{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 539476, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00037073011589023425, "grad_norm": 99.50769805908203, "learning_rate": 1.835032437442076e-06, "loss": 3.9874, "step": 100 }, { "epoch": 0.0007414602317804685, "grad_norm": 44.35987091064453, "learning_rate": 3.688600556070436e-06, "loss": 3.1948, "step": 200 }, { "epoch": 0.0011121903476707027, "grad_norm": 8.010477066040039, "learning_rate": 5.542168674698795e-06, "loss": 2.6292, "step": 300 }, { "epoch": 0.001482920463560937, "grad_norm": 11.574338912963867, "learning_rate": 7.395736793327156e-06, "loss": 2.384, "step": 400 }, { "epoch": 0.0018536505794511712, "grad_norm": 10.132732391357422, "learning_rate": 9.249304911955514e-06, "loss": 2.2273, "step": 500 }, { "epoch": 0.0022243806953414053, "grad_norm": 13.844990730285645, "learning_rate": 1.1102873030583874e-05, "loss": 2.1615, "step": 600 }, { "epoch": 0.0025951108112316395, "grad_norm": 18.074670791625977, "learning_rate": 1.2956441149212234e-05, "loss": 2.0617, "step": 700 }, { "epoch": 0.002965840927121874, "grad_norm": 9.796236991882324, "learning_rate": 1.4810009267840593e-05, "loss": 1.9833, "step": 800 }, { "epoch": 0.003336571043012108, "grad_norm": 7.66437292098999, "learning_rate": 1.6663577386468955e-05, "loss": 1.9092, "step": 900 }, { "epoch": 0.0037073011589023423, "grad_norm": 4.6178202629089355, "learning_rate": 1.8517145505097313e-05, "loss": 1.8398, "step": 1000 }, { "epoch": 0.004078031274792577, "grad_norm": 9.62745475769043, "learning_rate": 2.0370713623725674e-05, "loss": 1.842, "step": 1100 }, { "epoch": 0.004448761390682811, "grad_norm": 7.242320537567139, "learning_rate": 2.2224281742354032e-05, "loss": 1.7525, "step": 1200 }, { "epoch": 0.004819491506573045, "grad_norm": 6.867676258087158, "learning_rate": 2.4077849860982393e-05, "loss": 1.7033, "step": 1300 }, { "epoch": 0.005190221622463279, "grad_norm": 6.068170547485352, "learning_rate": 2.593141797961075e-05, "loss": 1.5716, "step": 1400 }, { "epoch": 0.0055609517383535135, "grad_norm": 2.5415446758270264, "learning_rate": 2.7784986098239112e-05, "loss": 1.6233, "step": 1500 }, { "epoch": 0.005931681854243748, "grad_norm": 5.603061199188232, "learning_rate": 2.963855421686747e-05, "loss": 1.5702, "step": 1600 }, { "epoch": 0.006302411970133982, "grad_norm": 4.061676025390625, "learning_rate": 3.1492122335495835e-05, "loss": 1.528, "step": 1700 }, { "epoch": 0.006673142086024216, "grad_norm": 4.216681957244873, "learning_rate": 3.334569045412419e-05, "loss": 1.5159, "step": 1800 }, { "epoch": 0.00704387220191445, "grad_norm": 2.9881560802459717, "learning_rate": 3.519925857275255e-05, "loss": 1.4747, "step": 1900 }, { "epoch": 0.007414602317804685, "grad_norm": 17.180503845214844, "learning_rate": 3.705282669138091e-05, "loss": 1.3845, "step": 2000 }, { "epoch": 0.007785332433694918, "grad_norm": 4.084206581115723, "learning_rate": 3.8906394810009274e-05, "loss": 1.3662, "step": 2100 }, { "epoch": 0.008156062549585154, "grad_norm": 4.966704368591309, "learning_rate": 4.075996292863763e-05, "loss": 1.3262, "step": 2200 }, { "epoch": 0.008526792665475387, "grad_norm": 3.110306739807129, "learning_rate": 4.261353104726599e-05, "loss": 1.3219, "step": 2300 }, { "epoch": 0.008897522781365621, "grad_norm": 3.8534443378448486, "learning_rate": 4.446709916589435e-05, "loss": 1.2575, "step": 2400 }, { "epoch": 0.009268252897255856, "grad_norm": 3.130038261413574, "learning_rate": 4.6320667284522705e-05, "loss": 1.236, "step": 2500 }, { "epoch": 0.00963898301314609, "grad_norm": 4.050836086273193, "learning_rate": 4.817423540315107e-05, "loss": 1.2172, "step": 2600 }, { "epoch": 0.010009713129036325, "grad_norm": 4.245081901550293, "learning_rate": 5.002780352177943e-05, "loss": 1.203, "step": 2700 }, { "epoch": 0.010380443244926558, "grad_norm": 3.6672215461730957, "learning_rate": 5.1881371640407786e-05, "loss": 1.2067, "step": 2800 }, { "epoch": 0.010751173360816792, "grad_norm": 2.455749988555908, "learning_rate": 5.373493975903615e-05, "loss": 1.1555, "step": 2900 }, { "epoch": 0.011121903476707027, "grad_norm": 2.1241626739501953, "learning_rate": 5.558850787766451e-05, "loss": 1.1941, "step": 3000 }, { "epoch": 0.011492633592597262, "grad_norm": 2.6115756034851074, "learning_rate": 5.744207599629287e-05, "loss": 1.1288, "step": 3100 }, { "epoch": 0.011863363708487496, "grad_norm": 2.2570788860321045, "learning_rate": 5.9295644114921225e-05, "loss": 1.1431, "step": 3200 }, { "epoch": 0.012234093824377729, "grad_norm": 3.3484842777252197, "learning_rate": 6.114921223354959e-05, "loss": 1.1097, "step": 3300 }, { "epoch": 0.012604823940267964, "grad_norm": 2.3199462890625, "learning_rate": 6.300278035217795e-05, "loss": 1.1034, "step": 3400 }, { "epoch": 0.012975554056158198, "grad_norm": 2.10054087638855, "learning_rate": 6.48563484708063e-05, "loss": 1.0673, "step": 3500 }, { "epoch": 0.013346284172048433, "grad_norm": 2.364109516143799, "learning_rate": 6.670991658943466e-05, "loss": 1.1267, "step": 3600 }, { "epoch": 0.013717014287938666, "grad_norm": 3.1956734657287598, "learning_rate": 6.856348470806302e-05, "loss": 1.0211, "step": 3700 }, { "epoch": 0.0140877444038289, "grad_norm": 1.5975918769836426, "learning_rate": 7.041705282669139e-05, "loss": 1.017, "step": 3800 }, { "epoch": 0.014458474519719135, "grad_norm": 3.158853769302368, "learning_rate": 7.227062094531975e-05, "loss": 0.994, "step": 3900 }, { "epoch": 0.01482920463560937, "grad_norm": 1.7349014282226562, "learning_rate": 7.41241890639481e-05, "loss": 0.9945, "step": 4000 }, { "epoch": 0.015199934751499604, "grad_norm": 1.7549678087234497, "learning_rate": 7.597775718257645e-05, "loss": 1.0048, "step": 4100 }, { "epoch": 0.015570664867389837, "grad_norm": 1.659894585609436, "learning_rate": 7.783132530120483e-05, "loss": 1.0242, "step": 4200 }, { "epoch": 0.015941394983280073, "grad_norm": 1.955856442451477, "learning_rate": 7.968489341983318e-05, "loss": 1.0364, "step": 4300 }, { "epoch": 0.016312125099170308, "grad_norm": 1.9671708345413208, "learning_rate": 8.153846153846155e-05, "loss": 1.0208, "step": 4400 }, { "epoch": 0.01668285521506054, "grad_norm": 1.4204425811767578, "learning_rate": 8.33920296570899e-05, "loss": 0.993, "step": 4500 }, { "epoch": 0.017053585330950773, "grad_norm": 1.71530282497406, "learning_rate": 8.524559777571826e-05, "loss": 0.9565, "step": 4600 }, { "epoch": 0.017424315446841008, "grad_norm": 1.5386372804641724, "learning_rate": 8.709916589434663e-05, "loss": 0.9865, "step": 4700 }, { "epoch": 0.017795045562731242, "grad_norm": 0.9895398616790771, "learning_rate": 8.895273401297498e-05, "loss": 0.9414, "step": 4800 }, { "epoch": 0.018165775678621477, "grad_norm": 2.2124781608581543, "learning_rate": 9.080630213160334e-05, "loss": 0.964, "step": 4900 }, { "epoch": 0.01853650579451171, "grad_norm": 2.047820806503296, "learning_rate": 9.26598702502317e-05, "loss": 0.9562, "step": 5000 }, { "epoch": 0.018907235910401946, "grad_norm": 1.625494122505188, "learning_rate": 9.451343836886006e-05, "loss": 0.9527, "step": 5100 }, { "epoch": 0.01927796602629218, "grad_norm": 1.5729025602340698, "learning_rate": 9.636700648748842e-05, "loss": 0.9133, "step": 5200 }, { "epoch": 0.019648696142182415, "grad_norm": 1.7960329055786133, "learning_rate": 9.822057460611678e-05, "loss": 0.9448, "step": 5300 }, { "epoch": 0.02001942625807265, "grad_norm": 1.4072473049163818, "learning_rate": 9.99999999861597e-05, "loss": 0.9632, "step": 5400 }, { "epoch": 0.02039015637396288, "grad_norm": 0.9697661399841309, "learning_rate": 9.9999990643958e-05, "loss": 0.9608, "step": 5500 }, { "epoch": 0.020760886489853116, "grad_norm": 2.303354263305664, "learning_rate": 9.999996400138598e-05, "loss": 0.9419, "step": 5600 }, { "epoch": 0.02113161660574335, "grad_norm": 0.9205586910247803, "learning_rate": 9.999992005845285e-05, "loss": 0.9177, "step": 5700 }, { "epoch": 0.021502346721633585, "grad_norm": 2.723714590072632, "learning_rate": 9.999985881517379e-05, "loss": 0.9043, "step": 5800 }, { "epoch": 0.02187307683752382, "grad_norm": 0.8938530087471008, "learning_rate": 9.999978027157004e-05, "loss": 0.8984, "step": 5900 }, { "epoch": 0.022243806953414054, "grad_norm": 2.183746576309204, "learning_rate": 9.999968442766874e-05, "loss": 0.8893, "step": 6000 }, { "epoch": 0.02261453706930429, "grad_norm": 1.1989552974700928, "learning_rate": 9.999957128350309e-05, "loss": 0.959, "step": 6100 }, { "epoch": 0.022985267185194523, "grad_norm": 1.3131054639816284, "learning_rate": 9.99994408391122e-05, "loss": 0.8816, "step": 6200 }, { "epoch": 0.023355997301084758, "grad_norm": 1.1144073009490967, "learning_rate": 9.999929309454121e-05, "loss": 0.9563, "step": 6300 }, { "epoch": 0.023726727416974992, "grad_norm": 0.9642235040664673, "learning_rate": 9.999912804984127e-05, "loss": 0.8873, "step": 6400 }, { "epoch": 0.024097457532865223, "grad_norm": 1.3094288110733032, "learning_rate": 9.999894570506946e-05, "loss": 0.8977, "step": 6500 }, { "epoch": 0.024468187648755458, "grad_norm": 0.8472939133644104, "learning_rate": 9.999874606028888e-05, "loss": 0.8245, "step": 6600 }, { "epoch": 0.024838917764645693, "grad_norm": 0.7795704007148743, "learning_rate": 9.999852911556861e-05, "loss": 0.8506, "step": 6700 }, { "epoch": 0.025209647880535927, "grad_norm": 0.8795124888420105, "learning_rate": 9.999829487098371e-05, "loss": 0.8453, "step": 6800 }, { "epoch": 0.02558037799642616, "grad_norm": 1.1115046739578247, "learning_rate": 9.999804332661522e-05, "loss": 0.8509, "step": 6900 }, { "epoch": 0.025951108112316396, "grad_norm": 1.0760468244552612, "learning_rate": 9.99977744825502e-05, "loss": 0.9414, "step": 7000 }, { "epoch": 0.02632183822820663, "grad_norm": 1.0083982944488525, "learning_rate": 9.999748833888167e-05, "loss": 0.8564, "step": 7100 }, { "epoch": 0.026692568344096865, "grad_norm": 0.8405463099479675, "learning_rate": 9.999718489570863e-05, "loss": 0.9633, "step": 7200 }, { "epoch": 0.0270632984599871, "grad_norm": 0.890576958656311, "learning_rate": 9.999686415313608e-05, "loss": 0.855, "step": 7300 }, { "epoch": 0.02743402857587733, "grad_norm": 1.1116279363632202, "learning_rate": 9.999652611127497e-05, "loss": 0.842, "step": 7400 }, { "epoch": 0.027804758691767566, "grad_norm": 0.6366739869117737, "learning_rate": 9.999617077024229e-05, "loss": 0.8556, "step": 7500 }, { "epoch": 0.0281754888076578, "grad_norm": 1.018994688987732, "learning_rate": 9.9995798130161e-05, "loss": 0.9164, "step": 7600 }, { "epoch": 0.028546218923548035, "grad_norm": 0.8336512446403503, "learning_rate": 9.999540819116003e-05, "loss": 0.8334, "step": 7700 }, { "epoch": 0.02891694903943827, "grad_norm": 0.8718096613883972, "learning_rate": 9.999500095337427e-05, "loss": 0.8468, "step": 7800 }, { "epoch": 0.029287679155328504, "grad_norm": 0.8324469327926636, "learning_rate": 9.999457641694467e-05, "loss": 0.8607, "step": 7900 }, { "epoch": 0.02965840927121874, "grad_norm": 0.6960898637771606, "learning_rate": 9.99941345820181e-05, "loss": 0.8406, "step": 8000 }, { "epoch": 0.030029139387108973, "grad_norm": 0.7328154444694519, "learning_rate": 9.999367544874745e-05, "loss": 0.8565, "step": 8100 }, { "epoch": 0.030399869502999208, "grad_norm": 1.4901723861694336, "learning_rate": 9.999319901729158e-05, "loss": 0.8697, "step": 8200 }, { "epoch": 0.030770599618889442, "grad_norm": 0.7893081307411194, "learning_rate": 9.999270528781533e-05, "loss": 0.8551, "step": 8300 }, { "epoch": 0.031141329734779673, "grad_norm": 0.7583390474319458, "learning_rate": 9.999219426048954e-05, "loss": 0.8833, "step": 8400 }, { "epoch": 0.03151205985066991, "grad_norm": 0.9233383536338806, "learning_rate": 9.999166593549105e-05, "loss": 0.8231, "step": 8500 }, { "epoch": 0.031882789966560146, "grad_norm": 0.6884192228317261, "learning_rate": 9.999112031300261e-05, "loss": 0.8444, "step": 8600 }, { "epoch": 0.03225352008245038, "grad_norm": 0.6741942763328552, "learning_rate": 9.999055739321308e-05, "loss": 0.8478, "step": 8700 }, { "epoch": 0.032624250198340615, "grad_norm": 0.6759223937988281, "learning_rate": 9.998997717631718e-05, "loss": 0.9164, "step": 8800 }, { "epoch": 0.03299498031423084, "grad_norm": 0.6478690505027771, "learning_rate": 9.99893796625157e-05, "loss": 0.8236, "step": 8900 }, { "epoch": 0.03336571043012108, "grad_norm": 0.7130744457244873, "learning_rate": 9.998876485201535e-05, "loss": 0.7647, "step": 9000 }, { "epoch": 0.03373644054601131, "grad_norm": 0.6797969341278076, "learning_rate": 9.99881327450289e-05, "loss": 0.8776, "step": 9100 }, { "epoch": 0.03410717066190155, "grad_norm": 0.6688681244850159, "learning_rate": 9.998748334177504e-05, "loss": 0.8961, "step": 9200 }, { "epoch": 0.03447790077779178, "grad_norm": 0.6628736257553101, "learning_rate": 9.998681664247847e-05, "loss": 0.8361, "step": 9300 }, { "epoch": 0.034848630893682016, "grad_norm": 0.82839435338974, "learning_rate": 9.998613264736987e-05, "loss": 0.8152, "step": 9400 }, { "epoch": 0.03521936100957225, "grad_norm": 0.5164697766304016, "learning_rate": 9.998543135668592e-05, "loss": 0.7927, "step": 9500 }, { "epoch": 0.035590091125462485, "grad_norm": 0.6574541330337524, "learning_rate": 9.998471277066927e-05, "loss": 0.8496, "step": 9600 }, { "epoch": 0.03596082124135272, "grad_norm": 0.9331479072570801, "learning_rate": 9.998397688956854e-05, "loss": 0.7948, "step": 9700 }, { "epoch": 0.036331551357242954, "grad_norm": 0.6393089890480042, "learning_rate": 9.998322371363837e-05, "loss": 0.8056, "step": 9800 }, { "epoch": 0.03670228147313319, "grad_norm": 0.9109394550323486, "learning_rate": 9.998245324313936e-05, "loss": 0.8647, "step": 9900 }, { "epoch": 0.03707301158902342, "grad_norm": 1.0814851522445679, "learning_rate": 9.998166547833809e-05, "loss": 0.8624, "step": 10000 }, { "epoch": 0.03744374170491366, "grad_norm": 0.6044529676437378, "learning_rate": 9.998086041950714e-05, "loss": 0.8689, "step": 10100 }, { "epoch": 0.03781447182080389, "grad_norm": 0.7124137878417969, "learning_rate": 9.998003806692506e-05, "loss": 0.8161, "step": 10200 }, { "epoch": 0.03818520193669413, "grad_norm": 0.6464531421661377, "learning_rate": 9.99791984208764e-05, "loss": 0.7868, "step": 10300 }, { "epoch": 0.03855593205258436, "grad_norm": 0.8116011619567871, "learning_rate": 9.997834148165166e-05, "loss": 0.8269, "step": 10400 }, { "epoch": 0.038926662168474596, "grad_norm": 0.4251263737678528, "learning_rate": 9.997746724954738e-05, "loss": 0.8202, "step": 10500 }, { "epoch": 0.03929739228436483, "grad_norm": 0.72599196434021, "learning_rate": 9.997657572486603e-05, "loss": 0.8342, "step": 10600 }, { "epoch": 0.039668122400255065, "grad_norm": 0.5540903806686401, "learning_rate": 9.997566690791612e-05, "loss": 0.8045, "step": 10700 }, { "epoch": 0.0400388525161453, "grad_norm": 0.7491291165351868, "learning_rate": 9.997474079901203e-05, "loss": 0.8552, "step": 10800 }, { "epoch": 0.04040958263203553, "grad_norm": 0.8369801640510559, "learning_rate": 9.997379739847428e-05, "loss": 0.7919, "step": 10900 }, { "epoch": 0.04078031274792576, "grad_norm": 0.6872873306274414, "learning_rate": 9.997283670662924e-05, "loss": 0.8403, "step": 11000 }, { "epoch": 0.041151042863816, "grad_norm": 0.5743688941001892, "learning_rate": 9.997185872380937e-05, "loss": 0.7826, "step": 11100 }, { "epoch": 0.04152177297970623, "grad_norm": 0.6225058436393738, "learning_rate": 9.9970863450353e-05, "loss": 0.8294, "step": 11200 }, { "epoch": 0.041892503095596466, "grad_norm": 0.6883350610733032, "learning_rate": 9.996985088660453e-05, "loss": 0.7438, "step": 11300 }, { "epoch": 0.0422632332114867, "grad_norm": 0.4370023012161255, "learning_rate": 9.996882103291433e-05, "loss": 0.8048, "step": 11400 }, { "epoch": 0.042633963327376935, "grad_norm": 0.6123729944229126, "learning_rate": 9.996777388963872e-05, "loss": 0.7889, "step": 11500 }, { "epoch": 0.04300469344326717, "grad_norm": 0.7943214178085327, "learning_rate": 9.996670945714002e-05, "loss": 0.8618, "step": 11600 }, { "epoch": 0.043375423559157404, "grad_norm": 0.9636479020118713, "learning_rate": 9.996562773578652e-05, "loss": 0.8063, "step": 11700 }, { "epoch": 0.04374615367504764, "grad_norm": 0.6917877793312073, "learning_rate": 9.996452872595253e-05, "loss": 0.8049, "step": 11800 }, { "epoch": 0.04411688379093787, "grad_norm": 0.5412705540657043, "learning_rate": 9.996341242801828e-05, "loss": 0.7866, "step": 11900 }, { "epoch": 0.04448761390682811, "grad_norm": 0.6093032956123352, "learning_rate": 9.996227884237005e-05, "loss": 0.8564, "step": 12000 }, { "epoch": 0.04485834402271834, "grad_norm": 0.6109910011291504, "learning_rate": 9.996112796940006e-05, "loss": 0.7477, "step": 12100 }, { "epoch": 0.04522907413860858, "grad_norm": 0.7013096809387207, "learning_rate": 9.995995980950654e-05, "loss": 0.7901, "step": 12200 }, { "epoch": 0.04559980425449881, "grad_norm": 0.5616400837898254, "learning_rate": 9.995877436309365e-05, "loss": 0.7869, "step": 12300 }, { "epoch": 0.045970534370389046, "grad_norm": 0.4203181862831116, "learning_rate": 9.995757163057155e-05, "loss": 0.7724, "step": 12400 }, { "epoch": 0.04634126448627928, "grad_norm": 0.780398428440094, "learning_rate": 9.995635161235645e-05, "loss": 0.7714, "step": 12500 }, { "epoch": 0.046711994602169515, "grad_norm": 0.7432860136032104, "learning_rate": 9.995511430887043e-05, "loss": 0.8325, "step": 12600 }, { "epoch": 0.04708272471805975, "grad_norm": 0.5596086382865906, "learning_rate": 9.995385972054165e-05, "loss": 0.8184, "step": 12700 }, { "epoch": 0.047453454833949985, "grad_norm": 0.5370872616767883, "learning_rate": 9.995258784780417e-05, "loss": 0.7772, "step": 12800 }, { "epoch": 0.04782418494984021, "grad_norm": 0.4655928611755371, "learning_rate": 9.995129869109809e-05, "loss": 0.8469, "step": 12900 }, { "epoch": 0.04819491506573045, "grad_norm": 0.5215294361114502, "learning_rate": 9.994999225086947e-05, "loss": 0.7401, "step": 13000 }, { "epoch": 0.04856564518162068, "grad_norm": 0.7074815034866333, "learning_rate": 9.994866852757034e-05, "loss": 0.8051, "step": 13100 }, { "epoch": 0.048936375297510916, "grad_norm": 0.6238998770713806, "learning_rate": 9.994732752165872e-05, "loss": 0.7783, "step": 13200 }, { "epoch": 0.04930710541340115, "grad_norm": 0.9087201356887817, "learning_rate": 9.99459692335986e-05, "loss": 0.8446, "step": 13300 }, { "epoch": 0.049677835529291385, "grad_norm": 0.652288019657135, "learning_rate": 9.994459366385996e-05, "loss": 0.8223, "step": 13400 }, { "epoch": 0.05004856564518162, "grad_norm": 0.6203054785728455, "learning_rate": 9.994320081291877e-05, "loss": 0.8227, "step": 13500 }, { "epoch": 0.050419295761071854, "grad_norm": 0.687816858291626, "learning_rate": 9.994179068125697e-05, "loss": 0.8349, "step": 13600 }, { "epoch": 0.05079002587696209, "grad_norm": 0.7644866108894348, "learning_rate": 9.994036326936245e-05, "loss": 0.7296, "step": 13700 }, { "epoch": 0.05116075599285232, "grad_norm": 0.7271385192871094, "learning_rate": 9.993891857772912e-05, "loss": 0.8248, "step": 13800 }, { "epoch": 0.05153148610874256, "grad_norm": 0.5891247391700745, "learning_rate": 9.993745660685686e-05, "loss": 0.7981, "step": 13900 }, { "epoch": 0.05190221622463279, "grad_norm": 0.4777681231498718, "learning_rate": 9.993597735725151e-05, "loss": 0.8221, "step": 14000 }, { "epoch": 0.05227294634052303, "grad_norm": 0.4985211491584778, "learning_rate": 9.993448082942491e-05, "loss": 0.8109, "step": 14100 }, { "epoch": 0.05264367645641326, "grad_norm": 0.4887671172618866, "learning_rate": 9.993296702389488e-05, "loss": 0.7404, "step": 14200 }, { "epoch": 0.053014406572303496, "grad_norm": 0.5071700811386108, "learning_rate": 9.99314359411852e-05, "loss": 0.8458, "step": 14300 }, { "epoch": 0.05338513668819373, "grad_norm": 0.9853917956352234, "learning_rate": 9.992988758182562e-05, "loss": 0.8352, "step": 14400 }, { "epoch": 0.053755866804083965, "grad_norm": 0.45613810420036316, "learning_rate": 9.992832194635188e-05, "loss": 0.7881, "step": 14500 }, { "epoch": 0.0541265969199742, "grad_norm": 0.39730435609817505, "learning_rate": 9.992673903530575e-05, "loss": 0.8529, "step": 14600 }, { "epoch": 0.054497327035864435, "grad_norm": 0.5308089852333069, "learning_rate": 9.992513884923489e-05, "loss": 0.8183, "step": 14700 }, { "epoch": 0.05486805715175466, "grad_norm": 0.7841264605522156, "learning_rate": 9.992352138869296e-05, "loss": 0.7831, "step": 14800 }, { "epoch": 0.0552387872676449, "grad_norm": 0.4979688823223114, "learning_rate": 9.992188665423967e-05, "loss": 0.7725, "step": 14900 }, { "epoch": 0.05560951738353513, "grad_norm": 0.6372047662734985, "learning_rate": 9.99202346464406e-05, "loss": 0.7676, "step": 15000 }, { "epoch": 0.055980247499425366, "grad_norm": 0.44077157974243164, "learning_rate": 9.991856536586736e-05, "loss": 0.7975, "step": 15100 }, { "epoch": 0.0563509776153156, "grad_norm": 0.5222585797309875, "learning_rate": 9.991687881309757e-05, "loss": 0.8025, "step": 15200 }, { "epoch": 0.056721707731205835, "grad_norm": 0.6299792528152466, "learning_rate": 9.991517498871477e-05, "loss": 0.8163, "step": 15300 }, { "epoch": 0.05709243784709607, "grad_norm": 0.6398350596427917, "learning_rate": 9.991345389330848e-05, "loss": 0.7845, "step": 15400 }, { "epoch": 0.057463167962986304, "grad_norm": 0.9026748538017273, "learning_rate": 9.991171552747423e-05, "loss": 0.7519, "step": 15500 }, { "epoch": 0.05783389807887654, "grad_norm": 0.5133185386657715, "learning_rate": 9.99099598918135e-05, "loss": 0.7724, "step": 15600 }, { "epoch": 0.05820462819476677, "grad_norm": 0.4883072078227997, "learning_rate": 9.990818698693376e-05, "loss": 0.7881, "step": 15700 }, { "epoch": 0.05857535831065701, "grad_norm": 0.7494258880615234, "learning_rate": 9.990639681344845e-05, "loss": 0.7392, "step": 15800 }, { "epoch": 0.05894608842654724, "grad_norm": 0.6197894215583801, "learning_rate": 9.990458937197696e-05, "loss": 0.7488, "step": 15900 }, { "epoch": 0.05931681854243748, "grad_norm": 0.603993833065033, "learning_rate": 9.990276466314471e-05, "loss": 0.7786, "step": 16000 }, { "epoch": 0.05968754865832771, "grad_norm": 0.5468663573265076, "learning_rate": 9.990092268758304e-05, "loss": 0.8226, "step": 16100 }, { "epoch": 0.060058278774217946, "grad_norm": 0.5717090368270874, "learning_rate": 9.989906344592931e-05, "loss": 0.8173, "step": 16200 }, { "epoch": 0.06042900889010818, "grad_norm": 0.5591000914573669, "learning_rate": 9.989718693882681e-05, "loss": 0.7462, "step": 16300 }, { "epoch": 0.060799739005998416, "grad_norm": 0.5306983590126038, "learning_rate": 9.989529316692484e-05, "loss": 0.756, "step": 16400 }, { "epoch": 0.06117046912188865, "grad_norm": 0.5714496374130249, "learning_rate": 9.989338213087864e-05, "loss": 0.824, "step": 16500 }, { "epoch": 0.061541199237778885, "grad_norm": 0.7544708251953125, "learning_rate": 9.989145383134946e-05, "loss": 0.7794, "step": 16600 }, { "epoch": 0.06191192935366912, "grad_norm": 0.8115096688270569, "learning_rate": 9.98895082690045e-05, "loss": 0.8197, "step": 16700 }, { "epoch": 0.06228265946955935, "grad_norm": 0.47384631633758545, "learning_rate": 9.988754544451695e-05, "loss": 0.7452, "step": 16800 }, { "epoch": 0.06265338958544958, "grad_norm": 1.166999101638794, "learning_rate": 9.988556535856595e-05, "loss": 0.7666, "step": 16900 }, { "epoch": 0.06302411970133982, "grad_norm": 0.6271091103553772, "learning_rate": 9.98835680118366e-05, "loss": 0.761, "step": 17000 }, { "epoch": 0.06339484981723005, "grad_norm": 0.47625523805618286, "learning_rate": 9.988155340502005e-05, "loss": 0.739, "step": 17100 }, { "epoch": 0.06376557993312029, "grad_norm": 0.5383567810058594, "learning_rate": 9.987952153881335e-05, "loss": 0.7884, "step": 17200 }, { "epoch": 0.06413631004901052, "grad_norm": 0.5605003237724304, "learning_rate": 9.987747241391951e-05, "loss": 0.7511, "step": 17300 }, { "epoch": 0.06450704016490076, "grad_norm": 0.778030276298523, "learning_rate": 9.987540603104759e-05, "loss": 0.7673, "step": 17400 }, { "epoch": 0.06487777028079099, "grad_norm": 0.5792937874794006, "learning_rate": 9.987332239091253e-05, "loss": 0.8043, "step": 17500 }, { "epoch": 0.06524850039668123, "grad_norm": 0.5209659934043884, "learning_rate": 9.987122149423531e-05, "loss": 0.7319, "step": 17600 }, { "epoch": 0.06561923051257146, "grad_norm": 0.48258358240127563, "learning_rate": 9.986910334174287e-05, "loss": 0.7526, "step": 17700 }, { "epoch": 0.06598996062846169, "grad_norm": 0.5678555369377136, "learning_rate": 9.986696793416806e-05, "loss": 0.8043, "step": 17800 }, { "epoch": 0.06636069074435193, "grad_norm": 0.49832242727279663, "learning_rate": 9.98648152722498e-05, "loss": 0.8453, "step": 17900 }, { "epoch": 0.06673142086024215, "grad_norm": 0.5377302765846252, "learning_rate": 9.986264535673288e-05, "loss": 0.7545, "step": 18000 }, { "epoch": 0.0671021509761324, "grad_norm": 0.6725029945373535, "learning_rate": 9.986045818836813e-05, "loss": 0.7416, "step": 18100 }, { "epoch": 0.06747288109202262, "grad_norm": 0.6792699098587036, "learning_rate": 9.985825376791233e-05, "loss": 0.7186, "step": 18200 }, { "epoch": 0.06784361120791287, "grad_norm": 0.5447192192077637, "learning_rate": 9.985603209612823e-05, "loss": 0.7253, "step": 18300 }, { "epoch": 0.0682143413238031, "grad_norm": 0.6599334478378296, "learning_rate": 9.985379317378454e-05, "loss": 0.7404, "step": 18400 }, { "epoch": 0.06858507143969333, "grad_norm": 1.6448434591293335, "learning_rate": 9.985153700165593e-05, "loss": 0.697, "step": 18500 }, { "epoch": 0.06895580155558356, "grad_norm": 1.1069124937057495, "learning_rate": 9.984926358052306e-05, "loss": 0.7265, "step": 18600 }, { "epoch": 0.0693265316714738, "grad_norm": 0.7190589308738708, "learning_rate": 9.984697291117256e-05, "loss": 0.7235, "step": 18700 }, { "epoch": 0.06969726178736403, "grad_norm": 0.8267783522605896, "learning_rate": 9.984466499439699e-05, "loss": 0.777, "step": 18800 }, { "epoch": 0.07006799190325427, "grad_norm": 0.7216712832450867, "learning_rate": 9.984233983099495e-05, "loss": 0.6984, "step": 18900 }, { "epoch": 0.0704387220191445, "grad_norm": 1.0311315059661865, "learning_rate": 9.983999742177095e-05, "loss": 0.7246, "step": 19000 }, { "epoch": 0.07080945213503474, "grad_norm": 1.3983759880065918, "learning_rate": 9.983763776753547e-05, "loss": 0.6138, "step": 19100 }, { "epoch": 0.07118018225092497, "grad_norm": 1.0749683380126953, "learning_rate": 9.983526086910495e-05, "loss": 0.6323, "step": 19200 }, { "epoch": 0.07155091236681521, "grad_norm": 0.7352898716926575, "learning_rate": 9.983286672730186e-05, "loss": 0.6022, "step": 19300 }, { "epoch": 0.07192164248270544, "grad_norm": 1.4705175161361694, "learning_rate": 9.983045534295456e-05, "loss": 0.6257, "step": 19400 }, { "epoch": 0.07229237259859568, "grad_norm": 1.990744709968567, "learning_rate": 9.98280267168974e-05, "loss": 0.5688, "step": 19500 }, { "epoch": 0.07266310271448591, "grad_norm": 1.717031717300415, "learning_rate": 9.982558084997073e-05, "loss": 0.539, "step": 19600 }, { "epoch": 0.07303383283037615, "grad_norm": 1.5657066106796265, "learning_rate": 9.982311774302083e-05, "loss": 0.5476, "step": 19700 }, { "epoch": 0.07340456294626638, "grad_norm": 1.8046010732650757, "learning_rate": 9.982063739689993e-05, "loss": 0.5286, "step": 19800 }, { "epoch": 0.0737752930621566, "grad_norm": 0.736138105392456, "learning_rate": 9.981813981246628e-05, "loss": 0.574, "step": 19900 }, { "epoch": 0.07414602317804685, "grad_norm": 1.7566463947296143, "learning_rate": 9.981562499058406e-05, "loss": 0.605, "step": 20000 }, { "epoch": 0.07451675329393707, "grad_norm": 1.8791098594665527, "learning_rate": 9.981309293212338e-05, "loss": 0.5392, "step": 20100 }, { "epoch": 0.07488748340982732, "grad_norm": 1.6982605457305908, "learning_rate": 9.98105436379604e-05, "loss": 0.5419, "step": 20200 }, { "epoch": 0.07525821352571754, "grad_norm": 1.5657070875167847, "learning_rate": 9.980797710897716e-05, "loss": 0.6017, "step": 20300 }, { "epoch": 0.07562894364160778, "grad_norm": 1.1628602743148804, "learning_rate": 9.980539334606172e-05, "loss": 0.5444, "step": 20400 }, { "epoch": 0.07599967375749801, "grad_norm": 1.4740490913391113, "learning_rate": 9.980279235010807e-05, "loss": 0.604, "step": 20500 }, { "epoch": 0.07637040387338825, "grad_norm": 1.7399816513061523, "learning_rate": 9.980017412201616e-05, "loss": 0.546, "step": 20600 }, { "epoch": 0.07674113398927848, "grad_norm": 0.6821767687797546, "learning_rate": 9.979753866269196e-05, "loss": 0.5354, "step": 20700 }, { "epoch": 0.07711186410516872, "grad_norm": 0.9485724568367004, "learning_rate": 9.979488597304731e-05, "loss": 0.4978, "step": 20800 }, { "epoch": 0.07748259422105895, "grad_norm": 0.5564303994178772, "learning_rate": 9.979221605400009e-05, "loss": 0.5778, "step": 20900 }, { "epoch": 0.07785332433694919, "grad_norm": 1.1101696491241455, "learning_rate": 9.978952890647411e-05, "loss": 0.5032, "step": 21000 }, { "epoch": 0.07822405445283942, "grad_norm": 1.1107808351516724, "learning_rate": 9.978682453139913e-05, "loss": 0.515, "step": 21100 }, { "epoch": 0.07859478456872966, "grad_norm": 1.1792526245117188, "learning_rate": 9.978410292971089e-05, "loss": 0.5268, "step": 21200 }, { "epoch": 0.07896551468461989, "grad_norm": 2.102752447128296, "learning_rate": 9.978136410235108e-05, "loss": 0.504, "step": 21300 }, { "epoch": 0.07933624480051013, "grad_norm": 1.1307857036590576, "learning_rate": 9.977860805026737e-05, "loss": 0.4886, "step": 21400 }, { "epoch": 0.07970697491640036, "grad_norm": 1.9100521802902222, "learning_rate": 9.977583477441337e-05, "loss": 0.5073, "step": 21500 }, { "epoch": 0.0800777050322906, "grad_norm": 0.8448984622955322, "learning_rate": 9.977304427574863e-05, "loss": 0.4945, "step": 21600 }, { "epoch": 0.08044843514818083, "grad_norm": 1.6560086011886597, "learning_rate": 9.977023655523871e-05, "loss": 0.5096, "step": 21700 }, { "epoch": 0.08081916526407106, "grad_norm": 1.6005879640579224, "learning_rate": 9.97674116138551e-05, "loss": 0.5135, "step": 21800 }, { "epoch": 0.0811898953799613, "grad_norm": 0.9383556246757507, "learning_rate": 9.976456945257524e-05, "loss": 0.4914, "step": 21900 }, { "epoch": 0.08156062549585152, "grad_norm": 1.1251165866851807, "learning_rate": 9.976171007238255e-05, "loss": 0.533, "step": 22000 }, { "epoch": 0.08193135561174177, "grad_norm": 1.192872166633606, "learning_rate": 9.975883347426637e-05, "loss": 0.5158, "step": 22100 }, { "epoch": 0.082302085727632, "grad_norm": 0.7229394912719727, "learning_rate": 9.975593965922208e-05, "loss": 0.5044, "step": 22200 }, { "epoch": 0.08267281584352223, "grad_norm": 1.106888771057129, "learning_rate": 9.975302862825091e-05, "loss": 0.474, "step": 22300 }, { "epoch": 0.08304354595941246, "grad_norm": 1.3041913509368896, "learning_rate": 9.975010038236011e-05, "loss": 0.4988, "step": 22400 }, { "epoch": 0.0834142760753027, "grad_norm": 0.7788728475570679, "learning_rate": 9.974715492256289e-05, "loss": 0.4485, "step": 22500 }, { "epoch": 0.08378500619119293, "grad_norm": 0.9619131684303284, "learning_rate": 9.974419224987838e-05, "loss": 0.4824, "step": 22600 }, { "epoch": 0.08415573630708317, "grad_norm": 2.3877079486846924, "learning_rate": 9.974121236533172e-05, "loss": 0.4875, "step": 22700 }, { "epoch": 0.0845264664229734, "grad_norm": 0.8470420241355896, "learning_rate": 9.973821526995392e-05, "loss": 0.4626, "step": 22800 }, { "epoch": 0.08489719653886364, "grad_norm": 0.657673716545105, "learning_rate": 9.973520096478207e-05, "loss": 0.4839, "step": 22900 }, { "epoch": 0.08526792665475387, "grad_norm": 0.9254263639450073, "learning_rate": 9.973216945085907e-05, "loss": 0.4543, "step": 23000 }, { "epoch": 0.08563865677064411, "grad_norm": 0.868180513381958, "learning_rate": 9.972912072923388e-05, "loss": 0.4744, "step": 23100 }, { "epoch": 0.08600938688653434, "grad_norm": 0.84515380859375, "learning_rate": 9.972605480096138e-05, "loss": 0.4624, "step": 23200 }, { "epoch": 0.08638011700242458, "grad_norm": 1.2031004428863525, "learning_rate": 9.97229716671024e-05, "loss": 0.5003, "step": 23300 }, { "epoch": 0.08675084711831481, "grad_norm": 0.8008503317832947, "learning_rate": 9.971987132872373e-05, "loss": 0.4612, "step": 23400 }, { "epoch": 0.08712157723420505, "grad_norm": 1.7768985033035278, "learning_rate": 9.971675378689813e-05, "loss": 0.49, "step": 23500 }, { "epoch": 0.08749230735009528, "grad_norm": 1.3287744522094727, "learning_rate": 9.971361904270425e-05, "loss": 0.4905, "step": 23600 }, { "epoch": 0.0878630374659855, "grad_norm": 0.9405617117881775, "learning_rate": 9.971046709722676e-05, "loss": 0.4453, "step": 23700 }, { "epoch": 0.08823376758187575, "grad_norm": 1.3922568559646606, "learning_rate": 9.970729795155625e-05, "loss": 0.4558, "step": 23800 }, { "epoch": 0.08860449769776597, "grad_norm": 1.5456054210662842, "learning_rate": 9.970411160678927e-05, "loss": 0.4401, "step": 23900 }, { "epoch": 0.08897522781365622, "grad_norm": 0.7666104435920715, "learning_rate": 9.970090806402832e-05, "loss": 0.454, "step": 24000 }, { "epoch": 0.08934595792954644, "grad_norm": 1.226173758506775, "learning_rate": 9.969768732438188e-05, "loss": 0.4714, "step": 24100 }, { "epoch": 0.08971668804543668, "grad_norm": 2.249293327331543, "learning_rate": 9.969444938896428e-05, "loss": 0.4524, "step": 24200 }, { "epoch": 0.09008741816132691, "grad_norm": 0.9888800978660583, "learning_rate": 9.969119425889592e-05, "loss": 0.4162, "step": 24300 }, { "epoch": 0.09045814827721715, "grad_norm": 1.1247888803482056, "learning_rate": 9.968792193530309e-05, "loss": 0.4629, "step": 24400 }, { "epoch": 0.09082887839310738, "grad_norm": 1.2108274698257446, "learning_rate": 9.968463241931806e-05, "loss": 0.4353, "step": 24500 }, { "epoch": 0.09119960850899762, "grad_norm": 0.9741836786270142, "learning_rate": 9.968132571207898e-05, "loss": 0.4378, "step": 24600 }, { "epoch": 0.09157033862488785, "grad_norm": 0.9862518906593323, "learning_rate": 9.967800181473004e-05, "loss": 0.4402, "step": 24700 }, { "epoch": 0.09194106874077809, "grad_norm": 0.7815281748771667, "learning_rate": 9.96746607284213e-05, "loss": 0.404, "step": 24800 }, { "epoch": 0.09231179885666832, "grad_norm": 0.5445035099983215, "learning_rate": 9.967130245430882e-05, "loss": 0.4287, "step": 24900 }, { "epoch": 0.09268252897255856, "grad_norm": 0.5694192051887512, "learning_rate": 9.966792699355458e-05, "loss": 0.4142, "step": 25000 }, { "epoch": 0.09305325908844879, "grad_norm": 2.238434314727783, "learning_rate": 9.966453434732652e-05, "loss": 0.4392, "step": 25100 }, { "epoch": 0.09342398920433903, "grad_norm": 0.5861896276473999, "learning_rate": 9.96611245167985e-05, "loss": 0.4475, "step": 25200 }, { "epoch": 0.09379471932022926, "grad_norm": 1.899655818939209, "learning_rate": 9.96576975031504e-05, "loss": 0.4393, "step": 25300 }, { "epoch": 0.0941654494361195, "grad_norm": 1.261289119720459, "learning_rate": 9.965425330756792e-05, "loss": 0.4761, "step": 25400 }, { "epoch": 0.09453617955200973, "grad_norm": 0.7905645966529846, "learning_rate": 9.965079193124284e-05, "loss": 0.4316, "step": 25500 }, { "epoch": 0.09490690966789997, "grad_norm": 1.2371140718460083, "learning_rate": 9.964731337537279e-05, "loss": 0.4474, "step": 25600 }, { "epoch": 0.0952776397837902, "grad_norm": 1.352743148803711, "learning_rate": 9.964381764116138e-05, "loss": 0.4197, "step": 25700 }, { "epoch": 0.09564836989968042, "grad_norm": 0.997117280960083, "learning_rate": 9.964030472981818e-05, "loss": 0.4535, "step": 25800 }, { "epoch": 0.09601910001557067, "grad_norm": 1.300774335861206, "learning_rate": 9.963677464255866e-05, "loss": 0.4369, "step": 25900 }, { "epoch": 0.0963898301314609, "grad_norm": 1.042384386062622, "learning_rate": 9.963322738060424e-05, "loss": 0.4092, "step": 26000 }, { "epoch": 0.09676056024735114, "grad_norm": 0.4633723795413971, "learning_rate": 9.962966294518234e-05, "loss": 0.4593, "step": 26100 }, { "epoch": 0.09713129036324136, "grad_norm": 0.8582066893577576, "learning_rate": 9.962608133752627e-05, "loss": 0.4037, "step": 26200 }, { "epoch": 0.0975020204791316, "grad_norm": 0.8240644335746765, "learning_rate": 9.962248255887528e-05, "loss": 0.4502, "step": 26300 }, { "epoch": 0.09787275059502183, "grad_norm": 1.3806111812591553, "learning_rate": 9.961886661047456e-05, "loss": 0.4239, "step": 26400 }, { "epoch": 0.09824348071091207, "grad_norm": 0.8272907733917236, "learning_rate": 9.96152334935753e-05, "loss": 0.4079, "step": 26500 }, { "epoch": 0.0986142108268023, "grad_norm": 0.6999910473823547, "learning_rate": 9.961158320943457e-05, "loss": 0.4499, "step": 26600 }, { "epoch": 0.09898494094269254, "grad_norm": 1.3631682395935059, "learning_rate": 9.960791575931537e-05, "loss": 0.4152, "step": 26700 }, { "epoch": 0.09935567105858277, "grad_norm": 0.9533482789993286, "learning_rate": 9.960423114448669e-05, "loss": 0.4331, "step": 26800 }, { "epoch": 0.09972640117447301, "grad_norm": 0.956407368183136, "learning_rate": 9.960052936622343e-05, "loss": 0.4382, "step": 26900 }, { "epoch": 0.10009713129036324, "grad_norm": 1.198620080947876, "learning_rate": 9.959681042580642e-05, "loss": 0.4367, "step": 27000 }, { "epoch": 0.10046786140625348, "grad_norm": 0.5099947452545166, "learning_rate": 9.959307432452247e-05, "loss": 0.4532, "step": 27100 }, { "epoch": 0.10083859152214371, "grad_norm": 1.3656061887741089, "learning_rate": 9.958932106366426e-05, "loss": 0.4007, "step": 27200 }, { "epoch": 0.10120932163803395, "grad_norm": 0.7944419384002686, "learning_rate": 9.958555064453046e-05, "loss": 0.4434, "step": 27300 }, { "epoch": 0.10158005175392418, "grad_norm": 1.3978921175003052, "learning_rate": 9.958176306842569e-05, "loss": 0.4008, "step": 27400 }, { "epoch": 0.10195078186981442, "grad_norm": 2.168423652648926, "learning_rate": 9.957795833666045e-05, "loss": 0.4286, "step": 27500 }, { "epoch": 0.10232151198570465, "grad_norm": 0.5136455297470093, "learning_rate": 9.957413645055121e-05, "loss": 0.4345, "step": 27600 }, { "epoch": 0.10269224210159487, "grad_norm": 1.3693057298660278, "learning_rate": 9.957029741142039e-05, "loss": 0.4309, "step": 27700 }, { "epoch": 0.10306297221748512, "grad_norm": 0.7976776361465454, "learning_rate": 9.95664412205963e-05, "loss": 0.4229, "step": 27800 }, { "epoch": 0.10343370233337534, "grad_norm": 1.3799725770950317, "learning_rate": 9.956256787941323e-05, "loss": 0.4125, "step": 27900 }, { "epoch": 0.10380443244926559, "grad_norm": 1.1051102876663208, "learning_rate": 9.955867738921136e-05, "loss": 0.4119, "step": 28000 }, { "epoch": 0.10417516256515581, "grad_norm": 1.2142465114593506, "learning_rate": 9.955476975133688e-05, "loss": 0.4009, "step": 28100 }, { "epoch": 0.10454589268104605, "grad_norm": 0.8687050938606262, "learning_rate": 9.95508449671418e-05, "loss": 0.4303, "step": 28200 }, { "epoch": 0.10491662279693628, "grad_norm": 0.5500572919845581, "learning_rate": 9.954690303798416e-05, "loss": 0.4463, "step": 28300 }, { "epoch": 0.10528735291282652, "grad_norm": 1.0125454664230347, "learning_rate": 9.95429439652279e-05, "loss": 0.393, "step": 28400 }, { "epoch": 0.10565808302871675, "grad_norm": 1.7513258457183838, "learning_rate": 9.953896775024288e-05, "loss": 0.4121, "step": 28500 }, { "epoch": 0.10602881314460699, "grad_norm": 1.1876221895217896, "learning_rate": 9.953497439440487e-05, "loss": 0.4448, "step": 28600 }, { "epoch": 0.10639954326049722, "grad_norm": 1.0689870119094849, "learning_rate": 9.953096389909565e-05, "loss": 0.4362, "step": 28700 }, { "epoch": 0.10677027337638746, "grad_norm": 0.8184880614280701, "learning_rate": 9.952693626570286e-05, "loss": 0.3968, "step": 28800 }, { "epoch": 0.10714100349227769, "grad_norm": 0.5506743788719177, "learning_rate": 9.95228914956201e-05, "loss": 0.4382, "step": 28900 }, { "epoch": 0.10751173360816793, "grad_norm": 0.6252792477607727, "learning_rate": 9.951882959024687e-05, "loss": 0.394, "step": 29000 }, { "epoch": 0.10788246372405816, "grad_norm": 1.1262739896774292, "learning_rate": 9.951475055098864e-05, "loss": 0.3852, "step": 29100 }, { "epoch": 0.1082531938399484, "grad_norm": 0.4469977021217346, "learning_rate": 9.951065437925678e-05, "loss": 0.3903, "step": 29200 }, { "epoch": 0.10862392395583863, "grad_norm": 1.522959589958191, "learning_rate": 9.95065410764686e-05, "loss": 0.4217, "step": 29300 }, { "epoch": 0.10899465407172887, "grad_norm": 0.581974446773529, "learning_rate": 9.950241064404733e-05, "loss": 0.4187, "step": 29400 }, { "epoch": 0.1093653841876191, "grad_norm": 0.7690785527229309, "learning_rate": 9.949826308342211e-05, "loss": 0.4403, "step": 29500 }, { "epoch": 0.10973611430350932, "grad_norm": 0.45510146021842957, "learning_rate": 9.949409839602808e-05, "loss": 0.3841, "step": 29600 }, { "epoch": 0.11010684441939957, "grad_norm": 0.6556417942047119, "learning_rate": 9.94899165833062e-05, "loss": 0.3722, "step": 29700 }, { "epoch": 0.1104775745352898, "grad_norm": 0.8600026369094849, "learning_rate": 9.948571764670344e-05, "loss": 0.4178, "step": 29800 }, { "epoch": 0.11084830465118004, "grad_norm": 0.7175588607788086, "learning_rate": 9.948150158767265e-05, "loss": 0.3722, "step": 29900 }, { "epoch": 0.11121903476707026, "grad_norm": 1.0878361463546753, "learning_rate": 9.947726840767262e-05, "loss": 0.4214, "step": 30000 }, { "epoch": 0.1115897648829605, "grad_norm": 0.7353511452674866, "learning_rate": 9.947301810816804e-05, "loss": 0.3777, "step": 30100 }, { "epoch": 0.11196049499885073, "grad_norm": 0.9460669755935669, "learning_rate": 9.94687506906296e-05, "loss": 0.3814, "step": 30200 }, { "epoch": 0.11233122511474097, "grad_norm": 1.4069336652755737, "learning_rate": 9.946446615653382e-05, "loss": 0.4468, "step": 30300 }, { "epoch": 0.1127019552306312, "grad_norm": 1.5949651002883911, "learning_rate": 9.946016450736318e-05, "loss": 0.42, "step": 30400 }, { "epoch": 0.11307268534652144, "grad_norm": 0.67017662525177, "learning_rate": 9.945584574460609e-05, "loss": 0.4071, "step": 30500 }, { "epoch": 0.11344341546241167, "grad_norm": 2.0182559490203857, "learning_rate": 9.945150986975687e-05, "loss": 0.4176, "step": 30600 }, { "epoch": 0.11381414557830191, "grad_norm": 0.9569240212440491, "learning_rate": 9.944715688431577e-05, "loss": 0.3665, "step": 30700 }, { "epoch": 0.11418487569419214, "grad_norm": 0.9506590962409973, "learning_rate": 9.944278678978896e-05, "loss": 0.4215, "step": 30800 }, { "epoch": 0.11455560581008238, "grad_norm": 0.7828270196914673, "learning_rate": 9.943839958768852e-05, "loss": 0.3757, "step": 30900 }, { "epoch": 0.11492633592597261, "grad_norm": 4.558686256408691, "learning_rate": 9.943399527953245e-05, "loss": 0.4271, "step": 31000 }, { "epoch": 0.11529706604186285, "grad_norm": 1.229203224182129, "learning_rate": 9.942957386684468e-05, "loss": 0.4131, "step": 31100 }, { "epoch": 0.11566779615775308, "grad_norm": 0.9783921241760254, "learning_rate": 9.942513535115505e-05, "loss": 0.4129, "step": 31200 }, { "epoch": 0.11603852627364332, "grad_norm": 0.6398488283157349, "learning_rate": 9.942067973399932e-05, "loss": 0.4086, "step": 31300 }, { "epoch": 0.11640925638953355, "grad_norm": 2.064410448074341, "learning_rate": 9.941620701691916e-05, "loss": 0.4099, "step": 31400 }, { "epoch": 0.11677998650542379, "grad_norm": 0.976294994354248, "learning_rate": 9.941171720146219e-05, "loss": 0.3838, "step": 31500 }, { "epoch": 0.11715071662131402, "grad_norm": 0.8531506061553955, "learning_rate": 9.940721028918188e-05, "loss": 0.4249, "step": 31600 }, { "epoch": 0.11752144673720424, "grad_norm": 1.2768926620483398, "learning_rate": 9.940268628163767e-05, "loss": 0.395, "step": 31700 }, { "epoch": 0.11789217685309449, "grad_norm": 1.0741896629333496, "learning_rate": 9.939814518039491e-05, "loss": 0.3789, "step": 31800 }, { "epoch": 0.11826290696898471, "grad_norm": 0.6934481263160706, "learning_rate": 9.939358698702487e-05, "loss": 0.4133, "step": 31900 }, { "epoch": 0.11863363708487495, "grad_norm": 0.7474868893623352, "learning_rate": 9.938901170310468e-05, "loss": 0.4249, "step": 32000 }, { "epoch": 0.11900436720076518, "grad_norm": 0.8844146132469177, "learning_rate": 9.938441933021745e-05, "loss": 0.3877, "step": 32100 }, { "epoch": 0.11937509731665542, "grad_norm": 1.07782781124115, "learning_rate": 9.937980986995217e-05, "loss": 0.4095, "step": 32200 }, { "epoch": 0.11974582743254565, "grad_norm": 1.007277250289917, "learning_rate": 9.937518332390372e-05, "loss": 0.4068, "step": 32300 }, { "epoch": 0.12011655754843589, "grad_norm": 1.0482805967330933, "learning_rate": 9.937053969367298e-05, "loss": 0.3887, "step": 32400 }, { "epoch": 0.12048728766432612, "grad_norm": 1.0186012983322144, "learning_rate": 9.936587898086662e-05, "loss": 0.4167, "step": 32500 }, { "epoch": 0.12085801778021636, "grad_norm": 1.2777408361434937, "learning_rate": 9.936120118709731e-05, "loss": 0.3956, "step": 32600 }, { "epoch": 0.12122874789610659, "grad_norm": 0.9621116518974304, "learning_rate": 9.93565063139836e-05, "loss": 0.3867, "step": 32700 }, { "epoch": 0.12159947801199683, "grad_norm": 0.6215800642967224, "learning_rate": 9.935179436314994e-05, "loss": 0.3929, "step": 32800 }, { "epoch": 0.12197020812788706, "grad_norm": 0.9234066009521484, "learning_rate": 9.934706533622673e-05, "loss": 0.4338, "step": 32900 }, { "epoch": 0.1223409382437773, "grad_norm": 0.7025009393692017, "learning_rate": 9.934231923485021e-05, "loss": 0.368, "step": 33000 }, { "epoch": 0.12271166835966753, "grad_norm": 0.8187329769134521, "learning_rate": 9.933755606066259e-05, "loss": 0.3883, "step": 33100 }, { "epoch": 0.12308239847555777, "grad_norm": 0.5711250901222229, "learning_rate": 9.933277581531197e-05, "loss": 0.4207, "step": 33200 }, { "epoch": 0.123453128591448, "grad_norm": 1.069798469543457, "learning_rate": 9.932797850045233e-05, "loss": 0.4199, "step": 33300 }, { "epoch": 0.12382385870733824, "grad_norm": 0.9460178017616272, "learning_rate": 9.932316411774358e-05, "loss": 0.4207, "step": 33400 }, { "epoch": 0.12419458882322847, "grad_norm": 0.7864503860473633, "learning_rate": 9.931833266885153e-05, "loss": 0.3836, "step": 33500 }, { "epoch": 0.1245653189391187, "grad_norm": 0.7371729612350464, "learning_rate": 9.931348415544793e-05, "loss": 0.3992, "step": 33600 }, { "epoch": 0.12493604905500894, "grad_norm": 0.7971158027648926, "learning_rate": 9.930861857921036e-05, "loss": 0.3979, "step": 33700 }, { "epoch": 0.12530677917089916, "grad_norm": 1.1577402353286743, "learning_rate": 9.930373594182236e-05, "loss": 0.3965, "step": 33800 }, { "epoch": 0.1256775092867894, "grad_norm": 1.366352915763855, "learning_rate": 9.929883624497337e-05, "loss": 0.3954, "step": 33900 }, { "epoch": 0.12604823940267965, "grad_norm": 1.005003809928894, "learning_rate": 9.929391949035872e-05, "loss": 0.3769, "step": 34000 }, { "epoch": 0.12641896951856987, "grad_norm": 1.2329858541488647, "learning_rate": 9.928898567967963e-05, "loss": 0.3887, "step": 34100 }, { "epoch": 0.1267896996344601, "grad_norm": 0.35148122906684875, "learning_rate": 9.928403481464324e-05, "loss": 0.4265, "step": 34200 }, { "epoch": 0.12716042975035033, "grad_norm": 0.7385274767875671, "learning_rate": 9.927906689696259e-05, "loss": 0.3926, "step": 34300 }, { "epoch": 0.12753115986624058, "grad_norm": 0.5814616680145264, "learning_rate": 9.927408192835662e-05, "loss": 0.357, "step": 34400 }, { "epoch": 0.1279018899821308, "grad_norm": 0.8198263049125671, "learning_rate": 9.926907991055017e-05, "loss": 0.3587, "step": 34500 }, { "epoch": 0.12827262009802104, "grad_norm": 0.914941132068634, "learning_rate": 9.926406084527396e-05, "loss": 0.3922, "step": 34600 }, { "epoch": 0.12864335021391127, "grad_norm": 2.3482911586761475, "learning_rate": 9.925902473426462e-05, "loss": 0.3994, "step": 34700 }, { "epoch": 0.12901408032980152, "grad_norm": 1.0699213743209839, "learning_rate": 9.925397157926471e-05, "loss": 0.3752, "step": 34800 }, { "epoch": 0.12938481044569175, "grad_norm": 1.1112794876098633, "learning_rate": 9.924890138202263e-05, "loss": 0.3889, "step": 34900 }, { "epoch": 0.12975554056158198, "grad_norm": 1.5959069728851318, "learning_rate": 9.924381414429274e-05, "loss": 0.3947, "step": 35000 }, { "epoch": 0.1301262706774722, "grad_norm": 0.49021199345588684, "learning_rate": 9.923870986783522e-05, "loss": 0.3987, "step": 35100 }, { "epoch": 0.13049700079336246, "grad_norm": 0.4471418559551239, "learning_rate": 9.923358855441623e-05, "loss": 0.3938, "step": 35200 }, { "epoch": 0.1308677309092527, "grad_norm": 0.4847422242164612, "learning_rate": 9.922845020580774e-05, "loss": 0.4021, "step": 35300 }, { "epoch": 0.13123846102514292, "grad_norm": 0.4569307267665863, "learning_rate": 9.922329482378769e-05, "loss": 0.3731, "step": 35400 }, { "epoch": 0.13160919114103314, "grad_norm": 0.6080461740493774, "learning_rate": 9.921812241013987e-05, "loss": 0.4223, "step": 35500 }, { "epoch": 0.13197992125692337, "grad_norm": 1.401538610458374, "learning_rate": 9.921293296665398e-05, "loss": 0.4312, "step": 35600 }, { "epoch": 0.13235065137281363, "grad_norm": 0.8613282442092896, "learning_rate": 9.920772649512559e-05, "loss": 0.3835, "step": 35700 }, { "epoch": 0.13272138148870385, "grad_norm": 0.49238401651382446, "learning_rate": 9.920250299735619e-05, "loss": 0.3632, "step": 35800 }, { "epoch": 0.13309211160459408, "grad_norm": 1.6451711654663086, "learning_rate": 9.919726247515315e-05, "loss": 0.3738, "step": 35900 }, { "epoch": 0.1334628417204843, "grad_norm": 0.5846077799797058, "learning_rate": 9.919200493032971e-05, "loss": 0.374, "step": 36000 }, { "epoch": 0.13383357183637457, "grad_norm": 0.5510419607162476, "learning_rate": 9.918673036470507e-05, "loss": 0.3746, "step": 36100 }, { "epoch": 0.1342043019522648, "grad_norm": 0.6924232244491577, "learning_rate": 9.918143878010422e-05, "loss": 0.4054, "step": 36200 }, { "epoch": 0.13457503206815502, "grad_norm": 1.0893176794052124, "learning_rate": 9.91761301783581e-05, "loss": 0.3754, "step": 36300 }, { "epoch": 0.13494576218404525, "grad_norm": 0.7199399471282959, "learning_rate": 9.917080456130353e-05, "loss": 0.4026, "step": 36400 }, { "epoch": 0.1353164922999355, "grad_norm": 0.9734033942222595, "learning_rate": 9.91654619307832e-05, "loss": 0.4067, "step": 36500 }, { "epoch": 0.13568722241582573, "grad_norm": 0.6576399803161621, "learning_rate": 9.916010228864574e-05, "loss": 0.3901, "step": 36600 }, { "epoch": 0.13605795253171596, "grad_norm": 0.6766926050186157, "learning_rate": 9.915472563674559e-05, "loss": 0.3913, "step": 36700 }, { "epoch": 0.1364286826476062, "grad_norm": 0.6358031630516052, "learning_rate": 9.914933197694311e-05, "loss": 0.4055, "step": 36800 }, { "epoch": 0.13679941276349644, "grad_norm": 1.3703324794769287, "learning_rate": 9.914392131110456e-05, "loss": 0.3839, "step": 36900 }, { "epoch": 0.13717014287938667, "grad_norm": 1.290383219718933, "learning_rate": 9.913849364110209e-05, "loss": 0.3846, "step": 37000 }, { "epoch": 0.1375408729952769, "grad_norm": 1.18392014503479, "learning_rate": 9.913304896881366e-05, "loss": 0.4023, "step": 37100 }, { "epoch": 0.13791160311116712, "grad_norm": 0.6272390484809875, "learning_rate": 9.912758729612322e-05, "loss": 0.41, "step": 37200 }, { "epoch": 0.13828233322705738, "grad_norm": 0.6405853629112244, "learning_rate": 9.912210862492053e-05, "loss": 0.4319, "step": 37300 }, { "epoch": 0.1386530633429476, "grad_norm": 0.5350852012634277, "learning_rate": 9.911661295710125e-05, "loss": 0.3855, "step": 37400 }, { "epoch": 0.13902379345883784, "grad_norm": 0.8961043357849121, "learning_rate": 9.911110029456692e-05, "loss": 0.4032, "step": 37500 }, { "epoch": 0.13939452357472806, "grad_norm": 1.57973313331604, "learning_rate": 9.910557063922497e-05, "loss": 0.3854, "step": 37600 }, { "epoch": 0.1397652536906183, "grad_norm": 0.7320783734321594, "learning_rate": 9.910002399298868e-05, "loss": 0.3546, "step": 37700 }, { "epoch": 0.14013598380650855, "grad_norm": 0.7619235515594482, "learning_rate": 9.909446035777726e-05, "loss": 0.4011, "step": 37800 }, { "epoch": 0.14050671392239877, "grad_norm": 0.905255913734436, "learning_rate": 9.908887973551576e-05, "loss": 0.3597, "step": 37900 }, { "epoch": 0.140877444038289, "grad_norm": 0.5582278370857239, "learning_rate": 9.908328212813512e-05, "loss": 0.4006, "step": 38000 }, { "epoch": 0.14124817415417923, "grad_norm": 0.4821436107158661, "learning_rate": 9.907766753757212e-05, "loss": 0.381, "step": 38100 }, { "epoch": 0.14161890427006948, "grad_norm": 1.0152881145477295, "learning_rate": 9.907203596576952e-05, "loss": 0.3613, "step": 38200 }, { "epoch": 0.1419896343859597, "grad_norm": 0.5244572162628174, "learning_rate": 9.906638741467582e-05, "loss": 0.3908, "step": 38300 }, { "epoch": 0.14236036450184994, "grad_norm": 0.8075941801071167, "learning_rate": 9.906072188624549e-05, "loss": 0.3984, "step": 38400 }, { "epoch": 0.14273109461774017, "grad_norm": 1.0834766626358032, "learning_rate": 9.905503938243884e-05, "loss": 0.3361, "step": 38500 }, { "epoch": 0.14310182473363042, "grad_norm": 1.1602801084518433, "learning_rate": 9.904933990522206e-05, "loss": 0.3781, "step": 38600 }, { "epoch": 0.14347255484952065, "grad_norm": 0.8608340620994568, "learning_rate": 9.904362345656721e-05, "loss": 0.3662, "step": 38700 }, { "epoch": 0.14384328496541088, "grad_norm": 1.4288878440856934, "learning_rate": 9.903789003845223e-05, "loss": 0.3657, "step": 38800 }, { "epoch": 0.1442140150813011, "grad_norm": 0.9570560455322266, "learning_rate": 9.903213965286093e-05, "loss": 0.4142, "step": 38900 }, { "epoch": 0.14458474519719136, "grad_norm": 0.7262715697288513, "learning_rate": 9.902637230178296e-05, "loss": 0.3526, "step": 39000 }, { "epoch": 0.1449554753130816, "grad_norm": 0.6706365346908569, "learning_rate": 9.902058798721389e-05, "loss": 0.3751, "step": 39100 }, { "epoch": 0.14532620542897182, "grad_norm": 0.3726540207862854, "learning_rate": 9.901478671115512e-05, "loss": 0.3823, "step": 39200 }, { "epoch": 0.14569693554486204, "grad_norm": 0.5851821303367615, "learning_rate": 9.900896847561397e-05, "loss": 0.3612, "step": 39300 }, { "epoch": 0.1460676656607523, "grad_norm": 0.5934658646583557, "learning_rate": 9.900313328260354e-05, "loss": 0.393, "step": 39400 }, { "epoch": 0.14643839577664253, "grad_norm": 1.14723539352417, "learning_rate": 9.899728113414289e-05, "loss": 0.3574, "step": 39500 }, { "epoch": 0.14680912589253275, "grad_norm": 0.5957729816436768, "learning_rate": 9.899141203225691e-05, "loss": 0.4032, "step": 39600 }, { "epoch": 0.14717985600842298, "grad_norm": 0.3523477017879486, "learning_rate": 9.898552597897632e-05, "loss": 0.342, "step": 39700 }, { "epoch": 0.1475505861243132, "grad_norm": 0.6433749794960022, "learning_rate": 9.897962297633775e-05, "loss": 0.3616, "step": 39800 }, { "epoch": 0.14792131624020347, "grad_norm": 1.0100681781768799, "learning_rate": 9.89737030263837e-05, "loss": 0.3798, "step": 39900 }, { "epoch": 0.1482920463560937, "grad_norm": 1.2726715803146362, "learning_rate": 9.896776613116251e-05, "loss": 0.413, "step": 40000 }, { "epoch": 0.14866277647198392, "grad_norm": 1.1614658832550049, "learning_rate": 9.896181229272839e-05, "loss": 0.3968, "step": 40100 }, { "epoch": 0.14903350658787415, "grad_norm": 0.7709549069404602, "learning_rate": 9.895584151314139e-05, "loss": 0.3871, "step": 40200 }, { "epoch": 0.1494042367037644, "grad_norm": 0.7924789190292358, "learning_rate": 9.894985379446749e-05, "loss": 0.3575, "step": 40300 }, { "epoch": 0.14977496681965463, "grad_norm": 0.6633001565933228, "learning_rate": 9.894384913877843e-05, "loss": 0.4011, "step": 40400 }, { "epoch": 0.15014569693554486, "grad_norm": 0.5799396634101868, "learning_rate": 9.893782754815192e-05, "loss": 0.3915, "step": 40500 }, { "epoch": 0.1505164270514351, "grad_norm": 1.0985636711120605, "learning_rate": 9.893178902467144e-05, "loss": 0.3795, "step": 40600 }, { "epoch": 0.15088715716732534, "grad_norm": 1.0541958808898926, "learning_rate": 9.892573357042637e-05, "loss": 0.3581, "step": 40700 }, { "epoch": 0.15125788728321557, "grad_norm": 0.9108558297157288, "learning_rate": 9.891966118751195e-05, "loss": 0.3574, "step": 40800 }, { "epoch": 0.1516286173991058, "grad_norm": 1.0703885555267334, "learning_rate": 9.891357187802926e-05, "loss": 0.3903, "step": 40900 }, { "epoch": 0.15199934751499602, "grad_norm": 0.4145256578922272, "learning_rate": 9.890746564408527e-05, "loss": 0.3771, "step": 41000 }, { "epoch": 0.15237007763088628, "grad_norm": 0.6815521121025085, "learning_rate": 9.890134248779274e-05, "loss": 0.4015, "step": 41100 }, { "epoch": 0.1527408077467765, "grad_norm": 0.4615787863731384, "learning_rate": 9.889520241127038e-05, "loss": 0.3706, "step": 41200 }, { "epoch": 0.15311153786266674, "grad_norm": 0.8048752546310425, "learning_rate": 9.888904541664268e-05, "loss": 0.3982, "step": 41300 }, { "epoch": 0.15348226797855696, "grad_norm": 0.5554524660110474, "learning_rate": 9.888287150603998e-05, "loss": 0.3673, "step": 41400 }, { "epoch": 0.1538529980944472, "grad_norm": 1.343424677848816, "learning_rate": 9.887668068159852e-05, "loss": 0.419, "step": 41500 }, { "epoch": 0.15422372821033745, "grad_norm": 0.4478367865085602, "learning_rate": 9.887047294546039e-05, "loss": 0.3577, "step": 41600 }, { "epoch": 0.15459445832622767, "grad_norm": 1.0038611888885498, "learning_rate": 9.886424829977349e-05, "loss": 0.3763, "step": 41700 }, { "epoch": 0.1549651884421179, "grad_norm": 0.43131861090660095, "learning_rate": 9.885800674669159e-05, "loss": 0.403, "step": 41800 }, { "epoch": 0.15533591855800813, "grad_norm": 0.6983708143234253, "learning_rate": 9.885174828837433e-05, "loss": 0.4057, "step": 41900 }, { "epoch": 0.15570664867389838, "grad_norm": 1.0479744672775269, "learning_rate": 9.884547292698718e-05, "loss": 0.3859, "step": 42000 }, { "epoch": 0.1560773787897886, "grad_norm": 0.48142319917678833, "learning_rate": 9.883918066470145e-05, "loss": 0.3789, "step": 42100 }, { "epoch": 0.15644810890567884, "grad_norm": 1.0730355978012085, "learning_rate": 9.883287150369433e-05, "loss": 0.3811, "step": 42200 }, { "epoch": 0.15681883902156907, "grad_norm": 0.8086548447608948, "learning_rate": 9.882654544614882e-05, "loss": 0.4002, "step": 42300 }, { "epoch": 0.15718956913745932, "grad_norm": 0.5838381052017212, "learning_rate": 9.88202024942538e-05, "loss": 0.412, "step": 42400 }, { "epoch": 0.15756029925334955, "grad_norm": 1.0335663557052612, "learning_rate": 9.881384265020394e-05, "loss": 0.3867, "step": 42500 }, { "epoch": 0.15793102936923978, "grad_norm": 0.9181808829307556, "learning_rate": 9.880746591619984e-05, "loss": 0.375, "step": 42600 }, { "epoch": 0.15830175948513, "grad_norm": 0.846128523349762, "learning_rate": 9.88010722944479e-05, "loss": 0.4061, "step": 42700 }, { "epoch": 0.15867248960102026, "grad_norm": 0.7639880776405334, "learning_rate": 9.87946617871603e-05, "loss": 0.371, "step": 42800 }, { "epoch": 0.1590432197169105, "grad_norm": 1.083258867263794, "learning_rate": 9.878823439655519e-05, "loss": 0.4029, "step": 42900 }, { "epoch": 0.15941394983280072, "grad_norm": 0.45794960856437683, "learning_rate": 9.878179012485647e-05, "loss": 0.3895, "step": 43000 }, { "epoch": 0.15978467994869094, "grad_norm": 1.2927484512329102, "learning_rate": 9.877532897429388e-05, "loss": 0.382, "step": 43100 }, { "epoch": 0.1601554100645812, "grad_norm": 0.714248538017273, "learning_rate": 9.876885094710307e-05, "loss": 0.352, "step": 43200 }, { "epoch": 0.16052614018047143, "grad_norm": 0.9089926481246948, "learning_rate": 9.876235604552546e-05, "loss": 0.3972, "step": 43300 }, { "epoch": 0.16089687029636165, "grad_norm": 0.7815203070640564, "learning_rate": 9.875584427180833e-05, "loss": 0.362, "step": 43400 }, { "epoch": 0.16126760041225188, "grad_norm": 1.4779435396194458, "learning_rate": 9.874931562820483e-05, "loss": 0.3723, "step": 43500 }, { "epoch": 0.1616383305281421, "grad_norm": 0.5384011268615723, "learning_rate": 9.87427701169739e-05, "loss": 0.3869, "step": 43600 }, { "epoch": 0.16200906064403237, "grad_norm": 1.0917903184890747, "learning_rate": 9.873620774038033e-05, "loss": 0.377, "step": 43700 }, { "epoch": 0.1623797907599226, "grad_norm": 0.9190884828567505, "learning_rate": 9.872962850069476e-05, "loss": 0.3781, "step": 43800 }, { "epoch": 0.16275052087581282, "grad_norm": 0.5819013714790344, "learning_rate": 9.872303240019368e-05, "loss": 0.3798, "step": 43900 }, { "epoch": 0.16312125099170305, "grad_norm": 0.6868224143981934, "learning_rate": 9.871641944115933e-05, "loss": 0.3684, "step": 44000 }, { "epoch": 0.1634919811075933, "grad_norm": 1.24089515209198, "learning_rate": 9.870978962587991e-05, "loss": 0.3721, "step": 44100 }, { "epoch": 0.16386271122348353, "grad_norm": 0.4502257704734802, "learning_rate": 9.870314295664933e-05, "loss": 0.3705, "step": 44200 }, { "epoch": 0.16423344133937376, "grad_norm": 0.6515984535217285, "learning_rate": 9.869647943576743e-05, "loss": 0.405, "step": 44300 }, { "epoch": 0.164604171455264, "grad_norm": 0.7325881719589233, "learning_rate": 9.868979906553981e-05, "loss": 0.4088, "step": 44400 }, { "epoch": 0.16497490157115424, "grad_norm": 0.7683349251747131, "learning_rate": 9.868310184827794e-05, "loss": 0.3909, "step": 44500 }, { "epoch": 0.16534563168704447, "grad_norm": 1.365806221961975, "learning_rate": 9.867638778629911e-05, "loss": 0.391, "step": 44600 }, { "epoch": 0.1657163618029347, "grad_norm": 1.5272507667541504, "learning_rate": 9.866965688192643e-05, "loss": 0.3525, "step": 44700 }, { "epoch": 0.16608709191882493, "grad_norm": 0.9756461381912231, "learning_rate": 9.866290913748884e-05, "loss": 0.3842, "step": 44800 }, { "epoch": 0.16645782203471518, "grad_norm": 0.6198223233222961, "learning_rate": 9.865614455532112e-05, "loss": 0.406, "step": 44900 }, { "epoch": 0.1668285521506054, "grad_norm": 0.7222094535827637, "learning_rate": 9.864936313776386e-05, "loss": 0.4047, "step": 45000 }, { "epoch": 0.16719928226649564, "grad_norm": 0.7432527542114258, "learning_rate": 9.864256488716348e-05, "loss": 0.3522, "step": 45100 }, { "epoch": 0.16757001238238586, "grad_norm": 0.5385957956314087, "learning_rate": 9.863574980587222e-05, "loss": 0.3701, "step": 45200 }, { "epoch": 0.16794074249827612, "grad_norm": 0.31520283222198486, "learning_rate": 9.862891789624816e-05, "loss": 0.349, "step": 45300 }, { "epoch": 0.16831147261416635, "grad_norm": 0.9010740518569946, "learning_rate": 9.86220691606552e-05, "loss": 0.3703, "step": 45400 }, { "epoch": 0.16868220273005657, "grad_norm": 0.749337911605835, "learning_rate": 9.861520360146302e-05, "loss": 0.4206, "step": 45500 }, { "epoch": 0.1690529328459468, "grad_norm": 0.32539835572242737, "learning_rate": 9.86083212210472e-05, "loss": 0.3955, "step": 45600 }, { "epoch": 0.16942366296183703, "grad_norm": 0.46987849473953247, "learning_rate": 9.860142202178904e-05, "loss": 0.3865, "step": 45700 }, { "epoch": 0.16979439307772728, "grad_norm": 0.3132653832435608, "learning_rate": 9.859450600607576e-05, "loss": 0.3318, "step": 45800 }, { "epoch": 0.1701651231936175, "grad_norm": 1.2576539516448975, "learning_rate": 9.858757317630034e-05, "loss": 0.3544, "step": 45900 }, { "epoch": 0.17053585330950774, "grad_norm": 0.7871999144554138, "learning_rate": 9.85806235348616e-05, "loss": 0.3561, "step": 46000 }, { "epoch": 0.17090658342539797, "grad_norm": 0.8973472118377686, "learning_rate": 9.857365708416414e-05, "loss": 0.3378, "step": 46100 }, { "epoch": 0.17127731354128822, "grad_norm": 1.1179981231689453, "learning_rate": 9.856667382661844e-05, "loss": 0.3685, "step": 46200 }, { "epoch": 0.17164804365717845, "grad_norm": 0.6617026925086975, "learning_rate": 9.855967376464072e-05, "loss": 0.3929, "step": 46300 }, { "epoch": 0.17201877377306868, "grad_norm": 0.5913839340209961, "learning_rate": 9.85526569006531e-05, "loss": 0.3814, "step": 46400 }, { "epoch": 0.1723895038889589, "grad_norm": 0.6911515593528748, "learning_rate": 9.854562323708341e-05, "loss": 0.3465, "step": 46500 }, { "epoch": 0.17276023400484916, "grad_norm": 0.6539260745048523, "learning_rate": 9.85385727763654e-05, "loss": 0.3491, "step": 46600 }, { "epoch": 0.1731309641207394, "grad_norm": 0.6901100873947144, "learning_rate": 9.853150552093857e-05, "loss": 0.3995, "step": 46700 }, { "epoch": 0.17350169423662962, "grad_norm": 1.4316084384918213, "learning_rate": 9.852442147324822e-05, "loss": 0.3474, "step": 46800 }, { "epoch": 0.17387242435251984, "grad_norm": 0.5367276072502136, "learning_rate": 9.851732063574553e-05, "loss": 0.3624, "step": 46900 }, { "epoch": 0.1742431544684101, "grad_norm": 1.2768089771270752, "learning_rate": 9.851020301088737e-05, "loss": 0.4034, "step": 47000 }, { "epoch": 0.17461388458430033, "grad_norm": 0.7111820578575134, "learning_rate": 9.850306860113659e-05, "loss": 0.3817, "step": 47100 }, { "epoch": 0.17498461470019055, "grad_norm": 0.5636797547340393, "learning_rate": 9.849591740896165e-05, "loss": 0.3783, "step": 47200 }, { "epoch": 0.17535534481608078, "grad_norm": 0.6631003022193909, "learning_rate": 9.848874943683697e-05, "loss": 0.3759, "step": 47300 }, { "epoch": 0.175726074931971, "grad_norm": 0.48340287804603577, "learning_rate": 9.848156468724272e-05, "loss": 0.3962, "step": 47400 }, { "epoch": 0.17609680504786127, "grad_norm": 0.3689079284667969, "learning_rate": 9.847436316266485e-05, "loss": 0.3792, "step": 47500 }, { "epoch": 0.1764675351637515, "grad_norm": 0.49792641401290894, "learning_rate": 9.846714486559518e-05, "loss": 0.372, "step": 47600 }, { "epoch": 0.17683826527964172, "grad_norm": 0.8697020411491394, "learning_rate": 9.845990979853127e-05, "loss": 0.3715, "step": 47700 }, { "epoch": 0.17720899539553195, "grad_norm": 0.5789051055908203, "learning_rate": 9.845265796397651e-05, "loss": 0.3659, "step": 47800 }, { "epoch": 0.1775797255114222, "grad_norm": 0.2441825270652771, "learning_rate": 9.84453893644401e-05, "loss": 0.3623, "step": 47900 }, { "epoch": 0.17795045562731243, "grad_norm": 0.7283831238746643, "learning_rate": 9.8438104002437e-05, "loss": 0.3685, "step": 48000 }, { "epoch": 0.17832118574320266, "grad_norm": 0.5871769189834595, "learning_rate": 9.843080188048803e-05, "loss": 0.364, "step": 48100 }, { "epoch": 0.1786919158590929, "grad_norm": 0.4995728135108948, "learning_rate": 9.842348300111979e-05, "loss": 0.3575, "step": 48200 }, { "epoch": 0.17906264597498314, "grad_norm": 0.6821136474609375, "learning_rate": 9.841614736686462e-05, "loss": 0.3427, "step": 48300 }, { "epoch": 0.17943337609087337, "grad_norm": 0.8241394758224487, "learning_rate": 9.840879498026075e-05, "loss": 0.3719, "step": 48400 }, { "epoch": 0.1798041062067636, "grad_norm": 0.8200068473815918, "learning_rate": 9.840142584385212e-05, "loss": 0.3676, "step": 48500 }, { "epoch": 0.18017483632265383, "grad_norm": 0.6707514524459839, "learning_rate": 9.839403996018854e-05, "loss": 0.3567, "step": 48600 }, { "epoch": 0.18054556643854408, "grad_norm": 0.771382749080658, "learning_rate": 9.838663733182556e-05, "loss": 0.3646, "step": 48700 }, { "epoch": 0.1809162965544343, "grad_norm": 0.6944271922111511, "learning_rate": 9.837921796132455e-05, "loss": 0.3823, "step": 48800 }, { "epoch": 0.18128702667032454, "grad_norm": 0.7673566937446594, "learning_rate": 9.837178185125268e-05, "loss": 0.4027, "step": 48900 }, { "epoch": 0.18165775678621476, "grad_norm": 1.729353904724121, "learning_rate": 9.83643290041829e-05, "loss": 0.3362, "step": 49000 }, { "epoch": 0.18202848690210502, "grad_norm": 0.33002689480781555, "learning_rate": 9.835685942269393e-05, "loss": 0.3464, "step": 49100 }, { "epoch": 0.18239921701799525, "grad_norm": 0.9963817000389099, "learning_rate": 9.834937310937031e-05, "loss": 0.3789, "step": 49200 }, { "epoch": 0.18276994713388547, "grad_norm": 0.4570348858833313, "learning_rate": 9.834187006680235e-05, "loss": 0.3985, "step": 49300 }, { "epoch": 0.1831406772497757, "grad_norm": 0.3605092167854309, "learning_rate": 9.83343502975862e-05, "loss": 0.3494, "step": 49400 }, { "epoch": 0.18351140736566593, "grad_norm": 0.6342818737030029, "learning_rate": 9.832681380432371e-05, "loss": 0.4299, "step": 49500 }, { "epoch": 0.18388213748155618, "grad_norm": 0.4942997694015503, "learning_rate": 9.831926058962259e-05, "loss": 0.3604, "step": 49600 }, { "epoch": 0.1842528675974464, "grad_norm": 0.5094645619392395, "learning_rate": 9.83116906560963e-05, "loss": 0.3923, "step": 49700 }, { "epoch": 0.18462359771333664, "grad_norm": 0.6798987984657288, "learning_rate": 9.830410400636407e-05, "loss": 0.342, "step": 49800 }, { "epoch": 0.18499432782922687, "grad_norm": 0.7176726460456848, "learning_rate": 9.829650064305098e-05, "loss": 0.3468, "step": 49900 }, { "epoch": 0.18536505794511712, "grad_norm": 0.869711697101593, "learning_rate": 9.828888056878783e-05, "loss": 0.3734, "step": 50000 }, { "epoch": 0.18573578806100735, "grad_norm": 0.9728835821151733, "learning_rate": 9.828124378621122e-05, "loss": 0.3745, "step": 50100 }, { "epoch": 0.18610651817689758, "grad_norm": 0.937781572341919, "learning_rate": 9.827359029796353e-05, "loss": 0.4075, "step": 50200 }, { "epoch": 0.1864772482927878, "grad_norm": 0.5540777444839478, "learning_rate": 9.826592010669295e-05, "loss": 0.3411, "step": 50300 }, { "epoch": 0.18684797840867806, "grad_norm": 0.6325036287307739, "learning_rate": 9.825823321505338e-05, "loss": 0.3517, "step": 50400 }, { "epoch": 0.1872187085245683, "grad_norm": 1.4309221506118774, "learning_rate": 9.82505296257046e-05, "loss": 0.3802, "step": 50500 }, { "epoch": 0.18758943864045852, "grad_norm": 0.8033241629600525, "learning_rate": 9.824280934131205e-05, "loss": 0.3527, "step": 50600 }, { "epoch": 0.18796016875634874, "grad_norm": 0.4912353456020355, "learning_rate": 9.823507236454704e-05, "loss": 0.3521, "step": 50700 }, { "epoch": 0.188330898872239, "grad_norm": 0.5067408084869385, "learning_rate": 9.822731869808662e-05, "loss": 0.3386, "step": 50800 }, { "epoch": 0.18870162898812923, "grad_norm": 0.8838446736335754, "learning_rate": 9.82195483446136e-05, "loss": 0.3428, "step": 50900 }, { "epoch": 0.18907235910401946, "grad_norm": 0.38964954018592834, "learning_rate": 9.821176130681659e-05, "loss": 0.3634, "step": 51000 }, { "epoch": 0.18944308921990968, "grad_norm": 0.6245937347412109, "learning_rate": 9.820395758738997e-05, "loss": 0.3738, "step": 51100 }, { "epoch": 0.18981381933579994, "grad_norm": 0.8470584154129028, "learning_rate": 9.819613718903388e-05, "loss": 0.3525, "step": 51200 }, { "epoch": 0.19018454945169017, "grad_norm": 0.8135424852371216, "learning_rate": 9.818830011445423e-05, "loss": 0.375, "step": 51300 }, { "epoch": 0.1905552795675804, "grad_norm": 0.7029352784156799, "learning_rate": 9.818044636636272e-05, "loss": 0.394, "step": 51400 }, { "epoch": 0.19092600968347062, "grad_norm": 0.6948062777519226, "learning_rate": 9.81725759474768e-05, "loss": 0.3706, "step": 51500 }, { "epoch": 0.19129673979936085, "grad_norm": 0.7840569615364075, "learning_rate": 9.816468886051968e-05, "loss": 0.3653, "step": 51600 }, { "epoch": 0.1916674699152511, "grad_norm": 0.6917842626571655, "learning_rate": 9.815678510822038e-05, "loss": 0.3758, "step": 51700 }, { "epoch": 0.19203820003114133, "grad_norm": 0.36393699049949646, "learning_rate": 9.814886469331361e-05, "loss": 0.3785, "step": 51800 }, { "epoch": 0.19240893014703156, "grad_norm": 0.8935218453407288, "learning_rate": 9.814092761853994e-05, "loss": 0.396, "step": 51900 }, { "epoch": 0.1927796602629218, "grad_norm": 1.0060265064239502, "learning_rate": 9.813297388664565e-05, "loss": 0.3825, "step": 52000 }, { "epoch": 0.19315039037881204, "grad_norm": 1.1629791259765625, "learning_rate": 9.812500350038276e-05, "loss": 0.3722, "step": 52100 }, { "epoch": 0.19352112049470227, "grad_norm": 0.6194243431091309, "learning_rate": 9.811701646250911e-05, "loss": 0.3778, "step": 52200 }, { "epoch": 0.1938918506105925, "grad_norm": 0.8039470911026001, "learning_rate": 9.810901277578827e-05, "loss": 0.3758, "step": 52300 }, { "epoch": 0.19426258072648273, "grad_norm": 1.8217400312423706, "learning_rate": 9.810099244298958e-05, "loss": 0.3166, "step": 52400 }, { "epoch": 0.19463331084237298, "grad_norm": 0.5527364611625671, "learning_rate": 9.809295546688813e-05, "loss": 0.3989, "step": 52500 }, { "epoch": 0.1950040409582632, "grad_norm": 0.7049033045768738, "learning_rate": 9.808490185026476e-05, "loss": 0.3809, "step": 52600 }, { "epoch": 0.19537477107415344, "grad_norm": 0.45510464906692505, "learning_rate": 9.807683159590611e-05, "loss": 0.3535, "step": 52700 }, { "epoch": 0.19574550119004366, "grad_norm": 0.8817132711410522, "learning_rate": 9.806874470660451e-05, "loss": 0.3618, "step": 52800 }, { "epoch": 0.19611623130593392, "grad_norm": 0.6368924975395203, "learning_rate": 9.806064118515812e-05, "loss": 0.3818, "step": 52900 }, { "epoch": 0.19648696142182415, "grad_norm": 0.2905023396015167, "learning_rate": 9.80525210343708e-05, "loss": 0.3485, "step": 53000 }, { "epoch": 0.19685769153771437, "grad_norm": 1.1719486713409424, "learning_rate": 9.80443842570522e-05, "loss": 0.3337, "step": 53100 }, { "epoch": 0.1972284216536046, "grad_norm": 1.0739079713821411, "learning_rate": 9.803623085601769e-05, "loss": 0.3707, "step": 53200 }, { "epoch": 0.19759915176949483, "grad_norm": 0.870187520980835, "learning_rate": 9.80280608340884e-05, "loss": 0.3552, "step": 53300 }, { "epoch": 0.19796988188538509, "grad_norm": 0.37773334980010986, "learning_rate": 9.801987419409125e-05, "loss": 0.3417, "step": 53400 }, { "epoch": 0.1983406120012753, "grad_norm": 0.41910427808761597, "learning_rate": 9.801167093885884e-05, "loss": 0.3662, "step": 53500 }, { "epoch": 0.19871134211716554, "grad_norm": 0.7127732038497925, "learning_rate": 9.800345107122958e-05, "loss": 0.3522, "step": 53600 }, { "epoch": 0.19908207223305577, "grad_norm": 0.944989025592804, "learning_rate": 9.79952145940476e-05, "loss": 0.386, "step": 53700 }, { "epoch": 0.19945280234894602, "grad_norm": 0.7370226383209229, "learning_rate": 9.79869615101628e-05, "loss": 0.372, "step": 53800 }, { "epoch": 0.19982353246483625, "grad_norm": 0.8160644769668579, "learning_rate": 9.797869182243078e-05, "loss": 0.3974, "step": 53900 }, { "epoch": 0.20019426258072648, "grad_norm": 0.4148170053958893, "learning_rate": 9.797040553371293e-05, "loss": 0.3782, "step": 54000 }, { "epoch": 0.2005649926966167, "grad_norm": 1.2088254690170288, "learning_rate": 9.796210264687636e-05, "loss": 0.3562, "step": 54100 }, { "epoch": 0.20093572281250696, "grad_norm": 0.5893823504447937, "learning_rate": 9.795378316479395e-05, "loss": 0.3666, "step": 54200 }, { "epoch": 0.2013064529283972, "grad_norm": 0.47151169180870056, "learning_rate": 9.794544709034428e-05, "loss": 0.3693, "step": 54300 }, { "epoch": 0.20167718304428742, "grad_norm": 0.5725803971290588, "learning_rate": 9.79370944264117e-05, "loss": 0.353, "step": 54400 }, { "epoch": 0.20204791316017764, "grad_norm": 0.5169137716293335, "learning_rate": 9.792872517588629e-05, "loss": 0.3225, "step": 54500 }, { "epoch": 0.2024186432760679, "grad_norm": 0.8139997124671936, "learning_rate": 9.79203393416639e-05, "loss": 0.3842, "step": 54600 }, { "epoch": 0.20278937339195813, "grad_norm": 1.0760337114334106, "learning_rate": 9.791193692664607e-05, "loss": 0.3596, "step": 54700 }, { "epoch": 0.20316010350784836, "grad_norm": 1.0085989236831665, "learning_rate": 9.790351793374009e-05, "loss": 0.386, "step": 54800 }, { "epoch": 0.20353083362373858, "grad_norm": 0.8236402869224548, "learning_rate": 9.7895082365859e-05, "loss": 0.4019, "step": 54900 }, { "epoch": 0.20390156373962884, "grad_norm": 0.8032817244529724, "learning_rate": 9.788663022592158e-05, "loss": 0.3792, "step": 55000 }, { "epoch": 0.20427229385551907, "grad_norm": 1.2027666568756104, "learning_rate": 9.787816151685233e-05, "loss": 0.3765, "step": 55100 }, { "epoch": 0.2046430239714093, "grad_norm": 0.689149022102356, "learning_rate": 9.786967624158147e-05, "loss": 0.3481, "step": 55200 }, { "epoch": 0.20501375408729952, "grad_norm": 0.6838663220405579, "learning_rate": 9.786117440304502e-05, "loss": 0.3823, "step": 55300 }, { "epoch": 0.20538448420318975, "grad_norm": 1.0465706586837769, "learning_rate": 9.785265600418459e-05, "loss": 0.3608, "step": 55400 }, { "epoch": 0.20575521431908, "grad_norm": 1.0617756843566895, "learning_rate": 9.784412104794769e-05, "loss": 0.365, "step": 55500 }, { "epoch": 0.20612594443497023, "grad_norm": 1.0373103618621826, "learning_rate": 9.783556953728745e-05, "loss": 0.4071, "step": 55600 }, { "epoch": 0.20649667455086046, "grad_norm": 0.6396840214729309, "learning_rate": 9.782700147516276e-05, "loss": 0.3442, "step": 55700 }, { "epoch": 0.2068674046667507, "grad_norm": 0.6196049451828003, "learning_rate": 9.781841686453823e-05, "loss": 0.365, "step": 55800 }, { "epoch": 0.20723813478264094, "grad_norm": 1.1356470584869385, "learning_rate": 9.780981570838419e-05, "loss": 0.3845, "step": 55900 }, { "epoch": 0.20760886489853117, "grad_norm": 0.6908576488494873, "learning_rate": 9.780119800967672e-05, "loss": 0.3851, "step": 56000 }, { "epoch": 0.2079795950144214, "grad_norm": 0.5850679278373718, "learning_rate": 9.779256377139761e-05, "loss": 0.3799, "step": 56100 }, { "epoch": 0.20835032513031163, "grad_norm": 0.9054037928581238, "learning_rate": 9.778391299653435e-05, "loss": 0.3649, "step": 56200 }, { "epoch": 0.20872105524620188, "grad_norm": 0.6351814866065979, "learning_rate": 9.777524568808018e-05, "loss": 0.3796, "step": 56300 }, { "epoch": 0.2090917853620921, "grad_norm": 0.44937562942504883, "learning_rate": 9.776656184903409e-05, "loss": 0.3447, "step": 56400 }, { "epoch": 0.20946251547798234, "grad_norm": 0.6332768797874451, "learning_rate": 9.775786148240071e-05, "loss": 0.3527, "step": 56500 }, { "epoch": 0.20983324559387256, "grad_norm": 0.6769325137138367, "learning_rate": 9.774914459119042e-05, "loss": 0.3931, "step": 56600 }, { "epoch": 0.21020397570976282, "grad_norm": 0.8558517098426819, "learning_rate": 9.774041117841939e-05, "loss": 0.363, "step": 56700 }, { "epoch": 0.21057470582565305, "grad_norm": 0.6749073266983032, "learning_rate": 9.77316612471094e-05, "loss": 0.3645, "step": 56800 }, { "epoch": 0.21094543594154327, "grad_norm": 1.1493905782699585, "learning_rate": 9.772289480028798e-05, "loss": 0.3795, "step": 56900 }, { "epoch": 0.2113161660574335, "grad_norm": 0.7513492107391357, "learning_rate": 9.771411184098842e-05, "loss": 0.397, "step": 57000 }, { "epoch": 0.21168689617332376, "grad_norm": 0.6098231077194214, "learning_rate": 9.770531237224969e-05, "loss": 0.3768, "step": 57100 }, { "epoch": 0.21205762628921399, "grad_norm": 0.6908873319625854, "learning_rate": 9.769649639711644e-05, "loss": 0.3346, "step": 57200 }, { "epoch": 0.2124283564051042, "grad_norm": 0.776894211769104, "learning_rate": 9.76876639186391e-05, "loss": 0.3713, "step": 57300 }, { "epoch": 0.21279908652099444, "grad_norm": 0.3146040439605713, "learning_rate": 9.767881493987376e-05, "loss": 0.4173, "step": 57400 }, { "epoch": 0.21316981663688467, "grad_norm": 0.886759877204895, "learning_rate": 9.766994946388222e-05, "loss": 0.3375, "step": 57500 }, { "epoch": 0.21354054675277492, "grad_norm": 0.3919335901737213, "learning_rate": 9.766106749373201e-05, "loss": 0.3395, "step": 57600 }, { "epoch": 0.21391127686866515, "grad_norm": 0.5349755883216858, "learning_rate": 9.765216903249634e-05, "loss": 0.3523, "step": 57700 }, { "epoch": 0.21428200698455538, "grad_norm": 0.539717435836792, "learning_rate": 9.764325408325419e-05, "loss": 0.3909, "step": 57800 }, { "epoch": 0.2146527371004456, "grad_norm": 0.8564454317092896, "learning_rate": 9.763432264909016e-05, "loss": 0.3409, "step": 57900 }, { "epoch": 0.21502346721633586, "grad_norm": 0.6874887943267822, "learning_rate": 9.76253747330946e-05, "loss": 0.3398, "step": 58000 }, { "epoch": 0.2153941973322261, "grad_norm": 0.8667436242103577, "learning_rate": 9.761641033836356e-05, "loss": 0.3484, "step": 58100 }, { "epoch": 0.21576492744811632, "grad_norm": 0.5847264528274536, "learning_rate": 9.760742946799879e-05, "loss": 0.3773, "step": 58200 }, { "epoch": 0.21613565756400654, "grad_norm": 0.4855090081691742, "learning_rate": 9.759843212510775e-05, "loss": 0.3442, "step": 58300 }, { "epoch": 0.2165063876798968, "grad_norm": 1.1861770153045654, "learning_rate": 9.758941831280355e-05, "loss": 0.3493, "step": 58400 }, { "epoch": 0.21687711779578703, "grad_norm": 1.4939110279083252, "learning_rate": 9.758038803420507e-05, "loss": 0.3626, "step": 58500 }, { "epoch": 0.21724784791167726, "grad_norm": 0.9408119320869446, "learning_rate": 9.757134129243684e-05, "loss": 0.3676, "step": 58600 }, { "epoch": 0.21761857802756748, "grad_norm": 0.801580011844635, "learning_rate": 9.75622780906291e-05, "loss": 0.3474, "step": 58700 }, { "epoch": 0.21798930814345774, "grad_norm": 1.359091877937317, "learning_rate": 9.755319843191781e-05, "loss": 0.3883, "step": 58800 }, { "epoch": 0.21836003825934797, "grad_norm": 0.5596160888671875, "learning_rate": 9.754410231944456e-05, "loss": 0.327, "step": 58900 }, { "epoch": 0.2187307683752382, "grad_norm": 0.3220389187335968, "learning_rate": 9.75349897563567e-05, "loss": 0.358, "step": 59000 }, { "epoch": 0.21910149849112842, "grad_norm": 0.707231879234314, "learning_rate": 9.752586074580723e-05, "loss": 0.3716, "step": 59100 }, { "epoch": 0.21947222860701865, "grad_norm": 0.807435154914856, "learning_rate": 9.751671529095488e-05, "loss": 0.3733, "step": 59200 }, { "epoch": 0.2198429587229089, "grad_norm": 0.5805105566978455, "learning_rate": 9.7507553394964e-05, "loss": 0.3632, "step": 59300 }, { "epoch": 0.22021368883879913, "grad_norm": 0.5349781513214111, "learning_rate": 9.749837506100472e-05, "loss": 0.3577, "step": 59400 }, { "epoch": 0.22058441895468936, "grad_norm": 0.6428444981575012, "learning_rate": 9.748918029225281e-05, "loss": 0.4002, "step": 59500 }, { "epoch": 0.2209551490705796, "grad_norm": 0.4472569525241852, "learning_rate": 9.74799690918897e-05, "loss": 0.3397, "step": 59600 }, { "epoch": 0.22132587918646984, "grad_norm": 0.9210056662559509, "learning_rate": 9.747074146310253e-05, "loss": 0.3829, "step": 59700 }, { "epoch": 0.22169660930236007, "grad_norm": 0.4971759021282196, "learning_rate": 9.746149740908417e-05, "loss": 0.3605, "step": 59800 }, { "epoch": 0.2220673394182503, "grad_norm": 3.4187655448913574, "learning_rate": 9.74522369330331e-05, "loss": 0.3887, "step": 59900 }, { "epoch": 0.22243806953414053, "grad_norm": 0.8485919237136841, "learning_rate": 9.744296003815351e-05, "loss": 0.3738, "step": 60000 }, { "epoch": 0.22280879965003078, "grad_norm": 0.9981324672698975, "learning_rate": 9.743366672765532e-05, "loss": 0.3486, "step": 60100 }, { "epoch": 0.223179529765921, "grad_norm": 0.6504378914833069, "learning_rate": 9.742435700475403e-05, "loss": 0.3423, "step": 60200 }, { "epoch": 0.22355025988181124, "grad_norm": 0.555202841758728, "learning_rate": 9.74150308726709e-05, "loss": 0.358, "step": 60300 }, { "epoch": 0.22392098999770146, "grad_norm": 0.985366940498352, "learning_rate": 9.740568833463284e-05, "loss": 0.3712, "step": 60400 }, { "epoch": 0.22429172011359172, "grad_norm": 0.48157423734664917, "learning_rate": 9.739632939387242e-05, "loss": 0.3828, "step": 60500 }, { "epoch": 0.22466245022948195, "grad_norm": 0.9967151880264282, "learning_rate": 9.738695405362793e-05, "loss": 0.3387, "step": 60600 }, { "epoch": 0.22503318034537217, "grad_norm": 1.3273298740386963, "learning_rate": 9.73775623171433e-05, "loss": 0.3523, "step": 60700 }, { "epoch": 0.2254039104612624, "grad_norm": 0.6448636651039124, "learning_rate": 9.736815418766813e-05, "loss": 0.3337, "step": 60800 }, { "epoch": 0.22577464057715266, "grad_norm": 1.2838014364242554, "learning_rate": 9.735872966845772e-05, "loss": 0.3736, "step": 60900 }, { "epoch": 0.22614537069304289, "grad_norm": 1.139573335647583, "learning_rate": 9.7349288762773e-05, "loss": 0.3456, "step": 61000 }, { "epoch": 0.2265161008089331, "grad_norm": 0.6108189821243286, "learning_rate": 9.733983147388062e-05, "loss": 0.3643, "step": 61100 }, { "epoch": 0.22688683092482334, "grad_norm": 0.916182279586792, "learning_rate": 9.733035780505286e-05, "loss": 0.3576, "step": 61200 }, { "epoch": 0.22725756104071357, "grad_norm": 0.8896015286445618, "learning_rate": 9.732086775956766e-05, "loss": 0.3323, "step": 61300 }, { "epoch": 0.22762829115660382, "grad_norm": 0.6119237542152405, "learning_rate": 9.73113613407087e-05, "loss": 0.3884, "step": 61400 }, { "epoch": 0.22799902127249405, "grad_norm": 0.4675068259239197, "learning_rate": 9.730183855176523e-05, "loss": 0.3694, "step": 61500 }, { "epoch": 0.22836975138838428, "grad_norm": 0.8635196685791016, "learning_rate": 9.72922993960322e-05, "loss": 0.3561, "step": 61600 }, { "epoch": 0.2287404815042745, "grad_norm": 1.1783031225204468, "learning_rate": 9.728274387681024e-05, "loss": 0.3619, "step": 61700 }, { "epoch": 0.22911121162016476, "grad_norm": 1.0662380456924438, "learning_rate": 9.727317199740566e-05, "loss": 0.3758, "step": 61800 }, { "epoch": 0.229481941736055, "grad_norm": 0.4212286174297333, "learning_rate": 9.726358376113037e-05, "loss": 0.3777, "step": 61900 }, { "epoch": 0.22985267185194522, "grad_norm": 0.976498007774353, "learning_rate": 9.725397917130198e-05, "loss": 0.3973, "step": 62000 }, { "epoch": 0.23022340196783544, "grad_norm": 1.263175368309021, "learning_rate": 9.724435823124375e-05, "loss": 0.3445, "step": 62100 }, { "epoch": 0.2305941320837257, "grad_norm": 0.6847081780433655, "learning_rate": 9.723472094428459e-05, "loss": 0.3394, "step": 62200 }, { "epoch": 0.23096486219961593, "grad_norm": 0.7184637784957886, "learning_rate": 9.722506731375909e-05, "loss": 0.3402, "step": 62300 }, { "epoch": 0.23133559231550616, "grad_norm": 0.8394955396652222, "learning_rate": 9.721539734300745e-05, "loss": 0.3835, "step": 62400 }, { "epoch": 0.23170632243139638, "grad_norm": 0.9207004308700562, "learning_rate": 9.720571103537559e-05, "loss": 0.3202, "step": 62500 }, { "epoch": 0.23207705254728664, "grad_norm": 0.8707634210586548, "learning_rate": 9.7196008394215e-05, "loss": 0.3363, "step": 62600 }, { "epoch": 0.23244778266317687, "grad_norm": 0.49225497245788574, "learning_rate": 9.71862894228829e-05, "loss": 0.3826, "step": 62700 }, { "epoch": 0.2328185127790671, "grad_norm": 0.7625803351402283, "learning_rate": 9.717655412474214e-05, "loss": 0.3612, "step": 62800 }, { "epoch": 0.23318924289495732, "grad_norm": 0.6199244260787964, "learning_rate": 9.716680250316116e-05, "loss": 0.3142, "step": 62900 }, { "epoch": 0.23355997301084758, "grad_norm": 0.8140596151351929, "learning_rate": 9.715703456151412e-05, "loss": 0.3637, "step": 63000 }, { "epoch": 0.2339307031267378, "grad_norm": 0.32811716198921204, "learning_rate": 9.714725030318082e-05, "loss": 0.3345, "step": 63100 }, { "epoch": 0.23430143324262803, "grad_norm": 0.3818732500076294, "learning_rate": 9.713744973154664e-05, "loss": 0.3544, "step": 63200 }, { "epoch": 0.23467216335851826, "grad_norm": 0.5978304743766785, "learning_rate": 9.712763285000267e-05, "loss": 0.3502, "step": 63300 }, { "epoch": 0.2350428934744085, "grad_norm": 0.6954530477523804, "learning_rate": 9.711779966194565e-05, "loss": 0.3342, "step": 63400 }, { "epoch": 0.23541362359029874, "grad_norm": 0.6988713145256042, "learning_rate": 9.710795017077792e-05, "loss": 0.3565, "step": 63500 }, { "epoch": 0.23578435370618897, "grad_norm": 0.7381346821784973, "learning_rate": 9.709808437990746e-05, "loss": 0.3742, "step": 63600 }, { "epoch": 0.2361550838220792, "grad_norm": 0.5175548791885376, "learning_rate": 9.708820229274792e-05, "loss": 0.3315, "step": 63700 }, { "epoch": 0.23652581393796943, "grad_norm": 2.1821794509887695, "learning_rate": 9.707830391271858e-05, "loss": 0.4064, "step": 63800 }, { "epoch": 0.23689654405385968, "grad_norm": 0.44789940118789673, "learning_rate": 9.706838924324435e-05, "loss": 0.3451, "step": 63900 }, { "epoch": 0.2372672741697499, "grad_norm": 0.3515869677066803, "learning_rate": 9.70584582877558e-05, "loss": 0.3926, "step": 64000 }, { "epoch": 0.23763800428564014, "grad_norm": 0.7288669943809509, "learning_rate": 9.704851104968907e-05, "loss": 0.3541, "step": 64100 }, { "epoch": 0.23800873440153036, "grad_norm": 0.5523846745491028, "learning_rate": 9.703854753248601e-05, "loss": 0.358, "step": 64200 }, { "epoch": 0.23837946451742062, "grad_norm": 0.6629263162612915, "learning_rate": 9.702856773959407e-05, "loss": 0.3448, "step": 64300 }, { "epoch": 0.23875019463331085, "grad_norm": 0.6133660078048706, "learning_rate": 9.701857167446634e-05, "loss": 0.3612, "step": 64400 }, { "epoch": 0.23912092474920107, "grad_norm": 0.7145023345947266, "learning_rate": 9.70085593405615e-05, "loss": 0.3952, "step": 64500 }, { "epoch": 0.2394916548650913, "grad_norm": 0.6501260995864868, "learning_rate": 9.699853074134394e-05, "loss": 0.3595, "step": 64600 }, { "epoch": 0.23986238498098156, "grad_norm": 0.5389569401741028, "learning_rate": 9.698848588028357e-05, "loss": 0.3603, "step": 64700 }, { "epoch": 0.24023311509687179, "grad_norm": 0.6581391096115112, "learning_rate": 9.697842476085604e-05, "loss": 0.3363, "step": 64800 }, { "epoch": 0.240603845212762, "grad_norm": 0.5940340161323547, "learning_rate": 9.696834738654256e-05, "loss": 0.3526, "step": 64900 }, { "epoch": 0.24097457532865224, "grad_norm": 0.615859866142273, "learning_rate": 9.695825376082996e-05, "loss": 0.3461, "step": 65000 }, { "epoch": 0.24134530544454247, "grad_norm": 0.7272951602935791, "learning_rate": 9.694814388721072e-05, "loss": 0.364, "step": 65100 }, { "epoch": 0.24171603556043272, "grad_norm": 0.6760991811752319, "learning_rate": 9.693801776918295e-05, "loss": 0.3417, "step": 65200 }, { "epoch": 0.24208676567632295, "grad_norm": 0.6610513925552368, "learning_rate": 9.692787541025033e-05, "loss": 0.3914, "step": 65300 }, { "epoch": 0.24245749579221318, "grad_norm": 0.6815891265869141, "learning_rate": 9.69177168139222e-05, "loss": 0.3432, "step": 65400 }, { "epoch": 0.2428282259081034, "grad_norm": 0.7524155974388123, "learning_rate": 9.690754198371352e-05, "loss": 0.3507, "step": 65500 }, { "epoch": 0.24319895602399366, "grad_norm": 0.6979447603225708, "learning_rate": 9.689735092314486e-05, "loss": 0.3568, "step": 65600 }, { "epoch": 0.2435696861398839, "grad_norm": 0.7990106344223022, "learning_rate": 9.688714363574239e-05, "loss": 0.3629, "step": 65700 }, { "epoch": 0.24394041625577412, "grad_norm": 0.6339213848114014, "learning_rate": 9.687692012503793e-05, "loss": 0.3531, "step": 65800 }, { "epoch": 0.24431114637166435, "grad_norm": 0.87986820936203, "learning_rate": 9.686668039456887e-05, "loss": 0.397, "step": 65900 }, { "epoch": 0.2446818764875546, "grad_norm": 0.7055337429046631, "learning_rate": 9.685642444787823e-05, "loss": 0.3838, "step": 66000 }, { "epoch": 0.24505260660344483, "grad_norm": 1.0909887552261353, "learning_rate": 9.684615228851466e-05, "loss": 0.3619, "step": 66100 }, { "epoch": 0.24542333671933506, "grad_norm": 0.42110034823417664, "learning_rate": 9.68358639200324e-05, "loss": 0.3235, "step": 66200 }, { "epoch": 0.24579406683522528, "grad_norm": 0.7030036449432373, "learning_rate": 9.682555934599129e-05, "loss": 0.3793, "step": 66300 }, { "epoch": 0.24616479695111554, "grad_norm": 0.7912965416908264, "learning_rate": 9.681523856995681e-05, "loss": 0.357, "step": 66400 }, { "epoch": 0.24653552706700577, "grad_norm": 0.5939860343933105, "learning_rate": 9.680490159550002e-05, "loss": 0.356, "step": 66500 }, { "epoch": 0.246906257182896, "grad_norm": 0.6061980128288269, "learning_rate": 9.679454842619757e-05, "loss": 0.3479, "step": 66600 }, { "epoch": 0.24727698729878622, "grad_norm": 0.7972367405891418, "learning_rate": 9.678417906563175e-05, "loss": 0.3413, "step": 66700 }, { "epoch": 0.24764771741467648, "grad_norm": 0.7738129496574402, "learning_rate": 9.677379351739044e-05, "loss": 0.3563, "step": 66800 }, { "epoch": 0.2480184475305667, "grad_norm": 0.6912312507629395, "learning_rate": 9.676339178506713e-05, "loss": 0.3742, "step": 66900 }, { "epoch": 0.24838917764645693, "grad_norm": 0.6595388650894165, "learning_rate": 9.675297387226085e-05, "loss": 0.3153, "step": 67000 }, { "epoch": 0.24875990776234716, "grad_norm": 0.6095815300941467, "learning_rate": 9.674253978257632e-05, "loss": 0.3699, "step": 67100 }, { "epoch": 0.2491306378782374, "grad_norm": 0.8531883955001831, "learning_rate": 9.67320895196238e-05, "loss": 0.354, "step": 67200 }, { "epoch": 0.24950136799412764, "grad_norm": 0.4452097713947296, "learning_rate": 9.672162308701917e-05, "loss": 0.3759, "step": 67300 }, { "epoch": 0.24987209811001787, "grad_norm": 0.4527115523815155, "learning_rate": 9.671114048838386e-05, "loss": 0.3587, "step": 67400 }, { "epoch": 0.2502428282259081, "grad_norm": 0.29029154777526855, "learning_rate": 9.670064172734496e-05, "loss": 0.381, "step": 67500 }, { "epoch": 0.2506135583417983, "grad_norm": 0.5344186425209045, "learning_rate": 9.66901268075351e-05, "loss": 0.3651, "step": 67600 }, { "epoch": 0.25098428845768855, "grad_norm": 0.43897882103919983, "learning_rate": 9.667959573259253e-05, "loss": 0.3354, "step": 67700 }, { "epoch": 0.2513550185735788, "grad_norm": 1.213488221168518, "learning_rate": 9.66690485061611e-05, "loss": 0.3824, "step": 67800 }, { "epoch": 0.25172574868946906, "grad_norm": 0.752863883972168, "learning_rate": 9.66584851318902e-05, "loss": 0.3528, "step": 67900 }, { "epoch": 0.2520964788053593, "grad_norm": 1.4865788221359253, "learning_rate": 9.664790561343483e-05, "loss": 0.3392, "step": 68000 }, { "epoch": 0.2524672089212495, "grad_norm": 1.2769784927368164, "learning_rate": 9.663730995445562e-05, "loss": 0.3537, "step": 68100 }, { "epoch": 0.25283793903713975, "grad_norm": 0.5997257828712463, "learning_rate": 9.66266981586187e-05, "loss": 0.3687, "step": 68200 }, { "epoch": 0.25320866915303, "grad_norm": 0.7657833099365234, "learning_rate": 9.661607022959587e-05, "loss": 0.3847, "step": 68300 }, { "epoch": 0.2535793992689202, "grad_norm": 0.5288935303688049, "learning_rate": 9.660542617106446e-05, "loss": 0.3611, "step": 68400 }, { "epoch": 0.25395012938481043, "grad_norm": 0.7719181776046753, "learning_rate": 9.659476598670739e-05, "loss": 0.3466, "step": 68500 }, { "epoch": 0.25432085950070066, "grad_norm": 0.5314151048660278, "learning_rate": 9.658408968021318e-05, "loss": 0.3979, "step": 68600 }, { "epoch": 0.25469158961659094, "grad_norm": 0.4889487624168396, "learning_rate": 9.657339725527587e-05, "loss": 0.365, "step": 68700 }, { "epoch": 0.25506231973248117, "grad_norm": 0.5485001802444458, "learning_rate": 9.656268871559516e-05, "loss": 0.3405, "step": 68800 }, { "epoch": 0.2554330498483714, "grad_norm": 1.0604158639907837, "learning_rate": 9.655196406487627e-05, "loss": 0.3266, "step": 68900 }, { "epoch": 0.2558037799642616, "grad_norm": 0.5189512968063354, "learning_rate": 9.654122330683001e-05, "loss": 0.3855, "step": 69000 }, { "epoch": 0.25617451008015185, "grad_norm": 0.5871772170066833, "learning_rate": 9.653046644517278e-05, "loss": 0.3595, "step": 69100 }, { "epoch": 0.2565452401960421, "grad_norm": 0.3116289973258972, "learning_rate": 9.651969348362651e-05, "loss": 0.3719, "step": 69200 }, { "epoch": 0.2569159703119323, "grad_norm": 0.49889594316482544, "learning_rate": 9.650890442591872e-05, "loss": 0.3845, "step": 69300 }, { "epoch": 0.25728670042782253, "grad_norm": 1.1153746843338013, "learning_rate": 9.649809927578252e-05, "loss": 0.3505, "step": 69400 }, { "epoch": 0.25765743054371276, "grad_norm": 0.38753369450569153, "learning_rate": 9.648727803695659e-05, "loss": 0.3336, "step": 69500 }, { "epoch": 0.25802816065960305, "grad_norm": 0.9947963356971741, "learning_rate": 9.647644071318512e-05, "loss": 0.3474, "step": 69600 }, { "epoch": 0.2583988907754933, "grad_norm": 1.1864749193191528, "learning_rate": 9.646558730821793e-05, "loss": 0.3642, "step": 69700 }, { "epoch": 0.2587696208913835, "grad_norm": 1.6150169372558594, "learning_rate": 9.645471782581037e-05, "loss": 0.3278, "step": 69800 }, { "epoch": 0.25914035100727373, "grad_norm": 0.6177868843078613, "learning_rate": 9.644383226972338e-05, "loss": 0.3664, "step": 69900 }, { "epoch": 0.25951108112316396, "grad_norm": 0.9959515333175659, "learning_rate": 9.643293064372341e-05, "loss": 0.3341, "step": 70000 }, { "epoch": 0.2598818112390542, "grad_norm": 0.503052830696106, "learning_rate": 9.642201295158253e-05, "loss": 0.366, "step": 70100 }, { "epoch": 0.2602525413549444, "grad_norm": 0.5561308264732361, "learning_rate": 9.641107919707835e-05, "loss": 0.3271, "step": 70200 }, { "epoch": 0.26062327147083464, "grad_norm": 0.6276100277900696, "learning_rate": 9.640012938399401e-05, "loss": 0.363, "step": 70300 }, { "epoch": 0.2609940015867249, "grad_norm": 0.7223142385482788, "learning_rate": 9.638916351611822e-05, "loss": 0.3388, "step": 70400 }, { "epoch": 0.26136473170261515, "grad_norm": 0.693469226360321, "learning_rate": 9.637818159724528e-05, "loss": 0.3678, "step": 70500 }, { "epoch": 0.2617354618185054, "grad_norm": 0.9387418031692505, "learning_rate": 9.636718363117501e-05, "loss": 0.3635, "step": 70600 }, { "epoch": 0.2621061919343956, "grad_norm": 0.9423599243164062, "learning_rate": 9.635616962171276e-05, "loss": 0.353, "step": 70700 }, { "epoch": 0.26247692205028583, "grad_norm": 0.7595126628875732, "learning_rate": 9.63451395726695e-05, "loss": 0.3447, "step": 70800 }, { "epoch": 0.26284765216617606, "grad_norm": 0.7434208989143372, "learning_rate": 9.633409348786169e-05, "loss": 0.3454, "step": 70900 }, { "epoch": 0.2632183822820663, "grad_norm": 0.44860655069351196, "learning_rate": 9.632303137111136e-05, "loss": 0.3614, "step": 71000 }, { "epoch": 0.2635891123979565, "grad_norm": 0.7484694719314575, "learning_rate": 9.631195322624607e-05, "loss": 0.3613, "step": 71100 }, { "epoch": 0.26395984251384674, "grad_norm": 0.8013439774513245, "learning_rate": 9.630085905709896e-05, "loss": 0.3935, "step": 71200 }, { "epoch": 0.264330572629737, "grad_norm": 0.5944663882255554, "learning_rate": 9.62897488675087e-05, "loss": 0.3608, "step": 71300 }, { "epoch": 0.26470130274562725, "grad_norm": 0.41204744577407837, "learning_rate": 9.627862266131947e-05, "loss": 0.3267, "step": 71400 }, { "epoch": 0.2650720328615175, "grad_norm": 1.0238357782363892, "learning_rate": 9.626748044238105e-05, "loss": 0.3373, "step": 71500 }, { "epoch": 0.2654427629774077, "grad_norm": 0.4463097155094147, "learning_rate": 9.625632221454871e-05, "loss": 0.3503, "step": 71600 }, { "epoch": 0.26581349309329794, "grad_norm": 0.775566577911377, "learning_rate": 9.624514798168329e-05, "loss": 0.379, "step": 71700 }, { "epoch": 0.26618422320918816, "grad_norm": 1.3316633701324463, "learning_rate": 9.623395774765118e-05, "loss": 0.3376, "step": 71800 }, { "epoch": 0.2665549533250784, "grad_norm": 0.7643953561782837, "learning_rate": 9.622275151632424e-05, "loss": 0.3724, "step": 71900 }, { "epoch": 0.2669256834409686, "grad_norm": 0.7884383797645569, "learning_rate": 9.621152929157994e-05, "loss": 0.3437, "step": 72000 }, { "epoch": 0.2672964135568589, "grad_norm": 0.6692783832550049, "learning_rate": 9.620029107730124e-05, "loss": 0.3411, "step": 72100 }, { "epoch": 0.26766714367274913, "grad_norm": 0.9335283637046814, "learning_rate": 9.618903687737666e-05, "loss": 0.3494, "step": 72200 }, { "epoch": 0.26803787378863936, "grad_norm": 0.8560910224914551, "learning_rate": 9.617776669570022e-05, "loss": 0.3463, "step": 72300 }, { "epoch": 0.2684086039045296, "grad_norm": 0.3624962866306305, "learning_rate": 9.61664805361715e-05, "loss": 0.3324, "step": 72400 }, { "epoch": 0.2687793340204198, "grad_norm": 2.1085305213928223, "learning_rate": 9.615517840269559e-05, "loss": 0.3318, "step": 72500 }, { "epoch": 0.26915006413631004, "grad_norm": 0.6248554587364197, "learning_rate": 9.614386029918312e-05, "loss": 0.3747, "step": 72600 }, { "epoch": 0.26952079425220027, "grad_norm": 0.9493477940559387, "learning_rate": 9.613252622955023e-05, "loss": 0.3365, "step": 72700 }, { "epoch": 0.2698915243680905, "grad_norm": 0.4188714921474457, "learning_rate": 9.612117619771862e-05, "loss": 0.3787, "step": 72800 }, { "epoch": 0.2702622544839808, "grad_norm": 0.3951663374900818, "learning_rate": 9.610981020761542e-05, "loss": 0.3215, "step": 72900 }, { "epoch": 0.270632984599871, "grad_norm": 0.3296673893928528, "learning_rate": 9.60984282631734e-05, "loss": 0.3228, "step": 73000 }, { "epoch": 0.27100371471576123, "grad_norm": 0.8805108070373535, "learning_rate": 9.60870303683308e-05, "loss": 0.3554, "step": 73100 }, { "epoch": 0.27137444483165146, "grad_norm": 0.649010419845581, "learning_rate": 9.607561652703135e-05, "loss": 0.3522, "step": 73200 }, { "epoch": 0.2717451749475417, "grad_norm": 0.6383349299430847, "learning_rate": 9.606418674322435e-05, "loss": 0.3369, "step": 73300 }, { "epoch": 0.2721159050634319, "grad_norm": 0.8060072660446167, "learning_rate": 9.605274102086456e-05, "loss": 0.3846, "step": 73400 }, { "epoch": 0.27248663517932215, "grad_norm": 1.0057214498519897, "learning_rate": 9.604127936391232e-05, "loss": 0.3808, "step": 73500 }, { "epoch": 0.2728573652952124, "grad_norm": 0.51446133852005, "learning_rate": 9.602980177633343e-05, "loss": 0.3281, "step": 73600 }, { "epoch": 0.2732280954111026, "grad_norm": 0.8221081495285034, "learning_rate": 9.601830826209924e-05, "loss": 0.3705, "step": 73700 }, { "epoch": 0.2735988255269929, "grad_norm": 0.5957756638526917, "learning_rate": 9.600679882518656e-05, "loss": 0.346, "step": 73800 }, { "epoch": 0.2739695556428831, "grad_norm": 0.6864870190620422, "learning_rate": 9.599527346957774e-05, "loss": 0.3478, "step": 73900 }, { "epoch": 0.27434028575877334, "grad_norm": 0.7624675035476685, "learning_rate": 9.598373219926067e-05, "loss": 0.374, "step": 74000 }, { "epoch": 0.27471101587466357, "grad_norm": 0.3440595269203186, "learning_rate": 9.59721750182287e-05, "loss": 0.3237, "step": 74100 }, { "epoch": 0.2750817459905538, "grad_norm": 0.5018372535705566, "learning_rate": 9.596060193048072e-05, "loss": 0.3324, "step": 74200 }, { "epoch": 0.275452476106444, "grad_norm": 0.5941722393035889, "learning_rate": 9.594901294002106e-05, "loss": 0.3702, "step": 74300 }, { "epoch": 0.27582320622233425, "grad_norm": 0.655626118183136, "learning_rate": 9.593740805085965e-05, "loss": 0.3139, "step": 74400 }, { "epoch": 0.2761939363382245, "grad_norm": 0.8820034265518188, "learning_rate": 9.592578726701182e-05, "loss": 0.3598, "step": 74500 }, { "epoch": 0.27656466645411476, "grad_norm": 0.9605242609977722, "learning_rate": 9.591415059249849e-05, "loss": 0.3499, "step": 74600 }, { "epoch": 0.276935396570005, "grad_norm": 0.8249090313911438, "learning_rate": 9.5902498031346e-05, "loss": 0.3439, "step": 74700 }, { "epoch": 0.2773061266858952, "grad_norm": 0.2974480390548706, "learning_rate": 9.589082958758625e-05, "loss": 0.3613, "step": 74800 }, { "epoch": 0.27767685680178544, "grad_norm": 0.9232938885688782, "learning_rate": 9.587914526525659e-05, "loss": 0.3497, "step": 74900 }, { "epoch": 0.27804758691767567, "grad_norm": 0.8066462874412537, "learning_rate": 9.586744506839991e-05, "loss": 0.3351, "step": 75000 }, { "epoch": 0.2784183170335659, "grad_norm": 1.019874095916748, "learning_rate": 9.585572900106455e-05, "loss": 0.3605, "step": 75100 }, { "epoch": 0.2787890471494561, "grad_norm": 0.4863557517528534, "learning_rate": 9.584399706730434e-05, "loss": 0.3601, "step": 75200 }, { "epoch": 0.27915977726534635, "grad_norm": 0.9166316986083984, "learning_rate": 9.583224927117862e-05, "loss": 0.3343, "step": 75300 }, { "epoch": 0.2795305073812366, "grad_norm": 1.0016956329345703, "learning_rate": 9.582048561675226e-05, "loss": 0.3717, "step": 75400 }, { "epoch": 0.27990123749712686, "grad_norm": 0.2846522629261017, "learning_rate": 9.580870610809551e-05, "loss": 0.336, "step": 75500 }, { "epoch": 0.2802719676130171, "grad_norm": 0.6594480276107788, "learning_rate": 9.57969107492842e-05, "loss": 0.345, "step": 75600 }, { "epoch": 0.2806426977289073, "grad_norm": 0.5041579604148865, "learning_rate": 9.578509954439962e-05, "loss": 0.3741, "step": 75700 }, { "epoch": 0.28101342784479755, "grad_norm": 0.718885600566864, "learning_rate": 9.577327249752852e-05, "loss": 0.3612, "step": 75800 }, { "epoch": 0.2813841579606878, "grad_norm": 0.3689005970954895, "learning_rate": 9.576142961276315e-05, "loss": 0.3304, "step": 75900 }, { "epoch": 0.281754888076578, "grad_norm": 0.4837969243526459, "learning_rate": 9.574957089420125e-05, "loss": 0.3784, "step": 76000 }, { "epoch": 0.28212561819246823, "grad_norm": 1.0212740898132324, "learning_rate": 9.5737696345946e-05, "loss": 0.3351, "step": 76100 }, { "epoch": 0.28249634830835846, "grad_norm": 0.663528323173523, "learning_rate": 9.57258059721061e-05, "loss": 0.3291, "step": 76200 }, { "epoch": 0.28286707842424874, "grad_norm": 0.5245704650878906, "learning_rate": 9.57138997767957e-05, "loss": 0.3545, "step": 76300 }, { "epoch": 0.28323780854013897, "grad_norm": 1.386476755142212, "learning_rate": 9.570197776413443e-05, "loss": 0.3529, "step": 76400 }, { "epoch": 0.2836085386560292, "grad_norm": 0.5430529713630676, "learning_rate": 9.569003993824741e-05, "loss": 0.335, "step": 76500 }, { "epoch": 0.2839792687719194, "grad_norm": 0.4308852553367615, "learning_rate": 9.567808630326521e-05, "loss": 0.3722, "step": 76600 }, { "epoch": 0.28434999888780965, "grad_norm": 0.8661644458770752, "learning_rate": 9.566611686332387e-05, "loss": 0.3218, "step": 76700 }, { "epoch": 0.2847207290036999, "grad_norm": 1.0365442037582397, "learning_rate": 9.565413162256491e-05, "loss": 0.3536, "step": 76800 }, { "epoch": 0.2850914591195901, "grad_norm": 0.6220062971115112, "learning_rate": 9.564213058513532e-05, "loss": 0.344, "step": 76900 }, { "epoch": 0.28546218923548033, "grad_norm": 0.3636418581008911, "learning_rate": 9.563011375518754e-05, "loss": 0.3399, "step": 77000 }, { "epoch": 0.28583291935137056, "grad_norm": 0.6465820074081421, "learning_rate": 9.561808113687947e-05, "loss": 0.3422, "step": 77100 }, { "epoch": 0.28620364946726085, "grad_norm": 1.054902195930481, "learning_rate": 9.560603273437452e-05, "loss": 0.3438, "step": 77200 }, { "epoch": 0.2865743795831511, "grad_norm": 0.9905043244361877, "learning_rate": 9.55939685518415e-05, "loss": 0.3263, "step": 77300 }, { "epoch": 0.2869451096990413, "grad_norm": 0.8655000925064087, "learning_rate": 9.558188859345471e-05, "loss": 0.347, "step": 77400 }, { "epoch": 0.28731583981493153, "grad_norm": 0.4352448582649231, "learning_rate": 9.556979286339394e-05, "loss": 0.3442, "step": 77500 }, { "epoch": 0.28768656993082176, "grad_norm": 0.7868867516517639, "learning_rate": 9.555768136584435e-05, "loss": 0.3587, "step": 77600 }, { "epoch": 0.288057300046712, "grad_norm": 1.4708112478256226, "learning_rate": 9.554555410499664e-05, "loss": 0.3527, "step": 77700 }, { "epoch": 0.2884280301626022, "grad_norm": 0.7180978655815125, "learning_rate": 9.553341108504693e-05, "loss": 0.3501, "step": 77800 }, { "epoch": 0.28879876027849244, "grad_norm": 0.892936110496521, "learning_rate": 9.552125231019678e-05, "loss": 0.3602, "step": 77900 }, { "epoch": 0.2891694903943827, "grad_norm": 0.7871927618980408, "learning_rate": 9.550907778465324e-05, "loss": 0.3602, "step": 78000 }, { "epoch": 0.28954022051027295, "grad_norm": 0.3739321529865265, "learning_rate": 9.549688751262875e-05, "loss": 0.3334, "step": 78100 }, { "epoch": 0.2899109506261632, "grad_norm": 1.0137110948562622, "learning_rate": 9.548468149834129e-05, "loss": 0.3919, "step": 78200 }, { "epoch": 0.2902816807420534, "grad_norm": 0.411663293838501, "learning_rate": 9.54724597460142e-05, "loss": 0.3654, "step": 78300 }, { "epoch": 0.29065241085794363, "grad_norm": 0.728925883769989, "learning_rate": 9.546022225987629e-05, "loss": 0.3593, "step": 78400 }, { "epoch": 0.29102314097383386, "grad_norm": 0.4923003017902374, "learning_rate": 9.544796904416182e-05, "loss": 0.3638, "step": 78500 }, { "epoch": 0.2913938710897241, "grad_norm": 0.46122556924819946, "learning_rate": 9.543570010311053e-05, "loss": 0.3484, "step": 78600 }, { "epoch": 0.2917646012056143, "grad_norm": 0.928170919418335, "learning_rate": 9.542341544096754e-05, "loss": 0.332, "step": 78700 }, { "epoch": 0.2921353313215046, "grad_norm": 0.8534809350967407, "learning_rate": 9.541111506198343e-05, "loss": 0.3828, "step": 78800 }, { "epoch": 0.2925060614373948, "grad_norm": 0.5449038147926331, "learning_rate": 9.539879897041421e-05, "loss": 0.3573, "step": 78900 }, { "epoch": 0.29287679155328505, "grad_norm": 0.44897860288619995, "learning_rate": 9.538646717052138e-05, "loss": 0.3612, "step": 79000 }, { "epoch": 0.2932475216691753, "grad_norm": 0.6319025158882141, "learning_rate": 9.537411966657182e-05, "loss": 0.3764, "step": 79100 }, { "epoch": 0.2936182517850655, "grad_norm": 0.7926188111305237, "learning_rate": 9.536175646283784e-05, "loss": 0.3672, "step": 79200 }, { "epoch": 0.29398898190095574, "grad_norm": 0.31281498074531555, "learning_rate": 9.534937756359723e-05, "loss": 0.3291, "step": 79300 }, { "epoch": 0.29435971201684596, "grad_norm": 0.5771576762199402, "learning_rate": 9.533698297313314e-05, "loss": 0.345, "step": 79400 }, { "epoch": 0.2947304421327362, "grad_norm": 0.3008382320404053, "learning_rate": 9.532457269573423e-05, "loss": 0.3393, "step": 79500 }, { "epoch": 0.2951011722486264, "grad_norm": 0.6382130980491638, "learning_rate": 9.531214673569453e-05, "loss": 0.3463, "step": 79600 }, { "epoch": 0.2954719023645167, "grad_norm": 0.38460248708724976, "learning_rate": 9.529970509731353e-05, "loss": 0.3402, "step": 79700 }, { "epoch": 0.29584263248040693, "grad_norm": 0.6337429881095886, "learning_rate": 9.52872477848961e-05, "loss": 0.3708, "step": 79800 }, { "epoch": 0.29621336259629716, "grad_norm": 0.31621912121772766, "learning_rate": 9.52747748027526e-05, "loss": 0.3408, "step": 79900 }, { "epoch": 0.2965840927121874, "grad_norm": 0.7498133778572083, "learning_rate": 9.526228615519876e-05, "loss": 0.3561, "step": 80000 }, { "epoch": 0.2969548228280776, "grad_norm": 0.6927025318145752, "learning_rate": 9.524978184655573e-05, "loss": 0.3245, "step": 80100 }, { "epoch": 0.29732555294396784, "grad_norm": 0.7404581904411316, "learning_rate": 9.523726188115012e-05, "loss": 0.3505, "step": 80200 }, { "epoch": 0.29769628305985807, "grad_norm": 0.5153583288192749, "learning_rate": 9.522472626331391e-05, "loss": 0.3835, "step": 80300 }, { "epoch": 0.2980670131757483, "grad_norm": 0.6740410327911377, "learning_rate": 9.521217499738453e-05, "loss": 0.3513, "step": 80400 }, { "epoch": 0.2984377432916386, "grad_norm": 0.6466183066368103, "learning_rate": 9.51996080877048e-05, "loss": 0.3629, "step": 80500 }, { "epoch": 0.2988084734075288, "grad_norm": 0.7051879167556763, "learning_rate": 9.518702553862297e-05, "loss": 0.3347, "step": 80600 }, { "epoch": 0.29917920352341904, "grad_norm": 1.102298378944397, "learning_rate": 9.517442735449272e-05, "loss": 0.3531, "step": 80700 }, { "epoch": 0.29954993363930926, "grad_norm": 0.6700214743614197, "learning_rate": 9.516181353967308e-05, "loss": 0.3389, "step": 80800 }, { "epoch": 0.2999206637551995, "grad_norm": 0.7355265021324158, "learning_rate": 9.514918409852854e-05, "loss": 0.3409, "step": 80900 }, { "epoch": 0.3002913938710897, "grad_norm": 0.4035002887248993, "learning_rate": 9.513653903542898e-05, "loss": 0.3283, "step": 81000 }, { "epoch": 0.30066212398697995, "grad_norm": 0.6572360396385193, "learning_rate": 9.51238783547497e-05, "loss": 0.3483, "step": 81100 }, { "epoch": 0.3010328541028702, "grad_norm": 0.6741189360618591, "learning_rate": 9.511120206087135e-05, "loss": 0.3466, "step": 81200 }, { "epoch": 0.3014035842187604, "grad_norm": 0.6768749356269836, "learning_rate": 9.509851015818005e-05, "loss": 0.3498, "step": 81300 }, { "epoch": 0.3017743143346507, "grad_norm": 0.8793819546699524, "learning_rate": 9.508580265106733e-05, "loss": 0.3661, "step": 81400 }, { "epoch": 0.3021450444505409, "grad_norm": 1.2207599878311157, "learning_rate": 9.507307954393e-05, "loss": 0.3562, "step": 81500 }, { "epoch": 0.30251577456643114, "grad_norm": 0.2393585443496704, "learning_rate": 9.506034084117042e-05, "loss": 0.3458, "step": 81600 }, { "epoch": 0.30288650468232137, "grad_norm": 0.7078492045402527, "learning_rate": 9.504758654719623e-05, "loss": 0.3223, "step": 81700 }, { "epoch": 0.3032572347982116, "grad_norm": 0.7494010925292969, "learning_rate": 9.503481666642055e-05, "loss": 0.3407, "step": 81800 }, { "epoch": 0.3036279649141018, "grad_norm": 0.9579745531082153, "learning_rate": 9.502203120326183e-05, "loss": 0.3408, "step": 81900 }, { "epoch": 0.30399869502999205, "grad_norm": 0.3516921401023865, "learning_rate": 9.500923016214394e-05, "loss": 0.3601, "step": 82000 }, { "epoch": 0.3043694251458823, "grad_norm": 0.2891921401023865, "learning_rate": 9.499641354749613e-05, "loss": 0.3283, "step": 82100 }, { "epoch": 0.30474015526177256, "grad_norm": 0.8769057989120483, "learning_rate": 9.498358136375306e-05, "loss": 0.3436, "step": 82200 }, { "epoch": 0.3051108853776628, "grad_norm": 0.5751267075538635, "learning_rate": 9.497073361535475e-05, "loss": 0.3602, "step": 82300 }, { "epoch": 0.305481615493553, "grad_norm": 0.4647534489631653, "learning_rate": 9.495787030674662e-05, "loss": 0.3651, "step": 82400 }, { "epoch": 0.30585234560944324, "grad_norm": 0.6700929403305054, "learning_rate": 9.494499144237947e-05, "loss": 0.3906, "step": 82500 }, { "epoch": 0.30622307572533347, "grad_norm": 0.5064874887466431, "learning_rate": 9.49320970267095e-05, "loss": 0.3585, "step": 82600 }, { "epoch": 0.3065938058412237, "grad_norm": 1.0749011039733887, "learning_rate": 9.491918706419824e-05, "loss": 0.3657, "step": 82700 }, { "epoch": 0.3069645359571139, "grad_norm": 0.9154279828071594, "learning_rate": 9.490626155931266e-05, "loss": 0.3814, "step": 82800 }, { "epoch": 0.30733526607300415, "grad_norm": 0.792879581451416, "learning_rate": 9.489332051652506e-05, "loss": 0.3351, "step": 82900 }, { "epoch": 0.3077059961888944, "grad_norm": 0.7561848759651184, "learning_rate": 9.488036394031316e-05, "loss": 0.3511, "step": 83000 }, { "epoch": 0.30807672630478466, "grad_norm": 1.4005093574523926, "learning_rate": 9.486739183516003e-05, "loss": 0.3346, "step": 83100 }, { "epoch": 0.3084474564206749, "grad_norm": 0.9498708844184875, "learning_rate": 9.48544042055541e-05, "loss": 0.3465, "step": 83200 }, { "epoch": 0.3088181865365651, "grad_norm": 0.6331781148910522, "learning_rate": 9.48414010559892e-05, "loss": 0.3264, "step": 83300 }, { "epoch": 0.30918891665245535, "grad_norm": 0.33727601170539856, "learning_rate": 9.482838239096451e-05, "loss": 0.3734, "step": 83400 }, { "epoch": 0.3095596467683456, "grad_norm": 0.3161174952983856, "learning_rate": 9.481534821498459e-05, "loss": 0.3502, "step": 83500 }, { "epoch": 0.3099303768842358, "grad_norm": 0.8974553942680359, "learning_rate": 9.480229853255935e-05, "loss": 0.3367, "step": 83600 }, { "epoch": 0.31030110700012603, "grad_norm": 0.5984868407249451, "learning_rate": 9.47892333482041e-05, "loss": 0.323, "step": 83700 }, { "epoch": 0.31067183711601626, "grad_norm": 0.9366351962089539, "learning_rate": 9.477615266643947e-05, "loss": 0.3681, "step": 83800 }, { "epoch": 0.31104256723190654, "grad_norm": 0.8739509582519531, "learning_rate": 9.47630564917915e-05, "loss": 0.3563, "step": 83900 }, { "epoch": 0.31141329734779677, "grad_norm": 0.7719703316688538, "learning_rate": 9.474994482879151e-05, "loss": 0.3224, "step": 84000 }, { "epoch": 0.311784027463687, "grad_norm": 0.8346322178840637, "learning_rate": 9.47368176819763e-05, "loss": 0.346, "step": 84100 }, { "epoch": 0.3121547575795772, "grad_norm": 0.4350517988204956, "learning_rate": 9.472367505588792e-05, "loss": 0.3353, "step": 84200 }, { "epoch": 0.31252548769546745, "grad_norm": 0.9867724180221558, "learning_rate": 9.471051695507382e-05, "loss": 0.3727, "step": 84300 }, { "epoch": 0.3128962178113577, "grad_norm": 1.2590768337249756, "learning_rate": 9.469734338408682e-05, "loss": 0.347, "step": 84400 }, { "epoch": 0.3132669479272479, "grad_norm": 0.6027205586433411, "learning_rate": 9.468415434748506e-05, "loss": 0.322, "step": 84500 }, { "epoch": 0.31363767804313814, "grad_norm": 0.4864099323749542, "learning_rate": 9.467094984983205e-05, "loss": 0.3373, "step": 84600 }, { "epoch": 0.3140084081590284, "grad_norm": 0.7066247463226318, "learning_rate": 9.465772989569663e-05, "loss": 0.3564, "step": 84700 }, { "epoch": 0.31437913827491865, "grad_norm": 0.7795847654342651, "learning_rate": 9.464449448965305e-05, "loss": 0.3443, "step": 84800 }, { "epoch": 0.3147498683908089, "grad_norm": 0.6811952590942383, "learning_rate": 9.463124363628078e-05, "loss": 0.3412, "step": 84900 }, { "epoch": 0.3151205985066991, "grad_norm": 0.4456767141819, "learning_rate": 9.461797734016479e-05, "loss": 0.353, "step": 85000 }, { "epoch": 0.31549132862258933, "grad_norm": 0.8719674944877625, "learning_rate": 9.460469560589528e-05, "loss": 0.3394, "step": 85100 }, { "epoch": 0.31586205873847956, "grad_norm": 0.7343171834945679, "learning_rate": 9.459139843806783e-05, "loss": 0.3459, "step": 85200 }, { "epoch": 0.3162327888543698, "grad_norm": 1.9570255279541016, "learning_rate": 9.457808584128337e-05, "loss": 0.3506, "step": 85300 }, { "epoch": 0.31660351897026, "grad_norm": 0.36517420411109924, "learning_rate": 9.456475782014816e-05, "loss": 0.3413, "step": 85400 }, { "epoch": 0.31697424908615024, "grad_norm": 1.1147810220718384, "learning_rate": 9.455141437927377e-05, "loss": 0.3484, "step": 85500 }, { "epoch": 0.3173449792020405, "grad_norm": 0.5239797234535217, "learning_rate": 9.453805552327717e-05, "loss": 0.3583, "step": 85600 }, { "epoch": 0.31771570931793075, "grad_norm": 0.5234938859939575, "learning_rate": 9.452468125678058e-05, "loss": 0.3569, "step": 85700 }, { "epoch": 0.318086439433821, "grad_norm": 0.731163740158081, "learning_rate": 9.451129158441164e-05, "loss": 0.373, "step": 85800 }, { "epoch": 0.3184571695497112, "grad_norm": 0.8244116306304932, "learning_rate": 9.449788651080324e-05, "loss": 0.3536, "step": 85900 }, { "epoch": 0.31882789966560143, "grad_norm": 0.5985147953033447, "learning_rate": 9.448446604059366e-05, "loss": 0.3148, "step": 86000 }, { "epoch": 0.31919862978149166, "grad_norm": 0.5850010514259338, "learning_rate": 9.447103017842647e-05, "loss": 0.3294, "step": 86100 }, { "epoch": 0.3195693598973819, "grad_norm": 0.32781678438186646, "learning_rate": 9.445757892895059e-05, "loss": 0.3299, "step": 86200 }, { "epoch": 0.3199400900132721, "grad_norm": 0.6766141057014465, "learning_rate": 9.444411229682025e-05, "loss": 0.3402, "step": 86300 }, { "epoch": 0.3203108201291624, "grad_norm": 0.7660307288169861, "learning_rate": 9.443063028669498e-05, "loss": 0.3562, "step": 86400 }, { "epoch": 0.3206815502450526, "grad_norm": 0.6381827592849731, "learning_rate": 9.441713290323968e-05, "loss": 0.3091, "step": 86500 }, { "epoch": 0.32105228036094285, "grad_norm": 0.5372316241264343, "learning_rate": 9.440362015112456e-05, "loss": 0.3415, "step": 86600 }, { "epoch": 0.3214230104768331, "grad_norm": 0.47964444756507874, "learning_rate": 9.43900920350251e-05, "loss": 0.3368, "step": 86700 }, { "epoch": 0.3217937405927233, "grad_norm": 0.5521647930145264, "learning_rate": 9.437654855962214e-05, "loss": 0.3521, "step": 86800 }, { "epoch": 0.32216447070861354, "grad_norm": 1.2784754037857056, "learning_rate": 9.436298972960185e-05, "loss": 0.3114, "step": 86900 }, { "epoch": 0.32253520082450376, "grad_norm": 0.7403432130813599, "learning_rate": 9.434941554965567e-05, "loss": 0.3765, "step": 87000 }, { "epoch": 0.322905930940394, "grad_norm": 0.35429203510284424, "learning_rate": 9.433582602448034e-05, "loss": 0.3451, "step": 87100 }, { "epoch": 0.3232766610562842, "grad_norm": 0.830359935760498, "learning_rate": 9.432222115877796e-05, "loss": 0.3611, "step": 87200 }, { "epoch": 0.3236473911721745, "grad_norm": 0.7524223327636719, "learning_rate": 9.430860095725593e-05, "loss": 0.3136, "step": 87300 }, { "epoch": 0.32401812128806473, "grad_norm": 0.5921102166175842, "learning_rate": 9.429496542462691e-05, "loss": 0.3364, "step": 87400 }, { "epoch": 0.32438885140395496, "grad_norm": 0.32605230808258057, "learning_rate": 9.428131456560894e-05, "loss": 0.3507, "step": 87500 }, { "epoch": 0.3247595815198452, "grad_norm": 1.3099547624588013, "learning_rate": 9.426764838492526e-05, "loss": 0.3607, "step": 87600 }, { "epoch": 0.3251303116357354, "grad_norm": 0.5710757374763489, "learning_rate": 9.425396688730452e-05, "loss": 0.3411, "step": 87700 }, { "epoch": 0.32550104175162564, "grad_norm": 0.409616082906723, "learning_rate": 9.42402700774806e-05, "loss": 0.3252, "step": 87800 }, { "epoch": 0.32587177186751587, "grad_norm": 0.4961477220058441, "learning_rate": 9.422655796019271e-05, "loss": 0.3378, "step": 87900 }, { "epoch": 0.3262425019834061, "grad_norm": 0.9957465529441833, "learning_rate": 9.421283054018533e-05, "loss": 0.3473, "step": 88000 }, { "epoch": 0.3266132320992964, "grad_norm": 0.6207696795463562, "learning_rate": 9.419908782220826e-05, "loss": 0.3214, "step": 88100 }, { "epoch": 0.3269839622151866, "grad_norm": 0.6447293758392334, "learning_rate": 9.418532981101657e-05, "loss": 0.3678, "step": 88200 }, { "epoch": 0.32735469233107684, "grad_norm": 0.5210201740264893, "learning_rate": 9.417155651137064e-05, "loss": 0.3728, "step": 88300 }, { "epoch": 0.32772542244696706, "grad_norm": 0.3636564612388611, "learning_rate": 9.415776792803616e-05, "loss": 0.3446, "step": 88400 }, { "epoch": 0.3280961525628573, "grad_norm": 1.2763452529907227, "learning_rate": 9.414396406578405e-05, "loss": 0.3371, "step": 88500 }, { "epoch": 0.3284668826787475, "grad_norm": 0.6253642439842224, "learning_rate": 9.413014492939056e-05, "loss": 0.3678, "step": 88600 }, { "epoch": 0.32883761279463775, "grad_norm": 0.5380815267562866, "learning_rate": 9.41163105236372e-05, "loss": 0.3347, "step": 88700 }, { "epoch": 0.329208342910528, "grad_norm": 0.6819657683372498, "learning_rate": 9.410246085331082e-05, "loss": 0.335, "step": 88800 }, { "epoch": 0.3295790730264182, "grad_norm": 0.732197105884552, "learning_rate": 9.408859592320348e-05, "loss": 0.3472, "step": 88900 }, { "epoch": 0.3299498031423085, "grad_norm": 0.5149137377738953, "learning_rate": 9.407471573811256e-05, "loss": 0.3247, "step": 89000 }, { "epoch": 0.3303205332581987, "grad_norm": 0.4090103805065155, "learning_rate": 9.406082030284067e-05, "loss": 0.3583, "step": 89100 }, { "epoch": 0.33069126337408894, "grad_norm": 0.8114193677902222, "learning_rate": 9.404690962219579e-05, "loss": 0.3109, "step": 89200 }, { "epoch": 0.33106199348997917, "grad_norm": 0.7784156799316406, "learning_rate": 9.403298370099108e-05, "loss": 0.3408, "step": 89300 }, { "epoch": 0.3314327236058694, "grad_norm": 1.0750620365142822, "learning_rate": 9.401904254404504e-05, "loss": 0.3535, "step": 89400 }, { "epoch": 0.3318034537217596, "grad_norm": 0.5734215974807739, "learning_rate": 9.400508615618137e-05, "loss": 0.3663, "step": 89500 }, { "epoch": 0.33217418383764985, "grad_norm": 0.251658171415329, "learning_rate": 9.399111454222915e-05, "loss": 0.3482, "step": 89600 }, { "epoch": 0.3325449139535401, "grad_norm": 0.3379223048686981, "learning_rate": 9.397712770702262e-05, "loss": 0.3191, "step": 89700 }, { "epoch": 0.33291564406943036, "grad_norm": 0.5715618133544922, "learning_rate": 9.396312565540133e-05, "loss": 0.3375, "step": 89800 }, { "epoch": 0.3332863741853206, "grad_norm": 0.7891431450843811, "learning_rate": 9.394910839221012e-05, "loss": 0.3228, "step": 89900 }, { "epoch": 0.3336571043012108, "grad_norm": 0.8558475375175476, "learning_rate": 9.393507592229902e-05, "loss": 0.3534, "step": 90000 }, { "epoch": 0.33402783441710104, "grad_norm": 1.1853523254394531, "learning_rate": 9.392102825052342e-05, "loss": 0.3333, "step": 90100 }, { "epoch": 0.33439856453299127, "grad_norm": 0.8322643041610718, "learning_rate": 9.390696538174391e-05, "loss": 0.3822, "step": 90200 }, { "epoch": 0.3347692946488815, "grad_norm": 1.0044609308242798, "learning_rate": 9.38928873208263e-05, "loss": 0.3337, "step": 90300 }, { "epoch": 0.3351400247647717, "grad_norm": 0.713130533695221, "learning_rate": 9.387879407264177e-05, "loss": 0.3446, "step": 90400 }, { "epoch": 0.33551075488066195, "grad_norm": 0.5549774169921875, "learning_rate": 9.386468564206666e-05, "loss": 0.3236, "step": 90500 }, { "epoch": 0.33588148499655224, "grad_norm": 0.5690083503723145, "learning_rate": 9.385056203398259e-05, "loss": 0.3337, "step": 90600 }, { "epoch": 0.33625221511244247, "grad_norm": 0.5513113737106323, "learning_rate": 9.383642325327642e-05, "loss": 0.3475, "step": 90700 }, { "epoch": 0.3366229452283327, "grad_norm": 1.2766674757003784, "learning_rate": 9.38222693048403e-05, "loss": 0.3307, "step": 90800 }, { "epoch": 0.3369936753442229, "grad_norm": 0.5249356031417847, "learning_rate": 9.38081001935716e-05, "loss": 0.3233, "step": 90900 }, { "epoch": 0.33736440546011315, "grad_norm": 0.6492719054222107, "learning_rate": 9.379391592437293e-05, "loss": 0.3165, "step": 91000 }, { "epoch": 0.3377351355760034, "grad_norm": 1.2065249681472778, "learning_rate": 9.377971650215215e-05, "loss": 0.3283, "step": 91100 }, { "epoch": 0.3381058656918936, "grad_norm": 0.45945852994918823, "learning_rate": 9.376550193182237e-05, "loss": 0.3163, "step": 91200 }, { "epoch": 0.33847659580778383, "grad_norm": 0.9408405423164368, "learning_rate": 9.375127221830193e-05, "loss": 0.3714, "step": 91300 }, { "epoch": 0.33884732592367406, "grad_norm": 0.5192344188690186, "learning_rate": 9.373702736651442e-05, "loss": 0.3319, "step": 91400 }, { "epoch": 0.33921805603956434, "grad_norm": 0.4743747115135193, "learning_rate": 9.37227673813887e-05, "loss": 0.3755, "step": 91500 }, { "epoch": 0.33958878615545457, "grad_norm": 0.5763564705848694, "learning_rate": 9.370849226785876e-05, "loss": 0.341, "step": 91600 }, { "epoch": 0.3399595162713448, "grad_norm": 0.9384293556213379, "learning_rate": 9.369420203086396e-05, "loss": 0.3379, "step": 91700 }, { "epoch": 0.340330246387235, "grad_norm": 0.7881439328193665, "learning_rate": 9.36798966753488e-05, "loss": 0.3423, "step": 91800 }, { "epoch": 0.34070097650312525, "grad_norm": 0.521366536617279, "learning_rate": 9.366557620626306e-05, "loss": 0.3537, "step": 91900 }, { "epoch": 0.3410717066190155, "grad_norm": 1.3091648817062378, "learning_rate": 9.36512406285617e-05, "loss": 0.314, "step": 92000 }, { "epoch": 0.3414424367349057, "grad_norm": 0.5936567187309265, "learning_rate": 9.363688994720496e-05, "loss": 0.3262, "step": 92100 }, { "epoch": 0.34181316685079594, "grad_norm": 0.4979558289051056, "learning_rate": 9.362252416715827e-05, "loss": 0.3549, "step": 92200 }, { "epoch": 0.3421838969666862, "grad_norm": 0.6515896916389465, "learning_rate": 9.360814329339229e-05, "loss": 0.3173, "step": 92300 }, { "epoch": 0.34255462708257645, "grad_norm": 0.6886466145515442, "learning_rate": 9.359374733088292e-05, "loss": 0.3392, "step": 92400 }, { "epoch": 0.3429253571984667, "grad_norm": 0.6698784828186035, "learning_rate": 9.35793362846113e-05, "loss": 0.3567, "step": 92500 }, { "epoch": 0.3432960873143569, "grad_norm": 0.45944657921791077, "learning_rate": 9.356491015956372e-05, "loss": 0.3367, "step": 92600 }, { "epoch": 0.34366681743024713, "grad_norm": 0.2257796823978424, "learning_rate": 9.355046896073174e-05, "loss": 0.3072, "step": 92700 }, { "epoch": 0.34403754754613736, "grad_norm": 0.7996974587440491, "learning_rate": 9.353601269311211e-05, "loss": 0.3559, "step": 92800 }, { "epoch": 0.3444082776620276, "grad_norm": 0.4967000484466553, "learning_rate": 9.352154136170683e-05, "loss": 0.3354, "step": 92900 }, { "epoch": 0.3447790077779178, "grad_norm": 2.139761209487915, "learning_rate": 9.350705497152307e-05, "loss": 0.3532, "step": 93000 }, { "epoch": 0.34514973789380804, "grad_norm": 0.7974853515625, "learning_rate": 9.349255352757324e-05, "loss": 0.3517, "step": 93100 }, { "epoch": 0.3455204680096983, "grad_norm": 0.8458948135375977, "learning_rate": 9.347803703487495e-05, "loss": 0.3536, "step": 93200 }, { "epoch": 0.34589119812558855, "grad_norm": 1.1717087030410767, "learning_rate": 9.346350549845101e-05, "loss": 0.3276, "step": 93300 }, { "epoch": 0.3462619282414788, "grad_norm": 0.8090676069259644, "learning_rate": 9.344895892332943e-05, "loss": 0.3347, "step": 93400 }, { "epoch": 0.346632658357369, "grad_norm": 0.6055586934089661, "learning_rate": 9.343439731454345e-05, "loss": 0.3322, "step": 93500 }, { "epoch": 0.34700338847325923, "grad_norm": 0.6283841729164124, "learning_rate": 9.341982067713148e-05, "loss": 0.3294, "step": 93600 }, { "epoch": 0.34737411858914946, "grad_norm": 0.4273974597454071, "learning_rate": 9.340522901613717e-05, "loss": 0.3577, "step": 93700 }, { "epoch": 0.3477448487050397, "grad_norm": 0.8069254159927368, "learning_rate": 9.339062233660931e-05, "loss": 0.3444, "step": 93800 }, { "epoch": 0.3481155788209299, "grad_norm": 0.7318072319030762, "learning_rate": 9.337600064360195e-05, "loss": 0.334, "step": 93900 }, { "epoch": 0.3484863089368202, "grad_norm": 0.9682247042655945, "learning_rate": 9.336136394217428e-05, "loss": 0.382, "step": 94000 }, { "epoch": 0.3488570390527104, "grad_norm": 0.5984449982643127, "learning_rate": 9.334671223739073e-05, "loss": 0.325, "step": 94100 }, { "epoch": 0.34922776916860065, "grad_norm": 0.9897327423095703, "learning_rate": 9.33320455343209e-05, "loss": 0.3187, "step": 94200 }, { "epoch": 0.3495984992844909, "grad_norm": 0.8240169286727905, "learning_rate": 9.331736383803955e-05, "loss": 0.3332, "step": 94300 }, { "epoch": 0.3499692294003811, "grad_norm": 0.5817030668258667, "learning_rate": 9.330266715362669e-05, "loss": 0.3256, "step": 94400 }, { "epoch": 0.35033995951627134, "grad_norm": 0.49510458111763, "learning_rate": 9.328795548616747e-05, "loss": 0.3457, "step": 94500 }, { "epoch": 0.35071068963216157, "grad_norm": 0.6322345733642578, "learning_rate": 9.327322884075223e-05, "loss": 0.3274, "step": 94600 }, { "epoch": 0.3510814197480518, "grad_norm": 0.49303868412971497, "learning_rate": 9.32584872224765e-05, "loss": 0.361, "step": 94700 }, { "epoch": 0.351452149863942, "grad_norm": 0.6157641410827637, "learning_rate": 9.3243730636441e-05, "loss": 0.3117, "step": 94800 }, { "epoch": 0.3518228799798323, "grad_norm": 0.5318772792816162, "learning_rate": 9.32289590877516e-05, "loss": 0.3603, "step": 94900 }, { "epoch": 0.35219361009572253, "grad_norm": 0.3497699499130249, "learning_rate": 9.32141725815194e-05, "loss": 0.3565, "step": 95000 }, { "epoch": 0.35256434021161276, "grad_norm": 0.7721067070960999, "learning_rate": 9.31993711228606e-05, "loss": 0.3447, "step": 95100 }, { "epoch": 0.352935070327503, "grad_norm": 0.7092555165290833, "learning_rate": 9.318455471689667e-05, "loss": 0.3295, "step": 95200 }, { "epoch": 0.3533058004433932, "grad_norm": 0.4105045795440674, "learning_rate": 9.316972336875414e-05, "loss": 0.3551, "step": 95300 }, { "epoch": 0.35367653055928344, "grad_norm": 0.7077154517173767, "learning_rate": 9.315487708356479e-05, "loss": 0.3439, "step": 95400 }, { "epoch": 0.35404726067517367, "grad_norm": 0.6228048205375671, "learning_rate": 9.314001586646555e-05, "loss": 0.3613, "step": 95500 }, { "epoch": 0.3544179907910639, "grad_norm": 0.20358939468860626, "learning_rate": 9.312513972259851e-05, "loss": 0.3313, "step": 95600 }, { "epoch": 0.3547887209069542, "grad_norm": 0.36436960101127625, "learning_rate": 9.311024865711093e-05, "loss": 0.327, "step": 95700 }, { "epoch": 0.3551594510228444, "grad_norm": 0.43325650691986084, "learning_rate": 9.309534267515522e-05, "loss": 0.3269, "step": 95800 }, { "epoch": 0.35553018113873464, "grad_norm": 0.47999343276023865, "learning_rate": 9.308042178188898e-05, "loss": 0.3385, "step": 95900 }, { "epoch": 0.35590091125462486, "grad_norm": 0.8582195043563843, "learning_rate": 9.306548598247492e-05, "loss": 0.3721, "step": 96000 }, { "epoch": 0.3562716413705151, "grad_norm": 0.3283604085445404, "learning_rate": 9.305053528208096e-05, "loss": 0.3594, "step": 96100 }, { "epoch": 0.3566423714864053, "grad_norm": 0.588557243347168, "learning_rate": 9.303556968588017e-05, "loss": 0.3107, "step": 96200 }, { "epoch": 0.35701310160229555, "grad_norm": 0.8220109343528748, "learning_rate": 9.30205891990507e-05, "loss": 0.3168, "step": 96300 }, { "epoch": 0.3573838317181858, "grad_norm": 0.8434810042381287, "learning_rate": 9.300559382677598e-05, "loss": 0.3203, "step": 96400 }, { "epoch": 0.35775456183407606, "grad_norm": 1.4650628566741943, "learning_rate": 9.299058357424449e-05, "loss": 0.3585, "step": 96500 }, { "epoch": 0.3581252919499663, "grad_norm": 0.6643173694610596, "learning_rate": 9.297555844664987e-05, "loss": 0.3578, "step": 96600 }, { "epoch": 0.3584960220658565, "grad_norm": 0.4747444689273834, "learning_rate": 9.296051844919093e-05, "loss": 0.3466, "step": 96700 }, { "epoch": 0.35886675218174674, "grad_norm": 1.0474804639816284, "learning_rate": 9.294546358707168e-05, "loss": 0.3358, "step": 96800 }, { "epoch": 0.35923748229763697, "grad_norm": 0.7518901824951172, "learning_rate": 9.293039386550115e-05, "loss": 0.3574, "step": 96900 }, { "epoch": 0.3596082124135272, "grad_norm": 0.5850396752357483, "learning_rate": 9.291530928969359e-05, "loss": 0.3362, "step": 97000 }, { "epoch": 0.3599789425294174, "grad_norm": 0.9023662209510803, "learning_rate": 9.290020986486838e-05, "loss": 0.34, "step": 97100 }, { "epoch": 0.36034967264530765, "grad_norm": 0.5387855172157288, "learning_rate": 9.288509559625006e-05, "loss": 0.3726, "step": 97200 }, { "epoch": 0.3607204027611979, "grad_norm": 0.5117036700248718, "learning_rate": 9.286996648906824e-05, "loss": 0.3854, "step": 97300 }, { "epoch": 0.36109113287708816, "grad_norm": 0.4845927357673645, "learning_rate": 9.285482254855772e-05, "loss": 0.2949, "step": 97400 }, { "epoch": 0.3614618629929784, "grad_norm": 0.6733394861221313, "learning_rate": 9.283966377995843e-05, "loss": 0.3118, "step": 97500 }, { "epoch": 0.3618325931088686, "grad_norm": 0.7416907548904419, "learning_rate": 9.282449018851541e-05, "loss": 0.3727, "step": 97600 }, { "epoch": 0.36220332322475884, "grad_norm": 0.8765612840652466, "learning_rate": 9.28093017794788e-05, "loss": 0.3305, "step": 97700 }, { "epoch": 0.36257405334064907, "grad_norm": 1.2032748460769653, "learning_rate": 9.279409855810397e-05, "loss": 0.352, "step": 97800 }, { "epoch": 0.3629447834565393, "grad_norm": 0.7933481335639954, "learning_rate": 9.27788805296513e-05, "loss": 0.3583, "step": 97900 }, { "epoch": 0.3633155135724295, "grad_norm": 0.6588338613510132, "learning_rate": 9.276364769938633e-05, "loss": 0.3451, "step": 98000 }, { "epoch": 0.36368624368831975, "grad_norm": 0.3400072455406189, "learning_rate": 9.274840007257979e-05, "loss": 0.327, "step": 98100 }, { "epoch": 0.36405697380421004, "grad_norm": 0.7582906484603882, "learning_rate": 9.273313765450742e-05, "loss": 0.3344, "step": 98200 }, { "epoch": 0.36442770392010027, "grad_norm": 0.845819354057312, "learning_rate": 9.271786045045016e-05, "loss": 0.3251, "step": 98300 }, { "epoch": 0.3647984340359905, "grad_norm": 0.8719211220741272, "learning_rate": 9.270256846569401e-05, "loss": 0.3297, "step": 98400 }, { "epoch": 0.3651691641518807, "grad_norm": 3.20678448677063, "learning_rate": 9.268726170553015e-05, "loss": 0.3334, "step": 98500 }, { "epoch": 0.36553989426777095, "grad_norm": 0.7989726066589355, "learning_rate": 9.26719401752548e-05, "loss": 0.3426, "step": 98600 }, { "epoch": 0.3659106243836612, "grad_norm": 0.5385003089904785, "learning_rate": 9.265660388016934e-05, "loss": 0.3344, "step": 98700 }, { "epoch": 0.3662813544995514, "grad_norm": 1.5159116983413696, "learning_rate": 9.264125282558024e-05, "loss": 0.3579, "step": 98800 }, { "epoch": 0.36665208461544163, "grad_norm": 0.8425242900848389, "learning_rate": 9.262588701679906e-05, "loss": 0.3384, "step": 98900 }, { "epoch": 0.36702281473133186, "grad_norm": 0.4901089072227478, "learning_rate": 9.261050645914254e-05, "loss": 0.361, "step": 99000 }, { "epoch": 0.36739354484722214, "grad_norm": 1.1367238759994507, "learning_rate": 9.259511115793239e-05, "loss": 0.3439, "step": 99100 }, { "epoch": 0.36776427496311237, "grad_norm": 0.37864795327186584, "learning_rate": 9.257970111849556e-05, "loss": 0.3616, "step": 99200 }, { "epoch": 0.3681350050790026, "grad_norm": 0.8787299394607544, "learning_rate": 9.2564276346164e-05, "loss": 0.3267, "step": 99300 }, { "epoch": 0.3685057351948928, "grad_norm": 0.5474388003349304, "learning_rate": 9.254883684627483e-05, "loss": 0.3496, "step": 99400 }, { "epoch": 0.36887646531078305, "grad_norm": 0.4167841970920563, "learning_rate": 9.253338262417021e-05, "loss": 0.3418, "step": 99500 }, { "epoch": 0.3692471954266733, "grad_norm": 0.7579504251480103, "learning_rate": 9.251791368519744e-05, "loss": 0.3329, "step": 99600 }, { "epoch": 0.3696179255425635, "grad_norm": 0.3717666566371918, "learning_rate": 9.250243003470887e-05, "loss": 0.362, "step": 99700 }, { "epoch": 0.36998865565845374, "grad_norm": 0.5370324850082397, "learning_rate": 9.248693167806192e-05, "loss": 0.3573, "step": 99800 }, { "epoch": 0.370359385774344, "grad_norm": 1.055830717086792, "learning_rate": 9.247141862061924e-05, "loss": 0.348, "step": 99900 }, { "epoch": 0.37073011589023425, "grad_norm": 1.1238776445388794, "learning_rate": 9.245589086774835e-05, "loss": 0.3409, "step": 100000 }, { "epoch": 0.3711008460061245, "grad_norm": 0.5316088795661926, "learning_rate": 9.244034842482204e-05, "loss": 0.3453, "step": 100100 }, { "epoch": 0.3714715761220147, "grad_norm": 0.9842131733894348, "learning_rate": 9.242479129721808e-05, "loss": 0.293, "step": 100200 }, { "epoch": 0.37184230623790493, "grad_norm": 0.8437684178352356, "learning_rate": 9.240921949031938e-05, "loss": 0.3235, "step": 100300 }, { "epoch": 0.37221303635379516, "grad_norm": 0.5072168707847595, "learning_rate": 9.239363300951387e-05, "loss": 0.3226, "step": 100400 }, { "epoch": 0.3725837664696854, "grad_norm": 0.9968106746673584, "learning_rate": 9.237803186019462e-05, "loss": 0.3326, "step": 100500 }, { "epoch": 0.3729544965855756, "grad_norm": 0.3727928102016449, "learning_rate": 9.236241604775973e-05, "loss": 0.3385, "step": 100600 }, { "epoch": 0.37332522670146584, "grad_norm": 0.840183436870575, "learning_rate": 9.234678557761237e-05, "loss": 0.3259, "step": 100700 }, { "epoch": 0.3736959568173561, "grad_norm": 0.8141234517097473, "learning_rate": 9.233114045516083e-05, "loss": 0.3129, "step": 100800 }, { "epoch": 0.37406668693324635, "grad_norm": 0.8707042336463928, "learning_rate": 9.23154806858184e-05, "loss": 0.3431, "step": 100900 }, { "epoch": 0.3744374170491366, "grad_norm": 0.5025699138641357, "learning_rate": 9.229980627500353e-05, "loss": 0.3319, "step": 101000 }, { "epoch": 0.3748081471650268, "grad_norm": 0.8001802563667297, "learning_rate": 9.228411722813964e-05, "loss": 0.32, "step": 101100 }, { "epoch": 0.37517887728091703, "grad_norm": 0.42514559626579285, "learning_rate": 9.226841355065527e-05, "loss": 0.3523, "step": 101200 }, { "epoch": 0.37554960739680726, "grad_norm": 0.18524208664894104, "learning_rate": 9.225269524798403e-05, "loss": 0.3669, "step": 101300 }, { "epoch": 0.3759203375126975, "grad_norm": 0.9158624410629272, "learning_rate": 9.223696232556453e-05, "loss": 0.3257, "step": 101400 }, { "epoch": 0.3762910676285877, "grad_norm": 0.438209593296051, "learning_rate": 9.222121478884052e-05, "loss": 0.3652, "step": 101500 }, { "epoch": 0.376661797744478, "grad_norm": 0.2667672634124756, "learning_rate": 9.220545264326074e-05, "loss": 0.3424, "step": 101600 }, { "epoch": 0.3770325278603682, "grad_norm": 0.4034643769264221, "learning_rate": 9.2189675894279e-05, "loss": 0.3389, "step": 101700 }, { "epoch": 0.37740325797625845, "grad_norm": 0.5279093980789185, "learning_rate": 9.217388454735421e-05, "loss": 0.355, "step": 101800 }, { "epoch": 0.3777739880921487, "grad_norm": 0.5742658376693726, "learning_rate": 9.215807860795028e-05, "loss": 0.3301, "step": 101900 }, { "epoch": 0.3781447182080389, "grad_norm": 0.4910024404525757, "learning_rate": 9.214225808153616e-05, "loss": 0.3171, "step": 102000 }, { "epoch": 0.37851544832392914, "grad_norm": 1.0892994403839111, "learning_rate": 9.212642297358589e-05, "loss": 0.335, "step": 102100 }, { "epoch": 0.37888617843981937, "grad_norm": 1.0148341655731201, "learning_rate": 9.211057328957854e-05, "loss": 0.336, "step": 102200 }, { "epoch": 0.3792569085557096, "grad_norm": 0.6240531802177429, "learning_rate": 9.209470903499821e-05, "loss": 0.3619, "step": 102300 }, { "epoch": 0.3796276386715999, "grad_norm": 0.5928137898445129, "learning_rate": 9.207883021533404e-05, "loss": 0.3335, "step": 102400 }, { "epoch": 0.3799983687874901, "grad_norm": 0.76870197057724, "learning_rate": 9.206293683608024e-05, "loss": 0.3592, "step": 102500 }, { "epoch": 0.38036909890338033, "grad_norm": 0.9269668459892273, "learning_rate": 9.204702890273605e-05, "loss": 0.3047, "step": 102600 }, { "epoch": 0.38073982901927056, "grad_norm": 0.32405000925064087, "learning_rate": 9.20311064208057e-05, "loss": 0.3083, "step": 102700 }, { "epoch": 0.3811105591351608, "grad_norm": 0.43341919779777527, "learning_rate": 9.201516939579849e-05, "loss": 0.3412, "step": 102800 }, { "epoch": 0.381481289251051, "grad_norm": 0.41278401017189026, "learning_rate": 9.199921783322877e-05, "loss": 0.3442, "step": 102900 }, { "epoch": 0.38185201936694124, "grad_norm": 0.8278214931488037, "learning_rate": 9.198325173861589e-05, "loss": 0.3309, "step": 103000 }, { "epoch": 0.38222274948283147, "grad_norm": 1.124145269393921, "learning_rate": 9.196727111748424e-05, "loss": 0.3318, "step": 103100 }, { "epoch": 0.3825934795987217, "grad_norm": 0.3276994228363037, "learning_rate": 9.195127597536324e-05, "loss": 0.3226, "step": 103200 }, { "epoch": 0.382964209714612, "grad_norm": 0.7374022006988525, "learning_rate": 9.193526631778733e-05, "loss": 0.3103, "step": 103300 }, { "epoch": 0.3833349398305022, "grad_norm": 0.7916367650032043, "learning_rate": 9.191924215029595e-05, "loss": 0.3227, "step": 103400 }, { "epoch": 0.38370566994639244, "grad_norm": 0.7895129323005676, "learning_rate": 9.19032034784336e-05, "loss": 0.3674, "step": 103500 }, { "epoch": 0.38407640006228266, "grad_norm": 0.5610736012458801, "learning_rate": 9.188715030774976e-05, "loss": 0.3435, "step": 103600 }, { "epoch": 0.3844471301781729, "grad_norm": 0.23312751948833466, "learning_rate": 9.187108264379899e-05, "loss": 0.3541, "step": 103700 }, { "epoch": 0.3848178602940631, "grad_norm": 0.5601670145988464, "learning_rate": 9.185500049214077e-05, "loss": 0.3301, "step": 103800 }, { "epoch": 0.38518859040995335, "grad_norm": 0.4852913022041321, "learning_rate": 9.183890385833969e-05, "loss": 0.3456, "step": 103900 }, { "epoch": 0.3855593205258436, "grad_norm": 0.8383184671401978, "learning_rate": 9.182279274796528e-05, "loss": 0.3441, "step": 104000 }, { "epoch": 0.38593005064173386, "grad_norm": 0.8916184306144714, "learning_rate": 9.18066671665921e-05, "loss": 0.3189, "step": 104100 }, { "epoch": 0.3863007807576241, "grad_norm": 0.3617243468761444, "learning_rate": 9.179052711979974e-05, "loss": 0.3643, "step": 104200 }, { "epoch": 0.3866715108735143, "grad_norm": 0.5950339436531067, "learning_rate": 9.177437261317277e-05, "loss": 0.3367, "step": 104300 }, { "epoch": 0.38704224098940454, "grad_norm": 1.4513438940048218, "learning_rate": 9.175820365230076e-05, "loss": 0.3107, "step": 104400 }, { "epoch": 0.38741297110529477, "grad_norm": 0.38993415236473083, "learning_rate": 9.174202024277831e-05, "loss": 0.3272, "step": 104500 }, { "epoch": 0.387783701221185, "grad_norm": 1.0211920738220215, "learning_rate": 9.172582239020497e-05, "loss": 0.3339, "step": 104600 }, { "epoch": 0.3881544313370752, "grad_norm": 0.39457330107688904, "learning_rate": 9.170961010018535e-05, "loss": 0.3279, "step": 104700 }, { "epoch": 0.38852516145296545, "grad_norm": 0.7699959874153137, "learning_rate": 9.1693383378329e-05, "loss": 0.3265, "step": 104800 }, { "epoch": 0.3888958915688557, "grad_norm": 0.5681918263435364, "learning_rate": 9.167714223025052e-05, "loss": 0.3409, "step": 104900 }, { "epoch": 0.38926662168474596, "grad_norm": 0.5813969373703003, "learning_rate": 9.166088666156943e-05, "loss": 0.3381, "step": 105000 }, { "epoch": 0.3896373518006362, "grad_norm": 0.38692739605903625, "learning_rate": 9.164461667791031e-05, "loss": 0.3521, "step": 105100 }, { "epoch": 0.3900080819165264, "grad_norm": 0.8726442456245422, "learning_rate": 9.162833228490266e-05, "loss": 0.3338, "step": 105200 }, { "epoch": 0.39037881203241664, "grad_norm": 0.6215270757675171, "learning_rate": 9.161203348818104e-05, "loss": 0.3285, "step": 105300 }, { "epoch": 0.39074954214830687, "grad_norm": 0.871218204498291, "learning_rate": 9.159572029338493e-05, "loss": 0.3226, "step": 105400 }, { "epoch": 0.3911202722641971, "grad_norm": 0.8004810214042664, "learning_rate": 9.157939270615882e-05, "loss": 0.3378, "step": 105500 }, { "epoch": 0.3914910023800873, "grad_norm": 0.6713256239891052, "learning_rate": 9.15630507321522e-05, "loss": 0.3372, "step": 105600 }, { "epoch": 0.39186173249597755, "grad_norm": 0.6418102979660034, "learning_rate": 9.154669437701949e-05, "loss": 0.3275, "step": 105700 }, { "epoch": 0.39223246261186784, "grad_norm": 0.5474681258201599, "learning_rate": 9.153032364642012e-05, "loss": 0.3677, "step": 105800 }, { "epoch": 0.39260319272775807, "grad_norm": 1.2297829389572144, "learning_rate": 9.151393854601847e-05, "loss": 0.3152, "step": 105900 }, { "epoch": 0.3929739228436483, "grad_norm": 1.180177092552185, "learning_rate": 9.149753908148393e-05, "loss": 0.312, "step": 106000 }, { "epoch": 0.3933446529595385, "grad_norm": 0.4470861256122589, "learning_rate": 9.148112525849082e-05, "loss": 0.3418, "step": 106100 }, { "epoch": 0.39371538307542875, "grad_norm": 0.45931535959243774, "learning_rate": 9.146469708271846e-05, "loss": 0.3247, "step": 106200 }, { "epoch": 0.394086113191319, "grad_norm": 0.8460520505905151, "learning_rate": 9.144825455985111e-05, "loss": 0.3249, "step": 106300 }, { "epoch": 0.3944568433072092, "grad_norm": 0.7163436412811279, "learning_rate": 9.143179769557801e-05, "loss": 0.3106, "step": 106400 }, { "epoch": 0.39482757342309943, "grad_norm": 0.7780168652534485, "learning_rate": 9.141532649559337e-05, "loss": 0.3302, "step": 106500 }, { "epoch": 0.39519830353898966, "grad_norm": 1.3201627731323242, "learning_rate": 9.139884096559634e-05, "loss": 0.3231, "step": 106600 }, { "epoch": 0.39556903365487994, "grad_norm": 0.28372249007225037, "learning_rate": 9.138234111129103e-05, "loss": 0.3181, "step": 106700 }, { "epoch": 0.39593976377077017, "grad_norm": 0.6903362274169922, "learning_rate": 9.13658269383865e-05, "loss": 0.3297, "step": 106800 }, { "epoch": 0.3963104938866604, "grad_norm": 0.7018836736679077, "learning_rate": 9.13492984525968e-05, "loss": 0.3318, "step": 106900 }, { "epoch": 0.3966812240025506, "grad_norm": 0.485586553812027, "learning_rate": 9.133275565964093e-05, "loss": 0.3709, "step": 107000 }, { "epoch": 0.39705195411844085, "grad_norm": 0.3178842067718506, "learning_rate": 9.131619856524276e-05, "loss": 0.3385, "step": 107100 }, { "epoch": 0.3974226842343311, "grad_norm": 0.4991243779659271, "learning_rate": 9.129962717513121e-05, "loss": 0.3751, "step": 107200 }, { "epoch": 0.3977934143502213, "grad_norm": 1.267915964126587, "learning_rate": 9.12830414950401e-05, "loss": 0.3275, "step": 107300 }, { "epoch": 0.39816414446611154, "grad_norm": 1.2061325311660767, "learning_rate": 9.12664415307082e-05, "loss": 0.3207, "step": 107400 }, { "epoch": 0.3985348745820018, "grad_norm": 0.43230292201042175, "learning_rate": 9.124982728787921e-05, "loss": 0.363, "step": 107500 }, { "epoch": 0.39890560469789205, "grad_norm": 0.49006402492523193, "learning_rate": 9.123319877230179e-05, "loss": 0.3113, "step": 107600 }, { "epoch": 0.3992763348137823, "grad_norm": 0.97110915184021, "learning_rate": 9.121655598972953e-05, "loss": 0.3554, "step": 107700 }, { "epoch": 0.3996470649296725, "grad_norm": 0.6153727173805237, "learning_rate": 9.119989894592096e-05, "loss": 0.351, "step": 107800 }, { "epoch": 0.40001779504556273, "grad_norm": 0.7296960353851318, "learning_rate": 9.118322764663953e-05, "loss": 0.3101, "step": 107900 }, { "epoch": 0.40038852516145296, "grad_norm": 1.046919345855713, "learning_rate": 9.116654209765365e-05, "loss": 0.3474, "step": 108000 }, { "epoch": 0.4007592552773432, "grad_norm": 0.9118031859397888, "learning_rate": 9.114984230473664e-05, "loss": 0.3125, "step": 108100 }, { "epoch": 0.4011299853932334, "grad_norm": 0.7791998386383057, "learning_rate": 9.113312827366675e-05, "loss": 0.3362, "step": 108200 }, { "epoch": 0.4015007155091237, "grad_norm": 0.48444485664367676, "learning_rate": 9.111640001022716e-05, "loss": 0.3619, "step": 108300 }, { "epoch": 0.4018714456250139, "grad_norm": 0.4027473032474518, "learning_rate": 9.109965752020597e-05, "loss": 0.3431, "step": 108400 }, { "epoch": 0.40224217574090415, "grad_norm": 0.6359562277793884, "learning_rate": 9.108290080939621e-05, "loss": 0.3021, "step": 108500 }, { "epoch": 0.4026129058567944, "grad_norm": 0.2671416401863098, "learning_rate": 9.106612988359585e-05, "loss": 0.3167, "step": 108600 }, { "epoch": 0.4029836359726846, "grad_norm": 0.28509512543678284, "learning_rate": 9.10493447486077e-05, "loss": 0.3401, "step": 108700 }, { "epoch": 0.40335436608857483, "grad_norm": 0.32149213552474976, "learning_rate": 9.103254541023959e-05, "loss": 0.3245, "step": 108800 }, { "epoch": 0.40372509620446506, "grad_norm": 0.4606223702430725, "learning_rate": 9.101573187430421e-05, "loss": 0.3331, "step": 108900 }, { "epoch": 0.4040958263203553, "grad_norm": 0.8097549676895142, "learning_rate": 9.099890414661914e-05, "loss": 0.3529, "step": 109000 }, { "epoch": 0.4044665564362455, "grad_norm": 0.5603088736534119, "learning_rate": 9.098206223300693e-05, "loss": 0.3574, "step": 109100 }, { "epoch": 0.4048372865521358, "grad_norm": 0.6863930821418762, "learning_rate": 9.096520613929503e-05, "loss": 0.3366, "step": 109200 }, { "epoch": 0.405208016668026, "grad_norm": 0.5491046905517578, "learning_rate": 9.09483358713157e-05, "loss": 0.3393, "step": 109300 }, { "epoch": 0.40557874678391626, "grad_norm": 0.59660804271698, "learning_rate": 9.093145143490625e-05, "loss": 0.3244, "step": 109400 }, { "epoch": 0.4059494768998065, "grad_norm": 0.9176308512687683, "learning_rate": 9.091455283590877e-05, "loss": 0.3299, "step": 109500 }, { "epoch": 0.4063202070156967, "grad_norm": 1.4202650785446167, "learning_rate": 9.089764008017034e-05, "loss": 0.3448, "step": 109600 }, { "epoch": 0.40669093713158694, "grad_norm": 0.48554670810699463, "learning_rate": 9.088071317354287e-05, "loss": 0.3223, "step": 109700 }, { "epoch": 0.40706166724747717, "grad_norm": 0.7899838089942932, "learning_rate": 9.086377212188324e-05, "loss": 0.3396, "step": 109800 }, { "epoch": 0.4074323973633674, "grad_norm": 0.16108044981956482, "learning_rate": 9.084681693105312e-05, "loss": 0.3221, "step": 109900 }, { "epoch": 0.4078031274792577, "grad_norm": 0.46110406517982483, "learning_rate": 9.082984760691919e-05, "loss": 0.344, "step": 110000 }, { "epoch": 0.4081738575951479, "grad_norm": 2.056623697280884, "learning_rate": 9.081286415535292e-05, "loss": 0.3321, "step": 110100 }, { "epoch": 0.40854458771103813, "grad_norm": 0.5816348791122437, "learning_rate": 9.079586658223074e-05, "loss": 0.3345, "step": 110200 }, { "epoch": 0.40891531782692836, "grad_norm": 0.5618560314178467, "learning_rate": 9.077885489343392e-05, "loss": 0.3527, "step": 110300 }, { "epoch": 0.4092860479428186, "grad_norm": 0.5037525296211243, "learning_rate": 9.076182909484862e-05, "loss": 0.35, "step": 110400 }, { "epoch": 0.4096567780587088, "grad_norm": 0.6426706910133362, "learning_rate": 9.074478919236593e-05, "loss": 0.3591, "step": 110500 }, { "epoch": 0.41002750817459904, "grad_norm": 1.9947469234466553, "learning_rate": 9.072773519188175e-05, "loss": 0.34, "step": 110600 }, { "epoch": 0.41039823829048927, "grad_norm": 0.8876546025276184, "learning_rate": 9.071066709929692e-05, "loss": 0.3357, "step": 110700 }, { "epoch": 0.4107689684063795, "grad_norm": 0.7403554916381836, "learning_rate": 9.06935849205171e-05, "loss": 0.324, "step": 110800 }, { "epoch": 0.4111396985222698, "grad_norm": 0.7658447623252869, "learning_rate": 9.067648866145288e-05, "loss": 0.3626, "step": 110900 }, { "epoch": 0.41151042863816, "grad_norm": 0.3899599611759186, "learning_rate": 9.065937832801966e-05, "loss": 0.3381, "step": 111000 }, { "epoch": 0.41188115875405024, "grad_norm": 0.40966057777404785, "learning_rate": 9.064225392613777e-05, "loss": 0.3492, "step": 111100 }, { "epoch": 0.41225188886994046, "grad_norm": 0.6038539409637451, "learning_rate": 9.062511546173237e-05, "loss": 0.3188, "step": 111200 }, { "epoch": 0.4126226189858307, "grad_norm": 0.5042416453361511, "learning_rate": 9.060796294073351e-05, "loss": 0.3249, "step": 111300 }, { "epoch": 0.4129933491017209, "grad_norm": 0.7415258884429932, "learning_rate": 9.059079636907606e-05, "loss": 0.344, "step": 111400 }, { "epoch": 0.41336407921761115, "grad_norm": 2.555881977081299, "learning_rate": 9.057361575269981e-05, "loss": 0.3638, "step": 111500 }, { "epoch": 0.4137348093335014, "grad_norm": 0.43271780014038086, "learning_rate": 9.055642109754936e-05, "loss": 0.3604, "step": 111600 }, { "epoch": 0.41410553944939166, "grad_norm": 0.9742223024368286, "learning_rate": 9.053921240957422e-05, "loss": 0.3173, "step": 111700 }, { "epoch": 0.4144762695652819, "grad_norm": 0.4747956395149231, "learning_rate": 9.052198969472869e-05, "loss": 0.3243, "step": 111800 }, { "epoch": 0.4148469996811721, "grad_norm": 0.9841435551643372, "learning_rate": 9.0504752958972e-05, "loss": 0.368, "step": 111900 }, { "epoch": 0.41521772979706234, "grad_norm": 1.2099230289459229, "learning_rate": 9.048750220826814e-05, "loss": 0.3441, "step": 112000 }, { "epoch": 0.41558845991295257, "grad_norm": 0.2806797921657562, "learning_rate": 9.047023744858603e-05, "loss": 0.3301, "step": 112100 }, { "epoch": 0.4159591900288428, "grad_norm": 0.44305258989334106, "learning_rate": 9.04529586858994e-05, "loss": 0.345, "step": 112200 }, { "epoch": 0.416329920144733, "grad_norm": 0.7579039335250854, "learning_rate": 9.043566592618683e-05, "loss": 0.3141, "step": 112300 }, { "epoch": 0.41670065026062325, "grad_norm": 0.7567089200019836, "learning_rate": 9.041835917543175e-05, "loss": 0.3337, "step": 112400 }, { "epoch": 0.4170713803765135, "grad_norm": 0.6553892493247986, "learning_rate": 9.040103843962239e-05, "loss": 0.3566, "step": 112500 }, { "epoch": 0.41744211049240376, "grad_norm": 0.3758779466152191, "learning_rate": 9.038370372475191e-05, "loss": 0.3287, "step": 112600 }, { "epoch": 0.417812840608294, "grad_norm": 1.5614092350006104, "learning_rate": 9.036635503681822e-05, "loss": 0.3461, "step": 112700 }, { "epoch": 0.4181835707241842, "grad_norm": 0.5693640112876892, "learning_rate": 9.034899238182409e-05, "loss": 0.3536, "step": 112800 }, { "epoch": 0.41855430084007444, "grad_norm": 0.6917423605918884, "learning_rate": 9.033161576577714e-05, "loss": 0.3319, "step": 112900 }, { "epoch": 0.4189250309559647, "grad_norm": 0.5170223116874695, "learning_rate": 9.031422519468982e-05, "loss": 0.3432, "step": 113000 }, { "epoch": 0.4192957610718549, "grad_norm": 1.1400420665740967, "learning_rate": 9.029682067457936e-05, "loss": 0.3607, "step": 113100 }, { "epoch": 0.4196664911877451, "grad_norm": 0.8493146896362305, "learning_rate": 9.027940221146789e-05, "loss": 0.3146, "step": 113200 }, { "epoch": 0.42003722130363536, "grad_norm": 0.8129609227180481, "learning_rate": 9.026196981138232e-05, "loss": 0.3356, "step": 113300 }, { "epoch": 0.42040795141952564, "grad_norm": 0.708017110824585, "learning_rate": 9.024452348035437e-05, "loss": 0.3388, "step": 113400 }, { "epoch": 0.42077868153541587, "grad_norm": 0.750062882900238, "learning_rate": 9.022706322442064e-05, "loss": 0.3336, "step": 113500 }, { "epoch": 0.4211494116513061, "grad_norm": 0.5924072265625, "learning_rate": 9.020958904962246e-05, "loss": 0.3095, "step": 113600 }, { "epoch": 0.4215201417671963, "grad_norm": 0.34617000818252563, "learning_rate": 9.019210096200608e-05, "loss": 0.3642, "step": 113700 }, { "epoch": 0.42189087188308655, "grad_norm": 0.6242346167564392, "learning_rate": 9.017459896762247e-05, "loss": 0.3359, "step": 113800 }, { "epoch": 0.4222616019989768, "grad_norm": 0.5208154320716858, "learning_rate": 9.015708307252747e-05, "loss": 0.3268, "step": 113900 }, { "epoch": 0.422632332114867, "grad_norm": 0.45826470851898193, "learning_rate": 9.013955328278169e-05, "loss": 0.3217, "step": 114000 }, { "epoch": 0.42300306223075723, "grad_norm": 0.6816144585609436, "learning_rate": 9.012200960445058e-05, "loss": 0.3473, "step": 114100 }, { "epoch": 0.4233737923466475, "grad_norm": 0.874652087688446, "learning_rate": 9.010445204360437e-05, "loss": 0.3272, "step": 114200 }, { "epoch": 0.42374452246253774, "grad_norm": 0.9392380714416504, "learning_rate": 9.008688060631816e-05, "loss": 0.3558, "step": 114300 }, { "epoch": 0.42411525257842797, "grad_norm": 0.7675385475158691, "learning_rate": 9.006929529867173e-05, "loss": 0.3122, "step": 114400 }, { "epoch": 0.4244859826943182, "grad_norm": 0.7133064866065979, "learning_rate": 9.005169612674976e-05, "loss": 0.3402, "step": 114500 }, { "epoch": 0.4248567128102084, "grad_norm": 0.5190449953079224, "learning_rate": 9.00340830966417e-05, "loss": 0.3694, "step": 114600 }, { "epoch": 0.42522744292609865, "grad_norm": 0.5263517498970032, "learning_rate": 9.001645621444178e-05, "loss": 0.315, "step": 114700 }, { "epoch": 0.4255981730419889, "grad_norm": 0.4652208089828491, "learning_rate": 8.999881548624905e-05, "loss": 0.3249, "step": 114800 }, { "epoch": 0.4259689031578791, "grad_norm": 0.9704657196998596, "learning_rate": 8.998116091816728e-05, "loss": 0.3591, "step": 114900 }, { "epoch": 0.42633963327376934, "grad_norm": 0.6267880201339722, "learning_rate": 8.996349251630516e-05, "loss": 0.3578, "step": 115000 }, { "epoch": 0.4267103633896596, "grad_norm": 0.46275976300239563, "learning_rate": 8.994581028677604e-05, "loss": 0.3584, "step": 115100 }, { "epoch": 0.42708109350554985, "grad_norm": 0.612018346786499, "learning_rate": 8.992811423569811e-05, "loss": 0.3064, "step": 115200 }, { "epoch": 0.4274518236214401, "grad_norm": 0.31399431824684143, "learning_rate": 8.991040436919435e-05, "loss": 0.3451, "step": 115300 }, { "epoch": 0.4278225537373303, "grad_norm": 0.6778253316879272, "learning_rate": 8.989268069339248e-05, "loss": 0.319, "step": 115400 }, { "epoch": 0.42819328385322053, "grad_norm": 0.47380542755126953, "learning_rate": 8.987494321442506e-05, "loss": 0.336, "step": 115500 }, { "epoch": 0.42856401396911076, "grad_norm": 0.5363597869873047, "learning_rate": 8.985719193842935e-05, "loss": 0.3181, "step": 115600 }, { "epoch": 0.428934744085001, "grad_norm": 0.7450547814369202, "learning_rate": 8.983942687154746e-05, "loss": 0.3164, "step": 115700 }, { "epoch": 0.4293054742008912, "grad_norm": 1.3432536125183105, "learning_rate": 8.98216480199262e-05, "loss": 0.3263, "step": 115800 }, { "epoch": 0.4296762043167815, "grad_norm": 0.81186443567276, "learning_rate": 8.980385538971724e-05, "loss": 0.3033, "step": 115900 }, { "epoch": 0.4300469344326717, "grad_norm": 0.46001842617988586, "learning_rate": 8.978604898707692e-05, "loss": 0.3162, "step": 116000 }, { "epoch": 0.43041766454856195, "grad_norm": 0.5632451176643372, "learning_rate": 8.976822881816637e-05, "loss": 0.3308, "step": 116100 }, { "epoch": 0.4307883946644522, "grad_norm": 0.9337729215621948, "learning_rate": 8.975039488915156e-05, "loss": 0.3188, "step": 116200 }, { "epoch": 0.4311591247803424, "grad_norm": 0.5265591740608215, "learning_rate": 8.973254720620312e-05, "loss": 0.3809, "step": 116300 }, { "epoch": 0.43152985489623263, "grad_norm": 0.6146958470344543, "learning_rate": 8.97146857754965e-05, "loss": 0.3543, "step": 116400 }, { "epoch": 0.43190058501212286, "grad_norm": 0.6869450211524963, "learning_rate": 8.969681060321188e-05, "loss": 0.3553, "step": 116500 }, { "epoch": 0.4322713151280131, "grad_norm": 0.34091928601264954, "learning_rate": 8.96789216955342e-05, "loss": 0.3365, "step": 116600 }, { "epoch": 0.4326420452439033, "grad_norm": 0.3500145673751831, "learning_rate": 8.966101905865316e-05, "loss": 0.3363, "step": 116700 }, { "epoch": 0.4330127753597936, "grad_norm": 0.7276604771614075, "learning_rate": 8.964310269876321e-05, "loss": 0.3001, "step": 116800 }, { "epoch": 0.43338350547568383, "grad_norm": 0.305853933095932, "learning_rate": 8.962517262206354e-05, "loss": 0.3248, "step": 116900 }, { "epoch": 0.43375423559157406, "grad_norm": 0.4419228136539459, "learning_rate": 8.960722883475809e-05, "loss": 0.3448, "step": 117000 }, { "epoch": 0.4341249657074643, "grad_norm": 0.6690040230751038, "learning_rate": 8.958927134305557e-05, "loss": 0.3284, "step": 117100 }, { "epoch": 0.4344956958233545, "grad_norm": 0.3984261453151703, "learning_rate": 8.957130015316935e-05, "loss": 0.3238, "step": 117200 }, { "epoch": 0.43486642593924474, "grad_norm": 0.295657217502594, "learning_rate": 8.955331527131765e-05, "loss": 0.3238, "step": 117300 }, { "epoch": 0.43523715605513497, "grad_norm": 0.5306917428970337, "learning_rate": 8.953531670372335e-05, "loss": 0.3396, "step": 117400 }, { "epoch": 0.4356078861710252, "grad_norm": 0.6393373012542725, "learning_rate": 8.951730445661408e-05, "loss": 0.3448, "step": 117500 }, { "epoch": 0.4359786162869155, "grad_norm": 0.7369088530540466, "learning_rate": 8.949927853622222e-05, "loss": 0.3146, "step": 117600 }, { "epoch": 0.4363493464028057, "grad_norm": 0.6221359968185425, "learning_rate": 8.948123894878489e-05, "loss": 0.3374, "step": 117700 }, { "epoch": 0.43672007651869593, "grad_norm": 0.8261216878890991, "learning_rate": 8.946318570054391e-05, "loss": 0.3217, "step": 117800 }, { "epoch": 0.43709080663458616, "grad_norm": 0.782930850982666, "learning_rate": 8.944511879774582e-05, "loss": 0.3274, "step": 117900 }, { "epoch": 0.4374615367504764, "grad_norm": 0.4863019287586212, "learning_rate": 8.942703824664193e-05, "loss": 0.3071, "step": 118000 }, { "epoch": 0.4378322668663666, "grad_norm": 0.8954169750213623, "learning_rate": 8.940894405348822e-05, "loss": 0.3283, "step": 118100 }, { "epoch": 0.43820299698225684, "grad_norm": 0.7546159625053406, "learning_rate": 8.939083622454547e-05, "loss": 0.3443, "step": 118200 }, { "epoch": 0.43857372709814707, "grad_norm": 0.8558548092842102, "learning_rate": 8.937271476607906e-05, "loss": 0.3447, "step": 118300 }, { "epoch": 0.4389444572140373, "grad_norm": 0.4052615165710449, "learning_rate": 8.935457968435919e-05, "loss": 0.3082, "step": 118400 }, { "epoch": 0.4393151873299276, "grad_norm": 0.27811044454574585, "learning_rate": 8.933643098566071e-05, "loss": 0.3157, "step": 118500 }, { "epoch": 0.4396859174458178, "grad_norm": 0.3674699068069458, "learning_rate": 8.93182686762632e-05, "loss": 0.3009, "step": 118600 }, { "epoch": 0.44005664756170804, "grad_norm": 0.57839435338974, "learning_rate": 8.930009276245099e-05, "loss": 0.3527, "step": 118700 }, { "epoch": 0.44042737767759826, "grad_norm": 0.43637606501579285, "learning_rate": 8.928190325051307e-05, "loss": 0.3199, "step": 118800 }, { "epoch": 0.4407981077934885, "grad_norm": 0.7029194831848145, "learning_rate": 8.926370014674313e-05, "loss": 0.3523, "step": 118900 }, { "epoch": 0.4411688379093787, "grad_norm": 0.696883499622345, "learning_rate": 8.924548345743959e-05, "loss": 0.3399, "step": 119000 }, { "epoch": 0.44153956802526895, "grad_norm": 0.6023645401000977, "learning_rate": 8.922725318890554e-05, "loss": 0.3274, "step": 119100 }, { "epoch": 0.4419102981411592, "grad_norm": 0.6686874032020569, "learning_rate": 8.920900934744883e-05, "loss": 0.3474, "step": 119200 }, { "epoch": 0.44228102825704946, "grad_norm": 0.5677143335342407, "learning_rate": 8.919075193938194e-05, "loss": 0.3517, "step": 119300 }, { "epoch": 0.4426517583729397, "grad_norm": 0.7738279104232788, "learning_rate": 8.917248097102209e-05, "loss": 0.3186, "step": 119400 }, { "epoch": 0.4430224884888299, "grad_norm": 0.534642219543457, "learning_rate": 8.915419644869114e-05, "loss": 0.3251, "step": 119500 }, { "epoch": 0.44339321860472014, "grad_norm": 0.6297026872634888, "learning_rate": 8.913589837871568e-05, "loss": 0.3192, "step": 119600 }, { "epoch": 0.44376394872061037, "grad_norm": 0.6582304239273071, "learning_rate": 8.911758676742699e-05, "loss": 0.3341, "step": 119700 }, { "epoch": 0.4441346788365006, "grad_norm": 0.6355113983154297, "learning_rate": 8.909926162116102e-05, "loss": 0.3413, "step": 119800 }, { "epoch": 0.4445054089523908, "grad_norm": 0.5417113900184631, "learning_rate": 8.908092294625841e-05, "loss": 0.3344, "step": 119900 }, { "epoch": 0.44487613906828105, "grad_norm": 0.41559723019599915, "learning_rate": 8.906257074906447e-05, "loss": 0.3417, "step": 120000 }, { "epoch": 0.44524686918417133, "grad_norm": 1.1016979217529297, "learning_rate": 8.90442050359292e-05, "loss": 0.3475, "step": 120100 }, { "epoch": 0.44561759930006156, "grad_norm": 1.6406201124191284, "learning_rate": 8.902582581320731e-05, "loss": 0.3273, "step": 120200 }, { "epoch": 0.4459883294159518, "grad_norm": 0.9386048316955566, "learning_rate": 8.900743308725807e-05, "loss": 0.3168, "step": 120300 }, { "epoch": 0.446359059531842, "grad_norm": 0.9136038422584534, "learning_rate": 8.898902686444558e-05, "loss": 0.3333, "step": 120400 }, { "epoch": 0.44672978964773224, "grad_norm": 0.7121193408966064, "learning_rate": 8.897060715113848e-05, "loss": 0.3242, "step": 120500 }, { "epoch": 0.4471005197636225, "grad_norm": 0.9924787282943726, "learning_rate": 8.895217395371016e-05, "loss": 0.3506, "step": 120600 }, { "epoch": 0.4474712498795127, "grad_norm": 0.5255289077758789, "learning_rate": 8.893372727853862e-05, "loss": 0.3103, "step": 120700 }, { "epoch": 0.44784197999540293, "grad_norm": 0.5155327916145325, "learning_rate": 8.891526713200655e-05, "loss": 0.3382, "step": 120800 }, { "epoch": 0.44821271011129316, "grad_norm": 0.3337087333202362, "learning_rate": 8.889679352050131e-05, "loss": 0.3377, "step": 120900 }, { "epoch": 0.44858344022718344, "grad_norm": 0.5516032576560974, "learning_rate": 8.88783064504149e-05, "loss": 0.3378, "step": 121000 }, { "epoch": 0.44895417034307367, "grad_norm": 0.7628347873687744, "learning_rate": 8.8859805928144e-05, "loss": 0.3621, "step": 121100 }, { "epoch": 0.4493249004589639, "grad_norm": 0.43531614542007446, "learning_rate": 8.88412919600899e-05, "loss": 0.3444, "step": 121200 }, { "epoch": 0.4496956305748541, "grad_norm": 0.6984626650810242, "learning_rate": 8.882276455265861e-05, "loss": 0.3079, "step": 121300 }, { "epoch": 0.45006636069074435, "grad_norm": 0.44845253229141235, "learning_rate": 8.880422371226071e-05, "loss": 0.3357, "step": 121400 }, { "epoch": 0.4504370908066346, "grad_norm": 0.6871994137763977, "learning_rate": 8.87856694453115e-05, "loss": 0.3469, "step": 121500 }, { "epoch": 0.4508078209225248, "grad_norm": 0.5631246566772461, "learning_rate": 8.876710175823087e-05, "loss": 0.3244, "step": 121600 }, { "epoch": 0.45117855103841503, "grad_norm": 0.9051106572151184, "learning_rate": 8.87485206574434e-05, "loss": 0.328, "step": 121700 }, { "epoch": 0.4515492811543053, "grad_norm": 0.6563377976417542, "learning_rate": 8.872992614937828e-05, "loss": 0.316, "step": 121800 }, { "epoch": 0.45192001127019554, "grad_norm": 0.640887439250946, "learning_rate": 8.871131824046935e-05, "loss": 0.3202, "step": 121900 }, { "epoch": 0.45229074138608577, "grad_norm": 0.5345094203948975, "learning_rate": 8.869269693715509e-05, "loss": 0.3242, "step": 122000 }, { "epoch": 0.452661471501976, "grad_norm": 0.37606698274612427, "learning_rate": 8.86740622458786e-05, "loss": 0.3233, "step": 122100 }, { "epoch": 0.4530322016178662, "grad_norm": 0.6615543961524963, "learning_rate": 8.865541417308763e-05, "loss": 0.3208, "step": 122200 }, { "epoch": 0.45340293173375645, "grad_norm": 0.8321110010147095, "learning_rate": 8.863675272523456e-05, "loss": 0.3375, "step": 122300 }, { "epoch": 0.4537736618496467, "grad_norm": 1.5832992792129517, "learning_rate": 8.861807790877637e-05, "loss": 0.3324, "step": 122400 }, { "epoch": 0.4541443919655369, "grad_norm": 0.42303821444511414, "learning_rate": 8.85993897301747e-05, "loss": 0.3565, "step": 122500 }, { "epoch": 0.45451512208142714, "grad_norm": 0.4684585928916931, "learning_rate": 8.858068819589579e-05, "loss": 0.3628, "step": 122600 }, { "epoch": 0.4548858521973174, "grad_norm": 0.28542405366897583, "learning_rate": 8.856197331241052e-05, "loss": 0.3345, "step": 122700 }, { "epoch": 0.45525658231320765, "grad_norm": 20.48404884338379, "learning_rate": 8.854324508619438e-05, "loss": 0.3455, "step": 122800 }, { "epoch": 0.4556273124290979, "grad_norm": 0.759093701839447, "learning_rate": 8.852450352372748e-05, "loss": 0.3229, "step": 122900 }, { "epoch": 0.4559980425449881, "grad_norm": 0.4470805823802948, "learning_rate": 8.850574863149452e-05, "loss": 0.3069, "step": 123000 }, { "epoch": 0.45636877266087833, "grad_norm": 1.0615532398223877, "learning_rate": 8.848698041598484e-05, "loss": 0.3103, "step": 123100 }, { "epoch": 0.45673950277676856, "grad_norm": 0.48713886737823486, "learning_rate": 8.84681988836924e-05, "loss": 0.3192, "step": 123200 }, { "epoch": 0.4571102328926588, "grad_norm": 0.7990370988845825, "learning_rate": 8.844940404111574e-05, "loss": 0.3161, "step": 123300 }, { "epoch": 0.457480963008549, "grad_norm": 0.8258106112480164, "learning_rate": 8.843059589475802e-05, "loss": 0.3708, "step": 123400 }, { "epoch": 0.4578516931244393, "grad_norm": 0.488482803106308, "learning_rate": 8.841177445112698e-05, "loss": 0.3409, "step": 123500 }, { "epoch": 0.4582224232403295, "grad_norm": 0.6419281363487244, "learning_rate": 8.839293971673501e-05, "loss": 0.329, "step": 123600 }, { "epoch": 0.45859315335621975, "grad_norm": 0.4407488703727722, "learning_rate": 8.837409169809904e-05, "loss": 0.3089, "step": 123700 }, { "epoch": 0.45896388347211, "grad_norm": 0.7720552086830139, "learning_rate": 8.835523040174065e-05, "loss": 0.3262, "step": 123800 }, { "epoch": 0.4593346135880002, "grad_norm": 1.0552111864089966, "learning_rate": 8.833635583418599e-05, "loss": 0.3597, "step": 123900 }, { "epoch": 0.45970534370389043, "grad_norm": 0.2529876232147217, "learning_rate": 8.831746800196577e-05, "loss": 0.3335, "step": 124000 }, { "epoch": 0.46007607381978066, "grad_norm": 1.2079278230667114, "learning_rate": 8.829856691161535e-05, "loss": 0.3421, "step": 124100 }, { "epoch": 0.4604468039356709, "grad_norm": 0.7541272640228271, "learning_rate": 8.827965256967463e-05, "loss": 0.3158, "step": 124200 }, { "epoch": 0.4608175340515611, "grad_norm": 0.3348942697048187, "learning_rate": 8.826072498268813e-05, "loss": 0.308, "step": 124300 }, { "epoch": 0.4611882641674514, "grad_norm": 1.0587422847747803, "learning_rate": 8.824178415720492e-05, "loss": 0.34, "step": 124400 }, { "epoch": 0.46155899428334163, "grad_norm": 0.5045888423919678, "learning_rate": 8.822283009977869e-05, "loss": 0.3005, "step": 124500 }, { "epoch": 0.46192972439923186, "grad_norm": 0.6448861360549927, "learning_rate": 8.820386281696766e-05, "loss": 0.3257, "step": 124600 }, { "epoch": 0.4623004545151221, "grad_norm": 0.3537586033344269, "learning_rate": 8.818488231533467e-05, "loss": 0.3433, "step": 124700 }, { "epoch": 0.4626711846310123, "grad_norm": 0.6741796731948853, "learning_rate": 8.816588860144711e-05, "loss": 0.3324, "step": 124800 }, { "epoch": 0.46304191474690254, "grad_norm": 0.41582441329956055, "learning_rate": 8.814688168187694e-05, "loss": 0.3516, "step": 124900 }, { "epoch": 0.46341264486279277, "grad_norm": 0.5512292385101318, "learning_rate": 8.812786156320071e-05, "loss": 0.3223, "step": 125000 }, { "epoch": 0.463783374978683, "grad_norm": 0.7549867033958435, "learning_rate": 8.810882825199951e-05, "loss": 0.3522, "step": 125100 }, { "epoch": 0.4641541050945733, "grad_norm": 0.6088722944259644, "learning_rate": 8.808978175485901e-05, "loss": 0.3685, "step": 125200 }, { "epoch": 0.4645248352104635, "grad_norm": 0.5127632021903992, "learning_rate": 8.807072207836944e-05, "loss": 0.3336, "step": 125300 }, { "epoch": 0.46489556532635373, "grad_norm": 0.3300882577896118, "learning_rate": 8.805164922912561e-05, "loss": 0.3191, "step": 125400 }, { "epoch": 0.46526629544224396, "grad_norm": 0.9587972164154053, "learning_rate": 8.803256321372684e-05, "loss": 0.335, "step": 125500 }, { "epoch": 0.4656370255581342, "grad_norm": 0.6120812892913818, "learning_rate": 8.801346403877706e-05, "loss": 0.332, "step": 125600 }, { "epoch": 0.4660077556740244, "grad_norm": 0.33075910806655884, "learning_rate": 8.79943517108847e-05, "loss": 0.3407, "step": 125700 }, { "epoch": 0.46637848578991464, "grad_norm": 0.19530253112316132, "learning_rate": 8.797522623666278e-05, "loss": 0.3193, "step": 125800 }, { "epoch": 0.46674921590580487, "grad_norm": 1.040759563446045, "learning_rate": 8.795608762272887e-05, "loss": 0.3444, "step": 125900 }, { "epoch": 0.46711994602169515, "grad_norm": 0.9512879252433777, "learning_rate": 8.793693587570505e-05, "loss": 0.3091, "step": 126000 }, { "epoch": 0.4674906761375854, "grad_norm": 0.2732359766960144, "learning_rate": 8.791777100221798e-05, "loss": 0.3107, "step": 126100 }, { "epoch": 0.4678614062534756, "grad_norm": 0.6678459048271179, "learning_rate": 8.789859300889885e-05, "loss": 0.3468, "step": 126200 }, { "epoch": 0.46823213636936584, "grad_norm": 0.8075119853019714, "learning_rate": 8.787940190238339e-05, "loss": 0.3248, "step": 126300 }, { "epoch": 0.46860286648525606, "grad_norm": 0.7504758238792419, "learning_rate": 8.786019768931188e-05, "loss": 0.3545, "step": 126400 }, { "epoch": 0.4689735966011463, "grad_norm": 0.46952739357948303, "learning_rate": 8.784098037632908e-05, "loss": 0.3333, "step": 126500 }, { "epoch": 0.4693443267170365, "grad_norm": 0.6172811985015869, "learning_rate": 8.782174997008436e-05, "loss": 0.3559, "step": 126600 }, { "epoch": 0.46971505683292675, "grad_norm": 0.5846688151359558, "learning_rate": 8.780250647723158e-05, "loss": 0.3324, "step": 126700 }, { "epoch": 0.470085786948817, "grad_norm": 1.0070252418518066, "learning_rate": 8.778324990442911e-05, "loss": 0.3234, "step": 126800 }, { "epoch": 0.47045651706470726, "grad_norm": 1.01193106174469, "learning_rate": 8.776398025833989e-05, "loss": 0.3506, "step": 126900 }, { "epoch": 0.4708272471805975, "grad_norm": 0.423578143119812, "learning_rate": 8.774469754563137e-05, "loss": 0.3493, "step": 127000 }, { "epoch": 0.4711979772964877, "grad_norm": 0.4373902678489685, "learning_rate": 8.772540177297548e-05, "loss": 0.3112, "step": 127100 }, { "epoch": 0.47156870741237794, "grad_norm": 0.6626511812210083, "learning_rate": 8.770609294704873e-05, "loss": 0.3474, "step": 127200 }, { "epoch": 0.47193943752826817, "grad_norm": 0.47178223729133606, "learning_rate": 8.768677107453209e-05, "loss": 0.3287, "step": 127300 }, { "epoch": 0.4723101676441584, "grad_norm": 0.7957141995429993, "learning_rate": 8.766743616211111e-05, "loss": 0.3201, "step": 127400 }, { "epoch": 0.4726808977600486, "grad_norm": 0.8285620808601379, "learning_rate": 8.764808821647579e-05, "loss": 0.3262, "step": 127500 }, { "epoch": 0.47305162787593885, "grad_norm": 0.8475161790847778, "learning_rate": 8.762872724432067e-05, "loss": 0.3422, "step": 127600 }, { "epoch": 0.47342235799182913, "grad_norm": 1.0094292163848877, "learning_rate": 8.760935325234477e-05, "loss": 0.3317, "step": 127700 }, { "epoch": 0.47379308810771936, "grad_norm": 0.9778109192848206, "learning_rate": 8.758996624725166e-05, "loss": 0.3392, "step": 127800 }, { "epoch": 0.4741638182236096, "grad_norm": 1.0625665187835693, "learning_rate": 8.75705662357494e-05, "loss": 0.3624, "step": 127900 }, { "epoch": 0.4745345483394998, "grad_norm": 0.741449773311615, "learning_rate": 8.755115322455052e-05, "loss": 0.3139, "step": 128000 }, { "epoch": 0.47490527845539005, "grad_norm": 0.6159365773200989, "learning_rate": 8.753172722037206e-05, "loss": 0.3239, "step": 128100 }, { "epoch": 0.4752760085712803, "grad_norm": 1.007365345954895, "learning_rate": 8.751228822993556e-05, "loss": 0.3369, "step": 128200 }, { "epoch": 0.4756467386871705, "grad_norm": 1.4690980911254883, "learning_rate": 8.74928362599671e-05, "loss": 0.3125, "step": 128300 }, { "epoch": 0.47601746880306073, "grad_norm": 2.602311134338379, "learning_rate": 8.747337131719714e-05, "loss": 0.2958, "step": 128400 }, { "epoch": 0.47638819891895096, "grad_norm": 0.47696006298065186, "learning_rate": 8.745389340836075e-05, "loss": 0.3353, "step": 128500 }, { "epoch": 0.47675892903484124, "grad_norm": 0.8578053712844849, "learning_rate": 8.743440254019741e-05, "loss": 0.3155, "step": 128600 }, { "epoch": 0.47712965915073147, "grad_norm": 0.7569879293441772, "learning_rate": 8.741489871945111e-05, "loss": 0.3429, "step": 128700 }, { "epoch": 0.4775003892666217, "grad_norm": 0.48034045100212097, "learning_rate": 8.739538195287032e-05, "loss": 0.3205, "step": 128800 }, { "epoch": 0.4778711193825119, "grad_norm": 0.528186023235321, "learning_rate": 8.737585224720797e-05, "loss": 0.3339, "step": 128900 }, { "epoch": 0.47824184949840215, "grad_norm": 0.7839371562004089, "learning_rate": 8.73563096092215e-05, "loss": 0.3343, "step": 129000 }, { "epoch": 0.4786125796142924, "grad_norm": 0.5103738307952881, "learning_rate": 8.73367540456728e-05, "loss": 0.3101, "step": 129100 }, { "epoch": 0.4789833097301826, "grad_norm": 0.5065139532089233, "learning_rate": 8.731718556332826e-05, "loss": 0.34, "step": 129200 }, { "epoch": 0.47935403984607283, "grad_norm": 1.1691548824310303, "learning_rate": 8.72976041689587e-05, "loss": 0.3354, "step": 129300 }, { "epoch": 0.4797247699619631, "grad_norm": 1.1157091856002808, "learning_rate": 8.727800986933945e-05, "loss": 0.3439, "step": 129400 }, { "epoch": 0.48009550007785334, "grad_norm": 0.515683650970459, "learning_rate": 8.725840267125025e-05, "loss": 0.3116, "step": 129500 }, { "epoch": 0.48046623019374357, "grad_norm": 0.291209876537323, "learning_rate": 8.723878258147538e-05, "loss": 0.3399, "step": 129600 }, { "epoch": 0.4808369603096338, "grad_norm": 1.0062898397445679, "learning_rate": 8.721914960680349e-05, "loss": 0.3685, "step": 129700 }, { "epoch": 0.481207690425524, "grad_norm": 0.3446211814880371, "learning_rate": 8.719950375402778e-05, "loss": 0.3476, "step": 129800 }, { "epoch": 0.48157842054141425, "grad_norm": 0.562185525894165, "learning_rate": 8.717984502994582e-05, "loss": 0.3388, "step": 129900 }, { "epoch": 0.4819491506573045, "grad_norm": 0.6113969087600708, "learning_rate": 8.716017344135973e-05, "loss": 0.3146, "step": 130000 }, { "epoch": 0.4823198807731947, "grad_norm": 0.9625849723815918, "learning_rate": 8.714048899507598e-05, "loss": 0.3441, "step": 130100 }, { "epoch": 0.48269061088908494, "grad_norm": 0.4304167926311493, "learning_rate": 8.712079169790555e-05, "loss": 0.3379, "step": 130200 }, { "epoch": 0.4830613410049752, "grad_norm": 0.5222867131233215, "learning_rate": 8.710108155666385e-05, "loss": 0.3675, "step": 130300 }, { "epoch": 0.48343207112086545, "grad_norm": 0.7842108011245728, "learning_rate": 8.708135857817075e-05, "loss": 0.3191, "step": 130400 }, { "epoch": 0.4838028012367557, "grad_norm": 0.5013927221298218, "learning_rate": 8.706162276925054e-05, "loss": 0.315, "step": 130500 }, { "epoch": 0.4841735313526459, "grad_norm": 0.812247097492218, "learning_rate": 8.704187413673192e-05, "loss": 0.3538, "step": 130600 }, { "epoch": 0.48454426146853613, "grad_norm": 0.9885092377662659, "learning_rate": 8.702211268744813e-05, "loss": 0.3116, "step": 130700 }, { "epoch": 0.48491499158442636, "grad_norm": 0.24859827756881714, "learning_rate": 8.700233842823675e-05, "loss": 0.3321, "step": 130800 }, { "epoch": 0.4852857217003166, "grad_norm": 1.2315515279769897, "learning_rate": 8.69825513659398e-05, "loss": 0.3591, "step": 130900 }, { "epoch": 0.4856564518162068, "grad_norm": 0.8179681897163391, "learning_rate": 8.696275150740378e-05, "loss": 0.3212, "step": 131000 }, { "epoch": 0.4860271819320971, "grad_norm": 1.2286796569824219, "learning_rate": 8.694293885947959e-05, "loss": 0.3243, "step": 131100 }, { "epoch": 0.4863979120479873, "grad_norm": 0.6638644933700562, "learning_rate": 8.692311342902253e-05, "loss": 0.3473, "step": 131200 }, { "epoch": 0.48676864216387755, "grad_norm": 0.5278761982917786, "learning_rate": 8.690327522289236e-05, "loss": 0.3416, "step": 131300 }, { "epoch": 0.4871393722797678, "grad_norm": 1.0583370923995972, "learning_rate": 8.688342424795324e-05, "loss": 0.3274, "step": 131400 }, { "epoch": 0.487510102395658, "grad_norm": 0.6022370457649231, "learning_rate": 8.686356051107378e-05, "loss": 0.3224, "step": 131500 }, { "epoch": 0.48788083251154823, "grad_norm": 0.506994903087616, "learning_rate": 8.684368401912697e-05, "loss": 0.3167, "step": 131600 }, { "epoch": 0.48825156262743846, "grad_norm": 1.0919255018234253, "learning_rate": 8.682379477899021e-05, "loss": 0.2905, "step": 131700 }, { "epoch": 0.4886222927433287, "grad_norm": 0.9689866900444031, "learning_rate": 8.680389279754533e-05, "loss": 0.336, "step": 131800 }, { "epoch": 0.488993022859219, "grad_norm": 0.4358638525009155, "learning_rate": 8.678397808167857e-05, "loss": 0.3383, "step": 131900 }, { "epoch": 0.4893637529751092, "grad_norm": 0.3884432017803192, "learning_rate": 8.676405063828057e-05, "loss": 0.3428, "step": 132000 }, { "epoch": 0.48973448309099943, "grad_norm": 0.6578744649887085, "learning_rate": 8.674411047424639e-05, "loss": 0.327, "step": 132100 }, { "epoch": 0.49010521320688966, "grad_norm": 0.6814656257629395, "learning_rate": 8.672415759647545e-05, "loss": 0.3256, "step": 132200 }, { "epoch": 0.4904759433227799, "grad_norm": 1.2418067455291748, "learning_rate": 8.67041920118716e-05, "loss": 0.3644, "step": 132300 }, { "epoch": 0.4908466734386701, "grad_norm": 0.5686973333358765, "learning_rate": 8.66842137273431e-05, "loss": 0.3248, "step": 132400 }, { "epoch": 0.49121740355456034, "grad_norm": 0.6391801834106445, "learning_rate": 8.666422274980255e-05, "loss": 0.3248, "step": 132500 }, { "epoch": 0.49158813367045057, "grad_norm": 0.4619053900241852, "learning_rate": 8.664421908616702e-05, "loss": 0.3447, "step": 132600 }, { "epoch": 0.4919588637863408, "grad_norm": 0.4215847849845886, "learning_rate": 8.66242027433579e-05, "loss": 0.3496, "step": 132700 }, { "epoch": 0.4923295939022311, "grad_norm": 0.46924135088920593, "learning_rate": 8.6604173728301e-05, "loss": 0.3499, "step": 132800 }, { "epoch": 0.4927003240181213, "grad_norm": 1.0738557577133179, "learning_rate": 8.658413204792653e-05, "loss": 0.3442, "step": 132900 }, { "epoch": 0.49307105413401153, "grad_norm": 1.0488382577896118, "learning_rate": 8.656407770916901e-05, "loss": 0.3489, "step": 133000 }, { "epoch": 0.49344178424990176, "grad_norm": 0.8059034943580627, "learning_rate": 8.654401071896742e-05, "loss": 0.3487, "step": 133100 }, { "epoch": 0.493812514365792, "grad_norm": 0.6818497776985168, "learning_rate": 8.652393108426512e-05, "loss": 0.3284, "step": 133200 }, { "epoch": 0.4941832444816822, "grad_norm": 0.6463399529457092, "learning_rate": 8.650383881200977e-05, "loss": 0.359, "step": 133300 }, { "epoch": 0.49455397459757244, "grad_norm": 0.40140455961227417, "learning_rate": 8.648373390915347e-05, "loss": 0.3508, "step": 133400 }, { "epoch": 0.49492470471346267, "grad_norm": 1.282117486000061, "learning_rate": 8.646361638265263e-05, "loss": 0.345, "step": 133500 }, { "epoch": 0.49529543482935295, "grad_norm": 0.7285473942756653, "learning_rate": 8.644348623946812e-05, "loss": 0.318, "step": 133600 }, { "epoch": 0.4956661649452432, "grad_norm": 0.6820040345191956, "learning_rate": 8.642334348656507e-05, "loss": 0.3422, "step": 133700 }, { "epoch": 0.4960368950611334, "grad_norm": 0.6920766234397888, "learning_rate": 8.640318813091308e-05, "loss": 0.3447, "step": 133800 }, { "epoch": 0.49640762517702364, "grad_norm": 1.0044747591018677, "learning_rate": 8.638302017948599e-05, "loss": 0.3222, "step": 133900 }, { "epoch": 0.49677835529291386, "grad_norm": 0.9380411505699158, "learning_rate": 8.636283963926209e-05, "loss": 0.3562, "step": 134000 }, { "epoch": 0.4971490854088041, "grad_norm": 0.6474587321281433, "learning_rate": 8.634264651722399e-05, "loss": 0.3119, "step": 134100 }, { "epoch": 0.4975198155246943, "grad_norm": 0.5671485662460327, "learning_rate": 8.63224408203587e-05, "loss": 0.3671, "step": 134200 }, { "epoch": 0.49789054564058455, "grad_norm": 0.6687446236610413, "learning_rate": 8.630222255565747e-05, "loss": 0.3529, "step": 134300 }, { "epoch": 0.4982612757564748, "grad_norm": 0.3487033545970917, "learning_rate": 8.628199173011603e-05, "loss": 0.3287, "step": 134400 }, { "epoch": 0.49863200587236506, "grad_norm": 0.358464777469635, "learning_rate": 8.62617483507344e-05, "loss": 0.3286, "step": 134500 }, { "epoch": 0.4990027359882553, "grad_norm": 0.24219462275505066, "learning_rate": 8.62414924245169e-05, "loss": 0.3429, "step": 134600 }, { "epoch": 0.4993734661041455, "grad_norm": 0.6188703775405884, "learning_rate": 8.622122395847225e-05, "loss": 0.3429, "step": 134700 }, { "epoch": 0.49974419622003574, "grad_norm": 0.4733023941516876, "learning_rate": 8.620094295961349e-05, "loss": 0.3141, "step": 134800 }, { "epoch": 0.500114926335926, "grad_norm": 0.4413136839866638, "learning_rate": 8.6180649434958e-05, "loss": 0.3258, "step": 134900 }, { "epoch": 0.5004856564518162, "grad_norm": 0.9144728779792786, "learning_rate": 8.61603433915275e-05, "loss": 0.3434, "step": 135000 }, { "epoch": 0.5008563865677065, "grad_norm": 0.5987548828125, "learning_rate": 8.614002483634801e-05, "loss": 0.3189, "step": 135100 }, { "epoch": 0.5012271166835967, "grad_norm": 0.8051921725273132, "learning_rate": 8.611969377644991e-05, "loss": 0.3339, "step": 135200 }, { "epoch": 0.5015978467994869, "grad_norm": 0.32952263951301575, "learning_rate": 8.609935021886792e-05, "loss": 0.3254, "step": 135300 }, { "epoch": 0.5019685769153771, "grad_norm": 1.0524775981903076, "learning_rate": 8.607899417064103e-05, "loss": 0.3349, "step": 135400 }, { "epoch": 0.5023393070312674, "grad_norm": 0.5578548908233643, "learning_rate": 8.605862563881262e-05, "loss": 0.3305, "step": 135500 }, { "epoch": 0.5027100371471576, "grad_norm": 0.3317144215106964, "learning_rate": 8.603824463043031e-05, "loss": 0.3178, "step": 135600 }, { "epoch": 0.5030807672630478, "grad_norm": 1.4792650938034058, "learning_rate": 8.601785115254612e-05, "loss": 0.3731, "step": 135700 }, { "epoch": 0.5034514973789381, "grad_norm": 0.766248345375061, "learning_rate": 8.599744521221633e-05, "loss": 0.3208, "step": 135800 }, { "epoch": 0.5038222274948283, "grad_norm": 0.6224705576896667, "learning_rate": 8.597702681650155e-05, "loss": 0.3812, "step": 135900 }, { "epoch": 0.5041929576107186, "grad_norm": 0.6708068251609802, "learning_rate": 8.59565959724667e-05, "loss": 0.3686, "step": 136000 }, { "epoch": 0.5045636877266088, "grad_norm": 0.7637072801589966, "learning_rate": 8.5936152687181e-05, "loss": 0.3148, "step": 136100 }, { "epoch": 0.504934417842499, "grad_norm": 0.8754328489303589, "learning_rate": 8.591569696771798e-05, "loss": 0.3097, "step": 136200 }, { "epoch": 0.5053051479583892, "grad_norm": 0.2624710202217102, "learning_rate": 8.589522882115547e-05, "loss": 0.3227, "step": 136300 }, { "epoch": 0.5056758780742795, "grad_norm": 0.6375905871391296, "learning_rate": 8.587474825457561e-05, "loss": 0.3233, "step": 136400 }, { "epoch": 0.5060466081901697, "grad_norm": 0.4594990313053131, "learning_rate": 8.585425527506482e-05, "loss": 0.2944, "step": 136500 }, { "epoch": 0.50641733830606, "grad_norm": 1.2614006996154785, "learning_rate": 8.583374988971383e-05, "loss": 0.3141, "step": 136600 }, { "epoch": 0.5067880684219502, "grad_norm": 0.9044752717018127, "learning_rate": 8.581323210561764e-05, "loss": 0.3509, "step": 136700 }, { "epoch": 0.5071587985378404, "grad_norm": 0.5360621213912964, "learning_rate": 8.57927019298756e-05, "loss": 0.3144, "step": 136800 }, { "epoch": 0.5075295286537307, "grad_norm": 0.5477878451347351, "learning_rate": 8.577215936959126e-05, "loss": 0.3576, "step": 136900 }, { "epoch": 0.5079002587696209, "grad_norm": 0.5886459350585938, "learning_rate": 8.575160443187251e-05, "loss": 0.3112, "step": 137000 }, { "epoch": 0.5082709888855111, "grad_norm": 0.7082052826881409, "learning_rate": 8.573103712383151e-05, "loss": 0.34, "step": 137100 }, { "epoch": 0.5086417190014013, "grad_norm": 0.8314849734306335, "learning_rate": 8.571045745258473e-05, "loss": 0.3506, "step": 137200 }, { "epoch": 0.5090124491172916, "grad_norm": 0.42521926760673523, "learning_rate": 8.568986542525285e-05, "loss": 0.3587, "step": 137300 }, { "epoch": 0.5093831792331819, "grad_norm": 0.602470874786377, "learning_rate": 8.566926104896089e-05, "loss": 0.3239, "step": 137400 }, { "epoch": 0.509753909349072, "grad_norm": 1.305550217628479, "learning_rate": 8.56486443308381e-05, "loss": 0.3124, "step": 137500 }, { "epoch": 0.5101246394649623, "grad_norm": 0.6297417879104614, "learning_rate": 8.562801527801804e-05, "loss": 0.323, "step": 137600 }, { "epoch": 0.5104953695808525, "grad_norm": 1.1771701574325562, "learning_rate": 8.560737389763853e-05, "loss": 0.3133, "step": 137700 }, { "epoch": 0.5108660996967428, "grad_norm": 0.35580500960350037, "learning_rate": 8.558672019684158e-05, "loss": 0.32, "step": 137800 }, { "epoch": 0.511236829812633, "grad_norm": 1.0580075979232788, "learning_rate": 8.55660541827736e-05, "loss": 0.303, "step": 137900 }, { "epoch": 0.5116075599285232, "grad_norm": 0.8973690867424011, "learning_rate": 8.554537586258512e-05, "loss": 0.3276, "step": 138000 }, { "epoch": 0.5119782900444134, "grad_norm": 0.5439248085021973, "learning_rate": 8.552468524343102e-05, "loss": 0.3252, "step": 138100 }, { "epoch": 0.5123490201603037, "grad_norm": 0.37419673800468445, "learning_rate": 8.550398233247044e-05, "loss": 0.3177, "step": 138200 }, { "epoch": 0.512719750276194, "grad_norm": 0.71340411901474, "learning_rate": 8.548326713686669e-05, "loss": 0.3128, "step": 138300 }, { "epoch": 0.5130904803920842, "grad_norm": 0.8240370154380798, "learning_rate": 8.546253966378742e-05, "loss": 0.3336, "step": 138400 }, { "epoch": 0.5134612105079744, "grad_norm": 0.8755084872245789, "learning_rate": 8.544179992040446e-05, "loss": 0.3207, "step": 138500 }, { "epoch": 0.5138319406238646, "grad_norm": 0.2523043155670166, "learning_rate": 8.542104791389394e-05, "loss": 0.3314, "step": 138600 }, { "epoch": 0.5142026707397549, "grad_norm": 1.0992518663406372, "learning_rate": 8.540028365143619e-05, "loss": 0.3328, "step": 138700 }, { "epoch": 0.5145734008556451, "grad_norm": 0.7551282048225403, "learning_rate": 8.537950714021583e-05, "loss": 0.3136, "step": 138800 }, { "epoch": 0.5149441309715354, "grad_norm": 0.32728463411331177, "learning_rate": 8.535871838742166e-05, "loss": 0.3683, "step": 138900 }, { "epoch": 0.5153148610874255, "grad_norm": 0.8188005685806274, "learning_rate": 8.533791740024675e-05, "loss": 0.3316, "step": 139000 }, { "epoch": 0.5156855912033158, "grad_norm": 0.998979926109314, "learning_rate": 8.53171041858884e-05, "loss": 0.2936, "step": 139100 }, { "epoch": 0.5160563213192061, "grad_norm": 0.6150734424591064, "learning_rate": 8.529627875154815e-05, "loss": 0.336, "step": 139200 }, { "epoch": 0.5164270514350963, "grad_norm": 0.14756588637828827, "learning_rate": 8.527544110443173e-05, "loss": 0.3247, "step": 139300 }, { "epoch": 0.5167977815509865, "grad_norm": 0.6253288388252258, "learning_rate": 8.525459125174913e-05, "loss": 0.353, "step": 139400 }, { "epoch": 0.5171685116668767, "grad_norm": 1.0562703609466553, "learning_rate": 8.523372920071457e-05, "loss": 0.3548, "step": 139500 }, { "epoch": 0.517539241782767, "grad_norm": 0.6198911666870117, "learning_rate": 8.521285495854647e-05, "loss": 0.3105, "step": 139600 }, { "epoch": 0.5179099718986572, "grad_norm": 0.37351205945014954, "learning_rate": 8.519196853246746e-05, "loss": 0.3137, "step": 139700 }, { "epoch": 0.5182807020145475, "grad_norm": 0.3834395706653595, "learning_rate": 8.517106992970442e-05, "loss": 0.3281, "step": 139800 }, { "epoch": 0.5186514321304376, "grad_norm": 0.9661216735839844, "learning_rate": 8.515015915748841e-05, "loss": 0.3526, "step": 139900 }, { "epoch": 0.5190221622463279, "grad_norm": 0.6062168478965759, "learning_rate": 8.51292362230547e-05, "loss": 0.3536, "step": 140000 }, { "epoch": 0.5193928923622182, "grad_norm": 0.34353721141815186, "learning_rate": 8.51083011336428e-05, "loss": 0.3233, "step": 140100 }, { "epoch": 0.5197636224781084, "grad_norm": 0.6811355948448181, "learning_rate": 8.508735389649642e-05, "loss": 0.325, "step": 140200 }, { "epoch": 0.5201343525939987, "grad_norm": 0.26427844166755676, "learning_rate": 8.506639451886342e-05, "loss": 0.3019, "step": 140300 }, { "epoch": 0.5205050827098888, "grad_norm": 0.6732288002967834, "learning_rate": 8.504542300799594e-05, "loss": 0.3217, "step": 140400 }, { "epoch": 0.5208758128257791, "grad_norm": 0.4661412835121155, "learning_rate": 8.502443937115028e-05, "loss": 0.3247, "step": 140500 }, { "epoch": 0.5212465429416693, "grad_norm": 0.536270797252655, "learning_rate": 8.500344361558689e-05, "loss": 0.337, "step": 140600 }, { "epoch": 0.5216172730575596, "grad_norm": 0.43403246998786926, "learning_rate": 8.49824357485705e-05, "loss": 0.346, "step": 140700 }, { "epoch": 0.5219880031734498, "grad_norm": 0.8519464135169983, "learning_rate": 8.496141577736997e-05, "loss": 0.2833, "step": 140800 }, { "epoch": 0.52235873328934, "grad_norm": 0.8312713503837585, "learning_rate": 8.494038370925841e-05, "loss": 0.3527, "step": 140900 }, { "epoch": 0.5227294634052303, "grad_norm": 0.8901540637016296, "learning_rate": 8.491933955151301e-05, "loss": 0.3281, "step": 141000 }, { "epoch": 0.5231001935211205, "grad_norm": 0.48609161376953125, "learning_rate": 8.489828331141523e-05, "loss": 0.3159, "step": 141100 }, { "epoch": 0.5234709236370108, "grad_norm": 0.31898683309555054, "learning_rate": 8.487721499625068e-05, "loss": 0.3271, "step": 141200 }, { "epoch": 0.5238416537529009, "grad_norm": 0.7508733868598938, "learning_rate": 8.485613461330919e-05, "loss": 0.3241, "step": 141300 }, { "epoch": 0.5242123838687912, "grad_norm": 0.6590599417686462, "learning_rate": 8.483504216988469e-05, "loss": 0.3465, "step": 141400 }, { "epoch": 0.5245831139846814, "grad_norm": 0.6584334373474121, "learning_rate": 8.481393767327535e-05, "loss": 0.3471, "step": 141500 }, { "epoch": 0.5249538441005717, "grad_norm": 0.5732010006904602, "learning_rate": 8.479282113078344e-05, "loss": 0.333, "step": 141600 }, { "epoch": 0.525324574216462, "grad_norm": 0.3301706612110138, "learning_rate": 8.477169254971551e-05, "loss": 0.3293, "step": 141700 }, { "epoch": 0.5256953043323521, "grad_norm": 0.9054283499717712, "learning_rate": 8.475055193738214e-05, "loss": 0.313, "step": 141800 }, { "epoch": 0.5260660344482424, "grad_norm": 0.2827327251434326, "learning_rate": 8.472939930109818e-05, "loss": 0.3263, "step": 141900 }, { "epoch": 0.5264367645641326, "grad_norm": 1.334529995918274, "learning_rate": 8.470823464818259e-05, "loss": 0.3287, "step": 142000 }, { "epoch": 0.5268074946800229, "grad_norm": 1.2268656492233276, "learning_rate": 8.468705798595848e-05, "loss": 0.3076, "step": 142100 }, { "epoch": 0.527178224795913, "grad_norm": 1.1989068984985352, "learning_rate": 8.466586932175318e-05, "loss": 0.3082, "step": 142200 }, { "epoch": 0.5275489549118033, "grad_norm": 0.7488361597061157, "learning_rate": 8.464466866289807e-05, "loss": 0.3287, "step": 142300 }, { "epoch": 0.5279196850276935, "grad_norm": 0.6912151575088501, "learning_rate": 8.462345601672877e-05, "loss": 0.3571, "step": 142400 }, { "epoch": 0.5282904151435838, "grad_norm": 0.537708044052124, "learning_rate": 8.460223139058501e-05, "loss": 0.3164, "step": 142500 }, { "epoch": 0.528661145259474, "grad_norm": 1.3145033121109009, "learning_rate": 8.458099479181066e-05, "loss": 0.3084, "step": 142600 }, { "epoch": 0.5290318753753642, "grad_norm": 0.9997704029083252, "learning_rate": 8.455974622775376e-05, "loss": 0.3237, "step": 142700 }, { "epoch": 0.5294026054912545, "grad_norm": 0.6095898151397705, "learning_rate": 8.453848570576646e-05, "loss": 0.3346, "step": 142800 }, { "epoch": 0.5297733356071447, "grad_norm": 0.8238364458084106, "learning_rate": 8.451721323320504e-05, "loss": 0.3274, "step": 142900 }, { "epoch": 0.530144065723035, "grad_norm": 0.48383551836013794, "learning_rate": 8.449592881742996e-05, "loss": 0.3221, "step": 143000 }, { "epoch": 0.5305147958389251, "grad_norm": 0.6795052886009216, "learning_rate": 8.447463246580578e-05, "loss": 0.333, "step": 143100 }, { "epoch": 0.5308855259548154, "grad_norm": 0.4301062226295471, "learning_rate": 8.445332418570122e-05, "loss": 0.332, "step": 143200 }, { "epoch": 0.5312562560707057, "grad_norm": 0.5889010429382324, "learning_rate": 8.443200398448906e-05, "loss": 0.303, "step": 143300 }, { "epoch": 0.5316269861865959, "grad_norm": 0.314588338136673, "learning_rate": 8.441067186954626e-05, "loss": 0.3523, "step": 143400 }, { "epoch": 0.5319977163024862, "grad_norm": 0.656849205493927, "learning_rate": 8.438932784825392e-05, "loss": 0.3167, "step": 143500 }, { "epoch": 0.5323684464183763, "grad_norm": 0.6773498058319092, "learning_rate": 8.436797192799722e-05, "loss": 0.34, "step": 143600 }, { "epoch": 0.5327391765342666, "grad_norm": 0.9157997369766235, "learning_rate": 8.434660411616544e-05, "loss": 0.3574, "step": 143700 }, { "epoch": 0.5331099066501568, "grad_norm": 1.2048577070236206, "learning_rate": 8.432522442015202e-05, "loss": 0.3394, "step": 143800 }, { "epoch": 0.5334806367660471, "grad_norm": 0.2848084568977356, "learning_rate": 8.430383284735451e-05, "loss": 0.3125, "step": 143900 }, { "epoch": 0.5338513668819372, "grad_norm": 0.7055698037147522, "learning_rate": 8.428242940517455e-05, "loss": 0.3094, "step": 144000 }, { "epoch": 0.5342220969978275, "grad_norm": 0.6577404737472534, "learning_rate": 8.426101410101786e-05, "loss": 0.2965, "step": 144100 }, { "epoch": 0.5345928271137178, "grad_norm": 0.2808060646057129, "learning_rate": 8.423958694229432e-05, "loss": 0.341, "step": 144200 }, { "epoch": 0.534963557229608, "grad_norm": 0.33684754371643066, "learning_rate": 8.42181479364179e-05, "loss": 0.307, "step": 144300 }, { "epoch": 0.5353342873454983, "grad_norm": 0.5107066035270691, "learning_rate": 8.419669709080665e-05, "loss": 0.3103, "step": 144400 }, { "epoch": 0.5357050174613884, "grad_norm": 0.6448732018470764, "learning_rate": 8.417523441288269e-05, "loss": 0.3614, "step": 144500 }, { "epoch": 0.5360757475772787, "grad_norm": 0.47724467515945435, "learning_rate": 8.415375991007228e-05, "loss": 0.32, "step": 144600 }, { "epoch": 0.5364464776931689, "grad_norm": 0.48915883898735046, "learning_rate": 8.41322735898058e-05, "loss": 0.3518, "step": 144700 }, { "epoch": 0.5368172078090592, "grad_norm": 0.7778482437133789, "learning_rate": 8.411077545951764e-05, "loss": 0.3129, "step": 144800 }, { "epoch": 0.5371879379249493, "grad_norm": 0.8147164583206177, "learning_rate": 8.40892655266463e-05, "loss": 0.3223, "step": 144900 }, { "epoch": 0.5375586680408396, "grad_norm": 0.5587433576583862, "learning_rate": 8.40677437986344e-05, "loss": 0.3412, "step": 145000 }, { "epoch": 0.5379293981567299, "grad_norm": 0.5495920777320862, "learning_rate": 8.404621028292863e-05, "loss": 0.3613, "step": 145100 }, { "epoch": 0.5383001282726201, "grad_norm": 0.36760666966438293, "learning_rate": 8.402466498697973e-05, "loss": 0.3184, "step": 145200 }, { "epoch": 0.5386708583885104, "grad_norm": 0.6292432546615601, "learning_rate": 8.400310791824253e-05, "loss": 0.3209, "step": 145300 }, { "epoch": 0.5390415885044005, "grad_norm": 0.8243033289909363, "learning_rate": 8.398153908417595e-05, "loss": 0.323, "step": 145400 }, { "epoch": 0.5394123186202908, "grad_norm": 0.6290979385375977, "learning_rate": 8.395995849224296e-05, "loss": 0.3363, "step": 145500 }, { "epoch": 0.539783048736181, "grad_norm": 0.33283117413520813, "learning_rate": 8.393836614991061e-05, "loss": 0.3231, "step": 145600 }, { "epoch": 0.5401537788520713, "grad_norm": 0.711925208568573, "learning_rate": 8.391676206465e-05, "loss": 0.3224, "step": 145700 }, { "epoch": 0.5405245089679616, "grad_norm": 0.7453523874282837, "learning_rate": 8.389514624393632e-05, "loss": 0.311, "step": 145800 }, { "epoch": 0.5408952390838517, "grad_norm": 0.6955090165138245, "learning_rate": 8.387351869524881e-05, "loss": 0.3217, "step": 145900 }, { "epoch": 0.541265969199742, "grad_norm": 0.6250924468040466, "learning_rate": 8.385187942607074e-05, "loss": 0.3524, "step": 146000 }, { "epoch": 0.5416366993156322, "grad_norm": 0.5832008719444275, "learning_rate": 8.383022844388947e-05, "loss": 0.3199, "step": 146100 }, { "epoch": 0.5420074294315225, "grad_norm": 0.4926455020904541, "learning_rate": 8.38085657561964e-05, "loss": 0.3383, "step": 146200 }, { "epoch": 0.5423781595474126, "grad_norm": 0.8114420175552368, "learning_rate": 8.378689137048701e-05, "loss": 0.3342, "step": 146300 }, { "epoch": 0.5427488896633029, "grad_norm": 0.610555112361908, "learning_rate": 8.376520529426075e-05, "loss": 0.3089, "step": 146400 }, { "epoch": 0.5431196197791931, "grad_norm": 0.4345783591270447, "learning_rate": 8.374350753502119e-05, "loss": 0.3126, "step": 146500 }, { "epoch": 0.5434903498950834, "grad_norm": 0.5771620273590088, "learning_rate": 8.372179810027591e-05, "loss": 0.3489, "step": 146600 }, { "epoch": 0.5438610800109737, "grad_norm": 0.7393174171447754, "learning_rate": 8.370007699753656e-05, "loss": 0.3329, "step": 146700 }, { "epoch": 0.5442318101268638, "grad_norm": 0.8391637802124023, "learning_rate": 8.367834423431878e-05, "loss": 0.2979, "step": 146800 }, { "epoch": 0.5446025402427541, "grad_norm": 0.4920552670955658, "learning_rate": 8.365659981814227e-05, "loss": 0.3148, "step": 146900 }, { "epoch": 0.5449732703586443, "grad_norm": 0.8017041087150574, "learning_rate": 8.36348437565308e-05, "loss": 0.327, "step": 147000 }, { "epoch": 0.5453440004745346, "grad_norm": 0.3872503638267517, "learning_rate": 8.361307605701203e-05, "loss": 0.3157, "step": 147100 }, { "epoch": 0.5457147305904247, "grad_norm": 0.508532702922821, "learning_rate": 8.359129672711785e-05, "loss": 0.3384, "step": 147200 }, { "epoch": 0.546085460706315, "grad_norm": 0.8714615106582642, "learning_rate": 8.356950577438403e-05, "loss": 0.3127, "step": 147300 }, { "epoch": 0.5464561908222052, "grad_norm": 1.0603781938552856, "learning_rate": 8.354770320635041e-05, "loss": 0.3109, "step": 147400 }, { "epoch": 0.5468269209380955, "grad_norm": 0.4521978795528412, "learning_rate": 8.352588903056082e-05, "loss": 0.3726, "step": 147500 }, { "epoch": 0.5471976510539858, "grad_norm": 1.7427945137023926, "learning_rate": 8.350406325456316e-05, "loss": 0.3388, "step": 147600 }, { "epoch": 0.5475683811698759, "grad_norm": 0.5300511121749878, "learning_rate": 8.348222588590929e-05, "loss": 0.3431, "step": 147700 }, { "epoch": 0.5479391112857662, "grad_norm": 0.5015584230422974, "learning_rate": 8.34603769321551e-05, "loss": 0.2951, "step": 147800 }, { "epoch": 0.5483098414016564, "grad_norm": 0.6796565055847168, "learning_rate": 8.34385164008605e-05, "loss": 0.3024, "step": 147900 }, { "epoch": 0.5486805715175467, "grad_norm": 0.7475005984306335, "learning_rate": 8.341664429958942e-05, "loss": 0.3245, "step": 148000 }, { "epoch": 0.5490513016334369, "grad_norm": 0.9571236371994019, "learning_rate": 8.339476063590972e-05, "loss": 0.3273, "step": 148100 }, { "epoch": 0.5494220317493271, "grad_norm": 0.642557680606842, "learning_rate": 8.337286541739335e-05, "loss": 0.3655, "step": 148200 }, { "epoch": 0.5497927618652173, "grad_norm": 0.6258786916732788, "learning_rate": 8.335095865161621e-05, "loss": 0.3223, "step": 148300 }, { "epoch": 0.5501634919811076, "grad_norm": 0.6219207048416138, "learning_rate": 8.33290403461582e-05, "loss": 0.3256, "step": 148400 }, { "epoch": 0.5505342220969979, "grad_norm": 0.4723232090473175, "learning_rate": 8.330711050860322e-05, "loss": 0.3375, "step": 148500 }, { "epoch": 0.550904952212888, "grad_norm": 1.7193678617477417, "learning_rate": 8.328516914653914e-05, "loss": 0.3505, "step": 148600 }, { "epoch": 0.5512756823287783, "grad_norm": 1.053464412689209, "learning_rate": 8.326321626755789e-05, "loss": 0.3123, "step": 148700 }, { "epoch": 0.5516464124446685, "grad_norm": 0.4490877687931061, "learning_rate": 8.324125187925527e-05, "loss": 0.3327, "step": 148800 }, { "epoch": 0.5520171425605588, "grad_norm": 0.537993848323822, "learning_rate": 8.321927598923113e-05, "loss": 0.3099, "step": 148900 }, { "epoch": 0.552387872676449, "grad_norm": 0.8266271352767944, "learning_rate": 8.319728860508933e-05, "loss": 0.3243, "step": 149000 }, { "epoch": 0.5527586027923392, "grad_norm": 1.5167702436447144, "learning_rate": 8.317528973443763e-05, "loss": 0.3329, "step": 149100 }, { "epoch": 0.5531293329082295, "grad_norm": 0.7116677165031433, "learning_rate": 8.315327938488783e-05, "loss": 0.3046, "step": 149200 }, { "epoch": 0.5535000630241197, "grad_norm": 0.34489843249320984, "learning_rate": 8.313125756405567e-05, "loss": 0.3452, "step": 149300 }, { "epoch": 0.55387079314001, "grad_norm": 0.5531739592552185, "learning_rate": 8.310922427956085e-05, "loss": 0.3298, "step": 149400 }, { "epoch": 0.5542415232559001, "grad_norm": 1.1582592725753784, "learning_rate": 8.308717953902708e-05, "loss": 0.3189, "step": 149500 }, { "epoch": 0.5546122533717904, "grad_norm": 0.7119314670562744, "learning_rate": 8.306512335008196e-05, "loss": 0.3502, "step": 149600 }, { "epoch": 0.5549829834876806, "grad_norm": 0.7101984024047852, "learning_rate": 8.304305572035713e-05, "loss": 0.3161, "step": 149700 }, { "epoch": 0.5553537136035709, "grad_norm": 0.5362114906311035, "learning_rate": 8.302097665748816e-05, "loss": 0.3135, "step": 149800 }, { "epoch": 0.5557244437194611, "grad_norm": 0.4047814607620239, "learning_rate": 8.299888616911454e-05, "loss": 0.3338, "step": 149900 }, { "epoch": 0.5560951738353513, "grad_norm": 0.9390084743499756, "learning_rate": 8.297678426287976e-05, "loss": 0.3406, "step": 150000 }, { "epoch": 0.5564659039512416, "grad_norm": 1.0159021615982056, "learning_rate": 8.295467094643127e-05, "loss": 0.2961, "step": 150100 }, { "epoch": 0.5568366340671318, "grad_norm": 0.335974782705307, "learning_rate": 8.293254622742039e-05, "loss": 0.3319, "step": 150200 }, { "epoch": 0.5572073641830221, "grad_norm": 0.5082780718803406, "learning_rate": 8.291041011350249e-05, "loss": 0.3456, "step": 150300 }, { "epoch": 0.5575780942989123, "grad_norm": 0.7200536727905273, "learning_rate": 8.28882626123368e-05, "loss": 0.3001, "step": 150400 }, { "epoch": 0.5579488244148025, "grad_norm": 0.6461792588233948, "learning_rate": 8.286610373158653e-05, "loss": 0.3628, "step": 150500 }, { "epoch": 0.5583195545306927, "grad_norm": 0.8352541327476501, "learning_rate": 8.284393347891882e-05, "loss": 0.3255, "step": 150600 }, { "epoch": 0.558690284646583, "grad_norm": 0.7582707405090332, "learning_rate": 8.282175186200473e-05, "loss": 0.3242, "step": 150700 }, { "epoch": 0.5590610147624732, "grad_norm": 0.8924834132194519, "learning_rate": 8.279955888851928e-05, "loss": 0.3053, "step": 150800 }, { "epoch": 0.5594317448783634, "grad_norm": 0.2911580502986908, "learning_rate": 8.277735456614139e-05, "loss": 0.3198, "step": 150900 }, { "epoch": 0.5598024749942537, "grad_norm": 0.5526257753372192, "learning_rate": 8.275513890255396e-05, "loss": 0.3382, "step": 151000 }, { "epoch": 0.5601732051101439, "grad_norm": 0.7693712115287781, "learning_rate": 8.273291190544371e-05, "loss": 0.3404, "step": 151100 }, { "epoch": 0.5605439352260342, "grad_norm": 0.5775411128997803, "learning_rate": 8.271067358250142e-05, "loss": 0.281, "step": 151200 }, { "epoch": 0.5609146653419244, "grad_norm": 0.45190078020095825, "learning_rate": 8.268842394142165e-05, "loss": 0.3418, "step": 151300 }, { "epoch": 0.5612853954578146, "grad_norm": 0.9048945903778076, "learning_rate": 8.266616298990298e-05, "loss": 0.3393, "step": 151400 }, { "epoch": 0.5616561255737048, "grad_norm": 0.5141841769218445, "learning_rate": 8.264389073564787e-05, "loss": 0.3459, "step": 151500 }, { "epoch": 0.5620268556895951, "grad_norm": 1.1340882778167725, "learning_rate": 8.262160718636267e-05, "loss": 0.3404, "step": 151600 }, { "epoch": 0.5623975858054854, "grad_norm": 1.0512477159500122, "learning_rate": 8.259931234975764e-05, "loss": 0.3482, "step": 151700 }, { "epoch": 0.5627683159213756, "grad_norm": 0.7288060188293457, "learning_rate": 8.257700623354698e-05, "loss": 0.343, "step": 151800 }, { "epoch": 0.5631390460372658, "grad_norm": 0.31551048159599304, "learning_rate": 8.255468884544879e-05, "loss": 0.3406, "step": 151900 }, { "epoch": 0.563509776153156, "grad_norm": 0.6274349689483643, "learning_rate": 8.253236019318503e-05, "loss": 0.2915, "step": 152000 }, { "epoch": 0.5638805062690463, "grad_norm": 0.5241157412528992, "learning_rate": 8.251002028448158e-05, "loss": 0.3318, "step": 152100 }, { "epoch": 0.5642512363849365, "grad_norm": 0.6362565755844116, "learning_rate": 8.248766912706822e-05, "loss": 0.3135, "step": 152200 }, { "epoch": 0.5646219665008267, "grad_norm": 0.6876683831214905, "learning_rate": 8.246530672867862e-05, "loss": 0.3422, "step": 152300 }, { "epoch": 0.5649926966167169, "grad_norm": 0.7570697069168091, "learning_rate": 8.244293309705033e-05, "loss": 0.3441, "step": 152400 }, { "epoch": 0.5653634267326072, "grad_norm": 1.2314379215240479, "learning_rate": 8.242054823992481e-05, "loss": 0.3272, "step": 152500 }, { "epoch": 0.5657341568484975, "grad_norm": 0.684101402759552, "learning_rate": 8.239815216504736e-05, "loss": 0.3355, "step": 152600 }, { "epoch": 0.5661048869643877, "grad_norm": 0.6730090975761414, "learning_rate": 8.237574488016721e-05, "loss": 0.3362, "step": 152700 }, { "epoch": 0.5664756170802779, "grad_norm": 0.616828978061676, "learning_rate": 8.235332639303745e-05, "loss": 0.3754, "step": 152800 }, { "epoch": 0.5668463471961681, "grad_norm": 0.9472378492355347, "learning_rate": 8.233089671141503e-05, "loss": 0.3245, "step": 152900 }, { "epoch": 0.5672170773120584, "grad_norm": 0.5413978695869446, "learning_rate": 8.23084558430608e-05, "loss": 0.3155, "step": 153000 }, { "epoch": 0.5675878074279486, "grad_norm": 0.5516555905342102, "learning_rate": 8.228600379573947e-05, "loss": 0.3205, "step": 153100 }, { "epoch": 0.5679585375438388, "grad_norm": 0.8880279064178467, "learning_rate": 8.22635405772196e-05, "loss": 0.3158, "step": 153200 }, { "epoch": 0.568329267659729, "grad_norm": 0.7535635828971863, "learning_rate": 8.224106619527363e-05, "loss": 0.3004, "step": 153300 }, { "epoch": 0.5686999977756193, "grad_norm": 0.3343527317047119, "learning_rate": 8.22185806576779e-05, "loss": 0.3208, "step": 153400 }, { "epoch": 0.5690707278915096, "grad_norm": 0.7308222055435181, "learning_rate": 8.219608397221252e-05, "loss": 0.3244, "step": 153500 }, { "epoch": 0.5694414580073998, "grad_norm": 1.1357159614562988, "learning_rate": 8.217357614666156e-05, "loss": 0.3247, "step": 153600 }, { "epoch": 0.56981218812329, "grad_norm": 0.4936862289905548, "learning_rate": 8.215105718881287e-05, "loss": 0.329, "step": 153700 }, { "epoch": 0.5701829182391802, "grad_norm": 0.5399175882339478, "learning_rate": 8.212852710645817e-05, "loss": 0.3157, "step": 153800 }, { "epoch": 0.5705536483550705, "grad_norm": 0.7294113039970398, "learning_rate": 8.210598590739307e-05, "loss": 0.3785, "step": 153900 }, { "epoch": 0.5709243784709607, "grad_norm": 0.7251603007316589, "learning_rate": 8.208343359941696e-05, "loss": 0.3072, "step": 154000 }, { "epoch": 0.571295108586851, "grad_norm": 0.2763591706752777, "learning_rate": 8.206087019033313e-05, "loss": 0.3578, "step": 154100 }, { "epoch": 0.5716658387027411, "grad_norm": 0.5613310933113098, "learning_rate": 8.203829568794866e-05, "loss": 0.314, "step": 154200 }, { "epoch": 0.5720365688186314, "grad_norm": 0.45749056339263916, "learning_rate": 8.201571010007453e-05, "loss": 0.3299, "step": 154300 }, { "epoch": 0.5724072989345217, "grad_norm": 0.9635785818099976, "learning_rate": 8.19931134345255e-05, "loss": 0.3544, "step": 154400 }, { "epoch": 0.5727780290504119, "grad_norm": 0.4360646605491638, "learning_rate": 8.197050569912021e-05, "loss": 0.3457, "step": 154500 }, { "epoch": 0.5731487591663021, "grad_norm": 0.5196271538734436, "learning_rate": 8.194788690168106e-05, "loss": 0.306, "step": 154600 }, { "epoch": 0.5735194892821923, "grad_norm": 0.4708426892757416, "learning_rate": 8.192525705003437e-05, "loss": 0.3056, "step": 154700 }, { "epoch": 0.5738902193980826, "grad_norm": 0.3194808065891266, "learning_rate": 8.190261615201021e-05, "loss": 0.3283, "step": 154800 }, { "epoch": 0.5742609495139728, "grad_norm": 2.9564123153686523, "learning_rate": 8.187996421544251e-05, "loss": 0.308, "step": 154900 }, { "epoch": 0.5746316796298631, "grad_norm": 0.8081748485565186, "learning_rate": 8.185730124816901e-05, "loss": 0.3301, "step": 155000 }, { "epoch": 0.5750024097457533, "grad_norm": 0.7777862548828125, "learning_rate": 8.183462725803126e-05, "loss": 0.3367, "step": 155100 }, { "epoch": 0.5753731398616435, "grad_norm": 0.942192554473877, "learning_rate": 8.181194225287464e-05, "loss": 0.3207, "step": 155200 }, { "epoch": 0.5757438699775338, "grad_norm": 0.9899280071258545, "learning_rate": 8.178924624054831e-05, "loss": 0.3325, "step": 155300 }, { "epoch": 0.576114600093424, "grad_norm": 1.1787716150283813, "learning_rate": 8.176653922890529e-05, "loss": 0.3445, "step": 155400 }, { "epoch": 0.5764853302093143, "grad_norm": 1.1626032590866089, "learning_rate": 8.174382122580235e-05, "loss": 0.3294, "step": 155500 }, { "epoch": 0.5768560603252044, "grad_norm": 0.49781936407089233, "learning_rate": 8.17210922391001e-05, "loss": 0.343, "step": 155600 }, { "epoch": 0.5772267904410947, "grad_norm": 1.4051926136016846, "learning_rate": 8.169835227666293e-05, "loss": 0.3214, "step": 155700 }, { "epoch": 0.5775975205569849, "grad_norm": 0.7454832792282104, "learning_rate": 8.167560134635908e-05, "loss": 0.3283, "step": 155800 }, { "epoch": 0.5779682506728752, "grad_norm": 0.8939104080200195, "learning_rate": 8.165283945606047e-05, "loss": 0.3274, "step": 155900 }, { "epoch": 0.5783389807887654, "grad_norm": 1.1849110126495361, "learning_rate": 8.163006661364294e-05, "loss": 0.3326, "step": 156000 }, { "epoch": 0.5787097109046556, "grad_norm": 0.6986303329467773, "learning_rate": 8.160728282698603e-05, "loss": 0.325, "step": 156100 }, { "epoch": 0.5790804410205459, "grad_norm": 0.4276401996612549, "learning_rate": 8.158448810397312e-05, "loss": 0.3097, "step": 156200 }, { "epoch": 0.5794511711364361, "grad_norm": 0.8611465096473694, "learning_rate": 8.156168245249136e-05, "loss": 0.3137, "step": 156300 }, { "epoch": 0.5798219012523264, "grad_norm": 0.6652678847312927, "learning_rate": 8.153886588043165e-05, "loss": 0.3391, "step": 156400 }, { "epoch": 0.5801926313682165, "grad_norm": 0.7935054302215576, "learning_rate": 8.151603839568871e-05, "loss": 0.3487, "step": 156500 }, { "epoch": 0.5805633614841068, "grad_norm": 0.6341293454170227, "learning_rate": 8.149320000616103e-05, "loss": 0.3383, "step": 156600 }, { "epoch": 0.580934091599997, "grad_norm": 0.6399911642074585, "learning_rate": 8.147035071975087e-05, "loss": 0.3046, "step": 156700 }, { "epoch": 0.5813048217158873, "grad_norm": 1.5658175945281982, "learning_rate": 8.14474905443642e-05, "loss": 0.3457, "step": 156800 }, { "epoch": 0.5816755518317775, "grad_norm": 0.5389863848686218, "learning_rate": 8.142461948791086e-05, "loss": 0.3303, "step": 156900 }, { "epoch": 0.5820462819476677, "grad_norm": 0.9050301313400269, "learning_rate": 8.14017375583044e-05, "loss": 0.3074, "step": 157000 }, { "epoch": 0.582417012063558, "grad_norm": 0.6462245583534241, "learning_rate": 8.137884476346214e-05, "loss": 0.3304, "step": 157100 }, { "epoch": 0.5827877421794482, "grad_norm": 0.778964102268219, "learning_rate": 8.135594111130516e-05, "loss": 0.3404, "step": 157200 }, { "epoch": 0.5831584722953385, "grad_norm": 1.3134076595306396, "learning_rate": 8.133302660975826e-05, "loss": 0.3072, "step": 157300 }, { "epoch": 0.5835292024112286, "grad_norm": 0.59549480676651, "learning_rate": 8.131010126675008e-05, "loss": 0.3114, "step": 157400 }, { "epoch": 0.5838999325271189, "grad_norm": 0.7614802718162537, "learning_rate": 8.128716509021291e-05, "loss": 0.3265, "step": 157500 }, { "epoch": 0.5842706626430092, "grad_norm": 0.7225677967071533, "learning_rate": 8.126421808808287e-05, "loss": 0.3222, "step": 157600 }, { "epoch": 0.5846413927588994, "grad_norm": 0.4899285137653351, "learning_rate": 8.124126026829981e-05, "loss": 0.3756, "step": 157700 }, { "epoch": 0.5850121228747897, "grad_norm": 0.7417983412742615, "learning_rate": 8.121829163880727e-05, "loss": 0.3085, "step": 157800 }, { "epoch": 0.5853828529906798, "grad_norm": 1.1237941980361938, "learning_rate": 8.119531220755256e-05, "loss": 0.3102, "step": 157900 }, { "epoch": 0.5857535831065701, "grad_norm": 0.7322901487350464, "learning_rate": 8.117232198248679e-05, "loss": 0.3196, "step": 158000 }, { "epoch": 0.5861243132224603, "grad_norm": 1.1301379203796387, "learning_rate": 8.114932097156471e-05, "loss": 0.3174, "step": 158100 }, { "epoch": 0.5864950433383506, "grad_norm": 0.8789128661155701, "learning_rate": 8.112630918274485e-05, "loss": 0.3449, "step": 158200 }, { "epoch": 0.5868657734542407, "grad_norm": 0.8041693568229675, "learning_rate": 8.110328662398945e-05, "loss": 0.3146, "step": 158300 }, { "epoch": 0.587236503570131, "grad_norm": 0.5157312154769897, "learning_rate": 8.10802533032645e-05, "loss": 0.3093, "step": 158400 }, { "epoch": 0.5876072336860213, "grad_norm": 0.5623337626457214, "learning_rate": 8.105720922853969e-05, "loss": 0.3216, "step": 158500 }, { "epoch": 0.5879779638019115, "grad_norm": 0.5649289488792419, "learning_rate": 8.103415440778846e-05, "loss": 0.3335, "step": 158600 }, { "epoch": 0.5883486939178018, "grad_norm": 0.3827555477619171, "learning_rate": 8.101108884898794e-05, "loss": 0.3407, "step": 158700 }, { "epoch": 0.5887194240336919, "grad_norm": 1.2988214492797852, "learning_rate": 8.098801256011899e-05, "loss": 0.3179, "step": 158800 }, { "epoch": 0.5890901541495822, "grad_norm": 1.3475226163864136, "learning_rate": 8.096492554916616e-05, "loss": 0.3168, "step": 158900 }, { "epoch": 0.5894608842654724, "grad_norm": 0.7034050226211548, "learning_rate": 8.094182782411775e-05, "loss": 0.3044, "step": 159000 }, { "epoch": 0.5898316143813627, "grad_norm": 0.45348671078681946, "learning_rate": 8.091871939296573e-05, "loss": 0.335, "step": 159100 }, { "epoch": 0.5902023444972528, "grad_norm": 0.5348803997039795, "learning_rate": 8.089560026370582e-05, "loss": 0.3315, "step": 159200 }, { "epoch": 0.5905730746131431, "grad_norm": 0.685468852519989, "learning_rate": 8.087247044433737e-05, "loss": 0.3082, "step": 159300 }, { "epoch": 0.5909438047290334, "grad_norm": 0.9230271577835083, "learning_rate": 8.084932994286349e-05, "loss": 0.3484, "step": 159400 }, { "epoch": 0.5913145348449236, "grad_norm": 0.7604371309280396, "learning_rate": 8.082617876729097e-05, "loss": 0.3412, "step": 159500 }, { "epoch": 0.5916852649608139, "grad_norm": 0.8036301136016846, "learning_rate": 8.080301692563028e-05, "loss": 0.3372, "step": 159600 }, { "epoch": 0.592055995076704, "grad_norm": 0.40108364820480347, "learning_rate": 8.077984442589562e-05, "loss": 0.3309, "step": 159700 }, { "epoch": 0.5924267251925943, "grad_norm": 0.5793288946151733, "learning_rate": 8.07566612761048e-05, "loss": 0.314, "step": 159800 }, { "epoch": 0.5927974553084845, "grad_norm": 0.7993284463882446, "learning_rate": 8.073346748427937e-05, "loss": 0.3337, "step": 159900 }, { "epoch": 0.5931681854243748, "grad_norm": 0.6807709336280823, "learning_rate": 8.07102630584446e-05, "loss": 0.3542, "step": 160000 }, { "epoch": 0.5935389155402649, "grad_norm": 0.5663362741470337, "learning_rate": 8.068704800662936e-05, "loss": 0.3235, "step": 160100 }, { "epoch": 0.5939096456561552, "grad_norm": 0.34325289726257324, "learning_rate": 8.066382233686624e-05, "loss": 0.3129, "step": 160200 }, { "epoch": 0.5942803757720455, "grad_norm": 1.9744770526885986, "learning_rate": 8.064058605719149e-05, "loss": 0.3076, "step": 160300 }, { "epoch": 0.5946511058879357, "grad_norm": 1.1668487787246704, "learning_rate": 8.061733917564504e-05, "loss": 0.3193, "step": 160400 }, { "epoch": 0.595021836003826, "grad_norm": 0.7204917669296265, "learning_rate": 8.05940817002705e-05, "loss": 0.3536, "step": 160500 }, { "epoch": 0.5953925661197161, "grad_norm": 0.5551474094390869, "learning_rate": 8.057081363911509e-05, "loss": 0.2872, "step": 160600 }, { "epoch": 0.5957632962356064, "grad_norm": 0.7016410827636719, "learning_rate": 8.054753500022977e-05, "loss": 0.3132, "step": 160700 }, { "epoch": 0.5961340263514966, "grad_norm": 0.664836585521698, "learning_rate": 8.052424579166911e-05, "loss": 0.3176, "step": 160800 }, { "epoch": 0.5965047564673869, "grad_norm": 1.1354440450668335, "learning_rate": 8.050094602149135e-05, "loss": 0.3105, "step": 160900 }, { "epoch": 0.5968754865832772, "grad_norm": 0.41729605197906494, "learning_rate": 8.047763569775839e-05, "loss": 0.3272, "step": 161000 }, { "epoch": 0.5972462166991673, "grad_norm": 0.7669975161552429, "learning_rate": 8.045431482853576e-05, "loss": 0.3267, "step": 161100 }, { "epoch": 0.5976169468150576, "grad_norm": 0.4365997612476349, "learning_rate": 8.043098342189267e-05, "loss": 0.3204, "step": 161200 }, { "epoch": 0.5979876769309478, "grad_norm": 0.5898780822753906, "learning_rate": 8.040764148590196e-05, "loss": 0.2892, "step": 161300 }, { "epoch": 0.5983584070468381, "grad_norm": 0.8510682582855225, "learning_rate": 8.038428902864011e-05, "loss": 0.3485, "step": 161400 }, { "epoch": 0.5987291371627282, "grad_norm": 0.7477826476097107, "learning_rate": 8.036092605818726e-05, "loss": 0.3218, "step": 161500 }, { "epoch": 0.5990998672786185, "grad_norm": 0.6159084439277649, "learning_rate": 8.033755258262714e-05, "loss": 0.3161, "step": 161600 }, { "epoch": 0.5994705973945087, "grad_norm": 0.5803883075714111, "learning_rate": 8.031416861004717e-05, "loss": 0.3272, "step": 161700 }, { "epoch": 0.599841327510399, "grad_norm": 0.8687874674797058, "learning_rate": 8.029077414853838e-05, "loss": 0.3482, "step": 161800 }, { "epoch": 0.6002120576262893, "grad_norm": 0.563754141330719, "learning_rate": 8.026736920619542e-05, "loss": 0.3291, "step": 161900 }, { "epoch": 0.6005827877421794, "grad_norm": 0.2796816825866699, "learning_rate": 8.02439537911166e-05, "loss": 0.3377, "step": 162000 }, { "epoch": 0.6009535178580697, "grad_norm": 0.9562301635742188, "learning_rate": 8.022052791140377e-05, "loss": 0.3138, "step": 162100 }, { "epoch": 0.6013242479739599, "grad_norm": 0.46490639448165894, "learning_rate": 8.019709157516252e-05, "loss": 0.3057, "step": 162200 }, { "epoch": 0.6016949780898502, "grad_norm": 0.6560518741607666, "learning_rate": 8.017364479050198e-05, "loss": 0.3293, "step": 162300 }, { "epoch": 0.6020657082057403, "grad_norm": 0.888691782951355, "learning_rate": 8.015018756553489e-05, "loss": 0.3218, "step": 162400 }, { "epoch": 0.6024364383216306, "grad_norm": 0.3368174433708191, "learning_rate": 8.012671990837766e-05, "loss": 0.2876, "step": 162500 }, { "epoch": 0.6028071684375208, "grad_norm": 0.25612902641296387, "learning_rate": 8.010324182715024e-05, "loss": 0.3032, "step": 162600 }, { "epoch": 0.6031778985534111, "grad_norm": 0.3191491961479187, "learning_rate": 8.007975332997624e-05, "loss": 0.3181, "step": 162700 }, { "epoch": 0.6035486286693014, "grad_norm": 0.6718581914901733, "learning_rate": 8.005625442498285e-05, "loss": 0.3279, "step": 162800 }, { "epoch": 0.6039193587851915, "grad_norm": 0.9097219109535217, "learning_rate": 8.003274512030088e-05, "loss": 0.3077, "step": 162900 }, { "epoch": 0.6042900889010818, "grad_norm": 0.6105613112449646, "learning_rate": 8.000922542406472e-05, "loss": 0.3189, "step": 163000 }, { "epoch": 0.604660819016972, "grad_norm": 0.5543099641799927, "learning_rate": 7.998569534441234e-05, "loss": 0.3569, "step": 163100 }, { "epoch": 0.6050315491328623, "grad_norm": 0.611444354057312, "learning_rate": 7.996215488948536e-05, "loss": 0.3385, "step": 163200 }, { "epoch": 0.6054022792487525, "grad_norm": 0.2771601974964142, "learning_rate": 7.99386040674289e-05, "loss": 0.3167, "step": 163300 }, { "epoch": 0.6057730093646427, "grad_norm": 0.42016780376434326, "learning_rate": 7.991504288639178e-05, "loss": 0.3246, "step": 163400 }, { "epoch": 0.606143739480533, "grad_norm": 0.38545694947242737, "learning_rate": 7.98914713545263e-05, "loss": 0.3261, "step": 163500 }, { "epoch": 0.6065144695964232, "grad_norm": 0.8454581499099731, "learning_rate": 7.986788947998843e-05, "loss": 0.3525, "step": 163600 }, { "epoch": 0.6068851997123135, "grad_norm": 0.6218124032020569, "learning_rate": 7.984429727093762e-05, "loss": 0.2875, "step": 163700 }, { "epoch": 0.6072559298282036, "grad_norm": 0.6464570760726929, "learning_rate": 7.9820694735537e-05, "loss": 0.3439, "step": 163800 }, { "epoch": 0.6076266599440939, "grad_norm": 1.1486072540283203, "learning_rate": 7.979708188195319e-05, "loss": 0.3277, "step": 163900 }, { "epoch": 0.6079973900599841, "grad_norm": 1.0173972845077515, "learning_rate": 7.977345871835643e-05, "loss": 0.3189, "step": 164000 }, { "epoch": 0.6083681201758744, "grad_norm": 1.0218851566314697, "learning_rate": 7.97498252529205e-05, "loss": 0.3147, "step": 164100 }, { "epoch": 0.6087388502917646, "grad_norm": 0.9714198112487793, "learning_rate": 7.972618149382278e-05, "loss": 0.3172, "step": 164200 }, { "epoch": 0.6091095804076548, "grad_norm": 0.5056530237197876, "learning_rate": 7.970252744924416e-05, "loss": 0.3472, "step": 164300 }, { "epoch": 0.6094803105235451, "grad_norm": 0.8827295303344727, "learning_rate": 7.967886312736913e-05, "loss": 0.343, "step": 164400 }, { "epoch": 0.6098510406394353, "grad_norm": 0.5658206939697266, "learning_rate": 7.965518853638572e-05, "loss": 0.3161, "step": 164500 }, { "epoch": 0.6102217707553256, "grad_norm": 0.7020500302314758, "learning_rate": 7.963150368448552e-05, "loss": 0.3157, "step": 164600 }, { "epoch": 0.6105925008712157, "grad_norm": 0.9323051571846008, "learning_rate": 7.960780857986364e-05, "loss": 0.3069, "step": 164700 }, { "epoch": 0.610963230987106, "grad_norm": 1.3231618404388428, "learning_rate": 7.958410323071879e-05, "loss": 0.3362, "step": 164800 }, { "epoch": 0.6113339611029962, "grad_norm": 1.4321640729904175, "learning_rate": 7.95603876452532e-05, "loss": 0.3557, "step": 164900 }, { "epoch": 0.6117046912188865, "grad_norm": 0.5015091896057129, "learning_rate": 7.953666183167263e-05, "loss": 0.3507, "step": 165000 }, { "epoch": 0.6120754213347767, "grad_norm": 0.7500291466712952, "learning_rate": 7.951292579818638e-05, "loss": 0.3123, "step": 165100 }, { "epoch": 0.6124461514506669, "grad_norm": 0.8980146646499634, "learning_rate": 7.94891795530073e-05, "loss": 0.3226, "step": 165200 }, { "epoch": 0.6128168815665572, "grad_norm": 0.7955583930015564, "learning_rate": 7.94654231043518e-05, "loss": 0.3274, "step": 165300 }, { "epoch": 0.6131876116824474, "grad_norm": 0.6423786878585815, "learning_rate": 7.944165646043975e-05, "loss": 0.3501, "step": 165400 }, { "epoch": 0.6135583417983377, "grad_norm": 0.5148004293441772, "learning_rate": 7.941787962949458e-05, "loss": 0.2995, "step": 165500 }, { "epoch": 0.6139290719142279, "grad_norm": 0.6537490487098694, "learning_rate": 7.939409261974329e-05, "loss": 0.3141, "step": 165600 }, { "epoch": 0.6142998020301181, "grad_norm": 0.9412434101104736, "learning_rate": 7.937029543941633e-05, "loss": 0.3245, "step": 165700 }, { "epoch": 0.6146705321460083, "grad_norm": 0.41764935851097107, "learning_rate": 7.934648809674772e-05, "loss": 0.324, "step": 165800 }, { "epoch": 0.6150412622618986, "grad_norm": 0.6327851414680481, "learning_rate": 7.932267059997496e-05, "loss": 0.3118, "step": 165900 }, { "epoch": 0.6154119923777888, "grad_norm": 0.5886436104774475, "learning_rate": 7.929884295733912e-05, "loss": 0.3547, "step": 166000 }, { "epoch": 0.615782722493679, "grad_norm": 1.5694185495376587, "learning_rate": 7.927500517708469e-05, "loss": 0.3648, "step": 166100 }, { "epoch": 0.6161534526095693, "grad_norm": 0.5889244675636292, "learning_rate": 7.925115726745975e-05, "loss": 0.305, "step": 166200 }, { "epoch": 0.6165241827254595, "grad_norm": 0.617215096950531, "learning_rate": 7.922729923671587e-05, "loss": 0.3251, "step": 166300 }, { "epoch": 0.6168949128413498, "grad_norm": 0.6008740067481995, "learning_rate": 7.920343109310808e-05, "loss": 0.2871, "step": 166400 }, { "epoch": 0.61726564295724, "grad_norm": 0.42335259914398193, "learning_rate": 7.917955284489494e-05, "loss": 0.3325, "step": 166500 }, { "epoch": 0.6176363730731302, "grad_norm": 0.7214980721473694, "learning_rate": 7.915566450033851e-05, "loss": 0.327, "step": 166600 }, { "epoch": 0.6180071031890204, "grad_norm": 0.45618027448654175, "learning_rate": 7.913176606770433e-05, "loss": 0.3381, "step": 166700 }, { "epoch": 0.6183778333049107, "grad_norm": 0.5154787302017212, "learning_rate": 7.910785755526145e-05, "loss": 0.3558, "step": 166800 }, { "epoch": 0.618748563420801, "grad_norm": 0.45641377568244934, "learning_rate": 7.908393897128236e-05, "loss": 0.3153, "step": 166900 }, { "epoch": 0.6191192935366912, "grad_norm": 0.7833675742149353, "learning_rate": 7.906001032404313e-05, "loss": 0.3128, "step": 167000 }, { "epoch": 0.6194900236525814, "grad_norm": 0.9445479512214661, "learning_rate": 7.903607162182318e-05, "loss": 0.3522, "step": 167100 }, { "epoch": 0.6198607537684716, "grad_norm": 0.7050688862800598, "learning_rate": 7.901212287290553e-05, "loss": 0.2984, "step": 167200 }, { "epoch": 0.6202314838843619, "grad_norm": 1.2139892578125, "learning_rate": 7.898816408557658e-05, "loss": 0.3136, "step": 167300 }, { "epoch": 0.6206022140002521, "grad_norm": 0.5112932324409485, "learning_rate": 7.89641952681263e-05, "loss": 0.3291, "step": 167400 }, { "epoch": 0.6209729441161423, "grad_norm": 1.1079003810882568, "learning_rate": 7.894021642884806e-05, "loss": 0.3057, "step": 167500 }, { "epoch": 0.6213436742320325, "grad_norm": 0.9953511953353882, "learning_rate": 7.891622757603872e-05, "loss": 0.3207, "step": 167600 }, { "epoch": 0.6217144043479228, "grad_norm": 0.9882906079292297, "learning_rate": 7.889222871799859e-05, "loss": 0.3189, "step": 167700 }, { "epoch": 0.6220851344638131, "grad_norm": 0.4893054664134979, "learning_rate": 7.886821986303144e-05, "loss": 0.3494, "step": 167800 }, { "epoch": 0.6224558645797033, "grad_norm": 0.3455788195133209, "learning_rate": 7.884420101944456e-05, "loss": 0.3069, "step": 167900 }, { "epoch": 0.6228265946955935, "grad_norm": 0.8653073906898499, "learning_rate": 7.882017219554863e-05, "loss": 0.3529, "step": 168000 }, { "epoch": 0.6231973248114837, "grad_norm": 0.6778702139854431, "learning_rate": 7.879613339965778e-05, "loss": 0.3127, "step": 168100 }, { "epoch": 0.623568054927374, "grad_norm": 0.3486931622028351, "learning_rate": 7.877208464008963e-05, "loss": 0.336, "step": 168200 }, { "epoch": 0.6239387850432642, "grad_norm": 0.5252026319503784, "learning_rate": 7.874802592516524e-05, "loss": 0.3266, "step": 168300 }, { "epoch": 0.6243095151591544, "grad_norm": 0.4913678765296936, "learning_rate": 7.872395726320909e-05, "loss": 0.3145, "step": 168400 }, { "epoch": 0.6246802452750446, "grad_norm": 0.45413070917129517, "learning_rate": 7.869987866254912e-05, "loss": 0.3136, "step": 168500 }, { "epoch": 0.6250509753909349, "grad_norm": 0.42494797706604004, "learning_rate": 7.867579013151671e-05, "loss": 0.3248, "step": 168600 }, { "epoch": 0.6254217055068252, "grad_norm": 0.8874185085296631, "learning_rate": 7.865169167844667e-05, "loss": 0.3115, "step": 168700 }, { "epoch": 0.6257924356227154, "grad_norm": 0.6738488078117371, "learning_rate": 7.862758331167724e-05, "loss": 0.3078, "step": 168800 }, { "epoch": 0.6261631657386056, "grad_norm": 1.0391923189163208, "learning_rate": 7.86034650395501e-05, "loss": 0.3281, "step": 168900 }, { "epoch": 0.6265338958544958, "grad_norm": 0.7682541608810425, "learning_rate": 7.857933687041036e-05, "loss": 0.3391, "step": 169000 }, { "epoch": 0.6269046259703861, "grad_norm": 0.45600491762161255, "learning_rate": 7.855519881260654e-05, "loss": 0.304, "step": 169100 }, { "epoch": 0.6272753560862763, "grad_norm": 0.5751388072967529, "learning_rate": 7.853105087449057e-05, "loss": 0.3191, "step": 169200 }, { "epoch": 0.6276460862021666, "grad_norm": 0.6437799334526062, "learning_rate": 7.850689306441785e-05, "loss": 0.3471, "step": 169300 }, { "epoch": 0.6280168163180568, "grad_norm": 0.6553826928138733, "learning_rate": 7.848272539074714e-05, "loss": 0.3177, "step": 169400 }, { "epoch": 0.628387546433947, "grad_norm": 0.46359783411026, "learning_rate": 7.845854786184065e-05, "loss": 0.3448, "step": 169500 }, { "epoch": 0.6287582765498373, "grad_norm": 0.6250042915344238, "learning_rate": 7.843436048606396e-05, "loss": 0.3504, "step": 169600 }, { "epoch": 0.6291290066657275, "grad_norm": 0.5439067482948303, "learning_rate": 7.841016327178612e-05, "loss": 0.3333, "step": 169700 }, { "epoch": 0.6294997367816177, "grad_norm": 0.3582840859889984, "learning_rate": 7.838595622737952e-05, "loss": 0.3097, "step": 169800 }, { "epoch": 0.6298704668975079, "grad_norm": 0.36283212900161743, "learning_rate": 7.836173936122e-05, "loss": 0.3247, "step": 169900 }, { "epoch": 0.6302411970133982, "grad_norm": 1.0198293924331665, "learning_rate": 7.833751268168674e-05, "loss": 0.3233, "step": 170000 }, { "epoch": 0.6306119271292884, "grad_norm": 0.9235473871231079, "learning_rate": 7.831327619716238e-05, "loss": 0.2966, "step": 170100 }, { "epoch": 0.6309826572451787, "grad_norm": 0.8856103420257568, "learning_rate": 7.828902991603292e-05, "loss": 0.3273, "step": 170200 }, { "epoch": 0.6313533873610689, "grad_norm": 0.4523926377296448, "learning_rate": 7.826477384668776e-05, "loss": 0.3018, "step": 170300 }, { "epoch": 0.6317241174769591, "grad_norm": 0.8163206577301025, "learning_rate": 7.824050799751967e-05, "loss": 0.3328, "step": 170400 }, { "epoch": 0.6320948475928494, "grad_norm": 0.32069477438926697, "learning_rate": 7.821623237692482e-05, "loss": 0.3342, "step": 170500 }, { "epoch": 0.6324655777087396, "grad_norm": 0.49772587418556213, "learning_rate": 7.819194699330276e-05, "loss": 0.3187, "step": 170600 }, { "epoch": 0.6328363078246299, "grad_norm": 0.3447984755039215, "learning_rate": 7.816765185505641e-05, "loss": 0.3385, "step": 170700 }, { "epoch": 0.63320703794052, "grad_norm": 0.6331614255905151, "learning_rate": 7.814334697059207e-05, "loss": 0.3148, "step": 170800 }, { "epoch": 0.6335777680564103, "grad_norm": 0.731684684753418, "learning_rate": 7.811903234831939e-05, "loss": 0.3259, "step": 170900 }, { "epoch": 0.6339484981723005, "grad_norm": 0.34974369406700134, "learning_rate": 7.809470799665147e-05, "loss": 0.3151, "step": 171000 }, { "epoch": 0.6343192282881908, "grad_norm": 0.7753848433494568, "learning_rate": 7.807037392400466e-05, "loss": 0.3413, "step": 171100 }, { "epoch": 0.634689958404081, "grad_norm": 0.504486620426178, "learning_rate": 7.804603013879874e-05, "loss": 0.3232, "step": 171200 }, { "epoch": 0.6350606885199712, "grad_norm": 0.9640689492225647, "learning_rate": 7.802167664945687e-05, "loss": 0.2976, "step": 171300 }, { "epoch": 0.6354314186358615, "grad_norm": 0.38343390822410583, "learning_rate": 7.799731346440552e-05, "loss": 0.3172, "step": 171400 }, { "epoch": 0.6358021487517517, "grad_norm": 0.6581412553787231, "learning_rate": 7.797294059207454e-05, "loss": 0.3416, "step": 171500 }, { "epoch": 0.636172878867642, "grad_norm": 1.9074287414550781, "learning_rate": 7.79485580408971e-05, "loss": 0.3327, "step": 171600 }, { "epoch": 0.6365436089835321, "grad_norm": 0.9837217926979065, "learning_rate": 7.792416581930978e-05, "loss": 0.368, "step": 171700 }, { "epoch": 0.6369143390994224, "grad_norm": 0.5056324601173401, "learning_rate": 7.789976393575246e-05, "loss": 0.3236, "step": 171800 }, { "epoch": 0.6372850692153126, "grad_norm": 0.35706183314323425, "learning_rate": 7.787535239866836e-05, "loss": 0.3311, "step": 171900 }, { "epoch": 0.6376557993312029, "grad_norm": 0.22083061933517456, "learning_rate": 7.785093121650406e-05, "loss": 0.3495, "step": 172000 }, { "epoch": 0.6380265294470931, "grad_norm": 0.7825510501861572, "learning_rate": 7.78265003977095e-05, "loss": 0.3302, "step": 172100 }, { "epoch": 0.6383972595629833, "grad_norm": 0.5891920328140259, "learning_rate": 7.780205995073791e-05, "loss": 0.3525, "step": 172200 }, { "epoch": 0.6387679896788736, "grad_norm": 0.625011146068573, "learning_rate": 7.777760988404584e-05, "loss": 0.3539, "step": 172300 }, { "epoch": 0.6391387197947638, "grad_norm": 0.9532029032707214, "learning_rate": 7.775315020609321e-05, "loss": 0.3296, "step": 172400 }, { "epoch": 0.6395094499106541, "grad_norm": 0.5624498724937439, "learning_rate": 7.772868092534325e-05, "loss": 0.3209, "step": 172500 }, { "epoch": 0.6398801800265442, "grad_norm": 0.5225188136100769, "learning_rate": 7.770420205026255e-05, "loss": 0.3375, "step": 172600 }, { "epoch": 0.6402509101424345, "grad_norm": 0.4058457016944885, "learning_rate": 7.767971358932093e-05, "loss": 0.3203, "step": 172700 }, { "epoch": 0.6406216402583248, "grad_norm": 0.4984307289123535, "learning_rate": 7.765521555099162e-05, "loss": 0.302, "step": 172800 }, { "epoch": 0.640992370374215, "grad_norm": 0.4690432846546173, "learning_rate": 7.763070794375111e-05, "loss": 0.3071, "step": 172900 }, { "epoch": 0.6413631004901053, "grad_norm": 0.3485882580280304, "learning_rate": 7.760619077607921e-05, "loss": 0.3285, "step": 173000 }, { "epoch": 0.6417338306059954, "grad_norm": 1.7520594596862793, "learning_rate": 7.758166405645905e-05, "loss": 0.3232, "step": 173100 }, { "epoch": 0.6421045607218857, "grad_norm": 0.8150789737701416, "learning_rate": 7.755712779337705e-05, "loss": 0.303, "step": 173200 }, { "epoch": 0.6424752908377759, "grad_norm": 1.266662359237671, "learning_rate": 7.753258199532296e-05, "loss": 0.3211, "step": 173300 }, { "epoch": 0.6428460209536662, "grad_norm": 0.48772305250167847, "learning_rate": 7.750802667078978e-05, "loss": 0.3339, "step": 173400 }, { "epoch": 0.6432167510695563, "grad_norm": 0.7318339347839355, "learning_rate": 7.748346182827387e-05, "loss": 0.3282, "step": 173500 }, { "epoch": 0.6435874811854466, "grad_norm": 0.5176146030426025, "learning_rate": 7.74588874762748e-05, "loss": 0.3389, "step": 173600 }, { "epoch": 0.6439582113013369, "grad_norm": 0.46523693203926086, "learning_rate": 7.743430362329554e-05, "loss": 0.3181, "step": 173700 }, { "epoch": 0.6443289414172271, "grad_norm": 0.41381171345710754, "learning_rate": 7.740971027784226e-05, "loss": 0.3205, "step": 173800 }, { "epoch": 0.6446996715331174, "grad_norm": 0.18193502724170685, "learning_rate": 7.73851074484244e-05, "loss": 0.3276, "step": 173900 }, { "epoch": 0.6450704016490075, "grad_norm": 0.486369788646698, "learning_rate": 7.736049514355478e-05, "loss": 0.3051, "step": 174000 }, { "epoch": 0.6454411317648978, "grad_norm": 1.0983823537826538, "learning_rate": 7.733587337174945e-05, "loss": 0.3125, "step": 174100 }, { "epoch": 0.645811861880788, "grad_norm": 0.8734340667724609, "learning_rate": 7.731124214152767e-05, "loss": 0.3804, "step": 174200 }, { "epoch": 0.6461825919966783, "grad_norm": 0.5609524846076965, "learning_rate": 7.728660146141207e-05, "loss": 0.3227, "step": 174300 }, { "epoch": 0.6465533221125684, "grad_norm": 0.8946533799171448, "learning_rate": 7.726195133992848e-05, "loss": 0.3143, "step": 174400 }, { "epoch": 0.6469240522284587, "grad_norm": 0.8029590845108032, "learning_rate": 7.723729178560609e-05, "loss": 0.3354, "step": 174500 }, { "epoch": 0.647294782344349, "grad_norm": 0.2535370886325836, "learning_rate": 7.721262280697719e-05, "loss": 0.3064, "step": 174600 }, { "epoch": 0.6476655124602392, "grad_norm": 0.6093637943267822, "learning_rate": 7.718794441257751e-05, "loss": 0.2988, "step": 174700 }, { "epoch": 0.6480362425761295, "grad_norm": 0.7915005087852478, "learning_rate": 7.716325661094595e-05, "loss": 0.3472, "step": 174800 }, { "epoch": 0.6484069726920196, "grad_norm": 0.722996175289154, "learning_rate": 7.713855941062463e-05, "loss": 0.3248, "step": 174900 }, { "epoch": 0.6487777028079099, "grad_norm": 0.9427354335784912, "learning_rate": 7.711385282015902e-05, "loss": 0.2914, "step": 175000 }, { "epoch": 0.6491484329238001, "grad_norm": 0.6179879903793335, "learning_rate": 7.708913684809773e-05, "loss": 0.3019, "step": 175100 }, { "epoch": 0.6495191630396904, "grad_norm": 0.33187904953956604, "learning_rate": 7.706441150299273e-05, "loss": 0.3228, "step": 175200 }, { "epoch": 0.6498898931555807, "grad_norm": 0.5731324553489685, "learning_rate": 7.703967679339912e-05, "loss": 0.3164, "step": 175300 }, { "epoch": 0.6502606232714708, "grad_norm": 0.22423574328422546, "learning_rate": 7.701493272787533e-05, "loss": 0.3213, "step": 175400 }, { "epoch": 0.6506313533873611, "grad_norm": 0.8272051215171814, "learning_rate": 7.699017931498298e-05, "loss": 0.341, "step": 175500 }, { "epoch": 0.6510020835032513, "grad_norm": 1.2736979722976685, "learning_rate": 7.696541656328694e-05, "loss": 0.2901, "step": 175600 }, { "epoch": 0.6513728136191416, "grad_norm": 0.6438654065132141, "learning_rate": 7.694064448135528e-05, "loss": 0.3107, "step": 175700 }, { "epoch": 0.6517435437350317, "grad_norm": 0.8650057315826416, "learning_rate": 7.691586307775937e-05, "loss": 0.3004, "step": 175800 }, { "epoch": 0.652114273850922, "grad_norm": 0.5065053105354309, "learning_rate": 7.689107236107374e-05, "loss": 0.3465, "step": 175900 }, { "epoch": 0.6524850039668122, "grad_norm": 0.8852288126945496, "learning_rate": 7.686627233987616e-05, "loss": 0.3008, "step": 176000 }, { "epoch": 0.6528557340827025, "grad_norm": 0.5538991093635559, "learning_rate": 7.684146302274762e-05, "loss": 0.3426, "step": 176100 }, { "epoch": 0.6532264641985928, "grad_norm": 1.0335015058517456, "learning_rate": 7.681664441827233e-05, "loss": 0.2862, "step": 176200 }, { "epoch": 0.6535971943144829, "grad_norm": 1.002989649772644, "learning_rate": 7.679181653503772e-05, "loss": 0.3136, "step": 176300 }, { "epoch": 0.6539679244303732, "grad_norm": 0.6089767217636108, "learning_rate": 7.676697938163442e-05, "loss": 0.317, "step": 176400 }, { "epoch": 0.6543386545462634, "grad_norm": 1.1149976253509521, "learning_rate": 7.674213296665627e-05, "loss": 0.3339, "step": 176500 }, { "epoch": 0.6547093846621537, "grad_norm": 0.4255412518978119, "learning_rate": 7.671727729870032e-05, "loss": 0.3185, "step": 176600 }, { "epoch": 0.6550801147780438, "grad_norm": 0.6290963888168335, "learning_rate": 7.669241238636681e-05, "loss": 0.3076, "step": 176700 }, { "epoch": 0.6554508448939341, "grad_norm": 0.6539090275764465, "learning_rate": 7.66675382382592e-05, "loss": 0.3457, "step": 176800 }, { "epoch": 0.6558215750098243, "grad_norm": 0.4805006682872772, "learning_rate": 7.66426548629841e-05, "loss": 0.3255, "step": 176900 }, { "epoch": 0.6561923051257146, "grad_norm": 0.5355345606803894, "learning_rate": 7.661776226915137e-05, "loss": 0.3382, "step": 177000 }, { "epoch": 0.6565630352416049, "grad_norm": 0.42525121569633484, "learning_rate": 7.659286046537403e-05, "loss": 0.298, "step": 177100 }, { "epoch": 0.656933765357495, "grad_norm": 0.8569908738136292, "learning_rate": 7.656794946026827e-05, "loss": 0.3362, "step": 177200 }, { "epoch": 0.6573044954733853, "grad_norm": 0.21998730301856995, "learning_rate": 7.654302926245352e-05, "loss": 0.3366, "step": 177300 }, { "epoch": 0.6576752255892755, "grad_norm": 0.4755604565143585, "learning_rate": 7.651809988055232e-05, "loss": 0.3029, "step": 177400 }, { "epoch": 0.6580459557051658, "grad_norm": 0.509870707988739, "learning_rate": 7.649316132319046e-05, "loss": 0.3256, "step": 177500 }, { "epoch": 0.658416685821056, "grad_norm": 0.7125570178031921, "learning_rate": 7.646821359899681e-05, "loss": 0.3141, "step": 177600 }, { "epoch": 0.6587874159369462, "grad_norm": 1.0448472499847412, "learning_rate": 7.644325671660353e-05, "loss": 0.3021, "step": 177700 }, { "epoch": 0.6591581460528364, "grad_norm": 1.4346028566360474, "learning_rate": 7.641829068464585e-05, "loss": 0.3421, "step": 177800 }, { "epoch": 0.6595288761687267, "grad_norm": 1.0625039339065552, "learning_rate": 7.639331551176222e-05, "loss": 0.3058, "step": 177900 }, { "epoch": 0.659899606284617, "grad_norm": 0.6959584355354309, "learning_rate": 7.636833120659424e-05, "loss": 0.3022, "step": 178000 }, { "epoch": 0.6602703364005071, "grad_norm": 0.7739969491958618, "learning_rate": 7.634333777778664e-05, "loss": 0.296, "step": 178100 }, { "epoch": 0.6606410665163974, "grad_norm": 1.0780497789382935, "learning_rate": 7.631833523398735e-05, "loss": 0.3173, "step": 178200 }, { "epoch": 0.6610117966322876, "grad_norm": 0.730991542339325, "learning_rate": 7.629332358384744e-05, "loss": 0.3368, "step": 178300 }, { "epoch": 0.6613825267481779, "grad_norm": 0.8907705545425415, "learning_rate": 7.626830283602113e-05, "loss": 0.3813, "step": 178400 }, { "epoch": 0.661753256864068, "grad_norm": 0.2506537437438965, "learning_rate": 7.624327299916578e-05, "loss": 0.3299, "step": 178500 }, { "epoch": 0.6621239869799583, "grad_norm": 0.7824212908744812, "learning_rate": 7.62182340819419e-05, "loss": 0.3406, "step": 178600 }, { "epoch": 0.6624947170958486, "grad_norm": 0.5176686644554138, "learning_rate": 7.619318609301314e-05, "loss": 0.3038, "step": 178700 }, { "epoch": 0.6628654472117388, "grad_norm": 0.6650992631912231, "learning_rate": 7.616812904104631e-05, "loss": 0.3023, "step": 178800 }, { "epoch": 0.6632361773276291, "grad_norm": 0.7321762442588806, "learning_rate": 7.614306293471131e-05, "loss": 0.3219, "step": 178900 }, { "epoch": 0.6636069074435192, "grad_norm": 0.6811278462409973, "learning_rate": 7.611798778268123e-05, "loss": 0.333, "step": 179000 }, { "epoch": 0.6639776375594095, "grad_norm": 0.5278980731964111, "learning_rate": 7.609290359363221e-05, "loss": 0.3068, "step": 179100 }, { "epoch": 0.6643483676752997, "grad_norm": 0.6801670789718628, "learning_rate": 7.606781037624362e-05, "loss": 0.2892, "step": 179200 }, { "epoch": 0.66471909779119, "grad_norm": 0.8099891543388367, "learning_rate": 7.604270813919788e-05, "loss": 0.2991, "step": 179300 }, { "epoch": 0.6650898279070802, "grad_norm": 1.5796775817871094, "learning_rate": 7.601759689118054e-05, "loss": 0.3359, "step": 179400 }, { "epoch": 0.6654605580229704, "grad_norm": 0.2431613951921463, "learning_rate": 7.59924766408803e-05, "loss": 0.3121, "step": 179500 }, { "epoch": 0.6658312881388607, "grad_norm": 0.5416744351387024, "learning_rate": 7.596734739698894e-05, "loss": 0.3353, "step": 179600 }, { "epoch": 0.6662020182547509, "grad_norm": 0.6308532357215881, "learning_rate": 7.594220916820137e-05, "loss": 0.2871, "step": 179700 }, { "epoch": 0.6665727483706412, "grad_norm": 0.37236660718917847, "learning_rate": 7.591706196321562e-05, "loss": 0.3188, "step": 179800 }, { "epoch": 0.6669434784865313, "grad_norm": 0.34211260080337524, "learning_rate": 7.589190579073279e-05, "loss": 0.2942, "step": 179900 }, { "epoch": 0.6673142086024216, "grad_norm": 0.34535935521125793, "learning_rate": 7.58667406594571e-05, "loss": 0.3113, "step": 180000 }, { "epoch": 0.6676849387183118, "grad_norm": 0.5744240880012512, "learning_rate": 7.584156657809589e-05, "loss": 0.3295, "step": 180100 }, { "epoch": 0.6680556688342021, "grad_norm": 0.6442566514015198, "learning_rate": 7.581638355535959e-05, "loss": 0.3199, "step": 180200 }, { "epoch": 0.6684263989500923, "grad_norm": 0.5782174468040466, "learning_rate": 7.579119159996169e-05, "loss": 0.3316, "step": 180300 }, { "epoch": 0.6687971290659825, "grad_norm": 0.4303435981273651, "learning_rate": 7.576599072061879e-05, "loss": 0.3344, "step": 180400 }, { "epoch": 0.6691678591818728, "grad_norm": 0.8309177160263062, "learning_rate": 7.57407809260506e-05, "loss": 0.3218, "step": 180500 }, { "epoch": 0.669538589297763, "grad_norm": 0.3717585504055023, "learning_rate": 7.571556222497991e-05, "loss": 0.3564, "step": 180600 }, { "epoch": 0.6699093194136533, "grad_norm": 0.5631848573684692, "learning_rate": 7.569033462613254e-05, "loss": 0.3189, "step": 180700 }, { "epoch": 0.6702800495295435, "grad_norm": 1.5231022834777832, "learning_rate": 7.566509813823746e-05, "loss": 0.3282, "step": 180800 }, { "epoch": 0.6706507796454337, "grad_norm": 0.9653101563453674, "learning_rate": 7.563985277002667e-05, "loss": 0.322, "step": 180900 }, { "epoch": 0.6710215097613239, "grad_norm": 1.0818897485733032, "learning_rate": 7.561459853023527e-05, "loss": 0.2999, "step": 181000 }, { "epoch": 0.6713922398772142, "grad_norm": 0.6261563897132874, "learning_rate": 7.558933542760138e-05, "loss": 0.3285, "step": 181100 }, { "epoch": 0.6717629699931045, "grad_norm": 0.7915939092636108, "learning_rate": 7.556406347086627e-05, "loss": 0.3309, "step": 181200 }, { "epoch": 0.6721337001089946, "grad_norm": 0.4921512007713318, "learning_rate": 7.55387826687742e-05, "loss": 0.3273, "step": 181300 }, { "epoch": 0.6725044302248849, "grad_norm": 0.9209569692611694, "learning_rate": 7.551349303007252e-05, "loss": 0.3419, "step": 181400 }, { "epoch": 0.6728751603407751, "grad_norm": 0.8162488341331482, "learning_rate": 7.548819456351162e-05, "loss": 0.3174, "step": 181500 }, { "epoch": 0.6732458904566654, "grad_norm": 0.24493291974067688, "learning_rate": 7.546288727784498e-05, "loss": 0.3208, "step": 181600 }, { "epoch": 0.6736166205725556, "grad_norm": 0.6124547123908997, "learning_rate": 7.54375711818291e-05, "loss": 0.314, "step": 181700 }, { "epoch": 0.6739873506884458, "grad_norm": 0.5064467787742615, "learning_rate": 7.541224628422354e-05, "loss": 0.3006, "step": 181800 }, { "epoch": 0.674358080804336, "grad_norm": 0.666159987449646, "learning_rate": 7.53869125937909e-05, "loss": 0.3069, "step": 181900 }, { "epoch": 0.6747288109202263, "grad_norm": 0.4678840637207031, "learning_rate": 7.536157011929684e-05, "loss": 0.3133, "step": 182000 }, { "epoch": 0.6750995410361166, "grad_norm": 1.6993242502212524, "learning_rate": 7.533621886951004e-05, "loss": 0.3742, "step": 182100 }, { "epoch": 0.6754702711520068, "grad_norm": 0.7357749342918396, "learning_rate": 7.531085885320219e-05, "loss": 0.3016, "step": 182200 }, { "epoch": 0.675841001267897, "grad_norm": 0.29037564992904663, "learning_rate": 7.528549007914809e-05, "loss": 0.3189, "step": 182300 }, { "epoch": 0.6762117313837872, "grad_norm": 0.5905086994171143, "learning_rate": 7.52601125561255e-05, "loss": 0.3156, "step": 182400 }, { "epoch": 0.6765824614996775, "grad_norm": 0.49776536226272583, "learning_rate": 7.523472629291526e-05, "loss": 0.3298, "step": 182500 }, { "epoch": 0.6769531916155677, "grad_norm": 0.4483221769332886, "learning_rate": 7.520933129830116e-05, "loss": 0.342, "step": 182600 }, { "epoch": 0.677323921731458, "grad_norm": 0.8897309303283691, "learning_rate": 7.518392758107012e-05, "loss": 0.3457, "step": 182700 }, { "epoch": 0.6776946518473481, "grad_norm": 0.7779741883277893, "learning_rate": 7.515851515001195e-05, "loss": 0.3182, "step": 182800 }, { "epoch": 0.6780653819632384, "grad_norm": 0.5704395174980164, "learning_rate": 7.513309401391959e-05, "loss": 0.3229, "step": 182900 }, { "epoch": 0.6784361120791287, "grad_norm": 1.2463005781173706, "learning_rate": 7.51076641815889e-05, "loss": 0.3019, "step": 183000 }, { "epoch": 0.6788068421950189, "grad_norm": 0.772339403629303, "learning_rate": 7.508222566181885e-05, "loss": 0.303, "step": 183100 }, { "epoch": 0.6791775723109091, "grad_norm": 1.0049304962158203, "learning_rate": 7.505677846341131e-05, "loss": 0.3193, "step": 183200 }, { "epoch": 0.6795483024267993, "grad_norm": 0.5658488273620605, "learning_rate": 7.50313225951712e-05, "loss": 0.3128, "step": 183300 }, { "epoch": 0.6799190325426896, "grad_norm": 0.661724328994751, "learning_rate": 7.500585806590646e-05, "loss": 0.3463, "step": 183400 }, { "epoch": 0.6802897626585798, "grad_norm": 0.5728402733802795, "learning_rate": 7.498038488442803e-05, "loss": 0.3071, "step": 183500 }, { "epoch": 0.68066049277447, "grad_norm": 0.859163224697113, "learning_rate": 7.495490305954977e-05, "loss": 0.3307, "step": 183600 }, { "epoch": 0.6810312228903602, "grad_norm": 0.45489948987960815, "learning_rate": 7.492941260008862e-05, "loss": 0.2977, "step": 183700 }, { "epoch": 0.6814019530062505, "grad_norm": 0.8212697505950928, "learning_rate": 7.490391351486443e-05, "loss": 0.3204, "step": 183800 }, { "epoch": 0.6817726831221408, "grad_norm": 1.0903757810592651, "learning_rate": 7.487840581270013e-05, "loss": 0.3486, "step": 183900 }, { "epoch": 0.682143413238031, "grad_norm": 0.8270167112350464, "learning_rate": 7.485288950242153e-05, "loss": 0.3242, "step": 184000 }, { "epoch": 0.6825141433539212, "grad_norm": 0.349461168050766, "learning_rate": 7.482736459285749e-05, "loss": 0.3351, "step": 184100 }, { "epoch": 0.6828848734698114, "grad_norm": 0.3962986469268799, "learning_rate": 7.48018310928398e-05, "loss": 0.3189, "step": 184200 }, { "epoch": 0.6832556035857017, "grad_norm": 0.7375141382217407, "learning_rate": 7.477628901120325e-05, "loss": 0.3229, "step": 184300 }, { "epoch": 0.6836263337015919, "grad_norm": 0.710038959980011, "learning_rate": 7.475073835678558e-05, "loss": 0.3463, "step": 184400 }, { "epoch": 0.6839970638174822, "grad_norm": 0.686808705329895, "learning_rate": 7.472517913842754e-05, "loss": 0.302, "step": 184500 }, { "epoch": 0.6843677939333724, "grad_norm": 0.5359973311424255, "learning_rate": 7.469961136497279e-05, "loss": 0.306, "step": 184600 }, { "epoch": 0.6847385240492626, "grad_norm": 0.4455412030220032, "learning_rate": 7.467403504526795e-05, "loss": 0.3149, "step": 184700 }, { "epoch": 0.6851092541651529, "grad_norm": 1.4339040517807007, "learning_rate": 7.464845018816265e-05, "loss": 0.2975, "step": 184800 }, { "epoch": 0.6854799842810431, "grad_norm": 0.45278799533843994, "learning_rate": 7.462285680250943e-05, "loss": 0.3077, "step": 184900 }, { "epoch": 0.6858507143969333, "grad_norm": 0.5447257161140442, "learning_rate": 7.459725489716379e-05, "loss": 0.3153, "step": 185000 }, { "epoch": 0.6862214445128235, "grad_norm": 0.5486783385276794, "learning_rate": 7.457164448098419e-05, "loss": 0.3258, "step": 185100 }, { "epoch": 0.6865921746287138, "grad_norm": 0.507307231426239, "learning_rate": 7.454602556283202e-05, "loss": 0.3034, "step": 185200 }, { "epoch": 0.686962904744604, "grad_norm": 1.0625334978103638, "learning_rate": 7.452039815157159e-05, "loss": 0.3388, "step": 185300 }, { "epoch": 0.6873336348604943, "grad_norm": 0.7222393751144409, "learning_rate": 7.449476225607022e-05, "loss": 0.3179, "step": 185400 }, { "epoch": 0.6877043649763845, "grad_norm": 0.7006638646125793, "learning_rate": 7.44691178851981e-05, "loss": 0.3061, "step": 185500 }, { "epoch": 0.6880750950922747, "grad_norm": 0.2547164261341095, "learning_rate": 7.444346504782838e-05, "loss": 0.3299, "step": 185600 }, { "epoch": 0.688445825208165, "grad_norm": 0.7784324288368225, "learning_rate": 7.441780375283713e-05, "loss": 0.347, "step": 185700 }, { "epoch": 0.6888165553240552, "grad_norm": 0.5545558333396912, "learning_rate": 7.439213400910334e-05, "loss": 0.314, "step": 185800 }, { "epoch": 0.6891872854399455, "grad_norm": 0.7974479794502258, "learning_rate": 7.436645582550894e-05, "loss": 0.3248, "step": 185900 }, { "epoch": 0.6895580155558356, "grad_norm": 0.6119667291641235, "learning_rate": 7.434076921093878e-05, "loss": 0.3104, "step": 186000 }, { "epoch": 0.6899287456717259, "grad_norm": 0.6379722356796265, "learning_rate": 7.431507417428062e-05, "loss": 0.3352, "step": 186100 }, { "epoch": 0.6902994757876161, "grad_norm": 0.6623979210853577, "learning_rate": 7.428937072442512e-05, "loss": 0.3041, "step": 186200 }, { "epoch": 0.6906702059035064, "grad_norm": 0.38964736461639404, "learning_rate": 7.42636588702659e-05, "loss": 0.3265, "step": 186300 }, { "epoch": 0.6910409360193966, "grad_norm": 0.5108358263969421, "learning_rate": 7.423793862069941e-05, "loss": 0.3237, "step": 186400 }, { "epoch": 0.6914116661352868, "grad_norm": 0.320951372385025, "learning_rate": 7.421220998462508e-05, "loss": 0.3077, "step": 186500 }, { "epoch": 0.6917823962511771, "grad_norm": 0.5915641188621521, "learning_rate": 7.418647297094519e-05, "loss": 0.3175, "step": 186600 }, { "epoch": 0.6921531263670673, "grad_norm": 0.4457387924194336, "learning_rate": 7.416072758856495e-05, "loss": 0.3459, "step": 186700 }, { "epoch": 0.6925238564829576, "grad_norm": 0.7447243332862854, "learning_rate": 7.413497384639246e-05, "loss": 0.3045, "step": 186800 }, { "epoch": 0.6928945865988477, "grad_norm": 0.6434532403945923, "learning_rate": 7.410921175333869e-05, "loss": 0.3383, "step": 186900 }, { "epoch": 0.693265316714738, "grad_norm": 0.5028476715087891, "learning_rate": 7.408344131831753e-05, "loss": 0.3181, "step": 187000 }, { "epoch": 0.6936360468306283, "grad_norm": 0.6377689838409424, "learning_rate": 7.405766255024575e-05, "loss": 0.3071, "step": 187100 }, { "epoch": 0.6940067769465185, "grad_norm": 0.43788260221481323, "learning_rate": 7.403187545804297e-05, "loss": 0.3298, "step": 187200 }, { "epoch": 0.6943775070624087, "grad_norm": 1.222129464149475, "learning_rate": 7.400608005063174e-05, "loss": 0.3059, "step": 187300 }, { "epoch": 0.6947482371782989, "grad_norm": 0.7180872559547424, "learning_rate": 7.398027633693746e-05, "loss": 0.3369, "step": 187400 }, { "epoch": 0.6951189672941892, "grad_norm": 0.7142365574836731, "learning_rate": 7.39544643258884e-05, "loss": 0.3044, "step": 187500 }, { "epoch": 0.6954896974100794, "grad_norm": 1.2585376501083374, "learning_rate": 7.392864402641571e-05, "loss": 0.3206, "step": 187600 }, { "epoch": 0.6958604275259697, "grad_norm": 0.9578207731246948, "learning_rate": 7.390281544745342e-05, "loss": 0.2967, "step": 187700 }, { "epoch": 0.6962311576418598, "grad_norm": 0.8429668545722961, "learning_rate": 7.387697859793838e-05, "loss": 0.3082, "step": 187800 }, { "epoch": 0.6966018877577501, "grad_norm": 0.5438064932823181, "learning_rate": 7.385113348681037e-05, "loss": 0.329, "step": 187900 }, { "epoch": 0.6969726178736404, "grad_norm": 0.35344892740249634, "learning_rate": 7.382528012301198e-05, "loss": 0.3175, "step": 188000 }, { "epoch": 0.6973433479895306, "grad_norm": 1.2347345352172852, "learning_rate": 7.379941851548864e-05, "loss": 0.357, "step": 188100 }, { "epoch": 0.6977140781054209, "grad_norm": 0.46499601006507874, "learning_rate": 7.37735486731887e-05, "loss": 0.3354, "step": 188200 }, { "epoch": 0.698084808221311, "grad_norm": 0.7512760162353516, "learning_rate": 7.37476706050633e-05, "loss": 0.3102, "step": 188300 }, { "epoch": 0.6984555383372013, "grad_norm": 0.4467937648296356, "learning_rate": 7.372178432006643e-05, "loss": 0.3306, "step": 188400 }, { "epoch": 0.6988262684530915, "grad_norm": 0.4034179449081421, "learning_rate": 7.369588982715496e-05, "loss": 0.342, "step": 188500 }, { "epoch": 0.6991969985689818, "grad_norm": 0.8997436761856079, "learning_rate": 7.366998713528858e-05, "loss": 0.3282, "step": 188600 }, { "epoch": 0.6995677286848719, "grad_norm": 0.31567975878715515, "learning_rate": 7.36440762534298e-05, "loss": 0.297, "step": 188700 }, { "epoch": 0.6999384588007622, "grad_norm": 0.37675440311431885, "learning_rate": 7.361815719054399e-05, "loss": 0.3253, "step": 188800 }, { "epoch": 0.7003091889166525, "grad_norm": 0.566193163394928, "learning_rate": 7.359222995559933e-05, "loss": 0.3038, "step": 188900 }, { "epoch": 0.7006799190325427, "grad_norm": 1.7031123638153076, "learning_rate": 7.356629455756685e-05, "loss": 0.3318, "step": 189000 }, { "epoch": 0.701050649148433, "grad_norm": 0.9364111423492432, "learning_rate": 7.354035100542036e-05, "loss": 0.3201, "step": 189100 }, { "epoch": 0.7014213792643231, "grad_norm": 0.6027671694755554, "learning_rate": 7.351439930813655e-05, "loss": 0.3137, "step": 189200 }, { "epoch": 0.7017921093802134, "grad_norm": 1.0103943347930908, "learning_rate": 7.348843947469492e-05, "loss": 0.3346, "step": 189300 }, { "epoch": 0.7021628394961036, "grad_norm": 0.5706481337547302, "learning_rate": 7.346247151407772e-05, "loss": 0.3475, "step": 189400 }, { "epoch": 0.7025335696119939, "grad_norm": 1.1142982244491577, "learning_rate": 7.343649543527009e-05, "loss": 0.3322, "step": 189500 }, { "epoch": 0.702904299727884, "grad_norm": 1.0004854202270508, "learning_rate": 7.341051124725994e-05, "loss": 0.3208, "step": 189600 }, { "epoch": 0.7032750298437743, "grad_norm": 0.2938682734966278, "learning_rate": 7.3384518959038e-05, "loss": 0.3227, "step": 189700 }, { "epoch": 0.7036457599596646, "grad_norm": 0.5536258220672607, "learning_rate": 7.335851857959777e-05, "loss": 0.321, "step": 189800 }, { "epoch": 0.7040164900755548, "grad_norm": 0.6051995754241943, "learning_rate": 7.33325101179356e-05, "loss": 0.3103, "step": 189900 }, { "epoch": 0.7043872201914451, "grad_norm": 0.3693952262401581, "learning_rate": 7.33064935830506e-05, "loss": 0.309, "step": 190000 }, { "epoch": 0.7047579503073352, "grad_norm": 0.372228741645813, "learning_rate": 7.328046898394471e-05, "loss": 0.3214, "step": 190100 }, { "epoch": 0.7051286804232255, "grad_norm": 0.8089569211006165, "learning_rate": 7.325443632962261e-05, "loss": 0.3415, "step": 190200 }, { "epoch": 0.7054994105391157, "grad_norm": 0.6680608987808228, "learning_rate": 7.322839562909178e-05, "loss": 0.2992, "step": 190300 }, { "epoch": 0.705870140655006, "grad_norm": 0.47035372257232666, "learning_rate": 7.320234689136254e-05, "loss": 0.2924, "step": 190400 }, { "epoch": 0.7062408707708963, "grad_norm": 0.5862602591514587, "learning_rate": 7.317629012544792e-05, "loss": 0.3139, "step": 190500 }, { "epoch": 0.7066116008867864, "grad_norm": 0.7823419570922852, "learning_rate": 7.315022534036375e-05, "loss": 0.3263, "step": 190600 }, { "epoch": 0.7069823310026767, "grad_norm": 0.6277729868888855, "learning_rate": 7.312415254512866e-05, "loss": 0.2909, "step": 190700 }, { "epoch": 0.7073530611185669, "grad_norm": 1.2645689249038696, "learning_rate": 7.309807174876402e-05, "loss": 0.3155, "step": 190800 }, { "epoch": 0.7077237912344572, "grad_norm": 0.4409535229206085, "learning_rate": 7.307198296029398e-05, "loss": 0.3341, "step": 190900 }, { "epoch": 0.7080945213503473, "grad_norm": 0.48958879709243774, "learning_rate": 7.304588618874545e-05, "loss": 0.3147, "step": 191000 }, { "epoch": 0.7084652514662376, "grad_norm": 0.29427191615104675, "learning_rate": 7.301978144314812e-05, "loss": 0.303, "step": 191100 }, { "epoch": 0.7088359815821278, "grad_norm": 0.8111467957496643, "learning_rate": 7.299366873253443e-05, "loss": 0.3109, "step": 191200 }, { "epoch": 0.7092067116980181, "grad_norm": 0.5425082445144653, "learning_rate": 7.296754806593958e-05, "loss": 0.3116, "step": 191300 }, { "epoch": 0.7095774418139084, "grad_norm": 0.6464499831199646, "learning_rate": 7.294141945240147e-05, "loss": 0.3158, "step": 191400 }, { "epoch": 0.7099481719297985, "grad_norm": 0.5491740107536316, "learning_rate": 7.291528290096085e-05, "loss": 0.3296, "step": 191500 }, { "epoch": 0.7103189020456888, "grad_norm": 0.44192609190940857, "learning_rate": 7.288913842066114e-05, "loss": 0.3378, "step": 191600 }, { "epoch": 0.710689632161579, "grad_norm": 0.6680096983909607, "learning_rate": 7.286298602054852e-05, "loss": 0.2921, "step": 191700 }, { "epoch": 0.7110603622774693, "grad_norm": 0.4591479003429413, "learning_rate": 7.283682570967192e-05, "loss": 0.2913, "step": 191800 }, { "epoch": 0.7114310923933594, "grad_norm": 1.4235607385635376, "learning_rate": 7.2810657497083e-05, "loss": 0.3083, "step": 191900 }, { "epoch": 0.7118018225092497, "grad_norm": 0.3024100363254547, "learning_rate": 7.278448139183618e-05, "loss": 0.3304, "step": 192000 }, { "epoch": 0.7121725526251399, "grad_norm": 0.4079939126968384, "learning_rate": 7.275829740298854e-05, "loss": 0.3142, "step": 192100 }, { "epoch": 0.7125432827410302, "grad_norm": 0.8111346364021301, "learning_rate": 7.273210553959998e-05, "loss": 0.3198, "step": 192200 }, { "epoch": 0.7129140128569205, "grad_norm": 0.573196530342102, "learning_rate": 7.270590581073308e-05, "loss": 0.3237, "step": 192300 }, { "epoch": 0.7132847429728106, "grad_norm": 0.3612860441207886, "learning_rate": 7.267969822545311e-05, "loss": 0.3162, "step": 192400 }, { "epoch": 0.7136554730887009, "grad_norm": 0.7423903346061707, "learning_rate": 7.26534827928281e-05, "loss": 0.2919, "step": 192500 }, { "epoch": 0.7140262032045911, "grad_norm": 0.9822202920913696, "learning_rate": 7.262725952192882e-05, "loss": 0.3158, "step": 192600 }, { "epoch": 0.7143969333204814, "grad_norm": 1.0921885967254639, "learning_rate": 7.260102842182867e-05, "loss": 0.3239, "step": 192700 }, { "epoch": 0.7147676634363715, "grad_norm": 0.1742420643568039, "learning_rate": 7.257478950160384e-05, "loss": 0.2824, "step": 192800 }, { "epoch": 0.7151383935522618, "grad_norm": 0.6755160689353943, "learning_rate": 7.254854277033317e-05, "loss": 0.3007, "step": 192900 }, { "epoch": 0.7155091236681521, "grad_norm": 0.5193705558776855, "learning_rate": 7.252228823709823e-05, "loss": 0.3057, "step": 193000 }, { "epoch": 0.7158798537840423, "grad_norm": 0.5497666597366333, "learning_rate": 7.24960259109833e-05, "loss": 0.3363, "step": 193100 }, { "epoch": 0.7162505838999326, "grad_norm": 0.872668445110321, "learning_rate": 7.246975580107532e-05, "loss": 0.3309, "step": 193200 }, { "epoch": 0.7166213140158227, "grad_norm": 1.3178340196609497, "learning_rate": 7.244347791646397e-05, "loss": 0.3398, "step": 193300 }, { "epoch": 0.716992044131713, "grad_norm": 0.33384308218955994, "learning_rate": 7.241719226624158e-05, "loss": 0.3283, "step": 193400 }, { "epoch": 0.7173627742476032, "grad_norm": 1.4054746627807617, "learning_rate": 7.239089885950316e-05, "loss": 0.2849, "step": 193500 }, { "epoch": 0.7177335043634935, "grad_norm": 1.027898907661438, "learning_rate": 7.236459770534645e-05, "loss": 0.3479, "step": 193600 }, { "epoch": 0.7181042344793837, "grad_norm": 0.9759005308151245, "learning_rate": 7.233828881287186e-05, "loss": 0.3349, "step": 193700 }, { "epoch": 0.7184749645952739, "grad_norm": 0.4213370382785797, "learning_rate": 7.231197219118244e-05, "loss": 0.3098, "step": 193800 }, { "epoch": 0.7188456947111642, "grad_norm": 0.5491893291473389, "learning_rate": 7.228564784938394e-05, "loss": 0.3152, "step": 193900 }, { "epoch": 0.7192164248270544, "grad_norm": 0.6764132380485535, "learning_rate": 7.225931579658476e-05, "loss": 0.3208, "step": 194000 }, { "epoch": 0.7195871549429447, "grad_norm": 0.9070931673049927, "learning_rate": 7.223297604189605e-05, "loss": 0.3289, "step": 194100 }, { "epoch": 0.7199578850588348, "grad_norm": 0.4266166687011719, "learning_rate": 7.220662859443149e-05, "loss": 0.2873, "step": 194200 }, { "epoch": 0.7203286151747251, "grad_norm": 0.5277365446090698, "learning_rate": 7.218027346330753e-05, "loss": 0.3275, "step": 194300 }, { "epoch": 0.7206993452906153, "grad_norm": 0.34685370326042175, "learning_rate": 7.215391065764325e-05, "loss": 0.3584, "step": 194400 }, { "epoch": 0.7210700754065056, "grad_norm": 0.6163163185119629, "learning_rate": 7.212754018656035e-05, "loss": 0.3178, "step": 194500 }, { "epoch": 0.7214408055223958, "grad_norm": 0.43930721282958984, "learning_rate": 7.210116205918322e-05, "loss": 0.291, "step": 194600 }, { "epoch": 0.721811535638286, "grad_norm": 0.31461799144744873, "learning_rate": 7.207477628463891e-05, "loss": 0.3164, "step": 194700 }, { "epoch": 0.7221822657541763, "grad_norm": 0.5337316989898682, "learning_rate": 7.204838287205707e-05, "loss": 0.3091, "step": 194800 }, { "epoch": 0.7225529958700665, "grad_norm": 0.8510362505912781, "learning_rate": 7.202198183057003e-05, "loss": 0.2975, "step": 194900 }, { "epoch": 0.7229237259859568, "grad_norm": 0.5381998419761658, "learning_rate": 7.199557316931274e-05, "loss": 0.3337, "step": 195000 }, { "epoch": 0.723294456101847, "grad_norm": 0.34809380769729614, "learning_rate": 7.19691568974228e-05, "loss": 0.3309, "step": 195100 }, { "epoch": 0.7236651862177372, "grad_norm": 1.469008207321167, "learning_rate": 7.194273302404044e-05, "loss": 0.3479, "step": 195200 }, { "epoch": 0.7240359163336274, "grad_norm": 1.8340859413146973, "learning_rate": 7.191630155830851e-05, "loss": 0.3188, "step": 195300 }, { "epoch": 0.7244066464495177, "grad_norm": 0.8306624889373779, "learning_rate": 7.188986250937249e-05, "loss": 0.3491, "step": 195400 }, { "epoch": 0.7247773765654079, "grad_norm": 0.6426912546157837, "learning_rate": 7.186341588638051e-05, "loss": 0.3158, "step": 195500 }, { "epoch": 0.7251481066812981, "grad_norm": 0.572904109954834, "learning_rate": 7.183696169848328e-05, "loss": 0.3415, "step": 195600 }, { "epoch": 0.7255188367971884, "grad_norm": 0.4712825119495392, "learning_rate": 7.181049995483416e-05, "loss": 0.3016, "step": 195700 }, { "epoch": 0.7258895669130786, "grad_norm": 0.3878413438796997, "learning_rate": 7.178403066458909e-05, "loss": 0.3169, "step": 195800 }, { "epoch": 0.7262602970289689, "grad_norm": 0.5720041990280151, "learning_rate": 7.175755383690666e-05, "loss": 0.3298, "step": 195900 }, { "epoch": 0.726631027144859, "grad_norm": 0.6043065786361694, "learning_rate": 7.173106948094805e-05, "loss": 0.3148, "step": 196000 }, { "epoch": 0.7270017572607493, "grad_norm": 0.9010827541351318, "learning_rate": 7.170457760587702e-05, "loss": 0.2922, "step": 196100 }, { "epoch": 0.7273724873766395, "grad_norm": 0.8916376829147339, "learning_rate": 7.167807822086e-05, "loss": 0.3194, "step": 196200 }, { "epoch": 0.7277432174925298, "grad_norm": 0.42507851123809814, "learning_rate": 7.165157133506595e-05, "loss": 0.3191, "step": 196300 }, { "epoch": 0.7281139476084201, "grad_norm": 0.7227169275283813, "learning_rate": 7.162505695766645e-05, "loss": 0.3293, "step": 196400 }, { "epoch": 0.7284846777243102, "grad_norm": 0.6014047861099243, "learning_rate": 7.159853509783567e-05, "loss": 0.3265, "step": 196500 }, { "epoch": 0.7288554078402005, "grad_norm": 0.6476625800132751, "learning_rate": 7.157200576475038e-05, "loss": 0.3057, "step": 196600 }, { "epoch": 0.7292261379560907, "grad_norm": 0.5827168226242065, "learning_rate": 7.154546896758993e-05, "loss": 0.32, "step": 196700 }, { "epoch": 0.729596868071981, "grad_norm": 0.708744466304779, "learning_rate": 7.151892471553626e-05, "loss": 0.3042, "step": 196800 }, { "epoch": 0.7299675981878712, "grad_norm": 1.241117000579834, "learning_rate": 7.149237301777384e-05, "loss": 0.3116, "step": 196900 }, { "epoch": 0.7303383283037614, "grad_norm": 0.33352959156036377, "learning_rate": 7.146581388348979e-05, "loss": 0.341, "step": 197000 }, { "epoch": 0.7307090584196516, "grad_norm": 0.7126948833465576, "learning_rate": 7.143924732187375e-05, "loss": 0.3141, "step": 197100 }, { "epoch": 0.7310797885355419, "grad_norm": 0.7836889624595642, "learning_rate": 7.141267334211797e-05, "loss": 0.3289, "step": 197200 }, { "epoch": 0.7314505186514322, "grad_norm": 1.2285977602005005, "learning_rate": 7.138609195341722e-05, "loss": 0.3198, "step": 197300 }, { "epoch": 0.7318212487673224, "grad_norm": 1.0989876985549927, "learning_rate": 7.135950316496888e-05, "loss": 0.3209, "step": 197400 }, { "epoch": 0.7321919788832126, "grad_norm": 0.4869151711463928, "learning_rate": 7.133290698597287e-05, "loss": 0.3237, "step": 197500 }, { "epoch": 0.7325627089991028, "grad_norm": 0.5135291218757629, "learning_rate": 7.130630342563165e-05, "loss": 0.3336, "step": 197600 }, { "epoch": 0.7329334391149931, "grad_norm": 0.5585698485374451, "learning_rate": 7.127969249315026e-05, "loss": 0.3376, "step": 197700 }, { "epoch": 0.7333041692308833, "grad_norm": 1.2863810062408447, "learning_rate": 7.125307419773628e-05, "loss": 0.3096, "step": 197800 }, { "epoch": 0.7336748993467735, "grad_norm": 0.7672189474105835, "learning_rate": 7.122644854859985e-05, "loss": 0.2961, "step": 197900 }, { "epoch": 0.7340456294626637, "grad_norm": 0.9674240350723267, "learning_rate": 7.11998155549536e-05, "loss": 0.2929, "step": 198000 }, { "epoch": 0.734416359578554, "grad_norm": 0.8717834949493408, "learning_rate": 7.117317522601282e-05, "loss": 0.3272, "step": 198100 }, { "epoch": 0.7347870896944443, "grad_norm": 0.5999615788459778, "learning_rate": 7.11465275709952e-05, "loss": 0.3322, "step": 198200 }, { "epoch": 0.7351578198103345, "grad_norm": 1.3955028057098389, "learning_rate": 7.111987259912105e-05, "loss": 0.3213, "step": 198300 }, { "epoch": 0.7355285499262247, "grad_norm": 0.34414756298065186, "learning_rate": 7.109321031961318e-05, "loss": 0.3222, "step": 198400 }, { "epoch": 0.7358992800421149, "grad_norm": 0.6956194639205933, "learning_rate": 7.106654074169697e-05, "loss": 0.3063, "step": 198500 }, { "epoch": 0.7362700101580052, "grad_norm": 0.8057518005371094, "learning_rate": 7.103986387460027e-05, "loss": 0.3101, "step": 198600 }, { "epoch": 0.7366407402738954, "grad_norm": 0.5814351439476013, "learning_rate": 7.101317972755346e-05, "loss": 0.3318, "step": 198700 }, { "epoch": 0.7370114703897857, "grad_norm": 0.6112227439880371, "learning_rate": 7.098648830978946e-05, "loss": 0.3207, "step": 198800 }, { "epoch": 0.7373822005056759, "grad_norm": 0.9435766339302063, "learning_rate": 7.095978963054373e-05, "loss": 0.3304, "step": 198900 }, { "epoch": 0.7377529306215661, "grad_norm": 0.4269554913043976, "learning_rate": 7.093308369905418e-05, "loss": 0.286, "step": 199000 }, { "epoch": 0.7381236607374564, "grad_norm": 0.6470925807952881, "learning_rate": 7.090637052456126e-05, "loss": 0.3155, "step": 199100 }, { "epoch": 0.7384943908533466, "grad_norm": 1.3067463636398315, "learning_rate": 7.087965011630797e-05, "loss": 0.3396, "step": 199200 }, { "epoch": 0.7388651209692368, "grad_norm": 0.8024086356163025, "learning_rate": 7.085292248353972e-05, "loss": 0.3136, "step": 199300 }, { "epoch": 0.739235851085127, "grad_norm": 0.8889555931091309, "learning_rate": 7.082618763550451e-05, "loss": 0.3243, "step": 199400 }, { "epoch": 0.7396065812010173, "grad_norm": 0.571980893611908, "learning_rate": 7.079944558145275e-05, "loss": 0.3397, "step": 199500 }, { "epoch": 0.7399773113169075, "grad_norm": 0.9070281386375427, "learning_rate": 7.077269633063744e-05, "loss": 0.297, "step": 199600 }, { "epoch": 0.7403480414327978, "grad_norm": 0.6628014445304871, "learning_rate": 7.074593989231402e-05, "loss": 0.3314, "step": 199700 }, { "epoch": 0.740718771548688, "grad_norm": 0.8994859457015991, "learning_rate": 7.071917627574037e-05, "loss": 0.3413, "step": 199800 }, { "epoch": 0.7410895016645782, "grad_norm": 1.1658884286880493, "learning_rate": 7.069240549017692e-05, "loss": 0.3203, "step": 199900 }, { "epoch": 0.7414602317804685, "grad_norm": 0.5242553949356079, "learning_rate": 7.066562754488661e-05, "loss": 0.3197, "step": 200000 }, { "epoch": 0.7418309618963587, "grad_norm": 2.133249282836914, "learning_rate": 7.063884244913474e-05, "loss": 0.3277, "step": 200100 }, { "epoch": 0.742201692012249, "grad_norm": 0.5624331831932068, "learning_rate": 7.061205021218919e-05, "loss": 0.2984, "step": 200200 }, { "epoch": 0.7425724221281391, "grad_norm": 1.2335107326507568, "learning_rate": 7.058525084332028e-05, "loss": 0.2998, "step": 200300 }, { "epoch": 0.7429431522440294, "grad_norm": 0.6902004480361938, "learning_rate": 7.05584443518008e-05, "loss": 0.2933, "step": 200400 }, { "epoch": 0.7433138823599196, "grad_norm": 0.39860767126083374, "learning_rate": 7.053163074690592e-05, "loss": 0.3019, "step": 200500 }, { "epoch": 0.7436846124758099, "grad_norm": 0.9121852517127991, "learning_rate": 7.050481003791344e-05, "loss": 0.3456, "step": 200600 }, { "epoch": 0.7440553425917001, "grad_norm": 1.4677352905273438, "learning_rate": 7.047798223410348e-05, "loss": 0.3293, "step": 200700 }, { "epoch": 0.7444260727075903, "grad_norm": 0.49643298983573914, "learning_rate": 7.045114734475869e-05, "loss": 0.3294, "step": 200800 }, { "epoch": 0.7447968028234806, "grad_norm": 0.7204617857933044, "learning_rate": 7.04243053791641e-05, "loss": 0.3232, "step": 200900 }, { "epoch": 0.7451675329393708, "grad_norm": 0.7800295948982239, "learning_rate": 7.039745634660726e-05, "loss": 0.3488, "step": 201000 }, { "epoch": 0.745538263055261, "grad_norm": 0.7416294813156128, "learning_rate": 7.037060025637814e-05, "loss": 0.3128, "step": 201100 }, { "epoch": 0.7459089931711512, "grad_norm": 0.4878130257129669, "learning_rate": 7.034373711776911e-05, "loss": 0.3194, "step": 201200 }, { "epoch": 0.7462797232870415, "grad_norm": 0.7884534001350403, "learning_rate": 7.031686694007505e-05, "loss": 0.3108, "step": 201300 }, { "epoch": 0.7466504534029317, "grad_norm": 0.6076580882072449, "learning_rate": 7.028998973259325e-05, "loss": 0.3529, "step": 201400 }, { "epoch": 0.747021183518822, "grad_norm": 0.7954737544059753, "learning_rate": 7.026310550462342e-05, "loss": 0.3055, "step": 201500 }, { "epoch": 0.7473919136347122, "grad_norm": 0.6339066624641418, "learning_rate": 7.023621426546767e-05, "loss": 0.3189, "step": 201600 }, { "epoch": 0.7477626437506024, "grad_norm": 0.37482863664627075, "learning_rate": 7.020931602443059e-05, "loss": 0.3211, "step": 201700 }, { "epoch": 0.7481333738664927, "grad_norm": 0.5946445465087891, "learning_rate": 7.01824107908192e-05, "loss": 0.2964, "step": 201800 }, { "epoch": 0.7485041039823829, "grad_norm": 0.6195926666259766, "learning_rate": 7.015549857394288e-05, "loss": 0.34, "step": 201900 }, { "epoch": 0.7488748340982732, "grad_norm": 0.43932044506073, "learning_rate": 7.012857938311346e-05, "loss": 0.3353, "step": 202000 }, { "epoch": 0.7492455642141633, "grad_norm": 1.3954063653945923, "learning_rate": 7.010165322764518e-05, "loss": 0.3421, "step": 202100 }, { "epoch": 0.7496162943300536, "grad_norm": 0.5094516277313232, "learning_rate": 7.007472011685472e-05, "loss": 0.3309, "step": 202200 }, { "epoch": 0.7499870244459439, "grad_norm": 0.6015029549598694, "learning_rate": 7.00477800600611e-05, "loss": 0.3197, "step": 202300 }, { "epoch": 0.7503577545618341, "grad_norm": 0.32585474848747253, "learning_rate": 7.00208330665858e-05, "loss": 0.3307, "step": 202400 }, { "epoch": 0.7507284846777244, "grad_norm": 0.531728208065033, "learning_rate": 6.999387914575268e-05, "loss": 0.2999, "step": 202500 }, { "epoch": 0.7510992147936145, "grad_norm": 0.7991048693656921, "learning_rate": 6.996691830688798e-05, "loss": 0.3305, "step": 202600 }, { "epoch": 0.7514699449095048, "grad_norm": 0.7161388993263245, "learning_rate": 6.993995055932039e-05, "loss": 0.3004, "step": 202700 }, { "epoch": 0.751840675025395, "grad_norm": 0.6533925533294678, "learning_rate": 6.991297591238091e-05, "loss": 0.2934, "step": 202800 }, { "epoch": 0.7522114051412853, "grad_norm": 0.37406623363494873, "learning_rate": 6.988599437540301e-05, "loss": 0.304, "step": 202900 }, { "epoch": 0.7525821352571754, "grad_norm": 0.7458063364028931, "learning_rate": 6.985900595772248e-05, "loss": 0.3511, "step": 203000 }, { "epoch": 0.7529528653730657, "grad_norm": 0.6428525447845459, "learning_rate": 6.98320106686775e-05, "loss": 0.2987, "step": 203100 }, { "epoch": 0.753323595488956, "grad_norm": 0.3017052114009857, "learning_rate": 6.980500851760866e-05, "loss": 0.3401, "step": 203200 }, { "epoch": 0.7536943256048462, "grad_norm": 0.6199378371238708, "learning_rate": 6.977799951385892e-05, "loss": 0.3096, "step": 203300 }, { "epoch": 0.7540650557207365, "grad_norm": 0.5517586469650269, "learning_rate": 6.975098366677358e-05, "loss": 0.3403, "step": 203400 }, { "epoch": 0.7544357858366266, "grad_norm": 0.5021377801895142, "learning_rate": 6.97239609857003e-05, "loss": 0.3287, "step": 203500 }, { "epoch": 0.7548065159525169, "grad_norm": 0.46017348766326904, "learning_rate": 6.969693147998917e-05, "loss": 0.3073, "step": 203600 }, { "epoch": 0.7551772460684071, "grad_norm": 0.9235522747039795, "learning_rate": 6.96698951589926e-05, "loss": 0.2845, "step": 203700 }, { "epoch": 0.7555479761842974, "grad_norm": 0.5486223697662354, "learning_rate": 6.964285203206533e-05, "loss": 0.326, "step": 203800 }, { "epoch": 0.7559187063001875, "grad_norm": 1.2264740467071533, "learning_rate": 6.96158021085645e-05, "loss": 0.3233, "step": 203900 }, { "epoch": 0.7562894364160778, "grad_norm": 1.1207319498062134, "learning_rate": 6.95887453978496e-05, "loss": 0.3268, "step": 204000 }, { "epoch": 0.7566601665319681, "grad_norm": 0.6873946785926819, "learning_rate": 6.956168190928242e-05, "loss": 0.3, "step": 204100 }, { "epoch": 0.7570308966478583, "grad_norm": 0.8785740733146667, "learning_rate": 6.953461165222716e-05, "loss": 0.3201, "step": 204200 }, { "epoch": 0.7574016267637486, "grad_norm": 0.7796656489372253, "learning_rate": 6.950753463605033e-05, "loss": 0.3211, "step": 204300 }, { "epoch": 0.7577723568796387, "grad_norm": 0.5560203194618225, "learning_rate": 6.948045087012074e-05, "loss": 0.3143, "step": 204400 }, { "epoch": 0.758143086995529, "grad_norm": 1.0751484632492065, "learning_rate": 6.945336036380962e-05, "loss": 0.2733, "step": 204500 }, { "epoch": 0.7585138171114192, "grad_norm": 1.0168095827102661, "learning_rate": 6.942626312649048e-05, "loss": 0.2911, "step": 204600 }, { "epoch": 0.7588845472273095, "grad_norm": 0.4691696763038635, "learning_rate": 6.939915916753915e-05, "loss": 0.3065, "step": 204700 }, { "epoch": 0.7592552773431998, "grad_norm": 0.873715341091156, "learning_rate": 6.937204849633383e-05, "loss": 0.2927, "step": 204800 }, { "epoch": 0.7596260074590899, "grad_norm": 0.5877065658569336, "learning_rate": 6.934493112225497e-05, "loss": 0.3345, "step": 204900 }, { "epoch": 0.7599967375749802, "grad_norm": 1.132154941558838, "learning_rate": 6.931780705468542e-05, "loss": 0.3229, "step": 205000 }, { "epoch": 0.7603674676908704, "grad_norm": 0.8767668604850769, "learning_rate": 6.929067630301032e-05, "loss": 0.3369, "step": 205100 }, { "epoch": 0.7607381978067607, "grad_norm": 0.6177182793617249, "learning_rate": 6.926353887661707e-05, "loss": 0.2942, "step": 205200 }, { "epoch": 0.7611089279226508, "grad_norm": 0.3501046895980835, "learning_rate": 6.923639478489545e-05, "loss": 0.3269, "step": 205300 }, { "epoch": 0.7614796580385411, "grad_norm": 0.42563992738723755, "learning_rate": 6.920924403723751e-05, "loss": 0.3013, "step": 205400 }, { "epoch": 0.7618503881544313, "grad_norm": 1.0885518789291382, "learning_rate": 6.918208664303762e-05, "loss": 0.3142, "step": 205500 }, { "epoch": 0.7622211182703216, "grad_norm": 0.8619701266288757, "learning_rate": 6.915492261169244e-05, "loss": 0.3098, "step": 205600 }, { "epoch": 0.7625918483862119, "grad_norm": 1.2247068881988525, "learning_rate": 6.912775195260093e-05, "loss": 0.311, "step": 205700 }, { "epoch": 0.762962578502102, "grad_norm": 0.2911297082901001, "learning_rate": 6.91005746751643e-05, "loss": 0.3499, "step": 205800 }, { "epoch": 0.7633333086179923, "grad_norm": 0.7690377831459045, "learning_rate": 6.907339078878617e-05, "loss": 0.2977, "step": 205900 }, { "epoch": 0.7637040387338825, "grad_norm": 1.3321866989135742, "learning_rate": 6.904620030287231e-05, "loss": 0.2848, "step": 206000 }, { "epoch": 0.7640747688497728, "grad_norm": 1.129806399345398, "learning_rate": 6.901900322683084e-05, "loss": 0.2883, "step": 206100 }, { "epoch": 0.7644454989656629, "grad_norm": 0.38468143343925476, "learning_rate": 6.899179957007217e-05, "loss": 0.3172, "step": 206200 }, { "epoch": 0.7648162290815532, "grad_norm": 0.9336357116699219, "learning_rate": 6.896458934200895e-05, "loss": 0.3177, "step": 206300 }, { "epoch": 0.7651869591974434, "grad_norm": 0.5407620072364807, "learning_rate": 6.893737255205611e-05, "loss": 0.3018, "step": 206400 }, { "epoch": 0.7655576893133337, "grad_norm": 0.6193931102752686, "learning_rate": 6.89101492096309e-05, "loss": 0.3289, "step": 206500 }, { "epoch": 0.765928419429224, "grad_norm": 0.5232301354408264, "learning_rate": 6.888291932415278e-05, "loss": 0.3177, "step": 206600 }, { "epoch": 0.7662991495451141, "grad_norm": 0.4669756591320038, "learning_rate": 6.88556829050435e-05, "loss": 0.3386, "step": 206700 }, { "epoch": 0.7666698796610044, "grad_norm": 1.132905125617981, "learning_rate": 6.882843996172703e-05, "loss": 0.3021, "step": 206800 }, { "epoch": 0.7670406097768946, "grad_norm": 2.196772813796997, "learning_rate": 6.880119050362968e-05, "loss": 0.3383, "step": 206900 }, { "epoch": 0.7674113398927849, "grad_norm": 0.621311366558075, "learning_rate": 6.877393454017996e-05, "loss": 0.2897, "step": 207000 }, { "epoch": 0.767782070008675, "grad_norm": 0.8657070398330688, "learning_rate": 6.87466720808086e-05, "loss": 0.3285, "step": 207100 }, { "epoch": 0.7681528001245653, "grad_norm": 0.8168548941612244, "learning_rate": 6.871940313494866e-05, "loss": 0.329, "step": 207200 }, { "epoch": 0.7685235302404555, "grad_norm": 0.8084076642990112, "learning_rate": 6.869212771203536e-05, "loss": 0.3353, "step": 207300 }, { "epoch": 0.7688942603563458, "grad_norm": 0.5358544588088989, "learning_rate": 6.866484582150622e-05, "loss": 0.3184, "step": 207400 }, { "epoch": 0.7692649904722361, "grad_norm": 0.47492632269859314, "learning_rate": 6.863755747280097e-05, "loss": 0.3214, "step": 207500 }, { "epoch": 0.7696357205881262, "grad_norm": 0.6547372341156006, "learning_rate": 6.86102626753616e-05, "loss": 0.3307, "step": 207600 }, { "epoch": 0.7700064507040165, "grad_norm": 1.0061274766921997, "learning_rate": 6.858296143863231e-05, "loss": 0.351, "step": 207700 }, { "epoch": 0.7703771808199067, "grad_norm": 0.4372013509273529, "learning_rate": 6.85556537720595e-05, "loss": 0.3318, "step": 207800 }, { "epoch": 0.770747910935797, "grad_norm": 0.24043744802474976, "learning_rate": 6.852833968509187e-05, "loss": 0.3068, "step": 207900 }, { "epoch": 0.7711186410516871, "grad_norm": 0.419929563999176, "learning_rate": 6.850101918718027e-05, "loss": 0.2947, "step": 208000 }, { "epoch": 0.7714893711675774, "grad_norm": 0.7054558992385864, "learning_rate": 6.847369228777781e-05, "loss": 0.2989, "step": 208100 }, { "epoch": 0.7718601012834677, "grad_norm": 0.614231526851654, "learning_rate": 6.844635899633979e-05, "loss": 0.319, "step": 208200 }, { "epoch": 0.7722308313993579, "grad_norm": 0.9969512820243835, "learning_rate": 6.841901932232373e-05, "loss": 0.3208, "step": 208300 }, { "epoch": 0.7726015615152482, "grad_norm": 0.46414944529533386, "learning_rate": 6.839167327518939e-05, "loss": 0.3119, "step": 208400 }, { "epoch": 0.7729722916311383, "grad_norm": 1.1410635709762573, "learning_rate": 6.836432086439867e-05, "loss": 0.3195, "step": 208500 }, { "epoch": 0.7733430217470286, "grad_norm": 0.47132185101509094, "learning_rate": 6.833696209941575e-05, "loss": 0.2908, "step": 208600 }, { "epoch": 0.7737137518629188, "grad_norm": 0.48051026463508606, "learning_rate": 6.830959698970691e-05, "loss": 0.3336, "step": 208700 }, { "epoch": 0.7740844819788091, "grad_norm": 1.5586639642715454, "learning_rate": 6.828222554474076e-05, "loss": 0.327, "step": 208800 }, { "epoch": 0.7744552120946993, "grad_norm": 0.9017274975776672, "learning_rate": 6.825484777398793e-05, "loss": 0.3063, "step": 208900 }, { "epoch": 0.7748259422105895, "grad_norm": 0.46842658519744873, "learning_rate": 6.822746368692142e-05, "loss": 0.299, "step": 209000 }, { "epoch": 0.7751966723264798, "grad_norm": 1.4246816635131836, "learning_rate": 6.820007329301627e-05, "loss": 0.2967, "step": 209100 }, { "epoch": 0.77556740244237, "grad_norm": 0.8373568654060364, "learning_rate": 6.81726766017498e-05, "loss": 0.3348, "step": 209200 }, { "epoch": 0.7759381325582603, "grad_norm": 0.9130806922912598, "learning_rate": 6.814527362260144e-05, "loss": 0.2974, "step": 209300 }, { "epoch": 0.7763088626741504, "grad_norm": 0.34992679953575134, "learning_rate": 6.811786436505284e-05, "loss": 0.3086, "step": 209400 }, { "epoch": 0.7766795927900407, "grad_norm": 0.7490069270133972, "learning_rate": 6.809044883858782e-05, "loss": 0.2991, "step": 209500 }, { "epoch": 0.7770503229059309, "grad_norm": 0.273250937461853, "learning_rate": 6.806302705269234e-05, "loss": 0.3193, "step": 209600 }, { "epoch": 0.7774210530218212, "grad_norm": 0.4195079207420349, "learning_rate": 6.803559901685452e-05, "loss": 0.2855, "step": 209700 }, { "epoch": 0.7777917831377114, "grad_norm": 0.6205518245697021, "learning_rate": 6.800816474056471e-05, "loss": 0.3144, "step": 209800 }, { "epoch": 0.7781625132536016, "grad_norm": 1.0212677717208862, "learning_rate": 6.798072423331536e-05, "loss": 0.3383, "step": 209900 }, { "epoch": 0.7785332433694919, "grad_norm": 0.6253033876419067, "learning_rate": 6.795327750460108e-05, "loss": 0.2899, "step": 210000 }, { "epoch": 0.7789039734853821, "grad_norm": 0.6635670065879822, "learning_rate": 6.792582456391865e-05, "loss": 0.3506, "step": 210100 }, { "epoch": 0.7792747036012724, "grad_norm": 0.8136152029037476, "learning_rate": 6.7898365420767e-05, "loss": 0.3206, "step": 210200 }, { "epoch": 0.7796454337171626, "grad_norm": 0.9446188807487488, "learning_rate": 6.787090008464718e-05, "loss": 0.3337, "step": 210300 }, { "epoch": 0.7800161638330528, "grad_norm": 2.1359519958496094, "learning_rate": 6.78434285650624e-05, "loss": 0.2875, "step": 210400 }, { "epoch": 0.780386893948943, "grad_norm": 0.7881209850311279, "learning_rate": 6.781595087151804e-05, "loss": 0.3167, "step": 210500 }, { "epoch": 0.7807576240648333, "grad_norm": 0.641974151134491, "learning_rate": 6.778846701352157e-05, "loss": 0.333, "step": 210600 }, { "epoch": 0.7811283541807236, "grad_norm": 0.4060905873775482, "learning_rate": 6.776097700058261e-05, "loss": 0.3131, "step": 210700 }, { "epoch": 0.7814990842966137, "grad_norm": 0.9936855435371399, "learning_rate": 6.773348084221291e-05, "loss": 0.3251, "step": 210800 }, { "epoch": 0.781869814412504, "grad_norm": 0.937762439250946, "learning_rate": 6.770597854792635e-05, "loss": 0.3393, "step": 210900 }, { "epoch": 0.7822405445283942, "grad_norm": 1.1134600639343262, "learning_rate": 6.767847012723894e-05, "loss": 0.3515, "step": 211000 }, { "epoch": 0.7826112746442845, "grad_norm": 0.9024290442466736, "learning_rate": 6.765095558966875e-05, "loss": 0.3258, "step": 211100 }, { "epoch": 0.7829820047601747, "grad_norm": 0.6683372855186462, "learning_rate": 6.762343494473607e-05, "loss": 0.3399, "step": 211200 }, { "epoch": 0.7833527348760649, "grad_norm": 0.42752566933631897, "learning_rate": 6.759590820196323e-05, "loss": 0.3226, "step": 211300 }, { "epoch": 0.7837234649919551, "grad_norm": 0.6444758176803589, "learning_rate": 6.75683753708747e-05, "loss": 0.3059, "step": 211400 }, { "epoch": 0.7840941951078454, "grad_norm": 0.44265517592430115, "learning_rate": 6.7540836460997e-05, "loss": 0.3294, "step": 211500 }, { "epoch": 0.7844649252237357, "grad_norm": 1.4465370178222656, "learning_rate": 6.751329148185885e-05, "loss": 0.3091, "step": 211600 }, { "epoch": 0.7848356553396258, "grad_norm": 0.5418146848678589, "learning_rate": 6.748574044299101e-05, "loss": 0.3088, "step": 211700 }, { "epoch": 0.7852063854555161, "grad_norm": 0.6203121542930603, "learning_rate": 6.745818335392632e-05, "loss": 0.2914, "step": 211800 }, { "epoch": 0.7855771155714063, "grad_norm": 0.7060174345970154, "learning_rate": 6.743062022419977e-05, "loss": 0.304, "step": 211900 }, { "epoch": 0.7859478456872966, "grad_norm": 0.6669508814811707, "learning_rate": 6.740305106334838e-05, "loss": 0.2793, "step": 212000 }, { "epoch": 0.7863185758031868, "grad_norm": 0.7302775382995605, "learning_rate": 6.737547588091131e-05, "loss": 0.3521, "step": 212100 }, { "epoch": 0.786689305919077, "grad_norm": 0.6964245438575745, "learning_rate": 6.734789468642976e-05, "loss": 0.3153, "step": 212200 }, { "epoch": 0.7870600360349672, "grad_norm": 0.583748996257782, "learning_rate": 6.732030748944704e-05, "loss": 0.3271, "step": 212300 }, { "epoch": 0.7874307661508575, "grad_norm": 0.7438490390777588, "learning_rate": 6.729271429950853e-05, "loss": 0.3025, "step": 212400 }, { "epoch": 0.7878014962667478, "grad_norm": 0.6366211771965027, "learning_rate": 6.726511512616166e-05, "loss": 0.3212, "step": 212500 }, { "epoch": 0.788172226382638, "grad_norm": 0.8779458999633789, "learning_rate": 6.723750997895598e-05, "loss": 0.3155, "step": 212600 }, { "epoch": 0.7885429564985282, "grad_norm": 0.2891848385334015, "learning_rate": 6.720989886744304e-05, "loss": 0.3096, "step": 212700 }, { "epoch": 0.7889136866144184, "grad_norm": 1.5043823719024658, "learning_rate": 6.718228180117655e-05, "loss": 0.3227, "step": 212800 }, { "epoch": 0.7892844167303087, "grad_norm": 0.8283397555351257, "learning_rate": 6.715465878971214e-05, "loss": 0.3374, "step": 212900 }, { "epoch": 0.7896551468461989, "grad_norm": 0.37572938203811646, "learning_rate": 6.712702984260764e-05, "loss": 0.2929, "step": 213000 }, { "epoch": 0.7900258769620891, "grad_norm": 0.6094139814376831, "learning_rate": 6.709939496942285e-05, "loss": 0.3166, "step": 213100 }, { "epoch": 0.7903966070779793, "grad_norm": 0.42399919033050537, "learning_rate": 6.707175417971965e-05, "loss": 0.3182, "step": 213200 }, { "epoch": 0.7907673371938696, "grad_norm": 0.6061890721321106, "learning_rate": 6.704410748306195e-05, "loss": 0.3053, "step": 213300 }, { "epoch": 0.7911380673097599, "grad_norm": 0.6341848373413086, "learning_rate": 6.701645488901573e-05, "loss": 0.316, "step": 213400 }, { "epoch": 0.7915087974256501, "grad_norm": 0.46314120292663574, "learning_rate": 6.698879640714897e-05, "loss": 0.301, "step": 213500 }, { "epoch": 0.7918795275415403, "grad_norm": 0.3992922008037567, "learning_rate": 6.696113204703174e-05, "loss": 0.3024, "step": 213600 }, { "epoch": 0.7922502576574305, "grad_norm": 0.5197064876556396, "learning_rate": 6.69334618182361e-05, "loss": 0.3142, "step": 213700 }, { "epoch": 0.7926209877733208, "grad_norm": 0.32292619347572327, "learning_rate": 6.690578573033615e-05, "loss": 0.3037, "step": 213800 }, { "epoch": 0.792991717889211, "grad_norm": 0.6675283908843994, "learning_rate": 6.687810379290802e-05, "loss": 0.3091, "step": 213900 }, { "epoch": 0.7933624480051013, "grad_norm": 0.42287442088127136, "learning_rate": 6.685041601552988e-05, "loss": 0.3287, "step": 214000 }, { "epoch": 0.7937331781209915, "grad_norm": 0.5739895105361938, "learning_rate": 6.682272240778193e-05, "loss": 0.3069, "step": 214100 }, { "epoch": 0.7941039082368817, "grad_norm": 0.9513316750526428, "learning_rate": 6.67950229792463e-05, "loss": 0.3091, "step": 214200 }, { "epoch": 0.794474638352772, "grad_norm": 0.7072402238845825, "learning_rate": 6.676731773950728e-05, "loss": 0.326, "step": 214300 }, { "epoch": 0.7948453684686622, "grad_norm": 0.5798244476318359, "learning_rate": 6.673960669815101e-05, "loss": 0.3429, "step": 214400 }, { "epoch": 0.7952160985845524, "grad_norm": 0.8645613193511963, "learning_rate": 6.671188986476577e-05, "loss": 0.3312, "step": 214500 }, { "epoch": 0.7955868287004426, "grad_norm": 0.9280213117599487, "learning_rate": 6.668416724894179e-05, "loss": 0.3125, "step": 214600 }, { "epoch": 0.7959575588163329, "grad_norm": 0.5828027725219727, "learning_rate": 6.665643886027129e-05, "loss": 0.3287, "step": 214700 }, { "epoch": 0.7963282889322231, "grad_norm": 0.5976018309593201, "learning_rate": 6.662870470834848e-05, "loss": 0.3235, "step": 214800 }, { "epoch": 0.7966990190481134, "grad_norm": 0.944126307964325, "learning_rate": 6.660096480276961e-05, "loss": 0.3058, "step": 214900 }, { "epoch": 0.7970697491640036, "grad_norm": 1.0730150938034058, "learning_rate": 6.65732191531329e-05, "loss": 0.324, "step": 215000 }, { "epoch": 0.7974404792798938, "grad_norm": 1.051536202430725, "learning_rate": 6.654546776903853e-05, "loss": 0.3021, "step": 215100 }, { "epoch": 0.7978112093957841, "grad_norm": 0.5440613627433777, "learning_rate": 6.651771066008869e-05, "loss": 0.2915, "step": 215200 }, { "epoch": 0.7981819395116743, "grad_norm": 0.632527232170105, "learning_rate": 6.648994783588756e-05, "loss": 0.345, "step": 215300 }, { "epoch": 0.7985526696275645, "grad_norm": 0.9354051947593689, "learning_rate": 6.646217930604129e-05, "loss": 0.2979, "step": 215400 }, { "epoch": 0.7989233997434547, "grad_norm": 0.7052352428436279, "learning_rate": 6.643440508015797e-05, "loss": 0.3133, "step": 215500 }, { "epoch": 0.799294129859345, "grad_norm": 0.8514916896820068, "learning_rate": 6.640662516784771e-05, "loss": 0.3035, "step": 215600 }, { "epoch": 0.7996648599752352, "grad_norm": 0.6174629926681519, "learning_rate": 6.637883957872256e-05, "loss": 0.3047, "step": 215700 }, { "epoch": 0.8000355900911255, "grad_norm": 0.5582329630851746, "learning_rate": 6.635104832239656e-05, "loss": 0.3255, "step": 215800 }, { "epoch": 0.8004063202070157, "grad_norm": 0.49659520387649536, "learning_rate": 6.632325140848566e-05, "loss": 0.3265, "step": 215900 }, { "epoch": 0.8007770503229059, "grad_norm": 0.4644688665866852, "learning_rate": 6.629544884660782e-05, "loss": 0.2759, "step": 216000 }, { "epoch": 0.8011477804387962, "grad_norm": 0.8786397576332092, "learning_rate": 6.626764064638294e-05, "loss": 0.3143, "step": 216100 }, { "epoch": 0.8015185105546864, "grad_norm": 0.32554078102111816, "learning_rate": 6.623982681743285e-05, "loss": 0.3133, "step": 216200 }, { "epoch": 0.8018892406705767, "grad_norm": 0.7726821899414062, "learning_rate": 6.621200736938133e-05, "loss": 0.3113, "step": 216300 }, { "epoch": 0.8022599707864668, "grad_norm": 0.8514429330825806, "learning_rate": 6.618418231185417e-05, "loss": 0.3102, "step": 216400 }, { "epoch": 0.8026307009023571, "grad_norm": 1.5117733478546143, "learning_rate": 6.6156351654479e-05, "loss": 0.3151, "step": 216500 }, { "epoch": 0.8030014310182474, "grad_norm": 0.6686671376228333, "learning_rate": 6.612851540688545e-05, "loss": 0.3289, "step": 216600 }, { "epoch": 0.8033721611341376, "grad_norm": 0.4649638831615448, "learning_rate": 6.610067357870507e-05, "loss": 0.33, "step": 216700 }, { "epoch": 0.8037428912500278, "grad_norm": 0.31841006875038147, "learning_rate": 6.607282617957132e-05, "loss": 0.3321, "step": 216800 }, { "epoch": 0.804113621365918, "grad_norm": 1.0697213411331177, "learning_rate": 6.604497321911965e-05, "loss": 0.3006, "step": 216900 }, { "epoch": 0.8044843514818083, "grad_norm": 0.9204962849617004, "learning_rate": 6.601711470698736e-05, "loss": 0.3235, "step": 217000 }, { "epoch": 0.8048550815976985, "grad_norm": 0.9542434215545654, "learning_rate": 6.598925065281371e-05, "loss": 0.3414, "step": 217100 }, { "epoch": 0.8052258117135888, "grad_norm": 1.0570188760757446, "learning_rate": 6.59613810662399e-05, "loss": 0.3056, "step": 217200 }, { "epoch": 0.8055965418294789, "grad_norm": 0.5721613168716431, "learning_rate": 6.593350595690896e-05, "loss": 0.3165, "step": 217300 }, { "epoch": 0.8059672719453692, "grad_norm": 0.9715991616249084, "learning_rate": 6.590562533446592e-05, "loss": 0.2964, "step": 217400 }, { "epoch": 0.8063380020612595, "grad_norm": 0.670779824256897, "learning_rate": 6.58777392085577e-05, "loss": 0.3234, "step": 217500 }, { "epoch": 0.8067087321771497, "grad_norm": 0.46015724539756775, "learning_rate": 6.584984758883306e-05, "loss": 0.2995, "step": 217600 }, { "epoch": 0.80707946229304, "grad_norm": 0.8787519931793213, "learning_rate": 6.582195048494274e-05, "loss": 0.2882, "step": 217700 }, { "epoch": 0.8074501924089301, "grad_norm": 1.18183434009552, "learning_rate": 6.579404790653933e-05, "loss": 0.3016, "step": 217800 }, { "epoch": 0.8078209225248204, "grad_norm": 0.4890477955341339, "learning_rate": 6.576613986327738e-05, "loss": 0.3121, "step": 217900 }, { "epoch": 0.8081916526407106, "grad_norm": 1.5542182922363281, "learning_rate": 6.573822636481322e-05, "loss": 0.3247, "step": 218000 }, { "epoch": 0.8085623827566009, "grad_norm": 0.9828373193740845, "learning_rate": 6.571030742080515e-05, "loss": 0.3013, "step": 218100 }, { "epoch": 0.808933112872491, "grad_norm": 1.9731333255767822, "learning_rate": 6.568238304091334e-05, "loss": 0.3197, "step": 218200 }, { "epoch": 0.8093038429883813, "grad_norm": 0.7355413436889648, "learning_rate": 6.565445323479985e-05, "loss": 0.2854, "step": 218300 }, { "epoch": 0.8096745731042716, "grad_norm": 0.6894125938415527, "learning_rate": 6.562651801212855e-05, "loss": 0.3504, "step": 218400 }, { "epoch": 0.8100453032201618, "grad_norm": 0.41569647192955017, "learning_rate": 6.559857738256528e-05, "loss": 0.3201, "step": 218500 }, { "epoch": 0.810416033336052, "grad_norm": 0.5063262581825256, "learning_rate": 6.55706313557777e-05, "loss": 0.3048, "step": 218600 }, { "epoch": 0.8107867634519422, "grad_norm": 1.3197649717330933, "learning_rate": 6.554267994143535e-05, "loss": 0.3173, "step": 218700 }, { "epoch": 0.8111574935678325, "grad_norm": 0.9051018953323364, "learning_rate": 6.551472314920959e-05, "loss": 0.3222, "step": 218800 }, { "epoch": 0.8115282236837227, "grad_norm": 0.6142867803573608, "learning_rate": 6.548676098877371e-05, "loss": 0.3129, "step": 218900 }, { "epoch": 0.811898953799613, "grad_norm": 0.8190245628356934, "learning_rate": 6.545879346980284e-05, "loss": 0.3038, "step": 219000 }, { "epoch": 0.8122696839155031, "grad_norm": 0.5662413835525513, "learning_rate": 6.543082060197393e-05, "loss": 0.3111, "step": 219100 }, { "epoch": 0.8126404140313934, "grad_norm": 0.947487473487854, "learning_rate": 6.540284239496579e-05, "loss": 0.3033, "step": 219200 }, { "epoch": 0.8130111441472837, "grad_norm": 0.4951702356338501, "learning_rate": 6.53748588584591e-05, "loss": 0.3171, "step": 219300 }, { "epoch": 0.8133818742631739, "grad_norm": 0.8440731763839722, "learning_rate": 6.53468700021364e-05, "loss": 0.3058, "step": 219400 }, { "epoch": 0.8137526043790642, "grad_norm": 0.7638470530509949, "learning_rate": 6.531887583568199e-05, "loss": 0.2806, "step": 219500 }, { "epoch": 0.8141233344949543, "grad_norm": 0.6297881007194519, "learning_rate": 6.52908763687821e-05, "loss": 0.332, "step": 219600 }, { "epoch": 0.8144940646108446, "grad_norm": 0.5919526815414429, "learning_rate": 6.526287161112475e-05, "loss": 0.3442, "step": 219700 }, { "epoch": 0.8148647947267348, "grad_norm": 0.8064566850662231, "learning_rate": 6.523486157239979e-05, "loss": 0.3119, "step": 219800 }, { "epoch": 0.8152355248426251, "grad_norm": 0.7276919484138489, "learning_rate": 6.52068462622989e-05, "loss": 0.3278, "step": 219900 }, { "epoch": 0.8156062549585154, "grad_norm": 0.8975847959518433, "learning_rate": 6.517882569051557e-05, "loss": 0.3071, "step": 220000 }, { "epoch": 0.8159769850744055, "grad_norm": 0.7774543762207031, "learning_rate": 6.515079986674516e-05, "loss": 0.3244, "step": 220100 }, { "epoch": 0.8163477151902958, "grad_norm": 0.6606189012527466, "learning_rate": 6.512276880068481e-05, "loss": 0.3443, "step": 220200 }, { "epoch": 0.816718445306186, "grad_norm": 0.69038987159729, "learning_rate": 6.509473250203345e-05, "loss": 0.3054, "step": 220300 }, { "epoch": 0.8170891754220763, "grad_norm": 0.48976612091064453, "learning_rate": 6.506669098049188e-05, "loss": 0.3073, "step": 220400 }, { "epoch": 0.8174599055379664, "grad_norm": 0.6559173464775085, "learning_rate": 6.503864424576266e-05, "loss": 0.3224, "step": 220500 }, { "epoch": 0.8178306356538567, "grad_norm": 0.3810791075229645, "learning_rate": 6.501059230755018e-05, "loss": 0.2926, "step": 220600 }, { "epoch": 0.8182013657697469, "grad_norm": 0.6550244092941284, "learning_rate": 6.49825351755606e-05, "loss": 0.3022, "step": 220700 }, { "epoch": 0.8185720958856372, "grad_norm": 0.5227406024932861, "learning_rate": 6.495447285950191e-05, "loss": 0.325, "step": 220800 }, { "epoch": 0.8189428260015275, "grad_norm": 0.5928474068641663, "learning_rate": 6.492640536908389e-05, "loss": 0.332, "step": 220900 }, { "epoch": 0.8193135561174176, "grad_norm": 0.8501703143119812, "learning_rate": 6.48983327140181e-05, "loss": 0.348, "step": 221000 }, { "epoch": 0.8196842862333079, "grad_norm": 0.8046396970748901, "learning_rate": 6.487025490401786e-05, "loss": 0.3354, "step": 221100 }, { "epoch": 0.8200550163491981, "grad_norm": 0.3009081482887268, "learning_rate": 6.484217194879835e-05, "loss": 0.3232, "step": 221200 }, { "epoch": 0.8204257464650884, "grad_norm": 0.7277501821517944, "learning_rate": 6.481408385807644e-05, "loss": 0.3259, "step": 221300 }, { "epoch": 0.8207964765809785, "grad_norm": 0.6539939045906067, "learning_rate": 6.478599064157082e-05, "loss": 0.3049, "step": 221400 }, { "epoch": 0.8211672066968688, "grad_norm": 1.216936707496643, "learning_rate": 6.475789230900199e-05, "loss": 0.3262, "step": 221500 }, { "epoch": 0.821537936812759, "grad_norm": 0.45903280377388, "learning_rate": 6.472978887009216e-05, "loss": 0.3179, "step": 221600 }, { "epoch": 0.8219086669286493, "grad_norm": 1.0928237438201904, "learning_rate": 6.470168033456533e-05, "loss": 0.3429, "step": 221700 }, { "epoch": 0.8222793970445396, "grad_norm": 0.7292802333831787, "learning_rate": 6.467356671214724e-05, "loss": 0.3083, "step": 221800 }, { "epoch": 0.8226501271604297, "grad_norm": 1.02112877368927, "learning_rate": 6.464544801256547e-05, "loss": 0.3223, "step": 221900 }, { "epoch": 0.82302085727632, "grad_norm": 1.10211181640625, "learning_rate": 6.461732424554926e-05, "loss": 0.3284, "step": 222000 }, { "epoch": 0.8233915873922102, "grad_norm": 0.5184034705162048, "learning_rate": 6.458919542082964e-05, "loss": 0.2987, "step": 222100 }, { "epoch": 0.8237623175081005, "grad_norm": 0.74056476354599, "learning_rate": 6.456106154813941e-05, "loss": 0.3337, "step": 222200 }, { "epoch": 0.8241330476239906, "grad_norm": 0.7979455590248108, "learning_rate": 6.453292263721309e-05, "loss": 0.3095, "step": 222300 }, { "epoch": 0.8245037777398809, "grad_norm": 0.7813823819160461, "learning_rate": 6.450477869778696e-05, "loss": 0.3353, "step": 222400 }, { "epoch": 0.8248745078557712, "grad_norm": 0.5278380513191223, "learning_rate": 6.447662973959903e-05, "loss": 0.3215, "step": 222500 }, { "epoch": 0.8252452379716614, "grad_norm": 0.5204999446868896, "learning_rate": 6.444847577238906e-05, "loss": 0.308, "step": 222600 }, { "epoch": 0.8256159680875517, "grad_norm": 0.5632990002632141, "learning_rate": 6.442031680589851e-05, "loss": 0.29, "step": 222700 }, { "epoch": 0.8259866982034418, "grad_norm": 0.6297847628593445, "learning_rate": 6.439215284987061e-05, "loss": 0.3414, "step": 222800 }, { "epoch": 0.8263574283193321, "grad_norm": 0.7115164399147034, "learning_rate": 6.436398391405033e-05, "loss": 0.3189, "step": 222900 }, { "epoch": 0.8267281584352223, "grad_norm": 0.774985134601593, "learning_rate": 6.433581000818425e-05, "loss": 0.3006, "step": 223000 }, { "epoch": 0.8270988885511126, "grad_norm": 1.0499509572982788, "learning_rate": 6.430763114202083e-05, "loss": 0.2867, "step": 223100 }, { "epoch": 0.8274696186670027, "grad_norm": 1.0882424116134644, "learning_rate": 6.427944732531012e-05, "loss": 0.3444, "step": 223200 }, { "epoch": 0.827840348782893, "grad_norm": 0.8718858361244202, "learning_rate": 6.425125856780396e-05, "loss": 0.305, "step": 223300 }, { "epoch": 0.8282110788987833, "grad_norm": 0.47110214829444885, "learning_rate": 6.422306487925587e-05, "loss": 0.2799, "step": 223400 }, { "epoch": 0.8285818090146735, "grad_norm": 0.47875508666038513, "learning_rate": 6.419486626942104e-05, "loss": 0.2984, "step": 223500 }, { "epoch": 0.8289525391305638, "grad_norm": 0.6230697631835938, "learning_rate": 6.416666274805645e-05, "loss": 0.3192, "step": 223600 }, { "epoch": 0.8293232692464539, "grad_norm": 0.6795237064361572, "learning_rate": 6.413845432492069e-05, "loss": 0.3192, "step": 223700 }, { "epoch": 0.8296939993623442, "grad_norm": 0.439881831407547, "learning_rate": 6.411024100977412e-05, "loss": 0.2898, "step": 223800 }, { "epoch": 0.8300647294782344, "grad_norm": 1.0361223220825195, "learning_rate": 6.408202281237872e-05, "loss": 0.3218, "step": 223900 }, { "epoch": 0.8304354595941247, "grad_norm": 1.1426048278808594, "learning_rate": 6.405379974249824e-05, "loss": 0.3335, "step": 224000 }, { "epoch": 0.8308061897100149, "grad_norm": 0.7749142646789551, "learning_rate": 6.402557180989803e-05, "loss": 0.3221, "step": 224100 }, { "epoch": 0.8311769198259051, "grad_norm": 0.6499410271644592, "learning_rate": 6.39973390243452e-05, "loss": 0.3127, "step": 224200 }, { "epoch": 0.8315476499417954, "grad_norm": 0.7007929086685181, "learning_rate": 6.396910139560846e-05, "loss": 0.3113, "step": 224300 }, { "epoch": 0.8319183800576856, "grad_norm": 0.27084869146347046, "learning_rate": 6.39408589334583e-05, "loss": 0.2969, "step": 224400 }, { "epoch": 0.8322891101735759, "grad_norm": 0.8482775688171387, "learning_rate": 6.391261164766678e-05, "loss": 0.2928, "step": 224500 }, { "epoch": 0.832659840289466, "grad_norm": 0.16886985301971436, "learning_rate": 6.388435954800769e-05, "loss": 0.3307, "step": 224600 }, { "epoch": 0.8330305704053563, "grad_norm": 0.9027968645095825, "learning_rate": 6.385610264425645e-05, "loss": 0.326, "step": 224700 }, { "epoch": 0.8334013005212465, "grad_norm": 0.545166552066803, "learning_rate": 6.382784094619019e-05, "loss": 0.3167, "step": 224800 }, { "epoch": 0.8337720306371368, "grad_norm": 0.3496576249599457, "learning_rate": 6.379957446358764e-05, "loss": 0.2989, "step": 224900 }, { "epoch": 0.834142760753027, "grad_norm": 0.5797802805900574, "learning_rate": 6.377130320622922e-05, "loss": 0.3182, "step": 225000 }, { "epoch": 0.8345134908689172, "grad_norm": 0.914718508720398, "learning_rate": 6.374302718389702e-05, "loss": 0.3665, "step": 225100 }, { "epoch": 0.8348842209848075, "grad_norm": 1.0576204061508179, "learning_rate": 6.371474640637472e-05, "loss": 0.3136, "step": 225200 }, { "epoch": 0.8352549511006977, "grad_norm": 0.6889545321464539, "learning_rate": 6.36864608834477e-05, "loss": 0.3105, "step": 225300 }, { "epoch": 0.835625681216588, "grad_norm": 0.8284688591957092, "learning_rate": 6.365817062490295e-05, "loss": 0.3023, "step": 225400 }, { "epoch": 0.8359964113324782, "grad_norm": 0.8025351166725159, "learning_rate": 6.362987564052913e-05, "loss": 0.3088, "step": 225500 }, { "epoch": 0.8363671414483684, "grad_norm": 0.9093849062919617, "learning_rate": 6.360157594011651e-05, "loss": 0.3219, "step": 225600 }, { "epoch": 0.8367378715642586, "grad_norm": 0.6028510332107544, "learning_rate": 6.3573271533457e-05, "loss": 0.2872, "step": 225700 }, { "epoch": 0.8371086016801489, "grad_norm": 0.5529880523681641, "learning_rate": 6.35449624303441e-05, "loss": 0.3231, "step": 225800 }, { "epoch": 0.8374793317960392, "grad_norm": 0.858705461025238, "learning_rate": 6.351664864057302e-05, "loss": 0.3244, "step": 225900 }, { "epoch": 0.8378500619119293, "grad_norm": 0.6317663192749023, "learning_rate": 6.348833017394054e-05, "loss": 0.3227, "step": 226000 }, { "epoch": 0.8382207920278196, "grad_norm": 1.416038155555725, "learning_rate": 6.346000704024503e-05, "loss": 0.3297, "step": 226100 }, { "epoch": 0.8385915221437098, "grad_norm": 1.2111091613769531, "learning_rate": 6.34316792492865e-05, "loss": 0.3271, "step": 226200 }, { "epoch": 0.8389622522596001, "grad_norm": 0.7541542053222656, "learning_rate": 6.340334681086664e-05, "loss": 0.2892, "step": 226300 }, { "epoch": 0.8393329823754903, "grad_norm": 0.49355563521385193, "learning_rate": 6.337500973478861e-05, "loss": 0.313, "step": 226400 }, { "epoch": 0.8397037124913805, "grad_norm": 0.5295121073722839, "learning_rate": 6.334666803085729e-05, "loss": 0.3136, "step": 226500 }, { "epoch": 0.8400744426072707, "grad_norm": 0.7824192643165588, "learning_rate": 6.331832170887912e-05, "loss": 0.3243, "step": 226600 }, { "epoch": 0.840445172723161, "grad_norm": 0.30753687024116516, "learning_rate": 6.328997077866214e-05, "loss": 0.2915, "step": 226700 }, { "epoch": 0.8408159028390513, "grad_norm": 0.39556625485420227, "learning_rate": 6.326161525001598e-05, "loss": 0.3361, "step": 226800 }, { "epoch": 0.8411866329549414, "grad_norm": 0.4916647672653198, "learning_rate": 6.323325513275184e-05, "loss": 0.3161, "step": 226900 }, { "epoch": 0.8415573630708317, "grad_norm": 0.45607560873031616, "learning_rate": 6.320489043668256e-05, "loss": 0.3021, "step": 227000 }, { "epoch": 0.8419280931867219, "grad_norm": 1.1411148309707642, "learning_rate": 6.317652117162255e-05, "loss": 0.322, "step": 227100 }, { "epoch": 0.8422988233026122, "grad_norm": 0.7165958881378174, "learning_rate": 6.314814734738775e-05, "loss": 0.3584, "step": 227200 }, { "epoch": 0.8426695534185024, "grad_norm": 0.680641770362854, "learning_rate": 6.311976897379575e-05, "loss": 0.3149, "step": 227300 }, { "epoch": 0.8430402835343926, "grad_norm": 1.7283273935317993, "learning_rate": 6.309138606066565e-05, "loss": 0.3029, "step": 227400 }, { "epoch": 0.8434110136502828, "grad_norm": 0.6269592642784119, "learning_rate": 6.306299861781817e-05, "loss": 0.2784, "step": 227500 }, { "epoch": 0.8437817437661731, "grad_norm": 0.6266169548034668, "learning_rate": 6.303460665507557e-05, "loss": 0.3138, "step": 227600 }, { "epoch": 0.8441524738820634, "grad_norm": 0.6902216076850891, "learning_rate": 6.300621018226168e-05, "loss": 0.3251, "step": 227700 }, { "epoch": 0.8445232039979536, "grad_norm": 0.3155316710472107, "learning_rate": 6.297780920920189e-05, "loss": 0.2946, "step": 227800 }, { "epoch": 0.8448939341138438, "grad_norm": 0.5382066369056702, "learning_rate": 6.294940374572317e-05, "loss": 0.3236, "step": 227900 }, { "epoch": 0.845264664229734, "grad_norm": 1.2426416873931885, "learning_rate": 6.292099380165398e-05, "loss": 0.3193, "step": 228000 }, { "epoch": 0.8456353943456243, "grad_norm": 0.7725491523742676, "learning_rate": 6.289257938682441e-05, "loss": 0.302, "step": 228100 }, { "epoch": 0.8460061244615145, "grad_norm": 0.48354676365852356, "learning_rate": 6.286416051106607e-05, "loss": 0.3045, "step": 228200 }, { "epoch": 0.8463768545774047, "grad_norm": 0.6681873798370361, "learning_rate": 6.283573718421205e-05, "loss": 0.3038, "step": 228300 }, { "epoch": 0.846747584693295, "grad_norm": 0.6992483735084534, "learning_rate": 6.280730941609707e-05, "loss": 0.2939, "step": 228400 }, { "epoch": 0.8471183148091852, "grad_norm": 0.6894723773002625, "learning_rate": 6.277887721655736e-05, "loss": 0.3054, "step": 228500 }, { "epoch": 0.8474890449250755, "grad_norm": 0.7877651453018188, "learning_rate": 6.275044059543064e-05, "loss": 0.3147, "step": 228600 }, { "epoch": 0.8478597750409657, "grad_norm": 0.6067752242088318, "learning_rate": 6.272199956255623e-05, "loss": 0.301, "step": 228700 }, { "epoch": 0.8482305051568559, "grad_norm": 1.009690284729004, "learning_rate": 6.26935541277749e-05, "loss": 0.2875, "step": 228800 }, { "epoch": 0.8486012352727461, "grad_norm": 0.5240969657897949, "learning_rate": 6.266510430092903e-05, "loss": 0.2993, "step": 228900 }, { "epoch": 0.8489719653886364, "grad_norm": 0.5127419829368591, "learning_rate": 6.263665009186244e-05, "loss": 0.3126, "step": 229000 }, { "epoch": 0.8493426955045266, "grad_norm": 0.7835715413093567, "learning_rate": 6.26081915104205e-05, "loss": 0.3246, "step": 229100 }, { "epoch": 0.8497134256204169, "grad_norm": 1.3097344636917114, "learning_rate": 6.257972856645009e-05, "loss": 0.318, "step": 229200 }, { "epoch": 0.8500841557363071, "grad_norm": 0.798805832862854, "learning_rate": 6.255126126979964e-05, "loss": 0.3168, "step": 229300 }, { "epoch": 0.8504548858521973, "grad_norm": 0.7932723164558411, "learning_rate": 6.252278963031899e-05, "loss": 0.2998, "step": 229400 }, { "epoch": 0.8508256159680876, "grad_norm": 0.6930146813392639, "learning_rate": 6.249431365785956e-05, "loss": 0.3082, "step": 229500 }, { "epoch": 0.8511963460839778, "grad_norm": 0.34123495221138, "learning_rate": 6.246583336227427e-05, "loss": 0.3013, "step": 229600 }, { "epoch": 0.851567076199868, "grad_norm": 0.9882388710975647, "learning_rate": 6.243734875341751e-05, "loss": 0.3053, "step": 229700 }, { "epoch": 0.8519378063157582, "grad_norm": 0.8687009811401367, "learning_rate": 6.240885984114515e-05, "loss": 0.3445, "step": 229800 }, { "epoch": 0.8523085364316485, "grad_norm": 0.6177666783332825, "learning_rate": 6.238036663531455e-05, "loss": 0.2846, "step": 229900 }, { "epoch": 0.8526792665475387, "grad_norm": 0.4587991535663605, "learning_rate": 6.235186914578464e-05, "loss": 0.3156, "step": 230000 }, { "epoch": 0.853049996663429, "grad_norm": 1.0118937492370605, "learning_rate": 6.23233673824157e-05, "loss": 0.2803, "step": 230100 }, { "epoch": 0.8534207267793192, "grad_norm": 0.5942692756652832, "learning_rate": 6.229486135506957e-05, "loss": 0.3074, "step": 230200 }, { "epoch": 0.8537914568952094, "grad_norm": 0.6272984147071838, "learning_rate": 6.226635107360954e-05, "loss": 0.3001, "step": 230300 }, { "epoch": 0.8541621870110997, "grad_norm": 0.9739845395088196, "learning_rate": 6.223783654790042e-05, "loss": 0.3192, "step": 230400 }, { "epoch": 0.8545329171269899, "grad_norm": 0.42963576316833496, "learning_rate": 6.220931778780842e-05, "loss": 0.3032, "step": 230500 }, { "epoch": 0.8549036472428801, "grad_norm": 0.5633513927459717, "learning_rate": 6.218079480320123e-05, "loss": 0.3062, "step": 230600 }, { "epoch": 0.8552743773587703, "grad_norm": 1.1789005994796753, "learning_rate": 6.215226760394803e-05, "loss": 0.3387, "step": 230700 }, { "epoch": 0.8556451074746606, "grad_norm": 0.667142391204834, "learning_rate": 6.212373619991947e-05, "loss": 0.3213, "step": 230800 }, { "epoch": 0.8560158375905508, "grad_norm": 0.718724250793457, "learning_rate": 6.20952006009876e-05, "loss": 0.3126, "step": 230900 }, { "epoch": 0.8563865677064411, "grad_norm": 1.3830571174621582, "learning_rate": 6.206666081702595e-05, "loss": 0.3229, "step": 231000 }, { "epoch": 0.8567572978223313, "grad_norm": 1.385448932647705, "learning_rate": 6.20381168579095e-05, "loss": 0.2955, "step": 231100 }, { "epoch": 0.8571280279382215, "grad_norm": 0.973398745059967, "learning_rate": 6.200956873351468e-05, "loss": 0.2944, "step": 231200 }, { "epoch": 0.8574987580541118, "grad_norm": 0.6259701251983643, "learning_rate": 6.198101645371934e-05, "loss": 0.2969, "step": 231300 }, { "epoch": 0.857869488170002, "grad_norm": 0.7257755994796753, "learning_rate": 6.19524600284028e-05, "loss": 0.3045, "step": 231400 }, { "epoch": 0.8582402182858923, "grad_norm": 0.6354368329048157, "learning_rate": 6.192389946744581e-05, "loss": 0.277, "step": 231500 }, { "epoch": 0.8586109484017824, "grad_norm": 0.6392965912818909, "learning_rate": 6.18953347807305e-05, "loss": 0.2927, "step": 231600 }, { "epoch": 0.8589816785176727, "grad_norm": 0.7399715185165405, "learning_rate": 6.186676597814047e-05, "loss": 0.3044, "step": 231700 }, { "epoch": 0.859352408633563, "grad_norm": 0.49541354179382324, "learning_rate": 6.183819306956076e-05, "loss": 0.2891, "step": 231800 }, { "epoch": 0.8597231387494532, "grad_norm": 0.39729467034339905, "learning_rate": 6.180961606487781e-05, "loss": 0.3024, "step": 231900 }, { "epoch": 0.8600938688653434, "grad_norm": 0.5273852944374084, "learning_rate": 6.178103497397947e-05, "loss": 0.3154, "step": 232000 }, { "epoch": 0.8604645989812336, "grad_norm": 1.0113320350646973, "learning_rate": 6.1752449806755e-05, "loss": 0.2947, "step": 232100 }, { "epoch": 0.8608353290971239, "grad_norm": 0.42898333072662354, "learning_rate": 6.17238605730951e-05, "loss": 0.2928, "step": 232200 }, { "epoch": 0.8612060592130141, "grad_norm": 1.0972541570663452, "learning_rate": 6.169526728289185e-05, "loss": 0.3222, "step": 232300 }, { "epoch": 0.8615767893289044, "grad_norm": 0.5575063824653625, "learning_rate": 6.166666994603874e-05, "loss": 0.3216, "step": 232400 }, { "epoch": 0.8619475194447945, "grad_norm": 0.6838763952255249, "learning_rate": 6.163806857243065e-05, "loss": 0.2773, "step": 232500 }, { "epoch": 0.8623182495606848, "grad_norm": 1.2027119398117065, "learning_rate": 6.16094631719639e-05, "loss": 0.3257, "step": 232600 }, { "epoch": 0.8626889796765751, "grad_norm": 0.7807101607322693, "learning_rate": 6.158085375453613e-05, "loss": 0.3316, "step": 232700 }, { "epoch": 0.8630597097924653, "grad_norm": 1.7168744802474976, "learning_rate": 6.155224033004647e-05, "loss": 0.3071, "step": 232800 }, { "epoch": 0.8634304399083556, "grad_norm": 0.6471925973892212, "learning_rate": 6.152362290839532e-05, "loss": 0.3133, "step": 232900 }, { "epoch": 0.8638011700242457, "grad_norm": 0.7518587708473206, "learning_rate": 6.149500149948456e-05, "loss": 0.2942, "step": 233000 }, { "epoch": 0.864171900140136, "grad_norm": 0.5746093392372131, "learning_rate": 6.14663761132174e-05, "loss": 0.3115, "step": 233100 }, { "epoch": 0.8645426302560262, "grad_norm": 0.7616492509841919, "learning_rate": 6.143774675949844e-05, "loss": 0.2813, "step": 233200 }, { "epoch": 0.8649133603719165, "grad_norm": 0.6007071733474731, "learning_rate": 6.140911344823363e-05, "loss": 0.3101, "step": 233300 }, { "epoch": 0.8652840904878066, "grad_norm": 0.9176962375640869, "learning_rate": 6.138047618933034e-05, "loss": 0.3244, "step": 233400 }, { "epoch": 0.8656548206036969, "grad_norm": 0.7776033282279968, "learning_rate": 6.135183499269726e-05, "loss": 0.3004, "step": 233500 }, { "epoch": 0.8660255507195872, "grad_norm": 0.7329208254814148, "learning_rate": 6.132318986824445e-05, "loss": 0.3055, "step": 233600 }, { "epoch": 0.8663962808354774, "grad_norm": 1.0025224685668945, "learning_rate": 6.129454082588336e-05, "loss": 0.297, "step": 233700 }, { "epoch": 0.8667670109513677, "grad_norm": 0.6311556696891785, "learning_rate": 6.126588787552675e-05, "loss": 0.3138, "step": 233800 }, { "epoch": 0.8671377410672578, "grad_norm": 0.5927762389183044, "learning_rate": 6.123723102708877e-05, "loss": 0.3056, "step": 233900 }, { "epoch": 0.8675084711831481, "grad_norm": 1.1481530666351318, "learning_rate": 6.12085702904849e-05, "loss": 0.3094, "step": 234000 }, { "epoch": 0.8678792012990383, "grad_norm": 1.1620229482650757, "learning_rate": 6.117990567563196e-05, "loss": 0.2875, "step": 234100 }, { "epoch": 0.8682499314149286, "grad_norm": 0.5600576996803284, "learning_rate": 6.115123719244813e-05, "loss": 0.3138, "step": 234200 }, { "epoch": 0.8686206615308188, "grad_norm": 0.5350291728973389, "learning_rate": 6.112256485085293e-05, "loss": 0.2955, "step": 234300 }, { "epoch": 0.868991391646709, "grad_norm": 0.7910351157188416, "learning_rate": 6.109388866076717e-05, "loss": 0.3587, "step": 234400 }, { "epoch": 0.8693621217625993, "grad_norm": 0.41735321283340454, "learning_rate": 6.106520863211306e-05, "loss": 0.3116, "step": 234500 }, { "epoch": 0.8697328518784895, "grad_norm": 0.8257538676261902, "learning_rate": 6.103652477481409e-05, "loss": 0.3317, "step": 234600 }, { "epoch": 0.8701035819943798, "grad_norm": 0.5605790019035339, "learning_rate": 6.1007837098795094e-05, "loss": 0.3161, "step": 234700 }, { "epoch": 0.8704743121102699, "grad_norm": 0.5438708066940308, "learning_rate": 6.097914561398222e-05, "loss": 0.2653, "step": 234800 }, { "epoch": 0.8708450422261602, "grad_norm": 1.1793732643127441, "learning_rate": 6.095045033030293e-05, "loss": 0.3217, "step": 234900 }, { "epoch": 0.8712157723420504, "grad_norm": 0.5226495265960693, "learning_rate": 6.0921751257686e-05, "loss": 0.2929, "step": 235000 }, { "epoch": 0.8715865024579407, "grad_norm": 0.809857964515686, "learning_rate": 6.0893048406061556e-05, "loss": 0.3044, "step": 235100 }, { "epoch": 0.871957232573831, "grad_norm": 0.8264075517654419, "learning_rate": 6.086434178536098e-05, "loss": 0.3288, "step": 235200 }, { "epoch": 0.8723279626897211, "grad_norm": 0.6544350385665894, "learning_rate": 6.083563140551696e-05, "loss": 0.2986, "step": 235300 }, { "epoch": 0.8726986928056114, "grad_norm": 0.6402145624160767, "learning_rate": 6.0806917276463526e-05, "loss": 0.2918, "step": 235400 }, { "epoch": 0.8730694229215016, "grad_norm": 0.5681881904602051, "learning_rate": 6.0778199408135986e-05, "loss": 0.3143, "step": 235500 }, { "epoch": 0.8734401530373919, "grad_norm": 0.8028451800346375, "learning_rate": 6.07494778104709e-05, "loss": 0.3281, "step": 235600 }, { "epoch": 0.873810883153282, "grad_norm": 0.8188853859901428, "learning_rate": 6.0720752493406184e-05, "loss": 0.2852, "step": 235700 }, { "epoch": 0.8741816132691723, "grad_norm": 0.6942448019981384, "learning_rate": 6.069202346688102e-05, "loss": 0.2862, "step": 235800 }, { "epoch": 0.8745523433850625, "grad_norm": 0.4696281850337982, "learning_rate": 6.0663290740835844e-05, "loss": 0.2888, "step": 235900 }, { "epoch": 0.8749230735009528, "grad_norm": 0.4981880486011505, "learning_rate": 6.06345543252124e-05, "loss": 0.3249, "step": 236000 }, { "epoch": 0.8752938036168431, "grad_norm": 0.25447162985801697, "learning_rate": 6.0605814229953704e-05, "loss": 0.3121, "step": 236100 }, { "epoch": 0.8756645337327332, "grad_norm": 0.4914533197879791, "learning_rate": 6.057707046500406e-05, "loss": 0.3139, "step": 236200 }, { "epoch": 0.8760352638486235, "grad_norm": 0.20800840854644775, "learning_rate": 6.0548323040309004e-05, "loss": 0.2989, "step": 236300 }, { "epoch": 0.8764059939645137, "grad_norm": 0.78923499584198, "learning_rate": 6.051957196581537e-05, "loss": 0.329, "step": 236400 }, { "epoch": 0.876776724080404, "grad_norm": 0.5414384007453918, "learning_rate": 6.049081725147122e-05, "loss": 0.3115, "step": 236500 }, { "epoch": 0.8771474541962941, "grad_norm": 0.6467755436897278, "learning_rate": 6.046205890722595e-05, "loss": 0.3304, "step": 236600 }, { "epoch": 0.8775181843121844, "grad_norm": 1.3376952409744263, "learning_rate": 6.0433296943030124e-05, "loss": 0.3196, "step": 236700 }, { "epoch": 0.8778889144280746, "grad_norm": 0.35335204005241394, "learning_rate": 6.040453136883559e-05, "loss": 0.3139, "step": 236800 }, { "epoch": 0.8782596445439649, "grad_norm": 0.37025076150894165, "learning_rate": 6.0375762194595474e-05, "loss": 0.3138, "step": 236900 }, { "epoch": 0.8786303746598552, "grad_norm": 0.9959751963615417, "learning_rate": 6.034698943026415e-05, "loss": 0.2852, "step": 237000 }, { "epoch": 0.8790011047757453, "grad_norm": 0.535211980342865, "learning_rate": 6.031821308579715e-05, "loss": 0.3299, "step": 237100 }, { "epoch": 0.8793718348916356, "grad_norm": 1.0584503412246704, "learning_rate": 6.0289433171151324e-05, "loss": 0.325, "step": 237200 }, { "epoch": 0.8797425650075258, "grad_norm": 0.8323512077331543, "learning_rate": 6.0260649696284757e-05, "loss": 0.3059, "step": 237300 }, { "epoch": 0.8801132951234161, "grad_norm": 1.2020180225372314, "learning_rate": 6.023186267115674e-05, "loss": 0.3105, "step": 237400 }, { "epoch": 0.8804840252393062, "grad_norm": 0.6153997778892517, "learning_rate": 6.0203072105727774e-05, "loss": 0.3039, "step": 237500 }, { "epoch": 0.8808547553551965, "grad_norm": 0.49143603444099426, "learning_rate": 6.017427800995964e-05, "loss": 0.3453, "step": 237600 }, { "epoch": 0.8812254854710868, "grad_norm": 0.7657976150512695, "learning_rate": 6.014548039381531e-05, "loss": 0.2882, "step": 237700 }, { "epoch": 0.881596215586977, "grad_norm": 0.44675540924072266, "learning_rate": 6.0116679267258955e-05, "loss": 0.3056, "step": 237800 }, { "epoch": 0.8819669457028673, "grad_norm": 0.781511127948761, "learning_rate": 6.008787464025599e-05, "loss": 0.3119, "step": 237900 }, { "epoch": 0.8823376758187574, "grad_norm": 1.0829347372055054, "learning_rate": 6.0059066522773024e-05, "loss": 0.3011, "step": 238000 }, { "epoch": 0.8827084059346477, "grad_norm": 0.8294603824615479, "learning_rate": 6.003025492477791e-05, "loss": 0.2945, "step": 238100 }, { "epoch": 0.8830791360505379, "grad_norm": 0.6861506700515747, "learning_rate": 6.0001439856239624e-05, "loss": 0.3237, "step": 238200 }, { "epoch": 0.8834498661664282, "grad_norm": 1.2101250886917114, "learning_rate": 5.997262132712844e-05, "loss": 0.291, "step": 238300 }, { "epoch": 0.8838205962823183, "grad_norm": 0.5553228855133057, "learning_rate": 5.9943799347415765e-05, "loss": 0.3012, "step": 238400 }, { "epoch": 0.8841913263982086, "grad_norm": 0.5994368195533752, "learning_rate": 5.991497392707424e-05, "loss": 0.3181, "step": 238500 }, { "epoch": 0.8845620565140989, "grad_norm": 0.3545340597629547, "learning_rate": 5.9886145076077625e-05, "loss": 0.2749, "step": 238600 }, { "epoch": 0.8849327866299891, "grad_norm": 0.5372537970542908, "learning_rate": 5.9857312804400966e-05, "loss": 0.3435, "step": 238700 }, { "epoch": 0.8853035167458794, "grad_norm": 4.904990196228027, "learning_rate": 5.982847712202043e-05, "loss": 0.3305, "step": 238800 }, { "epoch": 0.8856742468617695, "grad_norm": 0.5841454267501831, "learning_rate": 5.97996380389134e-05, "loss": 0.3079, "step": 238900 }, { "epoch": 0.8860449769776598, "grad_norm": 0.5707395672798157, "learning_rate": 5.9770795565058356e-05, "loss": 0.3031, "step": 239000 }, { "epoch": 0.88641570709355, "grad_norm": 1.1908012628555298, "learning_rate": 5.9741949710435066e-05, "loss": 0.297, "step": 239100 }, { "epoch": 0.8867864372094403, "grad_norm": 0.6841947436332703, "learning_rate": 5.97131004850244e-05, "loss": 0.3065, "step": 239200 }, { "epoch": 0.8871571673253305, "grad_norm": 0.6740678548812866, "learning_rate": 5.9684247898808374e-05, "loss": 0.3084, "step": 239300 }, { "epoch": 0.8875278974412207, "grad_norm": 0.7646487355232239, "learning_rate": 5.9655391961770235e-05, "loss": 0.3186, "step": 239400 }, { "epoch": 0.887898627557111, "grad_norm": 1.100368618965149, "learning_rate": 5.9626532683894344e-05, "loss": 0.3122, "step": 239500 }, { "epoch": 0.8882693576730012, "grad_norm": 0.5611932873725891, "learning_rate": 5.959767007516621e-05, "loss": 0.3014, "step": 239600 }, { "epoch": 0.8886400877888915, "grad_norm": 0.7864255905151367, "learning_rate": 5.9568804145572535e-05, "loss": 0.3201, "step": 239700 }, { "epoch": 0.8890108179047816, "grad_norm": 0.8573973178863525, "learning_rate": 5.953993490510112e-05, "loss": 0.2712, "step": 239800 }, { "epoch": 0.8893815480206719, "grad_norm": 0.8668106198310852, "learning_rate": 5.9511062363740955e-05, "loss": 0.3058, "step": 239900 }, { "epoch": 0.8897522781365621, "grad_norm": 1.157335877418518, "learning_rate": 5.9482186531482166e-05, "loss": 0.305, "step": 240000 }, { "epoch": 0.8901230082524524, "grad_norm": 0.47324228286743164, "learning_rate": 5.9453307418315984e-05, "loss": 0.2972, "step": 240100 }, { "epoch": 0.8904937383683427, "grad_norm": 0.4443867802619934, "learning_rate": 5.9424425034234796e-05, "loss": 0.3255, "step": 240200 }, { "epoch": 0.8908644684842328, "grad_norm": 0.6135109066963196, "learning_rate": 5.939553938923215e-05, "loss": 0.3077, "step": 240300 }, { "epoch": 0.8912351986001231, "grad_norm": 0.8718215227127075, "learning_rate": 5.9366650493302676e-05, "loss": 0.341, "step": 240400 }, { "epoch": 0.8916059287160133, "grad_norm": 0.567984938621521, "learning_rate": 5.933775835644214e-05, "loss": 0.2821, "step": 240500 }, { "epoch": 0.8919766588319036, "grad_norm": 0.8806360960006714, "learning_rate": 5.9308862988647464e-05, "loss": 0.2949, "step": 240600 }, { "epoch": 0.8923473889477938, "grad_norm": 0.9088647961616516, "learning_rate": 5.9279964399916635e-05, "loss": 0.3306, "step": 240700 }, { "epoch": 0.892718119063684, "grad_norm": 0.4564671814441681, "learning_rate": 5.9251062600248796e-05, "loss": 0.3081, "step": 240800 }, { "epoch": 0.8930888491795742, "grad_norm": 0.922457754611969, "learning_rate": 5.922215759964417e-05, "loss": 0.2924, "step": 240900 }, { "epoch": 0.8934595792954645, "grad_norm": 0.6092894077301025, "learning_rate": 5.919324940810412e-05, "loss": 0.2926, "step": 241000 }, { "epoch": 0.8938303094113548, "grad_norm": 1.1598548889160156, "learning_rate": 5.9164338035631094e-05, "loss": 0.3128, "step": 241100 }, { "epoch": 0.894201039527245, "grad_norm": 0.8805066347122192, "learning_rate": 5.9135423492228626e-05, "loss": 0.3341, "step": 241200 }, { "epoch": 0.8945717696431352, "grad_norm": 0.8580049872398376, "learning_rate": 5.910650578790138e-05, "loss": 0.3065, "step": 241300 }, { "epoch": 0.8949424997590254, "grad_norm": 0.8677487969398499, "learning_rate": 5.90775849326551e-05, "loss": 0.313, "step": 241400 }, { "epoch": 0.8953132298749157, "grad_norm": 1.0360970497131348, "learning_rate": 5.9048660936496614e-05, "loss": 0.3007, "step": 241500 }, { "epoch": 0.8956839599908059, "grad_norm": 0.9716653227806091, "learning_rate": 5.901973380943383e-05, "loss": 0.3466, "step": 241600 }, { "epoch": 0.8960546901066961, "grad_norm": 0.622301459312439, "learning_rate": 5.899080356147577e-05, "loss": 0.3052, "step": 241700 }, { "epoch": 0.8964254202225863, "grad_norm": 0.7517666816711426, "learning_rate": 5.896187020263249e-05, "loss": 0.3112, "step": 241800 }, { "epoch": 0.8967961503384766, "grad_norm": 1.6015961170196533, "learning_rate": 5.893293374291517e-05, "loss": 0.3217, "step": 241900 }, { "epoch": 0.8971668804543669, "grad_norm": 0.5483868718147278, "learning_rate": 5.890399419233603e-05, "loss": 0.3069, "step": 242000 }, { "epoch": 0.897537610570257, "grad_norm": 1.6562453508377075, "learning_rate": 5.887505156090839e-05, "loss": 0.2929, "step": 242100 }, { "epoch": 0.8979083406861473, "grad_norm": 0.16915281116962433, "learning_rate": 5.884610585864658e-05, "loss": 0.3339, "step": 242200 }, { "epoch": 0.8982790708020375, "grad_norm": 1.0005415678024292, "learning_rate": 5.881715709556608e-05, "loss": 0.2867, "step": 242300 }, { "epoch": 0.8986498009179278, "grad_norm": 0.33858463168144226, "learning_rate": 5.878820528168333e-05, "loss": 0.2963, "step": 242400 }, { "epoch": 0.899020531033818, "grad_norm": 0.4902627468109131, "learning_rate": 5.87592504270159e-05, "loss": 0.295, "step": 242500 }, { "epoch": 0.8993912611497082, "grad_norm": 0.3581118881702423, "learning_rate": 5.8730292541582364e-05, "loss": 0.2889, "step": 242600 }, { "epoch": 0.8997619912655984, "grad_norm": 0.4854952096939087, "learning_rate": 5.87013316354024e-05, "loss": 0.2722, "step": 242700 }, { "epoch": 0.9001327213814887, "grad_norm": 0.36473965644836426, "learning_rate": 5.867236771849667e-05, "loss": 0.2888, "step": 242800 }, { "epoch": 0.900503451497379, "grad_norm": 0.40740278363227844, "learning_rate": 5.8643400800886926e-05, "loss": 0.2838, "step": 242900 }, { "epoch": 0.9008741816132692, "grad_norm": 1.0453367233276367, "learning_rate": 5.8614430892595915e-05, "loss": 0.2708, "step": 243000 }, { "epoch": 0.9012449117291594, "grad_norm": 1.1334424018859863, "learning_rate": 5.858545800364746e-05, "loss": 0.3018, "step": 243100 }, { "epoch": 0.9016156418450496, "grad_norm": 1.1115158796310425, "learning_rate": 5.8556482144066404e-05, "loss": 0.309, "step": 243200 }, { "epoch": 0.9019863719609399, "grad_norm": 0.5301451086997986, "learning_rate": 5.8527503323878585e-05, "loss": 0.3018, "step": 243300 }, { "epoch": 0.9023571020768301, "grad_norm": 0.6482408046722412, "learning_rate": 5.849852155311091e-05, "loss": 0.3271, "step": 243400 }, { "epoch": 0.9027278321927203, "grad_norm": 0.5146307945251465, "learning_rate": 5.8469536841791275e-05, "loss": 0.3386, "step": 243500 }, { "epoch": 0.9030985623086106, "grad_norm": 0.5974158048629761, "learning_rate": 5.8440549199948624e-05, "loss": 0.3068, "step": 243600 }, { "epoch": 0.9034692924245008, "grad_norm": 0.789176344871521, "learning_rate": 5.8411558637612874e-05, "loss": 0.3381, "step": 243700 }, { "epoch": 0.9038400225403911, "grad_norm": 0.5067803263664246, "learning_rate": 5.8382565164815e-05, "loss": 0.2726, "step": 243800 }, { "epoch": 0.9042107526562813, "grad_norm": 1.3453717231750488, "learning_rate": 5.8353568791586945e-05, "loss": 0.3128, "step": 243900 }, { "epoch": 0.9045814827721715, "grad_norm": 1.3996098041534424, "learning_rate": 5.832456952796169e-05, "loss": 0.3022, "step": 244000 }, { "epoch": 0.9049522128880617, "grad_norm": 0.6063675880432129, "learning_rate": 5.829556738397317e-05, "loss": 0.2877, "step": 244100 }, { "epoch": 0.905322943003952, "grad_norm": 0.7194257378578186, "learning_rate": 5.8266562369656364e-05, "loss": 0.3025, "step": 244200 }, { "epoch": 0.9056936731198422, "grad_norm": 0.5596180558204651, "learning_rate": 5.823755449504722e-05, "loss": 0.316, "step": 244300 }, { "epoch": 0.9060644032357325, "grad_norm": 2.2716822624206543, "learning_rate": 5.820854377018265e-05, "loss": 0.3191, "step": 244400 }, { "epoch": 0.9064351333516227, "grad_norm": 0.5278255343437195, "learning_rate": 5.817953020510063e-05, "loss": 0.3358, "step": 244500 }, { "epoch": 0.9068058634675129, "grad_norm": 0.6747839450836182, "learning_rate": 5.815051380984004e-05, "loss": 0.3212, "step": 244600 }, { "epoch": 0.9071765935834032, "grad_norm": 0.4288410544395447, "learning_rate": 5.812149459444078e-05, "loss": 0.3104, "step": 244700 }, { "epoch": 0.9075473236992934, "grad_norm": 1.1491221189498901, "learning_rate": 5.809247256894371e-05, "loss": 0.3314, "step": 244800 }, { "epoch": 0.9079180538151836, "grad_norm": 0.6171171069145203, "learning_rate": 5.806344774339067e-05, "loss": 0.2816, "step": 244900 }, { "epoch": 0.9082887839310738, "grad_norm": 0.6349907517433167, "learning_rate": 5.803442012782447e-05, "loss": 0.3585, "step": 245000 }, { "epoch": 0.9086595140469641, "grad_norm": 1.222754716873169, "learning_rate": 5.8005389732288886e-05, "loss": 0.3273, "step": 245100 }, { "epoch": 0.9090302441628543, "grad_norm": 0.4354826807975769, "learning_rate": 5.797635656682863e-05, "loss": 0.3016, "step": 245200 }, { "epoch": 0.9094009742787446, "grad_norm": 0.311976820230484, "learning_rate": 5.794732064148942e-05, "loss": 0.3077, "step": 245300 }, { "epoch": 0.9097717043946348, "grad_norm": 0.604902446269989, "learning_rate": 5.7918281966317885e-05, "loss": 0.3113, "step": 245400 }, { "epoch": 0.910142434510525, "grad_norm": 0.855357825756073, "learning_rate": 5.7889240551361634e-05, "loss": 0.3258, "step": 245500 }, { "epoch": 0.9105131646264153, "grad_norm": 0.7147024869918823, "learning_rate": 5.78601964066692e-05, "loss": 0.3213, "step": 245600 }, { "epoch": 0.9108838947423055, "grad_norm": 0.5352205634117126, "learning_rate": 5.7831149542290086e-05, "loss": 0.3046, "step": 245700 }, { "epoch": 0.9112546248581957, "grad_norm": 0.8428664803504944, "learning_rate": 5.7802099968274733e-05, "loss": 0.3329, "step": 245800 }, { "epoch": 0.9116253549740859, "grad_norm": 0.8428768515586853, "learning_rate": 5.777304769467449e-05, "loss": 0.3027, "step": 245900 }, { "epoch": 0.9119960850899762, "grad_norm": 0.7721765637397766, "learning_rate": 5.7743992731541665e-05, "loss": 0.3108, "step": 246000 }, { "epoch": 0.9123668152058665, "grad_norm": 0.3975095748901367, "learning_rate": 5.77149350889295e-05, "loss": 0.3245, "step": 246100 }, { "epoch": 0.9127375453217567, "grad_norm": 0.2469290792942047, "learning_rate": 5.7685874776892166e-05, "loss": 0.3296, "step": 246200 }, { "epoch": 0.9131082754376469, "grad_norm": 0.804861307144165, "learning_rate": 5.7656811805484724e-05, "loss": 0.2945, "step": 246300 }, { "epoch": 0.9134790055535371, "grad_norm": 0.2675977945327759, "learning_rate": 5.7627746184763185e-05, "loss": 0.3235, "step": 246400 }, { "epoch": 0.9138497356694274, "grad_norm": 0.4415036737918854, "learning_rate": 5.7598677924784505e-05, "loss": 0.2903, "step": 246500 }, { "epoch": 0.9142204657853176, "grad_norm": 0.7119426727294922, "learning_rate": 5.7569607035606464e-05, "loss": 0.3147, "step": 246600 }, { "epoch": 0.9145911959012079, "grad_norm": 0.6002748608589172, "learning_rate": 5.7540533527287835e-05, "loss": 0.2836, "step": 246700 }, { "epoch": 0.914961926017098, "grad_norm": 0.6955744028091431, "learning_rate": 5.751145740988828e-05, "loss": 0.3406, "step": 246800 }, { "epoch": 0.9153326561329883, "grad_norm": 1.4733420610427856, "learning_rate": 5.7482378693468344e-05, "loss": 0.3245, "step": 246900 }, { "epoch": 0.9157033862488786, "grad_norm": 0.7437728047370911, "learning_rate": 5.745329738808947e-05, "loss": 0.3344, "step": 247000 }, { "epoch": 0.9160741163647688, "grad_norm": 0.8017998933792114, "learning_rate": 5.7424213503814e-05, "loss": 0.3177, "step": 247100 }, { "epoch": 0.916444846480659, "grad_norm": 0.7883774638175964, "learning_rate": 5.739512705070521e-05, "loss": 0.3069, "step": 247200 }, { "epoch": 0.9168155765965492, "grad_norm": 0.8120819926261902, "learning_rate": 5.736603803882721e-05, "loss": 0.3006, "step": 247300 }, { "epoch": 0.9171863067124395, "grad_norm": 0.6216634511947632, "learning_rate": 5.733694647824499e-05, "loss": 0.2951, "step": 247400 }, { "epoch": 0.9175570368283297, "grad_norm": 0.6969355344772339, "learning_rate": 5.7307852379024485e-05, "loss": 0.3047, "step": 247500 }, { "epoch": 0.91792776694422, "grad_norm": 0.36515867710113525, "learning_rate": 5.7278755751232486e-05, "loss": 0.2678, "step": 247600 }, { "epoch": 0.9182984970601101, "grad_norm": 0.6566796898841858, "learning_rate": 5.724965660493659e-05, "loss": 0.2995, "step": 247700 }, { "epoch": 0.9186692271760004, "grad_norm": 1.020294427871704, "learning_rate": 5.722055495020533e-05, "loss": 0.3004, "step": 247800 }, { "epoch": 0.9190399572918907, "grad_norm": 0.5244026780128479, "learning_rate": 5.719145079710813e-05, "loss": 0.3097, "step": 247900 }, { "epoch": 0.9194106874077809, "grad_norm": 0.6567115783691406, "learning_rate": 5.716234415571523e-05, "loss": 0.3436, "step": 248000 }, { "epoch": 0.9197814175236712, "grad_norm": 0.3676627576351166, "learning_rate": 5.713323503609772e-05, "loss": 0.3128, "step": 248100 }, { "epoch": 0.9201521476395613, "grad_norm": 0.6948099732398987, "learning_rate": 5.7104123448327604e-05, "loss": 0.3072, "step": 248200 }, { "epoch": 0.9205228777554516, "grad_norm": 0.47420066595077515, "learning_rate": 5.707500940247771e-05, "loss": 0.3129, "step": 248300 }, { "epoch": 0.9208936078713418, "grad_norm": 0.8142419457435608, "learning_rate": 5.704589290862169e-05, "loss": 0.322, "step": 248400 }, { "epoch": 0.9212643379872321, "grad_norm": 0.8642900586128235, "learning_rate": 5.7016773976834084e-05, "loss": 0.3255, "step": 248500 }, { "epoch": 0.9216350681031222, "grad_norm": 0.6360352039337158, "learning_rate": 5.6987652617190244e-05, "loss": 0.2735, "step": 248600 }, { "epoch": 0.9220057982190125, "grad_norm": 0.5854160189628601, "learning_rate": 5.695852883976641e-05, "loss": 0.3073, "step": 248700 }, { "epoch": 0.9223765283349028, "grad_norm": 1.041788935661316, "learning_rate": 5.692940265463961e-05, "loss": 0.2959, "step": 248800 }, { "epoch": 0.922747258450793, "grad_norm": 0.6428635716438293, "learning_rate": 5.6900274071887703e-05, "loss": 0.326, "step": 248900 }, { "epoch": 0.9231179885666833, "grad_norm": 0.3367425203323364, "learning_rate": 5.687114310158942e-05, "loss": 0.3012, "step": 249000 }, { "epoch": 0.9234887186825734, "grad_norm": 0.5933238863945007, "learning_rate": 5.68420097538243e-05, "loss": 0.2979, "step": 249100 }, { "epoch": 0.9238594487984637, "grad_norm": 0.8194847106933594, "learning_rate": 5.681287403867268e-05, "loss": 0.3087, "step": 249200 }, { "epoch": 0.9242301789143539, "grad_norm": 0.9859776496887207, "learning_rate": 5.6783735966215733e-05, "loss": 0.3215, "step": 249300 }, { "epoch": 0.9246009090302442, "grad_norm": 0.36367067694664, "learning_rate": 5.675459554653547e-05, "loss": 0.3134, "step": 249400 }, { "epoch": 0.9249716391461345, "grad_norm": 0.9642863273620605, "learning_rate": 5.672545278971468e-05, "loss": 0.3159, "step": 249500 }, { "epoch": 0.9253423692620246, "grad_norm": 0.9408420920372009, "learning_rate": 5.6696307705836973e-05, "loss": 0.3025, "step": 249600 }, { "epoch": 0.9257130993779149, "grad_norm": 0.8983610272407532, "learning_rate": 5.6667160304986765e-05, "loss": 0.3048, "step": 249700 }, { "epoch": 0.9260838294938051, "grad_norm": 0.5554744601249695, "learning_rate": 5.663801059724929e-05, "loss": 0.3049, "step": 249800 }, { "epoch": 0.9264545596096954, "grad_norm": 0.3466656506061554, "learning_rate": 5.660885859271056e-05, "loss": 0.3198, "step": 249900 }, { "epoch": 0.9268252897255855, "grad_norm": 0.7136577367782593, "learning_rate": 5.657970430145736e-05, "loss": 0.2991, "step": 250000 }, { "epoch": 0.9271960198414758, "grad_norm": 0.7360095381736755, "learning_rate": 5.655054773357732e-05, "loss": 0.293, "step": 250100 }, { "epoch": 0.927566749957366, "grad_norm": 1.155311107635498, "learning_rate": 5.6521388899158824e-05, "loss": 0.2932, "step": 250200 }, { "epoch": 0.9279374800732563, "grad_norm": 1.3536592721939087, "learning_rate": 5.649222780829105e-05, "loss": 0.3214, "step": 250300 }, { "epoch": 0.9283082101891466, "grad_norm": 0.898088812828064, "learning_rate": 5.646306447106394e-05, "loss": 0.3299, "step": 250400 }, { "epoch": 0.9286789403050367, "grad_norm": 0.3659202754497528, "learning_rate": 5.643389889756825e-05, "loss": 0.2927, "step": 250500 }, { "epoch": 0.929049670420927, "grad_norm": 1.2659732103347778, "learning_rate": 5.640473109789546e-05, "loss": 0.3167, "step": 250600 }, { "epoch": 0.9294204005368172, "grad_norm": 0.5965259075164795, "learning_rate": 5.637556108213787e-05, "loss": 0.314, "step": 250700 }, { "epoch": 0.9297911306527075, "grad_norm": 1.280868411064148, "learning_rate": 5.63463888603885e-05, "loss": 0.2986, "step": 250800 }, { "epoch": 0.9301618607685976, "grad_norm": 0.7690989971160889, "learning_rate": 5.6317214442741176e-05, "loss": 0.3358, "step": 250900 }, { "epoch": 0.9305325908844879, "grad_norm": 0.5422789454460144, "learning_rate": 5.628803783929045e-05, "loss": 0.3252, "step": 251000 }, { "epoch": 0.9309033210003781, "grad_norm": 0.4484233260154724, "learning_rate": 5.625885906013166e-05, "loss": 0.2865, "step": 251100 }, { "epoch": 0.9312740511162684, "grad_norm": 0.3531385660171509, "learning_rate": 5.622967811536086e-05, "loss": 0.3281, "step": 251200 }, { "epoch": 0.9316447812321587, "grad_norm": 0.539555549621582, "learning_rate": 5.620049501507491e-05, "loss": 0.3271, "step": 251300 }, { "epoch": 0.9320155113480488, "grad_norm": 2.744760274887085, "learning_rate": 5.617130976937135e-05, "loss": 0.3391, "step": 251400 }, { "epoch": 0.9323862414639391, "grad_norm": 0.548507809638977, "learning_rate": 5.614212238834849e-05, "loss": 0.2929, "step": 251500 }, { "epoch": 0.9327569715798293, "grad_norm": 1.0977928638458252, "learning_rate": 5.611293288210542e-05, "loss": 0.3341, "step": 251600 }, { "epoch": 0.9331277016957196, "grad_norm": 0.39370763301849365, "learning_rate": 5.608374126074187e-05, "loss": 0.3009, "step": 251700 }, { "epoch": 0.9334984318116097, "grad_norm": 0.8710266351699829, "learning_rate": 5.605454753435842e-05, "loss": 0.3087, "step": 251800 }, { "epoch": 0.9338691619275, "grad_norm": 0.5283187627792358, "learning_rate": 5.602535171305626e-05, "loss": 0.3146, "step": 251900 }, { "epoch": 0.9342398920433903, "grad_norm": 1.12079656124115, "learning_rate": 5.599615380693741e-05, "loss": 0.2958, "step": 252000 }, { "epoch": 0.9346106221592805, "grad_norm": 0.34388697147369385, "learning_rate": 5.596695382610453e-05, "loss": 0.3359, "step": 252100 }, { "epoch": 0.9349813522751708, "grad_norm": 0.6716346740722656, "learning_rate": 5.593775178066106e-05, "loss": 0.3096, "step": 252200 }, { "epoch": 0.9353520823910609, "grad_norm": 0.7841528654098511, "learning_rate": 5.59085476807111e-05, "loss": 0.3171, "step": 252300 }, { "epoch": 0.9357228125069512, "grad_norm": 0.893317699432373, "learning_rate": 5.587934153635951e-05, "loss": 0.2832, "step": 252400 }, { "epoch": 0.9360935426228414, "grad_norm": 0.2880883812904358, "learning_rate": 5.585013335771181e-05, "loss": 0.2957, "step": 252500 }, { "epoch": 0.9364642727387317, "grad_norm": 0.7132927179336548, "learning_rate": 5.582092315487426e-05, "loss": 0.3079, "step": 252600 }, { "epoch": 0.9368350028546218, "grad_norm": 0.5565809607505798, "learning_rate": 5.579171093795382e-05, "loss": 0.2957, "step": 252700 }, { "epoch": 0.9372057329705121, "grad_norm": 0.5633884072303772, "learning_rate": 5.576249671705811e-05, "loss": 0.2892, "step": 252800 }, { "epoch": 0.9375764630864024, "grad_norm": 0.6940320134162903, "learning_rate": 5.5733280502295494e-05, "loss": 0.3286, "step": 252900 }, { "epoch": 0.9379471932022926, "grad_norm": 0.9075203537940979, "learning_rate": 5.570406230377499e-05, "loss": 0.3297, "step": 253000 }, { "epoch": 0.9383179233181829, "grad_norm": 1.2508563995361328, "learning_rate": 5.567484213160631e-05, "loss": 0.3374, "step": 253100 }, { "epoch": 0.938688653434073, "grad_norm": 0.5174074172973633, "learning_rate": 5.564561999589984e-05, "loss": 0.2978, "step": 253200 }, { "epoch": 0.9390593835499633, "grad_norm": 0.7017688751220703, "learning_rate": 5.561639590676669e-05, "loss": 0.3048, "step": 253300 }, { "epoch": 0.9394301136658535, "grad_norm": 1.015933871269226, "learning_rate": 5.558716987431858e-05, "loss": 0.3125, "step": 253400 }, { "epoch": 0.9398008437817438, "grad_norm": 0.34008297324180603, "learning_rate": 5.555794190866797e-05, "loss": 0.3269, "step": 253500 }, { "epoch": 0.940171573897634, "grad_norm": 1.1185014247894287, "learning_rate": 5.552871201992791e-05, "loss": 0.3318, "step": 253600 }, { "epoch": 0.9405423040135242, "grad_norm": 1.0694942474365234, "learning_rate": 5.5499480218212206e-05, "loss": 0.2777, "step": 253700 }, { "epoch": 0.9409130341294145, "grad_norm": 0.2972624599933624, "learning_rate": 5.547024651363524e-05, "loss": 0.3269, "step": 253800 }, { "epoch": 0.9412837642453047, "grad_norm": 0.8011226058006287, "learning_rate": 5.544101091631213e-05, "loss": 0.3492, "step": 253900 }, { "epoch": 0.941654494361195, "grad_norm": 0.8461817502975464, "learning_rate": 5.5411773436358574e-05, "loss": 0.2942, "step": 254000 }, { "epoch": 0.9420252244770851, "grad_norm": 0.2897057831287384, "learning_rate": 5.538253408389099e-05, "loss": 0.2956, "step": 254100 }, { "epoch": 0.9423959545929754, "grad_norm": 0.6042524576187134, "learning_rate": 5.53532928690264e-05, "loss": 0.3012, "step": 254200 }, { "epoch": 0.9427666847088656, "grad_norm": 1.0565314292907715, "learning_rate": 5.532404980188246e-05, "loss": 0.3226, "step": 254300 }, { "epoch": 0.9431374148247559, "grad_norm": 0.5520265102386475, "learning_rate": 5.5294804892577525e-05, "loss": 0.2931, "step": 254400 }, { "epoch": 0.943508144940646, "grad_norm": 0.926365077495575, "learning_rate": 5.5265558151230536e-05, "loss": 0.2809, "step": 254500 }, { "epoch": 0.9438788750565363, "grad_norm": 0.4076917767524719, "learning_rate": 5.52363095879611e-05, "loss": 0.322, "step": 254600 }, { "epoch": 0.9442496051724266, "grad_norm": 0.4944913983345032, "learning_rate": 5.52070592128894e-05, "loss": 0.3093, "step": 254700 }, { "epoch": 0.9446203352883168, "grad_norm": 0.4577043950557709, "learning_rate": 5.517780703613632e-05, "loss": 0.2841, "step": 254800 }, { "epoch": 0.9449910654042071, "grad_norm": 1.1274546384811401, "learning_rate": 5.5148553067823325e-05, "loss": 0.3392, "step": 254900 }, { "epoch": 0.9453617955200972, "grad_norm": 0.8338509202003479, "learning_rate": 5.5119297318072504e-05, "loss": 0.3265, "step": 255000 }, { "epoch": 0.9457325256359875, "grad_norm": 0.9318355321884155, "learning_rate": 5.5090039797006554e-05, "loss": 0.3318, "step": 255100 }, { "epoch": 0.9461032557518777, "grad_norm": 0.8160555958747864, "learning_rate": 5.506078051474881e-05, "loss": 0.3108, "step": 255200 }, { "epoch": 0.946473985867768, "grad_norm": 0.6923002004623413, "learning_rate": 5.503151948142321e-05, "loss": 0.3014, "step": 255300 }, { "epoch": 0.9468447159836583, "grad_norm": 1.0753068923950195, "learning_rate": 5.500225670715425e-05, "loss": 0.2882, "step": 255400 }, { "epoch": 0.9472154460995484, "grad_norm": 1.3178011178970337, "learning_rate": 5.497299220206711e-05, "loss": 0.3306, "step": 255500 }, { "epoch": 0.9475861762154387, "grad_norm": 0.3396689295768738, "learning_rate": 5.494372597628752e-05, "loss": 0.3126, "step": 255600 }, { "epoch": 0.9479569063313289, "grad_norm": 0.6971357464790344, "learning_rate": 5.491445803994181e-05, "loss": 0.3079, "step": 255700 }, { "epoch": 0.9483276364472192, "grad_norm": 0.7587060332298279, "learning_rate": 5.4885188403156886e-05, "loss": 0.2865, "step": 255800 }, { "epoch": 0.9486983665631094, "grad_norm": 0.5014783143997192, "learning_rate": 5.485591707606027e-05, "loss": 0.3045, "step": 255900 }, { "epoch": 0.9490690966789996, "grad_norm": 0.6951929330825806, "learning_rate": 5.4826644068780087e-05, "loss": 0.3055, "step": 256000 }, { "epoch": 0.9494398267948898, "grad_norm": 0.6298203468322754, "learning_rate": 5.479736939144499e-05, "loss": 0.3051, "step": 256100 }, { "epoch": 0.9498105569107801, "grad_norm": 0.524243175983429, "learning_rate": 5.476809305418422e-05, "loss": 0.278, "step": 256200 }, { "epoch": 0.9501812870266704, "grad_norm": 0.7436094880104065, "learning_rate": 5.4738815067127645e-05, "loss": 0.292, "step": 256300 }, { "epoch": 0.9505520171425605, "grad_norm": 0.44147321581840515, "learning_rate": 5.470953544040567e-05, "loss": 0.312, "step": 256400 }, { "epoch": 0.9509227472584508, "grad_norm": 1.0785224437713623, "learning_rate": 5.468025418414922e-05, "loss": 0.3371, "step": 256500 }, { "epoch": 0.951293477374341, "grad_norm": 1.2491481304168701, "learning_rate": 5.465097130848986e-05, "loss": 0.3361, "step": 256600 }, { "epoch": 0.9516642074902313, "grad_norm": 0.801481306552887, "learning_rate": 5.462168682355968e-05, "loss": 0.3099, "step": 256700 }, { "epoch": 0.9520349376061215, "grad_norm": 1.4741556644439697, "learning_rate": 5.459240073949134e-05, "loss": 0.312, "step": 256800 }, { "epoch": 0.9524056677220117, "grad_norm": 0.9866992831230164, "learning_rate": 5.456311306641802e-05, "loss": 0.3378, "step": 256900 }, { "epoch": 0.9527763978379019, "grad_norm": 0.8414381742477417, "learning_rate": 5.453382381447348e-05, "loss": 0.2868, "step": 257000 }, { "epoch": 0.9531471279537922, "grad_norm": 0.7459807395935059, "learning_rate": 5.4504532993792045e-05, "loss": 0.2856, "step": 257100 }, { "epoch": 0.9535178580696825, "grad_norm": 0.4009762108325958, "learning_rate": 5.447524061450854e-05, "loss": 0.2766, "step": 257200 }, { "epoch": 0.9538885881855727, "grad_norm": 0.7319802045822144, "learning_rate": 5.4445946686758334e-05, "loss": 0.3168, "step": 257300 }, { "epoch": 0.9542593183014629, "grad_norm": 0.712274432182312, "learning_rate": 5.441665122067735e-05, "loss": 0.3233, "step": 257400 }, { "epoch": 0.9546300484173531, "grad_norm": 0.4087705612182617, "learning_rate": 5.4387354226402076e-05, "loss": 0.3117, "step": 257500 }, { "epoch": 0.9550007785332434, "grad_norm": 0.8949238061904907, "learning_rate": 5.435805571406944e-05, "loss": 0.3044, "step": 257600 }, { "epoch": 0.9553715086491336, "grad_norm": 0.7281532287597656, "learning_rate": 5.432875569381697e-05, "loss": 0.3432, "step": 257700 }, { "epoch": 0.9557422387650238, "grad_norm": 1.480070948600769, "learning_rate": 5.4299454175782686e-05, "loss": 0.3336, "step": 257800 }, { "epoch": 0.9561129688809141, "grad_norm": 0.7527721524238586, "learning_rate": 5.427015117010514e-05, "loss": 0.3066, "step": 257900 }, { "epoch": 0.9564836989968043, "grad_norm": 0.7607856392860413, "learning_rate": 5.4240846686923374e-05, "loss": 0.3176, "step": 258000 }, { "epoch": 0.9568544291126946, "grad_norm": 0.8956596851348877, "learning_rate": 5.421154073637697e-05, "loss": 0.3291, "step": 258100 }, { "epoch": 0.9572251592285848, "grad_norm": 0.9085289835929871, "learning_rate": 5.418223332860603e-05, "loss": 0.3121, "step": 258200 }, { "epoch": 0.957595889344475, "grad_norm": 0.8868082165718079, "learning_rate": 5.415292447375109e-05, "loss": 0.3085, "step": 258300 }, { "epoch": 0.9579666194603652, "grad_norm": 0.6445107460021973, "learning_rate": 5.412361418195325e-05, "loss": 0.3001, "step": 258400 }, { "epoch": 0.9583373495762555, "grad_norm": 0.7448208928108215, "learning_rate": 5.409430246335411e-05, "loss": 0.2926, "step": 258500 }, { "epoch": 0.9587080796921457, "grad_norm": 0.5058093070983887, "learning_rate": 5.406498932809571e-05, "loss": 0.3159, "step": 258600 }, { "epoch": 0.959078809808036, "grad_norm": 0.7616602182388306, "learning_rate": 5.403567478632066e-05, "loss": 0.3255, "step": 258700 }, { "epoch": 0.9594495399239262, "grad_norm": 0.5414878726005554, "learning_rate": 5.400635884817196e-05, "loss": 0.3021, "step": 258800 }, { "epoch": 0.9598202700398164, "grad_norm": 0.7485790848731995, "learning_rate": 5.397704152379318e-05, "loss": 0.2981, "step": 258900 }, { "epoch": 0.9601910001557067, "grad_norm": 0.5353637337684631, "learning_rate": 5.394772282332831e-05, "loss": 0.2957, "step": 259000 }, { "epoch": 0.9605617302715969, "grad_norm": 0.3084680438041687, "learning_rate": 5.391840275692185e-05, "loss": 0.2955, "step": 259100 }, { "epoch": 0.9609324603874871, "grad_norm": 0.989575207233429, "learning_rate": 5.388908133471875e-05, "loss": 0.293, "step": 259200 }, { "epoch": 0.9613031905033773, "grad_norm": 1.0693316459655762, "learning_rate": 5.385975856686447e-05, "loss": 0.3267, "step": 259300 }, { "epoch": 0.9616739206192676, "grad_norm": 0.9386118054389954, "learning_rate": 5.3830434463504884e-05, "loss": 0.2906, "step": 259400 }, { "epoch": 0.9620446507351578, "grad_norm": 0.7738717794418335, "learning_rate": 5.380110903478634e-05, "loss": 0.2997, "step": 259500 }, { "epoch": 0.962415380851048, "grad_norm": 0.39804327487945557, "learning_rate": 5.3771782290855674e-05, "loss": 0.2756, "step": 259600 }, { "epoch": 0.9627861109669383, "grad_norm": 1.6626147031784058, "learning_rate": 5.374245424186017e-05, "loss": 0.3039, "step": 259700 }, { "epoch": 0.9631568410828285, "grad_norm": 0.7547833323478699, "learning_rate": 5.371312489794754e-05, "loss": 0.3346, "step": 259800 }, { "epoch": 0.9635275711987188, "grad_norm": 0.6924284100532532, "learning_rate": 5.368379426926594e-05, "loss": 0.3097, "step": 259900 }, { "epoch": 0.963898301314609, "grad_norm": 0.7743933796882629, "learning_rate": 5.3654462365963995e-05, "loss": 0.311, "step": 260000 }, { "epoch": 0.9642690314304992, "grad_norm": 1.0572869777679443, "learning_rate": 5.362512919819079e-05, "loss": 0.2908, "step": 260100 }, { "epoch": 0.9646397615463894, "grad_norm": 0.8637348413467407, "learning_rate": 5.3595794776095784e-05, "loss": 0.3067, "step": 260200 }, { "epoch": 0.9650104916622797, "grad_norm": 1.2545220851898193, "learning_rate": 5.3566459109828925e-05, "loss": 0.3143, "step": 260300 }, { "epoch": 0.9653812217781699, "grad_norm": 0.30668458342552185, "learning_rate": 5.353712220954057e-05, "loss": 0.2746, "step": 260400 }, { "epoch": 0.9657519518940602, "grad_norm": 1.0704774856567383, "learning_rate": 5.350778408538148e-05, "loss": 0.3148, "step": 260500 }, { "epoch": 0.9661226820099504, "grad_norm": 0.9562327861785889, "learning_rate": 5.347844474750292e-05, "loss": 0.304, "step": 260600 }, { "epoch": 0.9664934121258406, "grad_norm": 0.506866455078125, "learning_rate": 5.344910420605648e-05, "loss": 0.299, "step": 260700 }, { "epoch": 0.9668641422417309, "grad_norm": 0.5319231748580933, "learning_rate": 5.341976247119421e-05, "loss": 0.339, "step": 260800 }, { "epoch": 0.9672348723576211, "grad_norm": 1.0069701671600342, "learning_rate": 5.339041955306857e-05, "loss": 0.3038, "step": 260900 }, { "epoch": 0.9676056024735114, "grad_norm": 1.00048828125, "learning_rate": 5.336107546183244e-05, "loss": 0.3183, "step": 261000 }, { "epoch": 0.9679763325894015, "grad_norm": 0.7481801509857178, "learning_rate": 5.333173020763909e-05, "loss": 0.3229, "step": 261100 }, { "epoch": 0.9683470627052918, "grad_norm": 0.8170436024665833, "learning_rate": 5.33023838006422e-05, "loss": 0.2875, "step": 261200 }, { "epoch": 0.9687177928211821, "grad_norm": 0.6789833307266235, "learning_rate": 5.327303625099585e-05, "loss": 0.319, "step": 261300 }, { "epoch": 0.9690885229370723, "grad_norm": 0.5428205728530884, "learning_rate": 5.3243687568854484e-05, "loss": 0.3195, "step": 261400 }, { "epoch": 0.9694592530529625, "grad_norm": 0.7265328168869019, "learning_rate": 5.321433776437299e-05, "loss": 0.3415, "step": 261500 }, { "epoch": 0.9698299831688527, "grad_norm": 0.7714012265205383, "learning_rate": 5.318498684770661e-05, "loss": 0.3017, "step": 261600 }, { "epoch": 0.970200713284743, "grad_norm": 0.30842480063438416, "learning_rate": 5.315563482901099e-05, "loss": 0.2842, "step": 261700 }, { "epoch": 0.9705714434006332, "grad_norm": 0.6948258280754089, "learning_rate": 5.312628171844214e-05, "loss": 0.2992, "step": 261800 }, { "epoch": 0.9709421735165235, "grad_norm": 0.6908165812492371, "learning_rate": 5.309692752615647e-05, "loss": 0.3088, "step": 261900 }, { "epoch": 0.9713129036324136, "grad_norm": 1.0788521766662598, "learning_rate": 5.3067572262310725e-05, "loss": 0.2952, "step": 262000 }, { "epoch": 0.9716836337483039, "grad_norm": 1.2768758535385132, "learning_rate": 5.3038215937062064e-05, "loss": 0.3146, "step": 262100 }, { "epoch": 0.9720543638641942, "grad_norm": 1.394485354423523, "learning_rate": 5.300885856056799e-05, "loss": 0.2936, "step": 262200 }, { "epoch": 0.9724250939800844, "grad_norm": 0.7913326621055603, "learning_rate": 5.2979500142986375e-05, "loss": 0.2939, "step": 262300 }, { "epoch": 0.9727958240959746, "grad_norm": 0.7045042514801025, "learning_rate": 5.295014069447544e-05, "loss": 0.288, "step": 262400 }, { "epoch": 0.9731665542118648, "grad_norm": 0.49015629291534424, "learning_rate": 5.292078022519379e-05, "loss": 0.28, "step": 262500 }, { "epoch": 0.9735372843277551, "grad_norm": 0.40397319197654724, "learning_rate": 5.289141874530037e-05, "loss": 0.3144, "step": 262600 }, { "epoch": 0.9739080144436453, "grad_norm": 0.6997429132461548, "learning_rate": 5.286205626495444e-05, "loss": 0.2961, "step": 262700 }, { "epoch": 0.9742787445595356, "grad_norm": 0.9345607161521912, "learning_rate": 5.283269279431566e-05, "loss": 0.3005, "step": 262800 }, { "epoch": 0.9746494746754257, "grad_norm": 0.6519402265548706, "learning_rate": 5.2803328343544034e-05, "loss": 0.3033, "step": 262900 }, { "epoch": 0.975020204791316, "grad_norm": 0.7430440187454224, "learning_rate": 5.2773962922799844e-05, "loss": 0.3088, "step": 263000 }, { "epoch": 0.9753909349072063, "grad_norm": 0.9847736954689026, "learning_rate": 5.274459654224375e-05, "loss": 0.2968, "step": 263100 }, { "epoch": 0.9757616650230965, "grad_norm": 0.17451761662960052, "learning_rate": 5.271522921203677e-05, "loss": 0.3169, "step": 263200 }, { "epoch": 0.9761323951389868, "grad_norm": 0.39320626854896545, "learning_rate": 5.268586094234017e-05, "loss": 0.2889, "step": 263300 }, { "epoch": 0.9765031252548769, "grad_norm": 1.0548518896102905, "learning_rate": 5.265649174331563e-05, "loss": 0.299, "step": 263400 }, { "epoch": 0.9768738553707672, "grad_norm": 0.7994017004966736, "learning_rate": 5.262712162512509e-05, "loss": 0.293, "step": 263500 }, { "epoch": 0.9772445854866574, "grad_norm": 1.2883504629135132, "learning_rate": 5.2597750597930854e-05, "loss": 0.3108, "step": 263600 }, { "epoch": 0.9776153156025477, "grad_norm": 0.8702749013900757, "learning_rate": 5.2568378671895505e-05, "loss": 0.3322, "step": 263700 }, { "epoch": 0.977986045718438, "grad_norm": 0.6138551831245422, "learning_rate": 5.253900585718192e-05, "loss": 0.3213, "step": 263800 }, { "epoch": 0.9783567758343281, "grad_norm": 0.6872825622558594, "learning_rate": 5.250963216395335e-05, "loss": 0.3303, "step": 263900 }, { "epoch": 0.9787275059502184, "grad_norm": 0.4365454316139221, "learning_rate": 5.24802576023733e-05, "loss": 0.3417, "step": 264000 }, { "epoch": 0.9790982360661086, "grad_norm": 0.4978713393211365, "learning_rate": 5.245088218260561e-05, "loss": 0.3072, "step": 264100 }, { "epoch": 0.9794689661819989, "grad_norm": 0.5979584455490112, "learning_rate": 5.242150591481435e-05, "loss": 0.3135, "step": 264200 }, { "epoch": 0.979839696297889, "grad_norm": 1.3805888891220093, "learning_rate": 5.2392128809163953e-05, "loss": 0.295, "step": 264300 }, { "epoch": 0.9802104264137793, "grad_norm": 0.7209807634353638, "learning_rate": 5.2362750875819134e-05, "loss": 0.3135, "step": 264400 }, { "epoch": 0.9805811565296695, "grad_norm": 1.099200963973999, "learning_rate": 5.233337212494486e-05, "loss": 0.3232, "step": 264500 }, { "epoch": 0.9809518866455598, "grad_norm": 0.32959747314453125, "learning_rate": 5.2303992566706374e-05, "loss": 0.2981, "step": 264600 }, { "epoch": 0.98132261676145, "grad_norm": 0.8088615536689758, "learning_rate": 5.227461221126925e-05, "loss": 0.3185, "step": 264700 }, { "epoch": 0.9816933468773402, "grad_norm": 0.6739004850387573, "learning_rate": 5.224523106879933e-05, "loss": 0.2817, "step": 264800 }, { "epoch": 0.9820640769932305, "grad_norm": 0.5674151182174683, "learning_rate": 5.221584914946266e-05, "loss": 0.3006, "step": 264900 }, { "epoch": 0.9824348071091207, "grad_norm": 0.44842374324798584, "learning_rate": 5.218646646342563e-05, "loss": 0.2998, "step": 265000 }, { "epoch": 0.982805537225011, "grad_norm": 0.78691565990448, "learning_rate": 5.215708302085488e-05, "loss": 0.3176, "step": 265100 }, { "epoch": 0.9831762673409011, "grad_norm": 1.5515974760055542, "learning_rate": 5.2127698831917306e-05, "loss": 0.3065, "step": 265200 }, { "epoch": 0.9835469974567914, "grad_norm": 0.6948597431182861, "learning_rate": 5.2098313906780006e-05, "loss": 0.2987, "step": 265300 }, { "epoch": 0.9839177275726816, "grad_norm": 0.6863521933555603, "learning_rate": 5.206892825561043e-05, "loss": 0.2826, "step": 265400 }, { "epoch": 0.9842884576885719, "grad_norm": 0.5995004177093506, "learning_rate": 5.2039541888576214e-05, "loss": 0.2764, "step": 265500 }, { "epoch": 0.9846591878044622, "grad_norm": 0.7336591482162476, "learning_rate": 5.201015481584528e-05, "loss": 0.3028, "step": 265600 }, { "epoch": 0.9850299179203523, "grad_norm": 0.7124290466308594, "learning_rate": 5.1980767047585754e-05, "loss": 0.2579, "step": 265700 }, { "epoch": 0.9854006480362426, "grad_norm": 0.9301703572273254, "learning_rate": 5.195137859396603e-05, "loss": 0.307, "step": 265800 }, { "epoch": 0.9857713781521328, "grad_norm": 0.8516147136688232, "learning_rate": 5.192198946515473e-05, "loss": 0.293, "step": 265900 }, { "epoch": 0.9861421082680231, "grad_norm": 0.9922652840614319, "learning_rate": 5.189259967132073e-05, "loss": 0.2874, "step": 266000 }, { "epoch": 0.9865128383839132, "grad_norm": 0.5369066596031189, "learning_rate": 5.186320922263308e-05, "loss": 0.3188, "step": 266100 }, { "epoch": 0.9868835684998035, "grad_norm": 0.6715562343597412, "learning_rate": 5.1833818129261125e-05, "loss": 0.2945, "step": 266200 }, { "epoch": 0.9872542986156937, "grad_norm": 0.7435858845710754, "learning_rate": 5.1804426401374415e-05, "loss": 0.2934, "step": 266300 }, { "epoch": 0.987625028731584, "grad_norm": 0.8587777018547058, "learning_rate": 5.177503404914268e-05, "loss": 0.3065, "step": 266400 }, { "epoch": 0.9879957588474743, "grad_norm": 0.8074586987495422, "learning_rate": 5.174564108273589e-05, "loss": 0.3192, "step": 266500 }, { "epoch": 0.9883664889633644, "grad_norm": 0.8968228697776794, "learning_rate": 5.171624751232426e-05, "loss": 0.299, "step": 266600 }, { "epoch": 0.9887372190792547, "grad_norm": 0.3808428645133972, "learning_rate": 5.168685334807816e-05, "loss": 0.2938, "step": 266700 }, { "epoch": 0.9891079491951449, "grad_norm": 0.41922667622566223, "learning_rate": 5.165745860016819e-05, "loss": 0.3161, "step": 266800 }, { "epoch": 0.9894786793110352, "grad_norm": 0.2746697962284088, "learning_rate": 5.162806327876517e-05, "loss": 0.3331, "step": 266900 }, { "epoch": 0.9898494094269253, "grad_norm": 1.6861145496368408, "learning_rate": 5.159866739404009e-05, "loss": 0.32, "step": 267000 }, { "epoch": 0.9902201395428156, "grad_norm": 0.9158071279525757, "learning_rate": 5.156927095616416e-05, "loss": 0.3213, "step": 267100 }, { "epoch": 0.9905908696587059, "grad_norm": 0.5734014511108398, "learning_rate": 5.153987397530874e-05, "loss": 0.2957, "step": 267200 }, { "epoch": 0.9909615997745961, "grad_norm": 0.5565531849861145, "learning_rate": 5.1510476461645406e-05, "loss": 0.3145, "step": 267300 }, { "epoch": 0.9913323298904864, "grad_norm": 0.9104280471801758, "learning_rate": 5.1481078425345974e-05, "loss": 0.308, "step": 267400 }, { "epoch": 0.9917030600063765, "grad_norm": 0.8332270383834839, "learning_rate": 5.1451679876582305e-05, "loss": 0.3137, "step": 267500 }, { "epoch": 0.9920737901222668, "grad_norm": 0.3147716522216797, "learning_rate": 5.142228082552656e-05, "loss": 0.2876, "step": 267600 }, { "epoch": 0.992444520238157, "grad_norm": 1.9208390712738037, "learning_rate": 5.139288128235102e-05, "loss": 0.3251, "step": 267700 }, { "epoch": 0.9928152503540473, "grad_norm": 0.65997314453125, "learning_rate": 5.136348125722818e-05, "loss": 0.3274, "step": 267800 }, { "epoch": 0.9931859804699374, "grad_norm": 0.48048269748687744, "learning_rate": 5.133408076033059e-05, "loss": 0.2834, "step": 267900 }, { "epoch": 0.9935567105858277, "grad_norm": 0.2911989092826843, "learning_rate": 5.1304679801831115e-05, "loss": 0.283, "step": 268000 }, { "epoch": 0.993927440701718, "grad_norm": 0.5457310676574707, "learning_rate": 5.127527839190269e-05, "loss": 0.3035, "step": 268100 }, { "epoch": 0.9942981708176082, "grad_norm": 0.6424395442008972, "learning_rate": 5.124587654071841e-05, "loss": 0.321, "step": 268200 }, { "epoch": 0.9946689009334985, "grad_norm": 0.7393209934234619, "learning_rate": 5.121647425845152e-05, "loss": 0.2911, "step": 268300 }, { "epoch": 0.9950396310493886, "grad_norm": 0.8980629444122314, "learning_rate": 5.1187071555275454e-05, "loss": 0.3067, "step": 268400 }, { "epoch": 0.9954103611652789, "grad_norm": 0.4344116151332855, "learning_rate": 5.115766844136377e-05, "loss": 0.3112, "step": 268500 }, { "epoch": 0.9957810912811691, "grad_norm": 1.1181306838989258, "learning_rate": 5.1128264926890154e-05, "loss": 0.2951, "step": 268600 }, { "epoch": 0.9961518213970594, "grad_norm": 0.9424721598625183, "learning_rate": 5.109886102202845e-05, "loss": 0.279, "step": 268700 }, { "epoch": 0.9965225515129496, "grad_norm": 1.2004516124725342, "learning_rate": 5.1069456736952616e-05, "loss": 0.295, "step": 268800 }, { "epoch": 0.9968932816288398, "grad_norm": 0.6202601790428162, "learning_rate": 5.1040052081836754e-05, "loss": 0.321, "step": 268900 }, { "epoch": 0.9972640117447301, "grad_norm": 1.0181478261947632, "learning_rate": 5.101064706685511e-05, "loss": 0.2909, "step": 269000 }, { "epoch": 0.9976347418606203, "grad_norm": 1.6529115438461304, "learning_rate": 5.098124170218202e-05, "loss": 0.3169, "step": 269100 }, { "epoch": 0.9980054719765106, "grad_norm": 0.8403481841087341, "learning_rate": 5.0951835997991984e-05, "loss": 0.2967, "step": 269200 }, { "epoch": 0.9983762020924007, "grad_norm": 0.40968039631843567, "learning_rate": 5.0922429964459576e-05, "loss": 0.3299, "step": 269300 }, { "epoch": 0.998746932208291, "grad_norm": 0.4022876024246216, "learning_rate": 5.08930236117595e-05, "loss": 0.2865, "step": 269400 }, { "epoch": 0.9991176623241812, "grad_norm": 1.3588131666183472, "learning_rate": 5.0863616950066586e-05, "loss": 0.3331, "step": 269500 }, { "epoch": 0.9994883924400715, "grad_norm": 0.39031630754470825, "learning_rate": 5.083420998955576e-05, "loss": 0.3095, "step": 269600 }, { "epoch": 0.9998591225559618, "grad_norm": 0.4925670027732849, "learning_rate": 5.080480274040205e-05, "loss": 0.3173, "step": 269700 }, { "epoch": 1.000229852671852, "grad_norm": 0.5261515378952026, "learning_rate": 5.0775395212780566e-05, "loss": 0.3096, "step": 269800 }, { "epoch": 1.0006005827877422, "grad_norm": 0.4487745761871338, "learning_rate": 5.0745987416866556e-05, "loss": 0.3059, "step": 269900 }, { "epoch": 1.0009713129036324, "grad_norm": 0.5003412961959839, "learning_rate": 5.0716579362835315e-05, "loss": 0.3059, "step": 270000 }, { "epoch": 1.0013420430195226, "grad_norm": 0.6085633635520935, "learning_rate": 5.068717106086227e-05, "loss": 0.2994, "step": 270100 }, { "epoch": 1.001712773135413, "grad_norm": 0.4629164934158325, "learning_rate": 5.0657762521122896e-05, "loss": 0.2931, "step": 270200 }, { "epoch": 1.0020835032513031, "grad_norm": 0.43695616722106934, "learning_rate": 5.062835375379279e-05, "loss": 0.3043, "step": 270300 }, { "epoch": 1.0024542333671933, "grad_norm": 0.5053547620773315, "learning_rate": 5.0598944769047574e-05, "loss": 0.3149, "step": 270400 }, { "epoch": 1.0028249634830835, "grad_norm": 0.6539624333381653, "learning_rate": 5.0569535577063e-05, "loss": 0.3159, "step": 270500 }, { "epoch": 1.0031956935989739, "grad_norm": 1.118529200553894, "learning_rate": 5.054012618801486e-05, "loss": 0.3115, "step": 270600 }, { "epoch": 1.003566423714864, "grad_norm": 1.0983824729919434, "learning_rate": 5.0510716612079026e-05, "loss": 0.2939, "step": 270700 }, { "epoch": 1.0039371538307542, "grad_norm": 1.0920246839523315, "learning_rate": 5.048130685943142e-05, "loss": 0.2924, "step": 270800 }, { "epoch": 1.0043078839466446, "grad_norm": 0.6910731196403503, "learning_rate": 5.0451896940248055e-05, "loss": 0.2972, "step": 270900 }, { "epoch": 1.0046786140625348, "grad_norm": 0.6507776975631714, "learning_rate": 5.0422486864704964e-05, "loss": 0.3187, "step": 271000 }, { "epoch": 1.005049344178425, "grad_norm": 0.7517284154891968, "learning_rate": 5.0393076642978277e-05, "loss": 0.3024, "step": 271100 }, { "epoch": 1.0054200742943151, "grad_norm": 0.8236764073371887, "learning_rate": 5.036366628524411e-05, "loss": 0.2997, "step": 271200 }, { "epoch": 1.0057908044102055, "grad_norm": 0.8866660594940186, "learning_rate": 5.033425580167872e-05, "loss": 0.3116, "step": 271300 }, { "epoch": 1.0061615345260957, "grad_norm": 0.318185955286026, "learning_rate": 5.030484520245832e-05, "loss": 0.3139, "step": 271400 }, { "epoch": 1.0065322646419859, "grad_norm": 0.2881660759449005, "learning_rate": 5.027543449775918e-05, "loss": 0.3032, "step": 271500 }, { "epoch": 1.0069029947578763, "grad_norm": 0.4831421971321106, "learning_rate": 5.024602369775766e-05, "loss": 0.3588, "step": 271600 }, { "epoch": 1.0072737248737664, "grad_norm": 0.7667540907859802, "learning_rate": 5.0216612812630103e-05, "loss": 0.2806, "step": 271700 }, { "epoch": 1.0076444549896566, "grad_norm": 0.7532679438591003, "learning_rate": 5.018720185255289e-05, "loss": 0.2924, "step": 271800 }, { "epoch": 1.0080151851055468, "grad_norm": 0.7438379526138306, "learning_rate": 5.0157790827702445e-05, "loss": 0.3054, "step": 271900 }, { "epoch": 1.0083859152214372, "grad_norm": 0.4842260479927063, "learning_rate": 5.012837974825519e-05, "loss": 0.2868, "step": 272000 }, { "epoch": 1.0087566453373273, "grad_norm": 0.6209085583686829, "learning_rate": 5.0098968624387574e-05, "loss": 0.327, "step": 272100 }, { "epoch": 1.0091273754532175, "grad_norm": 0.858718752861023, "learning_rate": 5.0069557466276094e-05, "loss": 0.2981, "step": 272200 }, { "epoch": 1.009498105569108, "grad_norm": 1.458554744720459, "learning_rate": 5.004014628409719e-05, "loss": 0.3231, "step": 272300 }, { "epoch": 1.009868835684998, "grad_norm": 0.37366998195648193, "learning_rate": 5.001073508802739e-05, "loss": 0.2967, "step": 272400 }, { "epoch": 1.0102395658008883, "grad_norm": 0.6041092276573181, "learning_rate": 4.9981323888243165e-05, "loss": 0.287, "step": 272500 }, { "epoch": 1.0106102959167784, "grad_norm": 0.6159973740577698, "learning_rate": 4.995191269492101e-05, "loss": 0.328, "step": 272600 }, { "epoch": 1.0109810260326688, "grad_norm": 1.645533800125122, "learning_rate": 4.9922501518237415e-05, "loss": 0.3017, "step": 272700 }, { "epoch": 1.011351756148559, "grad_norm": 0.4256337583065033, "learning_rate": 4.989309036836888e-05, "loss": 0.3282, "step": 272800 }, { "epoch": 1.0117224862644492, "grad_norm": 0.7617577314376831, "learning_rate": 4.9863679255491874e-05, "loss": 0.3013, "step": 272900 }, { "epoch": 1.0120932163803393, "grad_norm": 0.8090320825576782, "learning_rate": 4.9834268189782886e-05, "loss": 0.3231, "step": 273000 }, { "epoch": 1.0124639464962297, "grad_norm": 0.4910306930541992, "learning_rate": 4.980485718141832e-05, "loss": 0.3165, "step": 273100 }, { "epoch": 1.01283467661212, "grad_norm": 0.4540660083293915, "learning_rate": 4.977544624057463e-05, "loss": 0.2669, "step": 273200 }, { "epoch": 1.01320540672801, "grad_norm": 0.4651670455932617, "learning_rate": 4.9746035377428215e-05, "loss": 0.2898, "step": 273300 }, { "epoch": 1.0135761368439005, "grad_norm": 0.9308462142944336, "learning_rate": 4.971662460215546e-05, "loss": 0.2892, "step": 273400 }, { "epoch": 1.0139468669597906, "grad_norm": 0.8655659556388855, "learning_rate": 4.96872139249327e-05, "loss": 0.2916, "step": 273500 }, { "epoch": 1.0143175970756808, "grad_norm": 1.3576668500900269, "learning_rate": 4.965780335593628e-05, "loss": 0.317, "step": 273600 }, { "epoch": 1.014688327191571, "grad_norm": 0.6036360859870911, "learning_rate": 4.9628392905342464e-05, "loss": 0.31, "step": 273700 }, { "epoch": 1.0150590573074614, "grad_norm": 0.8284690976142883, "learning_rate": 4.9598982583327455e-05, "loss": 0.3108, "step": 273800 }, { "epoch": 1.0154297874233515, "grad_norm": 1.039859414100647, "learning_rate": 4.956957240006747e-05, "loss": 0.3155, "step": 273900 }, { "epoch": 1.0158005175392417, "grad_norm": 0.4408203363418579, "learning_rate": 4.9540162365738655e-05, "loss": 0.3125, "step": 274000 }, { "epoch": 1.0161712476551321, "grad_norm": 0.288603812456131, "learning_rate": 4.9510752490517094e-05, "loss": 0.3191, "step": 274100 }, { "epoch": 1.0165419777710223, "grad_norm": 0.7318848371505737, "learning_rate": 4.948134278457882e-05, "loss": 0.3027, "step": 274200 }, { "epoch": 1.0169127078869125, "grad_norm": 0.5661537647247314, "learning_rate": 4.945193325809983e-05, "loss": 0.3007, "step": 274300 }, { "epoch": 1.0172834380028026, "grad_norm": 1.3249680995941162, "learning_rate": 4.9422523921256025e-05, "loss": 0.2927, "step": 274400 }, { "epoch": 1.017654168118693, "grad_norm": 1.4018747806549072, "learning_rate": 4.939311478422327e-05, "loss": 0.2991, "step": 274500 }, { "epoch": 1.0180248982345832, "grad_norm": 0.6778904795646667, "learning_rate": 4.9363705857177315e-05, "loss": 0.2789, "step": 274600 }, { "epoch": 1.0183956283504734, "grad_norm": 1.3769911527633667, "learning_rate": 4.93342971502939e-05, "loss": 0.2836, "step": 274700 }, { "epoch": 1.0187663584663638, "grad_norm": 1.3441896438598633, "learning_rate": 4.930488867374863e-05, "loss": 0.3, "step": 274800 }, { "epoch": 1.019137088582254, "grad_norm": 0.7177808880805969, "learning_rate": 4.9275480437717074e-05, "loss": 0.2901, "step": 274900 }, { "epoch": 1.019507818698144, "grad_norm": 0.40595749020576477, "learning_rate": 4.9246072452374716e-05, "loss": 0.3021, "step": 275000 }, { "epoch": 1.0198785488140343, "grad_norm": 2.193157196044922, "learning_rate": 4.9216664727896903e-05, "loss": 0.3129, "step": 275100 }, { "epoch": 1.0202492789299247, "grad_norm": 2.212768793106079, "learning_rate": 4.9187257274458954e-05, "loss": 0.2992, "step": 275200 }, { "epoch": 1.0206200090458148, "grad_norm": 0.8229432106018066, "learning_rate": 4.915785010223607e-05, "loss": 0.3015, "step": 275300 }, { "epoch": 1.020990739161705, "grad_norm": 0.3583860397338867, "learning_rate": 4.912844322140333e-05, "loss": 0.3144, "step": 275400 }, { "epoch": 1.0213614692775952, "grad_norm": 0.6615760326385498, "learning_rate": 4.909903664213576e-05, "loss": 0.2893, "step": 275500 }, { "epoch": 1.0217321993934856, "grad_norm": 0.6790965795516968, "learning_rate": 4.906963037460823e-05, "loss": 0.341, "step": 275600 }, { "epoch": 1.0221029295093758, "grad_norm": 0.9097020626068115, "learning_rate": 4.904022442899554e-05, "loss": 0.2964, "step": 275700 }, { "epoch": 1.022473659625266, "grad_norm": 0.3794010281562805, "learning_rate": 4.901081881547238e-05, "loss": 0.3028, "step": 275800 }, { "epoch": 1.0228443897411563, "grad_norm": 0.325444757938385, "learning_rate": 4.898141354421329e-05, "loss": 0.3429, "step": 275900 }, { "epoch": 1.0232151198570465, "grad_norm": 0.761991560459137, "learning_rate": 4.895200862539274e-05, "loss": 0.282, "step": 276000 }, { "epoch": 1.0235858499729367, "grad_norm": 0.6794321537017822, "learning_rate": 4.8922604069185023e-05, "loss": 0.2991, "step": 276100 }, { "epoch": 1.0239565800888268, "grad_norm": 0.42103177309036255, "learning_rate": 4.8893199885764354e-05, "loss": 0.2971, "step": 276200 }, { "epoch": 1.0243273102047172, "grad_norm": 0.7186094522476196, "learning_rate": 4.886379608530478e-05, "loss": 0.2934, "step": 276300 }, { "epoch": 1.0246980403206074, "grad_norm": 0.9539751410484314, "learning_rate": 4.8834392677980254e-05, "loss": 0.3384, "step": 276400 }, { "epoch": 1.0250687704364976, "grad_norm": 0.7265830039978027, "learning_rate": 4.880498967396458e-05, "loss": 0.3201, "step": 276500 }, { "epoch": 1.025439500552388, "grad_norm": 0.7231929302215576, "learning_rate": 4.8775587083431396e-05, "loss": 0.3013, "step": 276600 }, { "epoch": 1.0258102306682781, "grad_norm": 0.812411367893219, "learning_rate": 4.874618491655423e-05, "loss": 0.2579, "step": 276700 }, { "epoch": 1.0261809607841683, "grad_norm": 1.1397231817245483, "learning_rate": 4.871678318350647e-05, "loss": 0.3223, "step": 276800 }, { "epoch": 1.0265516909000585, "grad_norm": 0.8874804973602295, "learning_rate": 4.8687381894461296e-05, "loss": 0.2923, "step": 276900 }, { "epoch": 1.0269224210159489, "grad_norm": 0.8135974407196045, "learning_rate": 4.86579810595918e-05, "loss": 0.3016, "step": 277000 }, { "epoch": 1.027293151131839, "grad_norm": 1.7726153135299683, "learning_rate": 4.862858068907087e-05, "loss": 0.283, "step": 277100 }, { "epoch": 1.0276638812477292, "grad_norm": 1.0282557010650635, "learning_rate": 4.859918079307128e-05, "loss": 0.3131, "step": 277200 }, { "epoch": 1.0280346113636196, "grad_norm": 0.6342775821685791, "learning_rate": 4.856978138176561e-05, "loss": 0.295, "step": 277300 }, { "epoch": 1.0284053414795098, "grad_norm": 0.8334500193595886, "learning_rate": 4.854038246532625e-05, "loss": 0.2836, "step": 277400 }, { "epoch": 1.0287760715954, "grad_norm": 1.0705689191818237, "learning_rate": 4.8510984053925485e-05, "loss": 0.3043, "step": 277500 }, { "epoch": 1.0291468017112901, "grad_norm": 0.5293753147125244, "learning_rate": 4.8481586157735345e-05, "loss": 0.284, "step": 277600 }, { "epoch": 1.0295175318271805, "grad_norm": 1.022678017616272, "learning_rate": 4.8452188786927744e-05, "loss": 0.2884, "step": 277700 }, { "epoch": 1.0298882619430707, "grad_norm": 0.6032870411872864, "learning_rate": 4.8422791951674376e-05, "loss": 0.2782, "step": 277800 }, { "epoch": 1.0302589920589609, "grad_norm": 1.313863754272461, "learning_rate": 4.8393395662146786e-05, "loss": 0.2992, "step": 277900 }, { "epoch": 1.030629722174851, "grad_norm": 0.7085054516792297, "learning_rate": 4.83639999285163e-05, "loss": 0.2943, "step": 278000 }, { "epoch": 1.0310004522907414, "grad_norm": 0.778872013092041, "learning_rate": 4.833460476095406e-05, "loss": 0.283, "step": 278100 }, { "epoch": 1.0313711824066316, "grad_norm": 0.9876055717468262, "learning_rate": 4.830521016963101e-05, "loss": 0.3078, "step": 278200 }, { "epoch": 1.0317419125225218, "grad_norm": 0.7847422361373901, "learning_rate": 4.827581616471792e-05, "loss": 0.295, "step": 278300 }, { "epoch": 1.0321126426384122, "grad_norm": 0.8312188982963562, "learning_rate": 4.824642275638532e-05, "loss": 0.3088, "step": 278400 }, { "epoch": 1.0324833727543024, "grad_norm": 0.7072118520736694, "learning_rate": 4.8217029954803526e-05, "loss": 0.2924, "step": 278500 }, { "epoch": 1.0328541028701925, "grad_norm": 0.8706657886505127, "learning_rate": 4.81876377701427e-05, "loss": 0.3127, "step": 278600 }, { "epoch": 1.0332248329860827, "grad_norm": 0.8296810388565063, "learning_rate": 4.815824621257275e-05, "loss": 0.2878, "step": 278700 }, { "epoch": 1.033595563101973, "grad_norm": 0.900955080986023, "learning_rate": 4.812885529226336e-05, "loss": 0.317, "step": 278800 }, { "epoch": 1.0339662932178633, "grad_norm": 0.9201534390449524, "learning_rate": 4.8099465019384024e-05, "loss": 0.3115, "step": 278900 }, { "epoch": 1.0343370233337534, "grad_norm": 0.7738629579544067, "learning_rate": 4.8070075404103994e-05, "loss": 0.2958, "step": 279000 }, { "epoch": 1.0347077534496438, "grad_norm": 1.1279526948928833, "learning_rate": 4.8040686456592294e-05, "loss": 0.2892, "step": 279100 }, { "epoch": 1.035078483565534, "grad_norm": 0.6883434057235718, "learning_rate": 4.8011298187017726e-05, "loss": 0.2886, "step": 279200 }, { "epoch": 1.0354492136814242, "grad_norm": 0.7773301601409912, "learning_rate": 4.7981910605548824e-05, "loss": 0.2649, "step": 279300 }, { "epoch": 1.0358199437973143, "grad_norm": 1.6901229619979858, "learning_rate": 4.7952523722353934e-05, "loss": 0.3403, "step": 279400 }, { "epoch": 1.0361906739132047, "grad_norm": 0.45716550946235657, "learning_rate": 4.7923137547601145e-05, "loss": 0.261, "step": 279500 }, { "epoch": 1.036561404029095, "grad_norm": 0.4819570481777191, "learning_rate": 4.789375209145827e-05, "loss": 0.302, "step": 279600 }, { "epoch": 1.036932134144985, "grad_norm": 0.9368488192558289, "learning_rate": 4.78643673640929e-05, "loss": 0.2941, "step": 279700 }, { "epoch": 1.0373028642608753, "grad_norm": 1.293733835220337, "learning_rate": 4.783498337567241e-05, "loss": 0.3004, "step": 279800 }, { "epoch": 1.0376735943767657, "grad_norm": 1.0426751375198364, "learning_rate": 4.780560013636383e-05, "loss": 0.2974, "step": 279900 }, { "epoch": 1.0380443244926558, "grad_norm": 0.3109659254550934, "learning_rate": 4.7776217656333994e-05, "loss": 0.2943, "step": 280000 }, { "epoch": 1.038415054608546, "grad_norm": 0.4480859041213989, "learning_rate": 4.774683594574946e-05, "loss": 0.3055, "step": 280100 }, { "epoch": 1.0387857847244364, "grad_norm": 0.737432599067688, "learning_rate": 4.771745501477653e-05, "loss": 0.2882, "step": 280200 }, { "epoch": 1.0391565148403266, "grad_norm": 0.774242639541626, "learning_rate": 4.7688074873581213e-05, "loss": 0.3263, "step": 280300 }, { "epoch": 1.0395272449562167, "grad_norm": 0.8915555477142334, "learning_rate": 4.7658695532329265e-05, "loss": 0.3092, "step": 280400 }, { "epoch": 1.039897975072107, "grad_norm": 0.49766167998313904, "learning_rate": 4.762931700118616e-05, "loss": 0.285, "step": 280500 }, { "epoch": 1.0402687051879973, "grad_norm": 0.3961893618106842, "learning_rate": 4.75999392903171e-05, "loss": 0.3305, "step": 280600 }, { "epoch": 1.0406394353038875, "grad_norm": 0.8052985668182373, "learning_rate": 4.7570562409886945e-05, "loss": 0.2916, "step": 280700 }, { "epoch": 1.0410101654197776, "grad_norm": 0.8831441402435303, "learning_rate": 4.754118637006036e-05, "loss": 0.3108, "step": 280800 }, { "epoch": 1.041380895535668, "grad_norm": 0.5403960943222046, "learning_rate": 4.751181118100166e-05, "loss": 0.2714, "step": 280900 }, { "epoch": 1.0417516256515582, "grad_norm": 0.8258278369903564, "learning_rate": 4.748243685287488e-05, "loss": 0.3397, "step": 281000 }, { "epoch": 1.0421223557674484, "grad_norm": 1.4889802932739258, "learning_rate": 4.745306339584376e-05, "loss": 0.276, "step": 281100 }, { "epoch": 1.0424930858833386, "grad_norm": 0.7503752708435059, "learning_rate": 4.742369082007174e-05, "loss": 0.2876, "step": 281200 }, { "epoch": 1.042863815999229, "grad_norm": 0.4069075882434845, "learning_rate": 4.7394319135721946e-05, "loss": 0.3316, "step": 281300 }, { "epoch": 1.0432345461151191, "grad_norm": 0.5023167133331299, "learning_rate": 4.7364948352957195e-05, "loss": 0.3113, "step": 281400 }, { "epoch": 1.0436052762310093, "grad_norm": 1.00737464427948, "learning_rate": 4.7335578481939996e-05, "loss": 0.3088, "step": 281500 }, { "epoch": 1.0439760063468997, "grad_norm": 1.063388466835022, "learning_rate": 4.730620953283255e-05, "loss": 0.2843, "step": 281600 }, { "epoch": 1.0443467364627899, "grad_norm": 0.6477052569389343, "learning_rate": 4.7276841515796735e-05, "loss": 0.3124, "step": 281700 }, { "epoch": 1.04471746657868, "grad_norm": 1.0181857347488403, "learning_rate": 4.724747444099409e-05, "loss": 0.3088, "step": 281800 }, { "epoch": 1.0450881966945702, "grad_norm": 0.3461955785751343, "learning_rate": 4.721810831858586e-05, "loss": 0.3012, "step": 281900 }, { "epoch": 1.0454589268104606, "grad_norm": 0.9640982151031494, "learning_rate": 4.7188743158732946e-05, "loss": 0.3169, "step": 282000 }, { "epoch": 1.0458296569263508, "grad_norm": 0.9239721894264221, "learning_rate": 4.715937897159591e-05, "loss": 0.3086, "step": 282100 }, { "epoch": 1.046200387042241, "grad_norm": 0.9508348703384399, "learning_rate": 4.713001576733496e-05, "loss": 0.3245, "step": 282200 }, { "epoch": 1.0465711171581313, "grad_norm": 0.5041287541389465, "learning_rate": 4.710065355610999e-05, "loss": 0.3313, "step": 282300 }, { "epoch": 1.0469418472740215, "grad_norm": 0.1999703049659729, "learning_rate": 4.7071292348080555e-05, "loss": 0.3033, "step": 282400 }, { "epoch": 1.0473125773899117, "grad_norm": 0.9261777400970459, "learning_rate": 4.704193215340584e-05, "loss": 0.3237, "step": 282500 }, { "epoch": 1.0476833075058019, "grad_norm": 0.48549216985702515, "learning_rate": 4.701257298224471e-05, "loss": 0.2731, "step": 282600 }, { "epoch": 1.0480540376216922, "grad_norm": 1.7874999046325684, "learning_rate": 4.6983214844755654e-05, "loss": 0.3051, "step": 282700 }, { "epoch": 1.0484247677375824, "grad_norm": 0.5548740029335022, "learning_rate": 4.695385775109679e-05, "loss": 0.3282, "step": 282800 }, { "epoch": 1.0487954978534726, "grad_norm": 1.0487303733825684, "learning_rate": 4.692450171142593e-05, "loss": 0.2867, "step": 282900 }, { "epoch": 1.0491662279693628, "grad_norm": 0.7498404383659363, "learning_rate": 4.6895146735900426e-05, "loss": 0.315, "step": 283000 }, { "epoch": 1.0495369580852532, "grad_norm": 4.890678882598877, "learning_rate": 4.686579283467735e-05, "loss": 0.2866, "step": 283100 }, { "epoch": 1.0499076882011433, "grad_norm": 1.1581186056137085, "learning_rate": 4.6836440017913385e-05, "loss": 0.3168, "step": 283200 }, { "epoch": 1.0502784183170335, "grad_norm": 0.4981914460659027, "learning_rate": 4.68070882957648e-05, "loss": 0.3032, "step": 283300 }, { "epoch": 1.050649148432924, "grad_norm": 0.3785054683685303, "learning_rate": 4.6777737678387514e-05, "loss": 0.2965, "step": 283400 }, { "epoch": 1.051019878548814, "grad_norm": 0.46771666407585144, "learning_rate": 4.674838817593707e-05, "loss": 0.28, "step": 283500 }, { "epoch": 1.0513906086647042, "grad_norm": 0.31207600235939026, "learning_rate": 4.6719039798568634e-05, "loss": 0.3093, "step": 283600 }, { "epoch": 1.0517613387805944, "grad_norm": 0.9776681661605835, "learning_rate": 4.668969255643692e-05, "loss": 0.2926, "step": 283700 }, { "epoch": 1.0521320688964848, "grad_norm": 0.9331262111663818, "learning_rate": 4.666034645969631e-05, "loss": 0.2755, "step": 283800 }, { "epoch": 1.052502799012375, "grad_norm": 1.2266769409179688, "learning_rate": 4.663100151850078e-05, "loss": 0.3046, "step": 283900 }, { "epoch": 1.0528735291282652, "grad_norm": 0.8341668844223022, "learning_rate": 4.66016577430039e-05, "loss": 0.3102, "step": 284000 }, { "epoch": 1.0532442592441555, "grad_norm": 0.6020668745040894, "learning_rate": 4.657231514335881e-05, "loss": 0.3074, "step": 284100 }, { "epoch": 1.0536149893600457, "grad_norm": 1.2823050022125244, "learning_rate": 4.654297372971832e-05, "loss": 0.3179, "step": 284200 }, { "epoch": 1.053985719475936, "grad_norm": 0.41515684127807617, "learning_rate": 4.651363351223473e-05, "loss": 0.2866, "step": 284300 }, { "epoch": 1.054356449591826, "grad_norm": 0.7105672955513, "learning_rate": 4.648429450106001e-05, "loss": 0.3054, "step": 284400 }, { "epoch": 1.0547271797077165, "grad_norm": 0.47530844807624817, "learning_rate": 4.645495670634564e-05, "loss": 0.2834, "step": 284500 }, { "epoch": 1.0550979098236066, "grad_norm": 0.6945728659629822, "learning_rate": 4.642562013824275e-05, "loss": 0.3364, "step": 284600 }, { "epoch": 1.0554686399394968, "grad_norm": 1.6111114025115967, "learning_rate": 4.639628480690198e-05, "loss": 0.3306, "step": 284700 }, { "epoch": 1.055839370055387, "grad_norm": 1.2201203107833862, "learning_rate": 4.6366950722473596e-05, "loss": 0.3119, "step": 284800 }, { "epoch": 1.0562101001712774, "grad_norm": 0.6830409169197083, "learning_rate": 4.6337617895107415e-05, "loss": 0.3323, "step": 284900 }, { "epoch": 1.0565808302871675, "grad_norm": 0.9572984576225281, "learning_rate": 4.6308286334952796e-05, "loss": 0.2648, "step": 285000 }, { "epoch": 1.0569515604030577, "grad_norm": 1.02972412109375, "learning_rate": 4.6278956052158684e-05, "loss": 0.2834, "step": 285100 }, { "epoch": 1.057322290518948, "grad_norm": 0.7865321636199951, "learning_rate": 4.62496270568736e-05, "loss": 0.2796, "step": 285200 }, { "epoch": 1.0576930206348383, "grad_norm": 0.9588046073913574, "learning_rate": 4.622029935924556e-05, "loss": 0.3115, "step": 285300 }, { "epoch": 1.0580637507507284, "grad_norm": 0.9739672541618347, "learning_rate": 4.619097296942218e-05, "loss": 0.3227, "step": 285400 }, { "epoch": 1.0584344808666186, "grad_norm": 0.8741119503974915, "learning_rate": 4.6161647897550606e-05, "loss": 0.3072, "step": 285500 }, { "epoch": 1.058805210982509, "grad_norm": 0.6314988136291504, "learning_rate": 4.613232415377755e-05, "loss": 0.292, "step": 285600 }, { "epoch": 1.0591759410983992, "grad_norm": 0.5874539017677307, "learning_rate": 4.6103001748249226e-05, "loss": 0.3022, "step": 285700 }, { "epoch": 1.0595466712142894, "grad_norm": 0.9649267792701721, "learning_rate": 4.6073680691111405e-05, "loss": 0.3121, "step": 285800 }, { "epoch": 1.0599174013301798, "grad_norm": 1.0037870407104492, "learning_rate": 4.6044360992509426e-05, "loss": 0.3178, "step": 285900 }, { "epoch": 1.06028813144607, "grad_norm": 0.5880357027053833, "learning_rate": 4.601504266258807e-05, "loss": 0.302, "step": 286000 }, { "epoch": 1.06065886156196, "grad_norm": 0.8851994276046753, "learning_rate": 4.598572571149174e-05, "loss": 0.3109, "step": 286100 }, { "epoch": 1.0610295916778503, "grad_norm": 0.5874459743499756, "learning_rate": 4.59564101493643e-05, "loss": 0.3152, "step": 286200 }, { "epoch": 1.0614003217937407, "grad_norm": 0.8906790614128113, "learning_rate": 4.592709598634915e-05, "loss": 0.3027, "step": 286300 }, { "epoch": 1.0617710519096308, "grad_norm": 1.421059012413025, "learning_rate": 4.589778323258923e-05, "loss": 0.3047, "step": 286400 }, { "epoch": 1.062141782025521, "grad_norm": 1.1765897274017334, "learning_rate": 4.586847189822696e-05, "loss": 0.3067, "step": 286500 }, { "epoch": 1.0625125121414114, "grad_norm": 0.7866582870483398, "learning_rate": 4.5839161993404265e-05, "loss": 0.2891, "step": 286600 }, { "epoch": 1.0628832422573016, "grad_norm": 0.3889412581920624, "learning_rate": 4.580985352826264e-05, "loss": 0.3023, "step": 286700 }, { "epoch": 1.0632539723731917, "grad_norm": 0.8044344186782837, "learning_rate": 4.5780546512942986e-05, "loss": 0.2644, "step": 286800 }, { "epoch": 1.063624702489082, "grad_norm": 1.71160888671875, "learning_rate": 4.575124095758575e-05, "loss": 0.2959, "step": 286900 }, { "epoch": 1.0639954326049723, "grad_norm": 0.24148821830749512, "learning_rate": 4.572193687233089e-05, "loss": 0.2919, "step": 287000 }, { "epoch": 1.0643661627208625, "grad_norm": 0.5856324434280396, "learning_rate": 4.569263426731785e-05, "loss": 0.2888, "step": 287100 }, { "epoch": 1.0647368928367527, "grad_norm": 1.4365171194076538, "learning_rate": 4.566333315268552e-05, "loss": 0.3067, "step": 287200 }, { "epoch": 1.065107622952643, "grad_norm": 0.6578074097633362, "learning_rate": 4.5634033538572314e-05, "loss": 0.2799, "step": 287300 }, { "epoch": 1.0654783530685332, "grad_norm": 0.8811532855033875, "learning_rate": 4.5604735435116146e-05, "loss": 0.3263, "step": 287400 }, { "epoch": 1.0658490831844234, "grad_norm": 0.5924971103668213, "learning_rate": 4.557543885245434e-05, "loss": 0.2917, "step": 287500 }, { "epoch": 1.0662198133003136, "grad_norm": 0.8023910522460938, "learning_rate": 4.554614380072376e-05, "loss": 0.2928, "step": 287600 }, { "epoch": 1.066590543416204, "grad_norm": 0.8843216896057129, "learning_rate": 4.551685029006068e-05, "loss": 0.3169, "step": 287700 }, { "epoch": 1.0669612735320941, "grad_norm": 0.6253518462181091, "learning_rate": 4.54875583306009e-05, "loss": 0.2944, "step": 287800 }, { "epoch": 1.0673320036479843, "grad_norm": 0.6621477007865906, "learning_rate": 4.545826793247967e-05, "loss": 0.3321, "step": 287900 }, { "epoch": 1.0677027337638745, "grad_norm": 0.5781023502349854, "learning_rate": 4.542897910583165e-05, "loss": 0.2957, "step": 288000 }, { "epoch": 1.0680734638797649, "grad_norm": 0.698681652545929, "learning_rate": 4.5399691860791006e-05, "loss": 0.2888, "step": 288100 }, { "epoch": 1.068444193995655, "grad_norm": 0.46907249093055725, "learning_rate": 4.537040620749138e-05, "loss": 0.3179, "step": 288200 }, { "epoch": 1.0688149241115452, "grad_norm": 1.235528588294983, "learning_rate": 4.534112215606577e-05, "loss": 0.3115, "step": 288300 }, { "epoch": 1.0691856542274356, "grad_norm": 1.0418250560760498, "learning_rate": 4.531183971664671e-05, "loss": 0.3283, "step": 288400 }, { "epoch": 1.0695563843433258, "grad_norm": 0.5634387135505676, "learning_rate": 4.528255889936612e-05, "loss": 0.2854, "step": 288500 }, { "epoch": 1.069927114459216, "grad_norm": 0.7086278796195984, "learning_rate": 4.525327971435541e-05, "loss": 0.3175, "step": 288600 }, { "epoch": 1.0702978445751061, "grad_norm": 0.4146464765071869, "learning_rate": 4.522400217174538e-05, "loss": 0.2758, "step": 288700 }, { "epoch": 1.0706685746909965, "grad_norm": 0.5158166885375977, "learning_rate": 4.519472628166627e-05, "loss": 0.2902, "step": 288800 }, { "epoch": 1.0710393048068867, "grad_norm": 0.467392235994339, "learning_rate": 4.5165452054247787e-05, "loss": 0.2939, "step": 288900 }, { "epoch": 1.0714100349227769, "grad_norm": 0.8680809736251831, "learning_rate": 4.513617949961901e-05, "loss": 0.3306, "step": 289000 }, { "epoch": 1.071780765038667, "grad_norm": 0.5474904775619507, "learning_rate": 4.510690862790847e-05, "loss": 0.2898, "step": 289100 }, { "epoch": 1.0721514951545574, "grad_norm": 0.46418216824531555, "learning_rate": 4.5077639449244087e-05, "loss": 0.2949, "step": 289200 }, { "epoch": 1.0725222252704476, "grad_norm": 0.6022411584854126, "learning_rate": 4.504837197375323e-05, "loss": 0.3074, "step": 289300 }, { "epoch": 1.0728929553863378, "grad_norm": 1.054656982421875, "learning_rate": 4.5019106211562674e-05, "loss": 0.3068, "step": 289400 }, { "epoch": 1.0732636855022282, "grad_norm": 0.9163946509361267, "learning_rate": 4.498984217279857e-05, "loss": 0.2939, "step": 289500 }, { "epoch": 1.0736344156181183, "grad_norm": 0.24184857308864594, "learning_rate": 4.4960579867586514e-05, "loss": 0.2766, "step": 289600 }, { "epoch": 1.0740051457340085, "grad_norm": 1.330430269241333, "learning_rate": 4.4931319306051496e-05, "loss": 0.3005, "step": 289700 }, { "epoch": 1.0743758758498987, "grad_norm": 1.1925045251846313, "learning_rate": 4.4902060498317854e-05, "loss": 0.304, "step": 289800 }, { "epoch": 1.074746605965789, "grad_norm": 0.3950655162334442, "learning_rate": 4.487280345450936e-05, "loss": 0.3031, "step": 289900 }, { "epoch": 1.0751173360816793, "grad_norm": 0.5798438191413879, "learning_rate": 4.484354818474917e-05, "loss": 0.3037, "step": 290000 }, { "epoch": 1.0754880661975694, "grad_norm": 0.7104446887969971, "learning_rate": 4.481429469915985e-05, "loss": 0.32, "step": 290100 }, { "epoch": 1.0758587963134598, "grad_norm": 0.45570772886276245, "learning_rate": 4.4785043007863295e-05, "loss": 0.305, "step": 290200 }, { "epoch": 1.07622952642935, "grad_norm": 0.7701956629753113, "learning_rate": 4.4755793120980825e-05, "loss": 0.2722, "step": 290300 }, { "epoch": 1.0766002565452402, "grad_norm": 0.7945518493652344, "learning_rate": 4.472654504863313e-05, "loss": 0.2778, "step": 290400 }, { "epoch": 1.0769709866611303, "grad_norm": 0.493869423866272, "learning_rate": 4.469729880094025e-05, "loss": 0.283, "step": 290500 }, { "epoch": 1.0773417167770207, "grad_norm": 0.1876228302717209, "learning_rate": 4.4668054388021586e-05, "loss": 0.2986, "step": 290600 }, { "epoch": 1.077712446892911, "grad_norm": 1.0018877983093262, "learning_rate": 4.463881181999594e-05, "loss": 0.2871, "step": 290700 }, { "epoch": 1.078083177008801, "grad_norm": 1.1508018970489502, "learning_rate": 4.460957110698147e-05, "loss": 0.288, "step": 290800 }, { "epoch": 1.0784539071246915, "grad_norm": 0.5392220616340637, "learning_rate": 4.4580332259095654e-05, "loss": 0.2644, "step": 290900 }, { "epoch": 1.0788246372405816, "grad_norm": 0.5525040626525879, "learning_rate": 4.4551095286455376e-05, "loss": 0.3162, "step": 291000 }, { "epoch": 1.0791953673564718, "grad_norm": 0.6950918436050415, "learning_rate": 4.4521860199176845e-05, "loss": 0.2834, "step": 291100 }, { "epoch": 1.079566097472362, "grad_norm": 0.5589026212692261, "learning_rate": 4.4492627007375606e-05, "loss": 0.3118, "step": 291200 }, { "epoch": 1.0799368275882524, "grad_norm": 0.5493624210357666, "learning_rate": 4.446339572116659e-05, "loss": 0.2792, "step": 291300 }, { "epoch": 1.0803075577041426, "grad_norm": 0.6823329329490662, "learning_rate": 4.4434166350664e-05, "loss": 0.2967, "step": 291400 }, { "epoch": 1.0806782878200327, "grad_norm": 0.8365862965583801, "learning_rate": 4.440493890598143e-05, "loss": 0.2944, "step": 291500 }, { "epoch": 1.0810490179359231, "grad_norm": 1.2373430728912354, "learning_rate": 4.437571339723181e-05, "loss": 0.2806, "step": 291600 }, { "epoch": 1.0814197480518133, "grad_norm": 0.6829968690872192, "learning_rate": 4.4346489834527376e-05, "loss": 0.2934, "step": 291700 }, { "epoch": 1.0817904781677035, "grad_norm": 0.29732245206832886, "learning_rate": 4.431726822797969e-05, "loss": 0.2823, "step": 291800 }, { "epoch": 1.0821612082835936, "grad_norm": 0.5400180220603943, "learning_rate": 4.428804858769967e-05, "loss": 0.2956, "step": 291900 }, { "epoch": 1.082531938399484, "grad_norm": 0.5818275213241577, "learning_rate": 4.425883092379752e-05, "loss": 0.2825, "step": 292000 }, { "epoch": 1.0829026685153742, "grad_norm": 0.7631511688232422, "learning_rate": 4.422961524638275e-05, "loss": 0.3082, "step": 292100 }, { "epoch": 1.0832733986312644, "grad_norm": 0.6136875152587891, "learning_rate": 4.420040156556421e-05, "loss": 0.2997, "step": 292200 }, { "epoch": 1.0836441287471545, "grad_norm": 0.7251676917076111, "learning_rate": 4.417118989145008e-05, "loss": 0.301, "step": 292300 }, { "epoch": 1.084014858863045, "grad_norm": 0.9091297388076782, "learning_rate": 4.4141980234147775e-05, "loss": 0.3008, "step": 292400 }, { "epoch": 1.084385588978935, "grad_norm": 0.3598489761352539, "learning_rate": 4.4112772603764074e-05, "loss": 0.3001, "step": 292500 }, { "epoch": 1.0847563190948253, "grad_norm": 1.1589632034301758, "learning_rate": 4.408356701040506e-05, "loss": 0.2829, "step": 292600 }, { "epoch": 1.0851270492107157, "grad_norm": 0.4276159405708313, "learning_rate": 4.405436346417604e-05, "loss": 0.3074, "step": 292700 }, { "epoch": 1.0854977793266058, "grad_norm": 0.39240211248397827, "learning_rate": 4.4025161975181714e-05, "loss": 0.2901, "step": 292800 }, { "epoch": 1.085868509442496, "grad_norm": 0.8717288374900818, "learning_rate": 4.3995962553525954e-05, "loss": 0.29, "step": 292900 }, { "epoch": 1.0862392395583862, "grad_norm": 0.7124008536338806, "learning_rate": 4.3966765209312024e-05, "loss": 0.3075, "step": 293000 }, { "epoch": 1.0866099696742766, "grad_norm": 0.9149613976478577, "learning_rate": 4.39375699526424e-05, "loss": 0.3088, "step": 293100 }, { "epoch": 1.0869806997901668, "grad_norm": 0.6193835735321045, "learning_rate": 4.390837679361886e-05, "loss": 0.2946, "step": 293200 }, { "epoch": 1.087351429906057, "grad_norm": 0.8843166828155518, "learning_rate": 4.387918574234248e-05, "loss": 0.3119, "step": 293300 }, { "epoch": 1.0877221600219473, "grad_norm": 0.6777443289756775, "learning_rate": 4.384999680891355e-05, "loss": 0.3076, "step": 293400 }, { "epoch": 1.0880928901378375, "grad_norm": 1.3140084743499756, "learning_rate": 4.382081000343169e-05, "loss": 0.3297, "step": 293500 }, { "epoch": 1.0884636202537277, "grad_norm": 0.5809606909751892, "learning_rate": 4.3791625335995715e-05, "loss": 0.3283, "step": 293600 }, { "epoch": 1.0888343503696178, "grad_norm": 1.1249074935913086, "learning_rate": 4.3762442816703755e-05, "loss": 0.3053, "step": 293700 }, { "epoch": 1.0892050804855082, "grad_norm": 1.050443172454834, "learning_rate": 4.37332624556532e-05, "loss": 0.3113, "step": 293800 }, { "epoch": 1.0895758106013984, "grad_norm": 0.49581682682037354, "learning_rate": 4.3704084262940647e-05, "loss": 0.3032, "step": 293900 }, { "epoch": 1.0899465407172886, "grad_norm": 0.37050244212150574, "learning_rate": 4.367490824866197e-05, "loss": 0.301, "step": 294000 }, { "epoch": 1.0903172708331788, "grad_norm": 0.6580499410629272, "learning_rate": 4.364573442291231e-05, "loss": 0.2474, "step": 294100 }, { "epoch": 1.0906880009490691, "grad_norm": 0.815534234046936, "learning_rate": 4.3616562795786005e-05, "loss": 0.3112, "step": 294200 }, { "epoch": 1.0910587310649593, "grad_norm": 0.6034706830978394, "learning_rate": 4.358739337737668e-05, "loss": 0.3152, "step": 294300 }, { "epoch": 1.0914294611808495, "grad_norm": 0.5357242822647095, "learning_rate": 4.355822617777714e-05, "loss": 0.2958, "step": 294400 }, { "epoch": 1.0918001912967399, "grad_norm": 1.1053791046142578, "learning_rate": 4.3529061207079494e-05, "loss": 0.2922, "step": 294500 }, { "epoch": 1.09217092141263, "grad_norm": 0.7128204703330994, "learning_rate": 4.349989847537499e-05, "loss": 0.307, "step": 294600 }, { "epoch": 1.0925416515285202, "grad_norm": 1.0462942123413086, "learning_rate": 4.3470737992754176e-05, "loss": 0.3241, "step": 294700 }, { "epoch": 1.0929123816444104, "grad_norm": 1.1022716760635376, "learning_rate": 4.3441579769306816e-05, "loss": 0.3414, "step": 294800 }, { "epoch": 1.0932831117603008, "grad_norm": 0.4002056419849396, "learning_rate": 4.341242381512184e-05, "loss": 0.3173, "step": 294900 }, { "epoch": 1.093653841876191, "grad_norm": 0.5612232089042664, "learning_rate": 4.3383270140287446e-05, "loss": 0.3078, "step": 295000 }, { "epoch": 1.0940245719920811, "grad_norm": 1.3711729049682617, "learning_rate": 4.3354118754891045e-05, "loss": 0.32, "step": 295100 }, { "epoch": 1.0943953021079715, "grad_norm": 0.9097443222999573, "learning_rate": 4.332496966901918e-05, "loss": 0.2727, "step": 295200 }, { "epoch": 1.0947660322238617, "grad_norm": 0.7259271740913391, "learning_rate": 4.329582289275769e-05, "loss": 0.3199, "step": 295300 }, { "epoch": 1.0951367623397519, "grad_norm": 0.5514920949935913, "learning_rate": 4.326667843619156e-05, "loss": 0.3122, "step": 295400 }, { "epoch": 1.095507492455642, "grad_norm": 0.9752296805381775, "learning_rate": 4.3237536309404994e-05, "loss": 0.3047, "step": 295500 }, { "epoch": 1.0958782225715324, "grad_norm": 0.6874715685844421, "learning_rate": 4.320839652248141e-05, "loss": 0.3072, "step": 295600 }, { "epoch": 1.0962489526874226, "grad_norm": 0.7154441475868225, "learning_rate": 4.317925908550335e-05, "loss": 0.3217, "step": 295700 }, { "epoch": 1.0966196828033128, "grad_norm": 0.5756542682647705, "learning_rate": 4.315012400855263e-05, "loss": 0.3169, "step": 295800 }, { "epoch": 1.0969904129192032, "grad_norm": 1.119326114654541, "learning_rate": 4.312099130171016e-05, "loss": 0.3158, "step": 295900 }, { "epoch": 1.0973611430350934, "grad_norm": 1.1239763498306274, "learning_rate": 4.3091860975056106e-05, "loss": 0.3137, "step": 296000 }, { "epoch": 1.0977318731509835, "grad_norm": 0.5962138772010803, "learning_rate": 4.306273303866976e-05, "loss": 0.3123, "step": 296100 }, { "epoch": 1.0981026032668737, "grad_norm": 0.8304513096809387, "learning_rate": 4.30336075026296e-05, "loss": 0.2957, "step": 296200 }, { "epoch": 1.098473333382764, "grad_norm": 0.9646735787391663, "learning_rate": 4.3004484377013315e-05, "loss": 0.2891, "step": 296300 }, { "epoch": 1.0988440634986543, "grad_norm": 0.5172989368438721, "learning_rate": 4.297536367189769e-05, "loss": 0.3093, "step": 296400 }, { "epoch": 1.0992147936145444, "grad_norm": 0.7152236104011536, "learning_rate": 4.2946245397358716e-05, "loss": 0.2631, "step": 296500 }, { "epoch": 1.0995855237304348, "grad_norm": 1.3868489265441895, "learning_rate": 4.2917129563471553e-05, "loss": 0.3053, "step": 296600 }, { "epoch": 1.099956253846325, "grad_norm": 0.39127564430236816, "learning_rate": 4.288801618031047e-05, "loss": 0.2815, "step": 296700 }, { "epoch": 1.1003269839622152, "grad_norm": 0.5323225855827332, "learning_rate": 4.285890525794891e-05, "loss": 0.3083, "step": 296800 }, { "epoch": 1.1006977140781053, "grad_norm": 0.8774656057357788, "learning_rate": 4.282979680645949e-05, "loss": 0.3122, "step": 296900 }, { "epoch": 1.1010684441939957, "grad_norm": 1.3147341012954712, "learning_rate": 4.280069083591396e-05, "loss": 0.284, "step": 297000 }, { "epoch": 1.101439174309886, "grad_norm": 0.7371772527694702, "learning_rate": 4.277158735638317e-05, "loss": 0.3097, "step": 297100 }, { "epoch": 1.101809904425776, "grad_norm": 0.5918312668800354, "learning_rate": 4.274248637793716e-05, "loss": 0.3035, "step": 297200 }, { "epoch": 1.1021806345416663, "grad_norm": 0.706451952457428, "learning_rate": 4.2713387910645096e-05, "loss": 0.3169, "step": 297300 }, { "epoch": 1.1025513646575567, "grad_norm": 0.7916515469551086, "learning_rate": 4.2684291964575265e-05, "loss": 0.3104, "step": 297400 }, { "epoch": 1.1029220947734468, "grad_norm": 1.0715466737747192, "learning_rate": 4.265519854979506e-05, "loss": 0.2825, "step": 297500 }, { "epoch": 1.103292824889337, "grad_norm": 0.6445251703262329, "learning_rate": 4.262610767637103e-05, "loss": 0.2797, "step": 297600 }, { "epoch": 1.1036635550052274, "grad_norm": 0.8577436804771423, "learning_rate": 4.259701935436883e-05, "loss": 0.2764, "step": 297700 }, { "epoch": 1.1040342851211176, "grad_norm": 0.949336588382721, "learning_rate": 4.256793359385326e-05, "loss": 0.3031, "step": 297800 }, { "epoch": 1.1044050152370077, "grad_norm": 2.073784351348877, "learning_rate": 4.2538850404888176e-05, "loss": 0.3118, "step": 297900 }, { "epoch": 1.104775745352898, "grad_norm": 0.478779673576355, "learning_rate": 4.2509769797536603e-05, "loss": 0.3203, "step": 298000 }, { "epoch": 1.1051464754687883, "grad_norm": 1.5264478921890259, "learning_rate": 4.248069178186066e-05, "loss": 0.2792, "step": 298100 }, { "epoch": 1.1055172055846785, "grad_norm": 0.8859559297561646, "learning_rate": 4.245161636792153e-05, "loss": 0.3277, "step": 298200 }, { "epoch": 1.1058879357005686, "grad_norm": 0.8161627054214478, "learning_rate": 4.242254356577951e-05, "loss": 0.2966, "step": 298300 }, { "epoch": 1.1062586658164588, "grad_norm": 0.5054191946983337, "learning_rate": 4.2393473385494045e-05, "loss": 0.2946, "step": 298400 }, { "epoch": 1.1066293959323492, "grad_norm": 0.815573513507843, "learning_rate": 4.236440583712363e-05, "loss": 0.3223, "step": 298500 }, { "epoch": 1.1070001260482394, "grad_norm": 0.7359567880630493, "learning_rate": 4.2335340930725824e-05, "loss": 0.3053, "step": 298600 }, { "epoch": 1.1073708561641296, "grad_norm": 0.9979274868965149, "learning_rate": 4.2306278676357325e-05, "loss": 0.3072, "step": 298700 }, { "epoch": 1.10774158628002, "grad_norm": 0.6763911843299866, "learning_rate": 4.2277219084073896e-05, "loss": 0.3416, "step": 298800 }, { "epoch": 1.1081123163959101, "grad_norm": 0.6919307112693787, "learning_rate": 4.224816216393037e-05, "loss": 0.2857, "step": 298900 }, { "epoch": 1.1084830465118003, "grad_norm": 0.742584764957428, "learning_rate": 4.2219107925980636e-05, "loss": 0.3029, "step": 299000 }, { "epoch": 1.1088537766276905, "grad_norm": 0.5021302103996277, "learning_rate": 4.219005638027769e-05, "loss": 0.2898, "step": 299100 }, { "epoch": 1.1092245067435809, "grad_norm": 0.7071950435638428, "learning_rate": 4.2161007536873596e-05, "loss": 0.3134, "step": 299200 }, { "epoch": 1.109595236859471, "grad_norm": 0.7571181058883667, "learning_rate": 4.213196140581945e-05, "loss": 0.2957, "step": 299300 }, { "epoch": 1.1099659669753612, "grad_norm": 1.5716912746429443, "learning_rate": 4.210291799716545e-05, "loss": 0.3077, "step": 299400 }, { "epoch": 1.1103366970912516, "grad_norm": 0.6044061183929443, "learning_rate": 4.207387732096082e-05, "loss": 0.2948, "step": 299500 }, { "epoch": 1.1107074272071418, "grad_norm": 0.49772873520851135, "learning_rate": 4.2044839387253876e-05, "loss": 0.2785, "step": 299600 }, { "epoch": 1.111078157323032, "grad_norm": 0.662408709526062, "learning_rate": 4.201580420609192e-05, "loss": 0.3043, "step": 299700 }, { "epoch": 1.1114488874389221, "grad_norm": 0.6704857349395752, "learning_rate": 4.1986771787521364e-05, "loss": 0.3164, "step": 299800 }, { "epoch": 1.1118196175548125, "grad_norm": 1.0496279001235962, "learning_rate": 4.195774214158763e-05, "loss": 0.2915, "step": 299900 }, { "epoch": 1.1121903476707027, "grad_norm": 0.8353812098503113, "learning_rate": 4.1928715278335214e-05, "loss": 0.3255, "step": 300000 }, { "epoch": 1.1125610777865929, "grad_norm": 1.050309658050537, "learning_rate": 4.1899691207807604e-05, "loss": 0.2901, "step": 300100 }, { "epoch": 1.1129318079024832, "grad_norm": 0.7197265028953552, "learning_rate": 4.187066994004736e-05, "loss": 0.3075, "step": 300200 }, { "epoch": 1.1133025380183734, "grad_norm": 0.624214768409729, "learning_rate": 4.1841651485096056e-05, "loss": 0.3132, "step": 300300 }, { "epoch": 1.1136732681342636, "grad_norm": 0.5552815794944763, "learning_rate": 4.181263585299431e-05, "loss": 0.2842, "step": 300400 }, { "epoch": 1.1140439982501538, "grad_norm": 0.45681607723236084, "learning_rate": 4.178362305378171e-05, "loss": 0.2649, "step": 300500 }, { "epoch": 1.1144147283660442, "grad_norm": 0.37325018644332886, "learning_rate": 4.175461309749691e-05, "loss": 0.2795, "step": 300600 }, { "epoch": 1.1147854584819343, "grad_norm": 0.8557956218719482, "learning_rate": 4.17256059941776e-05, "loss": 0.3193, "step": 300700 }, { "epoch": 1.1151561885978245, "grad_norm": 1.112954020500183, "learning_rate": 4.169660175386041e-05, "loss": 0.3088, "step": 300800 }, { "epoch": 1.115526918713715, "grad_norm": 1.357448697090149, "learning_rate": 4.166760038658107e-05, "loss": 0.3145, "step": 300900 }, { "epoch": 1.115897648829605, "grad_norm": 0.7540902495384216, "learning_rate": 4.163860190237425e-05, "loss": 0.3004, "step": 301000 }, { "epoch": 1.1162683789454952, "grad_norm": 1.2975016832351685, "learning_rate": 4.1609606311273636e-05, "loss": 0.3318, "step": 301100 }, { "epoch": 1.1166391090613854, "grad_norm": 0.8284198045730591, "learning_rate": 4.158061362331195e-05, "loss": 0.2815, "step": 301200 }, { "epoch": 1.1170098391772758, "grad_norm": 0.4272201657295227, "learning_rate": 4.1551623848520825e-05, "loss": 0.2886, "step": 301300 }, { "epoch": 1.117380569293166, "grad_norm": 0.8068810105323792, "learning_rate": 4.152263699693097e-05, "loss": 0.2646, "step": 301400 }, { "epoch": 1.1177512994090562, "grad_norm": 0.8665661215782166, "learning_rate": 4.149365307857206e-05, "loss": 0.3089, "step": 301500 }, { "epoch": 1.1181220295249463, "grad_norm": 0.5977432727813721, "learning_rate": 4.146467210347274e-05, "loss": 0.3243, "step": 301600 }, { "epoch": 1.1184927596408367, "grad_norm": 0.646581768989563, "learning_rate": 4.143569408166063e-05, "loss": 0.2972, "step": 301700 }, { "epoch": 1.118863489756727, "grad_norm": 0.9209235906600952, "learning_rate": 4.140671902316236e-05, "loss": 0.3105, "step": 301800 }, { "epoch": 1.119234219872617, "grad_norm": 0.7019659876823425, "learning_rate": 4.137774693800353e-05, "loss": 0.3023, "step": 301900 }, { "epoch": 1.1196049499885075, "grad_norm": 0.7438374161720276, "learning_rate": 4.1348777836208656e-05, "loss": 0.2927, "step": 302000 }, { "epoch": 1.1199756801043976, "grad_norm": 0.8144124746322632, "learning_rate": 4.131981172780128e-05, "loss": 0.3084, "step": 302100 }, { "epoch": 1.1203464102202878, "grad_norm": 0.4462498426437378, "learning_rate": 4.1290848622803925e-05, "loss": 0.2814, "step": 302200 }, { "epoch": 1.120717140336178, "grad_norm": 0.7189239263534546, "learning_rate": 4.126188853123799e-05, "loss": 0.2914, "step": 302300 }, { "epoch": 1.1210878704520684, "grad_norm": 1.0190259218215942, "learning_rate": 4.123293146312391e-05, "loss": 0.3049, "step": 302400 }, { "epoch": 1.1214586005679585, "grad_norm": 0.5026878118515015, "learning_rate": 4.120397742848106e-05, "loss": 0.3127, "step": 302500 }, { "epoch": 1.1218293306838487, "grad_norm": 1.992948293685913, "learning_rate": 4.117502643732772e-05, "loss": 0.3256, "step": 302600 }, { "epoch": 1.122200060799739, "grad_norm": 0.45878463983535767, "learning_rate": 4.1146078499681204e-05, "loss": 0.3035, "step": 302700 }, { "epoch": 1.1225707909156293, "grad_norm": 0.48074615001678467, "learning_rate": 4.111713362555764e-05, "loss": 0.2902, "step": 302800 }, { "epoch": 1.1229415210315195, "grad_norm": 0.42101413011550903, "learning_rate": 4.1088191824972224e-05, "loss": 0.3199, "step": 302900 }, { "epoch": 1.1233122511474096, "grad_norm": 0.4867750406265259, "learning_rate": 4.105925310793901e-05, "loss": 0.2992, "step": 303000 }, { "epoch": 1.1236829812633, "grad_norm": 0.5250921249389648, "learning_rate": 4.103031748447102e-05, "loss": 0.3095, "step": 303100 }, { "epoch": 1.1240537113791902, "grad_norm": 1.663285255432129, "learning_rate": 4.1001384964580195e-05, "loss": 0.3113, "step": 303200 }, { "epoch": 1.1244244414950804, "grad_norm": 0.5961251258850098, "learning_rate": 4.097245555827741e-05, "loss": 0.3054, "step": 303300 }, { "epoch": 1.1247951716109705, "grad_norm": 0.7583829164505005, "learning_rate": 4.0943529275572437e-05, "loss": 0.3321, "step": 303400 }, { "epoch": 1.125165901726861, "grad_norm": 0.9855505228042603, "learning_rate": 4.091460612647402e-05, "loss": 0.2875, "step": 303500 }, { "epoch": 1.125536631842751, "grad_norm": 0.5078909397125244, "learning_rate": 4.088568612098975e-05, "loss": 0.3082, "step": 303600 }, { "epoch": 1.1259073619586413, "grad_norm": 0.6774519085884094, "learning_rate": 4.085676926912618e-05, "loss": 0.2861, "step": 303700 }, { "epoch": 1.1262780920745317, "grad_norm": 0.5961154699325562, "learning_rate": 4.0827855580888744e-05, "loss": 0.3026, "step": 303800 }, { "epoch": 1.1266488221904218, "grad_norm": 0.5977916717529297, "learning_rate": 4.079894506628181e-05, "loss": 0.2737, "step": 303900 }, { "epoch": 1.127019552306312, "grad_norm": 0.7523824572563171, "learning_rate": 4.077003773530863e-05, "loss": 0.3081, "step": 304000 }, { "epoch": 1.1273902824222022, "grad_norm": 0.870893657207489, "learning_rate": 4.074113359797135e-05, "loss": 0.3166, "step": 304100 }, { "epoch": 1.1277610125380926, "grad_norm": 0.46500930190086365, "learning_rate": 4.071223266427104e-05, "loss": 0.3126, "step": 304200 }, { "epoch": 1.1281317426539827, "grad_norm": 0.9710444211959839, "learning_rate": 4.068333494420759e-05, "loss": 0.2847, "step": 304300 }, { "epoch": 1.128502472769873, "grad_norm": 1.0671918392181396, "learning_rate": 4.0654440447779886e-05, "loss": 0.3171, "step": 304400 }, { "epoch": 1.1288732028857633, "grad_norm": 0.4697538912296295, "learning_rate": 4.0625549184985585e-05, "loss": 0.2749, "step": 304500 }, { "epoch": 1.1292439330016535, "grad_norm": 1.2136576175689697, "learning_rate": 4.0596661165821306e-05, "loss": 0.2858, "step": 304600 }, { "epoch": 1.1296146631175437, "grad_norm": 0.22350743412971497, "learning_rate": 4.056777640028253e-05, "loss": 0.2897, "step": 304700 }, { "epoch": 1.1299853932334338, "grad_norm": 1.2903366088867188, "learning_rate": 4.053889489836359e-05, "loss": 0.2919, "step": 304800 }, { "epoch": 1.1303561233493242, "grad_norm": 0.4405432641506195, "learning_rate": 4.0510016670057685e-05, "loss": 0.2904, "step": 304900 }, { "epoch": 1.1307268534652144, "grad_norm": 1.1184744834899902, "learning_rate": 4.048114172535695e-05, "loss": 0.3112, "step": 305000 }, { "epoch": 1.1310975835811046, "grad_norm": 0.5487484335899353, "learning_rate": 4.045227007425228e-05, "loss": 0.2696, "step": 305100 }, { "epoch": 1.131468313696995, "grad_norm": 0.7549682259559631, "learning_rate": 4.0423401726733485e-05, "loss": 0.3021, "step": 305200 }, { "epoch": 1.1318390438128851, "grad_norm": 1.0360175371170044, "learning_rate": 4.039453669278924e-05, "loss": 0.2888, "step": 305300 }, { "epoch": 1.1322097739287753, "grad_norm": 0.5527519583702087, "learning_rate": 4.0365674982407075e-05, "loss": 0.287, "step": 305400 }, { "epoch": 1.1325805040446655, "grad_norm": 0.4944321811199188, "learning_rate": 4.0336816605573345e-05, "loss": 0.2949, "step": 305500 }, { "epoch": 1.1329512341605559, "grad_norm": 1.0714786052703857, "learning_rate": 4.030796157227327e-05, "loss": 0.3219, "step": 305600 }, { "epoch": 1.133321964276446, "grad_norm": 1.3309879302978516, "learning_rate": 4.027910989249091e-05, "loss": 0.2918, "step": 305700 }, { "epoch": 1.1336926943923362, "grad_norm": 0.7037000060081482, "learning_rate": 4.0250261576209134e-05, "loss": 0.3095, "step": 305800 }, { "epoch": 1.1340634245082266, "grad_norm": 0.6775796413421631, "learning_rate": 4.022141663340971e-05, "loss": 0.3081, "step": 305900 }, { "epoch": 1.1344341546241168, "grad_norm": 1.1025545597076416, "learning_rate": 4.0192575074073184e-05, "loss": 0.2818, "step": 306000 }, { "epoch": 1.134804884740007, "grad_norm": 0.8170901536941528, "learning_rate": 4.0163736908178956e-05, "loss": 0.3114, "step": 306100 }, { "epoch": 1.1351756148558971, "grad_norm": 0.6710562705993652, "learning_rate": 4.013490214570525e-05, "loss": 0.2973, "step": 306200 }, { "epoch": 1.1355463449717875, "grad_norm": 0.5549588799476624, "learning_rate": 4.01060707966291e-05, "loss": 0.3156, "step": 306300 }, { "epoch": 1.1359170750876777, "grad_norm": 0.7952961325645447, "learning_rate": 4.0077242870926376e-05, "loss": 0.288, "step": 306400 }, { "epoch": 1.1362878052035679, "grad_norm": 1.2263115644454956, "learning_rate": 4.004841837857178e-05, "loss": 0.3192, "step": 306500 }, { "epoch": 1.1366585353194583, "grad_norm": 0.4956980347633362, "learning_rate": 4.0019597329538755e-05, "loss": 0.2965, "step": 306600 }, { "epoch": 1.1370292654353484, "grad_norm": 0.29198580980300903, "learning_rate": 3.999077973379962e-05, "loss": 0.3315, "step": 306700 }, { "epoch": 1.1373999955512386, "grad_norm": 0.7778264284133911, "learning_rate": 3.9961965601325466e-05, "loss": 0.3062, "step": 306800 }, { "epoch": 1.1377707256671288, "grad_norm": 0.7475540637969971, "learning_rate": 3.993315494208623e-05, "loss": 0.2945, "step": 306900 }, { "epoch": 1.1381414557830192, "grad_norm": 0.4215249717235565, "learning_rate": 3.9904347766050586e-05, "loss": 0.2857, "step": 307000 }, { "epoch": 1.1385121858989093, "grad_norm": 0.9137313365936279, "learning_rate": 3.987554408318604e-05, "loss": 0.2941, "step": 307100 }, { "epoch": 1.1388829160147995, "grad_norm": 0.7133863568305969, "learning_rate": 3.984674390345889e-05, "loss": 0.2653, "step": 307200 }, { "epoch": 1.1392536461306897, "grad_norm": 1.0393520593643188, "learning_rate": 3.981794723683422e-05, "loss": 0.3084, "step": 307300 }, { "epoch": 1.13962437624658, "grad_norm": 0.6396570801734924, "learning_rate": 3.9789154093275874e-05, "loss": 0.2792, "step": 307400 }, { "epoch": 1.1399951063624703, "grad_norm": 0.6539766192436218, "learning_rate": 3.976036448274648e-05, "loss": 0.279, "step": 307500 }, { "epoch": 1.1403658364783604, "grad_norm": 0.43206748366355896, "learning_rate": 3.9731578415207484e-05, "loss": 0.3041, "step": 307600 }, { "epoch": 1.1407365665942506, "grad_norm": 0.8811590671539307, "learning_rate": 3.970279590061909e-05, "loss": 0.3025, "step": 307700 }, { "epoch": 1.141107296710141, "grad_norm": 0.353255033493042, "learning_rate": 3.967401694894024e-05, "loss": 0.2735, "step": 307800 }, { "epoch": 1.1414780268260312, "grad_norm": 0.7819467782974243, "learning_rate": 3.964524157012866e-05, "loss": 0.3036, "step": 307900 }, { "epoch": 1.1418487569419213, "grad_norm": 0.9634025692939758, "learning_rate": 3.961646977414089e-05, "loss": 0.317, "step": 308000 }, { "epoch": 1.1422194870578117, "grad_norm": 0.8227671384811401, "learning_rate": 3.9587701570932146e-05, "loss": 0.2796, "step": 308100 }, { "epoch": 1.142590217173702, "grad_norm": 0.606634259223938, "learning_rate": 3.955893697045644e-05, "loss": 0.2755, "step": 308200 }, { "epoch": 1.142960947289592, "grad_norm": 0.6149530410766602, "learning_rate": 3.953017598266655e-05, "loss": 0.2858, "step": 308300 }, { "epoch": 1.1433316774054822, "grad_norm": 0.9430896043777466, "learning_rate": 3.9501418617514005e-05, "loss": 0.2857, "step": 308400 }, { "epoch": 1.1437024075213726, "grad_norm": 1.136301040649414, "learning_rate": 3.947266488494904e-05, "loss": 0.2826, "step": 308500 }, { "epoch": 1.1440731376372628, "grad_norm": 0.9674627184867859, "learning_rate": 3.9443914794920684e-05, "loss": 0.3008, "step": 308600 }, { "epoch": 1.144443867753153, "grad_norm": 0.6253500580787659, "learning_rate": 3.9415168357376686e-05, "loss": 0.284, "step": 308700 }, { "epoch": 1.1448145978690434, "grad_norm": 0.916181206703186, "learning_rate": 3.938642558226353e-05, "loss": 0.2961, "step": 308800 }, { "epoch": 1.1451853279849336, "grad_norm": 1.0564873218536377, "learning_rate": 3.935768647952639e-05, "loss": 0.2741, "step": 308900 }, { "epoch": 1.1455560581008237, "grad_norm": 0.7715355157852173, "learning_rate": 3.932895105910925e-05, "loss": 0.3157, "step": 309000 }, { "epoch": 1.145926788216714, "grad_norm": 2.4274492263793945, "learning_rate": 3.930021933095477e-05, "loss": 0.3085, "step": 309100 }, { "epoch": 1.1462975183326043, "grad_norm": 0.5352848768234253, "learning_rate": 3.927149130500434e-05, "loss": 0.3231, "step": 309200 }, { "epoch": 1.1466682484484945, "grad_norm": 0.49896711111068726, "learning_rate": 3.9242766991198074e-05, "loss": 0.3242, "step": 309300 }, { "epoch": 1.1470389785643846, "grad_norm": 2.4483790397644043, "learning_rate": 3.9214046399474806e-05, "loss": 0.2825, "step": 309400 }, { "epoch": 1.147409708680275, "grad_norm": 0.6767386198043823, "learning_rate": 3.9185329539772076e-05, "loss": 0.2904, "step": 309500 }, { "epoch": 1.1477804387961652, "grad_norm": 0.8204732537269592, "learning_rate": 3.915661642202612e-05, "loss": 0.284, "step": 309600 }, { "epoch": 1.1481511689120554, "grad_norm": 1.3841530084609985, "learning_rate": 3.9127907056171885e-05, "loss": 0.3035, "step": 309700 }, { "epoch": 1.1485218990279455, "grad_norm": 0.8423539996147156, "learning_rate": 3.909920145214304e-05, "loss": 0.2694, "step": 309800 }, { "epoch": 1.148892629143836, "grad_norm": 0.5755272507667542, "learning_rate": 3.907049961987193e-05, "loss": 0.2989, "step": 309900 }, { "epoch": 1.1492633592597261, "grad_norm": 0.7492812275886536, "learning_rate": 3.904180156928962e-05, "loss": 0.2719, "step": 310000 }, { "epoch": 1.1496340893756163, "grad_norm": 1.6269514560699463, "learning_rate": 3.901310731032583e-05, "loss": 0.2908, "step": 310100 }, { "epoch": 1.1500048194915067, "grad_norm": 1.5295771360397339, "learning_rate": 3.8984416852909014e-05, "loss": 0.315, "step": 310200 }, { "epoch": 1.1503755496073969, "grad_norm": 0.234854057431221, "learning_rate": 3.8955730206966266e-05, "loss": 0.2609, "step": 310300 }, { "epoch": 1.150746279723287, "grad_norm": 0.40025994181632996, "learning_rate": 3.8927047382423365e-05, "loss": 0.3301, "step": 310400 }, { "epoch": 1.1511170098391772, "grad_norm": 0.6489104628562927, "learning_rate": 3.8898368389204794e-05, "loss": 0.3094, "step": 310500 }, { "epoch": 1.1514877399550676, "grad_norm": 0.6022928953170776, "learning_rate": 3.886969323723373e-05, "loss": 0.2844, "step": 310600 }, { "epoch": 1.1518584700709578, "grad_norm": 1.1191405057907104, "learning_rate": 3.884102193643194e-05, "loss": 0.3283, "step": 310700 }, { "epoch": 1.152229200186848, "grad_norm": 1.3130525350570679, "learning_rate": 3.8812354496719936e-05, "loss": 0.3251, "step": 310800 }, { "epoch": 1.1525999303027383, "grad_norm": 0.682442843914032, "learning_rate": 3.878369092801687e-05, "loss": 0.301, "step": 310900 }, { "epoch": 1.1529706604186285, "grad_norm": 0.8368661403656006, "learning_rate": 3.8755031240240535e-05, "loss": 0.2764, "step": 311000 }, { "epoch": 1.1533413905345187, "grad_norm": 0.8313582539558411, "learning_rate": 3.8726375443307426e-05, "loss": 0.292, "step": 311100 }, { "epoch": 1.1537121206504088, "grad_norm": 1.044495701789856, "learning_rate": 3.869772354713262e-05, "loss": 0.2745, "step": 311200 }, { "epoch": 1.1540828507662992, "grad_norm": 0.9500941634178162, "learning_rate": 3.866907556162991e-05, "loss": 0.2714, "step": 311300 }, { "epoch": 1.1544535808821894, "grad_norm": 0.7893732786178589, "learning_rate": 3.864043149671172e-05, "loss": 0.2904, "step": 311400 }, { "epoch": 1.1548243109980796, "grad_norm": 0.7226828336715698, "learning_rate": 3.8611791362289094e-05, "loss": 0.2871, "step": 311500 }, { "epoch": 1.1551950411139698, "grad_norm": 1.667790412902832, "learning_rate": 3.8583155168271745e-05, "loss": 0.2955, "step": 311600 }, { "epoch": 1.1555657712298602, "grad_norm": 0.7912875413894653, "learning_rate": 3.855452292456802e-05, "loss": 0.3226, "step": 311700 }, { "epoch": 1.1559365013457503, "grad_norm": 0.7970796823501587, "learning_rate": 3.852589464108487e-05, "loss": 0.292, "step": 311800 }, { "epoch": 1.1563072314616405, "grad_norm": 0.8096898794174194, "learning_rate": 3.849727032772789e-05, "loss": 0.2744, "step": 311900 }, { "epoch": 1.1566779615775307, "grad_norm": 0.9842033982276917, "learning_rate": 3.846864999440132e-05, "loss": 0.3031, "step": 312000 }, { "epoch": 1.157048691693421, "grad_norm": 1.115329623222351, "learning_rate": 3.8440033651008003e-05, "loss": 0.3139, "step": 312100 }, { "epoch": 1.1574194218093112, "grad_norm": 0.716185450553894, "learning_rate": 3.841142130744941e-05, "loss": 0.3144, "step": 312200 }, { "epoch": 1.1577901519252014, "grad_norm": 0.5564904808998108, "learning_rate": 3.838281297362563e-05, "loss": 0.3095, "step": 312300 }, { "epoch": 1.1581608820410918, "grad_norm": 0.39447125792503357, "learning_rate": 3.835420865943536e-05, "loss": 0.3644, "step": 312400 }, { "epoch": 1.158531612156982, "grad_norm": 0.9522678852081299, "learning_rate": 3.8325608374775894e-05, "loss": 0.3098, "step": 312500 }, { "epoch": 1.1589023422728721, "grad_norm": 0.876488208770752, "learning_rate": 3.8297012129543175e-05, "loss": 0.3119, "step": 312600 }, { "epoch": 1.1592730723887623, "grad_norm": 0.16932447254657745, "learning_rate": 3.826841993363167e-05, "loss": 0.2979, "step": 312700 }, { "epoch": 1.1596438025046527, "grad_norm": 0.5575760006904602, "learning_rate": 3.823983179693453e-05, "loss": 0.3012, "step": 312800 }, { "epoch": 1.1600145326205429, "grad_norm": 0.4712035357952118, "learning_rate": 3.8211247729343445e-05, "loss": 0.2735, "step": 312900 }, { "epoch": 1.160385262736433, "grad_norm": 1.0875294208526611, "learning_rate": 3.818266774074872e-05, "loss": 0.2912, "step": 313000 }, { "epoch": 1.1607559928523234, "grad_norm": 1.091566562652588, "learning_rate": 3.8154091841039264e-05, "loss": 0.3053, "step": 313100 }, { "epoch": 1.1611267229682136, "grad_norm": 0.6991491317749023, "learning_rate": 3.812552004010252e-05, "loss": 0.3126, "step": 313200 }, { "epoch": 1.1614974530841038, "grad_norm": 1.1680042743682861, "learning_rate": 3.809695234782456e-05, "loss": 0.2878, "step": 313300 }, { "epoch": 1.161868183199994, "grad_norm": 0.6867779493331909, "learning_rate": 3.806838877409004e-05, "loss": 0.2707, "step": 313400 }, { "epoch": 1.1622389133158844, "grad_norm": 0.9529985189437866, "learning_rate": 3.803982932878213e-05, "loss": 0.2977, "step": 313500 }, { "epoch": 1.1626096434317745, "grad_norm": 0.4093089699745178, "learning_rate": 3.801127402178263e-05, "loss": 0.2894, "step": 313600 }, { "epoch": 1.1629803735476647, "grad_norm": 0.7639794945716858, "learning_rate": 3.798272286297189e-05, "loss": 0.3025, "step": 313700 }, { "epoch": 1.163351103663555, "grad_norm": 1.0382874011993408, "learning_rate": 3.795417586222882e-05, "loss": 0.2711, "step": 313800 }, { "epoch": 1.1637218337794453, "grad_norm": 0.8922341465950012, "learning_rate": 3.792563302943091e-05, "loss": 0.2706, "step": 313900 }, { "epoch": 1.1640925638953354, "grad_norm": 0.5975860357284546, "learning_rate": 3.789709437445417e-05, "loss": 0.2992, "step": 314000 }, { "epoch": 1.1644632940112256, "grad_norm": 1.409071922302246, "learning_rate": 3.786855990717322e-05, "loss": 0.2799, "step": 314100 }, { "epoch": 1.164834024127116, "grad_norm": 0.9604749083518982, "learning_rate": 3.784002963746116e-05, "loss": 0.2829, "step": 314200 }, { "epoch": 1.1652047542430062, "grad_norm": 0.6955172419548035, "learning_rate": 3.7811503575189714e-05, "loss": 0.2836, "step": 314300 }, { "epoch": 1.1655754843588964, "grad_norm": 0.6324862241744995, "learning_rate": 3.7782981730229084e-05, "loss": 0.2897, "step": 314400 }, { "epoch": 1.1659462144747867, "grad_norm": 0.48825183510780334, "learning_rate": 3.775446411244804e-05, "loss": 0.2882, "step": 314500 }, { "epoch": 1.166316944590677, "grad_norm": 0.5258966684341431, "learning_rate": 3.772595073171392e-05, "loss": 0.3007, "step": 314600 }, { "epoch": 1.166687674706567, "grad_norm": 0.5105317234992981, "learning_rate": 3.7697441597892546e-05, "loss": 0.2791, "step": 314700 }, { "epoch": 1.1670584048224573, "grad_norm": 0.8947299122810364, "learning_rate": 3.766893672084829e-05, "loss": 0.2998, "step": 314800 }, { "epoch": 1.1674291349383477, "grad_norm": 0.8342703580856323, "learning_rate": 3.7640436110444075e-05, "loss": 0.3014, "step": 314900 }, { "epoch": 1.1677998650542378, "grad_norm": 1.752206802368164, "learning_rate": 3.76119397765413e-05, "loss": 0.2992, "step": 315000 }, { "epoch": 1.168170595170128, "grad_norm": 1.0185868740081787, "learning_rate": 3.758344772899991e-05, "loss": 0.2914, "step": 315100 }, { "epoch": 1.1685413252860184, "grad_norm": 1.1229710578918457, "learning_rate": 3.7554959977678364e-05, "loss": 0.2894, "step": 315200 }, { "epoch": 1.1689120554019086, "grad_norm": 0.8136422634124756, "learning_rate": 3.7526476532433664e-05, "loss": 0.2706, "step": 315300 }, { "epoch": 1.1692827855177987, "grad_norm": 0.4602573812007904, "learning_rate": 3.7497997403121266e-05, "loss": 0.293, "step": 315400 }, { "epoch": 1.169653515633689, "grad_norm": 1.238317847251892, "learning_rate": 3.7469522599595166e-05, "loss": 0.2974, "step": 315500 }, { "epoch": 1.1700242457495793, "grad_norm": 0.19252151250839233, "learning_rate": 3.7441052131707896e-05, "loss": 0.3163, "step": 315600 }, { "epoch": 1.1703949758654695, "grad_norm": 0.49765267968177795, "learning_rate": 3.741258600931039e-05, "loss": 0.2987, "step": 315700 }, { "epoch": 1.1707657059813597, "grad_norm": 0.6170503497123718, "learning_rate": 3.738412424225218e-05, "loss": 0.2794, "step": 315800 }, { "epoch": 1.17113643609725, "grad_norm": 1.0731086730957031, "learning_rate": 3.735566684038123e-05, "loss": 0.2911, "step": 315900 }, { "epoch": 1.1715071662131402, "grad_norm": 0.8744787573814392, "learning_rate": 3.732721381354403e-05, "loss": 0.3038, "step": 316000 }, { "epoch": 1.1718778963290304, "grad_norm": 1.4278892278671265, "learning_rate": 3.729876517158554e-05, "loss": 0.2907, "step": 316100 }, { "epoch": 1.1722486264449206, "grad_norm": 0.6319417953491211, "learning_rate": 3.7270320924349175e-05, "loss": 0.2916, "step": 316200 }, { "epoch": 1.172619356560811, "grad_norm": 0.24956731498241425, "learning_rate": 3.724188108167689e-05, "loss": 0.2858, "step": 316300 }, { "epoch": 1.1729900866767011, "grad_norm": 0.5438283681869507, "learning_rate": 3.7213445653409084e-05, "loss": 0.2967, "step": 316400 }, { "epoch": 1.1733608167925913, "grad_norm": 0.7030737400054932, "learning_rate": 3.71850146493846e-05, "loss": 0.3165, "step": 316500 }, { "epoch": 1.1737315469084815, "grad_norm": 0.8833681344985962, "learning_rate": 3.715658807944079e-05, "loss": 0.299, "step": 316600 }, { "epoch": 1.1741022770243719, "grad_norm": 0.8890489935874939, "learning_rate": 3.7128165953413454e-05, "loss": 0.3013, "step": 316700 }, { "epoch": 1.174473007140262, "grad_norm": 0.6286076903343201, "learning_rate": 3.709974828113688e-05, "loss": 0.254, "step": 316800 }, { "epoch": 1.1748437372561522, "grad_norm": 0.7439261674880981, "learning_rate": 3.7071335072443775e-05, "loss": 0.2744, "step": 316900 }, { "epoch": 1.1752144673720424, "grad_norm": 0.8445565104484558, "learning_rate": 3.704292633716533e-05, "loss": 0.2984, "step": 317000 }, { "epoch": 1.1755851974879328, "grad_norm": 0.512174665927887, "learning_rate": 3.701452208513118e-05, "loss": 0.2791, "step": 317100 }, { "epoch": 1.175955927603823, "grad_norm": 0.336786150932312, "learning_rate": 3.698612232616941e-05, "loss": 0.2799, "step": 317200 }, { "epoch": 1.1763266577197131, "grad_norm": 0.4217027723789215, "learning_rate": 3.695772707010655e-05, "loss": 0.2671, "step": 317300 }, { "epoch": 1.1766973878356035, "grad_norm": 0.7107738852500916, "learning_rate": 3.692933632676755e-05, "loss": 0.308, "step": 317400 }, { "epoch": 1.1770681179514937, "grad_norm": 0.6185529232025146, "learning_rate": 3.690095010597584e-05, "loss": 0.3199, "step": 317500 }, { "epoch": 1.1774388480673839, "grad_norm": 0.946199357509613, "learning_rate": 3.687256841755327e-05, "loss": 0.3163, "step": 317600 }, { "epoch": 1.177809578183274, "grad_norm": 0.5231097340583801, "learning_rate": 3.6844191271320093e-05, "loss": 0.3126, "step": 317700 }, { "epoch": 1.1781803082991644, "grad_norm": 1.0086960792541504, "learning_rate": 3.681581867709503e-05, "loss": 0.285, "step": 317800 }, { "epoch": 1.1785510384150546, "grad_norm": 0.32791683077812195, "learning_rate": 3.6787450644695224e-05, "loss": 0.2953, "step": 317900 }, { "epoch": 1.1789217685309448, "grad_norm": 0.4171833395957947, "learning_rate": 3.67590871839362e-05, "loss": 0.3206, "step": 318000 }, { "epoch": 1.1792924986468352, "grad_norm": 0.28946393728256226, "learning_rate": 3.673072830463194e-05, "loss": 0.3028, "step": 318100 }, { "epoch": 1.1796632287627253, "grad_norm": 0.9751963019371033, "learning_rate": 3.670237401659481e-05, "loss": 0.3155, "step": 318200 }, { "epoch": 1.1800339588786155, "grad_norm": 0.3482624590396881, "learning_rate": 3.667402432963563e-05, "loss": 0.2811, "step": 318300 }, { "epoch": 1.1804046889945057, "grad_norm": 0.563030481338501, "learning_rate": 3.664567925356359e-05, "loss": 0.3163, "step": 318400 }, { "epoch": 1.180775419110396, "grad_norm": 0.36036673188209534, "learning_rate": 3.6617338798186295e-05, "loss": 0.3057, "step": 318500 }, { "epoch": 1.1811461492262862, "grad_norm": 0.5289510488510132, "learning_rate": 3.658900297330978e-05, "loss": 0.2901, "step": 318600 }, { "epoch": 1.1815168793421764, "grad_norm": 0.7156248092651367, "learning_rate": 3.6560671788738444e-05, "loss": 0.316, "step": 318700 }, { "epoch": 1.1818876094580668, "grad_norm": 0.9434233903884888, "learning_rate": 3.6532345254275055e-05, "loss": 0.2754, "step": 318800 }, { "epoch": 1.182258339573957, "grad_norm": 1.38754141330719, "learning_rate": 3.650402337972082e-05, "loss": 0.3042, "step": 318900 }, { "epoch": 1.1826290696898472, "grad_norm": 0.8475192189216614, "learning_rate": 3.647570617487535e-05, "loss": 0.2945, "step": 319000 }, { "epoch": 1.1829997998057373, "grad_norm": 0.4481724798679352, "learning_rate": 3.644739364953658e-05, "loss": 0.2888, "step": 319100 }, { "epoch": 1.1833705299216277, "grad_norm": 0.8915621042251587, "learning_rate": 3.6419085813500855e-05, "loss": 0.3229, "step": 319200 }, { "epoch": 1.183741260037518, "grad_norm": 0.36987295746803284, "learning_rate": 3.639078267656291e-05, "loss": 0.271, "step": 319300 }, { "epoch": 1.184111990153408, "grad_norm": 1.3728951215744019, "learning_rate": 3.636248424851584e-05, "loss": 0.3116, "step": 319400 }, { "epoch": 1.1844827202692985, "grad_norm": 0.5537104606628418, "learning_rate": 3.633419053915112e-05, "loss": 0.2889, "step": 319500 }, { "epoch": 1.1848534503851886, "grad_norm": 0.6035724878311157, "learning_rate": 3.630590155825856e-05, "loss": 0.3135, "step": 319600 }, { "epoch": 1.1852241805010788, "grad_norm": 1.0421808958053589, "learning_rate": 3.627761731562637e-05, "loss": 0.2869, "step": 319700 }, { "epoch": 1.185594910616969, "grad_norm": 0.4305557310581207, "learning_rate": 3.624933782104112e-05, "loss": 0.3049, "step": 319800 }, { "epoch": 1.1859656407328594, "grad_norm": 0.4891347885131836, "learning_rate": 3.622106308428771e-05, "loss": 0.2766, "step": 319900 }, { "epoch": 1.1863363708487495, "grad_norm": 0.61009281873703, "learning_rate": 3.6192793115149424e-05, "loss": 0.2894, "step": 320000 }, { "epoch": 1.1867071009646397, "grad_norm": 0.347599059343338, "learning_rate": 3.6164527923407885e-05, "loss": 0.2984, "step": 320100 }, { "epoch": 1.18707783108053, "grad_norm": 0.6945781707763672, "learning_rate": 3.613626751884307e-05, "loss": 0.2906, "step": 320200 }, { "epoch": 1.1874485611964203, "grad_norm": 0.44343534111976624, "learning_rate": 3.6108011911233244e-05, "loss": 0.3189, "step": 320300 }, { "epoch": 1.1878192913123105, "grad_norm": 0.7170314788818359, "learning_rate": 3.607976111035509e-05, "loss": 0.2852, "step": 320400 }, { "epoch": 1.1881900214282006, "grad_norm": 0.5434401035308838, "learning_rate": 3.60515151259836e-05, "loss": 0.3085, "step": 320500 }, { "epoch": 1.188560751544091, "grad_norm": 0.976170539855957, "learning_rate": 3.602327396789209e-05, "loss": 0.267, "step": 320600 }, { "epoch": 1.1889314816599812, "grad_norm": 0.6890117526054382, "learning_rate": 3.5995037645852205e-05, "loss": 0.3016, "step": 320700 }, { "epoch": 1.1893022117758714, "grad_norm": 0.6361458897590637, "learning_rate": 3.596680616963394e-05, "loss": 0.3407, "step": 320800 }, { "epoch": 1.1896729418917618, "grad_norm": 0.5332069993019104, "learning_rate": 3.593857954900557e-05, "loss": 0.3121, "step": 320900 }, { "epoch": 1.190043672007652, "grad_norm": 0.8180859684944153, "learning_rate": 3.591035779373376e-05, "loss": 0.2791, "step": 321000 }, { "epoch": 1.190414402123542, "grad_norm": 0.7341098189353943, "learning_rate": 3.58821409135834e-05, "loss": 0.3313, "step": 321100 }, { "epoch": 1.1907851322394323, "grad_norm": 1.106126308441162, "learning_rate": 3.585392891831777e-05, "loss": 0.282, "step": 321200 }, { "epoch": 1.1911558623553224, "grad_norm": 0.5699649453163147, "learning_rate": 3.582572181769842e-05, "loss": 0.2911, "step": 321300 }, { "epoch": 1.1915265924712128, "grad_norm": 0.8061725497245789, "learning_rate": 3.579751962148521e-05, "loss": 0.3116, "step": 321400 }, { "epoch": 1.191897322587103, "grad_norm": 0.9350426197052002, "learning_rate": 3.576932233943635e-05, "loss": 0.2833, "step": 321500 }, { "epoch": 1.1922680527029932, "grad_norm": 1.159969687461853, "learning_rate": 3.574112998130825e-05, "loss": 0.3199, "step": 321600 }, { "epoch": 1.1926387828188836, "grad_norm": 0.8622734546661377, "learning_rate": 3.5712942556855746e-05, "loss": 0.2578, "step": 321700 }, { "epoch": 1.1930095129347738, "grad_norm": 1.3241548538208008, "learning_rate": 3.5684760075831836e-05, "loss": 0.3075, "step": 321800 }, { "epoch": 1.193380243050664, "grad_norm": 0.46996235847473145, "learning_rate": 3.565658254798788e-05, "loss": 0.3062, "step": 321900 }, { "epoch": 1.193750973166554, "grad_norm": 0.40048637986183167, "learning_rate": 3.562840998307355e-05, "loss": 0.2411, "step": 322000 }, { "epoch": 1.1941217032824445, "grad_norm": 0.5478109121322632, "learning_rate": 3.560024239083672e-05, "loss": 0.3202, "step": 322100 }, { "epoch": 1.1944924333983347, "grad_norm": 1.0975698232650757, "learning_rate": 3.557207978102362e-05, "loss": 0.2977, "step": 322200 }, { "epoch": 1.1948631635142248, "grad_norm": 0.26537537574768066, "learning_rate": 3.554392216337872e-05, "loss": 0.2975, "step": 322300 }, { "epoch": 1.1952338936301152, "grad_norm": 0.8443964123725891, "learning_rate": 3.551576954764474e-05, "loss": 0.2781, "step": 322400 }, { "epoch": 1.1956046237460054, "grad_norm": 0.8928636908531189, "learning_rate": 3.548762194356273e-05, "loss": 0.2877, "step": 322500 }, { "epoch": 1.1959753538618956, "grad_norm": 0.25968682765960693, "learning_rate": 3.545947936087194e-05, "loss": 0.2895, "step": 322600 }, { "epoch": 1.1963460839777857, "grad_norm": 0.9225914478302002, "learning_rate": 3.5431341809309935e-05, "loss": 0.293, "step": 322700 }, { "epoch": 1.1967168140936761, "grad_norm": 1.1563149690628052, "learning_rate": 3.540320929861249e-05, "loss": 0.3022, "step": 322800 }, { "epoch": 1.1970875442095663, "grad_norm": 0.7266859412193298, "learning_rate": 3.53750818385137e-05, "loss": 0.2898, "step": 322900 }, { "epoch": 1.1974582743254565, "grad_norm": 0.7514382600784302, "learning_rate": 3.534695943874586e-05, "loss": 0.2837, "step": 323000 }, { "epoch": 1.1978290044413469, "grad_norm": 0.844694197177887, "learning_rate": 3.531884210903953e-05, "loss": 0.2848, "step": 323100 }, { "epoch": 1.198199734557237, "grad_norm": 0.6584658622741699, "learning_rate": 3.529072985912352e-05, "loss": 0.2847, "step": 323200 }, { "epoch": 1.1985704646731272, "grad_norm": 1.5621814727783203, "learning_rate": 3.526262269872488e-05, "loss": 0.2688, "step": 323300 }, { "epoch": 1.1989411947890174, "grad_norm": 0.8096466064453125, "learning_rate": 3.523452063756889e-05, "loss": 0.2808, "step": 323400 }, { "epoch": 1.1993119249049078, "grad_norm": 0.7810752391815186, "learning_rate": 3.520642368537907e-05, "loss": 0.3156, "step": 323500 }, { "epoch": 1.199682655020798, "grad_norm": 0.8722066283226013, "learning_rate": 3.517833185187719e-05, "loss": 0.2869, "step": 323600 }, { "epoch": 1.2000533851366881, "grad_norm": 1.304978609085083, "learning_rate": 3.515024514678321e-05, "loss": 0.2709, "step": 323700 }, { "epoch": 1.2004241152525785, "grad_norm": 0.6331710815429688, "learning_rate": 3.512216357981538e-05, "loss": 0.2903, "step": 323800 }, { "epoch": 1.2007948453684687, "grad_norm": 0.7791673541069031, "learning_rate": 3.509408716069009e-05, "loss": 0.2789, "step": 323900 }, { "epoch": 1.2011655754843589, "grad_norm": 1.378676414489746, "learning_rate": 3.506601589912203e-05, "loss": 0.3045, "step": 324000 }, { "epoch": 1.201536305600249, "grad_norm": 0.7027153968811035, "learning_rate": 3.503794980482402e-05, "loss": 0.2974, "step": 324100 }, { "epoch": 1.2019070357161394, "grad_norm": 0.7143474817276001, "learning_rate": 3.500988888750718e-05, "loss": 0.2942, "step": 324200 }, { "epoch": 1.2022777658320296, "grad_norm": 0.64955073595047, "learning_rate": 3.498183315688077e-05, "loss": 0.3172, "step": 324300 }, { "epoch": 1.2026484959479198, "grad_norm": 0.9181506633758545, "learning_rate": 3.495378262265229e-05, "loss": 0.2901, "step": 324400 }, { "epoch": 1.2030192260638102, "grad_norm": 0.6267714500427246, "learning_rate": 3.492573729452745e-05, "loss": 0.2752, "step": 324500 }, { "epoch": 1.2033899561797003, "grad_norm": 0.8254978060722351, "learning_rate": 3.489769718221011e-05, "loss": 0.2725, "step": 324600 }, { "epoch": 1.2037606862955905, "grad_norm": 0.46210891008377075, "learning_rate": 3.4869662295402387e-05, "loss": 0.3042, "step": 324700 }, { "epoch": 1.2041314164114807, "grad_norm": 0.6721453666687012, "learning_rate": 3.484163264380458e-05, "loss": 0.2978, "step": 324800 }, { "epoch": 1.204502146527371, "grad_norm": 1.2176671028137207, "learning_rate": 3.481360823711511e-05, "loss": 0.2819, "step": 324900 }, { "epoch": 1.2048728766432613, "grad_norm": 1.4358655214309692, "learning_rate": 3.478558908503065e-05, "loss": 0.2928, "step": 325000 }, { "epoch": 1.2052436067591514, "grad_norm": 0.6040534377098083, "learning_rate": 3.475757519724603e-05, "loss": 0.271, "step": 325100 }, { "epoch": 1.2056143368750418, "grad_norm": 0.7980223894119263, "learning_rate": 3.472956658345428e-05, "loss": 0.2803, "step": 325200 }, { "epoch": 1.205985066990932, "grad_norm": 0.6284632682800293, "learning_rate": 3.470156325334658e-05, "loss": 0.2937, "step": 325300 }, { "epoch": 1.2063557971068222, "grad_norm": 1.7421720027923584, "learning_rate": 3.467356521661229e-05, "loss": 0.315, "step": 325400 }, { "epoch": 1.2067265272227123, "grad_norm": 0.7833635210990906, "learning_rate": 3.464557248293895e-05, "loss": 0.2917, "step": 325500 }, { "epoch": 1.2070972573386027, "grad_norm": 0.7289316058158875, "learning_rate": 3.461758506201225e-05, "loss": 0.2835, "step": 325600 }, { "epoch": 1.207467987454493, "grad_norm": 0.6287752985954285, "learning_rate": 3.458960296351604e-05, "loss": 0.2985, "step": 325700 }, { "epoch": 1.207838717570383, "grad_norm": 0.8567129969596863, "learning_rate": 3.456162619713234e-05, "loss": 0.2888, "step": 325800 }, { "epoch": 1.2082094476862733, "grad_norm": 0.3850800395011902, "learning_rate": 3.453365477254131e-05, "loss": 0.3184, "step": 325900 }, { "epoch": 1.2085801778021636, "grad_norm": 0.5888515710830688, "learning_rate": 3.45056886994213e-05, "loss": 0.3166, "step": 326000 }, { "epoch": 1.2089509079180538, "grad_norm": 0.6543995141983032, "learning_rate": 3.4477727987448754e-05, "loss": 0.2761, "step": 326100 }, { "epoch": 1.209321638033944, "grad_norm": 1.430224061012268, "learning_rate": 3.4449772646298296e-05, "loss": 0.2911, "step": 326200 }, { "epoch": 1.2096923681498342, "grad_norm": 0.5956946015357971, "learning_rate": 3.4421822685642694e-05, "loss": 0.2947, "step": 326300 }, { "epoch": 1.2100630982657246, "grad_norm": 0.6737325191497803, "learning_rate": 3.439387811515283e-05, "loss": 0.2935, "step": 326400 }, { "epoch": 1.2104338283816147, "grad_norm": 0.8973527550697327, "learning_rate": 3.436593894449772e-05, "loss": 0.3257, "step": 326500 }, { "epoch": 1.210804558497505, "grad_norm": 0.5406954288482666, "learning_rate": 3.4338005183344546e-05, "loss": 0.2821, "step": 326600 }, { "epoch": 1.2111752886133953, "grad_norm": 1.10105299949646, "learning_rate": 3.4310076841358595e-05, "loss": 0.2548, "step": 326700 }, { "epoch": 1.2115460187292855, "grad_norm": 0.5532819628715515, "learning_rate": 3.428215392820328e-05, "loss": 0.2822, "step": 326800 }, { "epoch": 1.2119167488451756, "grad_norm": 0.991688072681427, "learning_rate": 3.425423645354013e-05, "loss": 0.2809, "step": 326900 }, { "epoch": 1.2122874789610658, "grad_norm": 0.9968979954719543, "learning_rate": 3.4226324427028835e-05, "loss": 0.3372, "step": 327000 }, { "epoch": 1.2126582090769562, "grad_norm": 1.394853949546814, "learning_rate": 3.4198417858327136e-05, "loss": 0.3043, "step": 327100 }, { "epoch": 1.2130289391928464, "grad_norm": 0.5711827278137207, "learning_rate": 3.4170516757090894e-05, "loss": 0.2887, "step": 327200 }, { "epoch": 1.2133996693087366, "grad_norm": 0.6787301898002625, "learning_rate": 3.414262113297412e-05, "loss": 0.2914, "step": 327300 }, { "epoch": 1.213770399424627, "grad_norm": 0.8415539860725403, "learning_rate": 3.411473099562892e-05, "loss": 0.2756, "step": 327400 }, { "epoch": 1.2141411295405171, "grad_norm": 0.7247799038887024, "learning_rate": 3.4086846354705475e-05, "loss": 0.339, "step": 327500 }, { "epoch": 1.2145118596564073, "grad_norm": 0.8776405453681946, "learning_rate": 3.405896721985208e-05, "loss": 0.2642, "step": 327600 }, { "epoch": 1.2148825897722975, "grad_norm": 0.556689441204071, "learning_rate": 3.403109360071514e-05, "loss": 0.2951, "step": 327700 }, { "epoch": 1.2152533198881879, "grad_norm": 0.8359660506248474, "learning_rate": 3.400322550693911e-05, "loss": 0.2813, "step": 327800 }, { "epoch": 1.215624050004078, "grad_norm": 1.1281981468200684, "learning_rate": 3.3975362948166577e-05, "loss": 0.3049, "step": 327900 }, { "epoch": 1.2159947801199682, "grad_norm": 0.776417076587677, "learning_rate": 3.394750593403818e-05, "loss": 0.2764, "step": 328000 }, { "epoch": 1.2163655102358586, "grad_norm": 0.6139997839927673, "learning_rate": 3.391965447419267e-05, "loss": 0.2949, "step": 328100 }, { "epoch": 1.2167362403517488, "grad_norm": 0.8623173236846924, "learning_rate": 3.3891808578266854e-05, "loss": 0.2848, "step": 328200 }, { "epoch": 1.217106970467639, "grad_norm": 1.0468287467956543, "learning_rate": 3.3863968255895605e-05, "loss": 0.2985, "step": 328300 }, { "epoch": 1.217477700583529, "grad_norm": 1.0114657878875732, "learning_rate": 3.383613351671191e-05, "loss": 0.2743, "step": 328400 }, { "epoch": 1.2178484306994195, "grad_norm": 1.0319418907165527, "learning_rate": 3.3808304370346776e-05, "loss": 0.316, "step": 328500 }, { "epoch": 1.2182191608153097, "grad_norm": 0.37671375274658203, "learning_rate": 3.378048082642933e-05, "loss": 0.2936, "step": 328600 }, { "epoch": 1.2185898909311998, "grad_norm": 0.6055976748466492, "learning_rate": 3.375266289458667e-05, "loss": 0.309, "step": 328700 }, { "epoch": 1.2189606210470902, "grad_norm": 1.2362816333770752, "learning_rate": 3.3724850584444036e-05, "loss": 0.2893, "step": 328800 }, { "epoch": 1.2193313511629804, "grad_norm": 1.2564811706542969, "learning_rate": 3.36970439056247e-05, "loss": 0.3067, "step": 328900 }, { "epoch": 1.2197020812788706, "grad_norm": 0.3186763823032379, "learning_rate": 3.366924286774998e-05, "loss": 0.3087, "step": 329000 }, { "epoch": 1.2200728113947608, "grad_norm": 1.3609800338745117, "learning_rate": 3.364144748043923e-05, "loss": 0.2935, "step": 329100 }, { "epoch": 1.2204435415106512, "grad_norm": 0.6055441498756409, "learning_rate": 3.3613657753309885e-05, "loss": 0.2552, "step": 329200 }, { "epoch": 1.2208142716265413, "grad_norm": 1.0364692211151123, "learning_rate": 3.358587369597738e-05, "loss": 0.2667, "step": 329300 }, { "epoch": 1.2211850017424315, "grad_norm": 0.8302369713783264, "learning_rate": 3.355809531805521e-05, "loss": 0.2985, "step": 329400 }, { "epoch": 1.221555731858322, "grad_norm": 0.4156440496444702, "learning_rate": 3.3530322629154904e-05, "loss": 0.2923, "step": 329500 }, { "epoch": 1.221926461974212, "grad_norm": 0.5626766681671143, "learning_rate": 3.3502555638886015e-05, "loss": 0.2675, "step": 329600 }, { "epoch": 1.2222971920901022, "grad_norm": 0.5444937944412231, "learning_rate": 3.347479435685613e-05, "loss": 0.2888, "step": 329700 }, { "epoch": 1.2226679222059924, "grad_norm": 1.6904546022415161, "learning_rate": 3.344703879267086e-05, "loss": 0.2985, "step": 329800 }, { "epoch": 1.2230386523218828, "grad_norm": 0.6921955943107605, "learning_rate": 3.341928895593383e-05, "loss": 0.269, "step": 329900 }, { "epoch": 1.223409382437773, "grad_norm": 0.4625169336795807, "learning_rate": 3.339154485624673e-05, "loss": 0.2938, "step": 330000 }, { "epoch": 1.2237801125536631, "grad_norm": 0.419898122549057, "learning_rate": 3.336380650320919e-05, "loss": 0.3362, "step": 330100 }, { "epoch": 1.2241508426695535, "grad_norm": 0.5732890963554382, "learning_rate": 3.333607390641889e-05, "loss": 0.2656, "step": 330200 }, { "epoch": 1.2245215727854437, "grad_norm": 0.7199653387069702, "learning_rate": 3.33083470754715e-05, "loss": 0.2879, "step": 330300 }, { "epoch": 1.2248923029013339, "grad_norm": 0.7143241763114929, "learning_rate": 3.328062601996075e-05, "loss": 0.2643, "step": 330400 }, { "epoch": 1.225263033017224, "grad_norm": 1.1887190341949463, "learning_rate": 3.3252910749478295e-05, "loss": 0.2777, "step": 330500 }, { "epoch": 1.2256337631331145, "grad_norm": 0.25690460205078125, "learning_rate": 3.3225201273613845e-05, "loss": 0.2822, "step": 330600 }, { "epoch": 1.2260044932490046, "grad_norm": 0.8591539263725281, "learning_rate": 3.3197497601955104e-05, "loss": 0.2595, "step": 330700 }, { "epoch": 1.2263752233648948, "grad_norm": 0.6982511281967163, "learning_rate": 3.3169799744087706e-05, "loss": 0.2811, "step": 330800 }, { "epoch": 1.226745953480785, "grad_norm": 0.7081834673881531, "learning_rate": 3.314210770959537e-05, "loss": 0.2875, "step": 330900 }, { "epoch": 1.2271166835966754, "grad_norm": 0.8014085292816162, "learning_rate": 3.311442150805969e-05, "loss": 0.2814, "step": 331000 }, { "epoch": 1.2274874137125655, "grad_norm": 0.8321959376335144, "learning_rate": 3.308674114906034e-05, "loss": 0.2984, "step": 331100 }, { "epoch": 1.2278581438284557, "grad_norm": 0.5394162535667419, "learning_rate": 3.3059066642174895e-05, "loss": 0.279, "step": 331200 }, { "epoch": 1.2282288739443459, "grad_norm": 1.3413299322128296, "learning_rate": 3.303139799697898e-05, "loss": 0.3103, "step": 331300 }, { "epoch": 1.2285996040602363, "grad_norm": 0.8135436773300171, "learning_rate": 3.300373522304613e-05, "loss": 0.2933, "step": 331400 }, { "epoch": 1.2289703341761264, "grad_norm": 0.5948440432548523, "learning_rate": 3.297607832994787e-05, "loss": 0.2916, "step": 331500 }, { "epoch": 1.2293410642920166, "grad_norm": 0.6822120547294617, "learning_rate": 3.2948427327253705e-05, "loss": 0.303, "step": 331600 }, { "epoch": 1.229711794407907, "grad_norm": 0.5508038401603699, "learning_rate": 3.2920782224531096e-05, "loss": 0.2866, "step": 331700 }, { "epoch": 1.2300825245237972, "grad_norm": 0.6446106433868408, "learning_rate": 3.289314303134542e-05, "loss": 0.3193, "step": 331800 }, { "epoch": 1.2304532546396874, "grad_norm": 1.3407903909683228, "learning_rate": 3.286550975726008e-05, "loss": 0.2999, "step": 331900 }, { "epoch": 1.2308239847555775, "grad_norm": 0.5730645656585693, "learning_rate": 3.283788241183637e-05, "loss": 0.2866, "step": 332000 }, { "epoch": 1.231194714871468, "grad_norm": 0.859276533126831, "learning_rate": 3.281026100463356e-05, "loss": 0.2849, "step": 332100 }, { "epoch": 1.231565444987358, "grad_norm": 1.4643527269363403, "learning_rate": 3.278264554520889e-05, "loss": 0.2899, "step": 332200 }, { "epoch": 1.2319361751032483, "grad_norm": 0.9891154170036316, "learning_rate": 3.2755036043117486e-05, "loss": 0.297, "step": 332300 }, { "epoch": 1.2323069052191387, "grad_norm": 0.44582685828208923, "learning_rate": 3.2727432507912474e-05, "loss": 0.2698, "step": 332400 }, { "epoch": 1.2326776353350288, "grad_norm": 0.7786964774131775, "learning_rate": 3.2699834949144844e-05, "loss": 0.3098, "step": 332500 }, { "epoch": 1.233048365450919, "grad_norm": 1.6788084506988525, "learning_rate": 3.267224337636358e-05, "loss": 0.2893, "step": 332600 }, { "epoch": 1.2334190955668092, "grad_norm": 1.6093376874923706, "learning_rate": 3.264465779911556e-05, "loss": 0.3172, "step": 332700 }, { "epoch": 1.2337898256826996, "grad_norm": 0.3671197295188904, "learning_rate": 3.2617078226945615e-05, "loss": 0.2879, "step": 332800 }, { "epoch": 1.2341605557985897, "grad_norm": 0.5765570998191833, "learning_rate": 3.258950466939648e-05, "loss": 0.2948, "step": 332900 }, { "epoch": 1.23453128591448, "grad_norm": 0.6689721345901489, "learning_rate": 3.25619371360088e-05, "loss": 0.2749, "step": 333000 }, { "epoch": 1.2349020160303703, "grad_norm": 0.7708108425140381, "learning_rate": 3.253437563632115e-05, "loss": 0.265, "step": 333100 }, { "epoch": 1.2352727461462605, "grad_norm": 0.33011069893836975, "learning_rate": 3.250682017987005e-05, "loss": 0.2812, "step": 333200 }, { "epoch": 1.2356434762621507, "grad_norm": 0.6556157469749451, "learning_rate": 3.2479270776189846e-05, "loss": 0.2793, "step": 333300 }, { "epoch": 1.2360142063780408, "grad_norm": 1.187172770500183, "learning_rate": 3.245172743481285e-05, "loss": 0.3005, "step": 333400 }, { "epoch": 1.2363849364939312, "grad_norm": 1.147513508796692, "learning_rate": 3.242419016526927e-05, "loss": 0.3313, "step": 333500 }, { "epoch": 1.2367556666098214, "grad_norm": 0.7271843552589417, "learning_rate": 3.239665897708721e-05, "loss": 0.2807, "step": 333600 }, { "epoch": 1.2371263967257116, "grad_norm": 1.1557551622390747, "learning_rate": 3.2369133879792656e-05, "loss": 0.3113, "step": 333700 }, { "epoch": 1.237497126841602, "grad_norm": 0.9429115653038025, "learning_rate": 3.234161488290951e-05, "loss": 0.2742, "step": 333800 }, { "epoch": 1.2378678569574921, "grad_norm": 0.6592848300933838, "learning_rate": 3.2314101995959566e-05, "loss": 0.2733, "step": 333900 }, { "epoch": 1.2382385870733823, "grad_norm": 0.6802929639816284, "learning_rate": 3.228659522846244e-05, "loss": 0.3048, "step": 334000 }, { "epoch": 1.2386093171892725, "grad_norm": 0.905689537525177, "learning_rate": 3.225909458993572e-05, "loss": 0.298, "step": 334100 }, { "epoch": 1.2389800473051629, "grad_norm": 0.5612341165542603, "learning_rate": 3.223160008989481e-05, "loss": 0.3167, "step": 334200 }, { "epoch": 1.239350777421053, "grad_norm": 0.5202826857566833, "learning_rate": 3.2204111737853025e-05, "loss": 0.3023, "step": 334300 }, { "epoch": 1.2397215075369432, "grad_norm": 1.8481545448303223, "learning_rate": 3.2176629543321544e-05, "loss": 0.2973, "step": 334400 }, { "epoch": 1.2400922376528336, "grad_norm": 1.5184437036514282, "learning_rate": 3.214915351580939e-05, "loss": 0.3099, "step": 334500 }, { "epoch": 1.2404629677687238, "grad_norm": 0.8446977138519287, "learning_rate": 3.21216836648235e-05, "loss": 0.291, "step": 334600 }, { "epoch": 1.240833697884614, "grad_norm": 0.5614951848983765, "learning_rate": 3.2094219999868645e-05, "loss": 0.2577, "step": 334700 }, { "epoch": 1.2412044280005041, "grad_norm": 1.293229341506958, "learning_rate": 3.206676253044744e-05, "loss": 0.3147, "step": 334800 }, { "epoch": 1.2415751581163945, "grad_norm": 1.0659452676773071, "learning_rate": 3.203931126606038e-05, "loss": 0.3049, "step": 334900 }, { "epoch": 1.2419458882322847, "grad_norm": 1.0005970001220703, "learning_rate": 3.20118662162058e-05, "loss": 0.3021, "step": 335000 }, { "epoch": 1.2423166183481749, "grad_norm": 0.5686892867088318, "learning_rate": 3.198442739037992e-05, "loss": 0.327, "step": 335100 }, { "epoch": 1.242687348464065, "grad_norm": 0.8662936091423035, "learning_rate": 3.1956994798076756e-05, "loss": 0.2759, "step": 335200 }, { "epoch": 1.2430580785799554, "grad_norm": 0.8775308728218079, "learning_rate": 3.1929568448788195e-05, "loss": 0.2793, "step": 335300 }, { "epoch": 1.2434288086958456, "grad_norm": 0.8791937232017517, "learning_rate": 3.190214835200397e-05, "loss": 0.2897, "step": 335400 }, { "epoch": 1.2437995388117358, "grad_norm": 0.5155762434005737, "learning_rate": 3.187473451721162e-05, "loss": 0.3259, "step": 335500 }, { "epoch": 1.244170268927626, "grad_norm": 0.8431792259216309, "learning_rate": 3.184732695389656e-05, "loss": 0.3434, "step": 335600 }, { "epoch": 1.2445409990435163, "grad_norm": 0.5949411988258362, "learning_rate": 3.181992567154198e-05, "loss": 0.2771, "step": 335700 }, { "epoch": 1.2449117291594065, "grad_norm": 0.82327800989151, "learning_rate": 3.1792530679628953e-05, "loss": 0.2917, "step": 335800 }, { "epoch": 1.2452824592752967, "grad_norm": 0.644843339920044, "learning_rate": 3.176514198763635e-05, "loss": 0.2686, "step": 335900 }, { "epoch": 1.245653189391187, "grad_norm": 0.628823459148407, "learning_rate": 3.173775960504084e-05, "loss": 0.2743, "step": 336000 }, { "epoch": 1.2460239195070772, "grad_norm": 0.3404079079627991, "learning_rate": 3.171038354131696e-05, "loss": 0.2772, "step": 336100 }, { "epoch": 1.2463946496229674, "grad_norm": 0.8132339715957642, "learning_rate": 3.168301380593703e-05, "loss": 0.2944, "step": 336200 }, { "epoch": 1.2467653797388576, "grad_norm": 0.8937737941741943, "learning_rate": 3.165565040837117e-05, "loss": 0.3027, "step": 336300 }, { "epoch": 1.247136109854748, "grad_norm": 0.8510625958442688, "learning_rate": 3.1628293358087316e-05, "loss": 0.3148, "step": 336400 }, { "epoch": 1.2475068399706382, "grad_norm": 1.530387043952942, "learning_rate": 3.160094266455122e-05, "loss": 0.2957, "step": 336500 }, { "epoch": 1.2478775700865283, "grad_norm": 1.6429272890090942, "learning_rate": 3.157359833722644e-05, "loss": 0.2709, "step": 336600 }, { "epoch": 1.2482483002024187, "grad_norm": 0.5057021379470825, "learning_rate": 3.154626038557429e-05, "loss": 0.2811, "step": 336700 }, { "epoch": 1.248619030318309, "grad_norm": 0.5643353462219238, "learning_rate": 3.1518928819053915e-05, "loss": 0.3085, "step": 336800 }, { "epoch": 1.248989760434199, "grad_norm": 0.8996047973632812, "learning_rate": 3.149160364712226e-05, "loss": 0.2766, "step": 336900 }, { "epoch": 1.2493604905500892, "grad_norm": 1.1205815076828003, "learning_rate": 3.146428487923404e-05, "loss": 0.3062, "step": 337000 }, { "epoch": 1.2497312206659796, "grad_norm": 0.6949426531791687, "learning_rate": 3.143697252484171e-05, "loss": 0.2798, "step": 337100 }, { "epoch": 1.2501019507818698, "grad_norm": 1.0529121160507202, "learning_rate": 3.1409666593395584e-05, "loss": 0.2948, "step": 337200 }, { "epoch": 1.25047268089776, "grad_norm": 0.8453161120414734, "learning_rate": 3.138236709434371e-05, "loss": 0.294, "step": 337300 }, { "epoch": 1.2508434110136504, "grad_norm": 0.44544661045074463, "learning_rate": 3.1355074037131915e-05, "loss": 0.2698, "step": 337400 }, { "epoch": 1.2512141411295405, "grad_norm": 0.6506984233856201, "learning_rate": 3.1327787431203805e-05, "loss": 0.278, "step": 337500 }, { "epoch": 1.2515848712454307, "grad_norm": 0.5362005829811096, "learning_rate": 3.130050728600075e-05, "loss": 0.3112, "step": 337600 }, { "epoch": 1.251955601361321, "grad_norm": 1.0012495517730713, "learning_rate": 3.127323361096189e-05, "loss": 0.2873, "step": 337700 }, { "epoch": 1.2523263314772113, "grad_norm": 0.6326090693473816, "learning_rate": 3.124596641552412e-05, "loss": 0.2973, "step": 337800 }, { "epoch": 1.2526970615931015, "grad_norm": 1.0068843364715576, "learning_rate": 3.1218705709122065e-05, "loss": 0.2856, "step": 337900 }, { "epoch": 1.2530677917089916, "grad_norm": 1.1874839067459106, "learning_rate": 3.119145150118816e-05, "loss": 0.2671, "step": 338000 }, { "epoch": 1.253438521824882, "grad_norm": 0.9870126843452454, "learning_rate": 3.116420380115258e-05, "loss": 0.2931, "step": 338100 }, { "epoch": 1.2538092519407722, "grad_norm": 1.0272694826126099, "learning_rate": 3.113696261844319e-05, "loss": 0.3004, "step": 338200 }, { "epoch": 1.2541799820566624, "grad_norm": 0.9125121831893921, "learning_rate": 3.110972796248567e-05, "loss": 0.2868, "step": 338300 }, { "epoch": 1.2545507121725525, "grad_norm": 0.4688582122325897, "learning_rate": 3.108249984270342e-05, "loss": 0.2916, "step": 338400 }, { "epoch": 1.254921442288443, "grad_norm": 0.6390019059181213, "learning_rate": 3.105527826851756e-05, "loss": 0.3095, "step": 338500 }, { "epoch": 1.255292172404333, "grad_norm": 0.21954426169395447, "learning_rate": 3.102806324934695e-05, "loss": 0.318, "step": 338600 }, { "epoch": 1.2556629025202233, "grad_norm": 0.5983810424804688, "learning_rate": 3.1000854794608205e-05, "loss": 0.294, "step": 338700 }, { "epoch": 1.2560336326361137, "grad_norm": 0.6520207524299622, "learning_rate": 3.097365291371566e-05, "loss": 0.3033, "step": 338800 }, { "epoch": 1.2564043627520038, "grad_norm": 1.4797192811965942, "learning_rate": 3.094645761608134e-05, "loss": 0.2687, "step": 338900 }, { "epoch": 1.256775092867894, "grad_norm": 1.018478512763977, "learning_rate": 3.091926891111504e-05, "loss": 0.2865, "step": 339000 }, { "epoch": 1.2571458229837842, "grad_norm": 0.6615822911262512, "learning_rate": 3.0892086808224266e-05, "loss": 0.2823, "step": 339100 }, { "epoch": 1.2575165530996744, "grad_norm": 0.817698061466217, "learning_rate": 3.08649113168142e-05, "loss": 0.2974, "step": 339200 }, { "epoch": 1.2578872832155648, "grad_norm": 0.6369227170944214, "learning_rate": 3.0837742446287815e-05, "loss": 0.2704, "step": 339300 }, { "epoch": 1.258258013331455, "grad_norm": 1.2033982276916504, "learning_rate": 3.081058020604569e-05, "loss": 0.299, "step": 339400 }, { "epoch": 1.2586287434473453, "grad_norm": 0.7431185841560364, "learning_rate": 3.078342460548618e-05, "loss": 0.2691, "step": 339500 }, { "epoch": 1.2589994735632355, "grad_norm": 1.0327426195144653, "learning_rate": 3.075627565400533e-05, "loss": 0.2767, "step": 339600 }, { "epoch": 1.2593702036791257, "grad_norm": 0.5828046798706055, "learning_rate": 3.072913336099688e-05, "loss": 0.2826, "step": 339700 }, { "epoch": 1.2597409337950158, "grad_norm": 0.8097419738769531, "learning_rate": 3.070199773585225e-05, "loss": 0.2935, "step": 339800 }, { "epoch": 1.260111663910906, "grad_norm": 0.6730536818504333, "learning_rate": 3.067486878796061e-05, "loss": 0.2958, "step": 339900 }, { "epoch": 1.2604823940267964, "grad_norm": 0.6442365646362305, "learning_rate": 3.0647746526708757e-05, "loss": 0.3022, "step": 340000 }, { "epoch": 1.2608531241426866, "grad_norm": 1.4258694648742676, "learning_rate": 3.062063096148118e-05, "loss": 0.2747, "step": 340100 }, { "epoch": 1.261223854258577, "grad_norm": 1.3933933973312378, "learning_rate": 3.0593522101660066e-05, "loss": 0.2908, "step": 340200 }, { "epoch": 1.2615945843744671, "grad_norm": 0.5321272015571594, "learning_rate": 3.0566419956625314e-05, "loss": 0.3093, "step": 340300 }, { "epoch": 1.2619653144903573, "grad_norm": 0.9717157483100891, "learning_rate": 3.053932453575444e-05, "loss": 0.3, "step": 340400 }, { "epoch": 1.2623360446062475, "grad_norm": 0.7943215370178223, "learning_rate": 3.0512235848422666e-05, "loss": 0.3105, "step": 340500 }, { "epoch": 1.2627067747221377, "grad_norm": 0.7128025889396667, "learning_rate": 3.0485153904002905e-05, "loss": 0.2991, "step": 340600 }, { "epoch": 1.263077504838028, "grad_norm": 0.5075551867485046, "learning_rate": 3.0458078711865678e-05, "loss": 0.3003, "step": 340700 }, { "epoch": 1.2634482349539182, "grad_norm": 1.077022910118103, "learning_rate": 3.0431010281379247e-05, "loss": 0.2992, "step": 340800 }, { "epoch": 1.2638189650698084, "grad_norm": 0.6372740268707275, "learning_rate": 3.040394862190944e-05, "loss": 0.2817, "step": 340900 }, { "epoch": 1.2641896951856988, "grad_norm": 0.6804953217506409, "learning_rate": 3.037689374281983e-05, "loss": 0.306, "step": 341000 }, { "epoch": 1.264560425301589, "grad_norm": 0.7992532253265381, "learning_rate": 3.0349845653471582e-05, "loss": 0.297, "step": 341100 }, { "epoch": 1.2649311554174791, "grad_norm": 0.8114628195762634, "learning_rate": 3.0322804363223545e-05, "loss": 0.2746, "step": 341200 }, { "epoch": 1.2653018855333693, "grad_norm": 0.16131573915481567, "learning_rate": 3.0295769881432233e-05, "loss": 0.2922, "step": 341300 }, { "epoch": 1.2656726156492597, "grad_norm": 0.6398255228996277, "learning_rate": 3.026874221745174e-05, "loss": 0.2713, "step": 341400 }, { "epoch": 1.2660433457651499, "grad_norm": 1.9287335872650146, "learning_rate": 3.0241721380633858e-05, "loss": 0.3192, "step": 341500 }, { "epoch": 1.26641407588104, "grad_norm": 1.659342646598816, "learning_rate": 3.0214707380328022e-05, "loss": 0.2824, "step": 341600 }, { "epoch": 1.2667848059969304, "grad_norm": 0.644237220287323, "learning_rate": 3.0187700225881233e-05, "loss": 0.2897, "step": 341700 }, { "epoch": 1.2671555361128206, "grad_norm": 0.8940252065658569, "learning_rate": 3.0160699926638196e-05, "loss": 0.3025, "step": 341800 }, { "epoch": 1.2675262662287108, "grad_norm": 0.7095876336097717, "learning_rate": 3.0133706491941203e-05, "loss": 0.2936, "step": 341900 }, { "epoch": 1.267896996344601, "grad_norm": 1.0331358909606934, "learning_rate": 3.010671993113019e-05, "loss": 0.3068, "step": 342000 }, { "epoch": 1.2682677264604914, "grad_norm": 1.1302483081817627, "learning_rate": 3.0079740253542716e-05, "loss": 0.2924, "step": 342100 }, { "epoch": 1.2686384565763815, "grad_norm": 0.49842941761016846, "learning_rate": 3.005276746851393e-05, "loss": 0.2687, "step": 342200 }, { "epoch": 1.2690091866922717, "grad_norm": 0.6318409442901611, "learning_rate": 3.0025801585376663e-05, "loss": 0.2843, "step": 342300 }, { "epoch": 1.269379916808162, "grad_norm": 1.0067545175552368, "learning_rate": 2.9998842613461254e-05, "loss": 0.3007, "step": 342400 }, { "epoch": 1.2697506469240523, "grad_norm": 1.1674336194992065, "learning_rate": 2.9971890562095746e-05, "loss": 0.309, "step": 342500 }, { "epoch": 1.2701213770399424, "grad_norm": 0.8291154503822327, "learning_rate": 2.994494544060573e-05, "loss": 0.2738, "step": 342600 }, { "epoch": 1.2704921071558326, "grad_norm": 0.2401881217956543, "learning_rate": 2.991800725831443e-05, "loss": 0.2951, "step": 342700 }, { "epoch": 1.270862837271723, "grad_norm": 0.7219059467315674, "learning_rate": 2.9891076024542658e-05, "loss": 0.2843, "step": 342800 }, { "epoch": 1.2712335673876132, "grad_norm": 0.2596327066421509, "learning_rate": 2.9864151748608814e-05, "loss": 0.252, "step": 342900 }, { "epoch": 1.2716042975035033, "grad_norm": 0.7575201392173767, "learning_rate": 2.9837234439828908e-05, "loss": 0.2821, "step": 343000 }, { "epoch": 1.2719750276193937, "grad_norm": 0.6029621362686157, "learning_rate": 2.9810324107516552e-05, "loss": 0.2887, "step": 343100 }, { "epoch": 1.272345757735284, "grad_norm": 0.6042705774307251, "learning_rate": 2.9783420760982882e-05, "loss": 0.2888, "step": 343200 }, { "epoch": 1.272716487851174, "grad_norm": 0.7918636798858643, "learning_rate": 2.9756524409536667e-05, "loss": 0.3044, "step": 343300 }, { "epoch": 1.2730872179670643, "grad_norm": 0.9494282007217407, "learning_rate": 2.9729635062484245e-05, "loss": 0.256, "step": 343400 }, { "epoch": 1.2734579480829546, "grad_norm": 1.0624327659606934, "learning_rate": 2.970275272912955e-05, "loss": 0.2775, "step": 343500 }, { "epoch": 1.2738286781988448, "grad_norm": 0.7842395901679993, "learning_rate": 2.9675877418774047e-05, "loss": 0.2897, "step": 343600 }, { "epoch": 1.274199408314735, "grad_norm": 0.5572916865348816, "learning_rate": 2.9649009140716806e-05, "loss": 0.269, "step": 343700 }, { "epoch": 1.2745701384306254, "grad_norm": 0.9732170701026917, "learning_rate": 2.9622147904254472e-05, "loss": 0.2969, "step": 343800 }, { "epoch": 1.2749408685465156, "grad_norm": 0.7869042158126831, "learning_rate": 2.9595293718681183e-05, "loss": 0.2609, "step": 343900 }, { "epoch": 1.2753115986624057, "grad_norm": 0.8459057807922363, "learning_rate": 2.9568446593288733e-05, "loss": 0.2603, "step": 344000 }, { "epoch": 1.275682328778296, "grad_norm": 1.1853020191192627, "learning_rate": 2.95416065373664e-05, "loss": 0.2986, "step": 344100 }, { "epoch": 1.276053058894186, "grad_norm": 0.9483721256256104, "learning_rate": 2.951477356020106e-05, "loss": 0.3309, "step": 344200 }, { "epoch": 1.2764237890100765, "grad_norm": 1.4359995126724243, "learning_rate": 2.948794767107712e-05, "loss": 0.2939, "step": 344300 }, { "epoch": 1.2767945191259666, "grad_norm": 0.8780760765075684, "learning_rate": 2.9461128879276535e-05, "loss": 0.3056, "step": 344400 }, { "epoch": 1.277165249241857, "grad_norm": 0.9349070191383362, "learning_rate": 2.943431719407881e-05, "loss": 0.2943, "step": 344500 }, { "epoch": 1.2775359793577472, "grad_norm": 1.2549184560775757, "learning_rate": 2.9407512624761e-05, "loss": 0.2789, "step": 344600 }, { "epoch": 1.2779067094736374, "grad_norm": 0.6267966032028198, "learning_rate": 2.9380715180597673e-05, "loss": 0.262, "step": 344700 }, { "epoch": 1.2782774395895276, "grad_norm": 0.6777209043502808, "learning_rate": 2.935392487086093e-05, "loss": 0.284, "step": 344800 }, { "epoch": 1.2786481697054177, "grad_norm": 0.7947162985801697, "learning_rate": 2.9327141704820437e-05, "loss": 0.3115, "step": 344900 }, { "epoch": 1.2790188998213081, "grad_norm": 0.729923665523529, "learning_rate": 2.9300365691743374e-05, "loss": 0.2845, "step": 345000 }, { "epoch": 1.2793896299371983, "grad_norm": 1.1135084629058838, "learning_rate": 2.9273596840894424e-05, "loss": 0.2994, "step": 345100 }, { "epoch": 1.2797603600530887, "grad_norm": 0.7051349878311157, "learning_rate": 2.9246835161535825e-05, "loss": 0.2976, "step": 345200 }, { "epoch": 1.2801310901689789, "grad_norm": 1.256906270980835, "learning_rate": 2.9220080662927318e-05, "loss": 0.3044, "step": 345300 }, { "epoch": 1.280501820284869, "grad_norm": 1.2389612197875977, "learning_rate": 2.9193333354326168e-05, "loss": 0.2876, "step": 345400 }, { "epoch": 1.2808725504007592, "grad_norm": 0.9361628293991089, "learning_rate": 2.9166593244987106e-05, "loss": 0.2958, "step": 345500 }, { "epoch": 1.2812432805166494, "grad_norm": 0.7560179829597473, "learning_rate": 2.9139860344162452e-05, "loss": 0.2786, "step": 345600 }, { "epoch": 1.2816140106325398, "grad_norm": 0.7783770561218262, "learning_rate": 2.9113134661101938e-05, "loss": 0.3078, "step": 345700 }, { "epoch": 1.28198474074843, "grad_norm": 0.46253111958503723, "learning_rate": 2.9086416205052914e-05, "loss": 0.2801, "step": 345800 }, { "epoch": 1.2823554708643201, "grad_norm": 1.0879919528961182, "learning_rate": 2.905970498526013e-05, "loss": 0.3023, "step": 345900 }, { "epoch": 1.2827262009802105, "grad_norm": 0.5743886232376099, "learning_rate": 2.903300101096585e-05, "loss": 0.2775, "step": 346000 }, { "epoch": 1.2830969310961007, "grad_norm": 0.6744182109832764, "learning_rate": 2.9006304291409913e-05, "loss": 0.2758, "step": 346100 }, { "epoch": 1.2834676612119909, "grad_norm": 0.7947282195091248, "learning_rate": 2.89796148358295e-05, "loss": 0.2618, "step": 346200 }, { "epoch": 1.283838391327881, "grad_norm": 0.6749051809310913, "learning_rate": 2.895293265345942e-05, "loss": 0.2806, "step": 346300 }, { "epoch": 1.2842091214437714, "grad_norm": 2.633596658706665, "learning_rate": 2.8926257753531894e-05, "loss": 0.2792, "step": 346400 }, { "epoch": 1.2845798515596616, "grad_norm": 1.3443878889083862, "learning_rate": 2.8899590145276613e-05, "loss": 0.2975, "step": 346500 }, { "epoch": 1.2849505816755518, "grad_norm": 1.1234698295593262, "learning_rate": 2.887292983792081e-05, "loss": 0.3265, "step": 346600 }, { "epoch": 1.2853213117914422, "grad_norm": 0.3485521674156189, "learning_rate": 2.884627684068913e-05, "loss": 0.2687, "step": 346700 }, { "epoch": 1.2856920419073323, "grad_norm": 0.5014833807945251, "learning_rate": 2.8819631162803686e-05, "loss": 0.283, "step": 346800 }, { "epoch": 1.2860627720232225, "grad_norm": 0.922800600528717, "learning_rate": 2.8792992813484142e-05, "loss": 0.2804, "step": 346900 }, { "epoch": 1.2864335021391127, "grad_norm": 1.5812201499938965, "learning_rate": 2.8766361801947494e-05, "loss": 0.2775, "step": 347000 }, { "epoch": 1.286804232255003, "grad_norm": 0.6724244356155396, "learning_rate": 2.8739738137408323e-05, "loss": 0.2826, "step": 347100 }, { "epoch": 1.2871749623708932, "grad_norm": 1.3165172338485718, "learning_rate": 2.8713121829078592e-05, "loss": 0.303, "step": 347200 }, { "epoch": 1.2875456924867834, "grad_norm": 1.6527501344680786, "learning_rate": 2.8686512886167736e-05, "loss": 0.3071, "step": 347300 }, { "epoch": 1.2879164226026738, "grad_norm": 0.7932207584381104, "learning_rate": 2.8659911317882676e-05, "loss": 0.2871, "step": 347400 }, { "epoch": 1.288287152718564, "grad_norm": 0.9350433945655823, "learning_rate": 2.863331713342774e-05, "loss": 0.2811, "step": 347500 }, { "epoch": 1.2886578828344541, "grad_norm": 1.1482244729995728, "learning_rate": 2.860673034200469e-05, "loss": 0.2848, "step": 347600 }, { "epoch": 1.2890286129503443, "grad_norm": 0.7119553685188293, "learning_rate": 2.8580150952812818e-05, "loss": 0.2846, "step": 347700 }, { "epoch": 1.2893993430662347, "grad_norm": 1.3546262979507446, "learning_rate": 2.8553578975048733e-05, "loss": 0.2753, "step": 347800 }, { "epoch": 1.2897700731821249, "grad_norm": 0.6805325746536255, "learning_rate": 2.852701441790653e-05, "loss": 0.2744, "step": 347900 }, { "epoch": 1.290140803298015, "grad_norm": 0.665729284286499, "learning_rate": 2.850045729057779e-05, "loss": 0.2701, "step": 348000 }, { "epoch": 1.2905115334139055, "grad_norm": 0.39937320351600647, "learning_rate": 2.847390760225146e-05, "loss": 0.2945, "step": 348100 }, { "epoch": 1.2908822635297956, "grad_norm": 2.9980239868164062, "learning_rate": 2.8447365362113902e-05, "loss": 0.2922, "step": 348200 }, { "epoch": 1.2912529936456858, "grad_norm": 0.6909918785095215, "learning_rate": 2.8420830579348977e-05, "loss": 0.2846, "step": 348300 }, { "epoch": 1.291623723761576, "grad_norm": 1.0311745405197144, "learning_rate": 2.8394303263137902e-05, "loss": 0.2941, "step": 348400 }, { "epoch": 1.2919944538774664, "grad_norm": 1.0066585540771484, "learning_rate": 2.836778342265931e-05, "loss": 0.2534, "step": 348500 }, { "epoch": 1.2923651839933565, "grad_norm": 0.5794774293899536, "learning_rate": 2.8341271067089282e-05, "loss": 0.2779, "step": 348600 }, { "epoch": 1.2927359141092467, "grad_norm": 0.6787636280059814, "learning_rate": 2.8314766205601255e-05, "loss": 0.2869, "step": 348700 }, { "epoch": 1.293106644225137, "grad_norm": 1.2888652086257935, "learning_rate": 2.828826884736616e-05, "loss": 0.2898, "step": 348800 }, { "epoch": 1.2934773743410273, "grad_norm": 0.739841639995575, "learning_rate": 2.8261779001552263e-05, "loss": 0.2891, "step": 348900 }, { "epoch": 1.2938481044569174, "grad_norm": 0.6241795420646667, "learning_rate": 2.8235296677325217e-05, "loss": 0.3029, "step": 349000 }, { "epoch": 1.2942188345728076, "grad_norm": 0.8693567514419556, "learning_rate": 2.8208821883848158e-05, "loss": 0.2879, "step": 349100 }, { "epoch": 1.2945895646886978, "grad_norm": 0.6575790047645569, "learning_rate": 2.8182354630281536e-05, "loss": 0.2803, "step": 349200 }, { "epoch": 1.2949602948045882, "grad_norm": 0.4420260488986969, "learning_rate": 2.815589492578321e-05, "loss": 0.2479, "step": 349300 }, { "epoch": 1.2953310249204784, "grad_norm": 1.8569773435592651, "learning_rate": 2.8129442779508453e-05, "loss": 0.3164, "step": 349400 }, { "epoch": 1.2957017550363688, "grad_norm": 0.5991716384887695, "learning_rate": 2.8102998200609863e-05, "loss": 0.2934, "step": 349500 }, { "epoch": 1.296072485152259, "grad_norm": 0.5708280801773071, "learning_rate": 2.8076561198237522e-05, "loss": 0.2706, "step": 349600 }, { "epoch": 1.296443215268149, "grad_norm": 0.5299630165100098, "learning_rate": 2.8050131781538804e-05, "loss": 0.3062, "step": 349700 }, { "epoch": 1.2968139453840393, "grad_norm": 1.1558029651641846, "learning_rate": 2.802370995965846e-05, "loss": 0.2728, "step": 349800 }, { "epoch": 1.2971846754999294, "grad_norm": 0.597369909286499, "learning_rate": 2.79972957417387e-05, "loss": 0.2926, "step": 349900 }, { "epoch": 1.2975554056158198, "grad_norm": 0.61481773853302, "learning_rate": 2.7970889136918983e-05, "loss": 0.3091, "step": 350000 }, { "epoch": 1.29792613573171, "grad_norm": 1.2346858978271484, "learning_rate": 2.794449015433619e-05, "loss": 0.2716, "step": 350100 }, { "epoch": 1.2982968658476004, "grad_norm": 1.0703519582748413, "learning_rate": 2.7918098803124603e-05, "loss": 0.2911, "step": 350200 }, { "epoch": 1.2986675959634906, "grad_norm": 0.7046018242835999, "learning_rate": 2.7891715092415815e-05, "loss": 0.285, "step": 350300 }, { "epoch": 1.2990383260793807, "grad_norm": 0.8760456442832947, "learning_rate": 2.7865339031338768e-05, "loss": 0.2963, "step": 350400 }, { "epoch": 1.299409056195271, "grad_norm": 0.9711882472038269, "learning_rate": 2.78389706290198e-05, "loss": 0.2822, "step": 350500 }, { "epoch": 1.299779786311161, "grad_norm": 1.0113439559936523, "learning_rate": 2.7812609894582568e-05, "loss": 0.2649, "step": 350600 }, { "epoch": 1.3001505164270515, "grad_norm": 0.9086369872093201, "learning_rate": 2.7786256837148085e-05, "loss": 0.2892, "step": 350700 }, { "epoch": 1.3005212465429417, "grad_norm": 0.9340677261352539, "learning_rate": 2.77599114658347e-05, "loss": 0.2767, "step": 350800 }, { "epoch": 1.3008919766588318, "grad_norm": 0.867019772529602, "learning_rate": 2.773357378975808e-05, "loss": 0.2878, "step": 350900 }, { "epoch": 1.3012627067747222, "grad_norm": 0.6412786245346069, "learning_rate": 2.7707243818031324e-05, "loss": 0.2836, "step": 351000 }, { "epoch": 1.3016334368906124, "grad_norm": 0.8220049738883972, "learning_rate": 2.768092155976475e-05, "loss": 0.2767, "step": 351100 }, { "epoch": 1.3020041670065026, "grad_norm": 0.49287524819374084, "learning_rate": 2.7654607024066053e-05, "loss": 0.3081, "step": 351200 }, { "epoch": 1.3023748971223927, "grad_norm": 0.7780212163925171, "learning_rate": 2.7628300220040292e-05, "loss": 0.2836, "step": 351300 }, { "epoch": 1.3027456272382831, "grad_norm": 0.9202233552932739, "learning_rate": 2.7602001156789804e-05, "loss": 0.2702, "step": 351400 }, { "epoch": 1.3031163573541733, "grad_norm": 0.6707475185394287, "learning_rate": 2.7575709843414255e-05, "loss": 0.2587, "step": 351500 }, { "epoch": 1.3034870874700635, "grad_norm": 1.02622389793396, "learning_rate": 2.754942628901064e-05, "loss": 0.2884, "step": 351600 }, { "epoch": 1.3038578175859539, "grad_norm": 0.7540799975395203, "learning_rate": 2.7523150502673245e-05, "loss": 0.3022, "step": 351700 }, { "epoch": 1.304228547701844, "grad_norm": 0.6048611402511597, "learning_rate": 2.7496882493493726e-05, "loss": 0.2591, "step": 351800 }, { "epoch": 1.3045992778177342, "grad_norm": 0.6314045786857605, "learning_rate": 2.7470622270560986e-05, "loss": 0.2914, "step": 351900 }, { "epoch": 1.3049700079336244, "grad_norm": 0.5885927081108093, "learning_rate": 2.744436984296125e-05, "loss": 0.309, "step": 352000 }, { "epoch": 1.3053407380495148, "grad_norm": 1.132943868637085, "learning_rate": 2.7418125219778083e-05, "loss": 0.3012, "step": 352100 }, { "epoch": 1.305711468165405, "grad_norm": 0.5140805840492249, "learning_rate": 2.7391888410092324e-05, "loss": 0.2674, "step": 352200 }, { "epoch": 1.3060821982812951, "grad_norm": 0.7483682632446289, "learning_rate": 2.736565942298206e-05, "loss": 0.2643, "step": 352300 }, { "epoch": 1.3064529283971855, "grad_norm": 0.5402369499206543, "learning_rate": 2.7339438267522748e-05, "loss": 0.3101, "step": 352400 }, { "epoch": 1.3068236585130757, "grad_norm": 0.554337203502655, "learning_rate": 2.7313224952787108e-05, "loss": 0.2672, "step": 352500 }, { "epoch": 1.3071943886289659, "grad_norm": 1.086210012435913, "learning_rate": 2.7287019487845113e-05, "loss": 0.2672, "step": 352600 }, { "epoch": 1.307565118744856, "grad_norm": 1.085532546043396, "learning_rate": 2.7260821881764082e-05, "loss": 0.244, "step": 352700 }, { "epoch": 1.3079358488607464, "grad_norm": 1.6111128330230713, "learning_rate": 2.7234632143608574e-05, "loss": 0.2862, "step": 352800 }, { "epoch": 1.3083065789766366, "grad_norm": 0.6878329515457153, "learning_rate": 2.720845028244041e-05, "loss": 0.3117, "step": 352900 }, { "epoch": 1.3086773090925268, "grad_norm": 0.7080169320106506, "learning_rate": 2.7182276307318776e-05, "loss": 0.3324, "step": 353000 }, { "epoch": 1.3090480392084172, "grad_norm": 1.8181285858154297, "learning_rate": 2.715611022729998e-05, "loss": 0.2939, "step": 353100 }, { "epoch": 1.3094187693243073, "grad_norm": 0.6616932153701782, "learning_rate": 2.712995205143773e-05, "loss": 0.3015, "step": 353200 }, { "epoch": 1.3097894994401975, "grad_norm": 1.2958288192749023, "learning_rate": 2.7103801788782944e-05, "loss": 0.3009, "step": 353300 }, { "epoch": 1.3101602295560877, "grad_norm": 1.1045286655426025, "learning_rate": 2.707765944838378e-05, "loss": 0.2877, "step": 353400 }, { "epoch": 1.3105309596719779, "grad_norm": 1.0415332317352295, "learning_rate": 2.7051525039285725e-05, "loss": 0.2995, "step": 353500 }, { "epoch": 1.3109016897878683, "grad_norm": 0.7043631076812744, "learning_rate": 2.7025398570531464e-05, "loss": 0.3175, "step": 353600 }, { "epoch": 1.3112724199037584, "grad_norm": 0.8296023607254028, "learning_rate": 2.6999280051160925e-05, "loss": 0.2891, "step": 353700 }, { "epoch": 1.3116431500196488, "grad_norm": 0.6394186019897461, "learning_rate": 2.6973169490211377e-05, "loss": 0.265, "step": 353800 }, { "epoch": 1.312013880135539, "grad_norm": 0.8807701468467712, "learning_rate": 2.6947066896717178e-05, "loss": 0.3142, "step": 353900 }, { "epoch": 1.3123846102514292, "grad_norm": 0.6262776255607605, "learning_rate": 2.6920972279710095e-05, "loss": 0.2898, "step": 354000 }, { "epoch": 1.3127553403673193, "grad_norm": 0.35244885087013245, "learning_rate": 2.6894885648219025e-05, "loss": 0.2784, "step": 354100 }, { "epoch": 1.3131260704832095, "grad_norm": 0.2093198001384735, "learning_rate": 2.6868807011270138e-05, "loss": 0.2902, "step": 354200 }, { "epoch": 1.3134968005991, "grad_norm": 0.5072487592697144, "learning_rate": 2.6842736377886858e-05, "loss": 0.2819, "step": 354300 }, { "epoch": 1.31386753071499, "grad_norm": 1.7393938302993774, "learning_rate": 2.6816673757089805e-05, "loss": 0.2664, "step": 354400 }, { "epoch": 1.3142382608308805, "grad_norm": 0.8347476720809937, "learning_rate": 2.679061915789684e-05, "loss": 0.2868, "step": 354500 }, { "epoch": 1.3146089909467706, "grad_norm": 0.471675306558609, "learning_rate": 2.676457258932305e-05, "loss": 0.2722, "step": 354600 }, { "epoch": 1.3149797210626608, "grad_norm": 0.6897680163383484, "learning_rate": 2.6738534060380728e-05, "loss": 0.2839, "step": 354700 }, { "epoch": 1.315350451178551, "grad_norm": 0.3206407129764557, "learning_rate": 2.6712503580079428e-05, "loss": 0.2988, "step": 354800 }, { "epoch": 1.3157211812944412, "grad_norm": 1.0141313076019287, "learning_rate": 2.668648115742588e-05, "loss": 0.2912, "step": 354900 }, { "epoch": 1.3160919114103315, "grad_norm": 0.6060977578163147, "learning_rate": 2.666046680142401e-05, "loss": 0.2916, "step": 355000 }, { "epoch": 1.3164626415262217, "grad_norm": 0.48270025849342346, "learning_rate": 2.6634460521075023e-05, "loss": 0.303, "step": 355100 }, { "epoch": 1.316833371642112, "grad_norm": 0.867194414138794, "learning_rate": 2.6608462325377272e-05, "loss": 0.2992, "step": 355200 }, { "epoch": 1.3172041017580023, "grad_norm": 1.083543300628662, "learning_rate": 2.6582472223326317e-05, "loss": 0.2545, "step": 355300 }, { "epoch": 1.3175748318738925, "grad_norm": 0.5984652638435364, "learning_rate": 2.6556490223914933e-05, "loss": 0.3078, "step": 355400 }, { "epoch": 1.3179455619897826, "grad_norm": 1.0627326965332031, "learning_rate": 2.6530516336133082e-05, "loss": 0.275, "step": 355500 }, { "epoch": 1.3183162921056728, "grad_norm": 0.7738039493560791, "learning_rate": 2.650455056896791e-05, "loss": 0.2742, "step": 355600 }, { "epoch": 1.3186870222215632, "grad_norm": 1.0855098962783813, "learning_rate": 2.64785929314038e-05, "loss": 0.3003, "step": 355700 }, { "epoch": 1.3190577523374534, "grad_norm": 0.829347550868988, "learning_rate": 2.645264343242227e-05, "loss": 0.2742, "step": 355800 }, { "epoch": 1.3194284824533435, "grad_norm": 0.5303807854652405, "learning_rate": 2.642670208100202e-05, "loss": 0.2698, "step": 355900 }, { "epoch": 1.319799212569234, "grad_norm": 0.5872277617454529, "learning_rate": 2.6400768886119014e-05, "loss": 0.2817, "step": 356000 }, { "epoch": 1.320169942685124, "grad_norm": 0.3260321617126465, "learning_rate": 2.637484385674625e-05, "loss": 0.2802, "step": 356100 }, { "epoch": 1.3205406728010143, "grad_norm": 0.9661682844161987, "learning_rate": 2.6348927001854045e-05, "loss": 0.2907, "step": 356200 }, { "epoch": 1.3209114029169045, "grad_norm": 1.5876041650772095, "learning_rate": 2.63230183304098e-05, "loss": 0.3216, "step": 356300 }, { "epoch": 1.3212821330327948, "grad_norm": 0.6672738790512085, "learning_rate": 2.6297117851378083e-05, "loss": 0.2811, "step": 356400 }, { "epoch": 1.321652863148685, "grad_norm": 0.3262385129928589, "learning_rate": 2.627122557372071e-05, "loss": 0.2814, "step": 356500 }, { "epoch": 1.3220235932645752, "grad_norm": 0.7676641941070557, "learning_rate": 2.624534150639657e-05, "loss": 0.3086, "step": 356600 }, { "epoch": 1.3223943233804656, "grad_norm": 1.2228243350982666, "learning_rate": 2.6219465658361725e-05, "loss": 0.2742, "step": 356700 }, { "epoch": 1.3227650534963558, "grad_norm": 0.6403907537460327, "learning_rate": 2.6193598038569473e-05, "loss": 0.2857, "step": 356800 }, { "epoch": 1.323135783612246, "grad_norm": 0.6900531649589539, "learning_rate": 2.6167738655970118e-05, "loss": 0.2844, "step": 356900 }, { "epoch": 1.323506513728136, "grad_norm": 0.8783509135246277, "learning_rate": 2.6141887519511276e-05, "loss": 0.2826, "step": 357000 }, { "epoch": 1.3238772438440265, "grad_norm": 0.9286769032478333, "learning_rate": 2.61160446381376e-05, "loss": 0.237, "step": 357100 }, { "epoch": 1.3242479739599167, "grad_norm": 1.207969069480896, "learning_rate": 2.6090210020790905e-05, "loss": 0.2684, "step": 357200 }, { "epoch": 1.3246187040758068, "grad_norm": 0.6399251818656921, "learning_rate": 2.60643836764102e-05, "loss": 0.2909, "step": 357300 }, { "epoch": 1.3249894341916972, "grad_norm": 0.9829773306846619, "learning_rate": 2.6038565613931563e-05, "loss": 0.2827, "step": 357400 }, { "epoch": 1.3253601643075874, "grad_norm": 0.7385883927345276, "learning_rate": 2.6012755842288234e-05, "loss": 0.3022, "step": 357500 }, { "epoch": 1.3257308944234776, "grad_norm": 1.5172590017318726, "learning_rate": 2.5986954370410633e-05, "loss": 0.2985, "step": 357600 }, { "epoch": 1.3261016245393678, "grad_norm": 1.551173448562622, "learning_rate": 2.5961161207226216e-05, "loss": 0.3001, "step": 357700 }, { "epoch": 1.3264723546552581, "grad_norm": 0.8196588158607483, "learning_rate": 2.5935376361659596e-05, "loss": 0.2757, "step": 357800 }, { "epoch": 1.3268430847711483, "grad_norm": 1.04291832447052, "learning_rate": 2.590959984263257e-05, "loss": 0.2936, "step": 357900 }, { "epoch": 1.3272138148870385, "grad_norm": 0.7488998770713806, "learning_rate": 2.5883831659063982e-05, "loss": 0.3168, "step": 358000 }, { "epoch": 1.3275845450029289, "grad_norm": 0.8816910982131958, "learning_rate": 2.585807181986981e-05, "loss": 0.2782, "step": 358100 }, { "epoch": 1.327955275118819, "grad_norm": 0.7189591526985168, "learning_rate": 2.5832320333963168e-05, "loss": 0.277, "step": 358200 }, { "epoch": 1.3283260052347092, "grad_norm": 1.0800156593322754, "learning_rate": 2.5806577210254257e-05, "loss": 0.2855, "step": 358300 }, { "epoch": 1.3286967353505994, "grad_norm": 0.8075501918792725, "learning_rate": 2.578084245765039e-05, "loss": 0.2706, "step": 358400 }, { "epoch": 1.3290674654664896, "grad_norm": 1.336424469947815, "learning_rate": 2.5755116085055976e-05, "loss": 0.295, "step": 358500 }, { "epoch": 1.32943819558238, "grad_norm": 0.6730695366859436, "learning_rate": 2.5729398101372526e-05, "loss": 0.2885, "step": 358600 }, { "epoch": 1.3298089256982701, "grad_norm": 0.7345704436302185, "learning_rate": 2.5703688515498675e-05, "loss": 0.2648, "step": 358700 }, { "epoch": 1.3301796558141605, "grad_norm": 1.3181475400924683, "learning_rate": 2.5677987336330124e-05, "loss": 0.2723, "step": 358800 }, { "epoch": 1.3305503859300507, "grad_norm": 0.6939221620559692, "learning_rate": 2.5652294572759656e-05, "loss": 0.2615, "step": 358900 }, { "epoch": 1.3309211160459409, "grad_norm": 0.8497469425201416, "learning_rate": 2.5626610233677197e-05, "loss": 0.3044, "step": 359000 }, { "epoch": 1.331291846161831, "grad_norm": 0.6435534358024597, "learning_rate": 2.5600934327969684e-05, "loss": 0.2847, "step": 359100 }, { "epoch": 1.3316625762777212, "grad_norm": 1.3406113386154175, "learning_rate": 2.5575266864521195e-05, "loss": 0.2823, "step": 359200 }, { "epoch": 1.3320333063936116, "grad_norm": 0.5861217379570007, "learning_rate": 2.5549607852212855e-05, "loss": 0.2629, "step": 359300 }, { "epoch": 1.3324040365095018, "grad_norm": 1.55866277217865, "learning_rate": 2.552395729992285e-05, "loss": 0.2654, "step": 359400 }, { "epoch": 1.3327747666253922, "grad_norm": 0.5302572250366211, "learning_rate": 2.5498315216526514e-05, "loss": 0.2812, "step": 359500 }, { "epoch": 1.3331454967412824, "grad_norm": 0.9017696380615234, "learning_rate": 2.5472681610896166e-05, "loss": 0.2932, "step": 359600 }, { "epoch": 1.3335162268571725, "grad_norm": 1.0160523653030396, "learning_rate": 2.544705649190121e-05, "loss": 0.2601, "step": 359700 }, { "epoch": 1.3338869569730627, "grad_norm": 1.1165549755096436, "learning_rate": 2.5421439868408193e-05, "loss": 0.2937, "step": 359800 }, { "epoch": 1.3342576870889529, "grad_norm": 0.7371346950531006, "learning_rate": 2.5395831749280595e-05, "loss": 0.2951, "step": 359900 }, { "epoch": 1.3346284172048433, "grad_norm": 0.8573924899101257, "learning_rate": 2.537023214337902e-05, "loss": 0.2956, "step": 360000 }, { "epoch": 1.3349991473207334, "grad_norm": 0.9354628920555115, "learning_rate": 2.534464105956115e-05, "loss": 0.3036, "step": 360100 }, { "epoch": 1.3353698774366236, "grad_norm": 1.7329566478729248, "learning_rate": 2.5319058506681693e-05, "loss": 0.2967, "step": 360200 }, { "epoch": 1.335740607552514, "grad_norm": 0.39499616622924805, "learning_rate": 2.5293484493592357e-05, "loss": 0.3025, "step": 360300 }, { "epoch": 1.3361113376684042, "grad_norm": 0.8963720202445984, "learning_rate": 2.5267919029142007e-05, "loss": 0.2831, "step": 360400 }, { "epoch": 1.3364820677842943, "grad_norm": 0.6480953097343445, "learning_rate": 2.5242362122176442e-05, "loss": 0.277, "step": 360500 }, { "epoch": 1.3368527979001845, "grad_norm": 1.245055913925171, "learning_rate": 2.5216813781538562e-05, "loss": 0.2749, "step": 360600 }, { "epoch": 1.337223528016075, "grad_norm": 0.528232216835022, "learning_rate": 2.519127401606828e-05, "loss": 0.2735, "step": 360700 }, { "epoch": 1.337594258131965, "grad_norm": 0.8791388869285583, "learning_rate": 2.5165742834602512e-05, "loss": 0.2695, "step": 360800 }, { "epoch": 1.3379649882478553, "grad_norm": 1.017381191253662, "learning_rate": 2.5140220245975286e-05, "loss": 0.2859, "step": 360900 }, { "epoch": 1.3383357183637457, "grad_norm": 0.31722667813301086, "learning_rate": 2.5114706259017595e-05, "loss": 0.2898, "step": 361000 }, { "epoch": 1.3387064484796358, "grad_norm": 0.6019331812858582, "learning_rate": 2.5089200882557445e-05, "loss": 0.2835, "step": 361100 }, { "epoch": 1.339077178595526, "grad_norm": 0.5804650187492371, "learning_rate": 2.5063704125419924e-05, "loss": 0.2674, "step": 361200 }, { "epoch": 1.3394479087114162, "grad_norm": 0.7629022002220154, "learning_rate": 2.5038215996427077e-05, "loss": 0.2954, "step": 361300 }, { "epoch": 1.3398186388273066, "grad_norm": 0.9410708546638489, "learning_rate": 2.5012736504397993e-05, "loss": 0.2684, "step": 361400 }, { "epoch": 1.3401893689431967, "grad_norm": 1.2471709251403809, "learning_rate": 2.498726565814877e-05, "loss": 0.2841, "step": 361500 }, { "epoch": 1.340560099059087, "grad_norm": 1.4465922117233276, "learning_rate": 2.4961803466492483e-05, "loss": 0.2796, "step": 361600 }, { "epoch": 1.3409308291749773, "grad_norm": 1.245929479598999, "learning_rate": 2.4936349938239284e-05, "loss": 0.2825, "step": 361700 }, { "epoch": 1.3413015592908675, "grad_norm": 1.0104849338531494, "learning_rate": 2.4910905082196266e-05, "loss": 0.2653, "step": 361800 }, { "epoch": 1.3416722894067576, "grad_norm": 0.5260530114173889, "learning_rate": 2.4885468907167513e-05, "loss": 0.2794, "step": 361900 }, { "epoch": 1.3420430195226478, "grad_norm": 0.9658180475234985, "learning_rate": 2.4860041421954177e-05, "loss": 0.2717, "step": 362000 }, { "epoch": 1.3424137496385382, "grad_norm": 0.7693400382995605, "learning_rate": 2.483462263535436e-05, "loss": 0.2928, "step": 362100 }, { "epoch": 1.3427844797544284, "grad_norm": 0.7961878180503845, "learning_rate": 2.4809212556163086e-05, "loss": 0.2682, "step": 362200 }, { "epoch": 1.3431552098703186, "grad_norm": 1.0180163383483887, "learning_rate": 2.4783811193172496e-05, "loss": 0.2975, "step": 362300 }, { "epoch": 1.343525939986209, "grad_norm": 0.5236241817474365, "learning_rate": 2.475841855517163e-05, "loss": 0.2766, "step": 362400 }, { "epoch": 1.3438966701020991, "grad_norm": 0.9010047912597656, "learning_rate": 2.4733034650946512e-05, "loss": 0.3074, "step": 362500 }, { "epoch": 1.3442674002179893, "grad_norm": 0.945859432220459, "learning_rate": 2.4707659489280205e-05, "loss": 0.288, "step": 362600 }, { "epoch": 1.3446381303338795, "grad_norm": 1.4003080129623413, "learning_rate": 2.4682293078952673e-05, "loss": 0.2929, "step": 362700 }, { "epoch": 1.3450088604497696, "grad_norm": 0.813473105430603, "learning_rate": 2.4656935428740875e-05, "loss": 0.2737, "step": 362800 }, { "epoch": 1.34537959056566, "grad_norm": 0.6749415397644043, "learning_rate": 2.4631586547418805e-05, "loss": 0.3019, "step": 362900 }, { "epoch": 1.3457503206815502, "grad_norm": 0.4241899847984314, "learning_rate": 2.460624644375728e-05, "loss": 0.2813, "step": 363000 }, { "epoch": 1.3461210507974406, "grad_norm": 0.8215797543525696, "learning_rate": 2.4580915126524233e-05, "loss": 0.2668, "step": 363100 }, { "epoch": 1.3464917809133308, "grad_norm": 1.0536340475082397, "learning_rate": 2.4555592604484467e-05, "loss": 0.2632, "step": 363200 }, { "epoch": 1.346862511029221, "grad_norm": 0.9701337218284607, "learning_rate": 2.4530278886399736e-05, "loss": 0.2728, "step": 363300 }, { "epoch": 1.3472332411451111, "grad_norm": 1.2999005317687988, "learning_rate": 2.450497398102883e-05, "loss": 0.2829, "step": 363400 }, { "epoch": 1.3476039712610013, "grad_norm": 0.28870734572410583, "learning_rate": 2.44796778971274e-05, "loss": 0.2787, "step": 363500 }, { "epoch": 1.3479747013768917, "grad_norm": 0.9656482934951782, "learning_rate": 2.445439064344807e-05, "loss": 0.2914, "step": 363600 }, { "epoch": 1.3483454314927819, "grad_norm": 1.2373545169830322, "learning_rate": 2.4429112228740474e-05, "loss": 0.298, "step": 363700 }, { "epoch": 1.3487161616086722, "grad_norm": 0.5357387065887451, "learning_rate": 2.440384266175106e-05, "loss": 0.3006, "step": 363800 }, { "epoch": 1.3490868917245624, "grad_norm": 1.4222042560577393, "learning_rate": 2.4378581951223335e-05, "loss": 0.2978, "step": 363900 }, { "epoch": 1.3494576218404526, "grad_norm": 0.895623505115509, "learning_rate": 2.435333010589768e-05, "loss": 0.2838, "step": 364000 }, { "epoch": 1.3498283519563428, "grad_norm": 0.7900087237358093, "learning_rate": 2.4328087134511403e-05, "loss": 0.3046, "step": 364100 }, { "epoch": 1.350199082072233, "grad_norm": 2.4674532413482666, "learning_rate": 2.4302853045798803e-05, "loss": 0.2853, "step": 364200 }, { "epoch": 1.3505698121881233, "grad_norm": 0.5580422878265381, "learning_rate": 2.4277627848491037e-05, "loss": 0.2808, "step": 364300 }, { "epoch": 1.3509405423040135, "grad_norm": 0.7153894305229187, "learning_rate": 2.4252411551316214e-05, "loss": 0.2786, "step": 364400 }, { "epoch": 1.351311272419904, "grad_norm": 0.36846408247947693, "learning_rate": 2.4227204162999356e-05, "loss": 0.277, "step": 364500 }, { "epoch": 1.351682002535794, "grad_norm": 0.8118723034858704, "learning_rate": 2.420200569226242e-05, "loss": 0.3097, "step": 364600 }, { "epoch": 1.3520527326516842, "grad_norm": 0.3675038516521454, "learning_rate": 2.417681614782424e-05, "loss": 0.2909, "step": 364700 }, { "epoch": 1.3524234627675744, "grad_norm": 0.5718204379081726, "learning_rate": 2.4151635538400623e-05, "loss": 0.2816, "step": 364800 }, { "epoch": 1.3527941928834646, "grad_norm": 0.4748682975769043, "learning_rate": 2.412646387270423e-05, "loss": 0.2923, "step": 364900 }, { "epoch": 1.353164922999355, "grad_norm": 0.9618183374404907, "learning_rate": 2.4101301159444623e-05, "loss": 0.3025, "step": 365000 }, { "epoch": 1.3535356531152452, "grad_norm": 1.1596566438674927, "learning_rate": 2.4076147407328336e-05, "loss": 0.295, "step": 365100 }, { "epoch": 1.3539063832311353, "grad_norm": 0.5529922842979431, "learning_rate": 2.4051002625058727e-05, "loss": 0.2842, "step": 365200 }, { "epoch": 1.3542771133470257, "grad_norm": 0.6762419939041138, "learning_rate": 2.4025866821336085e-05, "loss": 0.2832, "step": 365300 }, { "epoch": 1.354647843462916, "grad_norm": 1.4706660509109497, "learning_rate": 2.400074000485758e-05, "loss": 0.3174, "step": 365400 }, { "epoch": 1.355018573578806, "grad_norm": 0.726020336151123, "learning_rate": 2.3975622184317258e-05, "loss": 0.2757, "step": 365500 }, { "epoch": 1.3553893036946962, "grad_norm": 0.2678129971027374, "learning_rate": 2.3950513368406124e-05, "loss": 0.2801, "step": 365600 }, { "epoch": 1.3557600338105866, "grad_norm": 0.5909460783004761, "learning_rate": 2.3925413565811983e-05, "loss": 0.2913, "step": 365700 }, { "epoch": 1.3561307639264768, "grad_norm": 1.5310108661651611, "learning_rate": 2.3900322785219538e-05, "loss": 0.3057, "step": 365800 }, { "epoch": 1.356501494042367, "grad_norm": 0.7067875862121582, "learning_rate": 2.3875241035310446e-05, "loss": 0.3058, "step": 365900 }, { "epoch": 1.3568722241582574, "grad_norm": 1.251495599746704, "learning_rate": 2.3850168324763104e-05, "loss": 0.3039, "step": 366000 }, { "epoch": 1.3572429542741475, "grad_norm": 0.6515965461730957, "learning_rate": 2.3825104662252917e-05, "loss": 0.2879, "step": 366100 }, { "epoch": 1.3576136843900377, "grad_norm": 1.2369458675384521, "learning_rate": 2.3800050056452072e-05, "loss": 0.3008, "step": 366200 }, { "epoch": 1.3579844145059279, "grad_norm": 0.4946245849132538, "learning_rate": 2.3775004516029635e-05, "loss": 0.2924, "step": 366300 }, { "epoch": 1.3583551446218183, "grad_norm": 1.7159677743911743, "learning_rate": 2.3749968049651595e-05, "loss": 0.2978, "step": 366400 }, { "epoch": 1.3587258747377085, "grad_norm": 0.8869773745536804, "learning_rate": 2.372494066598073e-05, "loss": 0.2715, "step": 366500 }, { "epoch": 1.3590966048535986, "grad_norm": 0.3882772922515869, "learning_rate": 2.3699922373676676e-05, "loss": 0.3201, "step": 366600 }, { "epoch": 1.359467334969489, "grad_norm": 1.739863634109497, "learning_rate": 2.367491318139602e-05, "loss": 0.2726, "step": 366700 }, { "epoch": 1.3598380650853792, "grad_norm": 0.38049787282943726, "learning_rate": 2.364991309779205e-05, "loss": 0.3166, "step": 366800 }, { "epoch": 1.3602087952012694, "grad_norm": 1.1564191579818726, "learning_rate": 2.3624922131515033e-05, "loss": 0.3127, "step": 366900 }, { "epoch": 1.3605795253171595, "grad_norm": 1.386584997177124, "learning_rate": 2.3599940291212008e-05, "loss": 0.2672, "step": 367000 }, { "epoch": 1.36095025543305, "grad_norm": 0.7671076655387878, "learning_rate": 2.3574967585526873e-05, "loss": 0.2965, "step": 367100 }, { "epoch": 1.36132098554894, "grad_norm": 1.729109764099121, "learning_rate": 2.35500040231004e-05, "loss": 0.2869, "step": 367200 }, { "epoch": 1.3616917156648303, "grad_norm": 1.1250944137573242, "learning_rate": 2.3525049612570148e-05, "loss": 0.2922, "step": 367300 }, { "epoch": 1.3620624457807207, "grad_norm": 1.1842031478881836, "learning_rate": 2.3500104362570518e-05, "loss": 0.2892, "step": 367400 }, { "epoch": 1.3624331758966108, "grad_norm": 0.8891779184341431, "learning_rate": 2.3475168281732802e-05, "loss": 0.2966, "step": 367500 }, { "epoch": 1.362803906012501, "grad_norm": 0.6639518737792969, "learning_rate": 2.3450241378685012e-05, "loss": 0.2907, "step": 367600 }, { "epoch": 1.3631746361283912, "grad_norm": 1.3786250352859497, "learning_rate": 2.342532366205205e-05, "loss": 0.3178, "step": 367700 }, { "epoch": 1.3635453662442814, "grad_norm": 0.788500964641571, "learning_rate": 2.3400415140455666e-05, "loss": 0.3104, "step": 367800 }, { "epoch": 1.3639160963601717, "grad_norm": 1.5127904415130615, "learning_rate": 2.3375515822514377e-05, "loss": 0.2928, "step": 367900 }, { "epoch": 1.364286826476062, "grad_norm": 1.3988937139511108, "learning_rate": 2.3350625716843517e-05, "loss": 0.2961, "step": 368000 }, { "epoch": 1.3646575565919523, "grad_norm": 0.7724320292472839, "learning_rate": 2.332574483205528e-05, "loss": 0.2715, "step": 368100 }, { "epoch": 1.3650282867078425, "grad_norm": 0.8119176626205444, "learning_rate": 2.330087317675862e-05, "loss": 0.2547, "step": 368200 }, { "epoch": 1.3653990168237327, "grad_norm": 0.9148111939430237, "learning_rate": 2.327601075955932e-05, "loss": 0.2743, "step": 368300 }, { "epoch": 1.3657697469396228, "grad_norm": 1.7938941717147827, "learning_rate": 2.3251157589059957e-05, "loss": 0.3055, "step": 368400 }, { "epoch": 1.366140477055513, "grad_norm": 1.0323405265808105, "learning_rate": 2.32263136738599e-05, "loss": 0.2994, "step": 368500 }, { "epoch": 1.3665112071714034, "grad_norm": 0.9493085145950317, "learning_rate": 2.320147902255536e-05, "loss": 0.2842, "step": 368600 }, { "epoch": 1.3668819372872936, "grad_norm": 0.7521066665649414, "learning_rate": 2.3176653643739303e-05, "loss": 0.2622, "step": 368700 }, { "epoch": 1.367252667403184, "grad_norm": 0.8046944737434387, "learning_rate": 2.3151837546001472e-05, "loss": 0.265, "step": 368800 }, { "epoch": 1.3676233975190741, "grad_norm": 0.7474866509437561, "learning_rate": 2.3127030737928456e-05, "loss": 0.288, "step": 368900 }, { "epoch": 1.3679941276349643, "grad_norm": 1.0268464088439941, "learning_rate": 2.310223322810358e-05, "loss": 0.2849, "step": 369000 }, { "epoch": 1.3683648577508545, "grad_norm": 0.9434842467308044, "learning_rate": 2.3077445025106968e-05, "loss": 0.2655, "step": 369100 }, { "epoch": 1.3687355878667447, "grad_norm": 1.6043834686279297, "learning_rate": 2.3052666137515522e-05, "loss": 0.2955, "step": 369200 }, { "epoch": 1.369106317982635, "grad_norm": 1.280275583267212, "learning_rate": 2.3027896573902907e-05, "loss": 0.2974, "step": 369300 }, { "epoch": 1.3694770480985252, "grad_norm": 0.5113385915756226, "learning_rate": 2.300313634283961e-05, "loss": 0.2666, "step": 369400 }, { "epoch": 1.3698477782144154, "grad_norm": 1.048891544342041, "learning_rate": 2.2978385452892842e-05, "loss": 0.299, "step": 369500 }, { "epoch": 1.3702185083303058, "grad_norm": 0.8047981262207031, "learning_rate": 2.2953643912626576e-05, "loss": 0.2831, "step": 369600 }, { "epoch": 1.370589238446196, "grad_norm": 1.5819560289382935, "learning_rate": 2.2928911730601604e-05, "loss": 0.2776, "step": 369700 }, { "epoch": 1.3709599685620861, "grad_norm": 1.4913783073425293, "learning_rate": 2.2904188915375445e-05, "loss": 0.2929, "step": 369800 }, { "epoch": 1.3713306986779763, "grad_norm": 0.6968981623649597, "learning_rate": 2.2879475475502326e-05, "loss": 0.2685, "step": 369900 }, { "epoch": 1.3717014287938667, "grad_norm": 0.9078231453895569, "learning_rate": 2.285477141953334e-05, "loss": 0.311, "step": 370000 }, { "epoch": 1.3720721589097569, "grad_norm": 1.692625641822815, "learning_rate": 2.283007675601625e-05, "loss": 0.2694, "step": 370100 }, { "epoch": 1.372442889025647, "grad_norm": 1.7504152059555054, "learning_rate": 2.2805391493495577e-05, "loss": 0.3008, "step": 370200 }, { "epoch": 1.3728136191415374, "grad_norm": 0.8823074698448181, "learning_rate": 2.2780715640512645e-05, "loss": 0.286, "step": 370300 }, { "epoch": 1.3731843492574276, "grad_norm": 0.3838784694671631, "learning_rate": 2.275604920560546e-05, "loss": 0.2709, "step": 370400 }, { "epoch": 1.3735550793733178, "grad_norm": 0.8330515623092651, "learning_rate": 2.2731392197308805e-05, "loss": 0.2847, "step": 370500 }, { "epoch": 1.373925809489208, "grad_norm": 0.6513704657554626, "learning_rate": 2.2706744624154175e-05, "loss": 0.3027, "step": 370600 }, { "epoch": 1.3742965396050983, "grad_norm": 1.2567628622055054, "learning_rate": 2.2682106494669803e-05, "loss": 0.3307, "step": 370700 }, { "epoch": 1.3746672697209885, "grad_norm": 1.367504358291626, "learning_rate": 2.2657477817380707e-05, "loss": 0.2885, "step": 370800 }, { "epoch": 1.3750379998368787, "grad_norm": 1.072507619857788, "learning_rate": 2.2632858600808564e-05, "loss": 0.2659, "step": 370900 }, { "epoch": 1.375408729952769, "grad_norm": 1.023005485534668, "learning_rate": 2.2608248853471796e-05, "loss": 0.2748, "step": 371000 }, { "epoch": 1.3757794600686593, "grad_norm": 0.9052010178565979, "learning_rate": 2.2583648583885597e-05, "loss": 0.2977, "step": 371100 }, { "epoch": 1.3761501901845494, "grad_norm": 1.322707176208496, "learning_rate": 2.255905780056182e-05, "loss": 0.2886, "step": 371200 }, { "epoch": 1.3765209203004396, "grad_norm": 0.8049766421318054, "learning_rate": 2.2534476512009062e-05, "loss": 0.28, "step": 371300 }, { "epoch": 1.37689165041633, "grad_norm": 0.9666003584861755, "learning_rate": 2.250990472673264e-05, "loss": 0.2767, "step": 371400 }, { "epoch": 1.3772623805322202, "grad_norm": 0.6796230673789978, "learning_rate": 2.248534245323454e-05, "loss": 0.2766, "step": 371500 }, { "epoch": 1.3776331106481103, "grad_norm": 0.7117855548858643, "learning_rate": 2.246078970001355e-05, "loss": 0.2766, "step": 371600 }, { "epoch": 1.3780038407640007, "grad_norm": 0.818467915058136, "learning_rate": 2.243624647556507e-05, "loss": 0.3087, "step": 371700 }, { "epoch": 1.378374570879891, "grad_norm": 1.0551584959030151, "learning_rate": 2.2411712788381235e-05, "loss": 0.2816, "step": 371800 }, { "epoch": 1.378745300995781, "grad_norm": 0.950619101524353, "learning_rate": 2.2387188646950906e-05, "loss": 0.2971, "step": 371900 }, { "epoch": 1.3791160311116712, "grad_norm": 0.9362244606018066, "learning_rate": 2.2362674059759632e-05, "loss": 0.2788, "step": 372000 }, { "epoch": 1.3794867612275616, "grad_norm": 1.188486099243164, "learning_rate": 2.2338169035289575e-05, "loss": 0.2711, "step": 372100 }, { "epoch": 1.3798574913434518, "grad_norm": 0.8289785385131836, "learning_rate": 2.231367358201973e-05, "loss": 0.2698, "step": 372200 }, { "epoch": 1.380228221459342, "grad_norm": 1.3901516199111938, "learning_rate": 2.228918770842567e-05, "loss": 0.2813, "step": 372300 }, { "epoch": 1.3805989515752324, "grad_norm": 1.6378222703933716, "learning_rate": 2.2264711422979682e-05, "loss": 0.3062, "step": 372400 }, { "epoch": 1.3809696816911226, "grad_norm": 0.6736930012702942, "learning_rate": 2.224024473415078e-05, "loss": 0.2895, "step": 372500 }, { "epoch": 1.3813404118070127, "grad_norm": 0.9685173630714417, "learning_rate": 2.2215787650404595e-05, "loss": 0.3094, "step": 372600 }, { "epoch": 1.381711141922903, "grad_norm": 1.0597585439682007, "learning_rate": 2.2191340180203452e-05, "loss": 0.2838, "step": 372700 }, { "epoch": 1.382081872038793, "grad_norm": 0.43741700053215027, "learning_rate": 2.216690233200641e-05, "loss": 0.2811, "step": 372800 }, { "epoch": 1.3824526021546835, "grad_norm": 0.6119074821472168, "learning_rate": 2.2142474114269074e-05, "loss": 0.2913, "step": 372900 }, { "epoch": 1.3828233322705736, "grad_norm": 0.16466599702835083, "learning_rate": 2.2118055535443842e-05, "loss": 0.2748, "step": 373000 }, { "epoch": 1.383194062386464, "grad_norm": 0.51839679479599, "learning_rate": 2.2093646603979713e-05, "loss": 0.3135, "step": 373100 }, { "epoch": 1.3835647925023542, "grad_norm": 1.3252391815185547, "learning_rate": 2.2069247328322334e-05, "loss": 0.27, "step": 373200 }, { "epoch": 1.3839355226182444, "grad_norm": 0.8695566058158875, "learning_rate": 2.2044857716914076e-05, "loss": 0.2902, "step": 373300 }, { "epoch": 1.3843062527341345, "grad_norm": 0.8225809335708618, "learning_rate": 2.202047777819391e-05, "loss": 0.2728, "step": 373400 }, { "epoch": 1.3846769828500247, "grad_norm": 0.7643478512763977, "learning_rate": 2.1996107520597458e-05, "loss": 0.285, "step": 373500 }, { "epoch": 1.3850477129659151, "grad_norm": 1.397139549255371, "learning_rate": 2.1971746952557065e-05, "loss": 0.2754, "step": 373600 }, { "epoch": 1.3854184430818053, "grad_norm": 1.95527184009552, "learning_rate": 2.1947396082501594e-05, "loss": 0.2847, "step": 373700 }, { "epoch": 1.3857891731976957, "grad_norm": 0.6739106178283691, "learning_rate": 2.192305491885669e-05, "loss": 0.2888, "step": 373800 }, { "epoch": 1.3861599033135859, "grad_norm": 1.2186832427978516, "learning_rate": 2.1898723470044546e-05, "loss": 0.296, "step": 373900 }, { "epoch": 1.386530633429476, "grad_norm": 0.9028012156486511, "learning_rate": 2.1874401744484025e-05, "loss": 0.2908, "step": 374000 }, { "epoch": 1.3869013635453662, "grad_norm": 1.2045128345489502, "learning_rate": 2.1850089750590646e-05, "loss": 0.2986, "step": 374100 }, { "epoch": 1.3872720936612564, "grad_norm": 0.8587411642074585, "learning_rate": 2.1825787496776534e-05, "loss": 0.2779, "step": 374200 }, { "epoch": 1.3876428237771468, "grad_norm": 0.8167155981063843, "learning_rate": 2.1801494991450443e-05, "loss": 0.2741, "step": 374300 }, { "epoch": 1.388013553893037, "grad_norm": 0.6938347816467285, "learning_rate": 2.177721224301777e-05, "loss": 0.2648, "step": 374400 }, { "epoch": 1.388384284008927, "grad_norm": 1.0391590595245361, "learning_rate": 2.1752939259880516e-05, "loss": 0.2901, "step": 374500 }, { "epoch": 1.3887550141248175, "grad_norm": 1.3935176134109497, "learning_rate": 2.1728676050437304e-05, "loss": 0.2966, "step": 374600 }, { "epoch": 1.3891257442407077, "grad_norm": 1.7053841352462769, "learning_rate": 2.1704422623083415e-05, "loss": 0.2832, "step": 374700 }, { "epoch": 1.3894964743565978, "grad_norm": 1.7305458784103394, "learning_rate": 2.1680178986210702e-05, "loss": 0.3014, "step": 374800 }, { "epoch": 1.389867204472488, "grad_norm": 0.9196724891662598, "learning_rate": 2.1655945148207635e-05, "loss": 0.298, "step": 374900 }, { "epoch": 1.3902379345883784, "grad_norm": 0.4340369999408722, "learning_rate": 2.1631721117459325e-05, "loss": 0.263, "step": 375000 }, { "epoch": 1.3906086647042686, "grad_norm": 1.2007790803909302, "learning_rate": 2.1607506902347457e-05, "loss": 0.2704, "step": 375100 }, { "epoch": 1.3909793948201588, "grad_norm": 0.7974711060523987, "learning_rate": 2.158330251125033e-05, "loss": 0.2885, "step": 375200 }, { "epoch": 1.3913501249360491, "grad_norm": 1.2964855432510376, "learning_rate": 2.1559107952542845e-05, "loss": 0.269, "step": 375300 }, { "epoch": 1.3917208550519393, "grad_norm": 0.7662702798843384, "learning_rate": 2.1534923234596477e-05, "loss": 0.2964, "step": 375400 }, { "epoch": 1.3920915851678295, "grad_norm": 0.33518901467323303, "learning_rate": 2.151074836577936e-05, "loss": 0.2944, "step": 375500 }, { "epoch": 1.3924623152837197, "grad_norm": 0.6659185886383057, "learning_rate": 2.148658335445616e-05, "loss": 0.2674, "step": 375600 }, { "epoch": 1.39283304539961, "grad_norm": 0.8077853322029114, "learning_rate": 2.1462428208988127e-05, "loss": 0.251, "step": 375700 }, { "epoch": 1.3932037755155002, "grad_norm": 0.8331159353256226, "learning_rate": 2.1438282937733173e-05, "loss": 0.2826, "step": 375800 }, { "epoch": 1.3935745056313904, "grad_norm": 0.6098957061767578, "learning_rate": 2.141414754904571e-05, "loss": 0.2865, "step": 375900 }, { "epoch": 1.3939452357472808, "grad_norm": 0.7920171618461609, "learning_rate": 2.139002205127677e-05, "loss": 0.287, "step": 376000 }, { "epoch": 1.394315965863171, "grad_norm": 0.8996031284332275, "learning_rate": 2.1365906452773955e-05, "loss": 0.2762, "step": 376100 }, { "epoch": 1.3946866959790611, "grad_norm": 0.30785802006721497, "learning_rate": 2.134180076188142e-05, "loss": 0.2595, "step": 376200 }, { "epoch": 1.3950574260949513, "grad_norm": 0.9265574216842651, "learning_rate": 2.1317704986939953e-05, "loss": 0.2558, "step": 376300 }, { "epoch": 1.3954281562108417, "grad_norm": 0.5372949242591858, "learning_rate": 2.1293619136286853e-05, "loss": 0.2802, "step": 376400 }, { "epoch": 1.3957988863267319, "grad_norm": 0.5697913765907288, "learning_rate": 2.1269543218255987e-05, "loss": 0.2706, "step": 376500 }, { "epoch": 1.396169616442622, "grad_norm": 0.7540616393089294, "learning_rate": 2.124547724117786e-05, "loss": 0.2706, "step": 376600 }, { "epoch": 1.3965403465585124, "grad_norm": 0.570479691028595, "learning_rate": 2.122142121337942e-05, "loss": 0.2901, "step": 376700 }, { "epoch": 1.3969110766744026, "grad_norm": 0.6511395573616028, "learning_rate": 2.1197375143184238e-05, "loss": 0.3003, "step": 376800 }, { "epoch": 1.3972818067902928, "grad_norm": 0.31400492787361145, "learning_rate": 2.1173339038912466e-05, "loss": 0.2766, "step": 376900 }, { "epoch": 1.397652536906183, "grad_norm": 0.36280557513237, "learning_rate": 2.1149312908880735e-05, "loss": 0.2945, "step": 377000 }, { "epoch": 1.3980232670220731, "grad_norm": 1.499492883682251, "learning_rate": 2.1125296761402312e-05, "loss": 0.2871, "step": 377100 }, { "epoch": 1.3983939971379635, "grad_norm": 1.2666056156158447, "learning_rate": 2.1101290604786945e-05, "loss": 0.2734, "step": 377200 }, { "epoch": 1.3987647272538537, "grad_norm": 0.47935983538627625, "learning_rate": 2.1077294447340918e-05, "loss": 0.279, "step": 377300 }, { "epoch": 1.399135457369744, "grad_norm": 0.4764808416366577, "learning_rate": 2.1053308297367134e-05, "loss": 0.2763, "step": 377400 }, { "epoch": 1.3995061874856343, "grad_norm": 0.8976060152053833, "learning_rate": 2.1029332163164932e-05, "loss": 0.2857, "step": 377500 }, { "epoch": 1.3998769176015244, "grad_norm": 0.6579723954200745, "learning_rate": 2.1005366053030233e-05, "loss": 0.285, "step": 377600 }, { "epoch": 1.4002476477174146, "grad_norm": 0.8446161150932312, "learning_rate": 2.0981409975255524e-05, "loss": 0.3074, "step": 377700 }, { "epoch": 1.4006183778333048, "grad_norm": 1.1573541164398193, "learning_rate": 2.0957463938129768e-05, "loss": 0.281, "step": 377800 }, { "epoch": 1.4009891079491952, "grad_norm": 0.7784209847450256, "learning_rate": 2.0933527949938457e-05, "loss": 0.283, "step": 377900 }, { "epoch": 1.4013598380650854, "grad_norm": 0.8739211559295654, "learning_rate": 2.0909602018963658e-05, "loss": 0.3025, "step": 378000 }, { "epoch": 1.4017305681809757, "grad_norm": 0.6080581545829773, "learning_rate": 2.0885686153483902e-05, "loss": 0.2883, "step": 378100 }, { "epoch": 1.402101298296866, "grad_norm": 0.7567920684814453, "learning_rate": 2.086178036177426e-05, "loss": 0.2944, "step": 378200 }, { "epoch": 1.402472028412756, "grad_norm": 1.1335183382034302, "learning_rate": 2.083788465210631e-05, "loss": 0.2879, "step": 378300 }, { "epoch": 1.4028427585286463, "grad_norm": 0.5818317532539368, "learning_rate": 2.081399903274813e-05, "loss": 0.2799, "step": 378400 }, { "epoch": 1.4032134886445364, "grad_norm": 1.061147689819336, "learning_rate": 2.0790123511964364e-05, "loss": 0.2555, "step": 378500 }, { "epoch": 1.4035842187604268, "grad_norm": 0.5608941316604614, "learning_rate": 2.0766258098016094e-05, "loss": 0.3156, "step": 378600 }, { "epoch": 1.403954948876317, "grad_norm": 0.8161286115646362, "learning_rate": 2.074240279916092e-05, "loss": 0.2962, "step": 378700 }, { "epoch": 1.4043256789922072, "grad_norm": 1.962334156036377, "learning_rate": 2.071855762365299e-05, "loss": 0.2757, "step": 378800 }, { "epoch": 1.4046964091080976, "grad_norm": 0.886726975440979, "learning_rate": 2.0694722579742885e-05, "loss": 0.2708, "step": 378900 }, { "epoch": 1.4050671392239877, "grad_norm": 0.529423713684082, "learning_rate": 2.067089767567772e-05, "loss": 0.2889, "step": 379000 }, { "epoch": 1.405437869339878, "grad_norm": 1.1950204372406006, "learning_rate": 2.0647082919701085e-05, "loss": 0.2695, "step": 379100 }, { "epoch": 1.405808599455768, "grad_norm": 0.6384178996086121, "learning_rate": 2.0623278320053045e-05, "loss": 0.2945, "step": 379200 }, { "epoch": 1.4061793295716585, "grad_norm": 1.3595060110092163, "learning_rate": 2.0599483884970206e-05, "loss": 0.2814, "step": 379300 }, { "epoch": 1.4065500596875486, "grad_norm": 0.9158879518508911, "learning_rate": 2.0575699622685603e-05, "loss": 0.2843, "step": 379400 }, { "epoch": 1.4069207898034388, "grad_norm": 1.3006573915481567, "learning_rate": 2.0551925541428746e-05, "loss": 0.2804, "step": 379500 }, { "epoch": 1.4072915199193292, "grad_norm": 1.4103118181228638, "learning_rate": 2.0528161649425688e-05, "loss": 0.2622, "step": 379600 }, { "epoch": 1.4076622500352194, "grad_norm": 0.8525390028953552, "learning_rate": 2.0504407954898912e-05, "loss": 0.2835, "step": 379700 }, { "epoch": 1.4080329801511096, "grad_norm": 1.0117833614349365, "learning_rate": 2.0480664466067313e-05, "loss": 0.2907, "step": 379800 }, { "epoch": 1.4084037102669997, "grad_norm": 2.1972544193267822, "learning_rate": 2.0456931191146377e-05, "loss": 0.2695, "step": 379900 }, { "epoch": 1.4087744403828901, "grad_norm": 1.4235129356384277, "learning_rate": 2.043320813834797e-05, "loss": 0.2734, "step": 380000 }, { "epoch": 1.4091451704987803, "grad_norm": 1.1610021591186523, "learning_rate": 2.0409495315880434e-05, "loss": 0.2603, "step": 380100 }, { "epoch": 1.4095159006146705, "grad_norm": 3.0940966606140137, "learning_rate": 2.0385792731948616e-05, "loss": 0.292, "step": 380200 }, { "epoch": 1.4098866307305609, "grad_norm": 0.6424523591995239, "learning_rate": 2.0362100394753765e-05, "loss": 0.3004, "step": 380300 }, { "epoch": 1.410257360846451, "grad_norm": 0.8126674890518188, "learning_rate": 2.0338418312493613e-05, "loss": 0.3095, "step": 380400 }, { "epoch": 1.4106280909623412, "grad_norm": 1.2360458374023438, "learning_rate": 2.0314746493362336e-05, "loss": 0.3156, "step": 380500 }, { "epoch": 1.4109988210782314, "grad_norm": 2.006652355194092, "learning_rate": 2.0291084945550535e-05, "loss": 0.2659, "step": 380600 }, { "epoch": 1.4113695511941218, "grad_norm": 1.4564369916915894, "learning_rate": 2.0267433677245334e-05, "loss": 0.2739, "step": 380700 }, { "epoch": 1.411740281310012, "grad_norm": 3.392552375793457, "learning_rate": 2.0243792696630215e-05, "loss": 0.2973, "step": 380800 }, { "epoch": 1.4121110114259021, "grad_norm": 0.4590910077095032, "learning_rate": 2.0220162011885125e-05, "loss": 0.2764, "step": 380900 }, { "epoch": 1.4124817415417925, "grad_norm": 3.6380655765533447, "learning_rate": 2.019654163118649e-05, "loss": 0.255, "step": 381000 }, { "epoch": 1.4128524716576827, "grad_norm": 1.9104173183441162, "learning_rate": 2.017293156270712e-05, "loss": 0.2869, "step": 381100 }, { "epoch": 1.4132232017735729, "grad_norm": 1.0982208251953125, "learning_rate": 2.0149331814616273e-05, "loss": 0.2628, "step": 381200 }, { "epoch": 1.413593931889463, "grad_norm": 0.6330453157424927, "learning_rate": 2.0125742395079644e-05, "loss": 0.2745, "step": 381300 }, { "epoch": 1.4139646620053534, "grad_norm": 0.4961959719657898, "learning_rate": 2.0102163312259327e-05, "loss": 0.2682, "step": 381400 }, { "epoch": 1.4143353921212436, "grad_norm": 1.194434404373169, "learning_rate": 2.0078594574313896e-05, "loss": 0.2863, "step": 381500 }, { "epoch": 1.4147061222371338, "grad_norm": 1.3436322212219238, "learning_rate": 2.0055036189398297e-05, "loss": 0.3, "step": 381600 }, { "epoch": 1.4150768523530242, "grad_norm": 1.1230812072753906, "learning_rate": 2.003148816566388e-05, "loss": 0.2834, "step": 381700 }, { "epoch": 1.4154475824689143, "grad_norm": 2.093935251235962, "learning_rate": 2.0007950511258484e-05, "loss": 0.2974, "step": 381800 }, { "epoch": 1.4158183125848045, "grad_norm": 0.6303021311759949, "learning_rate": 1.9984423234326282e-05, "loss": 0.2765, "step": 381900 }, { "epoch": 1.4161890427006947, "grad_norm": 0.9276920557022095, "learning_rate": 1.9960906343007907e-05, "loss": 0.2722, "step": 382000 }, { "epoch": 1.4165597728165849, "grad_norm": 1.561571717262268, "learning_rate": 1.9937399845440356e-05, "loss": 0.2779, "step": 382100 }, { "epoch": 1.4169305029324752, "grad_norm": 1.3023698329925537, "learning_rate": 1.9913903749757072e-05, "loss": 0.2784, "step": 382200 }, { "epoch": 1.4173012330483654, "grad_norm": 1.3516894578933716, "learning_rate": 1.9890418064087856e-05, "loss": 0.2776, "step": 382300 }, { "epoch": 1.4176719631642558, "grad_norm": 1.1279276609420776, "learning_rate": 1.986694279655896e-05, "loss": 0.2716, "step": 382400 }, { "epoch": 1.418042693280146, "grad_norm": 0.36618945002555847, "learning_rate": 1.9843477955292995e-05, "loss": 0.2732, "step": 382500 }, { "epoch": 1.4184134233960362, "grad_norm": 0.5780854821205139, "learning_rate": 1.982002354840894e-05, "loss": 0.2805, "step": 382600 }, { "epoch": 1.4187841535119263, "grad_norm": 0.4307127594947815, "learning_rate": 1.9796579584022263e-05, "loss": 0.2835, "step": 382700 }, { "epoch": 1.4191548836278165, "grad_norm": 1.1192848682403564, "learning_rate": 1.977314607024467e-05, "loss": 0.2666, "step": 382800 }, { "epoch": 1.419525613743707, "grad_norm": 1.16902494430542, "learning_rate": 1.9749723015184385e-05, "loss": 0.2492, "step": 382900 }, { "epoch": 1.419896343859597, "grad_norm": 1.3545572757720947, "learning_rate": 1.9726310426945943e-05, "loss": 0.2937, "step": 383000 }, { "epoch": 1.4202670739754875, "grad_norm": 0.528661847114563, "learning_rate": 1.9702908313630265e-05, "loss": 0.2434, "step": 383100 }, { "epoch": 1.4206378040913776, "grad_norm": 0.43843919038772583, "learning_rate": 1.9679516683334686e-05, "loss": 0.2815, "step": 383200 }, { "epoch": 1.4210085342072678, "grad_norm": 0.86357182264328, "learning_rate": 1.9656135544152866e-05, "loss": 0.2865, "step": 383300 }, { "epoch": 1.421379264323158, "grad_norm": 1.216890573501587, "learning_rate": 1.9632764904174834e-05, "loss": 0.2956, "step": 383400 }, { "epoch": 1.4217499944390481, "grad_norm": 0.46760377287864685, "learning_rate": 1.9609404771487068e-05, "loss": 0.2799, "step": 383500 }, { "epoch": 1.4221207245549385, "grad_norm": 1.9394701719284058, "learning_rate": 1.9586055154172267e-05, "loss": 0.2568, "step": 383600 }, { "epoch": 1.4224914546708287, "grad_norm": 0.6849600672721863, "learning_rate": 1.956271606030964e-05, "loss": 0.2796, "step": 383700 }, { "epoch": 1.4228621847867189, "grad_norm": 0.18819165229797363, "learning_rate": 1.9539387497974653e-05, "loss": 0.3059, "step": 383800 }, { "epoch": 1.4232329149026093, "grad_norm": 0.6270031332969666, "learning_rate": 1.9516069475239158e-05, "loss": 0.2839, "step": 383900 }, { "epoch": 1.4236036450184995, "grad_norm": 1.3605588674545288, "learning_rate": 1.9492762000171396e-05, "loss": 0.2753, "step": 384000 }, { "epoch": 1.4239743751343896, "grad_norm": 0.6797065138816833, "learning_rate": 1.9469465080835906e-05, "loss": 0.2668, "step": 384100 }, { "epoch": 1.4243451052502798, "grad_norm": 0.5450316071510315, "learning_rate": 1.9446178725293608e-05, "loss": 0.2656, "step": 384200 }, { "epoch": 1.4247158353661702, "grad_norm": 0.7463541626930237, "learning_rate": 1.942290294160174e-05, "loss": 0.2851, "step": 384300 }, { "epoch": 1.4250865654820604, "grad_norm": 0.9292182326316833, "learning_rate": 1.9399637737813908e-05, "loss": 0.2565, "step": 384400 }, { "epoch": 1.4254572955979505, "grad_norm": 0.8238021731376648, "learning_rate": 1.9376383121980025e-05, "loss": 0.2708, "step": 384500 }, { "epoch": 1.425828025713841, "grad_norm": 2.2298994064331055, "learning_rate": 1.9353139102146396e-05, "loss": 0.2882, "step": 384600 }, { "epoch": 1.426198755829731, "grad_norm": 0.8514468669891357, "learning_rate": 1.9329905686355605e-05, "loss": 0.2777, "step": 384700 }, { "epoch": 1.4265694859456213, "grad_norm": 0.28486981987953186, "learning_rate": 1.9306682882646583e-05, "loss": 0.2589, "step": 384800 }, { "epoch": 1.4269402160615114, "grad_norm": 0.7669302225112915, "learning_rate": 1.928347069905461e-05, "loss": 0.2689, "step": 384900 }, { "epoch": 1.4273109461774018, "grad_norm": 1.1747255325317383, "learning_rate": 1.926026914361127e-05, "loss": 0.2623, "step": 385000 }, { "epoch": 1.427681676293292, "grad_norm": 0.6421029567718506, "learning_rate": 1.9237078224344475e-05, "loss": 0.2509, "step": 385100 }, { "epoch": 1.4280524064091822, "grad_norm": 0.46416208148002625, "learning_rate": 1.9213897949278454e-05, "loss": 0.2896, "step": 385200 }, { "epoch": 1.4284231365250726, "grad_norm": 1.2954211235046387, "learning_rate": 1.9190728326433734e-05, "loss": 0.3126, "step": 385300 }, { "epoch": 1.4287938666409628, "grad_norm": 0.758083164691925, "learning_rate": 1.9167569363827208e-05, "loss": 0.3005, "step": 385400 }, { "epoch": 1.429164596756853, "grad_norm": 1.2561029195785522, "learning_rate": 1.9144421069472053e-05, "loss": 0.3126, "step": 385500 }, { "epoch": 1.429535326872743, "grad_norm": 1.0495526790618896, "learning_rate": 1.9121283451377708e-05, "loss": 0.2854, "step": 385600 }, { "epoch": 1.4299060569886335, "grad_norm": 0.541677713394165, "learning_rate": 1.909815651755002e-05, "loss": 0.2839, "step": 385700 }, { "epoch": 1.4302767871045237, "grad_norm": 0.4058993458747864, "learning_rate": 1.907504027599105e-05, "loss": 0.2945, "step": 385800 }, { "epoch": 1.4306475172204138, "grad_norm": 1.4192376136779785, "learning_rate": 1.9051934734699197e-05, "loss": 0.2854, "step": 385900 }, { "epoch": 1.4310182473363042, "grad_norm": 0.48552393913269043, "learning_rate": 1.9028839901669137e-05, "loss": 0.2814, "step": 386000 }, { "epoch": 1.4313889774521944, "grad_norm": 0.5726983547210693, "learning_rate": 1.900575578489186e-05, "loss": 0.2635, "step": 386100 }, { "epoch": 1.4317597075680846, "grad_norm": 0.7665042281150818, "learning_rate": 1.8982682392354654e-05, "loss": 0.2804, "step": 386200 }, { "epoch": 1.4321304376839747, "grad_norm": 1.2182258367538452, "learning_rate": 1.895961973204109e-05, "loss": 0.2821, "step": 386300 }, { "epoch": 1.432501167799865, "grad_norm": 0.6185856461524963, "learning_rate": 1.8936567811930988e-05, "loss": 0.2819, "step": 386400 }, { "epoch": 1.4328718979157553, "grad_norm": 0.9906799793243408, "learning_rate": 1.891352664000054e-05, "loss": 0.2747, "step": 386500 }, { "epoch": 1.4332426280316455, "grad_norm": 2.0001142024993896, "learning_rate": 1.8890496224222114e-05, "loss": 0.2832, "step": 386600 }, { "epoch": 1.4336133581475359, "grad_norm": 1.1457252502441406, "learning_rate": 1.886747657256441e-05, "loss": 0.27, "step": 386700 }, { "epoch": 1.433984088263426, "grad_norm": 1.3189849853515625, "learning_rate": 1.8844467692992425e-05, "loss": 0.3049, "step": 386800 }, { "epoch": 1.4343548183793162, "grad_norm": 2.045835494995117, "learning_rate": 1.8821469593467388e-05, "loss": 0.2565, "step": 386900 }, { "epoch": 1.4347255484952064, "grad_norm": 1.550140619277954, "learning_rate": 1.87984822819468e-05, "loss": 0.3113, "step": 387000 }, { "epoch": 1.4350962786110966, "grad_norm": 0.8428137302398682, "learning_rate": 1.877550576638447e-05, "loss": 0.2782, "step": 387100 }, { "epoch": 1.435467008726987, "grad_norm": 0.42004290223121643, "learning_rate": 1.8752540054730437e-05, "loss": 0.2666, "step": 387200 }, { "epoch": 1.4358377388428771, "grad_norm": 1.1244804859161377, "learning_rate": 1.8729585154931006e-05, "loss": 0.3032, "step": 387300 }, { "epoch": 1.4362084689587675, "grad_norm": 0.4709664583206177, "learning_rate": 1.8706641074928738e-05, "loss": 0.2798, "step": 387400 }, { "epoch": 1.4365791990746577, "grad_norm": 0.7836841940879822, "learning_rate": 1.8683707822662445e-05, "loss": 0.2594, "step": 387500 }, { "epoch": 1.4369499291905479, "grad_norm": 0.8597096800804138, "learning_rate": 1.8660785406067234e-05, "loss": 0.2782, "step": 387600 }, { "epoch": 1.437320659306438, "grad_norm": 0.8820332288742065, "learning_rate": 1.8637873833074416e-05, "loss": 0.2755, "step": 387700 }, { "epoch": 1.4376913894223282, "grad_norm": 0.432829886674881, "learning_rate": 1.861497311161155e-05, "loss": 0.2715, "step": 387800 }, { "epoch": 1.4380621195382186, "grad_norm": 2.5472452640533447, "learning_rate": 1.8592083249602488e-05, "loss": 0.2962, "step": 387900 }, { "epoch": 1.4384328496541088, "grad_norm": 1.138940691947937, "learning_rate": 1.856920425496728e-05, "loss": 0.2919, "step": 388000 }, { "epoch": 1.4388035797699992, "grad_norm": 0.8653557300567627, "learning_rate": 1.8546336135622234e-05, "loss": 0.2555, "step": 388100 }, { "epoch": 1.4391743098858893, "grad_norm": 0.682138204574585, "learning_rate": 1.852347889947988e-05, "loss": 0.2468, "step": 388200 }, { "epoch": 1.4395450400017795, "grad_norm": 1.3574106693267822, "learning_rate": 1.8500632554448978e-05, "loss": 0.2707, "step": 388300 }, { "epoch": 1.4399157701176697, "grad_norm": 0.777789294719696, "learning_rate": 1.847779710843457e-05, "loss": 0.275, "step": 388400 }, { "epoch": 1.4402865002335599, "grad_norm": 0.8391427993774414, "learning_rate": 1.8454972569337877e-05, "loss": 0.2749, "step": 388500 }, { "epoch": 1.4406572303494503, "grad_norm": 0.7181114554405212, "learning_rate": 1.8432158945056338e-05, "loss": 0.2833, "step": 388600 }, { "epoch": 1.4410279604653404, "grad_norm": 1.6038129329681396, "learning_rate": 1.8409356243483673e-05, "loss": 0.2864, "step": 388700 }, { "epoch": 1.4413986905812306, "grad_norm": 1.1341460943222046, "learning_rate": 1.838656447250977e-05, "loss": 0.3006, "step": 388800 }, { "epoch": 1.441769420697121, "grad_norm": 2.0402190685272217, "learning_rate": 1.8363783640020753e-05, "loss": 0.2735, "step": 388900 }, { "epoch": 1.4421401508130112, "grad_norm": 0.9192044138908386, "learning_rate": 1.8341013753898962e-05, "loss": 0.278, "step": 389000 }, { "epoch": 1.4425108809289013, "grad_norm": 0.877964973449707, "learning_rate": 1.8318254822022922e-05, "loss": 0.2734, "step": 389100 }, { "epoch": 1.4428816110447915, "grad_norm": 0.8374612927436829, "learning_rate": 1.8295506852267436e-05, "loss": 0.2913, "step": 389200 }, { "epoch": 1.443252341160682, "grad_norm": 0.7347968220710754, "learning_rate": 1.8272769852503456e-05, "loss": 0.2892, "step": 389300 }, { "epoch": 1.443623071276572, "grad_norm": 0.9862411618232727, "learning_rate": 1.825004383059813e-05, "loss": 0.2546, "step": 389400 }, { "epoch": 1.4439938013924623, "grad_norm": 0.8705859184265137, "learning_rate": 1.822732879441487e-05, "loss": 0.3035, "step": 389500 }, { "epoch": 1.4443645315083526, "grad_norm": 0.7827274203300476, "learning_rate": 1.8204624751813248e-05, "loss": 0.2812, "step": 389600 }, { "epoch": 1.4447352616242428, "grad_norm": 0.8861241340637207, "learning_rate": 1.8181931710648975e-05, "loss": 0.2843, "step": 389700 }, { "epoch": 1.445105991740133, "grad_norm": 0.7627475261688232, "learning_rate": 1.8159249678774075e-05, "loss": 0.281, "step": 389800 }, { "epoch": 1.4454767218560232, "grad_norm": 0.7889427542686462, "learning_rate": 1.8136578664036675e-05, "loss": 0.2919, "step": 389900 }, { "epoch": 1.4458474519719136, "grad_norm": 0.8166193962097168, "learning_rate": 1.8113918674281098e-05, "loss": 0.305, "step": 390000 }, { "epoch": 1.4462181820878037, "grad_norm": 0.35234376788139343, "learning_rate": 1.809126971734791e-05, "loss": 0.2882, "step": 390100 }, { "epoch": 1.446588912203694, "grad_norm": 0.4679870009422302, "learning_rate": 1.80686318010738e-05, "loss": 0.2883, "step": 390200 }, { "epoch": 1.4469596423195843, "grad_norm": 0.6594972014427185, "learning_rate": 1.804600493329165e-05, "loss": 0.2706, "step": 390300 }, { "epoch": 1.4473303724354745, "grad_norm": 2.5387253761291504, "learning_rate": 1.8023389121830532e-05, "loss": 0.3014, "step": 390400 }, { "epoch": 1.4477011025513646, "grad_norm": 0.9216254949569702, "learning_rate": 1.8000784374515667e-05, "loss": 0.2665, "step": 390500 }, { "epoch": 1.4480718326672548, "grad_norm": 1.3130618333816528, "learning_rate": 1.7978190699168502e-05, "loss": 0.2993, "step": 390600 }, { "epoch": 1.4484425627831452, "grad_norm": 0.7715280055999756, "learning_rate": 1.7955608103606603e-05, "loss": 0.2919, "step": 390700 }, { "epoch": 1.4488132928990354, "grad_norm": 0.9507124423980713, "learning_rate": 1.7933036595643687e-05, "loss": 0.3058, "step": 390800 }, { "epoch": 1.4491840230149255, "grad_norm": 0.35857313871383667, "learning_rate": 1.7910476183089714e-05, "loss": 0.3004, "step": 390900 }, { "epoch": 1.449554753130816, "grad_norm": 0.27257442474365234, "learning_rate": 1.7887926873750728e-05, "loss": 0.3039, "step": 391000 }, { "epoch": 1.4499254832467061, "grad_norm": 1.021169900894165, "learning_rate": 1.7865388675428962e-05, "loss": 0.2825, "step": 391100 }, { "epoch": 1.4502962133625963, "grad_norm": 0.7640347480773926, "learning_rate": 1.7842861595922805e-05, "loss": 0.2758, "step": 391200 }, { "epoch": 1.4506669434784865, "grad_norm": 0.5371448993682861, "learning_rate": 1.782034564302677e-05, "loss": 0.2836, "step": 391300 }, { "epoch": 1.4510376735943766, "grad_norm": 1.2235608100891113, "learning_rate": 1.7797840824531585e-05, "loss": 0.2819, "step": 391400 }, { "epoch": 1.451408403710267, "grad_norm": 0.5800812840461731, "learning_rate": 1.7775347148224065e-05, "loss": 0.2987, "step": 391500 }, { "epoch": 1.4517791338261572, "grad_norm": 0.602894127368927, "learning_rate": 1.775286462188717e-05, "loss": 0.2833, "step": 391600 }, { "epoch": 1.4521498639420476, "grad_norm": 1.5138332843780518, "learning_rate": 1.7730393253300066e-05, "loss": 0.2994, "step": 391700 }, { "epoch": 1.4525205940579378, "grad_norm": 1.0034348964691162, "learning_rate": 1.770793305023799e-05, "loss": 0.2856, "step": 391800 }, { "epoch": 1.452891324173828, "grad_norm": 0.7375285625457764, "learning_rate": 1.7685484020472344e-05, "loss": 0.2684, "step": 391900 }, { "epoch": 1.453262054289718, "grad_norm": 0.8064634799957275, "learning_rate": 1.7663046171770654e-05, "loss": 0.2945, "step": 392000 }, { "epoch": 1.4536327844056083, "grad_norm": 2.0045206546783447, "learning_rate": 1.764061951189659e-05, "loss": 0.2896, "step": 392100 }, { "epoch": 1.4540035145214987, "grad_norm": 0.667773962020874, "learning_rate": 1.761820404860992e-05, "loss": 0.2781, "step": 392200 }, { "epoch": 1.4543742446373888, "grad_norm": 0.24558934569358826, "learning_rate": 1.7595799789666607e-05, "loss": 0.274, "step": 392300 }, { "epoch": 1.4547449747532792, "grad_norm": 1.019173502922058, "learning_rate": 1.757340674281866e-05, "loss": 0.2565, "step": 392400 }, { "epoch": 1.4551157048691694, "grad_norm": 1.8485286235809326, "learning_rate": 1.7551024915814236e-05, "loss": 0.2935, "step": 392500 }, { "epoch": 1.4554864349850596, "grad_norm": 0.841194748878479, "learning_rate": 1.752865431639766e-05, "loss": 0.274, "step": 392600 }, { "epoch": 1.4558571651009498, "grad_norm": 0.9467856287956238, "learning_rate": 1.7506294952309255e-05, "loss": 0.2753, "step": 392700 }, { "epoch": 1.45622789521684, "grad_norm": 0.6465731263160706, "learning_rate": 1.7483946831285585e-05, "loss": 0.2806, "step": 392800 }, { "epoch": 1.4565986253327303, "grad_norm": 1.3485413789749146, "learning_rate": 1.746160996105925e-05, "loss": 0.2542, "step": 392900 }, { "epoch": 1.4569693554486205, "grad_norm": 2.7005131244659424, "learning_rate": 1.7439284349358953e-05, "loss": 0.2852, "step": 393000 }, { "epoch": 1.4573400855645107, "grad_norm": 0.48463740944862366, "learning_rate": 1.7416970003909556e-05, "loss": 0.2824, "step": 393100 }, { "epoch": 1.457710815680401, "grad_norm": 0.8820876479148865, "learning_rate": 1.739466693243198e-05, "loss": 0.2577, "step": 393200 }, { "epoch": 1.4580815457962912, "grad_norm": 1.1738495826721191, "learning_rate": 1.7372375142643226e-05, "loss": 0.2748, "step": 393300 }, { "epoch": 1.4584522759121814, "grad_norm": 0.6542091369628906, "learning_rate": 1.735009464225648e-05, "loss": 0.284, "step": 393400 }, { "epoch": 1.4588230060280716, "grad_norm": 0.9092134833335876, "learning_rate": 1.7327825438980895e-05, "loss": 0.295, "step": 393500 }, { "epoch": 1.459193736143962, "grad_norm": 0.7979613542556763, "learning_rate": 1.7305567540521823e-05, "loss": 0.2725, "step": 393600 }, { "epoch": 1.4595644662598521, "grad_norm": 0.3844234347343445, "learning_rate": 1.7283320954580652e-05, "loss": 0.3149, "step": 393700 }, { "epoch": 1.4599351963757423, "grad_norm": 1.287514090538025, "learning_rate": 1.726108568885485e-05, "loss": 0.2744, "step": 393800 }, { "epoch": 1.4603059264916327, "grad_norm": 0.49437612295150757, "learning_rate": 1.7238861751038014e-05, "loss": 0.2749, "step": 393900 }, { "epoch": 1.4606766566075229, "grad_norm": 0.38660210371017456, "learning_rate": 1.7216649148819787e-05, "loss": 0.2805, "step": 394000 }, { "epoch": 1.461047386723413, "grad_norm": 0.5464182496070862, "learning_rate": 1.7194447889885883e-05, "loss": 0.2633, "step": 394100 }, { "epoch": 1.4614181168393032, "grad_norm": 1.1514973640441895, "learning_rate": 1.717225798191811e-05, "loss": 0.3024, "step": 394200 }, { "epoch": 1.4617888469551936, "grad_norm": 0.8899003863334656, "learning_rate": 1.715007943259434e-05, "loss": 0.2833, "step": 394300 }, { "epoch": 1.4621595770710838, "grad_norm": 0.3626788854598999, "learning_rate": 1.7127912249588502e-05, "loss": 0.2534, "step": 394400 }, { "epoch": 1.462530307186974, "grad_norm": 1.1622650623321533, "learning_rate": 1.7105756440570636e-05, "loss": 0.2871, "step": 394500 }, { "epoch": 1.4629010373028644, "grad_norm": 0.8765597343444824, "learning_rate": 1.7083612013206805e-05, "loss": 0.2728, "step": 394600 }, { "epoch": 1.4632717674187545, "grad_norm": 0.583191454410553, "learning_rate": 1.7061478975159134e-05, "loss": 0.2625, "step": 394700 }, { "epoch": 1.4636424975346447, "grad_norm": 1.3766921758651733, "learning_rate": 1.703935733408584e-05, "loss": 0.2767, "step": 394800 }, { "epoch": 1.4640132276505349, "grad_norm": 0.7086800932884216, "learning_rate": 1.701724709764117e-05, "loss": 0.2613, "step": 394900 }, { "epoch": 1.4643839577664253, "grad_norm": 0.5653570890426636, "learning_rate": 1.6995148273475426e-05, "loss": 0.2868, "step": 395000 }, { "epoch": 1.4647546878823154, "grad_norm": 1.4400379657745361, "learning_rate": 1.697306086923497e-05, "loss": 0.2868, "step": 395100 }, { "epoch": 1.4651254179982056, "grad_norm": 1.4871832132339478, "learning_rate": 1.6950984892562188e-05, "loss": 0.2564, "step": 395200 }, { "epoch": 1.465496148114096, "grad_norm": 1.6570520401000977, "learning_rate": 1.6928920351095568e-05, "loss": 0.2523, "step": 395300 }, { "epoch": 1.4658668782299862, "grad_norm": 0.18814845383167267, "learning_rate": 1.6906867252469593e-05, "loss": 0.2563, "step": 395400 }, { "epoch": 1.4662376083458764, "grad_norm": 1.0166471004486084, "learning_rate": 1.6884825604314775e-05, "loss": 0.2767, "step": 395500 }, { "epoch": 1.4666083384617665, "grad_norm": 0.8718078136444092, "learning_rate": 1.6862795414257733e-05, "loss": 0.2894, "step": 395600 }, { "epoch": 1.466979068577657, "grad_norm": 1.0343222618103027, "learning_rate": 1.6840776689921052e-05, "loss": 0.243, "step": 395700 }, { "epoch": 1.467349798693547, "grad_norm": 1.3361783027648926, "learning_rate": 1.6818769438923377e-05, "loss": 0.2468, "step": 395800 }, { "epoch": 1.4677205288094373, "grad_norm": 1.949912428855896, "learning_rate": 1.6796773668879385e-05, "loss": 0.2739, "step": 395900 }, { "epoch": 1.4680912589253277, "grad_norm": 1.366259217262268, "learning_rate": 1.677478938739976e-05, "loss": 0.2702, "step": 396000 }, { "epoch": 1.4684619890412178, "grad_norm": 0.7674074769020081, "learning_rate": 1.6752816602091253e-05, "loss": 0.2635, "step": 396100 }, { "epoch": 1.468832719157108, "grad_norm": 0.299409955739975, "learning_rate": 1.67308553205566e-05, "loss": 0.2757, "step": 396200 }, { "epoch": 1.4692034492729982, "grad_norm": 0.3901430666446686, "learning_rate": 1.6708905550394554e-05, "loss": 0.299, "step": 396300 }, { "epoch": 1.4695741793888883, "grad_norm": 0.7582091093063354, "learning_rate": 1.6686967299199953e-05, "loss": 0.2548, "step": 396400 }, { "epoch": 1.4699449095047787, "grad_norm": 1.384488821029663, "learning_rate": 1.6665040574563532e-05, "loss": 0.2712, "step": 396500 }, { "epoch": 1.470315639620669, "grad_norm": 0.9981831908226013, "learning_rate": 1.6643125384072116e-05, "loss": 0.2821, "step": 396600 }, { "epoch": 1.4706863697365593, "grad_norm": 0.8209317922592163, "learning_rate": 1.6621221735308556e-05, "loss": 0.286, "step": 396700 }, { "epoch": 1.4710570998524495, "grad_norm": 1.3785920143127441, "learning_rate": 1.659932963585165e-05, "loss": 0.3077, "step": 396800 }, { "epoch": 1.4714278299683397, "grad_norm": 0.527532696723938, "learning_rate": 1.6577449093276226e-05, "loss": 0.2618, "step": 396900 }, { "epoch": 1.4717985600842298, "grad_norm": 0.6166320443153381, "learning_rate": 1.6555580115153136e-05, "loss": 0.2747, "step": 397000 }, { "epoch": 1.47216929020012, "grad_norm": 1.2388172149658203, "learning_rate": 1.6533722709049204e-05, "loss": 0.3047, "step": 397100 }, { "epoch": 1.4725400203160104, "grad_norm": 0.8613278269767761, "learning_rate": 1.651187688252725e-05, "loss": 0.278, "step": 397200 }, { "epoch": 1.4729107504319006, "grad_norm": 0.9882612228393555, "learning_rate": 1.6490042643146087e-05, "loss": 0.292, "step": 397300 }, { "epoch": 1.473281480547791, "grad_norm": 0.591375470161438, "learning_rate": 1.6468219998460522e-05, "loss": 0.2811, "step": 397400 }, { "epoch": 1.4736522106636811, "grad_norm": 0.789742648601532, "learning_rate": 1.6446408956021375e-05, "loss": 0.3021, "step": 397500 }, { "epoch": 1.4740229407795713, "grad_norm": 1.562896728515625, "learning_rate": 1.6424609523375424e-05, "loss": 0.287, "step": 397600 }, { "epoch": 1.4743936708954615, "grad_norm": 1.165548324584961, "learning_rate": 1.6402821708065407e-05, "loss": 0.2901, "step": 397700 }, { "epoch": 1.4747644010113516, "grad_norm": 1.591862440109253, "learning_rate": 1.6381045517630112e-05, "loss": 0.2796, "step": 397800 }, { "epoch": 1.475135131127242, "grad_norm": 0.7414871454238892, "learning_rate": 1.635928095960424e-05, "loss": 0.2721, "step": 397900 }, { "epoch": 1.4755058612431322, "grad_norm": 0.547579824924469, "learning_rate": 1.6337528041518503e-05, "loss": 0.2748, "step": 398000 }, { "epoch": 1.4758765913590224, "grad_norm": 1.022239089012146, "learning_rate": 1.631578677089956e-05, "loss": 0.2547, "step": 398100 }, { "epoch": 1.4762473214749128, "grad_norm": 0.8342387080192566, "learning_rate": 1.6294057155270043e-05, "loss": 0.2754, "step": 398200 }, { "epoch": 1.476618051590803, "grad_norm": 0.31683602929115295, "learning_rate": 1.627233920214859e-05, "loss": 0.2707, "step": 398300 }, { "epoch": 1.4769887817066931, "grad_norm": 1.3046176433563232, "learning_rate": 1.625063291904977e-05, "loss": 0.2836, "step": 398400 }, { "epoch": 1.4773595118225833, "grad_norm": 0.7547867298126221, "learning_rate": 1.622893831348409e-05, "loss": 0.2959, "step": 398500 }, { "epoch": 1.4777302419384737, "grad_norm": 1.6911712884902954, "learning_rate": 1.6207255392958083e-05, "loss": 0.2801, "step": 398600 }, { "epoch": 1.4781009720543639, "grad_norm": 0.521090567111969, "learning_rate": 1.6185584164974197e-05, "loss": 0.2743, "step": 398700 }, { "epoch": 1.478471702170254, "grad_norm": 0.6671949028968811, "learning_rate": 1.61639246370308e-05, "loss": 0.2691, "step": 398800 }, { "epoch": 1.4788424322861444, "grad_norm": 0.43417680263519287, "learning_rate": 1.6142276816622293e-05, "loss": 0.286, "step": 398900 }, { "epoch": 1.4792131624020346, "grad_norm": 0.6573916673660278, "learning_rate": 1.6120640711238965e-05, "loss": 0.2902, "step": 399000 }, { "epoch": 1.4795838925179248, "grad_norm": 0.44646722078323364, "learning_rate": 1.609901632836706e-05, "loss": 0.2974, "step": 399100 }, { "epoch": 1.479954622633815, "grad_norm": 0.6710375547409058, "learning_rate": 1.6077403675488807e-05, "loss": 0.2421, "step": 399200 }, { "epoch": 1.4803253527497053, "grad_norm": 0.5707090497016907, "learning_rate": 1.6055802760082327e-05, "loss": 0.2783, "step": 399300 }, { "epoch": 1.4806960828655955, "grad_norm": 1.3959362506866455, "learning_rate": 1.6034213589621676e-05, "loss": 0.2807, "step": 399400 }, { "epoch": 1.4810668129814857, "grad_norm": 1.0853166580200195, "learning_rate": 1.6012636171576933e-05, "loss": 0.251, "step": 399500 }, { "epoch": 1.481437543097376, "grad_norm": 1.119031548500061, "learning_rate": 1.5991070513413962e-05, "loss": 0.3047, "step": 399600 }, { "epoch": 1.4818082732132662, "grad_norm": 1.8283720016479492, "learning_rate": 1.5969516622594706e-05, "loss": 0.2876, "step": 399700 }, { "epoch": 1.4821790033291564, "grad_norm": 0.5763857960700989, "learning_rate": 1.594797450657695e-05, "loss": 0.279, "step": 399800 }, { "epoch": 1.4825497334450466, "grad_norm": 0.7999750375747681, "learning_rate": 1.5926444172814407e-05, "loss": 0.2509, "step": 399900 }, { "epoch": 1.482920463560937, "grad_norm": 0.5491151809692383, "learning_rate": 1.5904925628756774e-05, "loss": 0.2718, "step": 400000 }, { "epoch": 1.4832911936768272, "grad_norm": 1.5480302572250366, "learning_rate": 1.5883418881849603e-05, "loss": 0.2845, "step": 400100 }, { "epoch": 1.4836619237927173, "grad_norm": 1.1053102016448975, "learning_rate": 1.5861923939534394e-05, "loss": 0.276, "step": 400200 }, { "epoch": 1.4840326539086077, "grad_norm": 1.0610272884368896, "learning_rate": 1.5840440809248557e-05, "loss": 0.2908, "step": 400300 }, { "epoch": 1.484403384024498, "grad_norm": 1.3994258642196655, "learning_rate": 1.5818969498425402e-05, "loss": 0.2677, "step": 400400 }, { "epoch": 1.484774114140388, "grad_norm": 1.0338612794876099, "learning_rate": 1.5797510014494194e-05, "loss": 0.3019, "step": 400500 }, { "epoch": 1.4851448442562782, "grad_norm": 1.5674831867218018, "learning_rate": 1.5776062364880058e-05, "loss": 0.2968, "step": 400600 }, { "epoch": 1.4855155743721684, "grad_norm": 0.9517698884010315, "learning_rate": 1.5754626557004015e-05, "loss": 0.3036, "step": 400700 }, { "epoch": 1.4858863044880588, "grad_norm": 0.9326066374778748, "learning_rate": 1.5733202598283063e-05, "loss": 0.2689, "step": 400800 }, { "epoch": 1.486257034603949, "grad_norm": 0.5578441023826599, "learning_rate": 1.571179049613003e-05, "loss": 0.2656, "step": 400900 }, { "epoch": 1.4866277647198394, "grad_norm": 0.8633666634559631, "learning_rate": 1.5690390257953663e-05, "loss": 0.2885, "step": 401000 }, { "epoch": 1.4869984948357295, "grad_norm": 1.3233823776245117, "learning_rate": 1.5669001891158598e-05, "loss": 0.2826, "step": 401100 }, { "epoch": 1.4873692249516197, "grad_norm": 0.9780126214027405, "learning_rate": 1.564762540314536e-05, "loss": 0.2836, "step": 401200 }, { "epoch": 1.48773995506751, "grad_norm": 0.6370356678962708, "learning_rate": 1.562626080131041e-05, "loss": 0.2872, "step": 401300 }, { "epoch": 1.4881106851834, "grad_norm": 0.6798115968704224, "learning_rate": 1.5604908093046033e-05, "loss": 0.2676, "step": 401400 }, { "epoch": 1.4884814152992905, "grad_norm": 0.42964285612106323, "learning_rate": 1.558356728574042e-05, "loss": 0.2789, "step": 401500 }, { "epoch": 1.4888521454151806, "grad_norm": 0.3348149359226227, "learning_rate": 1.5562238386777672e-05, "loss": 0.2715, "step": 401600 }, { "epoch": 1.489222875531071, "grad_norm": 0.5199164748191833, "learning_rate": 1.5540921403537744e-05, "loss": 0.2867, "step": 401700 }, { "epoch": 1.4895936056469612, "grad_norm": 1.4553406238555908, "learning_rate": 1.551961634339646e-05, "loss": 0.3081, "step": 401800 }, { "epoch": 1.4899643357628514, "grad_norm": 0.7109498381614685, "learning_rate": 1.549832321372553e-05, "loss": 0.284, "step": 401900 }, { "epoch": 1.4903350658787415, "grad_norm": 0.3685062825679779, "learning_rate": 1.547704202189254e-05, "loss": 0.2701, "step": 402000 }, { "epoch": 1.4907057959946317, "grad_norm": 1.3110010623931885, "learning_rate": 1.545577277526093e-05, "loss": 0.2977, "step": 402100 }, { "epoch": 1.491076526110522, "grad_norm": 0.6173861622810364, "learning_rate": 1.5434515481190043e-05, "loss": 0.2551, "step": 402200 }, { "epoch": 1.4914472562264123, "grad_norm": 0.9503574371337891, "learning_rate": 1.5413270147035048e-05, "loss": 0.2819, "step": 402300 }, { "epoch": 1.4918179863423024, "grad_norm": 1.0951422452926636, "learning_rate": 1.5392036780146973e-05, "loss": 0.2944, "step": 402400 }, { "epoch": 1.4921887164581928, "grad_norm": 0.31473425030708313, "learning_rate": 1.537081538787277e-05, "loss": 0.2705, "step": 402500 }, { "epoch": 1.492559446574083, "grad_norm": 0.9188143610954285, "learning_rate": 1.5349605977555137e-05, "loss": 0.2833, "step": 402600 }, { "epoch": 1.4929301766899732, "grad_norm": 2.1965689659118652, "learning_rate": 1.532840855653273e-05, "loss": 0.2657, "step": 402700 }, { "epoch": 1.4933009068058634, "grad_norm": 1.0432971715927124, "learning_rate": 1.5307223132140007e-05, "loss": 0.2796, "step": 402800 }, { "epoch": 1.4936716369217538, "grad_norm": 0.5932414531707764, "learning_rate": 1.5286049711707267e-05, "loss": 0.2419, "step": 402900 }, { "epoch": 1.494042367037644, "grad_norm": 1.632681131362915, "learning_rate": 1.526488830256069e-05, "loss": 0.2621, "step": 403000 }, { "epoch": 1.494413097153534, "grad_norm": 0.4399764835834503, "learning_rate": 1.5243738912022287e-05, "loss": 0.2871, "step": 403100 }, { "epoch": 1.4947838272694245, "grad_norm": 0.7139073610305786, "learning_rate": 1.5222601547409875e-05, "loss": 0.2699, "step": 403200 }, { "epoch": 1.4951545573853147, "grad_norm": 0.776397168636322, "learning_rate": 1.5201476216037191e-05, "loss": 0.2873, "step": 403300 }, { "epoch": 1.4955252875012048, "grad_norm": 0.8488737344741821, "learning_rate": 1.5180362925213692e-05, "loss": 0.2512, "step": 403400 }, { "epoch": 1.495896017617095, "grad_norm": 0.8328301906585693, "learning_rate": 1.5159261682244785e-05, "loss": 0.2762, "step": 403500 }, { "epoch": 1.4962667477329854, "grad_norm": 1.8797223567962646, "learning_rate": 1.5138172494431641e-05, "loss": 0.2763, "step": 403600 }, { "epoch": 1.4966374778488756, "grad_norm": 1.0419235229492188, "learning_rate": 1.511709536907126e-05, "loss": 0.2877, "step": 403700 }, { "epoch": 1.4970082079647657, "grad_norm": 0.7497771382331848, "learning_rate": 1.5096030313456522e-05, "loss": 0.3023, "step": 403800 }, { "epoch": 1.4973789380806561, "grad_norm": 0.7678084969520569, "learning_rate": 1.5074977334876067e-05, "loss": 0.2932, "step": 403900 }, { "epoch": 1.4977496681965463, "grad_norm": 1.1418770551681519, "learning_rate": 1.5053936440614374e-05, "loss": 0.2805, "step": 404000 }, { "epoch": 1.4981203983124365, "grad_norm": 1.5438727140426636, "learning_rate": 1.50329076379518e-05, "loss": 0.3023, "step": 404100 }, { "epoch": 1.4984911284283267, "grad_norm": 0.7700592875480652, "learning_rate": 1.5011890934164413e-05, "loss": 0.2737, "step": 404200 }, { "epoch": 1.498861858544217, "grad_norm": 0.28036341071128845, "learning_rate": 1.4990886336524151e-05, "loss": 0.2656, "step": 404300 }, { "epoch": 1.4992325886601072, "grad_norm": 1.2779067754745483, "learning_rate": 1.49698938522988e-05, "loss": 0.2742, "step": 404400 }, { "epoch": 1.4996033187759974, "grad_norm": 1.2266558408737183, "learning_rate": 1.4948913488751892e-05, "loss": 0.2884, "step": 404500 }, { "epoch": 1.4999740488918878, "grad_norm": 2.0907773971557617, "learning_rate": 1.4927945253142777e-05, "loss": 0.2667, "step": 404600 }, { "epoch": 1.500344779007778, "grad_norm": 0.6368429660797119, "learning_rate": 1.4906989152726659e-05, "loss": 0.2749, "step": 404700 }, { "epoch": 1.5007155091236681, "grad_norm": 0.5933489203453064, "learning_rate": 1.4886045194754478e-05, "loss": 0.2714, "step": 404800 }, { "epoch": 1.5010862392395583, "grad_norm": 1.3625558614730835, "learning_rate": 1.4865113386473012e-05, "loss": 0.2724, "step": 404900 }, { "epoch": 1.5014569693554485, "grad_norm": 1.0079436302185059, "learning_rate": 1.4844193735124811e-05, "loss": 0.2985, "step": 405000 }, { "epoch": 1.5018276994713389, "grad_norm": 0.9780500531196594, "learning_rate": 1.482328624794822e-05, "loss": 0.2593, "step": 405100 }, { "epoch": 1.502198429587229, "grad_norm": 1.0193352699279785, "learning_rate": 1.4802390932177424e-05, "loss": 0.2888, "step": 405200 }, { "epoch": 1.5025691597031194, "grad_norm": 1.519148826599121, "learning_rate": 1.4781507795042332e-05, "loss": 0.2913, "step": 405300 }, { "epoch": 1.5029398898190096, "grad_norm": 1.2276324033737183, "learning_rate": 1.4760636843768654e-05, "loss": 0.2792, "step": 405400 }, { "epoch": 1.5033106199348998, "grad_norm": 0.8307383060455322, "learning_rate": 1.4739778085577931e-05, "loss": 0.2475, "step": 405500 }, { "epoch": 1.50368135005079, "grad_norm": 0.3358030617237091, "learning_rate": 1.471893152768743e-05, "loss": 0.2679, "step": 405600 }, { "epoch": 1.5040520801666801, "grad_norm": 0.6561837792396545, "learning_rate": 1.469809717731021e-05, "loss": 0.274, "step": 405700 }, { "epoch": 1.5044228102825705, "grad_norm": 1.1363157033920288, "learning_rate": 1.467727504165512e-05, "loss": 0.2793, "step": 405800 }, { "epoch": 1.5047935403984607, "grad_norm": 1.2608516216278076, "learning_rate": 1.4656465127926754e-05, "loss": 0.2669, "step": 405900 }, { "epoch": 1.505164270514351, "grad_norm": 0.5412049293518066, "learning_rate": 1.4635667443325534e-05, "loss": 0.2922, "step": 406000 }, { "epoch": 1.5055350006302413, "grad_norm": 0.5738481283187866, "learning_rate": 1.4614881995047597e-05, "loss": 0.2848, "step": 406100 }, { "epoch": 1.5059057307461314, "grad_norm": 1.5285279750823975, "learning_rate": 1.4594108790284843e-05, "loss": 0.2664, "step": 406200 }, { "epoch": 1.5062764608620216, "grad_norm": 1.1747339963912964, "learning_rate": 1.4573347836225003e-05, "loss": 0.3159, "step": 406300 }, { "epoch": 1.5066471909779118, "grad_norm": 1.1690343618392944, "learning_rate": 1.4552599140051488e-05, "loss": 0.2987, "step": 406400 }, { "epoch": 1.5070179210938022, "grad_norm": 0.955106258392334, "learning_rate": 1.4531862708943483e-05, "loss": 0.272, "step": 406500 }, { "epoch": 1.5073886512096923, "grad_norm": 0.4580312967300415, "learning_rate": 1.4511138550075993e-05, "loss": 0.2453, "step": 406600 }, { "epoch": 1.5077593813255827, "grad_norm": 0.8658204078674316, "learning_rate": 1.4490426670619712e-05, "loss": 0.2807, "step": 406700 }, { "epoch": 1.508130111441473, "grad_norm": 1.4048740863800049, "learning_rate": 1.4469727077741086e-05, "loss": 0.2872, "step": 406800 }, { "epoch": 1.508500841557363, "grad_norm": 0.2808430790901184, "learning_rate": 1.4449039778602368e-05, "loss": 0.2632, "step": 406900 }, { "epoch": 1.5088715716732533, "grad_norm": 1.1399019956588745, "learning_rate": 1.4428364780361492e-05, "loss": 0.2767, "step": 407000 }, { "epoch": 1.5092423017891434, "grad_norm": 1.1121392250061035, "learning_rate": 1.4407702090172176e-05, "loss": 0.2494, "step": 407100 }, { "epoch": 1.5096130319050338, "grad_norm": 0.9256900548934937, "learning_rate": 1.4387051715183852e-05, "loss": 0.2814, "step": 407200 }, { "epoch": 1.509983762020924, "grad_norm": 0.6553705930709839, "learning_rate": 1.436641366254169e-05, "loss": 0.2453, "step": 407300 }, { "epoch": 1.5103544921368144, "grad_norm": 1.0531084537506104, "learning_rate": 1.434578793938665e-05, "loss": 0.2631, "step": 407400 }, { "epoch": 1.5107252222527046, "grad_norm": 0.5401178002357483, "learning_rate": 1.4325174552855364e-05, "loss": 0.282, "step": 407500 }, { "epoch": 1.5110959523685947, "grad_norm": 0.7324489951133728, "learning_rate": 1.4304573510080205e-05, "loss": 0.284, "step": 407600 }, { "epoch": 1.511466682484485, "grad_norm": 1.192817211151123, "learning_rate": 1.4283984818189316e-05, "loss": 0.2818, "step": 407700 }, { "epoch": 1.511837412600375, "grad_norm": 1.4687849283218384, "learning_rate": 1.4263408484306524e-05, "loss": 0.2619, "step": 407800 }, { "epoch": 1.5122081427162652, "grad_norm": 0.8906898498535156, "learning_rate": 1.4242844515551396e-05, "loss": 0.2939, "step": 407900 }, { "epoch": 1.5125788728321556, "grad_norm": 0.9381018877029419, "learning_rate": 1.4222292919039216e-05, "loss": 0.276, "step": 408000 }, { "epoch": 1.512949602948046, "grad_norm": 0.32151710987091064, "learning_rate": 1.4201753701880977e-05, "loss": 0.293, "step": 408100 }, { "epoch": 1.5133203330639362, "grad_norm": 0.8988072872161865, "learning_rate": 1.4181226871183429e-05, "loss": 0.2894, "step": 408200 }, { "epoch": 1.5136910631798264, "grad_norm": 1.6286277770996094, "learning_rate": 1.4160712434049e-05, "loss": 0.283, "step": 408300 }, { "epoch": 1.5140617932957166, "grad_norm": 1.3416082859039307, "learning_rate": 1.4140210397575815e-05, "loss": 0.2783, "step": 408400 }, { "epoch": 1.5144325234116067, "grad_norm": 1.1636731624603271, "learning_rate": 1.4119720768857769e-05, "loss": 0.2862, "step": 408500 }, { "epoch": 1.514803253527497, "grad_norm": 0.8892748951911926, "learning_rate": 1.409924355498442e-05, "loss": 0.2658, "step": 408600 }, { "epoch": 1.5151739836433873, "grad_norm": 0.8533502817153931, "learning_rate": 1.4078778763041e-05, "loss": 0.2693, "step": 408700 }, { "epoch": 1.5155447137592777, "grad_norm": 0.659007728099823, "learning_rate": 1.4058326400108518e-05, "loss": 0.2766, "step": 408800 }, { "epoch": 1.5159154438751679, "grad_norm": 0.6706336140632629, "learning_rate": 1.4037886473263633e-05, "loss": 0.2701, "step": 408900 }, { "epoch": 1.516286173991058, "grad_norm": 0.8718041181564331, "learning_rate": 1.4017458989578691e-05, "loss": 0.2781, "step": 409000 }, { "epoch": 1.5166569041069482, "grad_norm": 0.8932418823242188, "learning_rate": 1.3997043956121796e-05, "loss": 0.3084, "step": 409100 }, { "epoch": 1.5170276342228384, "grad_norm": 1.24361252784729, "learning_rate": 1.397664137995668e-05, "loss": 0.2584, "step": 409200 }, { "epoch": 1.5173983643387285, "grad_norm": 1.0173463821411133, "learning_rate": 1.3956251268142767e-05, "loss": 0.2675, "step": 409300 }, { "epoch": 1.517769094454619, "grad_norm": 1.330600619316101, "learning_rate": 1.3935873627735246e-05, "loss": 0.2672, "step": 409400 }, { "epoch": 1.518139824570509, "grad_norm": 1.1001394987106323, "learning_rate": 1.391550846578486e-05, "loss": 0.2834, "step": 409500 }, { "epoch": 1.5185105546863995, "grad_norm": 0.7069059014320374, "learning_rate": 1.3895155789338154e-05, "loss": 0.2631, "step": 409600 }, { "epoch": 1.5188812848022897, "grad_norm": 0.8453354239463806, "learning_rate": 1.3874815605437303e-05, "loss": 0.2935, "step": 409700 }, { "epoch": 1.5192520149181798, "grad_norm": 0.4570026993751526, "learning_rate": 1.385448792112013e-05, "loss": 0.2542, "step": 409800 }, { "epoch": 1.51962274503407, "grad_norm": 1.3369286060333252, "learning_rate": 1.383417274342021e-05, "loss": 0.2546, "step": 409900 }, { "epoch": 1.5199934751499602, "grad_norm": 1.446542501449585, "learning_rate": 1.3813870079366726e-05, "loss": 0.2802, "step": 410000 }, { "epoch": 1.5203642052658506, "grad_norm": 0.7453349232673645, "learning_rate": 1.3793579935984551e-05, "loss": 0.2744, "step": 410100 }, { "epoch": 1.5207349353817408, "grad_norm": 1.0685555934906006, "learning_rate": 1.3773302320294223e-05, "loss": 0.279, "step": 410200 }, { "epoch": 1.5211056654976312, "grad_norm": 0.9098944664001465, "learning_rate": 1.3753037239311939e-05, "loss": 0.2636, "step": 410300 }, { "epoch": 1.5214763956135213, "grad_norm": 0.9621092677116394, "learning_rate": 1.37327847000496e-05, "loss": 0.2801, "step": 410400 }, { "epoch": 1.5218471257294115, "grad_norm": 0.6051278710365295, "learning_rate": 1.3712544709514714e-05, "loss": 0.2859, "step": 410500 }, { "epoch": 1.5222178558453017, "grad_norm": 1.2200486660003662, "learning_rate": 1.3692317274710453e-05, "loss": 0.2735, "step": 410600 }, { "epoch": 1.5225885859611918, "grad_norm": 0.7796865701675415, "learning_rate": 1.36721024026357e-05, "loss": 0.2587, "step": 410700 }, { "epoch": 1.5229593160770822, "grad_norm": 0.9389694333076477, "learning_rate": 1.365190010028493e-05, "loss": 0.2642, "step": 410800 }, { "epoch": 1.5233300461929724, "grad_norm": 1.3571953773498535, "learning_rate": 1.3631710374648293e-05, "loss": 0.2774, "step": 410900 }, { "epoch": 1.5237007763088628, "grad_norm": 2.470778465270996, "learning_rate": 1.3611533232711588e-05, "loss": 0.2763, "step": 411000 }, { "epoch": 1.524071506424753, "grad_norm": 0.9509580135345459, "learning_rate": 1.3591368681456245e-05, "loss": 0.2834, "step": 411100 }, { "epoch": 1.5244422365406431, "grad_norm": 1.385262131690979, "learning_rate": 1.3571216727859349e-05, "loss": 0.2794, "step": 411200 }, { "epoch": 1.5248129666565333, "grad_norm": 1.1259516477584839, "learning_rate": 1.3551077378893645e-05, "loss": 0.2715, "step": 411300 }, { "epoch": 1.5251836967724235, "grad_norm": 1.0775794982910156, "learning_rate": 1.3530950641527478e-05, "loss": 0.2783, "step": 411400 }, { "epoch": 1.5255544268883139, "grad_norm": 1.0320038795471191, "learning_rate": 1.3510836522724873e-05, "loss": 0.2783, "step": 411500 }, { "epoch": 1.525925157004204, "grad_norm": 1.016383171081543, "learning_rate": 1.349073502944545e-05, "loss": 0.2759, "step": 411600 }, { "epoch": 1.5262958871200945, "grad_norm": 1.7705334424972534, "learning_rate": 1.347064616864448e-05, "loss": 0.2658, "step": 411700 }, { "epoch": 1.5266666172359846, "grad_norm": 0.3546404242515564, "learning_rate": 1.345056994727285e-05, "loss": 0.2886, "step": 411800 }, { "epoch": 1.5270373473518748, "grad_norm": 0.6409999132156372, "learning_rate": 1.3430506372277097e-05, "loss": 0.2404, "step": 411900 }, { "epoch": 1.527408077467765, "grad_norm": 0.9793638586997986, "learning_rate": 1.341045545059934e-05, "loss": 0.2671, "step": 412000 }, { "epoch": 1.5277788075836551, "grad_norm": 0.9708759784698486, "learning_rate": 1.339041718917739e-05, "loss": 0.2713, "step": 412100 }, { "epoch": 1.5281495376995455, "grad_norm": 1.1600686311721802, "learning_rate": 1.337039159494461e-05, "loss": 0.2566, "step": 412200 }, { "epoch": 1.5285202678154357, "grad_norm": 0.4640007019042969, "learning_rate": 1.3350378674829989e-05, "loss": 0.2852, "step": 412300 }, { "epoch": 1.528890997931326, "grad_norm": 2.24881649017334, "learning_rate": 1.33303784357582e-05, "loss": 0.2596, "step": 412400 }, { "epoch": 1.5292617280472163, "grad_norm": 0.7538591027259827, "learning_rate": 1.331039088464941e-05, "loss": 0.2675, "step": 412500 }, { "epoch": 1.5296324581631064, "grad_norm": 1.5913153886795044, "learning_rate": 1.3290416028419505e-05, "loss": 0.267, "step": 412600 }, { "epoch": 1.5300031882789966, "grad_norm": 1.1633919477462769, "learning_rate": 1.3270453873979927e-05, "loss": 0.2613, "step": 412700 }, { "epoch": 1.5303739183948868, "grad_norm": 1.9687963724136353, "learning_rate": 1.3250504428237709e-05, "loss": 0.2686, "step": 412800 }, { "epoch": 1.530744648510777, "grad_norm": 0.7004048824310303, "learning_rate": 1.3230567698095536e-05, "loss": 0.2994, "step": 412900 }, { "epoch": 1.5311153786266674, "grad_norm": 1.7105233669281006, "learning_rate": 1.3210643690451658e-05, "loss": 0.2746, "step": 413000 }, { "epoch": 1.5314861087425578, "grad_norm": 0.5762355923652649, "learning_rate": 1.3190732412199908e-05, "loss": 0.2665, "step": 413100 }, { "epoch": 1.531856838858448, "grad_norm": 1.0586909055709839, "learning_rate": 1.3170833870229788e-05, "loss": 0.2859, "step": 413200 }, { "epoch": 1.532227568974338, "grad_norm": 1.4695943593978882, "learning_rate": 1.3150948071426283e-05, "loss": 0.2654, "step": 413300 }, { "epoch": 1.5325982990902283, "grad_norm": 0.783207356929779, "learning_rate": 1.3131075022670065e-05, "loss": 0.2637, "step": 413400 }, { "epoch": 1.5329690292061184, "grad_norm": 0.5795162320137024, "learning_rate": 1.3111214730837351e-05, "loss": 0.2576, "step": 413500 }, { "epoch": 1.5333397593220086, "grad_norm": 0.7408479452133179, "learning_rate": 1.3091367202799931e-05, "loss": 0.29, "step": 413600 }, { "epoch": 1.533710489437899, "grad_norm": 0.5605857372283936, "learning_rate": 1.3071532445425227e-05, "loss": 0.2497, "step": 413700 }, { "epoch": 1.5340812195537892, "grad_norm": 0.8384562730789185, "learning_rate": 1.30517104655762e-05, "loss": 0.2644, "step": 413800 }, { "epoch": 1.5344519496696796, "grad_norm": 1.0456936359405518, "learning_rate": 1.3031901270111386e-05, "loss": 0.2703, "step": 413900 }, { "epoch": 1.5348226797855697, "grad_norm": 0.7231103181838989, "learning_rate": 1.3012104865884966e-05, "loss": 0.2959, "step": 414000 }, { "epoch": 1.53519340990146, "grad_norm": 1.5044435262680054, "learning_rate": 1.2992321259746592e-05, "loss": 0.289, "step": 414100 }, { "epoch": 1.53556414001735, "grad_norm": 1.5003025531768799, "learning_rate": 1.2972550458541543e-05, "loss": 0.2601, "step": 414200 }, { "epoch": 1.5359348701332403, "grad_norm": 0.7479950189590454, "learning_rate": 1.2952792469110692e-05, "loss": 0.2882, "step": 414300 }, { "epoch": 1.5363056002491307, "grad_norm": 0.5365775227546692, "learning_rate": 1.2933047298290435e-05, "loss": 0.2754, "step": 414400 }, { "epoch": 1.5366763303650208, "grad_norm": 1.482074499130249, "learning_rate": 1.2913314952912737e-05, "loss": 0.2932, "step": 414500 }, { "epoch": 1.5370470604809112, "grad_norm": 1.3020206689834595, "learning_rate": 1.2893595439805167e-05, "loss": 0.265, "step": 414600 }, { "epoch": 1.5374177905968014, "grad_norm": 0.9648169875144958, "learning_rate": 1.2873888765790803e-05, "loss": 0.3112, "step": 414700 }, { "epoch": 1.5377885207126916, "grad_norm": 0.5020970106124878, "learning_rate": 1.2854194937688307e-05, "loss": 0.2818, "step": 414800 }, { "epoch": 1.5381592508285817, "grad_norm": 0.4791215658187866, "learning_rate": 1.2834513962311895e-05, "loss": 0.2674, "step": 414900 }, { "epoch": 1.538529980944472, "grad_norm": 0.7893198132514954, "learning_rate": 1.2814845846471301e-05, "loss": 0.3216, "step": 415000 }, { "epoch": 1.5389007110603623, "grad_norm": 0.8670728206634521, "learning_rate": 1.2795190596971889e-05, "loss": 0.2694, "step": 415100 }, { "epoch": 1.5392714411762525, "grad_norm": 1.5254470109939575, "learning_rate": 1.2775548220614491e-05, "loss": 0.2788, "step": 415200 }, { "epoch": 1.5396421712921429, "grad_norm": 0.43049395084381104, "learning_rate": 1.2755918724195515e-05, "loss": 0.2954, "step": 415300 }, { "epoch": 1.540012901408033, "grad_norm": 0.9245679378509521, "learning_rate": 1.2736302114506932e-05, "loss": 0.2851, "step": 415400 }, { "epoch": 1.5403836315239232, "grad_norm": 1.8086676597595215, "learning_rate": 1.2716698398336225e-05, "loss": 0.3023, "step": 415500 }, { "epoch": 1.5407543616398134, "grad_norm": 0.4707808494567871, "learning_rate": 1.2697107582466428e-05, "loss": 0.2509, "step": 415600 }, { "epoch": 1.5411250917557036, "grad_norm": 0.8158500790596008, "learning_rate": 1.2677529673676109e-05, "loss": 0.3003, "step": 415700 }, { "epoch": 1.541495821871594, "grad_norm": 1.7489575147628784, "learning_rate": 1.2657964678739349e-05, "loss": 0.2666, "step": 415800 }, { "epoch": 1.5418665519874841, "grad_norm": 1.2672710418701172, "learning_rate": 1.263841260442582e-05, "loss": 0.3412, "step": 415900 }, { "epoch": 1.5422372821033745, "grad_norm": 0.7182813882827759, "learning_rate": 1.2618873457500674e-05, "loss": 0.2735, "step": 416000 }, { "epoch": 1.5426080122192647, "grad_norm": 0.7310736179351807, "learning_rate": 1.2599347244724575e-05, "loss": 0.2915, "step": 416100 }, { "epoch": 1.5429787423351549, "grad_norm": 0.5551763772964478, "learning_rate": 1.2579833972853794e-05, "loss": 0.2666, "step": 416200 }, { "epoch": 1.543349472451045, "grad_norm": 1.2087570428848267, "learning_rate": 1.2560333648640015e-05, "loss": 0.2811, "step": 416300 }, { "epoch": 1.5437202025669352, "grad_norm": 0.9411323666572571, "learning_rate": 1.254084627883051e-05, "loss": 0.2625, "step": 416400 }, { "epoch": 1.5440909326828256, "grad_norm": 1.27108633518219, "learning_rate": 1.2521371870168069e-05, "loss": 0.2592, "step": 416500 }, { "epoch": 1.5444616627987158, "grad_norm": 1.2345770597457886, "learning_rate": 1.2501910429390973e-05, "loss": 0.2701, "step": 416600 }, { "epoch": 1.5448323929146062, "grad_norm": 0.7974886298179626, "learning_rate": 1.2482461963233016e-05, "loss": 0.283, "step": 416700 }, { "epoch": 1.5452031230304963, "grad_norm": 1.2915699481964111, "learning_rate": 1.246302647842354e-05, "loss": 0.28, "step": 416800 }, { "epoch": 1.5455738531463865, "grad_norm": 0.7792947888374329, "learning_rate": 1.2443603981687346e-05, "loss": 0.2447, "step": 416900 }, { "epoch": 1.5459445832622767, "grad_norm": 0.9681490063667297, "learning_rate": 1.2424194479744771e-05, "loss": 0.2691, "step": 417000 }, { "epoch": 1.5463153133781669, "grad_norm": 1.0000087022781372, "learning_rate": 1.2404797979311644e-05, "loss": 0.2748, "step": 417100 }, { "epoch": 1.5466860434940572, "grad_norm": 1.440463900566101, "learning_rate": 1.238541448709929e-05, "loss": 0.2858, "step": 417200 }, { "epoch": 1.5470567736099474, "grad_norm": 0.6358212828636169, "learning_rate": 1.2366044009814564e-05, "loss": 0.2854, "step": 417300 }, { "epoch": 1.5474275037258378, "grad_norm": 0.7932697534561157, "learning_rate": 1.2346686554159786e-05, "loss": 0.2827, "step": 417400 }, { "epoch": 1.547798233841728, "grad_norm": 2.4450278282165527, "learning_rate": 1.2327342126832763e-05, "loss": 0.2648, "step": 417500 }, { "epoch": 1.5481689639576182, "grad_norm": 1.0413553714752197, "learning_rate": 1.2308010734526842e-05, "loss": 0.3104, "step": 417600 }, { "epoch": 1.5485396940735083, "grad_norm": 0.5719080567359924, "learning_rate": 1.2288692383930817e-05, "loss": 0.2654, "step": 417700 }, { "epoch": 1.5489104241893985, "grad_norm": 1.0303013324737549, "learning_rate": 1.2269387081728978e-05, "loss": 0.2625, "step": 417800 }, { "epoch": 1.5492811543052887, "grad_norm": 1.1024374961853027, "learning_rate": 1.2250094834601106e-05, "loss": 0.2662, "step": 417900 }, { "epoch": 1.549651884421179, "grad_norm": 1.0215222835540771, "learning_rate": 1.2230815649222444e-05, "loss": 0.2776, "step": 418000 }, { "epoch": 1.5500226145370695, "grad_norm": 1.1435202360153198, "learning_rate": 1.2211549532263766e-05, "loss": 0.2683, "step": 418100 }, { "epoch": 1.5503933446529596, "grad_norm": 1.3240054845809937, "learning_rate": 1.2192296490391276e-05, "loss": 0.2685, "step": 418200 }, { "epoch": 1.5507640747688498, "grad_norm": 0.5021076202392578, "learning_rate": 1.2173056530266657e-05, "loss": 0.2731, "step": 418300 }, { "epoch": 1.55113480488474, "grad_norm": 1.8009437322616577, "learning_rate": 1.2153829658547117e-05, "loss": 0.2682, "step": 418400 }, { "epoch": 1.5515055350006302, "grad_norm": 0.9025293588638306, "learning_rate": 1.213461588188528e-05, "loss": 0.2698, "step": 418500 }, { "epoch": 1.5518762651165203, "grad_norm": 0.5233442187309265, "learning_rate": 1.2115415206929226e-05, "loss": 0.2962, "step": 418600 }, { "epoch": 1.5522469952324107, "grad_norm": 0.9804129600524902, "learning_rate": 1.2096227640322571e-05, "loss": 0.2874, "step": 418700 }, { "epoch": 1.552617725348301, "grad_norm": 0.6622077822685242, "learning_rate": 1.2077053188704346e-05, "loss": 0.2711, "step": 418800 }, { "epoch": 1.5529884554641913, "grad_norm": 0.8644052743911743, "learning_rate": 1.2057891858709031e-05, "loss": 0.2932, "step": 418900 }, { "epoch": 1.5533591855800815, "grad_norm": 2.5669381618499756, "learning_rate": 1.2038743656966623e-05, "loss": 0.261, "step": 419000 }, { "epoch": 1.5537299156959716, "grad_norm": 0.6473680138587952, "learning_rate": 1.2019608590102533e-05, "loss": 0.2807, "step": 419100 }, { "epoch": 1.5541006458118618, "grad_norm": 0.6191958785057068, "learning_rate": 1.2000486664737615e-05, "loss": 0.2627, "step": 419200 }, { "epoch": 1.554471375927752, "grad_norm": 0.5626499652862549, "learning_rate": 1.198137788748825e-05, "loss": 0.2472, "step": 419300 }, { "epoch": 1.5548421060436424, "grad_norm": 0.9056015610694885, "learning_rate": 1.1962282264966146e-05, "loss": 0.2585, "step": 419400 }, { "epoch": 1.5552128361595325, "grad_norm": 1.5016272068023682, "learning_rate": 1.1943199803778593e-05, "loss": 0.2946, "step": 419500 }, { "epoch": 1.555583566275423, "grad_norm": 1.0403705835342407, "learning_rate": 1.1924130510528237e-05, "loss": 0.2675, "step": 419600 }, { "epoch": 1.555954296391313, "grad_norm": 2.1395578384399414, "learning_rate": 1.1905074391813182e-05, "loss": 0.251, "step": 419700 }, { "epoch": 1.5563250265072033, "grad_norm": 1.2910209894180298, "learning_rate": 1.188603145422702e-05, "loss": 0.2979, "step": 419800 }, { "epoch": 1.5566957566230935, "grad_norm": 1.3253872394561768, "learning_rate": 1.1867001704358737e-05, "loss": 0.2962, "step": 419900 }, { "epoch": 1.5570664867389836, "grad_norm": 0.9726186394691467, "learning_rate": 1.1847985148792757e-05, "loss": 0.2733, "step": 420000 }, { "epoch": 1.557437216854874, "grad_norm": 0.46592649817466736, "learning_rate": 1.1828981794108985e-05, "loss": 0.3107, "step": 420100 }, { "epoch": 1.5578079469707642, "grad_norm": 1.367501139640808, "learning_rate": 1.1809991646882668e-05, "loss": 0.2852, "step": 420200 }, { "epoch": 1.5581786770866546, "grad_norm": 1.166236162185669, "learning_rate": 1.1791014713684584e-05, "loss": 0.2906, "step": 420300 }, { "epoch": 1.5585494072025448, "grad_norm": 0.2105102390050888, "learning_rate": 1.177205100108088e-05, "loss": 0.2373, "step": 420400 }, { "epoch": 1.558920137318435, "grad_norm": 0.8279871940612793, "learning_rate": 1.175310051563313e-05, "loss": 0.2875, "step": 420500 }, { "epoch": 1.559290867434325, "grad_norm": 2.6210930347442627, "learning_rate": 1.1734163263898367e-05, "loss": 0.23, "step": 420600 }, { "epoch": 1.5596615975502153, "grad_norm": 0.4822622835636139, "learning_rate": 1.1715239252429006e-05, "loss": 0.2944, "step": 420700 }, { "epoch": 1.5600323276661057, "grad_norm": 0.4928251802921295, "learning_rate": 1.1696328487772906e-05, "loss": 0.2949, "step": 420800 }, { "epoch": 1.5604030577819958, "grad_norm": 1.1337767839431763, "learning_rate": 1.1677430976473325e-05, "loss": 0.2602, "step": 420900 }, { "epoch": 1.5607737878978862, "grad_norm": 2.0757715702056885, "learning_rate": 1.1658546725068942e-05, "loss": 0.2818, "step": 421000 }, { "epoch": 1.5611445180137764, "grad_norm": 0.7073217630386353, "learning_rate": 1.1639675740093836e-05, "loss": 0.3095, "step": 421100 }, { "epoch": 1.5615152481296666, "grad_norm": 1.1231553554534912, "learning_rate": 1.1620818028077546e-05, "loss": 0.2818, "step": 421200 }, { "epoch": 1.5618859782455567, "grad_norm": 1.0348260402679443, "learning_rate": 1.1601973595544957e-05, "loss": 0.2706, "step": 421300 }, { "epoch": 1.562256708361447, "grad_norm": 2.64319109916687, "learning_rate": 1.1583142449016371e-05, "loss": 0.2879, "step": 421400 }, { "epoch": 1.5626274384773373, "grad_norm": 0.30683600902557373, "learning_rate": 1.1564324595007531e-05, "loss": 0.2509, "step": 421500 }, { "epoch": 1.5629981685932275, "grad_norm": 1.291129469871521, "learning_rate": 1.154552004002955e-05, "loss": 0.2651, "step": 421600 }, { "epoch": 1.5633688987091179, "grad_norm": 1.126009225845337, "learning_rate": 1.1526728790588943e-05, "loss": 0.2956, "step": 421700 }, { "epoch": 1.563739628825008, "grad_norm": 0.9946134686470032, "learning_rate": 1.150795085318761e-05, "loss": 0.2741, "step": 421800 }, { "epoch": 1.5641103589408982, "grad_norm": 1.1682281494140625, "learning_rate": 1.1489186234322857e-05, "loss": 0.29, "step": 421900 }, { "epoch": 1.5644810890567884, "grad_norm": 1.9310364723205566, "learning_rate": 1.1470434940487406e-05, "loss": 0.263, "step": 422000 }, { "epoch": 1.5648518191726786, "grad_norm": 0.9156919121742249, "learning_rate": 1.1451696978169334e-05, "loss": 0.2731, "step": 422100 }, { "epoch": 1.5652225492885687, "grad_norm": 0.7615335583686829, "learning_rate": 1.1432972353852095e-05, "loss": 0.261, "step": 422200 }, { "epoch": 1.5655932794044591, "grad_norm": 0.5587055087089539, "learning_rate": 1.1414261074014598e-05, "loss": 0.2918, "step": 422300 }, { "epoch": 1.5659640095203495, "grad_norm": 0.5703988671302795, "learning_rate": 1.1395563145131022e-05, "loss": 0.2943, "step": 422400 }, { "epoch": 1.5663347396362397, "grad_norm": 0.531376302242279, "learning_rate": 1.137687857367104e-05, "loss": 0.2713, "step": 422500 }, { "epoch": 1.5667054697521299, "grad_norm": 0.3296523988246918, "learning_rate": 1.1358207366099632e-05, "loss": 0.2639, "step": 422600 }, { "epoch": 1.56707619986802, "grad_norm": 1.0059541463851929, "learning_rate": 1.1339549528877168e-05, "loss": 0.2725, "step": 422700 }, { "epoch": 1.5674469299839102, "grad_norm": 1.6908345222473145, "learning_rate": 1.132090506845942e-05, "loss": 0.3047, "step": 422800 }, { "epoch": 1.5678176600998004, "grad_norm": 0.9722269773483276, "learning_rate": 1.1302273991297508e-05, "loss": 0.2803, "step": 422900 }, { "epoch": 1.5681883902156908, "grad_norm": 1.7161227464675903, "learning_rate": 1.1283656303837897e-05, "loss": 0.2477, "step": 423000 }, { "epoch": 1.5685591203315812, "grad_norm": 1.3393669128417969, "learning_rate": 1.1265052012522493e-05, "loss": 0.2599, "step": 423100 }, { "epoch": 1.5689298504474714, "grad_norm": 1.00547456741333, "learning_rate": 1.1246461123788466e-05, "loss": 0.284, "step": 423200 }, { "epoch": 1.5693005805633615, "grad_norm": 0.20741547644138336, "learning_rate": 1.1227883644068444e-05, "loss": 0.2748, "step": 423300 }, { "epoch": 1.5696713106792517, "grad_norm": 1.5249981880187988, "learning_rate": 1.1209319579790356e-05, "loss": 0.2732, "step": 423400 }, { "epoch": 1.5700420407951419, "grad_norm": 0.9520147442817688, "learning_rate": 1.1190768937377499e-05, "loss": 0.2572, "step": 423500 }, { "epoch": 1.570412770911032, "grad_norm": 1.439780354499817, "learning_rate": 1.1172231723248555e-05, "loss": 0.2482, "step": 423600 }, { "epoch": 1.5707835010269224, "grad_norm": 0.8057112693786621, "learning_rate": 1.1153707943817533e-05, "loss": 0.2854, "step": 423700 }, { "epoch": 1.5711542311428126, "grad_norm": 0.8431739807128906, "learning_rate": 1.1135197605493774e-05, "loss": 0.265, "step": 423800 }, { "epoch": 1.571524961258703, "grad_norm": 1.1475062370300293, "learning_rate": 1.1116700714682043e-05, "loss": 0.2622, "step": 423900 }, { "epoch": 1.5718956913745932, "grad_norm": 0.8156816363334656, "learning_rate": 1.1098217277782358e-05, "loss": 0.254, "step": 424000 }, { "epoch": 1.5722664214904833, "grad_norm": 0.48395803570747375, "learning_rate": 1.107974730119013e-05, "loss": 0.2845, "step": 424100 }, { "epoch": 1.5726371516063735, "grad_norm": 0.711969792842865, "learning_rate": 1.106129079129613e-05, "loss": 0.2752, "step": 424200 }, { "epoch": 1.5730078817222637, "grad_norm": 1.20499587059021, "learning_rate": 1.104284775448644e-05, "loss": 0.2688, "step": 424300 }, { "epoch": 1.573378611838154, "grad_norm": 1.8412880897521973, "learning_rate": 1.1024418197142478e-05, "loss": 0.2537, "step": 424400 }, { "epoch": 1.5737493419540443, "grad_norm": 0.7061262726783752, "learning_rate": 1.1006002125641024e-05, "loss": 0.2988, "step": 424500 }, { "epoch": 1.5741200720699347, "grad_norm": 0.8552757501602173, "learning_rate": 1.0987599546354171e-05, "loss": 0.2739, "step": 424600 }, { "epoch": 1.5744908021858248, "grad_norm": 0.599959135055542, "learning_rate": 1.0969210465649348e-05, "loss": 0.2796, "step": 424700 }, { "epoch": 1.574861532301715, "grad_norm": 0.767019510269165, "learning_rate": 1.0950834889889316e-05, "loss": 0.2799, "step": 424800 }, { "epoch": 1.5752322624176052, "grad_norm": 0.8153753876686096, "learning_rate": 1.0932472825432144e-05, "loss": 0.2989, "step": 424900 }, { "epoch": 1.5756029925334953, "grad_norm": 0.7276557087898254, "learning_rate": 1.0914124278631276e-05, "loss": 0.2592, "step": 425000 }, { "epoch": 1.5759737226493857, "grad_norm": 0.9695584774017334, "learning_rate": 1.0895789255835426e-05, "loss": 0.2666, "step": 425100 }, { "epoch": 1.576344452765276, "grad_norm": 0.9838254451751709, "learning_rate": 1.0877467763388639e-05, "loss": 0.2654, "step": 425200 }, { "epoch": 1.5767151828811663, "grad_norm": 0.7815879583358765, "learning_rate": 1.0859159807630315e-05, "loss": 0.2568, "step": 425300 }, { "epoch": 1.5770859129970565, "grad_norm": 0.45907866954803467, "learning_rate": 1.0840865394895128e-05, "loss": 0.2762, "step": 425400 }, { "epoch": 1.5774566431129466, "grad_norm": 0.8603971004486084, "learning_rate": 1.0822584531513086e-05, "loss": 0.287, "step": 425500 }, { "epoch": 1.5778273732288368, "grad_norm": 0.41211169958114624, "learning_rate": 1.0804317223809495e-05, "loss": 0.2545, "step": 425600 }, { "epoch": 1.578198103344727, "grad_norm": 1.3677407503128052, "learning_rate": 1.078606347810498e-05, "loss": 0.265, "step": 425700 }, { "epoch": 1.5785688334606174, "grad_norm": 0.7031418085098267, "learning_rate": 1.076782330071549e-05, "loss": 0.2823, "step": 425800 }, { "epoch": 1.5789395635765076, "grad_norm": 0.5268321633338928, "learning_rate": 1.0749596697952257e-05, "loss": 0.2913, "step": 425900 }, { "epoch": 1.579310293692398, "grad_norm": 3.3799195289611816, "learning_rate": 1.0731383676121797e-05, "loss": 0.2749, "step": 426000 }, { "epoch": 1.5796810238082881, "grad_norm": 1.1968905925750732, "learning_rate": 1.0713184241525992e-05, "loss": 0.2654, "step": 426100 }, { "epoch": 1.5800517539241783, "grad_norm": 1.8084579706192017, "learning_rate": 1.0694998400461975e-05, "loss": 0.2745, "step": 426200 }, { "epoch": 1.5804224840400685, "grad_norm": 0.8389817476272583, "learning_rate": 1.0676826159222142e-05, "loss": 0.2959, "step": 426300 }, { "epoch": 1.5807932141559586, "grad_norm": 1.1861565113067627, "learning_rate": 1.0658667524094262e-05, "loss": 0.2873, "step": 426400 }, { "epoch": 1.581163944271849, "grad_norm": 1.9235754013061523, "learning_rate": 1.0640522501361355e-05, "loss": 0.2648, "step": 426500 }, { "epoch": 1.5815346743877392, "grad_norm": 1.1397416591644287, "learning_rate": 1.062239109730171e-05, "loss": 0.2545, "step": 426600 }, { "epoch": 1.5819054045036296, "grad_norm": 0.983186662197113, "learning_rate": 1.0604273318188962e-05, "loss": 0.2804, "step": 426700 }, { "epoch": 1.5822761346195198, "grad_norm": 1.1784063577651978, "learning_rate": 1.0586169170291982e-05, "loss": 0.2794, "step": 426800 }, { "epoch": 1.58264686473541, "grad_norm": 0.7076811194419861, "learning_rate": 1.0568078659874942e-05, "loss": 0.263, "step": 426900 }, { "epoch": 1.5830175948513001, "grad_norm": 0.8720622658729553, "learning_rate": 1.055000179319729e-05, "loss": 0.2785, "step": 427000 }, { "epoch": 1.5833883249671903, "grad_norm": 0.4866648316383362, "learning_rate": 1.053193857651375e-05, "loss": 0.2964, "step": 427100 }, { "epoch": 1.5837590550830805, "grad_norm": 1.2741472721099854, "learning_rate": 1.0513889016074347e-05, "loss": 0.2905, "step": 427200 }, { "epoch": 1.5841297851989709, "grad_norm": 0.6436259150505066, "learning_rate": 1.049585311812436e-05, "loss": 0.2696, "step": 427300 }, { "epoch": 1.5845005153148612, "grad_norm": 0.31295228004455566, "learning_rate": 1.0477830888904327e-05, "loss": 0.2621, "step": 427400 }, { "epoch": 1.5848712454307514, "grad_norm": 0.7451156973838806, "learning_rate": 1.0459822334650105e-05, "loss": 0.2853, "step": 427500 }, { "epoch": 1.5852419755466416, "grad_norm": 0.9603539705276489, "learning_rate": 1.0441827461592767e-05, "loss": 0.2595, "step": 427600 }, { "epoch": 1.5856127056625318, "grad_norm": 1.012787938117981, "learning_rate": 1.0423846275958676e-05, "loss": 0.2907, "step": 427700 }, { "epoch": 1.585983435778422, "grad_norm": 1.2012661695480347, "learning_rate": 1.0405878783969458e-05, "loss": 0.2868, "step": 427800 }, { "epoch": 1.586354165894312, "grad_norm": 0.7511788606643677, "learning_rate": 1.0387924991841991e-05, "loss": 0.2687, "step": 427900 }, { "epoch": 1.5867248960102025, "grad_norm": 1.317022442817688, "learning_rate": 1.0369984905788438e-05, "loss": 0.2736, "step": 428000 }, { "epoch": 1.5870956261260927, "grad_norm": 1.3479949235916138, "learning_rate": 1.03520585320162e-05, "loss": 0.2437, "step": 428100 }, { "epoch": 1.587466356241983, "grad_norm": 0.46043843030929565, "learning_rate": 1.0334145876727913e-05, "loss": 0.2721, "step": 428200 }, { "epoch": 1.5878370863578732, "grad_norm": 0.9455525279045105, "learning_rate": 1.0316246946121522e-05, "loss": 0.2779, "step": 428300 }, { "epoch": 1.5882078164737634, "grad_norm": 1.401236653327942, "learning_rate": 1.029836174639019e-05, "loss": 0.2605, "step": 428400 }, { "epoch": 1.5885785465896536, "grad_norm": 1.0933598279953003, "learning_rate": 1.0280490283722288e-05, "loss": 0.2707, "step": 428500 }, { "epoch": 1.5889492767055438, "grad_norm": 0.6865474581718445, "learning_rate": 1.0262632564301516e-05, "loss": 0.283, "step": 428600 }, { "epoch": 1.5893200068214342, "grad_norm": 1.0722981691360474, "learning_rate": 1.024478859430677e-05, "loss": 0.2584, "step": 428700 }, { "epoch": 1.5896907369373243, "grad_norm": 1.6262202262878418, "learning_rate": 1.0226958379912177e-05, "loss": 0.3102, "step": 428800 }, { "epoch": 1.5900614670532147, "grad_norm": 0.5823125243186951, "learning_rate": 1.0209141927287153e-05, "loss": 0.2703, "step": 428900 }, { "epoch": 1.5904321971691049, "grad_norm": 0.9859658479690552, "learning_rate": 1.019133924259631e-05, "loss": 0.2715, "step": 429000 }, { "epoch": 1.590802927284995, "grad_norm": 1.3475481271743774, "learning_rate": 1.0173550331999499e-05, "loss": 0.2809, "step": 429100 }, { "epoch": 1.5911736574008852, "grad_norm": 1.7569077014923096, "learning_rate": 1.0155775201651852e-05, "loss": 0.2631, "step": 429200 }, { "epoch": 1.5915443875167754, "grad_norm": 0.6437801122665405, "learning_rate": 1.0138013857703649e-05, "loss": 0.3001, "step": 429300 }, { "epoch": 1.5919151176326658, "grad_norm": 1.1111927032470703, "learning_rate": 1.0120266306300486e-05, "loss": 0.2884, "step": 429400 }, { "epoch": 1.592285847748556, "grad_norm": 0.9853131175041199, "learning_rate": 1.0102532553583127e-05, "loss": 0.2905, "step": 429500 }, { "epoch": 1.5926565778644464, "grad_norm": 0.8236687779426575, "learning_rate": 1.0084812605687582e-05, "loss": 0.2719, "step": 429600 }, { "epoch": 1.5930273079803365, "grad_norm": 1.0329608917236328, "learning_rate": 1.0067106468745108e-05, "loss": 0.2858, "step": 429700 }, { "epoch": 1.5933980380962267, "grad_norm": 0.9984715580940247, "learning_rate": 1.0049414148882142e-05, "loss": 0.2458, "step": 429800 }, { "epoch": 1.5937687682121169, "grad_norm": 0.9345030188560486, "learning_rate": 1.0031735652220348e-05, "loss": 0.2521, "step": 429900 }, { "epoch": 1.594139498328007, "grad_norm": 2.6236510276794434, "learning_rate": 1.001407098487666e-05, "loss": 0.2625, "step": 430000 }, { "epoch": 1.5945102284438974, "grad_norm": 0.7788618803024292, "learning_rate": 9.996420152963131e-06, "loss": 0.2648, "step": 430100 }, { "epoch": 1.5948809585597876, "grad_norm": 0.8249890208244324, "learning_rate": 9.978783162587124e-06, "loss": 0.271, "step": 430200 }, { "epoch": 1.595251688675678, "grad_norm": 1.0685348510742188, "learning_rate": 9.961160019851147e-06, "loss": 0.2616, "step": 430300 }, { "epoch": 1.5956224187915682, "grad_norm": 1.1358428001403809, "learning_rate": 9.943550730852935e-06, "loss": 0.2698, "step": 430400 }, { "epoch": 1.5959931489074584, "grad_norm": 1.0531165599822998, "learning_rate": 9.925955301685453e-06, "loss": 0.259, "step": 430500 }, { "epoch": 1.5963638790233485, "grad_norm": 1.3295660018920898, "learning_rate": 9.908373738436843e-06, "loss": 0.2777, "step": 430600 }, { "epoch": 1.5967346091392387, "grad_norm": 0.7181895971298218, "learning_rate": 9.890806047190453e-06, "loss": 0.2734, "step": 430700 }, { "epoch": 1.597105339255129, "grad_norm": 1.1930946111679077, "learning_rate": 9.873252234024844e-06, "loss": 0.2956, "step": 430800 }, { "epoch": 1.5974760693710193, "grad_norm": 0.715694010257721, "learning_rate": 9.855712305013753e-06, "loss": 0.2385, "step": 430900 }, { "epoch": 1.5978467994869097, "grad_norm": 0.8921847343444824, "learning_rate": 9.83818626622613e-06, "loss": 0.2452, "step": 431000 }, { "epoch": 1.5982175296027998, "grad_norm": 1.1264493465423584, "learning_rate": 9.820674123726126e-06, "loss": 0.2728, "step": 431100 }, { "epoch": 1.59858825971869, "grad_norm": 0.9412246942520142, "learning_rate": 9.803175883573062e-06, "loss": 0.2729, "step": 431200 }, { "epoch": 1.5989589898345802, "grad_norm": 1.999316692352295, "learning_rate": 9.785691551821457e-06, "loss": 0.2791, "step": 431300 }, { "epoch": 1.5993297199504704, "grad_norm": 0.6728819012641907, "learning_rate": 9.768221134521038e-06, "loss": 0.2671, "step": 431400 }, { "epoch": 1.5997004500663607, "grad_norm": 0.7963821887969971, "learning_rate": 9.750764637716691e-06, "loss": 0.2638, "step": 431500 }, { "epoch": 1.600071180182251, "grad_norm": 0.8820111751556396, "learning_rate": 9.733322067448492e-06, "loss": 0.2634, "step": 431600 }, { "epoch": 1.6004419102981413, "grad_norm": 1.2562901973724365, "learning_rate": 9.715893429751699e-06, "loss": 0.2882, "step": 431700 }, { "epoch": 1.6008126404140315, "grad_norm": 0.5839844942092896, "learning_rate": 9.698478730656746e-06, "loss": 0.2825, "step": 431800 }, { "epoch": 1.6011833705299217, "grad_norm": 1.6888002157211304, "learning_rate": 9.681077976189263e-06, "loss": 0.2906, "step": 431900 }, { "epoch": 1.6015541006458118, "grad_norm": 1.2986587285995483, "learning_rate": 9.663691172370037e-06, "loss": 0.2771, "step": 432000 }, { "epoch": 1.601924830761702, "grad_norm": 1.1158517599105835, "learning_rate": 9.64631832521502e-06, "loss": 0.2737, "step": 432100 }, { "epoch": 1.6022955608775922, "grad_norm": 0.3392268419265747, "learning_rate": 9.628959440735363e-06, "loss": 0.2821, "step": 432200 }, { "epoch": 1.6026662909934826, "grad_norm": 0.30108729004859924, "learning_rate": 9.611614524937368e-06, "loss": 0.2901, "step": 432300 }, { "epoch": 1.603037021109373, "grad_norm": 0.9448392391204834, "learning_rate": 9.594283583822494e-06, "loss": 0.271, "step": 432400 }, { "epoch": 1.6034077512252631, "grad_norm": 1.2695125341415405, "learning_rate": 9.576966623387384e-06, "loss": 0.3115, "step": 432500 }, { "epoch": 1.6037784813411533, "grad_norm": 0.40582069754600525, "learning_rate": 9.55966364962383e-06, "loss": 0.2573, "step": 432600 }, { "epoch": 1.6041492114570435, "grad_norm": 0.8234401345252991, "learning_rate": 9.542374668518794e-06, "loss": 0.2814, "step": 432700 }, { "epoch": 1.6045199415729337, "grad_norm": 1.167030692100525, "learning_rate": 9.5250996860544e-06, "loss": 0.2641, "step": 432800 }, { "epoch": 1.6048906716888238, "grad_norm": 1.355307698249817, "learning_rate": 9.507838708207906e-06, "loss": 0.2666, "step": 432900 }, { "epoch": 1.6052614018047142, "grad_norm": 1.3492358922958374, "learning_rate": 9.490591740951765e-06, "loss": 0.3121, "step": 433000 }, { "epoch": 1.6056321319206044, "grad_norm": 1.072737455368042, "learning_rate": 9.473358790253528e-06, "loss": 0.2695, "step": 433100 }, { "epoch": 1.6060028620364948, "grad_norm": 0.984992504119873, "learning_rate": 9.456139862075925e-06, "loss": 0.2918, "step": 433200 }, { "epoch": 1.606373592152385, "grad_norm": 1.0734270811080933, "learning_rate": 9.438934962376856e-06, "loss": 0.2894, "step": 433300 }, { "epoch": 1.6067443222682751, "grad_norm": 0.4501027762889862, "learning_rate": 9.421744097109331e-06, "loss": 0.2613, "step": 433400 }, { "epoch": 1.6071150523841653, "grad_norm": 0.880838930606842, "learning_rate": 9.40456727222151e-06, "loss": 0.2913, "step": 433500 }, { "epoch": 1.6074857825000555, "grad_norm": 1.175792932510376, "learning_rate": 9.387404493656721e-06, "loss": 0.2627, "step": 433600 }, { "epoch": 1.6078565126159459, "grad_norm": 0.9245967864990234, "learning_rate": 9.370255767353397e-06, "loss": 0.2881, "step": 433700 }, { "epoch": 1.608227242731836, "grad_norm": 0.6649535298347473, "learning_rate": 9.35312109924516e-06, "loss": 0.2766, "step": 433800 }, { "epoch": 1.6085979728477264, "grad_norm": 0.6055174469947815, "learning_rate": 9.336000495260688e-06, "loss": 0.2621, "step": 433900 }, { "epoch": 1.6089687029636166, "grad_norm": 1.1649309396743774, "learning_rate": 9.318893961323849e-06, "loss": 0.2731, "step": 434000 }, { "epoch": 1.6093394330795068, "grad_norm": 0.8061076402664185, "learning_rate": 9.301801503353653e-06, "loss": 0.283, "step": 434100 }, { "epoch": 1.609710163195397, "grad_norm": 1.631717562675476, "learning_rate": 9.2847231272642e-06, "loss": 0.2553, "step": 434200 }, { "epoch": 1.6100808933112871, "grad_norm": 0.39758583903312683, "learning_rate": 9.267658838964733e-06, "loss": 0.2535, "step": 434300 }, { "epoch": 1.6104516234271775, "grad_norm": 0.4298534095287323, "learning_rate": 9.250608644359638e-06, "loss": 0.2896, "step": 434400 }, { "epoch": 1.6108223535430677, "grad_norm": 2.354724884033203, "learning_rate": 9.233572549348408e-06, "loss": 0.2805, "step": 434500 }, { "epoch": 1.611193083658958, "grad_norm": 1.7905901670455933, "learning_rate": 9.216550559825648e-06, "loss": 0.262, "step": 434600 }, { "epoch": 1.6115638137748483, "grad_norm": 0.8321895003318787, "learning_rate": 9.1995426816811e-06, "loss": 0.2728, "step": 434700 }, { "epoch": 1.6119345438907384, "grad_norm": 0.3570813536643982, "learning_rate": 9.182548920799605e-06, "loss": 0.2581, "step": 434800 }, { "epoch": 1.6123052740066286, "grad_norm": 0.7523700594902039, "learning_rate": 9.165569283061155e-06, "loss": 0.2572, "step": 434900 }, { "epoch": 1.6126760041225188, "grad_norm": 0.41789352893829346, "learning_rate": 9.14860377434082e-06, "loss": 0.2882, "step": 435000 }, { "epoch": 1.6130467342384092, "grad_norm": 1.3768147230148315, "learning_rate": 9.131652400508771e-06, "loss": 0.3001, "step": 435100 }, { "epoch": 1.6134174643542993, "grad_norm": 1.0018376111984253, "learning_rate": 9.114715167430348e-06, "loss": 0.2654, "step": 435200 }, { "epoch": 1.6137881944701897, "grad_norm": 0.8850124478340149, "learning_rate": 9.097792080965944e-06, "loss": 0.2729, "step": 435300 }, { "epoch": 1.61415892458608, "grad_norm": 0.695933997631073, "learning_rate": 9.08088314697107e-06, "loss": 0.2604, "step": 435400 }, { "epoch": 1.61452965470197, "grad_norm": 0.9044082164764404, "learning_rate": 9.063988371296344e-06, "loss": 0.2617, "step": 435500 }, { "epoch": 1.6149003848178602, "grad_norm": 0.8882888555526733, "learning_rate": 9.047107759787477e-06, "loss": 0.2749, "step": 435600 }, { "epoch": 1.6152711149337504, "grad_norm": 0.9092077016830444, "learning_rate": 9.03024131828531e-06, "loss": 0.2735, "step": 435700 }, { "epoch": 1.6156418450496408, "grad_norm": 1.189103364944458, "learning_rate": 9.013389052625738e-06, "loss": 0.259, "step": 435800 }, { "epoch": 1.616012575165531, "grad_norm": 1.0547256469726562, "learning_rate": 8.996550968639766e-06, "loss": 0.2579, "step": 435900 }, { "epoch": 1.6163833052814214, "grad_norm": 0.9721396565437317, "learning_rate": 8.979727072153521e-06, "loss": 0.2778, "step": 436000 }, { "epoch": 1.6167540353973116, "grad_norm": 0.7240614295005798, "learning_rate": 8.962917368988194e-06, "loss": 0.2863, "step": 436100 }, { "epoch": 1.6171247655132017, "grad_norm": 1.6890053749084473, "learning_rate": 8.946121864960028e-06, "loss": 0.2883, "step": 436200 }, { "epoch": 1.617495495629092, "grad_norm": 1.013601541519165, "learning_rate": 8.929340565880439e-06, "loss": 0.2633, "step": 436300 }, { "epoch": 1.617866225744982, "grad_norm": 0.9711277484893799, "learning_rate": 8.912573477555864e-06, "loss": 0.2718, "step": 436400 }, { "epoch": 1.6182369558608722, "grad_norm": 0.6809570789337158, "learning_rate": 8.89582060578783e-06, "loss": 0.2712, "step": 436500 }, { "epoch": 1.6186076859767626, "grad_norm": 0.34403082728385925, "learning_rate": 8.879081956372975e-06, "loss": 0.2648, "step": 436600 }, { "epoch": 1.618978416092653, "grad_norm": 1.0164450407028198, "learning_rate": 8.862357535102994e-06, "loss": 0.2686, "step": 436700 }, { "epoch": 1.6193491462085432, "grad_norm": 1.38005530834198, "learning_rate": 8.845647347764658e-06, "loss": 0.281, "step": 436800 }, { "epoch": 1.6197198763244334, "grad_norm": 0.8761342763900757, "learning_rate": 8.828951400139812e-06, "loss": 0.2891, "step": 436900 }, { "epoch": 1.6200906064403235, "grad_norm": 0.6319555044174194, "learning_rate": 8.812269698005371e-06, "loss": 0.2708, "step": 437000 }, { "epoch": 1.6204613365562137, "grad_norm": 0.9748546481132507, "learning_rate": 8.79560224713335e-06, "loss": 0.2788, "step": 437100 }, { "epoch": 1.6208320666721039, "grad_norm": 1.2098987102508545, "learning_rate": 8.778949053290802e-06, "loss": 0.2608, "step": 437200 }, { "epoch": 1.6212027967879943, "grad_norm": 0.87134850025177, "learning_rate": 8.762310122239842e-06, "loss": 0.3152, "step": 437300 }, { "epoch": 1.6215735269038847, "grad_norm": 0.45271456241607666, "learning_rate": 8.745685459737695e-06, "loss": 0.2692, "step": 437400 }, { "epoch": 1.6219442570197748, "grad_norm": 0.5589248538017273, "learning_rate": 8.729075071536597e-06, "loss": 0.28, "step": 437500 }, { "epoch": 1.622314987135665, "grad_norm": 0.9365842342376709, "learning_rate": 8.712478963383875e-06, "loss": 0.2895, "step": 437600 }, { "epoch": 1.6226857172515552, "grad_norm": 1.0550915002822876, "learning_rate": 8.695897141021896e-06, "loss": 0.2451, "step": 437700 }, { "epoch": 1.6230564473674454, "grad_norm": 0.5498074293136597, "learning_rate": 8.679329610188091e-06, "loss": 0.2684, "step": 437800 }, { "epoch": 1.6234271774833355, "grad_norm": 0.9327119588851929, "learning_rate": 8.662776376614972e-06, "loss": 0.2676, "step": 437900 }, { "epoch": 1.623797907599226, "grad_norm": 1.016305685043335, "learning_rate": 8.646237446030076e-06, "loss": 0.2794, "step": 438000 }, { "epoch": 1.624168637715116, "grad_norm": 0.3471633791923523, "learning_rate": 8.629712824155967e-06, "loss": 0.255, "step": 438100 }, { "epoch": 1.6245393678310065, "grad_norm": 1.4733232259750366, "learning_rate": 8.613202516710328e-06, "loss": 0.269, "step": 438200 }, { "epoch": 1.6249100979468967, "grad_norm": 0.627934455871582, "learning_rate": 8.59670652940584e-06, "loss": 0.2985, "step": 438300 }, { "epoch": 1.6252808280627868, "grad_norm": 1.3485788106918335, "learning_rate": 8.580224867950204e-06, "loss": 0.2801, "step": 438400 }, { "epoch": 1.625651558178677, "grad_norm": 2.2360777854919434, "learning_rate": 8.563757538046236e-06, "loss": 0.2675, "step": 438500 }, { "epoch": 1.6260222882945672, "grad_norm": 1.37287175655365, "learning_rate": 8.547304545391738e-06, "loss": 0.2525, "step": 438600 }, { "epoch": 1.6263930184104576, "grad_norm": 1.2214555740356445, "learning_rate": 8.530865895679563e-06, "loss": 0.3044, "step": 438700 }, { "epoch": 1.6267637485263478, "grad_norm": 0.6987506151199341, "learning_rate": 8.514441594597627e-06, "loss": 0.2742, "step": 438800 }, { "epoch": 1.6271344786422381, "grad_norm": 1.63481867313385, "learning_rate": 8.498031647828852e-06, "loss": 0.2727, "step": 438900 }, { "epoch": 1.6275052087581283, "grad_norm": 0.785466194152832, "learning_rate": 8.481636061051184e-06, "loss": 0.2676, "step": 439000 }, { "epoch": 1.6278759388740185, "grad_norm": 0.7725498080253601, "learning_rate": 8.465254839937658e-06, "loss": 0.2574, "step": 439100 }, { "epoch": 1.6282466689899087, "grad_norm": 1.0720007419586182, "learning_rate": 8.448887990156256e-06, "loss": 0.251, "step": 439200 }, { "epoch": 1.6286173991057988, "grad_norm": 0.5629092454910278, "learning_rate": 8.43253551737006e-06, "loss": 0.2795, "step": 439300 }, { "epoch": 1.6289881292216892, "grad_norm": 0.79253751039505, "learning_rate": 8.416197427237133e-06, "loss": 0.2758, "step": 439400 }, { "epoch": 1.6293588593375794, "grad_norm": 1.2755682468414307, "learning_rate": 8.399873725410573e-06, "loss": 0.2768, "step": 439500 }, { "epoch": 1.6297295894534698, "grad_norm": 2.555133104324341, "learning_rate": 8.38356441753852e-06, "loss": 0.2657, "step": 439600 }, { "epoch": 1.63010031956936, "grad_norm": 0.772835910320282, "learning_rate": 8.367269509264108e-06, "loss": 0.2866, "step": 439700 }, { "epoch": 1.6304710496852501, "grad_norm": 0.4254125654697418, "learning_rate": 8.350989006225479e-06, "loss": 0.256, "step": 439800 }, { "epoch": 1.6308417798011403, "grad_norm": 0.9071197509765625, "learning_rate": 8.334722914055854e-06, "loss": 0.2613, "step": 439900 }, { "epoch": 1.6312125099170305, "grad_norm": 0.7129960656166077, "learning_rate": 8.318471238383368e-06, "loss": 0.2717, "step": 440000 }, { "epoch": 1.6315832400329209, "grad_norm": 1.3636972904205322, "learning_rate": 8.302233984831264e-06, "loss": 0.2747, "step": 440100 }, { "epoch": 1.631953970148811, "grad_norm": 1.163549542427063, "learning_rate": 8.286011159017732e-06, "loss": 0.2634, "step": 440200 }, { "epoch": 1.6323247002647014, "grad_norm": 1.8368840217590332, "learning_rate": 8.269802766555985e-06, "loss": 0.2912, "step": 440300 }, { "epoch": 1.6326954303805916, "grad_norm": 1.0927997827529907, "learning_rate": 8.25360881305427e-06, "loss": 0.2808, "step": 440400 }, { "epoch": 1.6330661604964818, "grad_norm": 0.850825309753418, "learning_rate": 8.237429304115807e-06, "loss": 0.2728, "step": 440500 }, { "epoch": 1.633436890612372, "grad_norm": 0.6368454694747925, "learning_rate": 8.221264245338828e-06, "loss": 0.2397, "step": 440600 }, { "epoch": 1.6338076207282621, "grad_norm": 0.7371348738670349, "learning_rate": 8.205113642316554e-06, "loss": 0.2752, "step": 440700 }, { "epoch": 1.6341783508441525, "grad_norm": 0.2461865395307541, "learning_rate": 8.188977500637224e-06, "loss": 0.2374, "step": 440800 }, { "epoch": 1.6345490809600427, "grad_norm": 1.8441286087036133, "learning_rate": 8.172855825884045e-06, "loss": 0.2883, "step": 440900 }, { "epoch": 1.634919811075933, "grad_norm": 1.220971703529358, "learning_rate": 8.156748623635269e-06, "loss": 0.2487, "step": 441000 }, { "epoch": 1.6352905411918233, "grad_norm": 0.44855839014053345, "learning_rate": 8.140655899464095e-06, "loss": 0.2612, "step": 441100 }, { "epoch": 1.6356612713077134, "grad_norm": 1.110564947128296, "learning_rate": 8.124577658938704e-06, "loss": 0.2767, "step": 441200 }, { "epoch": 1.6360320014236036, "grad_norm": 0.21516260504722595, "learning_rate": 8.108513907622323e-06, "loss": 0.2648, "step": 441300 }, { "epoch": 1.6364027315394938, "grad_norm": 0.549366295337677, "learning_rate": 8.092464651073106e-06, "loss": 0.2661, "step": 441400 }, { "epoch": 1.636773461655384, "grad_norm": 0.9979655742645264, "learning_rate": 8.076429894844229e-06, "loss": 0.2643, "step": 441500 }, { "epoch": 1.6371441917712743, "grad_norm": 0.306366503238678, "learning_rate": 8.06040964448383e-06, "loss": 0.2497, "step": 441600 }, { "epoch": 1.6375149218871647, "grad_norm": 0.3124513328075409, "learning_rate": 8.044403905535025e-06, "loss": 0.2861, "step": 441700 }, { "epoch": 1.637885652003055, "grad_norm": 2.1207873821258545, "learning_rate": 8.028412683535941e-06, "loss": 0.2486, "step": 441800 }, { "epoch": 1.638256382118945, "grad_norm": 0.7493254542350769, "learning_rate": 8.012435984019651e-06, "loss": 0.2651, "step": 441900 }, { "epoch": 1.6386271122348353, "grad_norm": 0.6418402194976807, "learning_rate": 7.9964738125142e-06, "loss": 0.2665, "step": 442000 }, { "epoch": 1.6389978423507254, "grad_norm": 0.6755656599998474, "learning_rate": 7.98052617454264e-06, "loss": 0.2555, "step": 442100 }, { "epoch": 1.6393685724666156, "grad_norm": 0.9085489511489868, "learning_rate": 7.964593075622962e-06, "loss": 0.2463, "step": 442200 }, { "epoch": 1.639739302582506, "grad_norm": 0.9396021962165833, "learning_rate": 7.94867452126814e-06, "loss": 0.2735, "step": 442300 }, { "epoch": 1.6401100326983962, "grad_norm": 1.0098880529403687, "learning_rate": 7.93277051698611e-06, "loss": 0.2532, "step": 442400 }, { "epoch": 1.6404807628142866, "grad_norm": 1.7050931453704834, "learning_rate": 7.916881068279768e-06, "loss": 0.2742, "step": 442500 }, { "epoch": 1.6408514929301767, "grad_norm": 0.7208085060119629, "learning_rate": 7.901006180647002e-06, "loss": 0.2611, "step": 442600 }, { "epoch": 1.641222223046067, "grad_norm": 0.9738473296165466, "learning_rate": 7.885145859580629e-06, "loss": 0.2702, "step": 442700 }, { "epoch": 1.641592953161957, "grad_norm": 1.174720048904419, "learning_rate": 7.869300110568434e-06, "loss": 0.281, "step": 442800 }, { "epoch": 1.6419636832778473, "grad_norm": 0.7072051167488098, "learning_rate": 7.85346893909319e-06, "loss": 0.2547, "step": 442900 }, { "epoch": 1.6423344133937376, "grad_norm": 2.022030830383301, "learning_rate": 7.837652350632568e-06, "loss": 0.2719, "step": 443000 }, { "epoch": 1.6427051435096278, "grad_norm": 1.1673109531402588, "learning_rate": 7.821850350659231e-06, "loss": 0.2592, "step": 443100 }, { "epoch": 1.6430758736255182, "grad_norm": 0.8436570763587952, "learning_rate": 7.806062944640807e-06, "loss": 0.2372, "step": 443200 }, { "epoch": 1.6434466037414084, "grad_norm": 0.7066800594329834, "learning_rate": 7.790290138039851e-06, "loss": 0.2433, "step": 443300 }, { "epoch": 1.6438173338572986, "grad_norm": 0.9081747531890869, "learning_rate": 7.77453193631385e-06, "loss": 0.2895, "step": 443400 }, { "epoch": 1.6441880639731887, "grad_norm": 0.9134851098060608, "learning_rate": 7.758788344915297e-06, "loss": 0.2712, "step": 443500 }, { "epoch": 1.644558794089079, "grad_norm": 0.8450581431388855, "learning_rate": 7.743059369291567e-06, "loss": 0.2978, "step": 443600 }, { "epoch": 1.6449295242049693, "grad_norm": 1.1301175355911255, "learning_rate": 7.727345014885006e-06, "loss": 0.2567, "step": 443700 }, { "epoch": 1.6453002543208595, "grad_norm": 0.7090445160865784, "learning_rate": 7.711645287132908e-06, "loss": 0.2864, "step": 443800 }, { "epoch": 1.6456709844367499, "grad_norm": 1.5397111177444458, "learning_rate": 7.695960191467472e-06, "loss": 0.2712, "step": 443900 }, { "epoch": 1.64604171455264, "grad_norm": 1.417982816696167, "learning_rate": 7.680289733315892e-06, "loss": 0.2954, "step": 444000 }, { "epoch": 1.6464124446685302, "grad_norm": 1.131514072418213, "learning_rate": 7.66463391810024e-06, "loss": 0.2677, "step": 444100 }, { "epoch": 1.6467831747844204, "grad_norm": 0.8210633993148804, "learning_rate": 7.648992751237543e-06, "loss": 0.2512, "step": 444200 }, { "epoch": 1.6471539049003106, "grad_norm": 0.8689486384391785, "learning_rate": 7.633366238139783e-06, "loss": 0.3049, "step": 444300 }, { "epoch": 1.647524635016201, "grad_norm": 0.6666261553764343, "learning_rate": 7.6177543842138375e-06, "loss": 0.2945, "step": 444400 }, { "epoch": 1.6478953651320911, "grad_norm": 1.3674116134643555, "learning_rate": 7.602157194861526e-06, "loss": 0.2785, "step": 444500 }, { "epoch": 1.6482660952479815, "grad_norm": 0.6418777108192444, "learning_rate": 7.586574675479591e-06, "loss": 0.2392, "step": 444600 }, { "epoch": 1.6486368253638717, "grad_norm": 1.3173434734344482, "learning_rate": 7.571006831459687e-06, "loss": 0.2893, "step": 444700 }, { "epoch": 1.6490075554797619, "grad_norm": 1.2148782014846802, "learning_rate": 7.555453668188428e-06, "loss": 0.2882, "step": 444800 }, { "epoch": 1.649378285595652, "grad_norm": 0.42820486426353455, "learning_rate": 7.53991519104732e-06, "loss": 0.2883, "step": 444900 }, { "epoch": 1.6497490157115422, "grad_norm": 0.8128684163093567, "learning_rate": 7.524391405412773e-06, "loss": 0.2621, "step": 445000 }, { "epoch": 1.6501197458274326, "grad_norm": 0.7126721739768982, "learning_rate": 7.5088823166561575e-06, "loss": 0.2444, "step": 445100 }, { "epoch": 1.6504904759433228, "grad_norm": 0.8227465748786926, "learning_rate": 7.493387930143736e-06, "loss": 0.3022, "step": 445200 }, { "epoch": 1.6508612060592132, "grad_norm": 1.439645767211914, "learning_rate": 7.477908251236649e-06, "loss": 0.2908, "step": 445300 }, { "epoch": 1.6512319361751033, "grad_norm": 2.34385085105896, "learning_rate": 7.46244328529101e-06, "loss": 0.289, "step": 445400 }, { "epoch": 1.6516026662909935, "grad_norm": 0.8082863092422485, "learning_rate": 7.446993037657796e-06, "loss": 0.2558, "step": 445500 }, { "epoch": 1.6519733964068837, "grad_norm": 0.9427903890609741, "learning_rate": 7.431557513682924e-06, "loss": 0.2635, "step": 445600 }, { "epoch": 1.6523441265227738, "grad_norm": 1.1656283140182495, "learning_rate": 7.416136718707195e-06, "loss": 0.2534, "step": 445700 }, { "epoch": 1.652714856638664, "grad_norm": 1.0704691410064697, "learning_rate": 7.400730658066302e-06, "loss": 0.2787, "step": 445800 }, { "epoch": 1.6530855867545544, "grad_norm": 1.031025767326355, "learning_rate": 7.385339337090891e-06, "loss": 0.2625, "step": 445900 }, { "epoch": 1.6534563168704448, "grad_norm": 1.4385545253753662, "learning_rate": 7.369962761106464e-06, "loss": 0.2664, "step": 446000 }, { "epoch": 1.653827046986335, "grad_norm": 0.8092523217201233, "learning_rate": 7.3546009354333975e-06, "loss": 0.2633, "step": 446100 }, { "epoch": 1.6541977771022252, "grad_norm": 0.927738606929779, "learning_rate": 7.339253865387036e-06, "loss": 0.2517, "step": 446200 }, { "epoch": 1.6545685072181153, "grad_norm": 0.5259364247322083, "learning_rate": 7.323921556277569e-06, "loss": 0.2638, "step": 446300 }, { "epoch": 1.6549392373340055, "grad_norm": 1.0320645570755005, "learning_rate": 7.308604013410076e-06, "loss": 0.2432, "step": 446400 }, { "epoch": 1.6553099674498957, "grad_norm": 1.5153539180755615, "learning_rate": 7.29330124208456e-06, "loss": 0.2614, "step": 446500 }, { "epoch": 1.655680697565786, "grad_norm": 2.1029324531555176, "learning_rate": 7.278013247595894e-06, "loss": 0.2777, "step": 446600 }, { "epoch": 1.6560514276816765, "grad_norm": 0.5257918834686279, "learning_rate": 7.262740035233834e-06, "loss": 0.2419, "step": 446700 }, { "epoch": 1.6564221577975666, "grad_norm": 0.2830110192298889, "learning_rate": 7.247481610283019e-06, "loss": 0.2483, "step": 446800 }, { "epoch": 1.6567928879134568, "grad_norm": 1.1102041006088257, "learning_rate": 7.232237978022971e-06, "loss": 0.2796, "step": 446900 }, { "epoch": 1.657163618029347, "grad_norm": 0.4103546142578125, "learning_rate": 7.217009143728132e-06, "loss": 0.2716, "step": 447000 }, { "epoch": 1.6575343481452371, "grad_norm": 0.6573817729949951, "learning_rate": 7.201795112667764e-06, "loss": 0.2696, "step": 447100 }, { "epoch": 1.6579050782611273, "grad_norm": 1.2901395559310913, "learning_rate": 7.186595890106035e-06, "loss": 0.2954, "step": 447200 }, { "epoch": 1.6582758083770177, "grad_norm": 0.5576220750808716, "learning_rate": 7.171411481302015e-06, "loss": 0.2435, "step": 447300 }, { "epoch": 1.6586465384929079, "grad_norm": 0.9704784154891968, "learning_rate": 7.156241891509602e-06, "loss": 0.2867, "step": 447400 }, { "epoch": 1.6590172686087983, "grad_norm": 1.1405389308929443, "learning_rate": 7.141087125977597e-06, "loss": 0.2687, "step": 447500 }, { "epoch": 1.6593879987246885, "grad_norm": 0.7967250943183899, "learning_rate": 7.125947189949656e-06, "loss": 0.2816, "step": 447600 }, { "epoch": 1.6597587288405786, "grad_norm": 0.8923775553703308, "learning_rate": 7.110822088664293e-06, "loss": 0.2698, "step": 447700 }, { "epoch": 1.6601294589564688, "grad_norm": 0.5215062499046326, "learning_rate": 7.095711827354945e-06, "loss": 0.2775, "step": 447800 }, { "epoch": 1.660500189072359, "grad_norm": 1.398775577545166, "learning_rate": 7.080616411249852e-06, "loss": 0.2683, "step": 447900 }, { "epoch": 1.6608709191882494, "grad_norm": 1.055212378501892, "learning_rate": 7.06553584557213e-06, "loss": 0.3052, "step": 448000 }, { "epoch": 1.6612416493041395, "grad_norm": 0.8017084002494812, "learning_rate": 7.050470135539794e-06, "loss": 0.2886, "step": 448100 }, { "epoch": 1.66161237942003, "grad_norm": 1.1486412286758423, "learning_rate": 7.03541928636568e-06, "loss": 0.2651, "step": 448200 }, { "epoch": 1.66198310953592, "grad_norm": 0.7516258358955383, "learning_rate": 7.0203833032574885e-06, "loss": 0.259, "step": 448300 }, { "epoch": 1.6623538396518103, "grad_norm": 1.4459190368652344, "learning_rate": 7.005362191417791e-06, "loss": 0.2818, "step": 448400 }, { "epoch": 1.6627245697677004, "grad_norm": 1.1883857250213623, "learning_rate": 6.990355956043998e-06, "loss": 0.2675, "step": 448500 }, { "epoch": 1.6630952998835906, "grad_norm": 0.8303999304771423, "learning_rate": 6.975364602328371e-06, "loss": 0.2707, "step": 448600 }, { "epoch": 1.663466029999481, "grad_norm": 0.9845766425132751, "learning_rate": 6.960388135458057e-06, "loss": 0.2639, "step": 448700 }, { "epoch": 1.6638367601153712, "grad_norm": 1.6768616437911987, "learning_rate": 6.945426560615004e-06, "loss": 0.2807, "step": 448800 }, { "epoch": 1.6642074902312616, "grad_norm": 1.7713204622268677, "learning_rate": 6.930479882976021e-06, "loss": 0.2446, "step": 448900 }, { "epoch": 1.6645782203471517, "grad_norm": 1.4850385189056396, "learning_rate": 6.915548107712811e-06, "loss": 0.279, "step": 449000 }, { "epoch": 1.664948950463042, "grad_norm": 1.990812063217163, "learning_rate": 6.900631239991823e-06, "loss": 0.2806, "step": 449100 }, { "epoch": 1.665319680578932, "grad_norm": 1.1675734519958496, "learning_rate": 6.885729284974451e-06, "loss": 0.2552, "step": 449200 }, { "epoch": 1.6656904106948223, "grad_norm": 0.96539306640625, "learning_rate": 6.870842247816861e-06, "loss": 0.2733, "step": 449300 }, { "epoch": 1.6660611408107127, "grad_norm": 2.40336012840271, "learning_rate": 6.85597013367007e-06, "loss": 0.281, "step": 449400 }, { "epoch": 1.6664318709266028, "grad_norm": 0.9211331605911255, "learning_rate": 6.841112947679973e-06, "loss": 0.2688, "step": 449500 }, { "epoch": 1.6668026010424932, "grad_norm": 0.8905049562454224, "learning_rate": 6.8262706949872414e-06, "loss": 0.2601, "step": 449600 }, { "epoch": 1.6671733311583834, "grad_norm": 0.540223240852356, "learning_rate": 6.8114433807274056e-06, "loss": 0.2564, "step": 449700 }, { "epoch": 1.6675440612742736, "grad_norm": 1.1121909618377686, "learning_rate": 6.796631010030851e-06, "loss": 0.2841, "step": 449800 }, { "epoch": 1.6679147913901637, "grad_norm": 3.953554391860962, "learning_rate": 6.781833588022735e-06, "loss": 0.2696, "step": 449900 }, { "epoch": 1.668285521506054, "grad_norm": 0.8037930727005005, "learning_rate": 6.767051119823098e-06, "loss": 0.2586, "step": 450000 }, { "epoch": 1.6686562516219443, "grad_norm": 1.553105354309082, "learning_rate": 6.752283610546784e-06, "loss": 0.2682, "step": 450100 }, { "epoch": 1.6690269817378345, "grad_norm": 0.547755241394043, "learning_rate": 6.7375310653034426e-06, "loss": 0.2749, "step": 450200 }, { "epoch": 1.6693977118537249, "grad_norm": 1.953466534614563, "learning_rate": 6.722793489197593e-06, "loss": 0.2916, "step": 450300 }, { "epoch": 1.669768441969615, "grad_norm": 1.9869996309280396, "learning_rate": 6.708070887328527e-06, "loss": 0.2716, "step": 450400 }, { "epoch": 1.6701391720855052, "grad_norm": 2.3818793296813965, "learning_rate": 6.693363264790381e-06, "loss": 0.2773, "step": 450500 }, { "epoch": 1.6705099022013954, "grad_norm": 0.8423219919204712, "learning_rate": 6.678670626672106e-06, "loss": 0.2491, "step": 450600 }, { "epoch": 1.6708806323172856, "grad_norm": 1.1674364805221558, "learning_rate": 6.663992978057459e-06, "loss": 0.2835, "step": 450700 }, { "epoch": 1.6712513624331757, "grad_norm": 0.4758494794368744, "learning_rate": 6.649330324025005e-06, "loss": 0.2585, "step": 450800 }, { "epoch": 1.6716220925490661, "grad_norm": 1.2344334125518799, "learning_rate": 6.634682669648151e-06, "loss": 0.2572, "step": 450900 }, { "epoch": 1.6719928226649565, "grad_norm": 1.0608562231063843, "learning_rate": 6.620050019995095e-06, "loss": 0.2781, "step": 451000 }, { "epoch": 1.6723635527808467, "grad_norm": 0.6894645690917969, "learning_rate": 6.605432380128817e-06, "loss": 0.2776, "step": 451100 }, { "epoch": 1.6727342828967369, "grad_norm": 0.5485194325447083, "learning_rate": 6.590829755107164e-06, "loss": 0.2615, "step": 451200 }, { "epoch": 1.673105013012627, "grad_norm": 1.985517978668213, "learning_rate": 6.576242149982742e-06, "loss": 0.2513, "step": 451300 }, { "epoch": 1.6734757431285172, "grad_norm": 1.0573856830596924, "learning_rate": 6.561669569802959e-06, "loss": 0.259, "step": 451400 }, { "epoch": 1.6738464732444074, "grad_norm": 1.7727720737457275, "learning_rate": 6.547112019610052e-06, "loss": 0.2737, "step": 451500 }, { "epoch": 1.6742172033602978, "grad_norm": 2.271575927734375, "learning_rate": 6.5325695044410164e-06, "loss": 0.2611, "step": 451600 }, { "epoch": 1.674587933476188, "grad_norm": 1.6156085729599, "learning_rate": 6.5180420293277086e-06, "loss": 0.295, "step": 451700 }, { "epoch": 1.6749586635920783, "grad_norm": 0.4983724355697632, "learning_rate": 6.503529599296721e-06, "loss": 0.264, "step": 451800 }, { "epoch": 1.6753293937079685, "grad_norm": 1.27884840965271, "learning_rate": 6.4890322193694505e-06, "loss": 0.2571, "step": 451900 }, { "epoch": 1.6757001238238587, "grad_norm": 0.7283310294151306, "learning_rate": 6.474549894562132e-06, "loss": 0.3051, "step": 452000 }, { "epoch": 1.6760708539397489, "grad_norm": 0.8997449278831482, "learning_rate": 6.460082629885739e-06, "loss": 0.2259, "step": 452100 }, { "epoch": 1.676441584055639, "grad_norm": 0.8740087151527405, "learning_rate": 6.4456304303460555e-06, "loss": 0.2776, "step": 452200 }, { "epoch": 1.6768123141715294, "grad_norm": 1.3361930847167969, "learning_rate": 6.43119330094365e-06, "loss": 0.2972, "step": 452300 }, { "epoch": 1.6771830442874196, "grad_norm": 0.32292336225509644, "learning_rate": 6.416771246673869e-06, "loss": 0.2673, "step": 452400 }, { "epoch": 1.67755377440331, "grad_norm": 0.49236080050468445, "learning_rate": 6.402364272526867e-06, "loss": 0.2546, "step": 452500 }, { "epoch": 1.6779245045192002, "grad_norm": 0.7242713570594788, "learning_rate": 6.387972383487556e-06, "loss": 0.2746, "step": 452600 }, { "epoch": 1.6782952346350903, "grad_norm": 1.735625982284546, "learning_rate": 6.37359558453563e-06, "loss": 0.2545, "step": 452700 }, { "epoch": 1.6786659647509805, "grad_norm": 1.6984846591949463, "learning_rate": 6.359233880645593e-06, "loss": 0.2769, "step": 452800 }, { "epoch": 1.6790366948668707, "grad_norm": 1.6945559978485107, "learning_rate": 6.344887276786682e-06, "loss": 0.2311, "step": 452900 }, { "epoch": 1.679407424982761, "grad_norm": 0.8612693548202515, "learning_rate": 6.330555777922914e-06, "loss": 0.2499, "step": 453000 }, { "epoch": 1.6797781550986512, "grad_norm": 1.0352541208267212, "learning_rate": 6.316239389013129e-06, "loss": 0.2526, "step": 453100 }, { "epoch": 1.6801488852145416, "grad_norm": 1.1981514692306519, "learning_rate": 6.301938115010891e-06, "loss": 0.2751, "step": 453200 }, { "epoch": 1.6805196153304318, "grad_norm": 0.8348137736320496, "learning_rate": 6.287651960864532e-06, "loss": 0.2701, "step": 453300 }, { "epoch": 1.680890345446322, "grad_norm": 1.2895296812057495, "learning_rate": 6.273380931517198e-06, "loss": 0.2732, "step": 453400 }, { "epoch": 1.6812610755622122, "grad_norm": 0.9798968434333801, "learning_rate": 6.2591250319067485e-06, "loss": 0.2632, "step": 453500 }, { "epoch": 1.6816318056781023, "grad_norm": 2.308608293533325, "learning_rate": 6.244884266965839e-06, "loss": 0.2717, "step": 453600 }, { "epoch": 1.6820025357939927, "grad_norm": 1.1862900257110596, "learning_rate": 6.230658641621884e-06, "loss": 0.2599, "step": 453700 }, { "epoch": 1.682373265909883, "grad_norm": 0.4699949324131012, "learning_rate": 6.216448160797039e-06, "loss": 0.2558, "step": 453800 }, { "epoch": 1.6827439960257733, "grad_norm": 0.5423595905303955, "learning_rate": 6.202252829408261e-06, "loss": 0.2566, "step": 453900 }, { "epoch": 1.6831147261416635, "grad_norm": 0.6256350874900818, "learning_rate": 6.188072652367227e-06, "loss": 0.2753, "step": 454000 }, { "epoch": 1.6834854562575536, "grad_norm": 1.2415119409561157, "learning_rate": 6.173907634580378e-06, "loss": 0.2692, "step": 454100 }, { "epoch": 1.6838561863734438, "grad_norm": 0.8332960605621338, "learning_rate": 6.159757780948938e-06, "loss": 0.2674, "step": 454200 }, { "epoch": 1.684226916489334, "grad_norm": 2.1058380603790283, "learning_rate": 6.145623096368841e-06, "loss": 0.2465, "step": 454300 }, { "epoch": 1.6845976466052244, "grad_norm": 1.7291224002838135, "learning_rate": 6.131503585730808e-06, "loss": 0.2717, "step": 454400 }, { "epoch": 1.6849683767211145, "grad_norm": 0.2718738913536072, "learning_rate": 6.117399253920286e-06, "loss": 0.2697, "step": 454500 }, { "epoch": 1.685339106837005, "grad_norm": 0.7042381763458252, "learning_rate": 6.103310105817467e-06, "loss": 0.2644, "step": 454600 }, { "epoch": 1.6857098369528951, "grad_norm": 0.747858464717865, "learning_rate": 6.089236146297334e-06, "loss": 0.2544, "step": 454700 }, { "epoch": 1.6860805670687853, "grad_norm": 1.2314015626907349, "learning_rate": 6.075177380229557e-06, "loss": 0.237, "step": 454800 }, { "epoch": 1.6864512971846755, "grad_norm": 1.6858927011489868, "learning_rate": 6.061133812478576e-06, "loss": 0.2482, "step": 454900 }, { "epoch": 1.6868220273005656, "grad_norm": 1.4337347745895386, "learning_rate": 6.0471054479035835e-06, "loss": 0.2873, "step": 455000 }, { "epoch": 1.687192757416456, "grad_norm": 0.4057309329509735, "learning_rate": 6.0330922913585e-06, "loss": 0.2711, "step": 455100 }, { "epoch": 1.6875634875323462, "grad_norm": 0.9640792608261108, "learning_rate": 6.01909434769195e-06, "loss": 0.2805, "step": 455200 }, { "epoch": 1.6879342176482366, "grad_norm": 0.9594936966896057, "learning_rate": 6.005111621747367e-06, "loss": 0.2978, "step": 455300 }, { "epoch": 1.6883049477641268, "grad_norm": 0.8198885321617126, "learning_rate": 5.991144118362857e-06, "loss": 0.2669, "step": 455400 }, { "epoch": 1.688675677880017, "grad_norm": 0.6744568943977356, "learning_rate": 5.9771918423712715e-06, "loss": 0.2566, "step": 455500 }, { "epoch": 1.689046407995907, "grad_norm": 0.759061336517334, "learning_rate": 5.963254798600226e-06, "loss": 0.2744, "step": 455600 }, { "epoch": 1.6894171381117973, "grad_norm": 0.7586274147033691, "learning_rate": 5.949332991872036e-06, "loss": 0.2888, "step": 455700 }, { "epoch": 1.6897878682276875, "grad_norm": 1.9117318391799927, "learning_rate": 5.935426427003732e-06, "loss": 0.2639, "step": 455800 }, { "epoch": 1.6901585983435778, "grad_norm": 0.7399161458015442, "learning_rate": 5.921535108807125e-06, "loss": 0.2448, "step": 455900 }, { "epoch": 1.6905293284594682, "grad_norm": 1.0870819091796875, "learning_rate": 5.9076590420886704e-06, "loss": 0.2571, "step": 456000 }, { "epoch": 1.6909000585753584, "grad_norm": 1.0515432357788086, "learning_rate": 5.893798231649622e-06, "loss": 0.3079, "step": 456100 }, { "epoch": 1.6912707886912486, "grad_norm": 0.6854209899902344, "learning_rate": 5.879952682285922e-06, "loss": 0.2476, "step": 456200 }, { "epoch": 1.6916415188071388, "grad_norm": 0.3610565662384033, "learning_rate": 5.86612239878821e-06, "loss": 0.2706, "step": 456300 }, { "epoch": 1.692012248923029, "grad_norm": 0.6459169387817383, "learning_rate": 5.852307385941902e-06, "loss": 0.2548, "step": 456400 }, { "epoch": 1.692382979038919, "grad_norm": 0.6945053339004517, "learning_rate": 5.8385076485270754e-06, "loss": 0.2349, "step": 456500 }, { "epoch": 1.6927537091548095, "grad_norm": 0.9970824122428894, "learning_rate": 5.824723191318543e-06, "loss": 0.2587, "step": 456600 }, { "epoch": 1.6931244392706997, "grad_norm": 0.848026692867279, "learning_rate": 5.810954019085835e-06, "loss": 0.2786, "step": 456700 }, { "epoch": 1.69349516938659, "grad_norm": 0.4415178894996643, "learning_rate": 5.797200136593173e-06, "loss": 0.2502, "step": 456800 }, { "epoch": 1.6938658995024802, "grad_norm": 1.4043852090835571, "learning_rate": 5.783461548599523e-06, "loss": 0.2704, "step": 456900 }, { "epoch": 1.6942366296183704, "grad_norm": 1.0752079486846924, "learning_rate": 5.769738259858537e-06, "loss": 0.2406, "step": 457000 }, { "epoch": 1.6946073597342606, "grad_norm": 0.7045899033546448, "learning_rate": 5.756030275118557e-06, "loss": 0.2395, "step": 457100 }, { "epoch": 1.6949780898501507, "grad_norm": 0.8203831315040588, "learning_rate": 5.742337599122671e-06, "loss": 0.2772, "step": 457200 }, { "epoch": 1.6953488199660411, "grad_norm": 1.9197996854782104, "learning_rate": 5.728660236608635e-06, "loss": 0.2633, "step": 457300 }, { "epoch": 1.6957195500819313, "grad_norm": 0.37615013122558594, "learning_rate": 5.714998192308924e-06, "loss": 0.2587, "step": 457400 }, { "epoch": 1.6960902801978217, "grad_norm": 1.154141902923584, "learning_rate": 5.701351470950705e-06, "loss": 0.2666, "step": 457500 }, { "epoch": 1.6964610103137119, "grad_norm": 1.2582743167877197, "learning_rate": 5.6877200772558316e-06, "loss": 0.2652, "step": 457600 }, { "epoch": 1.696831740429602, "grad_norm": 1.0360419750213623, "learning_rate": 5.6741040159408896e-06, "loss": 0.2496, "step": 457700 }, { "epoch": 1.6972024705454922, "grad_norm": 0.8874717354774475, "learning_rate": 5.660503291717134e-06, "loss": 0.2846, "step": 457800 }, { "epoch": 1.6975732006613824, "grad_norm": 0.7773609757423401, "learning_rate": 5.6469179092905e-06, "loss": 0.2646, "step": 457900 }, { "epoch": 1.6979439307772728, "grad_norm": 0.9688079357147217, "learning_rate": 5.633347873361655e-06, "loss": 0.2643, "step": 458000 }, { "epoch": 1.698314660893163, "grad_norm": 0.5747445225715637, "learning_rate": 5.619793188625921e-06, "loss": 0.2841, "step": 458100 }, { "epoch": 1.6986853910090534, "grad_norm": 0.6941773295402527, "learning_rate": 5.6062538597733195e-06, "loss": 0.2506, "step": 458200 }, { "epoch": 1.6990561211249435, "grad_norm": 0.9163083434104919, "learning_rate": 5.592729891488563e-06, "loss": 0.2531, "step": 458300 }, { "epoch": 1.6994268512408337, "grad_norm": 1.089479923248291, "learning_rate": 5.579221288451037e-06, "loss": 0.2844, "step": 458400 }, { "epoch": 1.6997975813567239, "grad_norm": 1.0658237934112549, "learning_rate": 5.565728055334823e-06, "loss": 0.2836, "step": 458500 }, { "epoch": 1.700168311472614, "grad_norm": 1.4096611738204956, "learning_rate": 5.552250196808689e-06, "loss": 0.2877, "step": 458600 }, { "epoch": 1.7005390415885044, "grad_norm": 1.165905237197876, "learning_rate": 5.538787717536071e-06, "loss": 0.297, "step": 458700 }, { "epoch": 1.7009097717043946, "grad_norm": 1.001394271850586, "learning_rate": 5.525340622175074e-06, "loss": 0.2505, "step": 458800 }, { "epoch": 1.701280501820285, "grad_norm": 0.9843615889549255, "learning_rate": 5.511908915378522e-06, "loss": 0.2665, "step": 458900 }, { "epoch": 1.7016512319361752, "grad_norm": 1.2547672986984253, "learning_rate": 5.4984926017938474e-06, "loss": 0.2755, "step": 459000 }, { "epoch": 1.7020219620520654, "grad_norm": 0.9287433624267578, "learning_rate": 5.485091686063232e-06, "loss": 0.2353, "step": 459100 }, { "epoch": 1.7023926921679555, "grad_norm": 0.8501111268997192, "learning_rate": 5.471706172823471e-06, "loss": 0.264, "step": 459200 }, { "epoch": 1.7027634222838457, "grad_norm": 0.7887113094329834, "learning_rate": 5.458336066706049e-06, "loss": 0.2735, "step": 459300 }, { "epoch": 1.703134152399736, "grad_norm": 1.3088834285736084, "learning_rate": 5.444981372337144e-06, "loss": 0.2696, "step": 459400 }, { "epoch": 1.7035048825156263, "grad_norm": 1.2255067825317383, "learning_rate": 5.431642094337563e-06, "loss": 0.2699, "step": 459500 }, { "epoch": 1.7038756126315167, "grad_norm": 1.150579571723938, "learning_rate": 5.418318237322794e-06, "loss": 0.2842, "step": 459600 }, { "epoch": 1.7042463427474068, "grad_norm": 1.509373426437378, "learning_rate": 5.4050098059030165e-06, "loss": 0.2528, "step": 459700 }, { "epoch": 1.704617072863297, "grad_norm": 0.8952323198318481, "learning_rate": 5.391716804683006e-06, "loss": 0.2694, "step": 459800 }, { "epoch": 1.7049878029791872, "grad_norm": 0.5835772752761841, "learning_rate": 5.378439238262273e-06, "loss": 0.2793, "step": 459900 }, { "epoch": 1.7053585330950773, "grad_norm": 1.3710159063339233, "learning_rate": 5.365177111234943e-06, "loss": 0.2848, "step": 460000 }, { "epoch": 1.7057292632109675, "grad_norm": 0.6394947171211243, "learning_rate": 5.3519304281898e-06, "loss": 0.2692, "step": 460100 }, { "epoch": 1.706099993326858, "grad_norm": 0.5650017261505127, "learning_rate": 5.338699193710323e-06, "loss": 0.2688, "step": 460200 }, { "epoch": 1.7064707234427483, "grad_norm": 0.8047268986701965, "learning_rate": 5.325483412374593e-06, "loss": 0.245, "step": 460300 }, { "epoch": 1.7068414535586385, "grad_norm": 1.0398794412612915, "learning_rate": 5.312283088755388e-06, "loss": 0.2817, "step": 460400 }, { "epoch": 1.7072121836745286, "grad_norm": 0.8131577968597412, "learning_rate": 5.2990982274201015e-06, "loss": 0.2647, "step": 460500 }, { "epoch": 1.7075829137904188, "grad_norm": 2.146981954574585, "learning_rate": 5.285928832930809e-06, "loss": 0.2517, "step": 460600 }, { "epoch": 1.707953643906309, "grad_norm": 1.8917089700698853, "learning_rate": 5.272774909844197e-06, "loss": 0.2435, "step": 460700 }, { "epoch": 1.7083243740221992, "grad_norm": 1.1992532014846802, "learning_rate": 5.259636462711642e-06, "loss": 0.2622, "step": 460800 }, { "epoch": 1.7086951041380896, "grad_norm": 1.034328579902649, "learning_rate": 5.246513496079142e-06, "loss": 0.2734, "step": 460900 }, { "epoch": 1.70906583425398, "grad_norm": 1.7529715299606323, "learning_rate": 5.23340601448733e-06, "loss": 0.2521, "step": 461000 }, { "epoch": 1.7094365643698701, "grad_norm": 1.5269354581832886, "learning_rate": 5.220314022471512e-06, "loss": 0.2468, "step": 461100 }, { "epoch": 1.7098072944857603, "grad_norm": 1.0462931394577026, "learning_rate": 5.207237524561598e-06, "loss": 0.2642, "step": 461200 }, { "epoch": 1.7101780246016505, "grad_norm": 0.9681651592254639, "learning_rate": 5.194176525282162e-06, "loss": 0.2713, "step": 461300 }, { "epoch": 1.7105487547175406, "grad_norm": 1.1960257291793823, "learning_rate": 5.181131029152408e-06, "loss": 0.2426, "step": 461400 }, { "epoch": 1.7109194848334308, "grad_norm": 0.9325838685035706, "learning_rate": 5.168101040686163e-06, "loss": 0.2614, "step": 461500 }, { "epoch": 1.7112902149493212, "grad_norm": 1.4796744585037231, "learning_rate": 5.1550865643919145e-06, "loss": 0.2421, "step": 461600 }, { "epoch": 1.7116609450652114, "grad_norm": 1.0133765935897827, "learning_rate": 5.142087604772772e-06, "loss": 0.2758, "step": 461700 }, { "epoch": 1.7120316751811018, "grad_norm": 1.049285650253296, "learning_rate": 5.129104166326448e-06, "loss": 0.2603, "step": 461800 }, { "epoch": 1.712402405296992, "grad_norm": 0.9715127944946289, "learning_rate": 5.1161362535453335e-06, "loss": 0.2647, "step": 461900 }, { "epoch": 1.7127731354128821, "grad_norm": 1.480015516281128, "learning_rate": 5.103183870916417e-06, "loss": 0.2562, "step": 462000 }, { "epoch": 1.7131438655287723, "grad_norm": 0.6853845119476318, "learning_rate": 5.090247022921324e-06, "loss": 0.2969, "step": 462100 }, { "epoch": 1.7135145956446625, "grad_norm": 1.116734504699707, "learning_rate": 5.077325714036285e-06, "loss": 0.2729, "step": 462200 }, { "epoch": 1.7138853257605529, "grad_norm": 1.099605917930603, "learning_rate": 5.0644199487321745e-06, "loss": 0.2589, "step": 462300 }, { "epoch": 1.714256055876443, "grad_norm": 0.8690333366394043, "learning_rate": 5.051529731474497e-06, "loss": 0.2547, "step": 462400 }, { "epoch": 1.7146267859923334, "grad_norm": 0.7819579839706421, "learning_rate": 5.038655066723358e-06, "loss": 0.2794, "step": 462500 }, { "epoch": 1.7149975161082236, "grad_norm": 0.9263894557952881, "learning_rate": 5.025795958933471e-06, "loss": 0.2708, "step": 462600 }, { "epoch": 1.7153682462241138, "grad_norm": 1.2588112354278564, "learning_rate": 5.012952412554217e-06, "loss": 0.2545, "step": 462700 }, { "epoch": 1.715738976340004, "grad_norm": 1.0687519311904907, "learning_rate": 5.00012443202953e-06, "loss": 0.2576, "step": 462800 }, { "epoch": 1.7161097064558941, "grad_norm": 2.153301477432251, "learning_rate": 4.9873120217979855e-06, "loss": 0.2607, "step": 462900 }, { "epoch": 1.7164804365717845, "grad_norm": 0.9296573400497437, "learning_rate": 4.974515186292794e-06, "loss": 0.2483, "step": 463000 }, { "epoch": 1.7168511666876747, "grad_norm": 0.532751202583313, "learning_rate": 4.961733929941747e-06, "loss": 0.301, "step": 463100 }, { "epoch": 1.717221896803565, "grad_norm": 1.1218891143798828, "learning_rate": 4.9489682571672466e-06, "loss": 0.2706, "step": 463200 }, { "epoch": 1.7175926269194552, "grad_norm": 0.8309715390205383, "learning_rate": 4.9362181723863255e-06, "loss": 0.2786, "step": 463300 }, { "epoch": 1.7179633570353454, "grad_norm": 0.7430174946784973, "learning_rate": 4.9234836800105985e-06, "loss": 0.2814, "step": 463400 }, { "epoch": 1.7183340871512356, "grad_norm": 0.8614826202392578, "learning_rate": 4.910764784446298e-06, "loss": 0.2581, "step": 463500 }, { "epoch": 1.7187048172671258, "grad_norm": 1.36961030960083, "learning_rate": 4.898061490094258e-06, "loss": 0.2799, "step": 463600 }, { "epoch": 1.7190755473830162, "grad_norm": 0.8702954053878784, "learning_rate": 4.885373801349902e-06, "loss": 0.2563, "step": 463700 }, { "epoch": 1.7194462774989063, "grad_norm": 1.4011646509170532, "learning_rate": 4.872701722603285e-06, "loss": 0.255, "step": 463800 }, { "epoch": 1.7198170076147967, "grad_norm": 1.1718250513076782, "learning_rate": 4.860045258239032e-06, "loss": 0.2586, "step": 463900 }, { "epoch": 1.720187737730687, "grad_norm": 0.5843679904937744, "learning_rate": 4.847404412636358e-06, "loss": 0.291, "step": 464000 }, { "epoch": 1.720558467846577, "grad_norm": 0.49473151564598083, "learning_rate": 4.834779190169119e-06, "loss": 0.2905, "step": 464100 }, { "epoch": 1.7209291979624672, "grad_norm": 1.531345009803772, "learning_rate": 4.822169595205722e-06, "loss": 0.2582, "step": 464200 }, { "epoch": 1.7212999280783574, "grad_norm": 1.5654586553573608, "learning_rate": 4.809575632109181e-06, "loss": 0.2595, "step": 464300 }, { "epoch": 1.7216706581942478, "grad_norm": 2.360568046569824, "learning_rate": 4.7969973052371e-06, "loss": 0.3185, "step": 464400 }, { "epoch": 1.722041388310138, "grad_norm": 0.6330832242965698, "learning_rate": 4.784434618941669e-06, "loss": 0.2973, "step": 464500 }, { "epoch": 1.7224121184260284, "grad_norm": 1.9197975397109985, "learning_rate": 4.771887577569684e-06, "loss": 0.2572, "step": 464600 }, { "epoch": 1.7227828485419185, "grad_norm": 2.0941104888916016, "learning_rate": 4.75935618546251e-06, "loss": 0.2782, "step": 464700 }, { "epoch": 1.7231535786578087, "grad_norm": 1.1483163833618164, "learning_rate": 4.7468404469560965e-06, "loss": 0.2477, "step": 464800 }, { "epoch": 1.7235243087736989, "grad_norm": 0.8579285740852356, "learning_rate": 4.73434036638099e-06, "loss": 0.2878, "step": 464900 }, { "epoch": 1.723895038889589, "grad_norm": 3.4015164375305176, "learning_rate": 4.721855948062326e-06, "loss": 0.294, "step": 465000 }, { "epoch": 1.7242657690054792, "grad_norm": 1.0732221603393555, "learning_rate": 4.709387196319764e-06, "loss": 0.2685, "step": 465100 }, { "epoch": 1.7246364991213696, "grad_norm": 1.0938019752502441, "learning_rate": 4.6969341154676275e-06, "loss": 0.2926, "step": 465200 }, { "epoch": 1.72500722923726, "grad_norm": 1.6346945762634277, "learning_rate": 4.6844967098147504e-06, "loss": 0.2694, "step": 465300 }, { "epoch": 1.7253779593531502, "grad_norm": 0.9274721741676331, "learning_rate": 4.67207498366457e-06, "loss": 0.2938, "step": 465400 }, { "epoch": 1.7257486894690404, "grad_norm": 1.1946529150009155, "learning_rate": 4.659668941315115e-06, "loss": 0.2676, "step": 465500 }, { "epoch": 1.7261194195849305, "grad_norm": 0.8639439940452576, "learning_rate": 4.647278587058957e-06, "loss": 0.2619, "step": 465600 }, { "epoch": 1.7264901497008207, "grad_norm": 0.7452884912490845, "learning_rate": 4.634903925183243e-06, "loss": 0.2618, "step": 465700 }, { "epoch": 1.7268608798167109, "grad_norm": 1.308076024055481, "learning_rate": 4.622544959969721e-06, "loss": 0.3207, "step": 465800 }, { "epoch": 1.7272316099326013, "grad_norm": 0.5046842098236084, "learning_rate": 4.6102016956946494e-06, "loss": 0.2763, "step": 465900 }, { "epoch": 1.7276023400484914, "grad_norm": 1.1941509246826172, "learning_rate": 4.597874136628927e-06, "loss": 0.2846, "step": 466000 }, { "epoch": 1.7279730701643818, "grad_norm": 1.2963203191757202, "learning_rate": 4.585562287037959e-06, "loss": 0.2754, "step": 466100 }, { "epoch": 1.728343800280272, "grad_norm": 0.5602049827575684, "learning_rate": 4.573266151181737e-06, "loss": 0.2787, "step": 466200 }, { "epoch": 1.7287145303961622, "grad_norm": 1.6215664148330688, "learning_rate": 4.560985733314832e-06, "loss": 0.2853, "step": 466300 }, { "epoch": 1.7290852605120524, "grad_norm": 1.5586398839950562, "learning_rate": 4.548721037686354e-06, "loss": 0.2699, "step": 466400 }, { "epoch": 1.7294559906279425, "grad_norm": 1.716631293296814, "learning_rate": 4.536472068539971e-06, "loss": 0.2667, "step": 466500 }, { "epoch": 1.729826720743833, "grad_norm": 1.0508164167404175, "learning_rate": 4.5242388301139245e-06, "loss": 0.2494, "step": 466600 }, { "epoch": 1.730197450859723, "grad_norm": 0.7375841736793518, "learning_rate": 4.512021326640997e-06, "loss": 0.2491, "step": 466700 }, { "epoch": 1.7305681809756135, "grad_norm": 0.973644495010376, "learning_rate": 4.499819562348556e-06, "loss": 0.2641, "step": 466800 }, { "epoch": 1.7309389110915037, "grad_norm": 0.5276886224746704, "learning_rate": 4.487633541458486e-06, "loss": 0.272, "step": 466900 }, { "epoch": 1.7313096412073938, "grad_norm": 1.2161145210266113, "learning_rate": 4.475463268187241e-06, "loss": 0.2672, "step": 467000 }, { "epoch": 1.731680371323284, "grad_norm": 0.5807017087936401, "learning_rate": 4.4633087467458365e-06, "loss": 0.2746, "step": 467100 }, { "epoch": 1.7320511014391742, "grad_norm": 1.1620020866394043, "learning_rate": 4.451169981339831e-06, "loss": 0.2472, "step": 467200 }, { "epoch": 1.7324218315550646, "grad_norm": 0.8418462872505188, "learning_rate": 4.439046976169315e-06, "loss": 0.2947, "step": 467300 }, { "epoch": 1.7327925616709547, "grad_norm": 0.8959442973136902, "learning_rate": 4.426939735428942e-06, "loss": 0.2528, "step": 467400 }, { "epoch": 1.7331632917868451, "grad_norm": 4.907917022705078, "learning_rate": 4.414848263307913e-06, "loss": 0.2614, "step": 467500 }, { "epoch": 1.7335340219027353, "grad_norm": 0.8006325364112854, "learning_rate": 4.402772563989954e-06, "loss": 0.2839, "step": 467600 }, { "epoch": 1.7339047520186255, "grad_norm": 1.6741604804992676, "learning_rate": 4.390712641653366e-06, "loss": 0.2742, "step": 467700 }, { "epoch": 1.7342754821345157, "grad_norm": 0.7699487805366516, "learning_rate": 4.378668500470962e-06, "loss": 0.2737, "step": 467800 }, { "epoch": 1.7346462122504058, "grad_norm": 0.4285014867782593, "learning_rate": 4.3666401446101015e-06, "loss": 0.2679, "step": 467900 }, { "epoch": 1.7350169423662962, "grad_norm": 1.224151372909546, "learning_rate": 4.354627578232695e-06, "loss": 0.2392, "step": 468000 }, { "epoch": 1.7353876724821864, "grad_norm": 0.833739161491394, "learning_rate": 4.342630805495179e-06, "loss": 0.2715, "step": 468100 }, { "epoch": 1.7357584025980768, "grad_norm": 0.3331376910209656, "learning_rate": 4.330649830548522e-06, "loss": 0.2618, "step": 468200 }, { "epoch": 1.736129132713967, "grad_norm": 0.8806135654449463, "learning_rate": 4.318684657538236e-06, "loss": 0.2428, "step": 468300 }, { "epoch": 1.7364998628298571, "grad_norm": 2.307185411453247, "learning_rate": 4.306735290604347e-06, "loss": 0.2717, "step": 468400 }, { "epoch": 1.7368705929457473, "grad_norm": 0.9971423149108887, "learning_rate": 4.294801733881437e-06, "loss": 0.2675, "step": 468500 }, { "epoch": 1.7372413230616375, "grad_norm": 1.7016847133636475, "learning_rate": 4.2828839914986095e-06, "loss": 0.2806, "step": 468600 }, { "epoch": 1.7376120531775279, "grad_norm": 1.5396534204483032, "learning_rate": 4.270982067579476e-06, "loss": 0.2721, "step": 468700 }, { "epoch": 1.737982783293418, "grad_norm": 0.46256154775619507, "learning_rate": 4.259095966242216e-06, "loss": 0.2546, "step": 468800 }, { "epoch": 1.7383535134093084, "grad_norm": 1.7436906099319458, "learning_rate": 4.247225691599482e-06, "loss": 0.2518, "step": 468900 }, { "epoch": 1.7387242435251986, "grad_norm": 0.6436664462089539, "learning_rate": 4.2353712477584965e-06, "loss": 0.2624, "step": 469000 }, { "epoch": 1.7390949736410888, "grad_norm": 1.7161049842834473, "learning_rate": 4.223532638820982e-06, "loss": 0.248, "step": 469100 }, { "epoch": 1.739465703756979, "grad_norm": 0.9227671027183533, "learning_rate": 4.211709868883174e-06, "loss": 0.266, "step": 469200 }, { "epoch": 1.7398364338728691, "grad_norm": 0.22543881833553314, "learning_rate": 4.199902942035855e-06, "loss": 0.2741, "step": 469300 }, { "epoch": 1.7402071639887593, "grad_norm": 0.977307915687561, "learning_rate": 4.188111862364308e-06, "loss": 0.2521, "step": 469400 }, { "epoch": 1.7405778941046497, "grad_norm": 1.012514591217041, "learning_rate": 4.176336633948313e-06, "loss": 0.2617, "step": 469500 }, { "epoch": 1.74094862422054, "grad_norm": 1.458134651184082, "learning_rate": 4.164577260862229e-06, "loss": 0.2693, "step": 469600 }, { "epoch": 1.7413193543364303, "grad_norm": 1.0480701923370361, "learning_rate": 4.1528337471748445e-06, "loss": 0.2701, "step": 469700 }, { "epoch": 1.7416900844523204, "grad_norm": 1.0777523517608643, "learning_rate": 4.141106096949527e-06, "loss": 0.2675, "step": 469800 }, { "epoch": 1.7420608145682106, "grad_norm": 1.0515462160110474, "learning_rate": 4.129394314244123e-06, "loss": 0.2629, "step": 469900 }, { "epoch": 1.7424315446841008, "grad_norm": 1.0478118658065796, "learning_rate": 4.1176984031109925e-06, "loss": 0.2521, "step": 470000 }, { "epoch": 1.742802274799991, "grad_norm": 1.0925236940383911, "learning_rate": 4.10601836759702e-06, "loss": 0.2711, "step": 470100 }, { "epoch": 1.7431730049158813, "grad_norm": 0.3915126323699951, "learning_rate": 4.094354211743584e-06, "loss": 0.2669, "step": 470200 }, { "epoch": 1.7435437350317717, "grad_norm": 1.4284617900848389, "learning_rate": 4.082705939586551e-06, "loss": 0.2436, "step": 470300 }, { "epoch": 1.743914465147662, "grad_norm": 0.9822292327880859, "learning_rate": 4.071073555156346e-06, "loss": 0.2607, "step": 470400 }, { "epoch": 1.744285195263552, "grad_norm": 0.9346534609794617, "learning_rate": 4.059457062477828e-06, "loss": 0.237, "step": 470500 }, { "epoch": 1.7446559253794423, "grad_norm": 1.2499600648880005, "learning_rate": 4.0478564655703934e-06, "loss": 0.2706, "step": 470600 }, { "epoch": 1.7450266554953324, "grad_norm": 0.9353005886077881, "learning_rate": 4.0362717684479465e-06, "loss": 0.2736, "step": 470700 }, { "epoch": 1.7453973856112226, "grad_norm": 1.0662838220596313, "learning_rate": 4.024702975118883e-06, "loss": 0.2731, "step": 470800 }, { "epoch": 1.745768115727113, "grad_norm": 1.0592081546783447, "learning_rate": 4.013150089586071e-06, "loss": 0.2727, "step": 470900 }, { "epoch": 1.7461388458430032, "grad_norm": 0.7986031174659729, "learning_rate": 4.00161311584692e-06, "loss": 0.2581, "step": 471000 }, { "epoch": 1.7465095759588936, "grad_norm": 0.9045782685279846, "learning_rate": 3.990092057893297e-06, "loss": 0.2479, "step": 471100 }, { "epoch": 1.7468803060747837, "grad_norm": 1.532966136932373, "learning_rate": 3.978586919711575e-06, "loss": 0.2609, "step": 471200 }, { "epoch": 1.747251036190674, "grad_norm": 1.5782955884933472, "learning_rate": 3.967097705282613e-06, "loss": 0.2716, "step": 471300 }, { "epoch": 1.747621766306564, "grad_norm": 1.0177602767944336, "learning_rate": 3.955624418581766e-06, "loss": 0.2932, "step": 471400 }, { "epoch": 1.7479924964224542, "grad_norm": 0.9888424873352051, "learning_rate": 3.944167063578891e-06, "loss": 0.2623, "step": 471500 }, { "epoch": 1.7483632265383446, "grad_norm": 2.5795369148254395, "learning_rate": 3.932725644238305e-06, "loss": 0.2578, "step": 471600 }, { "epoch": 1.7487339566542348, "grad_norm": 0.7608141899108887, "learning_rate": 3.9213001645188155e-06, "loss": 0.2786, "step": 471700 }, { "epoch": 1.7491046867701252, "grad_norm": 1.704999566078186, "learning_rate": 3.909890628373752e-06, "loss": 0.2424, "step": 471800 }, { "epoch": 1.7494754168860154, "grad_norm": 1.1331104040145874, "learning_rate": 3.898497039750876e-06, "loss": 0.2512, "step": 471900 }, { "epoch": 1.7498461470019055, "grad_norm": 0.8403963446617126, "learning_rate": 3.887119402592465e-06, "loss": 0.2573, "step": 472000 }, { "epoch": 1.7502168771177957, "grad_norm": 1.8777539730072021, "learning_rate": 3.875757720835266e-06, "loss": 0.2943, "step": 472100 }, { "epoch": 1.750587607233686, "grad_norm": 0.8965407013893127, "learning_rate": 3.8644119984104945e-06, "loss": 0.2396, "step": 472200 }, { "epoch": 1.7509583373495763, "grad_norm": 0.934965968132019, "learning_rate": 3.853082239243866e-06, "loss": 0.2395, "step": 472300 }, { "epoch": 1.7513290674654665, "grad_norm": 1.036609172821045, "learning_rate": 3.8417684472555615e-06, "loss": 0.2755, "step": 472400 }, { "epoch": 1.7516997975813569, "grad_norm": 1.3701921701431274, "learning_rate": 3.830470626360228e-06, "loss": 0.2476, "step": 472500 }, { "epoch": 1.752070527697247, "grad_norm": 0.43265700340270996, "learning_rate": 3.81918878046702e-06, "loss": 0.2534, "step": 472600 }, { "epoch": 1.7524412578131372, "grad_norm": 0.6697891354560852, "learning_rate": 3.8079229134795168e-06, "loss": 0.2625, "step": 472700 }, { "epoch": 1.7528119879290274, "grad_norm": 0.8589888215065002, "learning_rate": 3.7966730292957885e-06, "loss": 0.2835, "step": 472800 }, { "epoch": 1.7531827180449175, "grad_norm": 1.6228798627853394, "learning_rate": 3.7854391318083993e-06, "loss": 0.2558, "step": 472900 }, { "epoch": 1.753553448160808, "grad_norm": 1.1228030920028687, "learning_rate": 3.7742212249043565e-06, "loss": 0.2713, "step": 473000 }, { "epoch": 1.753924178276698, "grad_norm": 0.9574868679046631, "learning_rate": 3.7630193124651214e-06, "loss": 0.2856, "step": 473100 }, { "epoch": 1.7542949083925885, "grad_norm": 0.8991975784301758, "learning_rate": 3.7518333983666666e-06, "loss": 0.2624, "step": 473200 }, { "epoch": 1.7546656385084787, "grad_norm": 0.5350012183189392, "learning_rate": 3.740663486479384e-06, "loss": 0.266, "step": 473300 }, { "epoch": 1.7550363686243688, "grad_norm": 0.48034054040908813, "learning_rate": 3.7295095806681557e-06, "loss": 0.2607, "step": 473400 }, { "epoch": 1.755407098740259, "grad_norm": 1.053760290145874, "learning_rate": 3.71837168479231e-06, "loss": 0.2795, "step": 473500 }, { "epoch": 1.7557778288561492, "grad_norm": 0.7385583519935608, "learning_rate": 3.7072498027056368e-06, "loss": 0.2616, "step": 473600 }, { "epoch": 1.7561485589720396, "grad_norm": 1.0890570878982544, "learning_rate": 3.696143938256408e-06, "loss": 0.259, "step": 473700 }, { "epoch": 1.7565192890879298, "grad_norm": 1.6163493394851685, "learning_rate": 3.6850540952873213e-06, "loss": 0.245, "step": 473800 }, { "epoch": 1.7568900192038202, "grad_norm": 1.1676549911499023, "learning_rate": 3.673980277635547e-06, "loss": 0.256, "step": 473900 }, { "epoch": 1.7572607493197103, "grad_norm": 0.587378978729248, "learning_rate": 3.6629224891327142e-06, "loss": 0.2702, "step": 474000 }, { "epoch": 1.7576314794356005, "grad_norm": 1.2639639377593994, "learning_rate": 3.651880733604901e-06, "loss": 0.258, "step": 474100 }, { "epoch": 1.7580022095514907, "grad_norm": 0.9604103565216064, "learning_rate": 3.640855014872635e-06, "loss": 0.2786, "step": 474200 }, { "epoch": 1.7583729396673808, "grad_norm": 0.20269820094108582, "learning_rate": 3.6298453367508923e-06, "loss": 0.2556, "step": 474300 }, { "epoch": 1.758743669783271, "grad_norm": 0.7047596573829651, "learning_rate": 3.618851703049103e-06, "loss": 0.2962, "step": 474400 }, { "epoch": 1.7591143998991614, "grad_norm": 1.5951811075210571, "learning_rate": 3.6078741175711584e-06, "loss": 0.292, "step": 474500 }, { "epoch": 1.7594851300150518, "grad_norm": 1.4751733541488647, "learning_rate": 3.596912584115375e-06, "loss": 0.2842, "step": 474600 }, { "epoch": 1.759855860130942, "grad_norm": 0.7744534015655518, "learning_rate": 3.5859671064745192e-06, "loss": 0.2484, "step": 474700 }, { "epoch": 1.7602265902468321, "grad_norm": 0.8853980302810669, "learning_rate": 3.5750376884358285e-06, "loss": 0.2933, "step": 474800 }, { "epoch": 1.7605973203627223, "grad_norm": 0.5509510040283203, "learning_rate": 3.5641243337809605e-06, "loss": 0.2547, "step": 474900 }, { "epoch": 1.7609680504786125, "grad_norm": 0.9715139865875244, "learning_rate": 3.5532270462859894e-06, "loss": 0.2659, "step": 475000 }, { "epoch": 1.7613387805945027, "grad_norm": 1.0288612842559814, "learning_rate": 3.542345829721483e-06, "loss": 0.2738, "step": 475100 }, { "epoch": 1.761709510710393, "grad_norm": 0.4905116856098175, "learning_rate": 3.531480687852423e-06, "loss": 0.2668, "step": 475200 }, { "epoch": 1.7620802408262832, "grad_norm": 0.8160320520401001, "learning_rate": 3.5206316244382144e-06, "loss": 0.257, "step": 475300 }, { "epoch": 1.7624509709421736, "grad_norm": 1.668597936630249, "learning_rate": 3.5097986432327313e-06, "loss": 0.2667, "step": 475400 }, { "epoch": 1.7628217010580638, "grad_norm": 0.6880292296409607, "learning_rate": 3.498981747984259e-06, "loss": 0.278, "step": 475500 }, { "epoch": 1.763192431173954, "grad_norm": 1.1272066831588745, "learning_rate": 3.488180942435515e-06, "loss": 0.2507, "step": 475600 }, { "epoch": 1.7635631612898441, "grad_norm": 0.7850258946418762, "learning_rate": 3.4773962303236873e-06, "loss": 0.2658, "step": 475700 }, { "epoch": 1.7639338914057343, "grad_norm": 0.9743553400039673, "learning_rate": 3.46662761538033e-06, "loss": 0.2652, "step": 475800 }, { "epoch": 1.7643046215216247, "grad_norm": 0.6117173433303833, "learning_rate": 3.4558751013314906e-06, "loss": 0.2469, "step": 475900 }, { "epoch": 1.7646753516375149, "grad_norm": 0.5054346919059753, "learning_rate": 3.445138691897609e-06, "loss": 0.2697, "step": 476000 }, { "epoch": 1.7650460817534053, "grad_norm": 1.7701359987258911, "learning_rate": 3.4344183907935535e-06, "loss": 0.2671, "step": 476100 }, { "epoch": 1.7654168118692954, "grad_norm": 0.3261033892631531, "learning_rate": 3.4237142017286504e-06, "loss": 0.2814, "step": 476200 }, { "epoch": 1.7657875419851856, "grad_norm": 0.7193737030029297, "learning_rate": 3.4130261284066212e-06, "loss": 0.2631, "step": 476300 }, { "epoch": 1.7661582721010758, "grad_norm": 1.8280943632125854, "learning_rate": 3.4023541745256015e-06, "loss": 0.2719, "step": 476400 }, { "epoch": 1.766529002216966, "grad_norm": 1.2405204772949219, "learning_rate": 3.391698343778199e-06, "loss": 0.2709, "step": 476500 }, { "epoch": 1.7668997323328564, "grad_norm": 0.8246700167655945, "learning_rate": 3.381058639851381e-06, "loss": 0.2702, "step": 476600 }, { "epoch": 1.7672704624487465, "grad_norm": 1.688012957572937, "learning_rate": 3.3704350664265805e-06, "loss": 0.2838, "step": 476700 }, { "epoch": 1.767641192564637, "grad_norm": 0.5566148161888123, "learning_rate": 3.359827627179635e-06, "loss": 0.2634, "step": 476800 }, { "epoch": 1.768011922680527, "grad_norm": 1.6927053928375244, "learning_rate": 3.34923632578078e-06, "loss": 0.2805, "step": 476900 }, { "epoch": 1.7683826527964173, "grad_norm": 0.8365062475204468, "learning_rate": 3.3386611658947077e-06, "loss": 0.2688, "step": 477000 }, { "epoch": 1.7687533829123074, "grad_norm": 1.3370680809020996, "learning_rate": 3.32810215118049e-06, "loss": 0.2536, "step": 477100 }, { "epoch": 1.7691241130281976, "grad_norm": 1.5083138942718506, "learning_rate": 3.317559285291627e-06, "loss": 0.2817, "step": 477200 }, { "epoch": 1.769494843144088, "grad_norm": 1.4467188119888306, "learning_rate": 3.3070325718760285e-06, "loss": 0.2759, "step": 477300 }, { "epoch": 1.7698655732599782, "grad_norm": 0.8112276196479797, "learning_rate": 3.2965220145760202e-06, "loss": 0.2879, "step": 477400 }, { "epoch": 1.7702363033758686, "grad_norm": 1.1798981428146362, "learning_rate": 3.2860276170283143e-06, "loss": 0.2826, "step": 477500 }, { "epoch": 1.7706070334917587, "grad_norm": 1.423201560974121, "learning_rate": 3.2755493828640795e-06, "loss": 0.2696, "step": 477600 }, { "epoch": 1.770977763607649, "grad_norm": 0.8858146071434021, "learning_rate": 3.2650873157088536e-06, "loss": 0.287, "step": 477700 }, { "epoch": 1.771348493723539, "grad_norm": 0.9625779986381531, "learning_rate": 3.254641419182569e-06, "loss": 0.2556, "step": 477800 }, { "epoch": 1.7717192238394293, "grad_norm": 0.9240179657936096, "learning_rate": 3.244211696899613e-06, "loss": 0.2538, "step": 477900 }, { "epoch": 1.7720899539553197, "grad_norm": 0.9124000072479248, "learning_rate": 3.2337981524687365e-06, "loss": 0.2798, "step": 478000 }, { "epoch": 1.7724606840712098, "grad_norm": 1.3753811120986938, "learning_rate": 3.223400789493103e-06, "loss": 0.2383, "step": 478100 }, { "epoch": 1.7728314141871002, "grad_norm": 1.2311664819717407, "learning_rate": 3.213019611570278e-06, "loss": 0.2574, "step": 478200 }, { "epoch": 1.7732021443029904, "grad_norm": 1.1619879007339478, "learning_rate": 3.2026546222922172e-06, "loss": 0.2573, "step": 478300 }, { "epoch": 1.7735728744188806, "grad_norm": 2.4417474269866943, "learning_rate": 3.192305825245301e-06, "loss": 0.295, "step": 478400 }, { "epoch": 1.7739436045347707, "grad_norm": 0.9464538097381592, "learning_rate": 3.1819732240102818e-06, "loss": 0.2538, "step": 478500 }, { "epoch": 1.774314334650661, "grad_norm": 1.3431862592697144, "learning_rate": 3.171656822162311e-06, "loss": 0.2973, "step": 478600 }, { "epoch": 1.7746850647665513, "grad_norm": 0.7325722575187683, "learning_rate": 3.1613566232709614e-06, "loss": 0.2696, "step": 478700 }, { "epoch": 1.7750557948824415, "grad_norm": 2.409623384475708, "learning_rate": 3.151072630900148e-06, "loss": 0.2743, "step": 478800 }, { "epoch": 1.7754265249983319, "grad_norm": 1.095247745513916, "learning_rate": 3.1408048486082364e-06, "loss": 0.2582, "step": 478900 }, { "epoch": 1.775797255114222, "grad_norm": 0.7989535927772522, "learning_rate": 3.1305532799479452e-06, "loss": 0.2704, "step": 479000 }, { "epoch": 1.7761679852301122, "grad_norm": 1.3110302686691284, "learning_rate": 3.1203179284663876e-06, "loss": 0.2497, "step": 479100 }, { "epoch": 1.7765387153460024, "grad_norm": 1.3005894422531128, "learning_rate": 3.1100987977050853e-06, "loss": 0.2689, "step": 479200 }, { "epoch": 1.7769094454618926, "grad_norm": 1.2252295017242432, "learning_rate": 3.099895891199922e-06, "loss": 0.2828, "step": 479300 }, { "epoch": 1.7772801755777827, "grad_norm": 1.411049723625183, "learning_rate": 3.0897092124811844e-06, "loss": 0.2805, "step": 479400 }, { "epoch": 1.7776509056936731, "grad_norm": 0.7726469039916992, "learning_rate": 3.0795387650735473e-06, "loss": 0.2786, "step": 479500 }, { "epoch": 1.7780216358095635, "grad_norm": 0.9516721963882446, "learning_rate": 3.069384552496052e-06, "loss": 0.2506, "step": 479600 }, { "epoch": 1.7783923659254537, "grad_norm": 0.7635670900344849, "learning_rate": 3.0592465782621203e-06, "loss": 0.2676, "step": 479700 }, { "epoch": 1.7787630960413439, "grad_norm": 0.9910292625427246, "learning_rate": 3.0491248458795916e-06, "loss": 0.2779, "step": 479800 }, { "epoch": 1.779133826157234, "grad_norm": 0.944811999797821, "learning_rate": 3.0390193588506364e-06, "loss": 0.2661, "step": 479900 }, { "epoch": 1.7795045562731242, "grad_norm": 0.5664013624191284, "learning_rate": 3.0289301206718525e-06, "loss": 0.2799, "step": 480000 }, { "epoch": 1.7798752863890144, "grad_norm": 1.4228800535202026, "learning_rate": 3.0188571348341853e-06, "loss": 0.2389, "step": 480100 }, { "epoch": 1.7802460165049048, "grad_norm": 0.2395518571138382, "learning_rate": 3.008800404822942e-06, "loss": 0.2735, "step": 480200 }, { "epoch": 1.780616746620795, "grad_norm": 1.5547553300857544, "learning_rate": 2.9987599341178673e-06, "loss": 0.2627, "step": 480300 }, { "epoch": 1.7809874767366853, "grad_norm": 1.8118470907211304, "learning_rate": 2.9887357261930037e-06, "loss": 0.2929, "step": 480400 }, { "epoch": 1.7813582068525755, "grad_norm": 1.4198181629180908, "learning_rate": 2.978727784516805e-06, "loss": 0.2713, "step": 480500 }, { "epoch": 1.7817289369684657, "grad_norm": 0.9878463745117188, "learning_rate": 2.9687361125521064e-06, "loss": 0.2911, "step": 480600 }, { "epoch": 1.7820996670843559, "grad_norm": 1.0474220514297485, "learning_rate": 2.9587607137561036e-06, "loss": 0.2622, "step": 480700 }, { "epoch": 1.782470397200246, "grad_norm": 1.8589011430740356, "learning_rate": 2.9488015915803412e-06, "loss": 0.2599, "step": 480800 }, { "epoch": 1.7828411273161364, "grad_norm": 1.336263656616211, "learning_rate": 2.9388587494707687e-06, "loss": 0.285, "step": 480900 }, { "epoch": 1.7832118574320266, "grad_norm": 0.8291609287261963, "learning_rate": 2.9289321908676727e-06, "loss": 0.2602, "step": 481000 }, { "epoch": 1.783582587547917, "grad_norm": 0.39953431487083435, "learning_rate": 2.919021919205722e-06, "loss": 0.2672, "step": 481100 }, { "epoch": 1.7839533176638072, "grad_norm": 0.4237014353275299, "learning_rate": 2.9091279379139467e-06, "loss": 0.2356, "step": 481200 }, { "epoch": 1.7843240477796973, "grad_norm": 1.0928659439086914, "learning_rate": 2.899250250415719e-06, "loss": 0.2745, "step": 481300 }, { "epoch": 1.7846947778955875, "grad_norm": 1.241208791732788, "learning_rate": 2.8893888601288223e-06, "loss": 0.2539, "step": 481400 }, { "epoch": 1.7850655080114777, "grad_norm": 1.404119610786438, "learning_rate": 2.879543770465354e-06, "loss": 0.2712, "step": 481500 }, { "epoch": 1.785436238127368, "grad_norm": 0.7895044684410095, "learning_rate": 2.8697149848317838e-06, "loss": 0.2712, "step": 481600 }, { "epoch": 1.7858069682432582, "grad_norm": 0.9193483591079712, "learning_rate": 2.8599025066289587e-06, "loss": 0.2571, "step": 481700 }, { "epoch": 1.7861776983591486, "grad_norm": 0.6227703094482422, "learning_rate": 2.850106339252062e-06, "loss": 0.2748, "step": 481800 }, { "epoch": 1.7865484284750388, "grad_norm": 0.42512765526771545, "learning_rate": 2.8403264860906443e-06, "loss": 0.2571, "step": 481900 }, { "epoch": 1.786919158590929, "grad_norm": 1.0903176069259644, "learning_rate": 2.8305629505286037e-06, "loss": 0.2445, "step": 482000 }, { "epoch": 1.7872898887068192, "grad_norm": 1.2993882894515991, "learning_rate": 2.8208157359441933e-06, "loss": 0.269, "step": 482100 }, { "epoch": 1.7876606188227093, "grad_norm": 0.7641462087631226, "learning_rate": 2.8110848457100326e-06, "loss": 0.2524, "step": 482200 }, { "epoch": 1.7880313489385997, "grad_norm": 1.5651074647903442, "learning_rate": 2.8013702831930776e-06, "loss": 0.2537, "step": 482300 }, { "epoch": 1.78840207905449, "grad_norm": 0.7292250990867615, "learning_rate": 2.7916720517546346e-06, "loss": 0.2441, "step": 482400 }, { "epoch": 1.7887728091703803, "grad_norm": 0.7961540818214417, "learning_rate": 2.7819901547503745e-06, "loss": 0.2581, "step": 482500 }, { "epoch": 1.7891435392862705, "grad_norm": 0.6666664481163025, "learning_rate": 2.772324595530301e-06, "loss": 0.2918, "step": 482600 }, { "epoch": 1.7895142694021606, "grad_norm": 0.6200628280639648, "learning_rate": 2.7626753774387616e-06, "loss": 0.2626, "step": 482700 }, { "epoch": 1.7898849995180508, "grad_norm": 0.5579478144645691, "learning_rate": 2.753042503814468e-06, "loss": 0.2588, "step": 482800 }, { "epoch": 1.790255729633941, "grad_norm": 0.5290198922157288, "learning_rate": 2.7434259779904657e-06, "loss": 0.2724, "step": 482900 }, { "epoch": 1.7906264597498314, "grad_norm": 1.8992480039596558, "learning_rate": 2.7338258032941323e-06, "loss": 0.2816, "step": 483000 }, { "epoch": 1.7909971898657215, "grad_norm": 0.49972569942474365, "learning_rate": 2.724241983047221e-06, "loss": 0.2362, "step": 483100 }, { "epoch": 1.791367919981612, "grad_norm": 0.42614254355430603, "learning_rate": 2.714674520565791e-06, "loss": 0.2335, "step": 483200 }, { "epoch": 1.791738650097502, "grad_norm": 2.4388320446014404, "learning_rate": 2.7051234191602604e-06, "loss": 0.2727, "step": 483300 }, { "epoch": 1.7921093802133923, "grad_norm": 0.9806447625160217, "learning_rate": 2.6955886821353803e-06, "loss": 0.266, "step": 483400 }, { "epoch": 1.7924801103292825, "grad_norm": 0.25646117329597473, "learning_rate": 2.6860703127902287e-06, "loss": 0.2706, "step": 483500 }, { "epoch": 1.7928508404451726, "grad_norm": 0.8434768915176392, "learning_rate": 2.6765683144182598e-06, "loss": 0.2631, "step": 483600 }, { "epoch": 1.7932215705610628, "grad_norm": 1.165117859840393, "learning_rate": 2.6670826903072154e-06, "loss": 0.252, "step": 483700 }, { "epoch": 1.7935923006769532, "grad_norm": 0.9998282194137573, "learning_rate": 2.657613443739193e-06, "loss": 0.2587, "step": 483800 }, { "epoch": 1.7939630307928436, "grad_norm": 0.8191498517990112, "learning_rate": 2.6481605779906325e-06, "loss": 0.2544, "step": 483900 }, { "epoch": 1.7943337609087338, "grad_norm": 1.2204657793045044, "learning_rate": 2.63872409633229e-06, "loss": 0.2653, "step": 484000 }, { "epoch": 1.794704491024624, "grad_norm": 2.3270621299743652, "learning_rate": 2.6293040020292594e-06, "loss": 0.2386, "step": 484100 }, { "epoch": 1.795075221140514, "grad_norm": 2.2118330001831055, "learning_rate": 2.619900298340966e-06, "loss": 0.2492, "step": 484200 }, { "epoch": 1.7954459512564043, "grad_norm": 0.925565779209137, "learning_rate": 2.610512988521152e-06, "loss": 0.2882, "step": 484300 }, { "epoch": 1.7958166813722944, "grad_norm": 1.0321640968322754, "learning_rate": 2.601142075817914e-06, "loss": 0.2517, "step": 484400 }, { "epoch": 1.7961874114881848, "grad_norm": 0.8356673121452332, "learning_rate": 2.5917875634736466e-06, "loss": 0.2583, "step": 484500 }, { "epoch": 1.7965581416040752, "grad_norm": 1.1183966398239136, "learning_rate": 2.582449454725072e-06, "loss": 0.2555, "step": 484600 }, { "epoch": 1.7969288717199654, "grad_norm": 0.6415712237358093, "learning_rate": 2.5731277528032615e-06, "loss": 0.2406, "step": 484700 }, { "epoch": 1.7972996018358556, "grad_norm": 1.4790823459625244, "learning_rate": 2.5638224609336015e-06, "loss": 0.2615, "step": 484800 }, { "epoch": 1.7976703319517457, "grad_norm": 0.7762015461921692, "learning_rate": 2.554533582335761e-06, "loss": 0.2837, "step": 484900 }, { "epoch": 1.798041062067636, "grad_norm": 1.1355154514312744, "learning_rate": 2.5452611202237853e-06, "loss": 0.2854, "step": 485000 }, { "epoch": 1.798411792183526, "grad_norm": 1.5414893627166748, "learning_rate": 2.536005077806014e-06, "loss": 0.2698, "step": 485100 }, { "epoch": 1.7987825222994165, "grad_norm": 1.0787721872329712, "learning_rate": 2.526765458285091e-06, "loss": 0.2613, "step": 485200 }, { "epoch": 1.7991532524153067, "grad_norm": 1.1649255752563477, "learning_rate": 2.517542264858014e-06, "loss": 0.2772, "step": 485300 }, { "epoch": 1.799523982531197, "grad_norm": 1.2573189735412598, "learning_rate": 2.508335500716069e-06, "loss": 0.2592, "step": 485400 }, { "epoch": 1.7998947126470872, "grad_norm": 0.6999520659446716, "learning_rate": 2.4991451690448586e-06, "loss": 0.2822, "step": 485500 }, { "epoch": 1.8002654427629774, "grad_norm": 0.5044705867767334, "learning_rate": 2.4899712730243274e-06, "loss": 0.2705, "step": 485600 }, { "epoch": 1.8006361728788676, "grad_norm": 1.1324622631072998, "learning_rate": 2.4808138158286755e-06, "loss": 0.2804, "step": 485700 }, { "epoch": 1.8010069029947577, "grad_norm": 1.315787434577942, "learning_rate": 2.4716728006264846e-06, "loss": 0.2432, "step": 485800 }, { "epoch": 1.8013776331106481, "grad_norm": 0.6896074414253235, "learning_rate": 2.4625482305806025e-06, "loss": 0.2904, "step": 485900 }, { "epoch": 1.8017483632265383, "grad_norm": 1.1488738059997559, "learning_rate": 2.4534401088481873e-06, "loss": 0.2562, "step": 486000 }, { "epoch": 1.8021190933424287, "grad_norm": 0.751736044883728, "learning_rate": 2.4443484385807404e-06, "loss": 0.245, "step": 486100 }, { "epoch": 1.8024898234583189, "grad_norm": 1.2569317817687988, "learning_rate": 2.435273222924034e-06, "loss": 0.27, "step": 486200 }, { "epoch": 1.802860553574209, "grad_norm": 1.353493332862854, "learning_rate": 2.4262144650181507e-06, "loss": 0.2591, "step": 486300 }, { "epoch": 1.8032312836900992, "grad_norm": 0.7643929719924927, "learning_rate": 2.4171721679975167e-06, "loss": 0.2389, "step": 486400 }, { "epoch": 1.8036020138059894, "grad_norm": 0.569882869720459, "learning_rate": 2.408146334990802e-06, "loss": 0.2504, "step": 486500 }, { "epoch": 1.8039727439218798, "grad_norm": 2.5066163539886475, "learning_rate": 2.3991369691210353e-06, "loss": 0.2413, "step": 486600 }, { "epoch": 1.80434347403777, "grad_norm": 0.27489569783210754, "learning_rate": 2.390144073505518e-06, "loss": 0.2393, "step": 486700 }, { "epoch": 1.8047142041536604, "grad_norm": 2.192762613296509, "learning_rate": 2.38116765125585e-06, "loss": 0.2589, "step": 486800 }, { "epoch": 1.8050849342695505, "grad_norm": 1.0129424333572388, "learning_rate": 2.372207705477958e-06, "loss": 0.2575, "step": 486900 }, { "epoch": 1.8054556643854407, "grad_norm": 0.5523486137390137, "learning_rate": 2.363264239272034e-06, "loss": 0.2826, "step": 487000 }, { "epoch": 1.8058263945013309, "grad_norm": 1.001538872718811, "learning_rate": 2.3543372557325973e-06, "loss": 0.2515, "step": 487100 }, { "epoch": 1.806197124617221, "grad_norm": 1.2420902252197266, "learning_rate": 2.345426757948438e-06, "loss": 0.2473, "step": 487200 }, { "epoch": 1.8065678547331114, "grad_norm": 0.8084650635719299, "learning_rate": 2.336532749002668e-06, "loss": 0.2736, "step": 487300 }, { "epoch": 1.8069385848490016, "grad_norm": 0.7521071434020996, "learning_rate": 2.327655231972664e-06, "loss": 0.2584, "step": 487400 }, { "epoch": 1.807309314964892, "grad_norm": 1.1719350814819336, "learning_rate": 2.3187942099301254e-06, "loss": 0.2873, "step": 487500 }, { "epoch": 1.8076800450807822, "grad_norm": 1.1211252212524414, "learning_rate": 2.3099496859410373e-06, "loss": 0.2472, "step": 487600 }, { "epoch": 1.8080507751966723, "grad_norm": 0.588954746723175, "learning_rate": 2.301121663065653e-06, "loss": 0.2439, "step": 487700 }, { "epoch": 1.8084215053125625, "grad_norm": 0.9538997411727905, "learning_rate": 2.2923101443585503e-06, "loss": 0.2828, "step": 487800 }, { "epoch": 1.8087922354284527, "grad_norm": 0.6099550127983093, "learning_rate": 2.28351513286858e-06, "loss": 0.2669, "step": 487900 }, { "epoch": 1.809162965544343, "grad_norm": 1.3413537740707397, "learning_rate": 2.274736631638874e-06, "loss": 0.2637, "step": 488000 }, { "epoch": 1.8095336956602333, "grad_norm": 1.0559062957763672, "learning_rate": 2.2659746437068643e-06, "loss": 0.2921, "step": 488100 }, { "epoch": 1.8099044257761236, "grad_norm": 1.0379785299301147, "learning_rate": 2.2572291721042525e-06, "loss": 0.2901, "step": 488200 }, { "epoch": 1.8102751558920138, "grad_norm": 1.1784915924072266, "learning_rate": 2.2485002198570527e-06, "loss": 0.2499, "step": 488300 }, { "epoch": 1.810645886007904, "grad_norm": 1.8049230575561523, "learning_rate": 2.239787789985548e-06, "loss": 0.2737, "step": 488400 }, { "epoch": 1.8110166161237942, "grad_norm": 1.5252137184143066, "learning_rate": 2.231091885504283e-06, "loss": 0.2652, "step": 488500 }, { "epoch": 1.8113873462396843, "grad_norm": 0.965323269367218, "learning_rate": 2.2224125094221393e-06, "loss": 0.265, "step": 488600 }, { "epoch": 1.8117580763555745, "grad_norm": 1.3610422611236572, "learning_rate": 2.213749664742215e-06, "loss": 0.2889, "step": 488700 }, { "epoch": 1.812128806471465, "grad_norm": 1.3670655488967896, "learning_rate": 2.205103354461935e-06, "loss": 0.2899, "step": 488800 }, { "epoch": 1.8124995365873553, "grad_norm": 1.089738368988037, "learning_rate": 2.1964735815729833e-06, "loss": 0.2718, "step": 488900 }, { "epoch": 1.8128702667032455, "grad_norm": 1.5592621564865112, "learning_rate": 2.1878603490613213e-06, "loss": 0.2615, "step": 489000 }, { "epoch": 1.8132409968191356, "grad_norm": 1.8680031299591064, "learning_rate": 2.179263659907199e-06, "loss": 0.2761, "step": 489100 }, { "epoch": 1.8136117269350258, "grad_norm": 1.2812317609786987, "learning_rate": 2.170683517085137e-06, "loss": 0.2733, "step": 489200 }, { "epoch": 1.813982457050916, "grad_norm": 1.890928864479065, "learning_rate": 2.1621199235639155e-06, "loss": 0.2519, "step": 489300 }, { "epoch": 1.8143531871668062, "grad_norm": 0.723751425743103, "learning_rate": 2.1535728823066205e-06, "loss": 0.2721, "step": 489400 }, { "epoch": 1.8147239172826966, "grad_norm": 1.139295220375061, "learning_rate": 2.1450423962705803e-06, "loss": 0.2787, "step": 489500 }, { "epoch": 1.8150946473985867, "grad_norm": 0.9537789821624756, "learning_rate": 2.1365284684074005e-06, "loss": 0.2746, "step": 489600 }, { "epoch": 1.8154653775144771, "grad_norm": 0.9303369522094727, "learning_rate": 2.1280311016629806e-06, "loss": 0.2866, "step": 489700 }, { "epoch": 1.8158361076303673, "grad_norm": 1.2546216249465942, "learning_rate": 2.119550298977463e-06, "loss": 0.2761, "step": 489800 }, { "epoch": 1.8162068377462575, "grad_norm": 1.4627972841262817, "learning_rate": 2.111086063285267e-06, "loss": 0.2738, "step": 489900 }, { "epoch": 1.8165775678621476, "grad_norm": 1.5048861503601074, "learning_rate": 2.1026383975150886e-06, "loss": 0.2605, "step": 490000 }, { "epoch": 1.8169482979780378, "grad_norm": 0.8757205605506897, "learning_rate": 2.0942073045898793e-06, "loss": 0.2893, "step": 490100 }, { "epoch": 1.8173190280939282, "grad_norm": 1.0043760538101196, "learning_rate": 2.0857927874268603e-06, "loss": 0.2699, "step": 490200 }, { "epoch": 1.8176897582098184, "grad_norm": 1.0403964519500732, "learning_rate": 2.077394848937514e-06, "loss": 0.2521, "step": 490300 }, { "epoch": 1.8180604883257088, "grad_norm": 0.44245097041130066, "learning_rate": 2.069013492027594e-06, "loss": 0.269, "step": 490400 }, { "epoch": 1.818431218441599, "grad_norm": 0.44171956181526184, "learning_rate": 2.060648719597114e-06, "loss": 0.2723, "step": 490500 }, { "epoch": 1.8188019485574891, "grad_norm": 1.0776466131210327, "learning_rate": 2.0523005345403412e-06, "loss": 0.2798, "step": 490600 }, { "epoch": 1.8191726786733793, "grad_norm": 1.0792908668518066, "learning_rate": 2.0439689397458162e-06, "loss": 0.2484, "step": 490700 }, { "epoch": 1.8195434087892695, "grad_norm": 0.8893476128578186, "learning_rate": 2.0356539380963325e-06, "loss": 0.2842, "step": 490800 }, { "epoch": 1.8199141389051599, "grad_norm": 1.6463406085968018, "learning_rate": 2.0273555324689385e-06, "loss": 0.262, "step": 490900 }, { "epoch": 1.82028486902105, "grad_norm": 0.8300613164901733, "learning_rate": 2.0190737257349546e-06, "loss": 0.2595, "step": 491000 }, { "epoch": 1.8206555991369404, "grad_norm": 1.5076366662979126, "learning_rate": 2.0108085207599324e-06, "loss": 0.249, "step": 491100 }, { "epoch": 1.8210263292528306, "grad_norm": 0.6020153164863586, "learning_rate": 2.0025599204037014e-06, "loss": 0.2736, "step": 491200 }, { "epoch": 1.8213970593687208, "grad_norm": 1.0060005187988281, "learning_rate": 1.9943279275203454e-06, "loss": 0.2498, "step": 491300 }, { "epoch": 1.821767789484611, "grad_norm": 1.002991795539856, "learning_rate": 1.986112544958191e-06, "loss": 0.2904, "step": 491400 }, { "epoch": 1.822138519600501, "grad_norm": 0.7030296325683594, "learning_rate": 1.97791377555982e-06, "loss": 0.2736, "step": 491500 }, { "epoch": 1.8225092497163915, "grad_norm": 1.1415504217147827, "learning_rate": 1.969731622162069e-06, "loss": 0.2567, "step": 491600 }, { "epoch": 1.8228799798322817, "grad_norm": 1.7021290063858032, "learning_rate": 1.961566087596023e-06, "loss": 0.2505, "step": 491700 }, { "epoch": 1.823250709948172, "grad_norm": 1.142049789428711, "learning_rate": 1.953417174687022e-06, "loss": 0.2778, "step": 491800 }, { "epoch": 1.8236214400640622, "grad_norm": 1.4135066270828247, "learning_rate": 1.94528488625465e-06, "loss": 0.2892, "step": 491900 }, { "epoch": 1.8239921701799524, "grad_norm": 0.785319983959198, "learning_rate": 1.937169225112728e-06, "loss": 0.251, "step": 492000 }, { "epoch": 1.8243629002958426, "grad_norm": 1.46189284324646, "learning_rate": 1.9290701940693546e-06, "loss": 0.2459, "step": 492100 }, { "epoch": 1.8247336304117328, "grad_norm": 1.5359301567077637, "learning_rate": 1.920987795926843e-06, "loss": 0.265, "step": 492200 }, { "epoch": 1.8251043605276231, "grad_norm": 0.6381931304931641, "learning_rate": 1.912922033481762e-06, "loss": 0.2792, "step": 492300 }, { "epoch": 1.8254750906435133, "grad_norm": 1.1798925399780273, "learning_rate": 1.9048729095249352e-06, "loss": 0.273, "step": 492400 }, { "epoch": 1.8258458207594037, "grad_norm": 0.2807241380214691, "learning_rate": 1.8968404268414174e-06, "loss": 0.2741, "step": 492500 }, { "epoch": 1.8262165508752939, "grad_norm": 0.5500292181968689, "learning_rate": 1.8888245882104916e-06, "loss": 0.2628, "step": 492600 }, { "epoch": 1.826587280991184, "grad_norm": 1.447951078414917, "learning_rate": 1.8808253964057166e-06, "loss": 0.277, "step": 492700 }, { "epoch": 1.8269580111070742, "grad_norm": 0.648189127445221, "learning_rate": 1.8728428541948728e-06, "loss": 0.2441, "step": 492800 }, { "epoch": 1.8273287412229644, "grad_norm": 1.104740023612976, "learning_rate": 1.8648769643399622e-06, "loss": 0.2582, "step": 492900 }, { "epoch": 1.8276994713388546, "grad_norm": 1.599278450012207, "learning_rate": 1.8569277295972576e-06, "loss": 0.2741, "step": 493000 }, { "epoch": 1.828070201454745, "grad_norm": 1.9609551429748535, "learning_rate": 1.8489951527172478e-06, "loss": 0.2504, "step": 493100 }, { "epoch": 1.8284409315706354, "grad_norm": 0.7106577754020691, "learning_rate": 1.8410792364446706e-06, "loss": 0.2511, "step": 493200 }, { "epoch": 1.8288116616865255, "grad_norm": 0.85588538646698, "learning_rate": 1.8331799835184792e-06, "loss": 0.2731, "step": 493300 }, { "epoch": 1.8291823918024157, "grad_norm": 2.4011261463165283, "learning_rate": 1.8252973966718822e-06, "loss": 0.2639, "step": 493400 }, { "epoch": 1.8295531219183059, "grad_norm": 0.7538250088691711, "learning_rate": 1.8174314786323143e-06, "loss": 0.2513, "step": 493500 }, { "epoch": 1.829923852034196, "grad_norm": 0.9296600222587585, "learning_rate": 1.8095822321214429e-06, "loss": 0.26, "step": 493600 }, { "epoch": 1.8302945821500862, "grad_norm": 1.1396729946136475, "learning_rate": 1.8017496598551565e-06, "loss": 0.273, "step": 493700 }, { "epoch": 1.8306653122659766, "grad_norm": 0.749600350856781, "learning_rate": 1.7939337645435984e-06, "loss": 0.2917, "step": 493800 }, { "epoch": 1.831036042381867, "grad_norm": 0.9689933061599731, "learning_rate": 1.7861345488911162e-06, "loss": 0.2613, "step": 493900 }, { "epoch": 1.8314067724977572, "grad_norm": 2.234438419342041, "learning_rate": 1.7783520155962963e-06, "loss": 0.2893, "step": 494000 }, { "epoch": 1.8317775026136474, "grad_norm": 0.9609447121620178, "learning_rate": 1.7705861673519618e-06, "loss": 0.2981, "step": 494100 }, { "epoch": 1.8321482327295375, "grad_norm": 1.289180874824524, "learning_rate": 1.7628370068451416e-06, "loss": 0.254, "step": 494200 }, { "epoch": 1.8325189628454277, "grad_norm": 2.1156086921691895, "learning_rate": 1.7551045367571129e-06, "loss": 0.2587, "step": 494300 }, { "epoch": 1.8328896929613179, "grad_norm": 1.197017788887024, "learning_rate": 1.7473887597633687e-06, "loss": 0.2682, "step": 494400 }, { "epoch": 1.8332604230772083, "grad_norm": 1.1994671821594238, "learning_rate": 1.7396896785336181e-06, "loss": 0.2348, "step": 494500 }, { "epoch": 1.8336311531930984, "grad_norm": 0.47025981545448303, "learning_rate": 1.7320072957318079e-06, "loss": 0.2727, "step": 494600 }, { "epoch": 1.8340018833089888, "grad_norm": 0.7005573511123657, "learning_rate": 1.7243416140161063e-06, "loss": 0.2528, "step": 494700 }, { "epoch": 1.834372613424879, "grad_norm": 1.30863618850708, "learning_rate": 1.7166926360388746e-06, "loss": 0.2572, "step": 494800 }, { "epoch": 1.8347433435407692, "grad_norm": 1.366794466972351, "learning_rate": 1.7090603644467352e-06, "loss": 0.2577, "step": 494900 }, { "epoch": 1.8351140736566594, "grad_norm": 1.3748326301574707, "learning_rate": 1.701444801880503e-06, "loss": 0.2524, "step": 495000 }, { "epoch": 1.8354848037725495, "grad_norm": 0.8908235430717468, "learning_rate": 1.6938459509752147e-06, "loss": 0.284, "step": 495100 }, { "epoch": 1.83585553388844, "grad_norm": 0.6746916174888611, "learning_rate": 1.686263814360145e-06, "loss": 0.2726, "step": 495200 }, { "epoch": 1.83622626400433, "grad_norm": 1.2685539722442627, "learning_rate": 1.6786983946587564e-06, "loss": 0.2494, "step": 495300 }, { "epoch": 1.8365969941202205, "grad_norm": 1.00760817527771, "learning_rate": 1.671149694488744e-06, "loss": 0.2477, "step": 495400 }, { "epoch": 1.8369677242361107, "grad_norm": 0.8489825129508972, "learning_rate": 1.6636177164620293e-06, "loss": 0.2809, "step": 495500 }, { "epoch": 1.8373384543520008, "grad_norm": 0.35584959387779236, "learning_rate": 1.6561024631847e-06, "loss": 0.2553, "step": 495600 }, { "epoch": 1.837709184467891, "grad_norm": 0.6342896819114685, "learning_rate": 1.6486039372571204e-06, "loss": 0.2561, "step": 495700 }, { "epoch": 1.8380799145837812, "grad_norm": 0.6256861686706543, "learning_rate": 1.6411221412738265e-06, "loss": 0.2483, "step": 495800 }, { "epoch": 1.8384506446996716, "grad_norm": 0.6360771059989929, "learning_rate": 1.6336570778235695e-06, "loss": 0.289, "step": 495900 }, { "epoch": 1.8388213748155617, "grad_norm": 0.4148041009902954, "learning_rate": 1.626208749489322e-06, "loss": 0.2738, "step": 496000 }, { "epoch": 1.8391921049314521, "grad_norm": 0.588826060295105, "learning_rate": 1.6187771588482669e-06, "loss": 0.2689, "step": 496100 }, { "epoch": 1.8395628350473423, "grad_norm": 1.0112463235855103, "learning_rate": 1.6113623084717755e-06, "loss": 0.2918, "step": 496200 }, { "epoch": 1.8399335651632325, "grad_norm": 0.8610304594039917, "learning_rate": 1.6039642009254618e-06, "loss": 0.2529, "step": 496300 }, { "epoch": 1.8403042952791226, "grad_norm": 0.7557651996612549, "learning_rate": 1.5965828387691062e-06, "loss": 0.2884, "step": 496400 }, { "epoch": 1.8406750253950128, "grad_norm": 0.848896324634552, "learning_rate": 1.5892182245567266e-06, "loss": 0.2865, "step": 496500 }, { "epoch": 1.8410457555109032, "grad_norm": 0.8705933094024658, "learning_rate": 1.581870360836535e-06, "loss": 0.258, "step": 496600 }, { "epoch": 1.8414164856267934, "grad_norm": 0.8147608637809753, "learning_rate": 1.574539250150936e-06, "loss": 0.2528, "step": 496700 }, { "epoch": 1.8417872157426838, "grad_norm": 1.1971162557601929, "learning_rate": 1.567224895036562e-06, "loss": 0.2564, "step": 496800 }, { "epoch": 1.842157945858574, "grad_norm": 0.9711613655090332, "learning_rate": 1.5599272980242275e-06, "loss": 0.2543, "step": 496900 }, { "epoch": 1.8425286759744641, "grad_norm": 0.7071180939674377, "learning_rate": 1.5526464616389625e-06, "loss": 0.2751, "step": 497000 }, { "epoch": 1.8428994060903543, "grad_norm": 1.0565555095672607, "learning_rate": 1.5453823883999852e-06, "loss": 0.2669, "step": 497100 }, { "epoch": 1.8432701362062445, "grad_norm": 1.0245354175567627, "learning_rate": 1.5381350808207128e-06, "loss": 0.2479, "step": 497200 }, { "epoch": 1.8436408663221349, "grad_norm": 0.6120325922966003, "learning_rate": 1.5309045414087786e-06, "loss": 0.2919, "step": 497300 }, { "epoch": 1.844011596438025, "grad_norm": 0.5841442942619324, "learning_rate": 1.5236907726659978e-06, "loss": 0.2697, "step": 497400 }, { "epoch": 1.8443823265539154, "grad_norm": 0.7598605155944824, "learning_rate": 1.5164937770883902e-06, "loss": 0.2561, "step": 497500 }, { "epoch": 1.8447530566698056, "grad_norm": 0.6929736733436584, "learning_rate": 1.50931355716617e-06, "loss": 0.2675, "step": 497600 }, { "epoch": 1.8451237867856958, "grad_norm": 0.7702512145042419, "learning_rate": 1.5021501153837436e-06, "loss": 0.2561, "step": 497700 }, { "epoch": 1.845494516901586, "grad_norm": 0.6433400511741638, "learning_rate": 1.4950034542197232e-06, "loss": 0.2966, "step": 497800 }, { "epoch": 1.8458652470174761, "grad_norm": 1.3732655048370361, "learning_rate": 1.487873576146903e-06, "loss": 0.2836, "step": 497900 }, { "epoch": 1.8462359771333663, "grad_norm": 0.3903549909591675, "learning_rate": 1.480760483632271e-06, "loss": 0.2522, "step": 498000 }, { "epoch": 1.8466067072492567, "grad_norm": 1.6700646877288818, "learning_rate": 1.4736641791370075e-06, "loss": 0.2708, "step": 498100 }, { "epoch": 1.846977437365147, "grad_norm": 0.5384502410888672, "learning_rate": 1.4665846651164995e-06, "loss": 0.2724, "step": 498200 }, { "epoch": 1.8473481674810373, "grad_norm": 0.9856400489807129, "learning_rate": 1.4595219440203033e-06, "loss": 0.2717, "step": 498300 }, { "epoch": 1.8477188975969274, "grad_norm": 0.6840747594833374, "learning_rate": 1.4524760182921649e-06, "loss": 0.2632, "step": 498400 }, { "epoch": 1.8480896277128176, "grad_norm": 0.7086251378059387, "learning_rate": 1.4454468903700446e-06, "loss": 0.2837, "step": 498500 }, { "epoch": 1.8484603578287078, "grad_norm": 1.9021472930908203, "learning_rate": 1.438434562686064e-06, "loss": 0.2815, "step": 498600 }, { "epoch": 1.848831087944598, "grad_norm": 1.1262710094451904, "learning_rate": 1.4314390376665432e-06, "loss": 0.2524, "step": 498700 }, { "epoch": 1.8492018180604883, "grad_norm": 0.8761334419250488, "learning_rate": 1.424460317731985e-06, "loss": 0.2368, "step": 498800 }, { "epoch": 1.8495725481763785, "grad_norm": 0.6342275738716125, "learning_rate": 1.4174984052970742e-06, "loss": 0.2852, "step": 498900 }, { "epoch": 1.849943278292269, "grad_norm": 1.2402769327163696, "learning_rate": 1.4105533027706896e-06, "loss": 0.2547, "step": 499000 }, { "epoch": 1.850314008408159, "grad_norm": 0.7437649369239807, "learning_rate": 1.4036250125558926e-06, "loss": 0.2442, "step": 499100 }, { "epoch": 1.8506847385240492, "grad_norm": 0.6484910845756531, "learning_rate": 1.3967135370499095e-06, "loss": 0.2587, "step": 499200 }, { "epoch": 1.8510554686399394, "grad_norm": 3.032609224319458, "learning_rate": 1.3898188786441835e-06, "loss": 0.2688, "step": 499300 }, { "epoch": 1.8514261987558296, "grad_norm": 0.809641420841217, "learning_rate": 1.3829410397243002e-06, "loss": 0.2599, "step": 499400 }, { "epoch": 1.85179692887172, "grad_norm": 1.6868163347244263, "learning_rate": 1.37608002267004e-06, "loss": 0.2566, "step": 499500 }, { "epoch": 1.8521676589876102, "grad_norm": 0.8580741286277771, "learning_rate": 1.3692358298553864e-06, "loss": 0.2619, "step": 499600 }, { "epoch": 1.8525383891035005, "grad_norm": 1.3672674894332886, "learning_rate": 1.362408463648468e-06, "loss": 0.2634, "step": 499700 }, { "epoch": 1.8529091192193907, "grad_norm": 1.0110387802124023, "learning_rate": 1.3555979264116003e-06, "loss": 0.2713, "step": 499800 }, { "epoch": 1.853279849335281, "grad_norm": 0.6585411429405212, "learning_rate": 1.3488042205012874e-06, "loss": 0.273, "step": 499900 }, { "epoch": 1.853650579451171, "grad_norm": 1.763874888420105, "learning_rate": 1.3420273482682044e-06, "loss": 0.2434, "step": 500000 }, { "epoch": 1.8540213095670612, "grad_norm": 0.5883867144584656, "learning_rate": 1.335267312057198e-06, "loss": 0.2677, "step": 500100 }, { "epoch": 1.8543920396829516, "grad_norm": 0.5176870226860046, "learning_rate": 1.3285241142072913e-06, "loss": 0.2636, "step": 500200 }, { "epoch": 1.8547627697988418, "grad_norm": 1.808978796005249, "learning_rate": 1.3217977570516682e-06, "loss": 0.25, "step": 500300 }, { "epoch": 1.8551334999147322, "grad_norm": 3.0268280506134033, "learning_rate": 1.315088242917717e-06, "loss": 0.2657, "step": 500400 }, { "epoch": 1.8555042300306224, "grad_norm": 1.553043007850647, "learning_rate": 1.308395574126975e-06, "loss": 0.2843, "step": 500500 }, { "epoch": 1.8558749601465125, "grad_norm": 0.8986644148826599, "learning_rate": 1.3017197529951452e-06, "loss": 0.2505, "step": 500600 }, { "epoch": 1.8562456902624027, "grad_norm": 1.213782787322998, "learning_rate": 1.2950607818321248e-06, "loss": 0.2646, "step": 500700 }, { "epoch": 1.8566164203782929, "grad_norm": 1.1497365236282349, "learning_rate": 1.2884186629419648e-06, "loss": 0.2769, "step": 500800 }, { "epoch": 1.8569871504941833, "grad_norm": 1.0702672004699707, "learning_rate": 1.2817933986228825e-06, "loss": 0.2813, "step": 500900 }, { "epoch": 1.8573578806100735, "grad_norm": 0.7409522533416748, "learning_rate": 1.275184991167272e-06, "loss": 0.248, "step": 501000 }, { "epoch": 1.8577286107259638, "grad_norm": 2.5461723804473877, "learning_rate": 1.2685934428616874e-06, "loss": 0.2692, "step": 501100 }, { "epoch": 1.858099340841854, "grad_norm": 0.2758546471595764, "learning_rate": 1.2620187559868602e-06, "loss": 0.2684, "step": 501200 }, { "epoch": 1.8584700709577442, "grad_norm": 0.9450598359107971, "learning_rate": 1.2554609328176814e-06, "loss": 0.2493, "step": 501300 }, { "epoch": 1.8588408010736344, "grad_norm": 1.417708158493042, "learning_rate": 1.2489199756231972e-06, "loss": 0.2694, "step": 501400 }, { "epoch": 1.8592115311895245, "grad_norm": 1.020751714706421, "learning_rate": 1.2423958866666419e-06, "loss": 0.2793, "step": 501500 }, { "epoch": 1.859582261305415, "grad_norm": 0.9329217672348022, "learning_rate": 1.2358886682053926e-06, "loss": 0.2755, "step": 501600 }, { "epoch": 1.859952991421305, "grad_norm": 1.1965678930282593, "learning_rate": 1.2293983224909878e-06, "loss": 0.276, "step": 501700 }, { "epoch": 1.8603237215371955, "grad_norm": 1.0749369859695435, "learning_rate": 1.2229248517691472e-06, "loss": 0.2551, "step": 501800 }, { "epoch": 1.8606944516530857, "grad_norm": 1.3361629247665405, "learning_rate": 1.2164682582797294e-06, "loss": 0.2733, "step": 501900 }, { "epoch": 1.8610651817689758, "grad_norm": 2.058793783187866, "learning_rate": 1.2100285442567694e-06, "loss": 0.2591, "step": 502000 }, { "epoch": 1.861435911884866, "grad_norm": 1.7734078168869019, "learning_rate": 1.2036057119284627e-06, "loss": 0.2806, "step": 502100 }, { "epoch": 1.8618066420007562, "grad_norm": 1.075834035873413, "learning_rate": 1.1971997635171539e-06, "loss": 0.2646, "step": 502200 }, { "epoch": 1.8621773721166466, "grad_norm": 0.6824052333831787, "learning_rate": 1.1908107012393366e-06, "loss": 0.2807, "step": 502300 }, { "epoch": 1.8625481022325368, "grad_norm": 1.1639630794525146, "learning_rate": 1.1844385273056924e-06, "loss": 0.2718, "step": 502400 }, { "epoch": 1.8629188323484271, "grad_norm": 0.8106762766838074, "learning_rate": 1.1780832439210243e-06, "loss": 0.2529, "step": 502500 }, { "epoch": 1.8632895624643173, "grad_norm": 0.61008220911026, "learning_rate": 1.1717448532843235e-06, "loss": 0.2602, "step": 502600 }, { "epoch": 1.8636602925802075, "grad_norm": 1.1032484769821167, "learning_rate": 1.1654233575887131e-06, "loss": 0.2798, "step": 502700 }, { "epoch": 1.8640310226960977, "grad_norm": 1.6709301471710205, "learning_rate": 1.1591187590214715e-06, "loss": 0.2528, "step": 502800 }, { "epoch": 1.8644017528119878, "grad_norm": 1.3402363061904907, "learning_rate": 1.152831059764048e-06, "loss": 0.2709, "step": 502900 }, { "epoch": 1.864772482927878, "grad_norm": 0.5889074206352234, "learning_rate": 1.1465602619920246e-06, "loss": 0.2912, "step": 503000 }, { "epoch": 1.8651432130437684, "grad_norm": 1.5510735511779785, "learning_rate": 1.1403063678751547e-06, "loss": 0.2651, "step": 503100 }, { "epoch": 1.8655139431596588, "grad_norm": 1.5457922220230103, "learning_rate": 1.1340693795773237e-06, "loss": 0.2748, "step": 503200 }, { "epoch": 1.865884673275549, "grad_norm": 0.7537457942962646, "learning_rate": 1.1278492992565727e-06, "loss": 0.287, "step": 503300 }, { "epoch": 1.8662554033914391, "grad_norm": 0.6342324018478394, "learning_rate": 1.1216461290651071e-06, "loss": 0.2655, "step": 503400 }, { "epoch": 1.8666261335073293, "grad_norm": 0.8493213057518005, "learning_rate": 1.1154598711492658e-06, "loss": 0.2507, "step": 503500 }, { "epoch": 1.8669968636232195, "grad_norm": 1.7312947511672974, "learning_rate": 1.1092905276495313e-06, "loss": 0.2768, "step": 503600 }, { "epoch": 1.8673675937391097, "grad_norm": 0.6540948748588562, "learning_rate": 1.103138100700557e-06, "loss": 0.2429, "step": 503700 }, { "epoch": 1.867738323855, "grad_norm": 0.6995697617530823, "learning_rate": 1.0970025924311234e-06, "loss": 0.3032, "step": 503800 }, { "epoch": 1.8681090539708902, "grad_norm": 2.1710166931152344, "learning_rate": 1.09088400496416e-06, "loss": 0.2595, "step": 503900 }, { "epoch": 1.8684797840867806, "grad_norm": 1.4451045989990234, "learning_rate": 1.0847823404167456e-06, "loss": 0.2773, "step": 504000 }, { "epoch": 1.8688505142026708, "grad_norm": 0.7082656025886536, "learning_rate": 1.0786976009000971e-06, "loss": 0.2803, "step": 504100 }, { "epoch": 1.869221244318561, "grad_norm": 0.4334729313850403, "learning_rate": 1.0726297885195856e-06, "loss": 0.2369, "step": 504200 }, { "epoch": 1.8695919744344511, "grad_norm": 0.9487056732177734, "learning_rate": 1.0665789053747154e-06, "loss": 0.2671, "step": 504300 }, { "epoch": 1.8699627045503413, "grad_norm": 1.6667695045471191, "learning_rate": 1.0605449535591393e-06, "loss": 0.2937, "step": 504400 }, { "epoch": 1.8703334346662317, "grad_norm": 0.9240500330924988, "learning_rate": 1.054527935160654e-06, "loss": 0.2504, "step": 504500 }, { "epoch": 1.8707041647821219, "grad_norm": 0.8618529438972473, "learning_rate": 1.0485278522611886e-06, "loss": 0.2543, "step": 504600 }, { "epoch": 1.8710748948980123, "grad_norm": 1.0105303525924683, "learning_rate": 1.04254470693681e-06, "loss": 0.2731, "step": 504700 }, { "epoch": 1.8714456250139024, "grad_norm": 0.8016411066055298, "learning_rate": 1.0365785012577456e-06, "loss": 0.2555, "step": 504800 }, { "epoch": 1.8718163551297926, "grad_norm": 0.4699464738368988, "learning_rate": 1.030629237288333e-06, "loss": 0.2603, "step": 504900 }, { "epoch": 1.8721870852456828, "grad_norm": 0.6153027415275574, "learning_rate": 1.0246969170870702e-06, "loss": 0.2509, "step": 505000 }, { "epoch": 1.872557815361573, "grad_norm": 0.7194880247116089, "learning_rate": 1.0187815427065816e-06, "loss": 0.2448, "step": 505100 }, { "epoch": 1.8729285454774633, "grad_norm": 1.534828543663025, "learning_rate": 1.0128831161936358e-06, "loss": 0.2319, "step": 505200 }, { "epoch": 1.8732992755933535, "grad_norm": 1.4582576751708984, "learning_rate": 1.0070016395891224e-06, "loss": 0.2544, "step": 505300 }, { "epoch": 1.873670005709244, "grad_norm": 1.636646032333374, "learning_rate": 1.0011371149280857e-06, "loss": 0.2521, "step": 505400 }, { "epoch": 1.874040735825134, "grad_norm": 1.339247226715088, "learning_rate": 9.952895442396915e-07, "loss": 0.2892, "step": 505500 }, { "epoch": 1.8744114659410243, "grad_norm": 1.8869880437850952, "learning_rate": 9.894589295472434e-07, "loss": 0.2593, "step": 505600 }, { "epoch": 1.8747821960569144, "grad_norm": 0.6663370728492737, "learning_rate": 9.836452728681722e-07, "loss": 0.2749, "step": 505700 }, { "epoch": 1.8751529261728046, "grad_norm": 0.6715912818908691, "learning_rate": 9.778485762140522e-07, "loss": 0.2675, "step": 505800 }, { "epoch": 1.875523656288695, "grad_norm": 1.1693367958068848, "learning_rate": 9.720688415905898e-07, "loss": 0.2784, "step": 505900 }, { "epoch": 1.8758943864045852, "grad_norm": 1.5506597757339478, "learning_rate": 9.66306070997608e-07, "loss": 0.2846, "step": 506000 }, { "epoch": 1.8762651165204756, "grad_norm": 0.6962424516677856, "learning_rate": 9.605602664290614e-07, "loss": 0.2586, "step": 506100 }, { "epoch": 1.8766358466363657, "grad_norm": 1.0458142757415771, "learning_rate": 9.548314298730654e-07, "loss": 0.2773, "step": 506200 }, { "epoch": 1.877006576752256, "grad_norm": 0.6673435568809509, "learning_rate": 9.491195633118122e-07, "loss": 0.2412, "step": 506300 }, { "epoch": 1.877377306868146, "grad_norm": 1.340226650238037, "learning_rate": 9.434246687216652e-07, "loss": 0.2581, "step": 506400 }, { "epoch": 1.8777480369840363, "grad_norm": 1.2350270748138428, "learning_rate": 9.377467480731039e-07, "loss": 0.2785, "step": 506500 }, { "epoch": 1.8781187670999266, "grad_norm": 1.572129726409912, "learning_rate": 9.320858033307179e-07, "loss": 0.2522, "step": 506600 }, { "epoch": 1.8784894972158168, "grad_norm": 0.8877524137496948, "learning_rate": 9.264418364532512e-07, "loss": 0.2737, "step": 506700 }, { "epoch": 1.8788602273317072, "grad_norm": 0.4195556342601776, "learning_rate": 9.208148493935476e-07, "loss": 0.2606, "step": 506800 }, { "epoch": 1.8792309574475974, "grad_norm": 0.9837801456451416, "learning_rate": 9.152048440985938e-07, "loss": 0.2379, "step": 506900 }, { "epoch": 1.8796016875634876, "grad_norm": 0.9315747618675232, "learning_rate": 9.096118225094874e-07, "loss": 0.2597, "step": 507000 }, { "epoch": 1.8799724176793777, "grad_norm": 0.6236509084701538, "learning_rate": 9.04035786561458e-07, "loss": 0.2448, "step": 507100 }, { "epoch": 1.880343147795268, "grad_norm": 0.7774379849433899, "learning_rate": 8.984767381838566e-07, "loss": 0.265, "step": 507200 }, { "epoch": 1.880713877911158, "grad_norm": 0.6086320877075195, "learning_rate": 8.929346793001558e-07, "loss": 0.2644, "step": 507300 }, { "epoch": 1.8810846080270485, "grad_norm": 2.161343812942505, "learning_rate": 8.874096118279496e-07, "loss": 0.2835, "step": 507400 }, { "epoch": 1.8814553381429389, "grad_norm": 0.9758574962615967, "learning_rate": 8.819015376789475e-07, "loss": 0.2471, "step": 507500 }, { "epoch": 1.881826068258829, "grad_norm": 0.8286194801330566, "learning_rate": 8.764104587589916e-07, "loss": 0.2495, "step": 507600 }, { "epoch": 1.8821967983747192, "grad_norm": 0.29668551683425903, "learning_rate": 8.709363769680345e-07, "loss": 0.233, "step": 507700 }, { "epoch": 1.8825675284906094, "grad_norm": 1.262487530708313, "learning_rate": 8.654792942001444e-07, "loss": 0.3042, "step": 507800 }, { "epoch": 1.8829382586064995, "grad_norm": 1.0454424619674683, "learning_rate": 8.600392123435219e-07, "loss": 0.2644, "step": 507900 }, { "epoch": 1.8833089887223897, "grad_norm": 0.6958454847335815, "learning_rate": 8.546161332804614e-07, "loss": 0.2483, "step": 508000 }, { "epoch": 1.8836797188382801, "grad_norm": 0.4377243220806122, "learning_rate": 8.492100588874064e-07, "loss": 0.2568, "step": 508100 }, { "epoch": 1.8840504489541705, "grad_norm": 1.2155628204345703, "learning_rate": 8.438209910348882e-07, "loss": 0.2501, "step": 508200 }, { "epoch": 1.8844211790700607, "grad_norm": 1.5076215267181396, "learning_rate": 8.384489315875599e-07, "loss": 0.2478, "step": 508300 }, { "epoch": 1.8847919091859509, "grad_norm": 1.1598292589187622, "learning_rate": 8.330938824042067e-07, "loss": 0.2949, "step": 508400 }, { "epoch": 1.885162639301841, "grad_norm": 1.1863600015640259, "learning_rate": 8.277558453377133e-07, "loss": 0.2652, "step": 508500 }, { "epoch": 1.8855333694177312, "grad_norm": 1.7103724479675293, "learning_rate": 8.224348222350742e-07, "loss": 0.2745, "step": 508600 }, { "epoch": 1.8859040995336214, "grad_norm": 1.1121573448181152, "learning_rate": 8.171308149374057e-07, "loss": 0.2693, "step": 508700 }, { "epoch": 1.8862748296495118, "grad_norm": 1.3693927526474, "learning_rate": 8.118438252799343e-07, "loss": 0.2592, "step": 508800 }, { "epoch": 1.886645559765402, "grad_norm": 1.3202447891235352, "learning_rate": 8.065738550919965e-07, "loss": 0.2557, "step": 508900 }, { "epoch": 1.8870162898812923, "grad_norm": 1.3018826246261597, "learning_rate": 8.01320906197045e-07, "loss": 0.2826, "step": 509000 }, { "epoch": 1.8873870199971825, "grad_norm": 1.0843721628189087, "learning_rate": 7.960849804126314e-07, "loss": 0.2591, "step": 509100 }, { "epoch": 1.8877577501130727, "grad_norm": 0.9976444244384766, "learning_rate": 7.908660795504397e-07, "loss": 0.2578, "step": 509200 }, { "epoch": 1.8881284802289628, "grad_norm": 1.020138144493103, "learning_rate": 7.856642054162367e-07, "loss": 0.2548, "step": 509300 }, { "epoch": 1.888499210344853, "grad_norm": 1.534122109413147, "learning_rate": 7.804793598099103e-07, "loss": 0.2662, "step": 509400 }, { "epoch": 1.8888699404607434, "grad_norm": 0.6359313130378723, "learning_rate": 7.753115445254589e-07, "loss": 0.276, "step": 509500 }, { "epoch": 1.8892406705766336, "grad_norm": 0.7640830874443054, "learning_rate": 7.701607613509909e-07, "loss": 0.2537, "step": 509600 }, { "epoch": 1.889611400692524, "grad_norm": 1.0219042301177979, "learning_rate": 7.650270120686976e-07, "loss": 0.2724, "step": 509700 }, { "epoch": 1.8899821308084142, "grad_norm": 3.233304977416992, "learning_rate": 7.599102984549133e-07, "loss": 0.2485, "step": 509800 }, { "epoch": 1.8903528609243043, "grad_norm": 1.1682251691818237, "learning_rate": 7.548106222800555e-07, "loss": 0.2871, "step": 509900 }, { "epoch": 1.8907235910401945, "grad_norm": 0.5237810611724854, "learning_rate": 7.497279853086403e-07, "loss": 0.255, "step": 510000 }, { "epoch": 1.8910943211560847, "grad_norm": 0.6027798652648926, "learning_rate": 7.446623892993109e-07, "loss": 0.2394, "step": 510100 }, { "epoch": 1.891465051271975, "grad_norm": 0.6219244599342346, "learning_rate": 7.396138360047877e-07, "loss": 0.2918, "step": 510200 }, { "epoch": 1.8918357813878652, "grad_norm": 0.6679556369781494, "learning_rate": 7.345823271719232e-07, "loss": 0.2525, "step": 510300 }, { "epoch": 1.8922065115037556, "grad_norm": 0.5003328919410706, "learning_rate": 7.29567864541647e-07, "loss": 0.2593, "step": 510400 }, { "epoch": 1.8925772416196458, "grad_norm": 0.6183279156684875, "learning_rate": 7.245704498489991e-07, "loss": 0.2372, "step": 510500 }, { "epoch": 1.892947971735536, "grad_norm": 0.7856457233428955, "learning_rate": 7.195900848231297e-07, "loss": 0.2305, "step": 510600 }, { "epoch": 1.8933187018514261, "grad_norm": 1.541520118713379, "learning_rate": 7.146267711872767e-07, "loss": 0.278, "step": 510700 }, { "epoch": 1.8936894319673163, "grad_norm": 0.8096623420715332, "learning_rate": 7.09680510658789e-07, "loss": 0.2587, "step": 510800 }, { "epoch": 1.8940601620832067, "grad_norm": 0.5674014091491699, "learning_rate": 7.047513049491083e-07, "loss": 0.2767, "step": 510900 }, { "epoch": 1.8944308921990969, "grad_norm": 0.6608348488807678, "learning_rate": 6.998391557637652e-07, "loss": 0.2517, "step": 511000 }, { "epoch": 1.8948016223149873, "grad_norm": 1.023116111755371, "learning_rate": 6.949440648024164e-07, "loss": 0.2445, "step": 511100 }, { "epoch": 1.8951723524308774, "grad_norm": 1.140552043914795, "learning_rate": 6.900660337587905e-07, "loss": 0.2894, "step": 511200 }, { "epoch": 1.8955430825467676, "grad_norm": 1.4275007247924805, "learning_rate": 6.852050643207263e-07, "loss": 0.2531, "step": 511300 }, { "epoch": 1.8959138126626578, "grad_norm": 1.5861694812774658, "learning_rate": 6.803611581701563e-07, "loss": 0.2658, "step": 511400 }, { "epoch": 1.896284542778548, "grad_norm": 1.2270591259002686, "learning_rate": 6.755343169831119e-07, "loss": 0.2673, "step": 511500 }, { "epoch": 1.8966552728944384, "grad_norm": 0.9912776947021484, "learning_rate": 6.70724542429696e-07, "loss": 0.2849, "step": 511600 }, { "epoch": 1.8970260030103285, "grad_norm": 1.3984625339508057, "learning_rate": 6.659318361741551e-07, "loss": 0.2793, "step": 511700 }, { "epoch": 1.897396733126219, "grad_norm": 1.1479425430297852, "learning_rate": 6.611561998747795e-07, "loss": 0.282, "step": 511800 }, { "epoch": 1.897767463242109, "grad_norm": 0.8653640747070312, "learning_rate": 6.563976351839807e-07, "loss": 0.2584, "step": 511900 }, { "epoch": 1.8981381933579993, "grad_norm": 0.7304574847221375, "learning_rate": 6.516561437482693e-07, "loss": 0.2579, "step": 512000 }, { "epoch": 1.8985089234738894, "grad_norm": 1.1566510200500488, "learning_rate": 6.469317272082221e-07, "loss": 0.2516, "step": 512100 }, { "epoch": 1.8988796535897796, "grad_norm": 0.840890109539032, "learning_rate": 6.422243871985256e-07, "loss": 0.2612, "step": 512200 }, { "epoch": 1.8992503837056698, "grad_norm": 2.2045979499816895, "learning_rate": 6.375341253479661e-07, "loss": 0.2493, "step": 512300 }, { "epoch": 1.8996211138215602, "grad_norm": 0.9765920639038086, "learning_rate": 6.328609432793897e-07, "loss": 0.2804, "step": 512400 }, { "epoch": 1.8999918439374506, "grad_norm": 0.5201328992843628, "learning_rate": 6.282048426097698e-07, "loss": 0.2793, "step": 512500 }, { "epoch": 1.9003625740533407, "grad_norm": 0.7813957929611206, "learning_rate": 6.235658249501397e-07, "loss": 0.2434, "step": 512600 }, { "epoch": 1.900733304169231, "grad_norm": 1.3246798515319824, "learning_rate": 6.189438919056434e-07, "loss": 0.2525, "step": 512700 }, { "epoch": 1.901104034285121, "grad_norm": 2.228605270385742, "learning_rate": 6.14339045075496e-07, "loss": 0.2751, "step": 512800 }, { "epoch": 1.9014747644010113, "grad_norm": 1.2227208614349365, "learning_rate": 6.097512860530175e-07, "loss": 0.259, "step": 512900 }, { "epoch": 1.9018454945169014, "grad_norm": 1.5668725967407227, "learning_rate": 6.051806164256047e-07, "loss": 0.2587, "step": 513000 }, { "epoch": 1.9022162246327918, "grad_norm": 0.4891645312309265, "learning_rate": 6.006270377747369e-07, "loss": 0.2768, "step": 513100 }, { "epoch": 1.902586954748682, "grad_norm": 1.0590860843658447, "learning_rate": 5.960905516759874e-07, "loss": 0.2543, "step": 513200 }, { "epoch": 1.9029576848645724, "grad_norm": 0.47756698727607727, "learning_rate": 5.915711596990226e-07, "loss": 0.2702, "step": 513300 }, { "epoch": 1.9033284149804626, "grad_norm": 1.1592774391174316, "learning_rate": 5.870688634075805e-07, "loss": 0.2741, "step": 513400 }, { "epoch": 1.9036991450963527, "grad_norm": 0.7624789476394653, "learning_rate": 5.825836643594873e-07, "loss": 0.2819, "step": 513500 }, { "epoch": 1.904069875212243, "grad_norm": 0.9075707197189331, "learning_rate": 5.781155641066572e-07, "loss": 0.2797, "step": 513600 }, { "epoch": 1.904440605328133, "grad_norm": 2.102741241455078, "learning_rate": 5.736645641950922e-07, "loss": 0.251, "step": 513700 }, { "epoch": 1.9048113354440235, "grad_norm": 1.4115492105484009, "learning_rate": 5.692306661648605e-07, "loss": 0.2784, "step": 513800 }, { "epoch": 1.9051820655599137, "grad_norm": 0.8239919543266296, "learning_rate": 5.648138715501295e-07, "loss": 0.2667, "step": 513900 }, { "epoch": 1.905552795675804, "grad_norm": 0.9188171625137329, "learning_rate": 5.604141818791487e-07, "loss": 0.2687, "step": 514000 }, { "epoch": 1.9059235257916942, "grad_norm": 0.6819309592247009, "learning_rate": 5.560315986742282e-07, "loss": 0.2951, "step": 514100 }, { "epoch": 1.9062942559075844, "grad_norm": 1.0102192163467407, "learning_rate": 5.516661234517939e-07, "loss": 0.2533, "step": 514200 }, { "epoch": 1.9066649860234746, "grad_norm": 1.1851967573165894, "learning_rate": 5.473177577223155e-07, "loss": 0.292, "step": 514300 }, { "epoch": 1.9070357161393647, "grad_norm": 3.212658405303955, "learning_rate": 5.429865029903725e-07, "loss": 0.2471, "step": 514400 }, { "epoch": 1.9074064462552551, "grad_norm": 2.5487799644470215, "learning_rate": 5.386723607546107e-07, "loss": 0.2756, "step": 514500 }, { "epoch": 1.9077771763711453, "grad_norm": 2.1883463859558105, "learning_rate": 5.343753325077472e-07, "loss": 0.3009, "step": 514600 }, { "epoch": 1.9081479064870357, "grad_norm": 1.121572494506836, "learning_rate": 5.300954197365871e-07, "loss": 0.2735, "step": 514700 }, { "epoch": 1.9085186366029259, "grad_norm": 0.9567686915397644, "learning_rate": 5.258326239220179e-07, "loss": 0.2711, "step": 514800 }, { "epoch": 1.908889366718816, "grad_norm": 0.9185101985931396, "learning_rate": 5.215869465389988e-07, "loss": 0.2542, "step": 514900 }, { "epoch": 1.9092600968347062, "grad_norm": 0.9349265098571777, "learning_rate": 5.173583890565604e-07, "loss": 0.2562, "step": 515000 }, { "epoch": 1.9096308269505964, "grad_norm": 1.0668970346450806, "learning_rate": 5.131469529378208e-07, "loss": 0.2676, "step": 515100 }, { "epoch": 1.9100015570664868, "grad_norm": 0.6993744969367981, "learning_rate": 5.089526396399591e-07, "loss": 0.2407, "step": 515200 }, { "epoch": 1.910372287182377, "grad_norm": 1.2225303649902344, "learning_rate": 5.047754506142533e-07, "loss": 0.2506, "step": 515300 }, { "epoch": 1.9107430172982673, "grad_norm": 0.9582035541534424, "learning_rate": 5.006153873060304e-07, "loss": 0.2459, "step": 515400 }, { "epoch": 1.9111137474141575, "grad_norm": 1.3103809356689453, "learning_rate": 4.964724511547059e-07, "loss": 0.2875, "step": 515500 }, { "epoch": 1.9114844775300477, "grad_norm": 1.1331599950790405, "learning_rate": 4.923466435937718e-07, "loss": 0.2595, "step": 515600 }, { "epoch": 1.9118552076459379, "grad_norm": 0.6846863627433777, "learning_rate": 4.882379660507808e-07, "loss": 0.2632, "step": 515700 }, { "epoch": 1.912225937761828, "grad_norm": 1.143405795097351, "learning_rate": 4.841464199473733e-07, "loss": 0.2726, "step": 515800 }, { "epoch": 1.9125966678777184, "grad_norm": 1.1685466766357422, "learning_rate": 4.800720066992503e-07, "loss": 0.2502, "step": 515900 }, { "epoch": 1.9129673979936086, "grad_norm": 2.6063497066497803, "learning_rate": 4.7601472771618974e-07, "loss": 0.276, "step": 516000 }, { "epoch": 1.913338128109499, "grad_norm": 2.2750439643859863, "learning_rate": 4.719745844020462e-07, "loss": 0.2668, "step": 516100 }, { "epoch": 1.9137088582253892, "grad_norm": 0.8233124017715454, "learning_rate": 4.679515781547239e-07, "loss": 0.2631, "step": 516200 }, { "epoch": 1.9140795883412793, "grad_norm": 0.9013885259628296, "learning_rate": 4.6394571036623145e-07, "loss": 0.2347, "step": 516300 }, { "epoch": 1.9144503184571695, "grad_norm": 0.973146378993988, "learning_rate": 4.5995698242262133e-07, "loss": 0.2754, "step": 516400 }, { "epoch": 1.9148210485730597, "grad_norm": 0.8071316480636597, "learning_rate": 4.559853957040172e-07, "loss": 0.2526, "step": 516500 }, { "epoch": 1.9151917786889499, "grad_norm": 0.69666987657547, "learning_rate": 4.520309515846255e-07, "loss": 0.249, "step": 516600 }, { "epoch": 1.9155625088048402, "grad_norm": 1.9928343296051025, "learning_rate": 4.4809365143271276e-07, "loss": 0.2636, "step": 516700 }, { "epoch": 1.9159332389207306, "grad_norm": 0.39625632762908936, "learning_rate": 4.4417349661061146e-07, "loss": 0.2495, "step": 516800 }, { "epoch": 1.9163039690366208, "grad_norm": 0.6878416538238525, "learning_rate": 4.402704884747311e-07, "loss": 0.2379, "step": 516900 }, { "epoch": 1.916674699152511, "grad_norm": 1.4967353343963623, "learning_rate": 4.363846283755302e-07, "loss": 0.2668, "step": 517000 }, { "epoch": 1.9170454292684012, "grad_norm": 0.9888267517089844, "learning_rate": 4.3251591765754995e-07, "loss": 0.2741, "step": 517100 }, { "epoch": 1.9174161593842913, "grad_norm": 1.8590275049209595, "learning_rate": 4.2866435765939737e-07, "loss": 0.2545, "step": 517200 }, { "epoch": 1.9177868895001815, "grad_norm": 1.0441639423370361, "learning_rate": 4.248299497137398e-07, "loss": 0.2795, "step": 517300 }, { "epoch": 1.918157619616072, "grad_norm": 0.5968843102455139, "learning_rate": 4.210126951473103e-07, "loss": 0.2544, "step": 517400 }, { "epoch": 1.9185283497319623, "grad_norm": 0.9024745225906372, "learning_rate": 4.172125952809025e-07, "loss": 0.2578, "step": 517500 }, { "epoch": 1.9188990798478525, "grad_norm": 0.7111920118331909, "learning_rate": 4.1342965142939226e-07, "loss": 0.2432, "step": 517600 }, { "epoch": 1.9192698099637426, "grad_norm": 0.46983230113983154, "learning_rate": 4.096638649016882e-07, "loss": 0.2694, "step": 517700 }, { "epoch": 1.9196405400796328, "grad_norm": 0.7017293572425842, "learning_rate": 4.059152370007979e-07, "loss": 0.2541, "step": 517800 }, { "epoch": 1.920011270195523, "grad_norm": 1.0084121227264404, "learning_rate": 4.0218376902376174e-07, "loss": 0.2895, "step": 517900 }, { "epoch": 1.9203820003114132, "grad_norm": 0.9156650304794312, "learning_rate": 3.984694622617025e-07, "loss": 0.2566, "step": 518000 }, { "epoch": 1.9207527304273035, "grad_norm": 1.2651407718658447, "learning_rate": 3.9477231799979753e-07, "loss": 0.2577, "step": 518100 }, { "epoch": 1.9211234605431937, "grad_norm": 1.1222113370895386, "learning_rate": 3.9109233751727926e-07, "loss": 0.2621, "step": 518200 }, { "epoch": 1.9214941906590841, "grad_norm": 0.6219333410263062, "learning_rate": 3.8742952208746243e-07, "loss": 0.2529, "step": 518300 }, { "epoch": 1.9218649207749743, "grad_norm": 0.7948572039604187, "learning_rate": 3.8378387297769437e-07, "loss": 0.2514, "step": 518400 }, { "epoch": 1.9222356508908645, "grad_norm": 1.0707191228866577, "learning_rate": 3.801553914494049e-07, "loss": 0.2764, "step": 518500 }, { "epoch": 1.9226063810067546, "grad_norm": 0.945325493812561, "learning_rate": 3.7654407875807874e-07, "loss": 0.2649, "step": 518600 }, { "epoch": 1.9229771111226448, "grad_norm": 0.8534600138664246, "learning_rate": 3.7294993615324403e-07, "loss": 0.2483, "step": 518700 }, { "epoch": 1.9233478412385352, "grad_norm": 0.7344462275505066, "learning_rate": 3.693729648785116e-07, "loss": 0.2568, "step": 518800 }, { "epoch": 1.9237185713544254, "grad_norm": 1.0525729656219482, "learning_rate": 3.658131661715414e-07, "loss": 0.2552, "step": 518900 }, { "epoch": 1.9240893014703158, "grad_norm": 0.7297746539115906, "learning_rate": 3.622705412640426e-07, "loss": 0.2583, "step": 519000 }, { "epoch": 1.924460031586206, "grad_norm": 1.5294065475463867, "learning_rate": 3.587450913817958e-07, "loss": 0.2715, "step": 519100 }, { "epoch": 1.924830761702096, "grad_norm": 1.0646370649337769, "learning_rate": 3.552368177446308e-07, "loss": 0.2852, "step": 519200 }, { "epoch": 1.9252014918179863, "grad_norm": 0.7906355857849121, "learning_rate": 3.517457215664377e-07, "loss": 0.2567, "step": 519300 }, { "epoch": 1.9255722219338764, "grad_norm": 1.4049230813980103, "learning_rate": 3.4827180405516134e-07, "loss": 0.2412, "step": 519400 }, { "epoch": 1.9259429520497668, "grad_norm": 1.7152695655822754, "learning_rate": 3.448150664128014e-07, "loss": 0.2826, "step": 519500 }, { "epoch": 1.926313682165657, "grad_norm": 0.8279920220375061, "learning_rate": 3.413755098354121e-07, "loss": 0.2625, "step": 519600 }, { "epoch": 1.9266844122815474, "grad_norm": 1.8941413164138794, "learning_rate": 3.379531355131138e-07, "loss": 0.2639, "step": 519700 }, { "epoch": 1.9270551423974376, "grad_norm": 1.1063035726547241, "learning_rate": 3.345479446300703e-07, "loss": 0.2789, "step": 519800 }, { "epoch": 1.9274258725133278, "grad_norm": 0.5604801177978516, "learning_rate": 3.3115993836450585e-07, "loss": 0.2688, "step": 519900 }, { "epoch": 1.927796602629218, "grad_norm": 0.6989477872848511, "learning_rate": 3.2778911788868825e-07, "loss": 0.2508, "step": 520000 }, { "epoch": 1.928167332745108, "grad_norm": 0.8481685519218445, "learning_rate": 3.2443548436895124e-07, "loss": 0.2188, "step": 520100 }, { "epoch": 1.9285380628609985, "grad_norm": 1.0133596658706665, "learning_rate": 3.2109903896567763e-07, "loss": 0.2799, "step": 520200 }, { "epoch": 1.9289087929768887, "grad_norm": 1.8191819190979004, "learning_rate": 3.1777978283329956e-07, "loss": 0.2523, "step": 520300 }, { "epoch": 1.929279523092779, "grad_norm": 1.6287511587142944, "learning_rate": 3.144777171203095e-07, "loss": 0.2702, "step": 520400 }, { "epoch": 1.9296502532086692, "grad_norm": 2.3578248023986816, "learning_rate": 3.1119284296924345e-07, "loss": 0.2434, "step": 520500 }, { "epoch": 1.9300209833245594, "grad_norm": 1.2321525812149048, "learning_rate": 3.0792516151669227e-07, "loss": 0.2817, "step": 520600 }, { "epoch": 1.9303917134404496, "grad_norm": 1.1836297512054443, "learning_rate": 3.04674673893296e-07, "loss": 0.2686, "step": 520700 }, { "epoch": 1.9307624435563397, "grad_norm": 0.6658021807670593, "learning_rate": 3.014413812237549e-07, "loss": 0.264, "step": 520800 }, { "epoch": 1.9311331736722301, "grad_norm": 0.7550407648086548, "learning_rate": 2.982252846268019e-07, "loss": 0.2736, "step": 520900 }, { "epoch": 1.9315039037881203, "grad_norm": 0.7785226702690125, "learning_rate": 2.9502638521524127e-07, "loss": 0.2548, "step": 521000 }, { "epoch": 1.9318746339040107, "grad_norm": 1.0178636312484741, "learning_rate": 2.918446840959099e-07, "loss": 0.2327, "step": 521100 }, { "epoch": 1.9322453640199009, "grad_norm": 1.5404537916183472, "learning_rate": 2.8868018236969943e-07, "loss": 0.2723, "step": 521200 }, { "epoch": 1.932616094135791, "grad_norm": 1.4452933073043823, "learning_rate": 2.855328811315561e-07, "loss": 0.2734, "step": 521300 }, { "epoch": 1.9329868242516812, "grad_norm": 1.0901228189468384, "learning_rate": 2.8240278147046996e-07, "loss": 0.2492, "step": 521400 }, { "epoch": 1.9333575543675714, "grad_norm": 0.7192224264144897, "learning_rate": 2.7928988446946913e-07, "loss": 0.2368, "step": 521500 }, { "epoch": 1.9337282844834616, "grad_norm": 0.6962764263153076, "learning_rate": 2.761941912056476e-07, "loss": 0.2548, "step": 521600 }, { "epoch": 1.934099014599352, "grad_norm": 1.3855719566345215, "learning_rate": 2.7311570275013743e-07, "loss": 0.2983, "step": 521700 }, { "epoch": 1.9344697447152424, "grad_norm": 0.841182291507721, "learning_rate": 2.7005442016811434e-07, "loss": 0.2976, "step": 521800 }, { "epoch": 1.9348404748311325, "grad_norm": 1.0531740188598633, "learning_rate": 2.670103445188088e-07, "loss": 0.2561, "step": 521900 }, { "epoch": 1.9352112049470227, "grad_norm": 1.9207888841629028, "learning_rate": 2.6398347685549493e-07, "loss": 0.268, "step": 522000 }, { "epoch": 1.9355819350629129, "grad_norm": 1.3035073280334473, "learning_rate": 2.609738182254906e-07, "loss": 0.2569, "step": 522100 }, { "epoch": 1.935952665178803, "grad_norm": 1.6272573471069336, "learning_rate": 2.5798136967016275e-07, "loss": 0.2618, "step": 522200 }, { "epoch": 1.9363233952946932, "grad_norm": 1.7440496683120728, "learning_rate": 2.5500613222491086e-07, "loss": 0.2885, "step": 522300 }, { "epoch": 1.9366941254105836, "grad_norm": 1.1391962766647339, "learning_rate": 2.520481069191949e-07, "loss": 0.2549, "step": 522400 }, { "epoch": 1.9370648555264738, "grad_norm": 1.554221272468567, "learning_rate": 2.491072947765238e-07, "loss": 0.2459, "step": 522500 }, { "epoch": 1.9374355856423642, "grad_norm": 1.1267379522323608, "learning_rate": 2.4618369681442267e-07, "loss": 0.2847, "step": 522600 }, { "epoch": 1.9378063157582543, "grad_norm": 1.7069565057754517, "learning_rate": 2.4327731404449325e-07, "loss": 0.2565, "step": 522700 }, { "epoch": 1.9381770458741445, "grad_norm": 1.3603123426437378, "learning_rate": 2.403881474723535e-07, "loss": 0.2606, "step": 522800 }, { "epoch": 1.9385477759900347, "grad_norm": 1.1292660236358643, "learning_rate": 2.3751619809768698e-07, "loss": 0.2941, "step": 522900 }, { "epoch": 1.9389185061059249, "grad_norm": 1.0747523307800293, "learning_rate": 2.3466146691420997e-07, "loss": 0.281, "step": 523000 }, { "epoch": 1.9392892362218153, "grad_norm": 0.3407441973686218, "learning_rate": 2.3182395490966568e-07, "loss": 0.253, "step": 523100 }, { "epoch": 1.9396599663377054, "grad_norm": 1.001861572265625, "learning_rate": 2.2900366306587428e-07, "loss": 0.257, "step": 523200 }, { "epoch": 1.9400306964535958, "grad_norm": 0.9868470430374146, "learning_rate": 2.262005923586663e-07, "loss": 0.2644, "step": 523300 }, { "epoch": 1.940401426569486, "grad_norm": 0.44324612617492676, "learning_rate": 2.234147437579215e-07, "loss": 0.2699, "step": 523400 }, { "epoch": 1.9407721566853762, "grad_norm": 1.3214995861053467, "learning_rate": 2.206461182275743e-07, "loss": 0.2451, "step": 523500 }, { "epoch": 1.9411428868012663, "grad_norm": 1.4291385412216187, "learning_rate": 2.178947167255918e-07, "loss": 0.2577, "step": 523600 }, { "epoch": 1.9415136169171565, "grad_norm": 0.8443538546562195, "learning_rate": 2.1516054020396802e-07, "loss": 0.2665, "step": 523700 }, { "epoch": 1.941884347033047, "grad_norm": 0.7954469323158264, "learning_rate": 2.1244358960875176e-07, "loss": 0.2789, "step": 523800 }, { "epoch": 1.942255077148937, "grad_norm": 1.1385616064071655, "learning_rate": 2.0974386588003548e-07, "loss": 0.299, "step": 523900 }, { "epoch": 1.9426258072648275, "grad_norm": 0.5795850157737732, "learning_rate": 2.0706136995193305e-07, "loss": 0.2492, "step": 524000 }, { "epoch": 1.9429965373807176, "grad_norm": 0.8735650181770325, "learning_rate": 2.0439610275261312e-07, "loss": 0.2692, "step": 524100 }, { "epoch": 1.9433672674966078, "grad_norm": 1.8635873794555664, "learning_rate": 2.0174806520428247e-07, "loss": 0.2593, "step": 524200 }, { "epoch": 1.943737997612498, "grad_norm": 0.671255886554718, "learning_rate": 1.9911725822317484e-07, "loss": 0.2604, "step": 524300 }, { "epoch": 1.9441087277283882, "grad_norm": 1.1133140325546265, "learning_rate": 1.9650368271957874e-07, "loss": 0.27, "step": 524400 }, { "epoch": 1.9444794578442786, "grad_norm": 1.3160934448242188, "learning_rate": 1.9390733959779861e-07, "loss": 0.2816, "step": 524500 }, { "epoch": 1.9448501879601687, "grad_norm": 0.6984559893608093, "learning_rate": 1.9132822975619358e-07, "loss": 0.2357, "step": 524600 }, { "epoch": 1.9452209180760591, "grad_norm": 0.9552838206291199, "learning_rate": 1.8876635408716093e-07, "loss": 0.2918, "step": 524700 }, { "epoch": 1.9455916481919493, "grad_norm": 0.7741240859031677, "learning_rate": 1.8622171347711937e-07, "loss": 0.2399, "step": 524800 }, { "epoch": 1.9459623783078395, "grad_norm": 1.0200904607772827, "learning_rate": 1.8369430880653681e-07, "loss": 0.2509, "step": 524900 }, { "epoch": 1.9463331084237296, "grad_norm": 0.8431881070137024, "learning_rate": 1.811841409499193e-07, "loss": 0.2694, "step": 525000 }, { "epoch": 1.9467038385396198, "grad_norm": 0.9309492707252502, "learning_rate": 1.7869121077579432e-07, "loss": 0.2757, "step": 525100 }, { "epoch": 1.9470745686555102, "grad_norm": 1.0970373153686523, "learning_rate": 1.7621551914674406e-07, "loss": 0.2596, "step": 525200 }, { "epoch": 1.9474452987714004, "grad_norm": 1.2237489223480225, "learning_rate": 1.7375706691936666e-07, "loss": 0.2481, "step": 525300 }, { "epoch": 1.9478160288872908, "grad_norm": 0.990932047367096, "learning_rate": 1.7131585494431502e-07, "loss": 0.2676, "step": 525400 }, { "epoch": 1.948186759003181, "grad_norm": 1.2484216690063477, "learning_rate": 1.6889188406625789e-07, "loss": 0.2662, "step": 525500 }, { "epoch": 1.9485574891190711, "grad_norm": 1.2114578485488892, "learning_rate": 1.6648515512391328e-07, "loss": 0.2594, "step": 525600 }, { "epoch": 1.9489282192349613, "grad_norm": 1.236899971961975, "learning_rate": 1.6409566895002615e-07, "loss": 0.2359, "step": 525700 }, { "epoch": 1.9492989493508515, "grad_norm": 0.7644926905632019, "learning_rate": 1.6172342637137406e-07, "loss": 0.257, "step": 525800 }, { "epoch": 1.9496696794667419, "grad_norm": 0.7775477766990662, "learning_rate": 1.5936842820877262e-07, "loss": 0.2627, "step": 525900 }, { "epoch": 1.950040409582632, "grad_norm": 1.268415093421936, "learning_rate": 1.5703067527707005e-07, "loss": 0.2433, "step": 526000 }, { "epoch": 1.9504111396985224, "grad_norm": 0.948347806930542, "learning_rate": 1.5471016838514153e-07, "loss": 0.2461, "step": 526100 }, { "epoch": 1.9507818698144126, "grad_norm": 0.9423221945762634, "learning_rate": 1.5240690833590032e-07, "loss": 0.2404, "step": 526200 }, { "epoch": 1.9511525999303028, "grad_norm": 0.6180194020271301, "learning_rate": 1.5012089592629785e-07, "loss": 0.277, "step": 526300 }, { "epoch": 1.951523330046193, "grad_norm": 1.5021088123321533, "learning_rate": 1.4785213194730695e-07, "loss": 0.2741, "step": 526400 }, { "epoch": 1.951894060162083, "grad_norm": 0.921113908290863, "learning_rate": 1.4560061718394413e-07, "loss": 0.2672, "step": 526500 }, { "epoch": 1.9522647902779733, "grad_norm": 0.6574723720550537, "learning_rate": 1.4336635241523622e-07, "loss": 0.248, "step": 526600 }, { "epoch": 1.9526355203938637, "grad_norm": 1.6767754554748535, "learning_rate": 1.411493384142648e-07, "loss": 0.273, "step": 526700 }, { "epoch": 1.953006250509754, "grad_norm": 1.0359675884246826, "learning_rate": 1.3894957594813295e-07, "loss": 0.2246, "step": 526800 }, { "epoch": 1.9533769806256442, "grad_norm": 0.6845061779022217, "learning_rate": 1.3676706577797628e-07, "loss": 0.3005, "step": 526900 }, { "epoch": 1.9537477107415344, "grad_norm": 0.9207216501235962, "learning_rate": 1.346018086589518e-07, "loss": 0.2579, "step": 527000 }, { "epoch": 1.9541184408574246, "grad_norm": 1.4961565732955933, "learning_rate": 1.3245380534026574e-07, "loss": 0.2879, "step": 527100 }, { "epoch": 1.9544891709733148, "grad_norm": 0.5834993720054626, "learning_rate": 1.3032305656512923e-07, "loss": 0.2777, "step": 527200 }, { "epoch": 1.954859901089205, "grad_norm": 0.8850821256637573, "learning_rate": 1.2820956307081355e-07, "loss": 0.3056, "step": 527300 }, { "epoch": 1.9552306312050953, "grad_norm": 0.8639688491821289, "learning_rate": 1.2611332558858935e-07, "loss": 0.2773, "step": 527400 }, { "epoch": 1.9556013613209855, "grad_norm": 1.2574058771133423, "learning_rate": 1.2403434484378197e-07, "loss": 0.2506, "step": 527500 }, { "epoch": 1.955972091436876, "grad_norm": 0.5194036960601807, "learning_rate": 1.2197262155572707e-07, "loss": 0.2451, "step": 527600 }, { "epoch": 1.956342821552766, "grad_norm": 1.0406614542007446, "learning_rate": 1.1992815643779297e-07, "loss": 0.2968, "step": 527700 }, { "epoch": 1.9567135516686562, "grad_norm": 1.072026252746582, "learning_rate": 1.1790095019739156e-07, "loss": 0.2526, "step": 527800 }, { "epoch": 1.9570842817845464, "grad_norm": 1.235876202583313, "learning_rate": 1.1589100353593952e-07, "loss": 0.2796, "step": 527900 }, { "epoch": 1.9574550119004366, "grad_norm": 1.3992373943328857, "learning_rate": 1.1389831714890276e-07, "loss": 0.2733, "step": 528000 }, { "epoch": 1.957825742016327, "grad_norm": 0.7872364521026611, "learning_rate": 1.1192289172575754e-07, "loss": 0.2903, "step": 528100 }, { "epoch": 1.9581964721322171, "grad_norm": 0.6681755185127258, "learning_rate": 1.0996472795001823e-07, "loss": 0.2557, "step": 528200 }, { "epoch": 1.9585672022481075, "grad_norm": 0.9715235233306885, "learning_rate": 1.0802382649923171e-07, "loss": 0.2587, "step": 528300 }, { "epoch": 1.9589379323639977, "grad_norm": 0.8085317015647888, "learning_rate": 1.0610018804495525e-07, "loss": 0.2711, "step": 528400 }, { "epoch": 1.9593086624798879, "grad_norm": 2.127732276916504, "learning_rate": 1.0419381325278421e-07, "loss": 0.2479, "step": 528500 }, { "epoch": 1.959679392595778, "grad_norm": 1.1550869941711426, "learning_rate": 1.023047027823354e-07, "loss": 0.2725, "step": 528600 }, { "epoch": 1.9600501227116682, "grad_norm": 0.7168514728546143, "learning_rate": 1.004328572872637e-07, "loss": 0.2717, "step": 528700 }, { "epoch": 1.9604208528275586, "grad_norm": 1.6938797235488892, "learning_rate": 9.857827741523995e-08, "loss": 0.2685, "step": 528800 }, { "epoch": 1.9607915829434488, "grad_norm": 2.208064079284668, "learning_rate": 9.674096380795638e-08, "loss": 0.2575, "step": 528900 }, { "epoch": 1.9611623130593392, "grad_norm": 0.5559431314468384, "learning_rate": 9.492091710114337e-08, "loss": 0.2644, "step": 529000 }, { "epoch": 1.9615330431752294, "grad_norm": 1.174103856086731, "learning_rate": 9.31181379245416e-08, "loss": 0.2585, "step": 529100 }, { "epoch": 1.9619037732911195, "grad_norm": 2.4535696506500244, "learning_rate": 9.133262690193545e-08, "loss": 0.2806, "step": 529200 }, { "epoch": 1.9622745034070097, "grad_norm": 1.1910173892974854, "learning_rate": 8.956438465112516e-08, "loss": 0.2609, "step": 529300 }, { "epoch": 1.9626452335228999, "grad_norm": 1.306016206741333, "learning_rate": 8.781341178393244e-08, "loss": 0.2384, "step": 529400 }, { "epoch": 1.9630159636387903, "grad_norm": 1.0111640691757202, "learning_rate": 8.607970890620044e-08, "loss": 0.2629, "step": 529500 }, { "epoch": 1.9633866937546804, "grad_norm": 0.8981440663337708, "learning_rate": 8.436327661781595e-08, "loss": 0.2535, "step": 529600 }, { "epoch": 1.9637574238705708, "grad_norm": 1.073011040687561, "learning_rate": 8.266411551267062e-08, "loss": 0.2616, "step": 529700 }, { "epoch": 1.964128153986461, "grad_norm": 2.778616428375244, "learning_rate": 8.098222617868856e-08, "loss": 0.2538, "step": 529800 }, { "epoch": 1.9644988841023512, "grad_norm": 1.2248859405517578, "learning_rate": 7.931760919781539e-08, "loss": 0.2691, "step": 529900 }, { "epoch": 1.9648696142182414, "grad_norm": 1.511271595954895, "learning_rate": 7.767026514601816e-08, "loss": 0.2422, "step": 530000 }, { "epoch": 1.9652403443341315, "grad_norm": 1.8893510103225708, "learning_rate": 7.604019459329647e-08, "loss": 0.2747, "step": 530100 }, { "epoch": 1.965611074450022, "grad_norm": 0.6982595324516296, "learning_rate": 7.442739810366029e-08, "loss": 0.2679, "step": 530200 }, { "epoch": 1.965981804565912, "grad_norm": 0.7671345472335815, "learning_rate": 7.283187623515208e-08, "loss": 0.2656, "step": 530300 }, { "epoch": 1.9663525346818025, "grad_norm": 0.6844850778579712, "learning_rate": 7.125362953983583e-08, "loss": 0.2601, "step": 530400 }, { "epoch": 1.9667232647976927, "grad_norm": 1.0589451789855957, "learning_rate": 6.969265856379693e-08, "loss": 0.2638, "step": 530500 }, { "epoch": 1.9670939949135828, "grad_norm": 2.502464532852173, "learning_rate": 6.81489638471422e-08, "loss": 0.2595, "step": 530600 }, { "epoch": 1.967464725029473, "grad_norm": 0.47240760922431946, "learning_rate": 6.662254592399442e-08, "loss": 0.2613, "step": 530700 }, { "epoch": 1.9678354551453632, "grad_norm": 1.0824003219604492, "learning_rate": 6.511340532251997e-08, "loss": 0.2919, "step": 530800 }, { "epoch": 1.9682061852612533, "grad_norm": 0.8804799318313599, "learning_rate": 6.362154256487896e-08, "loss": 0.2436, "step": 530900 }, { "epoch": 1.9685769153771437, "grad_norm": 0.27883675694465637, "learning_rate": 6.214695816727511e-08, "loss": 0.2838, "step": 531000 }, { "epoch": 1.9689476454930341, "grad_norm": 2.0598297119140625, "learning_rate": 6.068965263992254e-08, "loss": 0.272, "step": 531100 }, { "epoch": 1.9693183756089243, "grad_norm": 0.7119441032409668, "learning_rate": 5.924962648706234e-08, "loss": 0.2433, "step": 531200 }, { "epoch": 1.9696891057248145, "grad_norm": 0.7738451361656189, "learning_rate": 5.7826880206957036e-08, "loss": 0.2531, "step": 531300 }, { "epoch": 1.9700598358407047, "grad_norm": 0.20328280329704285, "learning_rate": 5.6421414291873974e-08, "loss": 0.2811, "step": 531400 }, { "epoch": 1.9704305659565948, "grad_norm": 0.7742271423339844, "learning_rate": 5.50332292281297e-08, "loss": 0.2938, "step": 531500 }, { "epoch": 1.970801296072485, "grad_norm": 1.088539958000183, "learning_rate": 5.366232549604e-08, "loss": 0.248, "step": 531600 }, { "epoch": 1.9711720261883754, "grad_norm": 1.1020554304122925, "learning_rate": 5.230870356994766e-08, "loss": 0.2742, "step": 531700 }, { "epoch": 1.9715427563042658, "grad_norm": 1.4092004299163818, "learning_rate": 5.097236391822247e-08, "loss": 0.2691, "step": 531800 }, { "epoch": 1.971913486420156, "grad_norm": 0.5208079218864441, "learning_rate": 4.965330700323345e-08, "loss": 0.2694, "step": 531900 }, { "epoch": 1.9722842165360461, "grad_norm": 0.8766278624534607, "learning_rate": 4.835153328139885e-08, "loss": 0.2829, "step": 532000 }, { "epoch": 1.9726549466519363, "grad_norm": 1.678202509880066, "learning_rate": 4.7067043203136154e-08, "loss": 0.2447, "step": 532100 }, { "epoch": 1.9730256767678265, "grad_norm": 1.5930906534194946, "learning_rate": 4.579983721288983e-08, "loss": 0.2822, "step": 532200 }, { "epoch": 1.9733964068837166, "grad_norm": 2.2611236572265625, "learning_rate": 4.454991574912026e-08, "loss": 0.2677, "step": 532300 }, { "epoch": 1.973767136999607, "grad_norm": 1.3755618333816528, "learning_rate": 4.3317279244309286e-08, "loss": 0.2422, "step": 532400 }, { "epoch": 1.9741378671154972, "grad_norm": 0.9833665490150452, "learning_rate": 4.2101928124965714e-08, "loss": 0.2883, "step": 532500 }, { "epoch": 1.9745085972313876, "grad_norm": 0.9862990379333496, "learning_rate": 4.090386281159764e-08, "loss": 0.257, "step": 532600 }, { "epoch": 1.9748793273472778, "grad_norm": 1.3692221641540527, "learning_rate": 3.972308371875677e-08, "loss": 0.252, "step": 532700 }, { "epoch": 1.975250057463168, "grad_norm": 0.8539320230484009, "learning_rate": 3.855959125499409e-08, "loss": 0.2709, "step": 532800 }, { "epoch": 1.9756207875790581, "grad_norm": 0.7126438617706299, "learning_rate": 3.741338582288756e-08, "loss": 0.2685, "step": 532900 }, { "epoch": 1.9759915176949483, "grad_norm": 0.32471904158592224, "learning_rate": 3.6284467819036605e-08, "loss": 0.2445, "step": 533000 }, { "epoch": 1.9763622478108387, "grad_norm": 0.4666435718536377, "learning_rate": 3.517283763405099e-08, "loss": 0.2688, "step": 533100 }, { "epoch": 1.9767329779267289, "grad_norm": 1.1601201295852661, "learning_rate": 3.407849565256749e-08, "loss": 0.2588, "step": 533200 }, { "epoch": 1.9771037080426193, "grad_norm": 1.2195132970809937, "learning_rate": 3.3001442253227654e-08, "loss": 0.279, "step": 533300 }, { "epoch": 1.9774744381585094, "grad_norm": 1.1505283117294312, "learning_rate": 3.1941677808711154e-08, "loss": 0.269, "step": 533400 }, { "epoch": 1.9778451682743996, "grad_norm": 0.8945961594581604, "learning_rate": 3.0899202685702455e-08, "loss": 0.2671, "step": 533500 }, { "epoch": 1.9782158983902898, "grad_norm": 1.019197702407837, "learning_rate": 2.987401724489636e-08, "loss": 0.2495, "step": 533600 }, { "epoch": 1.97858662850618, "grad_norm": 0.41027262806892395, "learning_rate": 2.8866121841025774e-08, "loss": 0.2577, "step": 533700 }, { "epoch": 1.9789573586220703, "grad_norm": 0.31804269552230835, "learning_rate": 2.7875516822822855e-08, "loss": 0.2428, "step": 533800 }, { "epoch": 1.9793280887379605, "grad_norm": 0.8226633667945862, "learning_rate": 2.690220253304676e-08, "loss": 0.2593, "step": 533900 }, { "epoch": 1.979698818853851, "grad_norm": 0.3580420911312103, "learning_rate": 2.5946179308472542e-08, "loss": 0.2405, "step": 534000 }, { "epoch": 1.980069548969741, "grad_norm": 1.2143442630767822, "learning_rate": 2.5007447479891146e-08, "loss": 0.2875, "step": 534100 }, { "epoch": 1.9804402790856312, "grad_norm": 1.038336157798767, "learning_rate": 2.408600737210942e-08, "loss": 0.2553, "step": 534200 }, { "epoch": 1.9808110092015214, "grad_norm": 0.7830453515052795, "learning_rate": 2.3181859303955666e-08, "loss": 0.268, "step": 534300 }, { "epoch": 1.9811817393174116, "grad_norm": 1.5607683658599854, "learning_rate": 2.2295003588262975e-08, "loss": 0.2393, "step": 534400 }, { "epoch": 1.981552469433302, "grad_norm": 0.7774990797042847, "learning_rate": 2.142544053190254e-08, "loss": 0.2595, "step": 534500 }, { "epoch": 1.9819231995491922, "grad_norm": 0.9557089805603027, "learning_rate": 2.0573170435739255e-08, "loss": 0.2594, "step": 534600 }, { "epoch": 1.9822939296650826, "grad_norm": 1.0173733234405518, "learning_rate": 1.9738193594676103e-08, "loss": 0.2435, "step": 534700 }, { "epoch": 1.9826646597809727, "grad_norm": 0.8246684670448303, "learning_rate": 1.892051029760977e-08, "loss": 0.2793, "step": 534800 }, { "epoch": 1.983035389896863, "grad_norm": 0.7758829593658447, "learning_rate": 1.81201208274695e-08, "loss": 0.2294, "step": 534900 }, { "epoch": 1.983406120012753, "grad_norm": 0.5600193738937378, "learning_rate": 1.7337025461194868e-08, "loss": 0.2698, "step": 535000 }, { "epoch": 1.9837768501286432, "grad_norm": 1.0241050720214844, "learning_rate": 1.6571224469746905e-08, "loss": 0.2489, "step": 535100 }, { "epoch": 1.9841475802445336, "grad_norm": 0.5697911977767944, "learning_rate": 1.582271811809144e-08, "loss": 0.2708, "step": 535200 }, { "epoch": 1.9845183103604238, "grad_norm": 0.7136486172676086, "learning_rate": 1.5091506665226853e-08, "loss": 0.2661, "step": 535300 }, { "epoch": 1.9848890404763142, "grad_norm": 0.7559804916381836, "learning_rate": 1.4377590364150761e-08, "loss": 0.2684, "step": 535400 }, { "epoch": 1.9852597705922044, "grad_norm": 1.1117522716522217, "learning_rate": 1.3680969461882242e-08, "loss": 0.2759, "step": 535500 }, { "epoch": 1.9856305007080945, "grad_norm": 0.39842909574508667, "learning_rate": 1.3001644199461815e-08, "loss": 0.2668, "step": 535600 }, { "epoch": 1.9860012308239847, "grad_norm": 0.6799736618995667, "learning_rate": 1.2339614811940348e-08, "loss": 0.2904, "step": 535700 }, { "epoch": 1.986371960939875, "grad_norm": 2.2267022132873535, "learning_rate": 1.1694881528384605e-08, "loss": 0.2718, "step": 535800 }, { "epoch": 1.986742691055765, "grad_norm": 0.9197942614555359, "learning_rate": 1.1067444571871699e-08, "loss": 0.2721, "step": 535900 }, { "epoch": 1.9871134211716555, "grad_norm": 0.6634698510169983, "learning_rate": 1.0457304159511294e-08, "loss": 0.2879, "step": 536000 }, { "epoch": 1.9874841512875459, "grad_norm": 1.6803314685821533, "learning_rate": 9.864460502401196e-09, "loss": 0.2912, "step": 536100 }, { "epoch": 1.987854881403436, "grad_norm": 0.846348226070404, "learning_rate": 9.288913805682863e-09, "loss": 0.2609, "step": 536200 }, { "epoch": 1.9882256115193262, "grad_norm": 0.37686920166015625, "learning_rate": 8.730664268497001e-09, "loss": 0.2788, "step": 536300 }, { "epoch": 1.9885963416352164, "grad_norm": 1.3998162746429443, "learning_rate": 8.189712084000211e-09, "loss": 0.2689, "step": 536400 }, { "epoch": 1.9889670717511065, "grad_norm": 1.1240495443344116, "learning_rate": 7.666057439359441e-09, "loss": 0.2456, "step": 536500 }, { "epoch": 1.9893378018669967, "grad_norm": 0.9786272048950195, "learning_rate": 7.1597005157741926e-09, "loss": 0.274, "step": 536600 }, { "epoch": 1.989708531982887, "grad_norm": 0.9902150630950928, "learning_rate": 6.670641488443208e-09, "loss": 0.2331, "step": 536700 }, { "epoch": 1.9900792620987773, "grad_norm": 2.136213541030884, "learning_rate": 6.1988805265866815e-09, "loss": 0.2585, "step": 536800 }, { "epoch": 1.9904499922146677, "grad_norm": 1.240695595741272, "learning_rate": 5.744417793429602e-09, "loss": 0.256, "step": 536900 }, { "epoch": 1.9908207223305578, "grad_norm": 0.4733426570892334, "learning_rate": 5.3072534462295095e-09, "loss": 0.2493, "step": 537000 }, { "epoch": 1.991191452446448, "grad_norm": 1.3026540279388428, "learning_rate": 4.88738763624319e-09, "loss": 0.2352, "step": 537100 }, { "epoch": 1.9915621825623382, "grad_norm": 0.6901194453239441, "learning_rate": 4.484820508748877e-09, "loss": 0.2675, "step": 537200 }, { "epoch": 1.9919329126782284, "grad_norm": 1.647271752357483, "learning_rate": 4.0995522030351505e-09, "loss": 0.2826, "step": 537300 }, { "epoch": 1.9923036427941188, "grad_norm": 1.050615668296814, "learning_rate": 3.7315828524120414e-09, "loss": 0.2552, "step": 537400 }, { "epoch": 1.992674372910009, "grad_norm": 0.7285329103469849, "learning_rate": 3.3809125841943735e-09, "loss": 0.2533, "step": 537500 }, { "epoch": 1.9930451030258993, "grad_norm": 0.7338119149208069, "learning_rate": 3.0475415197239733e-09, "loss": 0.2759, "step": 537600 }, { "epoch": 1.9934158331417895, "grad_norm": 1.1742608547210693, "learning_rate": 2.7314697743419106e-09, "loss": 0.2791, "step": 537700 }, { "epoch": 1.9937865632576797, "grad_norm": 2.3422696590423584, "learning_rate": 2.4326974574162554e-09, "loss": 0.2502, "step": 537800 }, { "epoch": 1.9941572933735698, "grad_norm": 1.305364966392517, "learning_rate": 2.1512246723254246e-09, "loss": 0.2472, "step": 537900 }, { "epoch": 1.99452802348946, "grad_norm": 0.9575023055076599, "learning_rate": 1.8870515164526316e-09, "loss": 0.2758, "step": 538000 }, { "epoch": 1.9948987536053504, "grad_norm": 0.6690237522125244, "learning_rate": 1.6401780812136391e-09, "loss": 0.2759, "step": 538100 }, { "epoch": 1.9952694837212406, "grad_norm": 1.0878537893295288, "learning_rate": 1.4106044520290073e-09, "loss": 0.249, "step": 538200 }, { "epoch": 1.995640213837131, "grad_norm": 1.406073808670044, "learning_rate": 1.1983307083240913e-09, "loss": 0.2738, "step": 538300 }, { "epoch": 1.9960109439530211, "grad_norm": 0.41225093603134155, "learning_rate": 1.0033569235512463e-09, "loss": 0.2424, "step": 538400 }, { "epoch": 1.9963816740689113, "grad_norm": 0.7758455872535706, "learning_rate": 8.256831651731744e-10, "loss": 0.2626, "step": 538500 }, { "epoch": 1.9967524041848015, "grad_norm": 0.5801597833633423, "learning_rate": 6.653094946684757e-10, "loss": 0.2351, "step": 538600 }, { "epoch": 1.9971231343006917, "grad_norm": 0.2951740026473999, "learning_rate": 5.222359675260969e-10, "loss": 0.2727, "step": 538700 }, { "epoch": 1.997493864416582, "grad_norm": 1.107856273651123, "learning_rate": 3.964626332508825e-10, "loss": 0.2415, "step": 538800 }, { "epoch": 1.9978645945324722, "grad_norm": 1.5402660369873047, "learning_rate": 2.879895353580242e-10, "loss": 0.2702, "step": 538900 }, { "epoch": 1.9982353246483626, "grad_norm": 1.4619864225387573, "learning_rate": 1.9681671138416237e-10, "loss": 0.2726, "step": 539000 }, { "epoch": 1.9986060547642528, "grad_norm": 0.7005214691162109, "learning_rate": 1.2294419287628423e-10, "loss": 0.2992, "step": 539100 }, { "epoch": 1.998976784880143, "grad_norm": 1.1344046592712402, "learning_rate": 6.637200539172384e-11, "loss": 0.2712, "step": 539200 }, { "epoch": 1.9993475149960331, "grad_norm": 1.4308596849441528, "learning_rate": 2.710016850371311e-11, "loss": 0.2766, "step": 539300 }, { "epoch": 1.9997182451119233, "grad_norm": 5.191880226135254, "learning_rate": 5.128695801381867e-12, "loss": 0.2642, "step": 539400 } ], "logging_steps": 100, "max_steps": 539476, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 3000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.415140669038081e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }