{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.6290752946581611, "eval_steps": 500, "global_step": 35600, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0008835327172165185, "grad_norm": 5.665971279144287, "learning_rate": 4.3286219081272084e-07, "loss": 1.3738, "step": 50 }, { "epoch": 0.001767065434433037, "grad_norm": 5.6161651611328125, "learning_rate": 8.745583038869259e-07, "loss": 1.1661, "step": 100 }, { "epoch": 0.0026505981516495554, "grad_norm": 7.866199970245361, "learning_rate": 1.3162544169611309e-06, "loss": 1.2107, "step": 150 }, { "epoch": 0.003534130868866074, "grad_norm": 5.07379674911499, "learning_rate": 1.7579505300353357e-06, "loss": 0.9855, "step": 200 }, { "epoch": 0.004417663586082593, "grad_norm": 3.2607851028442383, "learning_rate": 2.199646643109541e-06, "loss": 0.9431, "step": 250 }, { "epoch": 0.005301196303299111, "grad_norm": 6.517599105834961, "learning_rate": 2.6413427561837457e-06, "loss": 0.8566, "step": 300 }, { "epoch": 0.00618472902051563, "grad_norm": 2.8523333072662354, "learning_rate": 3.0830388692579506e-06, "loss": 0.8697, "step": 350 }, { "epoch": 0.007068261737732148, "grad_norm": 3.460226058959961, "learning_rate": 3.5247349823321555e-06, "loss": 0.8099, "step": 400 }, { "epoch": 0.007951794454948667, "grad_norm": 3.2528891563415527, "learning_rate": 3.966431095406361e-06, "loss": 0.766, "step": 450 }, { "epoch": 0.008835327172165185, "grad_norm": 4.1086039543151855, "learning_rate": 4.408127208480566e-06, "loss": 0.7402, "step": 500 }, { "epoch": 0.009718859889381704, "grad_norm": 3.8160510063171387, "learning_rate": 4.849823321554771e-06, "loss": 0.8769, "step": 550 }, { "epoch": 0.010602392606598222, "grad_norm": 2.901653289794922, "learning_rate": 5.291519434628975e-06, "loss": 0.6827, "step": 600 }, { "epoch": 0.011485925323814742, "grad_norm": 2.5824739933013916, "learning_rate": 5.73321554770318e-06, "loss": 0.7252, "step": 650 }, { "epoch": 0.01236945804103126, "grad_norm": 2.586138963699341, "learning_rate": 6.174911660777385e-06, "loss": 0.7701, "step": 700 }, { "epoch": 0.013252990758247778, "grad_norm": 2.3450210094451904, "learning_rate": 6.6166077738515904e-06, "loss": 0.7525, "step": 750 }, { "epoch": 0.014136523475464296, "grad_norm": 2.7902042865753174, "learning_rate": 7.058303886925795e-06, "loss": 0.7097, "step": 800 }, { "epoch": 0.015020056192680814, "grad_norm": 3.297929286956787, "learning_rate": 7.5e-06, "loss": 0.7575, "step": 850 }, { "epoch": 0.015903588909897334, "grad_norm": 4.028406143188477, "learning_rate": 7.941696113074205e-06, "loss": 0.6899, "step": 900 }, { "epoch": 0.016787121627113853, "grad_norm": 2.2513041496276855, "learning_rate": 8.38339222614841e-06, "loss": 0.6655, "step": 950 }, { "epoch": 0.01767065434433037, "grad_norm": 2.402355670928955, "learning_rate": 8.825088339222614e-06, "loss": 0.6601, "step": 1000 }, { "epoch": 0.01855418706154689, "grad_norm": 4.492621898651123, "learning_rate": 9.26678445229682e-06, "loss": 0.6925, "step": 1050 }, { "epoch": 0.019437719778763407, "grad_norm": 3.8099517822265625, "learning_rate": 9.708480565371025e-06, "loss": 0.6169, "step": 1100 }, { "epoch": 0.020321252495979925, "grad_norm": 4.58193826675415, "learning_rate": 1.0150176678445231e-05, "loss": 0.6367, "step": 1150 }, { "epoch": 0.021204785213196443, "grad_norm": 4.745123863220215, "learning_rate": 1.0591872791519434e-05, "loss": 0.615, "step": 1200 }, { "epoch": 0.02208831793041296, "grad_norm": 3.260239601135254, "learning_rate": 1.103356890459364e-05, "loss": 0.6869, "step": 1250 }, { "epoch": 0.022971850647629483, "grad_norm": 2.485383987426758, "learning_rate": 1.1475265017667845e-05, "loss": 0.7527, "step": 1300 }, { "epoch": 0.023855383364846, "grad_norm": 2.26680326461792, "learning_rate": 1.191696113074205e-05, "loss": 0.6124, "step": 1350 }, { "epoch": 0.02473891608206252, "grad_norm": 2.348688840866089, "learning_rate": 1.2358657243816255e-05, "loss": 0.6511, "step": 1400 }, { "epoch": 0.025622448799279038, "grad_norm": 2.770859956741333, "learning_rate": 1.280035335689046e-05, "loss": 0.7047, "step": 1450 }, { "epoch": 0.026505981516495556, "grad_norm": 3.188656806945801, "learning_rate": 1.3242049469964666e-05, "loss": 0.6639, "step": 1500 }, { "epoch": 0.027389514233712074, "grad_norm": 2.7158899307250977, "learning_rate": 1.368374558303887e-05, "loss": 0.6795, "step": 1550 }, { "epoch": 0.028273046950928592, "grad_norm": 2.7986080646514893, "learning_rate": 1.4125441696113076e-05, "loss": 0.6341, "step": 1600 }, { "epoch": 0.02915657966814511, "grad_norm": 1.9698214530944824, "learning_rate": 1.456713780918728e-05, "loss": 0.6031, "step": 1650 }, { "epoch": 0.03004011238536163, "grad_norm": 2.495985507965088, "learning_rate": 1.5008833922261484e-05, "loss": 0.5959, "step": 1700 }, { "epoch": 0.030923645102578147, "grad_norm": 2.990360975265503, "learning_rate": 1.545053003533569e-05, "loss": 0.6412, "step": 1750 }, { "epoch": 0.03180717781979467, "grad_norm": 3.658212184906006, "learning_rate": 1.5892226148409894e-05, "loss": 0.5065, "step": 1800 }, { "epoch": 0.03269071053701118, "grad_norm": 2.010875940322876, "learning_rate": 1.63339222614841e-05, "loss": 0.5611, "step": 1850 }, { "epoch": 0.033574243254227705, "grad_norm": 2.408937692642212, "learning_rate": 1.6775618374558306e-05, "loss": 0.5298, "step": 1900 }, { "epoch": 0.03445777597144422, "grad_norm": 2.3144407272338867, "learning_rate": 1.721731448763251e-05, "loss": 0.5759, "step": 1950 }, { "epoch": 0.03534130868866074, "grad_norm": 2.944115400314331, "learning_rate": 1.7659010600706715e-05, "loss": 0.5782, "step": 2000 }, { "epoch": 0.03622484140587726, "grad_norm": 2.3239428997039795, "learning_rate": 1.810070671378092e-05, "loss": 0.5221, "step": 2050 }, { "epoch": 0.03710837412309378, "grad_norm": 4.565939426422119, "learning_rate": 1.8542402826855124e-05, "loss": 0.5966, "step": 2100 }, { "epoch": 0.0379919068403103, "grad_norm": 2.6089091300964355, "learning_rate": 1.898409893992933e-05, "loss": 0.5989, "step": 2150 }, { "epoch": 0.038875439557526814, "grad_norm": 2.4395945072174072, "learning_rate": 1.9425795053003533e-05, "loss": 0.5097, "step": 2200 }, { "epoch": 0.039758972274743336, "grad_norm": 2.274600028991699, "learning_rate": 1.986749116607774e-05, "loss": 0.4934, "step": 2250 }, { "epoch": 0.04064250499195985, "grad_norm": 2.393251895904541, "learning_rate": 2.0309187279151945e-05, "loss": 0.5354, "step": 2300 }, { "epoch": 0.04152603770917637, "grad_norm": 2.613900899887085, "learning_rate": 2.075088339222615e-05, "loss": 0.5236, "step": 2350 }, { "epoch": 0.04240957042639289, "grad_norm": 2.233302116394043, "learning_rate": 2.1192579505300354e-05, "loss": 0.5057, "step": 2400 }, { "epoch": 0.04329310314360941, "grad_norm": 2.2634503841400146, "learning_rate": 2.163427561837456e-05, "loss": 0.5448, "step": 2450 }, { "epoch": 0.04417663586082592, "grad_norm": 1.6744658946990967, "learning_rate": 2.2075971731448763e-05, "loss": 0.5418, "step": 2500 }, { "epoch": 0.045060168578042445, "grad_norm": 2.9320178031921387, "learning_rate": 2.2517667844522968e-05, "loss": 0.5944, "step": 2550 }, { "epoch": 0.04594370129525897, "grad_norm": 2.2643797397613525, "learning_rate": 2.2959363957597176e-05, "loss": 0.4945, "step": 2600 }, { "epoch": 0.04682723401247548, "grad_norm": 2.389902114868164, "learning_rate": 2.340106007067138e-05, "loss": 0.5225, "step": 2650 }, { "epoch": 0.047710766729692, "grad_norm": 2.2676665782928467, "learning_rate": 2.3842756183745584e-05, "loss": 0.5661, "step": 2700 }, { "epoch": 0.04859429944690852, "grad_norm": 2.340926170349121, "learning_rate": 2.428445229681979e-05, "loss": 0.6125, "step": 2750 }, { "epoch": 0.04947783216412504, "grad_norm": 1.925943374633789, "learning_rate": 2.4726148409893997e-05, "loss": 0.5105, "step": 2800 }, { "epoch": 0.050361364881341554, "grad_norm": 3.1281192302703857, "learning_rate": 2.5167844522968198e-05, "loss": 0.5893, "step": 2850 }, { "epoch": 0.051244897598558076, "grad_norm": 2.345649242401123, "learning_rate": 2.5609540636042406e-05, "loss": 0.545, "step": 2900 }, { "epoch": 0.05212843031577459, "grad_norm": 2.9023561477661133, "learning_rate": 2.605123674911661e-05, "loss": 0.5299, "step": 2950 }, { "epoch": 0.05301196303299111, "grad_norm": 2.491269588470459, "learning_rate": 2.649293286219081e-05, "loss": 0.5186, "step": 3000 }, { "epoch": 0.05389549575020763, "grad_norm": 1.842517375946045, "learning_rate": 2.693462897526502e-05, "loss": 0.5259, "step": 3050 }, { "epoch": 0.05477902846742415, "grad_norm": 3.319514274597168, "learning_rate": 2.7376325088339223e-05, "loss": 0.6663, "step": 3100 }, { "epoch": 0.05566256118464067, "grad_norm": 2.7143654823303223, "learning_rate": 2.781802120141343e-05, "loss": 0.5152, "step": 3150 }, { "epoch": 0.056546093901857185, "grad_norm": 2.8187732696533203, "learning_rate": 2.8259717314487632e-05, "loss": 0.5417, "step": 3200 }, { "epoch": 0.057429626619073706, "grad_norm": 2.8348097801208496, "learning_rate": 2.870141342756184e-05, "loss": 0.5039, "step": 3250 }, { "epoch": 0.05831315933629022, "grad_norm": 3.6297833919525146, "learning_rate": 2.9143109540636045e-05, "loss": 0.4647, "step": 3300 }, { "epoch": 0.05919669205350674, "grad_norm": 2.6729063987731934, "learning_rate": 2.9584805653710253e-05, "loss": 0.4652, "step": 3350 }, { "epoch": 0.06008022477072326, "grad_norm": 3.030548572540283, "learning_rate": 3.0026501766784454e-05, "loss": 0.4914, "step": 3400 }, { "epoch": 0.06096375748793978, "grad_norm": 1.844643235206604, "learning_rate": 3.0468197879858658e-05, "loss": 0.5449, "step": 3450 }, { "epoch": 0.061847290205156294, "grad_norm": 1.6973118782043457, "learning_rate": 3.090989399293286e-05, "loss": 0.5072, "step": 3500 }, { "epoch": 0.06273082292237281, "grad_norm": 2.626692295074463, "learning_rate": 3.135159010600707e-05, "loss": 0.5639, "step": 3550 }, { "epoch": 0.06361435563958934, "grad_norm": 2.971773624420166, "learning_rate": 3.179328621908128e-05, "loss": 0.4729, "step": 3600 }, { "epoch": 0.06449788835680585, "grad_norm": 2.134610414505005, "learning_rate": 3.2234982332155476e-05, "loss": 0.6047, "step": 3650 }, { "epoch": 0.06538142107402237, "grad_norm": 1.8596552610397339, "learning_rate": 3.267667844522969e-05, "loss": 0.5369, "step": 3700 }, { "epoch": 0.0662649537912389, "grad_norm": 2.5137698650360107, "learning_rate": 3.311837455830389e-05, "loss": 0.5014, "step": 3750 }, { "epoch": 0.06714848650845541, "grad_norm": 2.8211522102355957, "learning_rate": 3.356007067137809e-05, "loss": 0.5128, "step": 3800 }, { "epoch": 0.06803201922567192, "grad_norm": 2.095426559448242, "learning_rate": 3.40017667844523e-05, "loss": 0.5345, "step": 3850 }, { "epoch": 0.06891555194288844, "grad_norm": 2.1965081691741943, "learning_rate": 3.4443462897526505e-05, "loss": 0.479, "step": 3900 }, { "epoch": 0.06979908466010497, "grad_norm": 2.1722958087921143, "learning_rate": 3.488515901060071e-05, "loss": 0.5652, "step": 3950 }, { "epoch": 0.07068261737732148, "grad_norm": 2.7183449268341064, "learning_rate": 3.5326855123674914e-05, "loss": 0.5272, "step": 4000 }, { "epoch": 0.071566150094538, "grad_norm": 2.356076717376709, "learning_rate": 3.576855123674912e-05, "loss": 0.4904, "step": 4050 }, { "epoch": 0.07244968281175453, "grad_norm": 1.7549006938934326, "learning_rate": 3.621024734982332e-05, "loss": 0.4755, "step": 4100 }, { "epoch": 0.07333321552897104, "grad_norm": 2.0377912521362305, "learning_rate": 3.665194346289753e-05, "loss": 0.4897, "step": 4150 }, { "epoch": 0.07421674824618756, "grad_norm": 2.4711716175079346, "learning_rate": 3.709363957597173e-05, "loss": 0.4679, "step": 4200 }, { "epoch": 0.07510028096340407, "grad_norm": 2.700162649154663, "learning_rate": 3.7535335689045936e-05, "loss": 0.4712, "step": 4250 }, { "epoch": 0.0759838136806206, "grad_norm": 1.9648590087890625, "learning_rate": 3.797703180212015e-05, "loss": 0.4779, "step": 4300 }, { "epoch": 0.07686734639783711, "grad_norm": 2.4238970279693604, "learning_rate": 3.8418727915194345e-05, "loss": 0.4463, "step": 4350 }, { "epoch": 0.07775087911505363, "grad_norm": 1.745356798171997, "learning_rate": 3.8860424028268556e-05, "loss": 0.4917, "step": 4400 }, { "epoch": 0.07863441183227014, "grad_norm": 5.889612197875977, "learning_rate": 3.930212014134276e-05, "loss": 0.5572, "step": 4450 }, { "epoch": 0.07951794454948667, "grad_norm": 2.7529609203338623, "learning_rate": 3.9743816254416965e-05, "loss": 0.4553, "step": 4500 }, { "epoch": 0.08040147726670319, "grad_norm": 2.4175944328308105, "learning_rate": 4.018551236749117e-05, "loss": 0.4598, "step": 4550 }, { "epoch": 0.0812850099839197, "grad_norm": 2.2330217361450195, "learning_rate": 4.0627208480565374e-05, "loss": 0.5445, "step": 4600 }, { "epoch": 0.08216854270113623, "grad_norm": 2.4177329540252686, "learning_rate": 4.106890459363958e-05, "loss": 0.4537, "step": 4650 }, { "epoch": 0.08305207541835274, "grad_norm": 2.6188764572143555, "learning_rate": 4.151060070671378e-05, "loss": 0.5158, "step": 4700 }, { "epoch": 0.08393560813556926, "grad_norm": 3.5044455528259277, "learning_rate": 4.195229681978799e-05, "loss": 0.4598, "step": 4750 }, { "epoch": 0.08481914085278577, "grad_norm": 2.2751505374908447, "learning_rate": 4.239399293286219e-05, "loss": 0.4662, "step": 4800 }, { "epoch": 0.0857026735700023, "grad_norm": 2.0289080142974854, "learning_rate": 4.28356890459364e-05, "loss": 0.459, "step": 4850 }, { "epoch": 0.08658620628721882, "grad_norm": 2.6102516651153564, "learning_rate": 4.32773851590106e-05, "loss": 0.4275, "step": 4900 }, { "epoch": 0.08746973900443533, "grad_norm": 2.5842251777648926, "learning_rate": 4.3719081272084805e-05, "loss": 0.5575, "step": 4950 }, { "epoch": 0.08835327172165185, "grad_norm": 3.6427652835845947, "learning_rate": 4.4160777385159016e-05, "loss": 0.4197, "step": 5000 }, { "epoch": 0.08923680443886838, "grad_norm": 1.8962676525115967, "learning_rate": 4.4602473498233214e-05, "loss": 0.4525, "step": 5050 }, { "epoch": 0.09012033715608489, "grad_norm": 2.1373822689056396, "learning_rate": 4.5044169611307425e-05, "loss": 0.4469, "step": 5100 }, { "epoch": 0.0910038698733014, "grad_norm": 5.542126178741455, "learning_rate": 4.548586572438163e-05, "loss": 0.5283, "step": 5150 }, { "epoch": 0.09188740259051793, "grad_norm": 2.4414310455322266, "learning_rate": 4.5927561837455834e-05, "loss": 0.4826, "step": 5200 }, { "epoch": 0.09277093530773445, "grad_norm": 3.52422833442688, "learning_rate": 4.636925795053004e-05, "loss": 0.3895, "step": 5250 }, { "epoch": 0.09365446802495096, "grad_norm": 2.1975631713867188, "learning_rate": 4.681095406360424e-05, "loss": 0.4873, "step": 5300 }, { "epoch": 0.09453800074216748, "grad_norm": 3.4910616874694824, "learning_rate": 4.725265017667845e-05, "loss": 0.4895, "step": 5350 }, { "epoch": 0.095421533459384, "grad_norm": 2.1225690841674805, "learning_rate": 4.769434628975265e-05, "loss": 0.4686, "step": 5400 }, { "epoch": 0.09630506617660052, "grad_norm": 2.2319257259368896, "learning_rate": 4.8136042402826856e-05, "loss": 0.4723, "step": 5450 }, { "epoch": 0.09718859889381704, "grad_norm": 2.2340879440307617, "learning_rate": 4.857773851590106e-05, "loss": 0.5258, "step": 5500 }, { "epoch": 0.09807213161103355, "grad_norm": 3.2808139324188232, "learning_rate": 4.901943462897527e-05, "loss": 0.4851, "step": 5550 }, { "epoch": 0.09895566432825008, "grad_norm": 2.4828484058380127, "learning_rate": 4.946113074204947e-05, "loss": 0.5311, "step": 5600 }, { "epoch": 0.0998391970454666, "grad_norm": 1.7307246923446655, "learning_rate": 4.990282685512368e-05, "loss": 0.411, "step": 5650 }, { "epoch": 0.10072272976268311, "grad_norm": 1.9073278903961182, "learning_rate": 4.996171290569595e-05, "loss": 0.4184, "step": 5700 }, { "epoch": 0.10160626247989964, "grad_norm": 1.8571208715438843, "learning_rate": 4.9912626887357406e-05, "loss": 0.4071, "step": 5750 }, { "epoch": 0.10248979519711615, "grad_norm": 1.7524621486663818, "learning_rate": 4.986354086901887e-05, "loss": 0.4712, "step": 5800 }, { "epoch": 0.10337332791433267, "grad_norm": 4.2943434715271, "learning_rate": 4.9814454850680335e-05, "loss": 0.4912, "step": 5850 }, { "epoch": 0.10425686063154918, "grad_norm": 2.398043632507324, "learning_rate": 4.97653688323418e-05, "loss": 0.5589, "step": 5900 }, { "epoch": 0.10514039334876571, "grad_norm": 1.9587973356246948, "learning_rate": 4.9716282814003265e-05, "loss": 0.4507, "step": 5950 }, { "epoch": 0.10602392606598222, "grad_norm": 2.0629475116729736, "learning_rate": 4.966719679566473e-05, "loss": 0.5429, "step": 6000 }, { "epoch": 0.10690745878319874, "grad_norm": 1.6127039194107056, "learning_rate": 4.961811077732619e-05, "loss": 0.3789, "step": 6050 }, { "epoch": 0.10779099150041525, "grad_norm": 2.230015993118286, "learning_rate": 4.956902475898765e-05, "loss": 0.3949, "step": 6100 }, { "epoch": 0.10867452421763178, "grad_norm": 1.9963310956954956, "learning_rate": 4.9519938740649116e-05, "loss": 0.4491, "step": 6150 }, { "epoch": 0.1095580569348483, "grad_norm": 2.2731542587280273, "learning_rate": 4.947085272231058e-05, "loss": 0.435, "step": 6200 }, { "epoch": 0.11044158965206481, "grad_norm": 2.447551727294922, "learning_rate": 4.9421766703972046e-05, "loss": 0.3865, "step": 6250 }, { "epoch": 0.11132512236928134, "grad_norm": 2.126950740814209, "learning_rate": 4.9372680685633504e-05, "loss": 0.4175, "step": 6300 }, { "epoch": 0.11220865508649785, "grad_norm": 2.22995924949646, "learning_rate": 4.932359466729497e-05, "loss": 0.4387, "step": 6350 }, { "epoch": 0.11309218780371437, "grad_norm": 1.5801736116409302, "learning_rate": 4.927450864895643e-05, "loss": 0.4554, "step": 6400 }, { "epoch": 0.11397572052093088, "grad_norm": 4.113645553588867, "learning_rate": 4.92254226306179e-05, "loss": 0.581, "step": 6450 }, { "epoch": 0.11485925323814741, "grad_norm": 1.6027569770812988, "learning_rate": 4.917633661227936e-05, "loss": 0.4746, "step": 6500 }, { "epoch": 0.11574278595536393, "grad_norm": 2.0555272102355957, "learning_rate": 4.912725059394083e-05, "loss": 0.4511, "step": 6550 }, { "epoch": 0.11662631867258044, "grad_norm": 2.6827495098114014, "learning_rate": 4.9078164575602285e-05, "loss": 0.3871, "step": 6600 }, { "epoch": 0.11750985138979697, "grad_norm": 1.969202995300293, "learning_rate": 4.902907855726375e-05, "loss": 0.449, "step": 6650 }, { "epoch": 0.11839338410701349, "grad_norm": 1.9535086154937744, "learning_rate": 4.8979992538925214e-05, "loss": 0.3458, "step": 6700 }, { "epoch": 0.11927691682423, "grad_norm": 1.7251821756362915, "learning_rate": 4.893090652058668e-05, "loss": 0.4791, "step": 6750 }, { "epoch": 0.12016044954144652, "grad_norm": 1.7175688743591309, "learning_rate": 4.8881820502248144e-05, "loss": 0.4445, "step": 6800 }, { "epoch": 0.12104398225866304, "grad_norm": 3.1055896282196045, "learning_rate": 4.88327344839096e-05, "loss": 0.4907, "step": 6850 }, { "epoch": 0.12192751497587956, "grad_norm": 3.251380681991577, "learning_rate": 4.8783648465571066e-05, "loss": 0.5377, "step": 6900 }, { "epoch": 0.12281104769309607, "grad_norm": 2.909510850906372, "learning_rate": 4.873456244723254e-05, "loss": 0.5275, "step": 6950 }, { "epoch": 0.12369458041031259, "grad_norm": 2.0700035095214844, "learning_rate": 4.8685476428893995e-05, "loss": 0.5489, "step": 7000 }, { "epoch": 0.12457811312752912, "grad_norm": 1.9759315252304077, "learning_rate": 4.863639041055546e-05, "loss": 0.3931, "step": 7050 }, { "epoch": 0.12546164584474562, "grad_norm": 1.9036837816238403, "learning_rate": 4.8587304392216925e-05, "loss": 0.5155, "step": 7100 }, { "epoch": 0.12634517856196215, "grad_norm": 3.4224536418914795, "learning_rate": 4.853821837387838e-05, "loss": 0.4282, "step": 7150 }, { "epoch": 0.12722871127917867, "grad_norm": 3.1725916862487793, "learning_rate": 4.8489132355539854e-05, "loss": 0.4639, "step": 7200 }, { "epoch": 0.12811224399639518, "grad_norm": 1.7154817581176758, "learning_rate": 4.844004633720131e-05, "loss": 0.5294, "step": 7250 }, { "epoch": 0.1289957767136117, "grad_norm": 2.130659580230713, "learning_rate": 4.839096031886278e-05, "loss": 0.4121, "step": 7300 }, { "epoch": 0.12987930943082823, "grad_norm": 1.8878060579299927, "learning_rate": 4.834187430052424e-05, "loss": 0.4139, "step": 7350 }, { "epoch": 0.13076284214804473, "grad_norm": 1.9885565042495728, "learning_rate": 4.82927882821857e-05, "loss": 0.4311, "step": 7400 }, { "epoch": 0.13164637486526126, "grad_norm": 2.3639650344848633, "learning_rate": 4.824370226384717e-05, "loss": 0.4025, "step": 7450 }, { "epoch": 0.1325299075824778, "grad_norm": 3.4997270107269287, "learning_rate": 4.8194616245508635e-05, "loss": 0.4791, "step": 7500 }, { "epoch": 0.1334134402996943, "grad_norm": 1.644084095954895, "learning_rate": 4.814553022717009e-05, "loss": 0.4498, "step": 7550 }, { "epoch": 0.13429697301691082, "grad_norm": 1.8292336463928223, "learning_rate": 4.809644420883156e-05, "loss": 0.4538, "step": 7600 }, { "epoch": 0.13518050573412735, "grad_norm": 3.380443572998047, "learning_rate": 4.804735819049302e-05, "loss": 0.4596, "step": 7650 }, { "epoch": 0.13606403845134385, "grad_norm": 1.6248747110366821, "learning_rate": 4.799827217215449e-05, "loss": 0.3508, "step": 7700 }, { "epoch": 0.13694757116856038, "grad_norm": 1.6644774675369263, "learning_rate": 4.794918615381595e-05, "loss": 0.5145, "step": 7750 }, { "epoch": 0.13783110388577688, "grad_norm": 1.8441638946533203, "learning_rate": 4.790010013547741e-05, "loss": 0.3505, "step": 7800 }, { "epoch": 0.1387146366029934, "grad_norm": 1.761982798576355, "learning_rate": 4.7851014117138874e-05, "loss": 0.3354, "step": 7850 }, { "epoch": 0.13959816932020994, "grad_norm": 3.417602777481079, "learning_rate": 4.780192809880034e-05, "loss": 0.4474, "step": 7900 }, { "epoch": 0.14048170203742644, "grad_norm": 1.7687017917633057, "learning_rate": 4.7752842080461804e-05, "loss": 0.3524, "step": 7950 }, { "epoch": 0.14136523475464297, "grad_norm": 3.2442593574523926, "learning_rate": 4.770375606212327e-05, "loss": 0.4957, "step": 8000 }, { "epoch": 0.1422487674718595, "grad_norm": 1.813818335533142, "learning_rate": 4.765467004378473e-05, "loss": 0.4461, "step": 8050 }, { "epoch": 0.143132300189076, "grad_norm": 1.936123013496399, "learning_rate": 4.760558402544619e-05, "loss": 0.4983, "step": 8100 }, { "epoch": 0.14401583290629252, "grad_norm": 2.0068929195404053, "learning_rate": 4.7556498007107656e-05, "loss": 0.4535, "step": 8150 }, { "epoch": 0.14489936562350905, "grad_norm": 1.6743545532226562, "learning_rate": 4.750741198876913e-05, "loss": 0.3668, "step": 8200 }, { "epoch": 0.14578289834072555, "grad_norm": 1.9963476657867432, "learning_rate": 4.7458325970430585e-05, "loss": 0.4688, "step": 8250 }, { "epoch": 0.14666643105794208, "grad_norm": 1.7402074337005615, "learning_rate": 4.740923995209205e-05, "loss": 0.3967, "step": 8300 }, { "epoch": 0.14754996377515858, "grad_norm": 2.0074145793914795, "learning_rate": 4.736015393375351e-05, "loss": 0.4911, "step": 8350 }, { "epoch": 0.1484334964923751, "grad_norm": 1.7804876565933228, "learning_rate": 4.731106791541497e-05, "loss": 0.4076, "step": 8400 }, { "epoch": 0.14931702920959164, "grad_norm": 2.1234054565429688, "learning_rate": 4.7261981897076444e-05, "loss": 0.398, "step": 8450 }, { "epoch": 0.15020056192680814, "grad_norm": 2.1532113552093506, "learning_rate": 4.72128958787379e-05, "loss": 0.4203, "step": 8500 }, { "epoch": 0.15108409464402467, "grad_norm": 1.8909550905227661, "learning_rate": 4.7163809860399366e-05, "loss": 0.414, "step": 8550 }, { "epoch": 0.1519676273612412, "grad_norm": 1.9415462017059326, "learning_rate": 4.711472384206083e-05, "loss": 0.3436, "step": 8600 }, { "epoch": 0.1528511600784577, "grad_norm": 2.2018544673919678, "learning_rate": 4.706563782372229e-05, "loss": 0.436, "step": 8650 }, { "epoch": 0.15373469279567423, "grad_norm": 1.5418767929077148, "learning_rate": 4.701655180538376e-05, "loss": 0.3761, "step": 8700 }, { "epoch": 0.15461822551289076, "grad_norm": 4.974616050720215, "learning_rate": 4.6967465787045225e-05, "loss": 0.5579, "step": 8750 }, { "epoch": 0.15550175823010726, "grad_norm": 1.8653486967086792, "learning_rate": 4.691837976870668e-05, "loss": 0.441, "step": 8800 }, { "epoch": 0.15638529094732379, "grad_norm": 2.2241523265838623, "learning_rate": 4.686929375036815e-05, "loss": 0.5877, "step": 8850 }, { "epoch": 0.15726882366454029, "grad_norm": 1.8084393739700317, "learning_rate": 4.6820207732029605e-05, "loss": 0.4081, "step": 8900 }, { "epoch": 0.15815235638175681, "grad_norm": 1.5464160442352295, "learning_rate": 4.677112171369108e-05, "loss": 0.4648, "step": 8950 }, { "epoch": 0.15903588909897334, "grad_norm": 1.7731395959854126, "learning_rate": 4.672203569535254e-05, "loss": 0.4321, "step": 9000 }, { "epoch": 0.15991942181618984, "grad_norm": 1.8130481243133545, "learning_rate": 4.6672949677014e-05, "loss": 0.4226, "step": 9050 }, { "epoch": 0.16080295453340637, "grad_norm": 2.4127371311187744, "learning_rate": 4.6623863658675464e-05, "loss": 0.3634, "step": 9100 }, { "epoch": 0.1616864872506229, "grad_norm": 2.362494707107544, "learning_rate": 4.657477764033693e-05, "loss": 0.4252, "step": 9150 }, { "epoch": 0.1625700199678394, "grad_norm": 1.855000615119934, "learning_rate": 4.6525691621998393e-05, "loss": 0.3899, "step": 9200 }, { "epoch": 0.16345355268505593, "grad_norm": 1.8728185892105103, "learning_rate": 4.647660560365986e-05, "loss": 0.4335, "step": 9250 }, { "epoch": 0.16433708540227246, "grad_norm": 1.977250576019287, "learning_rate": 4.642751958532132e-05, "loss": 0.4204, "step": 9300 }, { "epoch": 0.16522061811948896, "grad_norm": 4.992434978485107, "learning_rate": 4.637843356698278e-05, "loss": 0.5576, "step": 9350 }, { "epoch": 0.1661041508367055, "grad_norm": 1.673086166381836, "learning_rate": 4.6329347548644245e-05, "loss": 0.4712, "step": 9400 }, { "epoch": 0.166987683553922, "grad_norm": 1.8109374046325684, "learning_rate": 4.628026153030571e-05, "loss": 0.366, "step": 9450 }, { "epoch": 0.16787121627113852, "grad_norm": 1.9352269172668457, "learning_rate": 4.6231175511967175e-05, "loss": 0.3932, "step": 9500 }, { "epoch": 0.16875474898835505, "grad_norm": 1.7740451097488403, "learning_rate": 4.618208949362864e-05, "loss": 0.4836, "step": 9550 }, { "epoch": 0.16963828170557155, "grad_norm": 2.0106916427612305, "learning_rate": 4.61330034752901e-05, "loss": 0.3989, "step": 9600 }, { "epoch": 0.17052181442278808, "grad_norm": 1.5831292867660522, "learning_rate": 4.608391745695156e-05, "loss": 0.4025, "step": 9650 }, { "epoch": 0.1714053471400046, "grad_norm": 5.1861371994018555, "learning_rate": 4.6034831438613027e-05, "loss": 0.467, "step": 9700 }, { "epoch": 0.1722888798572211, "grad_norm": 3.7466721534729004, "learning_rate": 4.598574542027449e-05, "loss": 0.3558, "step": 9750 }, { "epoch": 0.17317241257443763, "grad_norm": 2.143721342086792, "learning_rate": 4.5936659401935956e-05, "loss": 0.3623, "step": 9800 }, { "epoch": 0.17405594529165416, "grad_norm": 2.1482434272766113, "learning_rate": 4.588757338359742e-05, "loss": 0.3438, "step": 9850 }, { "epoch": 0.17493947800887066, "grad_norm": 1.458309531211853, "learning_rate": 4.583848736525888e-05, "loss": 0.4193, "step": 9900 }, { "epoch": 0.1758230107260872, "grad_norm": 1.8698090314865112, "learning_rate": 4.578940134692034e-05, "loss": 0.3173, "step": 9950 }, { "epoch": 0.1767065434433037, "grad_norm": 2.087970018386841, "learning_rate": 4.574031532858181e-05, "loss": 0.4569, "step": 10000 }, { "epoch": 0.17759007616052022, "grad_norm": 1.6226812601089478, "learning_rate": 4.569122931024327e-05, "loss": 0.4538, "step": 10050 }, { "epoch": 0.17847360887773675, "grad_norm": 1.9845385551452637, "learning_rate": 4.564214329190474e-05, "loss": 0.4422, "step": 10100 }, { "epoch": 0.17935714159495325, "grad_norm": 1.7016047239303589, "learning_rate": 4.5593057273566195e-05, "loss": 0.3747, "step": 10150 }, { "epoch": 0.18024067431216978, "grad_norm": 2.2167670726776123, "learning_rate": 4.5543971255227666e-05, "loss": 0.3989, "step": 10200 }, { "epoch": 0.1811242070293863, "grad_norm": 1.464385747909546, "learning_rate": 4.549488523688913e-05, "loss": 0.5315, "step": 10250 }, { "epoch": 0.1820077397466028, "grad_norm": 1.2073971033096313, "learning_rate": 4.544579921855059e-05, "loss": 0.3565, "step": 10300 }, { "epoch": 0.18289127246381934, "grad_norm": 1.1773017644882202, "learning_rate": 4.5396713200212054e-05, "loss": 0.4409, "step": 10350 }, { "epoch": 0.18377480518103587, "grad_norm": 2.4389290809631348, "learning_rate": 4.534762718187352e-05, "loss": 0.3762, "step": 10400 }, { "epoch": 0.18465833789825237, "grad_norm": 3.560997247695923, "learning_rate": 4.529854116353498e-05, "loss": 0.4571, "step": 10450 }, { "epoch": 0.1855418706154689, "grad_norm": 2.0075438022613525, "learning_rate": 4.524945514519645e-05, "loss": 0.3561, "step": 10500 }, { "epoch": 0.1864254033326854, "grad_norm": 2.405439853668213, "learning_rate": 4.5200369126857906e-05, "loss": 0.4595, "step": 10550 }, { "epoch": 0.18730893604990193, "grad_norm": 1.6211732625961304, "learning_rate": 4.515128310851937e-05, "loss": 0.4576, "step": 10600 }, { "epoch": 0.18819246876711845, "grad_norm": 1.7272285223007202, "learning_rate": 4.5102197090180835e-05, "loss": 0.4957, "step": 10650 }, { "epoch": 0.18907600148433495, "grad_norm": 1.529583215713501, "learning_rate": 4.50531110718423e-05, "loss": 0.3533, "step": 10700 }, { "epoch": 0.18995953420155148, "grad_norm": 1.3267425298690796, "learning_rate": 4.5004025053503764e-05, "loss": 0.5213, "step": 10750 }, { "epoch": 0.190843066918768, "grad_norm": 2.40889573097229, "learning_rate": 4.495493903516523e-05, "loss": 0.4372, "step": 10800 }, { "epoch": 0.1917265996359845, "grad_norm": 2.532017230987549, "learning_rate": 4.4906834737193457e-05, "loss": 0.3286, "step": 10850 }, { "epoch": 0.19261013235320104, "grad_norm": 3.721505641937256, "learning_rate": 4.485774871885493e-05, "loss": 0.4082, "step": 10900 }, { "epoch": 0.19349366507041757, "grad_norm": 2.2368271350860596, "learning_rate": 4.4808662700516386e-05, "loss": 0.4056, "step": 10950 }, { "epoch": 0.19437719778763407, "grad_norm": 2.2011897563934326, "learning_rate": 4.475957668217785e-05, "loss": 0.4435, "step": 11000 }, { "epoch": 0.1952607305048506, "grad_norm": 2.1512463092803955, "learning_rate": 4.4710490663839315e-05, "loss": 0.4272, "step": 11050 }, { "epoch": 0.1961442632220671, "grad_norm": 1.5526123046875, "learning_rate": 4.466140464550077e-05, "loss": 0.4334, "step": 11100 }, { "epoch": 0.19702779593928363, "grad_norm": 1.4258567094802856, "learning_rate": 4.4612318627162245e-05, "loss": 0.4479, "step": 11150 }, { "epoch": 0.19791132865650016, "grad_norm": 3.2408463954925537, "learning_rate": 4.456323260882371e-05, "loss": 0.3545, "step": 11200 }, { "epoch": 0.19879486137371666, "grad_norm": 2.1903252601623535, "learning_rate": 4.451414659048517e-05, "loss": 0.3192, "step": 11250 }, { "epoch": 0.1996783940909332, "grad_norm": 1.9699974060058594, "learning_rate": 4.446506057214663e-05, "loss": 0.3883, "step": 11300 }, { "epoch": 0.20056192680814972, "grad_norm": 1.7133831977844238, "learning_rate": 4.441597455380809e-05, "loss": 0.3312, "step": 11350 }, { "epoch": 0.20144545952536622, "grad_norm": 3.0174543857574463, "learning_rate": 4.436688853546956e-05, "loss": 0.4888, "step": 11400 }, { "epoch": 0.20232899224258274, "grad_norm": 2.010566473007202, "learning_rate": 4.4317802517131026e-05, "loss": 0.5102, "step": 11450 }, { "epoch": 0.20321252495979927, "grad_norm": 2.093271493911743, "learning_rate": 4.4268716498792484e-05, "loss": 0.4133, "step": 11500 }, { "epoch": 0.20409605767701577, "grad_norm": 1.9231561422348022, "learning_rate": 4.421963048045395e-05, "loss": 0.4255, "step": 11550 }, { "epoch": 0.2049795903942323, "grad_norm": 1.561781644821167, "learning_rate": 4.417054446211541e-05, "loss": 0.3766, "step": 11600 }, { "epoch": 0.2058631231114488, "grad_norm": 2.006748676300049, "learning_rate": 4.412145844377688e-05, "loss": 0.3651, "step": 11650 }, { "epoch": 0.20674665582866533, "grad_norm": 1.5192091464996338, "learning_rate": 4.407237242543834e-05, "loss": 0.4562, "step": 11700 }, { "epoch": 0.20763018854588186, "grad_norm": 1.820331335067749, "learning_rate": 4.402328640709981e-05, "loss": 0.3946, "step": 11750 }, { "epoch": 0.20851372126309836, "grad_norm": 3.302582025527954, "learning_rate": 4.3974200388761265e-05, "loss": 0.4075, "step": 11800 }, { "epoch": 0.2093972539803149, "grad_norm": 2.601897716522217, "learning_rate": 4.392511437042273e-05, "loss": 0.4304, "step": 11850 }, { "epoch": 0.21028078669753142, "grad_norm": 1.58085036277771, "learning_rate": 4.3876028352084194e-05, "loss": 0.3404, "step": 11900 }, { "epoch": 0.21116431941474792, "grad_norm": 1.7569571733474731, "learning_rate": 4.382694233374566e-05, "loss": 0.4013, "step": 11950 }, { "epoch": 0.21204785213196445, "grad_norm": 1.9872467517852783, "learning_rate": 4.3777856315407124e-05, "loss": 0.4278, "step": 12000 }, { "epoch": 0.21293138484918098, "grad_norm": 1.4981114864349365, "learning_rate": 4.372877029706858e-05, "loss": 0.3905, "step": 12050 }, { "epoch": 0.21381491756639748, "grad_norm": 1.6444882154464722, "learning_rate": 4.3679684278730046e-05, "loss": 0.4082, "step": 12100 }, { "epoch": 0.214698450283614, "grad_norm": 1.9731707572937012, "learning_rate": 4.363059826039151e-05, "loss": 0.3855, "step": 12150 }, { "epoch": 0.2155819830008305, "grad_norm": 2.66648268699646, "learning_rate": 4.3581512242052976e-05, "loss": 0.4567, "step": 12200 }, { "epoch": 0.21646551571804704, "grad_norm": 2.0770373344421387, "learning_rate": 4.353242622371444e-05, "loss": 0.4368, "step": 12250 }, { "epoch": 0.21734904843526356, "grad_norm": 1.4739536046981812, "learning_rate": 4.3483340205375905e-05, "loss": 0.3686, "step": 12300 }, { "epoch": 0.21823258115248007, "grad_norm": 1.8857239484786987, "learning_rate": 4.343425418703736e-05, "loss": 0.4163, "step": 12350 }, { "epoch": 0.2191161138696966, "grad_norm": 1.722424030303955, "learning_rate": 4.3385168168698834e-05, "loss": 0.3595, "step": 12400 }, { "epoch": 0.21999964658691312, "grad_norm": 1.5602166652679443, "learning_rate": 4.333608215036029e-05, "loss": 0.3326, "step": 12450 }, { "epoch": 0.22088317930412962, "grad_norm": 1.7230535745620728, "learning_rate": 4.328699613202176e-05, "loss": 0.3775, "step": 12500 }, { "epoch": 0.22176671202134615, "grad_norm": 1.8666094541549683, "learning_rate": 4.323791011368322e-05, "loss": 0.3695, "step": 12550 }, { "epoch": 0.22265024473856268, "grad_norm": 3.1689233779907227, "learning_rate": 4.318882409534468e-05, "loss": 0.3545, "step": 12600 }, { "epoch": 0.22353377745577918, "grad_norm": 1.8885284662246704, "learning_rate": 4.313973807700615e-05, "loss": 0.3548, "step": 12650 }, { "epoch": 0.2244173101729957, "grad_norm": 1.8508330583572388, "learning_rate": 4.3090652058667615e-05, "loss": 0.4847, "step": 12700 }, { "epoch": 0.22530084289021224, "grad_norm": 2.1445882320404053, "learning_rate": 4.304156604032907e-05, "loss": 0.4, "step": 12750 }, { "epoch": 0.22618437560742874, "grad_norm": 1.721024990081787, "learning_rate": 4.299248002199054e-05, "loss": 0.4755, "step": 12800 }, { "epoch": 0.22706790832464527, "grad_norm": 1.7713844776153564, "learning_rate": 4.2943394003652e-05, "loss": 0.3399, "step": 12850 }, { "epoch": 0.22795144104186177, "grad_norm": 1.2936394214630127, "learning_rate": 4.289528970568024e-05, "loss": 0.3297, "step": 12900 }, { "epoch": 0.2288349737590783, "grad_norm": 1.6622658967971802, "learning_rate": 4.28462036873417e-05, "loss": 0.4071, "step": 12950 }, { "epoch": 0.22971850647629483, "grad_norm": 1.3949196338653564, "learning_rate": 4.279711766900316e-05, "loss": 0.4069, "step": 13000 }, { "epoch": 0.23060203919351133, "grad_norm": 1.8681453466415405, "learning_rate": 4.2748031650664624e-05, "loss": 0.5156, "step": 13050 }, { "epoch": 0.23148557191072786, "grad_norm": 1.6242793798446655, "learning_rate": 4.2698945632326096e-05, "loss": 0.4359, "step": 13100 }, { "epoch": 0.23236910462794438, "grad_norm": 2.897428035736084, "learning_rate": 4.2649859613987554e-05, "loss": 0.3702, "step": 13150 }, { "epoch": 0.23325263734516088, "grad_norm": 1.855938196182251, "learning_rate": 4.260077359564902e-05, "loss": 0.5026, "step": 13200 }, { "epoch": 0.2341361700623774, "grad_norm": 1.818076252937317, "learning_rate": 4.2551687577310476e-05, "loss": 0.5201, "step": 13250 }, { "epoch": 0.23501970277959394, "grad_norm": 1.9688682556152344, "learning_rate": 4.250260155897194e-05, "loss": 0.3857, "step": 13300 }, { "epoch": 0.23590323549681044, "grad_norm": 2.4908297061920166, "learning_rate": 4.245351554063341e-05, "loss": 0.3555, "step": 13350 }, { "epoch": 0.23678676821402697, "grad_norm": 1.9015276432037354, "learning_rate": 4.240442952229487e-05, "loss": 0.381, "step": 13400 }, { "epoch": 0.23767030093124347, "grad_norm": 3.011683225631714, "learning_rate": 4.2355343503956335e-05, "loss": 0.3804, "step": 13450 }, { "epoch": 0.23855383364846, "grad_norm": 3.5077691078186035, "learning_rate": 4.23062574856178e-05, "loss": 0.3666, "step": 13500 }, { "epoch": 0.23943736636567653, "grad_norm": 2.875953197479248, "learning_rate": 4.225717146727926e-05, "loss": 0.3792, "step": 13550 }, { "epoch": 0.24032089908289303, "grad_norm": 2.3432717323303223, "learning_rate": 4.220808544894073e-05, "loss": 0.3341, "step": 13600 }, { "epoch": 0.24120443180010956, "grad_norm": 1.6648529767990112, "learning_rate": 4.2158999430602194e-05, "loss": 0.4906, "step": 13650 }, { "epoch": 0.2420879645173261, "grad_norm": 2.034646987915039, "learning_rate": 4.210991341226365e-05, "loss": 0.541, "step": 13700 }, { "epoch": 0.2429714972345426, "grad_norm": 1.2273883819580078, "learning_rate": 4.2060827393925116e-05, "loss": 0.3936, "step": 13750 }, { "epoch": 0.24385502995175912, "grad_norm": 1.6031947135925293, "learning_rate": 4.201174137558658e-05, "loss": 0.3871, "step": 13800 }, { "epoch": 0.24473856266897565, "grad_norm": 1.7289350032806396, "learning_rate": 4.1962655357248045e-05, "loss": 0.2983, "step": 13850 }, { "epoch": 0.24562209538619215, "grad_norm": 1.792413592338562, "learning_rate": 4.191356933890951e-05, "loss": 0.4071, "step": 13900 }, { "epoch": 0.24650562810340867, "grad_norm": 1.5456571578979492, "learning_rate": 4.186448332057097e-05, "loss": 0.3434, "step": 13950 }, { "epoch": 0.24738916082062518, "grad_norm": 1.9666177034378052, "learning_rate": 4.181539730223243e-05, "loss": 0.3885, "step": 14000 }, { "epoch": 0.2482726935378417, "grad_norm": 2.5290989875793457, "learning_rate": 4.17663112838939e-05, "loss": 0.4296, "step": 14050 }, { "epoch": 0.24915622625505823, "grad_norm": 1.9654839038848877, "learning_rate": 4.171722526555536e-05, "loss": 0.3853, "step": 14100 }, { "epoch": 0.25003975897227476, "grad_norm": 1.68603515625, "learning_rate": 4.166813924721683e-05, "loss": 0.4068, "step": 14150 }, { "epoch": 0.25092329168949123, "grad_norm": 1.9062405824661255, "learning_rate": 4.161905322887829e-05, "loss": 0.4071, "step": 14200 }, { "epoch": 0.25180682440670776, "grad_norm": 1.7028473615646362, "learning_rate": 4.156996721053975e-05, "loss": 0.3588, "step": 14250 }, { "epoch": 0.2526903571239243, "grad_norm": 1.6032434701919556, "learning_rate": 4.1520881192201214e-05, "loss": 0.4161, "step": 14300 }, { "epoch": 0.2535738898411408, "grad_norm": 1.6103026866912842, "learning_rate": 4.147179517386268e-05, "loss": 0.3431, "step": 14350 }, { "epoch": 0.25445742255835735, "grad_norm": 3.727078914642334, "learning_rate": 4.142270915552414e-05, "loss": 0.3576, "step": 14400 }, { "epoch": 0.2553409552755739, "grad_norm": 1.3540493249893188, "learning_rate": 4.137362313718561e-05, "loss": 0.3563, "step": 14450 }, { "epoch": 0.25622448799279035, "grad_norm": 1.7373064756393433, "learning_rate": 4.1324537118847066e-05, "loss": 0.3406, "step": 14500 }, { "epoch": 0.2571080207100069, "grad_norm": 2.6311392784118652, "learning_rate": 4.127545110050853e-05, "loss": 0.4397, "step": 14550 }, { "epoch": 0.2579915534272234, "grad_norm": 1.845186471939087, "learning_rate": 4.122636508217e-05, "loss": 0.411, "step": 14600 }, { "epoch": 0.25887508614443994, "grad_norm": 1.5897334814071655, "learning_rate": 4.117727906383146e-05, "loss": 0.3742, "step": 14650 }, { "epoch": 0.25975861886165647, "grad_norm": 3.667428970336914, "learning_rate": 4.1128193045492924e-05, "loss": 0.3622, "step": 14700 }, { "epoch": 0.26064215157887294, "grad_norm": 1.7393996715545654, "learning_rate": 4.107910702715439e-05, "loss": 0.2782, "step": 14750 }, { "epoch": 0.26152568429608947, "grad_norm": 1.6495802402496338, "learning_rate": 4.103002100881585e-05, "loss": 0.36, "step": 14800 }, { "epoch": 0.262409217013306, "grad_norm": 1.5133942365646362, "learning_rate": 4.098093499047732e-05, "loss": 0.486, "step": 14850 }, { "epoch": 0.2632927497305225, "grad_norm": 1.848177194595337, "learning_rate": 4.0932830692505546e-05, "loss": 0.406, "step": 14900 }, { "epoch": 0.26417628244773905, "grad_norm": 3.320469379425049, "learning_rate": 4.088374467416701e-05, "loss": 0.357, "step": 14950 }, { "epoch": 0.2650598151649556, "grad_norm": 1.417015790939331, "learning_rate": 4.0834658655828475e-05, "loss": 0.2855, "step": 15000 }, { "epoch": 0.26594334788217205, "grad_norm": 1.8597488403320312, "learning_rate": 4.078557263748994e-05, "loss": 0.4424, "step": 15050 }, { "epoch": 0.2668268805993886, "grad_norm": 1.651663899421692, "learning_rate": 4.0736486619151405e-05, "loss": 0.352, "step": 15100 }, { "epoch": 0.2677104133166051, "grad_norm": 1.452006459236145, "learning_rate": 4.068740060081286e-05, "loss": 0.3638, "step": 15150 }, { "epoch": 0.26859394603382164, "grad_norm": 2.7887187004089355, "learning_rate": 4.063831458247433e-05, "loss": 0.3727, "step": 15200 }, { "epoch": 0.26947747875103817, "grad_norm": 1.9209206104278564, "learning_rate": 4.058922856413579e-05, "loss": 0.3842, "step": 15250 }, { "epoch": 0.2703610114682547, "grad_norm": 1.946022868156433, "learning_rate": 4.054014254579726e-05, "loss": 0.3625, "step": 15300 }, { "epoch": 0.27124454418547117, "grad_norm": 1.4893426895141602, "learning_rate": 4.049105652745872e-05, "loss": 0.4088, "step": 15350 }, { "epoch": 0.2721280769026877, "grad_norm": 1.7391968965530396, "learning_rate": 4.0441970509120186e-05, "loss": 0.4126, "step": 15400 }, { "epoch": 0.2730116096199042, "grad_norm": 1.7254865169525146, "learning_rate": 4.0392884490781644e-05, "loss": 0.4662, "step": 15450 }, { "epoch": 0.27389514233712076, "grad_norm": 4.502954483032227, "learning_rate": 4.034379847244311e-05, "loss": 0.3889, "step": 15500 }, { "epoch": 0.2747786750543373, "grad_norm": 2.4406206607818604, "learning_rate": 4.029471245410458e-05, "loss": 0.3618, "step": 15550 }, { "epoch": 0.27566220777155376, "grad_norm": 1.6272777318954468, "learning_rate": 4.024562643576604e-05, "loss": 0.4126, "step": 15600 }, { "epoch": 0.2765457404887703, "grad_norm": 1.5262032747268677, "learning_rate": 4.01965404174275e-05, "loss": 0.3771, "step": 15650 }, { "epoch": 0.2774292732059868, "grad_norm": 1.8245854377746582, "learning_rate": 4.014745439908896e-05, "loss": 0.4377, "step": 15700 }, { "epoch": 0.27831280592320334, "grad_norm": 2.8566267490386963, "learning_rate": 4.0098368380750425e-05, "loss": 0.4041, "step": 15750 }, { "epoch": 0.27919633864041987, "grad_norm": 2.0167641639709473, "learning_rate": 4.00492823624119e-05, "loss": 0.375, "step": 15800 }, { "epoch": 0.2800798713576364, "grad_norm": 1.9363830089569092, "learning_rate": 4.0000196344073355e-05, "loss": 0.3339, "step": 15850 }, { "epoch": 0.2809634040748529, "grad_norm": 2.208641767501831, "learning_rate": 3.995111032573482e-05, "loss": 0.348, "step": 15900 }, { "epoch": 0.2818469367920694, "grad_norm": 1.5789657831192017, "learning_rate": 3.9902024307396284e-05, "loss": 0.367, "step": 15950 }, { "epoch": 0.28273046950928593, "grad_norm": 1.6666336059570312, "learning_rate": 3.985293828905775e-05, "loss": 0.3427, "step": 16000 }, { "epoch": 0.28361400222650246, "grad_norm": 3.725020170211792, "learning_rate": 3.980385227071921e-05, "loss": 0.3637, "step": 16050 }, { "epoch": 0.284497534943719, "grad_norm": 1.5958735942840576, "learning_rate": 3.975476625238068e-05, "loss": 0.3489, "step": 16100 }, { "epoch": 0.28538106766093546, "grad_norm": 1.3779951333999634, "learning_rate": 3.9705680234042136e-05, "loss": 0.4209, "step": 16150 }, { "epoch": 0.286264600378152, "grad_norm": 1.6636724472045898, "learning_rate": 3.96565942157036e-05, "loss": 0.2984, "step": 16200 }, { "epoch": 0.2871481330953685, "grad_norm": 1.705592155456543, "learning_rate": 3.9607508197365065e-05, "loss": 0.3877, "step": 16250 }, { "epoch": 0.28803166581258505, "grad_norm": 1.5367944240570068, "learning_rate": 3.955842217902653e-05, "loss": 0.3508, "step": 16300 }, { "epoch": 0.2889151985298016, "grad_norm": 3.140960693359375, "learning_rate": 3.9509336160687994e-05, "loss": 0.3443, "step": 16350 }, { "epoch": 0.2897987312470181, "grad_norm": 1.2341272830963135, "learning_rate": 3.946025014234945e-05, "loss": 0.4346, "step": 16400 }, { "epoch": 0.2906822639642346, "grad_norm": 1.9500783681869507, "learning_rate": 3.941116412401092e-05, "loss": 0.4262, "step": 16450 }, { "epoch": 0.2915657966814511, "grad_norm": 1.344519853591919, "learning_rate": 3.936207810567238e-05, "loss": 0.3065, "step": 16500 }, { "epoch": 0.29244932939866763, "grad_norm": 1.4747456312179565, "learning_rate": 3.9312992087333846e-05, "loss": 0.4003, "step": 16550 }, { "epoch": 0.29333286211588416, "grad_norm": 1.5639158487319946, "learning_rate": 3.926390606899531e-05, "loss": 0.5295, "step": 16600 }, { "epoch": 0.2942163948331007, "grad_norm": 1.9425716400146484, "learning_rate": 3.9214820050656776e-05, "loss": 0.3582, "step": 16650 }, { "epoch": 0.29509992755031716, "grad_norm": 3.003871440887451, "learning_rate": 3.9165734032318234e-05, "loss": 0.3299, "step": 16700 }, { "epoch": 0.2959834602675337, "grad_norm": 3.689194679260254, "learning_rate": 3.91166480139797e-05, "loss": 0.3493, "step": 16750 }, { "epoch": 0.2968669929847502, "grad_norm": 1.9439842700958252, "learning_rate": 3.906756199564116e-05, "loss": 0.2752, "step": 16800 }, { "epoch": 0.29775052570196675, "grad_norm": 1.8846018314361572, "learning_rate": 3.901847597730263e-05, "loss": 0.3254, "step": 16850 }, { "epoch": 0.2986340584191833, "grad_norm": 2.9167964458465576, "learning_rate": 3.896938995896409e-05, "loss": 0.3352, "step": 16900 }, { "epoch": 0.2995175911363998, "grad_norm": 2.6470940113067627, "learning_rate": 3.892128566099233e-05, "loss": 0.3812, "step": 16950 }, { "epoch": 0.3004011238536163, "grad_norm": 2.1021623611450195, "learning_rate": 3.887219964265379e-05, "loss": 0.3332, "step": 17000 }, { "epoch": 0.3012846565708328, "grad_norm": 1.9923433065414429, "learning_rate": 3.882311362431525e-05, "loss": 0.3472, "step": 17050 }, { "epoch": 0.30216818928804934, "grad_norm": 1.5736125707626343, "learning_rate": 3.8774027605976714e-05, "loss": 0.4207, "step": 17100 }, { "epoch": 0.30305172200526587, "grad_norm": 2.2181496620178223, "learning_rate": 3.872494158763818e-05, "loss": 0.3849, "step": 17150 }, { "epoch": 0.3039352547224824, "grad_norm": 1.5112169981002808, "learning_rate": 3.867585556929964e-05, "loss": 0.3272, "step": 17200 }, { "epoch": 0.30481878743969887, "grad_norm": 1.5218919515609741, "learning_rate": 3.862676955096111e-05, "loss": 0.3037, "step": 17250 }, { "epoch": 0.3057023201569154, "grad_norm": 1.5864076614379883, "learning_rate": 3.857768353262257e-05, "loss": 0.2924, "step": 17300 }, { "epoch": 0.3065858528741319, "grad_norm": 1.8895894289016724, "learning_rate": 3.852859751428403e-05, "loss": 0.4029, "step": 17350 }, { "epoch": 0.30746938559134845, "grad_norm": 1.4156498908996582, "learning_rate": 3.8479511495945495e-05, "loss": 0.5016, "step": 17400 }, { "epoch": 0.308352918308565, "grad_norm": 1.4788236618041992, "learning_rate": 3.843042547760696e-05, "loss": 0.3648, "step": 17450 }, { "epoch": 0.3092364510257815, "grad_norm": 1.7631937265396118, "learning_rate": 3.8381339459268424e-05, "loss": 0.3045, "step": 17500 }, { "epoch": 0.310119983742998, "grad_norm": 1.9122941493988037, "learning_rate": 3.833225344092989e-05, "loss": 0.3271, "step": 17550 }, { "epoch": 0.3110035164602145, "grad_norm": 1.6838266849517822, "learning_rate": 3.828316742259135e-05, "loss": 0.519, "step": 17600 }, { "epoch": 0.31188704917743104, "grad_norm": 4.507582187652588, "learning_rate": 3.823408140425281e-05, "loss": 0.341, "step": 17650 }, { "epoch": 0.31277058189464757, "grad_norm": 1.3272327184677124, "learning_rate": 3.8184995385914276e-05, "loss": 0.3352, "step": 17700 }, { "epoch": 0.3136541146118641, "grad_norm": 2.516676664352417, "learning_rate": 3.813590936757574e-05, "loss": 0.4406, "step": 17750 }, { "epoch": 0.31453764732908057, "grad_norm": 1.8230887651443481, "learning_rate": 3.8086823349237206e-05, "loss": 0.3822, "step": 17800 }, { "epoch": 0.3154211800462971, "grad_norm": 1.5267698764801025, "learning_rate": 3.803773733089867e-05, "loss": 0.287, "step": 17850 }, { "epoch": 0.31630471276351363, "grad_norm": 2.647895574569702, "learning_rate": 3.798865131256013e-05, "loss": 0.4349, "step": 17900 }, { "epoch": 0.31718824548073016, "grad_norm": 1.5159648656845093, "learning_rate": 3.793956529422159e-05, "loss": 0.3633, "step": 17950 }, { "epoch": 0.3180717781979467, "grad_norm": 1.9135470390319824, "learning_rate": 3.7890479275883064e-05, "loss": 0.3431, "step": 18000 }, { "epoch": 0.3189553109151632, "grad_norm": 1.6438477039337158, "learning_rate": 3.784139325754452e-05, "loss": 0.3986, "step": 18050 }, { "epoch": 0.3198388436323797, "grad_norm": 1.6794339418411255, "learning_rate": 3.779230723920599e-05, "loss": 0.3279, "step": 18100 }, { "epoch": 0.3207223763495962, "grad_norm": 1.5067431926727295, "learning_rate": 3.7743221220867445e-05, "loss": 0.3062, "step": 18150 }, { "epoch": 0.32160590906681275, "grad_norm": 1.6953719854354858, "learning_rate": 3.7694135202528916e-05, "loss": 0.2973, "step": 18200 }, { "epoch": 0.3224894417840293, "grad_norm": 2.819748640060425, "learning_rate": 3.764504918419038e-05, "loss": 0.4078, "step": 18250 }, { "epoch": 0.3233729745012458, "grad_norm": 1.5743447542190552, "learning_rate": 3.759596316585184e-05, "loss": 0.31, "step": 18300 }, { "epoch": 0.3242565072184623, "grad_norm": 1.8966853618621826, "learning_rate": 3.7546877147513303e-05, "loss": 0.306, "step": 18350 }, { "epoch": 0.3251400399356788, "grad_norm": 2.7652056217193604, "learning_rate": 3.749779112917477e-05, "loss": 0.3426, "step": 18400 }, { "epoch": 0.32602357265289533, "grad_norm": 3.006504535675049, "learning_rate": 3.744870511083623e-05, "loss": 0.2807, "step": 18450 }, { "epoch": 0.32690710537011186, "grad_norm": 1.5666753053665161, "learning_rate": 3.73996190924977e-05, "loss": 0.3856, "step": 18500 }, { "epoch": 0.3277906380873284, "grad_norm": 1.9692752361297607, "learning_rate": 3.735053307415916e-05, "loss": 0.3575, "step": 18550 }, { "epoch": 0.3286741708045449, "grad_norm": 3.517622232437134, "learning_rate": 3.730144705582062e-05, "loss": 0.347, "step": 18600 }, { "epoch": 0.3295577035217614, "grad_norm": 1.8076531887054443, "learning_rate": 3.7252361037482085e-05, "loss": 0.3195, "step": 18650 }, { "epoch": 0.3304412362389779, "grad_norm": 1.8082791566848755, "learning_rate": 3.720327501914355e-05, "loss": 0.3543, "step": 18700 }, { "epoch": 0.33132476895619445, "grad_norm": 1.3712306022644043, "learning_rate": 3.7154189000805014e-05, "loss": 0.3642, "step": 18750 }, { "epoch": 0.332208301673411, "grad_norm": 1.5654476881027222, "learning_rate": 3.710510298246648e-05, "loss": 0.3415, "step": 18800 }, { "epoch": 0.3330918343906275, "grad_norm": 1.4388914108276367, "learning_rate": 3.7056016964127937e-05, "loss": 0.3069, "step": 18850 }, { "epoch": 0.333975367107844, "grad_norm": 1.5527664422988892, "learning_rate": 3.70069309457894e-05, "loss": 0.2962, "step": 18900 }, { "epoch": 0.3348588998250605, "grad_norm": 1.6680736541748047, "learning_rate": 3.6957844927450866e-05, "loss": 0.3156, "step": 18950 }, { "epoch": 0.33574243254227704, "grad_norm": 2.266108274459839, "learning_rate": 3.69097406294791e-05, "loss": 0.3791, "step": 19000 }, { "epoch": 0.33662596525949356, "grad_norm": 1.4146838188171387, "learning_rate": 3.6860654611140565e-05, "loss": 0.3287, "step": 19050 }, { "epoch": 0.3375094979767101, "grad_norm": 1.640153169631958, "learning_rate": 3.681156859280202e-05, "loss": 0.4034, "step": 19100 }, { "epoch": 0.3383930306939266, "grad_norm": 1.670589804649353, "learning_rate": 3.6762482574463494e-05, "loss": 0.3476, "step": 19150 }, { "epoch": 0.3392765634111431, "grad_norm": 3.375941753387451, "learning_rate": 3.671339655612496e-05, "loss": 0.363, "step": 19200 }, { "epoch": 0.3401600961283596, "grad_norm": 1.965834379196167, "learning_rate": 3.666431053778642e-05, "loss": 0.3182, "step": 19250 }, { "epoch": 0.34104362884557615, "grad_norm": 1.607900857925415, "learning_rate": 3.661522451944788e-05, "loss": 0.3238, "step": 19300 }, { "epoch": 0.3419271615627927, "grad_norm": 1.4051165580749512, "learning_rate": 3.6566138501109346e-05, "loss": 0.3043, "step": 19350 }, { "epoch": 0.3428106942800092, "grad_norm": 1.4679523706436157, "learning_rate": 3.651705248277081e-05, "loss": 0.3902, "step": 19400 }, { "epoch": 0.3436942269972257, "grad_norm": 1.5135536193847656, "learning_rate": 3.6467966464432276e-05, "loss": 0.3085, "step": 19450 }, { "epoch": 0.3445777597144422, "grad_norm": 2.2533581256866455, "learning_rate": 3.6418880446093734e-05, "loss": 0.3162, "step": 19500 }, { "epoch": 0.34546129243165874, "grad_norm": 1.625067949295044, "learning_rate": 3.63697944277552e-05, "loss": 0.345, "step": 19550 }, { "epoch": 0.34634482514887527, "grad_norm": 1.1573612689971924, "learning_rate": 3.632070840941666e-05, "loss": 0.3017, "step": 19600 }, { "epoch": 0.3472283578660918, "grad_norm": 3.46663498878479, "learning_rate": 3.627162239107813e-05, "loss": 0.4232, "step": 19650 }, { "epoch": 0.3481118905833083, "grad_norm": 1.5614382028579712, "learning_rate": 3.622253637273959e-05, "loss": 0.3363, "step": 19700 }, { "epoch": 0.3489954233005248, "grad_norm": 1.3841484785079956, "learning_rate": 3.617345035440106e-05, "loss": 0.3484, "step": 19750 }, { "epoch": 0.3498789560177413, "grad_norm": 1.941517949104309, "learning_rate": 3.6124364336062515e-05, "loss": 0.3719, "step": 19800 }, { "epoch": 0.35076248873495786, "grad_norm": 4.908963680267334, "learning_rate": 3.607527831772398e-05, "loss": 0.3226, "step": 19850 }, { "epoch": 0.3516460214521744, "grad_norm": 1.5221627950668335, "learning_rate": 3.6026192299385444e-05, "loss": 0.3636, "step": 19900 }, { "epoch": 0.3525295541693909, "grad_norm": 1.8089814186096191, "learning_rate": 3.597710628104691e-05, "loss": 0.3704, "step": 19950 }, { "epoch": 0.3534130868866074, "grad_norm": 2.786560535430908, "learning_rate": 3.5928020262708373e-05, "loss": 0.3459, "step": 20000 }, { "epoch": 0.3542966196038239, "grad_norm": 2.97851824760437, "learning_rate": 3.587893424436983e-05, "loss": 0.3226, "step": 20050 }, { "epoch": 0.35518015232104044, "grad_norm": 2.1979775428771973, "learning_rate": 3.5829848226031296e-05, "loss": 0.3256, "step": 20100 }, { "epoch": 0.35606368503825697, "grad_norm": 1.762453556060791, "learning_rate": 3.578076220769276e-05, "loss": 0.3179, "step": 20150 }, { "epoch": 0.3569472177554735, "grad_norm": 1.4908533096313477, "learning_rate": 3.5731676189354225e-05, "loss": 0.4226, "step": 20200 }, { "epoch": 0.35783075047269003, "grad_norm": 1.3192092180252075, "learning_rate": 3.568259017101569e-05, "loss": 0.4196, "step": 20250 }, { "epoch": 0.3587142831899065, "grad_norm": 1.421736717224121, "learning_rate": 3.5633504152677155e-05, "loss": 0.3618, "step": 20300 }, { "epoch": 0.35959781590712303, "grad_norm": 2.0631330013275146, "learning_rate": 3.558441813433861e-05, "loss": 0.4093, "step": 20350 }, { "epoch": 0.36048134862433956, "grad_norm": 1.6250920295715332, "learning_rate": 3.5535332116000084e-05, "loss": 0.3051, "step": 20400 }, { "epoch": 0.3613648813415561, "grad_norm": 1.4659417867660522, "learning_rate": 3.548624609766155e-05, "loss": 0.3379, "step": 20450 }, { "epoch": 0.3622484140587726, "grad_norm": 1.520573616027832, "learning_rate": 3.5437160079323007e-05, "loss": 0.3582, "step": 20500 }, { "epoch": 0.3631319467759891, "grad_norm": 2.158830165863037, "learning_rate": 3.538807406098447e-05, "loss": 0.4004, "step": 20550 }, { "epoch": 0.3640154794932056, "grad_norm": 1.7503968477249146, "learning_rate": 3.533898804264593e-05, "loss": 0.33, "step": 20600 }, { "epoch": 0.36489901221042215, "grad_norm": 1.5064153671264648, "learning_rate": 3.52899020243074e-05, "loss": 0.3072, "step": 20650 }, { "epoch": 0.3657825449276387, "grad_norm": 3.5023598670959473, "learning_rate": 3.5240816005968865e-05, "loss": 0.35, "step": 20700 }, { "epoch": 0.3666660776448552, "grad_norm": 1.7911083698272705, "learning_rate": 3.519172998763032e-05, "loss": 0.3241, "step": 20750 }, { "epoch": 0.36754961036207173, "grad_norm": 1.50026273727417, "learning_rate": 3.514264396929179e-05, "loss": 0.37, "step": 20800 }, { "epoch": 0.3684331430792882, "grad_norm": 1.5556259155273438, "learning_rate": 3.509355795095325e-05, "loss": 0.2689, "step": 20850 }, { "epoch": 0.36931667579650473, "grad_norm": 1.6530933380126953, "learning_rate": 3.504447193261472e-05, "loss": 0.4061, "step": 20900 }, { "epoch": 0.37020020851372126, "grad_norm": 1.250317931175232, "learning_rate": 3.499538591427618e-05, "loss": 0.3412, "step": 20950 }, { "epoch": 0.3710837412309378, "grad_norm": 1.9599151611328125, "learning_rate": 3.494728161630441e-05, "loss": 0.3619, "step": 21000 }, { "epoch": 0.3719672739481543, "grad_norm": 1.3728086948394775, "learning_rate": 3.4898195597965874e-05, "loss": 0.314, "step": 21050 }, { "epoch": 0.3728508066653708, "grad_norm": 1.6389710903167725, "learning_rate": 3.4849109579627346e-05, "loss": 0.2912, "step": 21100 }, { "epoch": 0.3737343393825873, "grad_norm": 3.552582025527954, "learning_rate": 3.4800023561288803e-05, "loss": 0.3402, "step": 21150 }, { "epoch": 0.37461787209980385, "grad_norm": 1.6479156017303467, "learning_rate": 3.475093754295027e-05, "loss": 0.3462, "step": 21200 }, { "epoch": 0.3755014048170204, "grad_norm": 1.593705415725708, "learning_rate": 3.470185152461173e-05, "loss": 0.2775, "step": 21250 }, { "epoch": 0.3763849375342369, "grad_norm": 2.1807069778442383, "learning_rate": 3.465276550627319e-05, "loss": 0.3825, "step": 21300 }, { "epoch": 0.37726847025145344, "grad_norm": 1.6359409093856812, "learning_rate": 3.460367948793466e-05, "loss": 0.3931, "step": 21350 }, { "epoch": 0.3781520029686699, "grad_norm": 1.5960018634796143, "learning_rate": 3.455459346959612e-05, "loss": 0.4059, "step": 21400 }, { "epoch": 0.37903553568588644, "grad_norm": 3.367835283279419, "learning_rate": 3.4505507451257585e-05, "loss": 0.3264, "step": 21450 }, { "epoch": 0.37991906840310297, "grad_norm": 1.5965161323547363, "learning_rate": 3.445642143291905e-05, "loss": 0.2605, "step": 21500 }, { "epoch": 0.3808026011203195, "grad_norm": 1.5011396408081055, "learning_rate": 3.440733541458051e-05, "loss": 0.3658, "step": 21550 }, { "epoch": 0.381686133837536, "grad_norm": 1.5021259784698486, "learning_rate": 3.435824939624198e-05, "loss": 0.3274, "step": 21600 }, { "epoch": 0.3825696665547525, "grad_norm": 1.5224860906600952, "learning_rate": 3.430916337790344e-05, "loss": 0.3094, "step": 21650 }, { "epoch": 0.383453199271969, "grad_norm": 3.36433482170105, "learning_rate": 3.42600773595649e-05, "loss": 0.3556, "step": 21700 }, { "epoch": 0.38433673198918555, "grad_norm": 1.9824773073196411, "learning_rate": 3.4210991341226366e-05, "loss": 0.2877, "step": 21750 }, { "epoch": 0.3852202647064021, "grad_norm": 1.5103614330291748, "learning_rate": 3.416190532288783e-05, "loss": 0.3203, "step": 21800 }, { "epoch": 0.3861037974236186, "grad_norm": 1.1625959873199463, "learning_rate": 3.4112819304549295e-05, "loss": 0.2553, "step": 21850 }, { "epoch": 0.38698733014083514, "grad_norm": 1.5695985555648804, "learning_rate": 3.406373328621076e-05, "loss": 0.4425, "step": 21900 }, { "epoch": 0.3878708628580516, "grad_norm": 1.6758594512939453, "learning_rate": 3.401464726787222e-05, "loss": 0.3249, "step": 21950 }, { "epoch": 0.38875439557526814, "grad_norm": 3.6129748821258545, "learning_rate": 3.396556124953368e-05, "loss": 0.3649, "step": 22000 }, { "epoch": 0.38963792829248467, "grad_norm": 1.6155461072921753, "learning_rate": 3.391647523119515e-05, "loss": 0.3621, "step": 22050 }, { "epoch": 0.3905214610097012, "grad_norm": 1.7477047443389893, "learning_rate": 3.386738921285661e-05, "loss": 0.4232, "step": 22100 }, { "epoch": 0.3914049937269177, "grad_norm": 3.0512797832489014, "learning_rate": 3.3818303194518076e-05, "loss": 0.266, "step": 22150 }, { "epoch": 0.3922885264441342, "grad_norm": 1.4074236154556274, "learning_rate": 3.376921717617954e-05, "loss": 0.3767, "step": 22200 }, { "epoch": 0.39317205916135073, "grad_norm": 1.7168455123901367, "learning_rate": 3.3720131157841e-05, "loss": 0.366, "step": 22250 }, { "epoch": 0.39405559187856726, "grad_norm": 3.360104560852051, "learning_rate": 3.3671045139502464e-05, "loss": 0.3211, "step": 22300 }, { "epoch": 0.3949391245957838, "grad_norm": 1.527031660079956, "learning_rate": 3.3621959121163935e-05, "loss": 0.2505, "step": 22350 }, { "epoch": 0.3958226573130003, "grad_norm": 1.7586029767990112, "learning_rate": 3.357287310282539e-05, "loss": 0.3824, "step": 22400 }, { "epoch": 0.39670619003021684, "grad_norm": 2.3490004539489746, "learning_rate": 3.352378708448686e-05, "loss": 0.331, "step": 22450 }, { "epoch": 0.3975897227474333, "grad_norm": 1.5686146020889282, "learning_rate": 3.3474701066148316e-05, "loss": 0.3136, "step": 22500 }, { "epoch": 0.39847325546464984, "grad_norm": 1.5068285465240479, "learning_rate": 3.342561504780978e-05, "loss": 0.297, "step": 22550 }, { "epoch": 0.3993567881818664, "grad_norm": 1.81602942943573, "learning_rate": 3.337652902947125e-05, "loss": 0.2933, "step": 22600 }, { "epoch": 0.4002403208990829, "grad_norm": 3.4516189098358154, "learning_rate": 3.332744301113271e-05, "loss": 0.4026, "step": 22650 }, { "epoch": 0.40112385361629943, "grad_norm": 1.5759230852127075, "learning_rate": 3.3278356992794174e-05, "loss": 0.3567, "step": 22700 }, { "epoch": 0.4020073863335159, "grad_norm": 1.9385254383087158, "learning_rate": 3.322927097445564e-05, "loss": 0.3711, "step": 22750 }, { "epoch": 0.40289091905073243, "grad_norm": 1.6334116458892822, "learning_rate": 3.31801849561171e-05, "loss": 0.378, "step": 22800 }, { "epoch": 0.40377445176794896, "grad_norm": 2.0981173515319824, "learning_rate": 3.313109893777857e-05, "loss": 0.355, "step": 22850 }, { "epoch": 0.4046579844851655, "grad_norm": 1.6996448040008545, "learning_rate": 3.308201291944003e-05, "loss": 0.3044, "step": 22900 }, { "epoch": 0.405541517202382, "grad_norm": 1.3511463403701782, "learning_rate": 3.303292690110149e-05, "loss": 0.357, "step": 22950 }, { "epoch": 0.40642504991959855, "grad_norm": 1.7596737146377563, "learning_rate": 3.2983840882762956e-05, "loss": 0.3616, "step": 23000 }, { "epoch": 0.407308582636815, "grad_norm": 2.8382747173309326, "learning_rate": 3.2934754864424413e-05, "loss": 0.3139, "step": 23050 }, { "epoch": 0.40819211535403155, "grad_norm": 3.052281618118286, "learning_rate": 3.2885668846085885e-05, "loss": 0.3474, "step": 23100 }, { "epoch": 0.4090756480712481, "grad_norm": 1.373552680015564, "learning_rate": 3.283756454811412e-05, "loss": 0.3208, "step": 23150 }, { "epoch": 0.4099591807884646, "grad_norm": 1.6797386407852173, "learning_rate": 3.278847852977558e-05, "loss": 0.3798, "step": 23200 }, { "epoch": 0.41084271350568113, "grad_norm": 1.8930203914642334, "learning_rate": 3.273939251143704e-05, "loss": 0.3282, "step": 23250 }, { "epoch": 0.4117262462228976, "grad_norm": 1.256135106086731, "learning_rate": 3.2690306493098507e-05, "loss": 0.3302, "step": 23300 }, { "epoch": 0.41260977894011414, "grad_norm": 1.952988862991333, "learning_rate": 3.264122047475997e-05, "loss": 0.3599, "step": 23350 }, { "epoch": 0.41349331165733066, "grad_norm": 1.3686082363128662, "learning_rate": 3.2592134456421436e-05, "loss": 0.3608, "step": 23400 }, { "epoch": 0.4143768443745472, "grad_norm": 1.56107759475708, "learning_rate": 3.2543048438082894e-05, "loss": 0.3387, "step": 23450 }, { "epoch": 0.4152603770917637, "grad_norm": 1.823240876197815, "learning_rate": 3.249396241974436e-05, "loss": 0.3987, "step": 23500 }, { "epoch": 0.41614390980898025, "grad_norm": 1.2912514209747314, "learning_rate": 3.244487640140583e-05, "loss": 0.3387, "step": 23550 }, { "epoch": 0.4170274425261967, "grad_norm": 1.5520604848861694, "learning_rate": 3.239579038306729e-05, "loss": 0.2989, "step": 23600 }, { "epoch": 0.41791097524341325, "grad_norm": 1.4236600399017334, "learning_rate": 3.234670436472875e-05, "loss": 0.2629, "step": 23650 }, { "epoch": 0.4187945079606298, "grad_norm": 3.2101380825042725, "learning_rate": 3.229761834639022e-05, "loss": 0.2905, "step": 23700 }, { "epoch": 0.4196780406778463, "grad_norm": 1.3380919694900513, "learning_rate": 3.2248532328051675e-05, "loss": 0.3234, "step": 23750 }, { "epoch": 0.42056157339506284, "grad_norm": 1.5015414953231812, "learning_rate": 3.2199446309713146e-05, "loss": 0.3063, "step": 23800 }, { "epoch": 0.4214451061122793, "grad_norm": 1.289444923400879, "learning_rate": 3.2150360291374604e-05, "loss": 0.3386, "step": 23850 }, { "epoch": 0.42232863882949584, "grad_norm": 2.95922589302063, "learning_rate": 3.210127427303607e-05, "loss": 0.3431, "step": 23900 }, { "epoch": 0.42321217154671237, "grad_norm": 1.6753530502319336, "learning_rate": 3.2052188254697534e-05, "loss": 0.2902, "step": 23950 }, { "epoch": 0.4240957042639289, "grad_norm": 1.6901003122329712, "learning_rate": 3.2003102236359e-05, "loss": 0.3136, "step": 24000 }, { "epoch": 0.4249792369811454, "grad_norm": 4.797271251678467, "learning_rate": 3.195401621802046e-05, "loss": 0.4001, "step": 24050 }, { "epoch": 0.42586276969836195, "grad_norm": 1.4796360731124878, "learning_rate": 3.190493019968193e-05, "loss": 0.285, "step": 24100 }, { "epoch": 0.4267463024155784, "grad_norm": 1.4410722255706787, "learning_rate": 3.1855844181343386e-05, "loss": 0.4717, "step": 24150 }, { "epoch": 0.42762983513279496, "grad_norm": 1.398037075996399, "learning_rate": 3.180675816300485e-05, "loss": 0.3391, "step": 24200 }, { "epoch": 0.4285133678500115, "grad_norm": 1.3054397106170654, "learning_rate": 3.1757672144666315e-05, "loss": 0.2913, "step": 24250 }, { "epoch": 0.429396900567228, "grad_norm": 1.7768748998641968, "learning_rate": 3.170858612632778e-05, "loss": 0.3417, "step": 24300 }, { "epoch": 0.43028043328444454, "grad_norm": 1.2682479619979858, "learning_rate": 3.1659500107989244e-05, "loss": 0.2909, "step": 24350 }, { "epoch": 0.431163966001661, "grad_norm": 1.791175365447998, "learning_rate": 3.16104140896507e-05, "loss": 0.2871, "step": 24400 }, { "epoch": 0.43204749871887754, "grad_norm": 1.5249110460281372, "learning_rate": 3.156132807131217e-05, "loss": 0.3929, "step": 24450 }, { "epoch": 0.43293103143609407, "grad_norm": 1.2778598070144653, "learning_rate": 3.151224205297363e-05, "loss": 0.278, "step": 24500 }, { "epoch": 0.4338145641533106, "grad_norm": 3.55033278465271, "learning_rate": 3.1463156034635096e-05, "loss": 0.4386, "step": 24550 }, { "epoch": 0.43469809687052713, "grad_norm": 1.4700381755828857, "learning_rate": 3.141407001629656e-05, "loss": 0.4193, "step": 24600 }, { "epoch": 0.43558162958774366, "grad_norm": 1.150854468345642, "learning_rate": 3.1364983997958025e-05, "loss": 0.367, "step": 24650 }, { "epoch": 0.43646516230496013, "grad_norm": 1.6972355842590332, "learning_rate": 3.131589797961948e-05, "loss": 0.3474, "step": 24700 }, { "epoch": 0.43734869502217666, "grad_norm": 1.355474829673767, "learning_rate": 3.126681196128095e-05, "loss": 0.3116, "step": 24750 }, { "epoch": 0.4382322277393932, "grad_norm": 1.4246526956558228, "learning_rate": 3.121772594294242e-05, "loss": 0.2733, "step": 24800 }, { "epoch": 0.4391157604566097, "grad_norm": 1.5642348527908325, "learning_rate": 3.116863992460388e-05, "loss": 0.3046, "step": 24850 }, { "epoch": 0.43999929317382624, "grad_norm": 1.5843394994735718, "learning_rate": 3.111955390626534e-05, "loss": 0.3627, "step": 24900 }, { "epoch": 0.4408828258910427, "grad_norm": 1.6260349750518799, "learning_rate": 3.10704678879268e-05, "loss": 0.3403, "step": 24950 }, { "epoch": 0.44176635860825925, "grad_norm": 1.7742459774017334, "learning_rate": 3.1021381869588265e-05, "loss": 0.349, "step": 25000 }, { "epoch": 0.4426498913254758, "grad_norm": 1.4080630540847778, "learning_rate": 3.0972295851249736e-05, "loss": 0.3527, "step": 25050 }, { "epoch": 0.4435334240426923, "grad_norm": 1.7197438478469849, "learning_rate": 3.0923209832911194e-05, "loss": 0.3773, "step": 25100 }, { "epoch": 0.44441695675990883, "grad_norm": 1.5831055641174316, "learning_rate": 3.087510553493943e-05, "loss": 0.3372, "step": 25150 }, { "epoch": 0.44530048947712536, "grad_norm": 1.7535090446472168, "learning_rate": 3.082601951660089e-05, "loss": 0.3178, "step": 25200 }, { "epoch": 0.44618402219434183, "grad_norm": 1.6131466627120972, "learning_rate": 3.077693349826236e-05, "loss": 0.2745, "step": 25250 }, { "epoch": 0.44706755491155836, "grad_norm": 1.5419201850891113, "learning_rate": 3.072784747992382e-05, "loss": 0.2773, "step": 25300 }, { "epoch": 0.4479510876287749, "grad_norm": 1.6418931484222412, "learning_rate": 3.067876146158528e-05, "loss": 0.3822, "step": 25350 }, { "epoch": 0.4488346203459914, "grad_norm": 1.288121223449707, "learning_rate": 3.0629675443246745e-05, "loss": 0.3851, "step": 25400 }, { "epoch": 0.44971815306320795, "grad_norm": 1.9523035287857056, "learning_rate": 3.058058942490821e-05, "loss": 0.3805, "step": 25450 }, { "epoch": 0.4506016857804245, "grad_norm": 3.3735404014587402, "learning_rate": 3.0531503406569674e-05, "loss": 0.3245, "step": 25500 }, { "epoch": 0.45148521849764095, "grad_norm": 1.4013001918792725, "learning_rate": 3.048241738823114e-05, "loss": 0.2978, "step": 25550 }, { "epoch": 0.4523687512148575, "grad_norm": 1.9055225849151611, "learning_rate": 3.0433331369892604e-05, "loss": 0.3397, "step": 25600 }, { "epoch": 0.453252283932074, "grad_norm": 3.319705009460449, "learning_rate": 3.0384245351554065e-05, "loss": 0.4655, "step": 25650 }, { "epoch": 0.45413581664929054, "grad_norm": 1.3729950189590454, "learning_rate": 3.033515933321553e-05, "loss": 0.2669, "step": 25700 }, { "epoch": 0.45501934936650706, "grad_norm": 1.3527820110321045, "learning_rate": 3.028607331487699e-05, "loss": 0.3316, "step": 25750 }, { "epoch": 0.45590288208372354, "grad_norm": 1.4500503540039062, "learning_rate": 3.0236987296538455e-05, "loss": 0.3395, "step": 25800 }, { "epoch": 0.45678641480094007, "grad_norm": 2.8250796794891357, "learning_rate": 3.018790127819992e-05, "loss": 0.3631, "step": 25850 }, { "epoch": 0.4576699475181566, "grad_norm": 1.1532173156738281, "learning_rate": 3.013881525986138e-05, "loss": 0.3418, "step": 25900 }, { "epoch": 0.4585534802353731, "grad_norm": 1.687465786933899, "learning_rate": 3.0089729241522846e-05, "loss": 0.3351, "step": 25950 }, { "epoch": 0.45943701295258965, "grad_norm": 4.05789852142334, "learning_rate": 3.004064322318431e-05, "loss": 0.3117, "step": 26000 }, { "epoch": 0.4603205456698062, "grad_norm": 1.4303230047225952, "learning_rate": 2.9991557204845772e-05, "loss": 0.3197, "step": 26050 }, { "epoch": 0.46120407838702265, "grad_norm": 3.692739248275757, "learning_rate": 2.9942471186507237e-05, "loss": 0.2856, "step": 26100 }, { "epoch": 0.4620876111042392, "grad_norm": 2.6494288444519043, "learning_rate": 2.98933851681687e-05, "loss": 0.3668, "step": 26150 }, { "epoch": 0.4629711438214557, "grad_norm": 1.832560420036316, "learning_rate": 2.9844299149830163e-05, "loss": 0.4672, "step": 26200 }, { "epoch": 0.46385467653867224, "grad_norm": 3.4169373512268066, "learning_rate": 2.9795213131491627e-05, "loss": 0.373, "step": 26250 }, { "epoch": 0.46473820925588877, "grad_norm": 1.5430257320404053, "learning_rate": 2.974612711315309e-05, "loss": 0.3232, "step": 26300 }, { "epoch": 0.46562174197310524, "grad_norm": 1.674177646636963, "learning_rate": 2.9697041094814553e-05, "loss": 0.3461, "step": 26350 }, { "epoch": 0.46650527469032177, "grad_norm": 1.7116457223892212, "learning_rate": 2.9647955076476018e-05, "loss": 0.2937, "step": 26400 }, { "epoch": 0.4673888074075383, "grad_norm": 1.3711694478988647, "learning_rate": 2.9599850778504252e-05, "loss": 0.3511, "step": 26450 }, { "epoch": 0.4682723401247548, "grad_norm": 3.0807628631591797, "learning_rate": 2.9550764760165717e-05, "loss": 0.3204, "step": 26500 }, { "epoch": 0.46915587284197136, "grad_norm": 1.5949090719223022, "learning_rate": 2.950167874182718e-05, "loss": 0.2698, "step": 26550 }, { "epoch": 0.4700394055591879, "grad_norm": 1.6748404502868652, "learning_rate": 2.9452592723488643e-05, "loss": 0.3019, "step": 26600 }, { "epoch": 0.47092293827640436, "grad_norm": 1.6362017393112183, "learning_rate": 2.9403506705150108e-05, "loss": 0.276, "step": 26650 }, { "epoch": 0.4718064709936209, "grad_norm": 1.5143210887908936, "learning_rate": 2.935442068681157e-05, "loss": 0.2572, "step": 26700 }, { "epoch": 0.4726900037108374, "grad_norm": 2.1000730991363525, "learning_rate": 2.9305334668473034e-05, "loss": 0.2821, "step": 26750 }, { "epoch": 0.47357353642805394, "grad_norm": 1.9400396347045898, "learning_rate": 2.9256248650134498e-05, "loss": 0.2753, "step": 26800 }, { "epoch": 0.47445706914527047, "grad_norm": 1.7398908138275146, "learning_rate": 2.920716263179596e-05, "loss": 0.2789, "step": 26850 }, { "epoch": 0.47534060186248694, "grad_norm": 1.456929087638855, "learning_rate": 2.9158076613457424e-05, "loss": 0.5175, "step": 26900 }, { "epoch": 0.4762241345797035, "grad_norm": 1.4763001203536987, "learning_rate": 2.910899059511889e-05, "loss": 0.3398, "step": 26950 }, { "epoch": 0.47710766729692, "grad_norm": 1.3316082954406738, "learning_rate": 2.905990457678035e-05, "loss": 0.3683, "step": 27000 }, { "epoch": 0.47799120001413653, "grad_norm": 1.1095103025436401, "learning_rate": 2.9010818558441815e-05, "loss": 0.341, "step": 27050 }, { "epoch": 0.47887473273135306, "grad_norm": 1.5168321132659912, "learning_rate": 2.8961732540103276e-05, "loss": 0.2753, "step": 27100 }, { "epoch": 0.4797582654485696, "grad_norm": 1.9980124235153198, "learning_rate": 2.891264652176474e-05, "loss": 0.35, "step": 27150 }, { "epoch": 0.48064179816578606, "grad_norm": 1.6252918243408203, "learning_rate": 2.8863560503426205e-05, "loss": 0.3143, "step": 27200 }, { "epoch": 0.4815253308830026, "grad_norm": 1.6409038305282593, "learning_rate": 2.8814474485087667e-05, "loss": 0.3968, "step": 27250 }, { "epoch": 0.4824088636002191, "grad_norm": 1.4830607175827026, "learning_rate": 2.876538846674913e-05, "loss": 0.3246, "step": 27300 }, { "epoch": 0.48329239631743565, "grad_norm": 1.6359367370605469, "learning_rate": 2.87163024484106e-05, "loss": 0.3131, "step": 27350 }, { "epoch": 0.4841759290346522, "grad_norm": 1.1834681034088135, "learning_rate": 2.8667216430072057e-05, "loss": 0.3078, "step": 27400 }, { "epoch": 0.48505946175186865, "grad_norm": 1.3667497634887695, "learning_rate": 2.8618130411733522e-05, "loss": 0.419, "step": 27450 }, { "epoch": 0.4859429944690852, "grad_norm": 4.66032075881958, "learning_rate": 2.856904439339499e-05, "loss": 0.3959, "step": 27500 }, { "epoch": 0.4868265271863017, "grad_norm": 1.530393362045288, "learning_rate": 2.8519958375056448e-05, "loss": 0.3754, "step": 27550 }, { "epoch": 0.48771005990351823, "grad_norm": 0.9399372935295105, "learning_rate": 2.8470872356717916e-05, "loss": 0.3163, "step": 27600 }, { "epoch": 0.48859359262073476, "grad_norm": 1.654520869255066, "learning_rate": 2.8421786338379374e-05, "loss": 0.35, "step": 27650 }, { "epoch": 0.4894771253379513, "grad_norm": 1.5777958631515503, "learning_rate": 2.8372700320040842e-05, "loss": 0.3397, "step": 27700 }, { "epoch": 0.49036065805516776, "grad_norm": 1.4474226236343384, "learning_rate": 2.8323614301702307e-05, "loss": 0.3853, "step": 27750 }, { "epoch": 0.4912441907723843, "grad_norm": 1.603667140007019, "learning_rate": 2.8274528283363765e-05, "loss": 0.2568, "step": 27800 }, { "epoch": 0.4921277234896008, "grad_norm": 1.727280855178833, "learning_rate": 2.8225442265025233e-05, "loss": 0.3108, "step": 27850 }, { "epoch": 0.49301125620681735, "grad_norm": 1.4632737636566162, "learning_rate": 2.8176356246686697e-05, "loss": 0.4098, "step": 27900 }, { "epoch": 0.4938947889240339, "grad_norm": 1.5443991422653198, "learning_rate": 2.812727022834816e-05, "loss": 0.3364, "step": 27950 }, { "epoch": 0.49477832164125035, "grad_norm": 1.7304097414016724, "learning_rate": 2.8078184210009623e-05, "loss": 0.3354, "step": 28000 }, { "epoch": 0.4956618543584669, "grad_norm": 1.141662359237671, "learning_rate": 2.8029098191671088e-05, "loss": 0.2879, "step": 28050 }, { "epoch": 0.4965453870756834, "grad_norm": 1.5769354104995728, "learning_rate": 2.798001217333255e-05, "loss": 0.3604, "step": 28100 }, { "epoch": 0.49742891979289994, "grad_norm": 2.3104453086853027, "learning_rate": 2.7930926154994014e-05, "loss": 0.2612, "step": 28150 }, { "epoch": 0.49831245251011647, "grad_norm": 0.764305830001831, "learning_rate": 2.7881840136655475e-05, "loss": 0.3593, "step": 28200 }, { "epoch": 0.499195985227333, "grad_norm": 1.1693766117095947, "learning_rate": 2.783275411831694e-05, "loss": 0.2961, "step": 28250 }, { "epoch": 0.5000795179445495, "grad_norm": 1.65450918674469, "learning_rate": 2.7783668099978404e-05, "loss": 0.3338, "step": 28300 }, { "epoch": 0.500963050661766, "grad_norm": 1.438693642616272, "learning_rate": 2.7734582081639866e-05, "loss": 0.3109, "step": 28350 }, { "epoch": 0.5018465833789825, "grad_norm": 1.5170999765396118, "learning_rate": 2.768549606330133e-05, "loss": 0.3234, "step": 28400 }, { "epoch": 0.502730116096199, "grad_norm": 1.497454285621643, "learning_rate": 2.7636410044962795e-05, "loss": 0.3257, "step": 28450 }, { "epoch": 0.5036136488134155, "grad_norm": 3.3886194229125977, "learning_rate": 2.7587324026624256e-05, "loss": 0.4675, "step": 28500 }, { "epoch": 0.504497181530632, "grad_norm": 1.6604270935058594, "learning_rate": 2.753823800828572e-05, "loss": 0.4318, "step": 28550 }, { "epoch": 0.5053807142478486, "grad_norm": 1.7005223035812378, "learning_rate": 2.7489151989947186e-05, "loss": 0.3594, "step": 28600 }, { "epoch": 0.5062642469650651, "grad_norm": 1.109703540802002, "learning_rate": 2.7440065971608647e-05, "loss": 0.3214, "step": 28650 }, { "epoch": 0.5071477796822816, "grad_norm": 1.9164469242095947, "learning_rate": 2.739097995327011e-05, "loss": 0.2856, "step": 28700 }, { "epoch": 0.5080313123994982, "grad_norm": 1.3944114446640015, "learning_rate": 2.7341893934931573e-05, "loss": 0.3094, "step": 28750 }, { "epoch": 0.5089148451167147, "grad_norm": 1.3844256401062012, "learning_rate": 2.7292807916593038e-05, "loss": 0.3933, "step": 28800 }, { "epoch": 0.5097983778339312, "grad_norm": 3.18278431892395, "learning_rate": 2.7243721898254506e-05, "loss": 0.3432, "step": 28850 }, { "epoch": 0.5106819105511478, "grad_norm": 1.7024506330490112, "learning_rate": 2.7194635879915964e-05, "loss": 0.3766, "step": 28900 }, { "epoch": 0.5115654432683642, "grad_norm": 1.4224214553833008, "learning_rate": 2.7145549861577428e-05, "loss": 0.3308, "step": 28950 }, { "epoch": 0.5124489759855807, "grad_norm": 1.5428136587142944, "learning_rate": 2.7096463843238896e-05, "loss": 0.3453, "step": 29000 }, { "epoch": 0.5133325087027972, "grad_norm": 1.4710556268692017, "learning_rate": 2.7047377824900354e-05, "loss": 0.2904, "step": 29050 }, { "epoch": 0.5142160414200138, "grad_norm": 1.5080032348632812, "learning_rate": 2.6998291806561822e-05, "loss": 0.2647, "step": 29100 }, { "epoch": 0.5150995741372303, "grad_norm": 1.7176605463027954, "learning_rate": 2.6949205788223287e-05, "loss": 0.4395, "step": 29150 }, { "epoch": 0.5159831068544468, "grad_norm": 1.4339267015457153, "learning_rate": 2.6900119769884745e-05, "loss": 0.295, "step": 29200 }, { "epoch": 0.5168666395716633, "grad_norm": 1.1258848905563354, "learning_rate": 2.6851033751546213e-05, "loss": 0.3927, "step": 29250 }, { "epoch": 0.5177501722888799, "grad_norm": 2.5667836666107178, "learning_rate": 2.680194773320767e-05, "loss": 0.3492, "step": 29300 }, { "epoch": 0.5186337050060964, "grad_norm": 1.7218468189239502, "learning_rate": 2.675286171486914e-05, "loss": 0.3304, "step": 29350 }, { "epoch": 0.5195172377233129, "grad_norm": 2.4908971786499023, "learning_rate": 2.6703775696530603e-05, "loss": 0.3557, "step": 29400 }, { "epoch": 0.5204007704405295, "grad_norm": 1.787463665008545, "learning_rate": 2.665468967819206e-05, "loss": 0.3389, "step": 29450 }, { "epoch": 0.5212843031577459, "grad_norm": 3.174107789993286, "learning_rate": 2.660560365985353e-05, "loss": 0.3322, "step": 29500 }, { "epoch": 0.5221678358749624, "grad_norm": 1.648913025856018, "learning_rate": 2.6556517641514994e-05, "loss": 0.3053, "step": 29550 }, { "epoch": 0.5230513685921789, "grad_norm": 1.648561954498291, "learning_rate": 2.6507431623176455e-05, "loss": 0.2486, "step": 29600 }, { "epoch": 0.5239349013093955, "grad_norm": 1.199449062347412, "learning_rate": 2.645834560483792e-05, "loss": 0.282, "step": 29650 }, { "epoch": 0.524818434026612, "grad_norm": 0.9432544112205505, "learning_rate": 2.6409259586499385e-05, "loss": 0.3791, "step": 29700 }, { "epoch": 0.5257019667438285, "grad_norm": 2.9582953453063965, "learning_rate": 2.6360173568160846e-05, "loss": 0.3346, "step": 29750 }, { "epoch": 0.526585499461045, "grad_norm": 1.5263501405715942, "learning_rate": 2.631108754982231e-05, "loss": 0.2743, "step": 29800 }, { "epoch": 0.5274690321782616, "grad_norm": 1.63582181930542, "learning_rate": 2.6262001531483772e-05, "loss": 0.2927, "step": 29850 }, { "epoch": 0.5283525648954781, "grad_norm": 1.843386173248291, "learning_rate": 2.6212915513145237e-05, "loss": 0.3775, "step": 29900 }, { "epoch": 0.5292360976126946, "grad_norm": 1.236327886581421, "learning_rate": 2.61638294948067e-05, "loss": 0.3114, "step": 29950 }, { "epoch": 0.5301196303299112, "grad_norm": 1.5327879190444946, "learning_rate": 2.6114743476468162e-05, "loss": 0.2383, "step": 30000 }, { "epoch": 0.5310031630471276, "grad_norm": 1.6281217336654663, "learning_rate": 2.6065657458129627e-05, "loss": 0.3798, "step": 30050 }, { "epoch": 0.5318866957643441, "grad_norm": 1.1688692569732666, "learning_rate": 2.6016571439791092e-05, "loss": 0.3204, "step": 30100 }, { "epoch": 0.5327702284815606, "grad_norm": 1.354048490524292, "learning_rate": 2.5967485421452553e-05, "loss": 0.2496, "step": 30150 }, { "epoch": 0.5336537611987772, "grad_norm": 2.8124821186065674, "learning_rate": 2.5918399403114018e-05, "loss": 0.4147, "step": 30200 }, { "epoch": 0.5345372939159937, "grad_norm": 1.886425495147705, "learning_rate": 2.5869313384775486e-05, "loss": 0.3021, "step": 30250 }, { "epoch": 0.5354208266332102, "grad_norm": 1.6316314935684204, "learning_rate": 2.5820227366436944e-05, "loss": 0.2758, "step": 30300 }, { "epoch": 0.5363043593504268, "grad_norm": 1.3990044593811035, "learning_rate": 2.577114134809841e-05, "loss": 0.3166, "step": 30350 }, { "epoch": 0.5371878920676433, "grad_norm": 2.1562857627868652, "learning_rate": 2.572205532975987e-05, "loss": 0.35, "step": 30400 }, { "epoch": 0.5380714247848598, "grad_norm": 1.1287676095962524, "learning_rate": 2.5672969311421334e-05, "loss": 0.3391, "step": 30450 }, { "epoch": 0.5389549575020763, "grad_norm": 1.7524675130844116, "learning_rate": 2.5623883293082802e-05, "loss": 0.3576, "step": 30500 }, { "epoch": 0.5398384902192929, "grad_norm": 1.1238594055175781, "learning_rate": 2.5575778995111033e-05, "loss": 0.295, "step": 30550 }, { "epoch": 0.5407220229365094, "grad_norm": 0.9298042058944702, "learning_rate": 2.5526692976772498e-05, "loss": 0.3449, "step": 30600 }, { "epoch": 0.5416055556537258, "grad_norm": 1.5093685388565063, "learning_rate": 2.547760695843396e-05, "loss": 0.3274, "step": 30650 }, { "epoch": 0.5424890883709423, "grad_norm": 1.4606502056121826, "learning_rate": 2.5428520940095424e-05, "loss": 0.3094, "step": 30700 }, { "epoch": 0.5433726210881589, "grad_norm": 1.7957881689071655, "learning_rate": 2.537943492175689e-05, "loss": 0.3077, "step": 30750 }, { "epoch": 0.5442561538053754, "grad_norm": 1.4665497541427612, "learning_rate": 2.533034890341835e-05, "loss": 0.3505, "step": 30800 }, { "epoch": 0.5451396865225919, "grad_norm": 1.785367488861084, "learning_rate": 2.5281262885079815e-05, "loss": 0.3485, "step": 30850 }, { "epoch": 0.5460232192398085, "grad_norm": 4.639885425567627, "learning_rate": 2.523217686674128e-05, "loss": 0.331, "step": 30900 }, { "epoch": 0.546906751957025, "grad_norm": 1.308772325515747, "learning_rate": 2.518309084840274e-05, "loss": 0.2846, "step": 30950 }, { "epoch": 0.5477902846742415, "grad_norm": 1.3961265087127686, "learning_rate": 2.5134004830064205e-05, "loss": 0.3647, "step": 31000 }, { "epoch": 0.548673817391458, "grad_norm": 1.0688265562057495, "learning_rate": 2.5084918811725673e-05, "loss": 0.3475, "step": 31050 }, { "epoch": 0.5495573501086746, "grad_norm": 1.7052621841430664, "learning_rate": 2.503583279338713e-05, "loss": 0.2833, "step": 31100 }, { "epoch": 0.5504408828258911, "grad_norm": 1.5378305912017822, "learning_rate": 2.4986746775048596e-05, "loss": 0.3, "step": 31150 }, { "epoch": 0.5513244155431075, "grad_norm": 3.8670883178710938, "learning_rate": 2.493766075671006e-05, "loss": 0.3568, "step": 31200 }, { "epoch": 0.552207948260324, "grad_norm": 1.8015788793563843, "learning_rate": 2.4888574738371522e-05, "loss": 0.3268, "step": 31250 }, { "epoch": 0.5530914809775406, "grad_norm": 2.7606303691864014, "learning_rate": 2.4839488720032987e-05, "loss": 0.4005, "step": 31300 }, { "epoch": 0.5539750136947571, "grad_norm": 1.3418834209442139, "learning_rate": 2.479040270169445e-05, "loss": 0.2993, "step": 31350 }, { "epoch": 0.5548585464119736, "grad_norm": 1.3790879249572754, "learning_rate": 2.4741316683355912e-05, "loss": 0.3463, "step": 31400 }, { "epoch": 0.5557420791291902, "grad_norm": 1.5994555950164795, "learning_rate": 2.4692230665017377e-05, "loss": 0.3654, "step": 31450 }, { "epoch": 0.5566256118464067, "grad_norm": 1.528947114944458, "learning_rate": 2.4643144646678842e-05, "loss": 0.3329, "step": 31500 }, { "epoch": 0.5575091445636232, "grad_norm": 1.4391777515411377, "learning_rate": 2.4594058628340306e-05, "loss": 0.2794, "step": 31550 }, { "epoch": 0.5583926772808397, "grad_norm": 4.419312953948975, "learning_rate": 2.4544972610001768e-05, "loss": 0.4189, "step": 31600 }, { "epoch": 0.5592762099980563, "grad_norm": 1.5030118227005005, "learning_rate": 2.4495886591663232e-05, "loss": 0.3643, "step": 31650 }, { "epoch": 0.5601597427152728, "grad_norm": 1.3483951091766357, "learning_rate": 2.4446800573324697e-05, "loss": 0.3578, "step": 31700 }, { "epoch": 0.5610432754324892, "grad_norm": 1.5314035415649414, "learning_rate": 2.439771455498616e-05, "loss": 0.3193, "step": 31750 }, { "epoch": 0.5619268081497057, "grad_norm": 1.1020389795303345, "learning_rate": 2.4348628536647623e-05, "loss": 0.327, "step": 31800 }, { "epoch": 0.5628103408669223, "grad_norm": 1.445654034614563, "learning_rate": 2.4299542518309084e-05, "loss": 0.3429, "step": 31850 }, { "epoch": 0.5636938735841388, "grad_norm": 1.3795325756072998, "learning_rate": 2.425045649997055e-05, "loss": 0.2994, "step": 31900 }, { "epoch": 0.5645774063013553, "grad_norm": 1.7217411994934082, "learning_rate": 2.4201370481632014e-05, "loss": 0.3219, "step": 31950 }, { "epoch": 0.5654609390185719, "grad_norm": 1.3482351303100586, "learning_rate": 2.4152284463293475e-05, "loss": 0.2902, "step": 32000 }, { "epoch": 0.5663444717357884, "grad_norm": 2.785452365875244, "learning_rate": 2.4103198444954943e-05, "loss": 0.3896, "step": 32050 }, { "epoch": 0.5672280044530049, "grad_norm": 2.5383968353271484, "learning_rate": 2.4054112426616404e-05, "loss": 0.2491, "step": 32100 }, { "epoch": 0.5681115371702214, "grad_norm": 1.584861397743225, "learning_rate": 2.4005026408277866e-05, "loss": 0.2663, "step": 32150 }, { "epoch": 0.568995069887438, "grad_norm": 1.5586644411087036, "learning_rate": 2.395594038993933e-05, "loss": 0.3433, "step": 32200 }, { "epoch": 0.5698786026046545, "grad_norm": 1.4697036743164062, "learning_rate": 2.3906854371600795e-05, "loss": 0.3375, "step": 32250 }, { "epoch": 0.5707621353218709, "grad_norm": 2.39277720451355, "learning_rate": 2.385776835326226e-05, "loss": 0.2891, "step": 32300 }, { "epoch": 0.5716456680390875, "grad_norm": 1.5755674839019775, "learning_rate": 2.380868233492372e-05, "loss": 0.296, "step": 32350 }, { "epoch": 0.572529200756304, "grad_norm": 1.5802369117736816, "learning_rate": 2.3759596316585182e-05, "loss": 0.2478, "step": 32400 }, { "epoch": 0.5734127334735205, "grad_norm": 2.731212615966797, "learning_rate": 2.371051029824665e-05, "loss": 0.3514, "step": 32450 }, { "epoch": 0.574296266190737, "grad_norm": 1.70058274269104, "learning_rate": 2.366142427990811e-05, "loss": 0.2741, "step": 32500 }, { "epoch": 0.5751797989079536, "grad_norm": 3.394753932952881, "learning_rate": 2.3612338261569576e-05, "loss": 0.3546, "step": 32550 }, { "epoch": 0.5760633316251701, "grad_norm": 2.7270805835723877, "learning_rate": 2.356423396359781e-05, "loss": 0.3927, "step": 32600 }, { "epoch": 0.5769468643423866, "grad_norm": 2.3731272220611572, "learning_rate": 2.3515147945259272e-05, "loss": 0.2725, "step": 32650 }, { "epoch": 0.5778303970596032, "grad_norm": 1.4900075197219849, "learning_rate": 2.3466061926920737e-05, "loss": 0.3167, "step": 32700 }, { "epoch": 0.5787139297768197, "grad_norm": 1.2145545482635498, "learning_rate": 2.34169759085822e-05, "loss": 0.3249, "step": 32750 }, { "epoch": 0.5795974624940362, "grad_norm": 1.725298285484314, "learning_rate": 2.3367889890243662e-05, "loss": 0.2443, "step": 32800 }, { "epoch": 0.5804809952112526, "grad_norm": 1.316084384918213, "learning_rate": 2.331880387190513e-05, "loss": 0.4113, "step": 32850 }, { "epoch": 0.5813645279284692, "grad_norm": 1.8195414543151855, "learning_rate": 2.3269717853566592e-05, "loss": 0.3106, "step": 32900 }, { "epoch": 0.5822480606456857, "grad_norm": 1.1715435981750488, "learning_rate": 2.3220631835228053e-05, "loss": 0.2841, "step": 32950 }, { "epoch": 0.5831315933629022, "grad_norm": 1.3928303718566895, "learning_rate": 2.3171545816889518e-05, "loss": 0.2786, "step": 33000 }, { "epoch": 0.5840151260801187, "grad_norm": 1.4881165027618408, "learning_rate": 2.3122459798550982e-05, "loss": 0.3576, "step": 33050 }, { "epoch": 0.5848986587973353, "grad_norm": 2.8615384101867676, "learning_rate": 2.3073373780212447e-05, "loss": 0.2475, "step": 33100 }, { "epoch": 0.5857821915145518, "grad_norm": 1.819924235343933, "learning_rate": 2.302428776187391e-05, "loss": 0.348, "step": 33150 }, { "epoch": 0.5866657242317683, "grad_norm": 1.5402089357376099, "learning_rate": 2.297520174353537e-05, "loss": 0.2779, "step": 33200 }, { "epoch": 0.5875492569489849, "grad_norm": 1.7234498262405396, "learning_rate": 2.2926115725196838e-05, "loss": 0.3166, "step": 33250 }, { "epoch": 0.5884327896662014, "grad_norm": 1.4789388179779053, "learning_rate": 2.28770297068583e-05, "loss": 0.3448, "step": 33300 }, { "epoch": 0.5893163223834179, "grad_norm": 0.8780321478843689, "learning_rate": 2.2827943688519764e-05, "loss": 0.2409, "step": 33350 }, { "epoch": 0.5901998551006343, "grad_norm": 1.9462053775787354, "learning_rate": 2.277885767018123e-05, "loss": 0.3313, "step": 33400 }, { "epoch": 0.5910833878178509, "grad_norm": 1.6026935577392578, "learning_rate": 2.272977165184269e-05, "loss": 0.2981, "step": 33450 }, { "epoch": 0.5919669205350674, "grad_norm": 2.3030807971954346, "learning_rate": 2.2680685633504154e-05, "loss": 0.269, "step": 33500 }, { "epoch": 0.5928504532522839, "grad_norm": 2.8911454677581787, "learning_rate": 2.2631599615165616e-05, "loss": 0.4098, "step": 33550 }, { "epoch": 0.5937339859695004, "grad_norm": 1.4643045663833618, "learning_rate": 2.258251359682708e-05, "loss": 0.2924, "step": 33600 }, { "epoch": 0.594617518686717, "grad_norm": 2.0076584815979004, "learning_rate": 2.2533427578488545e-05, "loss": 0.2952, "step": 33650 }, { "epoch": 0.5955010514039335, "grad_norm": 1.203574299812317, "learning_rate": 2.2484341560150006e-05, "loss": 0.2754, "step": 33700 }, { "epoch": 0.59638458412115, "grad_norm": 2.815420150756836, "learning_rate": 2.243525554181147e-05, "loss": 0.3434, "step": 33750 }, { "epoch": 0.5972681168383666, "grad_norm": 1.487236499786377, "learning_rate": 2.2386169523472935e-05, "loss": 0.2541, "step": 33800 }, { "epoch": 0.5981516495555831, "grad_norm": 1.532326102256775, "learning_rate": 2.23370835051344e-05, "loss": 0.2923, "step": 33850 }, { "epoch": 0.5990351822727996, "grad_norm": 1.543256402015686, "learning_rate": 2.228799748679586e-05, "loss": 0.2361, "step": 33900 }, { "epoch": 0.599918714990016, "grad_norm": 1.5733423233032227, "learning_rate": 2.2238911468457326e-05, "loss": 0.4459, "step": 33950 }, { "epoch": 0.6008022477072326, "grad_norm": 1.2398439645767212, "learning_rate": 2.218982545011879e-05, "loss": 0.3169, "step": 34000 }, { "epoch": 0.6016857804244491, "grad_norm": 1.1555734872817993, "learning_rate": 2.2140739431780252e-05, "loss": 0.3146, "step": 34050 }, { "epoch": 0.6025693131416656, "grad_norm": 1.4827885627746582, "learning_rate": 2.2091653413441717e-05, "loss": 0.3125, "step": 34100 }, { "epoch": 0.6034528458588821, "grad_norm": 1.5724104642868042, "learning_rate": 2.204256739510318e-05, "loss": 0.2738, "step": 34150 }, { "epoch": 0.6043363785760987, "grad_norm": 1.5903054475784302, "learning_rate": 2.1993481376764643e-05, "loss": 0.3062, "step": 34200 }, { "epoch": 0.6052199112933152, "grad_norm": 1.5402554273605347, "learning_rate": 2.1944395358426107e-05, "loss": 0.3566, "step": 34250 }, { "epoch": 0.6061034440105317, "grad_norm": 1.7631182670593262, "learning_rate": 2.189530934008757e-05, "loss": 0.3466, "step": 34300 }, { "epoch": 0.6069869767277483, "grad_norm": 1.2873070240020752, "learning_rate": 2.1846223321749033e-05, "loss": 0.2608, "step": 34350 }, { "epoch": 0.6078705094449648, "grad_norm": 1.8117417097091675, "learning_rate": 2.1797137303410498e-05, "loss": 0.3239, "step": 34400 }, { "epoch": 0.6087540421621813, "grad_norm": 1.5316294431686401, "learning_rate": 2.174805128507196e-05, "loss": 0.3557, "step": 34450 }, { "epoch": 0.6096375748793977, "grad_norm": 1.539382815361023, "learning_rate": 2.1698965266733427e-05, "loss": 0.4175, "step": 34500 }, { "epoch": 0.6105211075966143, "grad_norm": 1.6773380041122437, "learning_rate": 2.164987924839489e-05, "loss": 0.3104, "step": 34550 }, { "epoch": 0.6114046403138308, "grad_norm": 1.3534982204437256, "learning_rate": 2.1601774950423123e-05, "loss": 0.3305, "step": 34600 }, { "epoch": 0.6122881730310473, "grad_norm": 1.416923999786377, "learning_rate": 2.1552688932084584e-05, "loss": 0.2832, "step": 34650 }, { "epoch": 0.6131717057482639, "grad_norm": 1.7992863655090332, "learning_rate": 2.150360291374605e-05, "loss": 0.3007, "step": 34700 }, { "epoch": 0.6140552384654804, "grad_norm": 1.3988946676254272, "learning_rate": 2.1454516895407514e-05, "loss": 0.3932, "step": 34750 }, { "epoch": 0.6149387711826969, "grad_norm": 1.7125048637390137, "learning_rate": 2.1405430877068978e-05, "loss": 0.3, "step": 34800 }, { "epoch": 0.6158223038999134, "grad_norm": 1.4415560960769653, "learning_rate": 2.135634485873044e-05, "loss": 0.2785, "step": 34850 }, { "epoch": 0.61670583661713, "grad_norm": 1.8688596487045288, "learning_rate": 2.1307258840391904e-05, "loss": 0.3015, "step": 34900 }, { "epoch": 0.6175893693343465, "grad_norm": 3.085685968399048, "learning_rate": 2.125817282205337e-05, "loss": 0.3291, "step": 34950 }, { "epoch": 0.618472902051563, "grad_norm": 1.3053193092346191, "learning_rate": 2.120908680371483e-05, "loss": 0.2634, "step": 35000 }, { "epoch": 0.6193564347687794, "grad_norm": 1.4780889749526978, "learning_rate": 2.1160000785376295e-05, "loss": 0.3212, "step": 35050 }, { "epoch": 0.620239967485996, "grad_norm": 1.699916124343872, "learning_rate": 2.1110914767037756e-05, "loss": 0.2965, "step": 35100 }, { "epoch": 0.6211235002032125, "grad_norm": 1.6198956966400146, "learning_rate": 2.106182874869922e-05, "loss": 0.3557, "step": 35150 }, { "epoch": 0.622007032920429, "grad_norm": 1.2697581052780151, "learning_rate": 2.1012742730360685e-05, "loss": 0.3535, "step": 35200 }, { "epoch": 0.6228905656376456, "grad_norm": 1.9256399869918823, "learning_rate": 2.0963656712022147e-05, "loss": 0.4183, "step": 35250 }, { "epoch": 0.6237740983548621, "grad_norm": 1.4346308708190918, "learning_rate": 2.0914570693683615e-05, "loss": 0.3355, "step": 35300 }, { "epoch": 0.6246576310720786, "grad_norm": 1.3797852993011475, "learning_rate": 2.0865484675345076e-05, "loss": 0.3626, "step": 35350 }, { "epoch": 0.6255411637892951, "grad_norm": 3.1976869106292725, "learning_rate": 2.0816398657006537e-05, "loss": 0.2542, "step": 35400 }, { "epoch": 0.6264246965065117, "grad_norm": 1.4315252304077148, "learning_rate": 2.0767312638668002e-05, "loss": 0.2555, "step": 35450 }, { "epoch": 0.6273082292237282, "grad_norm": 2.861154079437256, "learning_rate": 2.0718226620329467e-05, "loss": 0.3418, "step": 35500 }, { "epoch": 0.6281917619409447, "grad_norm": 1.378416895866394, "learning_rate": 2.066914060199093e-05, "loss": 0.3118, "step": 35550 }, { "epoch": 0.6290752946581611, "grad_norm": 4.129642486572266, "learning_rate": 2.0620054583652393e-05, "loss": 0.2889, "step": 35600 } ], "logging_steps": 50, "max_steps": 56591, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 256, "trial_name": null, "trial_params": null }