{ "best_metric": null, "best_model_checkpoint": null, "epoch": 66.85236768802228, "eval_steps": 500, "global_step": 24000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.055710306406685235, "grad_norm": 14.402867878604889, "learning_rate": 3.2000000000000005e-05, "loss": 10.1886, "step": 20 }, { "epoch": 0.11142061281337047, "grad_norm": 13.635298426580595, "learning_rate": 6.400000000000001e-05, "loss": 8.826, "step": 40 }, { "epoch": 0.1671309192200557, "grad_norm": 4.640229834066508, "learning_rate": 7.999996773810157e-05, "loss": 7.8841, "step": 60 }, { "epoch": 0.22284122562674094, "grad_norm": 4.771787721259069, "learning_rate": 7.999970964324714e-05, "loss": 5.9641, "step": 80 }, { "epoch": 0.2785515320334262, "grad_norm": 2.673493851023645, "learning_rate": 7.999919345531461e-05, "loss": 4.8468, "step": 100 }, { "epoch": 0.3342618384401114, "grad_norm": 4.303381236511131, "learning_rate": 7.999841917785668e-05, "loss": 4.5736, "step": 120 }, { "epoch": 0.38997214484679665, "grad_norm": 3.0474327568183464, "learning_rate": 7.999738681620232e-05, "loss": 4.492, "step": 140 }, { "epoch": 0.4456824512534819, "grad_norm": 4.693843207680094, "learning_rate": 7.999609637745683e-05, "loss": 4.2847, "step": 160 }, { "epoch": 0.5013927576601671, "grad_norm": 2.9247230468930265, "learning_rate": 7.999454787050167e-05, "loss": 3.8923, "step": 180 }, { "epoch": 0.5571030640668524, "grad_norm": 1.489196861741293, "learning_rate": 7.999274130599451e-05, "loss": 3.7348, "step": 200 }, { "epoch": 0.6128133704735376, "grad_norm": 1.61920958013195, "learning_rate": 7.999067669636909e-05, "loss": 3.6525, "step": 220 }, { "epoch": 0.6685236768802229, "grad_norm": 1.7425863171418448, "learning_rate": 7.998835405583514e-05, "loss": 3.606, "step": 240 }, { "epoch": 0.724233983286908, "grad_norm": 1.5257744205454054, "learning_rate": 7.998577340037835e-05, "loss": 3.5769, "step": 260 }, { "epoch": 0.7799442896935933, "grad_norm": 2.667757094800373, "learning_rate": 7.998293474776016e-05, "loss": 3.5703, "step": 280 }, { "epoch": 0.8356545961002786, "grad_norm": 1.7404959255578403, "learning_rate": 7.997983811751768e-05, "loss": 3.5621, "step": 300 }, { "epoch": 0.8913649025069638, "grad_norm": 1.634375074224456, "learning_rate": 7.99764835309636e-05, "loss": 3.5179, "step": 320 }, { "epoch": 0.947075208913649, "grad_norm": 1.7342700062168488, "learning_rate": 7.997287101118597e-05, "loss": 3.4854, "step": 340 }, { "epoch": 1.0027855153203342, "grad_norm": 1.3788139455421626, "learning_rate": 7.996900058304807e-05, "loss": 3.4837, "step": 360 }, { "epoch": 1.0584958217270195, "grad_norm": 2.0400921187647367, "learning_rate": 7.996487227318829e-05, "loss": 3.4779, "step": 380 }, { "epoch": 1.1142061281337048, "grad_norm": 1.3557958841205344, "learning_rate": 7.996048611001985e-05, "loss": 3.4484, "step": 400 }, { "epoch": 1.16991643454039, "grad_norm": 1.2968269160609383, "learning_rate": 7.995584212373067e-05, "loss": 3.4364, "step": 420 }, { "epoch": 1.2256267409470751, "grad_norm": 1.552015046378834, "learning_rate": 7.995094034628315e-05, "loss": 3.428, "step": 440 }, { "epoch": 1.2813370473537604, "grad_norm": 1.5830966652724554, "learning_rate": 7.994578081141396e-05, "loss": 3.4002, "step": 460 }, { "epoch": 1.3370473537604457, "grad_norm": 1.1969791506799554, "learning_rate": 7.994036355463378e-05, "loss": 3.3879, "step": 480 }, { "epoch": 1.392757660167131, "grad_norm": 1.615715341656064, "learning_rate": 7.993468861322705e-05, "loss": 3.3804, "step": 500 }, { "epoch": 1.448467966573816, "grad_norm": 1.7044068053200199, "learning_rate": 7.992875602625179e-05, "loss": 3.3872, "step": 520 }, { "epoch": 1.5041782729805013, "grad_norm": 1.376009869207643, "learning_rate": 7.99225658345392e-05, "loss": 3.3729, "step": 540 }, { "epoch": 1.5598885793871866, "grad_norm": 1.3796323292210666, "learning_rate": 7.991611808069354e-05, "loss": 3.3832, "step": 560 }, { "epoch": 1.615598885793872, "grad_norm": 1.5448889484720325, "learning_rate": 7.990941280909165e-05, "loss": 3.372, "step": 580 }, { "epoch": 1.6713091922005572, "grad_norm": 1.492544519723889, "learning_rate": 7.990245006588282e-05, "loss": 3.3374, "step": 600 }, { "epoch": 1.7270194986072425, "grad_norm": 1.9983170136669428, "learning_rate": 7.98952298989884e-05, "loss": 3.3147, "step": 620 }, { "epoch": 1.7827298050139275, "grad_norm": 1.3253294659757162, "learning_rate": 7.988775235810143e-05, "loss": 3.3236, "step": 640 }, { "epoch": 1.8384401114206128, "grad_norm": 1.235983274303143, "learning_rate": 7.988001749468634e-05, "loss": 3.293, "step": 660 }, { "epoch": 1.894150417827298, "grad_norm": 1.4225853711687855, "learning_rate": 7.987202536197861e-05, "loss": 3.3039, "step": 680 }, { "epoch": 1.9498607242339832, "grad_norm": 1.3045957220402902, "learning_rate": 7.986377601498437e-05, "loss": 3.2981, "step": 700 }, { "epoch": 2.0055710306406684, "grad_norm": 1.3450960062726653, "learning_rate": 7.985526951048004e-05, "loss": 3.2797, "step": 720 }, { "epoch": 2.0612813370473537, "grad_norm": 1.3839789031500191, "learning_rate": 7.984650590701197e-05, "loss": 3.2485, "step": 740 }, { "epoch": 2.116991643454039, "grad_norm": 1.2061805725063066, "learning_rate": 7.983748526489592e-05, "loss": 3.2598, "step": 760 }, { "epoch": 2.1727019498607243, "grad_norm": 1.272502438657944, "learning_rate": 7.98282076462168e-05, "loss": 3.2587, "step": 780 }, { "epoch": 2.2284122562674096, "grad_norm": 1.2869044872960342, "learning_rate": 7.981867311482816e-05, "loss": 3.2227, "step": 800 }, { "epoch": 2.284122562674095, "grad_norm": 1.3303768462152665, "learning_rate": 7.980888173635174e-05, "loss": 3.2648, "step": 820 }, { "epoch": 2.33983286908078, "grad_norm": 1.3112716268503581, "learning_rate": 7.979883357817706e-05, "loss": 3.2745, "step": 840 }, { "epoch": 2.3955431754874654, "grad_norm": 1.286190262319452, "learning_rate": 7.978852870946091e-05, "loss": 3.2425, "step": 860 }, { "epoch": 2.4512534818941503, "grad_norm": 1.220565008870163, "learning_rate": 7.977796720112692e-05, "loss": 3.2243, "step": 880 }, { "epoch": 2.5069637883008355, "grad_norm": 1.2416999049169055, "learning_rate": 7.976714912586503e-05, "loss": 3.2217, "step": 900 }, { "epoch": 2.562674094707521, "grad_norm": 1.649909126615191, "learning_rate": 7.975607455813105e-05, "loss": 3.2232, "step": 920 }, { "epoch": 2.618384401114206, "grad_norm": 1.2360938114511875, "learning_rate": 7.974474357414606e-05, "loss": 3.1888, "step": 940 }, { "epoch": 2.6740947075208914, "grad_norm": 1.3519864827851877, "learning_rate": 7.973315625189597e-05, "loss": 3.1782, "step": 960 }, { "epoch": 2.7298050139275767, "grad_norm": 1.2890437174386116, "learning_rate": 7.972131267113096e-05, "loss": 3.192, "step": 980 }, { "epoch": 2.785515320334262, "grad_norm": 1.2063188725654215, "learning_rate": 7.970921291336485e-05, "loss": 3.1869, "step": 1000 }, { "epoch": 2.841225626740947, "grad_norm": 1.2749039180973243, "learning_rate": 7.969685706187467e-05, "loss": 3.1663, "step": 1020 }, { "epoch": 2.896935933147632, "grad_norm": 1.1624439689926256, "learning_rate": 7.968424520170001e-05, "loss": 3.1558, "step": 1040 }, { "epoch": 2.9526462395543174, "grad_norm": 1.3645084603771966, "learning_rate": 7.967137741964243e-05, "loss": 3.2151, "step": 1060 }, { "epoch": 3.0083565459610027, "grad_norm": 1.3095011308845006, "learning_rate": 7.965825380426492e-05, "loss": 3.1241, "step": 1080 }, { "epoch": 3.064066852367688, "grad_norm": 1.3067653478209693, "learning_rate": 7.96448744458912e-05, "loss": 3.1433, "step": 1100 }, { "epoch": 3.1197771587743732, "grad_norm": 1.299162093757844, "learning_rate": 7.963123943660518e-05, "loss": 3.1515, "step": 1120 }, { "epoch": 3.1754874651810585, "grad_norm": 1.1814155508957564, "learning_rate": 7.961734887025032e-05, "loss": 3.1658, "step": 1140 }, { "epoch": 3.231197771587744, "grad_norm": 1.4814086313964214, "learning_rate": 7.96032028424289e-05, "loss": 3.1477, "step": 1160 }, { "epoch": 3.286908077994429, "grad_norm": 1.2408741761500102, "learning_rate": 7.958880145050149e-05, "loss": 3.1562, "step": 1180 }, { "epoch": 3.3426183844011144, "grad_norm": 1.240867953717542, "learning_rate": 7.957414479358615e-05, "loss": 3.128, "step": 1200 }, { "epoch": 3.3983286908077996, "grad_norm": 1.3259135820081311, "learning_rate": 7.955923297255786e-05, "loss": 3.1341, "step": 1220 }, { "epoch": 3.4540389972144845, "grad_norm": 1.24731715301278, "learning_rate": 7.954406609004775e-05, "loss": 3.1352, "step": 1240 }, { "epoch": 3.5097493036211698, "grad_norm": 1.4328700677390847, "learning_rate": 7.952864425044241e-05, "loss": 3.1776, "step": 1260 }, { "epoch": 3.565459610027855, "grad_norm": 1.2128896131543132, "learning_rate": 7.951296755988323e-05, "loss": 3.155, "step": 1280 }, { "epoch": 3.6211699164345403, "grad_norm": 1.2116171750344513, "learning_rate": 7.949703612626555e-05, "loss": 3.1577, "step": 1300 }, { "epoch": 3.6768802228412256, "grad_norm": 1.1238332423339203, "learning_rate": 7.948085005923804e-05, "loss": 3.1176, "step": 1320 }, { "epoch": 3.732590529247911, "grad_norm": 1.2861584858089754, "learning_rate": 7.94644094702019e-05, "loss": 3.1444, "step": 1340 }, { "epoch": 3.788300835654596, "grad_norm": 1.392865877163545, "learning_rate": 7.944771447231002e-05, "loss": 3.1275, "step": 1360 }, { "epoch": 3.8440111420612815, "grad_norm": 1.319680796984498, "learning_rate": 7.943076518046636e-05, "loss": 3.1178, "step": 1380 }, { "epoch": 3.8997214484679663, "grad_norm": 1.6517648664445108, "learning_rate": 7.9413561711325e-05, "loss": 3.113, "step": 1400 }, { "epoch": 3.9554317548746516, "grad_norm": 1.2610682032071576, "learning_rate": 7.939610418328943e-05, "loss": 3.1197, "step": 1420 }, { "epoch": 4.011142061281337, "grad_norm": 1.451143416466195, "learning_rate": 7.937839271651169e-05, "loss": 3.081, "step": 1440 }, { "epoch": 4.066852367688022, "grad_norm": 1.377524653184373, "learning_rate": 7.936042743289158e-05, "loss": 3.0716, "step": 1460 }, { "epoch": 4.1225626740947074, "grad_norm": 1.2134154671846935, "learning_rate": 7.934220845607582e-05, "loss": 3.0934, "step": 1480 }, { "epoch": 4.178272980501393, "grad_norm": 1.1951444826901634, "learning_rate": 7.932373591145714e-05, "loss": 3.0666, "step": 1500 }, { "epoch": 4.233983286908078, "grad_norm": 1.3540057345367065, "learning_rate": 7.93050099261735e-05, "loss": 3.1106, "step": 1520 }, { "epoch": 4.289693593314763, "grad_norm": 1.0915162692508482, "learning_rate": 7.928603062910715e-05, "loss": 3.0979, "step": 1540 }, { "epoch": 4.345403899721449, "grad_norm": 1.3408430692556432, "learning_rate": 7.926679815088376e-05, "loss": 3.0822, "step": 1560 }, { "epoch": 4.401114206128134, "grad_norm": 1.244967969301378, "learning_rate": 7.924731262387156e-05, "loss": 3.0636, "step": 1580 }, { "epoch": 4.456824512534819, "grad_norm": 1.2480585827601505, "learning_rate": 7.922757418218038e-05, "loss": 3.0699, "step": 1600 }, { "epoch": 4.512534818941504, "grad_norm": 1.5801893483515768, "learning_rate": 7.920758296166072e-05, "loss": 3.0814, "step": 1620 }, { "epoch": 4.56824512534819, "grad_norm": 1.2273710344040911, "learning_rate": 7.918733909990287e-05, "loss": 3.0844, "step": 1640 }, { "epoch": 4.623955431754875, "grad_norm": 1.2276563480543434, "learning_rate": 7.916684273623593e-05, "loss": 3.042, "step": 1660 }, { "epoch": 4.67966573816156, "grad_norm": 1.2696026342294924, "learning_rate": 7.914609401172687e-05, "loss": 3.0693, "step": 1680 }, { "epoch": 4.735376044568245, "grad_norm": 1.2199175574002186, "learning_rate": 7.912509306917949e-05, "loss": 3.0728, "step": 1700 }, { "epoch": 4.791086350974931, "grad_norm": 1.2583579762159638, "learning_rate": 7.910384005313353e-05, "loss": 3.0661, "step": 1720 }, { "epoch": 4.846796657381615, "grad_norm": 1.1722768148065734, "learning_rate": 7.908233510986363e-05, "loss": 3.0687, "step": 1740 }, { "epoch": 4.9025069637883005, "grad_norm": 1.3035051706656053, "learning_rate": 7.906057838737831e-05, "loss": 3.032, "step": 1760 }, { "epoch": 4.958217270194986, "grad_norm": 1.1974756008535596, "learning_rate": 7.903857003541898e-05, "loss": 3.0866, "step": 1780 }, { "epoch": 5.013927576601671, "grad_norm": 1.1642430602684481, "learning_rate": 7.901631020545893e-05, "loss": 3.0565, "step": 1800 }, { "epoch": 5.069637883008356, "grad_norm": 1.4004497345779519, "learning_rate": 7.899379905070219e-05, "loss": 3.0445, "step": 1820 }, { "epoch": 5.125348189415042, "grad_norm": 1.17997512711719, "learning_rate": 7.89710367260826e-05, "loss": 3.035, "step": 1840 }, { "epoch": 5.181058495821727, "grad_norm": 1.3826862632813843, "learning_rate": 7.894802338826267e-05, "loss": 3.0447, "step": 1860 }, { "epoch": 5.236768802228412, "grad_norm": 1.1816414577958267, "learning_rate": 7.89247591956325e-05, "loss": 3.0637, "step": 1880 }, { "epoch": 5.2924791086350975, "grad_norm": 1.1654353865755456, "learning_rate": 7.890124430830871e-05, "loss": 3.0468, "step": 1900 }, { "epoch": 5.348189415041783, "grad_norm": 1.165385013993255, "learning_rate": 7.887747888813336e-05, "loss": 3.0313, "step": 1920 }, { "epoch": 5.403899721448468, "grad_norm": 1.2162837114192384, "learning_rate": 7.88534630986728e-05, "loss": 3.0466, "step": 1940 }, { "epoch": 5.459610027855153, "grad_norm": 1.20720099919841, "learning_rate": 7.882919710521653e-05, "loss": 3.0551, "step": 1960 }, { "epoch": 5.515320334261839, "grad_norm": 1.177139436715742, "learning_rate": 7.880468107477611e-05, "loss": 3.0376, "step": 1980 }, { "epoch": 5.571030640668524, "grad_norm": 1.1524202569240398, "learning_rate": 7.8779915176084e-05, "loss": 3.0291, "step": 2000 }, { "epoch": 5.626740947075209, "grad_norm": 1.2146587030127356, "learning_rate": 7.875489957959237e-05, "loss": 3.0191, "step": 2020 }, { "epoch": 5.6824512534818945, "grad_norm": 1.128843488220497, "learning_rate": 7.872963445747195e-05, "loss": 3.0227, "step": 2040 }, { "epoch": 5.73816155988858, "grad_norm": 1.0994966342019519, "learning_rate": 7.870411998361084e-05, "loss": 3.02, "step": 2060 }, { "epoch": 5.793871866295264, "grad_norm": 1.2808892137589254, "learning_rate": 7.867835633361329e-05, "loss": 3.0469, "step": 2080 }, { "epoch": 5.84958217270195, "grad_norm": 1.5413808282647536, "learning_rate": 7.865234368479853e-05, "loss": 3.0436, "step": 2100 }, { "epoch": 5.905292479108635, "grad_norm": 1.2503860090167789, "learning_rate": 7.862608221619959e-05, "loss": 3.0106, "step": 2120 }, { "epoch": 5.96100278551532, "grad_norm": 1.125474961018013, "learning_rate": 7.859957210856188e-05, "loss": 3.0519, "step": 2140 }, { "epoch": 6.016713091922005, "grad_norm": 1.1689492826416197, "learning_rate": 7.857281354434221e-05, "loss": 2.9989, "step": 2160 }, { "epoch": 6.072423398328691, "grad_norm": 1.154286021730802, "learning_rate": 7.854580670770731e-05, "loss": 3.0334, "step": 2180 }, { "epoch": 6.128133704735376, "grad_norm": 1.4350398534810123, "learning_rate": 7.851855178453272e-05, "loss": 2.988, "step": 2200 }, { "epoch": 6.183844011142061, "grad_norm": 1.113383813885033, "learning_rate": 7.84910489624014e-05, "loss": 2.9763, "step": 2220 }, { "epoch": 6.2395543175487465, "grad_norm": 1.1728629584155144, "learning_rate": 7.846329843060248e-05, "loss": 3.0121, "step": 2240 }, { "epoch": 6.295264623955432, "grad_norm": 1.3311805526252927, "learning_rate": 7.843530038012998e-05, "loss": 3.0093, "step": 2260 }, { "epoch": 6.350974930362117, "grad_norm": 1.6172845191063454, "learning_rate": 7.840705500368151e-05, "loss": 3.006, "step": 2280 }, { "epoch": 6.406685236768802, "grad_norm": 1.11224616315367, "learning_rate": 7.837856249565682e-05, "loss": 3.0092, "step": 2300 }, { "epoch": 6.462395543175488, "grad_norm": 1.2756438414289364, "learning_rate": 7.834982305215663e-05, "loss": 2.992, "step": 2320 }, { "epoch": 6.518105849582173, "grad_norm": 1.2405302540136935, "learning_rate": 7.832083687098119e-05, "loss": 3.0005, "step": 2340 }, { "epoch": 6.573816155988858, "grad_norm": 1.16507749139941, "learning_rate": 7.829160415162888e-05, "loss": 2.9687, "step": 2360 }, { "epoch": 6.629526462395543, "grad_norm": 1.1740957839848314, "learning_rate": 7.826212509529497e-05, "loss": 2.99, "step": 2380 }, { "epoch": 6.685236768802229, "grad_norm": 1.1161100776511597, "learning_rate": 7.823239990487008e-05, "loss": 2.9827, "step": 2400 }, { "epoch": 6.740947075208914, "grad_norm": 1.157820224450206, "learning_rate": 7.820242878493888e-05, "loss": 2.9993, "step": 2420 }, { "epoch": 6.796657381615599, "grad_norm": 1.1204269029527796, "learning_rate": 7.817221194177869e-05, "loss": 2.9845, "step": 2440 }, { "epoch": 6.852367688022284, "grad_norm": 1.1227241115622848, "learning_rate": 7.814174958335797e-05, "loss": 3.0135, "step": 2460 }, { "epoch": 6.908077994428969, "grad_norm": 1.314572529939298, "learning_rate": 7.8111041919335e-05, "loss": 3.0121, "step": 2480 }, { "epoch": 6.963788300835654, "grad_norm": 1.3816466804127303, "learning_rate": 7.808008916105636e-05, "loss": 3.0031, "step": 2500 }, { "epoch": 7.0194986072423395, "grad_norm": 1.1289207328451878, "learning_rate": 7.804889152155548e-05, "loss": 2.9677, "step": 2520 }, { "epoch": 7.075208913649025, "grad_norm": 1.1378808677658065, "learning_rate": 7.801744921555127e-05, "loss": 2.9911, "step": 2540 }, { "epoch": 7.13091922005571, "grad_norm": 1.2254491174881055, "learning_rate": 7.798576245944647e-05, "loss": 2.9853, "step": 2560 }, { "epoch": 7.186629526462395, "grad_norm": 1.1570802675245002, "learning_rate": 7.795383147132631e-05, "loss": 2.9589, "step": 2580 }, { "epoch": 7.242339832869081, "grad_norm": 1.2894488302976834, "learning_rate": 7.792165647095696e-05, "loss": 2.9776, "step": 2600 }, { "epoch": 7.298050139275766, "grad_norm": 1.0528540016974788, "learning_rate": 7.788923767978396e-05, "loss": 2.96, "step": 2620 }, { "epoch": 7.353760445682451, "grad_norm": 1.2125786891416614, "learning_rate": 7.785657532093085e-05, "loss": 3.0041, "step": 2640 }, { "epoch": 7.4094707520891365, "grad_norm": 1.1920890603213412, "learning_rate": 7.78236696191974e-05, "loss": 2.9508, "step": 2660 }, { "epoch": 7.465181058495822, "grad_norm": 1.2174124036610061, "learning_rate": 7.779052080105831e-05, "loss": 2.9744, "step": 2680 }, { "epoch": 7.520891364902507, "grad_norm": 1.1450363781873376, "learning_rate": 7.77571290946615e-05, "loss": 2.9648, "step": 2700 }, { "epoch": 7.576601671309192, "grad_norm": 1.0906384870617993, "learning_rate": 7.772349472982652e-05, "loss": 2.9472, "step": 2720 }, { "epoch": 7.632311977715878, "grad_norm": 1.321934725618673, "learning_rate": 7.768961793804312e-05, "loss": 2.9812, "step": 2740 }, { "epoch": 7.688022284122563, "grad_norm": 1.3145051192938724, "learning_rate": 7.765549895246952e-05, "loss": 2.9936, "step": 2760 }, { "epoch": 7.743732590529248, "grad_norm": 1.1688841213500007, "learning_rate": 7.762113800793083e-05, "loss": 2.9673, "step": 2780 }, { "epoch": 7.7994428969359335, "grad_norm": 1.3151433447911725, "learning_rate": 7.758653534091746e-05, "loss": 2.9899, "step": 2800 }, { "epoch": 7.855153203342619, "grad_norm": 1.1890842453445192, "learning_rate": 7.75516911895835e-05, "loss": 2.9372, "step": 2820 }, { "epoch": 7.910863509749303, "grad_norm": 1.1869067775771354, "learning_rate": 7.751660579374505e-05, "loss": 2.9741, "step": 2840 }, { "epoch": 7.9665738161559885, "grad_norm": 1.2713091007536645, "learning_rate": 7.74812793948786e-05, "loss": 2.9583, "step": 2860 }, { "epoch": 8.022284122562674, "grad_norm": 1.2300818760281924, "learning_rate": 7.74457122361193e-05, "loss": 2.9214, "step": 2880 }, { "epoch": 8.07799442896936, "grad_norm": 1.1631255228333053, "learning_rate": 7.740990456225944e-05, "loss": 2.9644, "step": 2900 }, { "epoch": 8.133704735376044, "grad_norm": 1.2750643723536295, "learning_rate": 7.737385661974655e-05, "loss": 2.9401, "step": 2920 }, { "epoch": 8.18941504178273, "grad_norm": 1.1034800758000585, "learning_rate": 7.733756865668189e-05, "loss": 2.9726, "step": 2940 }, { "epoch": 8.245125348189415, "grad_norm": 1.1418415045379222, "learning_rate": 7.730104092281867e-05, "loss": 2.9504, "step": 2960 }, { "epoch": 8.300835654596101, "grad_norm": 1.1744854672216198, "learning_rate": 7.726427366956026e-05, "loss": 2.9361, "step": 2980 }, { "epoch": 8.356545961002785, "grad_norm": 1.3072658416879444, "learning_rate": 7.722726714995862e-05, "loss": 2.9589, "step": 3000 }, { "epoch": 8.412256267409472, "grad_norm": 1.441058710439587, "learning_rate": 7.719002161871242e-05, "loss": 2.9417, "step": 3020 }, { "epoch": 8.467966573816156, "grad_norm": 1.067582173805221, "learning_rate": 7.715253733216534e-05, "loss": 2.9067, "step": 3040 }, { "epoch": 8.52367688022284, "grad_norm": 1.1284427897815015, "learning_rate": 7.711481454830433e-05, "loss": 2.899, "step": 3060 }, { "epoch": 8.579387186629527, "grad_norm": 1.081964247959463, "learning_rate": 7.707685352675777e-05, "loss": 2.9379, "step": 3080 }, { "epoch": 8.635097493036211, "grad_norm": 1.1987325305883298, "learning_rate": 7.703865452879372e-05, "loss": 2.9327, "step": 3100 }, { "epoch": 8.690807799442897, "grad_norm": 1.285067490039356, "learning_rate": 7.700021781731815e-05, "loss": 2.9105, "step": 3120 }, { "epoch": 8.746518105849582, "grad_norm": 1.2495135447033165, "learning_rate": 7.696154365687308e-05, "loss": 2.9324, "step": 3140 }, { "epoch": 8.802228412256268, "grad_norm": 1.2973981114841804, "learning_rate": 7.69226323136348e-05, "loss": 2.9255, "step": 3160 }, { "epoch": 8.857938718662952, "grad_norm": 1.4170513439543635, "learning_rate": 7.6883484055412e-05, "loss": 2.9497, "step": 3180 }, { "epoch": 8.913649025069638, "grad_norm": 1.1690142712727585, "learning_rate": 7.684409915164392e-05, "loss": 2.923, "step": 3200 }, { "epoch": 8.969359331476323, "grad_norm": 1.1510038194930527, "learning_rate": 7.680447787339861e-05, "loss": 2.926, "step": 3220 }, { "epoch": 9.025069637883009, "grad_norm": 1.2532790582579474, "learning_rate": 7.676462049337088e-05, "loss": 2.9202, "step": 3240 }, { "epoch": 9.080779944289693, "grad_norm": 1.1281216086435024, "learning_rate": 7.672452728588057e-05, "loss": 2.962, "step": 3260 }, { "epoch": 9.13649025069638, "grad_norm": 1.1715186252733858, "learning_rate": 7.668419852687062e-05, "loss": 2.9135, "step": 3280 }, { "epoch": 9.192200557103064, "grad_norm": 1.1013980953917584, "learning_rate": 7.664363449390508e-05, "loss": 2.9017, "step": 3300 }, { "epoch": 9.24791086350975, "grad_norm": 1.2577466487795455, "learning_rate": 7.660283546616741e-05, "loss": 2.9397, "step": 3320 }, { "epoch": 9.303621169916434, "grad_norm": 1.2666590722136728, "learning_rate": 7.656180172445832e-05, "loss": 2.9291, "step": 3340 }, { "epoch": 9.35933147632312, "grad_norm": 1.0677411210264725, "learning_rate": 7.6520533551194e-05, "loss": 2.8936, "step": 3360 }, { "epoch": 9.415041782729805, "grad_norm": 1.2801627452281688, "learning_rate": 7.647903123040411e-05, "loss": 2.9053, "step": 3380 }, { "epoch": 9.47075208913649, "grad_norm": 1.4058461947277687, "learning_rate": 7.643729504772985e-05, "loss": 2.9267, "step": 3400 }, { "epoch": 9.526462395543176, "grad_norm": 1.0476841151648881, "learning_rate": 7.639532529042196e-05, "loss": 2.9067, "step": 3420 }, { "epoch": 9.58217270194986, "grad_norm": 1.1192055741834313, "learning_rate": 7.635312224733879e-05, "loss": 2.9217, "step": 3440 }, { "epoch": 9.637883008356546, "grad_norm": 1.0978122554766025, "learning_rate": 7.631068620894427e-05, "loss": 2.9008, "step": 3460 }, { "epoch": 9.69359331476323, "grad_norm": 1.1633036806799766, "learning_rate": 7.626801746730594e-05, "loss": 2.9058, "step": 3480 }, { "epoch": 9.749303621169917, "grad_norm": 1.09083755501122, "learning_rate": 7.622511631609293e-05, "loss": 2.9128, "step": 3500 }, { "epoch": 9.805013927576601, "grad_norm": 1.0388545560459703, "learning_rate": 7.618198305057391e-05, "loss": 2.9161, "step": 3520 }, { "epoch": 9.860724233983287, "grad_norm": 1.1015281389024363, "learning_rate": 7.613861796761513e-05, "loss": 2.901, "step": 3540 }, { "epoch": 9.916434540389972, "grad_norm": 1.0877360587312859, "learning_rate": 7.609502136567829e-05, "loss": 2.9284, "step": 3560 }, { "epoch": 9.972144846796658, "grad_norm": 1.0365425361156384, "learning_rate": 7.605119354481855e-05, "loss": 2.902, "step": 3580 }, { "epoch": 10.027855153203342, "grad_norm": 1.2163943871082232, "learning_rate": 7.600713480668244e-05, "loss": 2.8877, "step": 3600 }, { "epoch": 10.083565459610028, "grad_norm": 1.3467097823122347, "learning_rate": 7.596284545450579e-05, "loss": 2.902, "step": 3620 }, { "epoch": 10.139275766016713, "grad_norm": 1.1614707108221107, "learning_rate": 7.591832579311162e-05, "loss": 2.8924, "step": 3640 }, { "epoch": 10.194986072423399, "grad_norm": 1.154263677555927, "learning_rate": 7.587357612890807e-05, "loss": 2.8906, "step": 3660 }, { "epoch": 10.250696378830083, "grad_norm": 1.2048421202419115, "learning_rate": 7.582859676988631e-05, "loss": 2.91, "step": 3680 }, { "epoch": 10.30640668523677, "grad_norm": 1.0867049785413572, "learning_rate": 7.578338802561835e-05, "loss": 2.9205, "step": 3700 }, { "epoch": 10.362116991643454, "grad_norm": 1.2226191180056192, "learning_rate": 7.573795020725498e-05, "loss": 2.891, "step": 3720 }, { "epoch": 10.41782729805014, "grad_norm": 1.0288993145273457, "learning_rate": 7.569228362752359e-05, "loss": 2.8813, "step": 3740 }, { "epoch": 10.473537604456824, "grad_norm": 1.322014772637416, "learning_rate": 7.564638860072602e-05, "loss": 2.8942, "step": 3760 }, { "epoch": 10.52924791086351, "grad_norm": 1.7224079895109572, "learning_rate": 7.560026544273644e-05, "loss": 2.89, "step": 3780 }, { "epoch": 10.584958217270195, "grad_norm": 1.06553603535958, "learning_rate": 7.555391447099909e-05, "loss": 2.8933, "step": 3800 }, { "epoch": 10.64066852367688, "grad_norm": 1.6005573677632197, "learning_rate": 7.550733600452618e-05, "loss": 2.8778, "step": 3820 }, { "epoch": 10.696378830083566, "grad_norm": 1.183378791005643, "learning_rate": 7.546053036389568e-05, "loss": 2.8785, "step": 3840 }, { "epoch": 10.75208913649025, "grad_norm": 1.0673503444473083, "learning_rate": 7.541349787124903e-05, "loss": 2.8656, "step": 3860 }, { "epoch": 10.807799442896936, "grad_norm": 1.1161036292131443, "learning_rate": 7.536623885028903e-05, "loss": 2.8949, "step": 3880 }, { "epoch": 10.86350974930362, "grad_norm": 1.1044441414641109, "learning_rate": 7.53187536262776e-05, "loss": 2.8852, "step": 3900 }, { "epoch": 10.919220055710307, "grad_norm": 1.1221763618181442, "learning_rate": 7.527104252603341e-05, "loss": 2.8687, "step": 3920 }, { "epoch": 10.974930362116991, "grad_norm": 1.0239803282921194, "learning_rate": 7.522310587792984e-05, "loss": 2.8738, "step": 3940 }, { "epoch": 11.030640668523677, "grad_norm": 1.0403254568733065, "learning_rate": 7.517494401189256e-05, "loss": 2.8654, "step": 3960 }, { "epoch": 11.086350974930362, "grad_norm": 1.3774438500736799, "learning_rate": 7.512655725939733e-05, "loss": 2.8514, "step": 3980 }, { "epoch": 11.142061281337048, "grad_norm": 1.0311405284757509, "learning_rate": 7.507794595346767e-05, "loss": 2.8698, "step": 4000 }, { "epoch": 11.197771587743732, "grad_norm": 1.0136275961779155, "learning_rate": 7.502911042867261e-05, "loss": 2.8141, "step": 4020 }, { "epoch": 11.253481894150418, "grad_norm": 1.2412045833199774, "learning_rate": 7.498005102112435e-05, "loss": 2.894, "step": 4040 }, { "epoch": 11.309192200557103, "grad_norm": 1.070331458091606, "learning_rate": 7.493076806847605e-05, "loss": 2.8753, "step": 4060 }, { "epoch": 11.364902506963789, "grad_norm": 1.1760961554541343, "learning_rate": 7.488126190991936e-05, "loss": 2.8722, "step": 4080 }, { "epoch": 11.420612813370473, "grad_norm": 1.1616540292064435, "learning_rate": 7.483153288618215e-05, "loss": 2.8909, "step": 4100 }, { "epoch": 11.47632311977716, "grad_norm": 1.0331404489020481, "learning_rate": 7.478158133952619e-05, "loss": 2.853, "step": 4120 }, { "epoch": 11.532033426183844, "grad_norm": 1.254677923406745, "learning_rate": 7.473140761374479e-05, "loss": 2.8674, "step": 4140 }, { "epoch": 11.587743732590528, "grad_norm": 1.4723808775722143, "learning_rate": 7.468101205416035e-05, "loss": 2.8738, "step": 4160 }, { "epoch": 11.643454038997215, "grad_norm": 1.2019875338287322, "learning_rate": 7.463039500762213e-05, "loss": 2.8878, "step": 4180 }, { "epoch": 11.699164345403899, "grad_norm": 1.2787778339880642, "learning_rate": 7.457955682250372e-05, "loss": 2.8797, "step": 4200 }, { "epoch": 11.754874651810585, "grad_norm": 1.054866858779706, "learning_rate": 7.452849784870072e-05, "loss": 2.8617, "step": 4220 }, { "epoch": 11.81058495821727, "grad_norm": 1.1047418002149563, "learning_rate": 7.447721843762836e-05, "loss": 2.8519, "step": 4240 }, { "epoch": 11.866295264623956, "grad_norm": 1.3479580629482741, "learning_rate": 7.442571894221898e-05, "loss": 2.8764, "step": 4260 }, { "epoch": 11.92200557103064, "grad_norm": 1.0996191502382984, "learning_rate": 7.437399971691968e-05, "loss": 2.8742, "step": 4280 }, { "epoch": 11.977715877437326, "grad_norm": 1.1829758402369137, "learning_rate": 7.432206111768985e-05, "loss": 2.8795, "step": 4300 }, { "epoch": 12.03342618384401, "grad_norm": 1.2260323749324813, "learning_rate": 7.426990350199874e-05, "loss": 2.8393, "step": 4320 }, { "epoch": 12.089136490250697, "grad_norm": 1.0838502363024154, "learning_rate": 7.421752722882299e-05, "loss": 2.8434, "step": 4340 }, { "epoch": 12.144846796657381, "grad_norm": 1.3481847345508688, "learning_rate": 7.416493265864415e-05, "loss": 2.8609, "step": 4360 }, { "epoch": 12.200557103064067, "grad_norm": 1.0824899172058005, "learning_rate": 7.411212015344622e-05, "loss": 2.8521, "step": 4380 }, { "epoch": 12.256267409470752, "grad_norm": 1.2218416915168235, "learning_rate": 7.40590900767131e-05, "loss": 2.8913, "step": 4400 }, { "epoch": 12.311977715877438, "grad_norm": 1.1304024863305357, "learning_rate": 7.400584279342621e-05, "loss": 2.8493, "step": 4420 }, { "epoch": 12.367688022284122, "grad_norm": 1.0421994761055569, "learning_rate": 7.395237867006185e-05, "loss": 2.8292, "step": 4440 }, { "epoch": 12.423398328690809, "grad_norm": 1.265322995175785, "learning_rate": 7.389869807458872e-05, "loss": 2.8576, "step": 4460 }, { "epoch": 12.479108635097493, "grad_norm": 1.2148541733313956, "learning_rate": 7.384480137646545e-05, "loss": 2.8684, "step": 4480 }, { "epoch": 12.534818941504179, "grad_norm": 1.0816243901094347, "learning_rate": 7.379068894663795e-05, "loss": 2.8608, "step": 4500 }, { "epoch": 12.590529247910863, "grad_norm": 1.0319313826952614, "learning_rate": 7.373636115753691e-05, "loss": 2.8381, "step": 4520 }, { "epoch": 12.64623955431755, "grad_norm": 1.362502756526822, "learning_rate": 7.368181838307531e-05, "loss": 2.8361, "step": 4540 }, { "epoch": 12.701949860724234, "grad_norm": 1.1691483178804676, "learning_rate": 7.36270609986457e-05, "loss": 2.8476, "step": 4560 }, { "epoch": 12.757660167130918, "grad_norm": 1.099737609334872, "learning_rate": 7.357208938111772e-05, "loss": 2.8317, "step": 4580 }, { "epoch": 12.813370473537605, "grad_norm": 1.2465031640095632, "learning_rate": 7.351690390883547e-05, "loss": 2.8607, "step": 4600 }, { "epoch": 12.869080779944289, "grad_norm": 1.0626741966239892, "learning_rate": 7.346150496161489e-05, "loss": 2.8482, "step": 4620 }, { "epoch": 12.924791086350975, "grad_norm": 1.4215858090007158, "learning_rate": 7.340589292074123e-05, "loss": 2.828, "step": 4640 }, { "epoch": 12.98050139275766, "grad_norm": 1.189746289044543, "learning_rate": 7.33500681689663e-05, "loss": 2.8392, "step": 4660 }, { "epoch": 13.036211699164346, "grad_norm": 1.1816710302425453, "learning_rate": 7.329403109050598e-05, "loss": 2.8439, "step": 4680 }, { "epoch": 13.09192200557103, "grad_norm": 1.1422328461301028, "learning_rate": 7.323778207103738e-05, "loss": 2.8458, "step": 4700 }, { "epoch": 13.147632311977716, "grad_norm": 1.286190919467895, "learning_rate": 7.318132149769639e-05, "loss": 2.8373, "step": 4720 }, { "epoch": 13.2033426183844, "grad_norm": 1.1535174701098847, "learning_rate": 7.312464975907494e-05, "loss": 2.8287, "step": 4740 }, { "epoch": 13.259052924791087, "grad_norm": 1.098410005946985, "learning_rate": 7.306776724521822e-05, "loss": 2.8347, "step": 4760 }, { "epoch": 13.314763231197771, "grad_norm": 1.0730663678590038, "learning_rate": 7.301067434762217e-05, "loss": 2.8022, "step": 4780 }, { "epoch": 13.370473537604457, "grad_norm": 1.2860001127878453, "learning_rate": 7.295337145923068e-05, "loss": 2.8209, "step": 4800 }, { "epoch": 13.426183844011142, "grad_norm": 1.1788256509864643, "learning_rate": 7.28958589744329e-05, "loss": 2.8202, "step": 4820 }, { "epoch": 13.481894150417828, "grad_norm": 1.2476156265733942, "learning_rate": 7.283813728906054e-05, "loss": 2.8301, "step": 4840 }, { "epoch": 13.537604456824512, "grad_norm": 1.1806963623362805, "learning_rate": 7.278020680038514e-05, "loss": 2.8325, "step": 4860 }, { "epoch": 13.593314763231199, "grad_norm": 1.312577644195395, "learning_rate": 7.272206790711534e-05, "loss": 2.8268, "step": 4880 }, { "epoch": 13.649025069637883, "grad_norm": 1.2945260257216111, "learning_rate": 7.266372100939415e-05, "loss": 2.8474, "step": 4900 }, { "epoch": 13.704735376044567, "grad_norm": 1.1825196002989207, "learning_rate": 7.26051665087961e-05, "loss": 2.8245, "step": 4920 }, { "epoch": 13.760445682451254, "grad_norm": 1.1409651239961929, "learning_rate": 7.254640480832468e-05, "loss": 2.8342, "step": 4940 }, { "epoch": 13.816155988857938, "grad_norm": 1.1047252543056303, "learning_rate": 7.248743631240934e-05, "loss": 2.8504, "step": 4960 }, { "epoch": 13.871866295264624, "grad_norm": 1.1114472045482278, "learning_rate": 7.242826142690284e-05, "loss": 2.8238, "step": 4980 }, { "epoch": 13.927576601671309, "grad_norm": 1.0521613836121042, "learning_rate": 7.236888055907841e-05, "loss": 2.8524, "step": 5000 }, { "epoch": 13.983286908077995, "grad_norm": 1.1464930432732499, "learning_rate": 7.230929411762698e-05, "loss": 2.8309, "step": 5020 }, { "epoch": 14.038997214484679, "grad_norm": 1.2249548851439938, "learning_rate": 7.224950251265438e-05, "loss": 2.8166, "step": 5040 }, { "epoch": 14.094707520891365, "grad_norm": 1.1496013157017706, "learning_rate": 7.218950615567839e-05, "loss": 2.8176, "step": 5060 }, { "epoch": 14.15041782729805, "grad_norm": 1.2406955049253514, "learning_rate": 7.212930545962609e-05, "loss": 2.8452, "step": 5080 }, { "epoch": 14.206128133704736, "grad_norm": 1.0793023104931123, "learning_rate": 7.206890083883089e-05, "loss": 2.7934, "step": 5100 }, { "epoch": 14.26183844011142, "grad_norm": 1.0555332439739342, "learning_rate": 7.200829270902974e-05, "loss": 2.7967, "step": 5120 }, { "epoch": 14.317548746518106, "grad_norm": 1.088008092694001, "learning_rate": 7.194748148736022e-05, "loss": 2.8118, "step": 5140 }, { "epoch": 14.37325905292479, "grad_norm": 1.1664773713662093, "learning_rate": 7.18864675923577e-05, "loss": 2.8322, "step": 5160 }, { "epoch": 14.428969359331477, "grad_norm": 1.180830286270773, "learning_rate": 7.182525144395254e-05, "loss": 2.7889, "step": 5180 }, { "epoch": 14.484679665738161, "grad_norm": 1.289182787809941, "learning_rate": 7.176383346346697e-05, "loss": 2.8145, "step": 5200 }, { "epoch": 14.540389972144848, "grad_norm": 1.3156558914225063, "learning_rate": 7.170221407361246e-05, "loss": 2.8057, "step": 5220 }, { "epoch": 14.596100278551532, "grad_norm": 1.311742340109156, "learning_rate": 7.164039369848662e-05, "loss": 2.7996, "step": 5240 }, { "epoch": 14.651810584958218, "grad_norm": 1.1167094205760877, "learning_rate": 7.157837276357038e-05, "loss": 2.8106, "step": 5260 }, { "epoch": 14.707520891364902, "grad_norm": 1.0697751811552614, "learning_rate": 7.151615169572499e-05, "loss": 2.8089, "step": 5280 }, { "epoch": 14.763231197771589, "grad_norm": 1.3217920715681262, "learning_rate": 7.145373092318921e-05, "loss": 2.8295, "step": 5300 }, { "epoch": 14.818941504178273, "grad_norm": 1.2030049127163769, "learning_rate": 7.139111087557614e-05, "loss": 2.8208, "step": 5320 }, { "epoch": 14.874651810584957, "grad_norm": 1.1560181548327002, "learning_rate": 7.132829198387052e-05, "loss": 2.7894, "step": 5340 }, { "epoch": 14.930362116991644, "grad_norm": 1.3677024675114284, "learning_rate": 7.12652746804256e-05, "loss": 2.8208, "step": 5360 }, { "epoch": 14.986072423398328, "grad_norm": 1.1758085824901958, "learning_rate": 7.120205939896016e-05, "loss": 2.7816, "step": 5380 }, { "epoch": 15.041782729805014, "grad_norm": 1.236149394894879, "learning_rate": 7.113864657455565e-05, "loss": 2.8242, "step": 5400 }, { "epoch": 15.097493036211699, "grad_norm": 1.2013719116832535, "learning_rate": 7.107503664365306e-05, "loss": 2.8048, "step": 5420 }, { "epoch": 15.153203342618385, "grad_norm": 1.1890270902537186, "learning_rate": 7.101123004404999e-05, "loss": 2.7988, "step": 5440 }, { "epoch": 15.20891364902507, "grad_norm": 0.9947321024811, "learning_rate": 7.094722721489762e-05, "loss": 2.8023, "step": 5460 }, { "epoch": 15.264623955431755, "grad_norm": 1.097217550580232, "learning_rate": 7.088302859669767e-05, "loss": 2.7876, "step": 5480 }, { "epoch": 15.32033426183844, "grad_norm": 1.1297335196918232, "learning_rate": 7.081863463129943e-05, "loss": 2.81, "step": 5500 }, { "epoch": 15.376044568245126, "grad_norm": 1.1252485351464636, "learning_rate": 7.075404576189664e-05, "loss": 2.8104, "step": 5520 }, { "epoch": 15.43175487465181, "grad_norm": 1.0952308860470021, "learning_rate": 7.068926243302446e-05, "loss": 2.8134, "step": 5540 }, { "epoch": 15.487465181058496, "grad_norm": 1.1033772218566738, "learning_rate": 7.062428509055645e-05, "loss": 2.7919, "step": 5560 }, { "epoch": 15.54317548746518, "grad_norm": 1.1682663344591195, "learning_rate": 7.055911418170146e-05, "loss": 2.8255, "step": 5580 }, { "epoch": 15.598885793871867, "grad_norm": 1.131084807002329, "learning_rate": 7.049375015500061e-05, "loss": 2.7911, "step": 5600 }, { "epoch": 15.654596100278551, "grad_norm": 1.200817519578834, "learning_rate": 7.042819346032408e-05, "loss": 2.8178, "step": 5620 }, { "epoch": 15.710306406685238, "grad_norm": 1.14175820047776, "learning_rate": 7.036244454886818e-05, "loss": 2.7656, "step": 5640 }, { "epoch": 15.766016713091922, "grad_norm": 1.1227562173996573, "learning_rate": 7.029650387315208e-05, "loss": 2.8176, "step": 5660 }, { "epoch": 15.821727019498606, "grad_norm": 1.1575039434693588, "learning_rate": 7.023037188701485e-05, "loss": 2.7942, "step": 5680 }, { "epoch": 15.877437325905293, "grad_norm": 1.229577946005525, "learning_rate": 7.01640490456122e-05, "loss": 2.786, "step": 5700 }, { "epoch": 15.933147632311977, "grad_norm": 1.2021995784653015, "learning_rate": 7.009753580541344e-05, "loss": 2.7857, "step": 5720 }, { "epoch": 15.988857938718663, "grad_norm": 1.1325819395096217, "learning_rate": 7.003083262419829e-05, "loss": 2.7999, "step": 5740 }, { "epoch": 16.044568245125348, "grad_norm": 1.1155399032222173, "learning_rate": 6.996393996105378e-05, "loss": 2.7835, "step": 5760 }, { "epoch": 16.100278551532032, "grad_norm": 1.152416883414678, "learning_rate": 6.989685827637099e-05, "loss": 2.7879, "step": 5780 }, { "epoch": 16.15598885793872, "grad_norm": 1.0579264010465572, "learning_rate": 6.982958803184201e-05, "loss": 2.7968, "step": 5800 }, { "epoch": 16.211699164345404, "grad_norm": 1.2482769970776515, "learning_rate": 6.976212969045668e-05, "loss": 2.7628, "step": 5820 }, { "epoch": 16.26740947075209, "grad_norm": 1.1876572666161167, "learning_rate": 6.969448371649945e-05, "loss": 2.7645, "step": 5840 }, { "epoch": 16.323119777158773, "grad_norm": 1.0502937038876212, "learning_rate": 6.962665057554606e-05, "loss": 2.7836, "step": 5860 }, { "epoch": 16.37883008356546, "grad_norm": 1.1579505600465934, "learning_rate": 6.955863073446054e-05, "loss": 2.8117, "step": 5880 }, { "epoch": 16.434540389972145, "grad_norm": 1.1026773625827397, "learning_rate": 6.949042466139187e-05, "loss": 2.7684, "step": 5900 }, { "epoch": 16.49025069637883, "grad_norm": 1.080576186890865, "learning_rate": 6.942203282577072e-05, "loss": 2.8201, "step": 5920 }, { "epoch": 16.545961002785514, "grad_norm": 1.1201896108002356, "learning_rate": 6.935345569830636e-05, "loss": 2.7998, "step": 5940 }, { "epoch": 16.601671309192202, "grad_norm": 1.0674451923986512, "learning_rate": 6.928469375098327e-05, "loss": 2.7513, "step": 5960 }, { "epoch": 16.657381615598887, "grad_norm": 1.393151947824109, "learning_rate": 6.921574745705798e-05, "loss": 2.7765, "step": 5980 }, { "epoch": 16.71309192200557, "grad_norm": 1.0998714589532412, "learning_rate": 6.91466172910558e-05, "loss": 2.7645, "step": 6000 }, { "epoch": 16.768802228412255, "grad_norm": 1.161893735809738, "learning_rate": 6.907730372876756e-05, "loss": 2.7775, "step": 6020 }, { "epoch": 16.824512534818943, "grad_norm": 1.1249294683555988, "learning_rate": 6.90078072472463e-05, "loss": 2.7751, "step": 6040 }, { "epoch": 16.880222841225628, "grad_norm": 1.0278563705072414, "learning_rate": 6.8938128324804e-05, "loss": 2.7886, "step": 6060 }, { "epoch": 16.935933147632312, "grad_norm": 1.061083533262939, "learning_rate": 6.886826744100831e-05, "loss": 2.7706, "step": 6080 }, { "epoch": 16.991643454038996, "grad_norm": 1.1962487610461527, "learning_rate": 6.879822507667925e-05, "loss": 2.778, "step": 6100 }, { "epoch": 17.04735376044568, "grad_norm": 1.084120872740396, "learning_rate": 6.872800171388584e-05, "loss": 2.758, "step": 6120 }, { "epoch": 17.10306406685237, "grad_norm": 1.2302739964645966, "learning_rate": 6.865759783594288e-05, "loss": 2.7437, "step": 6140 }, { "epoch": 17.158774373259053, "grad_norm": 1.2503733070591003, "learning_rate": 6.858701392740755e-05, "loss": 2.7828, "step": 6160 }, { "epoch": 17.214484679665738, "grad_norm": 1.0453068724351287, "learning_rate": 6.85162504740761e-05, "loss": 2.7804, "step": 6180 }, { "epoch": 17.270194986072422, "grad_norm": 1.102788495625949, "learning_rate": 6.844530796298049e-05, "loss": 2.7794, "step": 6200 }, { "epoch": 17.32590529247911, "grad_norm": 1.1692774985464567, "learning_rate": 6.837418688238506e-05, "loss": 2.7432, "step": 6220 }, { "epoch": 17.381615598885794, "grad_norm": 1.1331591899737494, "learning_rate": 6.830288772178319e-05, "loss": 2.7716, "step": 6240 }, { "epoch": 17.43732590529248, "grad_norm": 1.2148411223909634, "learning_rate": 6.823141097189384e-05, "loss": 2.7696, "step": 6260 }, { "epoch": 17.493036211699163, "grad_norm": 1.393055281753607, "learning_rate": 6.815975712465829e-05, "loss": 2.7415, "step": 6280 }, { "epoch": 17.54874651810585, "grad_norm": 1.3474136405073431, "learning_rate": 6.808792667323665e-05, "loss": 2.781, "step": 6300 }, { "epoch": 17.604456824512535, "grad_norm": 1.036698414343895, "learning_rate": 6.80159201120046e-05, "loss": 2.7695, "step": 6320 }, { "epoch": 17.66016713091922, "grad_norm": 1.1451609472767672, "learning_rate": 6.79437379365498e-05, "loss": 2.7744, "step": 6340 }, { "epoch": 17.715877437325904, "grad_norm": 1.0591481814927388, "learning_rate": 6.787138064366862e-05, "loss": 2.7892, "step": 6360 }, { "epoch": 17.771587743732592, "grad_norm": 1.1623299583698332, "learning_rate": 6.779884873136271e-05, "loss": 2.7675, "step": 6380 }, { "epoch": 17.827298050139277, "grad_norm": 1.0147233449592903, "learning_rate": 6.772614269883552e-05, "loss": 2.7427, "step": 6400 }, { "epoch": 17.88300835654596, "grad_norm": 1.1368649796660046, "learning_rate": 6.765326304648889e-05, "loss": 2.7683, "step": 6420 }, { "epoch": 17.938718662952645, "grad_norm": 1.1490699285660757, "learning_rate": 6.758021027591959e-05, "loss": 2.7886, "step": 6440 }, { "epoch": 17.99442896935933, "grad_norm": 0.9785051726498183, "learning_rate": 6.75069848899159e-05, "loss": 2.7515, "step": 6460 }, { "epoch": 18.050139275766018, "grad_norm": 1.2031239642894627, "learning_rate": 6.743358739245416e-05, "loss": 2.7646, "step": 6480 }, { "epoch": 18.105849582172702, "grad_norm": 1.0544436866470264, "learning_rate": 6.736001828869522e-05, "loss": 2.7755, "step": 6500 }, { "epoch": 18.161559888579387, "grad_norm": 1.2123837569916929, "learning_rate": 6.728627808498102e-05, "loss": 2.726, "step": 6520 }, { "epoch": 18.21727019498607, "grad_norm": 1.0826550987274146, "learning_rate": 6.721236728883116e-05, "loss": 2.7447, "step": 6540 }, { "epoch": 18.27298050139276, "grad_norm": 1.0759411138187283, "learning_rate": 6.71382864089393e-05, "loss": 2.7457, "step": 6560 }, { "epoch": 18.328690807799443, "grad_norm": 1.097324968536256, "learning_rate": 6.706403595516969e-05, "loss": 2.7833, "step": 6580 }, { "epoch": 18.384401114206128, "grad_norm": 1.057266074357713, "learning_rate": 6.69896164385537e-05, "loss": 2.7441, "step": 6600 }, { "epoch": 18.440111420612812, "grad_norm": 1.1747200892285241, "learning_rate": 6.691502837128632e-05, "loss": 2.7255, "step": 6620 }, { "epoch": 18.4958217270195, "grad_norm": 0.9921075391724958, "learning_rate": 6.684027226672256e-05, "loss": 2.749, "step": 6640 }, { "epoch": 18.551532033426184, "grad_norm": 1.1919910460153382, "learning_rate": 6.676534863937394e-05, "loss": 2.7244, "step": 6660 }, { "epoch": 18.60724233983287, "grad_norm": 0.9700416046708717, "learning_rate": 6.669025800490496e-05, "loss": 2.7578, "step": 6680 }, { "epoch": 18.662952646239553, "grad_norm": 1.7798390717297705, "learning_rate": 6.66150008801296e-05, "loss": 2.7497, "step": 6700 }, { "epoch": 18.71866295264624, "grad_norm": 1.3075654856552488, "learning_rate": 6.653957778300764e-05, "loss": 2.7627, "step": 6720 }, { "epoch": 18.774373259052926, "grad_norm": 1.109643118621012, "learning_rate": 6.646398923264127e-05, "loss": 2.7451, "step": 6740 }, { "epoch": 18.83008356545961, "grad_norm": 1.0116510422849132, "learning_rate": 6.638823574927133e-05, "loss": 2.7904, "step": 6760 }, { "epoch": 18.885793871866294, "grad_norm": 1.154541906977704, "learning_rate": 6.631231785427385e-05, "loss": 2.7375, "step": 6780 }, { "epoch": 18.94150417827298, "grad_norm": 0.9834780422588326, "learning_rate": 6.623623607015642e-05, "loss": 2.7324, "step": 6800 }, { "epoch": 18.997214484679667, "grad_norm": 1.0967675089569602, "learning_rate": 6.615999092055462e-05, "loss": 2.7377, "step": 6820 }, { "epoch": 19.05292479108635, "grad_norm": 1.033720234986334, "learning_rate": 6.608358293022839e-05, "loss": 2.7455, "step": 6840 }, { "epoch": 19.108635097493035, "grad_norm": 1.4074135325652137, "learning_rate": 6.600701262505844e-05, "loss": 2.7175, "step": 6860 }, { "epoch": 19.16434540389972, "grad_norm": 1.2217671501431764, "learning_rate": 6.593028053204258e-05, "loss": 2.7459, "step": 6880 }, { "epoch": 19.220055710306408, "grad_norm": 1.157829388585607, "learning_rate": 6.585338717929218e-05, "loss": 2.7437, "step": 6900 }, { "epoch": 19.275766016713092, "grad_norm": 1.0614455080094343, "learning_rate": 6.577633309602842e-05, "loss": 2.7703, "step": 6920 }, { "epoch": 19.331476323119777, "grad_norm": 1.1146275976097708, "learning_rate": 6.569911881257878e-05, "loss": 2.7435, "step": 6940 }, { "epoch": 19.38718662952646, "grad_norm": 1.0865432136684192, "learning_rate": 6.56217448603733e-05, "loss": 2.732, "step": 6960 }, { "epoch": 19.44289693593315, "grad_norm": 1.1353040144610025, "learning_rate": 6.554421177194095e-05, "loss": 2.7285, "step": 6980 }, { "epoch": 19.498607242339833, "grad_norm": 1.0801239077744584, "learning_rate": 6.546652008090591e-05, "loss": 2.7449, "step": 7000 }, { "epoch": 19.554317548746518, "grad_norm": 1.3301081392287104, "learning_rate": 6.538867032198405e-05, "loss": 2.758, "step": 7020 }, { "epoch": 19.610027855153202, "grad_norm": 1.378308356595679, "learning_rate": 6.531066303097907e-05, "loss": 2.7296, "step": 7040 }, { "epoch": 19.66573816155989, "grad_norm": 1.0272775678623267, "learning_rate": 6.523249874477889e-05, "loss": 2.7366, "step": 7060 }, { "epoch": 19.721448467966574, "grad_norm": 1.3280539231741249, "learning_rate": 6.515417800135199e-05, "loss": 2.7206, "step": 7080 }, { "epoch": 19.77715877437326, "grad_norm": 1.2190042029662624, "learning_rate": 6.507570133974366e-05, "loss": 2.7413, "step": 7100 }, { "epoch": 19.832869080779943, "grad_norm": 1.0253976846781938, "learning_rate": 6.499706930007227e-05, "loss": 2.7194, "step": 7120 }, { "epoch": 19.88857938718663, "grad_norm": 1.0998801088450254, "learning_rate": 6.491828242352565e-05, "loss": 2.7299, "step": 7140 }, { "epoch": 19.944289693593316, "grad_norm": 1.2547720080479265, "learning_rate": 6.483934125235726e-05, "loss": 2.6907, "step": 7160 }, { "epoch": 20.0, "grad_norm": 1.1249680628198624, "learning_rate": 6.47602463298825e-05, "loss": 2.7427, "step": 7180 }, { "epoch": 20.055710306406684, "grad_norm": 1.438020280162261, "learning_rate": 6.468099820047495e-05, "loss": 2.7324, "step": 7200 }, { "epoch": 20.11142061281337, "grad_norm": 1.2513883791091014, "learning_rate": 6.46015974095627e-05, "loss": 2.7433, "step": 7220 }, { "epoch": 20.167130919220057, "grad_norm": 1.1732322456488424, "learning_rate": 6.452204450362446e-05, "loss": 2.7287, "step": 7240 }, { "epoch": 20.22284122562674, "grad_norm": 1.099413306167727, "learning_rate": 6.444234003018595e-05, "loss": 2.7166, "step": 7260 }, { "epoch": 20.278551532033426, "grad_norm": 1.1933888070643845, "learning_rate": 6.436248453781604e-05, "loss": 2.7084, "step": 7280 }, { "epoch": 20.33426183844011, "grad_norm": 1.1295980078738417, "learning_rate": 6.428247857612295e-05, "loss": 2.7101, "step": 7300 }, { "epoch": 20.389972144846798, "grad_norm": 1.1784232492865596, "learning_rate": 6.420232269575055e-05, "loss": 2.7238, "step": 7320 }, { "epoch": 20.445682451253482, "grad_norm": 1.600869680659427, "learning_rate": 6.412201744837451e-05, "loss": 2.7048, "step": 7340 }, { "epoch": 20.501392757660167, "grad_norm": 1.15499859140262, "learning_rate": 6.404156338669859e-05, "loss": 2.6977, "step": 7360 }, { "epoch": 20.55710306406685, "grad_norm": 1.1378208760575172, "learning_rate": 6.396096106445064e-05, "loss": 2.7181, "step": 7380 }, { "epoch": 20.61281337047354, "grad_norm": 1.123871481777876, "learning_rate": 6.388021103637904e-05, "loss": 2.7155, "step": 7400 }, { "epoch": 20.668523676880223, "grad_norm": 1.0841648486082098, "learning_rate": 6.37993138582487e-05, "loss": 2.7354, "step": 7420 }, { "epoch": 20.724233983286908, "grad_norm": 1.1664188982324037, "learning_rate": 6.371827008683732e-05, "loss": 2.7238, "step": 7440 }, { "epoch": 20.779944289693592, "grad_norm": 1.104589633589391, "learning_rate": 6.363708027993152e-05, "loss": 2.6975, "step": 7460 }, { "epoch": 20.83565459610028, "grad_norm": 1.421916066233274, "learning_rate": 6.355574499632301e-05, "loss": 2.7423, "step": 7480 }, { "epoch": 20.891364902506965, "grad_norm": 1.081254160033794, "learning_rate": 6.347426479580477e-05, "loss": 2.725, "step": 7500 }, { "epoch": 20.94707520891365, "grad_norm": 1.1098202109359585, "learning_rate": 6.339264023916715e-05, "loss": 2.7272, "step": 7520 }, { "epoch": 21.002785515320333, "grad_norm": 1.0514058782353737, "learning_rate": 6.331087188819405e-05, "loss": 2.739, "step": 7540 }, { "epoch": 21.058495821727018, "grad_norm": 1.1184046430910828, "learning_rate": 6.322896030565905e-05, "loss": 2.703, "step": 7560 }, { "epoch": 21.114206128133706, "grad_norm": 1.1842229516973584, "learning_rate": 6.31469060553215e-05, "loss": 2.7151, "step": 7580 }, { "epoch": 21.16991643454039, "grad_norm": 1.0858640050274855, "learning_rate": 6.30647097019227e-05, "loss": 2.7033, "step": 7600 }, { "epoch": 21.225626740947074, "grad_norm": 1.2333467648992407, "learning_rate": 6.298237181118193e-05, "loss": 2.6952, "step": 7620 }, { "epoch": 21.28133704735376, "grad_norm": 1.3879484824315327, "learning_rate": 6.289989294979264e-05, "loss": 2.7024, "step": 7640 }, { "epoch": 21.337047353760447, "grad_norm": 1.2106236210149603, "learning_rate": 6.281727368541853e-05, "loss": 2.7047, "step": 7660 }, { "epoch": 21.39275766016713, "grad_norm": 1.0115849386843587, "learning_rate": 6.273451458668961e-05, "loss": 2.7075, "step": 7680 }, { "epoch": 21.448467966573816, "grad_norm": 1.051437728204155, "learning_rate": 6.265161622319829e-05, "loss": 2.7247, "step": 7700 }, { "epoch": 21.5041782729805, "grad_norm": 1.1019904640067848, "learning_rate": 6.256857916549548e-05, "loss": 2.691, "step": 7720 }, { "epoch": 21.559888579387188, "grad_norm": 1.1014027785637281, "learning_rate": 6.248540398508673e-05, "loss": 2.6992, "step": 7740 }, { "epoch": 21.615598885793872, "grad_norm": 1.0988814252962915, "learning_rate": 6.240209125442806e-05, "loss": 2.714, "step": 7760 }, { "epoch": 21.671309192200557, "grad_norm": 1.2025654774415926, "learning_rate": 6.231864154692237e-05, "loss": 2.7042, "step": 7780 }, { "epoch": 21.72701949860724, "grad_norm": 1.1583280781305814, "learning_rate": 6.223505543691518e-05, "loss": 2.7081, "step": 7800 }, { "epoch": 21.78272980501393, "grad_norm": 1.1105375147778487, "learning_rate": 6.215133349969086e-05, "loss": 2.6869, "step": 7820 }, { "epoch": 21.838440111420613, "grad_norm": 1.1146300728660752, "learning_rate": 6.206747631146862e-05, "loss": 2.6988, "step": 7840 }, { "epoch": 21.894150417827298, "grad_norm": 1.0660501800448277, "learning_rate": 6.198348444939849e-05, "loss": 2.6491, "step": 7860 }, { "epoch": 21.949860724233982, "grad_norm": 1.1013215569007113, "learning_rate": 6.189935849155747e-05, "loss": 2.7103, "step": 7880 }, { "epoch": 22.00557103064067, "grad_norm": 1.1802406207769038, "learning_rate": 6.18150990169454e-05, "loss": 2.7193, "step": 7900 }, { "epoch": 22.061281337047355, "grad_norm": 1.1108973776747364, "learning_rate": 6.173070660548112e-05, "loss": 2.6831, "step": 7920 }, { "epoch": 22.11699164345404, "grad_norm": 1.092182682917437, "learning_rate": 6.16461818379984e-05, "loss": 2.6557, "step": 7940 }, { "epoch": 22.172701949860723, "grad_norm": 1.145560140100733, "learning_rate": 6.156152529624193e-05, "loss": 2.6672, "step": 7960 }, { "epoch": 22.228412256267408, "grad_norm": 1.2287470982936533, "learning_rate": 6.147673756286334e-05, "loss": 2.7312, "step": 7980 }, { "epoch": 22.284122562674096, "grad_norm": 1.0846368953094574, "learning_rate": 6.139181922141721e-05, "loss": 2.7017, "step": 8000 }, { "epoch": 22.33983286908078, "grad_norm": 1.1767612465335586, "learning_rate": 6.130677085635704e-05, "loss": 2.7118, "step": 8020 }, { "epoch": 22.395543175487465, "grad_norm": 1.1460490982941247, "learning_rate": 6.12215930530312e-05, "loss": 2.6819, "step": 8040 }, { "epoch": 22.45125348189415, "grad_norm": 1.1479085095640083, "learning_rate": 6.113628639767893e-05, "loss": 2.6877, "step": 8060 }, { "epoch": 22.506963788300837, "grad_norm": 1.1375246029213462, "learning_rate": 6.105085147742632e-05, "loss": 2.6925, "step": 8080 }, { "epoch": 22.56267409470752, "grad_norm": 1.1123231981092023, "learning_rate": 6.0965288880282214e-05, "loss": 2.6822, "step": 8100 }, { "epoch": 22.618384401114206, "grad_norm": 1.2752545626840799, "learning_rate": 6.087959919513422e-05, "loss": 2.7205, "step": 8120 }, { "epoch": 22.67409470752089, "grad_norm": 1.2262605320485462, "learning_rate": 6.079378301174464e-05, "loss": 2.6924, "step": 8140 }, { "epoch": 22.729805013927578, "grad_norm": 1.0730006469584497, "learning_rate": 6.0707840920746374e-05, "loss": 2.7124, "step": 8160 }, { "epoch": 22.785515320334262, "grad_norm": 1.3257454551313221, "learning_rate": 6.0621773513638905e-05, "loss": 2.6762, "step": 8180 }, { "epoch": 22.841225626740947, "grad_norm": 1.2930384450871677, "learning_rate": 6.0535581382784216e-05, "loss": 2.6623, "step": 8200 }, { "epoch": 22.89693593314763, "grad_norm": 1.139885910912536, "learning_rate": 6.0449265121402686e-05, "loss": 2.6867, "step": 8220 }, { "epoch": 22.95264623955432, "grad_norm": 1.125710423580146, "learning_rate": 6.036282532356904e-05, "loss": 2.6742, "step": 8240 }, { "epoch": 23.008356545961004, "grad_norm": 1.131377041116149, "learning_rate": 6.027626258420825e-05, "loss": 2.7031, "step": 8260 }, { "epoch": 23.064066852367688, "grad_norm": 1.0869417103268262, "learning_rate": 6.0189577499091424e-05, "loss": 2.6683, "step": 8280 }, { "epoch": 23.119777158774372, "grad_norm": 1.1391157676417962, "learning_rate": 6.010277066483174e-05, "loss": 2.707, "step": 8300 }, { "epoch": 23.175487465181057, "grad_norm": 1.2945583722046563, "learning_rate": 6.001584267888028e-05, "loss": 2.6522, "step": 8320 }, { "epoch": 23.231197771587745, "grad_norm": 1.1565608753713996, "learning_rate": 5.9928794139522025e-05, "loss": 2.6717, "step": 8340 }, { "epoch": 23.28690807799443, "grad_norm": 1.2394122917368424, "learning_rate": 5.9841625645871575e-05, "loss": 2.7024, "step": 8360 }, { "epoch": 23.342618384401113, "grad_norm": 1.0903861377505122, "learning_rate": 5.975433779786921e-05, "loss": 2.6455, "step": 8380 }, { "epoch": 23.398328690807798, "grad_norm": 1.0661921555569798, "learning_rate": 5.966693119627662e-05, "loss": 2.6706, "step": 8400 }, { "epoch": 23.454038997214486, "grad_norm": 1.0655312130218653, "learning_rate": 5.957940644267282e-05, "loss": 2.6906, "step": 8420 }, { "epoch": 23.50974930362117, "grad_norm": 1.1134166788328776, "learning_rate": 5.949176413945003e-05, "loss": 2.6432, "step": 8440 }, { "epoch": 23.565459610027855, "grad_norm": 1.650527244066039, "learning_rate": 5.94040048898095e-05, "loss": 2.6937, "step": 8460 }, { "epoch": 23.62116991643454, "grad_norm": 1.1235056862121802, "learning_rate": 5.931612929775738e-05, "loss": 2.6705, "step": 8480 }, { "epoch": 23.676880222841227, "grad_norm": 1.0368298826041793, "learning_rate": 5.922813796810054e-05, "loss": 2.6724, "step": 8500 }, { "epoch": 23.73259052924791, "grad_norm": 1.215524406135999, "learning_rate": 5.914003150644242e-05, "loss": 2.6768, "step": 8520 }, { "epoch": 23.788300835654596, "grad_norm": 1.2276511679169064, "learning_rate": 5.905181051917883e-05, "loss": 2.7046, "step": 8540 }, { "epoch": 23.84401114206128, "grad_norm": 1.0144214891191534, "learning_rate": 5.896347561349387e-05, "loss": 2.652, "step": 8560 }, { "epoch": 23.899721448467968, "grad_norm": 1.206243520777862, "learning_rate": 5.887502739735565e-05, "loss": 2.6965, "step": 8580 }, { "epoch": 23.955431754874652, "grad_norm": 1.0056037720791713, "learning_rate": 5.878646647951213e-05, "loss": 2.6475, "step": 8600 }, { "epoch": 24.011142061281337, "grad_norm": 1.0641520059001517, "learning_rate": 5.8697793469486964e-05, "loss": 2.6991, "step": 8620 }, { "epoch": 24.06685236768802, "grad_norm": 1.2174985172315418, "learning_rate": 5.860900897757528e-05, "loss": 2.6711, "step": 8640 }, { "epoch": 24.12256267409471, "grad_norm": 1.0515823218930573, "learning_rate": 5.852011361483949e-05, "loss": 2.6625, "step": 8660 }, { "epoch": 24.178272980501394, "grad_norm": 1.3132128877951654, "learning_rate": 5.8431107993105076e-05, "loss": 2.6604, "step": 8680 }, { "epoch": 24.233983286908078, "grad_norm": 1.0603177321270034, "learning_rate": 5.834199272495636e-05, "loss": 2.6663, "step": 8700 }, { "epoch": 24.289693593314762, "grad_norm": 1.262687868955379, "learning_rate": 5.8252768423732364e-05, "loss": 2.6708, "step": 8720 }, { "epoch": 24.345403899721447, "grad_norm": 1.0873862910591228, "learning_rate": 5.816343570352244e-05, "loss": 2.6367, "step": 8740 }, { "epoch": 24.401114206128135, "grad_norm": 1.2746436419501377, "learning_rate": 5.8073995179162254e-05, "loss": 2.7081, "step": 8760 }, { "epoch": 24.45682451253482, "grad_norm": 1.092203478224612, "learning_rate": 5.798444746622934e-05, "loss": 2.6693, "step": 8780 }, { "epoch": 24.512534818941504, "grad_norm": 1.1045845154960057, "learning_rate": 5.7894793181039e-05, "loss": 2.6981, "step": 8800 }, { "epoch": 24.568245125348188, "grad_norm": 1.139876543282688, "learning_rate": 5.780503294064005e-05, "loss": 2.6539, "step": 8820 }, { "epoch": 24.623955431754876, "grad_norm": 1.1892780737352568, "learning_rate": 5.771516736281051e-05, "loss": 2.6676, "step": 8840 }, { "epoch": 24.67966573816156, "grad_norm": 1.124479629411898, "learning_rate": 5.7625197066053374e-05, "loss": 2.6712, "step": 8860 }, { "epoch": 24.735376044568245, "grad_norm": 1.078433196751875, "learning_rate": 5.753512266959242e-05, "loss": 2.6658, "step": 8880 }, { "epoch": 24.79108635097493, "grad_norm": 1.0663315697490754, "learning_rate": 5.744494479336786e-05, "loss": 2.6488, "step": 8900 }, { "epoch": 24.846796657381617, "grad_norm": 1.1044363572012328, "learning_rate": 5.735466405803211e-05, "loss": 2.6905, "step": 8920 }, { "epoch": 24.9025069637883, "grad_norm": 1.0926049616035345, "learning_rate": 5.7264281084945534e-05, "loss": 2.6744, "step": 8940 }, { "epoch": 24.958217270194986, "grad_norm": 1.0597627637210976, "learning_rate": 5.717379649617212e-05, "loss": 2.6501, "step": 8960 }, { "epoch": 25.01392757660167, "grad_norm": 0.9918782369429666, "learning_rate": 5.70832109144753e-05, "loss": 2.6394, "step": 8980 }, { "epoch": 25.069637883008358, "grad_norm": 1.1550215185121195, "learning_rate": 5.6992524963313494e-05, "loss": 2.6491, "step": 9000 }, { "epoch": 25.125348189415043, "grad_norm": 1.2681147669552022, "learning_rate": 5.6901739266835976e-05, "loss": 2.6637, "step": 9020 }, { "epoch": 25.181058495821727, "grad_norm": 1.1669892724232989, "learning_rate": 5.681085444987855e-05, "loss": 2.6595, "step": 9040 }, { "epoch": 25.23676880222841, "grad_norm": 1.118544311154742, "learning_rate": 5.6719871137959136e-05, "loss": 2.6602, "step": 9060 }, { "epoch": 25.2924791086351, "grad_norm": 1.1886485765626151, "learning_rate": 5.6628789957273634e-05, "loss": 2.6209, "step": 9080 }, { "epoch": 25.348189415041784, "grad_norm": 1.170833233617083, "learning_rate": 5.653761153469147e-05, "loss": 2.6986, "step": 9100 }, { "epoch": 25.403899721448468, "grad_norm": 1.248618232385229, "learning_rate": 5.644633649775136e-05, "loss": 2.6686, "step": 9120 }, { "epoch": 25.459610027855152, "grad_norm": 1.1165667219670397, "learning_rate": 5.6354965474657e-05, "loss": 2.6708, "step": 9140 }, { "epoch": 25.515320334261837, "grad_norm": 1.1574567530077358, "learning_rate": 5.626349909427265e-05, "loss": 2.6521, "step": 9160 }, { "epoch": 25.571030640668525, "grad_norm": 1.0835835883220495, "learning_rate": 5.617193798611895e-05, "loss": 2.6581, "step": 9180 }, { "epoch": 25.62674094707521, "grad_norm": 1.144587311838764, "learning_rate": 5.6080282780368435e-05, "loss": 2.6602, "step": 9200 }, { "epoch": 25.682451253481894, "grad_norm": 1.112741624680344, "learning_rate": 5.598853410784133e-05, "loss": 2.6598, "step": 9220 }, { "epoch": 25.738161559888578, "grad_norm": 1.168044637764499, "learning_rate": 5.589669260000109e-05, "loss": 2.6645, "step": 9240 }, { "epoch": 25.793871866295266, "grad_norm": 1.120774568517253, "learning_rate": 5.580475888895015e-05, "loss": 2.6602, "step": 9260 }, { "epoch": 25.84958217270195, "grad_norm": 1.2114817500368666, "learning_rate": 5.571273360742552e-05, "loss": 2.6328, "step": 9280 }, { "epoch": 25.905292479108635, "grad_norm": 1.2356256005701014, "learning_rate": 5.5620617388794466e-05, "loss": 2.6384, "step": 9300 }, { "epoch": 25.96100278551532, "grad_norm": 1.2235414625020526, "learning_rate": 5.552841086705014e-05, "loss": 2.6681, "step": 9320 }, { "epoch": 26.016713091922007, "grad_norm": 1.2819636542569275, "learning_rate": 5.5436114676807156e-05, "loss": 2.6561, "step": 9340 }, { "epoch": 26.07242339832869, "grad_norm": 1.2032746744995522, "learning_rate": 5.534372945329733e-05, "loss": 2.6384, "step": 9360 }, { "epoch": 26.128133704735376, "grad_norm": 1.1226061787109827, "learning_rate": 5.525125583236522e-05, "loss": 2.6294, "step": 9380 }, { "epoch": 26.18384401114206, "grad_norm": 1.021042107378643, "learning_rate": 5.515869445046379e-05, "loss": 2.6588, "step": 9400 }, { "epoch": 26.23955431754875, "grad_norm": 1.291042490575208, "learning_rate": 5.506604594465004e-05, "loss": 2.6264, "step": 9420 }, { "epoch": 26.295264623955433, "grad_norm": 1.1032208560398462, "learning_rate": 5.4973310952580576e-05, "loss": 2.6169, "step": 9440 }, { "epoch": 26.350974930362117, "grad_norm": 1.2761063536817212, "learning_rate": 5.488049011250727e-05, "loss": 2.6506, "step": 9460 }, { "epoch": 26.4066852367688, "grad_norm": 1.1511514672695646, "learning_rate": 5.478758406327282e-05, "loss": 2.6698, "step": 9480 }, { "epoch": 26.462395543175486, "grad_norm": 1.0992455713094436, "learning_rate": 5.469459344430642e-05, "loss": 2.6097, "step": 9500 }, { "epoch": 26.518105849582174, "grad_norm": 1.172205701527116, "learning_rate": 5.4601518895619284e-05, "loss": 2.6293, "step": 9520 }, { "epoch": 26.573816155988858, "grad_norm": 1.2031374723785744, "learning_rate": 5.4508361057800276e-05, "loss": 2.6199, "step": 9540 }, { "epoch": 26.629526462395543, "grad_norm": 1.0768039312990048, "learning_rate": 5.441512057201152e-05, "loss": 2.6497, "step": 9560 }, { "epoch": 26.685236768802227, "grad_norm": 1.182782939690952, "learning_rate": 5.432179807998395e-05, "loss": 2.6439, "step": 9580 }, { "epoch": 26.740947075208915, "grad_norm": 1.1202931202000697, "learning_rate": 5.422839422401295e-05, "loss": 2.622, "step": 9600 }, { "epoch": 26.7966573816156, "grad_norm": 1.291606489378618, "learning_rate": 5.413490964695381e-05, "loss": 2.6146, "step": 9620 }, { "epoch": 26.852367688022284, "grad_norm": 1.1538604314310363, "learning_rate": 5.404134499221748e-05, "loss": 2.6338, "step": 9640 }, { "epoch": 26.908077994428968, "grad_norm": 1.6695600582971142, "learning_rate": 5.3947700903765986e-05, "loss": 2.6499, "step": 9660 }, { "epoch": 26.963788300835656, "grad_norm": 1.245827651961817, "learning_rate": 5.3853978026108086e-05, "loss": 2.6421, "step": 9680 }, { "epoch": 27.01949860724234, "grad_norm": 1.173940924924453, "learning_rate": 5.37601770042948e-05, "loss": 2.6403, "step": 9700 }, { "epoch": 27.075208913649025, "grad_norm": 1.0519719376965715, "learning_rate": 5.3666298483914984e-05, "loss": 2.6203, "step": 9720 }, { "epoch": 27.13091922005571, "grad_norm": 1.133593745206024, "learning_rate": 5.357234311109086e-05, "loss": 2.6574, "step": 9740 }, { "epoch": 27.186629526462397, "grad_norm": 1.2653772970355646, "learning_rate": 5.347831153247361e-05, "loss": 2.6414, "step": 9760 }, { "epoch": 27.24233983286908, "grad_norm": 1.109574259928109, "learning_rate": 5.338420439523891e-05, "loss": 2.6147, "step": 9780 }, { "epoch": 27.298050139275766, "grad_norm": 1.2411069069646816, "learning_rate": 5.329002234708245e-05, "loss": 2.608, "step": 9800 }, { "epoch": 27.35376044568245, "grad_norm": 1.1182716681713758, "learning_rate": 5.319576603621553e-05, "loss": 2.6413, "step": 9820 }, { "epoch": 27.409470752089135, "grad_norm": 1.2003086684148825, "learning_rate": 5.3101436111360504e-05, "loss": 2.6275, "step": 9840 }, { "epoch": 27.465181058495823, "grad_norm": 1.0717097302386294, "learning_rate": 5.300703322174646e-05, "loss": 2.6328, "step": 9860 }, { "epoch": 27.520891364902507, "grad_norm": 1.4444979446572614, "learning_rate": 5.29125580171046e-05, "loss": 2.6201, "step": 9880 }, { "epoch": 27.57660167130919, "grad_norm": 1.0746363462650246, "learning_rate": 5.281801114766385e-05, "loss": 2.6123, "step": 9900 }, { "epoch": 27.632311977715876, "grad_norm": 1.0661192079701574, "learning_rate": 5.272339326414642e-05, "loss": 2.5964, "step": 9920 }, { "epoch": 27.688022284122564, "grad_norm": 1.1015082831508238, "learning_rate": 5.262870501776321e-05, "loss": 2.5953, "step": 9940 }, { "epoch": 27.74373259052925, "grad_norm": 1.1382427443811807, "learning_rate": 5.253394706020944e-05, "loss": 2.6181, "step": 9960 }, { "epoch": 27.799442896935933, "grad_norm": 1.0943370244357078, "learning_rate": 5.243912004366008e-05, "loss": 2.6116, "step": 9980 }, { "epoch": 27.855153203342617, "grad_norm": 1.0772056082741257, "learning_rate": 5.234422462076547e-05, "loss": 2.5998, "step": 10000 }, { "epoch": 27.910863509749305, "grad_norm": 1.0546759553565561, "learning_rate": 5.2249261444646674e-05, "loss": 2.5937, "step": 10020 }, { "epoch": 27.96657381615599, "grad_norm": 1.176478667272748, "learning_rate": 5.2154231168891134e-05, "loss": 2.6093, "step": 10040 }, { "epoch": 28.022284122562674, "grad_norm": 1.1321140906627838, "learning_rate": 5.2059134447548076e-05, "loss": 2.6229, "step": 10060 }, { "epoch": 28.077994428969358, "grad_norm": 1.065907796206171, "learning_rate": 5.196397193512405e-05, "loss": 2.6205, "step": 10080 }, { "epoch": 28.133704735376046, "grad_norm": 1.233651032400036, "learning_rate": 5.1868744286578406e-05, "loss": 2.5931, "step": 10100 }, { "epoch": 28.18941504178273, "grad_norm": 1.0599465164229271, "learning_rate": 5.177345215731881e-05, "loss": 2.6147, "step": 10120 }, { "epoch": 28.245125348189415, "grad_norm": 1.1538820509055618, "learning_rate": 5.167809620319672e-05, "loss": 2.6167, "step": 10140 }, { "epoch": 28.3008356545961, "grad_norm": 1.154317454601146, "learning_rate": 5.158267708050286e-05, "loss": 2.5937, "step": 10160 }, { "epoch": 28.356545961002787, "grad_norm": 1.0665328766519204, "learning_rate": 5.1487195445962715e-05, "loss": 2.604, "step": 10180 }, { "epoch": 28.41225626740947, "grad_norm": 1.1694960107465548, "learning_rate": 5.139165195673201e-05, "loss": 2.5995, "step": 10200 }, { "epoch": 28.467966573816156, "grad_norm": 1.0526752329813267, "learning_rate": 5.1296047270392175e-05, "loss": 2.6209, "step": 10220 }, { "epoch": 28.52367688022284, "grad_norm": 1.1414513270535445, "learning_rate": 5.120038204494588e-05, "loss": 2.5929, "step": 10240 }, { "epoch": 28.579387186629525, "grad_norm": 1.0966114959097728, "learning_rate": 5.1104656938812394e-05, "loss": 2.5924, "step": 10260 }, { "epoch": 28.635097493036213, "grad_norm": 1.1174765080610298, "learning_rate": 5.1008872610823155e-05, "loss": 2.6202, "step": 10280 }, { "epoch": 28.690807799442897, "grad_norm": 1.1145726415269017, "learning_rate": 5.091302972021719e-05, "loss": 2.5968, "step": 10300 }, { "epoch": 28.74651810584958, "grad_norm": 1.0586956135779206, "learning_rate": 5.08171289266366e-05, "loss": 2.6123, "step": 10320 }, { "epoch": 28.802228412256266, "grad_norm": 1.3940508320205856, "learning_rate": 5.072117089012195e-05, "loss": 2.597, "step": 10340 }, { "epoch": 28.857938718662954, "grad_norm": 1.0350493924845274, "learning_rate": 5.062515627110785e-05, "loss": 2.6207, "step": 10360 }, { "epoch": 28.91364902506964, "grad_norm": 1.2315404210100842, "learning_rate": 5.0529085730418306e-05, "loss": 2.6179, "step": 10380 }, { "epoch": 28.969359331476323, "grad_norm": 1.1045820694597503, "learning_rate": 5.0432959929262205e-05, "loss": 2.6008, "step": 10400 }, { "epoch": 29.025069637883007, "grad_norm": 1.146579867692895, "learning_rate": 5.03367795292288e-05, "loss": 2.6202, "step": 10420 }, { "epoch": 29.080779944289695, "grad_norm": 1.2733330944071208, "learning_rate": 5.0240545192283056e-05, "loss": 2.6123, "step": 10440 }, { "epoch": 29.13649025069638, "grad_norm": 1.0494794859758667, "learning_rate": 5.0144257580761224e-05, "loss": 2.5829, "step": 10460 }, { "epoch": 29.192200557103064, "grad_norm": 1.1190765883905507, "learning_rate": 5.0047917357366194e-05, "loss": 2.6223, "step": 10480 }, { "epoch": 29.24791086350975, "grad_norm": 1.5745640876405715, "learning_rate": 4.995152518516296e-05, "loss": 2.6133, "step": 10500 }, { "epoch": 29.303621169916436, "grad_norm": 1.1765754248618923, "learning_rate": 4.9855081727574066e-05, "loss": 2.6047, "step": 10520 }, { "epoch": 29.35933147632312, "grad_norm": 1.498763487641365, "learning_rate": 4.975858764837501e-05, "loss": 2.5656, "step": 10540 }, { "epoch": 29.415041782729805, "grad_norm": 1.1132252850299105, "learning_rate": 4.966204361168971e-05, "loss": 2.5914, "step": 10560 }, { "epoch": 29.47075208913649, "grad_norm": 1.1686019155914007, "learning_rate": 4.956545028198591e-05, "loss": 2.5874, "step": 10580 }, { "epoch": 29.526462395543177, "grad_norm": 1.1003078614700978, "learning_rate": 4.946880832407062e-05, "loss": 2.6143, "step": 10600 }, { "epoch": 29.58217270194986, "grad_norm": 1.0784823374299444, "learning_rate": 4.937211840308553e-05, "loss": 2.6153, "step": 10620 }, { "epoch": 29.637883008356546, "grad_norm": 1.2454601562477818, "learning_rate": 4.927538118450244e-05, "loss": 2.5872, "step": 10640 }, { "epoch": 29.69359331476323, "grad_norm": 1.2988040165269858, "learning_rate": 4.917859733411869e-05, "loss": 2.603, "step": 10660 }, { "epoch": 29.749303621169915, "grad_norm": 1.1745460752963417, "learning_rate": 4.908176751805253e-05, "loss": 2.5681, "step": 10680 }, { "epoch": 29.805013927576603, "grad_norm": 1.3297312991458439, "learning_rate": 4.898489240273864e-05, "loss": 2.6095, "step": 10700 }, { "epoch": 29.860724233983287, "grad_norm": 1.1339679906193685, "learning_rate": 4.888797265492338e-05, "loss": 2.6067, "step": 10720 }, { "epoch": 29.91643454038997, "grad_norm": 1.1806251948614392, "learning_rate": 4.879100894166038e-05, "loss": 2.5967, "step": 10740 }, { "epoch": 29.972144846796656, "grad_norm": 1.165860935100357, "learning_rate": 4.8694001930305794e-05, "loss": 2.5785, "step": 10760 }, { "epoch": 30.027855153203344, "grad_norm": 1.3776136936276104, "learning_rate": 4.859695228851381e-05, "loss": 2.5897, "step": 10780 }, { "epoch": 30.08356545961003, "grad_norm": 1.2587071827316874, "learning_rate": 4.8499860684232066e-05, "loss": 2.5797, "step": 10800 }, { "epoch": 30.139275766016713, "grad_norm": 1.1970210105742216, "learning_rate": 4.84027277856969e-05, "loss": 2.5672, "step": 10820 }, { "epoch": 30.194986072423397, "grad_norm": 1.3454554228099718, "learning_rate": 4.830555426142899e-05, "loss": 2.5934, "step": 10840 }, { "epoch": 30.250696378830085, "grad_norm": 1.264747247911362, "learning_rate": 4.8208340780228475e-05, "loss": 2.5894, "step": 10860 }, { "epoch": 30.30640668523677, "grad_norm": 1.05872829319226, "learning_rate": 4.811108801117065e-05, "loss": 2.5867, "step": 10880 }, { "epoch": 30.362116991643454, "grad_norm": 1.084586036460413, "learning_rate": 4.80137966236011e-05, "loss": 2.5901, "step": 10900 }, { "epoch": 30.41782729805014, "grad_norm": 1.229834581800877, "learning_rate": 4.7916467287131244e-05, "loss": 2.5604, "step": 10920 }, { "epoch": 30.473537604456826, "grad_norm": 1.4451033607435961, "learning_rate": 4.7819100671633706e-05, "loss": 2.597, "step": 10940 }, { "epoch": 30.52924791086351, "grad_norm": 1.2548193540172925, "learning_rate": 4.772169744723762e-05, "loss": 2.5529, "step": 10960 }, { "epoch": 30.584958217270195, "grad_norm": 1.5052254853750195, "learning_rate": 4.762425828432416e-05, "loss": 2.6054, "step": 10980 }, { "epoch": 30.64066852367688, "grad_norm": 1.1519592174538793, "learning_rate": 4.7526783853521796e-05, "loss": 2.5836, "step": 11000 }, { "epoch": 30.696378830083564, "grad_norm": 1.1375488161597391, "learning_rate": 4.742927482570176e-05, "loss": 2.5621, "step": 11020 }, { "epoch": 30.75208913649025, "grad_norm": 1.0552495588001027, "learning_rate": 4.733173187197335e-05, "loss": 2.5886, "step": 11040 }, { "epoch": 30.807799442896936, "grad_norm": 1.2026052803495149, "learning_rate": 4.723415566367945e-05, "loss": 2.576, "step": 11060 }, { "epoch": 30.86350974930362, "grad_norm": 1.2031132145653618, "learning_rate": 4.713654687239171e-05, "loss": 2.5871, "step": 11080 }, { "epoch": 30.919220055710305, "grad_norm": 1.152263728928741, "learning_rate": 4.703890616990612e-05, "loss": 2.586, "step": 11100 }, { "epoch": 30.974930362116993, "grad_norm": 1.3435496121402817, "learning_rate": 4.6941234228238256e-05, "loss": 2.5813, "step": 11120 }, { "epoch": 31.030640668523677, "grad_norm": 1.1328862858818538, "learning_rate": 4.684353171961873e-05, "loss": 2.5917, "step": 11140 }, { "epoch": 31.08635097493036, "grad_norm": 1.2484233364341746, "learning_rate": 4.674579931648851e-05, "loss": 2.5619, "step": 11160 }, { "epoch": 31.142061281337046, "grad_norm": 1.3584774397600772, "learning_rate": 4.664803769149427e-05, "loss": 2.5569, "step": 11180 }, { "epoch": 31.197771587743734, "grad_norm": 1.1992013763670537, "learning_rate": 4.6550247517483926e-05, "loss": 2.5468, "step": 11200 }, { "epoch": 31.25348189415042, "grad_norm": 1.1878315761868057, "learning_rate": 4.645242946750176e-05, "loss": 2.5693, "step": 11220 }, { "epoch": 31.309192200557103, "grad_norm": 1.2856028595876672, "learning_rate": 4.635458421478398e-05, "loss": 2.5959, "step": 11240 }, { "epoch": 31.364902506963787, "grad_norm": 1.0789628687819908, "learning_rate": 4.6256712432754e-05, "loss": 2.5813, "step": 11260 }, { "epoch": 31.420612813370475, "grad_norm": 1.3393871601470777, "learning_rate": 4.615881479501779e-05, "loss": 2.5487, "step": 11280 }, { "epoch": 31.47632311977716, "grad_norm": 1.4170061638611984, "learning_rate": 4.606089197535936e-05, "loss": 2.5672, "step": 11300 }, { "epoch": 31.532033426183844, "grad_norm": 1.200423025841862, "learning_rate": 4.5962944647735934e-05, "loss": 2.5586, "step": 11320 }, { "epoch": 31.58774373259053, "grad_norm": 1.1437564881838662, "learning_rate": 4.586497348627349e-05, "loss": 2.5968, "step": 11340 }, { "epoch": 31.643454038997213, "grad_norm": 1.08141997515126, "learning_rate": 4.576697916526199e-05, "loss": 2.5688, "step": 11360 }, { "epoch": 31.6991643454039, "grad_norm": 1.2066477041237875, "learning_rate": 4.5668962359150815e-05, "loss": 2.593, "step": 11380 }, { "epoch": 31.754874651810585, "grad_norm": 1.2032961351385616, "learning_rate": 4.557092374254412e-05, "loss": 2.5883, "step": 11400 }, { "epoch": 31.81058495821727, "grad_norm": 1.1746121842250725, "learning_rate": 4.547286399019614e-05, "loss": 2.5669, "step": 11420 }, { "epoch": 31.866295264623954, "grad_norm": 1.1916678733066106, "learning_rate": 4.53747837770066e-05, "loss": 2.5613, "step": 11440 }, { "epoch": 31.922005571030642, "grad_norm": 1.151296833307102, "learning_rate": 4.5276683778015984e-05, "loss": 2.5574, "step": 11460 }, { "epoch": 31.977715877437326, "grad_norm": 1.1387505510412506, "learning_rate": 4.517856466840108e-05, "loss": 2.5778, "step": 11480 }, { "epoch": 32.033426183844014, "grad_norm": 1.1307628256541518, "learning_rate": 4.50804271234701e-05, "loss": 2.58, "step": 11500 }, { "epoch": 32.089136490250695, "grad_norm": 1.143654969969148, "learning_rate": 4.498227181865816e-05, "loss": 2.5342, "step": 11520 }, { "epoch": 32.14484679665738, "grad_norm": 1.0872870621719841, "learning_rate": 4.488409942952261e-05, "loss": 2.5615, "step": 11540 }, { "epoch": 32.200557103064064, "grad_norm": 1.258161916655144, "learning_rate": 4.478591063173842e-05, "loss": 2.5566, "step": 11560 }, { "epoch": 32.25626740947075, "grad_norm": 1.2208070919269458, "learning_rate": 4.468770610109344e-05, "loss": 2.5549, "step": 11580 }, { "epoch": 32.31197771587744, "grad_norm": 1.169221911082448, "learning_rate": 4.458948651348383e-05, "loss": 2.5896, "step": 11600 }, { "epoch": 32.36768802228412, "grad_norm": 1.095014238667817, "learning_rate": 4.4491252544909394e-05, "loss": 2.5633, "step": 11620 }, { "epoch": 32.42339832869081, "grad_norm": 1.2970244190994054, "learning_rate": 4.439300487146887e-05, "loss": 2.5643, "step": 11640 }, { "epoch": 32.4791086350975, "grad_norm": 1.2886541818686938, "learning_rate": 4.429474416935536e-05, "loss": 2.6024, "step": 11660 }, { "epoch": 32.53481894150418, "grad_norm": 1.1502247567681423, "learning_rate": 4.419647111485162e-05, "loss": 2.5393, "step": 11680 }, { "epoch": 32.590529247910865, "grad_norm": 1.2887795563474687, "learning_rate": 4.4098186384325424e-05, "loss": 2.5511, "step": 11700 }, { "epoch": 32.646239554317546, "grad_norm": 1.1497087680966827, "learning_rate": 4.399989065422491e-05, "loss": 2.5538, "step": 11720 }, { "epoch": 32.701949860724234, "grad_norm": 1.2553351424149752, "learning_rate": 4.39015846010739e-05, "loss": 2.5896, "step": 11740 }, { "epoch": 32.75766016713092, "grad_norm": 1.1172142499914282, "learning_rate": 4.380326890146732e-05, "loss": 2.5503, "step": 11760 }, { "epoch": 32.8133704735376, "grad_norm": 1.0957163400827985, "learning_rate": 4.370494423206639e-05, "loss": 2.5527, "step": 11780 }, { "epoch": 32.86908077994429, "grad_norm": 1.1727453881618946, "learning_rate": 4.360661126959418e-05, "loss": 2.5808, "step": 11800 }, { "epoch": 32.92479108635097, "grad_norm": 1.5043031377227738, "learning_rate": 4.3508270690830764e-05, "loss": 2.5809, "step": 11820 }, { "epoch": 32.98050139275766, "grad_norm": 3.061321585302142, "learning_rate": 4.340992317260865e-05, "loss": 2.5672, "step": 11840 }, { "epoch": 33.03621169916435, "grad_norm": 1.2959039372508918, "learning_rate": 4.3311569391808116e-05, "loss": 2.5542, "step": 11860 }, { "epoch": 33.09192200557103, "grad_norm": 1.1013587892712957, "learning_rate": 4.321321002535253e-05, "loss": 2.5175, "step": 11880 }, { "epoch": 33.147632311977716, "grad_norm": 1.0924446035892936, "learning_rate": 4.311484575020373e-05, "loss": 2.538, "step": 11900 }, { "epoch": 33.203342618384404, "grad_norm": 1.4345799681219358, "learning_rate": 4.3016477243357297e-05, "loss": 2.5775, "step": 11920 }, { "epoch": 33.259052924791085, "grad_norm": 1.2806133296858657, "learning_rate": 4.291810518183797e-05, "loss": 2.5358, "step": 11940 }, { "epoch": 33.31476323119777, "grad_norm": 1.1660089750322047, "learning_rate": 4.2819730242694924e-05, "loss": 2.5516, "step": 11960 }, { "epoch": 33.370473537604454, "grad_norm": 1.5552478865676143, "learning_rate": 4.272135310299719e-05, "loss": 2.5551, "step": 11980 }, { "epoch": 33.42618384401114, "grad_norm": 1.3019947250320747, "learning_rate": 4.262297443982888e-05, "loss": 2.5147, "step": 12000 }, { "epoch": 33.48189415041783, "grad_norm": 1.1712469646586066, "learning_rate": 4.252459493028466e-05, "loss": 2.5448, "step": 12020 }, { "epoch": 33.53760445682451, "grad_norm": 1.190656040631796, "learning_rate": 4.2426215251464944e-05, "loss": 2.5421, "step": 12040 }, { "epoch": 33.5933147632312, "grad_norm": 1.166057800751934, "learning_rate": 4.232783608047138e-05, "loss": 2.5225, "step": 12060 }, { "epoch": 33.64902506963789, "grad_norm": 1.1827485566010056, "learning_rate": 4.222945809440208e-05, "loss": 2.5264, "step": 12080 }, { "epoch": 33.70473537604457, "grad_norm": 1.0982238340826698, "learning_rate": 4.213108197034701e-05, "loss": 2.5311, "step": 12100 }, { "epoch": 33.760445682451255, "grad_norm": 1.3570517309524062, "learning_rate": 4.2032708385383325e-05, "loss": 2.5381, "step": 12120 }, { "epoch": 33.816155988857936, "grad_norm": 1.221101343240307, "learning_rate": 4.193433801657072e-05, "loss": 2.5085, "step": 12140 }, { "epoch": 33.871866295264624, "grad_norm": 1.1074955804736837, "learning_rate": 4.183597154094672e-05, "loss": 2.554, "step": 12160 }, { "epoch": 33.92757660167131, "grad_norm": 1.2061033239097223, "learning_rate": 4.173760963552209e-05, "loss": 2.5144, "step": 12180 }, { "epoch": 33.98328690807799, "grad_norm": 1.5819083614189133, "learning_rate": 4.1639252977276076e-05, "loss": 2.5495, "step": 12200 }, { "epoch": 34.03899721448468, "grad_norm": 1.2608802822567775, "learning_rate": 4.1540902243151906e-05, "loss": 2.5386, "step": 12220 }, { "epoch": 34.09470752089136, "grad_norm": 1.1987980606427822, "learning_rate": 4.144255811005199e-05, "loss": 2.5521, "step": 12240 }, { "epoch": 34.15041782729805, "grad_norm": 1.5620621378858097, "learning_rate": 4.134422125483328e-05, "loss": 2.547, "step": 12260 }, { "epoch": 34.20612813370474, "grad_norm": 1.1486672391725685, "learning_rate": 4.124589235430266e-05, "loss": 2.5527, "step": 12280 }, { "epoch": 34.26183844011142, "grad_norm": 1.0658317601206686, "learning_rate": 4.114757208521229e-05, "loss": 2.5188, "step": 12300 }, { "epoch": 34.317548746518106, "grad_norm": 1.1632885977667755, "learning_rate": 4.104926112425487e-05, "loss": 2.5066, "step": 12320 }, { "epoch": 34.373259052924794, "grad_norm": 1.1978332103431018, "learning_rate": 4.095096014805907e-05, "loss": 2.5242, "step": 12340 }, { "epoch": 34.428969359331475, "grad_norm": 1.2329715563026837, "learning_rate": 4.0852669833184864e-05, "loss": 2.5121, "step": 12360 }, { "epoch": 34.48467966573816, "grad_norm": 1.4254336559578342, "learning_rate": 4.075439085611879e-05, "loss": 2.5327, "step": 12380 }, { "epoch": 34.540389972144844, "grad_norm": 1.3455389003917915, "learning_rate": 4.065612389326941e-05, "loss": 2.5282, "step": 12400 }, { "epoch": 34.59610027855153, "grad_norm": 1.1098645710519481, "learning_rate": 4.055786962096253e-05, "loss": 2.5414, "step": 12420 }, { "epoch": 34.65181058495822, "grad_norm": 1.0712441022651848, "learning_rate": 4.04596287154367e-05, "loss": 2.5243, "step": 12440 }, { "epoch": 34.7075208913649, "grad_norm": 1.2229530919327418, "learning_rate": 4.0361401852838415e-05, "loss": 2.5391, "step": 12460 }, { "epoch": 34.76323119777159, "grad_norm": 1.1116414230831893, "learning_rate": 4.026318970921751e-05, "loss": 2.5549, "step": 12480 }, { "epoch": 34.81894150417827, "grad_norm": 1.3725266835406396, "learning_rate": 4.016499296052257e-05, "loss": 2.5375, "step": 12500 }, { "epoch": 34.87465181058496, "grad_norm": 1.1846019558051342, "learning_rate": 4.0066812282596165e-05, "loss": 2.5508, "step": 12520 }, { "epoch": 34.930362116991645, "grad_norm": 1.155996133148295, "learning_rate": 3.9968648351170285e-05, "loss": 2.5284, "step": 12540 }, { "epoch": 34.986072423398326, "grad_norm": 1.3251336278575234, "learning_rate": 3.987050184186168e-05, "loss": 2.5112, "step": 12560 }, { "epoch": 35.041782729805014, "grad_norm": 1.2962788212433607, "learning_rate": 3.9772373430167165e-05, "loss": 2.5334, "step": 12580 }, { "epoch": 35.0974930362117, "grad_norm": 1.1720387548872857, "learning_rate": 3.967426379145899e-05, "loss": 2.5233, "step": 12600 }, { "epoch": 35.15320334261838, "grad_norm": 1.0933571811040883, "learning_rate": 3.957617360098023e-05, "loss": 2.5134, "step": 12620 }, { "epoch": 35.20891364902507, "grad_norm": 1.2440055643446601, "learning_rate": 3.9478103533840095e-05, "loss": 2.5155, "step": 12640 }, { "epoch": 35.26462395543175, "grad_norm": 1.1329596221636413, "learning_rate": 3.938005426500927e-05, "loss": 2.5335, "step": 12660 }, { "epoch": 35.32033426183844, "grad_norm": 1.185464659161853, "learning_rate": 3.928202646931534e-05, "loss": 2.5438, "step": 12680 }, { "epoch": 35.37604456824513, "grad_norm": 1.3785291897615783, "learning_rate": 3.918402082143804e-05, "loss": 2.5442, "step": 12700 }, { "epoch": 35.43175487465181, "grad_norm": 2.546941401301199, "learning_rate": 3.908603799590476e-05, "loss": 2.5113, "step": 12720 }, { "epoch": 35.4874651810585, "grad_norm": 1.1505394021830595, "learning_rate": 3.898807866708572e-05, "loss": 2.5527, "step": 12740 }, { "epoch": 35.543175487465184, "grad_norm": 1.1406223829986795, "learning_rate": 3.889014350918947e-05, "loss": 2.5169, "step": 12760 }, { "epoch": 35.598885793871865, "grad_norm": 1.5268262564835722, "learning_rate": 3.8792233196258226e-05, "loss": 2.5385, "step": 12780 }, { "epoch": 35.65459610027855, "grad_norm": 1.4366052923997545, "learning_rate": 3.869434840216315e-05, "loss": 2.5138, "step": 12800 }, { "epoch": 35.710306406685234, "grad_norm": 1.3493022957735832, "learning_rate": 3.8596489800599826e-05, "loss": 2.5012, "step": 12820 }, { "epoch": 35.76601671309192, "grad_norm": 1.2612297939888255, "learning_rate": 3.849865806508352e-05, "loss": 2.5167, "step": 12840 }, { "epoch": 35.82172701949861, "grad_norm": 1.1655785400988308, "learning_rate": 3.8400853868944604e-05, "loss": 2.5054, "step": 12860 }, { "epoch": 35.87743732590529, "grad_norm": 1.195789035259279, "learning_rate": 3.8303077885323945e-05, "loss": 2.5038, "step": 12880 }, { "epoch": 35.93314763231198, "grad_norm": 1.1195435490995402, "learning_rate": 3.820533078716821e-05, "loss": 2.5628, "step": 12900 }, { "epoch": 35.98885793871866, "grad_norm": 1.7921872957808758, "learning_rate": 3.810761324722523e-05, "loss": 2.5052, "step": 12920 }, { "epoch": 36.04456824512535, "grad_norm": 1.2261058827590274, "learning_rate": 3.800992593803946e-05, "loss": 2.5112, "step": 12940 }, { "epoch": 36.100278551532035, "grad_norm": 1.135621182926646, "learning_rate": 3.791226953194725e-05, "loss": 2.5028, "step": 12960 }, { "epoch": 36.155988857938716, "grad_norm": 1.6277687709086734, "learning_rate": 3.7814644701072246e-05, "loss": 2.5162, "step": 12980 }, { "epoch": 36.211699164345404, "grad_norm": 1.1697686185114848, "learning_rate": 3.771705211732085e-05, "loss": 2.4937, "step": 13000 }, { "epoch": 36.26740947075209, "grad_norm": 1.1470445311609152, "learning_rate": 3.761949245237742e-05, "loss": 2.4959, "step": 13020 }, { "epoch": 36.32311977715877, "grad_norm": 1.1830067451201864, "learning_rate": 3.752196637769983e-05, "loss": 2.5184, "step": 13040 }, { "epoch": 36.37883008356546, "grad_norm": 1.467366443353488, "learning_rate": 3.742447456451474e-05, "loss": 2.5167, "step": 13060 }, { "epoch": 36.43454038997214, "grad_norm": 1.4104527622849412, "learning_rate": 3.732701768381299e-05, "loss": 2.5044, "step": 13080 }, { "epoch": 36.49025069637883, "grad_norm": 1.2155633839198112, "learning_rate": 3.722959640634501e-05, "loss": 2.5472, "step": 13100 }, { "epoch": 36.54596100278552, "grad_norm": 1.083177335128614, "learning_rate": 3.713221140261619e-05, "loss": 2.5002, "step": 13120 }, { "epoch": 36.6016713091922, "grad_norm": 1.2014071998350737, "learning_rate": 3.703486334288228e-05, "loss": 2.5114, "step": 13140 }, { "epoch": 36.65738161559889, "grad_norm": 1.1428988294911389, "learning_rate": 3.693755289714471e-05, "loss": 2.4979, "step": 13160 }, { "epoch": 36.713091922005574, "grad_norm": 1.1304274747808816, "learning_rate": 3.68402807351461e-05, "loss": 2.4936, "step": 13180 }, { "epoch": 36.768802228412255, "grad_norm": 1.3650882764211232, "learning_rate": 3.674304752636551e-05, "loss": 2.5157, "step": 13200 }, { "epoch": 36.82451253481894, "grad_norm": 1.2182169011081385, "learning_rate": 3.664585394001398e-05, "loss": 2.5035, "step": 13220 }, { "epoch": 36.880222841225624, "grad_norm": 1.272624756548235, "learning_rate": 3.654870064502978e-05, "loss": 2.4992, "step": 13240 }, { "epoch": 36.93593314763231, "grad_norm": 1.0715589202820812, "learning_rate": 3.6451588310073895e-05, "loss": 2.5021, "step": 13260 }, { "epoch": 36.991643454039, "grad_norm": 1.2942781983724116, "learning_rate": 3.6354517603525434e-05, "loss": 2.4859, "step": 13280 }, { "epoch": 37.04735376044568, "grad_norm": 1.2606778534888776, "learning_rate": 3.625748919347694e-05, "loss": 2.506, "step": 13300 }, { "epoch": 37.10306406685237, "grad_norm": 1.2777601010216262, "learning_rate": 3.616050374772989e-05, "loss": 2.4778, "step": 13320 }, { "epoch": 37.15877437325905, "grad_norm": 1.176481978013248, "learning_rate": 3.606356193379004e-05, "loss": 2.5033, "step": 13340 }, { "epoch": 37.21448467966574, "grad_norm": 1.2169531494104022, "learning_rate": 3.596666441886285e-05, "loss": 2.4996, "step": 13360 }, { "epoch": 37.270194986072426, "grad_norm": 1.3217947072787108, "learning_rate": 3.586981186984891e-05, "loss": 2.4884, "step": 13380 }, { "epoch": 37.325905292479106, "grad_norm": 1.2091550344263509, "learning_rate": 3.577300495333929e-05, "loss": 2.4643, "step": 13400 }, { "epoch": 37.381615598885794, "grad_norm": 1.3020086095843864, "learning_rate": 3.5676244335611045e-05, "loss": 2.5115, "step": 13420 }, { "epoch": 37.43732590529248, "grad_norm": 1.224631401229228, "learning_rate": 3.5579530682622527e-05, "loss": 2.5052, "step": 13440 }, { "epoch": 37.49303621169916, "grad_norm": 1.2937153649240984, "learning_rate": 3.548286466000888e-05, "loss": 2.4887, "step": 13460 }, { "epoch": 37.54874651810585, "grad_norm": 1.1629156767485442, "learning_rate": 3.5386246933077437e-05, "loss": 2.4835, "step": 13480 }, { "epoch": 37.60445682451253, "grad_norm": 1.409151745433056, "learning_rate": 3.52896781668031e-05, "loss": 2.4844, "step": 13500 }, { "epoch": 37.66016713091922, "grad_norm": 1.219993695453392, "learning_rate": 3.519315902582384e-05, "loss": 2.4891, "step": 13520 }, { "epoch": 37.71587743732591, "grad_norm": 1.286568356632355, "learning_rate": 3.509669017443603e-05, "loss": 2.5028, "step": 13540 }, { "epoch": 37.77158774373259, "grad_norm": 1.2144839115218449, "learning_rate": 3.500027227658998e-05, "loss": 2.4808, "step": 13560 }, { "epoch": 37.82729805013928, "grad_norm": 1.4206777606666374, "learning_rate": 3.490390599588527e-05, "loss": 2.4884, "step": 13580 }, { "epoch": 37.88300835654596, "grad_norm": 1.119769219553682, "learning_rate": 3.480759199556625e-05, "loss": 2.532, "step": 13600 }, { "epoch": 37.938718662952645, "grad_norm": 1.216344088823688, "learning_rate": 3.4711330938517415e-05, "loss": 2.4825, "step": 13620 }, { "epoch": 37.99442896935933, "grad_norm": 1.2356798590645826, "learning_rate": 3.4615123487258904e-05, "loss": 2.477, "step": 13640 }, { "epoch": 38.050139275766014, "grad_norm": 1.1889430421938458, "learning_rate": 3.45189703039419e-05, "loss": 2.4657, "step": 13660 }, { "epoch": 38.1058495821727, "grad_norm": 1.2167871317287668, "learning_rate": 3.442287205034409e-05, "loss": 2.4873, "step": 13680 }, { "epoch": 38.16155988857939, "grad_norm": 1.3417090804209535, "learning_rate": 3.4326829387865105e-05, "loss": 2.4978, "step": 13700 }, { "epoch": 38.21727019498607, "grad_norm": 1.215214848291835, "learning_rate": 3.423084297752197e-05, "loss": 2.4873, "step": 13720 }, { "epoch": 38.27298050139276, "grad_norm": 1.3942080399553185, "learning_rate": 3.413491347994455e-05, "loss": 2.4869, "step": 13740 }, { "epoch": 38.32869080779944, "grad_norm": 1.1596082595198587, "learning_rate": 3.4039041555370985e-05, "loss": 2.4742, "step": 13760 }, { "epoch": 38.38440111420613, "grad_norm": 1.300823510553577, "learning_rate": 3.394322786364321e-05, "loss": 2.4824, "step": 13780 }, { "epoch": 38.440111420612816, "grad_norm": 1.4023204575319392, "learning_rate": 3.384747306420234e-05, "loss": 2.5132, "step": 13800 }, { "epoch": 38.4958217270195, "grad_norm": 1.169311221589696, "learning_rate": 3.375177781608417e-05, "loss": 2.4931, "step": 13820 }, { "epoch": 38.551532033426184, "grad_norm": 1.3513790522732192, "learning_rate": 3.365614277791463e-05, "loss": 2.5037, "step": 13840 }, { "epoch": 38.60724233983287, "grad_norm": 1.127031391091575, "learning_rate": 3.3560568607905244e-05, "loss": 2.5187, "step": 13860 }, { "epoch": 38.66295264623955, "grad_norm": 1.2275938957516415, "learning_rate": 3.346505596384864e-05, "loss": 2.4657, "step": 13880 }, { "epoch": 38.71866295264624, "grad_norm": 1.3320190791797168, "learning_rate": 3.336960550311395e-05, "loss": 2.4951, "step": 13900 }, { "epoch": 38.77437325905292, "grad_norm": 1.5869673766120833, "learning_rate": 3.3274217882642355e-05, "loss": 2.5087, "step": 13920 }, { "epoch": 38.83008356545961, "grad_norm": 1.2695582913500114, "learning_rate": 3.317889375894252e-05, "loss": 2.4826, "step": 13940 }, { "epoch": 38.8857938718663, "grad_norm": 1.430015517022631, "learning_rate": 3.3083633788086115e-05, "loss": 2.4652, "step": 13960 }, { "epoch": 38.94150417827298, "grad_norm": 1.2238102984760655, "learning_rate": 3.2988438625703226e-05, "loss": 2.5151, "step": 13980 }, { "epoch": 38.99721448467967, "grad_norm": 1.1505694948469867, "learning_rate": 3.2893308926977964e-05, "loss": 2.4639, "step": 14000 }, { "epoch": 39.05292479108635, "grad_norm": 1.3358804025647248, "learning_rate": 3.2798245346643826e-05, "loss": 2.4831, "step": 14020 }, { "epoch": 39.108635097493035, "grad_norm": 1.1965450681906031, "learning_rate": 3.270324853897926e-05, "loss": 2.4934, "step": 14040 }, { "epoch": 39.16434540389972, "grad_norm": 1.2724701888326422, "learning_rate": 3.260831915780317e-05, "loss": 2.515, "step": 14060 }, { "epoch": 39.220055710306404, "grad_norm": 1.226431107683309, "learning_rate": 3.251345785647037e-05, "loss": 2.4912, "step": 14080 }, { "epoch": 39.27576601671309, "grad_norm": 1.4978021366863286, "learning_rate": 3.241866528786712e-05, "loss": 2.4666, "step": 14100 }, { "epoch": 39.33147632311978, "grad_norm": 1.1852102410219578, "learning_rate": 3.232394210440664e-05, "loss": 2.453, "step": 14120 }, { "epoch": 39.38718662952646, "grad_norm": 1.1514347696655127, "learning_rate": 3.222928895802457e-05, "loss": 2.492, "step": 14140 }, { "epoch": 39.44289693593315, "grad_norm": 1.1727022967870606, "learning_rate": 3.213470650017457e-05, "loss": 2.4671, "step": 14160 }, { "epoch": 39.49860724233983, "grad_norm": 1.2255693825516534, "learning_rate": 3.204019538182371e-05, "loss": 2.47, "step": 14180 }, { "epoch": 39.55431754874652, "grad_norm": 1.3308024547484585, "learning_rate": 3.194575625344813e-05, "loss": 2.4705, "step": 14200 }, { "epoch": 39.610027855153206, "grad_norm": 1.29922812166292, "learning_rate": 3.185138976502847e-05, "loss": 2.4756, "step": 14220 }, { "epoch": 39.66573816155989, "grad_norm": 1.172050649565444, "learning_rate": 3.175709656604543e-05, "loss": 2.4795, "step": 14240 }, { "epoch": 39.721448467966574, "grad_norm": 1.1860108514337921, "learning_rate": 3.166287730547528e-05, "loss": 2.4682, "step": 14260 }, { "epoch": 39.77715877437326, "grad_norm": 1.1464631864975399, "learning_rate": 3.1568732631785405e-05, "loss": 2.4649, "step": 14280 }, { "epoch": 39.83286908077994, "grad_norm": 1.2912682284287014, "learning_rate": 3.147466319292988e-05, "loss": 2.458, "step": 14300 }, { "epoch": 39.88857938718663, "grad_norm": 1.24473741562901, "learning_rate": 3.138066963634491e-05, "loss": 2.4418, "step": 14320 }, { "epoch": 39.94428969359331, "grad_norm": 1.2188645511392617, "learning_rate": 3.1286752608944504e-05, "loss": 2.4666, "step": 14340 }, { "epoch": 40.0, "grad_norm": 1.515575618356932, "learning_rate": 3.11929127571159e-05, "loss": 2.4695, "step": 14360 }, { "epoch": 40.05571030640669, "grad_norm": 1.267322258199449, "learning_rate": 3.10991507267152e-05, "loss": 2.4631, "step": 14380 }, { "epoch": 40.11142061281337, "grad_norm": 1.2899851180690782, "learning_rate": 3.100546716306292e-05, "loss": 2.4461, "step": 14400 }, { "epoch": 40.16713091922006, "grad_norm": 1.14649768727926, "learning_rate": 3.091186271093947e-05, "loss": 2.4534, "step": 14420 }, { "epoch": 40.22284122562674, "grad_norm": 1.2682086289441403, "learning_rate": 3.081833801458084e-05, "loss": 2.4369, "step": 14440 }, { "epoch": 40.278551532033426, "grad_norm": 1.2091006451986415, "learning_rate": 3.0724893717674023e-05, "loss": 2.4586, "step": 14460 }, { "epoch": 40.33426183844011, "grad_norm": 1.3592584263818426, "learning_rate": 3.063153046335271e-05, "loss": 2.4591, "step": 14480 }, { "epoch": 40.389972144846794, "grad_norm": 1.3946508542140756, "learning_rate": 3.0538248894192804e-05, "loss": 2.4411, "step": 14500 }, { "epoch": 40.44568245125348, "grad_norm": 1.1777413578534581, "learning_rate": 3.0445049652207995e-05, "loss": 2.4261, "step": 14520 }, { "epoch": 40.50139275766017, "grad_norm": 1.2195721573601, "learning_rate": 3.035193337884538e-05, "loss": 2.4421, "step": 14540 }, { "epoch": 40.55710306406685, "grad_norm": 1.3520200371008158, "learning_rate": 3.0258900714981e-05, "loss": 2.4602, "step": 14560 }, { "epoch": 40.61281337047354, "grad_norm": 1.3624148046187146, "learning_rate": 3.016595230091545e-05, "loss": 2.4655, "step": 14580 }, { "epoch": 40.66852367688022, "grad_norm": 1.2699195161380987, "learning_rate": 3.0073088776369473e-05, "loss": 2.4279, "step": 14600 }, { "epoch": 40.72423398328691, "grad_norm": 1.1758469344658353, "learning_rate": 2.998031078047958e-05, "loss": 2.473, "step": 14620 }, { "epoch": 40.779944289693596, "grad_norm": 1.215388378764313, "learning_rate": 2.9887618951793587e-05, "loss": 2.4955, "step": 14640 }, { "epoch": 40.83565459610028, "grad_norm": 1.2477068837194507, "learning_rate": 2.97950139282663e-05, "loss": 2.4784, "step": 14660 }, { "epoch": 40.891364902506965, "grad_norm": 1.3420186707298443, "learning_rate": 2.9702496347255056e-05, "loss": 2.4768, "step": 14680 }, { "epoch": 40.94707520891365, "grad_norm": 1.39249618130456, "learning_rate": 2.9610066845515383e-05, "loss": 2.4385, "step": 14700 }, { "epoch": 41.00278551532033, "grad_norm": 1.2754818915657757, "learning_rate": 2.9517726059196613e-05, "loss": 2.4569, "step": 14720 }, { "epoch": 41.05849582172702, "grad_norm": 1.4277410738801621, "learning_rate": 2.942547462383744e-05, "loss": 2.4587, "step": 14740 }, { "epoch": 41.1142061281337, "grad_norm": 1.1986405858066078, "learning_rate": 2.9333313174361673e-05, "loss": 2.4533, "step": 14760 }, { "epoch": 41.16991643454039, "grad_norm": 1.4106771315477926, "learning_rate": 2.924124234507371e-05, "loss": 2.4564, "step": 14780 }, { "epoch": 41.22562674094708, "grad_norm": 1.1572284880554229, "learning_rate": 2.9149262769654307e-05, "loss": 2.4403, "step": 14800 }, { "epoch": 41.28133704735376, "grad_norm": 1.3398664349194382, "learning_rate": 2.9057375081156153e-05, "loss": 2.4632, "step": 14820 }, { "epoch": 41.33704735376045, "grad_norm": 1.4232783195653564, "learning_rate": 2.89655799119995e-05, "loss": 2.4457, "step": 14840 }, { "epoch": 41.39275766016713, "grad_norm": 1.2303189537876713, "learning_rate": 2.887387789396784e-05, "loss": 2.4454, "step": 14860 }, { "epoch": 41.448467966573816, "grad_norm": 1.4753554867267846, "learning_rate": 2.8782269658203593e-05, "loss": 2.4708, "step": 14880 }, { "epoch": 41.5041782729805, "grad_norm": 2.0554338333623225, "learning_rate": 2.8690755835203644e-05, "loss": 2.4174, "step": 14900 }, { "epoch": 41.559888579387184, "grad_norm": 1.2638050115107629, "learning_rate": 2.8599337054815128e-05, "loss": 2.4576, "step": 14920 }, { "epoch": 41.61559888579387, "grad_norm": 1.6592795850932565, "learning_rate": 2.8508013946231054e-05, "loss": 2.4439, "step": 14940 }, { "epoch": 41.67130919220056, "grad_norm": 1.2018369861968858, "learning_rate": 2.8416787137985912e-05, "loss": 2.4677, "step": 14960 }, { "epoch": 41.72701949860724, "grad_norm": 1.1787125181340552, "learning_rate": 2.832565725795147e-05, "loss": 2.4423, "step": 14980 }, { "epoch": 41.78272980501393, "grad_norm": 1.3144876376584371, "learning_rate": 2.8234624933332324e-05, "loss": 2.4166, "step": 15000 }, { "epoch": 41.83844011142061, "grad_norm": 1.2101663058378904, "learning_rate": 2.8143690790661687e-05, "loss": 2.431, "step": 15020 }, { "epoch": 41.8941504178273, "grad_norm": 1.3306396247714227, "learning_rate": 2.8052855455797008e-05, "loss": 2.423, "step": 15040 }, { "epoch": 41.949860724233986, "grad_norm": 1.1740048371108092, "learning_rate": 2.7962119553915685e-05, "loss": 2.4543, "step": 15060 }, { "epoch": 42.00557103064067, "grad_norm": 1.2505959391308659, "learning_rate": 2.7871483709510788e-05, "loss": 2.4612, "step": 15080 }, { "epoch": 42.061281337047355, "grad_norm": 1.1705839887196592, "learning_rate": 2.7780948546386702e-05, "loss": 2.4248, "step": 15100 }, { "epoch": 42.116991643454035, "grad_norm": 1.5770301620040164, "learning_rate": 2.76905146876549e-05, "loss": 2.4475, "step": 15120 }, { "epoch": 42.17270194986072, "grad_norm": 1.3540734118986908, "learning_rate": 2.760018275572962e-05, "loss": 2.4186, "step": 15140 }, { "epoch": 42.22841225626741, "grad_norm": 1.146718032535289, "learning_rate": 2.750995337232356e-05, "loss": 2.4091, "step": 15160 }, { "epoch": 42.28412256267409, "grad_norm": 1.2196868218322996, "learning_rate": 2.7419827158443667e-05, "loss": 2.4309, "step": 15180 }, { "epoch": 42.33983286908078, "grad_norm": 1.5485243840943164, "learning_rate": 2.7329804734386765e-05, "loss": 2.4602, "step": 15200 }, { "epoch": 42.39554317548747, "grad_norm": 1.2206227305933974, "learning_rate": 2.723988671973541e-05, "loss": 2.4701, "step": 15220 }, { "epoch": 42.45125348189415, "grad_norm": 1.26332460678578, "learning_rate": 2.7150073733353484e-05, "loss": 2.4528, "step": 15240 }, { "epoch": 42.50696378830084, "grad_norm": 1.311901210503493, "learning_rate": 2.706036639338207e-05, "loss": 2.4283, "step": 15260 }, { "epoch": 42.56267409470752, "grad_norm": 1.2690533418017822, "learning_rate": 2.6970765317235096e-05, "loss": 2.4345, "step": 15280 }, { "epoch": 42.618384401114206, "grad_norm": 1.2676520230160475, "learning_rate": 2.6881271121595137e-05, "loss": 2.4048, "step": 15300 }, { "epoch": 42.674094707520894, "grad_norm": 1.4516895593566883, "learning_rate": 2.6791884422409157e-05, "loss": 2.4279, "step": 15320 }, { "epoch": 42.729805013927574, "grad_norm": 1.3872513872471008, "learning_rate": 2.6702605834884283e-05, "loss": 2.4026, "step": 15340 }, { "epoch": 42.78551532033426, "grad_norm": 1.3767239373202538, "learning_rate": 2.6613435973483546e-05, "loss": 2.4219, "step": 15360 }, { "epoch": 42.84122562674095, "grad_norm": 1.206741931800155, "learning_rate": 2.6524375451921694e-05, "loss": 2.426, "step": 15380 }, { "epoch": 42.89693593314763, "grad_norm": 1.314107492262272, "learning_rate": 2.643542488316087e-05, "loss": 2.4027, "step": 15400 }, { "epoch": 42.95264623955432, "grad_norm": 1.2591566602374167, "learning_rate": 2.6346584879406546e-05, "loss": 2.4105, "step": 15420 }, { "epoch": 43.008356545961, "grad_norm": 1.2543202609095945, "learning_rate": 2.6257856052103176e-05, "loss": 2.4174, "step": 15440 }, { "epoch": 43.06406685236769, "grad_norm": 1.3161836673091634, "learning_rate": 2.616923901193006e-05, "loss": 2.4146, "step": 15460 }, { "epoch": 43.119777158774376, "grad_norm": 1.171075292652416, "learning_rate": 2.6080734368797124e-05, "loss": 2.4159, "step": 15480 }, { "epoch": 43.17548746518106, "grad_norm": 1.207718728407823, "learning_rate": 2.599234273184067e-05, "loss": 2.404, "step": 15500 }, { "epoch": 43.231197771587745, "grad_norm": 1.2836648932544974, "learning_rate": 2.5904064709419275e-05, "loss": 2.4147, "step": 15520 }, { "epoch": 43.286908077994426, "grad_norm": 1.2867738641320774, "learning_rate": 2.5815900909109578e-05, "loss": 2.4405, "step": 15540 }, { "epoch": 43.34261838440111, "grad_norm": 1.314400827907675, "learning_rate": 2.572785193770205e-05, "loss": 2.384, "step": 15560 }, { "epoch": 43.3983286908078, "grad_norm": 1.4291537299918844, "learning_rate": 2.5639918401196828e-05, "loss": 2.4408, "step": 15580 }, { "epoch": 43.45403899721448, "grad_norm": 1.5382813225617216, "learning_rate": 2.555210090479959e-05, "loss": 2.4224, "step": 15600 }, { "epoch": 43.50974930362117, "grad_norm": 1.2172238724441946, "learning_rate": 2.5464400052917377e-05, "loss": 2.4273, "step": 15620 }, { "epoch": 43.56545961002786, "grad_norm": 1.3185716102890666, "learning_rate": 2.537681644915439e-05, "loss": 2.4399, "step": 15640 }, { "epoch": 43.62116991643454, "grad_norm": 1.7970207701573762, "learning_rate": 2.528935069630791e-05, "loss": 2.438, "step": 15660 }, { "epoch": 43.67688022284123, "grad_norm": 1.336384852624976, "learning_rate": 2.5202003396364028e-05, "loss": 2.4104, "step": 15680 }, { "epoch": 43.73259052924791, "grad_norm": 1.2492741812810837, "learning_rate": 2.5114775150493652e-05, "loss": 2.4372, "step": 15700 }, { "epoch": 43.788300835654596, "grad_norm": 1.1839597940386342, "learning_rate": 2.5027666559048265e-05, "loss": 2.4374, "step": 15720 }, { "epoch": 43.844011142061284, "grad_norm": 1.5042190873869037, "learning_rate": 2.4940678221555836e-05, "loss": 2.4131, "step": 15740 }, { "epoch": 43.899721448467965, "grad_norm": 1.263884884008274, "learning_rate": 2.485381073671668e-05, "loss": 2.4481, "step": 15760 }, { "epoch": 43.95543175487465, "grad_norm": 1.3105766012420574, "learning_rate": 2.4767064702399307e-05, "loss": 2.4316, "step": 15780 }, { "epoch": 44.01114206128134, "grad_norm": 1.3070696897883654, "learning_rate": 2.4680440715636386e-05, "loss": 2.4113, "step": 15800 }, { "epoch": 44.06685236768802, "grad_norm": 1.3679934235167148, "learning_rate": 2.459393937262057e-05, "loss": 2.462, "step": 15820 }, { "epoch": 44.12256267409471, "grad_norm": 1.2961531328086042, "learning_rate": 2.45075612687004e-05, "loss": 2.3913, "step": 15840 }, { "epoch": 44.17827298050139, "grad_norm": 1.5918138107382298, "learning_rate": 2.4421306998376247e-05, "loss": 2.4062, "step": 15860 }, { "epoch": 44.23398328690808, "grad_norm": 1.5407959855411433, "learning_rate": 2.4335177155296173e-05, "loss": 2.4135, "step": 15880 }, { "epoch": 44.289693593314766, "grad_norm": 1.3864718482505074, "learning_rate": 2.4249172332251867e-05, "loss": 2.435, "step": 15900 }, { "epoch": 44.34540389972145, "grad_norm": 1.5629168020574962, "learning_rate": 2.4163293121174586e-05, "loss": 2.42, "step": 15920 }, { "epoch": 44.401114206128135, "grad_norm": 1.3404420567150592, "learning_rate": 2.4077540113131e-05, "loss": 2.3939, "step": 15940 }, { "epoch": 44.456824512534816, "grad_norm": 1.2610215128317497, "learning_rate": 2.3991913898319236e-05, "loss": 2.3981, "step": 15960 }, { "epoch": 44.5125348189415, "grad_norm": 1.1948628905515135, "learning_rate": 2.390641506606475e-05, "loss": 2.4259, "step": 15980 }, { "epoch": 44.56824512534819, "grad_norm": 1.5442691993168876, "learning_rate": 2.3821044204816285e-05, "loss": 2.4106, "step": 16000 }, { "epoch": 44.62395543175487, "grad_norm": 1.3385181560005985, "learning_rate": 2.3735801902141812e-05, "loss": 2.4231, "step": 16020 }, { "epoch": 44.67966573816156, "grad_norm": 1.5429534444435276, "learning_rate": 2.3650688744724484e-05, "loss": 2.4094, "step": 16040 }, { "epoch": 44.73537604456825, "grad_norm": 1.3480847228783814, "learning_rate": 2.356570531835862e-05, "loss": 2.3925, "step": 16060 }, { "epoch": 44.79108635097493, "grad_norm": 1.3495527852819211, "learning_rate": 2.348085220794566e-05, "loss": 2.4055, "step": 16080 }, { "epoch": 44.84679665738162, "grad_norm": 1.3160942477102502, "learning_rate": 2.3396129997490143e-05, "loss": 2.4, "step": 16100 }, { "epoch": 44.9025069637883, "grad_norm": 1.1577155684233915, "learning_rate": 2.3311539270095685e-05, "loss": 2.4214, "step": 16120 }, { "epoch": 44.958217270194986, "grad_norm": 1.1408523761505251, "learning_rate": 2.3227080607960936e-05, "loss": 2.3958, "step": 16140 }, { "epoch": 45.013927576601674, "grad_norm": 1.2730620543264026, "learning_rate": 2.314275459237564e-05, "loss": 2.3903, "step": 16160 }, { "epoch": 45.069637883008355, "grad_norm": 1.1827924678817745, "learning_rate": 2.3058561803716587e-05, "loss": 2.4268, "step": 16180 }, { "epoch": 45.12534818941504, "grad_norm": 1.3034192039686017, "learning_rate": 2.2974502821443615e-05, "loss": 2.3954, "step": 16200 }, { "epoch": 45.18105849582173, "grad_norm": 1.4633461110410906, "learning_rate": 2.289057822409564e-05, "loss": 2.3956, "step": 16220 }, { "epoch": 45.23676880222841, "grad_norm": 1.4777159861247156, "learning_rate": 2.2806788589286683e-05, "loss": 2.3643, "step": 16240 }, { "epoch": 45.2924791086351, "grad_norm": 1.2317517708690167, "learning_rate": 2.2723134493701863e-05, "loss": 2.3884, "step": 16260 }, { "epoch": 45.34818941504178, "grad_norm": 1.2821403751825975, "learning_rate": 2.2639616513093453e-05, "loss": 2.4146, "step": 16280 }, { "epoch": 45.40389972144847, "grad_norm": 1.445641063656784, "learning_rate": 2.2556235222276924e-05, "loss": 2.4316, "step": 16300 }, { "epoch": 45.459610027855156, "grad_norm": 1.4577958162948974, "learning_rate": 2.2472991195126933e-05, "loss": 2.417, "step": 16320 }, { "epoch": 45.51532033426184, "grad_norm": 1.3311809796824847, "learning_rate": 2.2389885004573452e-05, "loss": 2.4165, "step": 16340 }, { "epoch": 45.571030640668525, "grad_norm": 1.404962587268908, "learning_rate": 2.2306917222597776e-05, "loss": 2.4204, "step": 16360 }, { "epoch": 45.626740947075206, "grad_norm": 1.2925568482512864, "learning_rate": 2.2224088420228597e-05, "loss": 2.3624, "step": 16380 }, { "epoch": 45.682451253481894, "grad_norm": 1.2480563390495507, "learning_rate": 2.21413991675381e-05, "loss": 2.4226, "step": 16400 }, { "epoch": 45.73816155988858, "grad_norm": 1.395538579066591, "learning_rate": 2.2058850033637958e-05, "loss": 2.4103, "step": 16420 }, { "epoch": 45.79387186629526, "grad_norm": 1.2886437393173196, "learning_rate": 2.197644158667552e-05, "loss": 2.4027, "step": 16440 }, { "epoch": 45.84958217270195, "grad_norm": 1.2805552396725532, "learning_rate": 2.1894174393829843e-05, "loss": 2.3974, "step": 16460 }, { "epoch": 45.90529247910864, "grad_norm": 1.3616296005412893, "learning_rate": 2.1812049021307776e-05, "loss": 2.389, "step": 16480 }, { "epoch": 45.96100278551532, "grad_norm": 1.2441651106621028, "learning_rate": 2.1730066034340133e-05, "loss": 2.397, "step": 16500 }, { "epoch": 46.01671309192201, "grad_norm": 1.294168032433046, "learning_rate": 2.1648225997177664e-05, "loss": 2.4032, "step": 16520 }, { "epoch": 46.07242339832869, "grad_norm": 1.3259623680432362, "learning_rate": 2.1566529473087366e-05, "loss": 2.409, "step": 16540 }, { "epoch": 46.128133704735376, "grad_norm": 1.4193156574119963, "learning_rate": 2.1484977024348456e-05, "loss": 2.3973, "step": 16560 }, { "epoch": 46.183844011142064, "grad_norm": 1.423133534891623, "learning_rate": 2.1403569212248545e-05, "loss": 2.4221, "step": 16580 }, { "epoch": 46.239554317548745, "grad_norm": 1.289312718105723, "learning_rate": 2.1322306597079752e-05, "loss": 2.4058, "step": 16600 }, { "epoch": 46.29526462395543, "grad_norm": 1.3742336820519754, "learning_rate": 2.1241189738134925e-05, "loss": 2.3803, "step": 16620 }, { "epoch": 46.35097493036211, "grad_norm": 1.3293445511084065, "learning_rate": 2.116021919370371e-05, "loss": 2.3779, "step": 16640 }, { "epoch": 46.4066852367688, "grad_norm": 1.438873266798883, "learning_rate": 2.1079395521068736e-05, "loss": 2.4125, "step": 16660 }, { "epoch": 46.46239554317549, "grad_norm": 1.5684362505574445, "learning_rate": 2.099871927650181e-05, "loss": 2.4172, "step": 16680 }, { "epoch": 46.51810584958217, "grad_norm": 1.219039146798224, "learning_rate": 2.091819101526001e-05, "loss": 2.3556, "step": 16700 }, { "epoch": 46.57381615598886, "grad_norm": 1.290744476185817, "learning_rate": 2.083781129158196e-05, "loss": 2.3915, "step": 16720 }, { "epoch": 46.629526462395546, "grad_norm": 1.2783924627018073, "learning_rate": 2.075758065868394e-05, "loss": 2.4021, "step": 16740 }, { "epoch": 46.68523676880223, "grad_norm": 1.4723296505015109, "learning_rate": 2.0677499668756148e-05, "loss": 2.4076, "step": 16760 }, { "epoch": 46.740947075208915, "grad_norm": 1.4361000627178464, "learning_rate": 2.0597568872958793e-05, "loss": 2.3704, "step": 16780 }, { "epoch": 46.796657381615596, "grad_norm": 1.3406172199532498, "learning_rate": 2.051778882141842e-05, "loss": 2.4095, "step": 16800 }, { "epoch": 46.852367688022284, "grad_norm": 1.2728857237580478, "learning_rate": 2.0438160063224055e-05, "loss": 2.3951, "step": 16820 }, { "epoch": 46.90807799442897, "grad_norm": 1.307601696226092, "learning_rate": 2.035868314642344e-05, "loss": 2.3762, "step": 16840 }, { "epoch": 46.96378830083565, "grad_norm": 1.341375077755919, "learning_rate": 2.0279358618019277e-05, "loss": 2.3747, "step": 16860 }, { "epoch": 47.01949860724234, "grad_norm": 1.3661705880679602, "learning_rate": 2.0200187023965426e-05, "loss": 2.3762, "step": 16880 }, { "epoch": 47.07520891364903, "grad_norm": 1.498474803717406, "learning_rate": 2.0121168909163192e-05, "loss": 2.3757, "step": 16900 }, { "epoch": 47.13091922005571, "grad_norm": 1.30345057097787, "learning_rate": 2.0042304817457542e-05, "loss": 2.4065, "step": 16920 }, { "epoch": 47.1866295264624, "grad_norm": 1.8505180253101237, "learning_rate": 1.9963595291633392e-05, "loss": 2.3871, "step": 16940 }, { "epoch": 47.24233983286908, "grad_norm": 1.450593355694572, "learning_rate": 1.9885040873411806e-05, "loss": 2.3871, "step": 16960 }, { "epoch": 47.298050139275766, "grad_norm": 1.3343700388758895, "learning_rate": 1.980664210344637e-05, "loss": 2.3649, "step": 16980 }, { "epoch": 47.353760445682454, "grad_norm": 1.355886861721978, "learning_rate": 1.9728399521319373e-05, "loss": 2.4009, "step": 17000 }, { "epoch": 47.409470752089135, "grad_norm": 1.271470013080887, "learning_rate": 1.9650313665538177e-05, "loss": 2.3921, "step": 17020 }, { "epoch": 47.46518105849582, "grad_norm": 1.3641748668938396, "learning_rate": 1.957238507353144e-05, "loss": 2.3785, "step": 17040 }, { "epoch": 47.5208913649025, "grad_norm": 1.3453345779522274, "learning_rate": 1.9494614281645438e-05, "loss": 2.3535, "step": 17060 }, { "epoch": 47.57660167130919, "grad_norm": 1.4507718501211375, "learning_rate": 1.9417001825140412e-05, "loss": 2.3866, "step": 17080 }, { "epoch": 47.63231197771588, "grad_norm": 1.246940923314926, "learning_rate": 1.9339548238186828e-05, "loss": 2.3664, "step": 17100 }, { "epoch": 47.68802228412256, "grad_norm": 1.369776524169899, "learning_rate": 1.9262254053861745e-05, "loss": 2.4068, "step": 17120 }, { "epoch": 47.74373259052925, "grad_norm": 1.4449578327650376, "learning_rate": 1.9185119804145137e-05, "loss": 2.386, "step": 17140 }, { "epoch": 47.799442896935936, "grad_norm": 1.3430555560772082, "learning_rate": 1.9108146019916174e-05, "loss": 2.4116, "step": 17160 }, { "epoch": 47.85515320334262, "grad_norm": 1.3639059020256794, "learning_rate": 1.9031333230949668e-05, "loss": 2.3732, "step": 17180 }, { "epoch": 47.910863509749305, "grad_norm": 1.2408281799864953, "learning_rate": 1.8954681965912332e-05, "loss": 2.3787, "step": 17200 }, { "epoch": 47.966573816155986, "grad_norm": 1.3061826212099938, "learning_rate": 1.8878192752359258e-05, "loss": 2.3728, "step": 17220 }, { "epoch": 48.022284122562674, "grad_norm": 1.538238890457239, "learning_rate": 1.8801866116730123e-05, "loss": 2.3755, "step": 17240 }, { "epoch": 48.07799442896936, "grad_norm": 1.3762558112205403, "learning_rate": 1.872570258434571e-05, "loss": 2.3727, "step": 17260 }, { "epoch": 48.13370473537604, "grad_norm": 1.2476738434905705, "learning_rate": 1.8649702679404223e-05, "loss": 2.3652, "step": 17280 }, { "epoch": 48.18941504178273, "grad_norm": 1.278376392904509, "learning_rate": 1.8573866924977697e-05, "loss": 2.3867, "step": 17300 }, { "epoch": 48.24512534818942, "grad_norm": 1.5831828264230967, "learning_rate": 1.84981958430084e-05, "loss": 2.382, "step": 17320 }, { "epoch": 48.3008356545961, "grad_norm": 1.2626351145463612, "learning_rate": 1.842268995430522e-05, "loss": 2.36, "step": 17340 }, { "epoch": 48.35654596100279, "grad_norm": 1.3775126997716187, "learning_rate": 1.834734977854011e-05, "loss": 2.3553, "step": 17360 }, { "epoch": 48.41225626740947, "grad_norm": 1.2704692987761135, "learning_rate": 1.8272175834244497e-05, "loss": 2.3722, "step": 17380 }, { "epoch": 48.467966573816156, "grad_norm": 1.2840542491455302, "learning_rate": 1.8197168638805704e-05, "loss": 2.3766, "step": 17400 }, { "epoch": 48.523676880222844, "grad_norm": 1.2405421296938253, "learning_rate": 1.812232870846343e-05, "loss": 2.3796, "step": 17420 }, { "epoch": 48.579387186629525, "grad_norm": 1.5352436430288825, "learning_rate": 1.8047656558306114e-05, "loss": 2.3297, "step": 17440 }, { "epoch": 48.63509749303621, "grad_norm": 1.3152930932933073, "learning_rate": 1.797315270226748e-05, "loss": 2.3763, "step": 17460 }, { "epoch": 48.690807799442894, "grad_norm": 1.162288641400069, "learning_rate": 1.789881765312296e-05, "loss": 2.378, "step": 17480 }, { "epoch": 48.74651810584958, "grad_norm": 1.5801279533275956, "learning_rate": 1.7824651922486156e-05, "loss": 2.3697, "step": 17500 }, { "epoch": 48.80222841225627, "grad_norm": 1.471173585562619, "learning_rate": 1.7750656020805324e-05, "loss": 2.3874, "step": 17520 }, { "epoch": 48.85793871866295, "grad_norm": 1.4109592484040796, "learning_rate": 1.767683045735989e-05, "loss": 2.3444, "step": 17540 }, { "epoch": 48.91364902506964, "grad_norm": 1.3726614971919417, "learning_rate": 1.7603175740256895e-05, "loss": 2.3635, "step": 17560 }, { "epoch": 48.969359331476326, "grad_norm": 1.4758519191463757, "learning_rate": 1.752969237642755e-05, "loss": 2.3672, "step": 17580 }, { "epoch": 49.02506963788301, "grad_norm": 1.4221040255565391, "learning_rate": 1.745638087162368e-05, "loss": 2.3578, "step": 17600 }, { "epoch": 49.080779944289695, "grad_norm": 1.3119388898351543, "learning_rate": 1.7383241730414324e-05, "loss": 2.3895, "step": 17620 }, { "epoch": 49.136490250696376, "grad_norm": 1.3268973264123183, "learning_rate": 1.7310275456182212e-05, "loss": 2.3383, "step": 17640 }, { "epoch": 49.192200557103064, "grad_norm": 1.3097397298281048, "learning_rate": 1.72374825511203e-05, "loss": 2.3623, "step": 17660 }, { "epoch": 49.24791086350975, "grad_norm": 1.1921177397839031, "learning_rate": 1.716486351622835e-05, "loss": 2.351, "step": 17680 }, { "epoch": 49.30362116991643, "grad_norm": 1.3660254674411707, "learning_rate": 1.709241885130941e-05, "loss": 2.3643, "step": 17700 }, { "epoch": 49.35933147632312, "grad_norm": 1.6307520440880778, "learning_rate": 1.7020149054966462e-05, "loss": 2.3624, "step": 17720 }, { "epoch": 49.41504178272981, "grad_norm": 1.4329708911636774, "learning_rate": 1.694805462459894e-05, "loss": 2.3278, "step": 17740 }, { "epoch": 49.47075208913649, "grad_norm": 1.516855939329352, "learning_rate": 1.6876136056399307e-05, "loss": 2.3734, "step": 17760 }, { "epoch": 49.52646239554318, "grad_norm": 1.2457026011328314, "learning_rate": 1.6804393845349665e-05, "loss": 2.3626, "step": 17780 }, { "epoch": 49.58217270194986, "grad_norm": 1.7297834390596316, "learning_rate": 1.6732828485218297e-05, "loss": 2.3713, "step": 17800 }, { "epoch": 49.637883008356546, "grad_norm": 1.429662136545568, "learning_rate": 1.6661440468556335e-05, "loss": 2.3455, "step": 17820 }, { "epoch": 49.693593314763234, "grad_norm": 1.26740617213478, "learning_rate": 1.6590230286694328e-05, "loss": 2.3659, "step": 17840 }, { "epoch": 49.749303621169915, "grad_norm": 1.681389697921109, "learning_rate": 1.651919842973888e-05, "loss": 2.3445, "step": 17860 }, { "epoch": 49.8050139275766, "grad_norm": 1.441881294284973, "learning_rate": 1.6448345386569248e-05, "loss": 2.3834, "step": 17880 }, { "epoch": 49.860724233983284, "grad_norm": 1.341854199720449, "learning_rate": 1.637767164483401e-05, "loss": 2.3699, "step": 17900 }, { "epoch": 49.91643454038997, "grad_norm": 1.252618865244937, "learning_rate": 1.6307177690947698e-05, "loss": 2.3635, "step": 17920 }, { "epoch": 49.97214484679666, "grad_norm": 1.226307845622635, "learning_rate": 1.6236864010087446e-05, "loss": 2.3485, "step": 17940 }, { "epoch": 50.02785515320334, "grad_norm": 1.3356553843028374, "learning_rate": 1.616673108618965e-05, "loss": 2.3578, "step": 17960 }, { "epoch": 50.08356545961003, "grad_norm": 1.3848996607905597, "learning_rate": 1.6096779401946624e-05, "loss": 2.3504, "step": 17980 }, { "epoch": 50.139275766016716, "grad_norm": 1.498723321509667, "learning_rate": 1.6027009438803323e-05, "loss": 2.3496, "step": 18000 }, { "epoch": 50.1949860724234, "grad_norm": 1.5191100216493636, "learning_rate": 1.595742167695398e-05, "loss": 2.3461, "step": 18020 }, { "epoch": 50.250696378830085, "grad_norm": 1.2559077943231471, "learning_rate": 1.5888016595338836e-05, "loss": 2.371, "step": 18040 }, { "epoch": 50.306406685236766, "grad_norm": 1.3899128057224512, "learning_rate": 1.5818794671640822e-05, "loss": 2.349, "step": 18060 }, { "epoch": 50.362116991643454, "grad_norm": 1.354712886418284, "learning_rate": 1.574975638228226e-05, "loss": 2.3709, "step": 18080 }, { "epoch": 50.41782729805014, "grad_norm": 1.3042322626317018, "learning_rate": 1.5680902202421623e-05, "loss": 2.3456, "step": 18100 }, { "epoch": 50.47353760445682, "grad_norm": 1.3508893425795137, "learning_rate": 1.5612232605950247e-05, "loss": 2.3353, "step": 18120 }, { "epoch": 50.52924791086351, "grad_norm": 1.4691874741171684, "learning_rate": 1.554374806548906e-05, "loss": 2.3336, "step": 18140 }, { "epoch": 50.5849582172702, "grad_norm": 3.0455576787450673, "learning_rate": 1.5475449052385337e-05, "loss": 2.3218, "step": 18160 }, { "epoch": 50.64066852367688, "grad_norm": 1.4402674578269268, "learning_rate": 1.540733603670942e-05, "loss": 2.3372, "step": 18180 }, { "epoch": 50.69637883008357, "grad_norm": 1.4795688476423643, "learning_rate": 1.5339409487251585e-05, "loss": 2.341, "step": 18200 }, { "epoch": 50.75208913649025, "grad_norm": 1.4070357885010039, "learning_rate": 1.5271669871518705e-05, "loss": 2.3241, "step": 18220 }, { "epoch": 50.807799442896936, "grad_norm": 1.3031362800514608, "learning_rate": 1.5204117655731085e-05, "loss": 2.3621, "step": 18240 }, { "epoch": 50.863509749303624, "grad_norm": 1.434766758940882, "learning_rate": 1.5136753304819218e-05, "loss": 2.3302, "step": 18260 }, { "epoch": 50.919220055710305, "grad_norm": 1.274818587770635, "learning_rate": 1.5069577282420647e-05, "loss": 2.3465, "step": 18280 }, { "epoch": 50.97493036211699, "grad_norm": 1.2730001227993584, "learning_rate": 1.500259005087672e-05, "loss": 2.3294, "step": 18300 }, { "epoch": 51.030640668523674, "grad_norm": 1.3369943012249967, "learning_rate": 1.493579207122943e-05, "loss": 2.3577, "step": 18320 }, { "epoch": 51.08635097493036, "grad_norm": 1.255851793002455, "learning_rate": 1.4869183803218242e-05, "loss": 2.3442, "step": 18340 }, { "epoch": 51.14206128133705, "grad_norm": 1.465379611467944, "learning_rate": 1.4802765705276894e-05, "loss": 2.3361, "step": 18360 }, { "epoch": 51.19777158774373, "grad_norm": 1.598753189094513, "learning_rate": 1.4736538234530309e-05, "loss": 2.3488, "step": 18380 }, { "epoch": 51.25348189415042, "grad_norm": 1.3758579974135776, "learning_rate": 1.4670501846791401e-05, "loss": 2.341, "step": 18400 }, { "epoch": 51.309192200557106, "grad_norm": 1.3023167947311844, "learning_rate": 1.4604656996557936e-05, "loss": 2.3496, "step": 18420 }, { "epoch": 51.36490250696379, "grad_norm": 1.2931564385198864, "learning_rate": 1.4539004137009436e-05, "loss": 2.3394, "step": 18440 }, { "epoch": 51.420612813370475, "grad_norm": 1.6620919275534023, "learning_rate": 1.4473543720004015e-05, "loss": 2.3285, "step": 18460 }, { "epoch": 51.476323119777156, "grad_norm": 1.3917496152973825, "learning_rate": 1.4408276196075313e-05, "loss": 2.3486, "step": 18480 }, { "epoch": 51.532033426183844, "grad_norm": 1.3562772472225384, "learning_rate": 1.4343202014429376e-05, "loss": 2.3323, "step": 18500 }, { "epoch": 51.58774373259053, "grad_norm": 1.2319819791798092, "learning_rate": 1.4278321622941556e-05, "loss": 2.3439, "step": 18520 }, { "epoch": 51.64345403899721, "grad_norm": 1.5358845964171328, "learning_rate": 1.4213635468153446e-05, "loss": 2.3307, "step": 18540 }, { "epoch": 51.6991643454039, "grad_norm": 1.292648527837981, "learning_rate": 1.4149143995269799e-05, "loss": 2.3303, "step": 18560 }, { "epoch": 51.75487465181058, "grad_norm": 1.9631233257274625, "learning_rate": 1.4084847648155449e-05, "loss": 2.3382, "step": 18580 }, { "epoch": 51.81058495821727, "grad_norm": 1.6354418847695984, "learning_rate": 1.4020746869332296e-05, "loss": 2.3761, "step": 18600 }, { "epoch": 51.86629526462396, "grad_norm": 1.4408996452028136, "learning_rate": 1.3956842099976191e-05, "loss": 2.3899, "step": 18620 }, { "epoch": 51.92200557103064, "grad_norm": 1.3154420223017438, "learning_rate": 1.3893133779913992e-05, "loss": 2.3267, "step": 18640 }, { "epoch": 51.977715877437326, "grad_norm": 1.2664206876617758, "learning_rate": 1.382962234762045e-05, "loss": 2.3145, "step": 18660 }, { "epoch": 52.033426183844014, "grad_norm": 1.461002841812497, "learning_rate": 1.3766308240215257e-05, "loss": 2.337, "step": 18680 }, { "epoch": 52.089136490250695, "grad_norm": 1.3350308045413666, "learning_rate": 1.3703191893460002e-05, "loss": 2.3553, "step": 18700 }, { "epoch": 52.14484679665738, "grad_norm": 1.4095843649708175, "learning_rate": 1.364027374175515e-05, "loss": 2.3408, "step": 18720 }, { "epoch": 52.200557103064064, "grad_norm": 1.8553789055534144, "learning_rate": 1.357755421813712e-05, "loss": 2.3513, "step": 18740 }, { "epoch": 52.25626740947075, "grad_norm": 1.517906600566457, "learning_rate": 1.3515033754275249e-05, "loss": 2.3512, "step": 18760 }, { "epoch": 52.31197771587744, "grad_norm": 1.3004637489061956, "learning_rate": 1.3452712780468846e-05, "loss": 2.3344, "step": 18780 }, { "epoch": 52.36768802228412, "grad_norm": 1.6081005585159005, "learning_rate": 1.3390591725644231e-05, "loss": 2.3714, "step": 18800 }, { "epoch": 52.42339832869081, "grad_norm": 1.3820776705600462, "learning_rate": 1.3328671017351728e-05, "loss": 2.3472, "step": 18820 }, { "epoch": 52.4791086350975, "grad_norm": 1.741399862442912, "learning_rate": 1.3266951081762823e-05, "loss": 2.3318, "step": 18840 }, { "epoch": 52.53481894150418, "grad_norm": 1.6610982616432777, "learning_rate": 1.320543234366714e-05, "loss": 2.3564, "step": 18860 }, { "epoch": 52.590529247910865, "grad_norm": 1.534678472008335, "learning_rate": 1.3144115226469601e-05, "loss": 2.3453, "step": 18880 }, { "epoch": 52.646239554317546, "grad_norm": 1.6155419457685751, "learning_rate": 1.3083000152187406e-05, "loss": 2.3193, "step": 18900 }, { "epoch": 52.701949860724234, "grad_norm": 1.3933017087549446, "learning_rate": 1.3022087541447226e-05, "loss": 2.3263, "step": 18920 }, { "epoch": 52.75766016713092, "grad_norm": 1.4752643448356435, "learning_rate": 1.2961377813482258e-05, "loss": 2.3198, "step": 18940 }, { "epoch": 52.8133704735376, "grad_norm": 1.4046488544858395, "learning_rate": 1.2900871386129355e-05, "loss": 2.3076, "step": 18960 }, { "epoch": 52.86908077994429, "grad_norm": 1.3496360542086223, "learning_rate": 1.2840568675826145e-05, "loss": 2.3298, "step": 18980 }, { "epoch": 52.92479108635097, "grad_norm": 1.441744999480469, "learning_rate": 1.2780470097608155e-05, "loss": 2.3579, "step": 19000 }, { "epoch": 52.98050139275766, "grad_norm": 1.441039976490415, "learning_rate": 1.272057606510598e-05, "loss": 2.3408, "step": 19020 }, { "epoch": 53.03621169916435, "grad_norm": 1.394791214620375, "learning_rate": 1.2660886990542415e-05, "loss": 2.3151, "step": 19040 }, { "epoch": 53.09192200557103, "grad_norm": 1.315868085708281, "learning_rate": 1.2601403284729635e-05, "loss": 2.3304, "step": 19060 }, { "epoch": 53.147632311977716, "grad_norm": 1.3975934028921921, "learning_rate": 1.2542125357066354e-05, "loss": 2.3314, "step": 19080 }, { "epoch": 53.203342618384404, "grad_norm": 1.3231737653504019, "learning_rate": 1.2483053615534986e-05, "loss": 2.3419, "step": 19100 }, { "epoch": 53.259052924791085, "grad_norm": 1.4696480235764173, "learning_rate": 1.2424188466698894e-05, "loss": 2.327, "step": 19120 }, { "epoch": 53.31476323119777, "grad_norm": 1.369796074017674, "learning_rate": 1.2365530315699543e-05, "loss": 2.2887, "step": 19140 }, { "epoch": 53.370473537604454, "grad_norm": 1.318991041965293, "learning_rate": 1.2307079566253733e-05, "loss": 2.3359, "step": 19160 }, { "epoch": 53.42618384401114, "grad_norm": 1.3653752917448936, "learning_rate": 1.2248836620650818e-05, "loss": 2.3091, "step": 19180 }, { "epoch": 53.48189415041783, "grad_norm": 1.3620553290863062, "learning_rate": 1.219080187974993e-05, "loss": 2.333, "step": 19200 }, { "epoch": 53.53760445682451, "grad_norm": 1.521530969044834, "learning_rate": 1.2132975742977222e-05, "loss": 2.3477, "step": 19220 }, { "epoch": 53.5933147632312, "grad_norm": 1.3604837849834415, "learning_rate": 1.2075358608323133e-05, "loss": 2.3018, "step": 19240 }, { "epoch": 53.64902506963789, "grad_norm": 1.3342516336240242, "learning_rate": 1.2017950872339636e-05, "loss": 2.3189, "step": 19260 }, { "epoch": 53.70473537604457, "grad_norm": 1.388890930917694, "learning_rate": 1.1960752930137489e-05, "loss": 2.3289, "step": 19280 }, { "epoch": 53.760445682451255, "grad_norm": 1.3001131964873058, "learning_rate": 1.1903765175383552e-05, "loss": 2.2918, "step": 19300 }, { "epoch": 53.816155988857936, "grad_norm": 1.3807429463362486, "learning_rate": 1.1846988000298073e-05, "loss": 2.2947, "step": 19320 }, { "epoch": 53.871866295264624, "grad_norm": 1.3500489069634025, "learning_rate": 1.1790421795651973e-05, "loss": 2.3193, "step": 19340 }, { "epoch": 53.92757660167131, "grad_norm": 1.440888239581446, "learning_rate": 1.1734066950764138e-05, "loss": 2.3342, "step": 19360 }, { "epoch": 53.98328690807799, "grad_norm": 1.5457618066269658, "learning_rate": 1.1677923853498792e-05, "loss": 2.2843, "step": 19380 }, { "epoch": 54.03899721448468, "grad_norm": 1.4026171429295824, "learning_rate": 1.162199289026279e-05, "loss": 2.2993, "step": 19400 }, { "epoch": 54.09470752089136, "grad_norm": 1.5830085773681513, "learning_rate": 1.156627444600296e-05, "loss": 2.3154, "step": 19420 }, { "epoch": 54.15041782729805, "grad_norm": 1.4881826390608948, "learning_rate": 1.151076890420348e-05, "loss": 2.3147, "step": 19440 }, { "epoch": 54.20612813370474, "grad_norm": 1.3551091744705666, "learning_rate": 1.1455476646883177e-05, "loss": 2.3427, "step": 19460 }, { "epoch": 54.26183844011142, "grad_norm": 1.4419537324222909, "learning_rate": 1.1400398054592988e-05, "loss": 2.3253, "step": 19480 }, { "epoch": 54.317548746518106, "grad_norm": 1.3708788026201257, "learning_rate": 1.1345533506413266e-05, "loss": 2.2869, "step": 19500 }, { "epoch": 54.373259052924794, "grad_norm": 1.474440189127514, "learning_rate": 1.1290883379951205e-05, "loss": 2.3095, "step": 19520 }, { "epoch": 54.428969359331475, "grad_norm": 1.3454680397423404, "learning_rate": 1.1236448051338234e-05, "loss": 2.2888, "step": 19540 }, { "epoch": 54.48467966573816, "grad_norm": 1.429931802787514, "learning_rate": 1.1182227895227435e-05, "loss": 2.3356, "step": 19560 }, { "epoch": 54.540389972144844, "grad_norm": 1.5782912526289399, "learning_rate": 1.112822328479094e-05, "loss": 2.3116, "step": 19580 }, { "epoch": 54.59610027855153, "grad_norm": 1.4640262618715514, "learning_rate": 1.1074434591717396e-05, "loss": 2.3333, "step": 19600 }, { "epoch": 54.65181058495822, "grad_norm": 1.7289128171607941, "learning_rate": 1.102086218620939e-05, "loss": 2.2807, "step": 19620 }, { "epoch": 54.7075208913649, "grad_norm": 1.4358145943314486, "learning_rate": 1.0967506436980888e-05, "loss": 2.3362, "step": 19640 }, { "epoch": 54.76323119777159, "grad_norm": 1.2782981684370716, "learning_rate": 1.0914367711254726e-05, "loss": 2.3087, "step": 19660 }, { "epoch": 54.81894150417827, "grad_norm": 1.4574671055158, "learning_rate": 1.0861446374760058e-05, "loss": 2.329, "step": 19680 }, { "epoch": 54.87465181058496, "grad_norm": 1.6398070121291626, "learning_rate": 1.0808742791729863e-05, "loss": 2.3005, "step": 19700 }, { "epoch": 54.930362116991645, "grad_norm": 1.3682990675605438, "learning_rate": 1.075625732489842e-05, "loss": 2.3105, "step": 19720 }, { "epoch": 54.986072423398326, "grad_norm": 1.4113101451622823, "learning_rate": 1.0703990335498795e-05, "loss": 2.3004, "step": 19740 }, { "epoch": 55.041782729805014, "grad_norm": 1.3747716130043024, "learning_rate": 1.0651942183260405e-05, "loss": 2.3123, "step": 19760 }, { "epoch": 55.0974930362117, "grad_norm": 1.5773531144976136, "learning_rate": 1.0600113226406483e-05, "loss": 2.31, "step": 19780 }, { "epoch": 55.15320334261838, "grad_norm": 1.636015923365525, "learning_rate": 1.0548503821651675e-05, "loss": 2.2963, "step": 19800 }, { "epoch": 55.20891364902507, "grad_norm": 1.9126127423976698, "learning_rate": 1.0497114324199536e-05, "loss": 2.3125, "step": 19820 }, { "epoch": 55.26462395543175, "grad_norm": 1.3810199319505396, "learning_rate": 1.0445945087740083e-05, "loss": 2.2836, "step": 19840 }, { "epoch": 55.32033426183844, "grad_norm": 1.655152735076459, "learning_rate": 1.0394996464447398e-05, "loss": 2.3183, "step": 19860 }, { "epoch": 55.37604456824513, "grad_norm": 1.7280728341318472, "learning_rate": 1.0344268804977195e-05, "loss": 2.3056, "step": 19880 }, { "epoch": 55.43175487465181, "grad_norm": 1.3354142969390423, "learning_rate": 1.029376245846439e-05, "loss": 2.2894, "step": 19900 }, { "epoch": 55.4874651810585, "grad_norm": 1.6222487674242974, "learning_rate": 1.024347777252068e-05, "loss": 2.3073, "step": 19920 }, { "epoch": 55.543175487465184, "grad_norm": 1.454409209087223, "learning_rate": 1.0193415093232206e-05, "loss": 2.3023, "step": 19940 }, { "epoch": 55.598885793871865, "grad_norm": 1.5315652454207556, "learning_rate": 1.0143574765157128e-05, "loss": 2.3427, "step": 19960 }, { "epoch": 55.65459610027855, "grad_norm": 1.441290570573882, "learning_rate": 1.0093957131323262e-05, "loss": 2.3211, "step": 19980 }, { "epoch": 55.710306406685234, "grad_norm": 1.6552345609147763, "learning_rate": 1.004456253322574e-05, "loss": 2.3032, "step": 20000 }, { "epoch": 55.76601671309192, "grad_norm": 1.4344844434843587, "learning_rate": 9.995391310824615e-06, "loss": 2.32, "step": 20020 }, { "epoch": 55.82172701949861, "grad_norm": 1.5149262550896996, "learning_rate": 9.946443802542573e-06, "loss": 2.3054, "step": 20040 }, { "epoch": 55.87743732590529, "grad_norm": 1.4983617276276844, "learning_rate": 9.89772034526257e-06, "loss": 2.2887, "step": 20060 }, { "epoch": 55.93314763231198, "grad_norm": 1.3214738473117364, "learning_rate": 9.849221274325526e-06, "loss": 2.3222, "step": 20080 }, { "epoch": 55.98885793871866, "grad_norm": 1.477089337511352, "learning_rate": 9.800946923528015e-06, "loss": 2.2982, "step": 20100 }, { "epoch": 56.04456824512535, "grad_norm": 1.4526733288769058, "learning_rate": 9.752897625119957e-06, "loss": 2.2978, "step": 20120 }, { "epoch": 56.100278551532035, "grad_norm": 1.4020841003024251, "learning_rate": 9.705073709802343e-06, "loss": 2.2945, "step": 20140 }, { "epoch": 56.155988857938716, "grad_norm": 1.5600664300784186, "learning_rate": 9.657475506724974e-06, "loss": 2.2782, "step": 20160 }, { "epoch": 56.211699164345404, "grad_norm": 1.8810092843791293, "learning_rate": 9.610103343484164e-06, "loss": 2.3072, "step": 20180 }, { "epoch": 56.26740947075209, "grad_norm": 1.5355656388936216, "learning_rate": 9.562957546120497e-06, "loss": 2.2978, "step": 20200 }, { "epoch": 56.32311977715877, "grad_norm": 1.49909865084026, "learning_rate": 9.51603843911659e-06, "loss": 2.3092, "step": 20220 }, { "epoch": 56.37883008356546, "grad_norm": 1.5161221850342854, "learning_rate": 9.469346345394869e-06, "loss": 2.2818, "step": 20240 }, { "epoch": 56.43454038997214, "grad_norm": 1.7615731834241355, "learning_rate": 9.422881586315314e-06, "loss": 2.3084, "step": 20260 }, { "epoch": 56.49025069637883, "grad_norm": 1.329887631910666, "learning_rate": 9.376644481673266e-06, "loss": 2.3056, "step": 20280 }, { "epoch": 56.54596100278552, "grad_norm": 1.4720910620951293, "learning_rate": 9.33063534969724e-06, "loss": 2.3108, "step": 20300 }, { "epoch": 56.6016713091922, "grad_norm": 1.3281512249998089, "learning_rate": 9.284854507046706e-06, "loss": 2.2901, "step": 20320 }, { "epoch": 56.65738161559889, "grad_norm": 1.3508495829729492, "learning_rate": 9.239302268809946e-06, "loss": 2.3169, "step": 20340 }, { "epoch": 56.713091922005574, "grad_norm": 1.4459681060448604, "learning_rate": 9.19397894850185e-06, "loss": 2.2935, "step": 20360 }, { "epoch": 56.768802228412255, "grad_norm": 1.435466464580322, "learning_rate": 9.148884858061761e-06, "loss": 2.297, "step": 20380 }, { "epoch": 56.82451253481894, "grad_norm": 1.4747023153570098, "learning_rate": 9.10402030785136e-06, "loss": 2.2758, "step": 20400 }, { "epoch": 56.880222841225624, "grad_norm": 1.732004184834518, "learning_rate": 9.059385606652494e-06, "loss": 2.2663, "step": 20420 }, { "epoch": 56.93593314763231, "grad_norm": 1.627581542112412, "learning_rate": 9.014981061665082e-06, "loss": 2.3057, "step": 20440 }, { "epoch": 56.991643454039, "grad_norm": 1.557984274560907, "learning_rate": 8.970806978504978e-06, "loss": 2.3203, "step": 20460 }, { "epoch": 57.04735376044568, "grad_norm": 1.3370492439725272, "learning_rate": 8.926863661201858e-06, "loss": 2.2901, "step": 20480 }, { "epoch": 57.10306406685237, "grad_norm": 1.5834661112813444, "learning_rate": 8.883151412197163e-06, "loss": 2.3148, "step": 20500 }, { "epoch": 57.15877437325905, "grad_norm": 1.444140494560892, "learning_rate": 8.839670532341993e-06, "loss": 2.2811, "step": 20520 }, { "epoch": 57.21448467966574, "grad_norm": 1.2924659150251059, "learning_rate": 8.796421320895056e-06, "loss": 2.2812, "step": 20540 }, { "epoch": 57.270194986072426, "grad_norm": 1.278167875022471, "learning_rate": 8.753404075520562e-06, "loss": 2.2695, "step": 20560 }, { "epoch": 57.325905292479106, "grad_norm": 1.489794984401024, "learning_rate": 8.710619092286228e-06, "loss": 2.2812, "step": 20580 }, { "epoch": 57.381615598885794, "grad_norm": 1.4707110829209729, "learning_rate": 8.668066665661217e-06, "loss": 2.2903, "step": 20600 }, { "epoch": 57.43732590529248, "grad_norm": 1.4687558384788093, "learning_rate": 8.625747088514107e-06, "loss": 2.306, "step": 20620 }, { "epoch": 57.49303621169916, "grad_norm": 1.4060321766361261, "learning_rate": 8.583660652110897e-06, "loss": 2.3054, "step": 20640 }, { "epoch": 57.54874651810585, "grad_norm": 1.416314142016587, "learning_rate": 8.541807646112959e-06, "loss": 2.2776, "step": 20660 }, { "epoch": 57.60445682451253, "grad_norm": 1.528612849317557, "learning_rate": 8.50018835857509e-06, "loss": 2.2615, "step": 20680 }, { "epoch": 57.66016713091922, "grad_norm": 1.5253351674209896, "learning_rate": 8.45880307594351e-06, "loss": 2.2641, "step": 20700 }, { "epoch": 57.71587743732591, "grad_norm": 1.4358983735975828, "learning_rate": 8.417652083053896e-06, "loss": 2.2722, "step": 20720 }, { "epoch": 57.77158774373259, "grad_norm": 1.7734798200078705, "learning_rate": 8.376735663129412e-06, "loss": 2.3084, "step": 20740 }, { "epoch": 57.82729805013928, "grad_norm": 1.5869547851700487, "learning_rate": 8.336054097778755e-06, "loss": 2.2899, "step": 20760 }, { "epoch": 57.88300835654596, "grad_norm": 1.4546940753793316, "learning_rate": 8.295607666994244e-06, "loss": 2.3095, "step": 20780 }, { "epoch": 57.938718662952645, "grad_norm": 2.102508648107544, "learning_rate": 8.255396649149872e-06, "loss": 2.2591, "step": 20800 }, { "epoch": 57.99442896935933, "grad_norm": 1.7539480811989963, "learning_rate": 8.215421320999385e-06, "loss": 2.2713, "step": 20820 }, { "epoch": 58.050139275766014, "grad_norm": 2.230779439808303, "learning_rate": 8.175681957674403e-06, "loss": 2.3016, "step": 20840 }, { "epoch": 58.1058495821727, "grad_norm": 1.382319191320228, "learning_rate": 8.136178832682491e-06, "loss": 2.3041, "step": 20860 }, { "epoch": 58.16155988857939, "grad_norm": 1.5265747955874778, "learning_rate": 8.096912217905309e-06, "loss": 2.2702, "step": 20880 }, { "epoch": 58.21727019498607, "grad_norm": 1.4460542045577416, "learning_rate": 8.057882383596717e-06, "loss": 2.3015, "step": 20900 }, { "epoch": 58.27298050139276, "grad_norm": 1.4566734252474305, "learning_rate": 8.019089598380943e-06, "loss": 2.2889, "step": 20920 }, { "epoch": 58.32869080779944, "grad_norm": 1.2969700785128098, "learning_rate": 7.98053412925069e-06, "loss": 2.3123, "step": 20940 }, { "epoch": 58.38440111420613, "grad_norm": 1.4381747219027274, "learning_rate": 7.942216241565335e-06, "loss": 2.2903, "step": 20960 }, { "epoch": 58.440111420612816, "grad_norm": 1.41845772591463, "learning_rate": 7.904136199049108e-06, "loss": 2.2915, "step": 20980 }, { "epoch": 58.4958217270195, "grad_norm": 1.7118356239586723, "learning_rate": 7.866294263789243e-06, "loss": 2.272, "step": 21000 }, { "epoch": 58.551532033426184, "grad_norm": 1.6802721584331735, "learning_rate": 7.828690696234207e-06, "loss": 2.2831, "step": 21020 }, { "epoch": 58.60724233983287, "grad_norm": 1.3312453916466178, "learning_rate": 7.791325755191866e-06, "loss": 2.3042, "step": 21040 }, { "epoch": 58.66295264623955, "grad_norm": 1.4400016262848356, "learning_rate": 7.754199697827755e-06, "loss": 2.2708, "step": 21060 }, { "epoch": 58.71866295264624, "grad_norm": 1.4653785109530788, "learning_rate": 7.717312779663285e-06, "loss": 2.298, "step": 21080 }, { "epoch": 58.77437325905292, "grad_norm": 1.4617048241574984, "learning_rate": 7.680665254573972e-06, "loss": 2.295, "step": 21100 }, { "epoch": 58.83008356545961, "grad_norm": 1.5216063506001387, "learning_rate": 7.644257374787696e-06, "loss": 2.276, "step": 21120 }, { "epoch": 58.8857938718663, "grad_norm": 1.3935611767924123, "learning_rate": 7.6080893908829835e-06, "loss": 2.2758, "step": 21140 }, { "epoch": 58.94150417827298, "grad_norm": 1.4853858842912901, "learning_rate": 7.572161551787261e-06, "loss": 2.2871, "step": 21160 }, { "epoch": 58.99721448467967, "grad_norm": 1.5467142471984074, "learning_rate": 7.536474104775158e-06, "loss": 2.2848, "step": 21180 }, { "epoch": 59.05292479108635, "grad_norm": 1.5612739193627336, "learning_rate": 7.501027295466781e-06, "loss": 2.2918, "step": 21200 }, { "epoch": 59.108635097493035, "grad_norm": 1.5107647532729618, "learning_rate": 7.4658213678260586e-06, "loss": 2.2938, "step": 21220 }, { "epoch": 59.16434540389972, "grad_norm": 1.613803688667171, "learning_rate": 7.430856564159026e-06, "loss": 2.2624, "step": 21240 }, { "epoch": 59.220055710306404, "grad_norm": 1.4075510840449976, "learning_rate": 7.396133125112186e-06, "loss": 2.2882, "step": 21260 }, { "epoch": 59.27576601671309, "grad_norm": 1.4680721335227742, "learning_rate": 7.361651289670837e-06, "loss": 2.2772, "step": 21280 }, { "epoch": 59.33147632311978, "grad_norm": 1.9100448192464394, "learning_rate": 7.327411295157427e-06, "loss": 2.2552, "step": 21300 }, { "epoch": 59.38718662952646, "grad_norm": 1.6058890472494596, "learning_rate": 7.293413377229926e-06, "loss": 2.2458, "step": 21320 }, { "epoch": 59.44289693593315, "grad_norm": 1.760861350098991, "learning_rate": 7.259657769880218e-06, "loss": 2.2921, "step": 21340 }, { "epoch": 59.49860724233983, "grad_norm": 1.4588818247613144, "learning_rate": 7.226144705432453e-06, "loss": 2.2647, "step": 21360 }, { "epoch": 59.55431754874652, "grad_norm": 1.347496064491126, "learning_rate": 7.192874414541492e-06, "loss": 2.3212, "step": 21380 }, { "epoch": 59.610027855153206, "grad_norm": 1.4180100417862518, "learning_rate": 7.159847126191279e-06, "loss": 2.2922, "step": 21400 }, { "epoch": 59.66573816155989, "grad_norm": 1.3383217284211308, "learning_rate": 7.127063067693305e-06, "loss": 2.2642, "step": 21420 }, { "epoch": 59.721448467966574, "grad_norm": 1.5431150296701466, "learning_rate": 7.094522464685003e-06, "loss": 2.2798, "step": 21440 }, { "epoch": 59.77715877437326, "grad_norm": 1.453049719160441, "learning_rate": 7.062225541128232e-06, "loss": 2.2882, "step": 21460 }, { "epoch": 59.83286908077994, "grad_norm": 1.5504386381902358, "learning_rate": 7.030172519307708e-06, "loss": 2.2702, "step": 21480 }, { "epoch": 59.88857938718663, "grad_norm": 1.3068595652128718, "learning_rate": 6.998363619829485e-06, "loss": 2.2867, "step": 21500 }, { "epoch": 59.94428969359331, "grad_norm": 1.5921797096923227, "learning_rate": 6.966799061619429e-06, "loss": 2.3073, "step": 21520 }, { "epoch": 60.0, "grad_norm": 1.4377189989333592, "learning_rate": 6.935479061921752e-06, "loss": 2.2524, "step": 21540 }, { "epoch": 60.05571030640669, "grad_norm": 1.4965660553834361, "learning_rate": 6.904403836297449e-06, "loss": 2.2908, "step": 21560 }, { "epoch": 60.11142061281337, "grad_norm": 1.5433152079814891, "learning_rate": 6.873573598622855e-06, "loss": 2.3, "step": 21580 }, { "epoch": 60.16713091922006, "grad_norm": 1.532824089241608, "learning_rate": 6.842988561088175e-06, "loss": 2.2503, "step": 21600 }, { "epoch": 60.22284122562674, "grad_norm": 1.404540197652475, "learning_rate": 6.81264893419601e-06, "loss": 2.2671, "step": 21620 }, { "epoch": 60.278551532033426, "grad_norm": 2.3304693566638277, "learning_rate": 6.782554926759919e-06, "loss": 2.272, "step": 21640 }, { "epoch": 60.33426183844011, "grad_norm": 1.4431191588807148, "learning_rate": 6.752706745902972e-06, "loss": 2.2741, "step": 21660 }, { "epoch": 60.389972144846794, "grad_norm": 1.7541038462058614, "learning_rate": 6.723104597056326e-06, "loss": 2.2679, "step": 21680 }, { "epoch": 60.44568245125348, "grad_norm": 1.5107298093918222, "learning_rate": 6.693748683957818e-06, "loss": 2.2439, "step": 21700 }, { "epoch": 60.50139275766017, "grad_norm": 1.8531551454644686, "learning_rate": 6.664639208650558e-06, "loss": 2.3011, "step": 21720 }, { "epoch": 60.55710306406685, "grad_norm": 1.7982354405211904, "learning_rate": 6.635776371481545e-06, "loss": 2.2564, "step": 21740 }, { "epoch": 60.61281337047354, "grad_norm": 1.4660071270340647, "learning_rate": 6.607160371100274e-06, "loss": 2.2991, "step": 21760 }, { "epoch": 60.66852367688022, "grad_norm": 1.6108738101591895, "learning_rate": 6.578791404457377e-06, "loss": 2.2712, "step": 21780 }, { "epoch": 60.72423398328691, "grad_norm": 1.7280681944263292, "learning_rate": 6.550669666803269e-06, "loss": 2.2645, "step": 21800 }, { "epoch": 60.779944289693596, "grad_norm": 1.634845479160207, "learning_rate": 6.522795351686807e-06, "loss": 2.264, "step": 21820 }, { "epoch": 60.83565459610028, "grad_norm": 1.4425486098001479, "learning_rate": 6.495168650953954e-06, "loss": 2.2848, "step": 21840 }, { "epoch": 60.891364902506965, "grad_norm": 1.4822789289824396, "learning_rate": 6.467789754746452e-06, "loss": 2.2683, "step": 21860 }, { "epoch": 60.94707520891365, "grad_norm": 1.732381751649639, "learning_rate": 6.440658851500523e-06, "loss": 2.2965, "step": 21880 }, { "epoch": 61.00278551532033, "grad_norm": 1.5491944453547546, "learning_rate": 6.413776127945568e-06, "loss": 2.2874, "step": 21900 }, { "epoch": 61.05849582172702, "grad_norm": 1.3257669016068976, "learning_rate": 6.3871417691028895e-06, "loss": 2.2499, "step": 21920 }, { "epoch": 61.1142061281337, "grad_norm": 1.9195662310637283, "learning_rate": 6.360755958284388e-06, "loss": 2.2535, "step": 21940 }, { "epoch": 61.16991643454039, "grad_norm": 1.5972372792438843, "learning_rate": 6.334618877091354e-06, "loss": 2.2632, "step": 21960 }, { "epoch": 61.22562674094708, "grad_norm": 1.3893966806690632, "learning_rate": 6.308730705413165e-06, "loss": 2.2583, "step": 21980 }, { "epoch": 61.28133704735376, "grad_norm": 1.3230497193349502, "learning_rate": 6.283091621426083e-06, "loss": 2.2836, "step": 22000 }, { "epoch": 61.33704735376045, "grad_norm": 1.3952354521448391, "learning_rate": 6.257701801592015e-06, "loss": 2.257, "step": 22020 }, { "epoch": 61.39275766016713, "grad_norm": 1.554543762365429, "learning_rate": 6.232561420657287e-06, "loss": 2.2712, "step": 22040 }, { "epoch": 61.448467966573816, "grad_norm": 1.5605932348002485, "learning_rate": 6.207670651651461e-06, "loss": 2.2724, "step": 22060 }, { "epoch": 61.5041782729805, "grad_norm": 1.3976211106358032, "learning_rate": 6.183029665886133e-06, "loss": 2.2473, "step": 22080 }, { "epoch": 61.559888579387184, "grad_norm": 1.8546230069148926, "learning_rate": 6.158638632953763e-06, "loss": 2.2717, "step": 22100 }, { "epoch": 61.61559888579387, "grad_norm": 1.89636102470963, "learning_rate": 6.134497720726502e-06, "loss": 2.2812, "step": 22120 }, { "epoch": 61.67130919220056, "grad_norm": 1.5811862192223516, "learning_rate": 6.110607095355023e-06, "loss": 2.2526, "step": 22140 }, { "epoch": 61.72701949860724, "grad_norm": 1.4824595544381087, "learning_rate": 6.0869669212674075e-06, "loss": 2.2745, "step": 22160 }, { "epoch": 61.78272980501393, "grad_norm": 1.457324859716249, "learning_rate": 6.063577361167978e-06, "loss": 2.2999, "step": 22180 }, { "epoch": 61.83844011142061, "grad_norm": 1.7057172055075098, "learning_rate": 6.040438576036232e-06, "loss": 2.2332, "step": 22200 }, { "epoch": 61.8941504178273, "grad_norm": 1.4146229574598475, "learning_rate": 6.0175507251256545e-06, "loss": 2.2701, "step": 22220 }, { "epoch": 61.949860724233986, "grad_norm": 1.583083000184335, "learning_rate": 5.994913965962701e-06, "loss": 2.2528, "step": 22240 }, { "epoch": 62.00557103064067, "grad_norm": 1.4267458097429977, "learning_rate": 5.972528454345661e-06, "loss": 2.2459, "step": 22260 }, { "epoch": 62.061281337047355, "grad_norm": 1.868289697809984, "learning_rate": 5.950394344343613e-06, "loss": 2.2553, "step": 22280 }, { "epoch": 62.116991643454035, "grad_norm": 1.6100749427479117, "learning_rate": 5.928511788295353e-06, "loss": 2.258, "step": 22300 }, { "epoch": 62.17270194986072, "grad_norm": 2.033401841218128, "learning_rate": 5.906880936808346e-06, "loss": 2.2656, "step": 22320 }, { "epoch": 62.22841225626741, "grad_norm": 1.5336132691384432, "learning_rate": 5.8855019387576895e-06, "loss": 2.2713, "step": 22340 }, { "epoch": 62.28412256267409, "grad_norm": 1.4006290924081595, "learning_rate": 5.864374941285097e-06, "loss": 2.273, "step": 22360 }, { "epoch": 62.33983286908078, "grad_norm": 1.5727786325700963, "learning_rate": 5.843500089797875e-06, "loss": 2.2698, "step": 22380 }, { "epoch": 62.39554317548747, "grad_norm": 1.5471505670773764, "learning_rate": 5.822877527967931e-06, "loss": 2.2366, "step": 22400 }, { "epoch": 62.45125348189415, "grad_norm": 1.7387583369728248, "learning_rate": 5.802507397730769e-06, "loss": 2.2517, "step": 22420 }, { "epoch": 62.50696378830084, "grad_norm": 1.34648038991986, "learning_rate": 5.782389839284539e-06, "loss": 2.2792, "step": 22440 }, { "epoch": 62.56267409470752, "grad_norm": 1.4257642559426869, "learning_rate": 5.76252499108904e-06, "loss": 2.2639, "step": 22460 }, { "epoch": 62.618384401114206, "grad_norm": 1.4992603072132409, "learning_rate": 5.7429129898647996e-06, "loss": 2.2469, "step": 22480 }, { "epoch": 62.674094707520894, "grad_norm": 1.3812098236775807, "learning_rate": 5.723553970592111e-06, "loss": 2.2778, "step": 22500 }, { "epoch": 62.729805013927574, "grad_norm": 1.5792548079682394, "learning_rate": 5.704448066510095e-06, "loss": 2.267, "step": 22520 }, { "epoch": 62.78551532033426, "grad_norm": 1.3978392074739474, "learning_rate": 5.6855954091158275e-06, "loss": 2.2949, "step": 22540 }, { "epoch": 62.84122562674095, "grad_norm": 1.323364408912749, "learning_rate": 5.666996128163389e-06, "loss": 2.239, "step": 22560 }, { "epoch": 62.89693593314763, "grad_norm": 1.47299942786999, "learning_rate": 5.648650351662984e-06, "loss": 2.2428, "step": 22580 }, { "epoch": 62.95264623955432, "grad_norm": 1.5746917164193233, "learning_rate": 5.630558205880067e-06, "loss": 2.2717, "step": 22600 }, { "epoch": 63.008356545961, "grad_norm": 1.3453881918350201, "learning_rate": 5.612719815334472e-06, "loss": 2.2605, "step": 22620 }, { "epoch": 63.06406685236769, "grad_norm": 1.5732476592031857, "learning_rate": 5.595135302799554e-06, "loss": 2.2981, "step": 22640 }, { "epoch": 63.119777158774376, "grad_norm": 1.4933682090479892, "learning_rate": 5.577804789301342e-06, "loss": 2.2629, "step": 22660 }, { "epoch": 63.17548746518106, "grad_norm": 2.277146814514644, "learning_rate": 5.560728394117715e-06, "loss": 2.2708, "step": 22680 }, { "epoch": 63.231197771587745, "grad_norm": 1.272383107286924, "learning_rate": 5.543906234777552e-06, "loss": 2.2573, "step": 22700 }, { "epoch": 63.286908077994426, "grad_norm": 1.5182549490627633, "learning_rate": 5.527338427059974e-06, "loss": 2.2316, "step": 22720 }, { "epoch": 63.34261838440111, "grad_norm": 1.7891996636535088, "learning_rate": 5.511025084993495e-06, "loss": 2.2441, "step": 22740 }, { "epoch": 63.3983286908078, "grad_norm": 1.5802790373457376, "learning_rate": 5.494966320855273e-06, "loss": 2.2617, "step": 22760 }, { "epoch": 63.45403899721448, "grad_norm": 1.6073072729662374, "learning_rate": 5.479162245170319e-06, "loss": 2.2458, "step": 22780 }, { "epoch": 63.50974930362117, "grad_norm": 1.5697619339543767, "learning_rate": 5.4636129667107414e-06, "loss": 2.2971, "step": 22800 }, { "epoch": 63.56545961002786, "grad_norm": 1.3744571764690732, "learning_rate": 5.448318592495002e-06, "loss": 2.2844, "step": 22820 }, { "epoch": 63.62116991643454, "grad_norm": 1.6689901020657363, "learning_rate": 5.433279227787173e-06, "loss": 2.2517, "step": 22840 }, { "epoch": 63.67688022284123, "grad_norm": 1.382942198241601, "learning_rate": 5.418494976096209e-06, "loss": 2.26, "step": 22860 }, { "epoch": 63.73259052924791, "grad_norm": 1.5877846358641807, "learning_rate": 5.403965939175251e-06, "loss": 2.2572, "step": 22880 }, { "epoch": 63.788300835654596, "grad_norm": 1.816527139540087, "learning_rate": 5.389692217020904e-06, "loss": 2.2546, "step": 22900 }, { "epoch": 63.844011142061284, "grad_norm": 1.7654697501524792, "learning_rate": 5.375673907872574e-06, "loss": 2.2418, "step": 22920 }, { "epoch": 63.899721448467965, "grad_norm": 1.5888334127364063, "learning_rate": 5.36191110821176e-06, "loss": 2.2664, "step": 22940 }, { "epoch": 63.95543175487465, "grad_norm": 1.4670614525113759, "learning_rate": 5.348403912761424e-06, "loss": 2.2343, "step": 22960 }, { "epoch": 64.01114206128133, "grad_norm": 1.3543942876062933, "learning_rate": 5.335152414485308e-06, "loss": 2.2503, "step": 22980 }, { "epoch": 64.06685236768803, "grad_norm": 1.4563546735325645, "learning_rate": 5.32215670458733e-06, "loss": 2.2304, "step": 23000 }, { "epoch": 64.12256267409471, "grad_norm": 1.622565484000199, "learning_rate": 5.309416872510913e-06, "loss": 2.2452, "step": 23020 }, { "epoch": 64.17827298050139, "grad_norm": 1.4460308265765018, "learning_rate": 5.296933005938412e-06, "loss": 2.2938, "step": 23040 }, { "epoch": 64.23398328690807, "grad_norm": 1.4440564061553423, "learning_rate": 5.284705190790466e-06, "loss": 2.2453, "step": 23060 }, { "epoch": 64.28969359331477, "grad_norm": 1.4478148572192913, "learning_rate": 5.272733511225455e-06, "loss": 2.2343, "step": 23080 }, { "epoch": 64.34540389972145, "grad_norm": 1.4180812216938306, "learning_rate": 5.261018049638886e-06, "loss": 2.2665, "step": 23100 }, { "epoch": 64.40111420612813, "grad_norm": 1.5831915297512447, "learning_rate": 5.24955888666284e-06, "loss": 2.2539, "step": 23120 }, { "epoch": 64.45682451253482, "grad_norm": 1.6700085502977315, "learning_rate": 5.238356101165407e-06, "loss": 2.2677, "step": 23140 }, { "epoch": 64.5125348189415, "grad_norm": 1.4281340861580372, "learning_rate": 5.227409770250158e-06, "loss": 2.2693, "step": 23160 }, { "epoch": 64.56824512534818, "grad_norm": 1.474080653934136, "learning_rate": 5.216719969255597e-06, "loss": 2.2576, "step": 23180 }, { "epoch": 64.62395543175488, "grad_norm": 1.5546948660357771, "learning_rate": 5.206286771754661e-06, "loss": 2.2718, "step": 23200 }, { "epoch": 64.67966573816156, "grad_norm": 1.5619037183872753, "learning_rate": 5.196110249554205e-06, "loss": 2.2617, "step": 23220 }, { "epoch": 64.73537604456824, "grad_norm": 1.4612049949245505, "learning_rate": 5.186190472694495e-06, "loss": 2.2531, "step": 23240 }, { "epoch": 64.79108635097494, "grad_norm": 1.4463084271801094, "learning_rate": 5.176527509448752e-06, "loss": 2.2492, "step": 23260 }, { "epoch": 64.84679665738162, "grad_norm": 1.52054949971835, "learning_rate": 5.167121426322663e-06, "loss": 2.265, "step": 23280 }, { "epoch": 64.9025069637883, "grad_norm": 1.6137336460770288, "learning_rate": 5.157972288053923e-06, "loss": 2.2761, "step": 23300 }, { "epoch": 64.958217270195, "grad_norm": 1.5343563001642067, "learning_rate": 5.1490801576118046e-06, "loss": 2.2589, "step": 23320 }, { "epoch": 65.01392757660167, "grad_norm": 1.4067304900789732, "learning_rate": 5.140445096196706e-06, "loss": 2.2344, "step": 23340 }, { "epoch": 65.06963788300835, "grad_norm": 1.6266281172023442, "learning_rate": 5.132067163239744e-06, "loss": 2.2327, "step": 23360 }, { "epoch": 65.12534818941504, "grad_norm": 1.6114614480347964, "learning_rate": 5.123946416402338e-06, "loss": 2.2252, "step": 23380 }, { "epoch": 65.18105849582173, "grad_norm": 1.4381934112576613, "learning_rate": 5.116082911575816e-06, "loss": 2.2376, "step": 23400 }, { "epoch": 65.23676880222841, "grad_norm": 1.6265621183224508, "learning_rate": 5.108476702881032e-06, "loss": 2.2575, "step": 23420 }, { "epoch": 65.29247910863509, "grad_norm": 1.6332251534091453, "learning_rate": 5.101127842667981e-06, "loss": 2.2482, "step": 23440 }, { "epoch": 65.34818941504179, "grad_norm": 1.4703516792242604, "learning_rate": 5.094036381515459e-06, "loss": 2.2636, "step": 23460 }, { "epoch": 65.40389972144847, "grad_norm": 1.828744349221896, "learning_rate": 5.087202368230689e-06, "loss": 2.2676, "step": 23480 }, { "epoch": 65.45961002785515, "grad_norm": 1.5728681116117378, "learning_rate": 5.080625849849016e-06, "loss": 2.2408, "step": 23500 }, { "epoch": 65.51532033426184, "grad_norm": 1.3646586794266737, "learning_rate": 5.074306871633561e-06, "loss": 2.2594, "step": 23520 }, { "epoch": 65.57103064066852, "grad_norm": 1.344312658230311, "learning_rate": 5.068245477074914e-06, "loss": 2.2548, "step": 23540 }, { "epoch": 65.6267409470752, "grad_norm": 2.3009044106769543, "learning_rate": 5.062441707890833e-06, "loss": 2.2515, "step": 23560 }, { "epoch": 65.6824512534819, "grad_norm": 1.5477394057524316, "learning_rate": 5.056895604025971e-06, "loss": 2.2286, "step": 23580 }, { "epoch": 65.73816155988858, "grad_norm": 1.473509209538851, "learning_rate": 5.051607203651582e-06, "loss": 2.2558, "step": 23600 }, { "epoch": 65.79387186629526, "grad_norm": 1.6163262255244608, "learning_rate": 5.046576543165266e-06, "loss": 2.2587, "step": 23620 }, { "epoch": 65.84958217270194, "grad_norm": 1.7522873848064437, "learning_rate": 5.041803657190727e-06, "loss": 2.262, "step": 23640 }, { "epoch": 65.90529247910864, "grad_norm": 1.9510070766562828, "learning_rate": 5.037288578577515e-06, "loss": 2.2731, "step": 23660 }, { "epoch": 65.96100278551532, "grad_norm": 1.3249968235869283, "learning_rate": 5.033031338400824e-06, "loss": 2.2357, "step": 23680 }, { "epoch": 66.016713091922, "grad_norm": 1.8089068811586904, "learning_rate": 5.0290319659612565e-06, "loss": 2.2264, "step": 23700 }, { "epoch": 66.0724233983287, "grad_norm": 1.6823052379660255, "learning_rate": 5.0252904887846365e-06, "loss": 2.2241, "step": 23720 }, { "epoch": 66.12813370473538, "grad_norm": 1.4348819340116656, "learning_rate": 5.02180693262181e-06, "loss": 2.2448, "step": 23740 }, { "epoch": 66.18384401114206, "grad_norm": 1.48816291038319, "learning_rate": 5.01858132144848e-06, "loss": 2.2445, "step": 23760 }, { "epoch": 66.23955431754875, "grad_norm": 1.4921612956391412, "learning_rate": 5.015613677465031e-06, "loss": 2.2608, "step": 23780 }, { "epoch": 66.29526462395543, "grad_norm": 1.8304620275041354, "learning_rate": 5.0129040210963695e-06, "loss": 2.2599, "step": 23800 }, { "epoch": 66.35097493036211, "grad_norm": 1.3873830650491625, "learning_rate": 5.010452370991807e-06, "loss": 2.2506, "step": 23820 }, { "epoch": 66.40668523676881, "grad_norm": 1.4260103007082212, "learning_rate": 5.008258744024913e-06, "loss": 2.2474, "step": 23840 }, { "epoch": 66.46239554317549, "grad_norm": 1.5475790370983857, "learning_rate": 5.006323155293398e-06, "loss": 2.2718, "step": 23860 }, { "epoch": 66.51810584958217, "grad_norm": 1.6513082696882795, "learning_rate": 5.004645618119022e-06, "loss": 2.2305, "step": 23880 }, { "epoch": 66.57381615598885, "grad_norm": 1.3518749191666553, "learning_rate": 5.0032261440475e-06, "loss": 2.2475, "step": 23900 }, { "epoch": 66.62952646239555, "grad_norm": 1.4415217716791036, "learning_rate": 5.0020647428484e-06, "loss": 2.2413, "step": 23920 }, { "epoch": 66.68523676880223, "grad_norm": 1.463396039264776, "learning_rate": 5.001161422515119e-06, "loss": 2.2409, "step": 23940 }, { "epoch": 66.74094707520891, "grad_norm": 1.6094576785573045, "learning_rate": 5.000516189264787e-06, "loss": 2.2368, "step": 23960 }, { "epoch": 66.7966573816156, "grad_norm": 1.8634319644864112, "learning_rate": 5.000129047538239e-06, "loss": 2.2534, "step": 23980 }, { "epoch": 66.85236768802228, "grad_norm": 1.5147891109103364, "learning_rate": 5e-06, "loss": 2.2525, "step": 24000 } ], "logging_steps": 20, "max_steps": 24000, "num_input_tokens_seen": 0, "num_train_epochs": 67, "save_steps": 3000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4048509763584000.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }