{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.03451197479245361, "eval_steps": 500, "global_step": 40000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 4.3139968490567015e-05, "grad_norm": 36.896514892578125, "learning_rate": 3.3333333333333333e-06, "loss": 2.8457, "step": 50 }, { "epoch": 8.627993698113403e-05, "grad_norm": 35.37440490722656, "learning_rate": 6.666666666666667e-06, "loss": 2.1361, "step": 100 }, { "epoch": 0.00012941990547170104, "grad_norm": 31.632505416870117, "learning_rate": 1e-05, "loss": 0.777, "step": 150 }, { "epoch": 0.00017255987396226806, "grad_norm": 0.4134848415851593, "learning_rate": 1.3333333333333333e-05, "loss": 0.4916, "step": 200 }, { "epoch": 0.00021569984245283508, "grad_norm": 37.35564422607422, "learning_rate": 1.6666666666666667e-05, "loss": 0.381, "step": 250 }, { "epoch": 0.0002588398109434021, "grad_norm": 45.536712646484375, "learning_rate": 2e-05, "loss": 0.2516, "step": 300 }, { "epoch": 0.0003019797794339691, "grad_norm": 19.42644500732422, "learning_rate": 1.9999136977245545e-05, "loss": 0.2801, "step": 350 }, { "epoch": 0.0003451197479245361, "grad_norm": 54.93406295776367, "learning_rate": 1.9998273954491085e-05, "loss": 0.1867, "step": 400 }, { "epoch": 0.00038825971641510314, "grad_norm": 41.097774505615234, "learning_rate": 1.9997410931736628e-05, "loss": 0.1246, "step": 450 }, { "epoch": 0.00043139968490567016, "grad_norm": 0.38864001631736755, "learning_rate": 1.9996547908982168e-05, "loss": 0.1049, "step": 500 }, { "epoch": 0.0004745396533962372, "grad_norm": 0.041049182415008545, "learning_rate": 1.999568488622771e-05, "loss": 0.2315, "step": 550 }, { "epoch": 0.0005176796218868042, "grad_norm": 0.002712072106078267, "learning_rate": 1.9994821863473255e-05, "loss": 0.1082, "step": 600 }, { "epoch": 0.0005608195903773712, "grad_norm": 2.7014479201170616e-05, "learning_rate": 1.9993958840718798e-05, "loss": 0.0578, "step": 650 }, { "epoch": 0.0006039595588679382, "grad_norm": 1.4746110439300537, "learning_rate": 1.9993095817964338e-05, "loss": 0.2376, "step": 700 }, { "epoch": 0.0006470995273585052, "grad_norm": 28.263187408447266, "learning_rate": 1.999223279520988e-05, "loss": 0.0866, "step": 750 }, { "epoch": 0.0006902394958490722, "grad_norm": 11.379521369934082, "learning_rate": 1.999136977245542e-05, "loss": 0.0782, "step": 800 }, { "epoch": 0.0007333794643396393, "grad_norm": 0.0013383959885686636, "learning_rate": 1.9990506749700965e-05, "loss": 0.1511, "step": 850 }, { "epoch": 0.0007765194328302063, "grad_norm": 6.122570991516113, "learning_rate": 1.9989643726946505e-05, "loss": 0.0885, "step": 900 }, { "epoch": 0.0008196594013207733, "grad_norm": 0.15970896184444427, "learning_rate": 1.9988780704192048e-05, "loss": 0.1027, "step": 950 }, { "epoch": 0.0008627993698113403, "grad_norm": 0.19297116994857788, "learning_rate": 1.998791768143759e-05, "loss": 0.07, "step": 1000 }, { "epoch": 0.0009059393383019073, "grad_norm": 0.00016763704479672015, "learning_rate": 1.998705465868313e-05, "loss": 0.1106, "step": 1050 }, { "epoch": 0.0009490793067924744, "grad_norm": 0.0003569670661818236, "learning_rate": 1.9986191635928675e-05, "loss": 0.0801, "step": 1100 }, { "epoch": 0.0009922192752830413, "grad_norm": 34.73747253417969, "learning_rate": 1.9985328613174218e-05, "loss": 0.152, "step": 1150 }, { "epoch": 0.0010353592437736083, "grad_norm": 0.5465057492256165, "learning_rate": 1.9984465590419758e-05, "loss": 0.082, "step": 1200 }, { "epoch": 0.0010784992122641753, "grad_norm": 0.0005398567882366478, "learning_rate": 1.99836025676653e-05, "loss": 0.1238, "step": 1250 }, { "epoch": 0.0011216391807547423, "grad_norm": 28.343412399291992, "learning_rate": 1.9982739544910845e-05, "loss": 0.079, "step": 1300 }, { "epoch": 0.0011647791492453094, "grad_norm": 0.18938343226909637, "learning_rate": 1.9981876522156385e-05, "loss": 0.0968, "step": 1350 }, { "epoch": 0.0012079191177358764, "grad_norm": 18.69659996032715, "learning_rate": 1.9981013499401928e-05, "loss": 0.152, "step": 1400 }, { "epoch": 0.0012510590862264434, "grad_norm": 26.612380981445312, "learning_rate": 1.9980150476647468e-05, "loss": 0.0549, "step": 1450 }, { "epoch": 0.0012941990547170104, "grad_norm": 0.0005812590825371444, "learning_rate": 1.997928745389301e-05, "loss": 0.0668, "step": 1500 }, { "epoch": 0.0013373390232075775, "grad_norm": 0.15176478028297424, "learning_rate": 1.997842443113855e-05, "loss": 0.1209, "step": 1550 }, { "epoch": 0.0013804789916981445, "grad_norm": 32.04401779174805, "learning_rate": 1.9977561408384094e-05, "loss": 0.1198, "step": 1600 }, { "epoch": 0.0014236189601887115, "grad_norm": 0.6346271634101868, "learning_rate": 1.9976698385629638e-05, "loss": 0.0893, "step": 1650 }, { "epoch": 0.0014667589286792785, "grad_norm": 40.96885681152344, "learning_rate": 1.997583536287518e-05, "loss": 0.1393, "step": 1700 }, { "epoch": 0.0015098988971698455, "grad_norm": 0.29022184014320374, "learning_rate": 1.997497234012072e-05, "loss": 0.0832, "step": 1750 }, { "epoch": 0.0015530388656604126, "grad_norm": 0.6716536283493042, "learning_rate": 1.9974109317366264e-05, "loss": 0.0558, "step": 1800 }, { "epoch": 0.0015961788341509796, "grad_norm": 0.19002307951450348, "learning_rate": 1.9973246294611804e-05, "loss": 0.0881, "step": 1850 }, { "epoch": 0.0016393188026415466, "grad_norm": 0.24587740004062653, "learning_rate": 1.9972383271857348e-05, "loss": 0.0776, "step": 1900 }, { "epoch": 0.0016824587711321136, "grad_norm": 28.058324813842773, "learning_rate": 1.9971520249102888e-05, "loss": 0.0907, "step": 1950 }, { "epoch": 0.0017255987396226807, "grad_norm": 80.17859649658203, "learning_rate": 1.997065722634843e-05, "loss": 0.0566, "step": 2000 }, { "epoch": 0.0017687387081132477, "grad_norm": 0.020453251898288727, "learning_rate": 1.9969794203593974e-05, "loss": 0.066, "step": 2050 }, { "epoch": 0.0018118786766038147, "grad_norm": 1.5788724340382032e-05, "learning_rate": 1.9968931180839514e-05, "loss": 0.1197, "step": 2100 }, { "epoch": 0.0018550186450943817, "grad_norm": 0.008944077417254448, "learning_rate": 1.9968068158085058e-05, "loss": 0.0495, "step": 2150 }, { "epoch": 0.0018981586135849487, "grad_norm": 3.117482719972031e-06, "learning_rate": 1.99672051353306e-05, "loss": 0.0969, "step": 2200 }, { "epoch": 0.0019412985820755155, "grad_norm": 4.803666114807129, "learning_rate": 1.996634211257614e-05, "loss": 0.0534, "step": 2250 }, { "epoch": 0.0019844385505660826, "grad_norm": 2.6723075279733166e-05, "learning_rate": 1.9965479089821684e-05, "loss": 0.0841, "step": 2300 }, { "epoch": 0.0020275785190566496, "grad_norm": 0.0005806431290693581, "learning_rate": 1.9964616067067228e-05, "loss": 0.1267, "step": 2350 }, { "epoch": 0.0020707184875472166, "grad_norm": 6.426816253224388e-05, "learning_rate": 1.9963753044312767e-05, "loss": 0.0404, "step": 2400 }, { "epoch": 0.0021138584560377836, "grad_norm": 5.425294876098633, "learning_rate": 1.996289002155831e-05, "loss": 0.1306, "step": 2450 }, { "epoch": 0.0021569984245283507, "grad_norm": 0.5509458780288696, "learning_rate": 1.996202699880385e-05, "loss": 0.1097, "step": 2500 }, { "epoch": 0.0022001383930189177, "grad_norm": 1.9030728992674995e-08, "learning_rate": 1.9961163976049394e-05, "loss": 0.0954, "step": 2550 }, { "epoch": 0.0022432783615094847, "grad_norm": 5.162133693695068, "learning_rate": 1.9960300953294934e-05, "loss": 0.0495, "step": 2600 }, { "epoch": 0.0022864183300000517, "grad_norm": 0.001043809694238007, "learning_rate": 1.9959437930540477e-05, "loss": 0.0578, "step": 2650 }, { "epoch": 0.0023295582984906187, "grad_norm": 10.08859634399414, "learning_rate": 1.995857490778602e-05, "loss": 0.0479, "step": 2700 }, { "epoch": 0.0023726982669811858, "grad_norm": 0.00013425115321297199, "learning_rate": 1.9957711885031564e-05, "loss": 0.0999, "step": 2750 }, { "epoch": 0.002415838235471753, "grad_norm": 0.5551994442939758, "learning_rate": 1.9956848862277104e-05, "loss": 0.0828, "step": 2800 }, { "epoch": 0.00245897820396232, "grad_norm": 1.9412257671356201, "learning_rate": 1.9955985839522647e-05, "loss": 0.0374, "step": 2850 }, { "epoch": 0.002502118172452887, "grad_norm": 0.2069123089313507, "learning_rate": 1.9955122816768187e-05, "loss": 0.0201, "step": 2900 }, { "epoch": 0.002545258140943454, "grad_norm": 1.6855838111951016e-05, "learning_rate": 1.995425979401373e-05, "loss": 0.0395, "step": 2950 }, { "epoch": 0.002588398109434021, "grad_norm": 4.1953666141125723e-07, "learning_rate": 1.9953396771259274e-05, "loss": 0.081, "step": 3000 }, { "epoch": 0.002631538077924588, "grad_norm": 29.5993709564209, "learning_rate": 1.9952533748504814e-05, "loss": 0.0553, "step": 3050 }, { "epoch": 0.002674678046415155, "grad_norm": 3.231801031233772e-07, "learning_rate": 1.9951670725750357e-05, "loss": 0.153, "step": 3100 }, { "epoch": 0.002717818014905722, "grad_norm": 2.516810655593872, "learning_rate": 1.9950807702995897e-05, "loss": 0.0492, "step": 3150 }, { "epoch": 0.002760957983396289, "grad_norm": 1.1921870708465576, "learning_rate": 1.994994468024144e-05, "loss": 0.0816, "step": 3200 }, { "epoch": 0.002804097951886856, "grad_norm": 2.2311925888061523, "learning_rate": 1.9949081657486984e-05, "loss": 0.0785, "step": 3250 }, { "epoch": 0.002847237920377423, "grad_norm": 6.03306652919855e-05, "learning_rate": 1.9948218634732527e-05, "loss": 0.0706, "step": 3300 }, { "epoch": 0.00289037788886799, "grad_norm": 0.014764097519218922, "learning_rate": 1.9947355611978067e-05, "loss": 0.0731, "step": 3350 }, { "epoch": 0.002933517857358557, "grad_norm": 0.007481596898287535, "learning_rate": 1.994649258922361e-05, "loss": 0.0535, "step": 3400 }, { "epoch": 0.002976657825849124, "grad_norm": 0.35124772787094116, "learning_rate": 1.994562956646915e-05, "loss": 0.056, "step": 3450 }, { "epoch": 0.003019797794339691, "grad_norm": 2.102785583701916e-05, "learning_rate": 1.9944766543714694e-05, "loss": 0.0412, "step": 3500 }, { "epoch": 0.003062937762830258, "grad_norm": 11.827704429626465, "learning_rate": 1.9943903520960234e-05, "loss": 0.0194, "step": 3550 }, { "epoch": 0.003106077731320825, "grad_norm": 0.0012801631819456816, "learning_rate": 1.9943040498205777e-05, "loss": 0.0573, "step": 3600 }, { "epoch": 0.003149217699811392, "grad_norm": 6.006156905158377e-09, "learning_rate": 1.994217747545132e-05, "loss": 0.0459, "step": 3650 }, { "epoch": 0.003192357668301959, "grad_norm": 2.7759302412277975e-08, "learning_rate": 1.994131445269686e-05, "loss": 0.1287, "step": 3700 }, { "epoch": 0.003235497636792526, "grad_norm": 2.103457186208857e-09, "learning_rate": 1.9940451429942404e-05, "loss": 0.0164, "step": 3750 }, { "epoch": 0.0032786376052830932, "grad_norm": 2.2541730981817665e-10, "learning_rate": 1.9939588407187947e-05, "loss": 0.0747, "step": 3800 }, { "epoch": 0.0033217775737736602, "grad_norm": 5.149080607225187e-05, "learning_rate": 1.9938725384433487e-05, "loss": 0.0055, "step": 3850 }, { "epoch": 0.0033649175422642273, "grad_norm": 1.0809308290481567, "learning_rate": 1.993786236167903e-05, "loss": 0.0665, "step": 3900 }, { "epoch": 0.0034080575107547943, "grad_norm": 40.65428924560547, "learning_rate": 1.9936999338924574e-05, "loss": 0.0494, "step": 3950 }, { "epoch": 0.0034511974792453613, "grad_norm": 2.0113883018493652, "learning_rate": 1.9936136316170113e-05, "loss": 0.0212, "step": 4000 }, { "epoch": 0.0034943374477359283, "grad_norm": 4.40586519241333, "learning_rate": 1.9935273293415657e-05, "loss": 0.0988, "step": 4050 }, { "epoch": 0.0035374774162264954, "grad_norm": 1.0736999684013426e-05, "learning_rate": 1.9934410270661197e-05, "loss": 0.093, "step": 4100 }, { "epoch": 0.0035806173847170624, "grad_norm": 8.809935820863757e-07, "learning_rate": 1.993354724790674e-05, "loss": 0.0384, "step": 4150 }, { "epoch": 0.0036237573532076294, "grad_norm": 2.5714776515960693, "learning_rate": 1.993268422515228e-05, "loss": 0.0904, "step": 4200 }, { "epoch": 0.0036668973216981964, "grad_norm": 1.5415873022561755e-09, "learning_rate": 1.9931821202397823e-05, "loss": 0.0201, "step": 4250 }, { "epoch": 0.0037100372901887634, "grad_norm": 0.0013566080015152693, "learning_rate": 1.9930958179643367e-05, "loss": 0.116, "step": 4300 }, { "epoch": 0.0037531772586793305, "grad_norm": 0.036448314785957336, "learning_rate": 1.993009515688891e-05, "loss": 0.0538, "step": 4350 }, { "epoch": 0.0037963172271698975, "grad_norm": 1.335322380065918, "learning_rate": 1.992923213413445e-05, "loss": 0.0343, "step": 4400 }, { "epoch": 0.0038394571956604645, "grad_norm": 0.001166568254120648, "learning_rate": 1.9928369111379993e-05, "loss": 0.0827, "step": 4450 }, { "epoch": 0.003882597164151031, "grad_norm": 2.504633656030819e-08, "learning_rate": 1.9927506088625533e-05, "loss": 0.0871, "step": 4500 }, { "epoch": 0.0039257371326415985, "grad_norm": 1.2820944903069176e-05, "learning_rate": 1.9926643065871077e-05, "loss": 0.0427, "step": 4550 }, { "epoch": 0.003968877101132165, "grad_norm": 0.0003728326119016856, "learning_rate": 1.9925780043116617e-05, "loss": 0.0448, "step": 4600 }, { "epoch": 0.004012017069622733, "grad_norm": 0.1788995862007141, "learning_rate": 1.992491702036216e-05, "loss": 0.0341, "step": 4650 }, { "epoch": 0.004055157038113299, "grad_norm": 1.2131690709793475e-05, "learning_rate": 1.9924053997607703e-05, "loss": 0.0456, "step": 4700 }, { "epoch": 0.004098297006603867, "grad_norm": 0.09960448741912842, "learning_rate": 1.9923190974853243e-05, "loss": 0.0219, "step": 4750 }, { "epoch": 0.004141436975094433, "grad_norm": 0.00010674689110601321, "learning_rate": 1.9922327952098786e-05, "loss": 0.0585, "step": 4800 }, { "epoch": 0.004184576943585001, "grad_norm": 12.699185371398926, "learning_rate": 1.992146492934433e-05, "loss": 0.1029, "step": 4850 }, { "epoch": 0.004227716912075567, "grad_norm": 3.298513320260099e-06, "learning_rate": 1.9920601906589873e-05, "loss": 0.0596, "step": 4900 }, { "epoch": 0.004270856880566135, "grad_norm": 7.301036021090113e-06, "learning_rate": 1.9919738883835413e-05, "loss": 0.0433, "step": 4950 }, { "epoch": 0.004313996849056701, "grad_norm": 1.848353167588357e-05, "learning_rate": 1.9918875861080956e-05, "loss": 0.0439, "step": 5000 }, { "epoch": 0.004357136817547269, "grad_norm": 3.848089909297414e-05, "learning_rate": 1.9918012838326496e-05, "loss": 0.1067, "step": 5050 }, { "epoch": 0.004400276786037835, "grad_norm": 5.0859394832514226e-05, "learning_rate": 1.991714981557204e-05, "loss": 0.0592, "step": 5100 }, { "epoch": 0.004443416754528403, "grad_norm": 3.35551449097693e-05, "learning_rate": 1.991628679281758e-05, "loss": 0.1294, "step": 5150 }, { "epoch": 0.004486556723018969, "grad_norm": 0.4632960259914398, "learning_rate": 1.9915423770063123e-05, "loss": 0.0896, "step": 5200 }, { "epoch": 0.004529696691509537, "grad_norm": 26.527536392211914, "learning_rate": 1.9914560747308663e-05, "loss": 0.0405, "step": 5250 }, { "epoch": 0.0045728366600001034, "grad_norm": 1.0410542017780244e-05, "learning_rate": 1.9913697724554206e-05, "loss": 0.0509, "step": 5300 }, { "epoch": 0.004615976628490671, "grad_norm": 29.268795013427734, "learning_rate": 1.991283470179975e-05, "loss": 0.0698, "step": 5350 }, { "epoch": 0.0046591165969812375, "grad_norm": 4.8836263886187226e-05, "learning_rate": 1.9911971679045293e-05, "loss": 0.0155, "step": 5400 }, { "epoch": 0.004702256565471805, "grad_norm": 0.228873610496521, "learning_rate": 1.9911108656290833e-05, "loss": 0.0254, "step": 5450 }, { "epoch": 0.0047453965339623715, "grad_norm": 0.5368197560310364, "learning_rate": 1.9910245633536376e-05, "loss": 0.0429, "step": 5500 }, { "epoch": 0.004788536502452939, "grad_norm": 2.2967957193031907e-05, "learning_rate": 1.9909382610781916e-05, "loss": 0.048, "step": 5550 }, { "epoch": 0.004831676470943506, "grad_norm": 0.20427367091178894, "learning_rate": 1.990851958802746e-05, "loss": 0.0611, "step": 5600 }, { "epoch": 0.004874816439434073, "grad_norm": 9.368510246276855, "learning_rate": 1.9907656565273003e-05, "loss": 0.0462, "step": 5650 }, { "epoch": 0.00491795640792464, "grad_norm": 0.08957739174365997, "learning_rate": 1.9906793542518543e-05, "loss": 0.063, "step": 5700 }, { "epoch": 0.004961096376415207, "grad_norm": 0.0012034045066684484, "learning_rate": 1.9905930519764086e-05, "loss": 0.0335, "step": 5750 }, { "epoch": 0.005004236344905774, "grad_norm": 0.02072218433022499, "learning_rate": 1.9905067497009626e-05, "loss": 0.0799, "step": 5800 }, { "epoch": 0.005047376313396341, "grad_norm": 0.008446129970252514, "learning_rate": 1.990420447425517e-05, "loss": 0.0395, "step": 5850 }, { "epoch": 0.005090516281886908, "grad_norm": 47.39201736450195, "learning_rate": 1.9903341451500713e-05, "loss": 0.0857, "step": 5900 }, { "epoch": 0.005133656250377474, "grad_norm": 4.237736720824614e-05, "learning_rate": 1.9902478428746256e-05, "loss": 0.1098, "step": 5950 }, { "epoch": 0.005176796218868042, "grad_norm": 3.733102630576468e-06, "learning_rate": 1.9901615405991796e-05, "loss": 0.0516, "step": 6000 }, { "epoch": 0.005219936187358608, "grad_norm": 0.0014495301293209195, "learning_rate": 1.990075238323734e-05, "loss": 0.009, "step": 6050 }, { "epoch": 0.005263076155849176, "grad_norm": 1.5238803143802215e-06, "learning_rate": 1.989988936048288e-05, "loss": 0.065, "step": 6100 }, { "epoch": 0.005306216124339742, "grad_norm": 3.455934120211168e-06, "learning_rate": 1.9899026337728423e-05, "loss": 0.0879, "step": 6150 }, { "epoch": 0.00534935609283031, "grad_norm": 1.4700952988278004e-07, "learning_rate": 1.9898163314973963e-05, "loss": 0.047, "step": 6200 }, { "epoch": 0.005392496061320876, "grad_norm": 0.3679034411907196, "learning_rate": 1.9897300292219506e-05, "loss": 0.0449, "step": 6250 }, { "epoch": 0.005435636029811444, "grad_norm": 0.8546851873397827, "learning_rate": 1.989643726946505e-05, "loss": 0.0829, "step": 6300 }, { "epoch": 0.0054787759983020105, "grad_norm": 0.003740283427760005, "learning_rate": 1.989557424671059e-05, "loss": 0.0324, "step": 6350 }, { "epoch": 0.005521915966792578, "grad_norm": 0.11098367720842361, "learning_rate": 1.9894711223956133e-05, "loss": 0.0848, "step": 6400 }, { "epoch": 0.0055650559352831445, "grad_norm": 6.6278211363624e-08, "learning_rate": 1.9893848201201676e-05, "loss": 0.0153, "step": 6450 }, { "epoch": 0.005608195903773712, "grad_norm": 8.399548079296437e-08, "learning_rate": 1.9892985178447216e-05, "loss": 0.0509, "step": 6500 }, { "epoch": 0.0056513358722642786, "grad_norm": 0.010032990016043186, "learning_rate": 1.989212215569276e-05, "loss": 0.0894, "step": 6550 }, { "epoch": 0.005694475840754846, "grad_norm": 3.270921524745063e-06, "learning_rate": 1.9891259132938302e-05, "loss": 0.0484, "step": 6600 }, { "epoch": 0.005737615809245413, "grad_norm": 4.165988445281982, "learning_rate": 1.9890396110183842e-05, "loss": 0.0521, "step": 6650 }, { "epoch": 0.00578075577773598, "grad_norm": 0.16357873380184174, "learning_rate": 1.9889533087429386e-05, "loss": 0.0437, "step": 6700 }, { "epoch": 0.005823895746226547, "grad_norm": 1.4861450381431496e-07, "learning_rate": 1.9888670064674926e-05, "loss": 0.0291, "step": 6750 }, { "epoch": 0.005867035714717114, "grad_norm": 0.000343196967151016, "learning_rate": 1.988780704192047e-05, "loss": 0.0642, "step": 6800 }, { "epoch": 0.005910175683207681, "grad_norm": 1.720488944556564e-05, "learning_rate": 1.988694401916601e-05, "loss": 0.0981, "step": 6850 }, { "epoch": 0.005953315651698248, "grad_norm": 0.05200350657105446, "learning_rate": 1.9886080996411552e-05, "loss": 0.0184, "step": 6900 }, { "epoch": 0.005996455620188815, "grad_norm": 23.398279190063477, "learning_rate": 1.9885217973657096e-05, "loss": 0.049, "step": 6950 }, { "epoch": 0.006039595588679382, "grad_norm": 5.3464435040950775e-05, "learning_rate": 1.988435495090264e-05, "loss": 0.0248, "step": 7000 }, { "epoch": 0.006082735557169949, "grad_norm": 0.01494416780769825, "learning_rate": 1.988349192814818e-05, "loss": 0.0294, "step": 7050 }, { "epoch": 0.006125875525660516, "grad_norm": 6.322508852463216e-05, "learning_rate": 1.9882628905393722e-05, "loss": 0.0075, "step": 7100 }, { "epoch": 0.006169015494151083, "grad_norm": 0.007586951367557049, "learning_rate": 1.9881765882639262e-05, "loss": 0.052, "step": 7150 }, { "epoch": 0.00621215546264165, "grad_norm": 2.4987362873263308e-11, "learning_rate": 1.9880902859884806e-05, "loss": 0.0137, "step": 7200 }, { "epoch": 0.006255295431132217, "grad_norm": 8.16138744354248, "learning_rate": 1.9880039837130345e-05, "loss": 0.109, "step": 7250 }, { "epoch": 0.006298435399622784, "grad_norm": 0.002273560268804431, "learning_rate": 1.987917681437589e-05, "loss": 0.0204, "step": 7300 }, { "epoch": 0.006341575368113351, "grad_norm": 0.00022486828675027937, "learning_rate": 1.9878313791621432e-05, "loss": 0.0764, "step": 7350 }, { "epoch": 0.006384715336603918, "grad_norm": 0.00014589431521017104, "learning_rate": 1.9877450768866972e-05, "loss": 0.0407, "step": 7400 }, { "epoch": 0.006427855305094485, "grad_norm": 0.0005719369510188699, "learning_rate": 1.9876587746112515e-05, "loss": 0.0648, "step": 7450 }, { "epoch": 0.006470995273585052, "grad_norm": 3.020178610313451e-06, "learning_rate": 1.987572472335806e-05, "loss": 0.0525, "step": 7500 }, { "epoch": 0.006514135242075619, "grad_norm": 4.380962934646959e-07, "learning_rate": 1.9874861700603602e-05, "loss": 0.0392, "step": 7550 }, { "epoch": 0.0065572752105661864, "grad_norm": 1.5524530681432225e-05, "learning_rate": 1.9873998677849142e-05, "loss": 0.047, "step": 7600 }, { "epoch": 0.006600415179056753, "grad_norm": 1.0878498869715258e-05, "learning_rate": 1.9873135655094685e-05, "loss": 0.0453, "step": 7650 }, { "epoch": 0.0066435551475473205, "grad_norm": 10.473357200622559, "learning_rate": 1.9872272632340225e-05, "loss": 0.0621, "step": 7700 }, { "epoch": 0.006686695116037887, "grad_norm": 0.05818796157836914, "learning_rate": 1.987140960958577e-05, "loss": 0.0403, "step": 7750 }, { "epoch": 0.0067298350845284545, "grad_norm": 3.924364833096661e-09, "learning_rate": 1.987054658683131e-05, "loss": 0.0317, "step": 7800 }, { "epoch": 0.006772975053019021, "grad_norm": 3.545212848621304e-06, "learning_rate": 1.9869683564076852e-05, "loss": 0.0633, "step": 7850 }, { "epoch": 0.006816115021509589, "grad_norm": 0.17004750669002533, "learning_rate": 1.9868820541322392e-05, "loss": 0.0147, "step": 7900 }, { "epoch": 0.006859254990000155, "grad_norm": 5.974680243525654e-05, "learning_rate": 1.9867957518567935e-05, "loss": 0.0579, "step": 7950 }, { "epoch": 0.006902394958490723, "grad_norm": 3.9863412126806e-08, "learning_rate": 1.986709449581348e-05, "loss": 0.0248, "step": 8000 }, { "epoch": 0.006945534926981289, "grad_norm": 0.8195998668670654, "learning_rate": 1.9866231473059022e-05, "loss": 0.1239, "step": 8050 }, { "epoch": 0.006988674895471857, "grad_norm": 0.0003940521564800292, "learning_rate": 1.9865368450304562e-05, "loss": 0.0727, "step": 8100 }, { "epoch": 0.007031814863962423, "grad_norm": 0.0001462361979065463, "learning_rate": 1.9864505427550105e-05, "loss": 0.0264, "step": 8150 }, { "epoch": 0.007074954832452991, "grad_norm": 4.075237214351546e-08, "learning_rate": 1.9863642404795645e-05, "loss": 0.0154, "step": 8200 }, { "epoch": 0.007118094800943557, "grad_norm": 5.172235432837624e-06, "learning_rate": 1.986277938204119e-05, "loss": 0.0256, "step": 8250 }, { "epoch": 0.007161234769434125, "grad_norm": 0.0007250295020639896, "learning_rate": 1.9861916359286732e-05, "loss": 0.0325, "step": 8300 }, { "epoch": 0.007204374737924691, "grad_norm": 8.068302154541016, "learning_rate": 1.9861053336532272e-05, "loss": 0.032, "step": 8350 }, { "epoch": 0.007247514706415259, "grad_norm": 11.65196704864502, "learning_rate": 1.9860190313777815e-05, "loss": 0.006, "step": 8400 }, { "epoch": 0.007290654674905825, "grad_norm": 1.6602513808194885e-09, "learning_rate": 1.9859327291023355e-05, "loss": 0.0565, "step": 8450 }, { "epoch": 0.007333794643396393, "grad_norm": 0.22325988113880157, "learning_rate": 1.98584642682689e-05, "loss": 0.0493, "step": 8500 }, { "epoch": 0.007376934611886959, "grad_norm": 0.0023358704056590796, "learning_rate": 1.985760124551444e-05, "loss": 0.0061, "step": 8550 }, { "epoch": 0.007420074580377527, "grad_norm": 1.97016873926259e-07, "learning_rate": 1.9856738222759985e-05, "loss": 0.0704, "step": 8600 }, { "epoch": 0.0074632145488680935, "grad_norm": 0.0003019660944119096, "learning_rate": 1.9855875200005525e-05, "loss": 0.0156, "step": 8650 }, { "epoch": 0.007506354517358661, "grad_norm": 0.014269077219069004, "learning_rate": 1.9855012177251068e-05, "loss": 0.0606, "step": 8700 }, { "epoch": 0.0075494944858492275, "grad_norm": 0.010774667374789715, "learning_rate": 1.9854149154496608e-05, "loss": 0.0129, "step": 8750 }, { "epoch": 0.007592634454339795, "grad_norm": 14.643841743469238, "learning_rate": 1.985328613174215e-05, "loss": 0.0368, "step": 8800 }, { "epoch": 0.0076357744228303616, "grad_norm": 0.004390218295156956, "learning_rate": 1.985242310898769e-05, "loss": 0.0193, "step": 8850 }, { "epoch": 0.007678914391320929, "grad_norm": 0.00026494322810322046, "learning_rate": 1.9851560086233235e-05, "loss": 0.0377, "step": 8900 }, { "epoch": 0.007722054359811496, "grad_norm": 0.3454723656177521, "learning_rate": 1.9850697063478778e-05, "loss": 0.0271, "step": 8950 }, { "epoch": 0.007765194328302062, "grad_norm": 1.240284319692364e-07, "learning_rate": 1.9849834040724318e-05, "loss": 0.0312, "step": 9000 }, { "epoch": 0.00780833429679263, "grad_norm": 0.0001445577945560217, "learning_rate": 1.984897101796986e-05, "loss": 0.0305, "step": 9050 }, { "epoch": 0.007851474265283197, "grad_norm": 4.175523482530252e-09, "learning_rate": 1.9848107995215405e-05, "loss": 0.0979, "step": 9100 }, { "epoch": 0.007894614233773764, "grad_norm": 0.035435471683740616, "learning_rate": 1.9847244972460945e-05, "loss": 0.0098, "step": 9150 }, { "epoch": 0.00793775420226433, "grad_norm": 26.931116104125977, "learning_rate": 1.9846381949706488e-05, "loss": 0.0262, "step": 9200 }, { "epoch": 0.007980894170754897, "grad_norm": 8.122495273710229e-06, "learning_rate": 1.984551892695203e-05, "loss": 0.0602, "step": 9250 }, { "epoch": 0.008024034139245465, "grad_norm": 2.0076650411593455e-11, "learning_rate": 1.984465590419757e-05, "loss": 0.0514, "step": 9300 }, { "epoch": 0.008067174107736032, "grad_norm": 2.9286837843756075e-08, "learning_rate": 1.9843792881443115e-05, "loss": 0.0657, "step": 9350 }, { "epoch": 0.008110314076226598, "grad_norm": 1.5581694841384888, "learning_rate": 1.9842929858688655e-05, "loss": 0.0335, "step": 9400 }, { "epoch": 0.008153454044717165, "grad_norm": 3.392365144350151e-08, "learning_rate": 1.9842066835934198e-05, "loss": 0.0701, "step": 9450 }, { "epoch": 0.008196594013207733, "grad_norm": 0.03891870751976967, "learning_rate": 1.9841203813179738e-05, "loss": 0.1115, "step": 9500 }, { "epoch": 0.0082397339816983, "grad_norm": 5.497531890869141, "learning_rate": 1.984034079042528e-05, "loss": 0.0065, "step": 9550 }, { "epoch": 0.008282873950188866, "grad_norm": 0.0006867619813419878, "learning_rate": 1.9839477767670825e-05, "loss": 0.0231, "step": 9600 }, { "epoch": 0.008326013918679433, "grad_norm": 0.000866669462993741, "learning_rate": 1.9838614744916368e-05, "loss": 0.0234, "step": 9650 }, { "epoch": 0.008369153887170001, "grad_norm": 0.061681024730205536, "learning_rate": 1.9837751722161908e-05, "loss": 0.0371, "step": 9700 }, { "epoch": 0.008412293855660568, "grad_norm": 7.284898515536042e-07, "learning_rate": 1.983688869940745e-05, "loss": 0.039, "step": 9750 }, { "epoch": 0.008455433824151135, "grad_norm": 5.737701980201848e-10, "learning_rate": 1.983602567665299e-05, "loss": 0.0145, "step": 9800 }, { "epoch": 0.008498573792641701, "grad_norm": 6.553115099450224e-07, "learning_rate": 1.9835162653898534e-05, "loss": 0.024, "step": 9850 }, { "epoch": 0.00854171376113227, "grad_norm": 7.23102075994575e-08, "learning_rate": 1.9834299631144074e-05, "loss": 0.0458, "step": 9900 }, { "epoch": 0.008584853729622836, "grad_norm": 5.95320443608216e-06, "learning_rate": 1.9833436608389618e-05, "loss": 0.0918, "step": 9950 }, { "epoch": 0.008627993698113403, "grad_norm": 7.1469521571998484e-06, "learning_rate": 1.983257358563516e-05, "loss": 0.0287, "step": 10000 }, { "epoch": 0.00867113366660397, "grad_norm": 0.00036231454578228295, "learning_rate": 1.98317105628807e-05, "loss": 0.0284, "step": 10050 }, { "epoch": 0.008714273635094538, "grad_norm": 8.159648132277653e-05, "learning_rate": 1.9830847540126244e-05, "loss": 0.0589, "step": 10100 }, { "epoch": 0.008757413603585104, "grad_norm": 0.0002320503263035789, "learning_rate": 1.9829984517371788e-05, "loss": 0.0296, "step": 10150 }, { "epoch": 0.00880055357207567, "grad_norm": 0.001181815518066287, "learning_rate": 1.982912149461733e-05, "loss": 0.0244, "step": 10200 }, { "epoch": 0.008843693540566237, "grad_norm": 2.497093198883249e-09, "learning_rate": 1.982825847186287e-05, "loss": 0.0337, "step": 10250 }, { "epoch": 0.008886833509056806, "grad_norm": 0.00030890764901414514, "learning_rate": 1.9827395449108414e-05, "loss": 0.033, "step": 10300 }, { "epoch": 0.008929973477547372, "grad_norm": 24.577367782592773, "learning_rate": 1.9826532426353954e-05, "loss": 0.0135, "step": 10350 }, { "epoch": 0.008973113446037939, "grad_norm": 1.9483505487442017, "learning_rate": 1.9825669403599498e-05, "loss": 0.0299, "step": 10400 }, { "epoch": 0.009016253414528505, "grad_norm": 0.0004972516908310354, "learning_rate": 1.9824806380845038e-05, "loss": 0.0107, "step": 10450 }, { "epoch": 0.009059393383019074, "grad_norm": 1.4932817649082608e-08, "learning_rate": 1.982394335809058e-05, "loss": 0.0567, "step": 10500 }, { "epoch": 0.00910253335150964, "grad_norm": 0.004500082693994045, "learning_rate": 1.982308033533612e-05, "loss": 0.0163, "step": 10550 }, { "epoch": 0.009145673320000207, "grad_norm": 6.5830713538161945e-06, "learning_rate": 1.9822217312581664e-05, "loss": 0.0359, "step": 10600 }, { "epoch": 0.009188813288490773, "grad_norm": 0.09253023564815521, "learning_rate": 1.9821354289827207e-05, "loss": 0.0179, "step": 10650 }, { "epoch": 0.009231953256981342, "grad_norm": 0.004253961145877838, "learning_rate": 1.982049126707275e-05, "loss": 0.0433, "step": 10700 }, { "epoch": 0.009275093225471908, "grad_norm": 0.0014189484063535929, "learning_rate": 1.981962824431829e-05, "loss": 0.0467, "step": 10750 }, { "epoch": 0.009318233193962475, "grad_norm": 0.0005026152357459068, "learning_rate": 1.9818765221563834e-05, "loss": 0.0137, "step": 10800 }, { "epoch": 0.009361373162453042, "grad_norm": 0.003253827104344964, "learning_rate": 1.9817902198809374e-05, "loss": 0.0135, "step": 10850 }, { "epoch": 0.00940451313094361, "grad_norm": 0.5753559470176697, "learning_rate": 1.9817039176054917e-05, "loss": 0.0707, "step": 10900 }, { "epoch": 0.009447653099434176, "grad_norm": 2.7666785626934143e-06, "learning_rate": 1.981617615330046e-05, "loss": 0.0042, "step": 10950 }, { "epoch": 0.009490793067924743, "grad_norm": 0.0010713053634390235, "learning_rate": 1.9815313130546e-05, "loss": 0.0248, "step": 11000 }, { "epoch": 0.00953393303641531, "grad_norm": 0.00012337288353592157, "learning_rate": 1.9814450107791544e-05, "loss": 0.0101, "step": 11050 }, { "epoch": 0.009577073004905878, "grad_norm": 2.0991153704130738e-08, "learning_rate": 1.9813587085037084e-05, "loss": 0.0319, "step": 11100 }, { "epoch": 0.009620212973396445, "grad_norm": 0.0001735202531563118, "learning_rate": 1.9812724062282627e-05, "loss": 0.0065, "step": 11150 }, { "epoch": 0.009663352941887011, "grad_norm": 0.0007401935290545225, "learning_rate": 1.981186103952817e-05, "loss": 0.0184, "step": 11200 }, { "epoch": 0.009706492910377578, "grad_norm": 5.382436825129844e-07, "learning_rate": 1.9810998016773714e-05, "loss": 0.0766, "step": 11250 }, { "epoch": 0.009749632878868146, "grad_norm": 5.5672944654361345e-06, "learning_rate": 1.9810134994019254e-05, "loss": 0.0082, "step": 11300 }, { "epoch": 0.009792772847358713, "grad_norm": 2.2267850852131232e-07, "learning_rate": 1.9809271971264797e-05, "loss": 0.019, "step": 11350 }, { "epoch": 0.00983591281584928, "grad_norm": 0.23477919399738312, "learning_rate": 1.9808408948510337e-05, "loss": 0.0295, "step": 11400 }, { "epoch": 0.009879052784339846, "grad_norm": 9.228908304237393e-09, "learning_rate": 1.980754592575588e-05, "loss": 0.0575, "step": 11450 }, { "epoch": 0.009922192752830414, "grad_norm": 0.00020697819127235562, "learning_rate": 1.980668290300142e-05, "loss": 0.0269, "step": 11500 }, { "epoch": 0.00996533272132098, "grad_norm": 0.19181561470031738, "learning_rate": 1.9805819880246964e-05, "loss": 0.0093, "step": 11550 }, { "epoch": 0.010008472689811547, "grad_norm": 3.362165080034174e-05, "learning_rate": 1.9804956857492507e-05, "loss": 0.0373, "step": 11600 }, { "epoch": 0.010051612658302114, "grad_norm": 0.3552068769931793, "learning_rate": 1.9804093834738047e-05, "loss": 0.0599, "step": 11650 }, { "epoch": 0.010094752626792682, "grad_norm": 1.6512422007508576e-05, "learning_rate": 1.980323081198359e-05, "loss": 0.0255, "step": 11700 }, { "epoch": 0.010137892595283249, "grad_norm": 6.555333614349365, "learning_rate": 1.9802367789229134e-05, "loss": 0.0162, "step": 11750 }, { "epoch": 0.010181032563773815, "grad_norm": 2.48828387260437, "learning_rate": 1.9801504766474674e-05, "loss": 0.0377, "step": 11800 }, { "epoch": 0.010224172532264382, "grad_norm": 0.005198315717279911, "learning_rate": 1.9800641743720217e-05, "loss": 0.0071, "step": 11850 }, { "epoch": 0.010267312500754949, "grad_norm": 0.0014223635662347078, "learning_rate": 1.979977872096576e-05, "loss": 0.0286, "step": 11900 }, { "epoch": 0.010310452469245517, "grad_norm": 1.555037556499883e-06, "learning_rate": 1.97989156982113e-05, "loss": 0.0429, "step": 11950 }, { "epoch": 0.010353592437736083, "grad_norm": 7.9471330642700195, "learning_rate": 1.9798052675456844e-05, "loss": 0.0401, "step": 12000 }, { "epoch": 0.01039673240622665, "grad_norm": 0.0001056401088135317, "learning_rate": 1.9797189652702384e-05, "loss": 0.008, "step": 12050 }, { "epoch": 0.010439872374717217, "grad_norm": 3.85151979571674e-05, "learning_rate": 1.9796326629947927e-05, "loss": 0.0309, "step": 12100 }, { "epoch": 0.010483012343207785, "grad_norm": 16.898605346679688, "learning_rate": 1.9795463607193467e-05, "loss": 0.0234, "step": 12150 }, { "epoch": 0.010526152311698352, "grad_norm": 3.0313758170308347e-09, "learning_rate": 1.979460058443901e-05, "loss": 0.0237, "step": 12200 }, { "epoch": 0.010569292280188918, "grad_norm": 0.0001202192361233756, "learning_rate": 1.9793737561684553e-05, "loss": 0.026, "step": 12250 }, { "epoch": 0.010612432248679485, "grad_norm": 1.240387376144625e-10, "learning_rate": 1.9792874538930097e-05, "loss": 0.0527, "step": 12300 }, { "epoch": 0.010655572217170053, "grad_norm": 11.890090942382812, "learning_rate": 1.9792011516175637e-05, "loss": 0.0577, "step": 12350 }, { "epoch": 0.01069871218566062, "grad_norm": 2.300609958183486e-05, "learning_rate": 1.979114849342118e-05, "loss": 0.0413, "step": 12400 }, { "epoch": 0.010741852154151186, "grad_norm": 3.607681719586253e-05, "learning_rate": 1.979028547066672e-05, "loss": 0.056, "step": 12450 }, { "epoch": 0.010784992122641753, "grad_norm": 0.007184322457760572, "learning_rate": 1.9789422447912263e-05, "loss": 0.0356, "step": 12500 }, { "epoch": 0.010828132091132321, "grad_norm": 0.03649460896849632, "learning_rate": 1.9788559425157807e-05, "loss": 0.0653, "step": 12550 }, { "epoch": 0.010871272059622888, "grad_norm": 2.2537233235198073e-05, "learning_rate": 1.9787696402403347e-05, "loss": 0.0298, "step": 12600 }, { "epoch": 0.010914412028113454, "grad_norm": 0.012440712191164494, "learning_rate": 1.978683337964889e-05, "loss": 0.0027, "step": 12650 }, { "epoch": 0.010957551996604021, "grad_norm": 0.0001454145967727527, "learning_rate": 1.978597035689443e-05, "loss": 0.0428, "step": 12700 }, { "epoch": 0.01100069196509459, "grad_norm": 4.73512305754209e-11, "learning_rate": 1.9785107334139973e-05, "loss": 0.0266, "step": 12750 }, { "epoch": 0.011043831933585156, "grad_norm": 0.4899098873138428, "learning_rate": 1.9784244311385517e-05, "loss": 0.0423, "step": 12800 }, { "epoch": 0.011086971902075722, "grad_norm": 3.1542436772724614e-05, "learning_rate": 1.978338128863106e-05, "loss": 0.0201, "step": 12850 }, { "epoch": 0.011130111870566289, "grad_norm": 2.1234811242720752e-08, "learning_rate": 1.97825182658766e-05, "loss": 0.0602, "step": 12900 }, { "epoch": 0.011173251839056857, "grad_norm": 2.4936113174334196e-09, "learning_rate": 1.9781655243122143e-05, "loss": 0.0289, "step": 12950 }, { "epoch": 0.011216391807547424, "grad_norm": 7.835155884095002e-07, "learning_rate": 1.9780792220367683e-05, "loss": 0.0126, "step": 13000 }, { "epoch": 0.01125953177603799, "grad_norm": 3.1845395369600737e-06, "learning_rate": 1.9779929197613227e-05, "loss": 0.0133, "step": 13050 }, { "epoch": 0.011302671744528557, "grad_norm": 6.416823072896705e-09, "learning_rate": 1.9779066174858766e-05, "loss": 0.0413, "step": 13100 }, { "epoch": 0.011345811713019125, "grad_norm": 6.80740213394165, "learning_rate": 1.977820315210431e-05, "loss": 0.0443, "step": 13150 }, { "epoch": 0.011388951681509692, "grad_norm": 0.012771312147378922, "learning_rate": 1.977734012934985e-05, "loss": 0.0383, "step": 13200 }, { "epoch": 0.011432091650000259, "grad_norm": 0.0008403750252909958, "learning_rate": 1.9776477106595393e-05, "loss": 0.0434, "step": 13250 }, { "epoch": 0.011475231618490825, "grad_norm": 1.2084444761276245, "learning_rate": 1.9775614083840936e-05, "loss": 0.0066, "step": 13300 }, { "epoch": 0.011518371586981394, "grad_norm": 9.330961781017777e-09, "learning_rate": 1.977475106108648e-05, "loss": 0.0325, "step": 13350 }, { "epoch": 0.01156151155547196, "grad_norm": 0.00011164277384523302, "learning_rate": 1.977388803833202e-05, "loss": 0.0956, "step": 13400 }, { "epoch": 0.011604651523962527, "grad_norm": 2.7169560326001374e-06, "learning_rate": 1.9773025015577563e-05, "loss": 0.0207, "step": 13450 }, { "epoch": 0.011647791492453093, "grad_norm": 0.0006356360972858965, "learning_rate": 1.9772161992823103e-05, "loss": 0.0045, "step": 13500 }, { "epoch": 0.011690931460943662, "grad_norm": 6.926347850821912e-05, "learning_rate": 1.9771298970068646e-05, "loss": 0.0176, "step": 13550 }, { "epoch": 0.011734071429434228, "grad_norm": 0.00017402067896910012, "learning_rate": 1.977043594731419e-05, "loss": 0.0284, "step": 13600 }, { "epoch": 0.011777211397924795, "grad_norm": 1.069779334561538e-10, "learning_rate": 1.976957292455973e-05, "loss": 0.0292, "step": 13650 }, { "epoch": 0.011820351366415361, "grad_norm": 0.17523643374443054, "learning_rate": 1.9768709901805273e-05, "loss": 0.0273, "step": 13700 }, { "epoch": 0.01186349133490593, "grad_norm": 9.821783065795898, "learning_rate": 1.9767846879050813e-05, "loss": 0.0706, "step": 13750 }, { "epoch": 0.011906631303396496, "grad_norm": 1.8948287561215693e-07, "learning_rate": 1.9766983856296356e-05, "loss": 0.0165, "step": 13800 }, { "epoch": 0.011949771271887063, "grad_norm": 5.998489086778136e-06, "learning_rate": 1.97661208335419e-05, "loss": 0.0372, "step": 13850 }, { "epoch": 0.01199291124037763, "grad_norm": 1.5009301900863647, "learning_rate": 1.9765257810787443e-05, "loss": 0.0387, "step": 13900 }, { "epoch": 0.012036051208868198, "grad_norm": 5.223755650263229e-09, "learning_rate": 1.9764394788032983e-05, "loss": 0.0828, "step": 13950 }, { "epoch": 0.012079191177358764, "grad_norm": 0.1192856878042221, "learning_rate": 1.9763531765278526e-05, "loss": 0.0286, "step": 14000 }, { "epoch": 0.012122331145849331, "grad_norm": 5.815771601191955e-06, "learning_rate": 1.9762668742524066e-05, "loss": 0.0541, "step": 14050 }, { "epoch": 0.012165471114339898, "grad_norm": 11.029925346374512, "learning_rate": 1.976180571976961e-05, "loss": 0.039, "step": 14100 }, { "epoch": 0.012208611082830466, "grad_norm": 0.00015492299280595034, "learning_rate": 1.976094269701515e-05, "loss": 0.0354, "step": 14150 }, { "epoch": 0.012251751051321032, "grad_norm": 4.5061292439640965e-06, "learning_rate": 1.9760079674260693e-05, "loss": 0.0364, "step": 14200 }, { "epoch": 0.012294891019811599, "grad_norm": 0.45702916383743286, "learning_rate": 1.9759216651506236e-05, "loss": 0.0313, "step": 14250 }, { "epoch": 0.012338030988302166, "grad_norm": 1.0066764311034149e-08, "learning_rate": 1.9758353628751776e-05, "loss": 0.0344, "step": 14300 }, { "epoch": 0.012381170956792734, "grad_norm": 7.227523610708886e-07, "learning_rate": 1.975749060599732e-05, "loss": 0.0351, "step": 14350 }, { "epoch": 0.0124243109252833, "grad_norm": 5.080125653478262e-09, "learning_rate": 1.9756627583242863e-05, "loss": 0.0136, "step": 14400 }, { "epoch": 0.012467450893773867, "grad_norm": 0.016180645674467087, "learning_rate": 1.9755764560488403e-05, "loss": 0.0407, "step": 14450 }, { "epoch": 0.012510590862264434, "grad_norm": 0.061310265213251114, "learning_rate": 1.9754901537733946e-05, "loss": 0.01, "step": 14500 }, { "epoch": 0.012553730830755002, "grad_norm": 7.248584552144166e-06, "learning_rate": 1.975403851497949e-05, "loss": 0.0178, "step": 14550 }, { "epoch": 0.012596870799245569, "grad_norm": 7.203379154205322, "learning_rate": 1.975317549222503e-05, "loss": 0.062, "step": 14600 }, { "epoch": 0.012640010767736135, "grad_norm": 1.126842835219577e-05, "learning_rate": 1.9752312469470573e-05, "loss": 0.0173, "step": 14650 }, { "epoch": 0.012683150736226702, "grad_norm": 0.0011432298924773932, "learning_rate": 1.9751449446716112e-05, "loss": 0.0923, "step": 14700 }, { "epoch": 0.01272629070471727, "grad_norm": 1.9043671954932506e-06, "learning_rate": 1.9750586423961656e-05, "loss": 0.047, "step": 14750 }, { "epoch": 0.012769430673207837, "grad_norm": 0.20942749083042145, "learning_rate": 1.9749723401207196e-05, "loss": 0.0156, "step": 14800 }, { "epoch": 0.012812570641698403, "grad_norm": 1.760947014872727e-07, "learning_rate": 1.974886037845274e-05, "loss": 0.0481, "step": 14850 }, { "epoch": 0.01285571061018897, "grad_norm": 2.280950639033108e-06, "learning_rate": 1.9747997355698282e-05, "loss": 0.0073, "step": 14900 }, { "epoch": 0.012898850578679536, "grad_norm": 0.019771773368120193, "learning_rate": 1.9747134332943826e-05, "loss": 0.0109, "step": 14950 }, { "epoch": 0.012941990547170105, "grad_norm": 0.7483711838722229, "learning_rate": 1.9746271310189366e-05, "loss": 0.0025, "step": 15000 }, { "epoch": 0.012985130515660671, "grad_norm": 0.0011623813770711422, "learning_rate": 1.974540828743491e-05, "loss": 0.0097, "step": 15050 }, { "epoch": 0.013028270484151238, "grad_norm": 0.00023206142941489816, "learning_rate": 1.974454526468045e-05, "loss": 0.0211, "step": 15100 }, { "epoch": 0.013071410452641805, "grad_norm": 8.044224841796677e-07, "learning_rate": 1.9743682241925992e-05, "loss": 0.0678, "step": 15150 }, { "epoch": 0.013114550421132373, "grad_norm": 2.867023241037714e-08, "learning_rate": 1.9742819219171536e-05, "loss": 0.0332, "step": 15200 }, { "epoch": 0.01315769038962294, "grad_norm": 3.529981640326696e-08, "learning_rate": 1.9741956196417076e-05, "loss": 0.0811, "step": 15250 }, { "epoch": 0.013200830358113506, "grad_norm": 3.617996844695881e-05, "learning_rate": 1.974109317366262e-05, "loss": 0.0281, "step": 15300 }, { "epoch": 0.013243970326604073, "grad_norm": 0.0002957701508421451, "learning_rate": 1.974023015090816e-05, "loss": 0.0005, "step": 15350 }, { "epoch": 0.013287110295094641, "grad_norm": 0.1449277251958847, "learning_rate": 1.9739367128153702e-05, "loss": 0.026, "step": 15400 }, { "epoch": 0.013330250263585208, "grad_norm": 3.980770713063464e-10, "learning_rate": 1.9738504105399246e-05, "loss": 0.0121, "step": 15450 }, { "epoch": 0.013373390232075774, "grad_norm": 6.5806302629312086e-09, "learning_rate": 1.973764108264479e-05, "loss": 0.0131, "step": 15500 }, { "epoch": 0.01341653020056634, "grad_norm": 10.989927291870117, "learning_rate": 1.973677805989033e-05, "loss": 0.0375, "step": 15550 }, { "epoch": 0.013459670169056909, "grad_norm": 0.028256021440029144, "learning_rate": 1.9735915037135872e-05, "loss": 0.005, "step": 15600 }, { "epoch": 0.013502810137547476, "grad_norm": 3.129288234049454e-05, "learning_rate": 1.9735052014381412e-05, "loss": 0.0145, "step": 15650 }, { "epoch": 0.013545950106038042, "grad_norm": 1.9001586970546214e-09, "learning_rate": 1.9734188991626955e-05, "loss": 0.0662, "step": 15700 }, { "epoch": 0.013589090074528609, "grad_norm": 2.575715734565165e-05, "learning_rate": 1.9733325968872495e-05, "loss": 0.0169, "step": 15750 }, { "epoch": 0.013632230043019177, "grad_norm": 0.00017570947238709778, "learning_rate": 1.973246294611804e-05, "loss": 0.094, "step": 15800 }, { "epoch": 0.013675370011509744, "grad_norm": 2.5118701563187074e-10, "learning_rate": 1.973159992336358e-05, "loss": 0.0255, "step": 15850 }, { "epoch": 0.01371850998000031, "grad_norm": 4.180213952764689e-09, "learning_rate": 1.9730736900609122e-05, "loss": 0.0447, "step": 15900 }, { "epoch": 0.013761649948490877, "grad_norm": 9.289252744792975e-08, "learning_rate": 1.9729873877854665e-05, "loss": 0.0102, "step": 15950 }, { "epoch": 0.013804789916981445, "grad_norm": 9.842972659157567e-09, "learning_rate": 1.972901085510021e-05, "loss": 0.0263, "step": 16000 }, { "epoch": 0.013847929885472012, "grad_norm": 5.562032222747803, "learning_rate": 1.972814783234575e-05, "loss": 0.0495, "step": 16050 }, { "epoch": 0.013891069853962578, "grad_norm": 0.07973814755678177, "learning_rate": 1.9727284809591292e-05, "loss": 0.0315, "step": 16100 }, { "epoch": 0.013934209822453145, "grad_norm": 4.2199195604553097e-07, "learning_rate": 1.9726421786836832e-05, "loss": 0.0532, "step": 16150 }, { "epoch": 0.013977349790943713, "grad_norm": 0.00012909203360322863, "learning_rate": 1.9725558764082375e-05, "loss": 0.0331, "step": 16200 }, { "epoch": 0.01402048975943428, "grad_norm": 0.023724447935819626, "learning_rate": 1.972469574132792e-05, "loss": 0.0243, "step": 16250 }, { "epoch": 0.014063629727924847, "grad_norm": 7.801064384693746e-06, "learning_rate": 1.972383271857346e-05, "loss": 0.0729, "step": 16300 }, { "epoch": 0.014106769696415413, "grad_norm": 0.00012842965952586383, "learning_rate": 1.9722969695819002e-05, "loss": 0.0076, "step": 16350 }, { "epoch": 0.014149909664905981, "grad_norm": 3.283237148821172e-08, "learning_rate": 1.9722106673064542e-05, "loss": 0.0496, "step": 16400 }, { "epoch": 0.014193049633396548, "grad_norm": 5.932063174007851e-10, "learning_rate": 1.9721243650310085e-05, "loss": 0.0793, "step": 16450 }, { "epoch": 0.014236189601887115, "grad_norm": 0.07802320271730423, "learning_rate": 1.972038062755563e-05, "loss": 0.0157, "step": 16500 }, { "epoch": 0.014279329570377681, "grad_norm": 1.4036957907137548e-07, "learning_rate": 1.9719517604801172e-05, "loss": 0.0221, "step": 16550 }, { "epoch": 0.01432246953886825, "grad_norm": 3.236153389707397e-08, "learning_rate": 1.9718654582046712e-05, "loss": 0.0001, "step": 16600 }, { "epoch": 0.014365609507358816, "grad_norm": 3.180664539337158, "learning_rate": 1.9717791559292255e-05, "loss": 0.0684, "step": 16650 }, { "epoch": 0.014408749475849383, "grad_norm": 6.371417839545757e-06, "learning_rate": 1.9716928536537795e-05, "loss": 0.006, "step": 16700 }, { "epoch": 0.01445188944433995, "grad_norm": 6.981757906032726e-06, "learning_rate": 1.971606551378334e-05, "loss": 0.0743, "step": 16750 }, { "epoch": 0.014495029412830518, "grad_norm": 0.9886574745178223, "learning_rate": 1.9715202491028878e-05, "loss": 0.0285, "step": 16800 }, { "epoch": 0.014538169381321084, "grad_norm": 36.159725189208984, "learning_rate": 1.971433946827442e-05, "loss": 0.0159, "step": 16850 }, { "epoch": 0.01458130934981165, "grad_norm": 4.837416648864746, "learning_rate": 1.9713476445519965e-05, "loss": 0.0125, "step": 16900 }, { "epoch": 0.014624449318302217, "grad_norm": 4.346982677816413e-06, "learning_rate": 1.9712613422765505e-05, "loss": 0.0152, "step": 16950 }, { "epoch": 0.014667589286792786, "grad_norm": 8.429530962139609e-10, "learning_rate": 1.9711750400011048e-05, "loss": 0.0035, "step": 17000 }, { "epoch": 0.014710729255283352, "grad_norm": 1.2465453437471297e-05, "learning_rate": 1.971088737725659e-05, "loss": 0.0067, "step": 17050 }, { "epoch": 0.014753869223773919, "grad_norm": 6.187327699080925e-07, "learning_rate": 1.971002435450213e-05, "loss": 0.0159, "step": 17100 }, { "epoch": 0.014797009192264485, "grad_norm": 0.316834419965744, "learning_rate": 1.9709161331747675e-05, "loss": 0.0063, "step": 17150 }, { "epoch": 0.014840149160755054, "grad_norm": 0.00014671437384095043, "learning_rate": 1.9708298308993218e-05, "loss": 0.0784, "step": 17200 }, { "epoch": 0.01488328912924562, "grad_norm": 1.0954934737128497e-08, "learning_rate": 1.9707435286238758e-05, "loss": 0.0238, "step": 17250 }, { "epoch": 0.014926429097736187, "grad_norm": 0.5361968278884888, "learning_rate": 1.97065722634843e-05, "loss": 0.0195, "step": 17300 }, { "epoch": 0.014969569066226754, "grad_norm": 3.5330817699432373, "learning_rate": 1.970570924072984e-05, "loss": 0.0445, "step": 17350 }, { "epoch": 0.015012709034717322, "grad_norm": 0.0001147388611570932, "learning_rate": 1.9704846217975385e-05, "loss": 0.0534, "step": 17400 }, { "epoch": 0.015055849003207888, "grad_norm": 1.4025573237541611e-11, "learning_rate": 1.9703983195220925e-05, "loss": 0.0443, "step": 17450 }, { "epoch": 0.015098988971698455, "grad_norm": 0.0013008522801101208, "learning_rate": 1.9703120172466468e-05, "loss": 0.0301, "step": 17500 }, { "epoch": 0.015142128940189022, "grad_norm": 4.471134662628174, "learning_rate": 1.970225714971201e-05, "loss": 0.0301, "step": 17550 }, { "epoch": 0.01518526890867959, "grad_norm": 6.183355708344607e-06, "learning_rate": 1.9701394126957555e-05, "loss": 0.0369, "step": 17600 }, { "epoch": 0.015228408877170157, "grad_norm": 7.665110751986504e-05, "learning_rate": 1.9700531104203095e-05, "loss": 0.0056, "step": 17650 }, { "epoch": 0.015271548845660723, "grad_norm": 2.106353521347046, "learning_rate": 1.9699668081448638e-05, "loss": 0.0272, "step": 17700 }, { "epoch": 0.01531468881415129, "grad_norm": 3.855154488974222e-07, "learning_rate": 1.9698805058694178e-05, "loss": 0.0352, "step": 17750 }, { "epoch": 0.015357828782641858, "grad_norm": 17.341279983520508, "learning_rate": 1.969794203593972e-05, "loss": 0.0465, "step": 17800 }, { "epoch": 0.015400968751132425, "grad_norm": 9.402146679349244e-05, "learning_rate": 1.9697079013185265e-05, "loss": 0.0183, "step": 17850 }, { "epoch": 0.015444108719622991, "grad_norm": 0.00015307770809158683, "learning_rate": 1.9696215990430805e-05, "loss": 0.0291, "step": 17900 }, { "epoch": 0.015487248688113558, "grad_norm": 3.735563609552628e-07, "learning_rate": 1.9695352967676348e-05, "loss": 0.0019, "step": 17950 }, { "epoch": 0.015530388656604124, "grad_norm": 5.729863187298179e-05, "learning_rate": 1.9694489944921888e-05, "loss": 0.0045, "step": 18000 }, { "epoch": 0.015573528625094693, "grad_norm": 2.0717274562542798e-09, "learning_rate": 1.969362692216743e-05, "loss": 0.018, "step": 18050 }, { "epoch": 0.01561666859358526, "grad_norm": 12.531591415405273, "learning_rate": 1.9692763899412974e-05, "loss": 0.0242, "step": 18100 }, { "epoch": 0.015659808562075828, "grad_norm": 3.573931508071837e-06, "learning_rate": 1.9691900876658518e-05, "loss": 0.0556, "step": 18150 }, { "epoch": 0.015702948530566394, "grad_norm": 1.851037545463896e-08, "learning_rate": 1.9691037853904058e-05, "loss": 0.0141, "step": 18200 }, { "epoch": 0.01574608849905696, "grad_norm": 3.601686694310047e-05, "learning_rate": 1.96901748311496e-05, "loss": 0.0541, "step": 18250 }, { "epoch": 0.015789228467547527, "grad_norm": 0.05700366199016571, "learning_rate": 1.968931180839514e-05, "loss": 0.0286, "step": 18300 }, { "epoch": 0.015832368436038094, "grad_norm": 3.566603901106191e-09, "learning_rate": 1.9688448785640684e-05, "loss": 0.0091, "step": 18350 }, { "epoch": 0.01587550840452866, "grad_norm": 0.00013142921670805663, "learning_rate": 1.9687585762886224e-05, "loss": 0.0254, "step": 18400 }, { "epoch": 0.015918648373019227, "grad_norm": 20.01519775390625, "learning_rate": 1.9686722740131768e-05, "loss": 0.0771, "step": 18450 }, { "epoch": 0.015961788341509794, "grad_norm": 0.1688498556613922, "learning_rate": 1.9685859717377308e-05, "loss": 0.0183, "step": 18500 }, { "epoch": 0.016004928310000364, "grad_norm": 0.030780350789427757, "learning_rate": 1.968499669462285e-05, "loss": 0.0251, "step": 18550 }, { "epoch": 0.01604806827849093, "grad_norm": 0.002585780341178179, "learning_rate": 1.9684133671868394e-05, "loss": 0.0718, "step": 18600 }, { "epoch": 0.016091208246981497, "grad_norm": 4.36324262409471e-05, "learning_rate": 1.9683270649113938e-05, "loss": 0.0203, "step": 18650 }, { "epoch": 0.016134348215472064, "grad_norm": 1.3234290463515208e-07, "learning_rate": 1.9682407626359478e-05, "loss": 0.0136, "step": 18700 }, { "epoch": 0.01617748818396263, "grad_norm": 1.555231143868241e-08, "learning_rate": 1.968154460360502e-05, "loss": 0.037, "step": 18750 }, { "epoch": 0.016220628152453197, "grad_norm": 2.4578237116656965e-06, "learning_rate": 1.968068158085056e-05, "loss": 0.0045, "step": 18800 }, { "epoch": 0.016263768120943763, "grad_norm": 0.009525042027235031, "learning_rate": 1.9679818558096104e-05, "loss": 0.0084, "step": 18850 }, { "epoch": 0.01630690808943433, "grad_norm": 22.186767578125, "learning_rate": 1.9678955535341647e-05, "loss": 0.0316, "step": 18900 }, { "epoch": 0.0163500480579249, "grad_norm": 6.056162419554312e-06, "learning_rate": 1.9678092512587187e-05, "loss": 0.0085, "step": 18950 }, { "epoch": 0.016393188026415467, "grad_norm": 1.4418605198684986e-09, "learning_rate": 1.967722948983273e-05, "loss": 0.0181, "step": 19000 }, { "epoch": 0.016436327994906033, "grad_norm": 5.71908742585947e-09, "learning_rate": 1.967636646707827e-05, "loss": 0.0073, "step": 19050 }, { "epoch": 0.0164794679633966, "grad_norm": 6.593646517671914e-09, "learning_rate": 1.9675503444323817e-05, "loss": 0.0685, "step": 19100 }, { "epoch": 0.016522607931887166, "grad_norm": 2.7447922229766846, "learning_rate": 1.9674640421569357e-05, "loss": 0.0368, "step": 19150 }, { "epoch": 0.016565747900377733, "grad_norm": 3.157795136488062e-09, "learning_rate": 1.96737773988149e-05, "loss": 0.0653, "step": 19200 }, { "epoch": 0.0166088878688683, "grad_norm": 6.913658580742776e-06, "learning_rate": 1.967291437606044e-05, "loss": 0.053, "step": 19250 }, { "epoch": 0.016652027837358866, "grad_norm": 3.1019378639030037e-06, "learning_rate": 1.9672051353305984e-05, "loss": 0.0392, "step": 19300 }, { "epoch": 0.016695167805849436, "grad_norm": 0.00028862591716460884, "learning_rate": 1.9671188330551524e-05, "loss": 0.0031, "step": 19350 }, { "epoch": 0.016738307774340003, "grad_norm": 2.975168058583222e-07, "learning_rate": 1.9670325307797067e-05, "loss": 0.0142, "step": 19400 }, { "epoch": 0.01678144774283057, "grad_norm": 6.055047379049938e-07, "learning_rate": 1.9669462285042607e-05, "loss": 0.0294, "step": 19450 }, { "epoch": 0.016824587711321136, "grad_norm": 0.0006536454311572015, "learning_rate": 1.966859926228815e-05, "loss": 0.0411, "step": 19500 }, { "epoch": 0.016867727679811702, "grad_norm": 0.0043412791565060616, "learning_rate": 1.9667736239533694e-05, "loss": 0.0477, "step": 19550 }, { "epoch": 0.01691086764830227, "grad_norm": 0.08467547595500946, "learning_rate": 1.9666873216779234e-05, "loss": 0.0041, "step": 19600 }, { "epoch": 0.016954007616792836, "grad_norm": 5.161958824828616e-07, "learning_rate": 1.9666010194024777e-05, "loss": 0.0828, "step": 19650 }, { "epoch": 0.016997147585283402, "grad_norm": 0.03497151657938957, "learning_rate": 1.966514717127032e-05, "loss": 0.0095, "step": 19700 }, { "epoch": 0.017040287553773972, "grad_norm": 0.0004174104833509773, "learning_rate": 1.966428414851586e-05, "loss": 0.0206, "step": 19750 }, { "epoch": 0.01708342752226454, "grad_norm": 0.00030457283719442785, "learning_rate": 1.9663421125761404e-05, "loss": 0.0092, "step": 19800 }, { "epoch": 0.017126567490755105, "grad_norm": 0.0026671765372157097, "learning_rate": 1.9662558103006947e-05, "loss": 0.0083, "step": 19850 }, { "epoch": 0.017169707459245672, "grad_norm": 20.56145668029785, "learning_rate": 1.9661695080252487e-05, "loss": 0.0259, "step": 19900 }, { "epoch": 0.01721284742773624, "grad_norm": 2.7404727006796747e-05, "learning_rate": 1.966083205749803e-05, "loss": 0.0165, "step": 19950 }, { "epoch": 0.017255987396226805, "grad_norm": 4.9371454480251487e-08, "learning_rate": 1.965996903474357e-05, "loss": 0.0121, "step": 20000 }, { "epoch": 0.017299127364717372, "grad_norm": 0.011251527816057205, "learning_rate": 1.9659106011989114e-05, "loss": 0.0033, "step": 20050 }, { "epoch": 0.01734226733320794, "grad_norm": 3.3487244088803436e-09, "learning_rate": 1.9658242989234654e-05, "loss": 0.0319, "step": 20100 }, { "epoch": 0.01738540730169851, "grad_norm": 0.00034460489405319095, "learning_rate": 1.9657379966480197e-05, "loss": 0.0215, "step": 20150 }, { "epoch": 0.017428547270189075, "grad_norm": 8.861123319547914e-07, "learning_rate": 1.965651694372574e-05, "loss": 0.0507, "step": 20200 }, { "epoch": 0.01747168723867964, "grad_norm": 0.0008550824131816626, "learning_rate": 1.9655653920971284e-05, "loss": 0.0499, "step": 20250 }, { "epoch": 0.017514827207170208, "grad_norm": 1.3901036766128527e-07, "learning_rate": 1.9654790898216824e-05, "loss": 0.0462, "step": 20300 }, { "epoch": 0.017557967175660775, "grad_norm": 0.06260337680578232, "learning_rate": 1.9653927875462367e-05, "loss": 0.0499, "step": 20350 }, { "epoch": 0.01760110714415134, "grad_norm": 1.0717659648662448e-07, "learning_rate": 1.9653064852707907e-05, "loss": 0.0155, "step": 20400 }, { "epoch": 0.017644247112641908, "grad_norm": 7.46982475874347e-09, "learning_rate": 1.965220182995345e-05, "loss": 0.0605, "step": 20450 }, { "epoch": 0.017687387081132475, "grad_norm": 2.092070280923508e-05, "learning_rate": 1.9651338807198994e-05, "loss": 0.002, "step": 20500 }, { "epoch": 0.017730527049623045, "grad_norm": 8.422440259892028e-06, "learning_rate": 1.9650475784444533e-05, "loss": 0.0041, "step": 20550 }, { "epoch": 0.01777366701811361, "grad_norm": 0.2332431972026825, "learning_rate": 1.9649612761690077e-05, "loss": 0.004, "step": 20600 }, { "epoch": 0.017816806986604178, "grad_norm": 4.870547076762932e-09, "learning_rate": 1.9648749738935617e-05, "loss": 0.0452, "step": 20650 }, { "epoch": 0.017859946955094744, "grad_norm": 5.206494506637682e-07, "learning_rate": 1.964788671618116e-05, "loss": 0.0045, "step": 20700 }, { "epoch": 0.01790308692358531, "grad_norm": 2.1451814973261207e-06, "learning_rate": 1.9647023693426703e-05, "loss": 0.006, "step": 20750 }, { "epoch": 0.017946226892075878, "grad_norm": 8.108095244097058e-06, "learning_rate": 1.9646160670672247e-05, "loss": 0.0345, "step": 20800 }, { "epoch": 0.017989366860566444, "grad_norm": 0.025016743689775467, "learning_rate": 1.9645297647917787e-05, "loss": 0.0532, "step": 20850 }, { "epoch": 0.01803250682905701, "grad_norm": 5.400533609645208e-06, "learning_rate": 1.964443462516333e-05, "loss": 0.0021, "step": 20900 }, { "epoch": 0.018075646797547577, "grad_norm": 2.8619383556360845e-06, "learning_rate": 1.964357160240887e-05, "loss": 0.0362, "step": 20950 }, { "epoch": 0.018118786766038147, "grad_norm": 3.4743165969848633, "learning_rate": 1.9642708579654413e-05, "loss": 0.0136, "step": 21000 }, { "epoch": 0.018161926734528714, "grad_norm": 59.8224983215332, "learning_rate": 1.9641845556899953e-05, "loss": 0.014, "step": 21050 }, { "epoch": 0.01820506670301928, "grad_norm": 8.128851186484098e-05, "learning_rate": 1.9640982534145497e-05, "loss": 0.0295, "step": 21100 }, { "epoch": 0.018248206671509847, "grad_norm": 3.548375752870925e-05, "learning_rate": 1.9640119511391037e-05, "loss": 0.0286, "step": 21150 }, { "epoch": 0.018291346640000414, "grad_norm": 0.0468142107129097, "learning_rate": 1.963925648863658e-05, "loss": 0.0525, "step": 21200 }, { "epoch": 0.01833448660849098, "grad_norm": 4.863815320277354e-06, "learning_rate": 1.9638393465882123e-05, "loss": 0.0063, "step": 21250 }, { "epoch": 0.018377626576981547, "grad_norm": 9.059208938566599e-10, "learning_rate": 1.9637530443127667e-05, "loss": 0.0217, "step": 21300 }, { "epoch": 0.018420766545472114, "grad_norm": 0.9207327365875244, "learning_rate": 1.9636667420373206e-05, "loss": 0.0054, "step": 21350 }, { "epoch": 0.018463906513962684, "grad_norm": 0.00036540269502438605, "learning_rate": 1.963580439761875e-05, "loss": 0.0188, "step": 21400 }, { "epoch": 0.01850704648245325, "grad_norm": 0.00022348039783537388, "learning_rate": 1.963494137486429e-05, "loss": 0.0025, "step": 21450 }, { "epoch": 0.018550186450943817, "grad_norm": 0.27767083048820496, "learning_rate": 1.9634078352109833e-05, "loss": 0.018, "step": 21500 }, { "epoch": 0.018593326419434383, "grad_norm": 0.022822152823209763, "learning_rate": 1.9633215329355376e-05, "loss": 0.0569, "step": 21550 }, { "epoch": 0.01863646638792495, "grad_norm": 0.00016692353528924286, "learning_rate": 1.9632352306600916e-05, "loss": 0.0227, "step": 21600 }, { "epoch": 0.018679606356415517, "grad_norm": 0.5533714890480042, "learning_rate": 1.963148928384646e-05, "loss": 0.0112, "step": 21650 }, { "epoch": 0.018722746324906083, "grad_norm": 0.030804995447397232, "learning_rate": 1.9630626261092e-05, "loss": 0.0083, "step": 21700 }, { "epoch": 0.01876588629339665, "grad_norm": 1.79214639501879e-05, "learning_rate": 1.9629763238337546e-05, "loss": 0.0079, "step": 21750 }, { "epoch": 0.01880902626188722, "grad_norm": 1.6093619492618672e-10, "learning_rate": 1.9628900215583086e-05, "loss": 0.0156, "step": 21800 }, { "epoch": 0.018852166230377786, "grad_norm": 0.005034497939050198, "learning_rate": 1.962803719282863e-05, "loss": 0.0623, "step": 21850 }, { "epoch": 0.018895306198868353, "grad_norm": 0.017401648685336113, "learning_rate": 1.962717417007417e-05, "loss": 0.0258, "step": 21900 }, { "epoch": 0.01893844616735892, "grad_norm": 2.2319347858428955, "learning_rate": 1.9626311147319713e-05, "loss": 0.0201, "step": 21950 }, { "epoch": 0.018981586135849486, "grad_norm": 8.550871825718787e-06, "learning_rate": 1.9625448124565253e-05, "loss": 0.009, "step": 22000 }, { "epoch": 0.019024726104340053, "grad_norm": 1.8346406704949914e-06, "learning_rate": 1.9624585101810796e-05, "loss": 0.0806, "step": 22050 }, { "epoch": 0.01906786607283062, "grad_norm": 12.84133243560791, "learning_rate": 1.9623722079056336e-05, "loss": 0.0097, "step": 22100 }, { "epoch": 0.019111006041321186, "grad_norm": 9.22921472579219e-09, "learning_rate": 1.962285905630188e-05, "loss": 0.0185, "step": 22150 }, { "epoch": 0.019154146009811756, "grad_norm": 1.6999269723892212, "learning_rate": 1.9621996033547423e-05, "loss": 0.0145, "step": 22200 }, { "epoch": 0.019197285978302323, "grad_norm": 2.9266016483306885, "learning_rate": 1.9621133010792963e-05, "loss": 0.0214, "step": 22250 }, { "epoch": 0.01924042594679289, "grad_norm": 0.13319005072116852, "learning_rate": 1.9620269988038506e-05, "loss": 0.0542, "step": 22300 }, { "epoch": 0.019283565915283456, "grad_norm": 1.2659254934987985e-05, "learning_rate": 1.961940696528405e-05, "loss": 0.0059, "step": 22350 }, { "epoch": 0.019326705883774022, "grad_norm": 2.33125811064383e-05, "learning_rate": 1.961854394252959e-05, "loss": 0.0414, "step": 22400 }, { "epoch": 0.01936984585226459, "grad_norm": 0.008146941661834717, "learning_rate": 1.9617680919775133e-05, "loss": 0.0129, "step": 22450 }, { "epoch": 0.019412985820755155, "grad_norm": 4.2442545236554e-05, "learning_rate": 1.9616817897020676e-05, "loss": 0.0237, "step": 22500 }, { "epoch": 0.019456125789245722, "grad_norm": 6.483288217395966e-08, "learning_rate": 1.9615954874266216e-05, "loss": 0.0036, "step": 22550 }, { "epoch": 0.019499265757736292, "grad_norm": 0.025942707434296608, "learning_rate": 1.961509185151176e-05, "loss": 0.0233, "step": 22600 }, { "epoch": 0.01954240572622686, "grad_norm": 0.004933039657771587, "learning_rate": 1.96142288287573e-05, "loss": 0.0279, "step": 22650 }, { "epoch": 0.019585545694717425, "grad_norm": 9.285894102262215e-12, "learning_rate": 1.9613365806002843e-05, "loss": 0.0137, "step": 22700 }, { "epoch": 0.019628685663207992, "grad_norm": 17.506160736083984, "learning_rate": 1.9612502783248383e-05, "loss": 0.0106, "step": 22750 }, { "epoch": 0.01967182563169856, "grad_norm": 1.2982255270799214e-07, "learning_rate": 1.9611639760493926e-05, "loss": 0.0066, "step": 22800 }, { "epoch": 0.019714965600189125, "grad_norm": 3.575518903176089e-08, "learning_rate": 1.961077673773947e-05, "loss": 0.0187, "step": 22850 }, { "epoch": 0.01975810556867969, "grad_norm": 0.04352926090359688, "learning_rate": 1.9609913714985013e-05, "loss": 0.0127, "step": 22900 }, { "epoch": 0.019801245537170258, "grad_norm": 15.828106880187988, "learning_rate": 1.9609050692230552e-05, "loss": 0.0234, "step": 22950 }, { "epoch": 0.01984438550566083, "grad_norm": 2.8101124982526926e-08, "learning_rate": 1.9608187669476096e-05, "loss": 0.0406, "step": 23000 }, { "epoch": 0.019887525474151395, "grad_norm": 1.5754636478959583e-05, "learning_rate": 1.9607324646721636e-05, "loss": 0.013, "step": 23050 }, { "epoch": 0.01993066544264196, "grad_norm": 0.00016132810560520738, "learning_rate": 1.960646162396718e-05, "loss": 0.0661, "step": 23100 }, { "epoch": 0.019973805411132528, "grad_norm": 0.009830374270677567, "learning_rate": 1.9605598601212722e-05, "loss": 0.0001, "step": 23150 }, { "epoch": 0.020016945379623095, "grad_norm": 1.5961271415676492e-08, "learning_rate": 1.9604735578458262e-05, "loss": 0.0001, "step": 23200 }, { "epoch": 0.02006008534811366, "grad_norm": 0.6032620668411255, "learning_rate": 1.9603872555703806e-05, "loss": 0.057, "step": 23250 }, { "epoch": 0.020103225316604228, "grad_norm": 0.0007053284207358956, "learning_rate": 1.9603009532949346e-05, "loss": 0.0328, "step": 23300 }, { "epoch": 0.020146365285094794, "grad_norm": 0.00022471090778708458, "learning_rate": 1.960214651019489e-05, "loss": 0.0176, "step": 23350 }, { "epoch": 0.020189505253585364, "grad_norm": 3.784521595662227e-06, "learning_rate": 1.9601283487440432e-05, "loss": 0.0342, "step": 23400 }, { "epoch": 0.02023264522207593, "grad_norm": 0.0002926274319179356, "learning_rate": 1.9600420464685976e-05, "loss": 0.0622, "step": 23450 }, { "epoch": 0.020275785190566498, "grad_norm": 0.0005665869684889913, "learning_rate": 1.9599557441931516e-05, "loss": 0.0319, "step": 23500 }, { "epoch": 0.020318925159057064, "grad_norm": 0.0020943868439644575, "learning_rate": 1.959869441917706e-05, "loss": 0.0437, "step": 23550 }, { "epoch": 0.02036206512754763, "grad_norm": 0.007852623239159584, "learning_rate": 1.95978313964226e-05, "loss": 0.0002, "step": 23600 }, { "epoch": 0.020405205096038197, "grad_norm": 1.9628392457962036, "learning_rate": 1.9596968373668142e-05, "loss": 0.006, "step": 23650 }, { "epoch": 0.020448345064528764, "grad_norm": 2.0241428533296357e-09, "learning_rate": 1.9596105350913682e-05, "loss": 0.0491, "step": 23700 }, { "epoch": 0.02049148503301933, "grad_norm": 1.5093628569218254e-09, "learning_rate": 1.9595242328159226e-05, "loss": 0.0218, "step": 23750 }, { "epoch": 0.020534625001509897, "grad_norm": 0.013457014225423336, "learning_rate": 1.9594379305404765e-05, "loss": 0.0246, "step": 23800 }, { "epoch": 0.020577764970000467, "grad_norm": 2.7149107495461067e-07, "learning_rate": 1.959351628265031e-05, "loss": 0.0309, "step": 23850 }, { "epoch": 0.020620904938491034, "grad_norm": 2.928385534062272e-09, "learning_rate": 1.9592653259895852e-05, "loss": 0.0243, "step": 23900 }, { "epoch": 0.0206640449069816, "grad_norm": 0.0007422782364301383, "learning_rate": 1.9591790237141395e-05, "loss": 0.0483, "step": 23950 }, { "epoch": 0.020707184875472167, "grad_norm": 3.9503233892901335e-06, "learning_rate": 1.9590927214386935e-05, "loss": 0.0281, "step": 24000 }, { "epoch": 0.020750324843962734, "grad_norm": 0.07909461110830307, "learning_rate": 1.959006419163248e-05, "loss": 0.0137, "step": 24050 }, { "epoch": 0.0207934648124533, "grad_norm": 1.3648401853139092e-10, "learning_rate": 1.958920116887802e-05, "loss": 0.046, "step": 24100 }, { "epoch": 0.020836604780943867, "grad_norm": 1.791205619383618e-07, "learning_rate": 1.9588338146123562e-05, "loss": 0.0303, "step": 24150 }, { "epoch": 0.020879744749434433, "grad_norm": 3.758560573885461e-09, "learning_rate": 1.9587475123369105e-05, "loss": 0.0029, "step": 24200 }, { "epoch": 0.020922884717925003, "grad_norm": 2.0997137362144258e-10, "learning_rate": 1.9586612100614645e-05, "loss": 0.0431, "step": 24250 }, { "epoch": 0.02096602468641557, "grad_norm": 4.752119064331055, "learning_rate": 1.958574907786019e-05, "loss": 0.0253, "step": 24300 }, { "epoch": 0.021009164654906137, "grad_norm": 0.004993764217942953, "learning_rate": 1.958488605510573e-05, "loss": 0.0292, "step": 24350 }, { "epoch": 0.021052304623396703, "grad_norm": 1.2806524729569446e-09, "learning_rate": 1.9584023032351275e-05, "loss": 0.0538, "step": 24400 }, { "epoch": 0.02109544459188727, "grad_norm": 6.973591126779866e-08, "learning_rate": 1.9583160009596815e-05, "loss": 0.0272, "step": 24450 }, { "epoch": 0.021138584560377836, "grad_norm": 0.042537808418273926, "learning_rate": 1.958229698684236e-05, "loss": 0.046, "step": 24500 }, { "epoch": 0.021181724528868403, "grad_norm": 0.0006602337816730142, "learning_rate": 1.95814339640879e-05, "loss": 0.024, "step": 24550 }, { "epoch": 0.02122486449735897, "grad_norm": 22.432666778564453, "learning_rate": 1.9580570941333442e-05, "loss": 0.0484, "step": 24600 }, { "epoch": 0.02126800446584954, "grad_norm": 0.024881912395358086, "learning_rate": 1.9579707918578982e-05, "loss": 0.0061, "step": 24650 }, { "epoch": 0.021311144434340106, "grad_norm": 9.876566764432937e-06, "learning_rate": 1.9578844895824525e-05, "loss": 0.033, "step": 24700 }, { "epoch": 0.021354284402830673, "grad_norm": 1.04228820418939e-05, "learning_rate": 1.9577981873070065e-05, "loss": 0.0246, "step": 24750 }, { "epoch": 0.02139742437132124, "grad_norm": 4.033939262626518e-07, "learning_rate": 1.957711885031561e-05, "loss": 0.0273, "step": 24800 }, { "epoch": 0.021440564339811806, "grad_norm": 1.8699473002925515e-05, "learning_rate": 1.9576255827561152e-05, "loss": 0.0404, "step": 24850 }, { "epoch": 0.021483704308302373, "grad_norm": 7.583350480899753e-08, "learning_rate": 1.957539280480669e-05, "loss": 0.0265, "step": 24900 }, { "epoch": 0.02152684427679294, "grad_norm": 0.02612815983593464, "learning_rate": 1.9574529782052235e-05, "loss": 0.0219, "step": 24950 }, { "epoch": 0.021569984245283506, "grad_norm": 5.127071176502795e-07, "learning_rate": 1.957366675929778e-05, "loss": 0.0609, "step": 25000 }, { "epoch": 0.021613124213774076, "grad_norm": 0.00036468004691414535, "learning_rate": 1.957280373654332e-05, "loss": 0.0173, "step": 25050 }, { "epoch": 0.021656264182264642, "grad_norm": 4.805618573300308e-06, "learning_rate": 1.957194071378886e-05, "loss": 0.0478, "step": 25100 }, { "epoch": 0.02169940415075521, "grad_norm": 0.003498099045827985, "learning_rate": 1.9571077691034405e-05, "loss": 0.0422, "step": 25150 }, { "epoch": 0.021742544119245776, "grad_norm": 3.893982466252055e-06, "learning_rate": 1.9570214668279945e-05, "loss": 0.024, "step": 25200 }, { "epoch": 0.021785684087736342, "grad_norm": 6.174719402451956e-08, "learning_rate": 1.9569351645525488e-05, "loss": 0.0448, "step": 25250 }, { "epoch": 0.02182882405622691, "grad_norm": 0.8544023633003235, "learning_rate": 1.9568488622771028e-05, "loss": 0.009, "step": 25300 }, { "epoch": 0.021871964024717475, "grad_norm": 1.8829781822660152e-07, "learning_rate": 1.956762560001657e-05, "loss": 0.0059, "step": 25350 }, { "epoch": 0.021915103993208042, "grad_norm": 1.7753800420905463e-06, "learning_rate": 1.956676257726211e-05, "loss": 0.0614, "step": 25400 }, { "epoch": 0.021958243961698612, "grad_norm": 2.652618924514627e-08, "learning_rate": 1.9565899554507655e-05, "loss": 0.0184, "step": 25450 }, { "epoch": 0.02200138393018918, "grad_norm": 0.33340388536453247, "learning_rate": 1.9565036531753198e-05, "loss": 0.0166, "step": 25500 }, { "epoch": 0.022044523898679745, "grad_norm": 0.40569502115249634, "learning_rate": 1.956417350899874e-05, "loss": 0.0179, "step": 25550 }, { "epoch": 0.02208766386717031, "grad_norm": 0.00011573725350899622, "learning_rate": 1.956331048624428e-05, "loss": 0.0006, "step": 25600 }, { "epoch": 0.02213080383566088, "grad_norm": 2.554327238613041e-06, "learning_rate": 1.9562447463489825e-05, "loss": 0.0402, "step": 25650 }, { "epoch": 0.022173943804151445, "grad_norm": 8.304319010221661e-08, "learning_rate": 1.9561584440735365e-05, "loss": 0.0363, "step": 25700 }, { "epoch": 0.02221708377264201, "grad_norm": 1.8539299873054915e-08, "learning_rate": 1.9560721417980908e-05, "loss": 0.0042, "step": 25750 }, { "epoch": 0.022260223741132578, "grad_norm": 0.043552886694669724, "learning_rate": 1.955985839522645e-05, "loss": 0.0358, "step": 25800 }, { "epoch": 0.022303363709623148, "grad_norm": 0.00025480103795416653, "learning_rate": 1.955899537247199e-05, "loss": 0.0349, "step": 25850 }, { "epoch": 0.022346503678113715, "grad_norm": 0.0006263578543439507, "learning_rate": 1.9558132349717535e-05, "loss": 0.0184, "step": 25900 }, { "epoch": 0.02238964364660428, "grad_norm": 2.677586793899536, "learning_rate": 1.9557269326963075e-05, "loss": 0.0667, "step": 25950 }, { "epoch": 0.022432783615094848, "grad_norm": 0.6284056305885315, "learning_rate": 1.9556406304208618e-05, "loss": 0.0061, "step": 26000 }, { "epoch": 0.022475923583585414, "grad_norm": 0.01573588326573372, "learning_rate": 1.955554328145416e-05, "loss": 0.0515, "step": 26050 }, { "epoch": 0.02251906355207598, "grad_norm": 9.318134289060254e-06, "learning_rate": 1.9554680258699705e-05, "loss": 0.0231, "step": 26100 }, { "epoch": 0.022562203520566548, "grad_norm": 3.892751294642949e-07, "learning_rate": 1.9553817235945245e-05, "loss": 0.004, "step": 26150 }, { "epoch": 0.022605343489057114, "grad_norm": 0.0010842111660167575, "learning_rate": 1.9552954213190788e-05, "loss": 0.0568, "step": 26200 }, { "epoch": 0.022648483457547684, "grad_norm": 0.021115347743034363, "learning_rate": 1.9552091190436328e-05, "loss": 0.0711, "step": 26250 }, { "epoch": 0.02269162342603825, "grad_norm": 0.07015379518270493, "learning_rate": 1.955122816768187e-05, "loss": 0.0305, "step": 26300 }, { "epoch": 0.022734763394528817, "grad_norm": 3.8024263631086797e-05, "learning_rate": 1.955036514492741e-05, "loss": 0.0309, "step": 26350 }, { "epoch": 0.022777903363019384, "grad_norm": 0.0043113697320222855, "learning_rate": 1.9549502122172954e-05, "loss": 0.0066, "step": 26400 }, { "epoch": 0.02282104333150995, "grad_norm": 0.007588895037770271, "learning_rate": 1.9548639099418494e-05, "loss": 0.0242, "step": 26450 }, { "epoch": 0.022864183300000517, "grad_norm": 1.8674474954605103, "learning_rate": 1.9547776076664038e-05, "loss": 0.0163, "step": 26500 }, { "epoch": 0.022907323268491084, "grad_norm": 4.954452991485596, "learning_rate": 1.954691305390958e-05, "loss": 0.0368, "step": 26550 }, { "epoch": 0.02295046323698165, "grad_norm": 0.0024081666488200426, "learning_rate": 1.9546050031155124e-05, "loss": 0.0255, "step": 26600 }, { "epoch": 0.02299360320547222, "grad_norm": 0.4166341722011566, "learning_rate": 1.9545187008400664e-05, "loss": 0.0331, "step": 26650 }, { "epoch": 0.023036743173962787, "grad_norm": 0.00036967426422052085, "learning_rate": 1.9544323985646208e-05, "loss": 0.0282, "step": 26700 }, { "epoch": 0.023079883142453354, "grad_norm": 1.1294196688993452e-08, "learning_rate": 1.954346096289175e-05, "loss": 0.0293, "step": 26750 }, { "epoch": 0.02312302311094392, "grad_norm": 24.33706283569336, "learning_rate": 1.954259794013729e-05, "loss": 0.0475, "step": 26800 }, { "epoch": 0.023166163079434487, "grad_norm": 1.3493994366342577e-08, "learning_rate": 1.9541734917382834e-05, "loss": 0.0045, "step": 26850 }, { "epoch": 0.023209303047925053, "grad_norm": 6.673410098301247e-05, "learning_rate": 1.9540871894628374e-05, "loss": 0.0059, "step": 26900 }, { "epoch": 0.02325244301641562, "grad_norm": 0.0014361342182382941, "learning_rate": 1.9540008871873918e-05, "loss": 0.0002, "step": 26950 }, { "epoch": 0.023295582984906187, "grad_norm": 3.2534658908843994, "learning_rate": 1.9539145849119458e-05, "loss": 0.0329, "step": 27000 }, { "epoch": 0.023338722953396753, "grad_norm": 0.0029180857818573713, "learning_rate": 1.9538282826365004e-05, "loss": 0.0007, "step": 27050 }, { "epoch": 0.023381862921887323, "grad_norm": 7.010048866271973, "learning_rate": 1.9537419803610544e-05, "loss": 0.0473, "step": 27100 }, { "epoch": 0.02342500289037789, "grad_norm": 0.5129420757293701, "learning_rate": 1.9536556780856088e-05, "loss": 0.0312, "step": 27150 }, { "epoch": 0.023468142858868456, "grad_norm": 0.008801298215985298, "learning_rate": 1.9535693758101627e-05, "loss": 0.0025, "step": 27200 }, { "epoch": 0.023511282827359023, "grad_norm": 7.381456001986919e-10, "learning_rate": 1.953483073534717e-05, "loss": 0.0399, "step": 27250 }, { "epoch": 0.02355442279584959, "grad_norm": 0.015433188527822495, "learning_rate": 1.953396771259271e-05, "loss": 0.0248, "step": 27300 }, { "epoch": 0.023597562764340156, "grad_norm": 3.086728572845459, "learning_rate": 1.9533104689838254e-05, "loss": 0.0124, "step": 27350 }, { "epoch": 0.023640702732830723, "grad_norm": 5.318460255532287e-11, "learning_rate": 1.9532241667083794e-05, "loss": 0.0259, "step": 27400 }, { "epoch": 0.02368384270132129, "grad_norm": 0.0015008836053311825, "learning_rate": 1.9531378644329337e-05, "loss": 0.0128, "step": 27450 }, { "epoch": 0.02372698266981186, "grad_norm": 4.6280136302812025e-05, "learning_rate": 1.953051562157488e-05, "loss": 0.0005, "step": 27500 }, { "epoch": 0.023770122638302426, "grad_norm": 4.795760560227791e-06, "learning_rate": 1.952965259882042e-05, "loss": 0.0155, "step": 27550 }, { "epoch": 0.023813262606792993, "grad_norm": 0.19684414565563202, "learning_rate": 1.9528789576065964e-05, "loss": 0.0042, "step": 27600 }, { "epoch": 0.02385640257528356, "grad_norm": 1.0629539559658951e-07, "learning_rate": 1.9527926553311507e-05, "loss": 0.0097, "step": 27650 }, { "epoch": 0.023899542543774126, "grad_norm": 1.161576043684498e-11, "learning_rate": 1.9527063530557047e-05, "loss": 0.0009, "step": 27700 }, { "epoch": 0.023942682512264692, "grad_norm": 1.7004417318666754e-10, "learning_rate": 1.952620050780259e-05, "loss": 0.0324, "step": 27750 }, { "epoch": 0.02398582248075526, "grad_norm": 3.243289393140003e-05, "learning_rate": 1.9525337485048134e-05, "loss": 0.0768, "step": 27800 }, { "epoch": 0.024028962449245826, "grad_norm": 0.0029646342154592276, "learning_rate": 1.9524474462293674e-05, "loss": 0.0142, "step": 27850 }, { "epoch": 0.024072102417736396, "grad_norm": 0.0012051378143951297, "learning_rate": 1.9523611439539217e-05, "loss": 0.0147, "step": 27900 }, { "epoch": 0.024115242386226962, "grad_norm": 1.3464485164149664e-05, "learning_rate": 1.9522748416784757e-05, "loss": 0.0023, "step": 27950 }, { "epoch": 0.02415838235471753, "grad_norm": 0.0002646016946528107, "learning_rate": 1.95218853940303e-05, "loss": 0.0098, "step": 28000 }, { "epoch": 0.024201522323208095, "grad_norm": 6.006689727655612e-05, "learning_rate": 1.952102237127584e-05, "loss": 0.0052, "step": 28050 }, { "epoch": 0.024244662291698662, "grad_norm": 31.13625717163086, "learning_rate": 1.9520159348521384e-05, "loss": 0.0298, "step": 28100 }, { "epoch": 0.02428780226018923, "grad_norm": 0.00010399877646705136, "learning_rate": 1.9519296325766927e-05, "loss": 0.0512, "step": 28150 }, { "epoch": 0.024330942228679795, "grad_norm": 9.850235755948233e-07, "learning_rate": 1.951843330301247e-05, "loss": 0.052, "step": 28200 }, { "epoch": 0.02437408219717036, "grad_norm": 3.698731597978622e-05, "learning_rate": 1.951757028025801e-05, "loss": 0.0017, "step": 28250 }, { "epoch": 0.02441722216566093, "grad_norm": 0.04309392347931862, "learning_rate": 1.9516707257503554e-05, "loss": 0.0385, "step": 28300 }, { "epoch": 0.0244603621341515, "grad_norm": 9.081038115255069e-06, "learning_rate": 1.9515844234749094e-05, "loss": 0.0418, "step": 28350 }, { "epoch": 0.024503502102642065, "grad_norm": 0.48385998606681824, "learning_rate": 1.9514981211994637e-05, "loss": 0.0207, "step": 28400 }, { "epoch": 0.02454664207113263, "grad_norm": 1.9165490527939255e-07, "learning_rate": 1.951411818924018e-05, "loss": 0.0206, "step": 28450 }, { "epoch": 0.024589782039623198, "grad_norm": 1.4679693776997738e-05, "learning_rate": 1.951325516648572e-05, "loss": 0.0346, "step": 28500 }, { "epoch": 0.024632922008113765, "grad_norm": 0.11278124898672104, "learning_rate": 1.9512392143731264e-05, "loss": 0.0081, "step": 28550 }, { "epoch": 0.02467606197660433, "grad_norm": 9.307966024607595e-07, "learning_rate": 1.9511529120976804e-05, "loss": 0.012, "step": 28600 }, { "epoch": 0.024719201945094898, "grad_norm": 0.00027512782253324986, "learning_rate": 1.9510666098222347e-05, "loss": 0.0032, "step": 28650 }, { "epoch": 0.024762341913585468, "grad_norm": 0.11172260344028473, "learning_rate": 1.950980307546789e-05, "loss": 0.0032, "step": 28700 }, { "epoch": 0.024805481882076034, "grad_norm": 2.1106679923832417e-06, "learning_rate": 1.9508940052713434e-05, "loss": 0.0339, "step": 28750 }, { "epoch": 0.0248486218505666, "grad_norm": 0.028800344094634056, "learning_rate": 1.9508077029958973e-05, "loss": 0.0278, "step": 28800 }, { "epoch": 0.024891761819057168, "grad_norm": 1.757417521730531e-06, "learning_rate": 1.9507214007204517e-05, "loss": 0.0164, "step": 28850 }, { "epoch": 0.024934901787547734, "grad_norm": 4.451398893934311e-08, "learning_rate": 1.9506350984450057e-05, "loss": 0.009, "step": 28900 }, { "epoch": 0.0249780417560383, "grad_norm": 4.7023010552038613e-07, "learning_rate": 1.95054879616956e-05, "loss": 0.01, "step": 28950 }, { "epoch": 0.025021181724528867, "grad_norm": 0.3000449538230896, "learning_rate": 1.950462493894114e-05, "loss": 0.0208, "step": 29000 }, { "epoch": 0.025064321693019434, "grad_norm": 2.5534254746162333e-06, "learning_rate": 1.9503761916186683e-05, "loss": 0.0113, "step": 29050 }, { "epoch": 0.025107461661510004, "grad_norm": 1.051041209620962e-07, "learning_rate": 1.9502898893432223e-05, "loss": 0.0286, "step": 29100 }, { "epoch": 0.02515060163000057, "grad_norm": 0.11379561573266983, "learning_rate": 1.9502035870677767e-05, "loss": 0.0273, "step": 29150 }, { "epoch": 0.025193741598491137, "grad_norm": 4.075488391208637e-09, "learning_rate": 1.950117284792331e-05, "loss": 0.001, "step": 29200 }, { "epoch": 0.025236881566981704, "grad_norm": 6.561435283991557e-10, "learning_rate": 1.9500309825168853e-05, "loss": 0.0002, "step": 29250 }, { "epoch": 0.02528002153547227, "grad_norm": 2.523017644882202, "learning_rate": 1.9499446802414393e-05, "loss": 0.0078, "step": 29300 }, { "epoch": 0.025323161503962837, "grad_norm": 1.13604746729834e-05, "learning_rate": 1.9498583779659937e-05, "loss": 0.0009, "step": 29350 }, { "epoch": 0.025366301472453404, "grad_norm": 0.00017209288489539176, "learning_rate": 1.949772075690548e-05, "loss": 0.0157, "step": 29400 }, { "epoch": 0.02540944144094397, "grad_norm": 0.00011601659207371995, "learning_rate": 1.949685773415102e-05, "loss": 0.045, "step": 29450 }, { "epoch": 0.02545258140943454, "grad_norm": 22.94985580444336, "learning_rate": 1.9495994711396563e-05, "loss": 0.0269, "step": 29500 }, { "epoch": 0.025495721377925107, "grad_norm": 2.782198776918321e-11, "learning_rate": 1.9495131688642103e-05, "loss": 0.0025, "step": 29550 }, { "epoch": 0.025538861346415673, "grad_norm": 2.155955371563323e-05, "learning_rate": 1.9494268665887646e-05, "loss": 0.0452, "step": 29600 }, { "epoch": 0.02558200131490624, "grad_norm": 8.12989310361445e-06, "learning_rate": 1.9493405643133186e-05, "loss": 0.019, "step": 29650 }, { "epoch": 0.025625141283396807, "grad_norm": 0.000956275500357151, "learning_rate": 1.9492542620378733e-05, "loss": 0.0122, "step": 29700 }, { "epoch": 0.025668281251887373, "grad_norm": 3.834348838438473e-09, "learning_rate": 1.9491679597624273e-05, "loss": 0.0255, "step": 29750 }, { "epoch": 0.02571142122037794, "grad_norm": 0.16173326969146729, "learning_rate": 1.9490816574869816e-05, "loss": 0.0382, "step": 29800 }, { "epoch": 0.025754561188868506, "grad_norm": 0.0008912076009437442, "learning_rate": 1.9489953552115356e-05, "loss": 0.029, "step": 29850 }, { "epoch": 0.025797701157359073, "grad_norm": 1.517190213462527e-07, "learning_rate": 1.94890905293609e-05, "loss": 0.0413, "step": 29900 }, { "epoch": 0.025840841125849643, "grad_norm": 6.658311946239337e-08, "learning_rate": 1.948822750660644e-05, "loss": 0.0142, "step": 29950 }, { "epoch": 0.02588398109434021, "grad_norm": 0.0003508856752887368, "learning_rate": 1.9487364483851983e-05, "loss": 0.0079, "step": 30000 }, { "epoch": 0.025927121062830776, "grad_norm": 0.026366397738456726, "learning_rate": 1.9486501461097523e-05, "loss": 0.0381, "step": 30050 }, { "epoch": 0.025970261031321343, "grad_norm": 5.6284894943237305, "learning_rate": 1.9485638438343066e-05, "loss": 0.0618, "step": 30100 }, { "epoch": 0.02601340099981191, "grad_norm": 0.00824633240699768, "learning_rate": 1.948477541558861e-05, "loss": 0.0173, "step": 30150 }, { "epoch": 0.026056540968302476, "grad_norm": 0.0007174229249358177, "learning_rate": 1.948391239283415e-05, "loss": 0.0199, "step": 30200 }, { "epoch": 0.026099680936793043, "grad_norm": 0.02488381415605545, "learning_rate": 1.9483049370079693e-05, "loss": 0.0312, "step": 30250 }, { "epoch": 0.02614282090528361, "grad_norm": 1.9344063997268677, "learning_rate": 1.9482186347325236e-05, "loss": 0.0357, "step": 30300 }, { "epoch": 0.02618596087377418, "grad_norm": 3.485973834991455, "learning_rate": 1.9481323324570776e-05, "loss": 0.0266, "step": 30350 }, { "epoch": 0.026229100842264746, "grad_norm": 6.07471008606808e-07, "learning_rate": 1.948046030181632e-05, "loss": 0.0308, "step": 30400 }, { "epoch": 0.026272240810755312, "grad_norm": 7.532801760135044e-08, "learning_rate": 1.9479597279061863e-05, "loss": 0.047, "step": 30450 }, { "epoch": 0.02631538077924588, "grad_norm": 0.0005202541360631585, "learning_rate": 1.9478734256307403e-05, "loss": 0.0239, "step": 30500 }, { "epoch": 0.026358520747736446, "grad_norm": 26.940954208374023, "learning_rate": 1.9477871233552946e-05, "loss": 0.0709, "step": 30550 }, { "epoch": 0.026401660716227012, "grad_norm": 7.362630470575393e-12, "learning_rate": 1.9477008210798486e-05, "loss": 0.0117, "step": 30600 }, { "epoch": 0.02644480068471758, "grad_norm": 1.2002854418824427e-05, "learning_rate": 1.947614518804403e-05, "loss": 0.0392, "step": 30650 }, { "epoch": 0.026487940653208145, "grad_norm": 0.4743211269378662, "learning_rate": 1.947528216528957e-05, "loss": 0.0461, "step": 30700 }, { "epoch": 0.026531080621698715, "grad_norm": 5.520277568393794e-10, "learning_rate": 1.9474419142535113e-05, "loss": 0.002, "step": 30750 }, { "epoch": 0.026574220590189282, "grad_norm": 5.655643420254819e-08, "learning_rate": 1.9473556119780656e-05, "loss": 0.002, "step": 30800 }, { "epoch": 0.02661736055867985, "grad_norm": 3.0585747481381986e-06, "learning_rate": 1.94726930970262e-05, "loss": 0.0208, "step": 30850 }, { "epoch": 0.026660500527170415, "grad_norm": 0.00038789489190094173, "learning_rate": 1.947183007427174e-05, "loss": 0.0221, "step": 30900 }, { "epoch": 0.02670364049566098, "grad_norm": 0.006069442722946405, "learning_rate": 1.9470967051517283e-05, "loss": 0.0332, "step": 30950 }, { "epoch": 0.02674678046415155, "grad_norm": 2.1992854204455625e-09, "learning_rate": 1.9470104028762823e-05, "loss": 0.0133, "step": 31000 }, { "epoch": 0.026789920432642115, "grad_norm": 0.0005120674031786621, "learning_rate": 1.9469241006008366e-05, "loss": 0.0549, "step": 31050 }, { "epoch": 0.02683306040113268, "grad_norm": 3.589123298297636e-05, "learning_rate": 1.946837798325391e-05, "loss": 0.0172, "step": 31100 }, { "epoch": 0.02687620036962325, "grad_norm": 4.615823812059716e-08, "learning_rate": 1.946751496049945e-05, "loss": 0.013, "step": 31150 }, { "epoch": 0.026919340338113818, "grad_norm": 7.231820475794848e-09, "learning_rate": 1.9466651937744993e-05, "loss": 0.0037, "step": 31200 }, { "epoch": 0.026962480306604385, "grad_norm": 5.052131157867734e-09, "learning_rate": 1.9465788914990532e-05, "loss": 0.0491, "step": 31250 }, { "epoch": 0.02700562027509495, "grad_norm": 0.00010309406206943095, "learning_rate": 1.9464925892236076e-05, "loss": 0.0028, "step": 31300 }, { "epoch": 0.027048760243585518, "grad_norm": 5.6031745771178976e-05, "learning_rate": 1.946406286948162e-05, "loss": 0.015, "step": 31350 }, { "epoch": 0.027091900212076084, "grad_norm": 0.00024476449470967054, "learning_rate": 1.9463199846727162e-05, "loss": 0.0154, "step": 31400 }, { "epoch": 0.02713504018056665, "grad_norm": 2.0063467331965512e-07, "learning_rate": 1.9462336823972702e-05, "loss": 0.0212, "step": 31450 }, { "epoch": 0.027178180149057218, "grad_norm": 6.659844075329602e-05, "learning_rate": 1.9461473801218246e-05, "loss": 0.0216, "step": 31500 }, { "epoch": 0.027221320117547788, "grad_norm": 4.053091470268555e-05, "learning_rate": 1.9460610778463786e-05, "loss": 0.026, "step": 31550 }, { "epoch": 0.027264460086038354, "grad_norm": 2.6744512382492758e-08, "learning_rate": 1.945974775570933e-05, "loss": 0.0284, "step": 31600 }, { "epoch": 0.02730760005452892, "grad_norm": 0.1950395703315735, "learning_rate": 1.945888473295487e-05, "loss": 0.0064, "step": 31650 }, { "epoch": 0.027350740023019487, "grad_norm": 41.71430587768555, "learning_rate": 1.9458021710200412e-05, "loss": 0.0379, "step": 31700 }, { "epoch": 0.027393879991510054, "grad_norm": 2.8257717943347416e-08, "learning_rate": 1.9457158687445952e-05, "loss": 0.0263, "step": 31750 }, { "epoch": 0.02743701996000062, "grad_norm": 0.002763712080195546, "learning_rate": 1.9456295664691496e-05, "loss": 0.0189, "step": 31800 }, { "epoch": 0.027480159928491187, "grad_norm": 1.0972726061098115e-09, "learning_rate": 1.945543264193704e-05, "loss": 0.0156, "step": 31850 }, { "epoch": 0.027523299896981754, "grad_norm": 0.0012834984809160233, "learning_rate": 1.9454569619182582e-05, "loss": 0.0076, "step": 31900 }, { "epoch": 0.027566439865472324, "grad_norm": 2.497445628080186e-08, "learning_rate": 1.9453706596428122e-05, "loss": 0.0209, "step": 31950 }, { "epoch": 0.02760957983396289, "grad_norm": 23.704517364501953, "learning_rate": 1.9452843573673666e-05, "loss": 0.0603, "step": 32000 }, { "epoch": 0.027652719802453457, "grad_norm": 0.0009068456711247563, "learning_rate": 1.945198055091921e-05, "loss": 0.035, "step": 32050 }, { "epoch": 0.027695859770944024, "grad_norm": 5.298162460327148, "learning_rate": 1.945111752816475e-05, "loss": 0.0053, "step": 32100 }, { "epoch": 0.02773899973943459, "grad_norm": 0.017380917444825172, "learning_rate": 1.9450254505410292e-05, "loss": 0.0471, "step": 32150 }, { "epoch": 0.027782139707925157, "grad_norm": 0.02581915073096752, "learning_rate": 1.9449391482655832e-05, "loss": 0.0396, "step": 32200 }, { "epoch": 0.027825279676415723, "grad_norm": 1.437704066908907e-09, "learning_rate": 1.9448528459901375e-05, "loss": 0.0283, "step": 32250 }, { "epoch": 0.02786841964490629, "grad_norm": 1.0882466483508324e-08, "learning_rate": 1.9447665437146915e-05, "loss": 0.0066, "step": 32300 }, { "epoch": 0.02791155961339686, "grad_norm": 5.027173122229556e-11, "learning_rate": 1.9446802414392462e-05, "loss": 0.0082, "step": 32350 }, { "epoch": 0.027954699581887427, "grad_norm": 4.071168899536133, "learning_rate": 1.9445939391638002e-05, "loss": 0.0217, "step": 32400 }, { "epoch": 0.027997839550377993, "grad_norm": 0.0017136979149654508, "learning_rate": 1.9445076368883545e-05, "loss": 0.0665, "step": 32450 }, { "epoch": 0.02804097951886856, "grad_norm": 1.7071112301536573e-09, "learning_rate": 1.9444213346129085e-05, "loss": 0.0283, "step": 32500 }, { "epoch": 0.028084119487359126, "grad_norm": 2.8745741897928667e-10, "learning_rate": 1.944335032337463e-05, "loss": 0.0165, "step": 32550 }, { "epoch": 0.028127259455849693, "grad_norm": 0.06553611904382706, "learning_rate": 1.944248730062017e-05, "loss": 0.0039, "step": 32600 }, { "epoch": 0.02817039942434026, "grad_norm": 1.2114237506466452e-05, "learning_rate": 1.9441624277865712e-05, "loss": 0.0053, "step": 32650 }, { "epoch": 0.028213539392830826, "grad_norm": 5.977819910185644e-06, "learning_rate": 1.9440761255111252e-05, "loss": 0.016, "step": 32700 }, { "epoch": 0.028256679361321393, "grad_norm": 0.00414885301142931, "learning_rate": 1.9439898232356795e-05, "loss": 0.0064, "step": 32750 }, { "epoch": 0.028299819329811963, "grad_norm": 0.001667422242462635, "learning_rate": 1.943903520960234e-05, "loss": 0.0013, "step": 32800 }, { "epoch": 0.02834295929830253, "grad_norm": 1.7196412045450415e-06, "learning_rate": 1.943817218684788e-05, "loss": 0.0022, "step": 32850 }, { "epoch": 0.028386099266793096, "grad_norm": 4.220390792397666e-07, "learning_rate": 1.9437309164093422e-05, "loss": 0.0278, "step": 32900 }, { "epoch": 0.028429239235283663, "grad_norm": 8.6249691833018e-09, "learning_rate": 1.9436446141338965e-05, "loss": 0.0155, "step": 32950 }, { "epoch": 0.02847237920377423, "grad_norm": 21.435453414916992, "learning_rate": 1.9435583118584505e-05, "loss": 0.0234, "step": 33000 }, { "epoch": 0.028515519172264796, "grad_norm": 9.135671461990569e-06, "learning_rate": 1.943472009583005e-05, "loss": 0.0028, "step": 33050 }, { "epoch": 0.028558659140755362, "grad_norm": 1.085790088950489e-07, "learning_rate": 1.9433857073075592e-05, "loss": 0.0189, "step": 33100 }, { "epoch": 0.02860179910924593, "grad_norm": 1.0733113288879395, "learning_rate": 1.9432994050321132e-05, "loss": 0.0188, "step": 33150 }, { "epoch": 0.0286449390777365, "grad_norm": 5.325038046066766e-07, "learning_rate": 1.9432131027566675e-05, "loss": 0.0025, "step": 33200 }, { "epoch": 0.028688079046227066, "grad_norm": 0.001730454503558576, "learning_rate": 1.9431268004812215e-05, "loss": 0.0429, "step": 33250 }, { "epoch": 0.028731219014717632, "grad_norm": 0.03524341806769371, "learning_rate": 1.943040498205776e-05, "loss": 0.0147, "step": 33300 }, { "epoch": 0.0287743589832082, "grad_norm": 8.027368769703003e-10, "learning_rate": 1.9429541959303298e-05, "loss": 0.0074, "step": 33350 }, { "epoch": 0.028817498951698765, "grad_norm": 2.603889299734874e-07, "learning_rate": 1.9428678936548845e-05, "loss": 0.0015, "step": 33400 }, { "epoch": 0.028860638920189332, "grad_norm": 12.171298027038574, "learning_rate": 1.9427815913794385e-05, "loss": 0.0188, "step": 33450 }, { "epoch": 0.0289037788886799, "grad_norm": 3.4058632536471123e-06, "learning_rate": 1.9426952891039928e-05, "loss": 0.0529, "step": 33500 }, { "epoch": 0.028946918857170465, "grad_norm": 17.399200439453125, "learning_rate": 1.9426089868285468e-05, "loss": 0.0294, "step": 33550 }, { "epoch": 0.028990058825661035, "grad_norm": 0.011678768321871758, "learning_rate": 1.942522684553101e-05, "loss": 0.0211, "step": 33600 }, { "epoch": 0.029033198794151602, "grad_norm": 2.466938212819514e-06, "learning_rate": 1.942436382277655e-05, "loss": 0.03, "step": 33650 }, { "epoch": 0.02907633876264217, "grad_norm": 3.6094334986136456e-12, "learning_rate": 1.9423500800022095e-05, "loss": 0.0381, "step": 33700 }, { "epoch": 0.029119478731132735, "grad_norm": 0.08116328716278076, "learning_rate": 1.9422637777267638e-05, "loss": 0.0016, "step": 33750 }, { "epoch": 0.0291626186996233, "grad_norm": 0.2594936788082123, "learning_rate": 1.9421774754513178e-05, "loss": 0.0145, "step": 33800 }, { "epoch": 0.029205758668113868, "grad_norm": 1.6326714103342965e-05, "learning_rate": 1.942091173175872e-05, "loss": 0.014, "step": 33850 }, { "epoch": 0.029248898636604435, "grad_norm": 6.704578368044167e-07, "learning_rate": 1.942004870900426e-05, "loss": 0.0138, "step": 33900 }, { "epoch": 0.029292038605095, "grad_norm": 1.600632737464025e-09, "learning_rate": 1.9419185686249805e-05, "loss": 0.0044, "step": 33950 }, { "epoch": 0.02933517857358557, "grad_norm": 2.9473580070771277e-05, "learning_rate": 1.9418322663495348e-05, "loss": 0.0209, "step": 34000 }, { "epoch": 0.029378318542076138, "grad_norm": 0.013792168349027634, "learning_rate": 1.941745964074089e-05, "loss": 0.009, "step": 34050 }, { "epoch": 0.029421458510566705, "grad_norm": 1.5911604123175493e-07, "learning_rate": 1.941659661798643e-05, "loss": 0.0272, "step": 34100 }, { "epoch": 0.02946459847905727, "grad_norm": 0.29515737295150757, "learning_rate": 1.9415733595231975e-05, "loss": 0.0595, "step": 34150 }, { "epoch": 0.029507738447547838, "grad_norm": 2.744394862475019e-07, "learning_rate": 1.9414870572477515e-05, "loss": 0.046, "step": 34200 }, { "epoch": 0.029550878416038404, "grad_norm": 0.028887495398521423, "learning_rate": 1.9414007549723058e-05, "loss": 0.0014, "step": 34250 }, { "epoch": 0.02959401838452897, "grad_norm": 1.5995985449990258e-05, "learning_rate": 1.9413144526968598e-05, "loss": 0.0072, "step": 34300 }, { "epoch": 0.029637158353019537, "grad_norm": 1.774524207576178e-05, "learning_rate": 1.941228150421414e-05, "loss": 0.0072, "step": 34350 }, { "epoch": 0.029680298321510108, "grad_norm": 3.840292084333896e-09, "learning_rate": 1.9411418481459685e-05, "loss": 0.015, "step": 34400 }, { "epoch": 0.029723438290000674, "grad_norm": 4.855828592553735e-06, "learning_rate": 1.9410555458705225e-05, "loss": 0.0101, "step": 34450 }, { "epoch": 0.02976657825849124, "grad_norm": 5.043638229370117, "learning_rate": 1.9409692435950768e-05, "loss": 0.0598, "step": 34500 }, { "epoch": 0.029809718226981807, "grad_norm": 3.365451473058556e-09, "learning_rate": 1.940882941319631e-05, "loss": 0.012, "step": 34550 }, { "epoch": 0.029852858195472374, "grad_norm": 2.5963392999983625e-06, "learning_rate": 1.940796639044185e-05, "loss": 0.0195, "step": 34600 }, { "epoch": 0.02989599816396294, "grad_norm": 0.0003348338359501213, "learning_rate": 1.9407103367687394e-05, "loss": 0.0289, "step": 34650 }, { "epoch": 0.029939138132453507, "grad_norm": 6.386066436767578, "learning_rate": 1.9406240344932938e-05, "loss": 0.0308, "step": 34700 }, { "epoch": 0.029982278100944074, "grad_norm": 0.00012195282033644617, "learning_rate": 1.9405377322178478e-05, "loss": 0.0522, "step": 34750 }, { "epoch": 0.030025418069434644, "grad_norm": 0.0025203858967870474, "learning_rate": 1.940451429942402e-05, "loss": 0.0275, "step": 34800 }, { "epoch": 0.03006855803792521, "grad_norm": 4.238718820381848e-10, "learning_rate": 1.940365127666956e-05, "loss": 0.0164, "step": 34850 }, { "epoch": 0.030111698006415777, "grad_norm": 1.477847000330712e-08, "learning_rate": 1.9402788253915104e-05, "loss": 0.0227, "step": 34900 }, { "epoch": 0.030154837974906343, "grad_norm": 8.416482621953492e-09, "learning_rate": 1.9401925231160644e-05, "loss": 0.0379, "step": 34950 }, { "epoch": 0.03019797794339691, "grad_norm": 2.9379866646195296e-06, "learning_rate": 1.940106220840619e-05, "loss": 0.0449, "step": 35000 }, { "epoch": 0.030241117911887477, "grad_norm": 13.662910461425781, "learning_rate": 1.940019918565173e-05, "loss": 0.0245, "step": 35050 }, { "epoch": 0.030284257880378043, "grad_norm": 2.694193881325191e-06, "learning_rate": 1.9399336162897274e-05, "loss": 0.0231, "step": 35100 }, { "epoch": 0.03032739784886861, "grad_norm": 19.55348014831543, "learning_rate": 1.9398473140142814e-05, "loss": 0.0253, "step": 35150 }, { "epoch": 0.03037053781735918, "grad_norm": 7.588599970631549e-09, "learning_rate": 1.9397610117388358e-05, "loss": 0.026, "step": 35200 }, { "epoch": 0.030413677785849746, "grad_norm": 6.923779882761494e-10, "learning_rate": 1.9396747094633898e-05, "loss": 0.008, "step": 35250 }, { "epoch": 0.030456817754340313, "grad_norm": 5.178381456971692e-07, "learning_rate": 1.939588407187944e-05, "loss": 0.0512, "step": 35300 }, { "epoch": 0.03049995772283088, "grad_norm": 3.179905760930524e-08, "learning_rate": 1.939502104912498e-05, "loss": 0.0314, "step": 35350 }, { "epoch": 0.030543097691321446, "grad_norm": 0.00010464258957654238, "learning_rate": 1.9394158026370524e-05, "loss": 0.0015, "step": 35400 }, { "epoch": 0.030586237659812013, "grad_norm": 11.300006866455078, "learning_rate": 1.9393295003616067e-05, "loss": 0.0298, "step": 35450 }, { "epoch": 0.03062937762830258, "grad_norm": 1.0112120918392975e-07, "learning_rate": 1.9392431980861607e-05, "loss": 0.0235, "step": 35500 }, { "epoch": 0.030672517596793146, "grad_norm": 0.0002930278715211898, "learning_rate": 1.939156895810715e-05, "loss": 0.0422, "step": 35550 }, { "epoch": 0.030715657565283716, "grad_norm": 3.265949146680214e-07, "learning_rate": 1.9390705935352694e-05, "loss": 0.0453, "step": 35600 }, { "epoch": 0.030758797533774283, "grad_norm": 0.01071107853204012, "learning_rate": 1.9389842912598234e-05, "loss": 0.0088, "step": 35650 }, { "epoch": 0.03080193750226485, "grad_norm": 2.198061288538611e-09, "learning_rate": 1.9388979889843777e-05, "loss": 0.0344, "step": 35700 }, { "epoch": 0.030845077470755416, "grad_norm": 2.0178050874619657e-07, "learning_rate": 1.938811686708932e-05, "loss": 0.0112, "step": 35750 }, { "epoch": 0.030888217439245982, "grad_norm": 0.03751551732420921, "learning_rate": 1.938725384433486e-05, "loss": 0.0112, "step": 35800 }, { "epoch": 0.03093135740773655, "grad_norm": 0.00011108023318229243, "learning_rate": 1.9386390821580404e-05, "loss": 0.0275, "step": 35850 }, { "epoch": 0.030974497376227116, "grad_norm": 1.5553026644354873e-09, "learning_rate": 1.9385527798825944e-05, "loss": 0.0118, "step": 35900 }, { "epoch": 0.031017637344717682, "grad_norm": 2.6839693418878596e-06, "learning_rate": 1.9384664776071487e-05, "loss": 0.0054, "step": 35950 }, { "epoch": 0.03106077731320825, "grad_norm": 2.178272318076324e-08, "learning_rate": 1.9383801753317027e-05, "loss": 0.0331, "step": 36000 }, { "epoch": 0.03110391728169882, "grad_norm": 2.3207785204704123e-07, "learning_rate": 1.9382938730562574e-05, "loss": 0.0102, "step": 36050 }, { "epoch": 0.031147057250189385, "grad_norm": 1.738131345518923e-07, "learning_rate": 1.9382075707808114e-05, "loss": 0.0588, "step": 36100 }, { "epoch": 0.031190197218679952, "grad_norm": 0.019147371873259544, "learning_rate": 1.9381212685053657e-05, "loss": 0.043, "step": 36150 }, { "epoch": 0.03123333718717052, "grad_norm": 0.0022545859683305025, "learning_rate": 1.9380349662299197e-05, "loss": 0.0191, "step": 36200 }, { "epoch": 0.031276477155661085, "grad_norm": 0.00014786762767471373, "learning_rate": 1.937948663954474e-05, "loss": 0.0016, "step": 36250 }, { "epoch": 0.031319617124151655, "grad_norm": 1.8323513018003723e-07, "learning_rate": 1.937862361679028e-05, "loss": 0.0007, "step": 36300 }, { "epoch": 0.03136275709264222, "grad_norm": 15.16702651977539, "learning_rate": 1.9377760594035824e-05, "loss": 0.0363, "step": 36350 }, { "epoch": 0.03140589706113279, "grad_norm": 0.061391185969114304, "learning_rate": 1.9376897571281367e-05, "loss": 0.0393, "step": 36400 }, { "epoch": 0.03144903702962335, "grad_norm": 0.0035098083317279816, "learning_rate": 1.9376034548526907e-05, "loss": 0.0147, "step": 36450 }, { "epoch": 0.03149217699811392, "grad_norm": 0.06623140722513199, "learning_rate": 1.937517152577245e-05, "loss": 0.0543, "step": 36500 }, { "epoch": 0.031535316966604485, "grad_norm": 8.011748832359444e-06, "learning_rate": 1.937430850301799e-05, "loss": 0.0447, "step": 36550 }, { "epoch": 0.031578456935095055, "grad_norm": 2.976227278850274e-06, "learning_rate": 1.9373445480263534e-05, "loss": 0.0238, "step": 36600 }, { "epoch": 0.031621596903585625, "grad_norm": 4.54370677971383e-07, "learning_rate": 1.9372582457509077e-05, "loss": 0.0282, "step": 36650 }, { "epoch": 0.03166473687207619, "grad_norm": 1.2593355247503268e-09, "learning_rate": 1.937171943475462e-05, "loss": 0.0475, "step": 36700 }, { "epoch": 0.03170787684056676, "grad_norm": 0.0001775699929567054, "learning_rate": 1.937085641200016e-05, "loss": 0.0005, "step": 36750 }, { "epoch": 0.03175101680905732, "grad_norm": 1.9041050336454646e-07, "learning_rate": 1.9369993389245704e-05, "loss": 0.0008, "step": 36800 }, { "epoch": 0.03179415677754789, "grad_norm": 0.0002166083868360147, "learning_rate": 1.9369130366491244e-05, "loss": 0.0064, "step": 36850 }, { "epoch": 0.031837296746038454, "grad_norm": 2.4730157921482032e-09, "learning_rate": 1.9368267343736787e-05, "loss": 0.0747, "step": 36900 }, { "epoch": 0.031880436714529024, "grad_norm": 6.864386705274228e-06, "learning_rate": 1.9367404320982327e-05, "loss": 0.0022, "step": 36950 }, { "epoch": 0.03192357668301959, "grad_norm": 2.638907517393818e-06, "learning_rate": 1.936654129822787e-05, "loss": 0.0239, "step": 37000 }, { "epoch": 0.03196671665151016, "grad_norm": 8.631070522824302e-05, "learning_rate": 1.9365678275473413e-05, "loss": 0.0191, "step": 37050 }, { "epoch": 0.03200985662000073, "grad_norm": 14.52698802947998, "learning_rate": 1.9364815252718953e-05, "loss": 0.0188, "step": 37100 }, { "epoch": 0.03205299658849129, "grad_norm": 0.07407932728528976, "learning_rate": 1.9363952229964497e-05, "loss": 0.0136, "step": 37150 }, { "epoch": 0.03209613655698186, "grad_norm": 0.002848062664270401, "learning_rate": 1.936308920721004e-05, "loss": 0.0451, "step": 37200 }, { "epoch": 0.032139276525472424, "grad_norm": 2.2414766931433405e-07, "learning_rate": 1.936222618445558e-05, "loss": 0.0395, "step": 37250 }, { "epoch": 0.032182416493962994, "grad_norm": 5.524349830920983e-07, "learning_rate": 1.9361363161701123e-05, "loss": 0.0468, "step": 37300 }, { "epoch": 0.03222555646245356, "grad_norm": 2.2004120182828046e-05, "learning_rate": 1.9360500138946667e-05, "loss": 0.0599, "step": 37350 }, { "epoch": 0.03226869643094413, "grad_norm": 5.064206831661977e-08, "learning_rate": 1.9359637116192207e-05, "loss": 0.0191, "step": 37400 }, { "epoch": 0.0323118363994347, "grad_norm": 5.038096060161479e-05, "learning_rate": 1.935877409343775e-05, "loss": 0.0094, "step": 37450 }, { "epoch": 0.03235497636792526, "grad_norm": 0.002139901742339134, "learning_rate": 1.935791107068329e-05, "loss": 0.0026, "step": 37500 }, { "epoch": 0.03239811633641583, "grad_norm": 0.025793571025133133, "learning_rate": 1.9357048047928833e-05, "loss": 0.0503, "step": 37550 }, { "epoch": 0.03244125630490639, "grad_norm": 1.497374176979065, "learning_rate": 1.9356185025174373e-05, "loss": 0.0239, "step": 37600 }, { "epoch": 0.032484396273396964, "grad_norm": 9.68094241216022e-07, "learning_rate": 1.935532200241992e-05, "loss": 0.0362, "step": 37650 }, { "epoch": 0.03252753624188753, "grad_norm": 7.437192266479542e-07, "learning_rate": 1.935445897966546e-05, "loss": 0.0174, "step": 37700 }, { "epoch": 0.0325706762103781, "grad_norm": 1.591896947594762e-09, "learning_rate": 1.9353595956911003e-05, "loss": 0.0253, "step": 37750 }, { "epoch": 0.03261381617886866, "grad_norm": 14.039113998413086, "learning_rate": 1.9352732934156543e-05, "loss": 0.0201, "step": 37800 }, { "epoch": 0.03265695614735923, "grad_norm": 2.0073053747182712e-05, "learning_rate": 1.9351869911402087e-05, "loss": 0.043, "step": 37850 }, { "epoch": 0.0327000961158498, "grad_norm": 1.3844499768822516e-08, "learning_rate": 1.9351006888647626e-05, "loss": 0.007, "step": 37900 }, { "epoch": 0.03274323608434036, "grad_norm": 0.02289557084441185, "learning_rate": 1.935014386589317e-05, "loss": 0.0268, "step": 37950 }, { "epoch": 0.03278637605283093, "grad_norm": 2.7390053766729316e-11, "learning_rate": 1.934928084313871e-05, "loss": 0.0141, "step": 38000 }, { "epoch": 0.032829516021321496, "grad_norm": 2.0595265937117802e-07, "learning_rate": 1.9348417820384253e-05, "loss": 0.1202, "step": 38050 }, { "epoch": 0.032872655989812066, "grad_norm": 0.00014018621004652232, "learning_rate": 1.9347554797629796e-05, "loss": 0.0277, "step": 38100 }, { "epoch": 0.03291579595830263, "grad_norm": 9.558748570270836e-05, "learning_rate": 1.9346691774875336e-05, "loss": 0.0498, "step": 38150 }, { "epoch": 0.0329589359267932, "grad_norm": 2.20267253325801e-07, "learning_rate": 1.934582875212088e-05, "loss": 0.0357, "step": 38200 }, { "epoch": 0.03300207589528377, "grad_norm": 0.002117832424119115, "learning_rate": 1.9344965729366423e-05, "loss": 0.0478, "step": 38250 }, { "epoch": 0.03304521586377433, "grad_norm": 0.0015125697245821357, "learning_rate": 1.9344102706611963e-05, "loss": 0.0049, "step": 38300 }, { "epoch": 0.0330883558322649, "grad_norm": 0.001929111429490149, "learning_rate": 1.9343239683857506e-05, "loss": 0.0321, "step": 38350 }, { "epoch": 0.033131495800755466, "grad_norm": 14.052818298339844, "learning_rate": 1.934237666110305e-05, "loss": 0.0126, "step": 38400 }, { "epoch": 0.033174635769246036, "grad_norm": 0.04780351743102074, "learning_rate": 1.934151363834859e-05, "loss": 0.0145, "step": 38450 }, { "epoch": 0.0332177757377366, "grad_norm": 1.625859908926941e-07, "learning_rate": 1.9340650615594133e-05, "loss": 0.0006, "step": 38500 }, { "epoch": 0.03326091570622717, "grad_norm": 4.171390173723921e-06, "learning_rate": 1.9339787592839673e-05, "loss": 0.0052, "step": 38550 }, { "epoch": 0.03330405567471773, "grad_norm": 9.933991532307118e-05, "learning_rate": 1.9338924570085216e-05, "loss": 0.0149, "step": 38600 }, { "epoch": 0.0333471956432083, "grad_norm": 5.527433510899016e-10, "learning_rate": 1.9338061547330756e-05, "loss": 0.0569, "step": 38650 }, { "epoch": 0.03339033561169887, "grad_norm": 1.7711924149566016e-09, "learning_rate": 1.9337198524576303e-05, "loss": 0.0089, "step": 38700 }, { "epoch": 0.033433475580189435, "grad_norm": 5.876652497960322e-09, "learning_rate": 1.9336335501821843e-05, "loss": 0.0412, "step": 38750 }, { "epoch": 0.033476615548680005, "grad_norm": 1.2611899375915527, "learning_rate": 1.9335472479067386e-05, "loss": 0.0057, "step": 38800 }, { "epoch": 0.03351975551717057, "grad_norm": 0.00011541438288986683, "learning_rate": 1.9334609456312926e-05, "loss": 0.0264, "step": 38850 }, { "epoch": 0.03356289548566114, "grad_norm": 0.7902683019638062, "learning_rate": 1.933374643355847e-05, "loss": 0.0269, "step": 38900 }, { "epoch": 0.0336060354541517, "grad_norm": 1.6534098904230632e-05, "learning_rate": 1.933288341080401e-05, "loss": 0.0041, "step": 38950 }, { "epoch": 0.03364917542264227, "grad_norm": 0.029098449274897575, "learning_rate": 1.9332020388049553e-05, "loss": 0.0208, "step": 39000 }, { "epoch": 0.033692315391132835, "grad_norm": 0.0004794780688825995, "learning_rate": 1.9331157365295096e-05, "loss": 0.0595, "step": 39050 }, { "epoch": 0.033735455359623405, "grad_norm": 16.320070266723633, "learning_rate": 1.9330294342540636e-05, "loss": 0.0735, "step": 39100 }, { "epoch": 0.033778595328113975, "grad_norm": 3.635158840609165e-09, "learning_rate": 1.932943131978618e-05, "loss": 0.0164, "step": 39150 }, { "epoch": 0.03382173529660454, "grad_norm": 1.406357796440716e-06, "learning_rate": 1.932856829703172e-05, "loss": 0.0237, "step": 39200 }, { "epoch": 0.03386487526509511, "grad_norm": 0.05031180754303932, "learning_rate": 1.9327705274277263e-05, "loss": 0.0264, "step": 39250 }, { "epoch": 0.03390801523358567, "grad_norm": 0.022205352783203125, "learning_rate": 1.9326842251522806e-05, "loss": 0.0076, "step": 39300 }, { "epoch": 0.03395115520207624, "grad_norm": 3.1427214707946405e-05, "learning_rate": 1.932597922876835e-05, "loss": 0.0093, "step": 39350 }, { "epoch": 0.033994295170566804, "grad_norm": 0.0015017461264505982, "learning_rate": 1.932511620601389e-05, "loss": 0.0016, "step": 39400 }, { "epoch": 0.034037435139057375, "grad_norm": 3.3295341483885466e-10, "learning_rate": 1.9324253183259433e-05, "loss": 0.0297, "step": 39450 }, { "epoch": 0.034080575107547945, "grad_norm": 1.431539747853705e-10, "learning_rate": 1.9323390160504972e-05, "loss": 0.008, "step": 39500 }, { "epoch": 0.03412371507603851, "grad_norm": 9.472168188695562e-11, "learning_rate": 1.9322527137750516e-05, "loss": 0.0526, "step": 39550 }, { "epoch": 0.03416685504452908, "grad_norm": 1.1010347078510563e-09, "learning_rate": 1.9321664114996056e-05, "loss": 0.0438, "step": 39600 }, { "epoch": 0.03420999501301964, "grad_norm": 0.0038324242923408747, "learning_rate": 1.93208010922416e-05, "loss": 0.0068, "step": 39650 }, { "epoch": 0.03425313498151021, "grad_norm": 1.2454121067762003e-10, "learning_rate": 1.9319938069487142e-05, "loss": 0.0105, "step": 39700 }, { "epoch": 0.034296274950000774, "grad_norm": 3.0910987103283105e-09, "learning_rate": 1.9319075046732682e-05, "loss": 0.003, "step": 39750 }, { "epoch": 0.034339414918491344, "grad_norm": 0.20711366832256317, "learning_rate": 1.9318212023978226e-05, "loss": 0.0072, "step": 39800 }, { "epoch": 0.03438255488698191, "grad_norm": 0.0013983896933495998, "learning_rate": 1.931734900122377e-05, "loss": 0.0357, "step": 39850 }, { "epoch": 0.03442569485547248, "grad_norm": 1.195646859741828e-06, "learning_rate": 1.931648597846931e-05, "loss": 0.0252, "step": 39900 }, { "epoch": 0.03446883482396305, "grad_norm": 0.0007419702014885843, "learning_rate": 1.9315622955714852e-05, "loss": 0.0203, "step": 39950 }, { "epoch": 0.03451197479245361, "grad_norm": 1.9399341908865608e-05, "learning_rate": 1.9314759932960396e-05, "loss": 0.0273, "step": 40000 } ], "logging_steps": 50, "max_steps": 1159018, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 2500, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }