{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 23.42834830144475, "eval_steps": 500, "global_step": 300000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.007809449433814916, "grad_norm": 1.4658414125442505, "learning_rate": 2.0000000000000003e-06, "loss": 3.3302, "step": 100 }, { "epoch": 0.015618898867629832, "grad_norm": 1.1347604990005493, "learning_rate": 4.000000000000001e-06, "loss": 2.9274, "step": 200 }, { "epoch": 0.02342834830144475, "grad_norm": 1.399733543395996, "learning_rate": 6e-06, "loss": 2.7919, "step": 300 }, { "epoch": 0.031237797735259663, "grad_norm": 0.8615509271621704, "learning_rate": 8.000000000000001e-06, "loss": 2.7518, "step": 400 }, { "epoch": 0.03904724716907458, "grad_norm": 1.3015568256378174, "learning_rate": 1e-05, "loss": 2.7169, "step": 500 }, { "epoch": 0.0468566966028895, "grad_norm": 1.1654436588287354, "learning_rate": 1.2e-05, "loss": 2.6892, "step": 600 }, { "epoch": 0.05466614603670441, "grad_norm": 1.9601017236709595, "learning_rate": 1.4e-05, "loss": 2.6643, "step": 700 }, { "epoch": 0.06247559547051933, "grad_norm": 1.8341138362884521, "learning_rate": 1.6000000000000003e-05, "loss": 2.6494, "step": 800 }, { "epoch": 0.07028504490433425, "grad_norm": 1.137752890586853, "learning_rate": 1.8e-05, "loss": 2.6348, "step": 900 }, { "epoch": 0.07809449433814916, "grad_norm": 1.324123501777649, "learning_rate": 2e-05, "loss": 2.6144, "step": 1000 }, { "epoch": 0.08590394377196407, "grad_norm": 1.3983900547027588, "learning_rate": 1.9998436889409928e-05, "loss": 2.598, "step": 1100 }, { "epoch": 0.093713393205779, "grad_norm": 1.4354324340820312, "learning_rate": 1.9996873778819854e-05, "loss": 2.5849, "step": 1200 }, { "epoch": 0.10152284263959391, "grad_norm": 1.3982596397399902, "learning_rate": 1.999531066822978e-05, "loss": 2.5705, "step": 1300 }, { "epoch": 0.10933229207340882, "grad_norm": 1.9624176025390625, "learning_rate": 1.9993747557639706e-05, "loss": 2.5423, "step": 1400 }, { "epoch": 0.11714174150722374, "grad_norm": 1.2586028575897217, "learning_rate": 1.9992184447049628e-05, "loss": 2.5274, "step": 1500 }, { "epoch": 0.12495119094103865, "grad_norm": 1.7528750896453857, "learning_rate": 1.9990621336459558e-05, "loss": 2.476, "step": 1600 }, { "epoch": 0.13276064037485358, "grad_norm": 1.7756469249725342, "learning_rate": 1.9989058225869484e-05, "loss": 2.4652, "step": 1700 }, { "epoch": 0.1405700898086685, "grad_norm": 1.7584046125411987, "learning_rate": 1.9987495115279406e-05, "loss": 2.4154, "step": 1800 }, { "epoch": 0.1483795392424834, "grad_norm": 1.531298041343689, "learning_rate": 1.9985932004689332e-05, "loss": 2.3776, "step": 1900 }, { "epoch": 0.15618898867629832, "grad_norm": 3.2747278213500977, "learning_rate": 1.998436889409926e-05, "loss": 2.3426, "step": 2000 }, { "epoch": 0.16399843811011325, "grad_norm": 1.5152462720870972, "learning_rate": 1.9982805783509184e-05, "loss": 2.3311, "step": 2100 }, { "epoch": 0.17180788754392814, "grad_norm": 3.3594422340393066, "learning_rate": 1.998124267291911e-05, "loss": 2.2926, "step": 2200 }, { "epoch": 0.17961733697774307, "grad_norm": 1.8210939168930054, "learning_rate": 1.9979679562329036e-05, "loss": 2.2611, "step": 2300 }, { "epoch": 0.187426786411558, "grad_norm": 1.4974194765090942, "learning_rate": 1.9978116451738962e-05, "loss": 2.2428, "step": 2400 }, { "epoch": 0.1952362358453729, "grad_norm": 2.1313698291778564, "learning_rate": 1.997655334114889e-05, "loss": 2.2123, "step": 2500 }, { "epoch": 0.20304568527918782, "grad_norm": 2.3703184127807617, "learning_rate": 1.9974990230558814e-05, "loss": 2.168, "step": 2600 }, { "epoch": 0.21085513471300274, "grad_norm": 2.0529401302337646, "learning_rate": 1.997342711996874e-05, "loss": 2.1258, "step": 2700 }, { "epoch": 0.21866458414681764, "grad_norm": 1.7897934913635254, "learning_rate": 1.9971864009378666e-05, "loss": 2.0991, "step": 2800 }, { "epoch": 0.22647403358063256, "grad_norm": 4.2363386154174805, "learning_rate": 1.997030089878859e-05, "loss": 2.0567, "step": 2900 }, { "epoch": 0.23428348301444749, "grad_norm": 4.278024673461914, "learning_rate": 1.9968737788198515e-05, "loss": 2.0318, "step": 3000 }, { "epoch": 0.2420929324482624, "grad_norm": 2.8552427291870117, "learning_rate": 1.9967174677608444e-05, "loss": 1.9907, "step": 3100 }, { "epoch": 0.2499023818820773, "grad_norm": 3.2664151191711426, "learning_rate": 1.9965611567018367e-05, "loss": 1.9173, "step": 3200 }, { "epoch": 0.2577118313158922, "grad_norm": 3.365929365158081, "learning_rate": 1.9964048456428293e-05, "loss": 1.8902, "step": 3300 }, { "epoch": 0.26552128074970716, "grad_norm": 2.942408800125122, "learning_rate": 1.996248534583822e-05, "loss": 1.8659, "step": 3400 }, { "epoch": 0.27333073018352205, "grad_norm": 2.636049747467041, "learning_rate": 1.9960922235248145e-05, "loss": 1.8498, "step": 3500 }, { "epoch": 0.281140179617337, "grad_norm": 2.335026502609253, "learning_rate": 1.995935912465807e-05, "loss": 1.7999, "step": 3600 }, { "epoch": 0.2889496290511519, "grad_norm": 3.5235695838928223, "learning_rate": 1.9957796014067997e-05, "loss": 1.7668, "step": 3700 }, { "epoch": 0.2967590784849668, "grad_norm": 3.085439682006836, "learning_rate": 1.9956232903477923e-05, "loss": 1.7068, "step": 3800 }, { "epoch": 0.30456852791878175, "grad_norm": 2.5960071086883545, "learning_rate": 1.995466979288785e-05, "loss": 1.6931, "step": 3900 }, { "epoch": 0.31237797735259665, "grad_norm": 1.9345512390136719, "learning_rate": 1.9953106682297775e-05, "loss": 1.6449, "step": 4000 }, { "epoch": 0.32018742678641154, "grad_norm": 2.4250986576080322, "learning_rate": 1.9951543571707698e-05, "loss": 1.6116, "step": 4100 }, { "epoch": 0.3279968762202265, "grad_norm": 2.81168794631958, "learning_rate": 1.9949980461117627e-05, "loss": 1.5684, "step": 4200 }, { "epoch": 0.3358063256540414, "grad_norm": 3.2198565006256104, "learning_rate": 1.994841735052755e-05, "loss": 1.5581, "step": 4300 }, { "epoch": 0.3436157750878563, "grad_norm": 2.893419027328491, "learning_rate": 1.9946854239937476e-05, "loss": 1.5005, "step": 4400 }, { "epoch": 0.35142522452167124, "grad_norm": 2.161217212677002, "learning_rate": 1.9945291129347405e-05, "loss": 1.5025, "step": 4500 }, { "epoch": 0.35923467395548614, "grad_norm": 2.2876691818237305, "learning_rate": 1.9943728018757328e-05, "loss": 1.4733, "step": 4600 }, { "epoch": 0.36704412338930104, "grad_norm": 2.560692548751831, "learning_rate": 1.9942164908167254e-05, "loss": 1.4473, "step": 4700 }, { "epoch": 0.374853572823116, "grad_norm": 2.703275680541992, "learning_rate": 1.994060179757718e-05, "loss": 1.4112, "step": 4800 }, { "epoch": 0.3826630222569309, "grad_norm": 2.5784506797790527, "learning_rate": 1.9939038686987106e-05, "loss": 1.4035, "step": 4900 }, { "epoch": 0.3904724716907458, "grad_norm": 2.6702706813812256, "learning_rate": 1.9937475576397032e-05, "loss": 1.3877, "step": 5000 }, { "epoch": 0.39828192112456073, "grad_norm": 2.5400278568267822, "learning_rate": 1.9935912465806958e-05, "loss": 1.3443, "step": 5100 }, { "epoch": 0.40609137055837563, "grad_norm": 2.167583703994751, "learning_rate": 1.9934349355216884e-05, "loss": 1.3422, "step": 5200 }, { "epoch": 0.4139008199921905, "grad_norm": 3.1610641479492188, "learning_rate": 1.993278624462681e-05, "loss": 1.3036, "step": 5300 }, { "epoch": 0.4217102694260055, "grad_norm": 2.5936696529388428, "learning_rate": 1.9931223134036736e-05, "loss": 1.2936, "step": 5400 }, { "epoch": 0.4295197188598204, "grad_norm": 2.189955234527588, "learning_rate": 1.992966002344666e-05, "loss": 1.2722, "step": 5500 }, { "epoch": 0.4373291682936353, "grad_norm": 2.7976956367492676, "learning_rate": 1.9928096912856588e-05, "loss": 1.2519, "step": 5600 }, { "epoch": 0.4451386177274502, "grad_norm": 2.2419660091400146, "learning_rate": 1.992653380226651e-05, "loss": 1.2393, "step": 5700 }, { "epoch": 0.4529480671612651, "grad_norm": 2.1277241706848145, "learning_rate": 1.9924970691676437e-05, "loss": 1.2274, "step": 5800 }, { "epoch": 0.4607575165950801, "grad_norm": 3.7499144077301025, "learning_rate": 1.9923407581086363e-05, "loss": 1.2307, "step": 5900 }, { "epoch": 0.46856696602889497, "grad_norm": 1.9480825662612915, "learning_rate": 1.992184447049629e-05, "loss": 1.2134, "step": 6000 }, { "epoch": 0.47637641546270987, "grad_norm": 2.120570659637451, "learning_rate": 1.9920281359906215e-05, "loss": 1.2117, "step": 6100 }, { "epoch": 0.4841858648965248, "grad_norm": 2.7811381816864014, "learning_rate": 1.991871824931614e-05, "loss": 1.1805, "step": 6200 }, { "epoch": 0.4919953143303397, "grad_norm": 1.9131306409835815, "learning_rate": 1.9917155138726067e-05, "loss": 1.181, "step": 6300 }, { "epoch": 0.4998047637641546, "grad_norm": 1.955204963684082, "learning_rate": 1.9915592028135993e-05, "loss": 1.1594, "step": 6400 }, { "epoch": 0.5076142131979695, "grad_norm": 2.049238920211792, "learning_rate": 1.991402891754592e-05, "loss": 1.16, "step": 6500 }, { "epoch": 0.5154236626317844, "grad_norm": 2.059785842895508, "learning_rate": 1.991246580695584e-05, "loss": 1.1147, "step": 6600 }, { "epoch": 0.5232331120655994, "grad_norm": 2.1609349250793457, "learning_rate": 1.991090269636577e-05, "loss": 1.1218, "step": 6700 }, { "epoch": 0.5310425614994143, "grad_norm": 2.1594114303588867, "learning_rate": 1.9909339585775697e-05, "loss": 1.1246, "step": 6800 }, { "epoch": 0.5388520109332292, "grad_norm": 2.033703565597534, "learning_rate": 1.990777647518562e-05, "loss": 1.0668, "step": 6900 }, { "epoch": 0.5466614603670441, "grad_norm": 2.054765224456787, "learning_rate": 1.9906213364595545e-05, "loss": 1.104, "step": 7000 }, { "epoch": 0.554470909800859, "grad_norm": 2.6854045391082764, "learning_rate": 1.9904650254005475e-05, "loss": 1.0963, "step": 7100 }, { "epoch": 0.562280359234674, "grad_norm": 2.482316255569458, "learning_rate": 1.9903087143415397e-05, "loss": 1.0836, "step": 7200 }, { "epoch": 0.5700898086684889, "grad_norm": 1.9816139936447144, "learning_rate": 1.9901524032825323e-05, "loss": 1.0585, "step": 7300 }, { "epoch": 0.5778992581023038, "grad_norm": 2.2517287731170654, "learning_rate": 1.989996092223525e-05, "loss": 1.0604, "step": 7400 }, { "epoch": 0.5857087075361187, "grad_norm": 1.8760857582092285, "learning_rate": 1.9898397811645175e-05, "loss": 1.0492, "step": 7500 }, { "epoch": 0.5935181569699336, "grad_norm": 2.0815093517303467, "learning_rate": 1.98968347010551e-05, "loss": 1.0513, "step": 7600 }, { "epoch": 0.6013276064037485, "grad_norm": 2.0560126304626465, "learning_rate": 1.9895271590465027e-05, "loss": 1.0356, "step": 7700 }, { "epoch": 0.6091370558375635, "grad_norm": 1.6335766315460205, "learning_rate": 1.9893708479874953e-05, "loss": 1.0171, "step": 7800 }, { "epoch": 0.6169465052713784, "grad_norm": 2.0025687217712402, "learning_rate": 1.989214536928488e-05, "loss": 1.005, "step": 7900 }, { "epoch": 0.6247559547051933, "grad_norm": 2.0700294971466064, "learning_rate": 1.9890582258694805e-05, "loss": 1.0341, "step": 8000 }, { "epoch": 0.6325654041390082, "grad_norm": 1.6764856576919556, "learning_rate": 1.9889019148104728e-05, "loss": 1.0041, "step": 8100 }, { "epoch": 0.6403748535728231, "grad_norm": 1.821441411972046, "learning_rate": 1.9887456037514657e-05, "loss": 1.0123, "step": 8200 }, { "epoch": 0.648184303006638, "grad_norm": 1.8293089866638184, "learning_rate": 1.988589292692458e-05, "loss": 0.9997, "step": 8300 }, { "epoch": 0.655993752440453, "grad_norm": 1.7432034015655518, "learning_rate": 1.9884329816334506e-05, "loss": 0.9687, "step": 8400 }, { "epoch": 0.6638032018742679, "grad_norm": 1.683962345123291, "learning_rate": 1.9882766705744436e-05, "loss": 0.9589, "step": 8500 }, { "epoch": 0.6716126513080828, "grad_norm": 2.0143861770629883, "learning_rate": 1.9881203595154358e-05, "loss": 0.9536, "step": 8600 }, { "epoch": 0.6794221007418977, "grad_norm": 1.668605923652649, "learning_rate": 1.9879640484564284e-05, "loss": 0.9362, "step": 8700 }, { "epoch": 0.6872315501757126, "grad_norm": 2.569770574569702, "learning_rate": 1.987807737397421e-05, "loss": 0.9344, "step": 8800 }, { "epoch": 0.6950409996095275, "grad_norm": 2.044370412826538, "learning_rate": 1.9876514263384136e-05, "loss": 0.9277, "step": 8900 }, { "epoch": 0.7028504490433425, "grad_norm": 1.6726328134536743, "learning_rate": 1.9874951152794062e-05, "loss": 0.938, "step": 9000 }, { "epoch": 0.7106598984771574, "grad_norm": 1.9856268167495728, "learning_rate": 1.9873388042203988e-05, "loss": 0.9366, "step": 9100 }, { "epoch": 0.7184693479109723, "grad_norm": 2.2362923622131348, "learning_rate": 1.987182493161391e-05, "loss": 0.9515, "step": 9200 }, { "epoch": 0.7262787973447872, "grad_norm": 1.8397703170776367, "learning_rate": 1.987027745212974e-05, "loss": 0.9332, "step": 9300 }, { "epoch": 0.7340882467786021, "grad_norm": 1.7469147443771362, "learning_rate": 1.9868714341539666e-05, "loss": 0.9441, "step": 9400 }, { "epoch": 0.7418976962124171, "grad_norm": 2.095268726348877, "learning_rate": 1.9867151230949592e-05, "loss": 0.9272, "step": 9500 }, { "epoch": 0.749707145646232, "grad_norm": 1.8756574392318726, "learning_rate": 1.9865588120359518e-05, "loss": 0.8917, "step": 9600 }, { "epoch": 0.7575165950800469, "grad_norm": 1.924744725227356, "learning_rate": 1.9864025009769444e-05, "loss": 0.9079, "step": 9700 }, { "epoch": 0.7653260445138618, "grad_norm": 1.5487234592437744, "learning_rate": 1.9862461899179367e-05, "loss": 0.9002, "step": 9800 }, { "epoch": 0.7731354939476767, "grad_norm": 1.5049303770065308, "learning_rate": 1.9860898788589293e-05, "loss": 0.9027, "step": 9900 }, { "epoch": 0.7809449433814916, "grad_norm": 1.5578070878982544, "learning_rate": 1.9859335677999222e-05, "loss": 0.8865, "step": 10000 }, { "epoch": 0.7887543928153066, "grad_norm": 1.7090140581130981, "learning_rate": 1.9857772567409145e-05, "loss": 0.8829, "step": 10100 }, { "epoch": 0.7965638422491215, "grad_norm": 1.9182331562042236, "learning_rate": 1.985620945681907e-05, "loss": 0.8718, "step": 10200 }, { "epoch": 0.8043732916829364, "grad_norm": 1.6985232830047607, "learning_rate": 1.9854646346228997e-05, "loss": 0.875, "step": 10300 }, { "epoch": 0.8121827411167513, "grad_norm": 1.786824107170105, "learning_rate": 1.9853083235638923e-05, "loss": 0.8721, "step": 10400 }, { "epoch": 0.8199921905505662, "grad_norm": 1.4861210584640503, "learning_rate": 1.985152012504885e-05, "loss": 0.867, "step": 10500 }, { "epoch": 0.827801639984381, "grad_norm": 1.886149287223816, "learning_rate": 1.9849957014458775e-05, "loss": 0.856, "step": 10600 }, { "epoch": 0.8356110894181961, "grad_norm": 2.148075819015503, "learning_rate": 1.98483939038687e-05, "loss": 0.8496, "step": 10700 }, { "epoch": 0.843420538852011, "grad_norm": 1.4386200904846191, "learning_rate": 1.9846830793278627e-05, "loss": 0.8558, "step": 10800 }, { "epoch": 0.8512299882858259, "grad_norm": 1.9120664596557617, "learning_rate": 1.9845267682688553e-05, "loss": 0.8482, "step": 10900 }, { "epoch": 0.8590394377196408, "grad_norm": 1.9040182828903198, "learning_rate": 1.9843704572098476e-05, "loss": 0.8572, "step": 11000 }, { "epoch": 0.8668488871534556, "grad_norm": 2.2053062915802, "learning_rate": 1.9842141461508405e-05, "loss": 0.8764, "step": 11100 }, { "epoch": 0.8746583365872705, "grad_norm": 1.398203730583191, "learning_rate": 1.9840578350918328e-05, "loss": 0.8626, "step": 11200 }, { "epoch": 0.8824677860210856, "grad_norm": 1.7013752460479736, "learning_rate": 1.9839015240328254e-05, "loss": 0.8232, "step": 11300 }, { "epoch": 0.8902772354549005, "grad_norm": 1.5767678022384644, "learning_rate": 1.9837467760844083e-05, "loss": 0.8624, "step": 11400 }, { "epoch": 0.8980866848887153, "grad_norm": 1.8870518207550049, "learning_rate": 1.983590465025401e-05, "loss": 0.8552, "step": 11500 }, { "epoch": 0.9058961343225302, "grad_norm": 1.8952587842941284, "learning_rate": 1.983434153966393e-05, "loss": 0.8352, "step": 11600 }, { "epoch": 0.9137055837563451, "grad_norm": 2.1782963275909424, "learning_rate": 1.9832778429073858e-05, "loss": 0.8335, "step": 11700 }, { "epoch": 0.9215150331901601, "grad_norm": 1.6963413953781128, "learning_rate": 1.9831215318483784e-05, "loss": 0.8253, "step": 11800 }, { "epoch": 0.929324482623975, "grad_norm": 1.7919764518737793, "learning_rate": 1.982965220789371e-05, "loss": 0.8031, "step": 11900 }, { "epoch": 0.9371339320577899, "grad_norm": 1.9020565748214722, "learning_rate": 1.9828089097303636e-05, "loss": 0.8086, "step": 12000 }, { "epoch": 0.9449433814916048, "grad_norm": 1.617715835571289, "learning_rate": 1.982652598671356e-05, "loss": 0.807, "step": 12100 }, { "epoch": 0.9527528309254197, "grad_norm": 1.8193297386169434, "learning_rate": 1.9824962876123488e-05, "loss": 0.7864, "step": 12200 }, { "epoch": 0.9605622803592346, "grad_norm": 2.011845111846924, "learning_rate": 1.9823399765533414e-05, "loss": 0.8058, "step": 12300 }, { "epoch": 0.9683717297930496, "grad_norm": 2.0425360202789307, "learning_rate": 1.982183665494334e-05, "loss": 0.8175, "step": 12400 }, { "epoch": 0.9761811792268645, "grad_norm": 1.93047297000885, "learning_rate": 1.9820273544353266e-05, "loss": 0.7862, "step": 12500 }, { "epoch": 0.9839906286606794, "grad_norm": 2.4548439979553223, "learning_rate": 1.981871043376319e-05, "loss": 0.7839, "step": 12600 }, { "epoch": 0.9918000780944943, "grad_norm": 1.7791589498519897, "learning_rate": 1.9817147323173118e-05, "loss": 0.7722, "step": 12700 }, { "epoch": 0.9996095275283092, "grad_norm": 1.955427885055542, "learning_rate": 1.981558421258304e-05, "loss": 0.8095, "step": 12800 }, { "epoch": 1.0074189769621242, "grad_norm": 1.8196287155151367, "learning_rate": 1.981402110199297e-05, "loss": 0.7911, "step": 12900 }, { "epoch": 1.015228426395939, "grad_norm": 2.349574565887451, "learning_rate": 1.9812457991402892e-05, "loss": 0.803, "step": 13000 }, { "epoch": 1.023037875829754, "grad_norm": 1.7875525951385498, "learning_rate": 1.981089488081282e-05, "loss": 0.7797, "step": 13100 }, { "epoch": 1.0308473252635688, "grad_norm": 1.403671145439148, "learning_rate": 1.9809331770222744e-05, "loss": 0.7832, "step": 13200 }, { "epoch": 1.0386567746973838, "grad_norm": 1.5299042463302612, "learning_rate": 1.980776865963267e-05, "loss": 0.7699, "step": 13300 }, { "epoch": 1.0464662241311988, "grad_norm": 1.6570796966552734, "learning_rate": 1.9806205549042596e-05, "loss": 0.751, "step": 13400 }, { "epoch": 1.0542756735650136, "grad_norm": 2.0295419692993164, "learning_rate": 1.9804658069558422e-05, "loss": 0.7585, "step": 13500 }, { "epoch": 1.0620851229988286, "grad_norm": 1.412665843963623, "learning_rate": 1.9803094958968348e-05, "loss": 0.7887, "step": 13600 }, { "epoch": 1.0698945724326434, "grad_norm": 1.495368480682373, "learning_rate": 1.9801531848378274e-05, "loss": 0.7608, "step": 13700 }, { "epoch": 1.0777040218664584, "grad_norm": 1.5675642490386963, "learning_rate": 1.97999687377882e-05, "loss": 0.7448, "step": 13800 }, { "epoch": 1.0855134713002734, "grad_norm": 1.5208722352981567, "learning_rate": 1.9798405627198126e-05, "loss": 0.76, "step": 13900 }, { "epoch": 1.0933229207340882, "grad_norm": 1.5352216958999634, "learning_rate": 1.9796842516608052e-05, "loss": 0.7692, "step": 14000 }, { "epoch": 1.1011323701679032, "grad_norm": 1.8058335781097412, "learning_rate": 1.979527940601798e-05, "loss": 0.7488, "step": 14100 }, { "epoch": 1.108941819601718, "grad_norm": 1.7374639511108398, "learning_rate": 1.9793716295427904e-05, "loss": 0.7553, "step": 14200 }, { "epoch": 1.116751269035533, "grad_norm": 1.797935128211975, "learning_rate": 1.9792153184837827e-05, "loss": 0.7523, "step": 14300 }, { "epoch": 1.1245607184693478, "grad_norm": 1.7061059474945068, "learning_rate": 1.9790590074247756e-05, "loss": 0.7549, "step": 14400 }, { "epoch": 1.1323701679031628, "grad_norm": 1.8086135387420654, "learning_rate": 1.978902696365768e-05, "loss": 0.7544, "step": 14500 }, { "epoch": 1.1401796173369778, "grad_norm": 1.6291933059692383, "learning_rate": 1.9787463853067605e-05, "loss": 0.7407, "step": 14600 }, { "epoch": 1.1479890667707926, "grad_norm": 1.5078767538070679, "learning_rate": 1.9785900742477534e-05, "loss": 0.7087, "step": 14700 }, { "epoch": 1.1557985162046076, "grad_norm": 1.7376632690429688, "learning_rate": 1.9784337631887457e-05, "loss": 0.7432, "step": 14800 }, { "epoch": 1.1636079656384224, "grad_norm": 1.4681905508041382, "learning_rate": 1.9782774521297383e-05, "loss": 0.7228, "step": 14900 }, { "epoch": 1.1714174150722374, "grad_norm": 1.805963397026062, "learning_rate": 1.978121141070731e-05, "loss": 0.7408, "step": 15000 }, { "epoch": 1.1792268645060524, "grad_norm": 2.167956590652466, "learning_rate": 1.9779648300117235e-05, "loss": 0.7365, "step": 15100 }, { "epoch": 1.1870363139398672, "grad_norm": 1.7935293912887573, "learning_rate": 1.977808518952716e-05, "loss": 0.7361, "step": 15200 }, { "epoch": 1.1948457633736822, "grad_norm": 1.7757160663604736, "learning_rate": 1.9776522078937087e-05, "loss": 0.7229, "step": 15300 }, { "epoch": 1.202655212807497, "grad_norm": 2.04471755027771, "learning_rate": 1.977495896834701e-05, "loss": 0.7133, "step": 15400 }, { "epoch": 1.210464662241312, "grad_norm": 2.1509175300598145, "learning_rate": 1.977341148886284e-05, "loss": 0.7316, "step": 15500 }, { "epoch": 1.218274111675127, "grad_norm": 1.8622750043869019, "learning_rate": 1.9771848378272765e-05, "loss": 0.7517, "step": 15600 }, { "epoch": 1.2260835611089418, "grad_norm": 1.8098951578140259, "learning_rate": 1.977028526768269e-05, "loss": 0.695, "step": 15700 }, { "epoch": 1.2338930105427568, "grad_norm": 1.8343333005905151, "learning_rate": 1.9768722157092617e-05, "loss": 0.7092, "step": 15800 }, { "epoch": 1.2417024599765716, "grad_norm": 1.716015100479126, "learning_rate": 1.9767159046502543e-05, "loss": 0.6997, "step": 15900 }, { "epoch": 1.2495119094103866, "grad_norm": 1.668656349182129, "learning_rate": 1.9765595935912466e-05, "loss": 0.7041, "step": 16000 }, { "epoch": 1.2573213588442016, "grad_norm": 1.7509514093399048, "learning_rate": 1.9764032825322392e-05, "loss": 0.702, "step": 16100 }, { "epoch": 1.2651308082780164, "grad_norm": 1.7006629705429077, "learning_rate": 1.976246971473232e-05, "loss": 0.7071, "step": 16200 }, { "epoch": 1.2729402577118314, "grad_norm": 1.8491188287734985, "learning_rate": 1.9760906604142244e-05, "loss": 0.6678, "step": 16300 }, { "epoch": 1.2807497071456462, "grad_norm": 1.7705504894256592, "learning_rate": 1.975934349355217e-05, "loss": 0.688, "step": 16400 }, { "epoch": 1.2885591565794612, "grad_norm": 1.4014639854431152, "learning_rate": 1.9757780382962096e-05, "loss": 0.7038, "step": 16500 }, { "epoch": 1.2963686060132762, "grad_norm": 1.8170675039291382, "learning_rate": 1.9756217272372022e-05, "loss": 0.6787, "step": 16600 }, { "epoch": 1.304178055447091, "grad_norm": 1.4013011455535889, "learning_rate": 1.9754654161781948e-05, "loss": 0.6648, "step": 16700 }, { "epoch": 1.3119875048809058, "grad_norm": 1.41355299949646, "learning_rate": 1.9753091051191874e-05, "loss": 0.6895, "step": 16800 }, { "epoch": 1.3197969543147208, "grad_norm": 1.4750763177871704, "learning_rate": 1.97515279406018e-05, "loss": 0.6762, "step": 16900 }, { "epoch": 1.3276064037485358, "grad_norm": 1.596587896347046, "learning_rate": 1.9749964830011726e-05, "loss": 0.652, "step": 17000 }, { "epoch": 1.3354158531823506, "grad_norm": 1.7418196201324463, "learning_rate": 1.9748401719421652e-05, "loss": 0.6843, "step": 17100 }, { "epoch": 1.3432253026161656, "grad_norm": 1.8203010559082031, "learning_rate": 1.9746838608831574e-05, "loss": 0.6897, "step": 17200 }, { "epoch": 1.3510347520499804, "grad_norm": 1.56990647315979, "learning_rate": 1.9745275498241504e-05, "loss": 0.6736, "step": 17300 }, { "epoch": 1.3588442014837954, "grad_norm": 1.767688512802124, "learning_rate": 1.9743712387651426e-05, "loss": 0.6803, "step": 17400 }, { "epoch": 1.3666536509176104, "grad_norm": 1.43065345287323, "learning_rate": 1.9742149277061352e-05, "loss": 0.6546, "step": 17500 }, { "epoch": 1.3744631003514252, "grad_norm": 1.8510737419128418, "learning_rate": 1.974060179757718e-05, "loss": 0.6697, "step": 17600 }, { "epoch": 1.3822725497852402, "grad_norm": 1.7096539735794067, "learning_rate": 1.9739038686987108e-05, "loss": 0.653, "step": 17700 }, { "epoch": 1.390081999219055, "grad_norm": 1.6283639669418335, "learning_rate": 1.973747557639703e-05, "loss": 0.6663, "step": 17800 }, { "epoch": 1.39789144865287, "grad_norm": 1.794142723083496, "learning_rate": 1.9735912465806956e-05, "loss": 0.6689, "step": 17900 }, { "epoch": 1.405700898086685, "grad_norm": 2.0754244327545166, "learning_rate": 1.9734349355216882e-05, "loss": 0.6647, "step": 18000 }, { "epoch": 1.4135103475204998, "grad_norm": 1.6919443607330322, "learning_rate": 1.973278624462681e-05, "loss": 0.6597, "step": 18100 }, { "epoch": 1.4213197969543148, "grad_norm": 1.7112871408462524, "learning_rate": 1.9731223134036734e-05, "loss": 0.6515, "step": 18200 }, { "epoch": 1.4291292463881295, "grad_norm": 1.552016258239746, "learning_rate": 1.972966002344666e-05, "loss": 0.6585, "step": 18300 }, { "epoch": 1.4369386958219446, "grad_norm": 1.7915266752243042, "learning_rate": 1.9728096912856586e-05, "loss": 0.6549, "step": 18400 }, { "epoch": 1.4447481452557596, "grad_norm": 1.4755462408065796, "learning_rate": 1.9726533802266513e-05, "loss": 0.6448, "step": 18500 }, { "epoch": 1.4525575946895743, "grad_norm": 1.3579930067062378, "learning_rate": 1.972497069167644e-05, "loss": 0.6372, "step": 18600 }, { "epoch": 1.4603670441233894, "grad_norm": 2.1790480613708496, "learning_rate": 1.9723407581086365e-05, "loss": 0.6555, "step": 18700 }, { "epoch": 1.4681764935572041, "grad_norm": 1.4425419569015503, "learning_rate": 1.972184447049629e-05, "loss": 0.6531, "step": 18800 }, { "epoch": 1.4759859429910192, "grad_norm": 1.9316322803497314, "learning_rate": 1.9720281359906213e-05, "loss": 0.6457, "step": 18900 }, { "epoch": 1.4837953924248342, "grad_norm": 1.3997584581375122, "learning_rate": 1.971871824931614e-05, "loss": 0.6336, "step": 19000 }, { "epoch": 1.491604841858649, "grad_norm": 1.6301729679107666, "learning_rate": 1.971715513872607e-05, "loss": 0.6544, "step": 19100 }, { "epoch": 1.499414291292464, "grad_norm": 1.620894193649292, "learning_rate": 1.971559202813599e-05, "loss": 0.6339, "step": 19200 }, { "epoch": 1.5072237407262787, "grad_norm": 1.5146082639694214, "learning_rate": 1.9714028917545917e-05, "loss": 0.6573, "step": 19300 }, { "epoch": 1.5150331901600937, "grad_norm": 1.775248646736145, "learning_rate": 1.9712465806955843e-05, "loss": 0.6256, "step": 19400 }, { "epoch": 1.5228426395939088, "grad_norm": 1.5098567008972168, "learning_rate": 1.971090269636577e-05, "loss": 0.6453, "step": 19500 }, { "epoch": 1.5306520890277235, "grad_norm": 1.3362939357757568, "learning_rate": 1.9709339585775695e-05, "loss": 0.6157, "step": 19600 }, { "epoch": 1.5384615384615383, "grad_norm": 1.5511094331741333, "learning_rate": 1.970777647518562e-05, "loss": 0.662, "step": 19700 }, { "epoch": 1.5462709878953533, "grad_norm": 1.2386195659637451, "learning_rate": 1.9706228995701447e-05, "loss": 0.6227, "step": 19800 }, { "epoch": 1.5540804373291683, "grad_norm": 1.674869179725647, "learning_rate": 1.9704665885111373e-05, "loss": 0.6722, "step": 19900 }, { "epoch": 1.5618898867629833, "grad_norm": 2.106680393218994, "learning_rate": 1.97031027745213e-05, "loss": 0.6314, "step": 20000 }, { "epoch": 1.5696993361967981, "grad_norm": 1.7660146951675415, "learning_rate": 1.9701539663931225e-05, "loss": 0.6404, "step": 20100 }, { "epoch": 1.577508785630613, "grad_norm": 1.62801992893219, "learning_rate": 1.969997655334115e-05, "loss": 0.6167, "step": 20200 }, { "epoch": 1.585318235064428, "grad_norm": 1.5587072372436523, "learning_rate": 1.9698413442751077e-05, "loss": 0.5937, "step": 20300 }, { "epoch": 1.593127684498243, "grad_norm": 1.4757510423660278, "learning_rate": 1.9696850332161003e-05, "loss": 0.6012, "step": 20400 }, { "epoch": 1.600937133932058, "grad_norm": 1.6553717851638794, "learning_rate": 1.9695287221570926e-05, "loss": 0.6101, "step": 20500 }, { "epoch": 1.6087465833658727, "grad_norm": 1.7269705533981323, "learning_rate": 1.9693724110980855e-05, "loss": 0.6116, "step": 20600 }, { "epoch": 1.6165560327996875, "grad_norm": 1.8082709312438965, "learning_rate": 1.9692161000390778e-05, "loss": 0.6118, "step": 20700 }, { "epoch": 1.6243654822335025, "grad_norm": 1.6342484951019287, "learning_rate": 1.9690597889800704e-05, "loss": 0.6136, "step": 20800 }, { "epoch": 1.6321749316673175, "grad_norm": 1.4139142036437988, "learning_rate": 1.9689034779210633e-05, "loss": 0.5913, "step": 20900 }, { "epoch": 1.6399843811011323, "grad_norm": 1.691498041152954, "learning_rate": 1.9687471668620556e-05, "loss": 0.6069, "step": 21000 }, { "epoch": 1.6477938305349473, "grad_norm": 1.6393285989761353, "learning_rate": 1.9685908558030482e-05, "loss": 0.6143, "step": 21100 }, { "epoch": 1.655603279968762, "grad_norm": 1.4533542394638062, "learning_rate": 1.9684345447440408e-05, "loss": 0.6041, "step": 21200 }, { "epoch": 1.6634127294025771, "grad_norm": 1.7511471509933472, "learning_rate": 1.9682782336850334e-05, "loss": 0.6059, "step": 21300 }, { "epoch": 1.6712221788363921, "grad_norm": 1.6999716758728027, "learning_rate": 1.968121922626026e-05, "loss": 0.6234, "step": 21400 }, { "epoch": 1.679031628270207, "grad_norm": 1.5730124711990356, "learning_rate": 1.9679656115670186e-05, "loss": 0.609, "step": 21500 }, { "epoch": 1.686841077704022, "grad_norm": 1.657443642616272, "learning_rate": 1.967809300508011e-05, "loss": 0.612, "step": 21600 }, { "epoch": 1.6946505271378367, "grad_norm": 1.9014359712600708, "learning_rate": 1.9676529894490038e-05, "loss": 0.5932, "step": 21700 }, { "epoch": 1.7024599765716517, "grad_norm": 1.1905044317245483, "learning_rate": 1.9674966783899964e-05, "loss": 0.5974, "step": 21800 }, { "epoch": 1.7102694260054667, "grad_norm": 1.6017484664916992, "learning_rate": 1.9673403673309887e-05, "loss": 0.5872, "step": 21900 }, { "epoch": 1.7180788754392815, "grad_norm": 1.674900770187378, "learning_rate": 1.9671856193825716e-05, "loss": 0.6008, "step": 22000 }, { "epoch": 1.7258883248730963, "grad_norm": 1.4483686685562134, "learning_rate": 1.9670293083235642e-05, "loss": 0.5826, "step": 22100 }, { "epoch": 1.7336977743069113, "grad_norm": 1.7324950695037842, "learning_rate": 1.9668729972645565e-05, "loss": 0.6, "step": 22200 }, { "epoch": 1.7415072237407263, "grad_norm": 2.0077033042907715, "learning_rate": 1.966716686205549e-05, "loss": 0.5906, "step": 22300 }, { "epoch": 1.7493166731745413, "grad_norm": 1.6235421895980835, "learning_rate": 1.966560375146542e-05, "loss": 0.6161, "step": 22400 }, { "epoch": 1.757126122608356, "grad_norm": 1.6211363077163696, "learning_rate": 1.9664040640875343e-05, "loss": 0.584, "step": 22500 }, { "epoch": 1.7649355720421709, "grad_norm": 1.4874013662338257, "learning_rate": 1.966247753028527e-05, "loss": 0.5883, "step": 22600 }, { "epoch": 1.772745021475986, "grad_norm": 1.7945176362991333, "learning_rate": 1.9660914419695195e-05, "loss": 0.5939, "step": 22700 }, { "epoch": 1.780554470909801, "grad_norm": 1.5537645816802979, "learning_rate": 1.965935130910512e-05, "loss": 0.5916, "step": 22800 }, { "epoch": 1.788363920343616, "grad_norm": 1.5249923467636108, "learning_rate": 1.9657788198515047e-05, "loss": 0.5751, "step": 22900 }, { "epoch": 1.7961733697774307, "grad_norm": 1.017974615097046, "learning_rate": 1.9656225087924973e-05, "loss": 0.5799, "step": 23000 }, { "epoch": 1.8039828192112455, "grad_norm": 1.6119155883789062, "learning_rate": 1.96546619773349e-05, "loss": 0.5852, "step": 23100 }, { "epoch": 1.8117922686450605, "grad_norm": 1.5619168281555176, "learning_rate": 1.9653098866744825e-05, "loss": 0.581, "step": 23200 }, { "epoch": 1.8196017180788755, "grad_norm": 1.7065399885177612, "learning_rate": 1.965153575615475e-05, "loss": 0.5917, "step": 23300 }, { "epoch": 1.8274111675126905, "grad_norm": 1.5742697715759277, "learning_rate": 1.9649972645564673e-05, "loss": 0.5745, "step": 23400 }, { "epoch": 1.8352206169465053, "grad_norm": 1.9307541847229004, "learning_rate": 1.9648409534974603e-05, "loss": 0.5748, "step": 23500 }, { "epoch": 1.84303006638032, "grad_norm": 1.4289742708206177, "learning_rate": 1.9646846424384525e-05, "loss": 0.5657, "step": 23600 }, { "epoch": 1.850839515814135, "grad_norm": 1.5857402086257935, "learning_rate": 1.964528331379445e-05, "loss": 0.5535, "step": 23700 }, { "epoch": 1.85864896524795, "grad_norm": 1.8342182636260986, "learning_rate": 1.9643720203204377e-05, "loss": 0.5758, "step": 23800 }, { "epoch": 1.866458414681765, "grad_norm": 1.7719389200210571, "learning_rate": 1.9642157092614303e-05, "loss": 0.5925, "step": 23900 }, { "epoch": 1.8742678641155799, "grad_norm": 1.4628547430038452, "learning_rate": 1.964059398202423e-05, "loss": 0.5815, "step": 24000 }, { "epoch": 1.8820773135493947, "grad_norm": 1.7836227416992188, "learning_rate": 1.9639030871434155e-05, "loss": 0.5608, "step": 24100 }, { "epoch": 1.8898867629832097, "grad_norm": 1.4340027570724487, "learning_rate": 1.963748339194998e-05, "loss": 0.5669, "step": 24200 }, { "epoch": 1.8976962124170247, "grad_norm": 1.8303942680358887, "learning_rate": 1.9635920281359907e-05, "loss": 0.5532, "step": 24300 }, { "epoch": 1.9055056618508395, "grad_norm": 1.656308650970459, "learning_rate": 1.9634357170769833e-05, "loss": 0.5686, "step": 24400 }, { "epoch": 1.9133151112846545, "grad_norm": 1.7295633554458618, "learning_rate": 1.963279406017976e-05, "loss": 0.5715, "step": 24500 }, { "epoch": 1.9211245607184693, "grad_norm": 1.872109293937683, "learning_rate": 1.9631230949589685e-05, "loss": 0.55, "step": 24600 }, { "epoch": 1.9289340101522843, "grad_norm": 1.568379282951355, "learning_rate": 1.962966783899961e-05, "loss": 0.5669, "step": 24700 }, { "epoch": 1.9367434595860993, "grad_norm": 1.252161979675293, "learning_rate": 1.9628104728409537e-05, "loss": 0.5588, "step": 24800 }, { "epoch": 1.944552909019914, "grad_norm": 1.4522193670272827, "learning_rate": 1.9626541617819463e-05, "loss": 0.5669, "step": 24900 }, { "epoch": 1.9523623584537289, "grad_norm": 1.6285183429718018, "learning_rate": 1.962497850722939e-05, "loss": 0.5381, "step": 25000 }, { "epoch": 1.9601718078875439, "grad_norm": 1.4675999879837036, "learning_rate": 1.9623415396639312e-05, "loss": 0.5656, "step": 25100 }, { "epoch": 1.9679812573213589, "grad_norm": 1.4689419269561768, "learning_rate": 1.9621852286049238e-05, "loss": 0.5677, "step": 25200 }, { "epoch": 1.9757907067551739, "grad_norm": 1.5669220685958862, "learning_rate": 1.9620289175459167e-05, "loss": 0.5549, "step": 25300 }, { "epoch": 1.9836001561889887, "grad_norm": 1.3576539754867554, "learning_rate": 1.961872606486909e-05, "loss": 0.5409, "step": 25400 }, { "epoch": 1.9914096056228034, "grad_norm": 1.6081891059875488, "learning_rate": 1.9617162954279016e-05, "loss": 0.5733, "step": 25500 }, { "epoch": 1.9992190550566185, "grad_norm": 1.2406030893325806, "learning_rate": 1.9615599843688942e-05, "loss": 0.5461, "step": 25600 }, { "epoch": 2.0070285044904335, "grad_norm": 1.4401236772537231, "learning_rate": 1.9614036733098868e-05, "loss": 0.5613, "step": 25700 }, { "epoch": 2.0148379539242485, "grad_norm": 1.7388012409210205, "learning_rate": 1.9612473622508794e-05, "loss": 0.5468, "step": 25800 }, { "epoch": 2.0226474033580635, "grad_norm": 1.2448303699493408, "learning_rate": 1.961091051191872e-05, "loss": 0.5399, "step": 25900 }, { "epoch": 2.030456852791878, "grad_norm": 1.4686857461929321, "learning_rate": 1.9609347401328646e-05, "loss": 0.5671, "step": 26000 }, { "epoch": 2.038266302225693, "grad_norm": 1.6793551445007324, "learning_rate": 1.9607784290738572e-05, "loss": 0.557, "step": 26100 }, { "epoch": 2.046075751659508, "grad_norm": 1.5726957321166992, "learning_rate": 1.9606221180148498e-05, "loss": 0.5456, "step": 26200 }, { "epoch": 2.053885201093323, "grad_norm": 1.5355794429779053, "learning_rate": 1.960465806955842e-05, "loss": 0.5424, "step": 26300 }, { "epoch": 2.0616946505271376, "grad_norm": 1.3061555624008179, "learning_rate": 1.960309495896835e-05, "loss": 0.531, "step": 26400 }, { "epoch": 2.0695040999609526, "grad_norm": 1.2583160400390625, "learning_rate": 1.9601531848378276e-05, "loss": 0.5421, "step": 26500 }, { "epoch": 2.0773135493947676, "grad_norm": 1.578881025314331, "learning_rate": 1.95999687377882e-05, "loss": 0.5453, "step": 26600 }, { "epoch": 2.0851229988285827, "grad_norm": 1.240598201751709, "learning_rate": 1.9598405627198125e-05, "loss": 0.5509, "step": 26700 }, { "epoch": 2.0929324482623977, "grad_norm": 1.6285960674285889, "learning_rate": 1.959684251660805e-05, "loss": 0.5509, "step": 26800 }, { "epoch": 2.1007418976962122, "grad_norm": 1.7065869569778442, "learning_rate": 1.9595279406017977e-05, "loss": 0.5367, "step": 26900 }, { "epoch": 2.1085513471300272, "grad_norm": 1.1276403665542603, "learning_rate": 1.9593731926533803e-05, "loss": 0.5394, "step": 27000 }, { "epoch": 2.1163607965638422, "grad_norm": 1.3322559595108032, "learning_rate": 1.9592168815943732e-05, "loss": 0.5393, "step": 27100 }, { "epoch": 2.1241702459976572, "grad_norm": 1.2126073837280273, "learning_rate": 1.9590605705353655e-05, "loss": 0.5299, "step": 27200 }, { "epoch": 2.1319796954314723, "grad_norm": 1.2466312646865845, "learning_rate": 1.958904259476358e-05, "loss": 0.5389, "step": 27300 }, { "epoch": 2.139789144865287, "grad_norm": 1.2818732261657715, "learning_rate": 1.9587479484173507e-05, "loss": 0.5344, "step": 27400 }, { "epoch": 2.147598594299102, "grad_norm": 1.2986412048339844, "learning_rate": 1.9585916373583433e-05, "loss": 0.5332, "step": 27500 }, { "epoch": 2.155408043732917, "grad_norm": 1.1982731819152832, "learning_rate": 1.958435326299336e-05, "loss": 0.5448, "step": 27600 }, { "epoch": 2.163217493166732, "grad_norm": 1.0930027961730957, "learning_rate": 1.9582790152403285e-05, "loss": 0.5396, "step": 27700 }, { "epoch": 2.171026942600547, "grad_norm": 1.175718069076538, "learning_rate": 1.9581227041813208e-05, "loss": 0.5245, "step": 27800 }, { "epoch": 2.1788363920343614, "grad_norm": 1.9274697303771973, "learning_rate": 1.9579663931223137e-05, "loss": 0.5346, "step": 27900 }, { "epoch": 2.1866458414681764, "grad_norm": 1.5411278009414673, "learning_rate": 1.9578100820633063e-05, "loss": 0.5537, "step": 28000 }, { "epoch": 2.1944552909019914, "grad_norm": 1.6423187255859375, "learning_rate": 1.9576537710042986e-05, "loss": 0.526, "step": 28100 }, { "epoch": 2.2022647403358064, "grad_norm": 1.412419319152832, "learning_rate": 1.9574974599452915e-05, "loss": 0.5279, "step": 28200 }, { "epoch": 2.2100741897696214, "grad_norm": 1.2464630603790283, "learning_rate": 1.9573411488862838e-05, "loss": 0.5505, "step": 28300 }, { "epoch": 2.217883639203436, "grad_norm": 1.537739872932434, "learning_rate": 1.9571848378272764e-05, "loss": 0.5296, "step": 28400 }, { "epoch": 2.225693088637251, "grad_norm": 1.4067381620407104, "learning_rate": 1.957030089878859e-05, "loss": 0.5252, "step": 28500 }, { "epoch": 2.233502538071066, "grad_norm": 1.662540078163147, "learning_rate": 1.956873778819852e-05, "loss": 0.5329, "step": 28600 }, { "epoch": 2.241311987504881, "grad_norm": 1.4438475370407104, "learning_rate": 1.956717467760844e-05, "loss": 0.5289, "step": 28700 }, { "epoch": 2.2491214369386956, "grad_norm": 1.293503999710083, "learning_rate": 1.9565611567018368e-05, "loss": 0.522, "step": 28800 }, { "epoch": 2.2569308863725106, "grad_norm": 1.2435001134872437, "learning_rate": 1.9564048456428294e-05, "loss": 0.534, "step": 28900 }, { "epoch": 2.2647403358063256, "grad_norm": 1.3798662424087524, "learning_rate": 1.956248534583822e-05, "loss": 0.5188, "step": 29000 }, { "epoch": 2.2725497852401406, "grad_norm": 1.6525225639343262, "learning_rate": 1.9560922235248146e-05, "loss": 0.5463, "step": 29100 }, { "epoch": 2.2803592346739556, "grad_norm": 1.3094666004180908, "learning_rate": 1.955935912465807e-05, "loss": 0.5279, "step": 29200 }, { "epoch": 2.28816868410777, "grad_norm": 1.3461250066757202, "learning_rate": 1.9557796014067998e-05, "loss": 0.5428, "step": 29300 }, { "epoch": 2.295978133541585, "grad_norm": 1.3624392747879028, "learning_rate": 1.9556232903477924e-05, "loss": 0.5061, "step": 29400 }, { "epoch": 2.3037875829754, "grad_norm": 1.327601671218872, "learning_rate": 1.955466979288785e-05, "loss": 0.5279, "step": 29500 }, { "epoch": 2.311597032409215, "grad_norm": 1.3067333698272705, "learning_rate": 1.9553106682297772e-05, "loss": 0.5338, "step": 29600 }, { "epoch": 2.31940648184303, "grad_norm": 1.455754041671753, "learning_rate": 1.95515435717077e-05, "loss": 0.5233, "step": 29700 }, { "epoch": 2.327215931276845, "grad_norm": 1.3276084661483765, "learning_rate": 1.9549980461117624e-05, "loss": 0.5119, "step": 29800 }, { "epoch": 2.33502538071066, "grad_norm": 1.1605360507965088, "learning_rate": 1.954841735052755e-05, "loss": 0.5075, "step": 29900 }, { "epoch": 2.342834830144475, "grad_norm": 1.316475749015808, "learning_rate": 1.9546854239937476e-05, "loss": 0.5072, "step": 30000 }, { "epoch": 2.35064427957829, "grad_norm": 1.1585702896118164, "learning_rate": 1.9545291129347402e-05, "loss": 0.5042, "step": 30100 }, { "epoch": 2.358453729012105, "grad_norm": 1.2547882795333862, "learning_rate": 1.9543728018757328e-05, "loss": 0.5152, "step": 30200 }, { "epoch": 2.3662631784459194, "grad_norm": 1.4408245086669922, "learning_rate": 1.9542164908167254e-05, "loss": 0.5051, "step": 30300 }, { "epoch": 2.3740726278797344, "grad_norm": 1.256428837776184, "learning_rate": 1.954060179757718e-05, "loss": 0.5213, "step": 30400 }, { "epoch": 2.3818820773135494, "grad_norm": 1.3025598526000977, "learning_rate": 1.9539038686987106e-05, "loss": 0.5218, "step": 30500 }, { "epoch": 2.3896915267473644, "grad_norm": 1.343064546585083, "learning_rate": 1.9537475576397032e-05, "loss": 0.5128, "step": 30600 }, { "epoch": 2.3975009761811794, "grad_norm": 1.4012939929962158, "learning_rate": 1.9535912465806955e-05, "loss": 0.5071, "step": 30700 }, { "epoch": 2.405310425614994, "grad_norm": 1.2314071655273438, "learning_rate": 1.9534349355216884e-05, "loss": 0.5171, "step": 30800 }, { "epoch": 2.413119875048809, "grad_norm": 1.502560019493103, "learning_rate": 1.953278624462681e-05, "loss": 0.5123, "step": 30900 }, { "epoch": 2.420929324482624, "grad_norm": 1.1332653760910034, "learning_rate": 1.9531223134036733e-05, "loss": 0.5016, "step": 31000 }, { "epoch": 2.428738773916439, "grad_norm": 1.2297821044921875, "learning_rate": 1.952966002344666e-05, "loss": 0.5217, "step": 31100 }, { "epoch": 2.436548223350254, "grad_norm": 1.3758752346038818, "learning_rate": 1.9528096912856585e-05, "loss": 0.5096, "step": 31200 }, { "epoch": 2.4443576727840686, "grad_norm": 1.2562918663024902, "learning_rate": 1.952653380226651e-05, "loss": 0.5033, "step": 31300 }, { "epoch": 2.4521671222178836, "grad_norm": 1.238236904144287, "learning_rate": 1.9524970691676437e-05, "loss": 0.5022, "step": 31400 }, { "epoch": 2.4599765716516986, "grad_norm": 1.1074494123458862, "learning_rate": 1.9523407581086363e-05, "loss": 0.5028, "step": 31500 }, { "epoch": 2.4677860210855136, "grad_norm": 1.2257527112960815, "learning_rate": 1.952184447049629e-05, "loss": 0.5175, "step": 31600 }, { "epoch": 2.4755954705193286, "grad_norm": 1.3490757942199707, "learning_rate": 1.9520281359906215e-05, "loss": 0.5097, "step": 31700 }, { "epoch": 2.483404919953143, "grad_norm": 1.2227071523666382, "learning_rate": 1.951871824931614e-05, "loss": 0.5318, "step": 31800 }, { "epoch": 2.491214369386958, "grad_norm": 1.5109745264053345, "learning_rate": 1.9517155138726067e-05, "loss": 0.5092, "step": 31900 }, { "epoch": 2.499023818820773, "grad_norm": 1.3931546211242676, "learning_rate": 1.9515592028135993e-05, "loss": 0.5014, "step": 32000 }, { "epoch": 2.506833268254588, "grad_norm": 1.350993275642395, "learning_rate": 1.951402891754592e-05, "loss": 0.4969, "step": 32100 }, { "epoch": 2.514642717688403, "grad_norm": 1.1680783033370972, "learning_rate": 1.9512465806955845e-05, "loss": 0.5155, "step": 32200 }, { "epoch": 2.5224521671222178, "grad_norm": 1.8324127197265625, "learning_rate": 1.951090269636577e-05, "loss": 0.5005, "step": 32300 }, { "epoch": 2.5302616165560328, "grad_norm": 1.244734525680542, "learning_rate": 1.9509339585775694e-05, "loss": 0.5115, "step": 32400 }, { "epoch": 2.5380710659898478, "grad_norm": 1.1397197246551514, "learning_rate": 1.950777647518562e-05, "loss": 0.4764, "step": 32500 }, { "epoch": 2.545880515423663, "grad_norm": 1.2140659093856812, "learning_rate": 1.950621336459555e-05, "loss": 0.5141, "step": 32600 }, { "epoch": 2.553689964857478, "grad_norm": 1.2659382820129395, "learning_rate": 1.9504665885111375e-05, "loss": 0.5002, "step": 32700 }, { "epoch": 2.5614994142912924, "grad_norm": 1.4882584810256958, "learning_rate": 1.9503102774521298e-05, "loss": 0.5061, "step": 32800 }, { "epoch": 2.5693088637251074, "grad_norm": 1.4329360723495483, "learning_rate": 1.9501539663931224e-05, "loss": 0.5204, "step": 32900 }, { "epoch": 2.5771183131589224, "grad_norm": 1.341886281967163, "learning_rate": 1.949997655334115e-05, "loss": 0.5008, "step": 33000 }, { "epoch": 2.5849277625927374, "grad_norm": 1.3239928483963013, "learning_rate": 1.9498413442751076e-05, "loss": 0.4847, "step": 33100 }, { "epoch": 2.5927372120265524, "grad_norm": 1.525991678237915, "learning_rate": 1.9496850332161002e-05, "loss": 0.4975, "step": 33200 }, { "epoch": 2.600546661460367, "grad_norm": 1.2678552865982056, "learning_rate": 1.9495287221570928e-05, "loss": 0.4888, "step": 33300 }, { "epoch": 2.608356110894182, "grad_norm": 1.1323553323745728, "learning_rate": 1.9493724110980854e-05, "loss": 0.5244, "step": 33400 }, { "epoch": 2.616165560327997, "grad_norm": 1.548802137374878, "learning_rate": 1.949216100039078e-05, "loss": 0.4921, "step": 33500 }, { "epoch": 2.6239750097618115, "grad_norm": 1.1082913875579834, "learning_rate": 1.9490597889800706e-05, "loss": 0.492, "step": 33600 }, { "epoch": 2.631784459195627, "grad_norm": 1.1769174337387085, "learning_rate": 1.9489034779210632e-05, "loss": 0.4915, "step": 33700 }, { "epoch": 2.6395939086294415, "grad_norm": 1.846003770828247, "learning_rate": 1.9487471668620558e-05, "loss": 0.5014, "step": 33800 }, { "epoch": 2.6474033580632566, "grad_norm": 1.341156244277954, "learning_rate": 1.948590855803048e-05, "loss": 0.5003, "step": 33900 }, { "epoch": 2.6552128074970716, "grad_norm": 1.592016577720642, "learning_rate": 1.9484345447440406e-05, "loss": 0.5017, "step": 34000 }, { "epoch": 2.663022256930886, "grad_norm": 1.320487380027771, "learning_rate": 1.9482782336850336e-05, "loss": 0.4892, "step": 34100 }, { "epoch": 2.670831706364701, "grad_norm": 1.2302286624908447, "learning_rate": 1.948121922626026e-05, "loss": 0.4963, "step": 34200 }, { "epoch": 2.678641155798516, "grad_norm": 1.1193184852600098, "learning_rate": 1.9479656115670185e-05, "loss": 0.4925, "step": 34300 }, { "epoch": 2.686450605232331, "grad_norm": 1.0363550186157227, "learning_rate": 1.947809300508011e-05, "loss": 0.4986, "step": 34400 }, { "epoch": 2.694260054666146, "grad_norm": 1.32547926902771, "learning_rate": 1.9476529894490037e-05, "loss": 0.475, "step": 34500 }, { "epoch": 2.7020695040999607, "grad_norm": 1.1739405393600464, "learning_rate": 1.9474966783899963e-05, "loss": 0.5149, "step": 34600 }, { "epoch": 2.7098789535337757, "grad_norm": 1.2484989166259766, "learning_rate": 1.947340367330989e-05, "loss": 0.4906, "step": 34700 }, { "epoch": 2.7176884029675907, "grad_norm": 1.2752107381820679, "learning_rate": 1.9471840562719815e-05, "loss": 0.4871, "step": 34800 }, { "epoch": 2.7254978524014057, "grad_norm": 1.3706623315811157, "learning_rate": 1.947027745212974e-05, "loss": 0.4849, "step": 34900 }, { "epoch": 2.7333073018352207, "grad_norm": 1.2365776300430298, "learning_rate": 1.9468714341539667e-05, "loss": 0.495, "step": 35000 }, { "epoch": 2.7411167512690353, "grad_norm": 1.2424877882003784, "learning_rate": 1.946715123094959e-05, "loss": 0.4832, "step": 35100 }, { "epoch": 2.7489262007028503, "grad_norm": 1.2801834344863892, "learning_rate": 1.946558812035952e-05, "loss": 0.4754, "step": 35200 }, { "epoch": 2.7567356501366653, "grad_norm": 1.2843778133392334, "learning_rate": 1.946402500976944e-05, "loss": 0.4698, "step": 35300 }, { "epoch": 2.7645450995704803, "grad_norm": 1.2793940305709839, "learning_rate": 1.9462461899179367e-05, "loss": 0.4825, "step": 35400 }, { "epoch": 2.7723545490042953, "grad_norm": 1.1678388118743896, "learning_rate": 1.9460898788589297e-05, "loss": 0.4886, "step": 35500 }, { "epoch": 2.78016399843811, "grad_norm": 0.9187774658203125, "learning_rate": 1.945933567799922e-05, "loss": 0.4884, "step": 35600 }, { "epoch": 2.787973447871925, "grad_norm": 1.246982216835022, "learning_rate": 1.9457772567409145e-05, "loss": 0.4809, "step": 35700 }, { "epoch": 2.79578289730574, "grad_norm": 1.5185933113098145, "learning_rate": 1.945620945681907e-05, "loss": 0.4741, "step": 35800 }, { "epoch": 2.803592346739555, "grad_norm": 1.198704481124878, "learning_rate": 1.9454646346228997e-05, "loss": 0.4753, "step": 35900 }, { "epoch": 2.81140179617337, "grad_norm": 1.0310161113739014, "learning_rate": 1.9453098866744823e-05, "loss": 0.4826, "step": 36000 }, { "epoch": 2.8192112456071845, "grad_norm": 1.30093514919281, "learning_rate": 1.945153575615475e-05, "loss": 0.4695, "step": 36100 }, { "epoch": 2.8270206950409995, "grad_norm": 1.268122673034668, "learning_rate": 1.9449972645564675e-05, "loss": 0.4944, "step": 36200 }, { "epoch": 2.8348301444748145, "grad_norm": 1.3303180932998657, "learning_rate": 1.94484095349746e-05, "loss": 0.4773, "step": 36300 }, { "epoch": 2.8426395939086295, "grad_norm": 1.1513735055923462, "learning_rate": 1.9446846424384527e-05, "loss": 0.4672, "step": 36400 }, { "epoch": 2.8504490433424445, "grad_norm": 1.1249432563781738, "learning_rate": 1.9445283313794453e-05, "loss": 0.496, "step": 36500 }, { "epoch": 2.858258492776259, "grad_norm": 1.6745643615722656, "learning_rate": 1.944372020320438e-05, "loss": 0.465, "step": 36600 }, { "epoch": 2.866067942210074, "grad_norm": 1.2362009286880493, "learning_rate": 1.9442157092614305e-05, "loss": 0.4788, "step": 36700 }, { "epoch": 2.873877391643889, "grad_norm": 1.3232982158660889, "learning_rate": 1.9440593982024228e-05, "loss": 0.4748, "step": 36800 }, { "epoch": 2.881686841077704, "grad_norm": 1.561949610710144, "learning_rate": 1.9439030871434154e-05, "loss": 0.4854, "step": 36900 }, { "epoch": 2.889496290511519, "grad_norm": 1.1138705015182495, "learning_rate": 1.9437467760844083e-05, "loss": 0.47, "step": 37000 }, { "epoch": 2.8973057399453337, "grad_norm": 1.2688237428665161, "learning_rate": 1.9435904650254006e-05, "loss": 0.4795, "step": 37100 }, { "epoch": 2.9051151893791487, "grad_norm": 1.3343580961227417, "learning_rate": 1.9434341539663932e-05, "loss": 0.4688, "step": 37200 }, { "epoch": 2.9129246388129637, "grad_norm": 1.3715122938156128, "learning_rate": 1.9432778429073858e-05, "loss": 0.4846, "step": 37300 }, { "epoch": 2.9207340882467787, "grad_norm": 1.0764130353927612, "learning_rate": 1.9431215318483784e-05, "loss": 0.4589, "step": 37400 }, { "epoch": 2.9285435376805937, "grad_norm": 1.2759395837783813, "learning_rate": 1.942965220789371e-05, "loss": 0.4878, "step": 37500 }, { "epoch": 2.9363529871144083, "grad_norm": 1.2582157850265503, "learning_rate": 1.9428089097303636e-05, "loss": 0.4744, "step": 37600 }, { "epoch": 2.9441624365482233, "grad_norm": 1.045397400856018, "learning_rate": 1.9426525986713562e-05, "loss": 0.4836, "step": 37700 }, { "epoch": 2.9519718859820383, "grad_norm": 1.208304524421692, "learning_rate": 1.9424962876123488e-05, "loss": 0.4806, "step": 37800 }, { "epoch": 2.9597813354158533, "grad_norm": 1.1622886657714844, "learning_rate": 1.9423399765533414e-05, "loss": 0.4873, "step": 37900 }, { "epoch": 2.9675907848496683, "grad_norm": 1.1261911392211914, "learning_rate": 1.9421836654943337e-05, "loss": 0.4802, "step": 38000 }, { "epoch": 2.975400234283483, "grad_norm": 1.0438706874847412, "learning_rate": 1.9420273544353266e-05, "loss": 0.4859, "step": 38100 }, { "epoch": 2.983209683717298, "grad_norm": 1.171201467514038, "learning_rate": 1.9418710433763192e-05, "loss": 0.4822, "step": 38200 }, { "epoch": 2.991019133151113, "grad_norm": 0.9960982799530029, "learning_rate": 1.9417147323173115e-05, "loss": 0.4654, "step": 38300 }, { "epoch": 2.998828582584928, "grad_norm": 1.3679864406585693, "learning_rate": 1.9415584212583044e-05, "loss": 0.4717, "step": 38400 }, { "epoch": 3.0066380320187425, "grad_norm": 1.395412564277649, "learning_rate": 1.9414021101992967e-05, "loss": 0.4774, "step": 38500 }, { "epoch": 3.0144474814525575, "grad_norm": 0.8553999662399292, "learning_rate": 1.9412457991402893e-05, "loss": 0.4585, "step": 38600 }, { "epoch": 3.0222569308863725, "grad_norm": 1.151389718055725, "learning_rate": 1.941089488081282e-05, "loss": 0.4901, "step": 38700 }, { "epoch": 3.0300663803201875, "grad_norm": 1.1624094247817993, "learning_rate": 1.9409331770222745e-05, "loss": 0.4848, "step": 38800 }, { "epoch": 3.0378758297540025, "grad_norm": 1.263749599456787, "learning_rate": 1.940778429073857e-05, "loss": 0.4683, "step": 38900 }, { "epoch": 3.045685279187817, "grad_norm": 1.2082384824752808, "learning_rate": 1.9406221180148497e-05, "loss": 0.4698, "step": 39000 }, { "epoch": 3.053494728621632, "grad_norm": 1.1123636960983276, "learning_rate": 1.9404658069558423e-05, "loss": 0.4646, "step": 39100 }, { "epoch": 3.061304178055447, "grad_norm": 1.2531005144119263, "learning_rate": 1.940309495896835e-05, "loss": 0.4601, "step": 39200 }, { "epoch": 3.069113627489262, "grad_norm": 1.287048578262329, "learning_rate": 1.9401531848378275e-05, "loss": 0.4497, "step": 39300 }, { "epoch": 3.076923076923077, "grad_norm": 0.9623894691467285, "learning_rate": 1.93999687377882e-05, "loss": 0.4635, "step": 39400 }, { "epoch": 3.0847325263568917, "grad_norm": 1.3144161701202393, "learning_rate": 1.9398405627198127e-05, "loss": 0.4593, "step": 39500 }, { "epoch": 3.0925419757907067, "grad_norm": 1.362608551979065, "learning_rate": 1.9396842516608053e-05, "loss": 0.4626, "step": 39600 }, { "epoch": 3.1003514252245217, "grad_norm": 1.1824091672897339, "learning_rate": 1.939527940601798e-05, "loss": 0.4717, "step": 39700 }, { "epoch": 3.1081608746583367, "grad_norm": 1.4997671842575073, "learning_rate": 1.93937162954279e-05, "loss": 0.486, "step": 39800 }, { "epoch": 3.1159703240921517, "grad_norm": 1.056342363357544, "learning_rate": 1.939215318483783e-05, "loss": 0.4927, "step": 39900 }, { "epoch": 3.1237797735259663, "grad_norm": 1.4581704139709473, "learning_rate": 1.9390590074247753e-05, "loss": 0.4802, "step": 40000 }, { "epoch": 3.1315892229597813, "grad_norm": 1.2977889776229858, "learning_rate": 1.938902696365768e-05, "loss": 0.4667, "step": 40100 }, { "epoch": 3.1393986723935963, "grad_norm": 1.1020759344100952, "learning_rate": 1.9387463853067605e-05, "loss": 0.4625, "step": 40200 }, { "epoch": 3.1472081218274113, "grad_norm": 1.1267211437225342, "learning_rate": 1.938590074247753e-05, "loss": 0.465, "step": 40300 }, { "epoch": 3.1550175712612263, "grad_norm": 0.9519909024238586, "learning_rate": 1.9384337631887457e-05, "loss": 0.4798, "step": 40400 }, { "epoch": 3.162827020695041, "grad_norm": 1.512546420097351, "learning_rate": 1.9382774521297383e-05, "loss": 0.4498, "step": 40500 }, { "epoch": 3.170636470128856, "grad_norm": 1.2842134237289429, "learning_rate": 1.938121141070731e-05, "loss": 0.4742, "step": 40600 }, { "epoch": 3.178445919562671, "grad_norm": 1.0086536407470703, "learning_rate": 1.9379648300117236e-05, "loss": 0.4602, "step": 40700 }, { "epoch": 3.186255368996486, "grad_norm": 1.716693639755249, "learning_rate": 1.937808518952716e-05, "loss": 0.462, "step": 40800 }, { "epoch": 3.1940648184303004, "grad_norm": 0.9176514744758606, "learning_rate": 1.9376522078937084e-05, "loss": 0.4531, "step": 40900 }, { "epoch": 3.2018742678641154, "grad_norm": 0.9721403121948242, "learning_rate": 1.9374958968347014e-05, "loss": 0.4488, "step": 41000 }, { "epoch": 3.2096837172979304, "grad_norm": 1.0320054292678833, "learning_rate": 1.937339585775694e-05, "loss": 0.4632, "step": 41100 }, { "epoch": 3.2174931667317455, "grad_norm": 0.9565618634223938, "learning_rate": 1.9371832747166862e-05, "loss": 0.4686, "step": 41200 }, { "epoch": 3.2253026161655605, "grad_norm": 1.0439300537109375, "learning_rate": 1.9370269636576788e-05, "loss": 0.4771, "step": 41300 }, { "epoch": 3.233112065599375, "grad_norm": 1.02463960647583, "learning_rate": 1.9368706525986714e-05, "loss": 0.471, "step": 41400 }, { "epoch": 3.24092151503319, "grad_norm": 1.3478844165802002, "learning_rate": 1.936714341539664e-05, "loss": 0.4579, "step": 41500 }, { "epoch": 3.248730964467005, "grad_norm": 0.9918316006660461, "learning_rate": 1.9365580304806566e-05, "loss": 0.4668, "step": 41600 }, { "epoch": 3.25654041390082, "grad_norm": 1.500217080116272, "learning_rate": 1.9364017194216492e-05, "loss": 0.4498, "step": 41700 }, { "epoch": 3.264349863334635, "grad_norm": 1.0742114782333374, "learning_rate": 1.9362454083626418e-05, "loss": 0.4726, "step": 41800 }, { "epoch": 3.2721593127684496, "grad_norm": 1.3920074701309204, "learning_rate": 1.9360890973036344e-05, "loss": 0.4719, "step": 41900 }, { "epoch": 3.2799687622022646, "grad_norm": 1.1826322078704834, "learning_rate": 1.935932786244627e-05, "loss": 0.4584, "step": 42000 }, { "epoch": 3.2877782116360796, "grad_norm": 1.1376311779022217, "learning_rate": 1.9357764751856196e-05, "loss": 0.4741, "step": 42100 }, { "epoch": 3.2955876610698946, "grad_norm": 1.0839388370513916, "learning_rate": 1.9356201641266122e-05, "loss": 0.4769, "step": 42200 }, { "epoch": 3.3033971105037097, "grad_norm": 1.0490782260894775, "learning_rate": 1.9354638530676045e-05, "loss": 0.4557, "step": 42300 }, { "epoch": 3.311206559937524, "grad_norm": 1.1325418949127197, "learning_rate": 1.935309105119187e-05, "loss": 0.4628, "step": 42400 }, { "epoch": 3.3190160093713392, "grad_norm": 1.1212817430496216, "learning_rate": 1.93515279406018e-05, "loss": 0.4486, "step": 42500 }, { "epoch": 3.3268254588051542, "grad_norm": 1.111585259437561, "learning_rate": 1.9349964830011726e-05, "loss": 0.4647, "step": 42600 }, { "epoch": 3.3346349082389692, "grad_norm": 1.1315809488296509, "learning_rate": 1.934840171942165e-05, "loss": 0.4648, "step": 42700 }, { "epoch": 3.3424443576727842, "grad_norm": 0.9654698371887207, "learning_rate": 1.9346838608831578e-05, "loss": 0.4461, "step": 42800 }, { "epoch": 3.350253807106599, "grad_norm": 1.0180907249450684, "learning_rate": 1.93452754982415e-05, "loss": 0.472, "step": 42900 }, { "epoch": 3.358063256540414, "grad_norm": 1.1459476947784424, "learning_rate": 1.9343712387651427e-05, "loss": 0.4524, "step": 43000 }, { "epoch": 3.365872705974229, "grad_norm": 1.1465004682540894, "learning_rate": 1.9342149277061353e-05, "loss": 0.4503, "step": 43100 }, { "epoch": 3.373682155408044, "grad_norm": 1.0197675228118896, "learning_rate": 1.934058616647128e-05, "loss": 0.4585, "step": 43200 }, { "epoch": 3.381491604841859, "grad_norm": 1.0962787866592407, "learning_rate": 1.9339023055881205e-05, "loss": 0.4454, "step": 43300 }, { "epoch": 3.3893010542756734, "grad_norm": 1.0584628582000732, "learning_rate": 1.933745994529113e-05, "loss": 0.4589, "step": 43400 }, { "epoch": 3.3971105037094884, "grad_norm": 1.0412591695785522, "learning_rate": 1.9335896834701057e-05, "loss": 0.4513, "step": 43500 }, { "epoch": 3.4049199531433034, "grad_norm": 1.2135701179504395, "learning_rate": 1.9334333724110983e-05, "loss": 0.4675, "step": 43600 }, { "epoch": 3.4127294025771184, "grad_norm": 0.9547072052955627, "learning_rate": 1.933277061352091e-05, "loss": 0.4435, "step": 43700 }, { "epoch": 3.4205388520109334, "grad_norm": 1.005250096321106, "learning_rate": 1.933120750293083e-05, "loss": 0.4595, "step": 43800 }, { "epoch": 3.428348301444748, "grad_norm": 1.306046724319458, "learning_rate": 1.932964439234076e-05, "loss": 0.4549, "step": 43900 }, { "epoch": 3.436157750878563, "grad_norm": 0.9959258437156677, "learning_rate": 1.9328081281750687e-05, "loss": 0.4821, "step": 44000 }, { "epoch": 3.443967200312378, "grad_norm": 1.0681926012039185, "learning_rate": 1.932651817116061e-05, "loss": 0.4334, "step": 44100 }, { "epoch": 3.451776649746193, "grad_norm": 0.9525073766708374, "learning_rate": 1.9324955060570536e-05, "loss": 0.455, "step": 44200 }, { "epoch": 3.459586099180008, "grad_norm": 1.1635737419128418, "learning_rate": 1.9323391949980465e-05, "loss": 0.471, "step": 44300 }, { "epoch": 3.4673955486138226, "grad_norm": 1.1268336772918701, "learning_rate": 1.932184447049629e-05, "loss": 0.4546, "step": 44400 }, { "epoch": 3.4752049980476376, "grad_norm": 1.1383503675460815, "learning_rate": 1.9320281359906214e-05, "loss": 0.4734, "step": 44500 }, { "epoch": 3.4830144474814526, "grad_norm": 1.0616774559020996, "learning_rate": 1.931871824931614e-05, "loss": 0.4611, "step": 44600 }, { "epoch": 3.4908238969152676, "grad_norm": 1.338844895362854, "learning_rate": 1.9317155138726066e-05, "loss": 0.4489, "step": 44700 }, { "epoch": 3.4986333463490826, "grad_norm": 1.0978337526321411, "learning_rate": 1.931559202813599e-05, "loss": 0.4713, "step": 44800 }, { "epoch": 3.506442795782897, "grad_norm": 0.8954633474349976, "learning_rate": 1.9314028917545918e-05, "loss": 0.4383, "step": 44900 }, { "epoch": 3.514252245216712, "grad_norm": 1.0776824951171875, "learning_rate": 1.9312465806955844e-05, "loss": 0.4559, "step": 45000 }, { "epoch": 3.522061694650527, "grad_norm": 0.9826775789260864, "learning_rate": 1.931090269636577e-05, "loss": 0.4526, "step": 45100 }, { "epoch": 3.529871144084342, "grad_norm": 1.1166000366210938, "learning_rate": 1.9309339585775696e-05, "loss": 0.4463, "step": 45200 }, { "epoch": 3.5376805935181572, "grad_norm": 1.129669189453125, "learning_rate": 1.930777647518562e-05, "loss": 0.4594, "step": 45300 }, { "epoch": 3.545490042951972, "grad_norm": 1.1088656187057495, "learning_rate": 1.9306213364595548e-05, "loss": 0.4402, "step": 45400 }, { "epoch": 3.553299492385787, "grad_norm": 1.5019007921218872, "learning_rate": 1.9304650254005474e-05, "loss": 0.4482, "step": 45500 }, { "epoch": 3.561108941819602, "grad_norm": 1.105352520942688, "learning_rate": 1.9303087143415396e-05, "loss": 0.4357, "step": 45600 }, { "epoch": 3.5689183912534164, "grad_norm": 1.3878651857376099, "learning_rate": 1.9301524032825326e-05, "loss": 0.4601, "step": 45700 }, { "epoch": 3.576727840687232, "grad_norm": 1.047351360321045, "learning_rate": 1.9299960922235252e-05, "loss": 0.454, "step": 45800 }, { "epoch": 3.5845372901210464, "grad_norm": 1.0843867063522339, "learning_rate": 1.9298397811645174e-05, "loss": 0.4197, "step": 45900 }, { "epoch": 3.5923467395548614, "grad_norm": 1.1075481176376343, "learning_rate": 1.92968347010551e-05, "loss": 0.4518, "step": 46000 }, { "epoch": 3.6001561889886764, "grad_norm": 0.9046174883842468, "learning_rate": 1.9295271590465026e-05, "loss": 0.4434, "step": 46100 }, { "epoch": 3.607965638422491, "grad_norm": 0.9517740607261658, "learning_rate": 1.9293708479874952e-05, "loss": 0.4397, "step": 46200 }, { "epoch": 3.615775087856306, "grad_norm": 1.0598924160003662, "learning_rate": 1.929214536928488e-05, "loss": 0.463, "step": 46300 }, { "epoch": 3.623584537290121, "grad_norm": 1.3215514421463013, "learning_rate": 1.9290582258694804e-05, "loss": 0.4501, "step": 46400 }, { "epoch": 3.631393986723936, "grad_norm": 1.129428744316101, "learning_rate": 1.928901914810473e-05, "loss": 0.4326, "step": 46500 }, { "epoch": 3.639203436157751, "grad_norm": 1.0689501762390137, "learning_rate": 1.9287456037514656e-05, "loss": 0.4447, "step": 46600 }, { "epoch": 3.6470128855915656, "grad_norm": 1.0962802171707153, "learning_rate": 1.9285908558030482e-05, "loss": 0.4665, "step": 46700 }, { "epoch": 3.6548223350253806, "grad_norm": 1.0971689224243164, "learning_rate": 1.928434544744041e-05, "loss": 0.4526, "step": 46800 }, { "epoch": 3.6626317844591956, "grad_norm": 1.1643929481506348, "learning_rate": 1.9282782336850334e-05, "loss": 0.4605, "step": 46900 }, { "epoch": 3.6704412338930106, "grad_norm": 1.1027824878692627, "learning_rate": 1.928121922626026e-05, "loss": 0.4458, "step": 47000 }, { "epoch": 3.6782506833268256, "grad_norm": 0.9542461037635803, "learning_rate": 1.9279656115670183e-05, "loss": 0.4488, "step": 47100 }, { "epoch": 3.68606013276064, "grad_norm": 0.8931864500045776, "learning_rate": 1.9278093005080112e-05, "loss": 0.4552, "step": 47200 }, { "epoch": 3.693869582194455, "grad_norm": 1.299230933189392, "learning_rate": 1.927652989449004e-05, "loss": 0.4332, "step": 47300 }, { "epoch": 3.70167903162827, "grad_norm": 1.2159420251846313, "learning_rate": 1.927496678389996e-05, "loss": 0.4354, "step": 47400 }, { "epoch": 3.709488481062085, "grad_norm": 0.9442591071128845, "learning_rate": 1.9273403673309887e-05, "loss": 0.4399, "step": 47500 }, { "epoch": 3.7172979304959, "grad_norm": 1.3856900930404663, "learning_rate": 1.9271840562719813e-05, "loss": 0.4596, "step": 47600 }, { "epoch": 3.7251073799297147, "grad_norm": 1.2507699728012085, "learning_rate": 1.927027745212974e-05, "loss": 0.4502, "step": 47700 }, { "epoch": 3.7329168293635298, "grad_norm": 1.292219877243042, "learning_rate": 1.9268714341539665e-05, "loss": 0.4494, "step": 47800 }, { "epoch": 3.7407262787973448, "grad_norm": 1.3267557621002197, "learning_rate": 1.926715123094959e-05, "loss": 0.4407, "step": 47900 }, { "epoch": 3.7485357282311598, "grad_norm": 0.9994024634361267, "learning_rate": 1.9265588120359517e-05, "loss": 0.4449, "step": 48000 }, { "epoch": 3.7563451776649748, "grad_norm": 1.1877665519714355, "learning_rate": 1.9264025009769443e-05, "loss": 0.4261, "step": 48100 }, { "epoch": 3.7641546270987893, "grad_norm": 0.9004372954368591, "learning_rate": 1.926246189917937e-05, "loss": 0.4261, "step": 48200 }, { "epoch": 3.7719640765326043, "grad_norm": 1.0032011270523071, "learning_rate": 1.9260898788589295e-05, "loss": 0.4336, "step": 48300 }, { "epoch": 3.7797735259664194, "grad_norm": 1.331635594367981, "learning_rate": 1.925933567799922e-05, "loss": 0.4563, "step": 48400 }, { "epoch": 3.7875829754002344, "grad_norm": 1.1291660070419312, "learning_rate": 1.9257772567409144e-05, "loss": 0.4512, "step": 48500 }, { "epoch": 3.7953924248340494, "grad_norm": 1.0733696222305298, "learning_rate": 1.925620945681907e-05, "loss": 0.4293, "step": 48600 }, { "epoch": 3.803201874267864, "grad_norm": 0.9246060252189636, "learning_rate": 1.9254646346229e-05, "loss": 0.443, "step": 48700 }, { "epoch": 3.811011323701679, "grad_norm": 1.1063892841339111, "learning_rate": 1.9253083235638922e-05, "loss": 0.4324, "step": 48800 }, { "epoch": 3.818820773135494, "grad_norm": 0.9335746765136719, "learning_rate": 1.9251520125048848e-05, "loss": 0.4299, "step": 48900 }, { "epoch": 3.826630222569309, "grad_norm": 1.143466591835022, "learning_rate": 1.9249957014458774e-05, "loss": 0.443, "step": 49000 }, { "epoch": 3.834439672003124, "grad_norm": 1.0343248844146729, "learning_rate": 1.92484095349746e-05, "loss": 0.4352, "step": 49100 }, { "epoch": 3.8422491214369385, "grad_norm": 1.3340160846710205, "learning_rate": 1.9246846424384526e-05, "loss": 0.4252, "step": 49200 }, { "epoch": 3.8500585708707535, "grad_norm": 1.1612764596939087, "learning_rate": 1.9245283313794452e-05, "loss": 0.447, "step": 49300 }, { "epoch": 3.8578680203045685, "grad_norm": 1.130889654159546, "learning_rate": 1.9243720203204378e-05, "loss": 0.4377, "step": 49400 }, { "epoch": 3.8656774697383836, "grad_norm": 1.0333083868026733, "learning_rate": 1.9242157092614304e-05, "loss": 0.4557, "step": 49500 }, { "epoch": 3.8734869191721986, "grad_norm": 1.0891958475112915, "learning_rate": 1.924059398202423e-05, "loss": 0.4402, "step": 49600 }, { "epoch": 3.881296368606013, "grad_norm": 1.0473707914352417, "learning_rate": 1.9239030871434156e-05, "loss": 0.4289, "step": 49700 }, { "epoch": 3.889105818039828, "grad_norm": 0.858305037021637, "learning_rate": 1.9237467760844082e-05, "loss": 0.432, "step": 49800 }, { "epoch": 3.896915267473643, "grad_norm": 1.3434786796569824, "learning_rate": 1.9235904650254008e-05, "loss": 0.4346, "step": 49900 }, { "epoch": 3.904724716907458, "grad_norm": 0.7991245985031128, "learning_rate": 1.923434153966393e-05, "loss": 0.426, "step": 50000 }, { "epoch": 3.912534166341273, "grad_norm": 1.0419330596923828, "learning_rate": 1.923277842907386e-05, "loss": 0.4337, "step": 50100 }, { "epoch": 3.9203436157750877, "grad_norm": 1.0657131671905518, "learning_rate": 1.9231215318483786e-05, "loss": 0.4148, "step": 50200 }, { "epoch": 3.9281530652089027, "grad_norm": 1.0321459770202637, "learning_rate": 1.922965220789371e-05, "loss": 0.4394, "step": 50300 }, { "epoch": 3.9359625146427177, "grad_norm": 0.9495915174484253, "learning_rate": 1.9228089097303635e-05, "loss": 0.4381, "step": 50400 }, { "epoch": 3.9437719640765327, "grad_norm": 1.3790180683135986, "learning_rate": 1.9226525986713564e-05, "loss": 0.4385, "step": 50500 }, { "epoch": 3.9515814135103478, "grad_norm": 0.9733704328536987, "learning_rate": 1.9224962876123487e-05, "loss": 0.4173, "step": 50600 }, { "epoch": 3.9593908629441623, "grad_norm": 0.8968336582183838, "learning_rate": 1.9223399765533413e-05, "loss": 0.435, "step": 50700 }, { "epoch": 3.9672003123779773, "grad_norm": 1.1181347370147705, "learning_rate": 1.922183665494334e-05, "loss": 0.4388, "step": 50800 }, { "epoch": 3.9750097618117923, "grad_norm": 1.0089054107666016, "learning_rate": 1.9220273544353265e-05, "loss": 0.414, "step": 50900 }, { "epoch": 3.9828192112456073, "grad_norm": 0.9809916019439697, "learning_rate": 1.921871043376319e-05, "loss": 0.429, "step": 51000 }, { "epoch": 3.9906286606794223, "grad_norm": 0.9226115942001343, "learning_rate": 1.9217147323173117e-05, "loss": 0.4262, "step": 51100 }, { "epoch": 3.998438110113237, "grad_norm": 1.1190813779830933, "learning_rate": 1.9215584212583043e-05, "loss": 0.4287, "step": 51200 }, { "epoch": 4.006247559547052, "grad_norm": 1.146544098854065, "learning_rate": 1.921402110199297e-05, "loss": 0.4334, "step": 51300 }, { "epoch": 4.014057008980867, "grad_norm": 1.093766450881958, "learning_rate": 1.9212457991402895e-05, "loss": 0.4355, "step": 51400 }, { "epoch": 4.0218664584146815, "grad_norm": 0.8110470771789551, "learning_rate": 1.9210894880812817e-05, "loss": 0.4542, "step": 51500 }, { "epoch": 4.029675907848497, "grad_norm": 1.1870598793029785, "learning_rate": 1.9209331770222747e-05, "loss": 0.4247, "step": 51600 }, { "epoch": 4.0374853572823115, "grad_norm": 1.3483214378356934, "learning_rate": 1.920776865963267e-05, "loss": 0.4188, "step": 51700 }, { "epoch": 4.045294806716127, "grad_norm": 1.1394708156585693, "learning_rate": 1.9206205549042595e-05, "loss": 0.4341, "step": 51800 }, { "epoch": 4.0531042561499415, "grad_norm": 1.03669273853302, "learning_rate": 1.9204642438452525e-05, "loss": 0.4325, "step": 51900 }, { "epoch": 4.060913705583756, "grad_norm": 0.9215898513793945, "learning_rate": 1.9203079327862447e-05, "loss": 0.4259, "step": 52000 }, { "epoch": 4.0687231550175715, "grad_norm": 0.8863016963005066, "learning_rate": 1.9201516217272373e-05, "loss": 0.4215, "step": 52100 }, { "epoch": 4.076532604451386, "grad_norm": 0.865557074546814, "learning_rate": 1.91999531066823e-05, "loss": 0.4338, "step": 52200 }, { "epoch": 4.084342053885201, "grad_norm": 1.240858793258667, "learning_rate": 1.9198389996092225e-05, "loss": 0.4294, "step": 52300 }, { "epoch": 4.092151503319016, "grad_norm": 0.9957290887832642, "learning_rate": 1.919682688550215e-05, "loss": 0.4239, "step": 52400 }, { "epoch": 4.099960952752831, "grad_norm": 1.122232437133789, "learning_rate": 1.9195263774912077e-05, "loss": 0.4312, "step": 52500 }, { "epoch": 4.107770402186646, "grad_norm": 0.9833566546440125, "learning_rate": 1.9193700664322e-05, "loss": 0.4427, "step": 52600 }, { "epoch": 4.115579851620461, "grad_norm": 1.0275732278823853, "learning_rate": 1.919213755373193e-05, "loss": 0.4188, "step": 52700 }, { "epoch": 4.123389301054275, "grad_norm": 1.186841368675232, "learning_rate": 1.9190574443141855e-05, "loss": 0.423, "step": 52800 }, { "epoch": 4.131198750488091, "grad_norm": 1.288432240486145, "learning_rate": 1.9189011332551778e-05, "loss": 0.4483, "step": 52900 }, { "epoch": 4.139008199921905, "grad_norm": 1.2151869535446167, "learning_rate": 1.9187448221961707e-05, "loss": 0.4206, "step": 53000 }, { "epoch": 4.146817649355721, "grad_norm": 1.2180672883987427, "learning_rate": 1.9185900742477533e-05, "loss": 0.4006, "step": 53100 }, { "epoch": 4.154627098789535, "grad_norm": 0.9600439071655273, "learning_rate": 1.9184337631887456e-05, "loss": 0.4299, "step": 53200 }, { "epoch": 4.16243654822335, "grad_norm": 1.0519211292266846, "learning_rate": 1.9182774521297382e-05, "loss": 0.4221, "step": 53300 }, { "epoch": 4.170245997657165, "grad_norm": 0.9762826561927795, "learning_rate": 1.918121141070731e-05, "loss": 0.4052, "step": 53400 }, { "epoch": 4.17805544709098, "grad_norm": 0.9231967329978943, "learning_rate": 1.9179648300117234e-05, "loss": 0.4403, "step": 53500 }, { "epoch": 4.185864896524795, "grad_norm": 0.8770660161972046, "learning_rate": 1.917808518952716e-05, "loss": 0.45, "step": 53600 }, { "epoch": 4.19367434595861, "grad_norm": 1.1238151788711548, "learning_rate": 1.9176522078937086e-05, "loss": 0.414, "step": 53700 }, { "epoch": 4.2014837953924244, "grad_norm": 0.861791729927063, "learning_rate": 1.9174958968347012e-05, "loss": 0.4102, "step": 53800 }, { "epoch": 4.20929324482624, "grad_norm": 0.9705322980880737, "learning_rate": 1.9173395857756938e-05, "loss": 0.4106, "step": 53900 }, { "epoch": 4.2171026942600545, "grad_norm": 1.0542993545532227, "learning_rate": 1.9171832747166864e-05, "loss": 0.4256, "step": 54000 }, { "epoch": 4.22491214369387, "grad_norm": 1.1293755769729614, "learning_rate": 1.917026963657679e-05, "loss": 0.4192, "step": 54100 }, { "epoch": 4.2327215931276845, "grad_norm": 0.7894850969314575, "learning_rate": 1.9168706525986716e-05, "loss": 0.4187, "step": 54200 }, { "epoch": 4.240531042561499, "grad_norm": 1.1279304027557373, "learning_rate": 1.9167143415396642e-05, "loss": 0.4217, "step": 54300 }, { "epoch": 4.2483404919953145, "grad_norm": 1.1187465190887451, "learning_rate": 1.9165580304806565e-05, "loss": 0.4197, "step": 54400 }, { "epoch": 4.256149941429129, "grad_norm": 1.210397720336914, "learning_rate": 1.9164017194216494e-05, "loss": 0.416, "step": 54500 }, { "epoch": 4.2639593908629445, "grad_norm": 1.1013455390930176, "learning_rate": 1.9162454083626417e-05, "loss": 0.4312, "step": 54600 }, { "epoch": 4.271768840296759, "grad_norm": 1.0917813777923584, "learning_rate": 1.9160890973036343e-05, "loss": 0.4348, "step": 54700 }, { "epoch": 4.279578289730574, "grad_norm": 0.9799680113792419, "learning_rate": 1.915932786244627e-05, "loss": 0.4237, "step": 54800 }, { "epoch": 4.287387739164389, "grad_norm": 0.9628735780715942, "learning_rate": 1.9157780382962098e-05, "loss": 0.4282, "step": 54900 }, { "epoch": 4.295197188598204, "grad_norm": 0.9904158711433411, "learning_rate": 1.915621727237202e-05, "loss": 0.4273, "step": 55000 }, { "epoch": 4.303006638032019, "grad_norm": 0.8235137462615967, "learning_rate": 1.9154654161781947e-05, "loss": 0.4283, "step": 55100 }, { "epoch": 4.310816087465834, "grad_norm": 1.1564571857452393, "learning_rate": 1.9153091051191873e-05, "loss": 0.4078, "step": 55200 }, { "epoch": 4.318625536899648, "grad_norm": 1.199800729751587, "learning_rate": 1.91515279406018e-05, "loss": 0.4033, "step": 55300 }, { "epoch": 4.326434986333464, "grad_norm": 1.005346417427063, "learning_rate": 1.9149964830011725e-05, "loss": 0.4121, "step": 55400 }, { "epoch": 4.334244435767278, "grad_norm": 0.8400962948799133, "learning_rate": 1.914840171942165e-05, "loss": 0.4183, "step": 55500 }, { "epoch": 4.342053885201094, "grad_norm": 1.3714483976364136, "learning_rate": 1.9146838608831577e-05, "loss": 0.4228, "step": 55600 }, { "epoch": 4.349863334634908, "grad_norm": 1.2525608539581299, "learning_rate": 1.9145275498241503e-05, "loss": 0.4165, "step": 55700 }, { "epoch": 4.357672784068723, "grad_norm": 1.116113305091858, "learning_rate": 1.914371238765143e-05, "loss": 0.4192, "step": 55800 }, { "epoch": 4.365482233502538, "grad_norm": 1.3345171213150024, "learning_rate": 1.914214927706135e-05, "loss": 0.4127, "step": 55900 }, { "epoch": 4.373291682936353, "grad_norm": 0.7660952210426331, "learning_rate": 1.914058616647128e-05, "loss": 0.4297, "step": 56000 }, { "epoch": 4.381101132370168, "grad_norm": 0.9481973648071289, "learning_rate": 1.9139023055881203e-05, "loss": 0.4216, "step": 56100 }, { "epoch": 4.388910581803983, "grad_norm": 0.9404019117355347, "learning_rate": 1.913745994529113e-05, "loss": 0.4238, "step": 56200 }, { "epoch": 4.396720031237797, "grad_norm": 0.907920241355896, "learning_rate": 1.913589683470106e-05, "loss": 0.3997, "step": 56300 }, { "epoch": 4.404529480671613, "grad_norm": 0.9752848744392395, "learning_rate": 1.913433372411098e-05, "loss": 0.4146, "step": 56400 }, { "epoch": 4.412338930105427, "grad_norm": 1.210908055305481, "learning_rate": 1.9132770613520908e-05, "loss": 0.426, "step": 56500 }, { "epoch": 4.420148379539243, "grad_norm": 0.8827472925186157, "learning_rate": 1.9131207502930834e-05, "loss": 0.4177, "step": 56600 }, { "epoch": 4.4279578289730575, "grad_norm": 1.1287732124328613, "learning_rate": 1.912964439234076e-05, "loss": 0.4143, "step": 56700 }, { "epoch": 4.435767278406872, "grad_norm": 1.011299729347229, "learning_rate": 1.9128081281750686e-05, "loss": 0.4078, "step": 56800 }, { "epoch": 4.4435767278406875, "grad_norm": 1.2453038692474365, "learning_rate": 1.912651817116061e-05, "loss": 0.4167, "step": 56900 }, { "epoch": 4.451386177274502, "grad_norm": 0.992863118648529, "learning_rate": 1.9124955060570534e-05, "loss": 0.4245, "step": 57000 }, { "epoch": 4.459195626708317, "grad_norm": 1.1472619771957397, "learning_rate": 1.9123391949980464e-05, "loss": 0.4201, "step": 57100 }, { "epoch": 4.467005076142132, "grad_norm": 1.3278522491455078, "learning_rate": 1.912182883939039e-05, "loss": 0.4184, "step": 57200 }, { "epoch": 4.474814525575947, "grad_norm": 1.2207483053207397, "learning_rate": 1.9120265728800312e-05, "loss": 0.4184, "step": 57300 }, { "epoch": 4.482623975009762, "grad_norm": 1.0354204177856445, "learning_rate": 1.911870261821024e-05, "loss": 0.4112, "step": 57400 }, { "epoch": 4.490433424443577, "grad_norm": 0.8041611909866333, "learning_rate": 1.9117139507620168e-05, "loss": 0.4127, "step": 57500 }, { "epoch": 4.498242873877391, "grad_norm": 0.8567083477973938, "learning_rate": 1.911557639703009e-05, "loss": 0.414, "step": 57600 }, { "epoch": 4.506052323311207, "grad_norm": 1.2674700021743774, "learning_rate": 1.9114013286440016e-05, "loss": 0.4051, "step": 57700 }, { "epoch": 4.513861772745021, "grad_norm": 1.1298909187316895, "learning_rate": 1.9112450175849942e-05, "loss": 0.4047, "step": 57800 }, { "epoch": 4.521671222178837, "grad_norm": 0.9413766264915466, "learning_rate": 1.9110887065259868e-05, "loss": 0.4156, "step": 57900 }, { "epoch": 4.529480671612651, "grad_norm": 1.1707350015640259, "learning_rate": 1.9109323954669794e-05, "loss": 0.4269, "step": 58000 }, { "epoch": 4.537290121046466, "grad_norm": 0.9936490654945374, "learning_rate": 1.910777647518562e-05, "loss": 0.416, "step": 58100 }, { "epoch": 4.545099570480281, "grad_norm": 1.0722568035125732, "learning_rate": 1.9106213364595546e-05, "loss": 0.3986, "step": 58200 }, { "epoch": 4.552909019914096, "grad_norm": 1.2025909423828125, "learning_rate": 1.9104650254005472e-05, "loss": 0.4097, "step": 58300 }, { "epoch": 4.560718469347911, "grad_norm": 0.8958162069320679, "learning_rate": 1.9103087143415398e-05, "loss": 0.4047, "step": 58400 }, { "epoch": 4.568527918781726, "grad_norm": 0.9446201324462891, "learning_rate": 1.9101524032825324e-05, "loss": 0.4083, "step": 58500 }, { "epoch": 4.57633736821554, "grad_norm": 1.0519663095474243, "learning_rate": 1.909996092223525e-05, "loss": 0.4263, "step": 58600 }, { "epoch": 4.584146817649356, "grad_norm": 0.8739796876907349, "learning_rate": 1.9098397811645176e-05, "loss": 0.4055, "step": 58700 }, { "epoch": 4.59195626708317, "grad_norm": 0.9819687604904175, "learning_rate": 1.90968347010551e-05, "loss": 0.4025, "step": 58800 }, { "epoch": 4.599765716516986, "grad_norm": 1.321071743965149, "learning_rate": 1.9095271590465028e-05, "loss": 0.413, "step": 58900 }, { "epoch": 4.6075751659508, "grad_norm": 0.8105387091636658, "learning_rate": 1.9093708479874954e-05, "loss": 0.3976, "step": 59000 }, { "epoch": 4.615384615384615, "grad_norm": 1.2726750373840332, "learning_rate": 1.9092145369284877e-05, "loss": 0.4165, "step": 59100 }, { "epoch": 4.62319406481843, "grad_norm": 1.0112985372543335, "learning_rate": 1.9090582258694806e-05, "loss": 0.3968, "step": 59200 }, { "epoch": 4.631003514252245, "grad_norm": 0.9318651556968689, "learning_rate": 1.908901914810473e-05, "loss": 0.398, "step": 59300 }, { "epoch": 4.63881296368606, "grad_norm": 1.057499647140503, "learning_rate": 1.9087456037514655e-05, "loss": 0.4177, "step": 59400 }, { "epoch": 4.646622413119875, "grad_norm": 1.0527663230895996, "learning_rate": 1.908589292692458e-05, "loss": 0.4014, "step": 59500 }, { "epoch": 4.65443186255369, "grad_norm": 1.1206157207489014, "learning_rate": 1.9084329816334507e-05, "loss": 0.4022, "step": 59600 }, { "epoch": 4.662241311987505, "grad_norm": 0.9441333413124084, "learning_rate": 1.9082766705744433e-05, "loss": 0.4002, "step": 59700 }, { "epoch": 4.67005076142132, "grad_norm": 1.1934523582458496, "learning_rate": 1.908120359515436e-05, "loss": 0.4106, "step": 59800 }, { "epoch": 4.677860210855135, "grad_norm": 1.1480247974395752, "learning_rate": 1.9079640484564285e-05, "loss": 0.4116, "step": 59900 }, { "epoch": 4.68566966028895, "grad_norm": 0.8538499474525452, "learning_rate": 1.907807737397421e-05, "loss": 0.3956, "step": 60000 }, { "epoch": 4.693479109722764, "grad_norm": 0.9278829097747803, "learning_rate": 1.9076514263384137e-05, "loss": 0.4261, "step": 60100 }, { "epoch": 4.70128855915658, "grad_norm": 1.076007604598999, "learning_rate": 1.907495115279406e-05, "loss": 0.4061, "step": 60200 }, { "epoch": 4.709098008590394, "grad_norm": 1.2330677509307861, "learning_rate": 1.907338804220399e-05, "loss": 0.3983, "step": 60300 }, { "epoch": 4.71690745802421, "grad_norm": 1.0433940887451172, "learning_rate": 1.9071824931613915e-05, "loss": 0.4078, "step": 60400 }, { "epoch": 4.724716907458024, "grad_norm": 1.1503841876983643, "learning_rate": 1.9070261821023838e-05, "loss": 0.3977, "step": 60500 }, { "epoch": 4.732526356891839, "grad_norm": 0.9216443300247192, "learning_rate": 1.9068698710433764e-05, "loss": 0.4013, "step": 60600 }, { "epoch": 4.740335806325654, "grad_norm": 0.9420239329338074, "learning_rate": 1.906713559984369e-05, "loss": 0.4145, "step": 60700 }, { "epoch": 4.748145255759469, "grad_norm": 0.8641262650489807, "learning_rate": 1.9065572489253616e-05, "loss": 0.4111, "step": 60800 }, { "epoch": 4.755954705193284, "grad_norm": 1.179482340812683, "learning_rate": 1.9064009378663542e-05, "loss": 0.4103, "step": 60900 }, { "epoch": 4.763764154627099, "grad_norm": 1.0619276762008667, "learning_rate": 1.9062446268073468e-05, "loss": 0.4037, "step": 61000 }, { "epoch": 4.771573604060913, "grad_norm": 1.2168949842453003, "learning_rate": 1.9060898788589294e-05, "loss": 0.4115, "step": 61100 }, { "epoch": 4.779383053494729, "grad_norm": 1.133819580078125, "learning_rate": 1.905933567799922e-05, "loss": 0.4129, "step": 61200 }, { "epoch": 4.787192502928543, "grad_norm": 1.0034329891204834, "learning_rate": 1.9057772567409146e-05, "loss": 0.3981, "step": 61300 }, { "epoch": 4.795001952362359, "grad_norm": 1.039372205734253, "learning_rate": 1.9056209456819072e-05, "loss": 0.4006, "step": 61400 }, { "epoch": 4.802811401796173, "grad_norm": 1.1082069873809814, "learning_rate": 1.9054646346228998e-05, "loss": 0.4117, "step": 61500 }, { "epoch": 4.810620851229988, "grad_norm": 0.8162183165550232, "learning_rate": 1.9053083235638924e-05, "loss": 0.3847, "step": 61600 }, { "epoch": 4.818430300663803, "grad_norm": 1.176859736442566, "learning_rate": 1.9051520125048846e-05, "loss": 0.3877, "step": 61700 }, { "epoch": 4.826239750097618, "grad_norm": 1.084212303161621, "learning_rate": 1.9049957014458776e-05, "loss": 0.4127, "step": 61800 }, { "epoch": 4.834049199531433, "grad_norm": 1.1191595792770386, "learning_rate": 1.9048393903868702e-05, "loss": 0.4004, "step": 61900 }, { "epoch": 4.841858648965248, "grad_norm": 0.9039832949638367, "learning_rate": 1.9046830793278624e-05, "loss": 0.4057, "step": 62000 }, { "epoch": 4.8496680983990625, "grad_norm": 0.9490616917610168, "learning_rate": 1.904526768268855e-05, "loss": 0.4005, "step": 62100 }, { "epoch": 4.857477547832878, "grad_norm": 1.0080968141555786, "learning_rate": 1.9043704572098476e-05, "loss": 0.404, "step": 62200 }, { "epoch": 4.865286997266693, "grad_norm": 0.9084405303001404, "learning_rate": 1.9042141461508402e-05, "loss": 0.3969, "step": 62300 }, { "epoch": 4.873096446700508, "grad_norm": 1.123801589012146, "learning_rate": 1.904057835091833e-05, "loss": 0.3956, "step": 62400 }, { "epoch": 4.880905896134323, "grad_norm": 1.3912336826324463, "learning_rate": 1.9039015240328254e-05, "loss": 0.4006, "step": 62500 }, { "epoch": 4.888715345568137, "grad_norm": 1.3924329280853271, "learning_rate": 1.903745212973818e-05, "loss": 0.3966, "step": 62600 }, { "epoch": 4.896524795001953, "grad_norm": 1.1651110649108887, "learning_rate": 1.9035889019148106e-05, "loss": 0.3903, "step": 62700 }, { "epoch": 4.904334244435767, "grad_norm": 0.9102842807769775, "learning_rate": 1.9034325908558032e-05, "loss": 0.3986, "step": 62800 }, { "epoch": 4.912143693869583, "grad_norm": 1.1338361501693726, "learning_rate": 1.903276279796796e-05, "loss": 0.3836, "step": 62900 }, { "epoch": 4.919953143303397, "grad_norm": 0.7827601432800293, "learning_rate": 1.9031215318483784e-05, "loss": 0.3988, "step": 63000 }, { "epoch": 4.927762592737212, "grad_norm": 1.0492647886276245, "learning_rate": 1.902965220789371e-05, "loss": 0.397, "step": 63100 }, { "epoch": 4.935572042171027, "grad_norm": 1.2427724599838257, "learning_rate": 1.9028089097303633e-05, "loss": 0.3907, "step": 63200 }, { "epoch": 4.943381491604842, "grad_norm": 0.8890752792358398, "learning_rate": 1.9026525986713562e-05, "loss": 0.4093, "step": 63300 }, { "epoch": 4.951190941038657, "grad_norm": 1.0128096342086792, "learning_rate": 1.902496287612349e-05, "loss": 0.4066, "step": 63400 }, { "epoch": 4.959000390472472, "grad_norm": 1.2015756368637085, "learning_rate": 1.902339976553341e-05, "loss": 0.3898, "step": 63500 }, { "epoch": 4.966809839906286, "grad_norm": 1.039713740348816, "learning_rate": 1.902183665494334e-05, "loss": 0.3945, "step": 63600 }, { "epoch": 4.974619289340102, "grad_norm": 0.9324501156806946, "learning_rate": 1.9020273544353267e-05, "loss": 0.3965, "step": 63700 }, { "epoch": 4.982428738773916, "grad_norm": 0.9096100330352783, "learning_rate": 1.901871043376319e-05, "loss": 0.3995, "step": 63800 }, { "epoch": 4.990238188207732, "grad_norm": 1.202704906463623, "learning_rate": 1.9017147323173115e-05, "loss": 0.3879, "step": 63900 }, { "epoch": 4.998047637641546, "grad_norm": 1.168053150177002, "learning_rate": 1.901558421258304e-05, "loss": 0.4114, "step": 64000 }, { "epoch": 5.005857087075361, "grad_norm": 1.0179202556610107, "learning_rate": 1.9014021101992967e-05, "loss": 0.3913, "step": 64100 }, { "epoch": 5.013666536509176, "grad_norm": 0.8013337850570679, "learning_rate": 1.9012457991402893e-05, "loss": 0.3921, "step": 64200 }, { "epoch": 5.021475985942991, "grad_norm": 0.8554266095161438, "learning_rate": 1.901089488081282e-05, "loss": 0.3974, "step": 64300 }, { "epoch": 5.0292854353768055, "grad_norm": 1.1443554162979126, "learning_rate": 1.9009331770222745e-05, "loss": 0.393, "step": 64400 }, { "epoch": 5.037094884810621, "grad_norm": 0.8980423212051392, "learning_rate": 1.900776865963267e-05, "loss": 0.3925, "step": 64500 }, { "epoch": 5.0449043342444355, "grad_norm": 1.084808111190796, "learning_rate": 1.9006205549042597e-05, "loss": 0.381, "step": 64600 }, { "epoch": 5.052713783678251, "grad_norm": 1.0049347877502441, "learning_rate": 1.9004642438452523e-05, "loss": 0.3963, "step": 64700 }, { "epoch": 5.0605232331120655, "grad_norm": 1.215223789215088, "learning_rate": 1.900307932786245e-05, "loss": 0.3895, "step": 64800 }, { "epoch": 5.06833268254588, "grad_norm": 1.2979555130004883, "learning_rate": 1.9001516217272372e-05, "loss": 0.3983, "step": 64900 }, { "epoch": 5.0761421319796955, "grad_norm": 1.1935662031173706, "learning_rate": 1.8999953106682298e-05, "loss": 0.3923, "step": 65000 }, { "epoch": 5.08395158141351, "grad_norm": 0.8865647912025452, "learning_rate": 1.8998389996092227e-05, "loss": 0.3984, "step": 65100 }, { "epoch": 5.091761030847326, "grad_norm": 1.161111831665039, "learning_rate": 1.899682688550215e-05, "loss": 0.3947, "step": 65200 }, { "epoch": 5.09957048028114, "grad_norm": 1.0658845901489258, "learning_rate": 1.8995263774912076e-05, "loss": 0.3911, "step": 65300 }, { "epoch": 5.107379929714955, "grad_norm": 0.9132626056671143, "learning_rate": 1.8993700664322002e-05, "loss": 0.3786, "step": 65400 }, { "epoch": 5.11518937914877, "grad_norm": 1.2132585048675537, "learning_rate": 1.8992137553731928e-05, "loss": 0.4043, "step": 65500 }, { "epoch": 5.122998828582585, "grad_norm": 1.0975048542022705, "learning_rate": 1.8990574443141854e-05, "loss": 0.3757, "step": 65600 }, { "epoch": 5.1308082780164, "grad_norm": 1.0072981119155884, "learning_rate": 1.898901133255178e-05, "loss": 0.393, "step": 65700 }, { "epoch": 5.138617727450215, "grad_norm": 1.1201173067092896, "learning_rate": 1.8987448221961706e-05, "loss": 0.3813, "step": 65800 }, { "epoch": 5.146427176884029, "grad_norm": 0.9941774010658264, "learning_rate": 1.8985885111371632e-05, "loss": 0.4047, "step": 65900 }, { "epoch": 5.154236626317845, "grad_norm": 1.2513539791107178, "learning_rate": 1.8984322000781558e-05, "loss": 0.409, "step": 66000 }, { "epoch": 5.162046075751659, "grad_norm": 1.330694556236267, "learning_rate": 1.898275889019148e-05, "loss": 0.3809, "step": 66100 }, { "epoch": 5.169855525185475, "grad_norm": 0.9888590574264526, "learning_rate": 1.898119577960141e-05, "loss": 0.3902, "step": 66200 }, { "epoch": 5.177664974619289, "grad_norm": 1.3481968641281128, "learning_rate": 1.8979632669011333e-05, "loss": 0.3861, "step": 66300 }, { "epoch": 5.185474424053104, "grad_norm": 0.9936544299125671, "learning_rate": 1.897806955842126e-05, "loss": 0.3852, "step": 66400 }, { "epoch": 5.193283873486919, "grad_norm": 1.215280294418335, "learning_rate": 1.8976506447831188e-05, "loss": 0.392, "step": 66500 }, { "epoch": 5.201093322920734, "grad_norm": 0.976831316947937, "learning_rate": 1.897494333724111e-05, "loss": 0.3908, "step": 66600 }, { "epoch": 5.208902772354549, "grad_norm": 1.195337176322937, "learning_rate": 1.8973380226651037e-05, "loss": 0.377, "step": 66700 }, { "epoch": 5.216712221788364, "grad_norm": 0.9364585876464844, "learning_rate": 1.8971817116060963e-05, "loss": 0.3947, "step": 66800 }, { "epoch": 5.2245216712221785, "grad_norm": 0.8873243927955627, "learning_rate": 1.897025400547089e-05, "loss": 0.3978, "step": 66900 }, { "epoch": 5.232331120655994, "grad_norm": 0.9752253293991089, "learning_rate": 1.8968690894880815e-05, "loss": 0.3854, "step": 67000 }, { "epoch": 5.2401405700898085, "grad_norm": 0.9990689754486084, "learning_rate": 1.896712778429074e-05, "loss": 0.381, "step": 67100 }, { "epoch": 5.247950019523624, "grad_norm": 1.0154149532318115, "learning_rate": 1.8965564673700663e-05, "loss": 0.3824, "step": 67200 }, { "epoch": 5.2557594689574385, "grad_norm": 1.0053505897521973, "learning_rate": 1.8964001563110593e-05, "loss": 0.389, "step": 67300 }, { "epoch": 5.263568918391253, "grad_norm": 1.27951979637146, "learning_rate": 1.896243845252052e-05, "loss": 0.3934, "step": 67400 }, { "epoch": 5.2713783678250685, "grad_norm": 1.0939563512802124, "learning_rate": 1.896087534193044e-05, "loss": 0.3824, "step": 67500 }, { "epoch": 5.279187817258883, "grad_norm": 1.247258186340332, "learning_rate": 1.895931223134037e-05, "loss": 0.3741, "step": 67600 }, { "epoch": 5.2869972666926985, "grad_norm": 0.9081011414527893, "learning_rate": 1.8957749120750293e-05, "loss": 0.3925, "step": 67700 }, { "epoch": 5.294806716126513, "grad_norm": 1.0813019275665283, "learning_rate": 1.895618601016022e-05, "loss": 0.3797, "step": 67800 }, { "epoch": 5.302616165560328, "grad_norm": 1.2039158344268799, "learning_rate": 1.8954622899570145e-05, "loss": 0.3963, "step": 67900 }, { "epoch": 5.310425614994143, "grad_norm": 1.1755808591842651, "learning_rate": 1.895305978898007e-05, "loss": 0.3943, "step": 68000 }, { "epoch": 5.318235064427958, "grad_norm": 1.0200046300888062, "learning_rate": 1.8951496678389997e-05, "loss": 0.3553, "step": 68100 }, { "epoch": 5.326044513861773, "grad_norm": 1.0487428903579712, "learning_rate": 1.8949933567799923e-05, "loss": 0.3786, "step": 68200 }, { "epoch": 5.333853963295588, "grad_norm": 0.8831790685653687, "learning_rate": 1.894837045720985e-05, "loss": 0.4052, "step": 68300 }, { "epoch": 5.341663412729402, "grad_norm": 1.000813603401184, "learning_rate": 1.8946807346619775e-05, "loss": 0.3923, "step": 68400 }, { "epoch": 5.349472862163218, "grad_norm": 1.2624138593673706, "learning_rate": 1.89452442360297e-05, "loss": 0.3763, "step": 68500 }, { "epoch": 5.357282311597032, "grad_norm": 1.0321928262710571, "learning_rate": 1.8943696756545527e-05, "loss": 0.3906, "step": 68600 }, { "epoch": 5.365091761030848, "grad_norm": 0.8170016407966614, "learning_rate": 1.8942133645955453e-05, "loss": 0.3677, "step": 68700 }, { "epoch": 5.372901210464662, "grad_norm": 1.0419316291809082, "learning_rate": 1.894057053536538e-05, "loss": 0.3936, "step": 68800 }, { "epoch": 5.380710659898477, "grad_norm": 1.0884121656417847, "learning_rate": 1.8939007424775305e-05, "loss": 0.3662, "step": 68900 }, { "epoch": 5.388520109332292, "grad_norm": 1.6693960428237915, "learning_rate": 1.8937444314185228e-05, "loss": 0.3795, "step": 69000 }, { "epoch": 5.396329558766107, "grad_norm": 0.8068119883537292, "learning_rate": 1.8935881203595157e-05, "loss": 0.3701, "step": 69100 }, { "epoch": 5.404139008199921, "grad_norm": 1.0311602354049683, "learning_rate": 1.8934318093005083e-05, "loss": 0.3643, "step": 69200 }, { "epoch": 5.411948457633737, "grad_norm": 0.9586812853813171, "learning_rate": 1.8932754982415006e-05, "loss": 0.3765, "step": 69300 }, { "epoch": 5.4197579070675515, "grad_norm": 1.1380528211593628, "learning_rate": 1.8931191871824932e-05, "loss": 0.3875, "step": 69400 }, { "epoch": 5.427567356501367, "grad_norm": 0.8221355080604553, "learning_rate": 1.8929628761234858e-05, "loss": 0.3686, "step": 69500 }, { "epoch": 5.4353768059351815, "grad_norm": 1.1208763122558594, "learning_rate": 1.8928065650644784e-05, "loss": 0.3808, "step": 69600 }, { "epoch": 5.443186255368996, "grad_norm": 0.9474813342094421, "learning_rate": 1.892650254005471e-05, "loss": 0.3809, "step": 69700 }, { "epoch": 5.4509957048028115, "grad_norm": 1.1498290300369263, "learning_rate": 1.8924939429464636e-05, "loss": 0.3921, "step": 69800 }, { "epoch": 5.458805154236626, "grad_norm": 1.1263091564178467, "learning_rate": 1.8923376318874562e-05, "loss": 0.3893, "step": 69900 }, { "epoch": 5.4666146036704415, "grad_norm": 1.2363600730895996, "learning_rate": 1.8921813208284488e-05, "loss": 0.3848, "step": 70000 }, { "epoch": 5.474424053104256, "grad_norm": 0.9119688868522644, "learning_rate": 1.8920250097694414e-05, "loss": 0.3778, "step": 70100 }, { "epoch": 5.482233502538071, "grad_norm": 1.0323618650436401, "learning_rate": 1.891868698710434e-05, "loss": 0.3777, "step": 70200 }, { "epoch": 5.490042951971886, "grad_norm": 1.2179909944534302, "learning_rate": 1.8917123876514266e-05, "loss": 0.3782, "step": 70300 }, { "epoch": 5.497852401405701, "grad_norm": 1.1143534183502197, "learning_rate": 1.891556076592419e-05, "loss": 0.3746, "step": 70400 }, { "epoch": 5.505661850839516, "grad_norm": 0.9338456392288208, "learning_rate": 1.8913997655334115e-05, "loss": 0.3636, "step": 70500 }, { "epoch": 5.513471300273331, "grad_norm": 1.06593918800354, "learning_rate": 1.8912434544744044e-05, "loss": 0.3818, "step": 70600 }, { "epoch": 5.521280749707145, "grad_norm": 1.015339970588684, "learning_rate": 1.8910871434153967e-05, "loss": 0.376, "step": 70700 }, { "epoch": 5.529090199140961, "grad_norm": 1.016461968421936, "learning_rate": 1.8909308323563893e-05, "loss": 0.3759, "step": 70800 }, { "epoch": 5.536899648574775, "grad_norm": 0.9616903066635132, "learning_rate": 1.890774521297382e-05, "loss": 0.3663, "step": 70900 }, { "epoch": 5.544709098008591, "grad_norm": 1.0220727920532227, "learning_rate": 1.8906182102383745e-05, "loss": 0.385, "step": 71000 }, { "epoch": 5.552518547442405, "grad_norm": 1.0261396169662476, "learning_rate": 1.890461899179367e-05, "loss": 0.3859, "step": 71100 }, { "epoch": 5.56032799687622, "grad_norm": 0.9234685897827148, "learning_rate": 1.8903055881203597e-05, "loss": 0.3743, "step": 71200 }, { "epoch": 5.568137446310035, "grad_norm": 1.1062054634094238, "learning_rate": 1.8901492770613523e-05, "loss": 0.3909, "step": 71300 }, { "epoch": 5.57594689574385, "grad_norm": 0.9592029452323914, "learning_rate": 1.889992966002345e-05, "loss": 0.3788, "step": 71400 }, { "epoch": 5.583756345177665, "grad_norm": 1.0946000814437866, "learning_rate": 1.8898366549433375e-05, "loss": 0.392, "step": 71500 }, { "epoch": 5.59156579461148, "grad_norm": 0.8922383189201355, "learning_rate": 1.88968034388433e-05, "loss": 0.3674, "step": 71600 }, { "epoch": 5.599375244045294, "grad_norm": 1.0789119005203247, "learning_rate": 1.8895240328253227e-05, "loss": 0.3652, "step": 71700 }, { "epoch": 5.60718469347911, "grad_norm": 1.132554292678833, "learning_rate": 1.889367721766315e-05, "loss": 0.3624, "step": 71800 }, { "epoch": 5.614994142912924, "grad_norm": 0.995639979839325, "learning_rate": 1.8892114107073076e-05, "loss": 0.3604, "step": 71900 }, { "epoch": 5.62280359234674, "grad_norm": 0.9848433136940002, "learning_rate": 1.8890550996483005e-05, "loss": 0.3648, "step": 72000 }, { "epoch": 5.630613041780554, "grad_norm": 1.3442597389221191, "learning_rate": 1.8888987885892928e-05, "loss": 0.3723, "step": 72100 }, { "epoch": 5.638422491214369, "grad_norm": 1.3660664558410645, "learning_rate": 1.8887424775302854e-05, "loss": 0.3633, "step": 72200 }, { "epoch": 5.6462319406481845, "grad_norm": 1.0371010303497314, "learning_rate": 1.888586166471278e-05, "loss": 0.3651, "step": 72300 }, { "epoch": 5.654041390081999, "grad_norm": 1.0066925287246704, "learning_rate": 1.8884298554122706e-05, "loss": 0.379, "step": 72400 }, { "epoch": 5.6618508395158145, "grad_norm": 1.2986984252929688, "learning_rate": 1.888273544353263e-05, "loss": 0.376, "step": 72500 }, { "epoch": 5.669660288949629, "grad_norm": 0.9885306358337402, "learning_rate": 1.8881187964048458e-05, "loss": 0.3772, "step": 72600 }, { "epoch": 5.677469738383444, "grad_norm": 0.8843153119087219, "learning_rate": 1.8879624853458384e-05, "loss": 0.376, "step": 72700 }, { "epoch": 5.685279187817259, "grad_norm": 1.0636779069900513, "learning_rate": 1.887806174286831e-05, "loss": 0.3726, "step": 72800 }, { "epoch": 5.693088637251074, "grad_norm": 0.9811561107635498, "learning_rate": 1.8876498632278236e-05, "loss": 0.3711, "step": 72900 }, { "epoch": 5.700898086684889, "grad_norm": 1.421799659729004, "learning_rate": 1.887493552168816e-05, "loss": 0.3763, "step": 73000 }, { "epoch": 5.708707536118704, "grad_norm": 1.3165417909622192, "learning_rate": 1.8873372411098088e-05, "loss": 0.3686, "step": 73100 }, { "epoch": 5.716516985552518, "grad_norm": 0.988136887550354, "learning_rate": 1.8871809300508014e-05, "loss": 0.3796, "step": 73200 }, { "epoch": 5.724326434986334, "grad_norm": 0.8610849976539612, "learning_rate": 1.8870246189917936e-05, "loss": 0.3652, "step": 73300 }, { "epoch": 5.732135884420148, "grad_norm": 1.2296196222305298, "learning_rate": 1.8868683079327862e-05, "loss": 0.3879, "step": 73400 }, { "epoch": 5.739945333853964, "grad_norm": 1.0265551805496216, "learning_rate": 1.8867119968737792e-05, "loss": 0.3806, "step": 73500 }, { "epoch": 5.747754783287778, "grad_norm": 0.9798378944396973, "learning_rate": 1.8865556858147714e-05, "loss": 0.3747, "step": 73600 }, { "epoch": 5.755564232721593, "grad_norm": 1.0334645509719849, "learning_rate": 1.886399374755764e-05, "loss": 0.3625, "step": 73700 }, { "epoch": 5.763373682155408, "grad_norm": 0.8765638470649719, "learning_rate": 1.886244626807347e-05, "loss": 0.3622, "step": 73800 }, { "epoch": 5.771183131589223, "grad_norm": 0.7884292602539062, "learning_rate": 1.8860883157483392e-05, "loss": 0.3673, "step": 73900 }, { "epoch": 5.778992581023038, "grad_norm": 1.13102388381958, "learning_rate": 1.8859320046893318e-05, "loss": 0.3713, "step": 74000 }, { "epoch": 5.786802030456853, "grad_norm": 1.0099087953567505, "learning_rate": 1.8857756936303244e-05, "loss": 0.3593, "step": 74100 }, { "epoch": 5.794611479890667, "grad_norm": 1.127163290977478, "learning_rate": 1.885619382571317e-05, "loss": 0.3801, "step": 74200 }, { "epoch": 5.802420929324483, "grad_norm": 1.2953687906265259, "learning_rate": 1.8854630715123096e-05, "loss": 0.3882, "step": 74300 }, { "epoch": 5.810230378758297, "grad_norm": 1.0948154926300049, "learning_rate": 1.8853067604533022e-05, "loss": 0.3704, "step": 74400 }, { "epoch": 5.818039828192113, "grad_norm": 1.1528011560440063, "learning_rate": 1.885150449394295e-05, "loss": 0.3705, "step": 74500 }, { "epoch": 5.825849277625927, "grad_norm": 1.071085810661316, "learning_rate": 1.8849941383352874e-05, "loss": 0.3593, "step": 74600 }, { "epoch": 5.833658727059742, "grad_norm": 0.8655598163604736, "learning_rate": 1.88483782727628e-05, "loss": 0.3638, "step": 74700 }, { "epoch": 5.841468176493557, "grad_norm": 0.6879515051841736, "learning_rate": 1.8846815162172723e-05, "loss": 0.3688, "step": 74800 }, { "epoch": 5.849277625927372, "grad_norm": 1.0214784145355225, "learning_rate": 1.8845252051582652e-05, "loss": 0.3496, "step": 74900 }, { "epoch": 5.8570870753611874, "grad_norm": 1.1457356214523315, "learning_rate": 1.884368894099258e-05, "loss": 0.3651, "step": 75000 }, { "epoch": 5.864896524795002, "grad_norm": 0.903660774230957, "learning_rate": 1.88421258304025e-05, "loss": 0.3762, "step": 75100 }, { "epoch": 5.872705974228817, "grad_norm": 0.9821125864982605, "learning_rate": 1.8840562719812427e-05, "loss": 0.3595, "step": 75200 }, { "epoch": 5.880515423662632, "grad_norm": 0.7476074695587158, "learning_rate": 1.8838999609222356e-05, "loss": 0.3775, "step": 75300 }, { "epoch": 5.888324873096447, "grad_norm": 1.0855532884597778, "learning_rate": 1.883743649863228e-05, "loss": 0.3659, "step": 75400 }, { "epoch": 5.896134322530262, "grad_norm": 0.9092568755149841, "learning_rate": 1.8835873388042205e-05, "loss": 0.3722, "step": 75500 }, { "epoch": 5.903943771964077, "grad_norm": 1.1002038717269897, "learning_rate": 1.883431027745213e-05, "loss": 0.3722, "step": 75600 }, { "epoch": 5.911753221397891, "grad_norm": 1.1699854135513306, "learning_rate": 1.8832747166862057e-05, "loss": 0.3671, "step": 75700 }, { "epoch": 5.919562670831707, "grad_norm": 0.9368227124214172, "learning_rate": 1.8831184056271983e-05, "loss": 0.3739, "step": 75800 }, { "epoch": 5.927372120265521, "grad_norm": 0.9039379358291626, "learning_rate": 1.882962094568191e-05, "loss": 0.3704, "step": 75900 }, { "epoch": 5.935181569699337, "grad_norm": 1.0861670970916748, "learning_rate": 1.8828057835091835e-05, "loss": 0.361, "step": 76000 }, { "epoch": 5.942991019133151, "grad_norm": 0.8491230607032776, "learning_rate": 1.882649472450176e-05, "loss": 0.3603, "step": 76100 }, { "epoch": 5.950800468566966, "grad_norm": 0.9066652059555054, "learning_rate": 1.8824931613911687e-05, "loss": 0.3529, "step": 76200 }, { "epoch": 5.958609918000781, "grad_norm": 1.0582003593444824, "learning_rate": 1.882336850332161e-05, "loss": 0.355, "step": 76300 }, { "epoch": 5.966419367434596, "grad_norm": 0.9515270590782166, "learning_rate": 1.882180539273154e-05, "loss": 0.3641, "step": 76400 }, { "epoch": 5.974228816868411, "grad_norm": 0.9640957117080688, "learning_rate": 1.8820242282141462e-05, "loss": 0.3628, "step": 76500 }, { "epoch": 5.982038266302226, "grad_norm": 1.3416815996170044, "learning_rate": 1.8818679171551388e-05, "loss": 0.3406, "step": 76600 }, { "epoch": 5.98984771573604, "grad_norm": 0.8273581266403198, "learning_rate": 1.8817131692067214e-05, "loss": 0.3539, "step": 76700 }, { "epoch": 5.997657165169856, "grad_norm": 0.769110918045044, "learning_rate": 1.8815568581477143e-05, "loss": 0.3748, "step": 76800 }, { "epoch": 6.00546661460367, "grad_norm": 1.0953476428985596, "learning_rate": 1.8814005470887066e-05, "loss": 0.358, "step": 76900 }, { "epoch": 6.013276064037485, "grad_norm": 1.1051218509674072, "learning_rate": 1.8812442360296992e-05, "loss": 0.3608, "step": 77000 }, { "epoch": 6.0210855134713, "grad_norm": 0.9927105903625488, "learning_rate": 1.8810879249706918e-05, "loss": 0.3529, "step": 77100 }, { "epoch": 6.028894962905115, "grad_norm": 1.1750438213348389, "learning_rate": 1.8809316139116844e-05, "loss": 0.362, "step": 77200 }, { "epoch": 6.03670441233893, "grad_norm": 0.902812123298645, "learning_rate": 1.880775302852677e-05, "loss": 0.3606, "step": 77300 }, { "epoch": 6.044513861772745, "grad_norm": 0.9053332209587097, "learning_rate": 1.8806189917936696e-05, "loss": 0.3667, "step": 77400 }, { "epoch": 6.0523233112065595, "grad_norm": 1.2229048013687134, "learning_rate": 1.8804626807346622e-05, "loss": 0.3665, "step": 77500 }, { "epoch": 6.060132760640375, "grad_norm": 0.9037619829177856, "learning_rate": 1.8803063696756548e-05, "loss": 0.3526, "step": 77600 }, { "epoch": 6.0679422100741895, "grad_norm": 1.2268940210342407, "learning_rate": 1.8801500586166474e-05, "loss": 0.3654, "step": 77700 }, { "epoch": 6.075751659508005, "grad_norm": 1.1816377639770508, "learning_rate": 1.8799937475576396e-05, "loss": 0.3538, "step": 77800 }, { "epoch": 6.08356110894182, "grad_norm": 1.0596685409545898, "learning_rate": 1.8798374364986326e-05, "loss": 0.3378, "step": 77900 }, { "epoch": 6.091370558375634, "grad_norm": 0.9817838668823242, "learning_rate": 1.879681125439625e-05, "loss": 0.3522, "step": 78000 }, { "epoch": 6.09918000780945, "grad_norm": 0.9671791791915894, "learning_rate": 1.8795248143806175e-05, "loss": 0.3695, "step": 78100 }, { "epoch": 6.106989457243264, "grad_norm": 0.9326332211494446, "learning_rate": 1.8793685033216104e-05, "loss": 0.3712, "step": 78200 }, { "epoch": 6.11479890667708, "grad_norm": 0.8805835247039795, "learning_rate": 1.8792121922626027e-05, "loss": 0.3694, "step": 78300 }, { "epoch": 6.122608356110894, "grad_norm": 0.9570161700248718, "learning_rate": 1.8790558812035953e-05, "loss": 0.3509, "step": 78400 }, { "epoch": 6.130417805544709, "grad_norm": 0.7668564319610596, "learning_rate": 1.878899570144588e-05, "loss": 0.3582, "step": 78500 }, { "epoch": 6.138227254978524, "grad_norm": 1.0919289588928223, "learning_rate": 1.8787432590855805e-05, "loss": 0.3492, "step": 78600 }, { "epoch": 6.146036704412339, "grad_norm": 1.092902421951294, "learning_rate": 1.878586948026573e-05, "loss": 0.354, "step": 78700 }, { "epoch": 6.153846153846154, "grad_norm": 1.1573113203048706, "learning_rate": 1.8784306369675657e-05, "loss": 0.3553, "step": 78800 }, { "epoch": 6.161655603279969, "grad_norm": 1.2066175937652588, "learning_rate": 1.8782743259085583e-05, "loss": 0.338, "step": 78900 }, { "epoch": 6.169465052713783, "grad_norm": 0.9877097010612488, "learning_rate": 1.878118014849551e-05, "loss": 0.3546, "step": 79000 }, { "epoch": 6.177274502147599, "grad_norm": 1.2525135278701782, "learning_rate": 1.8779632669011335e-05, "loss": 0.3601, "step": 79100 }, { "epoch": 6.185083951581413, "grad_norm": 1.107706069946289, "learning_rate": 1.877806955842126e-05, "loss": 0.3737, "step": 79200 }, { "epoch": 6.192893401015229, "grad_norm": 1.1282185316085815, "learning_rate": 1.8776506447831187e-05, "loss": 0.3628, "step": 79300 }, { "epoch": 6.200702850449043, "grad_norm": 1.0635756254196167, "learning_rate": 1.8774943337241113e-05, "loss": 0.3492, "step": 79400 }, { "epoch": 6.208512299882858, "grad_norm": 1.1150610446929932, "learning_rate": 1.8773380226651035e-05, "loss": 0.3581, "step": 79500 }, { "epoch": 6.216321749316673, "grad_norm": 1.0393744707107544, "learning_rate": 1.877181711606096e-05, "loss": 0.3519, "step": 79600 }, { "epoch": 6.224131198750488, "grad_norm": 1.285361647605896, "learning_rate": 1.877025400547089e-05, "loss": 0.3533, "step": 79700 }, { "epoch": 6.231940648184303, "grad_norm": 1.00641930103302, "learning_rate": 1.8768690894880813e-05, "loss": 0.3625, "step": 79800 }, { "epoch": 6.239750097618118, "grad_norm": 1.0835678577423096, "learning_rate": 1.876712778429074e-05, "loss": 0.353, "step": 79900 }, { "epoch": 6.2475595470519325, "grad_norm": 1.0340200662612915, "learning_rate": 1.8765564673700665e-05, "loss": 0.3632, "step": 80000 }, { "epoch": 6.255368996485748, "grad_norm": 0.8271421790122986, "learning_rate": 1.876400156311059e-05, "loss": 0.3527, "step": 80100 }, { "epoch": 6.2631784459195625, "grad_norm": 1.2229000329971313, "learning_rate": 1.8762438452520517e-05, "loss": 0.3622, "step": 80200 }, { "epoch": 6.270987895353378, "grad_norm": 1.1166315078735352, "learning_rate": 1.8760875341930443e-05, "loss": 0.348, "step": 80300 }, { "epoch": 6.2787973447871925, "grad_norm": 0.9366671442985535, "learning_rate": 1.875931223134037e-05, "loss": 0.343, "step": 80400 }, { "epoch": 6.286606794221007, "grad_norm": 0.799643874168396, "learning_rate": 1.8757749120750295e-05, "loss": 0.3418, "step": 80500 }, { "epoch": 6.2944162436548226, "grad_norm": 1.2833951711654663, "learning_rate": 1.875618601016022e-05, "loss": 0.3575, "step": 80600 }, { "epoch": 6.302225693088637, "grad_norm": 1.0523055791854858, "learning_rate": 1.8754622899570144e-05, "loss": 0.3582, "step": 80700 }, { "epoch": 6.310035142522453, "grad_norm": 0.8948925733566284, "learning_rate": 1.8753059788980073e-05, "loss": 0.3533, "step": 80800 }, { "epoch": 6.317844591956267, "grad_norm": 0.8339662551879883, "learning_rate": 1.8751496678389996e-05, "loss": 0.3484, "step": 80900 }, { "epoch": 6.325654041390082, "grad_norm": 1.0875813961029053, "learning_rate": 1.8749933567799922e-05, "loss": 0.3504, "step": 81000 }, { "epoch": 6.333463490823897, "grad_norm": 0.8726950883865356, "learning_rate": 1.874837045720985e-05, "loss": 0.3666, "step": 81100 }, { "epoch": 6.341272940257712, "grad_norm": 0.8508704900741577, "learning_rate": 1.8746807346619774e-05, "loss": 0.3587, "step": 81200 }, { "epoch": 6.349082389691527, "grad_norm": 1.074478030204773, "learning_rate": 1.87452442360297e-05, "loss": 0.3668, "step": 81300 }, { "epoch": 6.356891839125342, "grad_norm": 0.9604992866516113, "learning_rate": 1.8743681125439626e-05, "loss": 0.3254, "step": 81400 }, { "epoch": 6.364701288559156, "grad_norm": 1.0126805305480957, "learning_rate": 1.8742118014849552e-05, "loss": 0.3337, "step": 81500 }, { "epoch": 6.372510737992972, "grad_norm": 1.1412858963012695, "learning_rate": 1.8740554904259478e-05, "loss": 0.3438, "step": 81600 }, { "epoch": 6.380320187426786, "grad_norm": 0.9028871655464172, "learning_rate": 1.8738991793669404e-05, "loss": 0.3522, "step": 81700 }, { "epoch": 6.388129636860601, "grad_norm": 1.0400549173355103, "learning_rate": 1.873742868307933e-05, "loss": 0.3572, "step": 81800 }, { "epoch": 6.395939086294416, "grad_norm": 1.0760631561279297, "learning_rate": 1.8735865572489256e-05, "loss": 0.3318, "step": 81900 }, { "epoch": 6.403748535728231, "grad_norm": 1.1983120441436768, "learning_rate": 1.8734302461899182e-05, "loss": 0.3498, "step": 82000 }, { "epoch": 6.411557985162046, "grad_norm": 1.2458655834197998, "learning_rate": 1.8732739351309105e-05, "loss": 0.3351, "step": 82100 }, { "epoch": 6.419367434595861, "grad_norm": 0.966160237789154, "learning_rate": 1.8731176240719034e-05, "loss": 0.3362, "step": 82200 }, { "epoch": 6.4271768840296755, "grad_norm": 1.1860681772232056, "learning_rate": 1.872961313012896e-05, "loss": 0.3535, "step": 82300 }, { "epoch": 6.434986333463491, "grad_norm": 1.1215561628341675, "learning_rate": 1.8728050019538883e-05, "loss": 0.3474, "step": 82400 }, { "epoch": 6.4427957828973055, "grad_norm": 1.008365273475647, "learning_rate": 1.872648690894881e-05, "loss": 0.3378, "step": 82500 }, { "epoch": 6.450605232331121, "grad_norm": 1.243260145187378, "learning_rate": 1.8724923798358735e-05, "loss": 0.3516, "step": 82600 }, { "epoch": 6.4584146817649355, "grad_norm": 0.759389340877533, "learning_rate": 1.872336068776866e-05, "loss": 0.3533, "step": 82700 }, { "epoch": 6.46622413119875, "grad_norm": 1.0799552202224731, "learning_rate": 1.8721797577178587e-05, "loss": 0.3433, "step": 82800 }, { "epoch": 6.4740335806325655, "grad_norm": 1.1244173049926758, "learning_rate": 1.8720234466588513e-05, "loss": 0.3394, "step": 82900 }, { "epoch": 6.48184303006638, "grad_norm": 1.2120684385299683, "learning_rate": 1.871868698710434e-05, "loss": 0.3589, "step": 83000 }, { "epoch": 6.4896524795001955, "grad_norm": 0.8704874515533447, "learning_rate": 1.8717123876514265e-05, "loss": 0.3509, "step": 83100 }, { "epoch": 6.49746192893401, "grad_norm": 0.8718099594116211, "learning_rate": 1.871556076592419e-05, "loss": 0.3298, "step": 83200 }, { "epoch": 6.505271378367825, "grad_norm": 1.084702968597412, "learning_rate": 1.8713997655334117e-05, "loss": 0.3418, "step": 83300 }, { "epoch": 6.51308082780164, "grad_norm": 1.082650065422058, "learning_rate": 1.8712434544744043e-05, "loss": 0.3347, "step": 83400 }, { "epoch": 6.520890277235455, "grad_norm": 0.9728855490684509, "learning_rate": 1.871087143415397e-05, "loss": 0.347, "step": 83500 }, { "epoch": 6.52869972666927, "grad_norm": 0.9495226144790649, "learning_rate": 1.870930832356389e-05, "loss": 0.3525, "step": 83600 }, { "epoch": 6.536509176103085, "grad_norm": 1.1406092643737793, "learning_rate": 1.870774521297382e-05, "loss": 0.3496, "step": 83700 }, { "epoch": 6.544318625536899, "grad_norm": 1.1625540256500244, "learning_rate": 1.8706182102383747e-05, "loss": 0.3548, "step": 83800 }, { "epoch": 6.552128074970715, "grad_norm": 1.0920753479003906, "learning_rate": 1.870461899179367e-05, "loss": 0.3521, "step": 83900 }, { "epoch": 6.559937524404529, "grad_norm": 1.1663368940353394, "learning_rate": 1.8703055881203595e-05, "loss": 0.3556, "step": 84000 }, { "epoch": 6.567746973838345, "grad_norm": 0.8520887494087219, "learning_rate": 1.870149277061352e-05, "loss": 0.3228, "step": 84100 }, { "epoch": 6.575556423272159, "grad_norm": 0.9098398089408875, "learning_rate": 1.8699929660023447e-05, "loss": 0.3316, "step": 84200 }, { "epoch": 6.583365872705974, "grad_norm": 1.0251222848892212, "learning_rate": 1.8698366549433373e-05, "loss": 0.332, "step": 84300 }, { "epoch": 6.591175322139789, "grad_norm": 1.014219880104065, "learning_rate": 1.86968034388433e-05, "loss": 0.3621, "step": 84400 }, { "epoch": 6.598984771573604, "grad_norm": 0.9549919366836548, "learning_rate": 1.8695240328253226e-05, "loss": 0.3337, "step": 84500 }, { "epoch": 6.606794221007419, "grad_norm": 1.220858097076416, "learning_rate": 1.869367721766315e-05, "loss": 0.3476, "step": 84600 }, { "epoch": 6.614603670441234, "grad_norm": 1.041111946105957, "learning_rate": 1.8692114107073078e-05, "loss": 0.3322, "step": 84700 }, { "epoch": 6.622413119875048, "grad_norm": 1.009423017501831, "learning_rate": 1.8690550996483004e-05, "loss": 0.3638, "step": 84800 }, { "epoch": 6.630222569308864, "grad_norm": 0.9405770897865295, "learning_rate": 1.868898788589293e-05, "loss": 0.3493, "step": 84900 }, { "epoch": 6.6380320187426785, "grad_norm": 1.2056901454925537, "learning_rate": 1.8687424775302852e-05, "loss": 0.3566, "step": 85000 }, { "epoch": 6.645841468176494, "grad_norm": 0.9662819504737854, "learning_rate": 1.8685861664712778e-05, "loss": 0.3394, "step": 85100 }, { "epoch": 6.6536509176103085, "grad_norm": 1.293601155281067, "learning_rate": 1.8684298554122708e-05, "loss": 0.3166, "step": 85200 }, { "epoch": 6.661460367044123, "grad_norm": 0.8332231640815735, "learning_rate": 1.868273544353263e-05, "loss": 0.3384, "step": 85300 }, { "epoch": 6.6692698164779385, "grad_norm": 1.139176368713379, "learning_rate": 1.8681172332942556e-05, "loss": 0.3368, "step": 85400 }, { "epoch": 6.677079265911753, "grad_norm": 1.0874099731445312, "learning_rate": 1.8679609222352482e-05, "loss": 0.3493, "step": 85500 }, { "epoch": 6.6848887153455685, "grad_norm": 1.0646657943725586, "learning_rate": 1.8678046111762408e-05, "loss": 0.3465, "step": 85600 }, { "epoch": 6.692698164779383, "grad_norm": 0.8554918169975281, "learning_rate": 1.8676483001172334e-05, "loss": 0.3462, "step": 85700 }, { "epoch": 6.700507614213198, "grad_norm": 1.0210607051849365, "learning_rate": 1.867491989058226e-05, "loss": 0.341, "step": 85800 }, { "epoch": 6.708317063647013, "grad_norm": 0.9455381035804749, "learning_rate": 1.8673356779992186e-05, "loss": 0.3463, "step": 85900 }, { "epoch": 6.716126513080828, "grad_norm": 0.9599422216415405, "learning_rate": 1.8671793669402112e-05, "loss": 0.3549, "step": 86000 }, { "epoch": 6.723935962514643, "grad_norm": 0.9664759635925293, "learning_rate": 1.8670230558812038e-05, "loss": 0.3192, "step": 86100 }, { "epoch": 6.731745411948458, "grad_norm": 0.8134546279907227, "learning_rate": 1.8668683079327864e-05, "loss": 0.3343, "step": 86200 }, { "epoch": 6.739554861382272, "grad_norm": 1.147776484489441, "learning_rate": 1.866711996873779e-05, "loss": 0.3519, "step": 86300 }, { "epoch": 6.747364310816088, "grad_norm": 1.2324354648590088, "learning_rate": 1.8665556858147716e-05, "loss": 0.3353, "step": 86400 }, { "epoch": 6.755173760249902, "grad_norm": 1.0100579261779785, "learning_rate": 1.866399374755764e-05, "loss": 0.3516, "step": 86500 }, { "epoch": 6.762983209683718, "grad_norm": 1.2178475856781006, "learning_rate": 1.8662430636967568e-05, "loss": 0.3475, "step": 86600 }, { "epoch": 6.770792659117532, "grad_norm": 1.2060697078704834, "learning_rate": 1.8660867526377494e-05, "loss": 0.3363, "step": 86700 }, { "epoch": 6.778602108551347, "grad_norm": 0.8602226376533508, "learning_rate": 1.8659304415787417e-05, "loss": 0.3435, "step": 86800 }, { "epoch": 6.786411557985162, "grad_norm": 1.0413028001785278, "learning_rate": 1.8657741305197343e-05, "loss": 0.3349, "step": 86900 }, { "epoch": 6.794221007418977, "grad_norm": 0.7829965353012085, "learning_rate": 1.8656178194607272e-05, "loss": 0.3344, "step": 87000 }, { "epoch": 6.802030456852792, "grad_norm": 1.249098777770996, "learning_rate": 1.8654615084017195e-05, "loss": 0.3507, "step": 87100 }, { "epoch": 6.809839906286607, "grad_norm": 1.135725975036621, "learning_rate": 1.865305197342712e-05, "loss": 0.3376, "step": 87200 }, { "epoch": 6.817649355720421, "grad_norm": 0.9486784934997559, "learning_rate": 1.8651488862837047e-05, "loss": 0.3389, "step": 87300 }, { "epoch": 6.825458805154237, "grad_norm": 1.0629407167434692, "learning_rate": 1.8649925752246973e-05, "loss": 0.3379, "step": 87400 }, { "epoch": 6.833268254588051, "grad_norm": 1.0095854997634888, "learning_rate": 1.86483626416569e-05, "loss": 0.3235, "step": 87500 }, { "epoch": 6.841077704021867, "grad_norm": 0.8875960111618042, "learning_rate": 1.8646799531066825e-05, "loss": 0.3326, "step": 87600 }, { "epoch": 6.848887153455681, "grad_norm": 1.103423833847046, "learning_rate": 1.864523642047675e-05, "loss": 0.3309, "step": 87700 }, { "epoch": 6.856696602889496, "grad_norm": 0.9338161945343018, "learning_rate": 1.8643673309886677e-05, "loss": 0.3386, "step": 87800 }, { "epoch": 6.8645060523233115, "grad_norm": 1.395498514175415, "learning_rate": 1.8642110199296603e-05, "loss": 0.3509, "step": 87900 }, { "epoch": 6.872315501757126, "grad_norm": 0.8791838884353638, "learning_rate": 1.8640547088706526e-05, "loss": 0.3465, "step": 88000 }, { "epoch": 6.8801249511909415, "grad_norm": 1.1168427467346191, "learning_rate": 1.8638983978116455e-05, "loss": 0.3387, "step": 88100 }, { "epoch": 6.887934400624756, "grad_norm": 0.8936362266540527, "learning_rate": 1.8637420867526378e-05, "loss": 0.3504, "step": 88200 }, { "epoch": 6.895743850058571, "grad_norm": 0.7844634056091309, "learning_rate": 1.8635857756936304e-05, "loss": 0.3163, "step": 88300 }, { "epoch": 6.903553299492386, "grad_norm": 1.28450608253479, "learning_rate": 1.8634294646346233e-05, "loss": 0.3283, "step": 88400 }, { "epoch": 6.911362748926201, "grad_norm": 1.0096105337142944, "learning_rate": 1.8632731535756156e-05, "loss": 0.329, "step": 88500 }, { "epoch": 6.919172198360016, "grad_norm": 1.2057095766067505, "learning_rate": 1.8631168425166082e-05, "loss": 0.3366, "step": 88600 }, { "epoch": 6.926981647793831, "grad_norm": 1.0522247552871704, "learning_rate": 1.8629605314576008e-05, "loss": 0.3173, "step": 88700 }, { "epoch": 6.934791097227645, "grad_norm": 0.8380939364433289, "learning_rate": 1.8628042203985934e-05, "loss": 0.3254, "step": 88800 }, { "epoch": 6.942600546661461, "grad_norm": 1.1132458448410034, "learning_rate": 1.862647909339586e-05, "loss": 0.3349, "step": 88900 }, { "epoch": 6.950409996095275, "grad_norm": 1.0412805080413818, "learning_rate": 1.8624915982805786e-05, "loss": 0.3446, "step": 89000 }, { "epoch": 6.958219445529091, "grad_norm": 0.7790032625198364, "learning_rate": 1.862335287221571e-05, "loss": 0.3393, "step": 89100 }, { "epoch": 6.966028894962905, "grad_norm": 1.0535982847213745, "learning_rate": 1.8621789761625638e-05, "loss": 0.3344, "step": 89200 }, { "epoch": 6.97383834439672, "grad_norm": 0.9829431176185608, "learning_rate": 1.8620226651035564e-05, "loss": 0.3285, "step": 89300 }, { "epoch": 6.981647793830535, "grad_norm": 1.049845576286316, "learning_rate": 1.8618663540445486e-05, "loss": 0.3249, "step": 89400 }, { "epoch": 6.98945724326435, "grad_norm": 1.01344633102417, "learning_rate": 1.8617100429855416e-05, "loss": 0.3284, "step": 89500 }, { "epoch": 6.997266692698165, "grad_norm": 1.1877566576004028, "learning_rate": 1.861553731926534e-05, "loss": 0.3347, "step": 89600 }, { "epoch": 7.00507614213198, "grad_norm": 0.8533827662467957, "learning_rate": 1.8613974208675264e-05, "loss": 0.3322, "step": 89700 }, { "epoch": 7.012885591565794, "grad_norm": 1.1142549514770508, "learning_rate": 1.861241109808519e-05, "loss": 0.3229, "step": 89800 }, { "epoch": 7.02069504099961, "grad_norm": 1.085334062576294, "learning_rate": 1.8610847987495116e-05, "loss": 0.3199, "step": 89900 }, { "epoch": 7.028504490433424, "grad_norm": 0.9643715023994446, "learning_rate": 1.8609300508010942e-05, "loss": 0.3139, "step": 90000 }, { "epoch": 7.036313939867239, "grad_norm": 1.189081072807312, "learning_rate": 1.860773739742087e-05, "loss": 0.3324, "step": 90100 }, { "epoch": 7.044123389301054, "grad_norm": 1.0931801795959473, "learning_rate": 1.8606174286830794e-05, "loss": 0.3309, "step": 90200 }, { "epoch": 7.051932838734869, "grad_norm": 0.9425420761108398, "learning_rate": 1.860461117624072e-05, "loss": 0.3358, "step": 90300 }, { "epoch": 7.059742288168684, "grad_norm": 1.2118514776229858, "learning_rate": 1.8603048065650646e-05, "loss": 0.3362, "step": 90400 }, { "epoch": 7.067551737602499, "grad_norm": 1.2366812229156494, "learning_rate": 1.8601484955060572e-05, "loss": 0.3395, "step": 90500 }, { "epoch": 7.075361187036314, "grad_norm": 1.0428937673568726, "learning_rate": 1.85999218444705e-05, "loss": 0.3192, "step": 90600 }, { "epoch": 7.083170636470129, "grad_norm": 1.0269099473953247, "learning_rate": 1.8598358733880424e-05, "loss": 0.3263, "step": 90700 }, { "epoch": 7.090980085903944, "grad_norm": 1.0814200639724731, "learning_rate": 1.859679562329035e-05, "loss": 0.3409, "step": 90800 }, { "epoch": 7.098789535337759, "grad_norm": 0.7768912315368652, "learning_rate": 1.8595232512700273e-05, "loss": 0.3195, "step": 90900 }, { "epoch": 7.106598984771574, "grad_norm": 0.9416346549987793, "learning_rate": 1.8593669402110203e-05, "loss": 0.3266, "step": 91000 }, { "epoch": 7.114408434205388, "grad_norm": 0.9913092255592346, "learning_rate": 1.8592106291520125e-05, "loss": 0.3319, "step": 91100 }, { "epoch": 7.122217883639204, "grad_norm": 0.7331252694129944, "learning_rate": 1.859054318093005e-05, "loss": 0.3202, "step": 91200 }, { "epoch": 7.130027333073018, "grad_norm": 1.1714645624160767, "learning_rate": 1.8588980070339977e-05, "loss": 0.3296, "step": 91300 }, { "epoch": 7.137836782506834, "grad_norm": 1.0094726085662842, "learning_rate": 1.8587416959749903e-05, "loss": 0.3268, "step": 91400 }, { "epoch": 7.145646231940648, "grad_norm": 0.8272337913513184, "learning_rate": 1.858585384915983e-05, "loss": 0.3258, "step": 91500 }, { "epoch": 7.153455681374463, "grad_norm": 1.1478444337844849, "learning_rate": 1.8584290738569755e-05, "loss": 0.3277, "step": 91600 }, { "epoch": 7.161265130808278, "grad_norm": 0.9589486718177795, "learning_rate": 1.858272762797968e-05, "loss": 0.3435, "step": 91700 }, { "epoch": 7.169074580242093, "grad_norm": 1.0916614532470703, "learning_rate": 1.8581164517389607e-05, "loss": 0.3318, "step": 91800 }, { "epoch": 7.176884029675908, "grad_norm": 1.128203272819519, "learning_rate": 1.8579601406799533e-05, "loss": 0.3209, "step": 91900 }, { "epoch": 7.184693479109723, "grad_norm": 1.1626170873641968, "learning_rate": 1.8578038296209456e-05, "loss": 0.3309, "step": 92000 }, { "epoch": 7.192502928543537, "grad_norm": 1.0426710844039917, "learning_rate": 1.8576475185619385e-05, "loss": 0.3235, "step": 92100 }, { "epoch": 7.200312377977353, "grad_norm": 0.883540689945221, "learning_rate": 1.857491207502931e-05, "loss": 0.3275, "step": 92200 }, { "epoch": 7.208121827411167, "grad_norm": 0.9239016771316528, "learning_rate": 1.8573348964439234e-05, "loss": 0.3173, "step": 92300 }, { "epoch": 7.215931276844983, "grad_norm": 0.8947902917861938, "learning_rate": 1.8571785853849163e-05, "loss": 0.3289, "step": 92400 }, { "epoch": 7.223740726278797, "grad_norm": 0.8909947276115417, "learning_rate": 1.8570222743259086e-05, "loss": 0.3144, "step": 92500 }, { "epoch": 7.231550175712612, "grad_norm": 0.850845217704773, "learning_rate": 1.8568659632669012e-05, "loss": 0.3152, "step": 92600 }, { "epoch": 7.239359625146427, "grad_norm": 0.9659631848335266, "learning_rate": 1.8567096522078938e-05, "loss": 0.3259, "step": 92700 }, { "epoch": 7.247169074580242, "grad_norm": 1.1609432697296143, "learning_rate": 1.8565533411488864e-05, "loss": 0.3176, "step": 92800 }, { "epoch": 7.254978524014057, "grad_norm": 0.9202740788459778, "learning_rate": 1.856398593200469e-05, "loss": 0.3212, "step": 92900 }, { "epoch": 7.262787973447872, "grad_norm": 0.9573159217834473, "learning_rate": 1.8562422821414616e-05, "loss": 0.3226, "step": 93000 }, { "epoch": 7.2705974228816865, "grad_norm": 1.138262152671814, "learning_rate": 1.8560859710824542e-05, "loss": 0.3298, "step": 93100 }, { "epoch": 7.278406872315502, "grad_norm": 0.9519045352935791, "learning_rate": 1.8559296600234468e-05, "loss": 0.3148, "step": 93200 }, { "epoch": 7.2862163217493165, "grad_norm": 0.788692057132721, "learning_rate": 1.8557733489644394e-05, "loss": 0.3212, "step": 93300 }, { "epoch": 7.294025771183132, "grad_norm": 0.8305114507675171, "learning_rate": 1.855617037905432e-05, "loss": 0.3302, "step": 93400 }, { "epoch": 7.301835220616947, "grad_norm": 1.0999449491500854, "learning_rate": 1.8554607268464246e-05, "loss": 0.3182, "step": 93500 }, { "epoch": 7.309644670050761, "grad_norm": 0.9967383742332458, "learning_rate": 1.8553044157874172e-05, "loss": 0.3218, "step": 93600 }, { "epoch": 7.317454119484577, "grad_norm": 1.140713095664978, "learning_rate": 1.8551481047284098e-05, "loss": 0.3228, "step": 93700 }, { "epoch": 7.325263568918391, "grad_norm": 0.8884949684143066, "learning_rate": 1.854991793669402e-05, "loss": 0.3213, "step": 93800 }, { "epoch": 7.333073018352206, "grad_norm": 0.9016627073287964, "learning_rate": 1.854835482610395e-05, "loss": 0.3157, "step": 93900 }, { "epoch": 7.340882467786021, "grad_norm": 1.2928720712661743, "learning_rate": 1.8546791715513876e-05, "loss": 0.3176, "step": 94000 }, { "epoch": 7.348691917219836, "grad_norm": 0.8581827282905579, "learning_rate": 1.85452286049238e-05, "loss": 0.3101, "step": 94100 }, { "epoch": 7.356501366653651, "grad_norm": 1.1784394979476929, "learning_rate": 1.8543665494333725e-05, "loss": 0.3254, "step": 94200 }, { "epoch": 7.364310816087466, "grad_norm": 0.8839449882507324, "learning_rate": 1.854210238374365e-05, "loss": 0.3264, "step": 94300 }, { "epoch": 7.37212026552128, "grad_norm": 0.9890450835227966, "learning_rate": 1.8540539273153577e-05, "loss": 0.3142, "step": 94400 }, { "epoch": 7.379929714955096, "grad_norm": 0.9410362839698792, "learning_rate": 1.8538976162563503e-05, "loss": 0.3207, "step": 94500 }, { "epoch": 7.38773916438891, "grad_norm": 1.012739658355713, "learning_rate": 1.853741305197343e-05, "loss": 0.3253, "step": 94600 }, { "epoch": 7.395548613822726, "grad_norm": 0.9576935172080994, "learning_rate": 1.8535849941383355e-05, "loss": 0.3324, "step": 94700 }, { "epoch": 7.40335806325654, "grad_norm": 1.124345064163208, "learning_rate": 1.853428683079328e-05, "loss": 0.3242, "step": 94800 }, { "epoch": 7.411167512690355, "grad_norm": 0.7819204330444336, "learning_rate": 1.8532723720203207e-05, "loss": 0.3031, "step": 94900 }, { "epoch": 7.41897696212417, "grad_norm": 0.9620487689971924, "learning_rate": 1.8531160609613133e-05, "loss": 0.3085, "step": 95000 }, { "epoch": 7.426786411557985, "grad_norm": 0.7977539300918579, "learning_rate": 1.852959749902306e-05, "loss": 0.3227, "step": 95100 }, { "epoch": 7.4345958609918, "grad_norm": 0.9288976788520813, "learning_rate": 1.852803438843298e-05, "loss": 0.3251, "step": 95200 }, { "epoch": 7.442405310425615, "grad_norm": 0.9492650628089905, "learning_rate": 1.8526471277842907e-05, "loss": 0.3198, "step": 95300 }, { "epoch": 7.4502147598594295, "grad_norm": 1.0884491205215454, "learning_rate": 1.8524908167252837e-05, "loss": 0.3117, "step": 95400 }, { "epoch": 7.458024209293245, "grad_norm": 0.930963397026062, "learning_rate": 1.852334505666276e-05, "loss": 0.3086, "step": 95500 }, { "epoch": 7.4658336587270595, "grad_norm": 1.0428942441940308, "learning_rate": 1.8521781946072685e-05, "loss": 0.3087, "step": 95600 }, { "epoch": 7.473643108160875, "grad_norm": 0.8642585277557373, "learning_rate": 1.852021883548261e-05, "loss": 0.3091, "step": 95700 }, { "epoch": 7.4814525575946895, "grad_norm": 1.0365341901779175, "learning_rate": 1.8518655724892537e-05, "loss": 0.3322, "step": 95800 }, { "epoch": 7.489262007028504, "grad_norm": 0.8407796025276184, "learning_rate": 1.8517092614302463e-05, "loss": 0.3247, "step": 95900 }, { "epoch": 7.4970714564623195, "grad_norm": 0.8818190097808838, "learning_rate": 1.851552950371239e-05, "loss": 0.3111, "step": 96000 }, { "epoch": 7.504880905896134, "grad_norm": 0.9547135233879089, "learning_rate": 1.8513966393122315e-05, "loss": 0.3062, "step": 96100 }, { "epoch": 7.5126903553299496, "grad_norm": 0.9618895053863525, "learning_rate": 1.851240328253224e-05, "loss": 0.3244, "step": 96200 }, { "epoch": 7.520499804763764, "grad_norm": 0.9813030958175659, "learning_rate": 1.8510840171942167e-05, "loss": 0.3359, "step": 96300 }, { "epoch": 7.528309254197579, "grad_norm": 0.9328567385673523, "learning_rate": 1.850927706135209e-05, "loss": 0.313, "step": 96400 }, { "epoch": 7.536118703631394, "grad_norm": 1.0434361696243286, "learning_rate": 1.850771395076202e-05, "loss": 0.3194, "step": 96500 }, { "epoch": 7.543928153065209, "grad_norm": 0.9884262681007385, "learning_rate": 1.8506150840171942e-05, "loss": 0.3088, "step": 96600 }, { "epoch": 7.551737602499024, "grad_norm": 0.937465250492096, "learning_rate": 1.8504587729581868e-05, "loss": 0.3063, "step": 96700 }, { "epoch": 7.559547051932839, "grad_norm": 0.8354927897453308, "learning_rate": 1.8503024618991798e-05, "loss": 0.3261, "step": 96800 }, { "epoch": 7.567356501366653, "grad_norm": 1.2076160907745361, "learning_rate": 1.8501477139507623e-05, "loss": 0.3162, "step": 96900 }, { "epoch": 7.575165950800469, "grad_norm": 1.0042390823364258, "learning_rate": 1.8499914028917546e-05, "loss": 0.3118, "step": 97000 }, { "epoch": 7.582975400234283, "grad_norm": 0.9606279134750366, "learning_rate": 1.8498350918327472e-05, "loss": 0.3282, "step": 97100 }, { "epoch": 7.590784849668099, "grad_norm": 1.2073897123336792, "learning_rate": 1.8496787807737398e-05, "loss": 0.3096, "step": 97200 }, { "epoch": 7.598594299101913, "grad_norm": 0.9388468861579895, "learning_rate": 1.8495224697147324e-05, "loss": 0.316, "step": 97300 }, { "epoch": 7.606403748535728, "grad_norm": 1.0033023357391357, "learning_rate": 1.849366158655725e-05, "loss": 0.3065, "step": 97400 }, { "epoch": 7.614213197969543, "grad_norm": 0.9127291440963745, "learning_rate": 1.8492098475967176e-05, "loss": 0.324, "step": 97500 }, { "epoch": 7.622022647403358, "grad_norm": 0.8793255090713501, "learning_rate": 1.8490535365377102e-05, "loss": 0.331, "step": 97600 }, { "epoch": 7.629832096837173, "grad_norm": 1.0385085344314575, "learning_rate": 1.8488972254787028e-05, "loss": 0.3232, "step": 97700 }, { "epoch": 7.637641546270988, "grad_norm": 1.1897807121276855, "learning_rate": 1.8487409144196954e-05, "loss": 0.3064, "step": 97800 }, { "epoch": 7.6454509957048025, "grad_norm": 1.2724990844726562, "learning_rate": 1.848584603360688e-05, "loss": 0.3102, "step": 97900 }, { "epoch": 7.653260445138618, "grad_norm": 1.2787495851516724, "learning_rate": 1.8484282923016806e-05, "loss": 0.3117, "step": 98000 }, { "epoch": 7.6610698945724325, "grad_norm": 1.0985214710235596, "learning_rate": 1.848271981242673e-05, "loss": 0.3148, "step": 98100 }, { "epoch": 7.668879344006248, "grad_norm": 1.0185761451721191, "learning_rate": 1.8481156701836655e-05, "loss": 0.3212, "step": 98200 }, { "epoch": 7.6766887934400625, "grad_norm": 1.177228569984436, "learning_rate": 1.8479593591246584e-05, "loss": 0.3168, "step": 98300 }, { "epoch": 7.684498242873877, "grad_norm": 1.2005599737167358, "learning_rate": 1.8478030480656507e-05, "loss": 0.3032, "step": 98400 }, { "epoch": 7.6923076923076925, "grad_norm": 1.4760136604309082, "learning_rate": 1.8476467370066433e-05, "loss": 0.3087, "step": 98500 }, { "epoch": 7.700117141741507, "grad_norm": 0.9583701491355896, "learning_rate": 1.8474904259476362e-05, "loss": 0.3163, "step": 98600 }, { "epoch": 7.7079265911753225, "grad_norm": 0.9487422704696655, "learning_rate": 1.8473341148886285e-05, "loss": 0.3173, "step": 98700 }, { "epoch": 7.715736040609137, "grad_norm": 0.8585200905799866, "learning_rate": 1.847177803829621e-05, "loss": 0.3133, "step": 98800 }, { "epoch": 7.723545490042952, "grad_norm": 1.1684235334396362, "learning_rate": 1.8470214927706137e-05, "loss": 0.2952, "step": 98900 }, { "epoch": 7.731354939476767, "grad_norm": 1.0295666456222534, "learning_rate": 1.8468651817116063e-05, "loss": 0.3248, "step": 99000 }, { "epoch": 7.739164388910582, "grad_norm": 0.7475218176841736, "learning_rate": 1.846708870652599e-05, "loss": 0.2987, "step": 99100 }, { "epoch": 7.746973838344397, "grad_norm": 1.1041820049285889, "learning_rate": 1.8465525595935915e-05, "loss": 0.3112, "step": 99200 }, { "epoch": 7.754783287778212, "grad_norm": 1.0141644477844238, "learning_rate": 1.8463962485345838e-05, "loss": 0.3154, "step": 99300 }, { "epoch": 7.762592737212026, "grad_norm": 0.7786453366279602, "learning_rate": 1.8462399374755767e-05, "loss": 0.327, "step": 99400 }, { "epoch": 7.770402186645842, "grad_norm": 0.8506142497062683, "learning_rate": 1.8460836264165693e-05, "loss": 0.3138, "step": 99500 }, { "epoch": 7.778211636079656, "grad_norm": 1.0492260456085205, "learning_rate": 1.8459273153575616e-05, "loss": 0.3114, "step": 99600 }, { "epoch": 7.786021085513472, "grad_norm": 0.9019497036933899, "learning_rate": 1.8457710042985545e-05, "loss": 0.3185, "step": 99700 }, { "epoch": 7.793830534947286, "grad_norm": 0.855957567691803, "learning_rate": 1.8456146932395468e-05, "loss": 0.3273, "step": 99800 }, { "epoch": 7.801639984381101, "grad_norm": 1.1977174282073975, "learning_rate": 1.8454583821805394e-05, "loss": 0.3095, "step": 99900 }, { "epoch": 7.809449433814916, "grad_norm": 0.9468150734901428, "learning_rate": 1.845302071121532e-05, "loss": 0.3043, "step": 100000 }, { "epoch": 7.817258883248731, "grad_norm": 0.957493007183075, "learning_rate": 1.8451457600625246e-05, "loss": 0.3072, "step": 100100 }, { "epoch": 7.825068332682546, "grad_norm": 1.0043883323669434, "learning_rate": 1.844989449003517e-05, "loss": 0.3127, "step": 100200 }, { "epoch": 7.832877782116361, "grad_norm": 0.893930971622467, "learning_rate": 1.8448331379445098e-05, "loss": 0.3014, "step": 100300 }, { "epoch": 7.840687231550175, "grad_norm": 0.8573713302612305, "learning_rate": 1.8446768268855024e-05, "loss": 0.3015, "step": 100400 }, { "epoch": 7.848496680983991, "grad_norm": 1.0177993774414062, "learning_rate": 1.844520515826495e-05, "loss": 0.302, "step": 100500 }, { "epoch": 7.8563061304178055, "grad_norm": 1.0929206609725952, "learning_rate": 1.8443642047674876e-05, "loss": 0.3181, "step": 100600 }, { "epoch": 7.864115579851621, "grad_norm": 1.2086570262908936, "learning_rate": 1.84420789370848e-05, "loss": 0.3131, "step": 100700 }, { "epoch": 7.8719250292854355, "grad_norm": 0.721596896648407, "learning_rate": 1.8440515826494728e-05, "loss": 0.2946, "step": 100800 }, { "epoch": 7.87973447871925, "grad_norm": 1.0397862195968628, "learning_rate": 1.8438952715904654e-05, "loss": 0.2931, "step": 100900 }, { "epoch": 7.8875439281530655, "grad_norm": 1.013625144958496, "learning_rate": 1.843740523642048e-05, "loss": 0.3084, "step": 101000 }, { "epoch": 7.89535337758688, "grad_norm": 0.9805079102516174, "learning_rate": 1.8435842125830402e-05, "loss": 0.2953, "step": 101100 }, { "epoch": 7.9031628270206955, "grad_norm": 1.1424782276153564, "learning_rate": 1.843427901524033e-05, "loss": 0.3009, "step": 101200 }, { "epoch": 7.91097227645451, "grad_norm": 1.0221184492111206, "learning_rate": 1.8432715904650254e-05, "loss": 0.3158, "step": 101300 }, { "epoch": 7.918781725888325, "grad_norm": 0.9535076022148132, "learning_rate": 1.843115279406018e-05, "loss": 0.3021, "step": 101400 }, { "epoch": 7.92659117532214, "grad_norm": 0.9413054585456848, "learning_rate": 1.8429589683470106e-05, "loss": 0.2997, "step": 101500 }, { "epoch": 7.934400624755955, "grad_norm": 1.0579475164413452, "learning_rate": 1.8428026572880032e-05, "loss": 0.3204, "step": 101600 }, { "epoch": 7.94221007418977, "grad_norm": 0.66800856590271, "learning_rate": 1.842646346228996e-05, "loss": 0.315, "step": 101700 }, { "epoch": 7.950019523623585, "grad_norm": 0.8284004926681519, "learning_rate": 1.8424900351699884e-05, "loss": 0.2963, "step": 101800 }, { "epoch": 7.957828973057399, "grad_norm": 1.1670104265213013, "learning_rate": 1.842333724110981e-05, "loss": 0.3119, "step": 101900 }, { "epoch": 7.965638422491215, "grad_norm": 0.9361162185668945, "learning_rate": 1.8421774130519736e-05, "loss": 0.3082, "step": 102000 }, { "epoch": 7.973447871925029, "grad_norm": 0.9291674494743347, "learning_rate": 1.8420226651035562e-05, "loss": 0.3086, "step": 102100 }, { "epoch": 7.981257321358845, "grad_norm": 1.239319086074829, "learning_rate": 1.841866354044549e-05, "loss": 0.3112, "step": 102200 }, { "epoch": 7.989066770792659, "grad_norm": 0.6871031522750854, "learning_rate": 1.8417100429855414e-05, "loss": 0.3217, "step": 102300 }, { "epoch": 7.996876220226474, "grad_norm": 1.024958848953247, "learning_rate": 1.841553731926534e-05, "loss": 0.2894, "step": 102400 }, { "epoch": 8.004685669660288, "grad_norm": 1.070359230041504, "learning_rate": 1.8413974208675266e-05, "loss": 0.3107, "step": 102500 }, { "epoch": 8.012495119094105, "grad_norm": 1.0423635244369507, "learning_rate": 1.841241109808519e-05, "loss": 0.292, "step": 102600 }, { "epoch": 8.02030456852792, "grad_norm": 0.7128071188926697, "learning_rate": 1.841084798749512e-05, "loss": 0.3138, "step": 102700 }, { "epoch": 8.028114017961734, "grad_norm": 1.5792732238769531, "learning_rate": 1.840928487690504e-05, "loss": 0.2974, "step": 102800 }, { "epoch": 8.035923467395548, "grad_norm": 0.9122816324234009, "learning_rate": 1.8407721766314967e-05, "loss": 0.3096, "step": 102900 }, { "epoch": 8.043732916829363, "grad_norm": 1.035617709159851, "learning_rate": 1.8406158655724896e-05, "loss": 0.3126, "step": 103000 }, { "epoch": 8.05154236626318, "grad_norm": 0.964963436126709, "learning_rate": 1.840459554513482e-05, "loss": 0.3008, "step": 103100 }, { "epoch": 8.059351815696994, "grad_norm": 1.0072320699691772, "learning_rate": 1.8403032434544745e-05, "loss": 0.3006, "step": 103200 }, { "epoch": 8.067161265130808, "grad_norm": 1.2636690139770508, "learning_rate": 1.840146932395467e-05, "loss": 0.3081, "step": 103300 }, { "epoch": 8.074970714564623, "grad_norm": 1.1620123386383057, "learning_rate": 1.8399906213364597e-05, "loss": 0.2962, "step": 103400 }, { "epoch": 8.082780163998438, "grad_norm": 0.9352577328681946, "learning_rate": 1.8398343102774523e-05, "loss": 0.3156, "step": 103500 }, { "epoch": 8.090589613432254, "grad_norm": 1.1235134601593018, "learning_rate": 1.839677999218445e-05, "loss": 0.3037, "step": 103600 }, { "epoch": 8.098399062866068, "grad_norm": 1.2459105253219604, "learning_rate": 1.8395216881594372e-05, "loss": 0.2965, "step": 103700 }, { "epoch": 8.106208512299883, "grad_norm": 0.9999929666519165, "learning_rate": 1.83936537710043e-05, "loss": 0.3131, "step": 103800 }, { "epoch": 8.114017961733698, "grad_norm": 1.092516541481018, "learning_rate": 1.8392090660414227e-05, "loss": 0.3015, "step": 103900 }, { "epoch": 8.121827411167512, "grad_norm": 0.8040266036987305, "learning_rate": 1.839052754982415e-05, "loss": 0.3021, "step": 104000 }, { "epoch": 8.129636860601327, "grad_norm": 1.031531810760498, "learning_rate": 1.838896443923408e-05, "loss": 0.2925, "step": 104100 }, { "epoch": 8.137446310035143, "grad_norm": 1.1790035963058472, "learning_rate": 1.8387401328644002e-05, "loss": 0.2968, "step": 104200 }, { "epoch": 8.145255759468958, "grad_norm": 1.1958867311477661, "learning_rate": 1.8385838218053928e-05, "loss": 0.2985, "step": 104300 }, { "epoch": 8.153065208902772, "grad_norm": 0.9311303496360779, "learning_rate": 1.8384275107463854e-05, "loss": 0.3128, "step": 104400 }, { "epoch": 8.160874658336587, "grad_norm": 1.0499194860458374, "learning_rate": 1.838271199687378e-05, "loss": 0.2911, "step": 104500 }, { "epoch": 8.168684107770401, "grad_norm": 0.8166439533233643, "learning_rate": 1.8381148886283706e-05, "loss": 0.3072, "step": 104600 }, { "epoch": 8.176493557204218, "grad_norm": 0.937285840511322, "learning_rate": 1.8379585775693632e-05, "loss": 0.3066, "step": 104700 }, { "epoch": 8.184303006638032, "grad_norm": 0.9809720516204834, "learning_rate": 1.8378022665103558e-05, "loss": 0.2994, "step": 104800 }, { "epoch": 8.192112456071847, "grad_norm": 0.9872479438781738, "learning_rate": 1.8376459554513484e-05, "loss": 0.2972, "step": 104900 }, { "epoch": 8.199921905505661, "grad_norm": 0.8918797373771667, "learning_rate": 1.837489644392341e-05, "loss": 0.3038, "step": 105000 }, { "epoch": 8.207731354939476, "grad_norm": 1.048528790473938, "learning_rate": 1.8373333333333332e-05, "loss": 0.289, "step": 105100 }, { "epoch": 8.215540804373292, "grad_norm": 1.0492016077041626, "learning_rate": 1.8371770222743262e-05, "loss": 0.3092, "step": 105200 }, { "epoch": 8.223350253807107, "grad_norm": 1.2988113164901733, "learning_rate": 1.8370207112153188e-05, "loss": 0.2949, "step": 105300 }, { "epoch": 8.231159703240921, "grad_norm": 1.2381683588027954, "learning_rate": 1.836864400156311e-05, "loss": 0.3013, "step": 105400 }, { "epoch": 8.238969152674736, "grad_norm": 0.741263210773468, "learning_rate": 1.8367080890973037e-05, "loss": 0.3026, "step": 105500 }, { "epoch": 8.24677860210855, "grad_norm": 1.1652076244354248, "learning_rate": 1.8365517780382966e-05, "loss": 0.302, "step": 105600 }, { "epoch": 8.254588051542367, "grad_norm": 1.1444916725158691, "learning_rate": 1.836395466979289e-05, "loss": 0.2928, "step": 105700 }, { "epoch": 8.262397500976181, "grad_norm": 1.168922781944275, "learning_rate": 1.8362391559202815e-05, "loss": 0.2959, "step": 105800 }, { "epoch": 8.270206950409996, "grad_norm": 0.9830915331840515, "learning_rate": 1.836082844861274e-05, "loss": 0.2915, "step": 105900 }, { "epoch": 8.27801639984381, "grad_norm": 0.680469810962677, "learning_rate": 1.8359265338022667e-05, "loss": 0.2909, "step": 106000 }, { "epoch": 8.285825849277625, "grad_norm": 0.8958842754364014, "learning_rate": 1.8357717858538493e-05, "loss": 0.3001, "step": 106100 }, { "epoch": 8.293635298711441, "grad_norm": 1.0888712406158447, "learning_rate": 1.835615474794842e-05, "loss": 0.2953, "step": 106200 }, { "epoch": 8.301444748145256, "grad_norm": 0.760229229927063, "learning_rate": 1.8354591637358345e-05, "loss": 0.3228, "step": 106300 }, { "epoch": 8.30925419757907, "grad_norm": 1.0346572399139404, "learning_rate": 1.835302852676827e-05, "loss": 0.2856, "step": 106400 }, { "epoch": 8.317063647012885, "grad_norm": 1.0129973888397217, "learning_rate": 1.8351465416178197e-05, "loss": 0.2907, "step": 106500 }, { "epoch": 8.3248730964467, "grad_norm": 1.3134441375732422, "learning_rate": 1.8349902305588123e-05, "loss": 0.2839, "step": 106600 }, { "epoch": 8.332682545880516, "grad_norm": 0.9875175356864929, "learning_rate": 1.834833919499805e-05, "loss": 0.2926, "step": 106700 }, { "epoch": 8.34049199531433, "grad_norm": 0.8269121646881104, "learning_rate": 1.8346776084407975e-05, "loss": 0.2854, "step": 106800 }, { "epoch": 8.348301444748145, "grad_norm": 0.9428029656410217, "learning_rate": 1.8345212973817897e-05, "loss": 0.29, "step": 106900 }, { "epoch": 8.35611089418196, "grad_norm": 1.0320848226547241, "learning_rate": 1.8343649863227827e-05, "loss": 0.2924, "step": 107000 }, { "epoch": 8.363920343615774, "grad_norm": 0.9099970459938049, "learning_rate": 1.8342086752637753e-05, "loss": 0.2848, "step": 107100 }, { "epoch": 8.37172979304959, "grad_norm": 0.7163190245628357, "learning_rate": 1.8340523642047675e-05, "loss": 0.285, "step": 107200 }, { "epoch": 8.379539242483405, "grad_norm": 0.9897966384887695, "learning_rate": 1.83389605314576e-05, "loss": 0.3034, "step": 107300 }, { "epoch": 8.38734869191722, "grad_norm": 1.2372212409973145, "learning_rate": 1.8337397420867527e-05, "loss": 0.2962, "step": 107400 }, { "epoch": 8.395158141351034, "grad_norm": 0.8196184635162354, "learning_rate": 1.8335834310277453e-05, "loss": 0.292, "step": 107500 }, { "epoch": 8.402967590784849, "grad_norm": 1.124740481376648, "learning_rate": 1.833427119968738e-05, "loss": 0.2962, "step": 107600 }, { "epoch": 8.410777040218665, "grad_norm": 0.8833318948745728, "learning_rate": 1.8332708089097305e-05, "loss": 0.2948, "step": 107700 }, { "epoch": 8.41858648965248, "grad_norm": 1.2437323331832886, "learning_rate": 1.833114497850723e-05, "loss": 0.2957, "step": 107800 }, { "epoch": 8.426395939086294, "grad_norm": 1.064677119255066, "learning_rate": 1.8329581867917157e-05, "loss": 0.2919, "step": 107900 }, { "epoch": 8.434205388520109, "grad_norm": 1.100700855255127, "learning_rate": 1.8328018757327083e-05, "loss": 0.3, "step": 108000 }, { "epoch": 8.442014837953923, "grad_norm": 1.1101109981536865, "learning_rate": 1.832647127784291e-05, "loss": 0.2884, "step": 108100 }, { "epoch": 8.44982428738774, "grad_norm": 0.9055554866790771, "learning_rate": 1.8324908167252835e-05, "loss": 0.3045, "step": 108200 }, { "epoch": 8.457633736821554, "grad_norm": 0.8915761709213257, "learning_rate": 1.832334505666276e-05, "loss": 0.2876, "step": 108300 }, { "epoch": 8.465443186255369, "grad_norm": 0.9400941133499146, "learning_rate": 1.8321781946072684e-05, "loss": 0.3045, "step": 108400 }, { "epoch": 8.473252635689184, "grad_norm": 0.7043224573135376, "learning_rate": 1.8320218835482613e-05, "loss": 0.2925, "step": 108500 }, { "epoch": 8.481062085122998, "grad_norm": 0.9574306011199951, "learning_rate": 1.831865572489254e-05, "loss": 0.2878, "step": 108600 }, { "epoch": 8.488871534556814, "grad_norm": 1.167900800704956, "learning_rate": 1.8317092614302462e-05, "loss": 0.3014, "step": 108700 }, { "epoch": 8.496680983990629, "grad_norm": 1.2867276668548584, "learning_rate": 1.8315529503712388e-05, "loss": 0.2855, "step": 108800 }, { "epoch": 8.504490433424444, "grad_norm": 1.1072494983673096, "learning_rate": 1.8313966393122314e-05, "loss": 0.3038, "step": 108900 }, { "epoch": 8.512299882858258, "grad_norm": 0.8459449410438538, "learning_rate": 1.831240328253224e-05, "loss": 0.2881, "step": 109000 }, { "epoch": 8.520109332292073, "grad_norm": 0.8919705152511597, "learning_rate": 1.8310840171942166e-05, "loss": 0.2915, "step": 109100 }, { "epoch": 8.527918781725889, "grad_norm": 0.7803249359130859, "learning_rate": 1.8309277061352092e-05, "loss": 0.2912, "step": 109200 }, { "epoch": 8.535728231159704, "grad_norm": 0.7071219682693481, "learning_rate": 1.8307713950762018e-05, "loss": 0.2875, "step": 109300 }, { "epoch": 8.543537680593518, "grad_norm": 0.9213855266571045, "learning_rate": 1.8306150840171944e-05, "loss": 0.2899, "step": 109400 }, { "epoch": 8.551347130027333, "grad_norm": 1.0583925247192383, "learning_rate": 1.830458772958187e-05, "loss": 0.2948, "step": 109500 }, { "epoch": 8.559156579461147, "grad_norm": 1.0667381286621094, "learning_rate": 1.8303024618991796e-05, "loss": 0.2927, "step": 109600 }, { "epoch": 8.566966028894964, "grad_norm": 1.162601351737976, "learning_rate": 1.8301461508401722e-05, "loss": 0.2878, "step": 109700 }, { "epoch": 8.574775478328778, "grad_norm": 0.99709153175354, "learning_rate": 1.8299898397811645e-05, "loss": 0.3006, "step": 109800 }, { "epoch": 8.582584927762593, "grad_norm": 0.9036930203437805, "learning_rate": 1.829833528722157e-05, "loss": 0.2799, "step": 109900 }, { "epoch": 8.590394377196407, "grad_norm": 0.9895430207252502, "learning_rate": 1.82967721766315e-05, "loss": 0.2827, "step": 110000 }, { "epoch": 8.598203826630222, "grad_norm": 0.8556926846504211, "learning_rate": 1.8295224697147326e-05, "loss": 0.2903, "step": 110100 }, { "epoch": 8.606013276064038, "grad_norm": 0.8513884544372559, "learning_rate": 1.829366158655725e-05, "loss": 0.2738, "step": 110200 }, { "epoch": 8.613822725497853, "grad_norm": 0.8311988711357117, "learning_rate": 1.8292098475967178e-05, "loss": 0.3023, "step": 110300 }, { "epoch": 8.621632174931667, "grad_norm": 0.8456495404243469, "learning_rate": 1.82905353653771e-05, "loss": 0.2901, "step": 110400 }, { "epoch": 8.629441624365482, "grad_norm": 1.003395676612854, "learning_rate": 1.8288972254787027e-05, "loss": 0.293, "step": 110500 }, { "epoch": 8.637251073799296, "grad_norm": 0.8753073811531067, "learning_rate": 1.8287409144196953e-05, "loss": 0.2865, "step": 110600 }, { "epoch": 8.645060523233113, "grad_norm": 0.9070360660552979, "learning_rate": 1.828584603360688e-05, "loss": 0.2995, "step": 110700 }, { "epoch": 8.652869972666927, "grad_norm": 0.7955684065818787, "learning_rate": 1.8284282923016805e-05, "loss": 0.2987, "step": 110800 }, { "epoch": 8.660679422100742, "grad_norm": 0.9900552034378052, "learning_rate": 1.828271981242673e-05, "loss": 0.2906, "step": 110900 }, { "epoch": 8.668488871534556, "grad_norm": 0.9677467346191406, "learning_rate": 1.8281156701836657e-05, "loss": 0.2975, "step": 111000 }, { "epoch": 8.676298320968371, "grad_norm": 0.9279794096946716, "learning_rate": 1.8279593591246583e-05, "loss": 0.2958, "step": 111100 }, { "epoch": 8.684107770402187, "grad_norm": 0.8239137530326843, "learning_rate": 1.827803048065651e-05, "loss": 0.2784, "step": 111200 }, { "epoch": 8.691917219836002, "grad_norm": 0.8737657070159912, "learning_rate": 1.827646737006643e-05, "loss": 0.2907, "step": 111300 }, { "epoch": 8.699726669269817, "grad_norm": 0.9607727527618408, "learning_rate": 1.827490425947636e-05, "loss": 0.2861, "step": 111400 }, { "epoch": 8.707536118703631, "grad_norm": 0.8190209865570068, "learning_rate": 1.8273341148886287e-05, "loss": 0.2808, "step": 111500 }, { "epoch": 8.715345568137446, "grad_norm": 1.3692559003829956, "learning_rate": 1.827177803829621e-05, "loss": 0.2828, "step": 111600 }, { "epoch": 8.723155017571262, "grad_norm": 1.252812385559082, "learning_rate": 1.8270214927706135e-05, "loss": 0.2787, "step": 111700 }, { "epoch": 8.730964467005077, "grad_norm": 0.7284253835678101, "learning_rate": 1.8268651817116065e-05, "loss": 0.2856, "step": 111800 }, { "epoch": 8.738773916438891, "grad_norm": 0.8836492300033569, "learning_rate": 1.8267088706525987e-05, "loss": 0.2945, "step": 111900 }, { "epoch": 8.746583365872706, "grad_norm": 0.9553095102310181, "learning_rate": 1.8265525595935913e-05, "loss": 0.2696, "step": 112000 }, { "epoch": 8.75439281530652, "grad_norm": 1.0329238176345825, "learning_rate": 1.826397811645174e-05, "loss": 0.295, "step": 112100 }, { "epoch": 8.762202264740337, "grad_norm": 1.124981164932251, "learning_rate": 1.8262415005861665e-05, "loss": 0.2748, "step": 112200 }, { "epoch": 8.770011714174151, "grad_norm": 0.8739539980888367, "learning_rate": 1.826085189527159e-05, "loss": 0.2861, "step": 112300 }, { "epoch": 8.777821163607966, "grad_norm": 1.0567717552185059, "learning_rate": 1.8259288784681517e-05, "loss": 0.2877, "step": 112400 }, { "epoch": 8.78563061304178, "grad_norm": 1.0307221412658691, "learning_rate": 1.8257725674091443e-05, "loss": 0.2805, "step": 112500 }, { "epoch": 8.793440062475595, "grad_norm": 1.2705262899398804, "learning_rate": 1.825616256350137e-05, "loss": 0.2724, "step": 112600 }, { "epoch": 8.801249511909411, "grad_norm": 1.0293917655944824, "learning_rate": 1.8254599452911295e-05, "loss": 0.284, "step": 112700 }, { "epoch": 8.809058961343226, "grad_norm": 0.9187902808189392, "learning_rate": 1.825303634232122e-05, "loss": 0.2893, "step": 112800 }, { "epoch": 8.81686841077704, "grad_norm": 0.8527992963790894, "learning_rate": 1.8251473231731147e-05, "loss": 0.2885, "step": 112900 }, { "epoch": 8.824677860210855, "grad_norm": 0.8219836950302124, "learning_rate": 1.8249910121141073e-05, "loss": 0.2937, "step": 113000 }, { "epoch": 8.83248730964467, "grad_norm": 0.9678224325180054, "learning_rate": 1.8248347010550996e-05, "loss": 0.2807, "step": 113100 }, { "epoch": 8.840296759078486, "grad_norm": 0.6939859390258789, "learning_rate": 1.8246783899960926e-05, "loss": 0.2776, "step": 113200 }, { "epoch": 8.8481062085123, "grad_norm": 1.0932164192199707, "learning_rate": 1.824522078937085e-05, "loss": 0.2823, "step": 113300 }, { "epoch": 8.855915657946115, "grad_norm": 1.029359221458435, "learning_rate": 1.8243657678780774e-05, "loss": 0.2863, "step": 113400 }, { "epoch": 8.86372510737993, "grad_norm": 1.0178298950195312, "learning_rate": 1.82420945681907e-05, "loss": 0.2841, "step": 113500 }, { "epoch": 8.871534556813744, "grad_norm": 0.8735611438751221, "learning_rate": 1.8240531457600626e-05, "loss": 0.2868, "step": 113600 }, { "epoch": 8.87934400624756, "grad_norm": 1.1830412149429321, "learning_rate": 1.8238968347010552e-05, "loss": 0.2819, "step": 113700 }, { "epoch": 8.887153455681375, "grad_norm": 0.7258915305137634, "learning_rate": 1.8237405236420478e-05, "loss": 0.2976, "step": 113800 }, { "epoch": 8.89496290511519, "grad_norm": 0.8901984095573425, "learning_rate": 1.8235842125830404e-05, "loss": 0.2907, "step": 113900 }, { "epoch": 8.902772354549004, "grad_norm": 1.0432909727096558, "learning_rate": 1.823427901524033e-05, "loss": 0.2937, "step": 114000 }, { "epoch": 8.910581803982819, "grad_norm": 0.7205260992050171, "learning_rate": 1.8232731535756156e-05, "loss": 0.283, "step": 114100 }, { "epoch": 8.918391253416633, "grad_norm": 1.2185769081115723, "learning_rate": 1.8231168425166082e-05, "loss": 0.2751, "step": 114200 }, { "epoch": 8.92620070285045, "grad_norm": 0.9317169189453125, "learning_rate": 1.8229605314576008e-05, "loss": 0.2906, "step": 114300 }, { "epoch": 8.934010152284264, "grad_norm": 0.8162646293640137, "learning_rate": 1.8228042203985934e-05, "loss": 0.2802, "step": 114400 }, { "epoch": 8.941819601718079, "grad_norm": 0.9121643900871277, "learning_rate": 1.822647909339586e-05, "loss": 0.2794, "step": 114500 }, { "epoch": 8.949629051151893, "grad_norm": 1.0515434741973877, "learning_rate": 1.8224915982805783e-05, "loss": 0.2851, "step": 114600 }, { "epoch": 8.95743850058571, "grad_norm": 0.882593035697937, "learning_rate": 1.8223352872215712e-05, "loss": 0.284, "step": 114700 }, { "epoch": 8.965247950019524, "grad_norm": 0.7872332334518433, "learning_rate": 1.8221789761625638e-05, "loss": 0.2809, "step": 114800 }, { "epoch": 8.973057399453339, "grad_norm": 0.9008090496063232, "learning_rate": 1.8220242282141464e-05, "loss": 0.2774, "step": 114900 }, { "epoch": 8.980866848887153, "grad_norm": 1.128659963607788, "learning_rate": 1.8218679171551387e-05, "loss": 0.2834, "step": 115000 }, { "epoch": 8.988676298320968, "grad_norm": 1.0850447416305542, "learning_rate": 1.8217116060961316e-05, "loss": 0.3057, "step": 115100 }, { "epoch": 8.996485747754782, "grad_norm": 0.9458226561546326, "learning_rate": 1.821555295037124e-05, "loss": 0.2831, "step": 115200 }, { "epoch": 9.004295197188599, "grad_norm": 0.9536977410316467, "learning_rate": 1.8213989839781165e-05, "loss": 0.2831, "step": 115300 }, { "epoch": 9.012104646622413, "grad_norm": 0.9741133451461792, "learning_rate": 1.8212426729191094e-05, "loss": 0.2802, "step": 115400 }, { "epoch": 9.019914096056228, "grad_norm": 0.9699240326881409, "learning_rate": 1.8210863618601017e-05, "loss": 0.2891, "step": 115500 }, { "epoch": 9.027723545490042, "grad_norm": 1.16199791431427, "learning_rate": 1.8209300508010943e-05, "loss": 0.281, "step": 115600 }, { "epoch": 9.035532994923859, "grad_norm": 1.1076624393463135, "learning_rate": 1.820773739742087e-05, "loss": 0.29, "step": 115700 }, { "epoch": 9.043342444357673, "grad_norm": 1.11338210105896, "learning_rate": 1.8206174286830795e-05, "loss": 0.275, "step": 115800 }, { "epoch": 9.051151893791488, "grad_norm": 0.8460277318954468, "learning_rate": 1.820461117624072e-05, "loss": 0.2915, "step": 115900 }, { "epoch": 9.058961343225302, "grad_norm": 1.0888310670852661, "learning_rate": 1.8203048065650647e-05, "loss": 0.2719, "step": 116000 }, { "epoch": 9.066770792659117, "grad_norm": 0.9920503497123718, "learning_rate": 1.820148495506057e-05, "loss": 0.2754, "step": 116100 }, { "epoch": 9.074580242092932, "grad_norm": 1.0043495893478394, "learning_rate": 1.81999218444705e-05, "loss": 0.2768, "step": 116200 }, { "epoch": 9.082389691526748, "grad_norm": 0.9468898177146912, "learning_rate": 1.8198358733880425e-05, "loss": 0.2795, "step": 116300 }, { "epoch": 9.090199140960562, "grad_norm": 1.0771790742874146, "learning_rate": 1.8196795623290348e-05, "loss": 0.2752, "step": 116400 }, { "epoch": 9.098008590394377, "grad_norm": 0.934396505355835, "learning_rate": 1.8195232512700277e-05, "loss": 0.2834, "step": 116500 }, { "epoch": 9.105818039828192, "grad_norm": 1.111622929573059, "learning_rate": 1.81936694021102e-05, "loss": 0.2709, "step": 116600 }, { "epoch": 9.113627489262006, "grad_norm": 0.8829967379570007, "learning_rate": 1.8192106291520126e-05, "loss": 0.2861, "step": 116700 }, { "epoch": 9.121436938695823, "grad_norm": 1.1684046983718872, "learning_rate": 1.819054318093005e-05, "loss": 0.2725, "step": 116800 }, { "epoch": 9.129246388129637, "grad_norm": 1.0102686882019043, "learning_rate": 1.8188980070339978e-05, "loss": 0.2787, "step": 116900 }, { "epoch": 9.137055837563452, "grad_norm": 1.0709645748138428, "learning_rate": 1.8187416959749904e-05, "loss": 0.2674, "step": 117000 }, { "epoch": 9.144865286997266, "grad_norm": 0.8841697573661804, "learning_rate": 1.818585384915983e-05, "loss": 0.2832, "step": 117100 }, { "epoch": 9.15267473643108, "grad_norm": 0.8411263823509216, "learning_rate": 1.8184290738569756e-05, "loss": 0.2818, "step": 117200 }, { "epoch": 9.160484185864897, "grad_norm": 0.6763750314712524, "learning_rate": 1.818272762797968e-05, "loss": 0.2828, "step": 117300 }, { "epoch": 9.168293635298712, "grad_norm": 0.8106026649475098, "learning_rate": 1.8181164517389608e-05, "loss": 0.2889, "step": 117400 }, { "epoch": 9.176103084732526, "grad_norm": 0.906958818435669, "learning_rate": 1.817960140679953e-05, "loss": 0.2772, "step": 117500 }, { "epoch": 9.18391253416634, "grad_norm": 1.0904810428619385, "learning_rate": 1.817803829620946e-05, "loss": 0.2767, "step": 117600 }, { "epoch": 9.191721983600155, "grad_norm": 0.9980121850967407, "learning_rate": 1.8176475185619386e-05, "loss": 0.2752, "step": 117700 }, { "epoch": 9.199531433033972, "grad_norm": 0.8572126626968384, "learning_rate": 1.8174912075029308e-05, "loss": 0.2696, "step": 117800 }, { "epoch": 9.207340882467786, "grad_norm": 1.0560619831085205, "learning_rate": 1.8173348964439234e-05, "loss": 0.2627, "step": 117900 }, { "epoch": 9.2151503319016, "grad_norm": 1.0500401258468628, "learning_rate": 1.8171785853849164e-05, "loss": 0.2803, "step": 118000 }, { "epoch": 9.222959781335415, "grad_norm": 0.8881191611289978, "learning_rate": 1.8170222743259086e-05, "loss": 0.2831, "step": 118100 }, { "epoch": 9.23076923076923, "grad_norm": 1.4876117706298828, "learning_rate": 1.8168659632669012e-05, "loss": 0.2717, "step": 118200 }, { "epoch": 9.238578680203046, "grad_norm": 0.8553842902183533, "learning_rate": 1.816709652207894e-05, "loss": 0.2829, "step": 118300 }, { "epoch": 9.24638812963686, "grad_norm": 1.1154588460922241, "learning_rate": 1.8165533411488864e-05, "loss": 0.2709, "step": 118400 }, { "epoch": 9.254197579070675, "grad_norm": 1.1607916355133057, "learning_rate": 1.816397030089879e-05, "loss": 0.2714, "step": 118500 }, { "epoch": 9.26200702850449, "grad_norm": 0.883974552154541, "learning_rate": 1.8162407190308716e-05, "loss": 0.2758, "step": 118600 }, { "epoch": 9.269816477938305, "grad_norm": 1.1841872930526733, "learning_rate": 1.8160844079718642e-05, "loss": 0.2844, "step": 118700 }, { "epoch": 9.27762592737212, "grad_norm": 1.1815747022628784, "learning_rate": 1.815928096912857e-05, "loss": 0.2651, "step": 118800 }, { "epoch": 9.285435376805935, "grad_norm": 0.8993953466415405, "learning_rate": 1.8157733489644394e-05, "loss": 0.2802, "step": 118900 }, { "epoch": 9.29324482623975, "grad_norm": 0.8702039122581482, "learning_rate": 1.815617037905432e-05, "loss": 0.2751, "step": 119000 }, { "epoch": 9.301054275673565, "grad_norm": 0.8321089148521423, "learning_rate": 1.8154607268464246e-05, "loss": 0.2758, "step": 119100 }, { "epoch": 9.30886372510738, "grad_norm": 0.7785543203353882, "learning_rate": 1.8153044157874172e-05, "loss": 0.2671, "step": 119200 }, { "epoch": 9.316673174541195, "grad_norm": 0.8350294828414917, "learning_rate": 1.8151481047284095e-05, "loss": 0.2699, "step": 119300 }, { "epoch": 9.32448262397501, "grad_norm": 0.933214008808136, "learning_rate": 1.8149917936694024e-05, "loss": 0.2731, "step": 119400 }, { "epoch": 9.332292073408825, "grad_norm": 0.780940592288971, "learning_rate": 1.814835482610395e-05, "loss": 0.2872, "step": 119500 }, { "epoch": 9.34010152284264, "grad_norm": 0.8094989061355591, "learning_rate": 1.8146791715513873e-05, "loss": 0.2684, "step": 119600 }, { "epoch": 9.347910972276454, "grad_norm": 0.949965238571167, "learning_rate": 1.81452286049238e-05, "loss": 0.2742, "step": 119700 }, { "epoch": 9.35572042171027, "grad_norm": 1.011487364768982, "learning_rate": 1.8143665494333725e-05, "loss": 0.2753, "step": 119800 }, { "epoch": 9.363529871144085, "grad_norm": 1.0791568756103516, "learning_rate": 1.814210238374365e-05, "loss": 0.2705, "step": 119900 }, { "epoch": 9.3713393205779, "grad_norm": 0.9083512425422668, "learning_rate": 1.8140539273153577e-05, "loss": 0.2659, "step": 120000 }, { "epoch": 9.379148770011714, "grad_norm": 1.0209741592407227, "learning_rate": 1.8138976162563503e-05, "loss": 0.2728, "step": 120100 }, { "epoch": 9.386958219445528, "grad_norm": 1.060758113861084, "learning_rate": 1.813741305197343e-05, "loss": 0.2747, "step": 120200 }, { "epoch": 9.394767668879345, "grad_norm": 1.1624435186386108, "learning_rate": 1.8135849941383355e-05, "loss": 0.282, "step": 120300 }, { "epoch": 9.40257711831316, "grad_norm": 1.0269403457641602, "learning_rate": 1.813428683079328e-05, "loss": 0.2676, "step": 120400 }, { "epoch": 9.410386567746974, "grad_norm": 1.0639585256576538, "learning_rate": 1.8132723720203207e-05, "loss": 0.2608, "step": 120500 }, { "epoch": 9.418196017180788, "grad_norm": 1.1121162176132202, "learning_rate": 1.8131160609613133e-05, "loss": 0.2669, "step": 120600 }, { "epoch": 9.426005466614603, "grad_norm": 0.9323787689208984, "learning_rate": 1.8129597499023056e-05, "loss": 0.2798, "step": 120700 }, { "epoch": 9.43381491604842, "grad_norm": 0.9078027606010437, "learning_rate": 1.8128034388432982e-05, "loss": 0.2694, "step": 120800 }, { "epoch": 9.441624365482234, "grad_norm": 0.9647188186645508, "learning_rate": 1.812648690894881e-05, "loss": 0.2674, "step": 120900 }, { "epoch": 9.449433814916048, "grad_norm": 1.0538146495819092, "learning_rate": 1.8124923798358737e-05, "loss": 0.2733, "step": 121000 }, { "epoch": 9.457243264349863, "grad_norm": 1.0025720596313477, "learning_rate": 1.812336068776866e-05, "loss": 0.2694, "step": 121100 }, { "epoch": 9.465052713783678, "grad_norm": 0.8698012232780457, "learning_rate": 1.8121797577178586e-05, "loss": 0.2683, "step": 121200 }, { "epoch": 9.472862163217494, "grad_norm": 1.078913927078247, "learning_rate": 1.8120234466588512e-05, "loss": 0.2643, "step": 121300 }, { "epoch": 9.480671612651308, "grad_norm": 0.9054146409034729, "learning_rate": 1.8118671355998438e-05, "loss": 0.274, "step": 121400 }, { "epoch": 9.488481062085123, "grad_norm": 0.9421964883804321, "learning_rate": 1.8117108245408364e-05, "loss": 0.2739, "step": 121500 }, { "epoch": 9.496290511518938, "grad_norm": 0.9728415608406067, "learning_rate": 1.811554513481829e-05, "loss": 0.2685, "step": 121600 }, { "epoch": 9.504099960952752, "grad_norm": 0.9228818416595459, "learning_rate": 1.8113982024228216e-05, "loss": 0.2874, "step": 121700 }, { "epoch": 9.511909410386568, "grad_norm": 0.8778666853904724, "learning_rate": 1.8112418913638142e-05, "loss": 0.278, "step": 121800 }, { "epoch": 9.519718859820383, "grad_norm": 1.1950221061706543, "learning_rate": 1.8110855803048068e-05, "loss": 0.2749, "step": 121900 }, { "epoch": 9.527528309254198, "grad_norm": 0.9403451085090637, "learning_rate": 1.8109292692457994e-05, "loss": 0.2646, "step": 122000 }, { "epoch": 9.535337758688012, "grad_norm": 1.0735374689102173, "learning_rate": 1.810772958186792e-05, "loss": 0.2834, "step": 122100 }, { "epoch": 9.543147208121827, "grad_norm": 1.1062880754470825, "learning_rate": 1.8106166471277842e-05, "loss": 0.268, "step": 122200 }, { "epoch": 9.550956657555643, "grad_norm": 1.203642725944519, "learning_rate": 1.810460336068777e-05, "loss": 0.2595, "step": 122300 }, { "epoch": 9.558766106989458, "grad_norm": 1.0009891986846924, "learning_rate": 1.8103040250097698e-05, "loss": 0.2604, "step": 122400 }, { "epoch": 9.566575556423272, "grad_norm": 1.1903247833251953, "learning_rate": 1.810147713950762e-05, "loss": 0.2539, "step": 122500 }, { "epoch": 9.574385005857087, "grad_norm": 1.1798654794692993, "learning_rate": 1.8099914028917547e-05, "loss": 0.2679, "step": 122600 }, { "epoch": 9.582194455290901, "grad_norm": 0.955449104309082, "learning_rate": 1.8098350918327473e-05, "loss": 0.2736, "step": 122700 }, { "epoch": 9.590003904724718, "grad_norm": 1.1760761737823486, "learning_rate": 1.80967878077374e-05, "loss": 0.2714, "step": 122800 }, { "epoch": 9.597813354158532, "grad_norm": 0.9086267352104187, "learning_rate": 1.8095224697147325e-05, "loss": 0.2799, "step": 122900 }, { "epoch": 9.605622803592347, "grad_norm": Infinity, "learning_rate": 1.809367721766315e-05, "loss": 0.2775, "step": 123000 }, { "epoch": 9.613432253026161, "grad_norm": 0.7868393659591675, "learning_rate": 1.8092114107073076e-05, "loss": 0.268, "step": 123100 }, { "epoch": 9.621241702459976, "grad_norm": 1.0325120687484741, "learning_rate": 1.8090550996483003e-05, "loss": 0.2671, "step": 123200 }, { "epoch": 9.629051151893792, "grad_norm": 1.0042729377746582, "learning_rate": 1.808898788589293e-05, "loss": 0.2615, "step": 123300 }, { "epoch": 9.636860601327607, "grad_norm": 0.8402108550071716, "learning_rate": 1.8087424775302855e-05, "loss": 0.2646, "step": 123400 }, { "epoch": 9.644670050761421, "grad_norm": 0.9658582210540771, "learning_rate": 1.808586166471278e-05, "loss": 0.2814, "step": 123500 }, { "epoch": 9.652479500195236, "grad_norm": 1.088170051574707, "learning_rate": 1.8084298554122707e-05, "loss": 0.2695, "step": 123600 }, { "epoch": 9.66028894962905, "grad_norm": 0.8987926840782166, "learning_rate": 1.808273544353263e-05, "loss": 0.2708, "step": 123700 }, { "epoch": 9.668098399062867, "grad_norm": 1.143585205078125, "learning_rate": 1.808117233294256e-05, "loss": 0.2656, "step": 123800 }, { "epoch": 9.675907848496681, "grad_norm": 0.8075404763221741, "learning_rate": 1.8079609222352485e-05, "loss": 0.2737, "step": 123900 }, { "epoch": 9.683717297930496, "grad_norm": 1.1423933506011963, "learning_rate": 1.8078046111762407e-05, "loss": 0.2677, "step": 124000 }, { "epoch": 9.69152674736431, "grad_norm": 0.9777726531028748, "learning_rate": 1.8076483001172333e-05, "loss": 0.2803, "step": 124100 }, { "epoch": 9.699336196798125, "grad_norm": 0.8643308877944946, "learning_rate": 1.8074919890582263e-05, "loss": 0.2613, "step": 124200 }, { "epoch": 9.707145646231941, "grad_norm": 1.1980983018875122, "learning_rate": 1.8073356779992185e-05, "loss": 0.2604, "step": 124300 }, { "epoch": 9.714955095665756, "grad_norm": 0.6800851821899414, "learning_rate": 1.807179366940211e-05, "loss": 0.2618, "step": 124400 }, { "epoch": 9.72276454509957, "grad_norm": 0.8524096608161926, "learning_rate": 1.8070230558812037e-05, "loss": 0.2582, "step": 124500 }, { "epoch": 9.730573994533385, "grad_norm": 0.8670868873596191, "learning_rate": 1.8068667448221963e-05, "loss": 0.2632, "step": 124600 }, { "epoch": 9.7383834439672, "grad_norm": 0.720768928527832, "learning_rate": 1.806710433763189e-05, "loss": 0.2668, "step": 124700 }, { "epoch": 9.746192893401016, "grad_norm": 0.7876792550086975, "learning_rate": 1.8065541227041815e-05, "loss": 0.2724, "step": 124800 }, { "epoch": 9.75400234283483, "grad_norm": 1.1576100587844849, "learning_rate": 1.806397811645174e-05, "loss": 0.2633, "step": 124900 }, { "epoch": 9.761811792268645, "grad_norm": 1.0780067443847656, "learning_rate": 1.8062415005861667e-05, "loss": 0.2709, "step": 125000 }, { "epoch": 9.76962124170246, "grad_norm": 1.1646336317062378, "learning_rate": 1.8060867526377493e-05, "loss": 0.2716, "step": 125100 }, { "epoch": 9.777430691136274, "grad_norm": 1.0148494243621826, "learning_rate": 1.8059304415787416e-05, "loss": 0.2769, "step": 125200 }, { "epoch": 9.78524014057009, "grad_norm": 0.8026964068412781, "learning_rate": 1.8057741305197345e-05, "loss": 0.2738, "step": 125300 }, { "epoch": 9.793049590003905, "grad_norm": 1.0151106119155884, "learning_rate": 1.805617819460727e-05, "loss": 0.2685, "step": 125400 }, { "epoch": 9.80085903943772, "grad_norm": 0.9404942989349365, "learning_rate": 1.8054615084017194e-05, "loss": 0.2673, "step": 125500 }, { "epoch": 9.808668488871534, "grad_norm": 1.1046905517578125, "learning_rate": 1.805305197342712e-05, "loss": 0.2684, "step": 125600 }, { "epoch": 9.816477938305349, "grad_norm": 1.1244338750839233, "learning_rate": 1.805148886283705e-05, "loss": 0.2641, "step": 125700 }, { "epoch": 9.824287387739165, "grad_norm": 0.7566142678260803, "learning_rate": 1.8049925752246972e-05, "loss": 0.2593, "step": 125800 }, { "epoch": 9.83209683717298, "grad_norm": 1.0991520881652832, "learning_rate": 1.8048362641656898e-05, "loss": 0.2736, "step": 125900 }, { "epoch": 9.839906286606794, "grad_norm": 1.0630158185958862, "learning_rate": 1.8046799531066824e-05, "loss": 0.2714, "step": 126000 }, { "epoch": 9.847715736040609, "grad_norm": 1.0158815383911133, "learning_rate": 1.804523642047675e-05, "loss": 0.2661, "step": 126100 }, { "epoch": 9.855525185474423, "grad_norm": 1.246567964553833, "learning_rate": 1.8043673309886676e-05, "loss": 0.25, "step": 126200 }, { "epoch": 9.863334634908238, "grad_norm": 0.8306715488433838, "learning_rate": 1.8042110199296602e-05, "loss": 0.2761, "step": 126300 }, { "epoch": 9.871144084342054, "grad_norm": 0.8288384079933167, "learning_rate": 1.8040547088706528e-05, "loss": 0.2628, "step": 126400 }, { "epoch": 9.878953533775869, "grad_norm": 1.0349608659744263, "learning_rate": 1.8038983978116454e-05, "loss": 0.2686, "step": 126500 }, { "epoch": 9.886762983209683, "grad_norm": 0.935551106929779, "learning_rate": 1.803742086752638e-05, "loss": 0.2656, "step": 126600 }, { "epoch": 9.894572432643498, "grad_norm": 0.8951859474182129, "learning_rate": 1.8035857756936306e-05, "loss": 0.2597, "step": 126700 }, { "epoch": 9.902381882077314, "grad_norm": 0.9531537890434265, "learning_rate": 1.8034294646346232e-05, "loss": 0.2674, "step": 126800 }, { "epoch": 9.910191331511129, "grad_norm": 0.5789433121681213, "learning_rate": 1.8032731535756155e-05, "loss": 0.2628, "step": 126900 }, { "epoch": 9.918000780944944, "grad_norm": 1.147503137588501, "learning_rate": 1.803116842516608e-05, "loss": 0.2733, "step": 127000 }, { "epoch": 9.925810230378758, "grad_norm": 0.8560211658477783, "learning_rate": 1.802962094568191e-05, "loss": 0.2698, "step": 127100 }, { "epoch": 9.933619679812573, "grad_norm": 0.7029670476913452, "learning_rate": 1.8028057835091836e-05, "loss": 0.271, "step": 127200 }, { "epoch": 9.941429129246387, "grad_norm": 0.9978414177894592, "learning_rate": 1.802649472450176e-05, "loss": 0.2517, "step": 127300 }, { "epoch": 9.949238578680204, "grad_norm": 0.905877411365509, "learning_rate": 1.8024931613911685e-05, "loss": 0.2783, "step": 127400 }, { "epoch": 9.957048028114018, "grad_norm": 1.013178825378418, "learning_rate": 1.802336850332161e-05, "loss": 0.2556, "step": 127500 }, { "epoch": 9.964857477547833, "grad_norm": 0.9851475954055786, "learning_rate": 1.8021805392731537e-05, "loss": 0.2659, "step": 127600 }, { "epoch": 9.972666926981647, "grad_norm": 0.981611967086792, "learning_rate": 1.8020242282141463e-05, "loss": 0.2682, "step": 127700 }, { "epoch": 9.980476376415464, "grad_norm": 0.7983320355415344, "learning_rate": 1.801867917155139e-05, "loss": 0.259, "step": 127800 }, { "epoch": 9.988285825849278, "grad_norm": 0.9113990068435669, "learning_rate": 1.8017116060961315e-05, "loss": 0.2643, "step": 127900 }, { "epoch": 9.996095275283093, "grad_norm": 1.0225112438201904, "learning_rate": 1.801555295037124e-05, "loss": 0.2597, "step": 128000 }, { "epoch": 10.003904724716907, "grad_norm": 1.1934940814971924, "learning_rate": 1.8013989839781167e-05, "loss": 0.2494, "step": 128100 }, { "epoch": 10.011714174150722, "grad_norm": 0.9636716246604919, "learning_rate": 1.8012426729191093e-05, "loss": 0.2621, "step": 128200 }, { "epoch": 10.019523623584536, "grad_norm": 1.4203592538833618, "learning_rate": 1.801086361860102e-05, "loss": 0.2604, "step": 128300 }, { "epoch": 10.027333073018353, "grad_norm": 0.8860512375831604, "learning_rate": 1.8009316139116845e-05, "loss": 0.2538, "step": 128400 }, { "epoch": 10.035142522452167, "grad_norm": 1.068411111831665, "learning_rate": 1.8007753028526767e-05, "loss": 0.262, "step": 128500 }, { "epoch": 10.042951971885982, "grad_norm": 0.9057846665382385, "learning_rate": 1.8006189917936697e-05, "loss": 0.2496, "step": 128600 }, { "epoch": 10.050761421319796, "grad_norm": 1.0000461339950562, "learning_rate": 1.8004626807346623e-05, "loss": 0.2585, "step": 128700 }, { "epoch": 10.058570870753611, "grad_norm": 0.8489338159561157, "learning_rate": 1.8003063696756545e-05, "loss": 0.2641, "step": 128800 }, { "epoch": 10.066380320187427, "grad_norm": 0.8931035399436951, "learning_rate": 1.8001500586166475e-05, "loss": 0.2557, "step": 128900 }, { "epoch": 10.074189769621242, "grad_norm": 0.9536027312278748, "learning_rate": 1.7999937475576397e-05, "loss": 0.255, "step": 129000 }, { "epoch": 10.081999219055056, "grad_norm": 0.8968120217323303, "learning_rate": 1.7998374364986323e-05, "loss": 0.2539, "step": 129100 }, { "epoch": 10.089808668488871, "grad_norm": 1.1486092805862427, "learning_rate": 1.799681125439625e-05, "loss": 0.2475, "step": 129200 }, { "epoch": 10.097618117922686, "grad_norm": 0.9897533655166626, "learning_rate": 1.7995248143806175e-05, "loss": 0.2471, "step": 129300 }, { "epoch": 10.105427567356502, "grad_norm": 0.7280323505401611, "learning_rate": 1.79936850332161e-05, "loss": 0.2584, "step": 129400 }, { "epoch": 10.113237016790317, "grad_norm": 0.8905358910560608, "learning_rate": 1.7992121922626027e-05, "loss": 0.2608, "step": 129500 }, { "epoch": 10.121046466224131, "grad_norm": 1.0517534017562866, "learning_rate": 1.7990558812035953e-05, "loss": 0.2589, "step": 129600 }, { "epoch": 10.128855915657946, "grad_norm": 1.541553020477295, "learning_rate": 1.798899570144588e-05, "loss": 0.2621, "step": 129700 }, { "epoch": 10.13666536509176, "grad_norm": 0.8779164552688599, "learning_rate": 1.7987432590855805e-05, "loss": 0.2536, "step": 129800 }, { "epoch": 10.144474814525577, "grad_norm": 1.1161167621612549, "learning_rate": 1.7985869480265728e-05, "loss": 0.2489, "step": 129900 }, { "epoch": 10.152284263959391, "grad_norm": 0.999082088470459, "learning_rate": 1.7984306369675657e-05, "loss": 0.2666, "step": 130000 }, { "epoch": 10.160093713393206, "grad_norm": 0.9592772722244263, "learning_rate": 1.7982743259085583e-05, "loss": 0.26, "step": 130100 }, { "epoch": 10.16790316282702, "grad_norm": 1.0480682849884033, "learning_rate": 1.7981180148495506e-05, "loss": 0.256, "step": 130200 }, { "epoch": 10.175712612260835, "grad_norm": 1.0796922445297241, "learning_rate": 1.7979617037905432e-05, "loss": 0.2634, "step": 130300 }, { "epoch": 10.183522061694651, "grad_norm": 0.8052628040313721, "learning_rate": 1.7978053927315358e-05, "loss": 0.2636, "step": 130400 }, { "epoch": 10.191331511128466, "grad_norm": 0.8707953095436096, "learning_rate": 1.7976490816725284e-05, "loss": 0.2551, "step": 130500 }, { "epoch": 10.19914096056228, "grad_norm": 1.1583307981491089, "learning_rate": 1.797492770613521e-05, "loss": 0.2669, "step": 130600 }, { "epoch": 10.206950409996095, "grad_norm": 0.8590611219406128, "learning_rate": 1.7973364595545136e-05, "loss": 0.2606, "step": 130700 }, { "epoch": 10.21475985942991, "grad_norm": 1.029362678527832, "learning_rate": 1.7971801484955062e-05, "loss": 0.2657, "step": 130800 }, { "epoch": 10.222569308863726, "grad_norm": 1.1454167366027832, "learning_rate": 1.7970238374364988e-05, "loss": 0.2579, "step": 130900 }, { "epoch": 10.23037875829754, "grad_norm": 0.6559708714485168, "learning_rate": 1.7968675263774914e-05, "loss": 0.2578, "step": 131000 }, { "epoch": 10.238188207731355, "grad_norm": 1.1231006383895874, "learning_rate": 1.796711215318484e-05, "loss": 0.2589, "step": 131100 }, { "epoch": 10.24599765716517, "grad_norm": 1.1264257431030273, "learning_rate": 1.7965549042594766e-05, "loss": 0.2489, "step": 131200 }, { "epoch": 10.253807106598984, "grad_norm": 0.8126028180122375, "learning_rate": 1.7963985932004692e-05, "loss": 0.2643, "step": 131300 }, { "epoch": 10.2616165560328, "grad_norm": 0.8900143504142761, "learning_rate": 1.7962422821414615e-05, "loss": 0.2777, "step": 131400 }, { "epoch": 10.269426005466615, "grad_norm": 0.972007691860199, "learning_rate": 1.7960859710824544e-05, "loss": 0.2707, "step": 131500 }, { "epoch": 10.27723545490043, "grad_norm": 0.8504611253738403, "learning_rate": 1.7959296600234467e-05, "loss": 0.2457, "step": 131600 }, { "epoch": 10.285044904334244, "grad_norm": 0.9626019597053528, "learning_rate": 1.7957733489644393e-05, "loss": 0.2512, "step": 131700 }, { "epoch": 10.292854353768059, "grad_norm": 1.2120001316070557, "learning_rate": 1.795617037905432e-05, "loss": 0.2579, "step": 131800 }, { "epoch": 10.300663803201875, "grad_norm": 1.2339860200881958, "learning_rate": 1.7954607268464245e-05, "loss": 0.2589, "step": 131900 }, { "epoch": 10.30847325263569, "grad_norm": 0.876400887966156, "learning_rate": 1.795304415787417e-05, "loss": 0.2439, "step": 132000 }, { "epoch": 10.316282702069504, "grad_norm": 0.7515790462493896, "learning_rate": 1.7951481047284097e-05, "loss": 0.2542, "step": 132100 }, { "epoch": 10.324092151503319, "grad_norm": 0.7441714406013489, "learning_rate": 1.7949917936694023e-05, "loss": 0.2592, "step": 132200 }, { "epoch": 10.331901600937133, "grad_norm": 0.9107796549797058, "learning_rate": 1.794837045720985e-05, "loss": 0.2662, "step": 132300 }, { "epoch": 10.33971105037095, "grad_norm": 1.1548948287963867, "learning_rate": 1.7946807346619775e-05, "loss": 0.2604, "step": 132400 }, { "epoch": 10.347520499804764, "grad_norm": 1.0243403911590576, "learning_rate": 1.79452442360297e-05, "loss": 0.273, "step": 132500 }, { "epoch": 10.355329949238579, "grad_norm": 1.4229692220687866, "learning_rate": 1.7943681125439627e-05, "loss": 0.2552, "step": 132600 }, { "epoch": 10.363139398672393, "grad_norm": 1.00844407081604, "learning_rate": 1.7942118014849553e-05, "loss": 0.2385, "step": 132700 }, { "epoch": 10.370948848106208, "grad_norm": 0.9128909707069397, "learning_rate": 1.794055490425948e-05, "loss": 0.2597, "step": 132800 }, { "epoch": 10.378758297540024, "grad_norm": 1.053162932395935, "learning_rate": 1.7938991793669405e-05, "loss": 0.2497, "step": 132900 }, { "epoch": 10.386567746973839, "grad_norm": 0.9146475195884705, "learning_rate": 1.793742868307933e-05, "loss": 0.25, "step": 133000 }, { "epoch": 10.394377196407653, "grad_norm": 0.998521089553833, "learning_rate": 1.7935865572489254e-05, "loss": 0.2478, "step": 133100 }, { "epoch": 10.402186645841468, "grad_norm": 0.9431730508804321, "learning_rate": 1.793430246189918e-05, "loss": 0.2467, "step": 133200 }, { "epoch": 10.409996095275282, "grad_norm": 0.9027838110923767, "learning_rate": 1.793273935130911e-05, "loss": 0.2531, "step": 133300 }, { "epoch": 10.417805544709099, "grad_norm": 0.9306374192237854, "learning_rate": 1.793117624071903e-05, "loss": 0.2593, "step": 133400 }, { "epoch": 10.425614994142913, "grad_norm": 0.8729743361473083, "learning_rate": 1.7929613130128958e-05, "loss": 0.257, "step": 133500 }, { "epoch": 10.433424443576728, "grad_norm": 0.7633653283119202, "learning_rate": 1.7928050019538884e-05, "loss": 0.2629, "step": 133600 }, { "epoch": 10.441233893010542, "grad_norm": 0.7634978890419006, "learning_rate": 1.792648690894881e-05, "loss": 0.2724, "step": 133700 }, { "epoch": 10.449043342444357, "grad_norm": 1.1082350015640259, "learning_rate": 1.7924923798358736e-05, "loss": 0.2519, "step": 133800 }, { "epoch": 10.456852791878173, "grad_norm": 0.8378466963768005, "learning_rate": 1.792336068776866e-05, "loss": 0.244, "step": 133900 }, { "epoch": 10.464662241311988, "grad_norm": 0.8386598825454712, "learning_rate": 1.7921797577178588e-05, "loss": 0.2544, "step": 134000 }, { "epoch": 10.472471690745802, "grad_norm": 0.7771602869033813, "learning_rate": 1.7920234466588514e-05, "loss": 0.2537, "step": 134100 }, { "epoch": 10.480281140179617, "grad_norm": 0.8470435738563538, "learning_rate": 1.791867135599844e-05, "loss": 0.2453, "step": 134200 }, { "epoch": 10.488090589613432, "grad_norm": 1.166733980178833, "learning_rate": 1.7917108245408362e-05, "loss": 0.2624, "step": 134300 }, { "epoch": 10.495900039047248, "grad_norm": 1.1435627937316895, "learning_rate": 1.7915545134818292e-05, "loss": 0.2653, "step": 134400 }, { "epoch": 10.503709488481062, "grad_norm": 0.8292436003684998, "learning_rate": 1.7913982024228214e-05, "loss": 0.2568, "step": 134500 }, { "epoch": 10.511518937914877, "grad_norm": 0.7752034068107605, "learning_rate": 1.791241891363814e-05, "loss": 0.2396, "step": 134600 }, { "epoch": 10.519328387348692, "grad_norm": 1.128818154335022, "learning_rate": 1.7910855803048066e-05, "loss": 0.2465, "step": 134700 }, { "epoch": 10.527137836782506, "grad_norm": 0.9121723771095276, "learning_rate": 1.7909292692457992e-05, "loss": 0.2469, "step": 134800 }, { "epoch": 10.534947286216322, "grad_norm": 1.0522806644439697, "learning_rate": 1.790772958186792e-05, "loss": 0.25, "step": 134900 }, { "epoch": 10.542756735650137, "grad_norm": 1.0877002477645874, "learning_rate": 1.7906166471277844e-05, "loss": 0.2525, "step": 135000 }, { "epoch": 10.550566185083952, "grad_norm": 0.8708367943763733, "learning_rate": 1.790460336068777e-05, "loss": 0.2418, "step": 135100 }, { "epoch": 10.558375634517766, "grad_norm": 0.8987772464752197, "learning_rate": 1.7903040250097696e-05, "loss": 0.2457, "step": 135200 }, { "epoch": 10.56618508395158, "grad_norm": 0.9944095015525818, "learning_rate": 1.7901477139507622e-05, "loss": 0.2359, "step": 135300 }, { "epoch": 10.573994533385397, "grad_norm": 1.073951005935669, "learning_rate": 1.7899914028917545e-05, "loss": 0.2547, "step": 135400 }, { "epoch": 10.581803982819212, "grad_norm": 0.8596273064613342, "learning_rate": 1.7898350918327474e-05, "loss": 0.2459, "step": 135500 }, { "epoch": 10.589613432253026, "grad_norm": 1.2592207193374634, "learning_rate": 1.78967878077374e-05, "loss": 0.2539, "step": 135600 }, { "epoch": 10.59742288168684, "grad_norm": 1.2251505851745605, "learning_rate": 1.7895224697147323e-05, "loss": 0.2545, "step": 135700 }, { "epoch": 10.605232331120655, "grad_norm": 0.7919961810112, "learning_rate": 1.789366158655725e-05, "loss": 0.2587, "step": 135800 }, { "epoch": 10.613041780554472, "grad_norm": 1.192502737045288, "learning_rate": 1.7892098475967175e-05, "loss": 0.257, "step": 135900 }, { "epoch": 10.620851229988286, "grad_norm": 0.9787290692329407, "learning_rate": 1.78905353653771e-05, "loss": 0.2534, "step": 136000 }, { "epoch": 10.6286606794221, "grad_norm": 0.7784271240234375, "learning_rate": 1.7888972254787027e-05, "loss": 0.2656, "step": 136100 }, { "epoch": 10.636470128855915, "grad_norm": 1.072007656097412, "learning_rate": 1.7887409144196953e-05, "loss": 0.2426, "step": 136200 }, { "epoch": 10.64427957828973, "grad_norm": 1.229270577430725, "learning_rate": 1.788586166471278e-05, "loss": 0.2495, "step": 136300 }, { "epoch": 10.652089027723546, "grad_norm": 1.0051836967468262, "learning_rate": 1.7884298554122705e-05, "loss": 0.2453, "step": 136400 }, { "epoch": 10.65989847715736, "grad_norm": 0.931196391582489, "learning_rate": 1.788273544353263e-05, "loss": 0.2483, "step": 136500 }, { "epoch": 10.667707926591175, "grad_norm": 0.9106378555297852, "learning_rate": 1.7881187964048457e-05, "loss": 0.2523, "step": 136600 }, { "epoch": 10.67551737602499, "grad_norm": 1.0450843572616577, "learning_rate": 1.7879624853458383e-05, "loss": 0.2583, "step": 136700 }, { "epoch": 10.683326825458805, "grad_norm": 0.7723289132118225, "learning_rate": 1.787806174286831e-05, "loss": 0.2607, "step": 136800 }, { "epoch": 10.69113627489262, "grad_norm": 1.0050076246261597, "learning_rate": 1.7876498632278235e-05, "loss": 0.2432, "step": 136900 }, { "epoch": 10.698945724326435, "grad_norm": 0.8825809359550476, "learning_rate": 1.787493552168816e-05, "loss": 0.238, "step": 137000 }, { "epoch": 10.70675517376025, "grad_norm": 0.7401546239852905, "learning_rate": 1.7873372411098087e-05, "loss": 0.2603, "step": 137100 }, { "epoch": 10.714564623194065, "grad_norm": 0.8159899115562439, "learning_rate": 1.7871809300508013e-05, "loss": 0.2428, "step": 137200 }, { "epoch": 10.72237407262788, "grad_norm": 1.0040541887283325, "learning_rate": 1.787024618991794e-05, "loss": 0.2422, "step": 137300 }, { "epoch": 10.730183522061695, "grad_norm": 0.9106484651565552, "learning_rate": 1.7868683079327865e-05, "loss": 0.2496, "step": 137400 }, { "epoch": 10.73799297149551, "grad_norm": 0.9485523700714111, "learning_rate": 1.7867119968737788e-05, "loss": 0.2501, "step": 137500 }, { "epoch": 10.745802420929325, "grad_norm": 0.9653137922286987, "learning_rate": 1.7865556858147714e-05, "loss": 0.2425, "step": 137600 }, { "epoch": 10.75361187036314, "grad_norm": 0.8802712559700012, "learning_rate": 1.7863993747557643e-05, "loss": 0.2391, "step": 137700 }, { "epoch": 10.761421319796954, "grad_norm": 0.8589837551116943, "learning_rate": 1.7862430636967566e-05, "loss": 0.2562, "step": 137800 }, { "epoch": 10.76923076923077, "grad_norm": 0.8044468760490417, "learning_rate": 1.7860867526377492e-05, "loss": 0.2682, "step": 137900 }, { "epoch": 10.777040218664585, "grad_norm": 1.045089840888977, "learning_rate": 1.7859304415787418e-05, "loss": 0.2435, "step": 138000 }, { "epoch": 10.7848496680984, "grad_norm": 0.9740383625030518, "learning_rate": 1.7857741305197344e-05, "loss": 0.2574, "step": 138100 }, { "epoch": 10.792659117532214, "grad_norm": 1.025576114654541, "learning_rate": 1.785617819460727e-05, "loss": 0.2448, "step": 138200 }, { "epoch": 10.800468566966028, "grad_norm": 0.7664403915405273, "learning_rate": 1.7854615084017196e-05, "loss": 0.244, "step": 138300 }, { "epoch": 10.808278016399843, "grad_norm": 1.039302945137024, "learning_rate": 1.7853051973427122e-05, "loss": 0.2411, "step": 138400 }, { "epoch": 10.81608746583366, "grad_norm": 0.9581201076507568, "learning_rate": 1.7851488862837048e-05, "loss": 0.2496, "step": 138500 }, { "epoch": 10.823896915267474, "grad_norm": 1.1588441133499146, "learning_rate": 1.7849925752246974e-05, "loss": 0.2526, "step": 138600 }, { "epoch": 10.831706364701288, "grad_norm": 0.9728406667709351, "learning_rate": 1.7848362641656896e-05, "loss": 0.244, "step": 138700 }, { "epoch": 10.839515814135103, "grad_norm": 0.947182834148407, "learning_rate": 1.7846799531066826e-05, "loss": 0.2561, "step": 138800 }, { "epoch": 10.84732526356892, "grad_norm": 0.9559268951416016, "learning_rate": 1.7845236420476752e-05, "loss": 0.2389, "step": 138900 }, { "epoch": 10.855134713002734, "grad_norm": 0.9298532009124756, "learning_rate": 1.7843673309886675e-05, "loss": 0.231, "step": 139000 }, { "epoch": 10.862944162436548, "grad_norm": 0.9832693338394165, "learning_rate": 1.78421101992966e-05, "loss": 0.2272, "step": 139100 }, { "epoch": 10.870753611870363, "grad_norm": 1.2468117475509644, "learning_rate": 1.7840547088706527e-05, "loss": 0.2529, "step": 139200 }, { "epoch": 10.878563061304177, "grad_norm": 1.0330675840377808, "learning_rate": 1.7838983978116453e-05, "loss": 0.2426, "step": 139300 }, { "epoch": 10.886372510737992, "grad_norm": 0.9902207255363464, "learning_rate": 1.783742086752638e-05, "loss": 0.2557, "step": 139400 }, { "epoch": 10.894181960171808, "grad_norm": 0.8359582424163818, "learning_rate": 1.7835857756936305e-05, "loss": 0.2588, "step": 139500 }, { "epoch": 10.901991409605623, "grad_norm": 1.1092885732650757, "learning_rate": 1.783429464634623e-05, "loss": 0.2634, "step": 139600 }, { "epoch": 10.909800859039438, "grad_norm": 0.9002270698547363, "learning_rate": 1.7832731535756157e-05, "loss": 0.2545, "step": 139700 }, { "epoch": 10.917610308473252, "grad_norm": 1.0879848003387451, "learning_rate": 1.7831168425166083e-05, "loss": 0.2438, "step": 139800 }, { "epoch": 10.925419757907068, "grad_norm": 0.9094911217689514, "learning_rate": 1.782960531457601e-05, "loss": 0.2451, "step": 139900 }, { "epoch": 10.933229207340883, "grad_norm": 1.1810535192489624, "learning_rate": 1.7828042203985935e-05, "loss": 0.2501, "step": 140000 }, { "epoch": 10.941038656774698, "grad_norm": 1.110302209854126, "learning_rate": 1.7826479093395857e-05, "loss": 0.2465, "step": 140100 }, { "epoch": 10.948848106208512, "grad_norm": 0.8959664702415466, "learning_rate": 1.7824915982805787e-05, "loss": 0.245, "step": 140200 }, { "epoch": 10.956657555642327, "grad_norm": 0.7523890137672424, "learning_rate": 1.7823352872215713e-05, "loss": 0.2524, "step": 140300 }, { "epoch": 10.964467005076141, "grad_norm": 1.1155420541763306, "learning_rate": 1.7821789761625635e-05, "loss": 0.2492, "step": 140400 }, { "epoch": 10.972276454509958, "grad_norm": 1.041972041130066, "learning_rate": 1.782022665103556e-05, "loss": 0.2459, "step": 140500 }, { "epoch": 10.980085903943772, "grad_norm": 0.8934460282325745, "learning_rate": 1.781867917155139e-05, "loss": 0.2555, "step": 140600 }, { "epoch": 10.987895353377587, "grad_norm": 0.8889949917793274, "learning_rate": 1.7817116060961313e-05, "loss": 0.2457, "step": 140700 }, { "epoch": 10.995704802811401, "grad_norm": 1.3171417713165283, "learning_rate": 1.781555295037124e-05, "loss": 0.2467, "step": 140800 }, { "epoch": 11.003514252245216, "grad_norm": 0.8319826126098633, "learning_rate": 1.7813989839781165e-05, "loss": 0.2418, "step": 140900 }, { "epoch": 11.011323701679032, "grad_norm": 1.0765905380249023, "learning_rate": 1.781242672919109e-05, "loss": 0.2403, "step": 141000 }, { "epoch": 11.019133151112847, "grad_norm": 0.8536299467086792, "learning_rate": 1.7810863618601017e-05, "loss": 0.2442, "step": 141100 }, { "epoch": 11.026942600546661, "grad_norm": 1.1073857545852661, "learning_rate": 1.7809300508010943e-05, "loss": 0.2436, "step": 141200 }, { "epoch": 11.034752049980476, "grad_norm": 0.9137203097343445, "learning_rate": 1.780775302852677e-05, "loss": 0.2357, "step": 141300 }, { "epoch": 11.04256149941429, "grad_norm": 0.9700175523757935, "learning_rate": 1.7806189917936695e-05, "loss": 0.2408, "step": 141400 }, { "epoch": 11.050370948848107, "grad_norm": 1.1891883611679077, "learning_rate": 1.780462680734662e-05, "loss": 0.2532, "step": 141500 }, { "epoch": 11.058180398281921, "grad_norm": 0.9186915159225464, "learning_rate": 1.7803063696756547e-05, "loss": 0.2541, "step": 141600 }, { "epoch": 11.065989847715736, "grad_norm": 0.8762014508247375, "learning_rate": 1.7801500586166473e-05, "loss": 0.2365, "step": 141700 }, { "epoch": 11.07379929714955, "grad_norm": 0.8541187644004822, "learning_rate": 1.77999374755764e-05, "loss": 0.2457, "step": 141800 }, { "epoch": 11.081608746583365, "grad_norm": 0.8798760771751404, "learning_rate": 1.7798374364986325e-05, "loss": 0.2507, "step": 141900 }, { "epoch": 11.089418196017181, "grad_norm": 1.1054275035858154, "learning_rate": 1.7796811254396248e-05, "loss": 0.2491, "step": 142000 }, { "epoch": 11.097227645450996, "grad_norm": 0.9322449564933777, "learning_rate": 1.7795248143806177e-05, "loss": 0.2398, "step": 142100 }, { "epoch": 11.10503709488481, "grad_norm": 1.0157818794250488, "learning_rate": 1.77936850332161e-05, "loss": 0.2338, "step": 142200 }, { "epoch": 11.112846544318625, "grad_norm": 1.0045311450958252, "learning_rate": 1.7792121922626026e-05, "loss": 0.2486, "step": 142300 }, { "epoch": 11.12065599375244, "grad_norm": 0.9703744649887085, "learning_rate": 1.7790558812035955e-05, "loss": 0.2449, "step": 142400 }, { "epoch": 11.128465443186256, "grad_norm": 0.8165519833564758, "learning_rate": 1.7788995701445878e-05, "loss": 0.238, "step": 142500 }, { "epoch": 11.13627489262007, "grad_norm": 0.9721464514732361, "learning_rate": 1.7787432590855804e-05, "loss": 0.2351, "step": 142600 }, { "epoch": 11.144084342053885, "grad_norm": 0.9640051126480103, "learning_rate": 1.778586948026573e-05, "loss": 0.2364, "step": 142700 }, { "epoch": 11.1518937914877, "grad_norm": 1.012770652770996, "learning_rate": 1.7784306369675656e-05, "loss": 0.2442, "step": 142800 }, { "epoch": 11.159703240921514, "grad_norm": 1.0033341646194458, "learning_rate": 1.7782743259085582e-05, "loss": 0.2447, "step": 142900 }, { "epoch": 11.16751269035533, "grad_norm": 0.6418032050132751, "learning_rate": 1.7781180148495508e-05, "loss": 0.2418, "step": 143000 }, { "epoch": 11.175322139789145, "grad_norm": 1.1134809255599976, "learning_rate": 1.777961703790543e-05, "loss": 0.2442, "step": 143100 }, { "epoch": 11.18313158922296, "grad_norm": 1.0568909645080566, "learning_rate": 1.777805392731536e-05, "loss": 0.2387, "step": 143200 }, { "epoch": 11.190941038656774, "grad_norm": 0.8510876893997192, "learning_rate": 1.7776490816725286e-05, "loss": 0.2355, "step": 143300 }, { "epoch": 11.198750488090589, "grad_norm": 0.6918273568153381, "learning_rate": 1.777492770613521e-05, "loss": 0.2376, "step": 143400 }, { "epoch": 11.206559937524405, "grad_norm": 0.8617496490478516, "learning_rate": 1.7773364595545138e-05, "loss": 0.2387, "step": 143500 }, { "epoch": 11.21436938695822, "grad_norm": 0.9683915972709656, "learning_rate": 1.7771801484955064e-05, "loss": 0.2368, "step": 143600 }, { "epoch": 11.222178836392034, "grad_norm": 1.0224312543869019, "learning_rate": 1.7770238374364987e-05, "loss": 0.2441, "step": 143700 }, { "epoch": 11.229988285825849, "grad_norm": 0.7473081350326538, "learning_rate": 1.7768675263774913e-05, "loss": 0.2516, "step": 143800 }, { "epoch": 11.237797735259663, "grad_norm": 0.9434145092964172, "learning_rate": 1.776711215318484e-05, "loss": 0.2267, "step": 143900 }, { "epoch": 11.24560718469348, "grad_norm": 0.887402355670929, "learning_rate": 1.7765549042594765e-05, "loss": 0.2336, "step": 144000 }, { "epoch": 11.253416634127294, "grad_norm": 0.9610289931297302, "learning_rate": 1.776398593200469e-05, "loss": 0.2362, "step": 144100 }, { "epoch": 11.261226083561109, "grad_norm": 0.8524101972579956, "learning_rate": 1.7762422821414617e-05, "loss": 0.2372, "step": 144200 }, { "epoch": 11.269035532994923, "grad_norm": 1.01228666305542, "learning_rate": 1.7760859710824543e-05, "loss": 0.226, "step": 144300 }, { "epoch": 11.276844982428738, "grad_norm": 1.0873188972473145, "learning_rate": 1.775929660023447e-05, "loss": 0.2393, "step": 144400 }, { "epoch": 11.284654431862554, "grad_norm": 0.9294605851173401, "learning_rate": 1.7757733489644395e-05, "loss": 0.2348, "step": 144500 }, { "epoch": 11.292463881296369, "grad_norm": 1.0935219526290894, "learning_rate": 1.775617037905432e-05, "loss": 0.2337, "step": 144600 }, { "epoch": 11.300273330730183, "grad_norm": 1.033736228942871, "learning_rate": 1.7754607268464247e-05, "loss": 0.248, "step": 144700 }, { "epoch": 11.308082780163998, "grad_norm": 0.8749229311943054, "learning_rate": 1.775304415787417e-05, "loss": 0.2486, "step": 144800 }, { "epoch": 11.315892229597813, "grad_norm": 0.7569709420204163, "learning_rate": 1.7751481047284095e-05, "loss": 0.242, "step": 144900 }, { "epoch": 11.323701679031629, "grad_norm": 1.029752492904663, "learning_rate": 1.7749917936694025e-05, "loss": 0.2516, "step": 145000 }, { "epoch": 11.331511128465444, "grad_norm": 1.0709694623947144, "learning_rate": 1.7748354826103947e-05, "loss": 0.2427, "step": 145100 }, { "epoch": 11.339320577899258, "grad_norm": 0.7665264010429382, "learning_rate": 1.7746791715513873e-05, "loss": 0.245, "step": 145200 }, { "epoch": 11.347130027333073, "grad_norm": 0.823897659778595, "learning_rate": 1.77452442360297e-05, "loss": 0.2383, "step": 145300 }, { "epoch": 11.354939476766887, "grad_norm": 0.9198905229568481, "learning_rate": 1.7743681125439625e-05, "loss": 0.2336, "step": 145400 }, { "epoch": 11.362748926200704, "grad_norm": 0.7961851358413696, "learning_rate": 1.774211801484955e-05, "loss": 0.2378, "step": 145500 }, { "epoch": 11.370558375634518, "grad_norm": 0.7704211473464966, "learning_rate": 1.7740554904259477e-05, "loss": 0.2373, "step": 145600 }, { "epoch": 11.378367825068333, "grad_norm": 1.017962098121643, "learning_rate": 1.7738991793669403e-05, "loss": 0.239, "step": 145700 }, { "epoch": 11.386177274502147, "grad_norm": 1.4405622482299805, "learning_rate": 1.773742868307933e-05, "loss": 0.2452, "step": 145800 }, { "epoch": 11.393986723935962, "grad_norm": 0.8968992233276367, "learning_rate": 1.7735865572489255e-05, "loss": 0.2443, "step": 145900 }, { "epoch": 11.401796173369778, "grad_norm": 0.847322404384613, "learning_rate": 1.773430246189918e-05, "loss": 0.2405, "step": 146000 }, { "epoch": 11.409605622803593, "grad_norm": 0.9945650696754456, "learning_rate": 1.7732739351309108e-05, "loss": 0.246, "step": 146100 }, { "epoch": 11.417415072237407, "grad_norm": 0.9590684771537781, "learning_rate": 1.7731176240719034e-05, "loss": 0.2509, "step": 146200 }, { "epoch": 11.425224521671222, "grad_norm": 0.7392657995223999, "learning_rate": 1.7729613130128956e-05, "loss": 0.2412, "step": 146300 }, { "epoch": 11.433033971105036, "grad_norm": 0.9596700668334961, "learning_rate": 1.7728050019538886e-05, "loss": 0.2443, "step": 146400 }, { "epoch": 11.440843420538853, "grad_norm": 1.1228785514831543, "learning_rate": 1.772648690894881e-05, "loss": 0.2394, "step": 146500 }, { "epoch": 11.448652869972667, "grad_norm": 1.0421438217163086, "learning_rate": 1.7724923798358734e-05, "loss": 0.2499, "step": 146600 }, { "epoch": 11.456462319406482, "grad_norm": 0.8273714780807495, "learning_rate": 1.772336068776866e-05, "loss": 0.2457, "step": 146700 }, { "epoch": 11.464271768840296, "grad_norm": 0.8781546950340271, "learning_rate": 1.7721797577178586e-05, "loss": 0.2337, "step": 146800 }, { "epoch": 11.472081218274111, "grad_norm": 0.7646591067314148, "learning_rate": 1.7720234466588512e-05, "loss": 0.2365, "step": 146900 }, { "epoch": 11.479890667707927, "grad_norm": 1.0014855861663818, "learning_rate": 1.7718671355998438e-05, "loss": 0.2363, "step": 147000 }, { "epoch": 11.487700117141742, "grad_norm": 1.2053076028823853, "learning_rate": 1.7717108245408364e-05, "loss": 0.2348, "step": 147100 }, { "epoch": 11.495509566575556, "grad_norm": 1.017988920211792, "learning_rate": 1.771554513481829e-05, "loss": 0.2365, "step": 147200 }, { "epoch": 11.503319016009371, "grad_norm": 1.1997560262680054, "learning_rate": 1.7713997655334116e-05, "loss": 0.2441, "step": 147300 }, { "epoch": 11.511128465443186, "grad_norm": 0.6547896265983582, "learning_rate": 1.7712434544744042e-05, "loss": 0.2363, "step": 147400 }, { "epoch": 11.518937914877002, "grad_norm": 0.9807236790657043, "learning_rate": 1.7710871434153968e-05, "loss": 0.2358, "step": 147500 }, { "epoch": 11.526747364310816, "grad_norm": 0.893150269985199, "learning_rate": 1.7709308323563894e-05, "loss": 0.2243, "step": 147600 }, { "epoch": 11.534556813744631, "grad_norm": 1.184472680091858, "learning_rate": 1.770774521297382e-05, "loss": 0.2397, "step": 147700 }, { "epoch": 11.542366263178446, "grad_norm": 1.0924776792526245, "learning_rate": 1.7706182102383743e-05, "loss": 0.2301, "step": 147800 }, { "epoch": 11.55017571261226, "grad_norm": 0.9261873960494995, "learning_rate": 1.7704618991793672e-05, "loss": 0.2368, "step": 147900 }, { "epoch": 11.557985162046077, "grad_norm": 1.007124662399292, "learning_rate": 1.7703055881203598e-05, "loss": 0.2295, "step": 148000 }, { "epoch": 11.565794611479891, "grad_norm": 0.9083753228187561, "learning_rate": 1.770149277061352e-05, "loss": 0.2501, "step": 148100 }, { "epoch": 11.573604060913706, "grad_norm": 0.8021575212478638, "learning_rate": 1.7699929660023447e-05, "loss": 0.2419, "step": 148200 }, { "epoch": 11.58141351034752, "grad_norm": 0.8178331851959229, "learning_rate": 1.7698366549433373e-05, "loss": 0.2295, "step": 148300 }, { "epoch": 11.589222959781335, "grad_norm": 0.9790313243865967, "learning_rate": 1.76968034388433e-05, "loss": 0.2404, "step": 148400 }, { "epoch": 11.597032409215151, "grad_norm": 0.8207682967185974, "learning_rate": 1.7695240328253225e-05, "loss": 0.2338, "step": 148500 }, { "epoch": 11.604841858648966, "grad_norm": 0.7306053638458252, "learning_rate": 1.769367721766315e-05, "loss": 0.2351, "step": 148600 }, { "epoch": 11.61265130808278, "grad_norm": 0.8284056186676025, "learning_rate": 1.7692114107073077e-05, "loss": 0.2373, "step": 148700 }, { "epoch": 11.620460757516595, "grad_norm": 0.7813366055488586, "learning_rate": 1.7690550996483003e-05, "loss": 0.2343, "step": 148800 }, { "epoch": 11.62827020695041, "grad_norm": 1.0725852251052856, "learning_rate": 1.768898788589293e-05, "loss": 0.2278, "step": 148900 }, { "epoch": 11.636079656384226, "grad_norm": 1.0066187381744385, "learning_rate": 1.7687424775302855e-05, "loss": 0.2341, "step": 149000 }, { "epoch": 11.64388910581804, "grad_norm": 0.8271299600601196, "learning_rate": 1.768586166471278e-05, "loss": 0.2365, "step": 149100 }, { "epoch": 11.651698555251855, "grad_norm": 1.159485936164856, "learning_rate": 1.7684298554122704e-05, "loss": 0.2262, "step": 149200 }, { "epoch": 11.65950800468567, "grad_norm": 0.973098874092102, "learning_rate": 1.768275107463853e-05, "loss": 0.2344, "step": 149300 }, { "epoch": 11.667317454119484, "grad_norm": 0.9480162262916565, "learning_rate": 1.768118796404846e-05, "loss": 0.2338, "step": 149400 }, { "epoch": 11.6751269035533, "grad_norm": 0.5876905918121338, "learning_rate": 1.7679624853458385e-05, "loss": 0.2377, "step": 149500 }, { "epoch": 11.682936352987115, "grad_norm": 1.1765408515930176, "learning_rate": 1.7678061742868308e-05, "loss": 0.2384, "step": 149600 }, { "epoch": 11.69074580242093, "grad_norm": 1.0857717990875244, "learning_rate": 1.7676498632278237e-05, "loss": 0.2382, "step": 149700 }, { "epoch": 11.698555251854744, "grad_norm": 0.811991810798645, "learning_rate": 1.7674935521688163e-05, "loss": 0.2457, "step": 149800 }, { "epoch": 11.706364701288559, "grad_norm": 0.9020510315895081, "learning_rate": 1.7673372411098086e-05, "loss": 0.2152, "step": 149900 }, { "epoch": 11.714174150722375, "grad_norm": 0.80827397108078, "learning_rate": 1.767180930050801e-05, "loss": 0.2355, "step": 150000 }, { "epoch": 11.72198360015619, "grad_norm": 1.0584274530410767, "learning_rate": 1.7670246189917938e-05, "loss": 0.2241, "step": 150100 }, { "epoch": 11.729793049590004, "grad_norm": 0.8974628448486328, "learning_rate": 1.7668683079327864e-05, "loss": 0.2401, "step": 150200 }, { "epoch": 11.737602499023819, "grad_norm": 0.8113837838172913, "learning_rate": 1.766711996873779e-05, "loss": 0.2476, "step": 150300 }, { "epoch": 11.745411948457633, "grad_norm": 0.7186545729637146, "learning_rate": 1.7665556858147716e-05, "loss": 0.2435, "step": 150400 }, { "epoch": 11.75322139789145, "grad_norm": 1.1256885528564453, "learning_rate": 1.766399374755764e-05, "loss": 0.2272, "step": 150500 }, { "epoch": 11.761030847325264, "grad_norm": 0.9910681843757629, "learning_rate": 1.7662430636967568e-05, "loss": 0.2294, "step": 150600 }, { "epoch": 11.768840296759079, "grad_norm": 1.2361962795257568, "learning_rate": 1.7660867526377494e-05, "loss": 0.2423, "step": 150700 }, { "epoch": 11.776649746192893, "grad_norm": 1.0611270666122437, "learning_rate": 1.765930441578742e-05, "loss": 0.2278, "step": 150800 }, { "epoch": 11.784459195626708, "grad_norm": 0.6779769062995911, "learning_rate": 1.7657741305197346e-05, "loss": 0.2232, "step": 150900 }, { "epoch": 11.792268645060524, "grad_norm": 1.2255175113677979, "learning_rate": 1.765617819460727e-05, "loss": 0.2333, "step": 151000 }, { "epoch": 11.800078094494339, "grad_norm": 1.2015628814697266, "learning_rate": 1.7654615084017194e-05, "loss": 0.2352, "step": 151100 }, { "epoch": 11.807887543928153, "grad_norm": 0.9914200305938721, "learning_rate": 1.7653051973427124e-05, "loss": 0.2326, "step": 151200 }, { "epoch": 11.815696993361968, "grad_norm": 0.9092264175415039, "learning_rate": 1.765150449394295e-05, "loss": 0.2251, "step": 151300 }, { "epoch": 11.823506442795782, "grad_norm": 0.6066728234291077, "learning_rate": 1.7649941383352872e-05, "loss": 0.2274, "step": 151400 }, { "epoch": 11.831315892229597, "grad_norm": 1.1375062465667725, "learning_rate": 1.76483782727628e-05, "loss": 0.231, "step": 151500 }, { "epoch": 11.839125341663413, "grad_norm": 0.839840292930603, "learning_rate": 1.7646815162172724e-05, "loss": 0.2291, "step": 151600 }, { "epoch": 11.846934791097228, "grad_norm": 0.7539154887199402, "learning_rate": 1.764525205158265e-05, "loss": 0.2194, "step": 151700 }, { "epoch": 11.854744240531042, "grad_norm": 1.1252955198287964, "learning_rate": 1.7643688940992576e-05, "loss": 0.2275, "step": 151800 }, { "epoch": 11.862553689964857, "grad_norm": 0.7381853461265564, "learning_rate": 1.7642125830402502e-05, "loss": 0.2287, "step": 151900 }, { "epoch": 11.870363139398673, "grad_norm": 1.1151509284973145, "learning_rate": 1.764056271981243e-05, "loss": 0.2281, "step": 152000 }, { "epoch": 11.878172588832488, "grad_norm": 0.8168774247169495, "learning_rate": 1.7638999609222354e-05, "loss": 0.2151, "step": 152100 }, { "epoch": 11.885982038266302, "grad_norm": 1.0729039907455444, "learning_rate": 1.763743649863228e-05, "loss": 0.2257, "step": 152200 }, { "epoch": 11.893791487700117, "grad_norm": 0.9500818848609924, "learning_rate": 1.7635873388042206e-05, "loss": 0.2189, "step": 152300 }, { "epoch": 11.901600937133932, "grad_norm": 1.022155523300171, "learning_rate": 1.7634310277452132e-05, "loss": 0.2375, "step": 152400 }, { "epoch": 11.909410386567746, "grad_norm": 1.272755742073059, "learning_rate": 1.7632747166862055e-05, "loss": 0.2313, "step": 152500 }, { "epoch": 11.917219836001562, "grad_norm": 0.7790395617485046, "learning_rate": 1.763118405627198e-05, "loss": 0.2234, "step": 152600 }, { "epoch": 11.925029285435377, "grad_norm": 0.9795846343040466, "learning_rate": 1.762962094568191e-05, "loss": 0.2301, "step": 152700 }, { "epoch": 11.932838734869192, "grad_norm": 0.6616389155387878, "learning_rate": 1.7628057835091833e-05, "loss": 0.2275, "step": 152800 }, { "epoch": 11.940648184303006, "grad_norm": 0.7695783376693726, "learning_rate": 1.762649472450176e-05, "loss": 0.2323, "step": 152900 }, { "epoch": 11.948457633736822, "grad_norm": 0.9902390241622925, "learning_rate": 1.7624931613911685e-05, "loss": 0.2335, "step": 153000 }, { "epoch": 11.956267083170637, "grad_norm": 0.9866804480552673, "learning_rate": 1.762336850332161e-05, "loss": 0.2354, "step": 153100 }, { "epoch": 11.964076532604452, "grad_norm": 0.9206321835517883, "learning_rate": 1.7621805392731537e-05, "loss": 0.2289, "step": 153200 }, { "epoch": 11.971885982038266, "grad_norm": 1.0868662595748901, "learning_rate": 1.7620242282141463e-05, "loss": 0.2192, "step": 153300 }, { "epoch": 11.97969543147208, "grad_norm": 1.080969214439392, "learning_rate": 1.761869480265729e-05, "loss": 0.2268, "step": 153400 }, { "epoch": 11.987504880905895, "grad_norm": 0.9495891332626343, "learning_rate": 1.7617131692067215e-05, "loss": 0.2248, "step": 153500 }, { "epoch": 11.995314330339712, "grad_norm": 0.6562321782112122, "learning_rate": 1.761556858147714e-05, "loss": 0.2279, "step": 153600 }, { "epoch": 12.003123779773526, "grad_norm": 0.9093583822250366, "learning_rate": 1.7614005470887067e-05, "loss": 0.2216, "step": 153700 }, { "epoch": 12.01093322920734, "grad_norm": 0.8424447178840637, "learning_rate": 1.7612442360296993e-05, "loss": 0.2312, "step": 153800 }, { "epoch": 12.018742678641155, "grad_norm": 0.9770123362541199, "learning_rate": 1.761087924970692e-05, "loss": 0.2283, "step": 153900 }, { "epoch": 12.02655212807497, "grad_norm": 1.1255728006362915, "learning_rate": 1.7609316139116842e-05, "loss": 0.2344, "step": 154000 }, { "epoch": 12.034361577508786, "grad_norm": 1.158980369567871, "learning_rate": 1.760775302852677e-05, "loss": 0.2402, "step": 154100 }, { "epoch": 12.0421710269426, "grad_norm": 0.8753023147583008, "learning_rate": 1.7606189917936697e-05, "loss": 0.2283, "step": 154200 }, { "epoch": 12.049980476376415, "grad_norm": 1.185396671295166, "learning_rate": 1.760462680734662e-05, "loss": 0.2365, "step": 154300 }, { "epoch": 12.05778992581023, "grad_norm": 0.9084982872009277, "learning_rate": 1.7603063696756546e-05, "loss": 0.2124, "step": 154400 }, { "epoch": 12.065599375244044, "grad_norm": 0.947121262550354, "learning_rate": 1.7601500586166472e-05, "loss": 0.2329, "step": 154500 }, { "epoch": 12.07340882467786, "grad_norm": 0.9727124571800232, "learning_rate": 1.7599937475576398e-05, "loss": 0.2247, "step": 154600 }, { "epoch": 12.081218274111675, "grad_norm": 0.99271559715271, "learning_rate": 1.7598374364986324e-05, "loss": 0.221, "step": 154700 }, { "epoch": 12.08902772354549, "grad_norm": 1.1732423305511475, "learning_rate": 1.759681125439625e-05, "loss": 0.2129, "step": 154800 }, { "epoch": 12.096837172979304, "grad_norm": 0.9106228351593018, "learning_rate": 1.7595248143806176e-05, "loss": 0.2274, "step": 154900 }, { "epoch": 12.104646622413119, "grad_norm": 1.0971242189407349, "learning_rate": 1.7593685033216102e-05, "loss": 0.2309, "step": 155000 }, { "epoch": 12.112456071846935, "grad_norm": 1.2554278373718262, "learning_rate": 1.7592121922626028e-05, "loss": 0.2387, "step": 155100 }, { "epoch": 12.12026552128075, "grad_norm": 0.8720566034317017, "learning_rate": 1.7590558812035954e-05, "loss": 0.2336, "step": 155200 }, { "epoch": 12.128074970714565, "grad_norm": 0.8539800643920898, "learning_rate": 1.758899570144588e-05, "loss": 0.2296, "step": 155300 }, { "epoch": 12.135884420148379, "grad_norm": 0.9018562436103821, "learning_rate": 1.7587448221961706e-05, "loss": 0.2304, "step": 155400 }, { "epoch": 12.143693869582194, "grad_norm": 1.048981785774231, "learning_rate": 1.758588511137163e-05, "loss": 0.2176, "step": 155500 }, { "epoch": 12.15150331901601, "grad_norm": 1.3624159097671509, "learning_rate": 1.7584322000781558e-05, "loss": 0.231, "step": 155600 }, { "epoch": 12.159312768449825, "grad_norm": 0.9861006736755371, "learning_rate": 1.7582758890191484e-05, "loss": 0.2276, "step": 155700 }, { "epoch": 12.16712221788364, "grad_norm": 0.971916139125824, "learning_rate": 1.7581195779601406e-05, "loss": 0.2258, "step": 155800 }, { "epoch": 12.174931667317454, "grad_norm": 1.1522843837738037, "learning_rate": 1.7579632669011336e-05, "loss": 0.2222, "step": 155900 }, { "epoch": 12.182741116751268, "grad_norm": 1.2297567129135132, "learning_rate": 1.757806955842126e-05, "loss": 0.2258, "step": 156000 }, { "epoch": 12.190550566185085, "grad_norm": 0.9804288148880005, "learning_rate": 1.7576506447831185e-05, "loss": 0.2233, "step": 156100 }, { "epoch": 12.1983600156189, "grad_norm": 0.9976232647895813, "learning_rate": 1.757494333724111e-05, "loss": 0.2233, "step": 156200 }, { "epoch": 12.206169465052714, "grad_norm": 0.7635705471038818, "learning_rate": 1.7573380226651037e-05, "loss": 0.2098, "step": 156300 }, { "epoch": 12.213978914486528, "grad_norm": 0.8271148800849915, "learning_rate": 1.7571817116060963e-05, "loss": 0.2262, "step": 156400 }, { "epoch": 12.221788363920343, "grad_norm": 0.7868706583976746, "learning_rate": 1.757025400547089e-05, "loss": 0.2346, "step": 156500 }, { "epoch": 12.22959781335416, "grad_norm": 1.131128191947937, "learning_rate": 1.7568690894880815e-05, "loss": 0.2226, "step": 156600 }, { "epoch": 12.237407262787974, "grad_norm": 1.0516031980514526, "learning_rate": 1.756712778429074e-05, "loss": 0.2196, "step": 156700 }, { "epoch": 12.245216712221788, "grad_norm": 0.8659607172012329, "learning_rate": 1.7565564673700667e-05, "loss": 0.2246, "step": 156800 }, { "epoch": 12.253026161655603, "grad_norm": 0.9379199147224426, "learning_rate": 1.7564001563110593e-05, "loss": 0.2272, "step": 156900 }, { "epoch": 12.260835611089417, "grad_norm": 1.2084906101226807, "learning_rate": 1.756243845252052e-05, "loss": 0.2239, "step": 157000 }, { "epoch": 12.268645060523234, "grad_norm": 1.0007749795913696, "learning_rate": 1.7560875341930445e-05, "loss": 0.2239, "step": 157100 }, { "epoch": 12.276454509957048, "grad_norm": 0.93492591381073, "learning_rate": 1.7559312231340367e-05, "loss": 0.2309, "step": 157200 }, { "epoch": 12.284263959390863, "grad_norm": 1.4212570190429688, "learning_rate": 1.7557749120750293e-05, "loss": 0.2209, "step": 157300 }, { "epoch": 12.292073408824677, "grad_norm": 1.0967302322387695, "learning_rate": 1.7556186010160223e-05, "loss": 0.2219, "step": 157400 }, { "epoch": 12.299882858258492, "grad_norm": 0.8859778046607971, "learning_rate": 1.755463853067605e-05, "loss": 0.2353, "step": 157500 }, { "epoch": 12.307692307692308, "grad_norm": 1.1520426273345947, "learning_rate": 1.755307542008597e-05, "loss": 0.2236, "step": 157600 }, { "epoch": 12.315501757126123, "grad_norm": 0.8958144187927246, "learning_rate": 1.7551512309495897e-05, "loss": 0.2347, "step": 157700 }, { "epoch": 12.323311206559938, "grad_norm": 0.8936724066734314, "learning_rate": 1.7549949198905823e-05, "loss": 0.2358, "step": 157800 }, { "epoch": 12.331120655993752, "grad_norm": 1.011566400527954, "learning_rate": 1.754838608831575e-05, "loss": 0.2255, "step": 157900 }, { "epoch": 12.338930105427567, "grad_norm": 0.8371241092681885, "learning_rate": 1.7546822977725675e-05, "loss": 0.2191, "step": 158000 }, { "epoch": 12.346739554861383, "grad_norm": 0.837230920791626, "learning_rate": 1.75452598671356e-05, "loss": 0.2384, "step": 158100 }, { "epoch": 12.354549004295198, "grad_norm": 0.9873660802841187, "learning_rate": 1.7543696756545527e-05, "loss": 0.2239, "step": 158200 }, { "epoch": 12.362358453729012, "grad_norm": 1.1262012720108032, "learning_rate": 1.7542133645955453e-05, "loss": 0.2148, "step": 158300 }, { "epoch": 12.370167903162827, "grad_norm": 0.9833094477653503, "learning_rate": 1.754057053536538e-05, "loss": 0.2182, "step": 158400 }, { "epoch": 12.377977352596641, "grad_norm": 0.7416759729385376, "learning_rate": 1.7539007424775305e-05, "loss": 0.2274, "step": 158500 }, { "epoch": 12.385786802030458, "grad_norm": 1.1120948791503906, "learning_rate": 1.753744431418523e-05, "loss": 0.2239, "step": 158600 }, { "epoch": 12.393596251464272, "grad_norm": 0.8697503805160522, "learning_rate": 1.7535881203595154e-05, "loss": 0.228, "step": 158700 }, { "epoch": 12.401405700898087, "grad_norm": 0.9854649901390076, "learning_rate": 1.753431809300508e-05, "loss": 0.223, "step": 158800 }, { "epoch": 12.409215150331901, "grad_norm": 0.6563554406166077, "learning_rate": 1.753275498241501e-05, "loss": 0.2192, "step": 158900 }, { "epoch": 12.417024599765716, "grad_norm": 0.928612232208252, "learning_rate": 1.7531191871824932e-05, "loss": 0.233, "step": 159000 }, { "epoch": 12.424834049199532, "grad_norm": 0.8887084722518921, "learning_rate": 1.7529628761234858e-05, "loss": 0.2284, "step": 159100 }, { "epoch": 12.432643498633347, "grad_norm": 1.3052536249160767, "learning_rate": 1.7528065650644784e-05, "loss": 0.224, "step": 159200 }, { "epoch": 12.440452948067161, "grad_norm": 0.9533888697624207, "learning_rate": 1.752650254005471e-05, "loss": 0.2255, "step": 159300 }, { "epoch": 12.448262397500976, "grad_norm": 0.9055050611495972, "learning_rate": 1.7524939429464636e-05, "loss": 0.2192, "step": 159400 }, { "epoch": 12.45607184693479, "grad_norm": 0.9132809638977051, "learning_rate": 1.7523391949980462e-05, "loss": 0.236, "step": 159500 }, { "epoch": 12.463881296368607, "grad_norm": 1.0307021141052246, "learning_rate": 1.7521828839390388e-05, "loss": 0.2168, "step": 159600 }, { "epoch": 12.471690745802421, "grad_norm": 0.8152646422386169, "learning_rate": 1.7520265728800314e-05, "loss": 0.2167, "step": 159700 }, { "epoch": 12.479500195236236, "grad_norm": 0.7328994870185852, "learning_rate": 1.751870261821024e-05, "loss": 0.217, "step": 159800 }, { "epoch": 12.48730964467005, "grad_norm": 0.9359160661697388, "learning_rate": 1.7517139507620166e-05, "loss": 0.2174, "step": 159900 }, { "epoch": 12.495119094103865, "grad_norm": 0.7891941666603088, "learning_rate": 1.7515576397030092e-05, "loss": 0.2179, "step": 160000 }, { "epoch": 12.502928543537681, "grad_norm": 1.0698615312576294, "learning_rate": 1.7514013286440018e-05, "loss": 0.224, "step": 160100 }, { "epoch": 12.510737992971496, "grad_norm": 0.9958947896957397, "learning_rate": 1.751245017584994e-05, "loss": 0.227, "step": 160200 }, { "epoch": 12.51854744240531, "grad_norm": 0.9366388320922852, "learning_rate": 1.751088706525987e-05, "loss": 0.2188, "step": 160300 }, { "epoch": 12.526356891839125, "grad_norm": 0.9255606532096863, "learning_rate": 1.7509323954669796e-05, "loss": 0.216, "step": 160400 }, { "epoch": 12.53416634127294, "grad_norm": 0.9308393001556396, "learning_rate": 1.750776084407972e-05, "loss": 0.2189, "step": 160500 }, { "epoch": 12.541975790706756, "grad_norm": 0.9493924975395203, "learning_rate": 1.7506197733489645e-05, "loss": 0.2261, "step": 160600 }, { "epoch": 12.54978524014057, "grad_norm": 1.0675753355026245, "learning_rate": 1.750463462289957e-05, "loss": 0.2227, "step": 160700 }, { "epoch": 12.557594689574385, "grad_norm": 0.8917664289474487, "learning_rate": 1.7503071512309497e-05, "loss": 0.2234, "step": 160800 }, { "epoch": 12.5654041390082, "grad_norm": 0.7320645451545715, "learning_rate": 1.7501508401719423e-05, "loss": 0.2232, "step": 160900 }, { "epoch": 12.573213588442014, "grad_norm": 0.8894788026809692, "learning_rate": 1.749994529112935e-05, "loss": 0.2298, "step": 161000 }, { "epoch": 12.58102303787583, "grad_norm": 1.3193320035934448, "learning_rate": 1.7498382180539275e-05, "loss": 0.2328, "step": 161100 }, { "epoch": 12.588832487309645, "grad_norm": 1.031163215637207, "learning_rate": 1.74968190699492e-05, "loss": 0.2238, "step": 161200 }, { "epoch": 12.59664193674346, "grad_norm": 1.3573243618011475, "learning_rate": 1.7495255959359127e-05, "loss": 0.219, "step": 161300 }, { "epoch": 12.604451386177274, "grad_norm": 1.072387456893921, "learning_rate": 1.7493692848769053e-05, "loss": 0.2245, "step": 161400 }, { "epoch": 12.612260835611089, "grad_norm": 1.095932960510254, "learning_rate": 1.749214536928488e-05, "loss": 0.2147, "step": 161500 }, { "epoch": 12.620070285044905, "grad_norm": 0.8156642913818359, "learning_rate": 1.7490582258694805e-05, "loss": 0.2201, "step": 161600 }, { "epoch": 12.62787973447872, "grad_norm": 1.1821234226226807, "learning_rate": 1.7489019148104727e-05, "loss": 0.2088, "step": 161700 }, { "epoch": 12.635689183912534, "grad_norm": 0.9735209345817566, "learning_rate": 1.7487456037514657e-05, "loss": 0.2199, "step": 161800 }, { "epoch": 12.643498633346349, "grad_norm": 1.0844048261642456, "learning_rate": 1.7485892926924583e-05, "loss": 0.2186, "step": 161900 }, { "epoch": 12.651308082780163, "grad_norm": 0.933223307132721, "learning_rate": 1.7484329816334505e-05, "loss": 0.2125, "step": 162000 }, { "epoch": 12.65911753221398, "grad_norm": 1.1654493808746338, "learning_rate": 1.7482766705744435e-05, "loss": 0.2174, "step": 162100 }, { "epoch": 12.666926981647794, "grad_norm": 1.048599362373352, "learning_rate": 1.7481203595154357e-05, "loss": 0.2116, "step": 162200 }, { "epoch": 12.674736431081609, "grad_norm": 0.8715877532958984, "learning_rate": 1.7479640484564283e-05, "loss": 0.2133, "step": 162300 }, { "epoch": 12.682545880515423, "grad_norm": 1.186110019683838, "learning_rate": 1.747807737397421e-05, "loss": 0.2212, "step": 162400 }, { "epoch": 12.690355329949238, "grad_norm": 0.9320825338363647, "learning_rate": 1.7476514263384135e-05, "loss": 0.2197, "step": 162500 }, { "epoch": 12.698164779383054, "grad_norm": 1.2670738697052002, "learning_rate": 1.747495115279406e-05, "loss": 0.2136, "step": 162600 }, { "epoch": 12.705974228816869, "grad_norm": 0.9668679237365723, "learning_rate": 1.7473388042203987e-05, "loss": 0.2211, "step": 162700 }, { "epoch": 12.713783678250683, "grad_norm": 0.7561825513839722, "learning_rate": 1.7471824931613913e-05, "loss": 0.2226, "step": 162800 }, { "epoch": 12.721593127684498, "grad_norm": 0.6604050397872925, "learning_rate": 1.747026182102384e-05, "loss": 0.2041, "step": 162900 }, { "epoch": 12.729402577118313, "grad_norm": 1.0108921527862549, "learning_rate": 1.7468698710433765e-05, "loss": 0.2174, "step": 163000 }, { "epoch": 12.737212026552129, "grad_norm": 0.9948078989982605, "learning_rate": 1.7467135599843688e-05, "loss": 0.2297, "step": 163100 }, { "epoch": 12.745021475985943, "grad_norm": 1.0243088006973267, "learning_rate": 1.7465572489253618e-05, "loss": 0.2356, "step": 163200 }, { "epoch": 12.752830925419758, "grad_norm": 0.9389221668243408, "learning_rate": 1.7464009378663544e-05, "loss": 0.2211, "step": 163300 }, { "epoch": 12.760640374853573, "grad_norm": 0.6568807363510132, "learning_rate": 1.7462446268073466e-05, "loss": 0.2166, "step": 163400 }, { "epoch": 12.768449824287387, "grad_norm": 1.0748472213745117, "learning_rate": 1.7460883157483392e-05, "loss": 0.2107, "step": 163500 }, { "epoch": 12.776259273721202, "grad_norm": 0.8790175318717957, "learning_rate": 1.745932004689332e-05, "loss": 0.219, "step": 163600 }, { "epoch": 12.784068723155018, "grad_norm": 1.250709891319275, "learning_rate": 1.7457772567409147e-05, "loss": 0.2128, "step": 163700 }, { "epoch": 12.791878172588833, "grad_norm": 0.8646677732467651, "learning_rate": 1.745620945681907e-05, "loss": 0.2218, "step": 163800 }, { "epoch": 12.799687622022647, "grad_norm": 1.0469748973846436, "learning_rate": 1.7454646346228996e-05, "loss": 0.2282, "step": 163900 }, { "epoch": 12.807497071456462, "grad_norm": 1.2025775909423828, "learning_rate": 1.7453083235638922e-05, "loss": 0.2204, "step": 164000 }, { "epoch": 12.815306520890278, "grad_norm": 1.0595488548278809, "learning_rate": 1.7451520125048848e-05, "loss": 0.2186, "step": 164100 }, { "epoch": 12.823115970324093, "grad_norm": 0.8982226848602295, "learning_rate": 1.7449957014458774e-05, "loss": 0.2278, "step": 164200 }, { "epoch": 12.830925419757907, "grad_norm": 0.9295059442520142, "learning_rate": 1.74483939038687e-05, "loss": 0.2219, "step": 164300 }, { "epoch": 12.838734869191722, "grad_norm": 0.8940107822418213, "learning_rate": 1.7446830793278626e-05, "loss": 0.2205, "step": 164400 }, { "epoch": 12.846544318625536, "grad_norm": 0.9545173048973083, "learning_rate": 1.7445267682688552e-05, "loss": 0.2123, "step": 164500 }, { "epoch": 12.854353768059351, "grad_norm": 0.8928143382072449, "learning_rate": 1.7443704572098478e-05, "loss": 0.2182, "step": 164600 }, { "epoch": 12.862163217493167, "grad_norm": 0.9669292569160461, "learning_rate": 1.7442141461508404e-05, "loss": 0.2178, "step": 164700 }, { "epoch": 12.869972666926982, "grad_norm": 0.951994776725769, "learning_rate": 1.744057835091833e-05, "loss": 0.2152, "step": 164800 }, { "epoch": 12.877782116360796, "grad_norm": 0.8024113774299622, "learning_rate": 1.7439015240328253e-05, "loss": 0.2106, "step": 164900 }, { "epoch": 12.885591565794611, "grad_norm": 0.8423495292663574, "learning_rate": 1.743745212973818e-05, "loss": 0.2171, "step": 165000 }, { "epoch": 12.893401015228427, "grad_norm": 0.7901012897491455, "learning_rate": 1.7435889019148108e-05, "loss": 0.2083, "step": 165100 }, { "epoch": 12.901210464662242, "grad_norm": 0.8986498713493347, "learning_rate": 1.743432590855803e-05, "loss": 0.2101, "step": 165200 }, { "epoch": 12.909019914096056, "grad_norm": 1.0257128477096558, "learning_rate": 1.7432762797967957e-05, "loss": 0.2046, "step": 165300 }, { "epoch": 12.916829363529871, "grad_norm": 0.7271884083747864, "learning_rate": 1.7431199687377883e-05, "loss": 0.214, "step": 165400 }, { "epoch": 12.924638812963686, "grad_norm": 0.9488785266876221, "learning_rate": 1.742963657678781e-05, "loss": 0.2165, "step": 165500 }, { "epoch": 12.9324482623975, "grad_norm": 0.999997615814209, "learning_rate": 1.7428073466197735e-05, "loss": 0.2113, "step": 165600 }, { "epoch": 12.940257711831316, "grad_norm": 0.7397522926330566, "learning_rate": 1.742652598671356e-05, "loss": 0.2191, "step": 165700 }, { "epoch": 12.948067161265131, "grad_norm": 1.1001430749893188, "learning_rate": 1.7424962876123487e-05, "loss": 0.2209, "step": 165800 }, { "epoch": 12.955876610698946, "grad_norm": 1.1696312427520752, "learning_rate": 1.7423399765533413e-05, "loss": 0.2134, "step": 165900 }, { "epoch": 12.96368606013276, "grad_norm": 0.941423237323761, "learning_rate": 1.742183665494334e-05, "loss": 0.2208, "step": 166000 }, { "epoch": 12.971495509566576, "grad_norm": 0.99091637134552, "learning_rate": 1.7420273544353265e-05, "loss": 0.2233, "step": 166100 }, { "epoch": 12.979304959000391, "grad_norm": 0.9629716277122498, "learning_rate": 1.741871043376319e-05, "loss": 0.2074, "step": 166200 }, { "epoch": 12.987114408434206, "grad_norm": 0.8541843295097351, "learning_rate": 1.7417147323173117e-05, "loss": 0.2205, "step": 166300 }, { "epoch": 12.99492385786802, "grad_norm": 0.7128260135650635, "learning_rate": 1.741558421258304e-05, "loss": 0.2142, "step": 166400 }, { "epoch": 13.002733307301835, "grad_norm": 0.8291605114936829, "learning_rate": 1.741402110199297e-05, "loss": 0.2069, "step": 166500 }, { "epoch": 13.01054275673565, "grad_norm": 1.0823742151260376, "learning_rate": 1.7412457991402895e-05, "loss": 0.2098, "step": 166600 }, { "epoch": 13.018352206169466, "grad_norm": 1.073767900466919, "learning_rate": 1.7410894880812818e-05, "loss": 0.2164, "step": 166700 }, { "epoch": 13.02616165560328, "grad_norm": 0.6933673024177551, "learning_rate": 1.7409331770222744e-05, "loss": 0.2101, "step": 166800 }, { "epoch": 13.033971105037095, "grad_norm": 0.9099040031433105, "learning_rate": 1.740776865963267e-05, "loss": 0.2063, "step": 166900 }, { "epoch": 13.04178055447091, "grad_norm": 0.7376136183738708, "learning_rate": 1.7406205549042596e-05, "loss": 0.2168, "step": 167000 }, { "epoch": 13.049590003904724, "grad_norm": 0.9267070293426514, "learning_rate": 1.740464243845252e-05, "loss": 0.2157, "step": 167100 }, { "epoch": 13.05739945333854, "grad_norm": 1.0121701955795288, "learning_rate": 1.7403079327862448e-05, "loss": 0.2148, "step": 167200 }, { "epoch": 13.065208902772355, "grad_norm": 1.0252717733383179, "learning_rate": 1.7401516217272374e-05, "loss": 0.2113, "step": 167300 }, { "epoch": 13.07301835220617, "grad_norm": 1.3679596185684204, "learning_rate": 1.73999531066823e-05, "loss": 0.2163, "step": 167400 }, { "epoch": 13.080827801639984, "grad_norm": 0.8503546118736267, "learning_rate": 1.7398389996092226e-05, "loss": 0.1982, "step": 167500 }, { "epoch": 13.088637251073798, "grad_norm": 1.2549128532409668, "learning_rate": 1.739682688550215e-05, "loss": 0.2172, "step": 167600 }, { "epoch": 13.096446700507615, "grad_norm": 1.0882325172424316, "learning_rate": 1.7395279406017978e-05, "loss": 0.2159, "step": 167700 }, { "epoch": 13.10425614994143, "grad_norm": 0.9475435018539429, "learning_rate": 1.7393716295427904e-05, "loss": 0.2076, "step": 167800 }, { "epoch": 13.112065599375244, "grad_norm": 0.6309915781021118, "learning_rate": 1.7392153184837826e-05, "loss": 0.2173, "step": 167900 }, { "epoch": 13.119875048809059, "grad_norm": 0.9528407454490662, "learning_rate": 1.7390590074247756e-05, "loss": 0.2087, "step": 168000 }, { "epoch": 13.127684498242873, "grad_norm": 1.0685718059539795, "learning_rate": 1.738902696365768e-05, "loss": 0.2115, "step": 168100 }, { "epoch": 13.13549394767669, "grad_norm": 0.7884103655815125, "learning_rate": 1.7387463853067604e-05, "loss": 0.2054, "step": 168200 }, { "epoch": 13.143303397110504, "grad_norm": 0.9388546347618103, "learning_rate": 1.7385900742477534e-05, "loss": 0.2126, "step": 168300 }, { "epoch": 13.151112846544319, "grad_norm": 0.8609205484390259, "learning_rate": 1.7384337631887456e-05, "loss": 0.2131, "step": 168400 }, { "epoch": 13.158922295978133, "grad_norm": 0.9092925190925598, "learning_rate": 1.7382774521297382e-05, "loss": 0.2202, "step": 168500 }, { "epoch": 13.166731745411948, "grad_norm": 1.0586522817611694, "learning_rate": 1.738121141070731e-05, "loss": 0.2048, "step": 168600 }, { "epoch": 13.174541194845764, "grad_norm": 1.3472049236297607, "learning_rate": 1.7379648300117234e-05, "loss": 0.2166, "step": 168700 }, { "epoch": 13.182350644279579, "grad_norm": 0.927558183670044, "learning_rate": 1.737808518952716e-05, "loss": 0.2226, "step": 168800 }, { "epoch": 13.190160093713393, "grad_norm": 0.9628226161003113, "learning_rate": 1.7376522078937086e-05, "loss": 0.2175, "step": 168900 }, { "epoch": 13.197969543147208, "grad_norm": 1.1431093215942383, "learning_rate": 1.7374958968347012e-05, "loss": 0.2094, "step": 169000 }, { "epoch": 13.205778992581022, "grad_norm": 1.1498020887374878, "learning_rate": 1.737339585775694e-05, "loss": 0.2116, "step": 169100 }, { "epoch": 13.213588442014839, "grad_norm": 0.9198805093765259, "learning_rate": 1.7371832747166864e-05, "loss": 0.2072, "step": 169200 }, { "epoch": 13.221397891448653, "grad_norm": 1.065612554550171, "learning_rate": 1.7370269636576787e-05, "loss": 0.2149, "step": 169300 }, { "epoch": 13.229207340882468, "grad_norm": 0.9671112895011902, "learning_rate": 1.7368706525986716e-05, "loss": 0.2111, "step": 169400 }, { "epoch": 13.237016790316282, "grad_norm": 0.7491472363471985, "learning_rate": 1.7367143415396642e-05, "loss": 0.2146, "step": 169500 }, { "epoch": 13.244826239750097, "grad_norm": 0.9440492391586304, "learning_rate": 1.7365580304806565e-05, "loss": 0.2085, "step": 169600 }, { "epoch": 13.252635689183913, "grad_norm": 0.8738968372344971, "learning_rate": 1.736403282532239e-05, "loss": 0.2155, "step": 169700 }, { "epoch": 13.260445138617728, "grad_norm": 0.9270493984222412, "learning_rate": 1.736246971473232e-05, "loss": 0.203, "step": 169800 }, { "epoch": 13.268254588051542, "grad_norm": 0.8990687131881714, "learning_rate": 1.7360906604142246e-05, "loss": 0.2134, "step": 169900 }, { "epoch": 13.276064037485357, "grad_norm": 0.8390234112739563, "learning_rate": 1.735934349355217e-05, "loss": 0.2118, "step": 170000 }, { "epoch": 13.283873486919171, "grad_norm": 0.7005640864372253, "learning_rate": 1.7357780382962095e-05, "loss": 0.2078, "step": 170100 }, { "epoch": 13.291682936352988, "grad_norm": 0.888175904750824, "learning_rate": 1.735621727237202e-05, "loss": 0.2136, "step": 170200 }, { "epoch": 13.299492385786802, "grad_norm": 0.9876678586006165, "learning_rate": 1.7354654161781947e-05, "loss": 0.2139, "step": 170300 }, { "epoch": 13.307301835220617, "grad_norm": 1.1229948997497559, "learning_rate": 1.7353091051191873e-05, "loss": 0.2026, "step": 170400 }, { "epoch": 13.315111284654432, "grad_norm": 0.7431442737579346, "learning_rate": 1.73515279406018e-05, "loss": 0.2194, "step": 170500 }, { "epoch": 13.322920734088246, "grad_norm": 0.8044176697731018, "learning_rate": 1.7349964830011725e-05, "loss": 0.228, "step": 170600 }, { "epoch": 13.330730183522062, "grad_norm": 0.8355295658111572, "learning_rate": 1.734840171942165e-05, "loss": 0.2122, "step": 170700 }, { "epoch": 13.338539632955877, "grad_norm": 1.1031625270843506, "learning_rate": 1.7346838608831577e-05, "loss": 0.2214, "step": 170800 }, { "epoch": 13.346349082389692, "grad_norm": 1.0112760066986084, "learning_rate": 1.7345275498241503e-05, "loss": 0.2044, "step": 170900 }, { "epoch": 13.354158531823506, "grad_norm": 0.8478233218193054, "learning_rate": 1.734371238765143e-05, "loss": 0.2099, "step": 171000 }, { "epoch": 13.36196798125732, "grad_norm": 0.5524217486381531, "learning_rate": 1.7342149277061352e-05, "loss": 0.2, "step": 171100 }, { "epoch": 13.369777430691137, "grad_norm": 1.041663646697998, "learning_rate": 1.7340586166471278e-05, "loss": 0.2155, "step": 171200 }, { "epoch": 13.377586880124952, "grad_norm": 0.8660856485366821, "learning_rate": 1.7339023055881207e-05, "loss": 0.211, "step": 171300 }, { "epoch": 13.385396329558766, "grad_norm": 1.2220951318740845, "learning_rate": 1.733745994529113e-05, "loss": 0.2081, "step": 171400 }, { "epoch": 13.39320577899258, "grad_norm": 0.9403126835823059, "learning_rate": 1.7335896834701056e-05, "loss": 0.2196, "step": 171500 }, { "epoch": 13.401015228426395, "grad_norm": 0.7787653803825378, "learning_rate": 1.7334333724110982e-05, "loss": 0.1999, "step": 171600 }, { "epoch": 13.408824677860212, "grad_norm": 0.9129199981689453, "learning_rate": 1.7332770613520908e-05, "loss": 0.1964, "step": 171700 }, { "epoch": 13.416634127294026, "grad_norm": 0.8007563352584839, "learning_rate": 1.7331223134036734e-05, "loss": 0.208, "step": 171800 }, { "epoch": 13.42444357672784, "grad_norm": 0.8194550275802612, "learning_rate": 1.732966002344666e-05, "loss": 0.2093, "step": 171900 }, { "epoch": 13.432253026161655, "grad_norm": 1.066786289215088, "learning_rate": 1.7328096912856586e-05, "loss": 0.2164, "step": 172000 }, { "epoch": 13.44006247559547, "grad_norm": 1.1544183492660522, "learning_rate": 1.7326533802266512e-05, "loss": 0.2145, "step": 172100 }, { "epoch": 13.447871925029286, "grad_norm": 0.9140012860298157, "learning_rate": 1.7324970691676438e-05, "loss": 0.2127, "step": 172200 }, { "epoch": 13.4556813744631, "grad_norm": 1.0456069707870483, "learning_rate": 1.7323407581086364e-05, "loss": 0.2083, "step": 172300 }, { "epoch": 13.463490823896915, "grad_norm": 1.0170204639434814, "learning_rate": 1.732184447049629e-05, "loss": 0.2024, "step": 172400 }, { "epoch": 13.47130027333073, "grad_norm": 0.8396112322807312, "learning_rate": 1.7320281359906216e-05, "loss": 0.2161, "step": 172500 }, { "epoch": 13.479109722764544, "grad_norm": 1.0795584917068481, "learning_rate": 1.731871824931614e-05, "loss": 0.2088, "step": 172600 }, { "epoch": 13.48691917219836, "grad_norm": 1.0431060791015625, "learning_rate": 1.7317155138726068e-05, "loss": 0.2095, "step": 172700 }, { "epoch": 13.494728621632175, "grad_norm": 0.9751796722412109, "learning_rate": 1.7315592028135994e-05, "loss": 0.2017, "step": 172800 }, { "epoch": 13.50253807106599, "grad_norm": 0.9349498152732849, "learning_rate": 1.7314028917545916e-05, "loss": 0.2115, "step": 172900 }, { "epoch": 13.510347520499804, "grad_norm": 0.7299902439117432, "learning_rate": 1.7312465806955842e-05, "loss": 0.2053, "step": 173000 }, { "epoch": 13.518156969933619, "grad_norm": 0.9909356832504272, "learning_rate": 1.731090269636577e-05, "loss": 0.2127, "step": 173100 }, { "epoch": 13.525966419367435, "grad_norm": 0.812975287437439, "learning_rate": 1.7309339585775694e-05, "loss": 0.2135, "step": 173200 }, { "epoch": 13.53377586880125, "grad_norm": 0.9539850354194641, "learning_rate": 1.730777647518562e-05, "loss": 0.2078, "step": 173300 }, { "epoch": 13.541585318235065, "grad_norm": 0.760218620300293, "learning_rate": 1.7306213364595547e-05, "loss": 0.2059, "step": 173400 }, { "epoch": 13.549394767668879, "grad_norm": 0.8979002237319946, "learning_rate": 1.7304650254005473e-05, "loss": 0.2146, "step": 173500 }, { "epoch": 13.557204217102694, "grad_norm": 1.020674228668213, "learning_rate": 1.73030871434154e-05, "loss": 0.2135, "step": 173600 }, { "epoch": 13.56501366653651, "grad_norm": 0.8978680968284607, "learning_rate": 1.7301524032825325e-05, "loss": 0.2031, "step": 173700 }, { "epoch": 13.572823115970325, "grad_norm": 0.8209431767463684, "learning_rate": 1.729996092223525e-05, "loss": 0.1995, "step": 173800 }, { "epoch": 13.580632565404139, "grad_norm": 0.8961414098739624, "learning_rate": 1.7298413442751077e-05, "loss": 0.221, "step": 173900 }, { "epoch": 13.588442014837954, "grad_norm": 0.9937573671340942, "learning_rate": 1.7296850332161003e-05, "loss": 0.2204, "step": 174000 }, { "epoch": 13.596251464271768, "grad_norm": 0.9728763103485107, "learning_rate": 1.7295287221570925e-05, "loss": 0.2122, "step": 174100 }, { "epoch": 13.604060913705585, "grad_norm": 0.9363887310028076, "learning_rate": 1.7293724110980855e-05, "loss": 0.209, "step": 174200 }, { "epoch": 13.6118703631394, "grad_norm": 0.652712881565094, "learning_rate": 1.729216100039078e-05, "loss": 0.2265, "step": 174300 }, { "epoch": 13.619679812573214, "grad_norm": 1.1150403022766113, "learning_rate": 1.7290597889800703e-05, "loss": 0.2013, "step": 174400 }, { "epoch": 13.627489262007028, "grad_norm": 0.8084608912467957, "learning_rate": 1.7289034779210633e-05, "loss": 0.2158, "step": 174500 }, { "epoch": 13.635298711440843, "grad_norm": 1.0631225109100342, "learning_rate": 1.7287471668620555e-05, "loss": 0.2073, "step": 174600 }, { "epoch": 13.64310816087466, "grad_norm": 0.631264328956604, "learning_rate": 1.728590855803048e-05, "loss": 0.2201, "step": 174700 }, { "epoch": 13.650917610308474, "grad_norm": 0.6314346194267273, "learning_rate": 1.7284345447440407e-05, "loss": 0.2015, "step": 174800 }, { "epoch": 13.658727059742288, "grad_norm": 0.964005708694458, "learning_rate": 1.7282782336850333e-05, "loss": 0.2046, "step": 174900 }, { "epoch": 13.666536509176103, "grad_norm": 0.6954252123832703, "learning_rate": 1.728121922626026e-05, "loss": 0.2129, "step": 175000 }, { "epoch": 13.674345958609917, "grad_norm": 0.9582391381263733, "learning_rate": 1.7279656115670185e-05, "loss": 0.2107, "step": 175100 }, { "epoch": 13.682155408043734, "grad_norm": 1.00692617893219, "learning_rate": 1.727809300508011e-05, "loss": 0.2055, "step": 175200 }, { "epoch": 13.689964857477548, "grad_norm": 0.786759078502655, "learning_rate": 1.7276529894490037e-05, "loss": 0.2138, "step": 175300 }, { "epoch": 13.697774306911363, "grad_norm": 0.9668805003166199, "learning_rate": 1.7274966783899963e-05, "loss": 0.2095, "step": 175400 }, { "epoch": 13.705583756345177, "grad_norm": 0.9312834143638611, "learning_rate": 1.7273403673309886e-05, "loss": 0.2122, "step": 175500 }, { "epoch": 13.713393205778992, "grad_norm": 0.785740852355957, "learning_rate": 1.7271840562719815e-05, "loss": 0.2113, "step": 175600 }, { "epoch": 13.721202655212807, "grad_norm": 0.8165653347969055, "learning_rate": 1.727027745212974e-05, "loss": 0.2043, "step": 175700 }, { "epoch": 13.729012104646623, "grad_norm": 0.9161164164543152, "learning_rate": 1.7268714341539664e-05, "loss": 0.2037, "step": 175800 }, { "epoch": 13.736821554080437, "grad_norm": 0.8624919652938843, "learning_rate": 1.726716686205549e-05, "loss": 0.216, "step": 175900 }, { "epoch": 13.744631003514252, "grad_norm": 1.1940569877624512, "learning_rate": 1.726560375146542e-05, "loss": 0.2025, "step": 176000 }, { "epoch": 13.752440452948067, "grad_norm": 0.9143908023834229, "learning_rate": 1.7264040640875345e-05, "loss": 0.2085, "step": 176100 }, { "epoch": 13.760249902381883, "grad_norm": 0.8099220395088196, "learning_rate": 1.7262477530285268e-05, "loss": 0.2092, "step": 176200 }, { "epoch": 13.768059351815698, "grad_norm": 0.8948672413825989, "learning_rate": 1.7260914419695194e-05, "loss": 0.2063, "step": 176300 }, { "epoch": 13.775868801249512, "grad_norm": 0.9727224707603455, "learning_rate": 1.725935130910512e-05, "loss": 0.2146, "step": 176400 }, { "epoch": 13.783678250683327, "grad_norm": 1.0335994958877563, "learning_rate": 1.7257788198515046e-05, "loss": 0.206, "step": 176500 }, { "epoch": 13.791487700117141, "grad_norm": 0.9940019845962524, "learning_rate": 1.7256225087924972e-05, "loss": 0.2095, "step": 176600 }, { "epoch": 13.799297149550956, "grad_norm": 0.8024458289146423, "learning_rate": 1.7254661977334898e-05, "loss": 0.2072, "step": 176700 }, { "epoch": 13.807106598984772, "grad_norm": 0.9615973830223083, "learning_rate": 1.7253098866744824e-05, "loss": 0.2028, "step": 176800 }, { "epoch": 13.814916048418587, "grad_norm": 0.8130218386650085, "learning_rate": 1.725153575615475e-05, "loss": 0.2063, "step": 176900 }, { "epoch": 13.822725497852401, "grad_norm": 0.9325103759765625, "learning_rate": 1.7249972645564676e-05, "loss": 0.1951, "step": 177000 }, { "epoch": 13.830534947286216, "grad_norm": 0.9893394112586975, "learning_rate": 1.7248409534974602e-05, "loss": 0.198, "step": 177100 }, { "epoch": 13.838344396720032, "grad_norm": 0.9419919848442078, "learning_rate": 1.7246846424384528e-05, "loss": 0.2198, "step": 177200 }, { "epoch": 13.846153846153847, "grad_norm": 0.9249516129493713, "learning_rate": 1.724528331379445e-05, "loss": 0.2036, "step": 177300 }, { "epoch": 13.853963295587661, "grad_norm": 0.9058451056480408, "learning_rate": 1.7243720203204377e-05, "loss": 0.218, "step": 177400 }, { "epoch": 13.861772745021476, "grad_norm": 0.9602999091148376, "learning_rate": 1.7242157092614306e-05, "loss": 0.2005, "step": 177500 }, { "epoch": 13.86958219445529, "grad_norm": 0.8785629868507385, "learning_rate": 1.724059398202423e-05, "loss": 0.2048, "step": 177600 }, { "epoch": 13.877391643889105, "grad_norm": 0.8119620084762573, "learning_rate": 1.7239030871434155e-05, "loss": 0.2025, "step": 177700 }, { "epoch": 13.885201093322921, "grad_norm": 0.8947235941886902, "learning_rate": 1.723746776084408e-05, "loss": 0.2083, "step": 177800 }, { "epoch": 13.893010542756736, "grad_norm": 0.908216655254364, "learning_rate": 1.7235920281359907e-05, "loss": 0.2049, "step": 177900 }, { "epoch": 13.90081999219055, "grad_norm": 0.875116229057312, "learning_rate": 1.7234357170769833e-05, "loss": 0.202, "step": 178000 }, { "epoch": 13.908629441624365, "grad_norm": 0.7890712022781372, "learning_rate": 1.723279406017976e-05, "loss": 0.1965, "step": 178100 }, { "epoch": 13.916438891058181, "grad_norm": 1.0504424571990967, "learning_rate": 1.7231230949589685e-05, "loss": 0.2124, "step": 178200 }, { "epoch": 13.924248340491996, "grad_norm": 0.8207637667655945, "learning_rate": 1.722966783899961e-05, "loss": 0.1953, "step": 178300 }, { "epoch": 13.93205778992581, "grad_norm": 0.7486147284507751, "learning_rate": 1.7228104728409537e-05, "loss": 0.2018, "step": 178400 }, { "epoch": 13.939867239359625, "grad_norm": 0.9399036169052124, "learning_rate": 1.7226541617819463e-05, "loss": 0.206, "step": 178500 }, { "epoch": 13.94767668879344, "grad_norm": 0.7904515862464905, "learning_rate": 1.722497850722939e-05, "loss": 0.2035, "step": 178600 }, { "epoch": 13.955486138227254, "grad_norm": 0.8712892532348633, "learning_rate": 1.7223415396639315e-05, "loss": 0.2155, "step": 178700 }, { "epoch": 13.96329558766107, "grad_norm": 0.901816189289093, "learning_rate": 1.7221852286049237e-05, "loss": 0.2029, "step": 178800 }, { "epoch": 13.971105037094885, "grad_norm": 1.0784202814102173, "learning_rate": 1.7220289175459167e-05, "loss": 0.2075, "step": 178900 }, { "epoch": 13.9789144865287, "grad_norm": 0.973260223865509, "learning_rate": 1.7218726064869093e-05, "loss": 0.2054, "step": 179000 }, { "epoch": 13.986723935962514, "grad_norm": 0.6385024785995483, "learning_rate": 1.7217162954279015e-05, "loss": 0.2023, "step": 179100 }, { "epoch": 13.994533385396329, "grad_norm": 1.0969829559326172, "learning_rate": 1.721559984368894e-05, "loss": 0.2022, "step": 179200 }, { "epoch": 14.002342834830145, "grad_norm": 1.21580171585083, "learning_rate": 1.7214036733098867e-05, "loss": 0.201, "step": 179300 }, { "epoch": 14.01015228426396, "grad_norm": 0.9006642699241638, "learning_rate": 1.7212473622508793e-05, "loss": 0.2046, "step": 179400 }, { "epoch": 14.017961733697774, "grad_norm": 0.5619805455207825, "learning_rate": 1.721091051191872e-05, "loss": 0.2018, "step": 179500 }, { "epoch": 14.025771183131589, "grad_norm": 0.9529440999031067, "learning_rate": 1.7209347401328645e-05, "loss": 0.2052, "step": 179600 }, { "epoch": 14.033580632565403, "grad_norm": 0.8338792324066162, "learning_rate": 1.720778429073857e-05, "loss": 0.2022, "step": 179700 }, { "epoch": 14.04139008199922, "grad_norm": 0.8016043901443481, "learning_rate": 1.7206221180148497e-05, "loss": 0.2054, "step": 179800 }, { "epoch": 14.049199531433034, "grad_norm": 1.037266731262207, "learning_rate": 1.7204673700664323e-05, "loss": 0.2003, "step": 179900 }, { "epoch": 14.057008980866849, "grad_norm": 1.0760704278945923, "learning_rate": 1.720311059007425e-05, "loss": 0.2057, "step": 180000 }, { "epoch": 14.064818430300663, "grad_norm": 0.7043364644050598, "learning_rate": 1.7201547479484175e-05, "loss": 0.1971, "step": 180100 }, { "epoch": 14.072627879734478, "grad_norm": 0.9022557139396667, "learning_rate": 1.71999843688941e-05, "loss": 0.2057, "step": 180200 }, { "epoch": 14.080437329168294, "grad_norm": 0.88816899061203, "learning_rate": 1.7198421258304024e-05, "loss": 0.2054, "step": 180300 }, { "epoch": 14.088246778602109, "grad_norm": 0.8379324674606323, "learning_rate": 1.7196858147713953e-05, "loss": 0.2056, "step": 180400 }, { "epoch": 14.096056228035923, "grad_norm": 0.9925566911697388, "learning_rate": 1.719529503712388e-05, "loss": 0.2089, "step": 180500 }, { "epoch": 14.103865677469738, "grad_norm": 0.9591396450996399, "learning_rate": 1.7193731926533802e-05, "loss": 0.1993, "step": 180600 }, { "epoch": 14.111675126903553, "grad_norm": 1.2258329391479492, "learning_rate": 1.7192168815943728e-05, "loss": 0.2083, "step": 180700 }, { "epoch": 14.119484576337369, "grad_norm": 0.7245174050331116, "learning_rate": 1.7190605705353654e-05, "loss": 0.2006, "step": 180800 }, { "epoch": 14.127294025771183, "grad_norm": 0.8938778042793274, "learning_rate": 1.718904259476358e-05, "loss": 0.1986, "step": 180900 }, { "epoch": 14.135103475204998, "grad_norm": 0.8464726805686951, "learning_rate": 1.7187479484173506e-05, "loss": 0.1904, "step": 181000 }, { "epoch": 14.142912924638813, "grad_norm": 0.721854567527771, "learning_rate": 1.7185916373583432e-05, "loss": 0.1963, "step": 181100 }, { "epoch": 14.150722374072627, "grad_norm": 0.9772906303405762, "learning_rate": 1.7184353262993358e-05, "loss": 0.2054, "step": 181200 }, { "epoch": 14.158531823506443, "grad_norm": 1.0603522062301636, "learning_rate": 1.7182790152403284e-05, "loss": 0.2031, "step": 181300 }, { "epoch": 14.166341272940258, "grad_norm": 0.8204702734947205, "learning_rate": 1.718122704181321e-05, "loss": 0.2184, "step": 181400 }, { "epoch": 14.174150722374073, "grad_norm": 0.8588528633117676, "learning_rate": 1.7179663931223136e-05, "loss": 0.1977, "step": 181500 }, { "epoch": 14.181960171807887, "grad_norm": 1.0079927444458008, "learning_rate": 1.7178100820633062e-05, "loss": 0.2032, "step": 181600 }, { "epoch": 14.189769621241702, "grad_norm": 0.7643899321556091, "learning_rate": 1.7176537710042985e-05, "loss": 0.1924, "step": 181700 }, { "epoch": 14.197579070675518, "grad_norm": 1.1137841939926147, "learning_rate": 1.7174974599452914e-05, "loss": 0.1964, "step": 181800 }, { "epoch": 14.205388520109333, "grad_norm": 1.1593997478485107, "learning_rate": 1.717342711996874e-05, "loss": 0.1979, "step": 181900 }, { "epoch": 14.213197969543147, "grad_norm": 0.9710358381271362, "learning_rate": 1.7171864009378666e-05, "loss": 0.2021, "step": 182000 }, { "epoch": 14.221007418976962, "grad_norm": 0.9757130742073059, "learning_rate": 1.717030089878859e-05, "loss": 0.2107, "step": 182100 }, { "epoch": 14.228816868410776, "grad_norm": 0.9044745564460754, "learning_rate": 1.7168737788198518e-05, "loss": 0.2032, "step": 182200 }, { "epoch": 14.236626317844593, "grad_norm": 0.7436084747314453, "learning_rate": 1.716717467760844e-05, "loss": 0.1946, "step": 182300 }, { "epoch": 14.244435767278407, "grad_norm": 0.9272249341011047, "learning_rate": 1.7165611567018367e-05, "loss": 0.1938, "step": 182400 }, { "epoch": 14.252245216712222, "grad_norm": 0.9946134090423584, "learning_rate": 1.7164048456428293e-05, "loss": 0.2012, "step": 182500 }, { "epoch": 14.260054666146036, "grad_norm": 0.8200010657310486, "learning_rate": 1.716248534583822e-05, "loss": 0.2056, "step": 182600 }, { "epoch": 14.267864115579851, "grad_norm": 0.9531083703041077, "learning_rate": 1.7160922235248145e-05, "loss": 0.2077, "step": 182700 }, { "epoch": 14.275673565013667, "grad_norm": 1.0950546264648438, "learning_rate": 1.715935912465807e-05, "loss": 0.1928, "step": 182800 }, { "epoch": 14.283483014447482, "grad_norm": 0.957729160785675, "learning_rate": 1.7157796014067997e-05, "loss": 0.2, "step": 182900 }, { "epoch": 14.291292463881296, "grad_norm": 0.8982499837875366, "learning_rate": 1.7156232903477923e-05, "loss": 0.1979, "step": 183000 }, { "epoch": 14.299101913315111, "grad_norm": 1.290502905845642, "learning_rate": 1.715466979288785e-05, "loss": 0.1959, "step": 183100 }, { "epoch": 14.306911362748925, "grad_norm": 1.0289325714111328, "learning_rate": 1.7153106682297775e-05, "loss": 0.205, "step": 183200 }, { "epoch": 14.314720812182742, "grad_norm": 0.93171626329422, "learning_rate": 1.71515435717077e-05, "loss": 0.1993, "step": 183300 }, { "epoch": 14.322530261616556, "grad_norm": 0.9225603938102722, "learning_rate": 1.7149980461117627e-05, "loss": 0.2059, "step": 183400 }, { "epoch": 14.330339711050371, "grad_norm": 0.908424973487854, "learning_rate": 1.714841735052755e-05, "loss": 0.2049, "step": 183500 }, { "epoch": 14.338149160484186, "grad_norm": 0.7969884872436523, "learning_rate": 1.7146854239937476e-05, "loss": 0.1964, "step": 183600 }, { "epoch": 14.345958609918, "grad_norm": 1.1689947843551636, "learning_rate": 1.7145291129347405e-05, "loss": 0.1955, "step": 183700 }, { "epoch": 14.353768059351816, "grad_norm": 1.039355754852295, "learning_rate": 1.7143728018757328e-05, "loss": 0.2021, "step": 183800 }, { "epoch": 14.361577508785631, "grad_norm": 0.923857569694519, "learning_rate": 1.7142164908167254e-05, "loss": 0.1974, "step": 183900 }, { "epoch": 14.369386958219446, "grad_norm": 0.8760836124420166, "learning_rate": 1.7140617428683083e-05, "loss": 0.2004, "step": 184000 }, { "epoch": 14.37719640765326, "grad_norm": 0.9584679007530212, "learning_rate": 1.7139054318093006e-05, "loss": 0.1964, "step": 184100 }, { "epoch": 14.385005857087075, "grad_norm": 0.7809962630271912, "learning_rate": 1.713749120750293e-05, "loss": 0.1946, "step": 184200 }, { "epoch": 14.392815306520891, "grad_norm": 0.8375005125999451, "learning_rate": 1.7135928096912858e-05, "loss": 0.1983, "step": 184300 }, { "epoch": 14.400624755954706, "grad_norm": 0.942611038684845, "learning_rate": 1.7134364986322784e-05, "loss": 0.2109, "step": 184400 }, { "epoch": 14.40843420538852, "grad_norm": 0.8951541185379028, "learning_rate": 1.713280187573271e-05, "loss": 0.203, "step": 184500 }, { "epoch": 14.416243654822335, "grad_norm": 1.221139669418335, "learning_rate": 1.7131238765142636e-05, "loss": 0.1992, "step": 184600 }, { "epoch": 14.42405310425615, "grad_norm": 0.6856141090393066, "learning_rate": 1.712967565455256e-05, "loss": 0.1991, "step": 184700 }, { "epoch": 14.431862553689966, "grad_norm": 1.0590518712997437, "learning_rate": 1.7128112543962488e-05, "loss": 0.193, "step": 184800 }, { "epoch": 14.43967200312378, "grad_norm": 0.8135849237442017, "learning_rate": 1.7126549433372414e-05, "loss": 0.2112, "step": 184900 }, { "epoch": 14.447481452557595, "grad_norm": 0.9077997803688049, "learning_rate": 1.7124986322782336e-05, "loss": 0.1936, "step": 185000 }, { "epoch": 14.45529090199141, "grad_norm": 0.9828291535377502, "learning_rate": 1.7123423212192266e-05, "loss": 0.1972, "step": 185100 }, { "epoch": 14.463100351425224, "grad_norm": 1.111656904220581, "learning_rate": 1.712186010160219e-05, "loss": 0.1999, "step": 185200 }, { "epoch": 14.47090980085904, "grad_norm": 0.9280322790145874, "learning_rate": 1.7120296991012114e-05, "loss": 0.2053, "step": 185300 }, { "epoch": 14.478719250292855, "grad_norm": 1.0524592399597168, "learning_rate": 1.711873388042204e-05, "loss": 0.2082, "step": 185400 }, { "epoch": 14.48652869972667, "grad_norm": 1.1530081033706665, "learning_rate": 1.7117170769831966e-05, "loss": 0.1953, "step": 185500 }, { "epoch": 14.494338149160484, "grad_norm": 0.7839354276657104, "learning_rate": 1.7115607659241892e-05, "loss": 0.1838, "step": 185600 }, { "epoch": 14.502147598594298, "grad_norm": 1.0498613119125366, "learning_rate": 1.711404454865182e-05, "loss": 0.2024, "step": 185700 }, { "epoch": 14.509957048028115, "grad_norm": 0.8028302192687988, "learning_rate": 1.7112481438061744e-05, "loss": 0.1993, "step": 185800 }, { "epoch": 14.51776649746193, "grad_norm": 1.0131007432937622, "learning_rate": 1.711091832747167e-05, "loss": 0.1948, "step": 185900 }, { "epoch": 14.525575946895744, "grad_norm": 0.9612122774124146, "learning_rate": 1.7109370847987496e-05, "loss": 0.1921, "step": 186000 }, { "epoch": 14.533385396329559, "grad_norm": 0.8743976950645447, "learning_rate": 1.7107807737397422e-05, "loss": 0.2054, "step": 186100 }, { "epoch": 14.541194845763373, "grad_norm": 1.2099506855010986, "learning_rate": 1.7106244626807348e-05, "loss": 0.2025, "step": 186200 }, { "epoch": 14.54900429519719, "grad_norm": 0.9520013928413391, "learning_rate": 1.7104681516217274e-05, "loss": 0.1943, "step": 186300 }, { "epoch": 14.556813744631004, "grad_norm": 0.7041392922401428, "learning_rate": 1.71031184056272e-05, "loss": 0.2021, "step": 186400 }, { "epoch": 14.564623194064819, "grad_norm": 1.4426240921020508, "learning_rate": 1.7101555295037123e-05, "loss": 0.1956, "step": 186500 }, { "epoch": 14.572432643498633, "grad_norm": 0.8749873042106628, "learning_rate": 1.7099992184447052e-05, "loss": 0.2027, "step": 186600 }, { "epoch": 14.580242092932448, "grad_norm": 0.7697212100028992, "learning_rate": 1.709842907385698e-05, "loss": 0.2058, "step": 186700 }, { "epoch": 14.588051542366264, "grad_norm": 0.8453297019004822, "learning_rate": 1.70968659632669e-05, "loss": 0.2138, "step": 186800 }, { "epoch": 14.595860991800079, "grad_norm": 1.1992191076278687, "learning_rate": 1.7095302852676827e-05, "loss": 0.1904, "step": 186900 }, { "epoch": 14.603670441233893, "grad_norm": 0.8397727608680725, "learning_rate": 1.7093739742086753e-05, "loss": 0.204, "step": 187000 }, { "epoch": 14.611479890667708, "grad_norm": 0.867508053779602, "learning_rate": 1.709217663149668e-05, "loss": 0.2018, "step": 187100 }, { "epoch": 14.619289340101522, "grad_norm": 0.9640269875526428, "learning_rate": 1.7090613520906605e-05, "loss": 0.1919, "step": 187200 }, { "epoch": 14.627098789535339, "grad_norm": 1.1995270252227783, "learning_rate": 1.708905041031653e-05, "loss": 0.1937, "step": 187300 }, { "epoch": 14.634908238969153, "grad_norm": 0.936285138130188, "learning_rate": 1.7087487299726457e-05, "loss": 0.2013, "step": 187400 }, { "epoch": 14.642717688402968, "grad_norm": 0.9428242444992065, "learning_rate": 1.7085924189136383e-05, "loss": 0.1901, "step": 187500 }, { "epoch": 14.650527137836782, "grad_norm": 1.123363733291626, "learning_rate": 1.708436107854631e-05, "loss": 0.2002, "step": 187600 }, { "epoch": 14.658336587270597, "grad_norm": 0.7815497517585754, "learning_rate": 1.7082797967956235e-05, "loss": 0.1928, "step": 187700 }, { "epoch": 14.666146036704411, "grad_norm": 0.9222931861877441, "learning_rate": 1.708123485736616e-05, "loss": 0.1962, "step": 187800 }, { "epoch": 14.673955486138228, "grad_norm": 0.9944746494293213, "learning_rate": 1.7079671746776084e-05, "loss": 0.1985, "step": 187900 }, { "epoch": 14.681764935572042, "grad_norm": 0.9492562413215637, "learning_rate": 1.707812426729191e-05, "loss": 0.1878, "step": 188000 }, { "epoch": 14.689574385005857, "grad_norm": 1.0256750583648682, "learning_rate": 1.707656115670184e-05, "loss": 0.194, "step": 188100 }, { "epoch": 14.697383834439671, "grad_norm": 0.9843342900276184, "learning_rate": 1.7074998046111765e-05, "loss": 0.2038, "step": 188200 }, { "epoch": 14.705193283873488, "grad_norm": 0.6537313461303711, "learning_rate": 1.7073434935521688e-05, "loss": 0.1972, "step": 188300 }, { "epoch": 14.713002733307302, "grad_norm": 1.145023226737976, "learning_rate": 1.7071871824931617e-05, "loss": 0.1866, "step": 188400 }, { "epoch": 14.720812182741117, "grad_norm": 0.9992174506187439, "learning_rate": 1.707030871434154e-05, "loss": 0.1941, "step": 188500 }, { "epoch": 14.728621632174931, "grad_norm": 0.9242602586746216, "learning_rate": 1.7068745603751466e-05, "loss": 0.1966, "step": 188600 }, { "epoch": 14.736431081608746, "grad_norm": 0.7435315847396851, "learning_rate": 1.7067182493161392e-05, "loss": 0.2018, "step": 188700 }, { "epoch": 14.74424053104256, "grad_norm": 0.8198279142379761, "learning_rate": 1.7065619382571318e-05, "loss": 0.1911, "step": 188800 }, { "epoch": 14.752049980476377, "grad_norm": 0.9536947011947632, "learning_rate": 1.7064056271981244e-05, "loss": 0.1947, "step": 188900 }, { "epoch": 14.759859429910192, "grad_norm": 0.814950704574585, "learning_rate": 1.706249316139117e-05, "loss": 0.1992, "step": 189000 }, { "epoch": 14.767668879344006, "grad_norm": 1.2029240131378174, "learning_rate": 1.7060930050801096e-05, "loss": 0.1897, "step": 189100 }, { "epoch": 14.77547832877782, "grad_norm": 0.8989683985710144, "learning_rate": 1.7059366940211022e-05, "loss": 0.1917, "step": 189200 }, { "epoch": 14.783287778211637, "grad_norm": 0.981456995010376, "learning_rate": 1.7057803829620948e-05, "loss": 0.2043, "step": 189300 }, { "epoch": 14.791097227645452, "grad_norm": 0.7692901492118835, "learning_rate": 1.705624071903087e-05, "loss": 0.1942, "step": 189400 }, { "epoch": 14.798906677079266, "grad_norm": 0.8410061001777649, "learning_rate": 1.70546776084408e-05, "loss": 0.1871, "step": 189500 }, { "epoch": 14.80671612651308, "grad_norm": 0.7750579714775085, "learning_rate": 1.7053114497850726e-05, "loss": 0.1978, "step": 189600 }, { "epoch": 14.814525575946895, "grad_norm": 0.9206308722496033, "learning_rate": 1.705155138726065e-05, "loss": 0.1869, "step": 189700 }, { "epoch": 14.82233502538071, "grad_norm": 0.9680977463722229, "learning_rate": 1.7049988276670574e-05, "loss": 0.1953, "step": 189800 }, { "epoch": 14.830144474814526, "grad_norm": 0.9864129424095154, "learning_rate": 1.7048425166080504e-05, "loss": 0.1878, "step": 189900 }, { "epoch": 14.83795392424834, "grad_norm": 0.8863809108734131, "learning_rate": 1.704687768659633e-05, "loss": 0.1923, "step": 190000 }, { "epoch": 14.845763373682155, "grad_norm": 1.3199231624603271, "learning_rate": 1.7045314576006252e-05, "loss": 0.2056, "step": 190100 }, { "epoch": 14.85357282311597, "grad_norm": 0.9699838161468506, "learning_rate": 1.7043751465416182e-05, "loss": 0.1922, "step": 190200 }, { "epoch": 14.861382272549786, "grad_norm": 0.7888826727867126, "learning_rate": 1.7042188354826104e-05, "loss": 0.1952, "step": 190300 }, { "epoch": 14.8691917219836, "grad_norm": 0.8985480070114136, "learning_rate": 1.704062524423603e-05, "loss": 0.1912, "step": 190400 }, { "epoch": 14.877001171417415, "grad_norm": 1.0374826192855835, "learning_rate": 1.7039062133645956e-05, "loss": 0.2057, "step": 190500 }, { "epoch": 14.88481062085123, "grad_norm": 0.8498507738113403, "learning_rate": 1.7037499023055882e-05, "loss": 0.2004, "step": 190600 }, { "epoch": 14.892620070285044, "grad_norm": 0.9233710169792175, "learning_rate": 1.703593591246581e-05, "loss": 0.1951, "step": 190700 }, { "epoch": 14.900429519718859, "grad_norm": 1.052475929260254, "learning_rate": 1.7034372801875734e-05, "loss": 0.2044, "step": 190800 }, { "epoch": 14.908238969152675, "grad_norm": 0.9808698892593384, "learning_rate": 1.703280969128566e-05, "loss": 0.191, "step": 190900 }, { "epoch": 14.91604841858649, "grad_norm": 0.8547319769859314, "learning_rate": 1.7031246580695586e-05, "loss": 0.1909, "step": 191000 }, { "epoch": 14.923857868020304, "grad_norm": 0.8447765707969666, "learning_rate": 1.7029683470105513e-05, "loss": 0.1879, "step": 191100 }, { "epoch": 14.931667317454119, "grad_norm": 0.9389594197273254, "learning_rate": 1.7028120359515435e-05, "loss": 0.1917, "step": 191200 }, { "epoch": 14.939476766887934, "grad_norm": 0.9502831101417542, "learning_rate": 1.7026557248925365e-05, "loss": 0.1895, "step": 191300 }, { "epoch": 14.94728621632175, "grad_norm": 0.8974219560623169, "learning_rate": 1.702499413833529e-05, "loss": 0.1938, "step": 191400 }, { "epoch": 14.955095665755564, "grad_norm": 1.042904019355774, "learning_rate": 1.7023431027745213e-05, "loss": 0.1981, "step": 191500 }, { "epoch": 14.962905115189379, "grad_norm": 0.8570128679275513, "learning_rate": 1.702186791715514e-05, "loss": 0.1908, "step": 191600 }, { "epoch": 14.970714564623194, "grad_norm": 0.8636656999588013, "learning_rate": 1.7020304806565065e-05, "loss": 0.1876, "step": 191700 }, { "epoch": 14.978524014057008, "grad_norm": 0.8689895868301392, "learning_rate": 1.701874169597499e-05, "loss": 0.1942, "step": 191800 }, { "epoch": 14.986333463490825, "grad_norm": 1.3911195993423462, "learning_rate": 1.7017178585384917e-05, "loss": 0.2062, "step": 191900 }, { "epoch": 14.994142912924639, "grad_norm": 0.7825437784194946, "learning_rate": 1.7015631105900743e-05, "loss": 0.2081, "step": 192000 }, { "epoch": 15.001952362358454, "grad_norm": 0.9370694160461426, "learning_rate": 1.701406799531067e-05, "loss": 0.2039, "step": 192100 }, { "epoch": 15.009761811792268, "grad_norm": 0.9433199763298035, "learning_rate": 1.7012504884720595e-05, "loss": 0.1972, "step": 192200 }, { "epoch": 15.017571261226083, "grad_norm": 1.1181074380874634, "learning_rate": 1.701094177413052e-05, "loss": 0.1896, "step": 192300 }, { "epoch": 15.025380710659899, "grad_norm": 0.685670018196106, "learning_rate": 1.7009378663540447e-05, "loss": 0.1903, "step": 192400 }, { "epoch": 15.033190160093714, "grad_norm": 0.7437558770179749, "learning_rate": 1.7007815552950373e-05, "loss": 0.1866, "step": 192500 }, { "epoch": 15.040999609527528, "grad_norm": 0.9175603985786438, "learning_rate": 1.70062524423603e-05, "loss": 0.1859, "step": 192600 }, { "epoch": 15.048809058961343, "grad_norm": 1.106664776802063, "learning_rate": 1.7004689331770222e-05, "loss": 0.1957, "step": 192700 }, { "epoch": 15.056618508395157, "grad_norm": 0.7231187224388123, "learning_rate": 1.700312622118015e-05, "loss": 0.186, "step": 192800 }, { "epoch": 15.064427957828974, "grad_norm": 1.1075918674468994, "learning_rate": 1.7001563110590077e-05, "loss": 0.196, "step": 192900 }, { "epoch": 15.072237407262788, "grad_norm": 0.8646638989448547, "learning_rate": 1.7e-05, "loss": 0.1928, "step": 193000 }, { "epoch": 15.080046856696603, "grad_norm": 0.9719217419624329, "learning_rate": 1.6998436889409926e-05, "loss": 0.1936, "step": 193100 }, { "epoch": 15.087856306130417, "grad_norm": 0.595931887626648, "learning_rate": 1.6996873778819852e-05, "loss": 0.1797, "step": 193200 }, { "epoch": 15.095665755564232, "grad_norm": 0.7941077947616577, "learning_rate": 1.6995310668229778e-05, "loss": 0.1986, "step": 193300 }, { "epoch": 15.103475204998048, "grad_norm": 0.7684542536735535, "learning_rate": 1.6993747557639704e-05, "loss": 0.1966, "step": 193400 }, { "epoch": 15.111284654431863, "grad_norm": 1.0312563180923462, "learning_rate": 1.699218444704963e-05, "loss": 0.1971, "step": 193500 }, { "epoch": 15.119094103865677, "grad_norm": 0.7021324038505554, "learning_rate": 1.6990621336459556e-05, "loss": 0.1925, "step": 193600 }, { "epoch": 15.126903553299492, "grad_norm": 0.6282637119293213, "learning_rate": 1.6989058225869482e-05, "loss": 0.1907, "step": 193700 }, { "epoch": 15.134713002733307, "grad_norm": 0.6020660996437073, "learning_rate": 1.6987495115279408e-05, "loss": 0.1925, "step": 193800 }, { "epoch": 15.142522452167123, "grad_norm": 1.0062716007232666, "learning_rate": 1.6985932004689334e-05, "loss": 0.1955, "step": 193900 }, { "epoch": 15.150331901600937, "grad_norm": 1.0643500089645386, "learning_rate": 1.698436889409926e-05, "loss": 0.1974, "step": 194000 }, { "epoch": 15.158141351034752, "grad_norm": 0.835570752620697, "learning_rate": 1.6982821414615086e-05, "loss": 0.1945, "step": 194100 }, { "epoch": 15.165950800468567, "grad_norm": 1.1955751180648804, "learning_rate": 1.698125830402501e-05, "loss": 0.1965, "step": 194200 }, { "epoch": 15.173760249902381, "grad_norm": 0.7210449576377869, "learning_rate": 1.6979695193434938e-05, "loss": 0.1951, "step": 194300 }, { "epoch": 15.181569699336197, "grad_norm": 1.0025458335876465, "learning_rate": 1.6978132082844864e-05, "loss": 0.1913, "step": 194400 }, { "epoch": 15.189379148770012, "grad_norm": 0.8779587745666504, "learning_rate": 1.6976568972254787e-05, "loss": 0.1953, "step": 194500 }, { "epoch": 15.197188598203827, "grad_norm": 1.1459087133407593, "learning_rate": 1.6975005861664716e-05, "loss": 0.1958, "step": 194600 }, { "epoch": 15.204998047637641, "grad_norm": 1.0809831619262695, "learning_rate": 1.697344275107464e-05, "loss": 0.1961, "step": 194700 }, { "epoch": 15.212807497071456, "grad_norm": 1.0169117450714111, "learning_rate": 1.6971879640484565e-05, "loss": 0.1916, "step": 194800 }, { "epoch": 15.220616946505272, "grad_norm": 0.9966555833816528, "learning_rate": 1.697031652989449e-05, "loss": 0.1955, "step": 194900 }, { "epoch": 15.228426395939087, "grad_norm": 1.2357121706008911, "learning_rate": 1.6968753419304417e-05, "loss": 0.1911, "step": 195000 }, { "epoch": 15.236235845372901, "grad_norm": 0.9304054975509644, "learning_rate": 1.6967190308714343e-05, "loss": 0.1904, "step": 195100 }, { "epoch": 15.244045294806716, "grad_norm": 1.0154582262039185, "learning_rate": 1.696562719812427e-05, "loss": 0.1935, "step": 195200 }, { "epoch": 15.25185474424053, "grad_norm": 0.8713698983192444, "learning_rate": 1.6964064087534195e-05, "loss": 0.1957, "step": 195300 }, { "epoch": 15.259664193674347, "grad_norm": 0.9780009984970093, "learning_rate": 1.696250097694412e-05, "loss": 0.1858, "step": 195400 }, { "epoch": 15.267473643108161, "grad_norm": 0.7178473472595215, "learning_rate": 1.6960937866354047e-05, "loss": 0.1956, "step": 195500 }, { "epoch": 15.275283092541976, "grad_norm": 0.9540956616401672, "learning_rate": 1.695937475576397e-05, "loss": 0.1889, "step": 195600 }, { "epoch": 15.28309254197579, "grad_norm": 1.0060138702392578, "learning_rate": 1.69578116451739e-05, "loss": 0.1937, "step": 195700 }, { "epoch": 15.290901991409605, "grad_norm": 0.9597862362861633, "learning_rate": 1.6956248534583825e-05, "loss": 0.1859, "step": 195800 }, { "epoch": 15.298711440843421, "grad_norm": 1.0637125968933105, "learning_rate": 1.6954685423993747e-05, "loss": 0.1903, "step": 195900 }, { "epoch": 15.306520890277236, "grad_norm": 0.8320383429527283, "learning_rate": 1.6953122313403673e-05, "loss": 0.1934, "step": 196000 }, { "epoch": 15.31433033971105, "grad_norm": 0.9052497148513794, "learning_rate": 1.6951574833919503e-05, "loss": 0.2025, "step": 196100 }, { "epoch": 15.322139789144865, "grad_norm": 0.7524051070213318, "learning_rate": 1.695001172332943e-05, "loss": 0.1882, "step": 196200 }, { "epoch": 15.32994923857868, "grad_norm": 0.7284848093986511, "learning_rate": 1.694844861273935e-05, "loss": 0.185, "step": 196300 }, { "epoch": 15.337758688012496, "grad_norm": 0.8714132905006409, "learning_rate": 1.694688550214928e-05, "loss": 0.1933, "step": 196400 }, { "epoch": 15.34556813744631, "grad_norm": 0.8674778938293457, "learning_rate": 1.6945322391559203e-05, "loss": 0.1916, "step": 196500 }, { "epoch": 15.353377586880125, "grad_norm": 0.9107986688613892, "learning_rate": 1.694375928096913e-05, "loss": 0.1928, "step": 196600 }, { "epoch": 15.36118703631394, "grad_norm": 0.8382946252822876, "learning_rate": 1.6942196170379055e-05, "loss": 0.1948, "step": 196700 }, { "epoch": 15.368996485747754, "grad_norm": 0.5965157151222229, "learning_rate": 1.694063305978898e-05, "loss": 0.1866, "step": 196800 }, { "epoch": 15.37680593518157, "grad_norm": 0.9926332831382751, "learning_rate": 1.6939069949198907e-05, "loss": 0.1858, "step": 196900 }, { "epoch": 15.384615384615385, "grad_norm": 0.9674046635627747, "learning_rate": 1.6937506838608833e-05, "loss": 0.1919, "step": 197000 }, { "epoch": 15.3924248340492, "grad_norm": 0.763822078704834, "learning_rate": 1.693594372801876e-05, "loss": 0.1946, "step": 197100 }, { "epoch": 15.400234283483014, "grad_norm": 0.9984347224235535, "learning_rate": 1.6934380617428685e-05, "loss": 0.2008, "step": 197200 }, { "epoch": 15.408043732916829, "grad_norm": 1.0560930967330933, "learning_rate": 1.693281750683861e-05, "loss": 0.1935, "step": 197300 }, { "epoch": 15.415853182350645, "grad_norm": 0.8882468938827515, "learning_rate": 1.6931254396248534e-05, "loss": 0.1827, "step": 197400 }, { "epoch": 15.42366263178446, "grad_norm": 1.249205470085144, "learning_rate": 1.6929691285658463e-05, "loss": 0.1931, "step": 197500 }, { "epoch": 15.431472081218274, "grad_norm": 0.8984086513519287, "learning_rate": 1.692812817506839e-05, "loss": 0.1892, "step": 197600 }, { "epoch": 15.439281530652089, "grad_norm": 0.9370852112770081, "learning_rate": 1.6926565064478312e-05, "loss": 0.1809, "step": 197700 }, { "epoch": 15.447090980085903, "grad_norm": 0.8891327977180481, "learning_rate": 1.6925001953888238e-05, "loss": 0.1887, "step": 197800 }, { "epoch": 15.45490042951972, "grad_norm": 1.0804502964019775, "learning_rate": 1.6923438843298164e-05, "loss": 0.1868, "step": 197900 }, { "epoch": 15.462709878953534, "grad_norm": 0.6988115906715393, "learning_rate": 1.692187573270809e-05, "loss": 0.1892, "step": 198000 }, { "epoch": 15.470519328387349, "grad_norm": 1.01301908493042, "learning_rate": 1.6920312622118016e-05, "loss": 0.1906, "step": 198100 }, { "epoch": 15.478328777821163, "grad_norm": 1.0333824157714844, "learning_rate": 1.6918765142633842e-05, "loss": 0.1905, "step": 198200 }, { "epoch": 15.486138227254978, "grad_norm": 0.6864656209945679, "learning_rate": 1.6917202032043768e-05, "loss": 0.19, "step": 198300 }, { "epoch": 15.493947676688794, "grad_norm": 0.8507186770439148, "learning_rate": 1.6915638921453694e-05, "loss": 0.1948, "step": 198400 }, { "epoch": 15.501757126122609, "grad_norm": 1.074942708015442, "learning_rate": 1.691407581086362e-05, "loss": 0.1867, "step": 198500 }, { "epoch": 15.509566575556423, "grad_norm": 0.9293534755706787, "learning_rate": 1.6912512700273546e-05, "loss": 0.1924, "step": 198600 }, { "epoch": 15.517376024990238, "grad_norm": 1.0201472043991089, "learning_rate": 1.6910949589683472e-05, "loss": 0.1916, "step": 198700 }, { "epoch": 15.525185474424053, "grad_norm": 1.2007817029953003, "learning_rate": 1.6909386479093398e-05, "loss": 0.1869, "step": 198800 }, { "epoch": 15.532994923857869, "grad_norm": 0.812961757183075, "learning_rate": 1.690782336850332e-05, "loss": 0.1869, "step": 198900 }, { "epoch": 15.540804373291683, "grad_norm": 0.7858532071113586, "learning_rate": 1.690626025791325e-05, "loss": 0.1929, "step": 199000 }, { "epoch": 15.548613822725498, "grad_norm": 0.7535479664802551, "learning_rate": 1.6904697147323176e-05, "loss": 0.1882, "step": 199100 }, { "epoch": 15.556423272159313, "grad_norm": 0.8674836158752441, "learning_rate": 1.69031340367331e-05, "loss": 0.1833, "step": 199200 }, { "epoch": 15.564232721593127, "grad_norm": 1.0248985290527344, "learning_rate": 1.6901570926143025e-05, "loss": 0.1901, "step": 199300 }, { "epoch": 15.572042171026943, "grad_norm": 0.8296997547149658, "learning_rate": 1.690000781555295e-05, "loss": 0.1891, "step": 199400 }, { "epoch": 15.579851620460758, "grad_norm": 0.8192151188850403, "learning_rate": 1.6898444704962877e-05, "loss": 0.189, "step": 199500 }, { "epoch": 15.587661069894573, "grad_norm": 0.864629864692688, "learning_rate": 1.6896881594372803e-05, "loss": 0.181, "step": 199600 }, { "epoch": 15.595470519328387, "grad_norm": 0.6454740166664124, "learning_rate": 1.689531848378273e-05, "loss": 0.1823, "step": 199700 }, { "epoch": 15.603279968762202, "grad_norm": 0.9795613884925842, "learning_rate": 1.6893755373192655e-05, "loss": 0.193, "step": 199800 }, { "epoch": 15.611089418196016, "grad_norm": 0.735085666179657, "learning_rate": 1.689219226260258e-05, "loss": 0.186, "step": 199900 }, { "epoch": 15.618898867629833, "grad_norm": 0.9269001483917236, "learning_rate": 1.6890629152012507e-05, "loss": 0.1859, "step": 200000 }, { "epoch": 15.626708317063647, "grad_norm": 0.864848256111145, "learning_rate": 1.6889066041422433e-05, "loss": 0.1839, "step": 200100 }, { "epoch": 15.634517766497462, "grad_norm": 0.8201810717582703, "learning_rate": 1.688751856193826e-05, "loss": 0.1902, "step": 200200 }, { "epoch": 15.642327215931276, "grad_norm": 0.8653948307037354, "learning_rate": 1.6885955451348185e-05, "loss": 0.1925, "step": 200300 }, { "epoch": 15.650136665365093, "grad_norm": 0.950314462184906, "learning_rate": 1.6884392340758107e-05, "loss": 0.1864, "step": 200400 }, { "epoch": 15.657946114798907, "grad_norm": 0.7774307727813721, "learning_rate": 1.6882829230168037e-05, "loss": 0.1735, "step": 200500 }, { "epoch": 15.665755564232722, "grad_norm": 0.800393283367157, "learning_rate": 1.6881266119577963e-05, "loss": 0.1881, "step": 200600 }, { "epoch": 15.673565013666536, "grad_norm": 0.9827004671096802, "learning_rate": 1.6879703008987885e-05, "loss": 0.186, "step": 200700 }, { "epoch": 15.68137446310035, "grad_norm": 0.6259004473686218, "learning_rate": 1.6878139898397815e-05, "loss": 0.1809, "step": 200800 }, { "epoch": 15.689183912534165, "grad_norm": 0.8524276614189148, "learning_rate": 1.6876576787807737e-05, "loss": 0.1913, "step": 200900 }, { "epoch": 15.696993361967982, "grad_norm": 0.9546549320220947, "learning_rate": 1.6875013677217663e-05, "loss": 0.1881, "step": 201000 }, { "epoch": 15.704802811401796, "grad_norm": 0.999563455581665, "learning_rate": 1.687345056662759e-05, "loss": 0.1822, "step": 201100 }, { "epoch": 15.712612260835611, "grad_norm": 0.8454039096832275, "learning_rate": 1.6871887456037516e-05, "loss": 0.188, "step": 201200 }, { "epoch": 15.720421710269425, "grad_norm": 0.767012894153595, "learning_rate": 1.687032434544744e-05, "loss": 0.1828, "step": 201300 }, { "epoch": 15.728231159703242, "grad_norm": 0.9323714971542358, "learning_rate": 1.6868761234857368e-05, "loss": 0.1872, "step": 201400 }, { "epoch": 15.736040609137056, "grad_norm": 1.0684683322906494, "learning_rate": 1.6867198124267294e-05, "loss": 0.1968, "step": 201500 }, { "epoch": 15.743850058570871, "grad_norm": 0.6585600972175598, "learning_rate": 1.686563501367722e-05, "loss": 0.1872, "step": 201600 }, { "epoch": 15.751659508004686, "grad_norm": 0.9620580673217773, "learning_rate": 1.6864071903087146e-05, "loss": 0.18, "step": 201700 }, { "epoch": 15.7594689574385, "grad_norm": 0.7878828644752502, "learning_rate": 1.6862508792497068e-05, "loss": 0.1847, "step": 201800 }, { "epoch": 15.767278406872315, "grad_norm": 0.7668275237083435, "learning_rate": 1.6860945681906998e-05, "loss": 0.1934, "step": 201900 }, { "epoch": 15.775087856306131, "grad_norm": 1.0003772974014282, "learning_rate": 1.6859382571316924e-05, "loss": 0.1816, "step": 202000 }, { "epoch": 15.782897305739946, "grad_norm": 0.8344219923019409, "learning_rate": 1.6857819460726846e-05, "loss": 0.1894, "step": 202100 }, { "epoch": 15.79070675517376, "grad_norm": 0.8241024613380432, "learning_rate": 1.6856271981242672e-05, "loss": 0.1842, "step": 202200 }, { "epoch": 15.798516204607575, "grad_norm": 1.0600364208221436, "learning_rate": 1.68547088706526e-05, "loss": 0.1839, "step": 202300 }, { "epoch": 15.806325654041391, "grad_norm": 0.6510144472122192, "learning_rate": 1.6853145760062528e-05, "loss": 0.1823, "step": 202400 }, { "epoch": 15.814135103475206, "grad_norm": 0.9536733627319336, "learning_rate": 1.685158264947245e-05, "loss": 0.1819, "step": 202500 }, { "epoch": 15.82194455290902, "grad_norm": 0.8948224782943726, "learning_rate": 1.685001953888238e-05, "loss": 0.183, "step": 202600 }, { "epoch": 15.829754002342835, "grad_norm": 0.9985434412956238, "learning_rate": 1.6848456428292302e-05, "loss": 0.1886, "step": 202700 }, { "epoch": 15.83756345177665, "grad_norm": 0.801413893699646, "learning_rate": 1.6846893317702228e-05, "loss": 0.184, "step": 202800 }, { "epoch": 15.845372901210464, "grad_norm": 1.120006799697876, "learning_rate": 1.6845330207112154e-05, "loss": 0.1748, "step": 202900 }, { "epoch": 15.85318235064428, "grad_norm": 1.0911275148391724, "learning_rate": 1.684376709652208e-05, "loss": 0.1966, "step": 203000 }, { "epoch": 15.860991800078095, "grad_norm": 1.0338274240493774, "learning_rate": 1.6842203985932006e-05, "loss": 0.1914, "step": 203100 }, { "epoch": 15.86880124951191, "grad_norm": 0.7905099391937256, "learning_rate": 1.6840640875341932e-05, "loss": 0.1753, "step": 203200 }, { "epoch": 15.876610698945724, "grad_norm": 0.838280200958252, "learning_rate": 1.6839077764751858e-05, "loss": 0.1838, "step": 203300 }, { "epoch": 15.884420148379538, "grad_norm": 1.0954149961471558, "learning_rate": 1.6837514654161784e-05, "loss": 0.1761, "step": 203400 }, { "epoch": 15.892229597813355, "grad_norm": 0.5330013632774353, "learning_rate": 1.683595154357171e-05, "loss": 0.1758, "step": 203500 }, { "epoch": 15.90003904724717, "grad_norm": 0.8652849197387695, "learning_rate": 1.6834388432981633e-05, "loss": 0.183, "step": 203600 }, { "epoch": 15.907848496680984, "grad_norm": 0.7471824288368225, "learning_rate": 1.6832825322391562e-05, "loss": 0.1854, "step": 203700 }, { "epoch": 15.915657946114798, "grad_norm": 0.820147693157196, "learning_rate": 1.683126221180149e-05, "loss": 0.1741, "step": 203800 }, { "epoch": 15.923467395548613, "grad_norm": 1.0590057373046875, "learning_rate": 1.682969910121141e-05, "loss": 0.1749, "step": 203900 }, { "epoch": 15.93127684498243, "grad_norm": 1.0665738582611084, "learning_rate": 1.6828135990621337e-05, "loss": 0.1846, "step": 204000 }, { "epoch": 15.939086294416244, "grad_norm": 1.073426604270935, "learning_rate": 1.6826572880031263e-05, "loss": 0.1861, "step": 204100 }, { "epoch": 15.946895743850058, "grad_norm": 0.9139155745506287, "learning_rate": 1.682502540054709e-05, "loss": 0.1734, "step": 204200 }, { "epoch": 15.954705193283873, "grad_norm": 1.1288511753082275, "learning_rate": 1.6823462289957015e-05, "loss": 0.1766, "step": 204300 }, { "epoch": 15.962514642717688, "grad_norm": 0.9638312458992004, "learning_rate": 1.682189917936694e-05, "loss": 0.1737, "step": 204400 }, { "epoch": 15.970324092151504, "grad_norm": 1.0422431230545044, "learning_rate": 1.6820336068776867e-05, "loss": 0.1909, "step": 204500 }, { "epoch": 15.978133541585319, "grad_norm": 0.6258545517921448, "learning_rate": 1.6818772958186793e-05, "loss": 0.1854, "step": 204600 }, { "epoch": 15.985942991019133, "grad_norm": 1.0568209886550903, "learning_rate": 1.681720984759672e-05, "loss": 0.1826, "step": 204700 }, { "epoch": 15.993752440452948, "grad_norm": 1.0256654024124146, "learning_rate": 1.6815646737006645e-05, "loss": 0.177, "step": 204800 }, { "epoch": 16.001561889886762, "grad_norm": 1.0408920049667358, "learning_rate": 1.681408362641657e-05, "loss": 0.1895, "step": 204900 }, { "epoch": 16.009371339320577, "grad_norm": 0.9039410948753357, "learning_rate": 1.6812520515826497e-05, "loss": 0.1888, "step": 205000 }, { "epoch": 16.01718078875439, "grad_norm": 1.0113240480422974, "learning_rate": 1.681095740523642e-05, "loss": 0.1748, "step": 205100 }, { "epoch": 16.02499023818821, "grad_norm": 1.0972598791122437, "learning_rate": 1.680939429464635e-05, "loss": 0.1885, "step": 205200 }, { "epoch": 16.032799687622024, "grad_norm": 0.8362821936607361, "learning_rate": 1.6807831184056275e-05, "loss": 0.1777, "step": 205300 }, { "epoch": 16.04060913705584, "grad_norm": 0.8148047924041748, "learning_rate": 1.6806268073466198e-05, "loss": 0.1815, "step": 205400 }, { "epoch": 16.048418586489653, "grad_norm": 0.920591413974762, "learning_rate": 1.6804704962876124e-05, "loss": 0.1944, "step": 205500 }, { "epoch": 16.056228035923468, "grad_norm": 1.1818751096725464, "learning_rate": 1.680314185228605e-05, "loss": 0.1754, "step": 205600 }, { "epoch": 16.064037485357282, "grad_norm": 1.2708854675292969, "learning_rate": 1.6801578741695976e-05, "loss": 0.1768, "step": 205700 }, { "epoch": 16.071846934791097, "grad_norm": 0.9716994762420654, "learning_rate": 1.6800015631105902e-05, "loss": 0.1886, "step": 205800 }, { "epoch": 16.07965638422491, "grad_norm": 0.87205570936203, "learning_rate": 1.6798452520515828e-05, "loss": 0.1782, "step": 205900 }, { "epoch": 16.087465833658726, "grad_norm": 0.929098904132843, "learning_rate": 1.6796889409925754e-05, "loss": 0.1844, "step": 206000 }, { "epoch": 16.09527528309254, "grad_norm": 1.0598945617675781, "learning_rate": 1.679532629933568e-05, "loss": 0.1811, "step": 206100 }, { "epoch": 16.10308473252636, "grad_norm": 1.0059045553207397, "learning_rate": 1.6793763188745606e-05, "loss": 0.1788, "step": 206200 }, { "epoch": 16.110894181960173, "grad_norm": 0.6536023616790771, "learning_rate": 1.679221570926143e-05, "loss": 0.1854, "step": 206300 }, { "epoch": 16.118703631393988, "grad_norm": 0.8467875123023987, "learning_rate": 1.6790652598671358e-05, "loss": 0.18, "step": 206400 }, { "epoch": 16.126513080827802, "grad_norm": 0.8544393181800842, "learning_rate": 1.6789089488081284e-05, "loss": 0.182, "step": 206500 }, { "epoch": 16.134322530261617, "grad_norm": 1.1084660291671753, "learning_rate": 1.6787526377491206e-05, "loss": 0.1844, "step": 206600 }, { "epoch": 16.14213197969543, "grad_norm": 0.9038600325584412, "learning_rate": 1.6785963266901136e-05, "loss": 0.1836, "step": 206700 }, { "epoch": 16.149941429129246, "grad_norm": 0.8956557512283325, "learning_rate": 1.6784400156311062e-05, "loss": 0.1841, "step": 206800 }, { "epoch": 16.15775087856306, "grad_norm": 0.8761757612228394, "learning_rate": 1.6782837045720984e-05, "loss": 0.1794, "step": 206900 }, { "epoch": 16.165560327996875, "grad_norm": 1.2372992038726807, "learning_rate": 1.6781273935130914e-05, "loss": 0.1868, "step": 207000 }, { "epoch": 16.17336977743069, "grad_norm": 0.9002154469490051, "learning_rate": 1.6779710824540836e-05, "loss": 0.1806, "step": 207100 }, { "epoch": 16.181179226864508, "grad_norm": 0.7399206161499023, "learning_rate": 1.6778147713950762e-05, "loss": 0.1719, "step": 207200 }, { "epoch": 16.188988676298322, "grad_norm": 1.0187625885009766, "learning_rate": 1.677658460336069e-05, "loss": 0.1844, "step": 207300 }, { "epoch": 16.196798125732137, "grad_norm": 0.6338691711425781, "learning_rate": 1.6775021492770614e-05, "loss": 0.181, "step": 207400 }, { "epoch": 16.20460757516595, "grad_norm": 0.9440446496009827, "learning_rate": 1.677345838218054e-05, "loss": 0.1856, "step": 207500 }, { "epoch": 16.212417024599766, "grad_norm": 1.0803288221359253, "learning_rate": 1.6771895271590466e-05, "loss": 0.1803, "step": 207600 }, { "epoch": 16.22022647403358, "grad_norm": 1.3593189716339111, "learning_rate": 1.6770332161000392e-05, "loss": 0.1751, "step": 207700 }, { "epoch": 16.228035923467395, "grad_norm": 1.1652122735977173, "learning_rate": 1.676876905041032e-05, "loss": 0.1874, "step": 207800 }, { "epoch": 16.23584537290121, "grad_norm": 0.8041174411773682, "learning_rate": 1.6767205939820244e-05, "loss": 0.1798, "step": 207900 }, { "epoch": 16.243654822335024, "grad_norm": 0.8075453639030457, "learning_rate": 1.6765642829230167e-05, "loss": 0.1845, "step": 208000 }, { "epoch": 16.25146427176884, "grad_norm": 0.95871502161026, "learning_rate": 1.6764079718640096e-05, "loss": 0.1667, "step": 208100 }, { "epoch": 16.259273721202653, "grad_norm": 0.9648413062095642, "learning_rate": 1.6762516608050023e-05, "loss": 0.1865, "step": 208200 }, { "epoch": 16.26708317063647, "grad_norm": 0.8995979428291321, "learning_rate": 1.676096912856585e-05, "loss": 0.1746, "step": 208300 }, { "epoch": 16.274892620070286, "grad_norm": 0.8468859195709229, "learning_rate": 1.675940601797577e-05, "loss": 0.1706, "step": 208400 }, { "epoch": 16.2827020695041, "grad_norm": 0.9065866470336914, "learning_rate": 1.67578429073857e-05, "loss": 0.1778, "step": 208500 }, { "epoch": 16.290511518937915, "grad_norm": 0.6135996580123901, "learning_rate": 1.6756279796795623e-05, "loss": 0.1808, "step": 208600 }, { "epoch": 16.29832096837173, "grad_norm": 1.0263526439666748, "learning_rate": 1.675471668620555e-05, "loss": 0.1773, "step": 208700 }, { "epoch": 16.306130417805544, "grad_norm": 1.000637173652649, "learning_rate": 1.6753153575615475e-05, "loss": 0.1792, "step": 208800 }, { "epoch": 16.31393986723936, "grad_norm": 0.948738694190979, "learning_rate": 1.67515904650254e-05, "loss": 0.1876, "step": 208900 }, { "epoch": 16.321749316673174, "grad_norm": 0.8848404288291931, "learning_rate": 1.6750027354435327e-05, "loss": 0.1773, "step": 209000 }, { "epoch": 16.329558766106988, "grad_norm": 0.7282323241233826, "learning_rate": 1.6748464243845253e-05, "loss": 0.1773, "step": 209100 }, { "epoch": 16.337368215540803, "grad_norm": 0.9083189368247986, "learning_rate": 1.674690113325518e-05, "loss": 0.1864, "step": 209200 }, { "epoch": 16.34517766497462, "grad_norm": 0.9564442038536072, "learning_rate": 1.6745338022665105e-05, "loss": 0.1816, "step": 209300 }, { "epoch": 16.352987114408435, "grad_norm": 0.5036829710006714, "learning_rate": 1.674377491207503e-05, "loss": 0.1756, "step": 209400 }, { "epoch": 16.36079656384225, "grad_norm": 0.7878613471984863, "learning_rate": 1.6742211801484957e-05, "loss": 0.176, "step": 209500 }, { "epoch": 16.368606013276064, "grad_norm": 0.7844358682632446, "learning_rate": 1.6740648690894883e-05, "loss": 0.1837, "step": 209600 }, { "epoch": 16.37641546270988, "grad_norm": 0.9099976420402527, "learning_rate": 1.673908558030481e-05, "loss": 0.18, "step": 209700 }, { "epoch": 16.384224912143694, "grad_norm": 0.8219439387321472, "learning_rate": 1.6737522469714732e-05, "loss": 0.1724, "step": 209800 }, { "epoch": 16.392034361577508, "grad_norm": 0.978882372379303, "learning_rate": 1.673595935912466e-05, "loss": 0.1762, "step": 209900 }, { "epoch": 16.399843811011323, "grad_norm": 1.052987813949585, "learning_rate": 1.6734396248534587e-05, "loss": 0.1811, "step": 210000 }, { "epoch": 16.407653260445137, "grad_norm": 1.045796275138855, "learning_rate": 1.673283313794451e-05, "loss": 0.1786, "step": 210100 }, { "epoch": 16.415462709878952, "grad_norm": 1.1169161796569824, "learning_rate": 1.6731270027354436e-05, "loss": 0.1822, "step": 210200 }, { "epoch": 16.42327215931277, "grad_norm": 0.7530735731124878, "learning_rate": 1.6729722547870265e-05, "loss": 0.1862, "step": 210300 }, { "epoch": 16.431081608746585, "grad_norm": 1.0057936906814575, "learning_rate": 1.6728159437280188e-05, "loss": 0.1614, "step": 210400 }, { "epoch": 16.4388910581804, "grad_norm": 0.8210418820381165, "learning_rate": 1.6726596326690114e-05, "loss": 0.1786, "step": 210500 }, { "epoch": 16.446700507614214, "grad_norm": 0.8076565265655518, "learning_rate": 1.672503321610004e-05, "loss": 0.1791, "step": 210600 }, { "epoch": 16.454509957048028, "grad_norm": 0.9307219386100769, "learning_rate": 1.6723470105509966e-05, "loss": 0.1802, "step": 210700 }, { "epoch": 16.462319406481843, "grad_norm": 0.9993940591812134, "learning_rate": 1.6721906994919892e-05, "loss": 0.1721, "step": 210800 }, { "epoch": 16.470128855915657, "grad_norm": 0.8416149616241455, "learning_rate": 1.6720343884329818e-05, "loss": 0.1896, "step": 210900 }, { "epoch": 16.477938305349472, "grad_norm": 0.7258075475692749, "learning_rate": 1.6718780773739744e-05, "loss": 0.1809, "step": 211000 }, { "epoch": 16.485747754783286, "grad_norm": 0.8920261263847351, "learning_rate": 1.671721766314967e-05, "loss": 0.1816, "step": 211100 }, { "epoch": 16.4935572042171, "grad_norm": 0.8794561624526978, "learning_rate": 1.6715654552559596e-05, "loss": 0.1832, "step": 211200 }, { "epoch": 16.50136665365092, "grad_norm": 1.1431734561920166, "learning_rate": 1.671409144196952e-05, "loss": 0.1826, "step": 211300 }, { "epoch": 16.509176103084734, "grad_norm": 0.7752969861030579, "learning_rate": 1.6712528331379448e-05, "loss": 0.1742, "step": 211400 }, { "epoch": 16.51698555251855, "grad_norm": 1.030426263809204, "learning_rate": 1.6710965220789374e-05, "loss": 0.1773, "step": 211500 }, { "epoch": 16.524795001952363, "grad_norm": 0.7855979204177856, "learning_rate": 1.6709402110199297e-05, "loss": 0.1857, "step": 211600 }, { "epoch": 16.532604451386177, "grad_norm": 0.9296749830245972, "learning_rate": 1.6707838999609223e-05, "loss": 0.17, "step": 211700 }, { "epoch": 16.540413900819992, "grad_norm": 0.8138403296470642, "learning_rate": 1.670627588901915e-05, "loss": 0.1834, "step": 211800 }, { "epoch": 16.548223350253807, "grad_norm": 0.9249143004417419, "learning_rate": 1.6704712778429075e-05, "loss": 0.1748, "step": 211900 }, { "epoch": 16.55603279968762, "grad_norm": 0.922657310962677, "learning_rate": 1.6703149667839e-05, "loss": 0.1728, "step": 212000 }, { "epoch": 16.563842249121436, "grad_norm": 0.9151207804679871, "learning_rate": 1.6701586557248927e-05, "loss": 0.1787, "step": 212100 }, { "epoch": 16.57165169855525, "grad_norm": 0.8385941982269287, "learning_rate": 1.6700023446658853e-05, "loss": 0.1883, "step": 212200 }, { "epoch": 16.57946114798907, "grad_norm": 0.724240779876709, "learning_rate": 1.669847596717468e-05, "loss": 0.1833, "step": 212300 }, { "epoch": 16.587270597422883, "grad_norm": 0.6641395092010498, "learning_rate": 1.6696912856584605e-05, "loss": 0.1761, "step": 212400 }, { "epoch": 16.595080046856697, "grad_norm": 0.885064423084259, "learning_rate": 1.669534974599453e-05, "loss": 0.1839, "step": 212500 }, { "epoch": 16.602889496290512, "grad_norm": 0.705733597278595, "learning_rate": 1.6693786635404457e-05, "loss": 0.1681, "step": 212600 }, { "epoch": 16.610698945724327, "grad_norm": 0.9132718443870544, "learning_rate": 1.6692223524814383e-05, "loss": 0.1828, "step": 212700 }, { "epoch": 16.61850839515814, "grad_norm": 0.7742204070091248, "learning_rate": 1.6690660414224305e-05, "loss": 0.1885, "step": 212800 }, { "epoch": 16.626317844591956, "grad_norm": 1.0476394891738892, "learning_rate": 1.6689097303634235e-05, "loss": 0.1667, "step": 212900 }, { "epoch": 16.63412729402577, "grad_norm": 0.7489650249481201, "learning_rate": 1.668753419304416e-05, "loss": 0.1742, "step": 213000 }, { "epoch": 16.641936743459585, "grad_norm": 0.9502554535865784, "learning_rate": 1.6685971082454083e-05, "loss": 0.1723, "step": 213100 }, { "epoch": 16.6497461928934, "grad_norm": 0.9613061547279358, "learning_rate": 1.6684407971864013e-05, "loss": 0.1768, "step": 213200 }, { "epoch": 16.657555642327218, "grad_norm": 0.7885509729385376, "learning_rate": 1.6682844861273935e-05, "loss": 0.17, "step": 213300 }, { "epoch": 16.665365091761032, "grad_norm": 1.1856578588485718, "learning_rate": 1.668128175068386e-05, "loss": 0.178, "step": 213400 }, { "epoch": 16.673174541194847, "grad_norm": 0.7488318681716919, "learning_rate": 1.6679718640093787e-05, "loss": 0.1802, "step": 213500 }, { "epoch": 16.68098399062866, "grad_norm": 0.6175550222396851, "learning_rate": 1.6678155529503713e-05, "loss": 0.1726, "step": 213600 }, { "epoch": 16.688793440062476, "grad_norm": 0.8155626058578491, "learning_rate": 1.667659241891364e-05, "loss": 0.1868, "step": 213700 }, { "epoch": 16.69660288949629, "grad_norm": 0.8481264114379883, "learning_rate": 1.6675029308323565e-05, "loss": 0.1724, "step": 213800 }, { "epoch": 16.704412338930105, "grad_norm": 0.971606969833374, "learning_rate": 1.667346619773349e-05, "loss": 0.1812, "step": 213900 }, { "epoch": 16.71222178836392, "grad_norm": 0.7704633474349976, "learning_rate": 1.6671903087143417e-05, "loss": 0.1814, "step": 214000 }, { "epoch": 16.720031237797734, "grad_norm": 0.8130826950073242, "learning_rate": 1.6670339976553343e-05, "loss": 0.172, "step": 214100 }, { "epoch": 16.72784068723155, "grad_norm": 0.7058693170547485, "learning_rate": 1.6668776865963266e-05, "loss": 0.1729, "step": 214200 }, { "epoch": 16.735650136665367, "grad_norm": 0.7548621892929077, "learning_rate": 1.6667213755373195e-05, "loss": 0.1753, "step": 214300 }, { "epoch": 16.74345958609918, "grad_norm": 0.9197354316711426, "learning_rate": 1.666565064478312e-05, "loss": 0.1739, "step": 214400 }, { "epoch": 16.751269035532996, "grad_norm": 0.9533059000968933, "learning_rate": 1.6664087534193044e-05, "loss": 0.1778, "step": 214500 }, { "epoch": 16.75907848496681, "grad_norm": 0.8951109051704407, "learning_rate": 1.666254005470887e-05, "loss": 0.19, "step": 214600 }, { "epoch": 16.766887934400625, "grad_norm": 0.8666356801986694, "learning_rate": 1.66609769441188e-05, "loss": 0.1868, "step": 214700 }, { "epoch": 16.77469738383444, "grad_norm": 1.1076565980911255, "learning_rate": 1.6659413833528722e-05, "loss": 0.1754, "step": 214800 }, { "epoch": 16.782506833268254, "grad_norm": 1.198845624923706, "learning_rate": 1.6657850722938648e-05, "loss": 0.1803, "step": 214900 }, { "epoch": 16.79031628270207, "grad_norm": 0.5751312971115112, "learning_rate": 1.6656287612348574e-05, "loss": 0.1789, "step": 215000 }, { "epoch": 16.798125732135883, "grad_norm": 1.1437292098999023, "learning_rate": 1.66547245017585e-05, "loss": 0.1772, "step": 215100 }, { "epoch": 16.805935181569698, "grad_norm": 1.0913738012313843, "learning_rate": 1.6653161391168426e-05, "loss": 0.1808, "step": 215200 }, { "epoch": 16.813744631003516, "grad_norm": 0.9207703471183777, "learning_rate": 1.6651598280578352e-05, "loss": 0.1661, "step": 215300 }, { "epoch": 16.82155408043733, "grad_norm": 1.096644401550293, "learning_rate": 1.6650035169988278e-05, "loss": 0.1802, "step": 215400 }, { "epoch": 16.829363529871145, "grad_norm": 0.9608086347579956, "learning_rate": 1.6648472059398204e-05, "loss": 0.1852, "step": 215500 }, { "epoch": 16.83717297930496, "grad_norm": 1.0263370275497437, "learning_rate": 1.664690894880813e-05, "loss": 0.177, "step": 215600 }, { "epoch": 16.844982428738774, "grad_norm": 0.8369292616844177, "learning_rate": 1.6645345838218053e-05, "loss": 0.1728, "step": 215700 }, { "epoch": 16.85279187817259, "grad_norm": 0.8427988290786743, "learning_rate": 1.6643782727627982e-05, "loss": 0.1819, "step": 215800 }, { "epoch": 16.860601327606403, "grad_norm": 0.9546705484390259, "learning_rate": 1.6642219617037908e-05, "loss": 0.1756, "step": 215900 }, { "epoch": 16.868410777040218, "grad_norm": 1.0135685205459595, "learning_rate": 1.664065650644783e-05, "loss": 0.1806, "step": 216000 }, { "epoch": 16.876220226474032, "grad_norm": 0.8881898522377014, "learning_rate": 1.663909339585776e-05, "loss": 0.1771, "step": 216100 }, { "epoch": 16.884029675907847, "grad_norm": 0.8927045464515686, "learning_rate": 1.6637530285267686e-05, "loss": 0.1792, "step": 216200 }, { "epoch": 16.891839125341665, "grad_norm": 0.633324146270752, "learning_rate": 1.663596717467761e-05, "loss": 0.1758, "step": 216300 }, { "epoch": 16.89964857477548, "grad_norm": 0.867351233959198, "learning_rate": 1.6634404064087535e-05, "loss": 0.1787, "step": 216400 }, { "epoch": 16.907458024209294, "grad_norm": 1.1435070037841797, "learning_rate": 1.663284095349746e-05, "loss": 0.1741, "step": 216500 }, { "epoch": 16.91526747364311, "grad_norm": 1.1241178512573242, "learning_rate": 1.6631293474013287e-05, "loss": 0.1757, "step": 216600 }, { "epoch": 16.923076923076923, "grad_norm": 0.608675479888916, "learning_rate": 1.6629730363423213e-05, "loss": 0.1842, "step": 216700 }, { "epoch": 16.930886372510738, "grad_norm": 0.7797546982765198, "learning_rate": 1.662816725283314e-05, "loss": 0.1745, "step": 216800 }, { "epoch": 16.938695821944552, "grad_norm": 0.689860999584198, "learning_rate": 1.6626604142243065e-05, "loss": 0.1765, "step": 216900 }, { "epoch": 16.946505271378367, "grad_norm": 1.104783535003662, "learning_rate": 1.662504103165299e-05, "loss": 0.1813, "step": 217000 }, { "epoch": 16.95431472081218, "grad_norm": 0.9937470555305481, "learning_rate": 1.6623477921062917e-05, "loss": 0.1767, "step": 217100 }, { "epoch": 16.962124170245996, "grad_norm": 0.9059083461761475, "learning_rate": 1.6621914810472843e-05, "loss": 0.1801, "step": 217200 }, { "epoch": 16.969933619679814, "grad_norm": 0.7685569524765015, "learning_rate": 1.662035169988277e-05, "loss": 0.1765, "step": 217300 }, { "epoch": 16.97774306911363, "grad_norm": 1.1561412811279297, "learning_rate": 1.6618788589292695e-05, "loss": 0.1784, "step": 217400 }, { "epoch": 16.985552518547443, "grad_norm": 0.7909867763519287, "learning_rate": 1.6617225478702617e-05, "loss": 0.1706, "step": 217500 }, { "epoch": 16.993361967981258, "grad_norm": 0.8139914870262146, "learning_rate": 1.6615662368112547e-05, "loss": 0.1748, "step": 217600 }, { "epoch": 17.001171417415073, "grad_norm": 0.8759558200836182, "learning_rate": 1.6614099257522473e-05, "loss": 0.176, "step": 217700 }, { "epoch": 17.008980866848887, "grad_norm": 0.780949592590332, "learning_rate": 1.6612536146932395e-05, "loss": 0.1765, "step": 217800 }, { "epoch": 17.0167903162827, "grad_norm": 0.7488613128662109, "learning_rate": 1.661097303634232e-05, "loss": 0.1795, "step": 217900 }, { "epoch": 17.024599765716516, "grad_norm": 0.6278981566429138, "learning_rate": 1.6609409925752247e-05, "loss": 0.1747, "step": 218000 }, { "epoch": 17.03240921515033, "grad_norm": 0.8410722613334656, "learning_rate": 1.6607846815162173e-05, "loss": 0.1784, "step": 218100 }, { "epoch": 17.040218664584145, "grad_norm": 0.9053663611412048, "learning_rate": 1.66062837045721e-05, "loss": 0.1696, "step": 218200 }, { "epoch": 17.048028114017963, "grad_norm": 1.0183720588684082, "learning_rate": 1.6604720593982026e-05, "loss": 0.1764, "step": 218300 }, { "epoch": 17.055837563451778, "grad_norm": 0.8127830624580383, "learning_rate": 1.660315748339195e-05, "loss": 0.18, "step": 218400 }, { "epoch": 17.063647012885593, "grad_norm": 0.9182298183441162, "learning_rate": 1.6601594372801878e-05, "loss": 0.1695, "step": 218500 }, { "epoch": 17.071456462319407, "grad_norm": 0.981143057346344, "learning_rate": 1.6600031262211804e-05, "loss": 0.1675, "step": 218600 }, { "epoch": 17.07926591175322, "grad_norm": 0.8018271923065186, "learning_rate": 1.659846815162173e-05, "loss": 0.1697, "step": 218700 }, { "epoch": 17.087075361187036, "grad_norm": 0.923195481300354, "learning_rate": 1.6596920672137555e-05, "loss": 0.1764, "step": 218800 }, { "epoch": 17.09488481062085, "grad_norm": 1.013973593711853, "learning_rate": 1.659535756154748e-05, "loss": 0.1772, "step": 218900 }, { "epoch": 17.102694260054665, "grad_norm": 0.7282880544662476, "learning_rate": 1.6593794450957404e-05, "loss": 0.1688, "step": 219000 }, { "epoch": 17.11050370948848, "grad_norm": 0.8870101571083069, "learning_rate": 1.6592231340367334e-05, "loss": 0.1776, "step": 219100 }, { "epoch": 17.118313158922295, "grad_norm": 0.7469478845596313, "learning_rate": 1.659066822977726e-05, "loss": 0.1751, "step": 219200 }, { "epoch": 17.126122608356113, "grad_norm": 1.4116019010543823, "learning_rate": 1.6589105119187182e-05, "loss": 0.1711, "step": 219300 }, { "epoch": 17.133932057789927, "grad_norm": 0.9935173988342285, "learning_rate": 1.658754200859711e-05, "loss": 0.1722, "step": 219400 }, { "epoch": 17.141741507223742, "grad_norm": 0.865006685256958, "learning_rate": 1.6585978898007034e-05, "loss": 0.1719, "step": 219500 }, { "epoch": 17.149550956657556, "grad_norm": 0.891258180141449, "learning_rate": 1.658441578741696e-05, "loss": 0.1738, "step": 219600 }, { "epoch": 17.15736040609137, "grad_norm": 0.9739299416542053, "learning_rate": 1.6582852676826886e-05, "loss": 0.1767, "step": 219700 }, { "epoch": 17.165169855525185, "grad_norm": 1.140121340751648, "learning_rate": 1.6581289566236812e-05, "loss": 0.1784, "step": 219800 }, { "epoch": 17.172979304959, "grad_norm": 0.9761224389076233, "learning_rate": 1.6579726455646738e-05, "loss": 0.1694, "step": 219900 }, { "epoch": 17.180788754392815, "grad_norm": 0.8532379269599915, "learning_rate": 1.6578163345056664e-05, "loss": 0.1752, "step": 220000 }, { "epoch": 17.18859820382663, "grad_norm": 0.6662416458129883, "learning_rate": 1.657660023446659e-05, "loss": 0.167, "step": 220100 }, { "epoch": 17.196407653260444, "grad_norm": 1.090410828590393, "learning_rate": 1.6575037123876516e-05, "loss": 0.177, "step": 220200 }, { "epoch": 17.20421710269426, "grad_norm": 0.9860948920249939, "learning_rate": 1.6573474013286442e-05, "loss": 0.1716, "step": 220300 }, { "epoch": 17.212026552128076, "grad_norm": 0.8205731511116028, "learning_rate": 1.6571910902696365e-05, "loss": 0.1708, "step": 220400 }, { "epoch": 17.21983600156189, "grad_norm": 0.636326014995575, "learning_rate": 1.6570347792106294e-05, "loss": 0.1775, "step": 220500 }, { "epoch": 17.227645450995706, "grad_norm": 1.1403089761734009, "learning_rate": 1.656878468151622e-05, "loss": 0.1818, "step": 220600 }, { "epoch": 17.23545490042952, "grad_norm": 0.9462027549743652, "learning_rate": 1.6567221570926143e-05, "loss": 0.1717, "step": 220700 }, { "epoch": 17.243264349863335, "grad_norm": 0.8314661979675293, "learning_rate": 1.656565846033607e-05, "loss": 0.1801, "step": 220800 }, { "epoch": 17.25107379929715, "grad_norm": 1.00192391872406, "learning_rate": 1.6564110980851898e-05, "loss": 0.1745, "step": 220900 }, { "epoch": 17.258883248730964, "grad_norm": 1.1850578784942627, "learning_rate": 1.656254787026182e-05, "loss": 0.1785, "step": 221000 }, { "epoch": 17.26669269816478, "grad_norm": 0.5880224108695984, "learning_rate": 1.6560984759671747e-05, "loss": 0.1702, "step": 221100 }, { "epoch": 17.274502147598593, "grad_norm": 0.8309512734413147, "learning_rate": 1.6559421649081673e-05, "loss": 0.1808, "step": 221200 }, { "epoch": 17.282311597032407, "grad_norm": 0.7761743664741516, "learning_rate": 1.65578585384916e-05, "loss": 0.1764, "step": 221300 }, { "epoch": 17.290121046466226, "grad_norm": 0.9220330715179443, "learning_rate": 1.6556295427901525e-05, "loss": 0.1684, "step": 221400 }, { "epoch": 17.29793049590004, "grad_norm": 1.2165334224700928, "learning_rate": 1.655473231731145e-05, "loss": 0.1791, "step": 221500 }, { "epoch": 17.305739945333855, "grad_norm": 0.9512267112731934, "learning_rate": 1.6553169206721377e-05, "loss": 0.1687, "step": 221600 }, { "epoch": 17.31354939476767, "grad_norm": 1.0094327926635742, "learning_rate": 1.6551606096131303e-05, "loss": 0.1698, "step": 221700 }, { "epoch": 17.321358844201484, "grad_norm": 0.8161134123802185, "learning_rate": 1.655004298554123e-05, "loss": 0.1693, "step": 221800 }, { "epoch": 17.3291682936353, "grad_norm": 0.7945289611816406, "learning_rate": 1.654847987495115e-05, "loss": 0.1821, "step": 221900 }, { "epoch": 17.336977743069113, "grad_norm": 0.9875785708427429, "learning_rate": 1.654691676436108e-05, "loss": 0.1699, "step": 222000 }, { "epoch": 17.344787192502928, "grad_norm": 1.033823847770691, "learning_rate": 1.6545353653771007e-05, "loss": 0.173, "step": 222100 }, { "epoch": 17.352596641936742, "grad_norm": 0.7107641696929932, "learning_rate": 1.654379054318093e-05, "loss": 0.171, "step": 222200 }, { "epoch": 17.360406091370557, "grad_norm": 0.6887959837913513, "learning_rate": 1.6542227432590856e-05, "loss": 0.17, "step": 222300 }, { "epoch": 17.368215540804375, "grad_norm": 1.2243207693099976, "learning_rate": 1.6540664322000785e-05, "loss": 0.1704, "step": 222400 }, { "epoch": 17.37602499023819, "grad_norm": 0.7111543416976929, "learning_rate": 1.6539101211410708e-05, "loss": 0.172, "step": 222500 }, { "epoch": 17.383834439672004, "grad_norm": 1.120431661605835, "learning_rate": 1.6537538100820634e-05, "loss": 0.179, "step": 222600 }, { "epoch": 17.39164388910582, "grad_norm": 1.0118180513381958, "learning_rate": 1.653597499023056e-05, "loss": 0.1721, "step": 222700 }, { "epoch": 17.399453338539633, "grad_norm": 0.9543727040290833, "learning_rate": 1.6534411879640486e-05, "loss": 0.1648, "step": 222800 }, { "epoch": 17.407262787973448, "grad_norm": 1.019272804260254, "learning_rate": 1.6532848769050412e-05, "loss": 0.1761, "step": 222900 }, { "epoch": 17.415072237407262, "grad_norm": 0.8520439863204956, "learning_rate": 1.6531301289566238e-05, "loss": 0.1668, "step": 223000 }, { "epoch": 17.422881686841077, "grad_norm": 0.9222218990325928, "learning_rate": 1.6529738178976164e-05, "loss": 0.1787, "step": 223100 }, { "epoch": 17.43069113627489, "grad_norm": 0.8735765218734741, "learning_rate": 1.652817506838609e-05, "loss": 0.1759, "step": 223200 }, { "epoch": 17.438500585708706, "grad_norm": 1.4969502687454224, "learning_rate": 1.6526611957796016e-05, "loss": 0.1734, "step": 223300 }, { "epoch": 17.446310035142524, "grad_norm": 0.671943187713623, "learning_rate": 1.652504884720594e-05, "loss": 0.1759, "step": 223400 }, { "epoch": 17.45411948457634, "grad_norm": 0.890876829624176, "learning_rate": 1.6523485736615868e-05, "loss": 0.1652, "step": 223500 }, { "epoch": 17.461928934010153, "grad_norm": 0.6937717795372009, "learning_rate": 1.6521922626025794e-05, "loss": 0.163, "step": 223600 }, { "epoch": 17.469738383443968, "grad_norm": 0.827542781829834, "learning_rate": 1.6520359515435716e-05, "loss": 0.176, "step": 223700 }, { "epoch": 17.477547832877782, "grad_norm": 0.9324260354042053, "learning_rate": 1.6518796404845646e-05, "loss": 0.1692, "step": 223800 }, { "epoch": 17.485357282311597, "grad_norm": 0.8106001615524292, "learning_rate": 1.6517233294255572e-05, "loss": 0.1728, "step": 223900 }, { "epoch": 17.49316673174541, "grad_norm": 1.4987694025039673, "learning_rate": 1.6515670183665494e-05, "loss": 0.1747, "step": 224000 }, { "epoch": 17.500976181179226, "grad_norm": 0.955620288848877, "learning_rate": 1.651410707307542e-05, "loss": 0.1772, "step": 224100 }, { "epoch": 17.50878563061304, "grad_norm": 0.8453149199485779, "learning_rate": 1.6512543962485346e-05, "loss": 0.1774, "step": 224200 }, { "epoch": 17.516595080046855, "grad_norm": 0.646578311920166, "learning_rate": 1.6510980851895272e-05, "loss": 0.1662, "step": 224300 }, { "epoch": 17.524404529480673, "grad_norm": 1.0147279500961304, "learning_rate": 1.65094177413052e-05, "loss": 0.1761, "step": 224400 }, { "epoch": 17.532213978914488, "grad_norm": 1.0600621700286865, "learning_rate": 1.6507854630715124e-05, "loss": 0.1601, "step": 224500 }, { "epoch": 17.540023428348302, "grad_norm": 1.0022188425064087, "learning_rate": 1.650629152012505e-05, "loss": 0.1677, "step": 224600 }, { "epoch": 17.547832877782117, "grad_norm": 0.8853134512901306, "learning_rate": 1.6504728409534976e-05, "loss": 0.1757, "step": 224700 }, { "epoch": 17.55564232721593, "grad_norm": 0.8376500606536865, "learning_rate": 1.6503165298944902e-05, "loss": 0.1777, "step": 224800 }, { "epoch": 17.563451776649746, "grad_norm": 1.0630090236663818, "learning_rate": 1.650160218835483e-05, "loss": 0.1755, "step": 224900 }, { "epoch": 17.57126122608356, "grad_norm": 0.9697504639625549, "learning_rate": 1.6500054708870654e-05, "loss": 0.173, "step": 225000 }, { "epoch": 17.579070675517375, "grad_norm": 1.2545424699783325, "learning_rate": 1.649849159828058e-05, "loss": 0.1761, "step": 225100 }, { "epoch": 17.58688012495119, "grad_norm": 0.8433966636657715, "learning_rate": 1.6496928487690503e-05, "loss": 0.1701, "step": 225200 }, { "epoch": 17.594689574385004, "grad_norm": 1.4100017547607422, "learning_rate": 1.6495365377100432e-05, "loss": 0.166, "step": 225300 }, { "epoch": 17.602499023818822, "grad_norm": 1.0186513662338257, "learning_rate": 1.649380226651036e-05, "loss": 0.1715, "step": 225400 }, { "epoch": 17.610308473252637, "grad_norm": 0.8596687912940979, "learning_rate": 1.649223915592028e-05, "loss": 0.1686, "step": 225500 }, { "epoch": 17.61811792268645, "grad_norm": 0.9129221439361572, "learning_rate": 1.649067604533021e-05, "loss": 0.168, "step": 225600 }, { "epoch": 17.625927372120266, "grad_norm": 1.0046827793121338, "learning_rate": 1.6489112934740133e-05, "loss": 0.1796, "step": 225700 }, { "epoch": 17.63373682155408, "grad_norm": 0.8824147582054138, "learning_rate": 1.648754982415006e-05, "loss": 0.1545, "step": 225800 }, { "epoch": 17.641546270987895, "grad_norm": 0.9992631077766418, "learning_rate": 1.6485986713559985e-05, "loss": 0.1662, "step": 225900 }, { "epoch": 17.64935572042171, "grad_norm": 0.931957483291626, "learning_rate": 1.648442360296991e-05, "loss": 0.1631, "step": 226000 }, { "epoch": 17.657165169855524, "grad_norm": 0.8518832325935364, "learning_rate": 1.6482860492379837e-05, "loss": 0.1639, "step": 226100 }, { "epoch": 17.66497461928934, "grad_norm": 0.9512225985527039, "learning_rate": 1.6481297381789763e-05, "loss": 0.1745, "step": 226200 }, { "epoch": 17.672784068723153, "grad_norm": 0.6881412267684937, "learning_rate": 1.647973427119969e-05, "loss": 0.1649, "step": 226300 }, { "epoch": 17.68059351815697, "grad_norm": 1.2182663679122925, "learning_rate": 1.6478171160609615e-05, "loss": 0.1712, "step": 226400 }, { "epoch": 17.688402967590786, "grad_norm": 0.7335953116416931, "learning_rate": 1.647660805001954e-05, "loss": 0.1709, "step": 226500 }, { "epoch": 17.6962124170246, "grad_norm": 0.7070138454437256, "learning_rate": 1.6475044939429464e-05, "loss": 0.1826, "step": 226600 }, { "epoch": 17.704021866458415, "grad_norm": 1.0415022373199463, "learning_rate": 1.6473481828839393e-05, "loss": 0.1709, "step": 226700 }, { "epoch": 17.71183131589223, "grad_norm": 1.1055572032928467, "learning_rate": 1.647191871824932e-05, "loss": 0.1656, "step": 226800 }, { "epoch": 17.719640765326044, "grad_norm": 0.989414393901825, "learning_rate": 1.6470355607659242e-05, "loss": 0.1657, "step": 226900 }, { "epoch": 17.72745021475986, "grad_norm": 1.3034842014312744, "learning_rate": 1.6468808128175068e-05, "loss": 0.159, "step": 227000 }, { "epoch": 17.735259664193674, "grad_norm": 0.7893176078796387, "learning_rate": 1.6467245017584997e-05, "loss": 0.1735, "step": 227100 }, { "epoch": 17.743069113627488, "grad_norm": 0.9313830733299255, "learning_rate": 1.646568190699492e-05, "loss": 0.1645, "step": 227200 }, { "epoch": 17.750878563061303, "grad_norm": 1.0136443376541138, "learning_rate": 1.6464118796404846e-05, "loss": 0.167, "step": 227300 }, { "epoch": 17.75868801249512, "grad_norm": 0.9547396898269653, "learning_rate": 1.6462555685814772e-05, "loss": 0.1681, "step": 227400 }, { "epoch": 17.766497461928935, "grad_norm": 1.1903905868530273, "learning_rate": 1.6460992575224698e-05, "loss": 0.1724, "step": 227500 }, { "epoch": 17.77430691136275, "grad_norm": 0.9577434659004211, "learning_rate": 1.6459429464634624e-05, "loss": 0.1614, "step": 227600 }, { "epoch": 17.782116360796564, "grad_norm": 0.9196197390556335, "learning_rate": 1.645786635404455e-05, "loss": 0.1749, "step": 227700 }, { "epoch": 17.78992581023038, "grad_norm": 0.8257341980934143, "learning_rate": 1.6456303243454476e-05, "loss": 0.1644, "step": 227800 }, { "epoch": 17.797735259664194, "grad_norm": 1.012330412864685, "learning_rate": 1.6454740132864402e-05, "loss": 0.1614, "step": 227900 }, { "epoch": 17.805544709098008, "grad_norm": 0.7949550747871399, "learning_rate": 1.6453177022274328e-05, "loss": 0.1682, "step": 228000 }, { "epoch": 17.813354158531823, "grad_norm": 0.9207527041435242, "learning_rate": 1.645161391168425e-05, "loss": 0.1636, "step": 228100 }, { "epoch": 17.821163607965637, "grad_norm": 0.742048978805542, "learning_rate": 1.645005080109418e-05, "loss": 0.1577, "step": 228200 }, { "epoch": 17.828973057399452, "grad_norm": 1.159746766090393, "learning_rate": 1.6448487690504106e-05, "loss": 0.171, "step": 228300 }, { "epoch": 17.83678250683327, "grad_norm": 0.996147871017456, "learning_rate": 1.644692457991403e-05, "loss": 0.1661, "step": 228400 }, { "epoch": 17.844591956267084, "grad_norm": 0.8791880011558533, "learning_rate": 1.6445361469323955e-05, "loss": 0.1665, "step": 228500 }, { "epoch": 17.8524014057009, "grad_norm": 0.823704183101654, "learning_rate": 1.6443798358733884e-05, "loss": 0.1632, "step": 228600 }, { "epoch": 17.860210855134714, "grad_norm": 0.6628607511520386, "learning_rate": 1.6442235248143807e-05, "loss": 0.1698, "step": 228700 }, { "epoch": 17.868020304568528, "grad_norm": 0.931252658367157, "learning_rate": 1.6440672137553733e-05, "loss": 0.1741, "step": 228800 }, { "epoch": 17.875829754002343, "grad_norm": 0.8795222640037537, "learning_rate": 1.643910902696366e-05, "loss": 0.1704, "step": 228900 }, { "epoch": 17.883639203436157, "grad_norm": 0.7636140584945679, "learning_rate": 1.6437545916373585e-05, "loss": 0.1631, "step": 229000 }, { "epoch": 17.891448652869972, "grad_norm": 0.8694945573806763, "learning_rate": 1.643598280578351e-05, "loss": 0.1684, "step": 229100 }, { "epoch": 17.899258102303786, "grad_norm": 0.8268353343009949, "learning_rate": 1.6434435326299337e-05, "loss": 0.1762, "step": 229200 }, { "epoch": 17.9070675517376, "grad_norm": 0.9662846326828003, "learning_rate": 1.6432872215709263e-05, "loss": 0.1773, "step": 229300 }, { "epoch": 17.91487700117142, "grad_norm": 0.6733610033988953, "learning_rate": 1.643130910511919e-05, "loss": 0.1744, "step": 229400 }, { "epoch": 17.922686450605234, "grad_norm": 0.9901676177978516, "learning_rate": 1.6429745994529115e-05, "loss": 0.1633, "step": 229500 }, { "epoch": 17.93049590003905, "grad_norm": 0.8217541575431824, "learning_rate": 1.642818288393904e-05, "loss": 0.1585, "step": 229600 }, { "epoch": 17.938305349472863, "grad_norm": 1.0147716999053955, "learning_rate": 1.6426619773348967e-05, "loss": 0.166, "step": 229700 }, { "epoch": 17.946114798906677, "grad_norm": 0.9158501029014587, "learning_rate": 1.6425056662758893e-05, "loss": 0.1677, "step": 229800 }, { "epoch": 17.953924248340492, "grad_norm": 0.7738738059997559, "learning_rate": 1.6423493552168815e-05, "loss": 0.1697, "step": 229900 }, { "epoch": 17.961733697774307, "grad_norm": 0.766834557056427, "learning_rate": 1.6421930441578745e-05, "loss": 0.1657, "step": 230000 }, { "epoch": 17.96954314720812, "grad_norm": 1.1993781328201294, "learning_rate": 1.642036733098867e-05, "loss": 0.1699, "step": 230100 }, { "epoch": 17.977352596641936, "grad_norm": 0.7733214497566223, "learning_rate": 1.6418804220398593e-05, "loss": 0.1666, "step": 230200 }, { "epoch": 17.98516204607575, "grad_norm": 0.8509901762008667, "learning_rate": 1.641724110980852e-05, "loss": 0.1682, "step": 230300 }, { "epoch": 17.992971495509565, "grad_norm": 0.775961697101593, "learning_rate": 1.6415677999218445e-05, "loss": 0.1595, "step": 230400 }, { "epoch": 18.000780944943383, "grad_norm": 0.824718177318573, "learning_rate": 1.641411488862837e-05, "loss": 0.1674, "step": 230500 }, { "epoch": 18.008590394377197, "grad_norm": 0.5882493257522583, "learning_rate": 1.6412551778038297e-05, "loss": 0.1658, "step": 230600 }, { "epoch": 18.016399843811012, "grad_norm": 0.9169065356254578, "learning_rate": 1.6410988667448223e-05, "loss": 0.1612, "step": 230700 }, { "epoch": 18.024209293244827, "grad_norm": 0.9664234519004822, "learning_rate": 1.640942555685815e-05, "loss": 0.1662, "step": 230800 }, { "epoch": 18.03201874267864, "grad_norm": 0.6878706812858582, "learning_rate": 1.6407862446268075e-05, "loss": 0.1701, "step": 230900 }, { "epoch": 18.039828192112456, "grad_norm": 1.0365025997161865, "learning_rate": 1.6406299335678e-05, "loss": 0.1694, "step": 231000 }, { "epoch": 18.04763764154627, "grad_norm": 0.8461370468139648, "learning_rate": 1.6404736225087927e-05, "loss": 0.1723, "step": 231100 }, { "epoch": 18.055447090980085, "grad_norm": 0.8166918158531189, "learning_rate": 1.6403188745603753e-05, "loss": 0.1703, "step": 231200 }, { "epoch": 18.0632565404139, "grad_norm": 0.9236775636672974, "learning_rate": 1.640162563501368e-05, "loss": 0.1714, "step": 231300 }, { "epoch": 18.071065989847718, "grad_norm": 1.0226106643676758, "learning_rate": 1.6400062524423602e-05, "loss": 0.171, "step": 231400 }, { "epoch": 18.078875439281532, "grad_norm": 0.6999830603599548, "learning_rate": 1.639849941383353e-05, "loss": 0.1631, "step": 231500 }, { "epoch": 18.086684888715347, "grad_norm": 0.7705897092819214, "learning_rate": 1.6396936303243457e-05, "loss": 0.1755, "step": 231600 }, { "epoch": 18.09449433814916, "grad_norm": 0.8058556914329529, "learning_rate": 1.639537319265338e-05, "loss": 0.1762, "step": 231700 }, { "epoch": 18.102303787582976, "grad_norm": 1.0926358699798584, "learning_rate": 1.639381008206331e-05, "loss": 0.1622, "step": 231800 }, { "epoch": 18.11011323701679, "grad_norm": 0.9904764294624329, "learning_rate": 1.6392246971473232e-05, "loss": 0.1731, "step": 231900 }, { "epoch": 18.117922686450605, "grad_norm": 0.8789817690849304, "learning_rate": 1.6390683860883158e-05, "loss": 0.1664, "step": 232000 }, { "epoch": 18.12573213588442, "grad_norm": 0.8609523773193359, "learning_rate": 1.6389120750293084e-05, "loss": 0.1774, "step": 232100 }, { "epoch": 18.133541585318234, "grad_norm": 0.929350733757019, "learning_rate": 1.638755763970301e-05, "loss": 0.1655, "step": 232200 }, { "epoch": 18.14135103475205, "grad_norm": 1.0220617055892944, "learning_rate": 1.6385994529112936e-05, "loss": 0.1588, "step": 232300 }, { "epoch": 18.149160484185863, "grad_norm": 0.9037083983421326, "learning_rate": 1.6384431418522862e-05, "loss": 0.1674, "step": 232400 }, { "epoch": 18.15696993361968, "grad_norm": 1.134727120399475, "learning_rate": 1.6382868307932788e-05, "loss": 0.1771, "step": 232500 }, { "epoch": 18.164779383053496, "grad_norm": 0.8640575408935547, "learning_rate": 1.6381305197342714e-05, "loss": 0.1626, "step": 232600 }, { "epoch": 18.17258883248731, "grad_norm": 1.0782209634780884, "learning_rate": 1.637974208675264e-05, "loss": 0.1618, "step": 232700 }, { "epoch": 18.180398281921125, "grad_norm": 0.8964031338691711, "learning_rate": 1.6378178976162563e-05, "loss": 0.163, "step": 232800 }, { "epoch": 18.18820773135494, "grad_norm": 0.8061158657073975, "learning_rate": 1.6376615865572492e-05, "loss": 0.1672, "step": 232900 }, { "epoch": 18.196017180788754, "grad_norm": 0.7686555981636047, "learning_rate": 1.6375052754982418e-05, "loss": 0.1631, "step": 233000 }, { "epoch": 18.20382663022257, "grad_norm": 0.8305268883705139, "learning_rate": 1.637348964439234e-05, "loss": 0.1595, "step": 233100 }, { "epoch": 18.211636079656383, "grad_norm": 0.7960754036903381, "learning_rate": 1.6371942164908167e-05, "loss": 0.1761, "step": 233200 }, { "epoch": 18.219445529090198, "grad_norm": 0.9720467329025269, "learning_rate": 1.6370379054318096e-05, "loss": 0.1647, "step": 233300 }, { "epoch": 18.227254978524012, "grad_norm": 0.7741464376449585, "learning_rate": 1.636881594372802e-05, "loss": 0.1609, "step": 233400 }, { "epoch": 18.23506442795783, "grad_norm": 0.7279603481292725, "learning_rate": 1.6367252833137945e-05, "loss": 0.1622, "step": 233500 }, { "epoch": 18.242873877391645, "grad_norm": 1.0035083293914795, "learning_rate": 1.636568972254787e-05, "loss": 0.1674, "step": 233600 }, { "epoch": 18.25068332682546, "grad_norm": 0.6845473647117615, "learning_rate": 1.6364126611957797e-05, "loss": 0.176, "step": 233700 }, { "epoch": 18.258492776259274, "grad_norm": 0.8481450080871582, "learning_rate": 1.6362563501367723e-05, "loss": 0.1642, "step": 233800 }, { "epoch": 18.26630222569309, "grad_norm": 0.8318843245506287, "learning_rate": 1.636100039077765e-05, "loss": 0.1614, "step": 233900 }, { "epoch": 18.274111675126903, "grad_norm": 0.7613986134529114, "learning_rate": 1.6359437280187575e-05, "loss": 0.1693, "step": 234000 }, { "epoch": 18.281921124560718, "grad_norm": 0.8641628623008728, "learning_rate": 1.63578741695975e-05, "loss": 0.1605, "step": 234100 }, { "epoch": 18.289730573994532, "grad_norm": 0.7250987887382507, "learning_rate": 1.6356311059007427e-05, "loss": 0.157, "step": 234200 }, { "epoch": 18.297540023428347, "grad_norm": 0.6812145709991455, "learning_rate": 1.635474794841735e-05, "loss": 0.1586, "step": 234300 }, { "epoch": 18.30534947286216, "grad_norm": 0.8246647119522095, "learning_rate": 1.635318483782728e-05, "loss": 0.1649, "step": 234400 }, { "epoch": 18.31315892229598, "grad_norm": 0.8091587424278259, "learning_rate": 1.6351621727237205e-05, "loss": 0.1661, "step": 234500 }, { "epoch": 18.320968371729794, "grad_norm": 0.8854368329048157, "learning_rate": 1.6350058616647127e-05, "loss": 0.1656, "step": 234600 }, { "epoch": 18.32877782116361, "grad_norm": 0.8019761443138123, "learning_rate": 1.6348495506057053e-05, "loss": 0.1645, "step": 234700 }, { "epoch": 18.336587270597423, "grad_norm": 0.9290599226951599, "learning_rate": 1.6346932395466983e-05, "loss": 0.167, "step": 234800 }, { "epoch": 18.344396720031238, "grad_norm": 1.1367971897125244, "learning_rate": 1.6345369284876905e-05, "loss": 0.1552, "step": 234900 }, { "epoch": 18.352206169465052, "grad_norm": 0.8118088841438293, "learning_rate": 1.634380617428683e-05, "loss": 0.1651, "step": 235000 }, { "epoch": 18.360015618898867, "grad_norm": 1.0581218004226685, "learning_rate": 1.6342243063696757e-05, "loss": 0.1642, "step": 235100 }, { "epoch": 18.36782506833268, "grad_norm": 0.7852098941802979, "learning_rate": 1.6340695584212583e-05, "loss": 0.1621, "step": 235200 }, { "epoch": 18.375634517766496, "grad_norm": 0.8468132019042969, "learning_rate": 1.633913247362251e-05, "loss": 0.1517, "step": 235300 }, { "epoch": 18.38344396720031, "grad_norm": 0.9250136613845825, "learning_rate": 1.6337569363032435e-05, "loss": 0.1654, "step": 235400 }, { "epoch": 18.39125341663413, "grad_norm": 0.8608556389808655, "learning_rate": 1.633600625244236e-05, "loss": 0.1651, "step": 235500 }, { "epoch": 18.399062866067943, "grad_norm": 0.9423208236694336, "learning_rate": 1.6334443141852287e-05, "loss": 0.1694, "step": 235600 }, { "epoch": 18.406872315501758, "grad_norm": 1.0843865871429443, "learning_rate": 1.6332880031262213e-05, "loss": 0.1657, "step": 235700 }, { "epoch": 18.414681764935573, "grad_norm": 0.6090073585510254, "learning_rate": 1.633131692067214e-05, "loss": 0.1601, "step": 235800 }, { "epoch": 18.422491214369387, "grad_norm": 0.8230893015861511, "learning_rate": 1.6329753810082065e-05, "loss": 0.1647, "step": 235900 }, { "epoch": 18.4303006638032, "grad_norm": 0.7741209268569946, "learning_rate": 1.632819069949199e-05, "loss": 0.1622, "step": 236000 }, { "epoch": 18.438110113237016, "grad_norm": 0.8756573796272278, "learning_rate": 1.6326627588901914e-05, "loss": 0.1655, "step": 236100 }, { "epoch": 18.44591956267083, "grad_norm": 1.0780562162399292, "learning_rate": 1.6325064478311844e-05, "loss": 0.1695, "step": 236200 }, { "epoch": 18.453729012104645, "grad_norm": 1.0400506258010864, "learning_rate": 1.632350136772177e-05, "loss": 0.166, "step": 236300 }, { "epoch": 18.46153846153846, "grad_norm": 1.2005187273025513, "learning_rate": 1.6321938257131692e-05, "loss": 0.1739, "step": 236400 }, { "epoch": 18.469347910972278, "grad_norm": 0.978459358215332, "learning_rate": 1.6320375146541618e-05, "loss": 0.1632, "step": 236500 }, { "epoch": 18.477157360406093, "grad_norm": 0.8813901543617249, "learning_rate": 1.6318812035951544e-05, "loss": 0.167, "step": 236600 }, { "epoch": 18.484966809839907, "grad_norm": 0.8949891924858093, "learning_rate": 1.631724892536147e-05, "loss": 0.1684, "step": 236700 }, { "epoch": 18.49277625927372, "grad_norm": 0.8135596513748169, "learning_rate": 1.6315685814771396e-05, "loss": 0.159, "step": 236800 }, { "epoch": 18.500585708707536, "grad_norm": 0.739747166633606, "learning_rate": 1.6314122704181322e-05, "loss": 0.1664, "step": 236900 }, { "epoch": 18.50839515814135, "grad_norm": 0.9250356554985046, "learning_rate": 1.6312559593591248e-05, "loss": 0.1581, "step": 237000 }, { "epoch": 18.516204607575165, "grad_norm": 1.1903828382492065, "learning_rate": 1.6310996483001174e-05, "loss": 0.16, "step": 237100 }, { "epoch": 18.52401405700898, "grad_norm": 0.8911426663398743, "learning_rate": 1.63094333724111e-05, "loss": 0.1595, "step": 237200 }, { "epoch": 18.531823506442795, "grad_norm": 0.7672592401504517, "learning_rate": 1.6307885892926926e-05, "loss": 0.1678, "step": 237300 }, { "epoch": 18.53963295587661, "grad_norm": 0.8545451760292053, "learning_rate": 1.6306322782336852e-05, "loss": 0.1595, "step": 237400 }, { "epoch": 18.547442405310427, "grad_norm": 1.0210609436035156, "learning_rate": 1.6304759671746778e-05, "loss": 0.1622, "step": 237500 }, { "epoch": 18.55525185474424, "grad_norm": 0.6720797419548035, "learning_rate": 1.63031965611567e-05, "loss": 0.1748, "step": 237600 }, { "epoch": 18.563061304178056, "grad_norm": 1.0264678001403809, "learning_rate": 1.630163345056663e-05, "loss": 0.1624, "step": 237700 }, { "epoch": 18.57087075361187, "grad_norm": 1.1977595090866089, "learning_rate": 1.6300070339976556e-05, "loss": 0.1536, "step": 237800 }, { "epoch": 18.578680203045685, "grad_norm": 0.7712870836257935, "learning_rate": 1.629850722938648e-05, "loss": 0.1679, "step": 237900 }, { "epoch": 18.5864896524795, "grad_norm": 0.8661043643951416, "learning_rate": 1.6296944118796408e-05, "loss": 0.1581, "step": 238000 }, { "epoch": 18.594299101913315, "grad_norm": 0.8155229687690735, "learning_rate": 1.629538100820633e-05, "loss": 0.1616, "step": 238100 }, { "epoch": 18.60210855134713, "grad_norm": 1.1041407585144043, "learning_rate": 1.6293817897616257e-05, "loss": 0.1591, "step": 238200 }, { "epoch": 18.609918000780944, "grad_norm": 0.8246005773544312, "learning_rate": 1.6292254787026183e-05, "loss": 0.1606, "step": 238300 }, { "epoch": 18.61772745021476, "grad_norm": 0.8949695229530334, "learning_rate": 1.629069167643611e-05, "loss": 0.1591, "step": 238400 }, { "epoch": 18.625536899648576, "grad_norm": 1.056926965713501, "learning_rate": 1.6289128565846035e-05, "loss": 0.1557, "step": 238500 }, { "epoch": 18.63334634908239, "grad_norm": 0.9467104077339172, "learning_rate": 1.628756545525596e-05, "loss": 0.1654, "step": 238600 }, { "epoch": 18.641155798516206, "grad_norm": 0.8194491863250732, "learning_rate": 1.6286002344665887e-05, "loss": 0.1629, "step": 238700 }, { "epoch": 18.64896524795002, "grad_norm": 0.8596170544624329, "learning_rate": 1.6284439234075813e-05, "loss": 0.1506, "step": 238800 }, { "epoch": 18.656774697383835, "grad_norm": 0.8728044629096985, "learning_rate": 1.628287612348574e-05, "loss": 0.1677, "step": 238900 }, { "epoch": 18.66458414681765, "grad_norm": 0.7847612500190735, "learning_rate": 1.628131301289566e-05, "loss": 0.1625, "step": 239000 }, { "epoch": 18.672393596251464, "grad_norm": 0.9129346609115601, "learning_rate": 1.627974990230559e-05, "loss": 0.1501, "step": 239100 }, { "epoch": 18.68020304568528, "grad_norm": 0.6726586818695068, "learning_rate": 1.6278186791715517e-05, "loss": 0.1712, "step": 239200 }, { "epoch": 18.688012495119093, "grad_norm": 1.2120012044906616, "learning_rate": 1.6276639312231343e-05, "loss": 0.175, "step": 239300 }, { "epoch": 18.695821944552907, "grad_norm": 0.7493081092834473, "learning_rate": 1.6275076201641266e-05, "loss": 0.1604, "step": 239400 }, { "epoch": 18.703631393986726, "grad_norm": 1.1694607734680176, "learning_rate": 1.6273513091051195e-05, "loss": 0.1605, "step": 239500 }, { "epoch": 18.71144084342054, "grad_norm": 0.7457549571990967, "learning_rate": 1.6271949980461118e-05, "loss": 0.1635, "step": 239600 }, { "epoch": 18.719250292854355, "grad_norm": 1.2835865020751953, "learning_rate": 1.6270386869871044e-05, "loss": 0.1624, "step": 239700 }, { "epoch": 18.72705974228817, "grad_norm": 0.6410244703292847, "learning_rate": 1.626882375928097e-05, "loss": 0.165, "step": 239800 }, { "epoch": 18.734869191721984, "grad_norm": 0.8569579124450684, "learning_rate": 1.6267260648690896e-05, "loss": 0.1606, "step": 239900 }, { "epoch": 18.7426786411558, "grad_norm": 0.994696319103241, "learning_rate": 1.626569753810082e-05, "loss": 0.1603, "step": 240000 }, { "epoch": 18.750488090589613, "grad_norm": 0.7902507185935974, "learning_rate": 1.6264134427510748e-05, "loss": 0.1638, "step": 240100 }, { "epoch": 18.758297540023428, "grad_norm": 0.7056221961975098, "learning_rate": 1.6262571316920674e-05, "loss": 0.162, "step": 240200 }, { "epoch": 18.766106989457242, "grad_norm": 0.7884091138839722, "learning_rate": 1.62610082063306e-05, "loss": 0.1557, "step": 240300 }, { "epoch": 18.773916438891057, "grad_norm": 0.782772958278656, "learning_rate": 1.6259445095740526e-05, "loss": 0.1711, "step": 240400 }, { "epoch": 18.781725888324875, "grad_norm": 0.9223241209983826, "learning_rate": 1.6257881985150448e-05, "loss": 0.1638, "step": 240500 }, { "epoch": 18.78953533775869, "grad_norm": 0.6795259714126587, "learning_rate": 1.6256318874560378e-05, "loss": 0.1583, "step": 240600 }, { "epoch": 18.797344787192504, "grad_norm": 0.917625367641449, "learning_rate": 1.6254755763970304e-05, "loss": 0.1596, "step": 240700 }, { "epoch": 18.80515423662632, "grad_norm": 0.6310639977455139, "learning_rate": 1.6253192653380226e-05, "loss": 0.1593, "step": 240800 }, { "epoch": 18.812963686060133, "grad_norm": 0.914989173412323, "learning_rate": 1.6251629542790152e-05, "loss": 0.1614, "step": 240900 }, { "epoch": 18.820773135493948, "grad_norm": 0.8099808692932129, "learning_rate": 1.6250066432200082e-05, "loss": 0.1567, "step": 241000 }, { "epoch": 18.828582584927762, "grad_norm": 0.812492311000824, "learning_rate": 1.6248503321610004e-05, "loss": 0.1642, "step": 241100 }, { "epoch": 18.836392034361577, "grad_norm": 1.177435040473938, "learning_rate": 1.624694021101993e-05, "loss": 0.172, "step": 241200 }, { "epoch": 18.84420148379539, "grad_norm": 0.8142099380493164, "learning_rate": 1.624539273153576e-05, "loss": 0.1616, "step": 241300 }, { "epoch": 18.852010933229206, "grad_norm": 0.7500884532928467, "learning_rate": 1.6243829620945682e-05, "loss": 0.1631, "step": 241400 }, { "epoch": 18.859820382663024, "grad_norm": 0.670933187007904, "learning_rate": 1.624226651035561e-05, "loss": 0.1509, "step": 241500 }, { "epoch": 18.86762983209684, "grad_norm": 0.8267391920089722, "learning_rate": 1.6240703399765534e-05, "loss": 0.1634, "step": 241600 }, { "epoch": 18.875439281530653, "grad_norm": 0.8886703848838806, "learning_rate": 1.623914028917546e-05, "loss": 0.1546, "step": 241700 }, { "epoch": 18.883248730964468, "grad_norm": 0.6933555603027344, "learning_rate": 1.6237577178585386e-05, "loss": 0.1683, "step": 241800 }, { "epoch": 18.891058180398282, "grad_norm": 1.125935435295105, "learning_rate": 1.6236014067995312e-05, "loss": 0.1701, "step": 241900 }, { "epoch": 18.898867629832097, "grad_norm": 0.9346698522567749, "learning_rate": 1.623445095740524e-05, "loss": 0.1591, "step": 242000 }, { "epoch": 18.90667707926591, "grad_norm": 1.0114465951919556, "learning_rate": 1.6232887846815164e-05, "loss": 0.1556, "step": 242100 }, { "epoch": 18.914486528699726, "grad_norm": 1.0575963258743286, "learning_rate": 1.623132473622509e-05, "loss": 0.1669, "step": 242200 }, { "epoch": 18.92229597813354, "grad_norm": 1.0619617700576782, "learning_rate": 1.6229761625635013e-05, "loss": 0.1678, "step": 242300 }, { "epoch": 18.930105427567355, "grad_norm": 0.8855568170547485, "learning_rate": 1.6228198515044942e-05, "loss": 0.1565, "step": 242400 }, { "epoch": 18.93791487700117, "grad_norm": 0.8218313455581665, "learning_rate": 1.622663540445487e-05, "loss": 0.1583, "step": 242500 }, { "epoch": 18.945724326434988, "grad_norm": 0.7416090965270996, "learning_rate": 1.622507229386479e-05, "loss": 0.164, "step": 242600 }, { "epoch": 18.953533775868802, "grad_norm": 1.218461513519287, "learning_rate": 1.6223509183274717e-05, "loss": 0.1572, "step": 242700 }, { "epoch": 18.961343225302617, "grad_norm": 0.6875521540641785, "learning_rate": 1.6221946072684643e-05, "loss": 0.1533, "step": 242800 }, { "epoch": 18.96915267473643, "grad_norm": 0.8290515542030334, "learning_rate": 1.622038296209457e-05, "loss": 0.1607, "step": 242900 }, { "epoch": 18.976962124170246, "grad_norm": 1.013169765472412, "learning_rate": 1.6218819851504495e-05, "loss": 0.1732, "step": 243000 }, { "epoch": 18.98477157360406, "grad_norm": 0.86373370885849, "learning_rate": 1.621725674091442e-05, "loss": 0.1696, "step": 243100 }, { "epoch": 18.992581023037875, "grad_norm": 0.6753338575363159, "learning_rate": 1.6215693630324347e-05, "loss": 0.1519, "step": 243200 }, { "epoch": 19.00039047247169, "grad_norm": 0.7721323370933533, "learning_rate": 1.6214130519734273e-05, "loss": 0.1585, "step": 243300 }, { "epoch": 19.008199921905504, "grad_norm": 1.0071583986282349, "learning_rate": 1.62125830402501e-05, "loss": 0.1591, "step": 243400 }, { "epoch": 19.016009371339322, "grad_norm": 0.9117600917816162, "learning_rate": 1.6211019929660025e-05, "loss": 0.1604, "step": 243500 }, { "epoch": 19.023818820773137, "grad_norm": 0.8695400953292847, "learning_rate": 1.620945681906995e-05, "loss": 0.1517, "step": 243600 }, { "epoch": 19.03162827020695, "grad_norm": 0.7234436273574829, "learning_rate": 1.6207893708479877e-05, "loss": 0.1581, "step": 243700 }, { "epoch": 19.039437719640766, "grad_norm": 0.7975166440010071, "learning_rate": 1.62063305978898e-05, "loss": 0.1518, "step": 243800 }, { "epoch": 19.04724716907458, "grad_norm": 1.156203031539917, "learning_rate": 1.620476748729973e-05, "loss": 0.1564, "step": 243900 }, { "epoch": 19.055056618508395, "grad_norm": 0.9850621223449707, "learning_rate": 1.6203204376709655e-05, "loss": 0.1479, "step": 244000 }, { "epoch": 19.06286606794221, "grad_norm": 0.7504763007164001, "learning_rate": 1.6201641266119578e-05, "loss": 0.1555, "step": 244100 }, { "epoch": 19.070675517376024, "grad_norm": 0.8326386213302612, "learning_rate": 1.6200078155529507e-05, "loss": 0.154, "step": 244200 }, { "epoch": 19.07848496680984, "grad_norm": 1.0703577995300293, "learning_rate": 1.619851504493943e-05, "loss": 0.1604, "step": 244300 }, { "epoch": 19.086294416243653, "grad_norm": 1.010055422782898, "learning_rate": 1.6196951934349356e-05, "loss": 0.1693, "step": 244400 }, { "epoch": 19.094103865677468, "grad_norm": 0.8883320093154907, "learning_rate": 1.6195388823759282e-05, "loss": 0.1547, "step": 244500 }, { "epoch": 19.101913315111286, "grad_norm": 0.8305409550666809, "learning_rate": 1.6193825713169208e-05, "loss": 0.1679, "step": 244600 }, { "epoch": 19.1097227645451, "grad_norm": 0.7261273860931396, "learning_rate": 1.6192262602579134e-05, "loss": 0.1602, "step": 244700 }, { "epoch": 19.117532213978915, "grad_norm": 0.8075242638587952, "learning_rate": 1.619069949198906e-05, "loss": 0.1612, "step": 244800 }, { "epoch": 19.12534166341273, "grad_norm": 1.0095022916793823, "learning_rate": 1.6189136381398986e-05, "loss": 0.1636, "step": 244900 }, { "epoch": 19.133151112846544, "grad_norm": 0.7797873020172119, "learning_rate": 1.6187573270808912e-05, "loss": 0.1508, "step": 245000 }, { "epoch": 19.14096056228036, "grad_norm": 0.7069320678710938, "learning_rate": 1.6186010160218838e-05, "loss": 0.1545, "step": 245100 }, { "epoch": 19.148770011714173, "grad_norm": 0.7221994996070862, "learning_rate": 1.618444704962876e-05, "loss": 0.1483, "step": 245200 }, { "epoch": 19.156579461147988, "grad_norm": 0.9450590014457703, "learning_rate": 1.618288393903869e-05, "loss": 0.1672, "step": 245300 }, { "epoch": 19.164388910581803, "grad_norm": 0.7580748200416565, "learning_rate": 1.6181336459554516e-05, "loss": 0.1685, "step": 245400 }, { "epoch": 19.172198360015617, "grad_norm": 0.9807697534561157, "learning_rate": 1.6179773348964442e-05, "loss": 0.1631, "step": 245500 }, { "epoch": 19.180007809449435, "grad_norm": 0.7284419536590576, "learning_rate": 1.6178210238374364e-05, "loss": 0.1618, "step": 245600 }, { "epoch": 19.18781725888325, "grad_norm": 1.0209381580352783, "learning_rate": 1.6176647127784294e-05, "loss": 0.1584, "step": 245700 }, { "epoch": 19.195626708317064, "grad_norm": 0.763217031955719, "learning_rate": 1.6175084017194216e-05, "loss": 0.1601, "step": 245800 }, { "epoch": 19.20343615775088, "grad_norm": 0.7209915518760681, "learning_rate": 1.6173520906604142e-05, "loss": 0.1591, "step": 245900 }, { "epoch": 19.211245607184694, "grad_norm": 0.8466573357582092, "learning_rate": 1.617195779601407e-05, "loss": 0.161, "step": 246000 }, { "epoch": 19.219055056618508, "grad_norm": 0.739590048789978, "learning_rate": 1.6170394685423995e-05, "loss": 0.1595, "step": 246100 }, { "epoch": 19.226864506052323, "grad_norm": 0.8309229612350464, "learning_rate": 1.616883157483392e-05, "loss": 0.1539, "step": 246200 }, { "epoch": 19.234673955486137, "grad_norm": 0.9524125456809998, "learning_rate": 1.6167268464243847e-05, "loss": 0.1574, "step": 246300 }, { "epoch": 19.242483404919952, "grad_norm": 0.79534512758255, "learning_rate": 1.6165705353653773e-05, "loss": 0.1537, "step": 246400 }, { "epoch": 19.250292854353766, "grad_norm": 0.698271632194519, "learning_rate": 1.61641422430637e-05, "loss": 0.1624, "step": 246500 }, { "epoch": 19.258102303787584, "grad_norm": 1.0599291324615479, "learning_rate": 1.6162579132473625e-05, "loss": 0.1646, "step": 246600 }, { "epoch": 19.2659117532214, "grad_norm": 0.7981654405593872, "learning_rate": 1.6161016021883547e-05, "loss": 0.1572, "step": 246700 }, { "epoch": 19.273721202655214, "grad_norm": 1.2136812210083008, "learning_rate": 1.6159452911293477e-05, "loss": 0.1589, "step": 246800 }, { "epoch": 19.281530652089028, "grad_norm": 0.8294757604598999, "learning_rate": 1.6157889800703403e-05, "loss": 0.1539, "step": 246900 }, { "epoch": 19.289340101522843, "grad_norm": 0.8389180898666382, "learning_rate": 1.6156326690113325e-05, "loss": 0.1599, "step": 247000 }, { "epoch": 19.297149550956657, "grad_norm": 0.7896692752838135, "learning_rate": 1.615476357952325e-05, "loss": 0.1527, "step": 247100 }, { "epoch": 19.304959000390472, "grad_norm": 0.541652262210846, "learning_rate": 1.615320046893318e-05, "loss": 0.155, "step": 247200 }, { "epoch": 19.312768449824286, "grad_norm": 0.7445521354675293, "learning_rate": 1.6151637358343103e-05, "loss": 0.163, "step": 247300 }, { "epoch": 19.3205778992581, "grad_norm": 0.8510544896125793, "learning_rate": 1.615007424775303e-05, "loss": 0.1594, "step": 247400 }, { "epoch": 19.328387348691916, "grad_norm": 0.760508120059967, "learning_rate": 1.614852676826886e-05, "loss": 0.1582, "step": 247500 }, { "epoch": 19.336196798125734, "grad_norm": 0.9797661304473877, "learning_rate": 1.614696365767878e-05, "loss": 0.1573, "step": 247600 }, { "epoch": 19.34400624755955, "grad_norm": 1.1147596836090088, "learning_rate": 1.6145400547088707e-05, "loss": 0.1613, "step": 247700 }, { "epoch": 19.351815696993363, "grad_norm": 0.8399510979652405, "learning_rate": 1.6143837436498633e-05, "loss": 0.1545, "step": 247800 }, { "epoch": 19.359625146427177, "grad_norm": 0.9175117015838623, "learning_rate": 1.614227432590856e-05, "loss": 0.1512, "step": 247900 }, { "epoch": 19.367434595860992, "grad_norm": 0.8806548714637756, "learning_rate": 1.6140711215318485e-05, "loss": 0.15, "step": 248000 }, { "epoch": 19.375244045294806, "grad_norm": 0.8820730447769165, "learning_rate": 1.613914810472841e-05, "loss": 0.1625, "step": 248100 }, { "epoch": 19.38305349472862, "grad_norm": 0.5434184670448303, "learning_rate": 1.6137584994138334e-05, "loss": 0.1545, "step": 248200 }, { "epoch": 19.390862944162436, "grad_norm": 1.0711945295333862, "learning_rate": 1.6136021883548263e-05, "loss": 0.1525, "step": 248300 }, { "epoch": 19.39867239359625, "grad_norm": 0.8247241377830505, "learning_rate": 1.613445877295819e-05, "loss": 0.15, "step": 248400 }, { "epoch": 19.406481843030065, "grad_norm": 0.6772851943969727, "learning_rate": 1.6132895662368112e-05, "loss": 0.1561, "step": 248500 }, { "epoch": 19.414291292463883, "grad_norm": 1.05992591381073, "learning_rate": 1.613133255177804e-05, "loss": 0.1478, "step": 248600 }, { "epoch": 19.422100741897697, "grad_norm": 0.8491674065589905, "learning_rate": 1.6129769441187967e-05, "loss": 0.1558, "step": 248700 }, { "epoch": 19.429910191331512, "grad_norm": 0.7933894395828247, "learning_rate": 1.612820633059789e-05, "loss": 0.158, "step": 248800 }, { "epoch": 19.437719640765327, "grad_norm": 0.8349812030792236, "learning_rate": 1.6126643220007816e-05, "loss": 0.158, "step": 248900 }, { "epoch": 19.44552909019914, "grad_norm": 1.0554237365722656, "learning_rate": 1.6125080109417742e-05, "loss": 0.1579, "step": 249000 }, { "epoch": 19.453338539632956, "grad_norm": 0.6732720136642456, "learning_rate": 1.6123516998827668e-05, "loss": 0.1521, "step": 249100 }, { "epoch": 19.46114798906677, "grad_norm": 0.8966999053955078, "learning_rate": 1.6121953888237594e-05, "loss": 0.1633, "step": 249200 }, { "epoch": 19.468957438500585, "grad_norm": 1.0872215032577515, "learning_rate": 1.612039077764752e-05, "loss": 0.1567, "step": 249300 }, { "epoch": 19.4767668879344, "grad_norm": 0.893544614315033, "learning_rate": 1.6118827667057446e-05, "loss": 0.1555, "step": 249400 }, { "epoch": 19.484576337368214, "grad_norm": 1.0895719528198242, "learning_rate": 1.6117280187573272e-05, "loss": 0.164, "step": 249500 }, { "epoch": 19.492385786802032, "grad_norm": 0.6664323210716248, "learning_rate": 1.6115717076983198e-05, "loss": 0.1569, "step": 249600 }, { "epoch": 19.500195236235847, "grad_norm": 0.7812830805778503, "learning_rate": 1.6114153966393124e-05, "loss": 0.153, "step": 249700 }, { "epoch": 19.50800468566966, "grad_norm": 0.8430187702178955, "learning_rate": 1.611259085580305e-05, "loss": 0.1612, "step": 249800 }, { "epoch": 19.515814135103476, "grad_norm": 0.997885525226593, "learning_rate": 1.6111027745212976e-05, "loss": 0.1557, "step": 249900 }, { "epoch": 19.52362358453729, "grad_norm": 0.9310106635093689, "learning_rate": 1.61094646346229e-05, "loss": 0.1623, "step": 250000 }, { "epoch": 19.531433033971105, "grad_norm": 0.7472857236862183, "learning_rate": 1.6107901524032828e-05, "loss": 0.1553, "step": 250100 }, { "epoch": 19.53924248340492, "grad_norm": 0.7143159508705139, "learning_rate": 1.6106338413442754e-05, "loss": 0.1578, "step": 250200 }, { "epoch": 19.547051932838734, "grad_norm": 0.7594248056411743, "learning_rate": 1.6104775302852677e-05, "loss": 0.1588, "step": 250300 }, { "epoch": 19.55486138227255, "grad_norm": 0.8787967562675476, "learning_rate": 1.6103212192262603e-05, "loss": 0.1592, "step": 250400 }, { "epoch": 19.562670831706363, "grad_norm": 0.9230839610099792, "learning_rate": 1.610164908167253e-05, "loss": 0.1543, "step": 250500 }, { "epoch": 19.57048028114018, "grad_norm": 0.9485085606575012, "learning_rate": 1.6100085971082455e-05, "loss": 0.1472, "step": 250600 }, { "epoch": 19.578289730573996, "grad_norm": 0.8526518940925598, "learning_rate": 1.609852286049238e-05, "loss": 0.1576, "step": 250700 }, { "epoch": 19.58609918000781, "grad_norm": 0.8549910187721252, "learning_rate": 1.6096959749902307e-05, "loss": 0.1556, "step": 250800 }, { "epoch": 19.593908629441625, "grad_norm": 1.0488536357879639, "learning_rate": 1.6095396639312233e-05, "loss": 0.1595, "step": 250900 }, { "epoch": 19.60171807887544, "grad_norm": 1.1000605821609497, "learning_rate": 1.609383352872216e-05, "loss": 0.1479, "step": 251000 }, { "epoch": 19.609527528309254, "grad_norm": 0.8705799579620361, "learning_rate": 1.6092270418132085e-05, "loss": 0.1571, "step": 251100 }, { "epoch": 19.61733697774307, "grad_norm": 0.9906345009803772, "learning_rate": 1.609070730754201e-05, "loss": 0.1566, "step": 251200 }, { "epoch": 19.625146427176883, "grad_norm": 0.9738631248474121, "learning_rate": 1.6089144196951937e-05, "loss": 0.1615, "step": 251300 }, { "epoch": 19.632955876610698, "grad_norm": 0.7126808762550354, "learning_rate": 1.608758108636186e-05, "loss": 0.1561, "step": 251400 }, { "epoch": 19.640765326044512, "grad_norm": 0.8045843243598938, "learning_rate": 1.608603360687769e-05, "loss": 0.1612, "step": 251500 }, { "epoch": 19.64857477547833, "grad_norm": 0.812263011932373, "learning_rate": 1.6084470496287615e-05, "loss": 0.1588, "step": 251600 }, { "epoch": 19.656384224912145, "grad_norm": 0.9213880300521851, "learning_rate": 1.608290738569754e-05, "loss": 0.1548, "step": 251700 }, { "epoch": 19.66419367434596, "grad_norm": 1.1556429862976074, "learning_rate": 1.6081344275107463e-05, "loss": 0.1539, "step": 251800 }, { "epoch": 19.672003123779774, "grad_norm": 0.8295390605926514, "learning_rate": 1.6079781164517393e-05, "loss": 0.1552, "step": 251900 }, { "epoch": 19.67981257321359, "grad_norm": 1.0561972856521606, "learning_rate": 1.6078218053927315e-05, "loss": 0.1497, "step": 252000 }, { "epoch": 19.687622022647403, "grad_norm": 0.6519757509231567, "learning_rate": 1.607665494333724e-05, "loss": 0.1477, "step": 252100 }, { "epoch": 19.695431472081218, "grad_norm": 0.8574190735816956, "learning_rate": 1.6075091832747167e-05, "loss": 0.1609, "step": 252200 }, { "epoch": 19.703240921515032, "grad_norm": 0.885389506816864, "learning_rate": 1.6073528722157093e-05, "loss": 0.1584, "step": 252300 }, { "epoch": 19.711050370948847, "grad_norm": 0.9630519151687622, "learning_rate": 1.607196561156702e-05, "loss": 0.1517, "step": 252400 }, { "epoch": 19.71885982038266, "grad_norm": 0.7270082235336304, "learning_rate": 1.6070402500976945e-05, "loss": 0.1575, "step": 252500 }, { "epoch": 19.72666926981648, "grad_norm": 0.999950110912323, "learning_rate": 1.606883939038687e-05, "loss": 0.1598, "step": 252600 }, { "epoch": 19.734478719250294, "grad_norm": 0.7339434623718262, "learning_rate": 1.6067276279796797e-05, "loss": 0.1548, "step": 252700 }, { "epoch": 19.74228816868411, "grad_norm": 0.8402358889579773, "learning_rate": 1.6065713169206723e-05, "loss": 0.1545, "step": 252800 }, { "epoch": 19.750097618117923, "grad_norm": 1.0282187461853027, "learning_rate": 1.6064150058616646e-05, "loss": 0.1547, "step": 252900 }, { "epoch": 19.757907067551738, "grad_norm": 0.903782844543457, "learning_rate": 1.6062586948026575e-05, "loss": 0.1524, "step": 253000 }, { "epoch": 19.765716516985552, "grad_norm": 0.876445472240448, "learning_rate": 1.60610238374365e-05, "loss": 0.1548, "step": 253100 }, { "epoch": 19.773525966419367, "grad_norm": 0.7405554056167603, "learning_rate": 1.6059460726846424e-05, "loss": 0.1598, "step": 253200 }, { "epoch": 19.78133541585318, "grad_norm": 0.8406596183776855, "learning_rate": 1.605789761625635e-05, "loss": 0.151, "step": 253300 }, { "epoch": 19.789144865286996, "grad_norm": 0.8342940211296082, "learning_rate": 1.6056334505666276e-05, "loss": 0.1475, "step": 253400 }, { "epoch": 19.79695431472081, "grad_norm": 1.0519615411758423, "learning_rate": 1.6054771395076202e-05, "loss": 0.1625, "step": 253500 }, { "epoch": 19.80476376415463, "grad_norm": 0.903139054775238, "learning_rate": 1.6053223915592028e-05, "loss": 0.1609, "step": 253600 }, { "epoch": 19.812573213588443, "grad_norm": 1.150200605392456, "learning_rate": 1.6051660805001957e-05, "loss": 0.1601, "step": 253700 }, { "epoch": 19.820382663022258, "grad_norm": 1.1543763875961304, "learning_rate": 1.605009769441188e-05, "loss": 0.1615, "step": 253800 }, { "epoch": 19.828192112456072, "grad_norm": 0.739437997341156, "learning_rate": 1.6048534583821806e-05, "loss": 0.1514, "step": 253900 }, { "epoch": 19.836001561889887, "grad_norm": 1.1290647983551025, "learning_rate": 1.6046971473231732e-05, "loss": 0.1585, "step": 254000 }, { "epoch": 19.8438110113237, "grad_norm": 0.8145610690116882, "learning_rate": 1.6045408362641658e-05, "loss": 0.154, "step": 254100 }, { "epoch": 19.851620460757516, "grad_norm": 1.0215067863464355, "learning_rate": 1.6043845252051584e-05, "loss": 0.15, "step": 254200 }, { "epoch": 19.85942991019133, "grad_norm": 0.885907769203186, "learning_rate": 1.604228214146151e-05, "loss": 0.1463, "step": 254300 }, { "epoch": 19.867239359625145, "grad_norm": 1.0276118516921997, "learning_rate": 1.6040719030871433e-05, "loss": 0.1447, "step": 254400 }, { "epoch": 19.87504880905896, "grad_norm": 0.740425169467926, "learning_rate": 1.6039155920281362e-05, "loss": 0.1547, "step": 254500 }, { "epoch": 19.882858258492774, "grad_norm": 0.740181565284729, "learning_rate": 1.6037592809691288e-05, "loss": 0.1524, "step": 254600 }, { "epoch": 19.890667707926593, "grad_norm": 0.7419953346252441, "learning_rate": 1.603602969910121e-05, "loss": 0.1522, "step": 254700 }, { "epoch": 19.898477157360407, "grad_norm": 0.8773970603942871, "learning_rate": 1.603446658851114e-05, "loss": 0.1537, "step": 254800 }, { "epoch": 19.90628660679422, "grad_norm": 0.9431491494178772, "learning_rate": 1.6032903477921066e-05, "loss": 0.153, "step": 254900 }, { "epoch": 19.914096056228036, "grad_norm": 0.887361466884613, "learning_rate": 1.603134036733099e-05, "loss": 0.152, "step": 255000 }, { "epoch": 19.92190550566185, "grad_norm": 0.9748094081878662, "learning_rate": 1.6029777256740915e-05, "loss": 0.1502, "step": 255100 }, { "epoch": 19.929714955095665, "grad_norm": 0.7755160927772522, "learning_rate": 1.602821414615084e-05, "loss": 0.1517, "step": 255200 }, { "epoch": 19.93752440452948, "grad_norm": 0.7254877090454102, "learning_rate": 1.6026651035560767e-05, "loss": 0.1636, "step": 255300 }, { "epoch": 19.945333853963295, "grad_norm": 0.8369965553283691, "learning_rate": 1.6025087924970693e-05, "loss": 0.1463, "step": 255400 }, { "epoch": 19.95314330339711, "grad_norm": 0.999004602432251, "learning_rate": 1.602352481438062e-05, "loss": 0.1553, "step": 255500 }, { "epoch": 19.960952752830927, "grad_norm": 0.8538402915000916, "learning_rate": 1.6021961703790545e-05, "loss": 0.1498, "step": 255600 }, { "epoch": 19.96876220226474, "grad_norm": 0.8866280913352966, "learning_rate": 1.602041422430637e-05, "loss": 0.1537, "step": 255700 }, { "epoch": 19.976571651698556, "grad_norm": 0.6460148692131042, "learning_rate": 1.6018851113716297e-05, "loss": 0.1558, "step": 255800 }, { "epoch": 19.98438110113237, "grad_norm": 0.8335278630256653, "learning_rate": 1.6017288003126223e-05, "loss": 0.1565, "step": 255900 }, { "epoch": 19.992190550566185, "grad_norm": 0.9254976511001587, "learning_rate": 1.601572489253615e-05, "loss": 0.1594, "step": 256000 }, { "epoch": 20.0, "grad_norm": 0.9401211142539978, "learning_rate": 1.6014161781946075e-05, "loss": 0.1563, "step": 256100 }, { "epoch": 20.007809449433815, "grad_norm": 1.0900503396987915, "learning_rate": 1.6012598671355998e-05, "loss": 0.1537, "step": 256200 }, { "epoch": 20.01561889886763, "grad_norm": 0.7111983895301819, "learning_rate": 1.6011035560765927e-05, "loss": 0.1549, "step": 256300 }, { "epoch": 20.023428348301444, "grad_norm": 0.8802400827407837, "learning_rate": 1.6009472450175853e-05, "loss": 0.1535, "step": 256400 }, { "epoch": 20.03123779773526, "grad_norm": 0.9764599204063416, "learning_rate": 1.6007909339585776e-05, "loss": 0.1481, "step": 256500 }, { "epoch": 20.039047247169073, "grad_norm": 1.1582820415496826, "learning_rate": 1.60063462289957e-05, "loss": 0.1539, "step": 256600 }, { "epoch": 20.04685669660289, "grad_norm": 0.8898832201957703, "learning_rate": 1.6004783118405628e-05, "loss": 0.1494, "step": 256700 }, { "epoch": 20.054666146036705, "grad_norm": 0.9158037900924683, "learning_rate": 1.6003220007815554e-05, "loss": 0.1563, "step": 256800 }, { "epoch": 20.06247559547052, "grad_norm": 0.738610029220581, "learning_rate": 1.600165689722548e-05, "loss": 0.152, "step": 256900 }, { "epoch": 20.070285044904335, "grad_norm": 0.8221123218536377, "learning_rate": 1.6000093786635406e-05, "loss": 0.1594, "step": 257000 }, { "epoch": 20.07809449433815, "grad_norm": 1.2848199605941772, "learning_rate": 1.599853067604533e-05, "loss": 0.1506, "step": 257100 }, { "epoch": 20.085903943771964, "grad_norm": 1.2601646184921265, "learning_rate": 1.5996967565455258e-05, "loss": 0.1597, "step": 257200 }, { "epoch": 20.09371339320578, "grad_norm": 0.9480125308036804, "learning_rate": 1.5995404454865184e-05, "loss": 0.1527, "step": 257300 }, { "epoch": 20.101522842639593, "grad_norm": 0.9821155667304993, "learning_rate": 1.599384134427511e-05, "loss": 0.1552, "step": 257400 }, { "epoch": 20.109332292073407, "grad_norm": 1.0815869569778442, "learning_rate": 1.5992278233685036e-05, "loss": 0.1543, "step": 257500 }, { "epoch": 20.117141741507222, "grad_norm": 1.0338879823684692, "learning_rate": 1.5990715123094958e-05, "loss": 0.1537, "step": 257600 }, { "epoch": 20.12495119094104, "grad_norm": 1.0437591075897217, "learning_rate": 1.5989167643610784e-05, "loss": 0.1526, "step": 257700 }, { "epoch": 20.132760640374855, "grad_norm": 0.6990692019462585, "learning_rate": 1.5987604533020714e-05, "loss": 0.1583, "step": 257800 }, { "epoch": 20.14057008980867, "grad_norm": 0.9085626602172852, "learning_rate": 1.598604142243064e-05, "loss": 0.147, "step": 257900 }, { "epoch": 20.148379539242484, "grad_norm": 0.9155283570289612, "learning_rate": 1.5984478311840562e-05, "loss": 0.1508, "step": 258000 }, { "epoch": 20.1561889886763, "grad_norm": 0.67339026927948, "learning_rate": 1.598291520125049e-05, "loss": 0.1525, "step": 258100 }, { "epoch": 20.163998438110113, "grad_norm": 0.8980534672737122, "learning_rate": 1.5981352090660414e-05, "loss": 0.159, "step": 258200 }, { "epoch": 20.171807887543928, "grad_norm": 0.8323621153831482, "learning_rate": 1.597978898007034e-05, "loss": 0.1569, "step": 258300 }, { "epoch": 20.179617336977742, "grad_norm": 0.8660298585891724, "learning_rate": 1.5978225869480266e-05, "loss": 0.152, "step": 258400 }, { "epoch": 20.187426786411557, "grad_norm": 0.9316799640655518, "learning_rate": 1.5976662758890192e-05, "loss": 0.1665, "step": 258500 }, { "epoch": 20.19523623584537, "grad_norm": 0.7573386430740356, "learning_rate": 1.597509964830012e-05, "loss": 0.1464, "step": 258600 }, { "epoch": 20.20304568527919, "grad_norm": 1.2083278894424438, "learning_rate": 1.5973536537710044e-05, "loss": 0.1512, "step": 258700 }, { "epoch": 20.210855134713004, "grad_norm": 0.9421168565750122, "learning_rate": 1.597197342711997e-05, "loss": 0.1545, "step": 258800 }, { "epoch": 20.21866458414682, "grad_norm": 0.8368399739265442, "learning_rate": 1.5970410316529896e-05, "loss": 0.1436, "step": 258900 }, { "epoch": 20.226474033580633, "grad_norm": 0.8021383881568909, "learning_rate": 1.5968847205939822e-05, "loss": 0.1435, "step": 259000 }, { "epoch": 20.234283483014448, "grad_norm": 0.7664214968681335, "learning_rate": 1.5967284095349745e-05, "loss": 0.1475, "step": 259100 }, { "epoch": 20.242092932448262, "grad_norm": 1.0566761493682861, "learning_rate": 1.5965720984759674e-05, "loss": 0.1577, "step": 259200 }, { "epoch": 20.249902381882077, "grad_norm": 0.7286539077758789, "learning_rate": 1.59641578741696e-05, "loss": 0.1524, "step": 259300 }, { "epoch": 20.25771183131589, "grad_norm": 0.8399373292922974, "learning_rate": 1.5962594763579523e-05, "loss": 0.1542, "step": 259400 }, { "epoch": 20.265521280749706, "grad_norm": 1.2981783151626587, "learning_rate": 1.596103165298945e-05, "loss": 0.1465, "step": 259500 }, { "epoch": 20.27333073018352, "grad_norm": 1.052333950996399, "learning_rate": 1.5959468542399375e-05, "loss": 0.1514, "step": 259600 }, { "epoch": 20.28114017961734, "grad_norm": 1.0850844383239746, "learning_rate": 1.59579210629152e-05, "loss": 0.1449, "step": 259700 }, { "epoch": 20.288949629051153, "grad_norm": 1.1221493482589722, "learning_rate": 1.5956357952325127e-05, "loss": 0.1595, "step": 259800 }, { "epoch": 20.296759078484968, "grad_norm": 0.5353173613548279, "learning_rate": 1.5954794841735056e-05, "loss": 0.1454, "step": 259900 }, { "epoch": 20.304568527918782, "grad_norm": 1.2891056537628174, "learning_rate": 1.595323173114498e-05, "loss": 0.1558, "step": 260000 }, { "epoch": 20.312377977352597, "grad_norm": 0.6218847632408142, "learning_rate": 1.5951668620554905e-05, "loss": 0.1436, "step": 260100 }, { "epoch": 20.32018742678641, "grad_norm": 0.9923516511917114, "learning_rate": 1.595010550996483e-05, "loss": 0.1459, "step": 260200 }, { "epoch": 20.327996876220226, "grad_norm": 0.9337815046310425, "learning_rate": 1.5948542399374757e-05, "loss": 0.1478, "step": 260300 }, { "epoch": 20.33580632565404, "grad_norm": 0.8427096605300903, "learning_rate": 1.5946979288784683e-05, "loss": 0.1454, "step": 260400 }, { "epoch": 20.343615775087855, "grad_norm": 0.8702914118766785, "learning_rate": 1.594541617819461e-05, "loss": 0.1515, "step": 260500 }, { "epoch": 20.35142522452167, "grad_norm": 1.1068722009658813, "learning_rate": 1.594385306760453e-05, "loss": 0.1466, "step": 260600 }, { "epoch": 20.359234673955488, "grad_norm": 0.9992669224739075, "learning_rate": 1.594228995701446e-05, "loss": 0.1517, "step": 260700 }, { "epoch": 20.367044123389302, "grad_norm": 0.8575073480606079, "learning_rate": 1.5940726846424387e-05, "loss": 0.1492, "step": 260800 }, { "epoch": 20.374853572823117, "grad_norm": 0.9805818200111389, "learning_rate": 1.593916373583431e-05, "loss": 0.15, "step": 260900 }, { "epoch": 20.38266302225693, "grad_norm": 0.9054665565490723, "learning_rate": 1.593760062524424e-05, "loss": 0.1435, "step": 261000 }, { "epoch": 20.390472471690746, "grad_norm": 0.788601279258728, "learning_rate": 1.5936037514654165e-05, "loss": 0.1551, "step": 261100 }, { "epoch": 20.39828192112456, "grad_norm": 1.1899691820144653, "learning_rate": 1.5934474404064088e-05, "loss": 0.1545, "step": 261200 }, { "epoch": 20.406091370558375, "grad_norm": 0.8695967793464661, "learning_rate": 1.5932911293474014e-05, "loss": 0.1481, "step": 261300 }, { "epoch": 20.41390081999219, "grad_norm": 0.8515498042106628, "learning_rate": 1.593134818288394e-05, "loss": 0.1495, "step": 261400 }, { "epoch": 20.421710269426004, "grad_norm": 0.8685891628265381, "learning_rate": 1.5929785072293866e-05, "loss": 0.1488, "step": 261500 }, { "epoch": 20.42951971885982, "grad_norm": 0.8837761282920837, "learning_rate": 1.5928221961703792e-05, "loss": 0.1541, "step": 261600 }, { "epoch": 20.437329168293637, "grad_norm": 0.5444580912590027, "learning_rate": 1.5926674482219618e-05, "loss": 0.1462, "step": 261700 }, { "epoch": 20.44513861772745, "grad_norm": 0.7685499787330627, "learning_rate": 1.5925111371629544e-05, "loss": 0.1555, "step": 261800 }, { "epoch": 20.452948067161266, "grad_norm": 0.9541754126548767, "learning_rate": 1.592354826103947e-05, "loss": 0.1511, "step": 261900 }, { "epoch": 20.46075751659508, "grad_norm": 0.8788749575614929, "learning_rate": 1.5921985150449396e-05, "loss": 0.1598, "step": 262000 }, { "epoch": 20.468566966028895, "grad_norm": 0.9367106556892395, "learning_rate": 1.5920422039859322e-05, "loss": 0.1426, "step": 262100 }, { "epoch": 20.47637641546271, "grad_norm": 0.6599664688110352, "learning_rate": 1.5918858929269248e-05, "loss": 0.1476, "step": 262200 }, { "epoch": 20.484185864896524, "grad_norm": 0.9754122495651245, "learning_rate": 1.5917295818679174e-05, "loss": 0.156, "step": 262300 }, { "epoch": 20.49199531433034, "grad_norm": 0.921559751033783, "learning_rate": 1.5915732708089096e-05, "loss": 0.1504, "step": 262400 }, { "epoch": 20.499804763764153, "grad_norm": 0.5617296099662781, "learning_rate": 1.5914169597499026e-05, "loss": 0.1478, "step": 262500 }, { "epoch": 20.507614213197968, "grad_norm": 0.7331305146217346, "learning_rate": 1.5912606486908952e-05, "loss": 0.1414, "step": 262600 }, { "epoch": 20.515423662631786, "grad_norm": 0.9771094918251038, "learning_rate": 1.5911043376318874e-05, "loss": 0.1487, "step": 262700 }, { "epoch": 20.5232331120656, "grad_norm": 0.9460648894309998, "learning_rate": 1.59094802657288e-05, "loss": 0.1441, "step": 262800 }, { "epoch": 20.531042561499415, "grad_norm": 0.7630587220191956, "learning_rate": 1.5907917155138726e-05, "loss": 0.1521, "step": 262900 }, { "epoch": 20.53885201093323, "grad_norm": 1.0937858819961548, "learning_rate": 1.5906354044548652e-05, "loss": 0.1452, "step": 263000 }, { "epoch": 20.546661460367044, "grad_norm": 0.5268203020095825, "learning_rate": 1.590479093395858e-05, "loss": 0.1423, "step": 263100 }, { "epoch": 20.55447090980086, "grad_norm": 0.7118089199066162, "learning_rate": 1.5903227823368505e-05, "loss": 0.1515, "step": 263200 }, { "epoch": 20.562280359234673, "grad_norm": 0.9789298176765442, "learning_rate": 1.590166471277843e-05, "loss": 0.1476, "step": 263300 }, { "epoch": 20.570089808668488, "grad_norm": 0.9407501220703125, "learning_rate": 1.5900101602188357e-05, "loss": 0.1458, "step": 263400 }, { "epoch": 20.577899258102303, "grad_norm": 0.942206859588623, "learning_rate": 1.5898538491598283e-05, "loss": 0.1462, "step": 263500 }, { "epoch": 20.585708707536117, "grad_norm": 0.7750436067581177, "learning_rate": 1.589697538100821e-05, "loss": 0.149, "step": 263600 }, { "epoch": 20.593518156969935, "grad_norm": 1.2150155305862427, "learning_rate": 1.5895412270418135e-05, "loss": 0.1492, "step": 263700 }, { "epoch": 20.60132760640375, "grad_norm": 0.917884349822998, "learning_rate": 1.5893849159828057e-05, "loss": 0.1482, "step": 263800 }, { "epoch": 20.609137055837564, "grad_norm": 0.9767228960990906, "learning_rate": 1.5892286049237983e-05, "loss": 0.1459, "step": 263900 }, { "epoch": 20.61694650527138, "grad_norm": 0.6677699685096741, "learning_rate": 1.5890738569753813e-05, "loss": 0.1505, "step": 264000 }, { "epoch": 20.624755954705194, "grad_norm": 0.9703248739242554, "learning_rate": 1.588917545916374e-05, "loss": 0.1502, "step": 264100 }, { "epoch": 20.632565404139008, "grad_norm": 0.7702843546867371, "learning_rate": 1.588761234857366e-05, "loss": 0.1514, "step": 264200 }, { "epoch": 20.640374853572823, "grad_norm": 0.4829925298690796, "learning_rate": 1.588604923798359e-05, "loss": 0.1551, "step": 264300 }, { "epoch": 20.648184303006637, "grad_norm": 1.015032172203064, "learning_rate": 1.5884486127393513e-05, "loss": 0.1573, "step": 264400 }, { "epoch": 20.65599375244045, "grad_norm": 0.7382018566131592, "learning_rate": 1.588292301680344e-05, "loss": 0.1468, "step": 264500 }, { "epoch": 20.663803201874266, "grad_norm": 0.8831744194030762, "learning_rate": 1.5881359906213365e-05, "loss": 0.1488, "step": 264600 }, { "epoch": 20.671612651308084, "grad_norm": 0.983797013759613, "learning_rate": 1.587979679562329e-05, "loss": 0.149, "step": 264700 }, { "epoch": 20.6794221007419, "grad_norm": 0.9039027094841003, "learning_rate": 1.5878233685033217e-05, "loss": 0.1453, "step": 264800 }, { "epoch": 20.687231550175714, "grad_norm": 0.7768588662147522, "learning_rate": 1.5876670574443143e-05, "loss": 0.1425, "step": 264900 }, { "epoch": 20.695040999609528, "grad_norm": 0.8414096832275391, "learning_rate": 1.587510746385307e-05, "loss": 0.147, "step": 265000 }, { "epoch": 20.702850449043343, "grad_norm": 0.981864869594574, "learning_rate": 1.5873544353262995e-05, "loss": 0.1482, "step": 265100 }, { "epoch": 20.710659898477157, "grad_norm": 0.6834481358528137, "learning_rate": 1.587198124267292e-05, "loss": 0.1507, "step": 265200 }, { "epoch": 20.718469347910972, "grad_norm": 1.1916412115097046, "learning_rate": 1.5870418132082844e-05, "loss": 0.156, "step": 265300 }, { "epoch": 20.726278797344786, "grad_norm": 0.7448387742042542, "learning_rate": 1.5868855021492773e-05, "loss": 0.1463, "step": 265400 }, { "epoch": 20.7340882467786, "grad_norm": 0.8105716705322266, "learning_rate": 1.58672919109027e-05, "loss": 0.1498, "step": 265500 }, { "epoch": 20.741897696212416, "grad_norm": 0.7603853940963745, "learning_rate": 1.5865728800312622e-05, "loss": 0.1453, "step": 265600 }, { "epoch": 20.749707145646234, "grad_norm": 0.9040424227714539, "learning_rate": 1.5864165689722548e-05, "loss": 0.153, "step": 265700 }, { "epoch": 20.757516595080048, "grad_norm": 1.0994298458099365, "learning_rate": 1.5862602579132474e-05, "loss": 0.1534, "step": 265800 }, { "epoch": 20.765326044513863, "grad_norm": 0.6235753297805786, "learning_rate": 1.58610394685424e-05, "loss": 0.1476, "step": 265900 }, { "epoch": 20.773135493947677, "grad_norm": 0.9457157254219055, "learning_rate": 1.5859491989058226e-05, "loss": 0.147, "step": 266000 }, { "epoch": 20.780944943381492, "grad_norm": 1.1702708005905151, "learning_rate": 1.5857928878468155e-05, "loss": 0.1493, "step": 266100 }, { "epoch": 20.788754392815306, "grad_norm": 0.9484367966651917, "learning_rate": 1.5856365767878078e-05, "loss": 0.1423, "step": 266200 }, { "epoch": 20.79656384224912, "grad_norm": 0.8259855508804321, "learning_rate": 1.5854802657288004e-05, "loss": 0.1465, "step": 266300 }, { "epoch": 20.804373291682936, "grad_norm": 0.8354877233505249, "learning_rate": 1.585323954669793e-05, "loss": 0.1403, "step": 266400 }, { "epoch": 20.81218274111675, "grad_norm": 0.9899166226387024, "learning_rate": 1.5851676436107856e-05, "loss": 0.1575, "step": 266500 }, { "epoch": 20.819992190550565, "grad_norm": 1.1090909242630005, "learning_rate": 1.5850113325517782e-05, "loss": 0.1496, "step": 266600 }, { "epoch": 20.82780163998438, "grad_norm": 0.7917970418930054, "learning_rate": 1.5848550214927708e-05, "loss": 0.1525, "step": 266700 }, { "epoch": 20.835611089418197, "grad_norm": 0.7563430666923523, "learning_rate": 1.584698710433763e-05, "loss": 0.1441, "step": 266800 }, { "epoch": 20.843420538852012, "grad_norm": 0.8265455365180969, "learning_rate": 1.584542399374756e-05, "loss": 0.1462, "step": 266900 }, { "epoch": 20.851229988285827, "grad_norm": 0.6629912257194519, "learning_rate": 1.5843860883157486e-05, "loss": 0.1469, "step": 267000 }, { "epoch": 20.85903943771964, "grad_norm": 1.1149176359176636, "learning_rate": 1.584229777256741e-05, "loss": 0.1514, "step": 267100 }, { "epoch": 20.866848887153456, "grad_norm": 0.7931187748908997, "learning_rate": 1.5840734661977338e-05, "loss": 0.1478, "step": 267200 }, { "epoch": 20.87465833658727, "grad_norm": 0.49144962430000305, "learning_rate": 1.5839171551387264e-05, "loss": 0.1559, "step": 267300 }, { "epoch": 20.882467786021085, "grad_norm": 1.0094081163406372, "learning_rate": 1.5837608440797187e-05, "loss": 0.1516, "step": 267400 }, { "epoch": 20.8902772354549, "grad_norm": 0.905548632144928, "learning_rate": 1.5836045330207113e-05, "loss": 0.1448, "step": 267500 }, { "epoch": 20.898086684888714, "grad_norm": 0.7649174332618713, "learning_rate": 1.583448221961704e-05, "loss": 0.1521, "step": 267600 }, { "epoch": 20.905896134322532, "grad_norm": 0.6280129551887512, "learning_rate": 1.5832919109026965e-05, "loss": 0.1458, "step": 267700 }, { "epoch": 20.913705583756347, "grad_norm": 0.8115954995155334, "learning_rate": 1.583135599843689e-05, "loss": 0.1587, "step": 267800 }, { "epoch": 20.92151503319016, "grad_norm": 1.0527704954147339, "learning_rate": 1.5829792887846817e-05, "loss": 0.1528, "step": 267900 }, { "epoch": 20.929324482623976, "grad_norm": 0.943759024143219, "learning_rate": 1.5828245408362643e-05, "loss": 0.1505, "step": 268000 }, { "epoch": 20.93713393205779, "grad_norm": 0.8028054237365723, "learning_rate": 1.582668229777257e-05, "loss": 0.1493, "step": 268100 }, { "epoch": 20.944943381491605, "grad_norm": 0.9871835708618164, "learning_rate": 1.5825119187182495e-05, "loss": 0.1489, "step": 268200 }, { "epoch": 20.95275283092542, "grad_norm": 0.9000875353813171, "learning_rate": 1.582355607659242e-05, "loss": 0.1488, "step": 268300 }, { "epoch": 20.960562280359234, "grad_norm": 0.8018443584442139, "learning_rate": 1.5821992966002347e-05, "loss": 0.1485, "step": 268400 }, { "epoch": 20.96837172979305, "grad_norm": 1.2284111976623535, "learning_rate": 1.5820429855412273e-05, "loss": 0.1506, "step": 268500 }, { "epoch": 20.976181179226863, "grad_norm": 0.5926699042320251, "learning_rate": 1.5818866744822195e-05, "loss": 0.1468, "step": 268600 }, { "epoch": 20.983990628660678, "grad_norm": 0.8168689012527466, "learning_rate": 1.5817303634232125e-05, "loss": 0.1497, "step": 268700 }, { "epoch": 20.991800078094496, "grad_norm": 0.9076529741287231, "learning_rate": 1.581574052364205e-05, "loss": 0.147, "step": 268800 }, { "epoch": 20.99960952752831, "grad_norm": 1.1257812976837158, "learning_rate": 1.5814177413051973e-05, "loss": 0.152, "step": 268900 }, { "epoch": 21.007418976962125, "grad_norm": 0.6348904967308044, "learning_rate": 1.58126143024619e-05, "loss": 0.1503, "step": 269000 }, { "epoch": 21.01522842639594, "grad_norm": 0.6594538688659668, "learning_rate": 1.5811051191871825e-05, "loss": 0.145, "step": 269100 }, { "epoch": 21.023037875829754, "grad_norm": 0.9883918762207031, "learning_rate": 1.580948808128175e-05, "loss": 0.1465, "step": 269200 }, { "epoch": 21.03084732526357, "grad_norm": 0.9725099802017212, "learning_rate": 1.5807924970691677e-05, "loss": 0.1514, "step": 269300 }, { "epoch": 21.038656774697383, "grad_norm": 0.8018738031387329, "learning_rate": 1.5806361860101603e-05, "loss": 0.1557, "step": 269400 }, { "epoch": 21.046466224131198, "grad_norm": 0.71611487865448, "learning_rate": 1.580479874951153e-05, "loss": 0.1453, "step": 269500 }, { "epoch": 21.054275673565012, "grad_norm": 0.8069080114364624, "learning_rate": 1.5803235638921455e-05, "loss": 0.1493, "step": 269600 }, { "epoch": 21.062085122998827, "grad_norm": 1.025117039680481, "learning_rate": 1.580167252833138e-05, "loss": 0.1398, "step": 269700 }, { "epoch": 21.069894572432645, "grad_norm": 0.8417700529098511, "learning_rate": 1.5800109417741307e-05, "loss": 0.1514, "step": 269800 }, { "epoch": 21.07770402186646, "grad_norm": 0.6837599873542786, "learning_rate": 1.5798546307151233e-05, "loss": 0.1453, "step": 269900 }, { "epoch": 21.085513471300274, "grad_norm": 0.7681900858879089, "learning_rate": 1.5796983196561156e-05, "loss": 0.1502, "step": 270000 }, { "epoch": 21.09332292073409, "grad_norm": 1.1430526971817017, "learning_rate": 1.5795435717076982e-05, "loss": 0.1446, "step": 270100 }, { "epoch": 21.101132370167903, "grad_norm": 0.6700018048286438, "learning_rate": 1.579387260648691e-05, "loss": 0.1482, "step": 270200 }, { "epoch": 21.108941819601718, "grad_norm": 0.48721635341644287, "learning_rate": 1.5792309495896837e-05, "loss": 0.1471, "step": 270300 }, { "epoch": 21.116751269035532, "grad_norm": 1.0522655248641968, "learning_rate": 1.579074638530676e-05, "loss": 0.1468, "step": 270400 }, { "epoch": 21.124560718469347, "grad_norm": 0.5169057250022888, "learning_rate": 1.578918327471669e-05, "loss": 0.1473, "step": 270500 }, { "epoch": 21.13237016790316, "grad_norm": 0.614720344543457, "learning_rate": 1.5787620164126612e-05, "loss": 0.1415, "step": 270600 }, { "epoch": 21.140179617336976, "grad_norm": 1.132947564125061, "learning_rate": 1.5786057053536538e-05, "loss": 0.142, "step": 270700 }, { "epoch": 21.147989066770794, "grad_norm": 1.131731390953064, "learning_rate": 1.5784493942946464e-05, "loss": 0.1433, "step": 270800 }, { "epoch": 21.15579851620461, "grad_norm": 0.8054686784744263, "learning_rate": 1.578293083235639e-05, "loss": 0.1434, "step": 270900 }, { "epoch": 21.163607965638423, "grad_norm": 0.8654218912124634, "learning_rate": 1.5781367721766316e-05, "loss": 0.1398, "step": 271000 }, { "epoch": 21.171417415072238, "grad_norm": 0.7513204216957092, "learning_rate": 1.5779804611176242e-05, "loss": 0.1375, "step": 271100 }, { "epoch": 21.179226864506052, "grad_norm": 0.8772755265235901, "learning_rate": 1.5778241500586168e-05, "loss": 0.143, "step": 271200 }, { "epoch": 21.187036313939867, "grad_norm": 0.661716103553772, "learning_rate": 1.5776678389996094e-05, "loss": 0.1414, "step": 271300 }, { "epoch": 21.19484576337368, "grad_norm": 1.0139799118041992, "learning_rate": 1.577511527940602e-05, "loss": 0.148, "step": 271400 }, { "epoch": 21.202655212807496, "grad_norm": 1.010338544845581, "learning_rate": 1.5773552168815943e-05, "loss": 0.1527, "step": 271500 }, { "epoch": 21.21046466224131, "grad_norm": 1.147233247756958, "learning_rate": 1.5771989058225872e-05, "loss": 0.14, "step": 271600 }, { "epoch": 21.218274111675125, "grad_norm": 1.077331304550171, "learning_rate": 1.5770425947635798e-05, "loss": 0.1498, "step": 271700 }, { "epoch": 21.226083561108943, "grad_norm": 0.8089535236358643, "learning_rate": 1.576886283704572e-05, "loss": 0.1503, "step": 271800 }, { "epoch": 21.233893010542758, "grad_norm": 0.8551170825958252, "learning_rate": 1.5767299726455647e-05, "loss": 0.1423, "step": 271900 }, { "epoch": 21.241702459976572, "grad_norm": 0.8253236413002014, "learning_rate": 1.5765736615865573e-05, "loss": 0.1452, "step": 272000 }, { "epoch": 21.249511909410387, "grad_norm": 0.9153646230697632, "learning_rate": 1.57641891363814e-05, "loss": 0.1483, "step": 272100 }, { "epoch": 21.2573213588442, "grad_norm": 0.7935351729393005, "learning_rate": 1.5762626025791325e-05, "loss": 0.1425, "step": 272200 }, { "epoch": 21.265130808278016, "grad_norm": 0.7212216258049011, "learning_rate": 1.5761062915201254e-05, "loss": 0.1475, "step": 272300 }, { "epoch": 21.27294025771183, "grad_norm": 0.6691550612449646, "learning_rate": 1.5759499804611177e-05, "loss": 0.1344, "step": 272400 }, { "epoch": 21.280749707145645, "grad_norm": 0.9414501786231995, "learning_rate": 1.5757936694021103e-05, "loss": 0.1445, "step": 272500 }, { "epoch": 21.28855915657946, "grad_norm": 0.8180404901504517, "learning_rate": 1.575637358343103e-05, "loss": 0.1462, "step": 272600 }, { "epoch": 21.296368606013274, "grad_norm": 0.8017256259918213, "learning_rate": 1.5754810472840955e-05, "loss": 0.1483, "step": 272700 }, { "epoch": 21.304178055447093, "grad_norm": 0.8329930901527405, "learning_rate": 1.575324736225088e-05, "loss": 0.1408, "step": 272800 }, { "epoch": 21.311987504880907, "grad_norm": 1.1513885259628296, "learning_rate": 1.5751684251660807e-05, "loss": 0.1509, "step": 272900 }, { "epoch": 21.31979695431472, "grad_norm": 0.7862344980239868, "learning_rate": 1.575012114107073e-05, "loss": 0.1452, "step": 273000 }, { "epoch": 21.327606403748536, "grad_norm": 0.8844183087348938, "learning_rate": 1.574855803048066e-05, "loss": 0.1455, "step": 273100 }, { "epoch": 21.33541585318235, "grad_norm": 0.8400126695632935, "learning_rate": 1.5746994919890585e-05, "loss": 0.1447, "step": 273200 }, { "epoch": 21.343225302616165, "grad_norm": 0.8862954378128052, "learning_rate": 1.5745431809300508e-05, "loss": 0.1322, "step": 273300 }, { "epoch": 21.35103475204998, "grad_norm": 0.8712252378463745, "learning_rate": 1.5743868698710437e-05, "loss": 0.1483, "step": 273400 }, { "epoch": 21.358844201483794, "grad_norm": 0.8062180876731873, "learning_rate": 1.5742305588120363e-05, "loss": 0.145, "step": 273500 }, { "epoch": 21.36665365091761, "grad_norm": 0.9273768663406372, "learning_rate": 1.5740742477530286e-05, "loss": 0.141, "step": 273600 }, { "epoch": 21.374463100351424, "grad_norm": 0.6157429218292236, "learning_rate": 1.573917936694021e-05, "loss": 0.1433, "step": 273700 }, { "epoch": 21.38227254978524, "grad_norm": 0.5980425477027893, "learning_rate": 1.5737616256350138e-05, "loss": 0.1445, "step": 273800 }, { "epoch": 21.390081999219056, "grad_norm": 0.8572303652763367, "learning_rate": 1.5736053145760064e-05, "loss": 0.1446, "step": 273900 }, { "epoch": 21.39789144865287, "grad_norm": 1.1001536846160889, "learning_rate": 1.573449003516999e-05, "loss": 0.1496, "step": 274000 }, { "epoch": 21.405700898086685, "grad_norm": 0.6422202587127686, "learning_rate": 1.5732942555685816e-05, "loss": 0.1492, "step": 274100 }, { "epoch": 21.4135103475205, "grad_norm": 0.9524158835411072, "learning_rate": 1.573137944509574e-05, "loss": 0.1382, "step": 274200 }, { "epoch": 21.421319796954315, "grad_norm": 0.8874322175979614, "learning_rate": 1.5729816334505668e-05, "loss": 0.1547, "step": 274300 }, { "epoch": 21.42912924638813, "grad_norm": 0.8682050704956055, "learning_rate": 1.5728253223915594e-05, "loss": 0.1414, "step": 274400 }, { "epoch": 21.436938695821944, "grad_norm": 0.7201287150382996, "learning_rate": 1.572669011332552e-05, "loss": 0.144, "step": 274500 }, { "epoch": 21.44474814525576, "grad_norm": 0.8117372989654541, "learning_rate": 1.5725127002735446e-05, "loss": 0.1452, "step": 274600 }, { "epoch": 21.452557594689573, "grad_norm": 0.8349194526672363, "learning_rate": 1.572356389214537e-05, "loss": 0.1467, "step": 274700 }, { "epoch": 21.46036704412339, "grad_norm": 0.6320531368255615, "learning_rate": 1.5722000781555294e-05, "loss": 0.144, "step": 274800 }, { "epoch": 21.468176493557205, "grad_norm": 0.9218801856040955, "learning_rate": 1.5720437670965224e-05, "loss": 0.1433, "step": 274900 }, { "epoch": 21.47598594299102, "grad_norm": 1.0286614894866943, "learning_rate": 1.571887456037515e-05, "loss": 0.1446, "step": 275000 }, { "epoch": 21.483795392424835, "grad_norm": 0.8825885653495789, "learning_rate": 1.5717311449785072e-05, "loss": 0.1411, "step": 275100 }, { "epoch": 21.49160484185865, "grad_norm": 0.9197112917900085, "learning_rate": 1.5715748339194998e-05, "loss": 0.1406, "step": 275200 }, { "epoch": 21.499414291292464, "grad_norm": 1.0009756088256836, "learning_rate": 1.5714185228604924e-05, "loss": 0.1452, "step": 275300 }, { "epoch": 21.50722374072628, "grad_norm": 1.0206668376922607, "learning_rate": 1.571262211801485e-05, "loss": 0.1446, "step": 275400 }, { "epoch": 21.515033190160093, "grad_norm": 0.8309497833251953, "learning_rate": 1.5711059007424776e-05, "loss": 0.1419, "step": 275500 }, { "epoch": 21.522842639593907, "grad_norm": 0.8372001647949219, "learning_rate": 1.5709495896834702e-05, "loss": 0.1401, "step": 275600 }, { "epoch": 21.530652089027722, "grad_norm": 0.7666497826576233, "learning_rate": 1.570793278624463e-05, "loss": 0.1295, "step": 275700 }, { "epoch": 21.53846153846154, "grad_norm": 0.7983540892601013, "learning_rate": 1.5706369675654554e-05, "loss": 0.1456, "step": 275800 }, { "epoch": 21.546270987895355, "grad_norm": 0.8200308084487915, "learning_rate": 1.570480656506448e-05, "loss": 0.1499, "step": 275900 }, { "epoch": 21.55408043732917, "grad_norm": 1.095491886138916, "learning_rate": 1.5703243454474406e-05, "loss": 0.1378, "step": 276000 }, { "epoch": 21.561889886762984, "grad_norm": 0.8706907033920288, "learning_rate": 1.5701695974990232e-05, "loss": 0.1529, "step": 276100 }, { "epoch": 21.5696993361968, "grad_norm": 0.7600975036621094, "learning_rate": 1.5700132864400158e-05, "loss": 0.1424, "step": 276200 }, { "epoch": 21.577508785630613, "grad_norm": 0.8851431608200073, "learning_rate": 1.569856975381008e-05, "loss": 0.1512, "step": 276300 }, { "epoch": 21.585318235064427, "grad_norm": 0.9802671670913696, "learning_rate": 1.569700664322001e-05, "loss": 0.1418, "step": 276400 }, { "epoch": 21.593127684498242, "grad_norm": 0.8661850690841675, "learning_rate": 1.5695443532629936e-05, "loss": 0.1484, "step": 276500 }, { "epoch": 21.600937133932057, "grad_norm": 0.8756442666053772, "learning_rate": 1.569388042203986e-05, "loss": 0.1385, "step": 276600 }, { "epoch": 21.60874658336587, "grad_norm": 1.033715009689331, "learning_rate": 1.569231731144979e-05, "loss": 0.1436, "step": 276700 }, { "epoch": 21.61655603279969, "grad_norm": 0.6875880360603333, "learning_rate": 1.569075420085971e-05, "loss": 0.1468, "step": 276800 }, { "epoch": 21.624365482233504, "grad_norm": 0.9872573018074036, "learning_rate": 1.5689191090269637e-05, "loss": 0.1393, "step": 276900 }, { "epoch": 21.63217493166732, "grad_norm": 0.6981116533279419, "learning_rate": 1.5687627979679563e-05, "loss": 0.1386, "step": 277000 }, { "epoch": 21.639984381101133, "grad_norm": 0.9638392925262451, "learning_rate": 1.568606486908949e-05, "loss": 0.1482, "step": 277100 }, { "epoch": 21.647793830534948, "grad_norm": 0.8693450093269348, "learning_rate": 1.5684501758499415e-05, "loss": 0.1388, "step": 277200 }, { "epoch": 21.655603279968762, "grad_norm": 0.8939495086669922, "learning_rate": 1.568293864790934e-05, "loss": 0.1428, "step": 277300 }, { "epoch": 21.663412729402577, "grad_norm": 0.989825427532196, "learning_rate": 1.5681375537319267e-05, "loss": 0.1459, "step": 277400 }, { "epoch": 21.67122217883639, "grad_norm": 0.9190754294395447, "learning_rate": 1.5679812426729193e-05, "loss": 0.1465, "step": 277500 }, { "epoch": 21.679031628270206, "grad_norm": 0.9194713234901428, "learning_rate": 1.567824931613912e-05, "loss": 0.1494, "step": 277600 }, { "epoch": 21.68684107770402, "grad_norm": 0.9386900067329407, "learning_rate": 1.567668620554904e-05, "loss": 0.144, "step": 277700 }, { "epoch": 21.69465052713784, "grad_norm": 0.7496020197868347, "learning_rate": 1.567512309495897e-05, "loss": 0.1438, "step": 277800 }, { "epoch": 21.702459976571653, "grad_norm": 0.7539072036743164, "learning_rate": 1.5673559984368897e-05, "loss": 0.147, "step": 277900 }, { "epoch": 21.710269426005468, "grad_norm": 0.8383206129074097, "learning_rate": 1.567199687377882e-05, "loss": 0.1464, "step": 278000 }, { "epoch": 21.718078875439282, "grad_norm": 0.9453973770141602, "learning_rate": 1.5670433763188746e-05, "loss": 0.1429, "step": 278100 }, { "epoch": 21.725888324873097, "grad_norm": 0.8745108842849731, "learning_rate": 1.5668886283704575e-05, "loss": 0.1422, "step": 278200 }, { "epoch": 21.73369777430691, "grad_norm": 0.8260504007339478, "learning_rate": 1.5667323173114498e-05, "loss": 0.1438, "step": 278300 }, { "epoch": 21.741507223740726, "grad_norm": 1.0408035516738892, "learning_rate": 1.5665760062524424e-05, "loss": 0.1328, "step": 278400 }, { "epoch": 21.74931667317454, "grad_norm": 0.7690845727920532, "learning_rate": 1.566419695193435e-05, "loss": 0.1367, "step": 278500 }, { "epoch": 21.757126122608355, "grad_norm": 0.9121866226196289, "learning_rate": 1.5662633841344276e-05, "loss": 0.1474, "step": 278600 }, { "epoch": 21.76493557204217, "grad_norm": 1.1322033405303955, "learning_rate": 1.5661070730754202e-05, "loss": 0.1411, "step": 278700 }, { "epoch": 21.772745021475984, "grad_norm": 1.1597561836242676, "learning_rate": 1.5659523251270028e-05, "loss": 0.1428, "step": 278800 }, { "epoch": 21.780554470909802, "grad_norm": 0.9378727078437805, "learning_rate": 1.5657960140679954e-05, "loss": 0.151, "step": 278900 }, { "epoch": 21.788363920343617, "grad_norm": 0.7633863091468811, "learning_rate": 1.565639703008988e-05, "loss": 0.1456, "step": 279000 }, { "epoch": 21.79617336977743, "grad_norm": 0.6599911451339722, "learning_rate": 1.5654833919499806e-05, "loss": 0.1436, "step": 279100 }, { "epoch": 21.803982819211246, "grad_norm": 0.8178460597991943, "learning_rate": 1.5653270808909732e-05, "loss": 0.1426, "step": 279200 }, { "epoch": 21.81179226864506, "grad_norm": 0.8605934977531433, "learning_rate": 1.5651707698319658e-05, "loss": 0.1444, "step": 279300 }, { "epoch": 21.819601718078875, "grad_norm": 0.898315966129303, "learning_rate": 1.5650144587729584e-05, "loss": 0.1348, "step": 279400 }, { "epoch": 21.82741116751269, "grad_norm": 1.0550199747085571, "learning_rate": 1.564858147713951e-05, "loss": 0.1317, "step": 279500 }, { "epoch": 21.835220616946504, "grad_norm": 0.9702891707420349, "learning_rate": 1.5647018366549436e-05, "loss": 0.1412, "step": 279600 }, { "epoch": 21.84303006638032, "grad_norm": 0.9088001251220703, "learning_rate": 1.5645455255959362e-05, "loss": 0.14, "step": 279700 }, { "epoch": 21.850839515814137, "grad_norm": 0.9616572856903076, "learning_rate": 1.5643892145369284e-05, "loss": 0.1338, "step": 279800 }, { "epoch": 21.85864896524795, "grad_norm": 0.8163248300552368, "learning_rate": 1.564232903477921e-05, "loss": 0.1398, "step": 279900 }, { "epoch": 21.866458414681766, "grad_norm": 0.8260008096694946, "learning_rate": 1.564076592418914e-05, "loss": 0.1384, "step": 280000 }, { "epoch": 21.87426786411558, "grad_norm": 1.126587152481079, "learning_rate": 1.5639202813599062e-05, "loss": 0.1439, "step": 280100 }, { "epoch": 21.882077313549395, "grad_norm": 0.8658336997032166, "learning_rate": 1.563763970300899e-05, "loss": 0.1465, "step": 280200 }, { "epoch": 21.88988676298321, "grad_norm": 0.8323200345039368, "learning_rate": 1.5636076592418914e-05, "loss": 0.1516, "step": 280300 }, { "epoch": 21.897696212417024, "grad_norm": 0.99912029504776, "learning_rate": 1.563451348182884e-05, "loss": 0.1352, "step": 280400 }, { "epoch": 21.90550566185084, "grad_norm": 0.9015044569969177, "learning_rate": 1.5632950371238766e-05, "loss": 0.1428, "step": 280500 }, { "epoch": 21.913315111284653, "grad_norm": 0.7489168643951416, "learning_rate": 1.5631387260648692e-05, "loss": 0.1373, "step": 280600 }, { "epoch": 21.921124560718468, "grad_norm": 0.9612414836883545, "learning_rate": 1.562982415005862e-05, "loss": 0.1411, "step": 280700 }, { "epoch": 21.928934010152282, "grad_norm": 0.8368463516235352, "learning_rate": 1.5628261039468544e-05, "loss": 0.1441, "step": 280800 }, { "epoch": 21.9367434595861, "grad_norm": 0.7885879874229431, "learning_rate": 1.562669792887847e-05, "loss": 0.1297, "step": 280900 }, { "epoch": 21.944552909019915, "grad_norm": 0.882032036781311, "learning_rate": 1.5625134818288393e-05, "loss": 0.1425, "step": 281000 }, { "epoch": 21.95236235845373, "grad_norm": 0.7525109648704529, "learning_rate": 1.5623571707698323e-05, "loss": 0.1387, "step": 281100 }, { "epoch": 21.960171807887544, "grad_norm": 0.7078118920326233, "learning_rate": 1.562200859710825e-05, "loss": 0.1427, "step": 281200 }, { "epoch": 21.96798125732136, "grad_norm": 1.0537186861038208, "learning_rate": 1.562044548651817e-05, "loss": 0.1395, "step": 281300 }, { "epoch": 21.975790706755173, "grad_norm": 0.9437336921691895, "learning_rate": 1.5618882375928097e-05, "loss": 0.1357, "step": 281400 }, { "epoch": 21.983600156188988, "grad_norm": 0.6707448363304138, "learning_rate": 1.5617319265338023e-05, "loss": 0.1432, "step": 281500 }, { "epoch": 21.991409605622803, "grad_norm": 1.0596201419830322, "learning_rate": 1.561575615474795e-05, "loss": 0.1426, "step": 281600 }, { "epoch": 21.999219055056617, "grad_norm": 0.85732102394104, "learning_rate": 1.5614193044157875e-05, "loss": 0.1459, "step": 281700 }, { "epoch": 22.00702850449043, "grad_norm": 0.6178754568099976, "learning_rate": 1.56126299335678e-05, "loss": 0.142, "step": 281800 }, { "epoch": 22.01483795392425, "grad_norm": 0.7230113744735718, "learning_rate": 1.5611066822977727e-05, "loss": 0.1409, "step": 281900 }, { "epoch": 22.022647403358064, "grad_norm": 0.7485736608505249, "learning_rate": 1.5609503712387653e-05, "loss": 0.133, "step": 282000 }, { "epoch": 22.03045685279188, "grad_norm": 0.6739600300788879, "learning_rate": 1.560794060179758e-05, "loss": 0.1363, "step": 282100 }, { "epoch": 22.038266302225693, "grad_norm": 0.7293041944503784, "learning_rate": 1.5606377491207505e-05, "loss": 0.1479, "step": 282200 }, { "epoch": 22.046075751659508, "grad_norm": 0.722092866897583, "learning_rate": 1.560481438061743e-05, "loss": 0.137, "step": 282300 }, { "epoch": 22.053885201093323, "grad_norm": 0.8747148513793945, "learning_rate": 1.5603251270027354e-05, "loss": 0.1345, "step": 282400 }, { "epoch": 22.061694650527137, "grad_norm": 0.969915509223938, "learning_rate": 1.560168815943728e-05, "loss": 0.1381, "step": 282500 }, { "epoch": 22.06950409996095, "grad_norm": 0.8908195495605469, "learning_rate": 1.560012504884721e-05, "loss": 0.1342, "step": 282600 }, { "epoch": 22.077313549394766, "grad_norm": 0.9030640721321106, "learning_rate": 1.5598561938257132e-05, "loss": 0.1343, "step": 282700 }, { "epoch": 22.08512299882858, "grad_norm": 0.7707045078277588, "learning_rate": 1.5597014458772958e-05, "loss": 0.1375, "step": 282800 }, { "epoch": 22.0929324482624, "grad_norm": 0.7158626914024353, "learning_rate": 1.5595451348182887e-05, "loss": 0.1391, "step": 282900 }, { "epoch": 22.100741897696214, "grad_norm": 0.8736804723739624, "learning_rate": 1.559388823759281e-05, "loss": 0.1356, "step": 283000 }, { "epoch": 22.108551347130028, "grad_norm": 0.8259661793708801, "learning_rate": 1.5592325127002736e-05, "loss": 0.1417, "step": 283100 }, { "epoch": 22.116360796563843, "grad_norm": 0.7664586305618286, "learning_rate": 1.5590762016412662e-05, "loss": 0.1404, "step": 283200 }, { "epoch": 22.124170245997657, "grad_norm": 0.7634907960891724, "learning_rate": 1.5589198905822588e-05, "loss": 0.1408, "step": 283300 }, { "epoch": 22.131979695431472, "grad_norm": 0.7122980952262878, "learning_rate": 1.5587635795232514e-05, "loss": 0.1359, "step": 283400 }, { "epoch": 22.139789144865286, "grad_norm": 1.002042531967163, "learning_rate": 1.558607268464244e-05, "loss": 0.1345, "step": 283500 }, { "epoch": 22.1475985942991, "grad_norm": 0.675338089466095, "learning_rate": 1.5584509574052366e-05, "loss": 0.1415, "step": 283600 }, { "epoch": 22.155408043732916, "grad_norm": 1.0163286924362183, "learning_rate": 1.5582946463462292e-05, "loss": 0.141, "step": 283700 }, { "epoch": 22.16321749316673, "grad_norm": 0.7089374661445618, "learning_rate": 1.5581383352872218e-05, "loss": 0.1497, "step": 283800 }, { "epoch": 22.171026942600548, "grad_norm": 0.6143225431442261, "learning_rate": 1.557982024228214e-05, "loss": 0.1443, "step": 283900 }, { "epoch": 22.178836392034363, "grad_norm": 1.1019307374954224, "learning_rate": 1.557825713169207e-05, "loss": 0.1351, "step": 284000 }, { "epoch": 22.186645841468177, "grad_norm": 0.901965856552124, "learning_rate": 1.5576694021101996e-05, "loss": 0.1405, "step": 284100 }, { "epoch": 22.194455290901992, "grad_norm": 0.5475614070892334, "learning_rate": 1.557513091051192e-05, "loss": 0.1338, "step": 284200 }, { "epoch": 22.202264740335806, "grad_norm": 0.8850200176239014, "learning_rate": 1.5573567799921845e-05, "loss": 0.1438, "step": 284300 }, { "epoch": 22.21007418976962, "grad_norm": 1.0332914590835571, "learning_rate": 1.557200468933177e-05, "loss": 0.1353, "step": 284400 }, { "epoch": 22.217883639203436, "grad_norm": 0.8510705828666687, "learning_rate": 1.5570441578741697e-05, "loss": 0.1396, "step": 284500 }, { "epoch": 22.22569308863725, "grad_norm": 0.9297281503677368, "learning_rate": 1.5568878468151623e-05, "loss": 0.142, "step": 284600 }, { "epoch": 22.233502538071065, "grad_norm": 1.048340916633606, "learning_rate": 1.556731535756155e-05, "loss": 0.1349, "step": 284700 }, { "epoch": 22.24131198750488, "grad_norm": 0.9004128575325012, "learning_rate": 1.5565767878077375e-05, "loss": 0.1369, "step": 284800 }, { "epoch": 22.249121436938697, "grad_norm": 0.7570412755012512, "learning_rate": 1.55642047674873e-05, "loss": 0.1367, "step": 284900 }, { "epoch": 22.256930886372512, "grad_norm": 1.0669686794281006, "learning_rate": 1.5562641656897227e-05, "loss": 0.1483, "step": 285000 }, { "epoch": 22.264740335806326, "grad_norm": 0.7067891955375671, "learning_rate": 1.5561078546307153e-05, "loss": 0.1474, "step": 285100 }, { "epoch": 22.27254978524014, "grad_norm": 0.7027773857116699, "learning_rate": 1.555951543571708e-05, "loss": 0.1402, "step": 285200 }, { "epoch": 22.280359234673956, "grad_norm": 0.7071043848991394, "learning_rate": 1.5557952325127005e-05, "loss": 0.1376, "step": 285300 }, { "epoch": 22.28816868410777, "grad_norm": 0.8070594668388367, "learning_rate": 1.5556389214536927e-05, "loss": 0.1365, "step": 285400 }, { "epoch": 22.295978133541585, "grad_norm": 0.5632590651512146, "learning_rate": 1.5554826103946857e-05, "loss": 0.1352, "step": 285500 }, { "epoch": 22.3037875829754, "grad_norm": 0.7632501125335693, "learning_rate": 1.5553262993356783e-05, "loss": 0.1374, "step": 285600 }, { "epoch": 22.311597032409214, "grad_norm": 0.9150585532188416, "learning_rate": 1.5551699882766705e-05, "loss": 0.1369, "step": 285700 }, { "epoch": 22.31940648184303, "grad_norm": 0.5579971671104431, "learning_rate": 1.5550136772176635e-05, "loss": 0.1367, "step": 285800 }, { "epoch": 22.327215931276847, "grad_norm": 0.9822412133216858, "learning_rate": 1.5548573661586557e-05, "loss": 0.1387, "step": 285900 }, { "epoch": 22.33502538071066, "grad_norm": 0.6855137944221497, "learning_rate": 1.5547010550996483e-05, "loss": 0.1364, "step": 286000 }, { "epoch": 22.342834830144476, "grad_norm": 0.592866063117981, "learning_rate": 1.554544744040641e-05, "loss": 0.145, "step": 286100 }, { "epoch": 22.35064427957829, "grad_norm": 0.7260706424713135, "learning_rate": 1.5543884329816335e-05, "loss": 0.1404, "step": 286200 }, { "epoch": 22.358453729012105, "grad_norm": 0.7422342896461487, "learning_rate": 1.554232121922626e-05, "loss": 0.1384, "step": 286300 }, { "epoch": 22.36626317844592, "grad_norm": 0.8567727208137512, "learning_rate": 1.5540758108636187e-05, "loss": 0.1349, "step": 286400 }, { "epoch": 22.374072627879734, "grad_norm": 0.9445796608924866, "learning_rate": 1.5539194998046113e-05, "loss": 0.1358, "step": 286500 }, { "epoch": 22.38188207731355, "grad_norm": 1.007031798362732, "learning_rate": 1.553763188745604e-05, "loss": 0.1367, "step": 286600 }, { "epoch": 22.389691526747363, "grad_norm": 0.663798987865448, "learning_rate": 1.5536068776865965e-05, "loss": 0.1381, "step": 286700 }, { "epoch": 22.397500976181178, "grad_norm": 0.9678874611854553, "learning_rate": 1.553452129738179e-05, "loss": 0.1298, "step": 286800 }, { "epoch": 22.405310425614996, "grad_norm": 0.6478586196899414, "learning_rate": 1.5532958186791717e-05, "loss": 0.1412, "step": 286900 }, { "epoch": 22.41311987504881, "grad_norm": 0.9851172566413879, "learning_rate": 1.5531395076201643e-05, "loss": 0.1384, "step": 287000 }, { "epoch": 22.420929324482625, "grad_norm": 0.925794243812561, "learning_rate": 1.552983196561157e-05, "loss": 0.1407, "step": 287100 }, { "epoch": 22.42873877391644, "grad_norm": 0.7711319923400879, "learning_rate": 1.5528268855021492e-05, "loss": 0.1469, "step": 287200 }, { "epoch": 22.436548223350254, "grad_norm": 0.8208619356155396, "learning_rate": 1.552670574443142e-05, "loss": 0.1439, "step": 287300 }, { "epoch": 22.44435767278407, "grad_norm": 0.9265621304512024, "learning_rate": 1.5525142633841347e-05, "loss": 0.1356, "step": 287400 }, { "epoch": 22.452167122217883, "grad_norm": 0.6884872317314148, "learning_rate": 1.552357952325127e-05, "loss": 0.136, "step": 287500 }, { "epoch": 22.459976571651698, "grad_norm": 0.899463415145874, "learning_rate": 1.5522016412661196e-05, "loss": 0.1399, "step": 287600 }, { "epoch": 22.467786021085512, "grad_norm": 0.9107911586761475, "learning_rate": 1.5520453302071122e-05, "loss": 0.1357, "step": 287700 }, { "epoch": 22.475595470519327, "grad_norm": 0.9815389513969421, "learning_rate": 1.5518890191481048e-05, "loss": 0.14, "step": 287800 }, { "epoch": 22.483404919953145, "grad_norm": 0.8975253701210022, "learning_rate": 1.5517327080890974e-05, "loss": 0.1393, "step": 287900 }, { "epoch": 22.49121436938696, "grad_norm": 0.8066114187240601, "learning_rate": 1.55157639703009e-05, "loss": 0.1336, "step": 288000 }, { "epoch": 22.499023818820774, "grad_norm": 0.7992839813232422, "learning_rate": 1.5514200859710826e-05, "loss": 0.132, "step": 288100 }, { "epoch": 22.50683326825459, "grad_norm": 0.8005449771881104, "learning_rate": 1.5512637749120752e-05, "loss": 0.1399, "step": 288200 }, { "epoch": 22.514642717688403, "grad_norm": 0.7647144198417664, "learning_rate": 1.5511074638530678e-05, "loss": 0.1372, "step": 288300 }, { "epoch": 22.522452167122218, "grad_norm": 0.8365885019302368, "learning_rate": 1.5509511527940604e-05, "loss": 0.14, "step": 288400 }, { "epoch": 22.530261616556032, "grad_norm": 0.9638312458992004, "learning_rate": 1.550794841735053e-05, "loss": 0.1315, "step": 288500 }, { "epoch": 22.538071065989847, "grad_norm": 0.7517139911651611, "learning_rate": 1.5506385306760453e-05, "loss": 0.1413, "step": 288600 }, { "epoch": 22.54588051542366, "grad_norm": 0.8458108901977539, "learning_rate": 1.550482219617038e-05, "loss": 0.1393, "step": 288700 }, { "epoch": 22.553689964857476, "grad_norm": 0.7653716802597046, "learning_rate": 1.5503259085580308e-05, "loss": 0.1404, "step": 288800 }, { "epoch": 22.561499414291294, "grad_norm": 0.631316065788269, "learning_rate": 1.550169597499023e-05, "loss": 0.133, "step": 288900 }, { "epoch": 22.56930886372511, "grad_norm": 1.053382158279419, "learning_rate": 1.5500148495506057e-05, "loss": 0.1336, "step": 289000 }, { "epoch": 22.577118313158923, "grad_norm": 1.0804376602172852, "learning_rate": 1.5498585384915986e-05, "loss": 0.1325, "step": 289100 }, { "epoch": 22.584927762592738, "grad_norm": 0.7826420664787292, "learning_rate": 1.549702227432591e-05, "loss": 0.1439, "step": 289200 }, { "epoch": 22.592737212026552, "grad_norm": 0.9446529150009155, "learning_rate": 1.5495459163735835e-05, "loss": 0.1418, "step": 289300 }, { "epoch": 22.600546661460367, "grad_norm": 0.9799430966377258, "learning_rate": 1.549389605314576e-05, "loss": 0.131, "step": 289400 }, { "epoch": 22.60835611089418, "grad_norm": 0.7131752371788025, "learning_rate": 1.5492332942555687e-05, "loss": 0.1356, "step": 289500 }, { "epoch": 22.616165560327996, "grad_norm": 0.7591296434402466, "learning_rate": 1.5490769831965613e-05, "loss": 0.1302, "step": 289600 }, { "epoch": 22.62397500976181, "grad_norm": 0.7944523096084595, "learning_rate": 1.548920672137554e-05, "loss": 0.1381, "step": 289700 }, { "epoch": 22.631784459195625, "grad_norm": 0.7745599150657654, "learning_rate": 1.5487643610785465e-05, "loss": 0.1378, "step": 289800 }, { "epoch": 22.639593908629443, "grad_norm": 0.5822691917419434, "learning_rate": 1.548608050019539e-05, "loss": 0.1412, "step": 289900 }, { "epoch": 22.647403358063258, "grad_norm": 0.8896892666816711, "learning_rate": 1.5484517389605317e-05, "loss": 0.1324, "step": 290000 }, { "epoch": 22.655212807497072, "grad_norm": 0.9473217725753784, "learning_rate": 1.548295427901524e-05, "loss": 0.1372, "step": 290100 }, { "epoch": 22.663022256930887, "grad_norm": 0.6952362656593323, "learning_rate": 1.548139116842517e-05, "loss": 0.1369, "step": 290200 }, { "epoch": 22.6708317063647, "grad_norm": 0.5139284133911133, "learning_rate": 1.5479828057835095e-05, "loss": 0.1305, "step": 290300 }, { "epoch": 22.678641155798516, "grad_norm": 0.8785337805747986, "learning_rate": 1.5478264947245018e-05, "loss": 0.1378, "step": 290400 }, { "epoch": 22.68645060523233, "grad_norm": 1.0016475915908813, "learning_rate": 1.5476701836654944e-05, "loss": 0.1411, "step": 290500 }, { "epoch": 22.694260054666145, "grad_norm": 0.8744795918464661, "learning_rate": 1.547513872606487e-05, "loss": 0.1342, "step": 290600 }, { "epoch": 22.70206950409996, "grad_norm": 0.9555802345275879, "learning_rate": 1.5473575615474796e-05, "loss": 0.1388, "step": 290700 }, { "epoch": 22.709878953533774, "grad_norm": 0.7396284341812134, "learning_rate": 1.547201250488472e-05, "loss": 0.1375, "step": 290800 }, { "epoch": 22.71768840296759, "grad_norm": 0.872858464717865, "learning_rate": 1.5470449394294648e-05, "loss": 0.1316, "step": 290900 }, { "epoch": 22.725497852401407, "grad_norm": 0.8516215682029724, "learning_rate": 1.5468901914810474e-05, "loss": 0.133, "step": 291000 }, { "epoch": 22.73330730183522, "grad_norm": 0.7716134190559387, "learning_rate": 1.54673388042204e-05, "loss": 0.1353, "step": 291100 }, { "epoch": 22.741116751269036, "grad_norm": 1.0244024991989136, "learning_rate": 1.5465775693630326e-05, "loss": 0.1388, "step": 291200 }, { "epoch": 22.74892620070285, "grad_norm": 0.9649984836578369, "learning_rate": 1.546421258304025e-05, "loss": 0.1372, "step": 291300 }, { "epoch": 22.756735650136665, "grad_norm": 0.7406355738639832, "learning_rate": 1.5462649472450178e-05, "loss": 0.1393, "step": 291400 }, { "epoch": 22.76454509957048, "grad_norm": 0.9723506569862366, "learning_rate": 1.5461086361860104e-05, "loss": 0.1381, "step": 291500 }, { "epoch": 22.772354549004294, "grad_norm": 0.9101099371910095, "learning_rate": 1.5459523251270026e-05, "loss": 0.1427, "step": 291600 }, { "epoch": 22.78016399843811, "grad_norm": 0.8881165981292725, "learning_rate": 1.5457960140679956e-05, "loss": 0.1262, "step": 291700 }, { "epoch": 22.787973447871924, "grad_norm": 0.8255956172943115, "learning_rate": 1.545639703008988e-05, "loss": 0.1344, "step": 291800 }, { "epoch": 22.79578289730574, "grad_norm": 0.8563169240951538, "learning_rate": 1.5454833919499804e-05, "loss": 0.1417, "step": 291900 }, { "epoch": 22.803592346739556, "grad_norm": 0.8315694332122803, "learning_rate": 1.545327080890973e-05, "loss": 0.1317, "step": 292000 }, { "epoch": 22.81140179617337, "grad_norm": 0.9142987132072449, "learning_rate": 1.5451707698319656e-05, "loss": 0.1405, "step": 292100 }, { "epoch": 22.819211245607185, "grad_norm": 0.7362247109413147, "learning_rate": 1.5450144587729582e-05, "loss": 0.1409, "step": 292200 }, { "epoch": 22.827020695041, "grad_norm": 0.6340514421463013, "learning_rate": 1.5448581477139508e-05, "loss": 0.1348, "step": 292300 }, { "epoch": 22.834830144474815, "grad_norm": 0.8943261504173279, "learning_rate": 1.5447018366549434e-05, "loss": 0.1318, "step": 292400 }, { "epoch": 22.84263959390863, "grad_norm": 0.596819281578064, "learning_rate": 1.544545525595936e-05, "loss": 0.1394, "step": 292500 }, { "epoch": 22.850449043342444, "grad_norm": 0.9999971985816956, "learning_rate": 1.5443892145369286e-05, "loss": 0.1382, "step": 292600 }, { "epoch": 22.858258492776258, "grad_norm": 0.6040255427360535, "learning_rate": 1.5442329034779212e-05, "loss": 0.1326, "step": 292700 }, { "epoch": 22.866067942210073, "grad_norm": 1.0905107259750366, "learning_rate": 1.544076592418914e-05, "loss": 0.1316, "step": 292800 }, { "epoch": 22.873877391643887, "grad_norm": 0.8896177411079407, "learning_rate": 1.5439202813599064e-05, "loss": 0.1349, "step": 292900 }, { "epoch": 22.881686841077705, "grad_norm": 0.9923672676086426, "learning_rate": 1.5437639703008987e-05, "loss": 0.1318, "step": 293000 }, { "epoch": 22.88949629051152, "grad_norm": 0.9534666538238525, "learning_rate": 1.5436092223524816e-05, "loss": 0.1391, "step": 293100 }, { "epoch": 22.897305739945335, "grad_norm": 0.6288168430328369, "learning_rate": 1.5434529112934742e-05, "loss": 0.1423, "step": 293200 }, { "epoch": 22.90511518937915, "grad_norm": 0.9616836905479431, "learning_rate": 1.5432966002344668e-05, "loss": 0.1415, "step": 293300 }, { "epoch": 22.912924638812964, "grad_norm": 0.5409935712814331, "learning_rate": 1.543140289175459e-05, "loss": 0.133, "step": 293400 }, { "epoch": 22.92073408824678, "grad_norm": 0.8281419277191162, "learning_rate": 1.542983978116452e-05, "loss": 0.1354, "step": 293500 }, { "epoch": 22.928543537680593, "grad_norm": 0.8489418029785156, "learning_rate": 1.5428276670574446e-05, "loss": 0.1317, "step": 293600 }, { "epoch": 22.936352987114407, "grad_norm": 0.729698657989502, "learning_rate": 1.542671355998437e-05, "loss": 0.1385, "step": 293700 }, { "epoch": 22.944162436548222, "grad_norm": 0.6333410143852234, "learning_rate": 1.5425150449394295e-05, "loss": 0.135, "step": 293800 }, { "epoch": 22.95197188598204, "grad_norm": 0.7198565006256104, "learning_rate": 1.542358733880422e-05, "loss": 0.1333, "step": 293900 }, { "epoch": 22.959781335415855, "grad_norm": 0.9172605872154236, "learning_rate": 1.5422024228214147e-05, "loss": 0.1343, "step": 294000 }, { "epoch": 22.96759078484967, "grad_norm": 0.7993659973144531, "learning_rate": 1.5420461117624073e-05, "loss": 0.1392, "step": 294100 }, { "epoch": 22.975400234283484, "grad_norm": 1.051163673400879, "learning_rate": 1.5418898007034e-05, "loss": 0.1458, "step": 294200 }, { "epoch": 22.9832096837173, "grad_norm": 0.7416157722473145, "learning_rate": 1.5417334896443925e-05, "loss": 0.1334, "step": 294300 }, { "epoch": 22.991019133151113, "grad_norm": 1.0222550630569458, "learning_rate": 1.541577178585385e-05, "loss": 0.1387, "step": 294400 }, { "epoch": 22.998828582584927, "grad_norm": 0.7647424936294556, "learning_rate": 1.5414208675263777e-05, "loss": 0.1434, "step": 294500 }, { "epoch": 23.006638032018742, "grad_norm": 0.6978480219841003, "learning_rate": 1.5412645564673703e-05, "loss": 0.1371, "step": 294600 }, { "epoch": 23.014447481452557, "grad_norm": 0.8882784247398376, "learning_rate": 1.541108245408363e-05, "loss": 0.135, "step": 294700 }, { "epoch": 23.02225693088637, "grad_norm": 0.8186953067779541, "learning_rate": 1.540951934349355e-05, "loss": 0.1364, "step": 294800 }, { "epoch": 23.030066380320186, "grad_norm": 0.738304078578949, "learning_rate": 1.5407956232903478e-05, "loss": 0.1401, "step": 294900 }, { "epoch": 23.037875829754004, "grad_norm": 0.8697826266288757, "learning_rate": 1.5406393122313407e-05, "loss": 0.131, "step": 295000 }, { "epoch": 23.04568527918782, "grad_norm": 0.9213365316390991, "learning_rate": 1.5404845642829233e-05, "loss": 0.1318, "step": 295100 }, { "epoch": 23.053494728621633, "grad_norm": 0.856341540813446, "learning_rate": 1.5403282532239156e-05, "loss": 0.1332, "step": 295200 }, { "epoch": 23.061304178055448, "grad_norm": 0.9739687442779541, "learning_rate": 1.5401735052754985e-05, "loss": 0.1352, "step": 295300 }, { "epoch": 23.069113627489262, "grad_norm": 0.6837376356124878, "learning_rate": 1.540017194216491e-05, "loss": 0.1331, "step": 295400 }, { "epoch": 23.076923076923077, "grad_norm": 0.7577429413795471, "learning_rate": 1.5398608831574834e-05, "loss": 0.1404, "step": 295500 }, { "epoch": 23.08473252635689, "grad_norm": 0.8737211227416992, "learning_rate": 1.539704572098476e-05, "loss": 0.1317, "step": 295600 }, { "epoch": 23.092541975790706, "grad_norm": 1.0972391366958618, "learning_rate": 1.539548261039469e-05, "loss": 0.1308, "step": 295700 }, { "epoch": 23.10035142522452, "grad_norm": 0.8688048720359802, "learning_rate": 1.539391949980461e-05, "loss": 0.1316, "step": 295800 }, { "epoch": 23.108160874658335, "grad_norm": 0.6488736271858215, "learning_rate": 1.5392356389214538e-05, "loss": 0.1259, "step": 295900 }, { "epoch": 23.115970324092153, "grad_norm": 0.8195211291313171, "learning_rate": 1.5390793278624464e-05, "loss": 0.1336, "step": 296000 }, { "epoch": 23.123779773525968, "grad_norm": 0.7433906197547913, "learning_rate": 1.538923016803439e-05, "loss": 0.1328, "step": 296100 }, { "epoch": 23.131589222959782, "grad_norm": 1.1779958009719849, "learning_rate": 1.5387667057444316e-05, "loss": 0.1335, "step": 296200 }, { "epoch": 23.139398672393597, "grad_norm": 0.7165812849998474, "learning_rate": 1.5386103946854242e-05, "loss": 0.1291, "step": 296300 }, { "epoch": 23.14720812182741, "grad_norm": 0.8701615929603577, "learning_rate": 1.5384540836264168e-05, "loss": 0.1317, "step": 296400 }, { "epoch": 23.155017571261226, "grad_norm": 1.0225144624710083, "learning_rate": 1.5382977725674094e-05, "loss": 0.1426, "step": 296500 }, { "epoch": 23.16282702069504, "grad_norm": 0.9554911255836487, "learning_rate": 1.538141461508402e-05, "loss": 0.1428, "step": 296600 }, { "epoch": 23.170636470128855, "grad_norm": 0.9963477849960327, "learning_rate": 1.5379851504493942e-05, "loss": 0.139, "step": 296700 }, { "epoch": 23.17844591956267, "grad_norm": 0.8080196380615234, "learning_rate": 1.5378288393903872e-05, "loss": 0.1344, "step": 296800 }, { "epoch": 23.186255368996484, "grad_norm": 1.0430487394332886, "learning_rate": 1.5376725283313794e-05, "loss": 0.1315, "step": 296900 }, { "epoch": 23.194064818430302, "grad_norm": 1.0507380962371826, "learning_rate": 1.537516217272372e-05, "loss": 0.1314, "step": 297000 }, { "epoch": 23.201874267864117, "grad_norm": 0.8849628567695618, "learning_rate": 1.5373599062133646e-05, "loss": 0.1315, "step": 297100 }, { "epoch": 23.20968371729793, "grad_norm": 0.7833200693130493, "learning_rate": 1.5372035951543572e-05, "loss": 0.1405, "step": 297200 }, { "epoch": 23.217493166731746, "grad_norm": 1.1596750020980835, "learning_rate": 1.53704728409535e-05, "loss": 0.134, "step": 297300 }, { "epoch": 23.22530261616556, "grad_norm": 1.0101959705352783, "learning_rate": 1.5368909730363424e-05, "loss": 0.1334, "step": 297400 }, { "epoch": 23.233112065599375, "grad_norm": 0.7190399169921875, "learning_rate": 1.536734661977335e-05, "loss": 0.1302, "step": 297500 }, { "epoch": 23.24092151503319, "grad_norm": 0.890193521976471, "learning_rate": 1.5365783509183276e-05, "loss": 0.1232, "step": 297600 }, { "epoch": 23.248730964467004, "grad_norm": 0.8231490850448608, "learning_rate": 1.5364220398593202e-05, "loss": 0.13, "step": 297700 }, { "epoch": 23.25654041390082, "grad_norm": 1.3083713054656982, "learning_rate": 1.5362657288003125e-05, "loss": 0.1337, "step": 297800 }, { "epoch": 23.264349863334633, "grad_norm": 0.8749645352363586, "learning_rate": 1.5361094177413054e-05, "loss": 0.1425, "step": 297900 }, { "epoch": 23.27215931276845, "grad_norm": 0.8127243518829346, "learning_rate": 1.535953106682298e-05, "loss": 0.13, "step": 298000 }, { "epoch": 23.279968762202266, "grad_norm": 0.7600269913673401, "learning_rate": 1.5357967956232903e-05, "loss": 0.1277, "step": 298100 }, { "epoch": 23.28777821163608, "grad_norm": 0.8609147667884827, "learning_rate": 1.535640484564283e-05, "loss": 0.1247, "step": 298200 }, { "epoch": 23.295587661069895, "grad_norm": 0.6120932102203369, "learning_rate": 1.5354841735052755e-05, "loss": 0.1262, "step": 298300 }, { "epoch": 23.30339711050371, "grad_norm": 0.6841028928756714, "learning_rate": 1.535327862446268e-05, "loss": 0.1273, "step": 298400 }, { "epoch": 23.311206559937524, "grad_norm": 0.9684091806411743, "learning_rate": 1.5351715513872607e-05, "loss": 0.131, "step": 298500 }, { "epoch": 23.31901600937134, "grad_norm": 0.8668888807296753, "learning_rate": 1.5350152403282533e-05, "loss": 0.1295, "step": 298600 }, { "epoch": 23.326825458805153, "grad_norm": 0.8056829571723938, "learning_rate": 1.534858929269246e-05, "loss": 0.1406, "step": 298700 }, { "epoch": 23.334634908238968, "grad_norm": 0.9638793468475342, "learning_rate": 1.5347026182102385e-05, "loss": 0.1263, "step": 298800 }, { "epoch": 23.342444357672782, "grad_norm": 0.7716854810714722, "learning_rate": 1.534546307151231e-05, "loss": 0.1359, "step": 298900 }, { "epoch": 23.3502538071066, "grad_norm": 0.930443525314331, "learning_rate": 1.5343899960922237e-05, "loss": 0.1367, "step": 299000 }, { "epoch": 23.358063256540415, "grad_norm": 1.0026413202285767, "learning_rate": 1.5342336850332163e-05, "loss": 0.1345, "step": 299100 }, { "epoch": 23.36587270597423, "grad_norm": 0.6678506731987, "learning_rate": 1.5340773739742086e-05, "loss": 0.1351, "step": 299200 }, { "epoch": 23.373682155408044, "grad_norm": 1.0416382551193237, "learning_rate": 1.5339210629152015e-05, "loss": 0.1386, "step": 299300 }, { "epoch": 23.38149160484186, "grad_norm": 0.9412092566490173, "learning_rate": 1.533764751856194e-05, "loss": 0.1309, "step": 299400 }, { "epoch": 23.389301054275673, "grad_norm": 0.7648351788520813, "learning_rate": 1.5336084407971864e-05, "loss": 0.1328, "step": 299500 }, { "epoch": 23.397110503709488, "grad_norm": 0.6699008941650391, "learning_rate": 1.533452129738179e-05, "loss": 0.1357, "step": 299600 }, { "epoch": 23.404919953143303, "grad_norm": 0.7237615585327148, "learning_rate": 1.533297381789762e-05, "loss": 0.129, "step": 299700 }, { "epoch": 23.412729402577117, "grad_norm": 1.0570127964019775, "learning_rate": 1.5331410707307545e-05, "loss": 0.133, "step": 299800 }, { "epoch": 23.42053885201093, "grad_norm": 0.845137894153595, "learning_rate": 1.5329847596717468e-05, "loss": 0.1344, "step": 299900 }, { "epoch": 23.42834830144475, "grad_norm": 0.8534473180770874, "learning_rate": 1.5328284486127394e-05, "loss": 0.1348, "step": 300000 } ], "logging_steps": 100, "max_steps": 1280500, "num_input_tokens_seen": 0, "num_train_epochs": 100, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.98964279526528e+18, "train_batch_size": 64, "trial_name": null, "trial_params": null }