diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,21034 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 23.42834830144475, + "eval_steps": 500, + "global_step": 300000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.007809449433814916, + "grad_norm": 1.4658414125442505, + "learning_rate": 2.0000000000000003e-06, + "loss": 3.3302, + "step": 100 + }, + { + "epoch": 0.015618898867629832, + "grad_norm": 1.1347604990005493, + "learning_rate": 4.000000000000001e-06, + "loss": 2.9274, + "step": 200 + }, + { + "epoch": 0.02342834830144475, + "grad_norm": 1.399733543395996, + "learning_rate": 6e-06, + "loss": 2.7919, + "step": 300 + }, + { + "epoch": 0.031237797735259663, + "grad_norm": 0.8615509271621704, + "learning_rate": 8.000000000000001e-06, + "loss": 2.7518, + "step": 400 + }, + { + "epoch": 0.03904724716907458, + "grad_norm": 1.3015568256378174, + "learning_rate": 1e-05, + "loss": 2.7169, + "step": 500 + }, + { + "epoch": 0.0468566966028895, + "grad_norm": 1.1654436588287354, + "learning_rate": 1.2e-05, + "loss": 2.6892, + "step": 600 + }, + { + "epoch": 0.05466614603670441, + "grad_norm": 1.9601017236709595, + "learning_rate": 1.4e-05, + "loss": 2.6643, + "step": 700 + }, + { + "epoch": 0.06247559547051933, + "grad_norm": 1.8341138362884521, + "learning_rate": 1.6000000000000003e-05, + "loss": 2.6494, + "step": 800 + }, + { + "epoch": 0.07028504490433425, + "grad_norm": 1.137752890586853, + "learning_rate": 1.8e-05, + "loss": 2.6348, + "step": 900 + }, + { + "epoch": 0.07809449433814916, + "grad_norm": 1.324123501777649, + "learning_rate": 2e-05, + "loss": 2.6144, + "step": 1000 + }, + { + "epoch": 0.08590394377196407, + "grad_norm": 1.3983900547027588, + "learning_rate": 1.9998436889409928e-05, + "loss": 2.598, + "step": 1100 + }, + { + "epoch": 0.093713393205779, + "grad_norm": 1.4354324340820312, + "learning_rate": 1.9996873778819854e-05, + "loss": 2.5849, + "step": 1200 + }, + { + "epoch": 0.10152284263959391, + "grad_norm": 1.3982596397399902, + "learning_rate": 1.999531066822978e-05, + "loss": 2.5705, + "step": 1300 + }, + { + "epoch": 0.10933229207340882, + "grad_norm": 1.9624176025390625, + "learning_rate": 1.9993747557639706e-05, + "loss": 2.5423, + "step": 1400 + }, + { + "epoch": 0.11714174150722374, + "grad_norm": 1.2586028575897217, + "learning_rate": 1.9992184447049628e-05, + "loss": 2.5274, + "step": 1500 + }, + { + "epoch": 0.12495119094103865, + "grad_norm": 1.7528750896453857, + "learning_rate": 1.9990621336459558e-05, + "loss": 2.476, + "step": 1600 + }, + { + "epoch": 0.13276064037485358, + "grad_norm": 1.7756469249725342, + "learning_rate": 1.9989058225869484e-05, + "loss": 2.4652, + "step": 1700 + }, + { + "epoch": 0.1405700898086685, + "grad_norm": 1.7584046125411987, + "learning_rate": 1.9987495115279406e-05, + "loss": 2.4154, + "step": 1800 + }, + { + "epoch": 0.1483795392424834, + "grad_norm": 1.531298041343689, + "learning_rate": 1.9985932004689332e-05, + "loss": 2.3776, + "step": 1900 + }, + { + "epoch": 0.15618898867629832, + "grad_norm": 3.2747278213500977, + "learning_rate": 1.998436889409926e-05, + "loss": 2.3426, + "step": 2000 + }, + { + "epoch": 0.16399843811011325, + "grad_norm": 1.5152462720870972, + "learning_rate": 1.9982805783509184e-05, + "loss": 2.3311, + "step": 2100 + }, + { + "epoch": 0.17180788754392814, + "grad_norm": 3.3594422340393066, + "learning_rate": 1.998124267291911e-05, + "loss": 2.2926, + "step": 2200 + }, + { + "epoch": 0.17961733697774307, + "grad_norm": 1.8210939168930054, + "learning_rate": 1.9979679562329036e-05, + "loss": 2.2611, + "step": 2300 + }, + { + "epoch": 0.187426786411558, + "grad_norm": 1.4974194765090942, + "learning_rate": 1.9978116451738962e-05, + "loss": 2.2428, + "step": 2400 + }, + { + "epoch": 0.1952362358453729, + "grad_norm": 2.1313698291778564, + "learning_rate": 1.997655334114889e-05, + "loss": 2.2123, + "step": 2500 + }, + { + "epoch": 0.20304568527918782, + "grad_norm": 2.3703184127807617, + "learning_rate": 1.9974990230558814e-05, + "loss": 2.168, + "step": 2600 + }, + { + "epoch": 0.21085513471300274, + "grad_norm": 2.0529401302337646, + "learning_rate": 1.997342711996874e-05, + "loss": 2.1258, + "step": 2700 + }, + { + "epoch": 0.21866458414681764, + "grad_norm": 1.7897934913635254, + "learning_rate": 1.9971864009378666e-05, + "loss": 2.0991, + "step": 2800 + }, + { + "epoch": 0.22647403358063256, + "grad_norm": 4.2363386154174805, + "learning_rate": 1.997030089878859e-05, + "loss": 2.0567, + "step": 2900 + }, + { + "epoch": 0.23428348301444749, + "grad_norm": 4.278024673461914, + "learning_rate": 1.9968737788198515e-05, + "loss": 2.0318, + "step": 3000 + }, + { + "epoch": 0.2420929324482624, + "grad_norm": 2.8552427291870117, + "learning_rate": 1.9967174677608444e-05, + "loss": 1.9907, + "step": 3100 + }, + { + "epoch": 0.2499023818820773, + "grad_norm": 3.2664151191711426, + "learning_rate": 1.9965611567018367e-05, + "loss": 1.9173, + "step": 3200 + }, + { + "epoch": 0.2577118313158922, + "grad_norm": 3.365929365158081, + "learning_rate": 1.9964048456428293e-05, + "loss": 1.8902, + "step": 3300 + }, + { + "epoch": 0.26552128074970716, + "grad_norm": 2.942408800125122, + "learning_rate": 1.996248534583822e-05, + "loss": 1.8659, + "step": 3400 + }, + { + "epoch": 0.27333073018352205, + "grad_norm": 2.636049747467041, + "learning_rate": 1.9960922235248145e-05, + "loss": 1.8498, + "step": 3500 + }, + { + "epoch": 0.281140179617337, + "grad_norm": 2.335026502609253, + "learning_rate": 1.995935912465807e-05, + "loss": 1.7999, + "step": 3600 + }, + { + "epoch": 0.2889496290511519, + "grad_norm": 3.5235695838928223, + "learning_rate": 1.9957796014067997e-05, + "loss": 1.7668, + "step": 3700 + }, + { + "epoch": 0.2967590784849668, + "grad_norm": 3.085439682006836, + "learning_rate": 1.9956232903477923e-05, + "loss": 1.7068, + "step": 3800 + }, + { + "epoch": 0.30456852791878175, + "grad_norm": 2.5960071086883545, + "learning_rate": 1.995466979288785e-05, + "loss": 1.6931, + "step": 3900 + }, + { + "epoch": 0.31237797735259665, + "grad_norm": 1.9345512390136719, + "learning_rate": 1.9953106682297775e-05, + "loss": 1.6449, + "step": 4000 + }, + { + "epoch": 0.32018742678641154, + "grad_norm": 2.4250986576080322, + "learning_rate": 1.9951543571707698e-05, + "loss": 1.6116, + "step": 4100 + }, + { + "epoch": 0.3279968762202265, + "grad_norm": 2.81168794631958, + "learning_rate": 1.9949980461117627e-05, + "loss": 1.5684, + "step": 4200 + }, + { + "epoch": 0.3358063256540414, + "grad_norm": 3.2198565006256104, + "learning_rate": 1.994841735052755e-05, + "loss": 1.5581, + "step": 4300 + }, + { + "epoch": 0.3436157750878563, + "grad_norm": 2.893419027328491, + "learning_rate": 1.9946854239937476e-05, + "loss": 1.5005, + "step": 4400 + }, + { + "epoch": 0.35142522452167124, + "grad_norm": 2.161217212677002, + "learning_rate": 1.9945291129347405e-05, + "loss": 1.5025, + "step": 4500 + }, + { + "epoch": 0.35923467395548614, + "grad_norm": 2.2876691818237305, + "learning_rate": 1.9943728018757328e-05, + "loss": 1.4733, + "step": 4600 + }, + { + "epoch": 0.36704412338930104, + "grad_norm": 2.560692548751831, + "learning_rate": 1.9942164908167254e-05, + "loss": 1.4473, + "step": 4700 + }, + { + "epoch": 0.374853572823116, + "grad_norm": 2.703275680541992, + "learning_rate": 1.994060179757718e-05, + "loss": 1.4112, + "step": 4800 + }, + { + "epoch": 0.3826630222569309, + "grad_norm": 2.5784506797790527, + "learning_rate": 1.9939038686987106e-05, + "loss": 1.4035, + "step": 4900 + }, + { + "epoch": 0.3904724716907458, + "grad_norm": 2.6702706813812256, + "learning_rate": 1.9937475576397032e-05, + "loss": 1.3877, + "step": 5000 + }, + { + "epoch": 0.39828192112456073, + "grad_norm": 2.5400278568267822, + "learning_rate": 1.9935912465806958e-05, + "loss": 1.3443, + "step": 5100 + }, + { + "epoch": 0.40609137055837563, + "grad_norm": 2.167583703994751, + "learning_rate": 1.9934349355216884e-05, + "loss": 1.3422, + "step": 5200 + }, + { + "epoch": 0.4139008199921905, + "grad_norm": 3.1610641479492188, + "learning_rate": 1.993278624462681e-05, + "loss": 1.3036, + "step": 5300 + }, + { + "epoch": 0.4217102694260055, + "grad_norm": 2.5936696529388428, + "learning_rate": 1.9931223134036736e-05, + "loss": 1.2936, + "step": 5400 + }, + { + "epoch": 0.4295197188598204, + "grad_norm": 2.189955234527588, + "learning_rate": 1.992966002344666e-05, + "loss": 1.2722, + "step": 5500 + }, + { + "epoch": 0.4373291682936353, + "grad_norm": 2.7976956367492676, + "learning_rate": 1.9928096912856588e-05, + "loss": 1.2519, + "step": 5600 + }, + { + "epoch": 0.4451386177274502, + "grad_norm": 2.2419660091400146, + "learning_rate": 1.992653380226651e-05, + "loss": 1.2393, + "step": 5700 + }, + { + "epoch": 0.4529480671612651, + "grad_norm": 2.1277241706848145, + "learning_rate": 1.9924970691676437e-05, + "loss": 1.2274, + "step": 5800 + }, + { + "epoch": 0.4607575165950801, + "grad_norm": 3.7499144077301025, + "learning_rate": 1.9923407581086363e-05, + "loss": 1.2307, + "step": 5900 + }, + { + "epoch": 0.46856696602889497, + "grad_norm": 1.9480825662612915, + "learning_rate": 1.992184447049629e-05, + "loss": 1.2134, + "step": 6000 + }, + { + "epoch": 0.47637641546270987, + "grad_norm": 2.120570659637451, + "learning_rate": 1.9920281359906215e-05, + "loss": 1.2117, + "step": 6100 + }, + { + "epoch": 0.4841858648965248, + "grad_norm": 2.7811381816864014, + "learning_rate": 1.991871824931614e-05, + "loss": 1.1805, + "step": 6200 + }, + { + "epoch": 0.4919953143303397, + "grad_norm": 1.9131306409835815, + "learning_rate": 1.9917155138726067e-05, + "loss": 1.181, + "step": 6300 + }, + { + "epoch": 0.4998047637641546, + "grad_norm": 1.955204963684082, + "learning_rate": 1.9915592028135993e-05, + "loss": 1.1594, + "step": 6400 + }, + { + "epoch": 0.5076142131979695, + "grad_norm": 2.049238920211792, + "learning_rate": 1.991402891754592e-05, + "loss": 1.16, + "step": 6500 + }, + { + "epoch": 0.5154236626317844, + "grad_norm": 2.059785842895508, + "learning_rate": 1.991246580695584e-05, + "loss": 1.1147, + "step": 6600 + }, + { + "epoch": 0.5232331120655994, + "grad_norm": 2.1609349250793457, + "learning_rate": 1.991090269636577e-05, + "loss": 1.1218, + "step": 6700 + }, + { + "epoch": 0.5310425614994143, + "grad_norm": 2.1594114303588867, + "learning_rate": 1.9909339585775697e-05, + "loss": 1.1246, + "step": 6800 + }, + { + "epoch": 0.5388520109332292, + "grad_norm": 2.033703565597534, + "learning_rate": 1.990777647518562e-05, + "loss": 1.0668, + "step": 6900 + }, + { + "epoch": 0.5466614603670441, + "grad_norm": 2.054765224456787, + "learning_rate": 1.9906213364595545e-05, + "loss": 1.104, + "step": 7000 + }, + { + "epoch": 0.554470909800859, + "grad_norm": 2.6854045391082764, + "learning_rate": 1.9904650254005475e-05, + "loss": 1.0963, + "step": 7100 + }, + { + "epoch": 0.562280359234674, + "grad_norm": 2.482316255569458, + "learning_rate": 1.9903087143415397e-05, + "loss": 1.0836, + "step": 7200 + }, + { + "epoch": 0.5700898086684889, + "grad_norm": 1.9816139936447144, + "learning_rate": 1.9901524032825323e-05, + "loss": 1.0585, + "step": 7300 + }, + { + "epoch": 0.5778992581023038, + "grad_norm": 2.2517287731170654, + "learning_rate": 1.989996092223525e-05, + "loss": 1.0604, + "step": 7400 + }, + { + "epoch": 0.5857087075361187, + "grad_norm": 1.8760857582092285, + "learning_rate": 1.9898397811645175e-05, + "loss": 1.0492, + "step": 7500 + }, + { + "epoch": 0.5935181569699336, + "grad_norm": 2.0815093517303467, + "learning_rate": 1.98968347010551e-05, + "loss": 1.0513, + "step": 7600 + }, + { + "epoch": 0.6013276064037485, + "grad_norm": 2.0560126304626465, + "learning_rate": 1.9895271590465027e-05, + "loss": 1.0356, + "step": 7700 + }, + { + "epoch": 0.6091370558375635, + "grad_norm": 1.6335766315460205, + "learning_rate": 1.9893708479874953e-05, + "loss": 1.0171, + "step": 7800 + }, + { + "epoch": 0.6169465052713784, + "grad_norm": 2.0025687217712402, + "learning_rate": 1.989214536928488e-05, + "loss": 1.005, + "step": 7900 + }, + { + "epoch": 0.6247559547051933, + "grad_norm": 2.0700294971466064, + "learning_rate": 1.9890582258694805e-05, + "loss": 1.0341, + "step": 8000 + }, + { + "epoch": 0.6325654041390082, + "grad_norm": 1.6764856576919556, + "learning_rate": 1.9889019148104728e-05, + "loss": 1.0041, + "step": 8100 + }, + { + "epoch": 0.6403748535728231, + "grad_norm": 1.821441411972046, + "learning_rate": 1.9887456037514657e-05, + "loss": 1.0123, + "step": 8200 + }, + { + "epoch": 0.648184303006638, + "grad_norm": 1.8293089866638184, + "learning_rate": 1.988589292692458e-05, + "loss": 0.9997, + "step": 8300 + }, + { + "epoch": 0.655993752440453, + "grad_norm": 1.7432034015655518, + "learning_rate": 1.9884329816334506e-05, + "loss": 0.9687, + "step": 8400 + }, + { + "epoch": 0.6638032018742679, + "grad_norm": 1.683962345123291, + "learning_rate": 1.9882766705744436e-05, + "loss": 0.9589, + "step": 8500 + }, + { + "epoch": 0.6716126513080828, + "grad_norm": 2.0143861770629883, + "learning_rate": 1.9881203595154358e-05, + "loss": 0.9536, + "step": 8600 + }, + { + "epoch": 0.6794221007418977, + "grad_norm": 1.668605923652649, + "learning_rate": 1.9879640484564284e-05, + "loss": 0.9362, + "step": 8700 + }, + { + "epoch": 0.6872315501757126, + "grad_norm": 2.569770574569702, + "learning_rate": 1.987807737397421e-05, + "loss": 0.9344, + "step": 8800 + }, + { + "epoch": 0.6950409996095275, + "grad_norm": 2.044370412826538, + "learning_rate": 1.9876514263384136e-05, + "loss": 0.9277, + "step": 8900 + }, + { + "epoch": 0.7028504490433425, + "grad_norm": 1.6726328134536743, + "learning_rate": 1.9874951152794062e-05, + "loss": 0.938, + "step": 9000 + }, + { + "epoch": 0.7106598984771574, + "grad_norm": 1.9856268167495728, + "learning_rate": 1.9873388042203988e-05, + "loss": 0.9366, + "step": 9100 + }, + { + "epoch": 0.7184693479109723, + "grad_norm": 2.2362923622131348, + "learning_rate": 1.987182493161391e-05, + "loss": 0.9515, + "step": 9200 + }, + { + "epoch": 0.7262787973447872, + "grad_norm": 1.8397703170776367, + "learning_rate": 1.987027745212974e-05, + "loss": 0.9332, + "step": 9300 + }, + { + "epoch": 0.7340882467786021, + "grad_norm": 1.7469147443771362, + "learning_rate": 1.9868714341539666e-05, + "loss": 0.9441, + "step": 9400 + }, + { + "epoch": 0.7418976962124171, + "grad_norm": 2.095268726348877, + "learning_rate": 1.9867151230949592e-05, + "loss": 0.9272, + "step": 9500 + }, + { + "epoch": 0.749707145646232, + "grad_norm": 1.8756574392318726, + "learning_rate": 1.9865588120359518e-05, + "loss": 0.8917, + "step": 9600 + }, + { + "epoch": 0.7575165950800469, + "grad_norm": 1.924744725227356, + "learning_rate": 1.9864025009769444e-05, + "loss": 0.9079, + "step": 9700 + }, + { + "epoch": 0.7653260445138618, + "grad_norm": 1.5487234592437744, + "learning_rate": 1.9862461899179367e-05, + "loss": 0.9002, + "step": 9800 + }, + { + "epoch": 0.7731354939476767, + "grad_norm": 1.5049303770065308, + "learning_rate": 1.9860898788589293e-05, + "loss": 0.9027, + "step": 9900 + }, + { + "epoch": 0.7809449433814916, + "grad_norm": 1.5578070878982544, + "learning_rate": 1.9859335677999222e-05, + "loss": 0.8865, + "step": 10000 + }, + { + "epoch": 0.7887543928153066, + "grad_norm": 1.7090140581130981, + "learning_rate": 1.9857772567409145e-05, + "loss": 0.8829, + "step": 10100 + }, + { + "epoch": 0.7965638422491215, + "grad_norm": 1.9182331562042236, + "learning_rate": 1.985620945681907e-05, + "loss": 0.8718, + "step": 10200 + }, + { + "epoch": 0.8043732916829364, + "grad_norm": 1.6985232830047607, + "learning_rate": 1.9854646346228997e-05, + "loss": 0.875, + "step": 10300 + }, + { + "epoch": 0.8121827411167513, + "grad_norm": 1.786824107170105, + "learning_rate": 1.9853083235638923e-05, + "loss": 0.8721, + "step": 10400 + }, + { + "epoch": 0.8199921905505662, + "grad_norm": 1.4861210584640503, + "learning_rate": 1.985152012504885e-05, + "loss": 0.867, + "step": 10500 + }, + { + "epoch": 0.827801639984381, + "grad_norm": 1.886149287223816, + "learning_rate": 1.9849957014458775e-05, + "loss": 0.856, + "step": 10600 + }, + { + "epoch": 0.8356110894181961, + "grad_norm": 2.148075819015503, + "learning_rate": 1.98483939038687e-05, + "loss": 0.8496, + "step": 10700 + }, + { + "epoch": 0.843420538852011, + "grad_norm": 1.4386200904846191, + "learning_rate": 1.9846830793278627e-05, + "loss": 0.8558, + "step": 10800 + }, + { + "epoch": 0.8512299882858259, + "grad_norm": 1.9120664596557617, + "learning_rate": 1.9845267682688553e-05, + "loss": 0.8482, + "step": 10900 + }, + { + "epoch": 0.8590394377196408, + "grad_norm": 1.9040182828903198, + "learning_rate": 1.9843704572098476e-05, + "loss": 0.8572, + "step": 11000 + }, + { + "epoch": 0.8668488871534556, + "grad_norm": 2.2053062915802, + "learning_rate": 1.9842141461508405e-05, + "loss": 0.8764, + "step": 11100 + }, + { + "epoch": 0.8746583365872705, + "grad_norm": 1.398203730583191, + "learning_rate": 1.9840578350918328e-05, + "loss": 0.8626, + "step": 11200 + }, + { + "epoch": 0.8824677860210856, + "grad_norm": 1.7013752460479736, + "learning_rate": 1.9839015240328254e-05, + "loss": 0.8232, + "step": 11300 + }, + { + "epoch": 0.8902772354549005, + "grad_norm": 1.5767678022384644, + "learning_rate": 1.9837467760844083e-05, + "loss": 0.8624, + "step": 11400 + }, + { + "epoch": 0.8980866848887153, + "grad_norm": 1.8870518207550049, + "learning_rate": 1.983590465025401e-05, + "loss": 0.8552, + "step": 11500 + }, + { + "epoch": 0.9058961343225302, + "grad_norm": 1.8952587842941284, + "learning_rate": 1.983434153966393e-05, + "loss": 0.8352, + "step": 11600 + }, + { + "epoch": 0.9137055837563451, + "grad_norm": 2.1782963275909424, + "learning_rate": 1.9832778429073858e-05, + "loss": 0.8335, + "step": 11700 + }, + { + "epoch": 0.9215150331901601, + "grad_norm": 1.6963413953781128, + "learning_rate": 1.9831215318483784e-05, + "loss": 0.8253, + "step": 11800 + }, + { + "epoch": 0.929324482623975, + "grad_norm": 1.7919764518737793, + "learning_rate": 1.982965220789371e-05, + "loss": 0.8031, + "step": 11900 + }, + { + "epoch": 0.9371339320577899, + "grad_norm": 1.9020565748214722, + "learning_rate": 1.9828089097303636e-05, + "loss": 0.8086, + "step": 12000 + }, + { + "epoch": 0.9449433814916048, + "grad_norm": 1.617715835571289, + "learning_rate": 1.982652598671356e-05, + "loss": 0.807, + "step": 12100 + }, + { + "epoch": 0.9527528309254197, + "grad_norm": 1.8193297386169434, + "learning_rate": 1.9824962876123488e-05, + "loss": 0.7864, + "step": 12200 + }, + { + "epoch": 0.9605622803592346, + "grad_norm": 2.011845111846924, + "learning_rate": 1.9823399765533414e-05, + "loss": 0.8058, + "step": 12300 + }, + { + "epoch": 0.9683717297930496, + "grad_norm": 2.0425360202789307, + "learning_rate": 1.982183665494334e-05, + "loss": 0.8175, + "step": 12400 + }, + { + "epoch": 0.9761811792268645, + "grad_norm": 1.93047297000885, + "learning_rate": 1.9820273544353266e-05, + "loss": 0.7862, + "step": 12500 + }, + { + "epoch": 0.9839906286606794, + "grad_norm": 2.4548439979553223, + "learning_rate": 1.981871043376319e-05, + "loss": 0.7839, + "step": 12600 + }, + { + "epoch": 0.9918000780944943, + "grad_norm": 1.7791589498519897, + "learning_rate": 1.9817147323173118e-05, + "loss": 0.7722, + "step": 12700 + }, + { + "epoch": 0.9996095275283092, + "grad_norm": 1.955427885055542, + "learning_rate": 1.981558421258304e-05, + "loss": 0.8095, + "step": 12800 + }, + { + "epoch": 1.0074189769621242, + "grad_norm": 1.8196287155151367, + "learning_rate": 1.981402110199297e-05, + "loss": 0.7911, + "step": 12900 + }, + { + "epoch": 1.015228426395939, + "grad_norm": 2.349574565887451, + "learning_rate": 1.9812457991402892e-05, + "loss": 0.803, + "step": 13000 + }, + { + "epoch": 1.023037875829754, + "grad_norm": 1.7875525951385498, + "learning_rate": 1.981089488081282e-05, + "loss": 0.7797, + "step": 13100 + }, + { + "epoch": 1.0308473252635688, + "grad_norm": 1.403671145439148, + "learning_rate": 1.9809331770222744e-05, + "loss": 0.7832, + "step": 13200 + }, + { + "epoch": 1.0386567746973838, + "grad_norm": 1.5299042463302612, + "learning_rate": 1.980776865963267e-05, + "loss": 0.7699, + "step": 13300 + }, + { + "epoch": 1.0464662241311988, + "grad_norm": 1.6570796966552734, + "learning_rate": 1.9806205549042596e-05, + "loss": 0.751, + "step": 13400 + }, + { + "epoch": 1.0542756735650136, + "grad_norm": 2.0295419692993164, + "learning_rate": 1.9804658069558422e-05, + "loss": 0.7585, + "step": 13500 + }, + { + "epoch": 1.0620851229988286, + "grad_norm": 1.412665843963623, + "learning_rate": 1.9803094958968348e-05, + "loss": 0.7887, + "step": 13600 + }, + { + "epoch": 1.0698945724326434, + "grad_norm": 1.495368480682373, + "learning_rate": 1.9801531848378274e-05, + "loss": 0.7608, + "step": 13700 + }, + { + "epoch": 1.0777040218664584, + "grad_norm": 1.5675642490386963, + "learning_rate": 1.97999687377882e-05, + "loss": 0.7448, + "step": 13800 + }, + { + "epoch": 1.0855134713002734, + "grad_norm": 1.5208722352981567, + "learning_rate": 1.9798405627198126e-05, + "loss": 0.76, + "step": 13900 + }, + { + "epoch": 1.0933229207340882, + "grad_norm": 1.5352216958999634, + "learning_rate": 1.9796842516608052e-05, + "loss": 0.7692, + "step": 14000 + }, + { + "epoch": 1.1011323701679032, + "grad_norm": 1.8058335781097412, + "learning_rate": 1.979527940601798e-05, + "loss": 0.7488, + "step": 14100 + }, + { + "epoch": 1.108941819601718, + "grad_norm": 1.7374639511108398, + "learning_rate": 1.9793716295427904e-05, + "loss": 0.7553, + "step": 14200 + }, + { + "epoch": 1.116751269035533, + "grad_norm": 1.797935128211975, + "learning_rate": 1.9792153184837827e-05, + "loss": 0.7523, + "step": 14300 + }, + { + "epoch": 1.1245607184693478, + "grad_norm": 1.7061059474945068, + "learning_rate": 1.9790590074247756e-05, + "loss": 0.7549, + "step": 14400 + }, + { + "epoch": 1.1323701679031628, + "grad_norm": 1.8086135387420654, + "learning_rate": 1.978902696365768e-05, + "loss": 0.7544, + "step": 14500 + }, + { + "epoch": 1.1401796173369778, + "grad_norm": 1.6291933059692383, + "learning_rate": 1.9787463853067605e-05, + "loss": 0.7407, + "step": 14600 + }, + { + "epoch": 1.1479890667707926, + "grad_norm": 1.5078767538070679, + "learning_rate": 1.9785900742477534e-05, + "loss": 0.7087, + "step": 14700 + }, + { + "epoch": 1.1557985162046076, + "grad_norm": 1.7376632690429688, + "learning_rate": 1.9784337631887457e-05, + "loss": 0.7432, + "step": 14800 + }, + { + "epoch": 1.1636079656384224, + "grad_norm": 1.4681905508041382, + "learning_rate": 1.9782774521297383e-05, + "loss": 0.7228, + "step": 14900 + }, + { + "epoch": 1.1714174150722374, + "grad_norm": 1.805963397026062, + "learning_rate": 1.978121141070731e-05, + "loss": 0.7408, + "step": 15000 + }, + { + "epoch": 1.1792268645060524, + "grad_norm": 2.167956590652466, + "learning_rate": 1.9779648300117235e-05, + "loss": 0.7365, + "step": 15100 + }, + { + "epoch": 1.1870363139398672, + "grad_norm": 1.7935293912887573, + "learning_rate": 1.977808518952716e-05, + "loss": 0.7361, + "step": 15200 + }, + { + "epoch": 1.1948457633736822, + "grad_norm": 1.7757160663604736, + "learning_rate": 1.9776522078937087e-05, + "loss": 0.7229, + "step": 15300 + }, + { + "epoch": 1.202655212807497, + "grad_norm": 2.04471755027771, + "learning_rate": 1.977495896834701e-05, + "loss": 0.7133, + "step": 15400 + }, + { + "epoch": 1.210464662241312, + "grad_norm": 2.1509175300598145, + "learning_rate": 1.977341148886284e-05, + "loss": 0.7316, + "step": 15500 + }, + { + "epoch": 1.218274111675127, + "grad_norm": 1.8622750043869019, + "learning_rate": 1.9771848378272765e-05, + "loss": 0.7517, + "step": 15600 + }, + { + "epoch": 1.2260835611089418, + "grad_norm": 1.8098951578140259, + "learning_rate": 1.977028526768269e-05, + "loss": 0.695, + "step": 15700 + }, + { + "epoch": 1.2338930105427568, + "grad_norm": 1.8343333005905151, + "learning_rate": 1.9768722157092617e-05, + "loss": 0.7092, + "step": 15800 + }, + { + "epoch": 1.2417024599765716, + "grad_norm": 1.716015100479126, + "learning_rate": 1.9767159046502543e-05, + "loss": 0.6997, + "step": 15900 + }, + { + "epoch": 1.2495119094103866, + "grad_norm": 1.668656349182129, + "learning_rate": 1.9765595935912466e-05, + "loss": 0.7041, + "step": 16000 + }, + { + "epoch": 1.2573213588442016, + "grad_norm": 1.7509514093399048, + "learning_rate": 1.9764032825322392e-05, + "loss": 0.702, + "step": 16100 + }, + { + "epoch": 1.2651308082780164, + "grad_norm": 1.7006629705429077, + "learning_rate": 1.976246971473232e-05, + "loss": 0.7071, + "step": 16200 + }, + { + "epoch": 1.2729402577118314, + "grad_norm": 1.8491188287734985, + "learning_rate": 1.9760906604142244e-05, + "loss": 0.6678, + "step": 16300 + }, + { + "epoch": 1.2807497071456462, + "grad_norm": 1.7705504894256592, + "learning_rate": 1.975934349355217e-05, + "loss": 0.688, + "step": 16400 + }, + { + "epoch": 1.2885591565794612, + "grad_norm": 1.4014639854431152, + "learning_rate": 1.9757780382962096e-05, + "loss": 0.7038, + "step": 16500 + }, + { + "epoch": 1.2963686060132762, + "grad_norm": 1.8170675039291382, + "learning_rate": 1.9756217272372022e-05, + "loss": 0.6787, + "step": 16600 + }, + { + "epoch": 1.304178055447091, + "grad_norm": 1.4013011455535889, + "learning_rate": 1.9754654161781948e-05, + "loss": 0.6648, + "step": 16700 + }, + { + "epoch": 1.3119875048809058, + "grad_norm": 1.41355299949646, + "learning_rate": 1.9753091051191874e-05, + "loss": 0.6895, + "step": 16800 + }, + { + "epoch": 1.3197969543147208, + "grad_norm": 1.4750763177871704, + "learning_rate": 1.97515279406018e-05, + "loss": 0.6762, + "step": 16900 + }, + { + "epoch": 1.3276064037485358, + "grad_norm": 1.596587896347046, + "learning_rate": 1.9749964830011726e-05, + "loss": 0.652, + "step": 17000 + }, + { + "epoch": 1.3354158531823506, + "grad_norm": 1.7418196201324463, + "learning_rate": 1.9748401719421652e-05, + "loss": 0.6843, + "step": 17100 + }, + { + "epoch": 1.3432253026161656, + "grad_norm": 1.8203010559082031, + "learning_rate": 1.9746838608831574e-05, + "loss": 0.6897, + "step": 17200 + }, + { + "epoch": 1.3510347520499804, + "grad_norm": 1.56990647315979, + "learning_rate": 1.9745275498241504e-05, + "loss": 0.6736, + "step": 17300 + }, + { + "epoch": 1.3588442014837954, + "grad_norm": 1.767688512802124, + "learning_rate": 1.9743712387651426e-05, + "loss": 0.6803, + "step": 17400 + }, + { + "epoch": 1.3666536509176104, + "grad_norm": 1.43065345287323, + "learning_rate": 1.9742149277061352e-05, + "loss": 0.6546, + "step": 17500 + }, + { + "epoch": 1.3744631003514252, + "grad_norm": 1.8510737419128418, + "learning_rate": 1.974060179757718e-05, + "loss": 0.6697, + "step": 17600 + }, + { + "epoch": 1.3822725497852402, + "grad_norm": 1.7096539735794067, + "learning_rate": 1.9739038686987108e-05, + "loss": 0.653, + "step": 17700 + }, + { + "epoch": 1.390081999219055, + "grad_norm": 1.6283639669418335, + "learning_rate": 1.973747557639703e-05, + "loss": 0.6663, + "step": 17800 + }, + { + "epoch": 1.39789144865287, + "grad_norm": 1.794142723083496, + "learning_rate": 1.9735912465806956e-05, + "loss": 0.6689, + "step": 17900 + }, + { + "epoch": 1.405700898086685, + "grad_norm": 2.0754244327545166, + "learning_rate": 1.9734349355216882e-05, + "loss": 0.6647, + "step": 18000 + }, + { + "epoch": 1.4135103475204998, + "grad_norm": 1.6919443607330322, + "learning_rate": 1.973278624462681e-05, + "loss": 0.6597, + "step": 18100 + }, + { + "epoch": 1.4213197969543148, + "grad_norm": 1.7112871408462524, + "learning_rate": 1.9731223134036734e-05, + "loss": 0.6515, + "step": 18200 + }, + { + "epoch": 1.4291292463881295, + "grad_norm": 1.552016258239746, + "learning_rate": 1.972966002344666e-05, + "loss": 0.6585, + "step": 18300 + }, + { + "epoch": 1.4369386958219446, + "grad_norm": 1.7915266752243042, + "learning_rate": 1.9728096912856586e-05, + "loss": 0.6549, + "step": 18400 + }, + { + "epoch": 1.4447481452557596, + "grad_norm": 1.4755462408065796, + "learning_rate": 1.9726533802266513e-05, + "loss": 0.6448, + "step": 18500 + }, + { + "epoch": 1.4525575946895743, + "grad_norm": 1.3579930067062378, + "learning_rate": 1.972497069167644e-05, + "loss": 0.6372, + "step": 18600 + }, + { + "epoch": 1.4603670441233894, + "grad_norm": 2.1790480613708496, + "learning_rate": 1.9723407581086365e-05, + "loss": 0.6555, + "step": 18700 + }, + { + "epoch": 1.4681764935572041, + "grad_norm": 1.4425419569015503, + "learning_rate": 1.972184447049629e-05, + "loss": 0.6531, + "step": 18800 + }, + { + "epoch": 1.4759859429910192, + "grad_norm": 1.9316322803497314, + "learning_rate": 1.9720281359906213e-05, + "loss": 0.6457, + "step": 18900 + }, + { + "epoch": 1.4837953924248342, + "grad_norm": 1.3997584581375122, + "learning_rate": 1.971871824931614e-05, + "loss": 0.6336, + "step": 19000 + }, + { + "epoch": 1.491604841858649, + "grad_norm": 1.6301729679107666, + "learning_rate": 1.971715513872607e-05, + "loss": 0.6544, + "step": 19100 + }, + { + "epoch": 1.499414291292464, + "grad_norm": 1.620894193649292, + "learning_rate": 1.971559202813599e-05, + "loss": 0.6339, + "step": 19200 + }, + { + "epoch": 1.5072237407262787, + "grad_norm": 1.5146082639694214, + "learning_rate": 1.9714028917545917e-05, + "loss": 0.6573, + "step": 19300 + }, + { + "epoch": 1.5150331901600937, + "grad_norm": 1.775248646736145, + "learning_rate": 1.9712465806955843e-05, + "loss": 0.6256, + "step": 19400 + }, + { + "epoch": 1.5228426395939088, + "grad_norm": 1.5098567008972168, + "learning_rate": 1.971090269636577e-05, + "loss": 0.6453, + "step": 19500 + }, + { + "epoch": 1.5306520890277235, + "grad_norm": 1.3362939357757568, + "learning_rate": 1.9709339585775695e-05, + "loss": 0.6157, + "step": 19600 + }, + { + "epoch": 1.5384615384615383, + "grad_norm": 1.5511094331741333, + "learning_rate": 1.970777647518562e-05, + "loss": 0.662, + "step": 19700 + }, + { + "epoch": 1.5462709878953533, + "grad_norm": 1.2386195659637451, + "learning_rate": 1.9706228995701447e-05, + "loss": 0.6227, + "step": 19800 + }, + { + "epoch": 1.5540804373291683, + "grad_norm": 1.674869179725647, + "learning_rate": 1.9704665885111373e-05, + "loss": 0.6722, + "step": 19900 + }, + { + "epoch": 1.5618898867629833, + "grad_norm": 2.106680393218994, + "learning_rate": 1.97031027745213e-05, + "loss": 0.6314, + "step": 20000 + }, + { + "epoch": 1.5696993361967981, + "grad_norm": 1.7660146951675415, + "learning_rate": 1.9701539663931225e-05, + "loss": 0.6404, + "step": 20100 + }, + { + "epoch": 1.577508785630613, + "grad_norm": 1.62801992893219, + "learning_rate": 1.969997655334115e-05, + "loss": 0.6167, + "step": 20200 + }, + { + "epoch": 1.585318235064428, + "grad_norm": 1.5587072372436523, + "learning_rate": 1.9698413442751077e-05, + "loss": 0.5937, + "step": 20300 + }, + { + "epoch": 1.593127684498243, + "grad_norm": 1.4757510423660278, + "learning_rate": 1.9696850332161003e-05, + "loss": 0.6012, + "step": 20400 + }, + { + "epoch": 1.600937133932058, + "grad_norm": 1.6553717851638794, + "learning_rate": 1.9695287221570926e-05, + "loss": 0.6101, + "step": 20500 + }, + { + "epoch": 1.6087465833658727, + "grad_norm": 1.7269705533981323, + "learning_rate": 1.9693724110980855e-05, + "loss": 0.6116, + "step": 20600 + }, + { + "epoch": 1.6165560327996875, + "grad_norm": 1.8082709312438965, + "learning_rate": 1.9692161000390778e-05, + "loss": 0.6118, + "step": 20700 + }, + { + "epoch": 1.6243654822335025, + "grad_norm": 1.6342484951019287, + "learning_rate": 1.9690597889800704e-05, + "loss": 0.6136, + "step": 20800 + }, + { + "epoch": 1.6321749316673175, + "grad_norm": 1.4139142036437988, + "learning_rate": 1.9689034779210633e-05, + "loss": 0.5913, + "step": 20900 + }, + { + "epoch": 1.6399843811011323, + "grad_norm": 1.691498041152954, + "learning_rate": 1.9687471668620556e-05, + "loss": 0.6069, + "step": 21000 + }, + { + "epoch": 1.6477938305349473, + "grad_norm": 1.6393285989761353, + "learning_rate": 1.9685908558030482e-05, + "loss": 0.6143, + "step": 21100 + }, + { + "epoch": 1.655603279968762, + "grad_norm": 1.4533542394638062, + "learning_rate": 1.9684345447440408e-05, + "loss": 0.6041, + "step": 21200 + }, + { + "epoch": 1.6634127294025771, + "grad_norm": 1.7511471509933472, + "learning_rate": 1.9682782336850334e-05, + "loss": 0.6059, + "step": 21300 + }, + { + "epoch": 1.6712221788363921, + "grad_norm": 1.6999716758728027, + "learning_rate": 1.968121922626026e-05, + "loss": 0.6234, + "step": 21400 + }, + { + "epoch": 1.679031628270207, + "grad_norm": 1.5730124711990356, + "learning_rate": 1.9679656115670186e-05, + "loss": 0.609, + "step": 21500 + }, + { + "epoch": 1.686841077704022, + "grad_norm": 1.657443642616272, + "learning_rate": 1.967809300508011e-05, + "loss": 0.612, + "step": 21600 + }, + { + "epoch": 1.6946505271378367, + "grad_norm": 1.9014359712600708, + "learning_rate": 1.9676529894490038e-05, + "loss": 0.5932, + "step": 21700 + }, + { + "epoch": 1.7024599765716517, + "grad_norm": 1.1905044317245483, + "learning_rate": 1.9674966783899964e-05, + "loss": 0.5974, + "step": 21800 + }, + { + "epoch": 1.7102694260054667, + "grad_norm": 1.6017484664916992, + "learning_rate": 1.9673403673309887e-05, + "loss": 0.5872, + "step": 21900 + }, + { + "epoch": 1.7180788754392815, + "grad_norm": 1.674900770187378, + "learning_rate": 1.9671856193825716e-05, + "loss": 0.6008, + "step": 22000 + }, + { + "epoch": 1.7258883248730963, + "grad_norm": 1.4483686685562134, + "learning_rate": 1.9670293083235642e-05, + "loss": 0.5826, + "step": 22100 + }, + { + "epoch": 1.7336977743069113, + "grad_norm": 1.7324950695037842, + "learning_rate": 1.9668729972645565e-05, + "loss": 0.6, + "step": 22200 + }, + { + "epoch": 1.7415072237407263, + "grad_norm": 2.0077033042907715, + "learning_rate": 1.966716686205549e-05, + "loss": 0.5906, + "step": 22300 + }, + { + "epoch": 1.7493166731745413, + "grad_norm": 1.6235421895980835, + "learning_rate": 1.966560375146542e-05, + "loss": 0.6161, + "step": 22400 + }, + { + "epoch": 1.757126122608356, + "grad_norm": 1.6211363077163696, + "learning_rate": 1.9664040640875343e-05, + "loss": 0.584, + "step": 22500 + }, + { + "epoch": 1.7649355720421709, + "grad_norm": 1.4874013662338257, + "learning_rate": 1.966247753028527e-05, + "loss": 0.5883, + "step": 22600 + }, + { + "epoch": 1.772745021475986, + "grad_norm": 1.7945176362991333, + "learning_rate": 1.9660914419695195e-05, + "loss": 0.5939, + "step": 22700 + }, + { + "epoch": 1.780554470909801, + "grad_norm": 1.5537645816802979, + "learning_rate": 1.965935130910512e-05, + "loss": 0.5916, + "step": 22800 + }, + { + "epoch": 1.788363920343616, + "grad_norm": 1.5249923467636108, + "learning_rate": 1.9657788198515047e-05, + "loss": 0.5751, + "step": 22900 + }, + { + "epoch": 1.7961733697774307, + "grad_norm": 1.017974615097046, + "learning_rate": 1.9656225087924973e-05, + "loss": 0.5799, + "step": 23000 + }, + { + "epoch": 1.8039828192112455, + "grad_norm": 1.6119155883789062, + "learning_rate": 1.96546619773349e-05, + "loss": 0.5852, + "step": 23100 + }, + { + "epoch": 1.8117922686450605, + "grad_norm": 1.5619168281555176, + "learning_rate": 1.9653098866744825e-05, + "loss": 0.581, + "step": 23200 + }, + { + "epoch": 1.8196017180788755, + "grad_norm": 1.7065399885177612, + "learning_rate": 1.965153575615475e-05, + "loss": 0.5917, + "step": 23300 + }, + { + "epoch": 1.8274111675126905, + "grad_norm": 1.5742697715759277, + "learning_rate": 1.9649972645564673e-05, + "loss": 0.5745, + "step": 23400 + }, + { + "epoch": 1.8352206169465053, + "grad_norm": 1.9307541847229004, + "learning_rate": 1.9648409534974603e-05, + "loss": 0.5748, + "step": 23500 + }, + { + "epoch": 1.84303006638032, + "grad_norm": 1.4289742708206177, + "learning_rate": 1.9646846424384525e-05, + "loss": 0.5657, + "step": 23600 + }, + { + "epoch": 1.850839515814135, + "grad_norm": 1.5857402086257935, + "learning_rate": 1.964528331379445e-05, + "loss": 0.5535, + "step": 23700 + }, + { + "epoch": 1.85864896524795, + "grad_norm": 1.8342182636260986, + "learning_rate": 1.9643720203204377e-05, + "loss": 0.5758, + "step": 23800 + }, + { + "epoch": 1.866458414681765, + "grad_norm": 1.7719389200210571, + "learning_rate": 1.9642157092614303e-05, + "loss": 0.5925, + "step": 23900 + }, + { + "epoch": 1.8742678641155799, + "grad_norm": 1.4628547430038452, + "learning_rate": 1.964059398202423e-05, + "loss": 0.5815, + "step": 24000 + }, + { + "epoch": 1.8820773135493947, + "grad_norm": 1.7836227416992188, + "learning_rate": 1.9639030871434155e-05, + "loss": 0.5608, + "step": 24100 + }, + { + "epoch": 1.8898867629832097, + "grad_norm": 1.4340027570724487, + "learning_rate": 1.963748339194998e-05, + "loss": 0.5669, + "step": 24200 + }, + { + "epoch": 1.8976962124170247, + "grad_norm": 1.8303942680358887, + "learning_rate": 1.9635920281359907e-05, + "loss": 0.5532, + "step": 24300 + }, + { + "epoch": 1.9055056618508395, + "grad_norm": 1.656308650970459, + "learning_rate": 1.9634357170769833e-05, + "loss": 0.5686, + "step": 24400 + }, + { + "epoch": 1.9133151112846545, + "grad_norm": 1.7295633554458618, + "learning_rate": 1.963279406017976e-05, + "loss": 0.5715, + "step": 24500 + }, + { + "epoch": 1.9211245607184693, + "grad_norm": 1.872109293937683, + "learning_rate": 1.9631230949589685e-05, + "loss": 0.55, + "step": 24600 + }, + { + "epoch": 1.9289340101522843, + "grad_norm": 1.568379282951355, + "learning_rate": 1.962966783899961e-05, + "loss": 0.5669, + "step": 24700 + }, + { + "epoch": 1.9367434595860993, + "grad_norm": 1.252161979675293, + "learning_rate": 1.9628104728409537e-05, + "loss": 0.5588, + "step": 24800 + }, + { + "epoch": 1.944552909019914, + "grad_norm": 1.4522193670272827, + "learning_rate": 1.9626541617819463e-05, + "loss": 0.5669, + "step": 24900 + }, + { + "epoch": 1.9523623584537289, + "grad_norm": 1.6285183429718018, + "learning_rate": 1.962497850722939e-05, + "loss": 0.5381, + "step": 25000 + }, + { + "epoch": 1.9601718078875439, + "grad_norm": 1.4675999879837036, + "learning_rate": 1.9623415396639312e-05, + "loss": 0.5656, + "step": 25100 + }, + { + "epoch": 1.9679812573213589, + "grad_norm": 1.4689419269561768, + "learning_rate": 1.9621852286049238e-05, + "loss": 0.5677, + "step": 25200 + }, + { + "epoch": 1.9757907067551739, + "grad_norm": 1.5669220685958862, + "learning_rate": 1.9620289175459167e-05, + "loss": 0.5549, + "step": 25300 + }, + { + "epoch": 1.9836001561889887, + "grad_norm": 1.3576539754867554, + "learning_rate": 1.961872606486909e-05, + "loss": 0.5409, + "step": 25400 + }, + { + "epoch": 1.9914096056228034, + "grad_norm": 1.6081891059875488, + "learning_rate": 1.9617162954279016e-05, + "loss": 0.5733, + "step": 25500 + }, + { + "epoch": 1.9992190550566185, + "grad_norm": 1.2406030893325806, + "learning_rate": 1.9615599843688942e-05, + "loss": 0.5461, + "step": 25600 + }, + { + "epoch": 2.0070285044904335, + "grad_norm": 1.4401236772537231, + "learning_rate": 1.9614036733098868e-05, + "loss": 0.5613, + "step": 25700 + }, + { + "epoch": 2.0148379539242485, + "grad_norm": 1.7388012409210205, + "learning_rate": 1.9612473622508794e-05, + "loss": 0.5468, + "step": 25800 + }, + { + "epoch": 2.0226474033580635, + "grad_norm": 1.2448303699493408, + "learning_rate": 1.961091051191872e-05, + "loss": 0.5399, + "step": 25900 + }, + { + "epoch": 2.030456852791878, + "grad_norm": 1.4686857461929321, + "learning_rate": 1.9609347401328646e-05, + "loss": 0.5671, + "step": 26000 + }, + { + "epoch": 2.038266302225693, + "grad_norm": 1.6793551445007324, + "learning_rate": 1.9607784290738572e-05, + "loss": 0.557, + "step": 26100 + }, + { + "epoch": 2.046075751659508, + "grad_norm": 1.5726957321166992, + "learning_rate": 1.9606221180148498e-05, + "loss": 0.5456, + "step": 26200 + }, + { + "epoch": 2.053885201093323, + "grad_norm": 1.5355794429779053, + "learning_rate": 1.960465806955842e-05, + "loss": 0.5424, + "step": 26300 + }, + { + "epoch": 2.0616946505271376, + "grad_norm": 1.3061555624008179, + "learning_rate": 1.960309495896835e-05, + "loss": 0.531, + "step": 26400 + }, + { + "epoch": 2.0695040999609526, + "grad_norm": 1.2583160400390625, + "learning_rate": 1.9601531848378276e-05, + "loss": 0.5421, + "step": 26500 + }, + { + "epoch": 2.0773135493947676, + "grad_norm": 1.578881025314331, + "learning_rate": 1.95999687377882e-05, + "loss": 0.5453, + "step": 26600 + }, + { + "epoch": 2.0851229988285827, + "grad_norm": 1.240598201751709, + "learning_rate": 1.9598405627198125e-05, + "loss": 0.5509, + "step": 26700 + }, + { + "epoch": 2.0929324482623977, + "grad_norm": 1.6285960674285889, + "learning_rate": 1.959684251660805e-05, + "loss": 0.5509, + "step": 26800 + }, + { + "epoch": 2.1007418976962122, + "grad_norm": 1.7065869569778442, + "learning_rate": 1.9595279406017977e-05, + "loss": 0.5367, + "step": 26900 + }, + { + "epoch": 2.1085513471300272, + "grad_norm": 1.1276403665542603, + "learning_rate": 1.9593731926533803e-05, + "loss": 0.5394, + "step": 27000 + }, + { + "epoch": 2.1163607965638422, + "grad_norm": 1.3322559595108032, + "learning_rate": 1.9592168815943732e-05, + "loss": 0.5393, + "step": 27100 + }, + { + "epoch": 2.1241702459976572, + "grad_norm": 1.2126073837280273, + "learning_rate": 1.9590605705353655e-05, + "loss": 0.5299, + "step": 27200 + }, + { + "epoch": 2.1319796954314723, + "grad_norm": 1.2466312646865845, + "learning_rate": 1.958904259476358e-05, + "loss": 0.5389, + "step": 27300 + }, + { + "epoch": 2.139789144865287, + "grad_norm": 1.2818732261657715, + "learning_rate": 1.9587479484173507e-05, + "loss": 0.5344, + "step": 27400 + }, + { + "epoch": 2.147598594299102, + "grad_norm": 1.2986412048339844, + "learning_rate": 1.9585916373583433e-05, + "loss": 0.5332, + "step": 27500 + }, + { + "epoch": 2.155408043732917, + "grad_norm": 1.1982731819152832, + "learning_rate": 1.958435326299336e-05, + "loss": 0.5448, + "step": 27600 + }, + { + "epoch": 2.163217493166732, + "grad_norm": 1.0930027961730957, + "learning_rate": 1.9582790152403285e-05, + "loss": 0.5396, + "step": 27700 + }, + { + "epoch": 2.171026942600547, + "grad_norm": 1.175718069076538, + "learning_rate": 1.9581227041813208e-05, + "loss": 0.5245, + "step": 27800 + }, + { + "epoch": 2.1788363920343614, + "grad_norm": 1.9274697303771973, + "learning_rate": 1.9579663931223137e-05, + "loss": 0.5346, + "step": 27900 + }, + { + "epoch": 2.1866458414681764, + "grad_norm": 1.5411278009414673, + "learning_rate": 1.9578100820633063e-05, + "loss": 0.5537, + "step": 28000 + }, + { + "epoch": 2.1944552909019914, + "grad_norm": 1.6423187255859375, + "learning_rate": 1.9576537710042986e-05, + "loss": 0.526, + "step": 28100 + }, + { + "epoch": 2.2022647403358064, + "grad_norm": 1.412419319152832, + "learning_rate": 1.9574974599452915e-05, + "loss": 0.5279, + "step": 28200 + }, + { + "epoch": 2.2100741897696214, + "grad_norm": 1.2464630603790283, + "learning_rate": 1.9573411488862838e-05, + "loss": 0.5505, + "step": 28300 + }, + { + "epoch": 2.217883639203436, + "grad_norm": 1.537739872932434, + "learning_rate": 1.9571848378272764e-05, + "loss": 0.5296, + "step": 28400 + }, + { + "epoch": 2.225693088637251, + "grad_norm": 1.4067381620407104, + "learning_rate": 1.957030089878859e-05, + "loss": 0.5252, + "step": 28500 + }, + { + "epoch": 2.233502538071066, + "grad_norm": 1.662540078163147, + "learning_rate": 1.956873778819852e-05, + "loss": 0.5329, + "step": 28600 + }, + { + "epoch": 2.241311987504881, + "grad_norm": 1.4438475370407104, + "learning_rate": 1.956717467760844e-05, + "loss": 0.5289, + "step": 28700 + }, + { + "epoch": 2.2491214369386956, + "grad_norm": 1.293503999710083, + "learning_rate": 1.9565611567018368e-05, + "loss": 0.522, + "step": 28800 + }, + { + "epoch": 2.2569308863725106, + "grad_norm": 1.2435001134872437, + "learning_rate": 1.9564048456428294e-05, + "loss": 0.534, + "step": 28900 + }, + { + "epoch": 2.2647403358063256, + "grad_norm": 1.3798662424087524, + "learning_rate": 1.956248534583822e-05, + "loss": 0.5188, + "step": 29000 + }, + { + "epoch": 2.2725497852401406, + "grad_norm": 1.6525225639343262, + "learning_rate": 1.9560922235248146e-05, + "loss": 0.5463, + "step": 29100 + }, + { + "epoch": 2.2803592346739556, + "grad_norm": 1.3094666004180908, + "learning_rate": 1.955935912465807e-05, + "loss": 0.5279, + "step": 29200 + }, + { + "epoch": 2.28816868410777, + "grad_norm": 1.3461250066757202, + "learning_rate": 1.9557796014067998e-05, + "loss": 0.5428, + "step": 29300 + }, + { + "epoch": 2.295978133541585, + "grad_norm": 1.3624392747879028, + "learning_rate": 1.9556232903477924e-05, + "loss": 0.5061, + "step": 29400 + }, + { + "epoch": 2.3037875829754, + "grad_norm": 1.327601671218872, + "learning_rate": 1.955466979288785e-05, + "loss": 0.5279, + "step": 29500 + }, + { + "epoch": 2.311597032409215, + "grad_norm": 1.3067333698272705, + "learning_rate": 1.9553106682297772e-05, + "loss": 0.5338, + "step": 29600 + }, + { + "epoch": 2.31940648184303, + "grad_norm": 1.455754041671753, + "learning_rate": 1.95515435717077e-05, + "loss": 0.5233, + "step": 29700 + }, + { + "epoch": 2.327215931276845, + "grad_norm": 1.3276084661483765, + "learning_rate": 1.9549980461117624e-05, + "loss": 0.5119, + "step": 29800 + }, + { + "epoch": 2.33502538071066, + "grad_norm": 1.1605360507965088, + "learning_rate": 1.954841735052755e-05, + "loss": 0.5075, + "step": 29900 + }, + { + "epoch": 2.342834830144475, + "grad_norm": 1.316475749015808, + "learning_rate": 1.9546854239937476e-05, + "loss": 0.5072, + "step": 30000 + }, + { + "epoch": 2.35064427957829, + "grad_norm": 1.1585702896118164, + "learning_rate": 1.9545291129347402e-05, + "loss": 0.5042, + "step": 30100 + }, + { + "epoch": 2.358453729012105, + "grad_norm": 1.2547882795333862, + "learning_rate": 1.9543728018757328e-05, + "loss": 0.5152, + "step": 30200 + }, + { + "epoch": 2.3662631784459194, + "grad_norm": 1.4408245086669922, + "learning_rate": 1.9542164908167254e-05, + "loss": 0.5051, + "step": 30300 + }, + { + "epoch": 2.3740726278797344, + "grad_norm": 1.256428837776184, + "learning_rate": 1.954060179757718e-05, + "loss": 0.5213, + "step": 30400 + }, + { + "epoch": 2.3818820773135494, + "grad_norm": 1.3025598526000977, + "learning_rate": 1.9539038686987106e-05, + "loss": 0.5218, + "step": 30500 + }, + { + "epoch": 2.3896915267473644, + "grad_norm": 1.343064546585083, + "learning_rate": 1.9537475576397032e-05, + "loss": 0.5128, + "step": 30600 + }, + { + "epoch": 2.3975009761811794, + "grad_norm": 1.4012939929962158, + "learning_rate": 1.9535912465806955e-05, + "loss": 0.5071, + "step": 30700 + }, + { + "epoch": 2.405310425614994, + "grad_norm": 1.2314071655273438, + "learning_rate": 1.9534349355216884e-05, + "loss": 0.5171, + "step": 30800 + }, + { + "epoch": 2.413119875048809, + "grad_norm": 1.502560019493103, + "learning_rate": 1.953278624462681e-05, + "loss": 0.5123, + "step": 30900 + }, + { + "epoch": 2.420929324482624, + "grad_norm": 1.1332653760910034, + "learning_rate": 1.9531223134036733e-05, + "loss": 0.5016, + "step": 31000 + }, + { + "epoch": 2.428738773916439, + "grad_norm": 1.2297821044921875, + "learning_rate": 1.952966002344666e-05, + "loss": 0.5217, + "step": 31100 + }, + { + "epoch": 2.436548223350254, + "grad_norm": 1.3758752346038818, + "learning_rate": 1.9528096912856585e-05, + "loss": 0.5096, + "step": 31200 + }, + { + "epoch": 2.4443576727840686, + "grad_norm": 1.2562918663024902, + "learning_rate": 1.952653380226651e-05, + "loss": 0.5033, + "step": 31300 + }, + { + "epoch": 2.4521671222178836, + "grad_norm": 1.238236904144287, + "learning_rate": 1.9524970691676437e-05, + "loss": 0.5022, + "step": 31400 + }, + { + "epoch": 2.4599765716516986, + "grad_norm": 1.1074494123458862, + "learning_rate": 1.9523407581086363e-05, + "loss": 0.5028, + "step": 31500 + }, + { + "epoch": 2.4677860210855136, + "grad_norm": 1.2257527112960815, + "learning_rate": 1.952184447049629e-05, + "loss": 0.5175, + "step": 31600 + }, + { + "epoch": 2.4755954705193286, + "grad_norm": 1.3490757942199707, + "learning_rate": 1.9520281359906215e-05, + "loss": 0.5097, + "step": 31700 + }, + { + "epoch": 2.483404919953143, + "grad_norm": 1.2227071523666382, + "learning_rate": 1.951871824931614e-05, + "loss": 0.5318, + "step": 31800 + }, + { + "epoch": 2.491214369386958, + "grad_norm": 1.5109745264053345, + "learning_rate": 1.9517155138726067e-05, + "loss": 0.5092, + "step": 31900 + }, + { + "epoch": 2.499023818820773, + "grad_norm": 1.3931546211242676, + "learning_rate": 1.9515592028135993e-05, + "loss": 0.5014, + "step": 32000 + }, + { + "epoch": 2.506833268254588, + "grad_norm": 1.350993275642395, + "learning_rate": 1.951402891754592e-05, + "loss": 0.4969, + "step": 32100 + }, + { + "epoch": 2.514642717688403, + "grad_norm": 1.1680783033370972, + "learning_rate": 1.9512465806955845e-05, + "loss": 0.5155, + "step": 32200 + }, + { + "epoch": 2.5224521671222178, + "grad_norm": 1.8324127197265625, + "learning_rate": 1.951090269636577e-05, + "loss": 0.5005, + "step": 32300 + }, + { + "epoch": 2.5302616165560328, + "grad_norm": 1.244734525680542, + "learning_rate": 1.9509339585775694e-05, + "loss": 0.5115, + "step": 32400 + }, + { + "epoch": 2.5380710659898478, + "grad_norm": 1.1397197246551514, + "learning_rate": 1.950777647518562e-05, + "loss": 0.4764, + "step": 32500 + }, + { + "epoch": 2.545880515423663, + "grad_norm": 1.2140659093856812, + "learning_rate": 1.950621336459555e-05, + "loss": 0.5141, + "step": 32600 + }, + { + "epoch": 2.553689964857478, + "grad_norm": 1.2659382820129395, + "learning_rate": 1.9504665885111375e-05, + "loss": 0.5002, + "step": 32700 + }, + { + "epoch": 2.5614994142912924, + "grad_norm": 1.4882584810256958, + "learning_rate": 1.9503102774521298e-05, + "loss": 0.5061, + "step": 32800 + }, + { + "epoch": 2.5693088637251074, + "grad_norm": 1.4329360723495483, + "learning_rate": 1.9501539663931224e-05, + "loss": 0.5204, + "step": 32900 + }, + { + "epoch": 2.5771183131589224, + "grad_norm": 1.341886281967163, + "learning_rate": 1.949997655334115e-05, + "loss": 0.5008, + "step": 33000 + }, + { + "epoch": 2.5849277625927374, + "grad_norm": 1.3239928483963013, + "learning_rate": 1.9498413442751076e-05, + "loss": 0.4847, + "step": 33100 + }, + { + "epoch": 2.5927372120265524, + "grad_norm": 1.525991678237915, + "learning_rate": 1.9496850332161002e-05, + "loss": 0.4975, + "step": 33200 + }, + { + "epoch": 2.600546661460367, + "grad_norm": 1.2678552865982056, + "learning_rate": 1.9495287221570928e-05, + "loss": 0.4888, + "step": 33300 + }, + { + "epoch": 2.608356110894182, + "grad_norm": 1.1323553323745728, + "learning_rate": 1.9493724110980854e-05, + "loss": 0.5244, + "step": 33400 + }, + { + "epoch": 2.616165560327997, + "grad_norm": 1.548802137374878, + "learning_rate": 1.949216100039078e-05, + "loss": 0.4921, + "step": 33500 + }, + { + "epoch": 2.6239750097618115, + "grad_norm": 1.1082913875579834, + "learning_rate": 1.9490597889800706e-05, + "loss": 0.492, + "step": 33600 + }, + { + "epoch": 2.631784459195627, + "grad_norm": 1.1769174337387085, + "learning_rate": 1.9489034779210632e-05, + "loss": 0.4915, + "step": 33700 + }, + { + "epoch": 2.6395939086294415, + "grad_norm": 1.846003770828247, + "learning_rate": 1.9487471668620558e-05, + "loss": 0.5014, + "step": 33800 + }, + { + "epoch": 2.6474033580632566, + "grad_norm": 1.341156244277954, + "learning_rate": 1.948590855803048e-05, + "loss": 0.5003, + "step": 33900 + }, + { + "epoch": 2.6552128074970716, + "grad_norm": 1.592016577720642, + "learning_rate": 1.9484345447440406e-05, + "loss": 0.5017, + "step": 34000 + }, + { + "epoch": 2.663022256930886, + "grad_norm": 1.320487380027771, + "learning_rate": 1.9482782336850336e-05, + "loss": 0.4892, + "step": 34100 + }, + { + "epoch": 2.670831706364701, + "grad_norm": 1.2302286624908447, + "learning_rate": 1.948121922626026e-05, + "loss": 0.4963, + "step": 34200 + }, + { + "epoch": 2.678641155798516, + "grad_norm": 1.1193184852600098, + "learning_rate": 1.9479656115670185e-05, + "loss": 0.4925, + "step": 34300 + }, + { + "epoch": 2.686450605232331, + "grad_norm": 1.0363550186157227, + "learning_rate": 1.947809300508011e-05, + "loss": 0.4986, + "step": 34400 + }, + { + "epoch": 2.694260054666146, + "grad_norm": 1.32547926902771, + "learning_rate": 1.9476529894490037e-05, + "loss": 0.475, + "step": 34500 + }, + { + "epoch": 2.7020695040999607, + "grad_norm": 1.1739405393600464, + "learning_rate": 1.9474966783899963e-05, + "loss": 0.5149, + "step": 34600 + }, + { + "epoch": 2.7098789535337757, + "grad_norm": 1.2484989166259766, + "learning_rate": 1.947340367330989e-05, + "loss": 0.4906, + "step": 34700 + }, + { + "epoch": 2.7176884029675907, + "grad_norm": 1.2752107381820679, + "learning_rate": 1.9471840562719815e-05, + "loss": 0.4871, + "step": 34800 + }, + { + "epoch": 2.7254978524014057, + "grad_norm": 1.3706623315811157, + "learning_rate": 1.947027745212974e-05, + "loss": 0.4849, + "step": 34900 + }, + { + "epoch": 2.7333073018352207, + "grad_norm": 1.2365776300430298, + "learning_rate": 1.9468714341539667e-05, + "loss": 0.495, + "step": 35000 + }, + { + "epoch": 2.7411167512690353, + "grad_norm": 1.2424877882003784, + "learning_rate": 1.946715123094959e-05, + "loss": 0.4832, + "step": 35100 + }, + { + "epoch": 2.7489262007028503, + "grad_norm": 1.2801834344863892, + "learning_rate": 1.946558812035952e-05, + "loss": 0.4754, + "step": 35200 + }, + { + "epoch": 2.7567356501366653, + "grad_norm": 1.2843778133392334, + "learning_rate": 1.946402500976944e-05, + "loss": 0.4698, + "step": 35300 + }, + { + "epoch": 2.7645450995704803, + "grad_norm": 1.2793940305709839, + "learning_rate": 1.9462461899179367e-05, + "loss": 0.4825, + "step": 35400 + }, + { + "epoch": 2.7723545490042953, + "grad_norm": 1.1678388118743896, + "learning_rate": 1.9460898788589297e-05, + "loss": 0.4886, + "step": 35500 + }, + { + "epoch": 2.78016399843811, + "grad_norm": 0.9187774658203125, + "learning_rate": 1.945933567799922e-05, + "loss": 0.4884, + "step": 35600 + }, + { + "epoch": 2.787973447871925, + "grad_norm": 1.246982216835022, + "learning_rate": 1.9457772567409145e-05, + "loss": 0.4809, + "step": 35700 + }, + { + "epoch": 2.79578289730574, + "grad_norm": 1.5185933113098145, + "learning_rate": 1.945620945681907e-05, + "loss": 0.4741, + "step": 35800 + }, + { + "epoch": 2.803592346739555, + "grad_norm": 1.198704481124878, + "learning_rate": 1.9454646346228997e-05, + "loss": 0.4753, + "step": 35900 + }, + { + "epoch": 2.81140179617337, + "grad_norm": 1.0310161113739014, + "learning_rate": 1.9453098866744823e-05, + "loss": 0.4826, + "step": 36000 + }, + { + "epoch": 2.8192112456071845, + "grad_norm": 1.30093514919281, + "learning_rate": 1.945153575615475e-05, + "loss": 0.4695, + "step": 36100 + }, + { + "epoch": 2.8270206950409995, + "grad_norm": 1.268122673034668, + "learning_rate": 1.9449972645564675e-05, + "loss": 0.4944, + "step": 36200 + }, + { + "epoch": 2.8348301444748145, + "grad_norm": 1.3303180932998657, + "learning_rate": 1.94484095349746e-05, + "loss": 0.4773, + "step": 36300 + }, + { + "epoch": 2.8426395939086295, + "grad_norm": 1.1513735055923462, + "learning_rate": 1.9446846424384527e-05, + "loss": 0.4672, + "step": 36400 + }, + { + "epoch": 2.8504490433424445, + "grad_norm": 1.1249432563781738, + "learning_rate": 1.9445283313794453e-05, + "loss": 0.496, + "step": 36500 + }, + { + "epoch": 2.858258492776259, + "grad_norm": 1.6745643615722656, + "learning_rate": 1.944372020320438e-05, + "loss": 0.465, + "step": 36600 + }, + { + "epoch": 2.866067942210074, + "grad_norm": 1.2362009286880493, + "learning_rate": 1.9442157092614305e-05, + "loss": 0.4788, + "step": 36700 + }, + { + "epoch": 2.873877391643889, + "grad_norm": 1.3232982158660889, + "learning_rate": 1.9440593982024228e-05, + "loss": 0.4748, + "step": 36800 + }, + { + "epoch": 2.881686841077704, + "grad_norm": 1.561949610710144, + "learning_rate": 1.9439030871434154e-05, + "loss": 0.4854, + "step": 36900 + }, + { + "epoch": 2.889496290511519, + "grad_norm": 1.1138705015182495, + "learning_rate": 1.9437467760844083e-05, + "loss": 0.47, + "step": 37000 + }, + { + "epoch": 2.8973057399453337, + "grad_norm": 1.2688237428665161, + "learning_rate": 1.9435904650254006e-05, + "loss": 0.4795, + "step": 37100 + }, + { + "epoch": 2.9051151893791487, + "grad_norm": 1.3343580961227417, + "learning_rate": 1.9434341539663932e-05, + "loss": 0.4688, + "step": 37200 + }, + { + "epoch": 2.9129246388129637, + "grad_norm": 1.3715122938156128, + "learning_rate": 1.9432778429073858e-05, + "loss": 0.4846, + "step": 37300 + }, + { + "epoch": 2.9207340882467787, + "grad_norm": 1.0764130353927612, + "learning_rate": 1.9431215318483784e-05, + "loss": 0.4589, + "step": 37400 + }, + { + "epoch": 2.9285435376805937, + "grad_norm": 1.2759395837783813, + "learning_rate": 1.942965220789371e-05, + "loss": 0.4878, + "step": 37500 + }, + { + "epoch": 2.9363529871144083, + "grad_norm": 1.2582157850265503, + "learning_rate": 1.9428089097303636e-05, + "loss": 0.4744, + "step": 37600 + }, + { + "epoch": 2.9441624365482233, + "grad_norm": 1.045397400856018, + "learning_rate": 1.9426525986713562e-05, + "loss": 0.4836, + "step": 37700 + }, + { + "epoch": 2.9519718859820383, + "grad_norm": 1.208304524421692, + "learning_rate": 1.9424962876123488e-05, + "loss": 0.4806, + "step": 37800 + }, + { + "epoch": 2.9597813354158533, + "grad_norm": 1.1622886657714844, + "learning_rate": 1.9423399765533414e-05, + "loss": 0.4873, + "step": 37900 + }, + { + "epoch": 2.9675907848496683, + "grad_norm": 1.1261911392211914, + "learning_rate": 1.9421836654943337e-05, + "loss": 0.4802, + "step": 38000 + }, + { + "epoch": 2.975400234283483, + "grad_norm": 1.0438706874847412, + "learning_rate": 1.9420273544353266e-05, + "loss": 0.4859, + "step": 38100 + }, + { + "epoch": 2.983209683717298, + "grad_norm": 1.171201467514038, + "learning_rate": 1.9418710433763192e-05, + "loss": 0.4822, + "step": 38200 + }, + { + "epoch": 2.991019133151113, + "grad_norm": 0.9960982799530029, + "learning_rate": 1.9417147323173115e-05, + "loss": 0.4654, + "step": 38300 + }, + { + "epoch": 2.998828582584928, + "grad_norm": 1.3679864406585693, + "learning_rate": 1.9415584212583044e-05, + "loss": 0.4717, + "step": 38400 + }, + { + "epoch": 3.0066380320187425, + "grad_norm": 1.395412564277649, + "learning_rate": 1.9414021101992967e-05, + "loss": 0.4774, + "step": 38500 + }, + { + "epoch": 3.0144474814525575, + "grad_norm": 0.8553999662399292, + "learning_rate": 1.9412457991402893e-05, + "loss": 0.4585, + "step": 38600 + }, + { + "epoch": 3.0222569308863725, + "grad_norm": 1.151389718055725, + "learning_rate": 1.941089488081282e-05, + "loss": 0.4901, + "step": 38700 + }, + { + "epoch": 3.0300663803201875, + "grad_norm": 1.1624094247817993, + "learning_rate": 1.9409331770222745e-05, + "loss": 0.4848, + "step": 38800 + }, + { + "epoch": 3.0378758297540025, + "grad_norm": 1.263749599456787, + "learning_rate": 1.940778429073857e-05, + "loss": 0.4683, + "step": 38900 + }, + { + "epoch": 3.045685279187817, + "grad_norm": 1.2082384824752808, + "learning_rate": 1.9406221180148497e-05, + "loss": 0.4698, + "step": 39000 + }, + { + "epoch": 3.053494728621632, + "grad_norm": 1.1123636960983276, + "learning_rate": 1.9404658069558423e-05, + "loss": 0.4646, + "step": 39100 + }, + { + "epoch": 3.061304178055447, + "grad_norm": 1.2531005144119263, + "learning_rate": 1.940309495896835e-05, + "loss": 0.4601, + "step": 39200 + }, + { + "epoch": 3.069113627489262, + "grad_norm": 1.287048578262329, + "learning_rate": 1.9401531848378275e-05, + "loss": 0.4497, + "step": 39300 + }, + { + "epoch": 3.076923076923077, + "grad_norm": 0.9623894691467285, + "learning_rate": 1.93999687377882e-05, + "loss": 0.4635, + "step": 39400 + }, + { + "epoch": 3.0847325263568917, + "grad_norm": 1.3144161701202393, + "learning_rate": 1.9398405627198127e-05, + "loss": 0.4593, + "step": 39500 + }, + { + "epoch": 3.0925419757907067, + "grad_norm": 1.362608551979065, + "learning_rate": 1.9396842516608053e-05, + "loss": 0.4626, + "step": 39600 + }, + { + "epoch": 3.1003514252245217, + "grad_norm": 1.1824091672897339, + "learning_rate": 1.939527940601798e-05, + "loss": 0.4717, + "step": 39700 + }, + { + "epoch": 3.1081608746583367, + "grad_norm": 1.4997671842575073, + "learning_rate": 1.93937162954279e-05, + "loss": 0.486, + "step": 39800 + }, + { + "epoch": 3.1159703240921517, + "grad_norm": 1.056342363357544, + "learning_rate": 1.939215318483783e-05, + "loss": 0.4927, + "step": 39900 + }, + { + "epoch": 3.1237797735259663, + "grad_norm": 1.4581704139709473, + "learning_rate": 1.9390590074247753e-05, + "loss": 0.4802, + "step": 40000 + }, + { + "epoch": 3.1315892229597813, + "grad_norm": 1.2977889776229858, + "learning_rate": 1.938902696365768e-05, + "loss": 0.4667, + "step": 40100 + }, + { + "epoch": 3.1393986723935963, + "grad_norm": 1.1020759344100952, + "learning_rate": 1.9387463853067605e-05, + "loss": 0.4625, + "step": 40200 + }, + { + "epoch": 3.1472081218274113, + "grad_norm": 1.1267211437225342, + "learning_rate": 1.938590074247753e-05, + "loss": 0.465, + "step": 40300 + }, + { + "epoch": 3.1550175712612263, + "grad_norm": 0.9519909024238586, + "learning_rate": 1.9384337631887457e-05, + "loss": 0.4798, + "step": 40400 + }, + { + "epoch": 3.162827020695041, + "grad_norm": 1.512546420097351, + "learning_rate": 1.9382774521297383e-05, + "loss": 0.4498, + "step": 40500 + }, + { + "epoch": 3.170636470128856, + "grad_norm": 1.2842134237289429, + "learning_rate": 1.938121141070731e-05, + "loss": 0.4742, + "step": 40600 + }, + { + "epoch": 3.178445919562671, + "grad_norm": 1.0086536407470703, + "learning_rate": 1.9379648300117236e-05, + "loss": 0.4602, + "step": 40700 + }, + { + "epoch": 3.186255368996486, + "grad_norm": 1.716693639755249, + "learning_rate": 1.937808518952716e-05, + "loss": 0.462, + "step": 40800 + }, + { + "epoch": 3.1940648184303004, + "grad_norm": 0.9176514744758606, + "learning_rate": 1.9376522078937084e-05, + "loss": 0.4531, + "step": 40900 + }, + { + "epoch": 3.2018742678641154, + "grad_norm": 0.9721403121948242, + "learning_rate": 1.9374958968347014e-05, + "loss": 0.4488, + "step": 41000 + }, + { + "epoch": 3.2096837172979304, + "grad_norm": 1.0320054292678833, + "learning_rate": 1.937339585775694e-05, + "loss": 0.4632, + "step": 41100 + }, + { + "epoch": 3.2174931667317455, + "grad_norm": 0.9565618634223938, + "learning_rate": 1.9371832747166862e-05, + "loss": 0.4686, + "step": 41200 + }, + { + "epoch": 3.2253026161655605, + "grad_norm": 1.0439300537109375, + "learning_rate": 1.9370269636576788e-05, + "loss": 0.4771, + "step": 41300 + }, + { + "epoch": 3.233112065599375, + "grad_norm": 1.02463960647583, + "learning_rate": 1.9368706525986714e-05, + "loss": 0.471, + "step": 41400 + }, + { + "epoch": 3.24092151503319, + "grad_norm": 1.3478844165802002, + "learning_rate": 1.936714341539664e-05, + "loss": 0.4579, + "step": 41500 + }, + { + "epoch": 3.248730964467005, + "grad_norm": 0.9918316006660461, + "learning_rate": 1.9365580304806566e-05, + "loss": 0.4668, + "step": 41600 + }, + { + "epoch": 3.25654041390082, + "grad_norm": 1.500217080116272, + "learning_rate": 1.9364017194216492e-05, + "loss": 0.4498, + "step": 41700 + }, + { + "epoch": 3.264349863334635, + "grad_norm": 1.0742114782333374, + "learning_rate": 1.9362454083626418e-05, + "loss": 0.4726, + "step": 41800 + }, + { + "epoch": 3.2721593127684496, + "grad_norm": 1.3920074701309204, + "learning_rate": 1.9360890973036344e-05, + "loss": 0.4719, + "step": 41900 + }, + { + "epoch": 3.2799687622022646, + "grad_norm": 1.1826322078704834, + "learning_rate": 1.935932786244627e-05, + "loss": 0.4584, + "step": 42000 + }, + { + "epoch": 3.2877782116360796, + "grad_norm": 1.1376311779022217, + "learning_rate": 1.9357764751856196e-05, + "loss": 0.4741, + "step": 42100 + }, + { + "epoch": 3.2955876610698946, + "grad_norm": 1.0839388370513916, + "learning_rate": 1.9356201641266122e-05, + "loss": 0.4769, + "step": 42200 + }, + { + "epoch": 3.3033971105037097, + "grad_norm": 1.0490782260894775, + "learning_rate": 1.9354638530676045e-05, + "loss": 0.4557, + "step": 42300 + }, + { + "epoch": 3.311206559937524, + "grad_norm": 1.1325418949127197, + "learning_rate": 1.935309105119187e-05, + "loss": 0.4628, + "step": 42400 + }, + { + "epoch": 3.3190160093713392, + "grad_norm": 1.1212817430496216, + "learning_rate": 1.93515279406018e-05, + "loss": 0.4486, + "step": 42500 + }, + { + "epoch": 3.3268254588051542, + "grad_norm": 1.111585259437561, + "learning_rate": 1.9349964830011726e-05, + "loss": 0.4647, + "step": 42600 + }, + { + "epoch": 3.3346349082389692, + "grad_norm": 1.1315809488296509, + "learning_rate": 1.934840171942165e-05, + "loss": 0.4648, + "step": 42700 + }, + { + "epoch": 3.3424443576727842, + "grad_norm": 0.9654698371887207, + "learning_rate": 1.9346838608831578e-05, + "loss": 0.4461, + "step": 42800 + }, + { + "epoch": 3.350253807106599, + "grad_norm": 1.0180907249450684, + "learning_rate": 1.93452754982415e-05, + "loss": 0.472, + "step": 42900 + }, + { + "epoch": 3.358063256540414, + "grad_norm": 1.1459476947784424, + "learning_rate": 1.9343712387651427e-05, + "loss": 0.4524, + "step": 43000 + }, + { + "epoch": 3.365872705974229, + "grad_norm": 1.1465004682540894, + "learning_rate": 1.9342149277061353e-05, + "loss": 0.4503, + "step": 43100 + }, + { + "epoch": 3.373682155408044, + "grad_norm": 1.0197675228118896, + "learning_rate": 1.934058616647128e-05, + "loss": 0.4585, + "step": 43200 + }, + { + "epoch": 3.381491604841859, + "grad_norm": 1.0962787866592407, + "learning_rate": 1.9339023055881205e-05, + "loss": 0.4454, + "step": 43300 + }, + { + "epoch": 3.3893010542756734, + "grad_norm": 1.0584628582000732, + "learning_rate": 1.933745994529113e-05, + "loss": 0.4589, + "step": 43400 + }, + { + "epoch": 3.3971105037094884, + "grad_norm": 1.0412591695785522, + "learning_rate": 1.9335896834701057e-05, + "loss": 0.4513, + "step": 43500 + }, + { + "epoch": 3.4049199531433034, + "grad_norm": 1.2135701179504395, + "learning_rate": 1.9334333724110983e-05, + "loss": 0.4675, + "step": 43600 + }, + { + "epoch": 3.4127294025771184, + "grad_norm": 0.9547072052955627, + "learning_rate": 1.933277061352091e-05, + "loss": 0.4435, + "step": 43700 + }, + { + "epoch": 3.4205388520109334, + "grad_norm": 1.005250096321106, + "learning_rate": 1.933120750293083e-05, + "loss": 0.4595, + "step": 43800 + }, + { + "epoch": 3.428348301444748, + "grad_norm": 1.306046724319458, + "learning_rate": 1.932964439234076e-05, + "loss": 0.4549, + "step": 43900 + }, + { + "epoch": 3.436157750878563, + "grad_norm": 0.9959258437156677, + "learning_rate": 1.9328081281750687e-05, + "loss": 0.4821, + "step": 44000 + }, + { + "epoch": 3.443967200312378, + "grad_norm": 1.0681926012039185, + "learning_rate": 1.932651817116061e-05, + "loss": 0.4334, + "step": 44100 + }, + { + "epoch": 3.451776649746193, + "grad_norm": 0.9525073766708374, + "learning_rate": 1.9324955060570536e-05, + "loss": 0.455, + "step": 44200 + }, + { + "epoch": 3.459586099180008, + "grad_norm": 1.1635737419128418, + "learning_rate": 1.9323391949980465e-05, + "loss": 0.471, + "step": 44300 + }, + { + "epoch": 3.4673955486138226, + "grad_norm": 1.1268336772918701, + "learning_rate": 1.932184447049629e-05, + "loss": 0.4546, + "step": 44400 + }, + { + "epoch": 3.4752049980476376, + "grad_norm": 1.1383503675460815, + "learning_rate": 1.9320281359906214e-05, + "loss": 0.4734, + "step": 44500 + }, + { + "epoch": 3.4830144474814526, + "grad_norm": 1.0616774559020996, + "learning_rate": 1.931871824931614e-05, + "loss": 0.4611, + "step": 44600 + }, + { + "epoch": 3.4908238969152676, + "grad_norm": 1.338844895362854, + "learning_rate": 1.9317155138726066e-05, + "loss": 0.4489, + "step": 44700 + }, + { + "epoch": 3.4986333463490826, + "grad_norm": 1.0978337526321411, + "learning_rate": 1.931559202813599e-05, + "loss": 0.4713, + "step": 44800 + }, + { + "epoch": 3.506442795782897, + "grad_norm": 0.8954633474349976, + "learning_rate": 1.9314028917545918e-05, + "loss": 0.4383, + "step": 44900 + }, + { + "epoch": 3.514252245216712, + "grad_norm": 1.0776824951171875, + "learning_rate": 1.9312465806955844e-05, + "loss": 0.4559, + "step": 45000 + }, + { + "epoch": 3.522061694650527, + "grad_norm": 0.9826775789260864, + "learning_rate": 1.931090269636577e-05, + "loss": 0.4526, + "step": 45100 + }, + { + "epoch": 3.529871144084342, + "grad_norm": 1.1166000366210938, + "learning_rate": 1.9309339585775696e-05, + "loss": 0.4463, + "step": 45200 + }, + { + "epoch": 3.5376805935181572, + "grad_norm": 1.129669189453125, + "learning_rate": 1.930777647518562e-05, + "loss": 0.4594, + "step": 45300 + }, + { + "epoch": 3.545490042951972, + "grad_norm": 1.1088656187057495, + "learning_rate": 1.9306213364595548e-05, + "loss": 0.4402, + "step": 45400 + }, + { + "epoch": 3.553299492385787, + "grad_norm": 1.5019007921218872, + "learning_rate": 1.9304650254005474e-05, + "loss": 0.4482, + "step": 45500 + }, + { + "epoch": 3.561108941819602, + "grad_norm": 1.105352520942688, + "learning_rate": 1.9303087143415396e-05, + "loss": 0.4357, + "step": 45600 + }, + { + "epoch": 3.5689183912534164, + "grad_norm": 1.3878651857376099, + "learning_rate": 1.9301524032825326e-05, + "loss": 0.4601, + "step": 45700 + }, + { + "epoch": 3.576727840687232, + "grad_norm": 1.047351360321045, + "learning_rate": 1.9299960922235252e-05, + "loss": 0.454, + "step": 45800 + }, + { + "epoch": 3.5845372901210464, + "grad_norm": 1.0843867063522339, + "learning_rate": 1.9298397811645174e-05, + "loss": 0.4197, + "step": 45900 + }, + { + "epoch": 3.5923467395548614, + "grad_norm": 1.1075481176376343, + "learning_rate": 1.92968347010551e-05, + "loss": 0.4518, + "step": 46000 + }, + { + "epoch": 3.6001561889886764, + "grad_norm": 0.9046174883842468, + "learning_rate": 1.9295271590465026e-05, + "loss": 0.4434, + "step": 46100 + }, + { + "epoch": 3.607965638422491, + "grad_norm": 0.9517740607261658, + "learning_rate": 1.9293708479874952e-05, + "loss": 0.4397, + "step": 46200 + }, + { + "epoch": 3.615775087856306, + "grad_norm": 1.0598924160003662, + "learning_rate": 1.929214536928488e-05, + "loss": 0.463, + "step": 46300 + }, + { + "epoch": 3.623584537290121, + "grad_norm": 1.3215514421463013, + "learning_rate": 1.9290582258694804e-05, + "loss": 0.4501, + "step": 46400 + }, + { + "epoch": 3.631393986723936, + "grad_norm": 1.129428744316101, + "learning_rate": 1.928901914810473e-05, + "loss": 0.4326, + "step": 46500 + }, + { + "epoch": 3.639203436157751, + "grad_norm": 1.0689501762390137, + "learning_rate": 1.9287456037514656e-05, + "loss": 0.4447, + "step": 46600 + }, + { + "epoch": 3.6470128855915656, + "grad_norm": 1.0962802171707153, + "learning_rate": 1.9285908558030482e-05, + "loss": 0.4665, + "step": 46700 + }, + { + "epoch": 3.6548223350253806, + "grad_norm": 1.0971689224243164, + "learning_rate": 1.928434544744041e-05, + "loss": 0.4526, + "step": 46800 + }, + { + "epoch": 3.6626317844591956, + "grad_norm": 1.1643929481506348, + "learning_rate": 1.9282782336850334e-05, + "loss": 0.4605, + "step": 46900 + }, + { + "epoch": 3.6704412338930106, + "grad_norm": 1.1027824878692627, + "learning_rate": 1.928121922626026e-05, + "loss": 0.4458, + "step": 47000 + }, + { + "epoch": 3.6782506833268256, + "grad_norm": 0.9542461037635803, + "learning_rate": 1.9279656115670183e-05, + "loss": 0.4488, + "step": 47100 + }, + { + "epoch": 3.68606013276064, + "grad_norm": 0.8931864500045776, + "learning_rate": 1.9278093005080112e-05, + "loss": 0.4552, + "step": 47200 + }, + { + "epoch": 3.693869582194455, + "grad_norm": 1.299230933189392, + "learning_rate": 1.927652989449004e-05, + "loss": 0.4332, + "step": 47300 + }, + { + "epoch": 3.70167903162827, + "grad_norm": 1.2159420251846313, + "learning_rate": 1.927496678389996e-05, + "loss": 0.4354, + "step": 47400 + }, + { + "epoch": 3.709488481062085, + "grad_norm": 0.9442591071128845, + "learning_rate": 1.9273403673309887e-05, + "loss": 0.4399, + "step": 47500 + }, + { + "epoch": 3.7172979304959, + "grad_norm": 1.3856900930404663, + "learning_rate": 1.9271840562719813e-05, + "loss": 0.4596, + "step": 47600 + }, + { + "epoch": 3.7251073799297147, + "grad_norm": 1.2507699728012085, + "learning_rate": 1.927027745212974e-05, + "loss": 0.4502, + "step": 47700 + }, + { + "epoch": 3.7329168293635298, + "grad_norm": 1.292219877243042, + "learning_rate": 1.9268714341539665e-05, + "loss": 0.4494, + "step": 47800 + }, + { + "epoch": 3.7407262787973448, + "grad_norm": 1.3267557621002197, + "learning_rate": 1.926715123094959e-05, + "loss": 0.4407, + "step": 47900 + }, + { + "epoch": 3.7485357282311598, + "grad_norm": 0.9994024634361267, + "learning_rate": 1.9265588120359517e-05, + "loss": 0.4449, + "step": 48000 + }, + { + "epoch": 3.7563451776649748, + "grad_norm": 1.1877665519714355, + "learning_rate": 1.9264025009769443e-05, + "loss": 0.4261, + "step": 48100 + }, + { + "epoch": 3.7641546270987893, + "grad_norm": 0.9004372954368591, + "learning_rate": 1.926246189917937e-05, + "loss": 0.4261, + "step": 48200 + }, + { + "epoch": 3.7719640765326043, + "grad_norm": 1.0032011270523071, + "learning_rate": 1.9260898788589295e-05, + "loss": 0.4336, + "step": 48300 + }, + { + "epoch": 3.7797735259664194, + "grad_norm": 1.331635594367981, + "learning_rate": 1.925933567799922e-05, + "loss": 0.4563, + "step": 48400 + }, + { + "epoch": 3.7875829754002344, + "grad_norm": 1.1291660070419312, + "learning_rate": 1.9257772567409144e-05, + "loss": 0.4512, + "step": 48500 + }, + { + "epoch": 3.7953924248340494, + "grad_norm": 1.0733696222305298, + "learning_rate": 1.925620945681907e-05, + "loss": 0.4293, + "step": 48600 + }, + { + "epoch": 3.803201874267864, + "grad_norm": 0.9246060252189636, + "learning_rate": 1.9254646346229e-05, + "loss": 0.443, + "step": 48700 + }, + { + "epoch": 3.811011323701679, + "grad_norm": 1.1063892841339111, + "learning_rate": 1.9253083235638922e-05, + "loss": 0.4324, + "step": 48800 + }, + { + "epoch": 3.818820773135494, + "grad_norm": 0.9335746765136719, + "learning_rate": 1.9251520125048848e-05, + "loss": 0.4299, + "step": 48900 + }, + { + "epoch": 3.826630222569309, + "grad_norm": 1.143466591835022, + "learning_rate": 1.9249957014458774e-05, + "loss": 0.443, + "step": 49000 + }, + { + "epoch": 3.834439672003124, + "grad_norm": 1.0343248844146729, + "learning_rate": 1.92484095349746e-05, + "loss": 0.4352, + "step": 49100 + }, + { + "epoch": 3.8422491214369385, + "grad_norm": 1.3340160846710205, + "learning_rate": 1.9246846424384526e-05, + "loss": 0.4252, + "step": 49200 + }, + { + "epoch": 3.8500585708707535, + "grad_norm": 1.1612764596939087, + "learning_rate": 1.9245283313794452e-05, + "loss": 0.447, + "step": 49300 + }, + { + "epoch": 3.8578680203045685, + "grad_norm": 1.130889654159546, + "learning_rate": 1.9243720203204378e-05, + "loss": 0.4377, + "step": 49400 + }, + { + "epoch": 3.8656774697383836, + "grad_norm": 1.0333083868026733, + "learning_rate": 1.9242157092614304e-05, + "loss": 0.4557, + "step": 49500 + }, + { + "epoch": 3.8734869191721986, + "grad_norm": 1.0891958475112915, + "learning_rate": 1.924059398202423e-05, + "loss": 0.4402, + "step": 49600 + }, + { + "epoch": 3.881296368606013, + "grad_norm": 1.0473707914352417, + "learning_rate": 1.9239030871434156e-05, + "loss": 0.4289, + "step": 49700 + }, + { + "epoch": 3.889105818039828, + "grad_norm": 0.858305037021637, + "learning_rate": 1.9237467760844082e-05, + "loss": 0.432, + "step": 49800 + }, + { + "epoch": 3.896915267473643, + "grad_norm": 1.3434786796569824, + "learning_rate": 1.9235904650254008e-05, + "loss": 0.4346, + "step": 49900 + }, + { + "epoch": 3.904724716907458, + "grad_norm": 0.7991245985031128, + "learning_rate": 1.923434153966393e-05, + "loss": 0.426, + "step": 50000 + }, + { + "epoch": 3.912534166341273, + "grad_norm": 1.0419330596923828, + "learning_rate": 1.923277842907386e-05, + "loss": 0.4337, + "step": 50100 + }, + { + "epoch": 3.9203436157750877, + "grad_norm": 1.0657131671905518, + "learning_rate": 1.9231215318483786e-05, + "loss": 0.4148, + "step": 50200 + }, + { + "epoch": 3.9281530652089027, + "grad_norm": 1.0321459770202637, + "learning_rate": 1.922965220789371e-05, + "loss": 0.4394, + "step": 50300 + }, + { + "epoch": 3.9359625146427177, + "grad_norm": 0.9495915174484253, + "learning_rate": 1.9228089097303635e-05, + "loss": 0.4381, + "step": 50400 + }, + { + "epoch": 3.9437719640765327, + "grad_norm": 1.3790180683135986, + "learning_rate": 1.9226525986713564e-05, + "loss": 0.4385, + "step": 50500 + }, + { + "epoch": 3.9515814135103478, + "grad_norm": 0.9733704328536987, + "learning_rate": 1.9224962876123487e-05, + "loss": 0.4173, + "step": 50600 + }, + { + "epoch": 3.9593908629441623, + "grad_norm": 0.8968336582183838, + "learning_rate": 1.9223399765533413e-05, + "loss": 0.435, + "step": 50700 + }, + { + "epoch": 3.9672003123779773, + "grad_norm": 1.1181347370147705, + "learning_rate": 1.922183665494334e-05, + "loss": 0.4388, + "step": 50800 + }, + { + "epoch": 3.9750097618117923, + "grad_norm": 1.0089054107666016, + "learning_rate": 1.9220273544353265e-05, + "loss": 0.414, + "step": 50900 + }, + { + "epoch": 3.9828192112456073, + "grad_norm": 0.9809916019439697, + "learning_rate": 1.921871043376319e-05, + "loss": 0.429, + "step": 51000 + }, + { + "epoch": 3.9906286606794223, + "grad_norm": 0.9226115942001343, + "learning_rate": 1.9217147323173117e-05, + "loss": 0.4262, + "step": 51100 + }, + { + "epoch": 3.998438110113237, + "grad_norm": 1.1190813779830933, + "learning_rate": 1.9215584212583043e-05, + "loss": 0.4287, + "step": 51200 + }, + { + "epoch": 4.006247559547052, + "grad_norm": 1.146544098854065, + "learning_rate": 1.921402110199297e-05, + "loss": 0.4334, + "step": 51300 + }, + { + "epoch": 4.014057008980867, + "grad_norm": 1.093766450881958, + "learning_rate": 1.9212457991402895e-05, + "loss": 0.4355, + "step": 51400 + }, + { + "epoch": 4.0218664584146815, + "grad_norm": 0.8110470771789551, + "learning_rate": 1.9210894880812817e-05, + "loss": 0.4542, + "step": 51500 + }, + { + "epoch": 4.029675907848497, + "grad_norm": 1.1870598793029785, + "learning_rate": 1.9209331770222747e-05, + "loss": 0.4247, + "step": 51600 + }, + { + "epoch": 4.0374853572823115, + "grad_norm": 1.3483214378356934, + "learning_rate": 1.920776865963267e-05, + "loss": 0.4188, + "step": 51700 + }, + { + "epoch": 4.045294806716127, + "grad_norm": 1.1394708156585693, + "learning_rate": 1.9206205549042595e-05, + "loss": 0.4341, + "step": 51800 + }, + { + "epoch": 4.0531042561499415, + "grad_norm": 1.03669273853302, + "learning_rate": 1.9204642438452525e-05, + "loss": 0.4325, + "step": 51900 + }, + { + "epoch": 4.060913705583756, + "grad_norm": 0.9215898513793945, + "learning_rate": 1.9203079327862447e-05, + "loss": 0.4259, + "step": 52000 + }, + { + "epoch": 4.0687231550175715, + "grad_norm": 0.8863016963005066, + "learning_rate": 1.9201516217272373e-05, + "loss": 0.4215, + "step": 52100 + }, + { + "epoch": 4.076532604451386, + "grad_norm": 0.865557074546814, + "learning_rate": 1.91999531066823e-05, + "loss": 0.4338, + "step": 52200 + }, + { + "epoch": 4.084342053885201, + "grad_norm": 1.240858793258667, + "learning_rate": 1.9198389996092225e-05, + "loss": 0.4294, + "step": 52300 + }, + { + "epoch": 4.092151503319016, + "grad_norm": 0.9957290887832642, + "learning_rate": 1.919682688550215e-05, + "loss": 0.4239, + "step": 52400 + }, + { + "epoch": 4.099960952752831, + "grad_norm": 1.122232437133789, + "learning_rate": 1.9195263774912077e-05, + "loss": 0.4312, + "step": 52500 + }, + { + "epoch": 4.107770402186646, + "grad_norm": 0.9833566546440125, + "learning_rate": 1.9193700664322e-05, + "loss": 0.4427, + "step": 52600 + }, + { + "epoch": 4.115579851620461, + "grad_norm": 1.0275732278823853, + "learning_rate": 1.919213755373193e-05, + "loss": 0.4188, + "step": 52700 + }, + { + "epoch": 4.123389301054275, + "grad_norm": 1.186841368675232, + "learning_rate": 1.9190574443141855e-05, + "loss": 0.423, + "step": 52800 + }, + { + "epoch": 4.131198750488091, + "grad_norm": 1.288432240486145, + "learning_rate": 1.9189011332551778e-05, + "loss": 0.4483, + "step": 52900 + }, + { + "epoch": 4.139008199921905, + "grad_norm": 1.2151869535446167, + "learning_rate": 1.9187448221961707e-05, + "loss": 0.4206, + "step": 53000 + }, + { + "epoch": 4.146817649355721, + "grad_norm": 1.2180672883987427, + "learning_rate": 1.9185900742477533e-05, + "loss": 0.4006, + "step": 53100 + }, + { + "epoch": 4.154627098789535, + "grad_norm": 0.9600439071655273, + "learning_rate": 1.9184337631887456e-05, + "loss": 0.4299, + "step": 53200 + }, + { + "epoch": 4.16243654822335, + "grad_norm": 1.0519211292266846, + "learning_rate": 1.9182774521297382e-05, + "loss": 0.4221, + "step": 53300 + }, + { + "epoch": 4.170245997657165, + "grad_norm": 0.9762826561927795, + "learning_rate": 1.918121141070731e-05, + "loss": 0.4052, + "step": 53400 + }, + { + "epoch": 4.17805544709098, + "grad_norm": 0.9231967329978943, + "learning_rate": 1.9179648300117234e-05, + "loss": 0.4403, + "step": 53500 + }, + { + "epoch": 4.185864896524795, + "grad_norm": 0.8770660161972046, + "learning_rate": 1.917808518952716e-05, + "loss": 0.45, + "step": 53600 + }, + { + "epoch": 4.19367434595861, + "grad_norm": 1.1238151788711548, + "learning_rate": 1.9176522078937086e-05, + "loss": 0.414, + "step": 53700 + }, + { + "epoch": 4.2014837953924244, + "grad_norm": 0.861791729927063, + "learning_rate": 1.9174958968347012e-05, + "loss": 0.4102, + "step": 53800 + }, + { + "epoch": 4.20929324482624, + "grad_norm": 0.9705322980880737, + "learning_rate": 1.9173395857756938e-05, + "loss": 0.4106, + "step": 53900 + }, + { + "epoch": 4.2171026942600545, + "grad_norm": 1.0542993545532227, + "learning_rate": 1.9171832747166864e-05, + "loss": 0.4256, + "step": 54000 + }, + { + "epoch": 4.22491214369387, + "grad_norm": 1.1293755769729614, + "learning_rate": 1.917026963657679e-05, + "loss": 0.4192, + "step": 54100 + }, + { + "epoch": 4.2327215931276845, + "grad_norm": 0.7894850969314575, + "learning_rate": 1.9168706525986716e-05, + "loss": 0.4187, + "step": 54200 + }, + { + "epoch": 4.240531042561499, + "grad_norm": 1.1279304027557373, + "learning_rate": 1.9167143415396642e-05, + "loss": 0.4217, + "step": 54300 + }, + { + "epoch": 4.2483404919953145, + "grad_norm": 1.1187465190887451, + "learning_rate": 1.9165580304806565e-05, + "loss": 0.4197, + "step": 54400 + }, + { + "epoch": 4.256149941429129, + "grad_norm": 1.210397720336914, + "learning_rate": 1.9164017194216494e-05, + "loss": 0.416, + "step": 54500 + }, + { + "epoch": 4.2639593908629445, + "grad_norm": 1.1013455390930176, + "learning_rate": 1.9162454083626417e-05, + "loss": 0.4312, + "step": 54600 + }, + { + "epoch": 4.271768840296759, + "grad_norm": 1.0917813777923584, + "learning_rate": 1.9160890973036343e-05, + "loss": 0.4348, + "step": 54700 + }, + { + "epoch": 4.279578289730574, + "grad_norm": 0.9799680113792419, + "learning_rate": 1.915932786244627e-05, + "loss": 0.4237, + "step": 54800 + }, + { + "epoch": 4.287387739164389, + "grad_norm": 0.9628735780715942, + "learning_rate": 1.9157780382962098e-05, + "loss": 0.4282, + "step": 54900 + }, + { + "epoch": 4.295197188598204, + "grad_norm": 0.9904158711433411, + "learning_rate": 1.915621727237202e-05, + "loss": 0.4273, + "step": 55000 + }, + { + "epoch": 4.303006638032019, + "grad_norm": 0.8235137462615967, + "learning_rate": 1.9154654161781947e-05, + "loss": 0.4283, + "step": 55100 + }, + { + "epoch": 4.310816087465834, + "grad_norm": 1.1564571857452393, + "learning_rate": 1.9153091051191873e-05, + "loss": 0.4078, + "step": 55200 + }, + { + "epoch": 4.318625536899648, + "grad_norm": 1.199800729751587, + "learning_rate": 1.91515279406018e-05, + "loss": 0.4033, + "step": 55300 + }, + { + "epoch": 4.326434986333464, + "grad_norm": 1.005346417427063, + "learning_rate": 1.9149964830011725e-05, + "loss": 0.4121, + "step": 55400 + }, + { + "epoch": 4.334244435767278, + "grad_norm": 0.8400962948799133, + "learning_rate": 1.914840171942165e-05, + "loss": 0.4183, + "step": 55500 + }, + { + "epoch": 4.342053885201094, + "grad_norm": 1.3714483976364136, + "learning_rate": 1.9146838608831577e-05, + "loss": 0.4228, + "step": 55600 + }, + { + "epoch": 4.349863334634908, + "grad_norm": 1.2525608539581299, + "learning_rate": 1.9145275498241503e-05, + "loss": 0.4165, + "step": 55700 + }, + { + "epoch": 4.357672784068723, + "grad_norm": 1.116113305091858, + "learning_rate": 1.914371238765143e-05, + "loss": 0.4192, + "step": 55800 + }, + { + "epoch": 4.365482233502538, + "grad_norm": 1.3345171213150024, + "learning_rate": 1.914214927706135e-05, + "loss": 0.4127, + "step": 55900 + }, + { + "epoch": 4.373291682936353, + "grad_norm": 0.7660952210426331, + "learning_rate": 1.914058616647128e-05, + "loss": 0.4297, + "step": 56000 + }, + { + "epoch": 4.381101132370168, + "grad_norm": 0.9481973648071289, + "learning_rate": 1.9139023055881203e-05, + "loss": 0.4216, + "step": 56100 + }, + { + "epoch": 4.388910581803983, + "grad_norm": 0.9404019117355347, + "learning_rate": 1.913745994529113e-05, + "loss": 0.4238, + "step": 56200 + }, + { + "epoch": 4.396720031237797, + "grad_norm": 0.907920241355896, + "learning_rate": 1.913589683470106e-05, + "loss": 0.3997, + "step": 56300 + }, + { + "epoch": 4.404529480671613, + "grad_norm": 0.9752848744392395, + "learning_rate": 1.913433372411098e-05, + "loss": 0.4146, + "step": 56400 + }, + { + "epoch": 4.412338930105427, + "grad_norm": 1.210908055305481, + "learning_rate": 1.9132770613520908e-05, + "loss": 0.426, + "step": 56500 + }, + { + "epoch": 4.420148379539243, + "grad_norm": 0.8827472925186157, + "learning_rate": 1.9131207502930834e-05, + "loss": 0.4177, + "step": 56600 + }, + { + "epoch": 4.4279578289730575, + "grad_norm": 1.1287732124328613, + "learning_rate": 1.912964439234076e-05, + "loss": 0.4143, + "step": 56700 + }, + { + "epoch": 4.435767278406872, + "grad_norm": 1.011299729347229, + "learning_rate": 1.9128081281750686e-05, + "loss": 0.4078, + "step": 56800 + }, + { + "epoch": 4.4435767278406875, + "grad_norm": 1.2453038692474365, + "learning_rate": 1.912651817116061e-05, + "loss": 0.4167, + "step": 56900 + }, + { + "epoch": 4.451386177274502, + "grad_norm": 0.992863118648529, + "learning_rate": 1.9124955060570534e-05, + "loss": 0.4245, + "step": 57000 + }, + { + "epoch": 4.459195626708317, + "grad_norm": 1.1472619771957397, + "learning_rate": 1.9123391949980464e-05, + "loss": 0.4201, + "step": 57100 + }, + { + "epoch": 4.467005076142132, + "grad_norm": 1.3278522491455078, + "learning_rate": 1.912182883939039e-05, + "loss": 0.4184, + "step": 57200 + }, + { + "epoch": 4.474814525575947, + "grad_norm": 1.2207483053207397, + "learning_rate": 1.9120265728800312e-05, + "loss": 0.4184, + "step": 57300 + }, + { + "epoch": 4.482623975009762, + "grad_norm": 1.0354204177856445, + "learning_rate": 1.911870261821024e-05, + "loss": 0.4112, + "step": 57400 + }, + { + "epoch": 4.490433424443577, + "grad_norm": 0.8041611909866333, + "learning_rate": 1.9117139507620168e-05, + "loss": 0.4127, + "step": 57500 + }, + { + "epoch": 4.498242873877391, + "grad_norm": 0.8567083477973938, + "learning_rate": 1.911557639703009e-05, + "loss": 0.414, + "step": 57600 + }, + { + "epoch": 4.506052323311207, + "grad_norm": 1.2674700021743774, + "learning_rate": 1.9114013286440016e-05, + "loss": 0.4051, + "step": 57700 + }, + { + "epoch": 4.513861772745021, + "grad_norm": 1.1298909187316895, + "learning_rate": 1.9112450175849942e-05, + "loss": 0.4047, + "step": 57800 + }, + { + "epoch": 4.521671222178837, + "grad_norm": 0.9413766264915466, + "learning_rate": 1.9110887065259868e-05, + "loss": 0.4156, + "step": 57900 + }, + { + "epoch": 4.529480671612651, + "grad_norm": 1.1707350015640259, + "learning_rate": 1.9109323954669794e-05, + "loss": 0.4269, + "step": 58000 + }, + { + "epoch": 4.537290121046466, + "grad_norm": 0.9936490654945374, + "learning_rate": 1.910777647518562e-05, + "loss": 0.416, + "step": 58100 + }, + { + "epoch": 4.545099570480281, + "grad_norm": 1.0722568035125732, + "learning_rate": 1.9106213364595546e-05, + "loss": 0.3986, + "step": 58200 + }, + { + "epoch": 4.552909019914096, + "grad_norm": 1.2025909423828125, + "learning_rate": 1.9104650254005472e-05, + "loss": 0.4097, + "step": 58300 + }, + { + "epoch": 4.560718469347911, + "grad_norm": 0.8958162069320679, + "learning_rate": 1.9103087143415398e-05, + "loss": 0.4047, + "step": 58400 + }, + { + "epoch": 4.568527918781726, + "grad_norm": 0.9446201324462891, + "learning_rate": 1.9101524032825324e-05, + "loss": 0.4083, + "step": 58500 + }, + { + "epoch": 4.57633736821554, + "grad_norm": 1.0519663095474243, + "learning_rate": 1.909996092223525e-05, + "loss": 0.4263, + "step": 58600 + }, + { + "epoch": 4.584146817649356, + "grad_norm": 0.8739796876907349, + "learning_rate": 1.9098397811645176e-05, + "loss": 0.4055, + "step": 58700 + }, + { + "epoch": 4.59195626708317, + "grad_norm": 0.9819687604904175, + "learning_rate": 1.90968347010551e-05, + "loss": 0.4025, + "step": 58800 + }, + { + "epoch": 4.599765716516986, + "grad_norm": 1.321071743965149, + "learning_rate": 1.9095271590465028e-05, + "loss": 0.413, + "step": 58900 + }, + { + "epoch": 4.6075751659508, + "grad_norm": 0.8105387091636658, + "learning_rate": 1.9093708479874954e-05, + "loss": 0.3976, + "step": 59000 + }, + { + "epoch": 4.615384615384615, + "grad_norm": 1.2726750373840332, + "learning_rate": 1.9092145369284877e-05, + "loss": 0.4165, + "step": 59100 + }, + { + "epoch": 4.62319406481843, + "grad_norm": 1.0112985372543335, + "learning_rate": 1.9090582258694806e-05, + "loss": 0.3968, + "step": 59200 + }, + { + "epoch": 4.631003514252245, + "grad_norm": 0.9318651556968689, + "learning_rate": 1.908901914810473e-05, + "loss": 0.398, + "step": 59300 + }, + { + "epoch": 4.63881296368606, + "grad_norm": 1.057499647140503, + "learning_rate": 1.9087456037514655e-05, + "loss": 0.4177, + "step": 59400 + }, + { + "epoch": 4.646622413119875, + "grad_norm": 1.0527663230895996, + "learning_rate": 1.908589292692458e-05, + "loss": 0.4014, + "step": 59500 + }, + { + "epoch": 4.65443186255369, + "grad_norm": 1.1206157207489014, + "learning_rate": 1.9084329816334507e-05, + "loss": 0.4022, + "step": 59600 + }, + { + "epoch": 4.662241311987505, + "grad_norm": 0.9441333413124084, + "learning_rate": 1.9082766705744433e-05, + "loss": 0.4002, + "step": 59700 + }, + { + "epoch": 4.67005076142132, + "grad_norm": 1.1934523582458496, + "learning_rate": 1.908120359515436e-05, + "loss": 0.4106, + "step": 59800 + }, + { + "epoch": 4.677860210855135, + "grad_norm": 1.1480247974395752, + "learning_rate": 1.9079640484564285e-05, + "loss": 0.4116, + "step": 59900 + }, + { + "epoch": 4.68566966028895, + "grad_norm": 0.8538499474525452, + "learning_rate": 1.907807737397421e-05, + "loss": 0.3956, + "step": 60000 + }, + { + "epoch": 4.693479109722764, + "grad_norm": 0.9278829097747803, + "learning_rate": 1.9076514263384137e-05, + "loss": 0.4261, + "step": 60100 + }, + { + "epoch": 4.70128855915658, + "grad_norm": 1.076007604598999, + "learning_rate": 1.907495115279406e-05, + "loss": 0.4061, + "step": 60200 + }, + { + "epoch": 4.709098008590394, + "grad_norm": 1.2330677509307861, + "learning_rate": 1.907338804220399e-05, + "loss": 0.3983, + "step": 60300 + }, + { + "epoch": 4.71690745802421, + "grad_norm": 1.0433940887451172, + "learning_rate": 1.9071824931613915e-05, + "loss": 0.4078, + "step": 60400 + }, + { + "epoch": 4.724716907458024, + "grad_norm": 1.1503841876983643, + "learning_rate": 1.9070261821023838e-05, + "loss": 0.3977, + "step": 60500 + }, + { + "epoch": 4.732526356891839, + "grad_norm": 0.9216443300247192, + "learning_rate": 1.9068698710433764e-05, + "loss": 0.4013, + "step": 60600 + }, + { + "epoch": 4.740335806325654, + "grad_norm": 0.9420239329338074, + "learning_rate": 1.906713559984369e-05, + "loss": 0.4145, + "step": 60700 + }, + { + "epoch": 4.748145255759469, + "grad_norm": 0.8641262650489807, + "learning_rate": 1.9065572489253616e-05, + "loss": 0.4111, + "step": 60800 + }, + { + "epoch": 4.755954705193284, + "grad_norm": 1.179482340812683, + "learning_rate": 1.9064009378663542e-05, + "loss": 0.4103, + "step": 60900 + }, + { + "epoch": 4.763764154627099, + "grad_norm": 1.0619276762008667, + "learning_rate": 1.9062446268073468e-05, + "loss": 0.4037, + "step": 61000 + }, + { + "epoch": 4.771573604060913, + "grad_norm": 1.2168949842453003, + "learning_rate": 1.9060898788589294e-05, + "loss": 0.4115, + "step": 61100 + }, + { + "epoch": 4.779383053494729, + "grad_norm": 1.133819580078125, + "learning_rate": 1.905933567799922e-05, + "loss": 0.4129, + "step": 61200 + }, + { + "epoch": 4.787192502928543, + "grad_norm": 1.0034329891204834, + "learning_rate": 1.9057772567409146e-05, + "loss": 0.3981, + "step": 61300 + }, + { + "epoch": 4.795001952362359, + "grad_norm": 1.039372205734253, + "learning_rate": 1.9056209456819072e-05, + "loss": 0.4006, + "step": 61400 + }, + { + "epoch": 4.802811401796173, + "grad_norm": 1.1082069873809814, + "learning_rate": 1.9054646346228998e-05, + "loss": 0.4117, + "step": 61500 + }, + { + "epoch": 4.810620851229988, + "grad_norm": 0.8162183165550232, + "learning_rate": 1.9053083235638924e-05, + "loss": 0.3847, + "step": 61600 + }, + { + "epoch": 4.818430300663803, + "grad_norm": 1.176859736442566, + "learning_rate": 1.9051520125048846e-05, + "loss": 0.3877, + "step": 61700 + }, + { + "epoch": 4.826239750097618, + "grad_norm": 1.084212303161621, + "learning_rate": 1.9049957014458776e-05, + "loss": 0.4127, + "step": 61800 + }, + { + "epoch": 4.834049199531433, + "grad_norm": 1.1191595792770386, + "learning_rate": 1.9048393903868702e-05, + "loss": 0.4004, + "step": 61900 + }, + { + "epoch": 4.841858648965248, + "grad_norm": 0.9039832949638367, + "learning_rate": 1.9046830793278624e-05, + "loss": 0.4057, + "step": 62000 + }, + { + "epoch": 4.8496680983990625, + "grad_norm": 0.9490616917610168, + "learning_rate": 1.904526768268855e-05, + "loss": 0.4005, + "step": 62100 + }, + { + "epoch": 4.857477547832878, + "grad_norm": 1.0080968141555786, + "learning_rate": 1.9043704572098476e-05, + "loss": 0.404, + "step": 62200 + }, + { + "epoch": 4.865286997266693, + "grad_norm": 0.9084405303001404, + "learning_rate": 1.9042141461508402e-05, + "loss": 0.3969, + "step": 62300 + }, + { + "epoch": 4.873096446700508, + "grad_norm": 1.123801589012146, + "learning_rate": 1.904057835091833e-05, + "loss": 0.3956, + "step": 62400 + }, + { + "epoch": 4.880905896134323, + "grad_norm": 1.3912336826324463, + "learning_rate": 1.9039015240328254e-05, + "loss": 0.4006, + "step": 62500 + }, + { + "epoch": 4.888715345568137, + "grad_norm": 1.3924329280853271, + "learning_rate": 1.903745212973818e-05, + "loss": 0.3966, + "step": 62600 + }, + { + "epoch": 4.896524795001953, + "grad_norm": 1.1651110649108887, + "learning_rate": 1.9035889019148106e-05, + "loss": 0.3903, + "step": 62700 + }, + { + "epoch": 4.904334244435767, + "grad_norm": 0.9102842807769775, + "learning_rate": 1.9034325908558032e-05, + "loss": 0.3986, + "step": 62800 + }, + { + "epoch": 4.912143693869583, + "grad_norm": 1.1338361501693726, + "learning_rate": 1.903276279796796e-05, + "loss": 0.3836, + "step": 62900 + }, + { + "epoch": 4.919953143303397, + "grad_norm": 0.7827601432800293, + "learning_rate": 1.9031215318483784e-05, + "loss": 0.3988, + "step": 63000 + }, + { + "epoch": 4.927762592737212, + "grad_norm": 1.0492647886276245, + "learning_rate": 1.902965220789371e-05, + "loss": 0.397, + "step": 63100 + }, + { + "epoch": 4.935572042171027, + "grad_norm": 1.2427724599838257, + "learning_rate": 1.9028089097303633e-05, + "loss": 0.3907, + "step": 63200 + }, + { + "epoch": 4.943381491604842, + "grad_norm": 0.8890752792358398, + "learning_rate": 1.9026525986713562e-05, + "loss": 0.4093, + "step": 63300 + }, + { + "epoch": 4.951190941038657, + "grad_norm": 1.0128096342086792, + "learning_rate": 1.902496287612349e-05, + "loss": 0.4066, + "step": 63400 + }, + { + "epoch": 4.959000390472472, + "grad_norm": 1.2015756368637085, + "learning_rate": 1.902339976553341e-05, + "loss": 0.3898, + "step": 63500 + }, + { + "epoch": 4.966809839906286, + "grad_norm": 1.039713740348816, + "learning_rate": 1.902183665494334e-05, + "loss": 0.3945, + "step": 63600 + }, + { + "epoch": 4.974619289340102, + "grad_norm": 0.9324501156806946, + "learning_rate": 1.9020273544353267e-05, + "loss": 0.3965, + "step": 63700 + }, + { + "epoch": 4.982428738773916, + "grad_norm": 0.9096100330352783, + "learning_rate": 1.901871043376319e-05, + "loss": 0.3995, + "step": 63800 + }, + { + "epoch": 4.990238188207732, + "grad_norm": 1.202704906463623, + "learning_rate": 1.9017147323173115e-05, + "loss": 0.3879, + "step": 63900 + }, + { + "epoch": 4.998047637641546, + "grad_norm": 1.168053150177002, + "learning_rate": 1.901558421258304e-05, + "loss": 0.4114, + "step": 64000 + }, + { + "epoch": 5.005857087075361, + "grad_norm": 1.0179202556610107, + "learning_rate": 1.9014021101992967e-05, + "loss": 0.3913, + "step": 64100 + }, + { + "epoch": 5.013666536509176, + "grad_norm": 0.8013337850570679, + "learning_rate": 1.9012457991402893e-05, + "loss": 0.3921, + "step": 64200 + }, + { + "epoch": 5.021475985942991, + "grad_norm": 0.8554266095161438, + "learning_rate": 1.901089488081282e-05, + "loss": 0.3974, + "step": 64300 + }, + { + "epoch": 5.0292854353768055, + "grad_norm": 1.1443554162979126, + "learning_rate": 1.9009331770222745e-05, + "loss": 0.393, + "step": 64400 + }, + { + "epoch": 5.037094884810621, + "grad_norm": 0.8980423212051392, + "learning_rate": 1.900776865963267e-05, + "loss": 0.3925, + "step": 64500 + }, + { + "epoch": 5.0449043342444355, + "grad_norm": 1.084808111190796, + "learning_rate": 1.9006205549042597e-05, + "loss": 0.381, + "step": 64600 + }, + { + "epoch": 5.052713783678251, + "grad_norm": 1.0049347877502441, + "learning_rate": 1.9004642438452523e-05, + "loss": 0.3963, + "step": 64700 + }, + { + "epoch": 5.0605232331120655, + "grad_norm": 1.215223789215088, + "learning_rate": 1.900307932786245e-05, + "loss": 0.3895, + "step": 64800 + }, + { + "epoch": 5.06833268254588, + "grad_norm": 1.2979555130004883, + "learning_rate": 1.9001516217272372e-05, + "loss": 0.3983, + "step": 64900 + }, + { + "epoch": 5.0761421319796955, + "grad_norm": 1.1935662031173706, + "learning_rate": 1.8999953106682298e-05, + "loss": 0.3923, + "step": 65000 + }, + { + "epoch": 5.08395158141351, + "grad_norm": 0.8865647912025452, + "learning_rate": 1.8998389996092227e-05, + "loss": 0.3984, + "step": 65100 + }, + { + "epoch": 5.091761030847326, + "grad_norm": 1.161111831665039, + "learning_rate": 1.899682688550215e-05, + "loss": 0.3947, + "step": 65200 + }, + { + "epoch": 5.09957048028114, + "grad_norm": 1.0658845901489258, + "learning_rate": 1.8995263774912076e-05, + "loss": 0.3911, + "step": 65300 + }, + { + "epoch": 5.107379929714955, + "grad_norm": 0.9132626056671143, + "learning_rate": 1.8993700664322002e-05, + "loss": 0.3786, + "step": 65400 + }, + { + "epoch": 5.11518937914877, + "grad_norm": 1.2132585048675537, + "learning_rate": 1.8992137553731928e-05, + "loss": 0.4043, + "step": 65500 + }, + { + "epoch": 5.122998828582585, + "grad_norm": 1.0975048542022705, + "learning_rate": 1.8990574443141854e-05, + "loss": 0.3757, + "step": 65600 + }, + { + "epoch": 5.1308082780164, + "grad_norm": 1.0072981119155884, + "learning_rate": 1.898901133255178e-05, + "loss": 0.393, + "step": 65700 + }, + { + "epoch": 5.138617727450215, + "grad_norm": 1.1201173067092896, + "learning_rate": 1.8987448221961706e-05, + "loss": 0.3813, + "step": 65800 + }, + { + "epoch": 5.146427176884029, + "grad_norm": 0.9941774010658264, + "learning_rate": 1.8985885111371632e-05, + "loss": 0.4047, + "step": 65900 + }, + { + "epoch": 5.154236626317845, + "grad_norm": 1.2513539791107178, + "learning_rate": 1.8984322000781558e-05, + "loss": 0.409, + "step": 66000 + }, + { + "epoch": 5.162046075751659, + "grad_norm": 1.330694556236267, + "learning_rate": 1.898275889019148e-05, + "loss": 0.3809, + "step": 66100 + }, + { + "epoch": 5.169855525185475, + "grad_norm": 0.9888590574264526, + "learning_rate": 1.898119577960141e-05, + "loss": 0.3902, + "step": 66200 + }, + { + "epoch": 5.177664974619289, + "grad_norm": 1.3481968641281128, + "learning_rate": 1.8979632669011333e-05, + "loss": 0.3861, + "step": 66300 + }, + { + "epoch": 5.185474424053104, + "grad_norm": 0.9936544299125671, + "learning_rate": 1.897806955842126e-05, + "loss": 0.3852, + "step": 66400 + }, + { + "epoch": 5.193283873486919, + "grad_norm": 1.215280294418335, + "learning_rate": 1.8976506447831188e-05, + "loss": 0.392, + "step": 66500 + }, + { + "epoch": 5.201093322920734, + "grad_norm": 0.976831316947937, + "learning_rate": 1.897494333724111e-05, + "loss": 0.3908, + "step": 66600 + }, + { + "epoch": 5.208902772354549, + "grad_norm": 1.195337176322937, + "learning_rate": 1.8973380226651037e-05, + "loss": 0.377, + "step": 66700 + }, + { + "epoch": 5.216712221788364, + "grad_norm": 0.9364585876464844, + "learning_rate": 1.8971817116060963e-05, + "loss": 0.3947, + "step": 66800 + }, + { + "epoch": 5.2245216712221785, + "grad_norm": 0.8873243927955627, + "learning_rate": 1.897025400547089e-05, + "loss": 0.3978, + "step": 66900 + }, + { + "epoch": 5.232331120655994, + "grad_norm": 0.9752253293991089, + "learning_rate": 1.8968690894880815e-05, + "loss": 0.3854, + "step": 67000 + }, + { + "epoch": 5.2401405700898085, + "grad_norm": 0.9990689754486084, + "learning_rate": 1.896712778429074e-05, + "loss": 0.381, + "step": 67100 + }, + { + "epoch": 5.247950019523624, + "grad_norm": 1.0154149532318115, + "learning_rate": 1.8965564673700663e-05, + "loss": 0.3824, + "step": 67200 + }, + { + "epoch": 5.2557594689574385, + "grad_norm": 1.0053505897521973, + "learning_rate": 1.8964001563110593e-05, + "loss": 0.389, + "step": 67300 + }, + { + "epoch": 5.263568918391253, + "grad_norm": 1.27951979637146, + "learning_rate": 1.896243845252052e-05, + "loss": 0.3934, + "step": 67400 + }, + { + "epoch": 5.2713783678250685, + "grad_norm": 1.0939563512802124, + "learning_rate": 1.896087534193044e-05, + "loss": 0.3824, + "step": 67500 + }, + { + "epoch": 5.279187817258883, + "grad_norm": 1.247258186340332, + "learning_rate": 1.895931223134037e-05, + "loss": 0.3741, + "step": 67600 + }, + { + "epoch": 5.2869972666926985, + "grad_norm": 0.9081011414527893, + "learning_rate": 1.8957749120750293e-05, + "loss": 0.3925, + "step": 67700 + }, + { + "epoch": 5.294806716126513, + "grad_norm": 1.0813019275665283, + "learning_rate": 1.895618601016022e-05, + "loss": 0.3797, + "step": 67800 + }, + { + "epoch": 5.302616165560328, + "grad_norm": 1.2039158344268799, + "learning_rate": 1.8954622899570145e-05, + "loss": 0.3963, + "step": 67900 + }, + { + "epoch": 5.310425614994143, + "grad_norm": 1.1755808591842651, + "learning_rate": 1.895305978898007e-05, + "loss": 0.3943, + "step": 68000 + }, + { + "epoch": 5.318235064427958, + "grad_norm": 1.0200046300888062, + "learning_rate": 1.8951496678389997e-05, + "loss": 0.3553, + "step": 68100 + }, + { + "epoch": 5.326044513861773, + "grad_norm": 1.0487428903579712, + "learning_rate": 1.8949933567799923e-05, + "loss": 0.3786, + "step": 68200 + }, + { + "epoch": 5.333853963295588, + "grad_norm": 0.8831790685653687, + "learning_rate": 1.894837045720985e-05, + "loss": 0.4052, + "step": 68300 + }, + { + "epoch": 5.341663412729402, + "grad_norm": 1.000813603401184, + "learning_rate": 1.8946807346619775e-05, + "loss": 0.3923, + "step": 68400 + }, + { + "epoch": 5.349472862163218, + "grad_norm": 1.2624138593673706, + "learning_rate": 1.89452442360297e-05, + "loss": 0.3763, + "step": 68500 + }, + { + "epoch": 5.357282311597032, + "grad_norm": 1.0321928262710571, + "learning_rate": 1.8943696756545527e-05, + "loss": 0.3906, + "step": 68600 + }, + { + "epoch": 5.365091761030848, + "grad_norm": 0.8170016407966614, + "learning_rate": 1.8942133645955453e-05, + "loss": 0.3677, + "step": 68700 + }, + { + "epoch": 5.372901210464662, + "grad_norm": 1.0419316291809082, + "learning_rate": 1.894057053536538e-05, + "loss": 0.3936, + "step": 68800 + }, + { + "epoch": 5.380710659898477, + "grad_norm": 1.0884121656417847, + "learning_rate": 1.8939007424775305e-05, + "loss": 0.3662, + "step": 68900 + }, + { + "epoch": 5.388520109332292, + "grad_norm": 1.6693960428237915, + "learning_rate": 1.8937444314185228e-05, + "loss": 0.3795, + "step": 69000 + }, + { + "epoch": 5.396329558766107, + "grad_norm": 0.8068119883537292, + "learning_rate": 1.8935881203595157e-05, + "loss": 0.3701, + "step": 69100 + }, + { + "epoch": 5.404139008199921, + "grad_norm": 1.0311602354049683, + "learning_rate": 1.8934318093005083e-05, + "loss": 0.3643, + "step": 69200 + }, + { + "epoch": 5.411948457633737, + "grad_norm": 0.9586812853813171, + "learning_rate": 1.8932754982415006e-05, + "loss": 0.3765, + "step": 69300 + }, + { + "epoch": 5.4197579070675515, + "grad_norm": 1.1380528211593628, + "learning_rate": 1.8931191871824932e-05, + "loss": 0.3875, + "step": 69400 + }, + { + "epoch": 5.427567356501367, + "grad_norm": 0.8221355080604553, + "learning_rate": 1.8929628761234858e-05, + "loss": 0.3686, + "step": 69500 + }, + { + "epoch": 5.4353768059351815, + "grad_norm": 1.1208763122558594, + "learning_rate": 1.8928065650644784e-05, + "loss": 0.3808, + "step": 69600 + }, + { + "epoch": 5.443186255368996, + "grad_norm": 0.9474813342094421, + "learning_rate": 1.892650254005471e-05, + "loss": 0.3809, + "step": 69700 + }, + { + "epoch": 5.4509957048028115, + "grad_norm": 1.1498290300369263, + "learning_rate": 1.8924939429464636e-05, + "loss": 0.3921, + "step": 69800 + }, + { + "epoch": 5.458805154236626, + "grad_norm": 1.1263091564178467, + "learning_rate": 1.8923376318874562e-05, + "loss": 0.3893, + "step": 69900 + }, + { + "epoch": 5.4666146036704415, + "grad_norm": 1.2363600730895996, + "learning_rate": 1.8921813208284488e-05, + "loss": 0.3848, + "step": 70000 + }, + { + "epoch": 5.474424053104256, + "grad_norm": 0.9119688868522644, + "learning_rate": 1.8920250097694414e-05, + "loss": 0.3778, + "step": 70100 + }, + { + "epoch": 5.482233502538071, + "grad_norm": 1.0323618650436401, + "learning_rate": 1.891868698710434e-05, + "loss": 0.3777, + "step": 70200 + }, + { + "epoch": 5.490042951971886, + "grad_norm": 1.2179909944534302, + "learning_rate": 1.8917123876514266e-05, + "loss": 0.3782, + "step": 70300 + }, + { + "epoch": 5.497852401405701, + "grad_norm": 1.1143534183502197, + "learning_rate": 1.891556076592419e-05, + "loss": 0.3746, + "step": 70400 + }, + { + "epoch": 5.505661850839516, + "grad_norm": 0.9338456392288208, + "learning_rate": 1.8913997655334115e-05, + "loss": 0.3636, + "step": 70500 + }, + { + "epoch": 5.513471300273331, + "grad_norm": 1.06593918800354, + "learning_rate": 1.8912434544744044e-05, + "loss": 0.3818, + "step": 70600 + }, + { + "epoch": 5.521280749707145, + "grad_norm": 1.015339970588684, + "learning_rate": 1.8910871434153967e-05, + "loss": 0.376, + "step": 70700 + }, + { + "epoch": 5.529090199140961, + "grad_norm": 1.016461968421936, + "learning_rate": 1.8909308323563893e-05, + "loss": 0.3759, + "step": 70800 + }, + { + "epoch": 5.536899648574775, + "grad_norm": 0.9616903066635132, + "learning_rate": 1.890774521297382e-05, + "loss": 0.3663, + "step": 70900 + }, + { + "epoch": 5.544709098008591, + "grad_norm": 1.0220727920532227, + "learning_rate": 1.8906182102383745e-05, + "loss": 0.385, + "step": 71000 + }, + { + "epoch": 5.552518547442405, + "grad_norm": 1.0261396169662476, + "learning_rate": 1.890461899179367e-05, + "loss": 0.3859, + "step": 71100 + }, + { + "epoch": 5.56032799687622, + "grad_norm": 0.9234685897827148, + "learning_rate": 1.8903055881203597e-05, + "loss": 0.3743, + "step": 71200 + }, + { + "epoch": 5.568137446310035, + "grad_norm": 1.1062054634094238, + "learning_rate": 1.8901492770613523e-05, + "loss": 0.3909, + "step": 71300 + }, + { + "epoch": 5.57594689574385, + "grad_norm": 0.9592029452323914, + "learning_rate": 1.889992966002345e-05, + "loss": 0.3788, + "step": 71400 + }, + { + "epoch": 5.583756345177665, + "grad_norm": 1.0946000814437866, + "learning_rate": 1.8898366549433375e-05, + "loss": 0.392, + "step": 71500 + }, + { + "epoch": 5.59156579461148, + "grad_norm": 0.8922383189201355, + "learning_rate": 1.88968034388433e-05, + "loss": 0.3674, + "step": 71600 + }, + { + "epoch": 5.599375244045294, + "grad_norm": 1.0789119005203247, + "learning_rate": 1.8895240328253227e-05, + "loss": 0.3652, + "step": 71700 + }, + { + "epoch": 5.60718469347911, + "grad_norm": 1.132554292678833, + "learning_rate": 1.889367721766315e-05, + "loss": 0.3624, + "step": 71800 + }, + { + "epoch": 5.614994142912924, + "grad_norm": 0.995639979839325, + "learning_rate": 1.8892114107073076e-05, + "loss": 0.3604, + "step": 71900 + }, + { + "epoch": 5.62280359234674, + "grad_norm": 0.9848433136940002, + "learning_rate": 1.8890550996483005e-05, + "loss": 0.3648, + "step": 72000 + }, + { + "epoch": 5.630613041780554, + "grad_norm": 1.3442597389221191, + "learning_rate": 1.8888987885892928e-05, + "loss": 0.3723, + "step": 72100 + }, + { + "epoch": 5.638422491214369, + "grad_norm": 1.3660664558410645, + "learning_rate": 1.8887424775302854e-05, + "loss": 0.3633, + "step": 72200 + }, + { + "epoch": 5.6462319406481845, + "grad_norm": 1.0371010303497314, + "learning_rate": 1.888586166471278e-05, + "loss": 0.3651, + "step": 72300 + }, + { + "epoch": 5.654041390081999, + "grad_norm": 1.0066925287246704, + "learning_rate": 1.8884298554122706e-05, + "loss": 0.379, + "step": 72400 + }, + { + "epoch": 5.6618508395158145, + "grad_norm": 1.2986984252929688, + "learning_rate": 1.888273544353263e-05, + "loss": 0.376, + "step": 72500 + }, + { + "epoch": 5.669660288949629, + "grad_norm": 0.9885306358337402, + "learning_rate": 1.8881187964048458e-05, + "loss": 0.3772, + "step": 72600 + }, + { + "epoch": 5.677469738383444, + "grad_norm": 0.8843153119087219, + "learning_rate": 1.8879624853458384e-05, + "loss": 0.376, + "step": 72700 + }, + { + "epoch": 5.685279187817259, + "grad_norm": 1.0636779069900513, + "learning_rate": 1.887806174286831e-05, + "loss": 0.3726, + "step": 72800 + }, + { + "epoch": 5.693088637251074, + "grad_norm": 0.9811561107635498, + "learning_rate": 1.8876498632278236e-05, + "loss": 0.3711, + "step": 72900 + }, + { + "epoch": 5.700898086684889, + "grad_norm": 1.421799659729004, + "learning_rate": 1.887493552168816e-05, + "loss": 0.3763, + "step": 73000 + }, + { + "epoch": 5.708707536118704, + "grad_norm": 1.3165417909622192, + "learning_rate": 1.8873372411098088e-05, + "loss": 0.3686, + "step": 73100 + }, + { + "epoch": 5.716516985552518, + "grad_norm": 0.988136887550354, + "learning_rate": 1.8871809300508014e-05, + "loss": 0.3796, + "step": 73200 + }, + { + "epoch": 5.724326434986334, + "grad_norm": 0.8610849976539612, + "learning_rate": 1.8870246189917936e-05, + "loss": 0.3652, + "step": 73300 + }, + { + "epoch": 5.732135884420148, + "grad_norm": 1.2296196222305298, + "learning_rate": 1.8868683079327862e-05, + "loss": 0.3879, + "step": 73400 + }, + { + "epoch": 5.739945333853964, + "grad_norm": 1.0265551805496216, + "learning_rate": 1.8867119968737792e-05, + "loss": 0.3806, + "step": 73500 + }, + { + "epoch": 5.747754783287778, + "grad_norm": 0.9798378944396973, + "learning_rate": 1.8865556858147714e-05, + "loss": 0.3747, + "step": 73600 + }, + { + "epoch": 5.755564232721593, + "grad_norm": 1.0334645509719849, + "learning_rate": 1.886399374755764e-05, + "loss": 0.3625, + "step": 73700 + }, + { + "epoch": 5.763373682155408, + "grad_norm": 0.8765638470649719, + "learning_rate": 1.886244626807347e-05, + "loss": 0.3622, + "step": 73800 + }, + { + "epoch": 5.771183131589223, + "grad_norm": 0.7884292602539062, + "learning_rate": 1.8860883157483392e-05, + "loss": 0.3673, + "step": 73900 + }, + { + "epoch": 5.778992581023038, + "grad_norm": 1.13102388381958, + "learning_rate": 1.8859320046893318e-05, + "loss": 0.3713, + "step": 74000 + }, + { + "epoch": 5.786802030456853, + "grad_norm": 1.0099087953567505, + "learning_rate": 1.8857756936303244e-05, + "loss": 0.3593, + "step": 74100 + }, + { + "epoch": 5.794611479890667, + "grad_norm": 1.127163290977478, + "learning_rate": 1.885619382571317e-05, + "loss": 0.3801, + "step": 74200 + }, + { + "epoch": 5.802420929324483, + "grad_norm": 1.2953687906265259, + "learning_rate": 1.8854630715123096e-05, + "loss": 0.3882, + "step": 74300 + }, + { + "epoch": 5.810230378758297, + "grad_norm": 1.0948154926300049, + "learning_rate": 1.8853067604533022e-05, + "loss": 0.3704, + "step": 74400 + }, + { + "epoch": 5.818039828192113, + "grad_norm": 1.1528011560440063, + "learning_rate": 1.885150449394295e-05, + "loss": 0.3705, + "step": 74500 + }, + { + "epoch": 5.825849277625927, + "grad_norm": 1.071085810661316, + "learning_rate": 1.8849941383352874e-05, + "loss": 0.3593, + "step": 74600 + }, + { + "epoch": 5.833658727059742, + "grad_norm": 0.8655598163604736, + "learning_rate": 1.88483782727628e-05, + "loss": 0.3638, + "step": 74700 + }, + { + "epoch": 5.841468176493557, + "grad_norm": 0.6879515051841736, + "learning_rate": 1.8846815162172723e-05, + "loss": 0.3688, + "step": 74800 + }, + { + "epoch": 5.849277625927372, + "grad_norm": 1.0214784145355225, + "learning_rate": 1.8845252051582652e-05, + "loss": 0.3496, + "step": 74900 + }, + { + "epoch": 5.8570870753611874, + "grad_norm": 1.1457356214523315, + "learning_rate": 1.884368894099258e-05, + "loss": 0.3651, + "step": 75000 + }, + { + "epoch": 5.864896524795002, + "grad_norm": 0.903660774230957, + "learning_rate": 1.88421258304025e-05, + "loss": 0.3762, + "step": 75100 + }, + { + "epoch": 5.872705974228817, + "grad_norm": 0.9821125864982605, + "learning_rate": 1.8840562719812427e-05, + "loss": 0.3595, + "step": 75200 + }, + { + "epoch": 5.880515423662632, + "grad_norm": 0.7476074695587158, + "learning_rate": 1.8838999609222356e-05, + "loss": 0.3775, + "step": 75300 + }, + { + "epoch": 5.888324873096447, + "grad_norm": 1.0855532884597778, + "learning_rate": 1.883743649863228e-05, + "loss": 0.3659, + "step": 75400 + }, + { + "epoch": 5.896134322530262, + "grad_norm": 0.9092568755149841, + "learning_rate": 1.8835873388042205e-05, + "loss": 0.3722, + "step": 75500 + }, + { + "epoch": 5.903943771964077, + "grad_norm": 1.1002038717269897, + "learning_rate": 1.883431027745213e-05, + "loss": 0.3722, + "step": 75600 + }, + { + "epoch": 5.911753221397891, + "grad_norm": 1.1699854135513306, + "learning_rate": 1.8832747166862057e-05, + "loss": 0.3671, + "step": 75700 + }, + { + "epoch": 5.919562670831707, + "grad_norm": 0.9368227124214172, + "learning_rate": 1.8831184056271983e-05, + "loss": 0.3739, + "step": 75800 + }, + { + "epoch": 5.927372120265521, + "grad_norm": 0.9039379358291626, + "learning_rate": 1.882962094568191e-05, + "loss": 0.3704, + "step": 75900 + }, + { + "epoch": 5.935181569699337, + "grad_norm": 1.0861670970916748, + "learning_rate": 1.8828057835091835e-05, + "loss": 0.361, + "step": 76000 + }, + { + "epoch": 5.942991019133151, + "grad_norm": 0.8491230607032776, + "learning_rate": 1.882649472450176e-05, + "loss": 0.3603, + "step": 76100 + }, + { + "epoch": 5.950800468566966, + "grad_norm": 0.9066652059555054, + "learning_rate": 1.8824931613911687e-05, + "loss": 0.3529, + "step": 76200 + }, + { + "epoch": 5.958609918000781, + "grad_norm": 1.0582003593444824, + "learning_rate": 1.882336850332161e-05, + "loss": 0.355, + "step": 76300 + }, + { + "epoch": 5.966419367434596, + "grad_norm": 0.9515270590782166, + "learning_rate": 1.882180539273154e-05, + "loss": 0.3641, + "step": 76400 + }, + { + "epoch": 5.974228816868411, + "grad_norm": 0.9640957117080688, + "learning_rate": 1.8820242282141462e-05, + "loss": 0.3628, + "step": 76500 + }, + { + "epoch": 5.982038266302226, + "grad_norm": 1.3416815996170044, + "learning_rate": 1.8818679171551388e-05, + "loss": 0.3406, + "step": 76600 + }, + { + "epoch": 5.98984771573604, + "grad_norm": 0.8273581266403198, + "learning_rate": 1.8817131692067214e-05, + "loss": 0.3539, + "step": 76700 + }, + { + "epoch": 5.997657165169856, + "grad_norm": 0.769110918045044, + "learning_rate": 1.8815568581477143e-05, + "loss": 0.3748, + "step": 76800 + }, + { + "epoch": 6.00546661460367, + "grad_norm": 1.0953476428985596, + "learning_rate": 1.8814005470887066e-05, + "loss": 0.358, + "step": 76900 + }, + { + "epoch": 6.013276064037485, + "grad_norm": 1.1051218509674072, + "learning_rate": 1.8812442360296992e-05, + "loss": 0.3608, + "step": 77000 + }, + { + "epoch": 6.0210855134713, + "grad_norm": 0.9927105903625488, + "learning_rate": 1.8810879249706918e-05, + "loss": 0.3529, + "step": 77100 + }, + { + "epoch": 6.028894962905115, + "grad_norm": 1.1750438213348389, + "learning_rate": 1.8809316139116844e-05, + "loss": 0.362, + "step": 77200 + }, + { + "epoch": 6.03670441233893, + "grad_norm": 0.902812123298645, + "learning_rate": 1.880775302852677e-05, + "loss": 0.3606, + "step": 77300 + }, + { + "epoch": 6.044513861772745, + "grad_norm": 0.9053332209587097, + "learning_rate": 1.8806189917936696e-05, + "loss": 0.3667, + "step": 77400 + }, + { + "epoch": 6.0523233112065595, + "grad_norm": 1.2229048013687134, + "learning_rate": 1.8804626807346622e-05, + "loss": 0.3665, + "step": 77500 + }, + { + "epoch": 6.060132760640375, + "grad_norm": 0.9037619829177856, + "learning_rate": 1.8803063696756548e-05, + "loss": 0.3526, + "step": 77600 + }, + { + "epoch": 6.0679422100741895, + "grad_norm": 1.2268940210342407, + "learning_rate": 1.8801500586166474e-05, + "loss": 0.3654, + "step": 77700 + }, + { + "epoch": 6.075751659508005, + "grad_norm": 1.1816377639770508, + "learning_rate": 1.8799937475576396e-05, + "loss": 0.3538, + "step": 77800 + }, + { + "epoch": 6.08356110894182, + "grad_norm": 1.0596685409545898, + "learning_rate": 1.8798374364986326e-05, + "loss": 0.3378, + "step": 77900 + }, + { + "epoch": 6.091370558375634, + "grad_norm": 0.9817838668823242, + "learning_rate": 1.879681125439625e-05, + "loss": 0.3522, + "step": 78000 + }, + { + "epoch": 6.09918000780945, + "grad_norm": 0.9671791791915894, + "learning_rate": 1.8795248143806175e-05, + "loss": 0.3695, + "step": 78100 + }, + { + "epoch": 6.106989457243264, + "grad_norm": 0.9326332211494446, + "learning_rate": 1.8793685033216104e-05, + "loss": 0.3712, + "step": 78200 + }, + { + "epoch": 6.11479890667708, + "grad_norm": 0.8805835247039795, + "learning_rate": 1.8792121922626027e-05, + "loss": 0.3694, + "step": 78300 + }, + { + "epoch": 6.122608356110894, + "grad_norm": 0.9570161700248718, + "learning_rate": 1.8790558812035953e-05, + "loss": 0.3509, + "step": 78400 + }, + { + "epoch": 6.130417805544709, + "grad_norm": 0.7668564319610596, + "learning_rate": 1.878899570144588e-05, + "loss": 0.3582, + "step": 78500 + }, + { + "epoch": 6.138227254978524, + "grad_norm": 1.0919289588928223, + "learning_rate": 1.8787432590855805e-05, + "loss": 0.3492, + "step": 78600 + }, + { + "epoch": 6.146036704412339, + "grad_norm": 1.092902421951294, + "learning_rate": 1.878586948026573e-05, + "loss": 0.354, + "step": 78700 + }, + { + "epoch": 6.153846153846154, + "grad_norm": 1.1573113203048706, + "learning_rate": 1.8784306369675657e-05, + "loss": 0.3553, + "step": 78800 + }, + { + "epoch": 6.161655603279969, + "grad_norm": 1.2066175937652588, + "learning_rate": 1.8782743259085583e-05, + "loss": 0.338, + "step": 78900 + }, + { + "epoch": 6.169465052713783, + "grad_norm": 0.9877097010612488, + "learning_rate": 1.878118014849551e-05, + "loss": 0.3546, + "step": 79000 + }, + { + "epoch": 6.177274502147599, + "grad_norm": 1.2525135278701782, + "learning_rate": 1.8779632669011335e-05, + "loss": 0.3601, + "step": 79100 + }, + { + "epoch": 6.185083951581413, + "grad_norm": 1.107706069946289, + "learning_rate": 1.877806955842126e-05, + "loss": 0.3737, + "step": 79200 + }, + { + "epoch": 6.192893401015229, + "grad_norm": 1.1282185316085815, + "learning_rate": 1.8776506447831187e-05, + "loss": 0.3628, + "step": 79300 + }, + { + "epoch": 6.200702850449043, + "grad_norm": 1.0635756254196167, + "learning_rate": 1.8774943337241113e-05, + "loss": 0.3492, + "step": 79400 + }, + { + "epoch": 6.208512299882858, + "grad_norm": 1.1150610446929932, + "learning_rate": 1.8773380226651035e-05, + "loss": 0.3581, + "step": 79500 + }, + { + "epoch": 6.216321749316673, + "grad_norm": 1.0393744707107544, + "learning_rate": 1.877181711606096e-05, + "loss": 0.3519, + "step": 79600 + }, + { + "epoch": 6.224131198750488, + "grad_norm": 1.285361647605896, + "learning_rate": 1.877025400547089e-05, + "loss": 0.3533, + "step": 79700 + }, + { + "epoch": 6.231940648184303, + "grad_norm": 1.00641930103302, + "learning_rate": 1.8768690894880813e-05, + "loss": 0.3625, + "step": 79800 + }, + { + "epoch": 6.239750097618118, + "grad_norm": 1.0835678577423096, + "learning_rate": 1.876712778429074e-05, + "loss": 0.353, + "step": 79900 + }, + { + "epoch": 6.2475595470519325, + "grad_norm": 1.0340200662612915, + "learning_rate": 1.8765564673700665e-05, + "loss": 0.3632, + "step": 80000 + }, + { + "epoch": 6.255368996485748, + "grad_norm": 0.8271421790122986, + "learning_rate": 1.876400156311059e-05, + "loss": 0.3527, + "step": 80100 + }, + { + "epoch": 6.2631784459195625, + "grad_norm": 1.2229000329971313, + "learning_rate": 1.8762438452520517e-05, + "loss": 0.3622, + "step": 80200 + }, + { + "epoch": 6.270987895353378, + "grad_norm": 1.1166315078735352, + "learning_rate": 1.8760875341930443e-05, + "loss": 0.348, + "step": 80300 + }, + { + "epoch": 6.2787973447871925, + "grad_norm": 0.9366671442985535, + "learning_rate": 1.875931223134037e-05, + "loss": 0.343, + "step": 80400 + }, + { + "epoch": 6.286606794221007, + "grad_norm": 0.799643874168396, + "learning_rate": 1.8757749120750295e-05, + "loss": 0.3418, + "step": 80500 + }, + { + "epoch": 6.2944162436548226, + "grad_norm": 1.2833951711654663, + "learning_rate": 1.875618601016022e-05, + "loss": 0.3575, + "step": 80600 + }, + { + "epoch": 6.302225693088637, + "grad_norm": 1.0523055791854858, + "learning_rate": 1.8754622899570144e-05, + "loss": 0.3582, + "step": 80700 + }, + { + "epoch": 6.310035142522453, + "grad_norm": 0.8948925733566284, + "learning_rate": 1.8753059788980073e-05, + "loss": 0.3533, + "step": 80800 + }, + { + "epoch": 6.317844591956267, + "grad_norm": 0.8339662551879883, + "learning_rate": 1.8751496678389996e-05, + "loss": 0.3484, + "step": 80900 + }, + { + "epoch": 6.325654041390082, + "grad_norm": 1.0875813961029053, + "learning_rate": 1.8749933567799922e-05, + "loss": 0.3504, + "step": 81000 + }, + { + "epoch": 6.333463490823897, + "grad_norm": 0.8726950883865356, + "learning_rate": 1.874837045720985e-05, + "loss": 0.3666, + "step": 81100 + }, + { + "epoch": 6.341272940257712, + "grad_norm": 0.8508704900741577, + "learning_rate": 1.8746807346619774e-05, + "loss": 0.3587, + "step": 81200 + }, + { + "epoch": 6.349082389691527, + "grad_norm": 1.074478030204773, + "learning_rate": 1.87452442360297e-05, + "loss": 0.3668, + "step": 81300 + }, + { + "epoch": 6.356891839125342, + "grad_norm": 0.9604992866516113, + "learning_rate": 1.8743681125439626e-05, + "loss": 0.3254, + "step": 81400 + }, + { + "epoch": 6.364701288559156, + "grad_norm": 1.0126805305480957, + "learning_rate": 1.8742118014849552e-05, + "loss": 0.3337, + "step": 81500 + }, + { + "epoch": 6.372510737992972, + "grad_norm": 1.1412858963012695, + "learning_rate": 1.8740554904259478e-05, + "loss": 0.3438, + "step": 81600 + }, + { + "epoch": 6.380320187426786, + "grad_norm": 0.9028871655464172, + "learning_rate": 1.8738991793669404e-05, + "loss": 0.3522, + "step": 81700 + }, + { + "epoch": 6.388129636860601, + "grad_norm": 1.0400549173355103, + "learning_rate": 1.873742868307933e-05, + "loss": 0.3572, + "step": 81800 + }, + { + "epoch": 6.395939086294416, + "grad_norm": 1.0760631561279297, + "learning_rate": 1.8735865572489256e-05, + "loss": 0.3318, + "step": 81900 + }, + { + "epoch": 6.403748535728231, + "grad_norm": 1.1983120441436768, + "learning_rate": 1.8734302461899182e-05, + "loss": 0.3498, + "step": 82000 + }, + { + "epoch": 6.411557985162046, + "grad_norm": 1.2458655834197998, + "learning_rate": 1.8732739351309105e-05, + "loss": 0.3351, + "step": 82100 + }, + { + "epoch": 6.419367434595861, + "grad_norm": 0.966160237789154, + "learning_rate": 1.8731176240719034e-05, + "loss": 0.3362, + "step": 82200 + }, + { + "epoch": 6.4271768840296755, + "grad_norm": 1.1860681772232056, + "learning_rate": 1.872961313012896e-05, + "loss": 0.3535, + "step": 82300 + }, + { + "epoch": 6.434986333463491, + "grad_norm": 1.1215561628341675, + "learning_rate": 1.8728050019538883e-05, + "loss": 0.3474, + "step": 82400 + }, + { + "epoch": 6.4427957828973055, + "grad_norm": 1.008365273475647, + "learning_rate": 1.872648690894881e-05, + "loss": 0.3378, + "step": 82500 + }, + { + "epoch": 6.450605232331121, + "grad_norm": 1.243260145187378, + "learning_rate": 1.8724923798358735e-05, + "loss": 0.3516, + "step": 82600 + }, + { + "epoch": 6.4584146817649355, + "grad_norm": 0.759389340877533, + "learning_rate": 1.872336068776866e-05, + "loss": 0.3533, + "step": 82700 + }, + { + "epoch": 6.46622413119875, + "grad_norm": 1.0799552202224731, + "learning_rate": 1.8721797577178587e-05, + "loss": 0.3433, + "step": 82800 + }, + { + "epoch": 6.4740335806325655, + "grad_norm": 1.1244173049926758, + "learning_rate": 1.8720234466588513e-05, + "loss": 0.3394, + "step": 82900 + }, + { + "epoch": 6.48184303006638, + "grad_norm": 1.2120684385299683, + "learning_rate": 1.871868698710434e-05, + "loss": 0.3589, + "step": 83000 + }, + { + "epoch": 6.4896524795001955, + "grad_norm": 0.8704874515533447, + "learning_rate": 1.8717123876514265e-05, + "loss": 0.3509, + "step": 83100 + }, + { + "epoch": 6.49746192893401, + "grad_norm": 0.8718099594116211, + "learning_rate": 1.871556076592419e-05, + "loss": 0.3298, + "step": 83200 + }, + { + "epoch": 6.505271378367825, + "grad_norm": 1.084702968597412, + "learning_rate": 1.8713997655334117e-05, + "loss": 0.3418, + "step": 83300 + }, + { + "epoch": 6.51308082780164, + "grad_norm": 1.082650065422058, + "learning_rate": 1.8712434544744043e-05, + "loss": 0.3347, + "step": 83400 + }, + { + "epoch": 6.520890277235455, + "grad_norm": 0.9728855490684509, + "learning_rate": 1.871087143415397e-05, + "loss": 0.347, + "step": 83500 + }, + { + "epoch": 6.52869972666927, + "grad_norm": 0.9495226144790649, + "learning_rate": 1.870930832356389e-05, + "loss": 0.3525, + "step": 83600 + }, + { + "epoch": 6.536509176103085, + "grad_norm": 1.1406092643737793, + "learning_rate": 1.870774521297382e-05, + "loss": 0.3496, + "step": 83700 + }, + { + "epoch": 6.544318625536899, + "grad_norm": 1.1625540256500244, + "learning_rate": 1.8706182102383747e-05, + "loss": 0.3548, + "step": 83800 + }, + { + "epoch": 6.552128074970715, + "grad_norm": 1.0920753479003906, + "learning_rate": 1.870461899179367e-05, + "loss": 0.3521, + "step": 83900 + }, + { + "epoch": 6.559937524404529, + "grad_norm": 1.1663368940353394, + "learning_rate": 1.8703055881203595e-05, + "loss": 0.3556, + "step": 84000 + }, + { + "epoch": 6.567746973838345, + "grad_norm": 0.8520887494087219, + "learning_rate": 1.870149277061352e-05, + "loss": 0.3228, + "step": 84100 + }, + { + "epoch": 6.575556423272159, + "grad_norm": 0.9098398089408875, + "learning_rate": 1.8699929660023447e-05, + "loss": 0.3316, + "step": 84200 + }, + { + "epoch": 6.583365872705974, + "grad_norm": 1.0251222848892212, + "learning_rate": 1.8698366549433373e-05, + "loss": 0.332, + "step": 84300 + }, + { + "epoch": 6.591175322139789, + "grad_norm": 1.014219880104065, + "learning_rate": 1.86968034388433e-05, + "loss": 0.3621, + "step": 84400 + }, + { + "epoch": 6.598984771573604, + "grad_norm": 0.9549919366836548, + "learning_rate": 1.8695240328253226e-05, + "loss": 0.3337, + "step": 84500 + }, + { + "epoch": 6.606794221007419, + "grad_norm": 1.220858097076416, + "learning_rate": 1.869367721766315e-05, + "loss": 0.3476, + "step": 84600 + }, + { + "epoch": 6.614603670441234, + "grad_norm": 1.041111946105957, + "learning_rate": 1.8692114107073078e-05, + "loss": 0.3322, + "step": 84700 + }, + { + "epoch": 6.622413119875048, + "grad_norm": 1.009423017501831, + "learning_rate": 1.8690550996483004e-05, + "loss": 0.3638, + "step": 84800 + }, + { + "epoch": 6.630222569308864, + "grad_norm": 0.9405770897865295, + "learning_rate": 1.868898788589293e-05, + "loss": 0.3493, + "step": 84900 + }, + { + "epoch": 6.6380320187426785, + "grad_norm": 1.2056901454925537, + "learning_rate": 1.8687424775302852e-05, + "loss": 0.3566, + "step": 85000 + }, + { + "epoch": 6.645841468176494, + "grad_norm": 0.9662819504737854, + "learning_rate": 1.8685861664712778e-05, + "loss": 0.3394, + "step": 85100 + }, + { + "epoch": 6.6536509176103085, + "grad_norm": 1.293601155281067, + "learning_rate": 1.8684298554122708e-05, + "loss": 0.3166, + "step": 85200 + }, + { + "epoch": 6.661460367044123, + "grad_norm": 0.8332231640815735, + "learning_rate": 1.868273544353263e-05, + "loss": 0.3384, + "step": 85300 + }, + { + "epoch": 6.6692698164779385, + "grad_norm": 1.139176368713379, + "learning_rate": 1.8681172332942556e-05, + "loss": 0.3368, + "step": 85400 + }, + { + "epoch": 6.677079265911753, + "grad_norm": 1.0874099731445312, + "learning_rate": 1.8679609222352482e-05, + "loss": 0.3493, + "step": 85500 + }, + { + "epoch": 6.6848887153455685, + "grad_norm": 1.0646657943725586, + "learning_rate": 1.8678046111762408e-05, + "loss": 0.3465, + "step": 85600 + }, + { + "epoch": 6.692698164779383, + "grad_norm": 0.8554918169975281, + "learning_rate": 1.8676483001172334e-05, + "loss": 0.3462, + "step": 85700 + }, + { + "epoch": 6.700507614213198, + "grad_norm": 1.0210607051849365, + "learning_rate": 1.867491989058226e-05, + "loss": 0.341, + "step": 85800 + }, + { + "epoch": 6.708317063647013, + "grad_norm": 0.9455381035804749, + "learning_rate": 1.8673356779992186e-05, + "loss": 0.3463, + "step": 85900 + }, + { + "epoch": 6.716126513080828, + "grad_norm": 0.9599422216415405, + "learning_rate": 1.8671793669402112e-05, + "loss": 0.3549, + "step": 86000 + }, + { + "epoch": 6.723935962514643, + "grad_norm": 0.9664759635925293, + "learning_rate": 1.8670230558812038e-05, + "loss": 0.3192, + "step": 86100 + }, + { + "epoch": 6.731745411948458, + "grad_norm": 0.8134546279907227, + "learning_rate": 1.8668683079327864e-05, + "loss": 0.3343, + "step": 86200 + }, + { + "epoch": 6.739554861382272, + "grad_norm": 1.147776484489441, + "learning_rate": 1.866711996873779e-05, + "loss": 0.3519, + "step": 86300 + }, + { + "epoch": 6.747364310816088, + "grad_norm": 1.2324354648590088, + "learning_rate": 1.8665556858147716e-05, + "loss": 0.3353, + "step": 86400 + }, + { + "epoch": 6.755173760249902, + "grad_norm": 1.0100579261779785, + "learning_rate": 1.866399374755764e-05, + "loss": 0.3516, + "step": 86500 + }, + { + "epoch": 6.762983209683718, + "grad_norm": 1.2178475856781006, + "learning_rate": 1.8662430636967568e-05, + "loss": 0.3475, + "step": 86600 + }, + { + "epoch": 6.770792659117532, + "grad_norm": 1.2060697078704834, + "learning_rate": 1.8660867526377494e-05, + "loss": 0.3363, + "step": 86700 + }, + { + "epoch": 6.778602108551347, + "grad_norm": 0.8602226376533508, + "learning_rate": 1.8659304415787417e-05, + "loss": 0.3435, + "step": 86800 + }, + { + "epoch": 6.786411557985162, + "grad_norm": 1.0413028001785278, + "learning_rate": 1.8657741305197343e-05, + "loss": 0.3349, + "step": 86900 + }, + { + "epoch": 6.794221007418977, + "grad_norm": 0.7829965353012085, + "learning_rate": 1.8656178194607272e-05, + "loss": 0.3344, + "step": 87000 + }, + { + "epoch": 6.802030456852792, + "grad_norm": 1.249098777770996, + "learning_rate": 1.8654615084017195e-05, + "loss": 0.3507, + "step": 87100 + }, + { + "epoch": 6.809839906286607, + "grad_norm": 1.135725975036621, + "learning_rate": 1.865305197342712e-05, + "loss": 0.3376, + "step": 87200 + }, + { + "epoch": 6.817649355720421, + "grad_norm": 0.9486784934997559, + "learning_rate": 1.8651488862837047e-05, + "loss": 0.3389, + "step": 87300 + }, + { + "epoch": 6.825458805154237, + "grad_norm": 1.0629407167434692, + "learning_rate": 1.8649925752246973e-05, + "loss": 0.3379, + "step": 87400 + }, + { + "epoch": 6.833268254588051, + "grad_norm": 1.0095854997634888, + "learning_rate": 1.86483626416569e-05, + "loss": 0.3235, + "step": 87500 + }, + { + "epoch": 6.841077704021867, + "grad_norm": 0.8875960111618042, + "learning_rate": 1.8646799531066825e-05, + "loss": 0.3326, + "step": 87600 + }, + { + "epoch": 6.848887153455681, + "grad_norm": 1.103423833847046, + "learning_rate": 1.864523642047675e-05, + "loss": 0.3309, + "step": 87700 + }, + { + "epoch": 6.856696602889496, + "grad_norm": 0.9338161945343018, + "learning_rate": 1.8643673309886677e-05, + "loss": 0.3386, + "step": 87800 + }, + { + "epoch": 6.8645060523233115, + "grad_norm": 1.395498514175415, + "learning_rate": 1.8642110199296603e-05, + "loss": 0.3509, + "step": 87900 + }, + { + "epoch": 6.872315501757126, + "grad_norm": 0.8791838884353638, + "learning_rate": 1.8640547088706526e-05, + "loss": 0.3465, + "step": 88000 + }, + { + "epoch": 6.8801249511909415, + "grad_norm": 1.1168427467346191, + "learning_rate": 1.8638983978116455e-05, + "loss": 0.3387, + "step": 88100 + }, + { + "epoch": 6.887934400624756, + "grad_norm": 0.8936362266540527, + "learning_rate": 1.8637420867526378e-05, + "loss": 0.3504, + "step": 88200 + }, + { + "epoch": 6.895743850058571, + "grad_norm": 0.7844634056091309, + "learning_rate": 1.8635857756936304e-05, + "loss": 0.3163, + "step": 88300 + }, + { + "epoch": 6.903553299492386, + "grad_norm": 1.28450608253479, + "learning_rate": 1.8634294646346233e-05, + "loss": 0.3283, + "step": 88400 + }, + { + "epoch": 6.911362748926201, + "grad_norm": 1.0096105337142944, + "learning_rate": 1.8632731535756156e-05, + "loss": 0.329, + "step": 88500 + }, + { + "epoch": 6.919172198360016, + "grad_norm": 1.2057095766067505, + "learning_rate": 1.8631168425166082e-05, + "loss": 0.3366, + "step": 88600 + }, + { + "epoch": 6.926981647793831, + "grad_norm": 1.0522247552871704, + "learning_rate": 1.8629605314576008e-05, + "loss": 0.3173, + "step": 88700 + }, + { + "epoch": 6.934791097227645, + "grad_norm": 0.8380939364433289, + "learning_rate": 1.8628042203985934e-05, + "loss": 0.3254, + "step": 88800 + }, + { + "epoch": 6.942600546661461, + "grad_norm": 1.1132458448410034, + "learning_rate": 1.862647909339586e-05, + "loss": 0.3349, + "step": 88900 + }, + { + "epoch": 6.950409996095275, + "grad_norm": 1.0412805080413818, + "learning_rate": 1.8624915982805786e-05, + "loss": 0.3446, + "step": 89000 + }, + { + "epoch": 6.958219445529091, + "grad_norm": 0.7790032625198364, + "learning_rate": 1.862335287221571e-05, + "loss": 0.3393, + "step": 89100 + }, + { + "epoch": 6.966028894962905, + "grad_norm": 1.0535982847213745, + "learning_rate": 1.8621789761625638e-05, + "loss": 0.3344, + "step": 89200 + }, + { + "epoch": 6.97383834439672, + "grad_norm": 0.9829431176185608, + "learning_rate": 1.8620226651035564e-05, + "loss": 0.3285, + "step": 89300 + }, + { + "epoch": 6.981647793830535, + "grad_norm": 1.049845576286316, + "learning_rate": 1.8618663540445486e-05, + "loss": 0.3249, + "step": 89400 + }, + { + "epoch": 6.98945724326435, + "grad_norm": 1.01344633102417, + "learning_rate": 1.8617100429855416e-05, + "loss": 0.3284, + "step": 89500 + }, + { + "epoch": 6.997266692698165, + "grad_norm": 1.1877566576004028, + "learning_rate": 1.861553731926534e-05, + "loss": 0.3347, + "step": 89600 + }, + { + "epoch": 7.00507614213198, + "grad_norm": 0.8533827662467957, + "learning_rate": 1.8613974208675264e-05, + "loss": 0.3322, + "step": 89700 + }, + { + "epoch": 7.012885591565794, + "grad_norm": 1.1142549514770508, + "learning_rate": 1.861241109808519e-05, + "loss": 0.3229, + "step": 89800 + }, + { + "epoch": 7.02069504099961, + "grad_norm": 1.085334062576294, + "learning_rate": 1.8610847987495116e-05, + "loss": 0.3199, + "step": 89900 + }, + { + "epoch": 7.028504490433424, + "grad_norm": 0.9643715023994446, + "learning_rate": 1.8609300508010942e-05, + "loss": 0.3139, + "step": 90000 + }, + { + "epoch": 7.036313939867239, + "grad_norm": 1.189081072807312, + "learning_rate": 1.860773739742087e-05, + "loss": 0.3324, + "step": 90100 + }, + { + "epoch": 7.044123389301054, + "grad_norm": 1.0931801795959473, + "learning_rate": 1.8606174286830794e-05, + "loss": 0.3309, + "step": 90200 + }, + { + "epoch": 7.051932838734869, + "grad_norm": 0.9425420761108398, + "learning_rate": 1.860461117624072e-05, + "loss": 0.3358, + "step": 90300 + }, + { + "epoch": 7.059742288168684, + "grad_norm": 1.2118514776229858, + "learning_rate": 1.8603048065650646e-05, + "loss": 0.3362, + "step": 90400 + }, + { + "epoch": 7.067551737602499, + "grad_norm": 1.2366812229156494, + "learning_rate": 1.8601484955060572e-05, + "loss": 0.3395, + "step": 90500 + }, + { + "epoch": 7.075361187036314, + "grad_norm": 1.0428937673568726, + "learning_rate": 1.85999218444705e-05, + "loss": 0.3192, + "step": 90600 + }, + { + "epoch": 7.083170636470129, + "grad_norm": 1.0269099473953247, + "learning_rate": 1.8598358733880424e-05, + "loss": 0.3263, + "step": 90700 + }, + { + "epoch": 7.090980085903944, + "grad_norm": 1.0814200639724731, + "learning_rate": 1.859679562329035e-05, + "loss": 0.3409, + "step": 90800 + }, + { + "epoch": 7.098789535337759, + "grad_norm": 0.7768912315368652, + "learning_rate": 1.8595232512700273e-05, + "loss": 0.3195, + "step": 90900 + }, + { + "epoch": 7.106598984771574, + "grad_norm": 0.9416346549987793, + "learning_rate": 1.8593669402110203e-05, + "loss": 0.3266, + "step": 91000 + }, + { + "epoch": 7.114408434205388, + "grad_norm": 0.9913092255592346, + "learning_rate": 1.8592106291520125e-05, + "loss": 0.3319, + "step": 91100 + }, + { + "epoch": 7.122217883639204, + "grad_norm": 0.7331252694129944, + "learning_rate": 1.859054318093005e-05, + "loss": 0.3202, + "step": 91200 + }, + { + "epoch": 7.130027333073018, + "grad_norm": 1.1714645624160767, + "learning_rate": 1.8588980070339977e-05, + "loss": 0.3296, + "step": 91300 + }, + { + "epoch": 7.137836782506834, + "grad_norm": 1.0094726085662842, + "learning_rate": 1.8587416959749903e-05, + "loss": 0.3268, + "step": 91400 + }, + { + "epoch": 7.145646231940648, + "grad_norm": 0.8272337913513184, + "learning_rate": 1.858585384915983e-05, + "loss": 0.3258, + "step": 91500 + }, + { + "epoch": 7.153455681374463, + "grad_norm": 1.1478444337844849, + "learning_rate": 1.8584290738569755e-05, + "loss": 0.3277, + "step": 91600 + }, + { + "epoch": 7.161265130808278, + "grad_norm": 0.9589486718177795, + "learning_rate": 1.858272762797968e-05, + "loss": 0.3435, + "step": 91700 + }, + { + "epoch": 7.169074580242093, + "grad_norm": 1.0916614532470703, + "learning_rate": 1.8581164517389607e-05, + "loss": 0.3318, + "step": 91800 + }, + { + "epoch": 7.176884029675908, + "grad_norm": 1.128203272819519, + "learning_rate": 1.8579601406799533e-05, + "loss": 0.3209, + "step": 91900 + }, + { + "epoch": 7.184693479109723, + "grad_norm": 1.1626170873641968, + "learning_rate": 1.8578038296209456e-05, + "loss": 0.3309, + "step": 92000 + }, + { + "epoch": 7.192502928543537, + "grad_norm": 1.0426710844039917, + "learning_rate": 1.8576475185619385e-05, + "loss": 0.3235, + "step": 92100 + }, + { + "epoch": 7.200312377977353, + "grad_norm": 0.883540689945221, + "learning_rate": 1.857491207502931e-05, + "loss": 0.3275, + "step": 92200 + }, + { + "epoch": 7.208121827411167, + "grad_norm": 0.9239016771316528, + "learning_rate": 1.8573348964439234e-05, + "loss": 0.3173, + "step": 92300 + }, + { + "epoch": 7.215931276844983, + "grad_norm": 0.8947902917861938, + "learning_rate": 1.8571785853849163e-05, + "loss": 0.3289, + "step": 92400 + }, + { + "epoch": 7.223740726278797, + "grad_norm": 0.8909947276115417, + "learning_rate": 1.8570222743259086e-05, + "loss": 0.3144, + "step": 92500 + }, + { + "epoch": 7.231550175712612, + "grad_norm": 0.850845217704773, + "learning_rate": 1.8568659632669012e-05, + "loss": 0.3152, + "step": 92600 + }, + { + "epoch": 7.239359625146427, + "grad_norm": 0.9659631848335266, + "learning_rate": 1.8567096522078938e-05, + "loss": 0.3259, + "step": 92700 + }, + { + "epoch": 7.247169074580242, + "grad_norm": 1.1609432697296143, + "learning_rate": 1.8565533411488864e-05, + "loss": 0.3176, + "step": 92800 + }, + { + "epoch": 7.254978524014057, + "grad_norm": 0.9202740788459778, + "learning_rate": 1.856398593200469e-05, + "loss": 0.3212, + "step": 92900 + }, + { + "epoch": 7.262787973447872, + "grad_norm": 0.9573159217834473, + "learning_rate": 1.8562422821414616e-05, + "loss": 0.3226, + "step": 93000 + }, + { + "epoch": 7.2705974228816865, + "grad_norm": 1.138262152671814, + "learning_rate": 1.8560859710824542e-05, + "loss": 0.3298, + "step": 93100 + }, + { + "epoch": 7.278406872315502, + "grad_norm": 0.9519045352935791, + "learning_rate": 1.8559296600234468e-05, + "loss": 0.3148, + "step": 93200 + }, + { + "epoch": 7.2862163217493165, + "grad_norm": 0.788692057132721, + "learning_rate": 1.8557733489644394e-05, + "loss": 0.3212, + "step": 93300 + }, + { + "epoch": 7.294025771183132, + "grad_norm": 0.8305114507675171, + "learning_rate": 1.855617037905432e-05, + "loss": 0.3302, + "step": 93400 + }, + { + "epoch": 7.301835220616947, + "grad_norm": 1.0999449491500854, + "learning_rate": 1.8554607268464246e-05, + "loss": 0.3182, + "step": 93500 + }, + { + "epoch": 7.309644670050761, + "grad_norm": 0.9967383742332458, + "learning_rate": 1.8553044157874172e-05, + "loss": 0.3218, + "step": 93600 + }, + { + "epoch": 7.317454119484577, + "grad_norm": 1.140713095664978, + "learning_rate": 1.8551481047284098e-05, + "loss": 0.3228, + "step": 93700 + }, + { + "epoch": 7.325263568918391, + "grad_norm": 0.8884949684143066, + "learning_rate": 1.854991793669402e-05, + "loss": 0.3213, + "step": 93800 + }, + { + "epoch": 7.333073018352206, + "grad_norm": 0.9016627073287964, + "learning_rate": 1.854835482610395e-05, + "loss": 0.3157, + "step": 93900 + }, + { + "epoch": 7.340882467786021, + "grad_norm": 1.2928720712661743, + "learning_rate": 1.8546791715513876e-05, + "loss": 0.3176, + "step": 94000 + }, + { + "epoch": 7.348691917219836, + "grad_norm": 0.8581827282905579, + "learning_rate": 1.85452286049238e-05, + "loss": 0.3101, + "step": 94100 + }, + { + "epoch": 7.356501366653651, + "grad_norm": 1.1784394979476929, + "learning_rate": 1.8543665494333725e-05, + "loss": 0.3254, + "step": 94200 + }, + { + "epoch": 7.364310816087466, + "grad_norm": 0.8839449882507324, + "learning_rate": 1.854210238374365e-05, + "loss": 0.3264, + "step": 94300 + }, + { + "epoch": 7.37212026552128, + "grad_norm": 0.9890450835227966, + "learning_rate": 1.8540539273153577e-05, + "loss": 0.3142, + "step": 94400 + }, + { + "epoch": 7.379929714955096, + "grad_norm": 0.9410362839698792, + "learning_rate": 1.8538976162563503e-05, + "loss": 0.3207, + "step": 94500 + }, + { + "epoch": 7.38773916438891, + "grad_norm": 1.012739658355713, + "learning_rate": 1.853741305197343e-05, + "loss": 0.3253, + "step": 94600 + }, + { + "epoch": 7.395548613822726, + "grad_norm": 0.9576935172080994, + "learning_rate": 1.8535849941383355e-05, + "loss": 0.3324, + "step": 94700 + }, + { + "epoch": 7.40335806325654, + "grad_norm": 1.124345064163208, + "learning_rate": 1.853428683079328e-05, + "loss": 0.3242, + "step": 94800 + }, + { + "epoch": 7.411167512690355, + "grad_norm": 0.7819204330444336, + "learning_rate": 1.8532723720203207e-05, + "loss": 0.3031, + "step": 94900 + }, + { + "epoch": 7.41897696212417, + "grad_norm": 0.9620487689971924, + "learning_rate": 1.8531160609613133e-05, + "loss": 0.3085, + "step": 95000 + }, + { + "epoch": 7.426786411557985, + "grad_norm": 0.7977539300918579, + "learning_rate": 1.852959749902306e-05, + "loss": 0.3227, + "step": 95100 + }, + { + "epoch": 7.4345958609918, + "grad_norm": 0.9288976788520813, + "learning_rate": 1.852803438843298e-05, + "loss": 0.3251, + "step": 95200 + }, + { + "epoch": 7.442405310425615, + "grad_norm": 0.9492650628089905, + "learning_rate": 1.8526471277842907e-05, + "loss": 0.3198, + "step": 95300 + }, + { + "epoch": 7.4502147598594295, + "grad_norm": 1.0884491205215454, + "learning_rate": 1.8524908167252837e-05, + "loss": 0.3117, + "step": 95400 + }, + { + "epoch": 7.458024209293245, + "grad_norm": 0.930963397026062, + "learning_rate": 1.852334505666276e-05, + "loss": 0.3086, + "step": 95500 + }, + { + "epoch": 7.4658336587270595, + "grad_norm": 1.0428942441940308, + "learning_rate": 1.8521781946072685e-05, + "loss": 0.3087, + "step": 95600 + }, + { + "epoch": 7.473643108160875, + "grad_norm": 0.8642585277557373, + "learning_rate": 1.852021883548261e-05, + "loss": 0.3091, + "step": 95700 + }, + { + "epoch": 7.4814525575946895, + "grad_norm": 1.0365341901779175, + "learning_rate": 1.8518655724892537e-05, + "loss": 0.3322, + "step": 95800 + }, + { + "epoch": 7.489262007028504, + "grad_norm": 0.8407796025276184, + "learning_rate": 1.8517092614302463e-05, + "loss": 0.3247, + "step": 95900 + }, + { + "epoch": 7.4970714564623195, + "grad_norm": 0.8818190097808838, + "learning_rate": 1.851552950371239e-05, + "loss": 0.3111, + "step": 96000 + }, + { + "epoch": 7.504880905896134, + "grad_norm": 0.9547135233879089, + "learning_rate": 1.8513966393122315e-05, + "loss": 0.3062, + "step": 96100 + }, + { + "epoch": 7.5126903553299496, + "grad_norm": 0.9618895053863525, + "learning_rate": 1.851240328253224e-05, + "loss": 0.3244, + "step": 96200 + }, + { + "epoch": 7.520499804763764, + "grad_norm": 0.9813030958175659, + "learning_rate": 1.8510840171942167e-05, + "loss": 0.3359, + "step": 96300 + }, + { + "epoch": 7.528309254197579, + "grad_norm": 0.9328567385673523, + "learning_rate": 1.850927706135209e-05, + "loss": 0.313, + "step": 96400 + }, + { + "epoch": 7.536118703631394, + "grad_norm": 1.0434361696243286, + "learning_rate": 1.850771395076202e-05, + "loss": 0.3194, + "step": 96500 + }, + { + "epoch": 7.543928153065209, + "grad_norm": 0.9884262681007385, + "learning_rate": 1.8506150840171942e-05, + "loss": 0.3088, + "step": 96600 + }, + { + "epoch": 7.551737602499024, + "grad_norm": 0.937465250492096, + "learning_rate": 1.8504587729581868e-05, + "loss": 0.3063, + "step": 96700 + }, + { + "epoch": 7.559547051932839, + "grad_norm": 0.8354927897453308, + "learning_rate": 1.8503024618991798e-05, + "loss": 0.3261, + "step": 96800 + }, + { + "epoch": 7.567356501366653, + "grad_norm": 1.2076160907745361, + "learning_rate": 1.8501477139507623e-05, + "loss": 0.3162, + "step": 96900 + }, + { + "epoch": 7.575165950800469, + "grad_norm": 1.0042390823364258, + "learning_rate": 1.8499914028917546e-05, + "loss": 0.3118, + "step": 97000 + }, + { + "epoch": 7.582975400234283, + "grad_norm": 0.9606279134750366, + "learning_rate": 1.8498350918327472e-05, + "loss": 0.3282, + "step": 97100 + }, + { + "epoch": 7.590784849668099, + "grad_norm": 1.2073897123336792, + "learning_rate": 1.8496787807737398e-05, + "loss": 0.3096, + "step": 97200 + }, + { + "epoch": 7.598594299101913, + "grad_norm": 0.9388468861579895, + "learning_rate": 1.8495224697147324e-05, + "loss": 0.316, + "step": 97300 + }, + { + "epoch": 7.606403748535728, + "grad_norm": 1.0033023357391357, + "learning_rate": 1.849366158655725e-05, + "loss": 0.3065, + "step": 97400 + }, + { + "epoch": 7.614213197969543, + "grad_norm": 0.9127291440963745, + "learning_rate": 1.8492098475967176e-05, + "loss": 0.324, + "step": 97500 + }, + { + "epoch": 7.622022647403358, + "grad_norm": 0.8793255090713501, + "learning_rate": 1.8490535365377102e-05, + "loss": 0.331, + "step": 97600 + }, + { + "epoch": 7.629832096837173, + "grad_norm": 1.0385085344314575, + "learning_rate": 1.8488972254787028e-05, + "loss": 0.3232, + "step": 97700 + }, + { + "epoch": 7.637641546270988, + "grad_norm": 1.1897807121276855, + "learning_rate": 1.8487409144196954e-05, + "loss": 0.3064, + "step": 97800 + }, + { + "epoch": 7.6454509957048025, + "grad_norm": 1.2724990844726562, + "learning_rate": 1.848584603360688e-05, + "loss": 0.3102, + "step": 97900 + }, + { + "epoch": 7.653260445138618, + "grad_norm": 1.2787495851516724, + "learning_rate": 1.8484282923016806e-05, + "loss": 0.3117, + "step": 98000 + }, + { + "epoch": 7.6610698945724325, + "grad_norm": 1.0985214710235596, + "learning_rate": 1.848271981242673e-05, + "loss": 0.3148, + "step": 98100 + }, + { + "epoch": 7.668879344006248, + "grad_norm": 1.0185761451721191, + "learning_rate": 1.8481156701836655e-05, + "loss": 0.3212, + "step": 98200 + }, + { + "epoch": 7.6766887934400625, + "grad_norm": 1.177228569984436, + "learning_rate": 1.8479593591246584e-05, + "loss": 0.3168, + "step": 98300 + }, + { + "epoch": 7.684498242873877, + "grad_norm": 1.2005599737167358, + "learning_rate": 1.8478030480656507e-05, + "loss": 0.3032, + "step": 98400 + }, + { + "epoch": 7.6923076923076925, + "grad_norm": 1.4760136604309082, + "learning_rate": 1.8476467370066433e-05, + "loss": 0.3087, + "step": 98500 + }, + { + "epoch": 7.700117141741507, + "grad_norm": 0.9583701491355896, + "learning_rate": 1.8474904259476362e-05, + "loss": 0.3163, + "step": 98600 + }, + { + "epoch": 7.7079265911753225, + "grad_norm": 0.9487422704696655, + "learning_rate": 1.8473341148886285e-05, + "loss": 0.3173, + "step": 98700 + }, + { + "epoch": 7.715736040609137, + "grad_norm": 0.8585200905799866, + "learning_rate": 1.847177803829621e-05, + "loss": 0.3133, + "step": 98800 + }, + { + "epoch": 7.723545490042952, + "grad_norm": 1.1684235334396362, + "learning_rate": 1.8470214927706137e-05, + "loss": 0.2952, + "step": 98900 + }, + { + "epoch": 7.731354939476767, + "grad_norm": 1.0295666456222534, + "learning_rate": 1.8468651817116063e-05, + "loss": 0.3248, + "step": 99000 + }, + { + "epoch": 7.739164388910582, + "grad_norm": 0.7475218176841736, + "learning_rate": 1.846708870652599e-05, + "loss": 0.2987, + "step": 99100 + }, + { + "epoch": 7.746973838344397, + "grad_norm": 1.1041820049285889, + "learning_rate": 1.8465525595935915e-05, + "loss": 0.3112, + "step": 99200 + }, + { + "epoch": 7.754783287778212, + "grad_norm": 1.0141644477844238, + "learning_rate": 1.8463962485345838e-05, + "loss": 0.3154, + "step": 99300 + }, + { + "epoch": 7.762592737212026, + "grad_norm": 0.7786453366279602, + "learning_rate": 1.8462399374755767e-05, + "loss": 0.327, + "step": 99400 + }, + { + "epoch": 7.770402186645842, + "grad_norm": 0.8506142497062683, + "learning_rate": 1.8460836264165693e-05, + "loss": 0.3138, + "step": 99500 + }, + { + "epoch": 7.778211636079656, + "grad_norm": 1.0492260456085205, + "learning_rate": 1.8459273153575616e-05, + "loss": 0.3114, + "step": 99600 + }, + { + "epoch": 7.786021085513472, + "grad_norm": 0.9019497036933899, + "learning_rate": 1.8457710042985545e-05, + "loss": 0.3185, + "step": 99700 + }, + { + "epoch": 7.793830534947286, + "grad_norm": 0.855957567691803, + "learning_rate": 1.8456146932395468e-05, + "loss": 0.3273, + "step": 99800 + }, + { + "epoch": 7.801639984381101, + "grad_norm": 1.1977174282073975, + "learning_rate": 1.8454583821805394e-05, + "loss": 0.3095, + "step": 99900 + }, + { + "epoch": 7.809449433814916, + "grad_norm": 0.9468150734901428, + "learning_rate": 1.845302071121532e-05, + "loss": 0.3043, + "step": 100000 + }, + { + "epoch": 7.817258883248731, + "grad_norm": 0.957493007183075, + "learning_rate": 1.8451457600625246e-05, + "loss": 0.3072, + "step": 100100 + }, + { + "epoch": 7.825068332682546, + "grad_norm": 1.0043883323669434, + "learning_rate": 1.844989449003517e-05, + "loss": 0.3127, + "step": 100200 + }, + { + "epoch": 7.832877782116361, + "grad_norm": 0.893930971622467, + "learning_rate": 1.8448331379445098e-05, + "loss": 0.3014, + "step": 100300 + }, + { + "epoch": 7.840687231550175, + "grad_norm": 0.8573713302612305, + "learning_rate": 1.8446768268855024e-05, + "loss": 0.3015, + "step": 100400 + }, + { + "epoch": 7.848496680983991, + "grad_norm": 1.0177993774414062, + "learning_rate": 1.844520515826495e-05, + "loss": 0.302, + "step": 100500 + }, + { + "epoch": 7.8563061304178055, + "grad_norm": 1.0929206609725952, + "learning_rate": 1.8443642047674876e-05, + "loss": 0.3181, + "step": 100600 + }, + { + "epoch": 7.864115579851621, + "grad_norm": 1.2086570262908936, + "learning_rate": 1.84420789370848e-05, + "loss": 0.3131, + "step": 100700 + }, + { + "epoch": 7.8719250292854355, + "grad_norm": 0.721596896648407, + "learning_rate": 1.8440515826494728e-05, + "loss": 0.2946, + "step": 100800 + }, + { + "epoch": 7.87973447871925, + "grad_norm": 1.0397862195968628, + "learning_rate": 1.8438952715904654e-05, + "loss": 0.2931, + "step": 100900 + }, + { + "epoch": 7.8875439281530655, + "grad_norm": 1.013625144958496, + "learning_rate": 1.843740523642048e-05, + "loss": 0.3084, + "step": 101000 + }, + { + "epoch": 7.89535337758688, + "grad_norm": 0.9805079102516174, + "learning_rate": 1.8435842125830402e-05, + "loss": 0.2953, + "step": 101100 + }, + { + "epoch": 7.9031628270206955, + "grad_norm": 1.1424782276153564, + "learning_rate": 1.843427901524033e-05, + "loss": 0.3009, + "step": 101200 + }, + { + "epoch": 7.91097227645451, + "grad_norm": 1.0221184492111206, + "learning_rate": 1.8432715904650254e-05, + "loss": 0.3158, + "step": 101300 + }, + { + "epoch": 7.918781725888325, + "grad_norm": 0.9535076022148132, + "learning_rate": 1.843115279406018e-05, + "loss": 0.3021, + "step": 101400 + }, + { + "epoch": 7.92659117532214, + "grad_norm": 0.9413054585456848, + "learning_rate": 1.8429589683470106e-05, + "loss": 0.2997, + "step": 101500 + }, + { + "epoch": 7.934400624755955, + "grad_norm": 1.0579475164413452, + "learning_rate": 1.8428026572880032e-05, + "loss": 0.3204, + "step": 101600 + }, + { + "epoch": 7.94221007418977, + "grad_norm": 0.66800856590271, + "learning_rate": 1.842646346228996e-05, + "loss": 0.315, + "step": 101700 + }, + { + "epoch": 7.950019523623585, + "grad_norm": 0.8284004926681519, + "learning_rate": 1.8424900351699884e-05, + "loss": 0.2963, + "step": 101800 + }, + { + "epoch": 7.957828973057399, + "grad_norm": 1.1670104265213013, + "learning_rate": 1.842333724110981e-05, + "loss": 0.3119, + "step": 101900 + }, + { + "epoch": 7.965638422491215, + "grad_norm": 0.9361162185668945, + "learning_rate": 1.8421774130519736e-05, + "loss": 0.3082, + "step": 102000 + }, + { + "epoch": 7.973447871925029, + "grad_norm": 0.9291674494743347, + "learning_rate": 1.8420226651035562e-05, + "loss": 0.3086, + "step": 102100 + }, + { + "epoch": 7.981257321358845, + "grad_norm": 1.239319086074829, + "learning_rate": 1.841866354044549e-05, + "loss": 0.3112, + "step": 102200 + }, + { + "epoch": 7.989066770792659, + "grad_norm": 0.6871031522750854, + "learning_rate": 1.8417100429855414e-05, + "loss": 0.3217, + "step": 102300 + }, + { + "epoch": 7.996876220226474, + "grad_norm": 1.024958848953247, + "learning_rate": 1.841553731926534e-05, + "loss": 0.2894, + "step": 102400 + }, + { + "epoch": 8.004685669660288, + "grad_norm": 1.070359230041504, + "learning_rate": 1.8413974208675266e-05, + "loss": 0.3107, + "step": 102500 + }, + { + "epoch": 8.012495119094105, + "grad_norm": 1.0423635244369507, + "learning_rate": 1.841241109808519e-05, + "loss": 0.292, + "step": 102600 + }, + { + "epoch": 8.02030456852792, + "grad_norm": 0.7128071188926697, + "learning_rate": 1.841084798749512e-05, + "loss": 0.3138, + "step": 102700 + }, + { + "epoch": 8.028114017961734, + "grad_norm": 1.5792732238769531, + "learning_rate": 1.840928487690504e-05, + "loss": 0.2974, + "step": 102800 + }, + { + "epoch": 8.035923467395548, + "grad_norm": 0.9122816324234009, + "learning_rate": 1.8407721766314967e-05, + "loss": 0.3096, + "step": 102900 + }, + { + "epoch": 8.043732916829363, + "grad_norm": 1.035617709159851, + "learning_rate": 1.8406158655724896e-05, + "loss": 0.3126, + "step": 103000 + }, + { + "epoch": 8.05154236626318, + "grad_norm": 0.964963436126709, + "learning_rate": 1.840459554513482e-05, + "loss": 0.3008, + "step": 103100 + }, + { + "epoch": 8.059351815696994, + "grad_norm": 1.0072320699691772, + "learning_rate": 1.8403032434544745e-05, + "loss": 0.3006, + "step": 103200 + }, + { + "epoch": 8.067161265130808, + "grad_norm": 1.2636690139770508, + "learning_rate": 1.840146932395467e-05, + "loss": 0.3081, + "step": 103300 + }, + { + "epoch": 8.074970714564623, + "grad_norm": 1.1620123386383057, + "learning_rate": 1.8399906213364597e-05, + "loss": 0.2962, + "step": 103400 + }, + { + "epoch": 8.082780163998438, + "grad_norm": 0.9352577328681946, + "learning_rate": 1.8398343102774523e-05, + "loss": 0.3156, + "step": 103500 + }, + { + "epoch": 8.090589613432254, + "grad_norm": 1.1235134601593018, + "learning_rate": 1.839677999218445e-05, + "loss": 0.3037, + "step": 103600 + }, + { + "epoch": 8.098399062866068, + "grad_norm": 1.2459105253219604, + "learning_rate": 1.8395216881594372e-05, + "loss": 0.2965, + "step": 103700 + }, + { + "epoch": 8.106208512299883, + "grad_norm": 0.9999929666519165, + "learning_rate": 1.83936537710043e-05, + "loss": 0.3131, + "step": 103800 + }, + { + "epoch": 8.114017961733698, + "grad_norm": 1.092516541481018, + "learning_rate": 1.8392090660414227e-05, + "loss": 0.3015, + "step": 103900 + }, + { + "epoch": 8.121827411167512, + "grad_norm": 0.8040266036987305, + "learning_rate": 1.839052754982415e-05, + "loss": 0.3021, + "step": 104000 + }, + { + "epoch": 8.129636860601327, + "grad_norm": 1.031531810760498, + "learning_rate": 1.838896443923408e-05, + "loss": 0.2925, + "step": 104100 + }, + { + "epoch": 8.137446310035143, + "grad_norm": 1.1790035963058472, + "learning_rate": 1.8387401328644002e-05, + "loss": 0.2968, + "step": 104200 + }, + { + "epoch": 8.145255759468958, + "grad_norm": 1.1958867311477661, + "learning_rate": 1.8385838218053928e-05, + "loss": 0.2985, + "step": 104300 + }, + { + "epoch": 8.153065208902772, + "grad_norm": 0.9311303496360779, + "learning_rate": 1.8384275107463854e-05, + "loss": 0.3128, + "step": 104400 + }, + { + "epoch": 8.160874658336587, + "grad_norm": 1.0499194860458374, + "learning_rate": 1.838271199687378e-05, + "loss": 0.2911, + "step": 104500 + }, + { + "epoch": 8.168684107770401, + "grad_norm": 0.8166439533233643, + "learning_rate": 1.8381148886283706e-05, + "loss": 0.3072, + "step": 104600 + }, + { + "epoch": 8.176493557204218, + "grad_norm": 0.937285840511322, + "learning_rate": 1.8379585775693632e-05, + "loss": 0.3066, + "step": 104700 + }, + { + "epoch": 8.184303006638032, + "grad_norm": 0.9809720516204834, + "learning_rate": 1.8378022665103558e-05, + "loss": 0.2994, + "step": 104800 + }, + { + "epoch": 8.192112456071847, + "grad_norm": 0.9872479438781738, + "learning_rate": 1.8376459554513484e-05, + "loss": 0.2972, + "step": 104900 + }, + { + "epoch": 8.199921905505661, + "grad_norm": 0.8918797373771667, + "learning_rate": 1.837489644392341e-05, + "loss": 0.3038, + "step": 105000 + }, + { + "epoch": 8.207731354939476, + "grad_norm": 1.048528790473938, + "learning_rate": 1.8373333333333332e-05, + "loss": 0.289, + "step": 105100 + }, + { + "epoch": 8.215540804373292, + "grad_norm": 1.0492016077041626, + "learning_rate": 1.8371770222743262e-05, + "loss": 0.3092, + "step": 105200 + }, + { + "epoch": 8.223350253807107, + "grad_norm": 1.2988113164901733, + "learning_rate": 1.8370207112153188e-05, + "loss": 0.2949, + "step": 105300 + }, + { + "epoch": 8.231159703240921, + "grad_norm": 1.2381683588027954, + "learning_rate": 1.836864400156311e-05, + "loss": 0.3013, + "step": 105400 + }, + { + "epoch": 8.238969152674736, + "grad_norm": 0.741263210773468, + "learning_rate": 1.8367080890973037e-05, + "loss": 0.3026, + "step": 105500 + }, + { + "epoch": 8.24677860210855, + "grad_norm": 1.1652076244354248, + "learning_rate": 1.8365517780382966e-05, + "loss": 0.302, + "step": 105600 + }, + { + "epoch": 8.254588051542367, + "grad_norm": 1.1444916725158691, + "learning_rate": 1.836395466979289e-05, + "loss": 0.2928, + "step": 105700 + }, + { + "epoch": 8.262397500976181, + "grad_norm": 1.168922781944275, + "learning_rate": 1.8362391559202815e-05, + "loss": 0.2959, + "step": 105800 + }, + { + "epoch": 8.270206950409996, + "grad_norm": 0.9830915331840515, + "learning_rate": 1.836082844861274e-05, + "loss": 0.2915, + "step": 105900 + }, + { + "epoch": 8.27801639984381, + "grad_norm": 0.680469810962677, + "learning_rate": 1.8359265338022667e-05, + "loss": 0.2909, + "step": 106000 + }, + { + "epoch": 8.285825849277625, + "grad_norm": 0.8958842754364014, + "learning_rate": 1.8357717858538493e-05, + "loss": 0.3001, + "step": 106100 + }, + { + "epoch": 8.293635298711441, + "grad_norm": 1.0888712406158447, + "learning_rate": 1.835615474794842e-05, + "loss": 0.2953, + "step": 106200 + }, + { + "epoch": 8.301444748145256, + "grad_norm": 0.760229229927063, + "learning_rate": 1.8354591637358345e-05, + "loss": 0.3228, + "step": 106300 + }, + { + "epoch": 8.30925419757907, + "grad_norm": 1.0346572399139404, + "learning_rate": 1.835302852676827e-05, + "loss": 0.2856, + "step": 106400 + }, + { + "epoch": 8.317063647012885, + "grad_norm": 1.0129973888397217, + "learning_rate": 1.8351465416178197e-05, + "loss": 0.2907, + "step": 106500 + }, + { + "epoch": 8.3248730964467, + "grad_norm": 1.3134441375732422, + "learning_rate": 1.8349902305588123e-05, + "loss": 0.2839, + "step": 106600 + }, + { + "epoch": 8.332682545880516, + "grad_norm": 0.9875175356864929, + "learning_rate": 1.834833919499805e-05, + "loss": 0.2926, + "step": 106700 + }, + { + "epoch": 8.34049199531433, + "grad_norm": 0.8269121646881104, + "learning_rate": 1.8346776084407975e-05, + "loss": 0.2854, + "step": 106800 + }, + { + "epoch": 8.348301444748145, + "grad_norm": 0.9428029656410217, + "learning_rate": 1.8345212973817897e-05, + "loss": 0.29, + "step": 106900 + }, + { + "epoch": 8.35611089418196, + "grad_norm": 1.0320848226547241, + "learning_rate": 1.8343649863227827e-05, + "loss": 0.2924, + "step": 107000 + }, + { + "epoch": 8.363920343615774, + "grad_norm": 0.9099970459938049, + "learning_rate": 1.8342086752637753e-05, + "loss": 0.2848, + "step": 107100 + }, + { + "epoch": 8.37172979304959, + "grad_norm": 0.7163190245628357, + "learning_rate": 1.8340523642047675e-05, + "loss": 0.285, + "step": 107200 + }, + { + "epoch": 8.379539242483405, + "grad_norm": 0.9897966384887695, + "learning_rate": 1.83389605314576e-05, + "loss": 0.3034, + "step": 107300 + }, + { + "epoch": 8.38734869191722, + "grad_norm": 1.2372212409973145, + "learning_rate": 1.8337397420867527e-05, + "loss": 0.2962, + "step": 107400 + }, + { + "epoch": 8.395158141351034, + "grad_norm": 0.8196184635162354, + "learning_rate": 1.8335834310277453e-05, + "loss": 0.292, + "step": 107500 + }, + { + "epoch": 8.402967590784849, + "grad_norm": 1.124740481376648, + "learning_rate": 1.833427119968738e-05, + "loss": 0.2962, + "step": 107600 + }, + { + "epoch": 8.410777040218665, + "grad_norm": 0.8833318948745728, + "learning_rate": 1.8332708089097305e-05, + "loss": 0.2948, + "step": 107700 + }, + { + "epoch": 8.41858648965248, + "grad_norm": 1.2437323331832886, + "learning_rate": 1.833114497850723e-05, + "loss": 0.2957, + "step": 107800 + }, + { + "epoch": 8.426395939086294, + "grad_norm": 1.064677119255066, + "learning_rate": 1.8329581867917157e-05, + "loss": 0.2919, + "step": 107900 + }, + { + "epoch": 8.434205388520109, + "grad_norm": 1.100700855255127, + "learning_rate": 1.8328018757327083e-05, + "loss": 0.3, + "step": 108000 + }, + { + "epoch": 8.442014837953923, + "grad_norm": 1.1101109981536865, + "learning_rate": 1.832647127784291e-05, + "loss": 0.2884, + "step": 108100 + }, + { + "epoch": 8.44982428738774, + "grad_norm": 0.9055554866790771, + "learning_rate": 1.8324908167252835e-05, + "loss": 0.3045, + "step": 108200 + }, + { + "epoch": 8.457633736821554, + "grad_norm": 0.8915761709213257, + "learning_rate": 1.832334505666276e-05, + "loss": 0.2876, + "step": 108300 + }, + { + "epoch": 8.465443186255369, + "grad_norm": 0.9400941133499146, + "learning_rate": 1.8321781946072684e-05, + "loss": 0.3045, + "step": 108400 + }, + { + "epoch": 8.473252635689184, + "grad_norm": 0.7043224573135376, + "learning_rate": 1.8320218835482613e-05, + "loss": 0.2925, + "step": 108500 + }, + { + "epoch": 8.481062085122998, + "grad_norm": 0.9574306011199951, + "learning_rate": 1.831865572489254e-05, + "loss": 0.2878, + "step": 108600 + }, + { + "epoch": 8.488871534556814, + "grad_norm": 1.167900800704956, + "learning_rate": 1.8317092614302462e-05, + "loss": 0.3014, + "step": 108700 + }, + { + "epoch": 8.496680983990629, + "grad_norm": 1.2867276668548584, + "learning_rate": 1.8315529503712388e-05, + "loss": 0.2855, + "step": 108800 + }, + { + "epoch": 8.504490433424444, + "grad_norm": 1.1072494983673096, + "learning_rate": 1.8313966393122314e-05, + "loss": 0.3038, + "step": 108900 + }, + { + "epoch": 8.512299882858258, + "grad_norm": 0.8459449410438538, + "learning_rate": 1.831240328253224e-05, + "loss": 0.2881, + "step": 109000 + }, + { + "epoch": 8.520109332292073, + "grad_norm": 0.8919705152511597, + "learning_rate": 1.8310840171942166e-05, + "loss": 0.2915, + "step": 109100 + }, + { + "epoch": 8.527918781725889, + "grad_norm": 0.7803249359130859, + "learning_rate": 1.8309277061352092e-05, + "loss": 0.2912, + "step": 109200 + }, + { + "epoch": 8.535728231159704, + "grad_norm": 0.7071219682693481, + "learning_rate": 1.8307713950762018e-05, + "loss": 0.2875, + "step": 109300 + }, + { + "epoch": 8.543537680593518, + "grad_norm": 0.9213855266571045, + "learning_rate": 1.8306150840171944e-05, + "loss": 0.2899, + "step": 109400 + }, + { + "epoch": 8.551347130027333, + "grad_norm": 1.0583925247192383, + "learning_rate": 1.830458772958187e-05, + "loss": 0.2948, + "step": 109500 + }, + { + "epoch": 8.559156579461147, + "grad_norm": 1.0667381286621094, + "learning_rate": 1.8303024618991796e-05, + "loss": 0.2927, + "step": 109600 + }, + { + "epoch": 8.566966028894964, + "grad_norm": 1.162601351737976, + "learning_rate": 1.8301461508401722e-05, + "loss": 0.2878, + "step": 109700 + }, + { + "epoch": 8.574775478328778, + "grad_norm": 0.99709153175354, + "learning_rate": 1.8299898397811645e-05, + "loss": 0.3006, + "step": 109800 + }, + { + "epoch": 8.582584927762593, + "grad_norm": 0.9036930203437805, + "learning_rate": 1.829833528722157e-05, + "loss": 0.2799, + "step": 109900 + }, + { + "epoch": 8.590394377196407, + "grad_norm": 0.9895430207252502, + "learning_rate": 1.82967721766315e-05, + "loss": 0.2827, + "step": 110000 + }, + { + "epoch": 8.598203826630222, + "grad_norm": 0.8556926846504211, + "learning_rate": 1.8295224697147326e-05, + "loss": 0.2903, + "step": 110100 + }, + { + "epoch": 8.606013276064038, + "grad_norm": 0.8513884544372559, + "learning_rate": 1.829366158655725e-05, + "loss": 0.2738, + "step": 110200 + }, + { + "epoch": 8.613822725497853, + "grad_norm": 0.8311988711357117, + "learning_rate": 1.8292098475967178e-05, + "loss": 0.3023, + "step": 110300 + }, + { + "epoch": 8.621632174931667, + "grad_norm": 0.8456495404243469, + "learning_rate": 1.82905353653771e-05, + "loss": 0.2901, + "step": 110400 + }, + { + "epoch": 8.629441624365482, + "grad_norm": 1.003395676612854, + "learning_rate": 1.8288972254787027e-05, + "loss": 0.293, + "step": 110500 + }, + { + "epoch": 8.637251073799296, + "grad_norm": 0.8753073811531067, + "learning_rate": 1.8287409144196953e-05, + "loss": 0.2865, + "step": 110600 + }, + { + "epoch": 8.645060523233113, + "grad_norm": 0.9070360660552979, + "learning_rate": 1.828584603360688e-05, + "loss": 0.2995, + "step": 110700 + }, + { + "epoch": 8.652869972666927, + "grad_norm": 0.7955684065818787, + "learning_rate": 1.8284282923016805e-05, + "loss": 0.2987, + "step": 110800 + }, + { + "epoch": 8.660679422100742, + "grad_norm": 0.9900552034378052, + "learning_rate": 1.828271981242673e-05, + "loss": 0.2906, + "step": 110900 + }, + { + "epoch": 8.668488871534556, + "grad_norm": 0.9677467346191406, + "learning_rate": 1.8281156701836657e-05, + "loss": 0.2975, + "step": 111000 + }, + { + "epoch": 8.676298320968371, + "grad_norm": 0.9279794096946716, + "learning_rate": 1.8279593591246583e-05, + "loss": 0.2958, + "step": 111100 + }, + { + "epoch": 8.684107770402187, + "grad_norm": 0.8239137530326843, + "learning_rate": 1.827803048065651e-05, + "loss": 0.2784, + "step": 111200 + }, + { + "epoch": 8.691917219836002, + "grad_norm": 0.8737657070159912, + "learning_rate": 1.827646737006643e-05, + "loss": 0.2907, + "step": 111300 + }, + { + "epoch": 8.699726669269817, + "grad_norm": 0.9607727527618408, + "learning_rate": 1.827490425947636e-05, + "loss": 0.2861, + "step": 111400 + }, + { + "epoch": 8.707536118703631, + "grad_norm": 0.8190209865570068, + "learning_rate": 1.8273341148886287e-05, + "loss": 0.2808, + "step": 111500 + }, + { + "epoch": 8.715345568137446, + "grad_norm": 1.3692559003829956, + "learning_rate": 1.827177803829621e-05, + "loss": 0.2828, + "step": 111600 + }, + { + "epoch": 8.723155017571262, + "grad_norm": 1.252812385559082, + "learning_rate": 1.8270214927706135e-05, + "loss": 0.2787, + "step": 111700 + }, + { + "epoch": 8.730964467005077, + "grad_norm": 0.7284253835678101, + "learning_rate": 1.8268651817116065e-05, + "loss": 0.2856, + "step": 111800 + }, + { + "epoch": 8.738773916438891, + "grad_norm": 0.8836492300033569, + "learning_rate": 1.8267088706525987e-05, + "loss": 0.2945, + "step": 111900 + }, + { + "epoch": 8.746583365872706, + "grad_norm": 0.9553095102310181, + "learning_rate": 1.8265525595935913e-05, + "loss": 0.2696, + "step": 112000 + }, + { + "epoch": 8.75439281530652, + "grad_norm": 1.0329238176345825, + "learning_rate": 1.826397811645174e-05, + "loss": 0.295, + "step": 112100 + }, + { + "epoch": 8.762202264740337, + "grad_norm": 1.124981164932251, + "learning_rate": 1.8262415005861665e-05, + "loss": 0.2748, + "step": 112200 + }, + { + "epoch": 8.770011714174151, + "grad_norm": 0.8739539980888367, + "learning_rate": 1.826085189527159e-05, + "loss": 0.2861, + "step": 112300 + }, + { + "epoch": 8.777821163607966, + "grad_norm": 1.0567717552185059, + "learning_rate": 1.8259288784681517e-05, + "loss": 0.2877, + "step": 112400 + }, + { + "epoch": 8.78563061304178, + "grad_norm": 1.0307221412658691, + "learning_rate": 1.8257725674091443e-05, + "loss": 0.2805, + "step": 112500 + }, + { + "epoch": 8.793440062475595, + "grad_norm": 1.2705262899398804, + "learning_rate": 1.825616256350137e-05, + "loss": 0.2724, + "step": 112600 + }, + { + "epoch": 8.801249511909411, + "grad_norm": 1.0293917655944824, + "learning_rate": 1.8254599452911295e-05, + "loss": 0.284, + "step": 112700 + }, + { + "epoch": 8.809058961343226, + "grad_norm": 0.9187902808189392, + "learning_rate": 1.825303634232122e-05, + "loss": 0.2893, + "step": 112800 + }, + { + "epoch": 8.81686841077704, + "grad_norm": 0.8527992963790894, + "learning_rate": 1.8251473231731147e-05, + "loss": 0.2885, + "step": 112900 + }, + { + "epoch": 8.824677860210855, + "grad_norm": 0.8219836950302124, + "learning_rate": 1.8249910121141073e-05, + "loss": 0.2937, + "step": 113000 + }, + { + "epoch": 8.83248730964467, + "grad_norm": 0.9678224325180054, + "learning_rate": 1.8248347010550996e-05, + "loss": 0.2807, + "step": 113100 + }, + { + "epoch": 8.840296759078486, + "grad_norm": 0.6939859390258789, + "learning_rate": 1.8246783899960926e-05, + "loss": 0.2776, + "step": 113200 + }, + { + "epoch": 8.8481062085123, + "grad_norm": 1.0932164192199707, + "learning_rate": 1.824522078937085e-05, + "loss": 0.2823, + "step": 113300 + }, + { + "epoch": 8.855915657946115, + "grad_norm": 1.029359221458435, + "learning_rate": 1.8243657678780774e-05, + "loss": 0.2863, + "step": 113400 + }, + { + "epoch": 8.86372510737993, + "grad_norm": 1.0178298950195312, + "learning_rate": 1.82420945681907e-05, + "loss": 0.2841, + "step": 113500 + }, + { + "epoch": 8.871534556813744, + "grad_norm": 0.8735611438751221, + "learning_rate": 1.8240531457600626e-05, + "loss": 0.2868, + "step": 113600 + }, + { + "epoch": 8.87934400624756, + "grad_norm": 1.1830412149429321, + "learning_rate": 1.8238968347010552e-05, + "loss": 0.2819, + "step": 113700 + }, + { + "epoch": 8.887153455681375, + "grad_norm": 0.7258915305137634, + "learning_rate": 1.8237405236420478e-05, + "loss": 0.2976, + "step": 113800 + }, + { + "epoch": 8.89496290511519, + "grad_norm": 0.8901984095573425, + "learning_rate": 1.8235842125830404e-05, + "loss": 0.2907, + "step": 113900 + }, + { + "epoch": 8.902772354549004, + "grad_norm": 1.0432909727096558, + "learning_rate": 1.823427901524033e-05, + "loss": 0.2937, + "step": 114000 + }, + { + "epoch": 8.910581803982819, + "grad_norm": 0.7205260992050171, + "learning_rate": 1.8232731535756156e-05, + "loss": 0.283, + "step": 114100 + }, + { + "epoch": 8.918391253416633, + "grad_norm": 1.2185769081115723, + "learning_rate": 1.8231168425166082e-05, + "loss": 0.2751, + "step": 114200 + }, + { + "epoch": 8.92620070285045, + "grad_norm": 0.9317169189453125, + "learning_rate": 1.8229605314576008e-05, + "loss": 0.2906, + "step": 114300 + }, + { + "epoch": 8.934010152284264, + "grad_norm": 0.8162646293640137, + "learning_rate": 1.8228042203985934e-05, + "loss": 0.2802, + "step": 114400 + }, + { + "epoch": 8.941819601718079, + "grad_norm": 0.9121643900871277, + "learning_rate": 1.822647909339586e-05, + "loss": 0.2794, + "step": 114500 + }, + { + "epoch": 8.949629051151893, + "grad_norm": 1.0515434741973877, + "learning_rate": 1.8224915982805783e-05, + "loss": 0.2851, + "step": 114600 + }, + { + "epoch": 8.95743850058571, + "grad_norm": 0.882593035697937, + "learning_rate": 1.8223352872215712e-05, + "loss": 0.284, + "step": 114700 + }, + { + "epoch": 8.965247950019524, + "grad_norm": 0.7872332334518433, + "learning_rate": 1.8221789761625638e-05, + "loss": 0.2809, + "step": 114800 + }, + { + "epoch": 8.973057399453339, + "grad_norm": 0.9008090496063232, + "learning_rate": 1.8220242282141464e-05, + "loss": 0.2774, + "step": 114900 + }, + { + "epoch": 8.980866848887153, + "grad_norm": 1.128659963607788, + "learning_rate": 1.8218679171551387e-05, + "loss": 0.2834, + "step": 115000 + }, + { + "epoch": 8.988676298320968, + "grad_norm": 1.0850447416305542, + "learning_rate": 1.8217116060961316e-05, + "loss": 0.3057, + "step": 115100 + }, + { + "epoch": 8.996485747754782, + "grad_norm": 0.9458226561546326, + "learning_rate": 1.821555295037124e-05, + "loss": 0.2831, + "step": 115200 + }, + { + "epoch": 9.004295197188599, + "grad_norm": 0.9536977410316467, + "learning_rate": 1.8213989839781165e-05, + "loss": 0.2831, + "step": 115300 + }, + { + "epoch": 9.012104646622413, + "grad_norm": 0.9741133451461792, + "learning_rate": 1.8212426729191094e-05, + "loss": 0.2802, + "step": 115400 + }, + { + "epoch": 9.019914096056228, + "grad_norm": 0.9699240326881409, + "learning_rate": 1.8210863618601017e-05, + "loss": 0.2891, + "step": 115500 + }, + { + "epoch": 9.027723545490042, + "grad_norm": 1.16199791431427, + "learning_rate": 1.8209300508010943e-05, + "loss": 0.281, + "step": 115600 + }, + { + "epoch": 9.035532994923859, + "grad_norm": 1.1076624393463135, + "learning_rate": 1.820773739742087e-05, + "loss": 0.29, + "step": 115700 + }, + { + "epoch": 9.043342444357673, + "grad_norm": 1.11338210105896, + "learning_rate": 1.8206174286830795e-05, + "loss": 0.275, + "step": 115800 + }, + { + "epoch": 9.051151893791488, + "grad_norm": 0.8460277318954468, + "learning_rate": 1.820461117624072e-05, + "loss": 0.2915, + "step": 115900 + }, + { + "epoch": 9.058961343225302, + "grad_norm": 1.0888310670852661, + "learning_rate": 1.8203048065650647e-05, + "loss": 0.2719, + "step": 116000 + }, + { + "epoch": 9.066770792659117, + "grad_norm": 0.9920503497123718, + "learning_rate": 1.820148495506057e-05, + "loss": 0.2754, + "step": 116100 + }, + { + "epoch": 9.074580242092932, + "grad_norm": 1.0043495893478394, + "learning_rate": 1.81999218444705e-05, + "loss": 0.2768, + "step": 116200 + }, + { + "epoch": 9.082389691526748, + "grad_norm": 0.9468898177146912, + "learning_rate": 1.8198358733880425e-05, + "loss": 0.2795, + "step": 116300 + }, + { + "epoch": 9.090199140960562, + "grad_norm": 1.0771790742874146, + "learning_rate": 1.8196795623290348e-05, + "loss": 0.2752, + "step": 116400 + }, + { + "epoch": 9.098008590394377, + "grad_norm": 0.934396505355835, + "learning_rate": 1.8195232512700277e-05, + "loss": 0.2834, + "step": 116500 + }, + { + "epoch": 9.105818039828192, + "grad_norm": 1.111622929573059, + "learning_rate": 1.81936694021102e-05, + "loss": 0.2709, + "step": 116600 + }, + { + "epoch": 9.113627489262006, + "grad_norm": 0.8829967379570007, + "learning_rate": 1.8192106291520126e-05, + "loss": 0.2861, + "step": 116700 + }, + { + "epoch": 9.121436938695823, + "grad_norm": 1.1684046983718872, + "learning_rate": 1.819054318093005e-05, + "loss": 0.2725, + "step": 116800 + }, + { + "epoch": 9.129246388129637, + "grad_norm": 1.0102686882019043, + "learning_rate": 1.8188980070339978e-05, + "loss": 0.2787, + "step": 116900 + }, + { + "epoch": 9.137055837563452, + "grad_norm": 1.0709645748138428, + "learning_rate": 1.8187416959749904e-05, + "loss": 0.2674, + "step": 117000 + }, + { + "epoch": 9.144865286997266, + "grad_norm": 0.8841697573661804, + "learning_rate": 1.818585384915983e-05, + "loss": 0.2832, + "step": 117100 + }, + { + "epoch": 9.15267473643108, + "grad_norm": 0.8411263823509216, + "learning_rate": 1.8184290738569756e-05, + "loss": 0.2818, + "step": 117200 + }, + { + "epoch": 9.160484185864897, + "grad_norm": 0.6763750314712524, + "learning_rate": 1.818272762797968e-05, + "loss": 0.2828, + "step": 117300 + }, + { + "epoch": 9.168293635298712, + "grad_norm": 0.8106026649475098, + "learning_rate": 1.8181164517389608e-05, + "loss": 0.2889, + "step": 117400 + }, + { + "epoch": 9.176103084732526, + "grad_norm": 0.906958818435669, + "learning_rate": 1.817960140679953e-05, + "loss": 0.2772, + "step": 117500 + }, + { + "epoch": 9.18391253416634, + "grad_norm": 1.0904810428619385, + "learning_rate": 1.817803829620946e-05, + "loss": 0.2767, + "step": 117600 + }, + { + "epoch": 9.191721983600155, + "grad_norm": 0.9980121850967407, + "learning_rate": 1.8176475185619386e-05, + "loss": 0.2752, + "step": 117700 + }, + { + "epoch": 9.199531433033972, + "grad_norm": 0.8572126626968384, + "learning_rate": 1.8174912075029308e-05, + "loss": 0.2696, + "step": 117800 + }, + { + "epoch": 9.207340882467786, + "grad_norm": 1.0560619831085205, + "learning_rate": 1.8173348964439234e-05, + "loss": 0.2627, + "step": 117900 + }, + { + "epoch": 9.2151503319016, + "grad_norm": 1.0500401258468628, + "learning_rate": 1.8171785853849164e-05, + "loss": 0.2803, + "step": 118000 + }, + { + "epoch": 9.222959781335415, + "grad_norm": 0.8881191611289978, + "learning_rate": 1.8170222743259086e-05, + "loss": 0.2831, + "step": 118100 + }, + { + "epoch": 9.23076923076923, + "grad_norm": 1.4876117706298828, + "learning_rate": 1.8168659632669012e-05, + "loss": 0.2717, + "step": 118200 + }, + { + "epoch": 9.238578680203046, + "grad_norm": 0.8553842902183533, + "learning_rate": 1.816709652207894e-05, + "loss": 0.2829, + "step": 118300 + }, + { + "epoch": 9.24638812963686, + "grad_norm": 1.1154588460922241, + "learning_rate": 1.8165533411488864e-05, + "loss": 0.2709, + "step": 118400 + }, + { + "epoch": 9.254197579070675, + "grad_norm": 1.1607916355133057, + "learning_rate": 1.816397030089879e-05, + "loss": 0.2714, + "step": 118500 + }, + { + "epoch": 9.26200702850449, + "grad_norm": 0.883974552154541, + "learning_rate": 1.8162407190308716e-05, + "loss": 0.2758, + "step": 118600 + }, + { + "epoch": 9.269816477938305, + "grad_norm": 1.1841872930526733, + "learning_rate": 1.8160844079718642e-05, + "loss": 0.2844, + "step": 118700 + }, + { + "epoch": 9.27762592737212, + "grad_norm": 1.1815747022628784, + "learning_rate": 1.815928096912857e-05, + "loss": 0.2651, + "step": 118800 + }, + { + "epoch": 9.285435376805935, + "grad_norm": 0.8993953466415405, + "learning_rate": 1.8157733489644394e-05, + "loss": 0.2802, + "step": 118900 + }, + { + "epoch": 9.29324482623975, + "grad_norm": 0.8702039122581482, + "learning_rate": 1.815617037905432e-05, + "loss": 0.2751, + "step": 119000 + }, + { + "epoch": 9.301054275673565, + "grad_norm": 0.8321089148521423, + "learning_rate": 1.8154607268464246e-05, + "loss": 0.2758, + "step": 119100 + }, + { + "epoch": 9.30886372510738, + "grad_norm": 0.7785543203353882, + "learning_rate": 1.8153044157874172e-05, + "loss": 0.2671, + "step": 119200 + }, + { + "epoch": 9.316673174541195, + "grad_norm": 0.8350294828414917, + "learning_rate": 1.8151481047284095e-05, + "loss": 0.2699, + "step": 119300 + }, + { + "epoch": 9.32448262397501, + "grad_norm": 0.933214008808136, + "learning_rate": 1.8149917936694024e-05, + "loss": 0.2731, + "step": 119400 + }, + { + "epoch": 9.332292073408825, + "grad_norm": 0.780940592288971, + "learning_rate": 1.814835482610395e-05, + "loss": 0.2872, + "step": 119500 + }, + { + "epoch": 9.34010152284264, + "grad_norm": 0.8094989061355591, + "learning_rate": 1.8146791715513873e-05, + "loss": 0.2684, + "step": 119600 + }, + { + "epoch": 9.347910972276454, + "grad_norm": 0.949965238571167, + "learning_rate": 1.81452286049238e-05, + "loss": 0.2742, + "step": 119700 + }, + { + "epoch": 9.35572042171027, + "grad_norm": 1.011487364768982, + "learning_rate": 1.8143665494333725e-05, + "loss": 0.2753, + "step": 119800 + }, + { + "epoch": 9.363529871144085, + "grad_norm": 1.0791568756103516, + "learning_rate": 1.814210238374365e-05, + "loss": 0.2705, + "step": 119900 + }, + { + "epoch": 9.3713393205779, + "grad_norm": 0.9083512425422668, + "learning_rate": 1.8140539273153577e-05, + "loss": 0.2659, + "step": 120000 + }, + { + "epoch": 9.379148770011714, + "grad_norm": 1.0209741592407227, + "learning_rate": 1.8138976162563503e-05, + "loss": 0.2728, + "step": 120100 + }, + { + "epoch": 9.386958219445528, + "grad_norm": 1.060758113861084, + "learning_rate": 1.813741305197343e-05, + "loss": 0.2747, + "step": 120200 + }, + { + "epoch": 9.394767668879345, + "grad_norm": 1.1624435186386108, + "learning_rate": 1.8135849941383355e-05, + "loss": 0.282, + "step": 120300 + }, + { + "epoch": 9.40257711831316, + "grad_norm": 1.0269403457641602, + "learning_rate": 1.813428683079328e-05, + "loss": 0.2676, + "step": 120400 + }, + { + "epoch": 9.410386567746974, + "grad_norm": 1.0639585256576538, + "learning_rate": 1.8132723720203207e-05, + "loss": 0.2608, + "step": 120500 + }, + { + "epoch": 9.418196017180788, + "grad_norm": 1.1121162176132202, + "learning_rate": 1.8131160609613133e-05, + "loss": 0.2669, + "step": 120600 + }, + { + "epoch": 9.426005466614603, + "grad_norm": 0.9323787689208984, + "learning_rate": 1.8129597499023056e-05, + "loss": 0.2798, + "step": 120700 + }, + { + "epoch": 9.43381491604842, + "grad_norm": 0.9078027606010437, + "learning_rate": 1.8128034388432982e-05, + "loss": 0.2694, + "step": 120800 + }, + { + "epoch": 9.441624365482234, + "grad_norm": 0.9647188186645508, + "learning_rate": 1.812648690894881e-05, + "loss": 0.2674, + "step": 120900 + }, + { + "epoch": 9.449433814916048, + "grad_norm": 1.0538146495819092, + "learning_rate": 1.8124923798358737e-05, + "loss": 0.2733, + "step": 121000 + }, + { + "epoch": 9.457243264349863, + "grad_norm": 1.0025720596313477, + "learning_rate": 1.812336068776866e-05, + "loss": 0.2694, + "step": 121100 + }, + { + "epoch": 9.465052713783678, + "grad_norm": 0.8698012232780457, + "learning_rate": 1.8121797577178586e-05, + "loss": 0.2683, + "step": 121200 + }, + { + "epoch": 9.472862163217494, + "grad_norm": 1.078913927078247, + "learning_rate": 1.8120234466588512e-05, + "loss": 0.2643, + "step": 121300 + }, + { + "epoch": 9.480671612651308, + "grad_norm": 0.9054146409034729, + "learning_rate": 1.8118671355998438e-05, + "loss": 0.274, + "step": 121400 + }, + { + "epoch": 9.488481062085123, + "grad_norm": 0.9421964883804321, + "learning_rate": 1.8117108245408364e-05, + "loss": 0.2739, + "step": 121500 + }, + { + "epoch": 9.496290511518938, + "grad_norm": 0.9728415608406067, + "learning_rate": 1.811554513481829e-05, + "loss": 0.2685, + "step": 121600 + }, + { + "epoch": 9.504099960952752, + "grad_norm": 0.9228818416595459, + "learning_rate": 1.8113982024228216e-05, + "loss": 0.2874, + "step": 121700 + }, + { + "epoch": 9.511909410386568, + "grad_norm": 0.8778666853904724, + "learning_rate": 1.8112418913638142e-05, + "loss": 0.278, + "step": 121800 + }, + { + "epoch": 9.519718859820383, + "grad_norm": 1.1950221061706543, + "learning_rate": 1.8110855803048068e-05, + "loss": 0.2749, + "step": 121900 + }, + { + "epoch": 9.527528309254198, + "grad_norm": 0.9403451085090637, + "learning_rate": 1.8109292692457994e-05, + "loss": 0.2646, + "step": 122000 + }, + { + "epoch": 9.535337758688012, + "grad_norm": 1.0735374689102173, + "learning_rate": 1.810772958186792e-05, + "loss": 0.2834, + "step": 122100 + }, + { + "epoch": 9.543147208121827, + "grad_norm": 1.1062880754470825, + "learning_rate": 1.8106166471277842e-05, + "loss": 0.268, + "step": 122200 + }, + { + "epoch": 9.550956657555643, + "grad_norm": 1.203642725944519, + "learning_rate": 1.810460336068777e-05, + "loss": 0.2595, + "step": 122300 + }, + { + "epoch": 9.558766106989458, + "grad_norm": 1.0009891986846924, + "learning_rate": 1.8103040250097698e-05, + "loss": 0.2604, + "step": 122400 + }, + { + "epoch": 9.566575556423272, + "grad_norm": 1.1903247833251953, + "learning_rate": 1.810147713950762e-05, + "loss": 0.2539, + "step": 122500 + }, + { + "epoch": 9.574385005857087, + "grad_norm": 1.1798654794692993, + "learning_rate": 1.8099914028917547e-05, + "loss": 0.2679, + "step": 122600 + }, + { + "epoch": 9.582194455290901, + "grad_norm": 0.955449104309082, + "learning_rate": 1.8098350918327473e-05, + "loss": 0.2736, + "step": 122700 + }, + { + "epoch": 9.590003904724718, + "grad_norm": 1.1760761737823486, + "learning_rate": 1.80967878077374e-05, + "loss": 0.2714, + "step": 122800 + }, + { + "epoch": 9.597813354158532, + "grad_norm": 0.9086267352104187, + "learning_rate": 1.8095224697147325e-05, + "loss": 0.2799, + "step": 122900 + }, + { + "epoch": 9.605622803592347, + "grad_norm": Infinity, + "learning_rate": 1.809367721766315e-05, + "loss": 0.2775, + "step": 123000 + }, + { + "epoch": 9.613432253026161, + "grad_norm": 0.7868393659591675, + "learning_rate": 1.8092114107073076e-05, + "loss": 0.268, + "step": 123100 + }, + { + "epoch": 9.621241702459976, + "grad_norm": 1.0325120687484741, + "learning_rate": 1.8090550996483003e-05, + "loss": 0.2671, + "step": 123200 + }, + { + "epoch": 9.629051151893792, + "grad_norm": 1.0042729377746582, + "learning_rate": 1.808898788589293e-05, + "loss": 0.2615, + "step": 123300 + }, + { + "epoch": 9.636860601327607, + "grad_norm": 0.8402108550071716, + "learning_rate": 1.8087424775302855e-05, + "loss": 0.2646, + "step": 123400 + }, + { + "epoch": 9.644670050761421, + "grad_norm": 0.9658582210540771, + "learning_rate": 1.808586166471278e-05, + "loss": 0.2814, + "step": 123500 + }, + { + "epoch": 9.652479500195236, + "grad_norm": 1.088170051574707, + "learning_rate": 1.8084298554122707e-05, + "loss": 0.2695, + "step": 123600 + }, + { + "epoch": 9.66028894962905, + "grad_norm": 0.8987926840782166, + "learning_rate": 1.808273544353263e-05, + "loss": 0.2708, + "step": 123700 + }, + { + "epoch": 9.668098399062867, + "grad_norm": 1.143585205078125, + "learning_rate": 1.808117233294256e-05, + "loss": 0.2656, + "step": 123800 + }, + { + "epoch": 9.675907848496681, + "grad_norm": 0.8075404763221741, + "learning_rate": 1.8079609222352485e-05, + "loss": 0.2737, + "step": 123900 + }, + { + "epoch": 9.683717297930496, + "grad_norm": 1.1423933506011963, + "learning_rate": 1.8078046111762407e-05, + "loss": 0.2677, + "step": 124000 + }, + { + "epoch": 9.69152674736431, + "grad_norm": 0.9777726531028748, + "learning_rate": 1.8076483001172333e-05, + "loss": 0.2803, + "step": 124100 + }, + { + "epoch": 9.699336196798125, + "grad_norm": 0.8643308877944946, + "learning_rate": 1.8074919890582263e-05, + "loss": 0.2613, + "step": 124200 + }, + { + "epoch": 9.707145646231941, + "grad_norm": 1.1980983018875122, + "learning_rate": 1.8073356779992185e-05, + "loss": 0.2604, + "step": 124300 + }, + { + "epoch": 9.714955095665756, + "grad_norm": 0.6800851821899414, + "learning_rate": 1.807179366940211e-05, + "loss": 0.2618, + "step": 124400 + }, + { + "epoch": 9.72276454509957, + "grad_norm": 0.8524096608161926, + "learning_rate": 1.8070230558812037e-05, + "loss": 0.2582, + "step": 124500 + }, + { + "epoch": 9.730573994533385, + "grad_norm": 0.8670868873596191, + "learning_rate": 1.8068667448221963e-05, + "loss": 0.2632, + "step": 124600 + }, + { + "epoch": 9.7383834439672, + "grad_norm": 0.720768928527832, + "learning_rate": 1.806710433763189e-05, + "loss": 0.2668, + "step": 124700 + }, + { + "epoch": 9.746192893401016, + "grad_norm": 0.7876792550086975, + "learning_rate": 1.8065541227041815e-05, + "loss": 0.2724, + "step": 124800 + }, + { + "epoch": 9.75400234283483, + "grad_norm": 1.1576100587844849, + "learning_rate": 1.806397811645174e-05, + "loss": 0.2633, + "step": 124900 + }, + { + "epoch": 9.761811792268645, + "grad_norm": 1.0780067443847656, + "learning_rate": 1.8062415005861667e-05, + "loss": 0.2709, + "step": 125000 + }, + { + "epoch": 9.76962124170246, + "grad_norm": 1.1646336317062378, + "learning_rate": 1.8060867526377493e-05, + "loss": 0.2716, + "step": 125100 + }, + { + "epoch": 9.777430691136274, + "grad_norm": 1.0148494243621826, + "learning_rate": 1.8059304415787416e-05, + "loss": 0.2769, + "step": 125200 + }, + { + "epoch": 9.78524014057009, + "grad_norm": 0.8026964068412781, + "learning_rate": 1.8057741305197345e-05, + "loss": 0.2738, + "step": 125300 + }, + { + "epoch": 9.793049590003905, + "grad_norm": 1.0151106119155884, + "learning_rate": 1.805617819460727e-05, + "loss": 0.2685, + "step": 125400 + }, + { + "epoch": 9.80085903943772, + "grad_norm": 0.9404942989349365, + "learning_rate": 1.8054615084017194e-05, + "loss": 0.2673, + "step": 125500 + }, + { + "epoch": 9.808668488871534, + "grad_norm": 1.1046905517578125, + "learning_rate": 1.805305197342712e-05, + "loss": 0.2684, + "step": 125600 + }, + { + "epoch": 9.816477938305349, + "grad_norm": 1.1244338750839233, + "learning_rate": 1.805148886283705e-05, + "loss": 0.2641, + "step": 125700 + }, + { + "epoch": 9.824287387739165, + "grad_norm": 0.7566142678260803, + "learning_rate": 1.8049925752246972e-05, + "loss": 0.2593, + "step": 125800 + }, + { + "epoch": 9.83209683717298, + "grad_norm": 1.0991520881652832, + "learning_rate": 1.8048362641656898e-05, + "loss": 0.2736, + "step": 125900 + }, + { + "epoch": 9.839906286606794, + "grad_norm": 1.0630158185958862, + "learning_rate": 1.8046799531066824e-05, + "loss": 0.2714, + "step": 126000 + }, + { + "epoch": 9.847715736040609, + "grad_norm": 1.0158815383911133, + "learning_rate": 1.804523642047675e-05, + "loss": 0.2661, + "step": 126100 + }, + { + "epoch": 9.855525185474423, + "grad_norm": 1.246567964553833, + "learning_rate": 1.8043673309886676e-05, + "loss": 0.25, + "step": 126200 + }, + { + "epoch": 9.863334634908238, + "grad_norm": 0.8306715488433838, + "learning_rate": 1.8042110199296602e-05, + "loss": 0.2761, + "step": 126300 + }, + { + "epoch": 9.871144084342054, + "grad_norm": 0.8288384079933167, + "learning_rate": 1.8040547088706528e-05, + "loss": 0.2628, + "step": 126400 + }, + { + "epoch": 9.878953533775869, + "grad_norm": 1.0349608659744263, + "learning_rate": 1.8038983978116454e-05, + "loss": 0.2686, + "step": 126500 + }, + { + "epoch": 9.886762983209683, + "grad_norm": 0.935551106929779, + "learning_rate": 1.803742086752638e-05, + "loss": 0.2656, + "step": 126600 + }, + { + "epoch": 9.894572432643498, + "grad_norm": 0.8951859474182129, + "learning_rate": 1.8035857756936306e-05, + "loss": 0.2597, + "step": 126700 + }, + { + "epoch": 9.902381882077314, + "grad_norm": 0.9531537890434265, + "learning_rate": 1.8034294646346232e-05, + "loss": 0.2674, + "step": 126800 + }, + { + "epoch": 9.910191331511129, + "grad_norm": 0.5789433121681213, + "learning_rate": 1.8032731535756155e-05, + "loss": 0.2628, + "step": 126900 + }, + { + "epoch": 9.918000780944944, + "grad_norm": 1.147503137588501, + "learning_rate": 1.803116842516608e-05, + "loss": 0.2733, + "step": 127000 + }, + { + "epoch": 9.925810230378758, + "grad_norm": 0.8560211658477783, + "learning_rate": 1.802962094568191e-05, + "loss": 0.2698, + "step": 127100 + }, + { + "epoch": 9.933619679812573, + "grad_norm": 0.7029670476913452, + "learning_rate": 1.8028057835091836e-05, + "loss": 0.271, + "step": 127200 + }, + { + "epoch": 9.941429129246387, + "grad_norm": 0.9978414177894592, + "learning_rate": 1.802649472450176e-05, + "loss": 0.2517, + "step": 127300 + }, + { + "epoch": 9.949238578680204, + "grad_norm": 0.905877411365509, + "learning_rate": 1.8024931613911685e-05, + "loss": 0.2783, + "step": 127400 + }, + { + "epoch": 9.957048028114018, + "grad_norm": 1.013178825378418, + "learning_rate": 1.802336850332161e-05, + "loss": 0.2556, + "step": 127500 + }, + { + "epoch": 9.964857477547833, + "grad_norm": 0.9851475954055786, + "learning_rate": 1.8021805392731537e-05, + "loss": 0.2659, + "step": 127600 + }, + { + "epoch": 9.972666926981647, + "grad_norm": 0.981611967086792, + "learning_rate": 1.8020242282141463e-05, + "loss": 0.2682, + "step": 127700 + }, + { + "epoch": 9.980476376415464, + "grad_norm": 0.7983320355415344, + "learning_rate": 1.801867917155139e-05, + "loss": 0.259, + "step": 127800 + }, + { + "epoch": 9.988285825849278, + "grad_norm": 0.9113990068435669, + "learning_rate": 1.8017116060961315e-05, + "loss": 0.2643, + "step": 127900 + }, + { + "epoch": 9.996095275283093, + "grad_norm": 1.0225112438201904, + "learning_rate": 1.801555295037124e-05, + "loss": 0.2597, + "step": 128000 + }, + { + "epoch": 10.003904724716907, + "grad_norm": 1.1934940814971924, + "learning_rate": 1.8013989839781167e-05, + "loss": 0.2494, + "step": 128100 + }, + { + "epoch": 10.011714174150722, + "grad_norm": 0.9636716246604919, + "learning_rate": 1.8012426729191093e-05, + "loss": 0.2621, + "step": 128200 + }, + { + "epoch": 10.019523623584536, + "grad_norm": 1.4203592538833618, + "learning_rate": 1.801086361860102e-05, + "loss": 0.2604, + "step": 128300 + }, + { + "epoch": 10.027333073018353, + "grad_norm": 0.8860512375831604, + "learning_rate": 1.8009316139116845e-05, + "loss": 0.2538, + "step": 128400 + }, + { + "epoch": 10.035142522452167, + "grad_norm": 1.068411111831665, + "learning_rate": 1.8007753028526767e-05, + "loss": 0.262, + "step": 128500 + }, + { + "epoch": 10.042951971885982, + "grad_norm": 0.9057846665382385, + "learning_rate": 1.8006189917936697e-05, + "loss": 0.2496, + "step": 128600 + }, + { + "epoch": 10.050761421319796, + "grad_norm": 1.0000461339950562, + "learning_rate": 1.8004626807346623e-05, + "loss": 0.2585, + "step": 128700 + }, + { + "epoch": 10.058570870753611, + "grad_norm": 0.8489338159561157, + "learning_rate": 1.8003063696756545e-05, + "loss": 0.2641, + "step": 128800 + }, + { + "epoch": 10.066380320187427, + "grad_norm": 0.8931035399436951, + "learning_rate": 1.8001500586166475e-05, + "loss": 0.2557, + "step": 128900 + }, + { + "epoch": 10.074189769621242, + "grad_norm": 0.9536027312278748, + "learning_rate": 1.7999937475576397e-05, + "loss": 0.255, + "step": 129000 + }, + { + "epoch": 10.081999219055056, + "grad_norm": 0.8968120217323303, + "learning_rate": 1.7998374364986323e-05, + "loss": 0.2539, + "step": 129100 + }, + { + "epoch": 10.089808668488871, + "grad_norm": 1.1486092805862427, + "learning_rate": 1.799681125439625e-05, + "loss": 0.2475, + "step": 129200 + }, + { + "epoch": 10.097618117922686, + "grad_norm": 0.9897533655166626, + "learning_rate": 1.7995248143806175e-05, + "loss": 0.2471, + "step": 129300 + }, + { + "epoch": 10.105427567356502, + "grad_norm": 0.7280323505401611, + "learning_rate": 1.79936850332161e-05, + "loss": 0.2584, + "step": 129400 + }, + { + "epoch": 10.113237016790317, + "grad_norm": 0.8905358910560608, + "learning_rate": 1.7992121922626027e-05, + "loss": 0.2608, + "step": 129500 + }, + { + "epoch": 10.121046466224131, + "grad_norm": 1.0517534017562866, + "learning_rate": 1.7990558812035953e-05, + "loss": 0.2589, + "step": 129600 + }, + { + "epoch": 10.128855915657946, + "grad_norm": 1.541553020477295, + "learning_rate": 1.798899570144588e-05, + "loss": 0.2621, + "step": 129700 + }, + { + "epoch": 10.13666536509176, + "grad_norm": 0.8779164552688599, + "learning_rate": 1.7987432590855805e-05, + "loss": 0.2536, + "step": 129800 + }, + { + "epoch": 10.144474814525577, + "grad_norm": 1.1161167621612549, + "learning_rate": 1.7985869480265728e-05, + "loss": 0.2489, + "step": 129900 + }, + { + "epoch": 10.152284263959391, + "grad_norm": 0.999082088470459, + "learning_rate": 1.7984306369675657e-05, + "loss": 0.2666, + "step": 130000 + }, + { + "epoch": 10.160093713393206, + "grad_norm": 0.9592772722244263, + "learning_rate": 1.7982743259085583e-05, + "loss": 0.26, + "step": 130100 + }, + { + "epoch": 10.16790316282702, + "grad_norm": 1.0480682849884033, + "learning_rate": 1.7981180148495506e-05, + "loss": 0.256, + "step": 130200 + }, + { + "epoch": 10.175712612260835, + "grad_norm": 1.0796922445297241, + "learning_rate": 1.7979617037905432e-05, + "loss": 0.2634, + "step": 130300 + }, + { + "epoch": 10.183522061694651, + "grad_norm": 0.8052628040313721, + "learning_rate": 1.7978053927315358e-05, + "loss": 0.2636, + "step": 130400 + }, + { + "epoch": 10.191331511128466, + "grad_norm": 0.8707953095436096, + "learning_rate": 1.7976490816725284e-05, + "loss": 0.2551, + "step": 130500 + }, + { + "epoch": 10.19914096056228, + "grad_norm": 1.1583307981491089, + "learning_rate": 1.797492770613521e-05, + "loss": 0.2669, + "step": 130600 + }, + { + "epoch": 10.206950409996095, + "grad_norm": 0.8590611219406128, + "learning_rate": 1.7973364595545136e-05, + "loss": 0.2606, + "step": 130700 + }, + { + "epoch": 10.21475985942991, + "grad_norm": 1.029362678527832, + "learning_rate": 1.7971801484955062e-05, + "loss": 0.2657, + "step": 130800 + }, + { + "epoch": 10.222569308863726, + "grad_norm": 1.1454167366027832, + "learning_rate": 1.7970238374364988e-05, + "loss": 0.2579, + "step": 130900 + }, + { + "epoch": 10.23037875829754, + "grad_norm": 0.6559708714485168, + "learning_rate": 1.7968675263774914e-05, + "loss": 0.2578, + "step": 131000 + }, + { + "epoch": 10.238188207731355, + "grad_norm": 1.1231006383895874, + "learning_rate": 1.796711215318484e-05, + "loss": 0.2589, + "step": 131100 + }, + { + "epoch": 10.24599765716517, + "grad_norm": 1.1264257431030273, + "learning_rate": 1.7965549042594766e-05, + "loss": 0.2489, + "step": 131200 + }, + { + "epoch": 10.253807106598984, + "grad_norm": 0.8126028180122375, + "learning_rate": 1.7963985932004692e-05, + "loss": 0.2643, + "step": 131300 + }, + { + "epoch": 10.2616165560328, + "grad_norm": 0.8900143504142761, + "learning_rate": 1.7962422821414615e-05, + "loss": 0.2777, + "step": 131400 + }, + { + "epoch": 10.269426005466615, + "grad_norm": 0.972007691860199, + "learning_rate": 1.7960859710824544e-05, + "loss": 0.2707, + "step": 131500 + }, + { + "epoch": 10.27723545490043, + "grad_norm": 0.8504611253738403, + "learning_rate": 1.7959296600234467e-05, + "loss": 0.2457, + "step": 131600 + }, + { + "epoch": 10.285044904334244, + "grad_norm": 0.9626019597053528, + "learning_rate": 1.7957733489644393e-05, + "loss": 0.2512, + "step": 131700 + }, + { + "epoch": 10.292854353768059, + "grad_norm": 1.2120001316070557, + "learning_rate": 1.795617037905432e-05, + "loss": 0.2579, + "step": 131800 + }, + { + "epoch": 10.300663803201875, + "grad_norm": 1.2339860200881958, + "learning_rate": 1.7954607268464245e-05, + "loss": 0.2589, + "step": 131900 + }, + { + "epoch": 10.30847325263569, + "grad_norm": 0.876400887966156, + "learning_rate": 1.795304415787417e-05, + "loss": 0.2439, + "step": 132000 + }, + { + "epoch": 10.316282702069504, + "grad_norm": 0.7515790462493896, + "learning_rate": 1.7951481047284097e-05, + "loss": 0.2542, + "step": 132100 + }, + { + "epoch": 10.324092151503319, + "grad_norm": 0.7441714406013489, + "learning_rate": 1.7949917936694023e-05, + "loss": 0.2592, + "step": 132200 + }, + { + "epoch": 10.331901600937133, + "grad_norm": 0.9107796549797058, + "learning_rate": 1.794837045720985e-05, + "loss": 0.2662, + "step": 132300 + }, + { + "epoch": 10.33971105037095, + "grad_norm": 1.1548948287963867, + "learning_rate": 1.7946807346619775e-05, + "loss": 0.2604, + "step": 132400 + }, + { + "epoch": 10.347520499804764, + "grad_norm": 1.0243403911590576, + "learning_rate": 1.79452442360297e-05, + "loss": 0.273, + "step": 132500 + }, + { + "epoch": 10.355329949238579, + "grad_norm": 1.4229692220687866, + "learning_rate": 1.7943681125439627e-05, + "loss": 0.2552, + "step": 132600 + }, + { + "epoch": 10.363139398672393, + "grad_norm": 1.00844407081604, + "learning_rate": 1.7942118014849553e-05, + "loss": 0.2385, + "step": 132700 + }, + { + "epoch": 10.370948848106208, + "grad_norm": 0.9128909707069397, + "learning_rate": 1.794055490425948e-05, + "loss": 0.2597, + "step": 132800 + }, + { + "epoch": 10.378758297540024, + "grad_norm": 1.053162932395935, + "learning_rate": 1.7938991793669405e-05, + "loss": 0.2497, + "step": 132900 + }, + { + "epoch": 10.386567746973839, + "grad_norm": 0.9146475195884705, + "learning_rate": 1.793742868307933e-05, + "loss": 0.25, + "step": 133000 + }, + { + "epoch": 10.394377196407653, + "grad_norm": 0.998521089553833, + "learning_rate": 1.7935865572489254e-05, + "loss": 0.2478, + "step": 133100 + }, + { + "epoch": 10.402186645841468, + "grad_norm": 0.9431730508804321, + "learning_rate": 1.793430246189918e-05, + "loss": 0.2467, + "step": 133200 + }, + { + "epoch": 10.409996095275282, + "grad_norm": 0.9027838110923767, + "learning_rate": 1.793273935130911e-05, + "loss": 0.2531, + "step": 133300 + }, + { + "epoch": 10.417805544709099, + "grad_norm": 0.9306374192237854, + "learning_rate": 1.793117624071903e-05, + "loss": 0.2593, + "step": 133400 + }, + { + "epoch": 10.425614994142913, + "grad_norm": 0.8729743361473083, + "learning_rate": 1.7929613130128958e-05, + "loss": 0.257, + "step": 133500 + }, + { + "epoch": 10.433424443576728, + "grad_norm": 0.7633653283119202, + "learning_rate": 1.7928050019538884e-05, + "loss": 0.2629, + "step": 133600 + }, + { + "epoch": 10.441233893010542, + "grad_norm": 0.7634978890419006, + "learning_rate": 1.792648690894881e-05, + "loss": 0.2724, + "step": 133700 + }, + { + "epoch": 10.449043342444357, + "grad_norm": 1.1082350015640259, + "learning_rate": 1.7924923798358736e-05, + "loss": 0.2519, + "step": 133800 + }, + { + "epoch": 10.456852791878173, + "grad_norm": 0.8378466963768005, + "learning_rate": 1.792336068776866e-05, + "loss": 0.244, + "step": 133900 + }, + { + "epoch": 10.464662241311988, + "grad_norm": 0.8386598825454712, + "learning_rate": 1.7921797577178588e-05, + "loss": 0.2544, + "step": 134000 + }, + { + "epoch": 10.472471690745802, + "grad_norm": 0.7771602869033813, + "learning_rate": 1.7920234466588514e-05, + "loss": 0.2537, + "step": 134100 + }, + { + "epoch": 10.480281140179617, + "grad_norm": 0.8470435738563538, + "learning_rate": 1.791867135599844e-05, + "loss": 0.2453, + "step": 134200 + }, + { + "epoch": 10.488090589613432, + "grad_norm": 1.166733980178833, + "learning_rate": 1.7917108245408362e-05, + "loss": 0.2624, + "step": 134300 + }, + { + "epoch": 10.495900039047248, + "grad_norm": 1.1435627937316895, + "learning_rate": 1.7915545134818292e-05, + "loss": 0.2653, + "step": 134400 + }, + { + "epoch": 10.503709488481062, + "grad_norm": 0.8292436003684998, + "learning_rate": 1.7913982024228214e-05, + "loss": 0.2568, + "step": 134500 + }, + { + "epoch": 10.511518937914877, + "grad_norm": 0.7752034068107605, + "learning_rate": 1.791241891363814e-05, + "loss": 0.2396, + "step": 134600 + }, + { + "epoch": 10.519328387348692, + "grad_norm": 1.128818154335022, + "learning_rate": 1.7910855803048066e-05, + "loss": 0.2465, + "step": 134700 + }, + { + "epoch": 10.527137836782506, + "grad_norm": 0.9121723771095276, + "learning_rate": 1.7909292692457992e-05, + "loss": 0.2469, + "step": 134800 + }, + { + "epoch": 10.534947286216322, + "grad_norm": 1.0522806644439697, + "learning_rate": 1.790772958186792e-05, + "loss": 0.25, + "step": 134900 + }, + { + "epoch": 10.542756735650137, + "grad_norm": 1.0877002477645874, + "learning_rate": 1.7906166471277844e-05, + "loss": 0.2525, + "step": 135000 + }, + { + "epoch": 10.550566185083952, + "grad_norm": 0.8708367943763733, + "learning_rate": 1.790460336068777e-05, + "loss": 0.2418, + "step": 135100 + }, + { + "epoch": 10.558375634517766, + "grad_norm": 0.8987772464752197, + "learning_rate": 1.7903040250097696e-05, + "loss": 0.2457, + "step": 135200 + }, + { + "epoch": 10.56618508395158, + "grad_norm": 0.9944095015525818, + "learning_rate": 1.7901477139507622e-05, + "loss": 0.2359, + "step": 135300 + }, + { + "epoch": 10.573994533385397, + "grad_norm": 1.073951005935669, + "learning_rate": 1.7899914028917545e-05, + "loss": 0.2547, + "step": 135400 + }, + { + "epoch": 10.581803982819212, + "grad_norm": 0.8596273064613342, + "learning_rate": 1.7898350918327474e-05, + "loss": 0.2459, + "step": 135500 + }, + { + "epoch": 10.589613432253026, + "grad_norm": 1.2592207193374634, + "learning_rate": 1.78967878077374e-05, + "loss": 0.2539, + "step": 135600 + }, + { + "epoch": 10.59742288168684, + "grad_norm": 1.2251505851745605, + "learning_rate": 1.7895224697147323e-05, + "loss": 0.2545, + "step": 135700 + }, + { + "epoch": 10.605232331120655, + "grad_norm": 0.7919961810112, + "learning_rate": 1.789366158655725e-05, + "loss": 0.2587, + "step": 135800 + }, + { + "epoch": 10.613041780554472, + "grad_norm": 1.192502737045288, + "learning_rate": 1.7892098475967175e-05, + "loss": 0.257, + "step": 135900 + }, + { + "epoch": 10.620851229988286, + "grad_norm": 0.9787290692329407, + "learning_rate": 1.78905353653771e-05, + "loss": 0.2534, + "step": 136000 + }, + { + "epoch": 10.6286606794221, + "grad_norm": 0.7784271240234375, + "learning_rate": 1.7888972254787027e-05, + "loss": 0.2656, + "step": 136100 + }, + { + "epoch": 10.636470128855915, + "grad_norm": 1.072007656097412, + "learning_rate": 1.7887409144196953e-05, + "loss": 0.2426, + "step": 136200 + }, + { + "epoch": 10.64427957828973, + "grad_norm": 1.229270577430725, + "learning_rate": 1.788586166471278e-05, + "loss": 0.2495, + "step": 136300 + }, + { + "epoch": 10.652089027723546, + "grad_norm": 1.0051836967468262, + "learning_rate": 1.7884298554122705e-05, + "loss": 0.2453, + "step": 136400 + }, + { + "epoch": 10.65989847715736, + "grad_norm": 0.931196391582489, + "learning_rate": 1.788273544353263e-05, + "loss": 0.2483, + "step": 136500 + }, + { + "epoch": 10.667707926591175, + "grad_norm": 0.9106378555297852, + "learning_rate": 1.7881187964048457e-05, + "loss": 0.2523, + "step": 136600 + }, + { + "epoch": 10.67551737602499, + "grad_norm": 1.0450843572616577, + "learning_rate": 1.7879624853458383e-05, + "loss": 0.2583, + "step": 136700 + }, + { + "epoch": 10.683326825458805, + "grad_norm": 0.7723289132118225, + "learning_rate": 1.787806174286831e-05, + "loss": 0.2607, + "step": 136800 + }, + { + "epoch": 10.69113627489262, + "grad_norm": 1.0050076246261597, + "learning_rate": 1.7876498632278235e-05, + "loss": 0.2432, + "step": 136900 + }, + { + "epoch": 10.698945724326435, + "grad_norm": 0.8825809359550476, + "learning_rate": 1.787493552168816e-05, + "loss": 0.238, + "step": 137000 + }, + { + "epoch": 10.70675517376025, + "grad_norm": 0.7401546239852905, + "learning_rate": 1.7873372411098087e-05, + "loss": 0.2603, + "step": 137100 + }, + { + "epoch": 10.714564623194065, + "grad_norm": 0.8159899115562439, + "learning_rate": 1.7871809300508013e-05, + "loss": 0.2428, + "step": 137200 + }, + { + "epoch": 10.72237407262788, + "grad_norm": 1.0040541887283325, + "learning_rate": 1.787024618991794e-05, + "loss": 0.2422, + "step": 137300 + }, + { + "epoch": 10.730183522061695, + "grad_norm": 0.9106484651565552, + "learning_rate": 1.7868683079327865e-05, + "loss": 0.2496, + "step": 137400 + }, + { + "epoch": 10.73799297149551, + "grad_norm": 0.9485523700714111, + "learning_rate": 1.7867119968737788e-05, + "loss": 0.2501, + "step": 137500 + }, + { + "epoch": 10.745802420929325, + "grad_norm": 0.9653137922286987, + "learning_rate": 1.7865556858147714e-05, + "loss": 0.2425, + "step": 137600 + }, + { + "epoch": 10.75361187036314, + "grad_norm": 0.8802712559700012, + "learning_rate": 1.7863993747557643e-05, + "loss": 0.2391, + "step": 137700 + }, + { + "epoch": 10.761421319796954, + "grad_norm": 0.8589837551116943, + "learning_rate": 1.7862430636967566e-05, + "loss": 0.2562, + "step": 137800 + }, + { + "epoch": 10.76923076923077, + "grad_norm": 0.8044468760490417, + "learning_rate": 1.7860867526377492e-05, + "loss": 0.2682, + "step": 137900 + }, + { + "epoch": 10.777040218664585, + "grad_norm": 1.045089840888977, + "learning_rate": 1.7859304415787418e-05, + "loss": 0.2435, + "step": 138000 + }, + { + "epoch": 10.7848496680984, + "grad_norm": 0.9740383625030518, + "learning_rate": 1.7857741305197344e-05, + "loss": 0.2574, + "step": 138100 + }, + { + "epoch": 10.792659117532214, + "grad_norm": 1.025576114654541, + "learning_rate": 1.785617819460727e-05, + "loss": 0.2448, + "step": 138200 + }, + { + "epoch": 10.800468566966028, + "grad_norm": 0.7664403915405273, + "learning_rate": 1.7854615084017196e-05, + "loss": 0.244, + "step": 138300 + }, + { + "epoch": 10.808278016399843, + "grad_norm": 1.039302945137024, + "learning_rate": 1.7853051973427122e-05, + "loss": 0.2411, + "step": 138400 + }, + { + "epoch": 10.81608746583366, + "grad_norm": 0.9581201076507568, + "learning_rate": 1.7851488862837048e-05, + "loss": 0.2496, + "step": 138500 + }, + { + "epoch": 10.823896915267474, + "grad_norm": 1.1588441133499146, + "learning_rate": 1.7849925752246974e-05, + "loss": 0.2526, + "step": 138600 + }, + { + "epoch": 10.831706364701288, + "grad_norm": 0.9728406667709351, + "learning_rate": 1.7848362641656896e-05, + "loss": 0.244, + "step": 138700 + }, + { + "epoch": 10.839515814135103, + "grad_norm": 0.947182834148407, + "learning_rate": 1.7846799531066826e-05, + "loss": 0.2561, + "step": 138800 + }, + { + "epoch": 10.84732526356892, + "grad_norm": 0.9559268951416016, + "learning_rate": 1.7845236420476752e-05, + "loss": 0.2389, + "step": 138900 + }, + { + "epoch": 10.855134713002734, + "grad_norm": 0.9298532009124756, + "learning_rate": 1.7843673309886675e-05, + "loss": 0.231, + "step": 139000 + }, + { + "epoch": 10.862944162436548, + "grad_norm": 0.9832693338394165, + "learning_rate": 1.78421101992966e-05, + "loss": 0.2272, + "step": 139100 + }, + { + "epoch": 10.870753611870363, + "grad_norm": 1.2468117475509644, + "learning_rate": 1.7840547088706527e-05, + "loss": 0.2529, + "step": 139200 + }, + { + "epoch": 10.878563061304177, + "grad_norm": 1.0330675840377808, + "learning_rate": 1.7838983978116453e-05, + "loss": 0.2426, + "step": 139300 + }, + { + "epoch": 10.886372510737992, + "grad_norm": 0.9902207255363464, + "learning_rate": 1.783742086752638e-05, + "loss": 0.2557, + "step": 139400 + }, + { + "epoch": 10.894181960171808, + "grad_norm": 0.8359582424163818, + "learning_rate": 1.7835857756936305e-05, + "loss": 0.2588, + "step": 139500 + }, + { + "epoch": 10.901991409605623, + "grad_norm": 1.1092885732650757, + "learning_rate": 1.783429464634623e-05, + "loss": 0.2634, + "step": 139600 + }, + { + "epoch": 10.909800859039438, + "grad_norm": 0.9002270698547363, + "learning_rate": 1.7832731535756157e-05, + "loss": 0.2545, + "step": 139700 + }, + { + "epoch": 10.917610308473252, + "grad_norm": 1.0879848003387451, + "learning_rate": 1.7831168425166083e-05, + "loss": 0.2438, + "step": 139800 + }, + { + "epoch": 10.925419757907068, + "grad_norm": 0.9094911217689514, + "learning_rate": 1.782960531457601e-05, + "loss": 0.2451, + "step": 139900 + }, + { + "epoch": 10.933229207340883, + "grad_norm": 1.1810535192489624, + "learning_rate": 1.7828042203985935e-05, + "loss": 0.2501, + "step": 140000 + }, + { + "epoch": 10.941038656774698, + "grad_norm": 1.110302209854126, + "learning_rate": 1.7826479093395857e-05, + "loss": 0.2465, + "step": 140100 + }, + { + "epoch": 10.948848106208512, + "grad_norm": 0.8959664702415466, + "learning_rate": 1.7824915982805787e-05, + "loss": 0.245, + "step": 140200 + }, + { + "epoch": 10.956657555642327, + "grad_norm": 0.7523890137672424, + "learning_rate": 1.7823352872215713e-05, + "loss": 0.2524, + "step": 140300 + }, + { + "epoch": 10.964467005076141, + "grad_norm": 1.1155420541763306, + "learning_rate": 1.7821789761625635e-05, + "loss": 0.2492, + "step": 140400 + }, + { + "epoch": 10.972276454509958, + "grad_norm": 1.041972041130066, + "learning_rate": 1.782022665103556e-05, + "loss": 0.2459, + "step": 140500 + }, + { + "epoch": 10.980085903943772, + "grad_norm": 0.8934460282325745, + "learning_rate": 1.781867917155139e-05, + "loss": 0.2555, + "step": 140600 + }, + { + "epoch": 10.987895353377587, + "grad_norm": 0.8889949917793274, + "learning_rate": 1.7817116060961313e-05, + "loss": 0.2457, + "step": 140700 + }, + { + "epoch": 10.995704802811401, + "grad_norm": 1.3171417713165283, + "learning_rate": 1.781555295037124e-05, + "loss": 0.2467, + "step": 140800 + }, + { + "epoch": 11.003514252245216, + "grad_norm": 0.8319826126098633, + "learning_rate": 1.7813989839781165e-05, + "loss": 0.2418, + "step": 140900 + }, + { + "epoch": 11.011323701679032, + "grad_norm": 1.0765905380249023, + "learning_rate": 1.781242672919109e-05, + "loss": 0.2403, + "step": 141000 + }, + { + "epoch": 11.019133151112847, + "grad_norm": 0.8536299467086792, + "learning_rate": 1.7810863618601017e-05, + "loss": 0.2442, + "step": 141100 + }, + { + "epoch": 11.026942600546661, + "grad_norm": 1.1073857545852661, + "learning_rate": 1.7809300508010943e-05, + "loss": 0.2436, + "step": 141200 + }, + { + "epoch": 11.034752049980476, + "grad_norm": 0.9137203097343445, + "learning_rate": 1.780775302852677e-05, + "loss": 0.2357, + "step": 141300 + }, + { + "epoch": 11.04256149941429, + "grad_norm": 0.9700175523757935, + "learning_rate": 1.7806189917936695e-05, + "loss": 0.2408, + "step": 141400 + }, + { + "epoch": 11.050370948848107, + "grad_norm": 1.1891883611679077, + "learning_rate": 1.780462680734662e-05, + "loss": 0.2532, + "step": 141500 + }, + { + "epoch": 11.058180398281921, + "grad_norm": 0.9186915159225464, + "learning_rate": 1.7803063696756547e-05, + "loss": 0.2541, + "step": 141600 + }, + { + "epoch": 11.065989847715736, + "grad_norm": 0.8762014508247375, + "learning_rate": 1.7801500586166473e-05, + "loss": 0.2365, + "step": 141700 + }, + { + "epoch": 11.07379929714955, + "grad_norm": 0.8541187644004822, + "learning_rate": 1.77999374755764e-05, + "loss": 0.2457, + "step": 141800 + }, + { + "epoch": 11.081608746583365, + "grad_norm": 0.8798760771751404, + "learning_rate": 1.7798374364986325e-05, + "loss": 0.2507, + "step": 141900 + }, + { + "epoch": 11.089418196017181, + "grad_norm": 1.1054275035858154, + "learning_rate": 1.7796811254396248e-05, + "loss": 0.2491, + "step": 142000 + }, + { + "epoch": 11.097227645450996, + "grad_norm": 0.9322449564933777, + "learning_rate": 1.7795248143806177e-05, + "loss": 0.2398, + "step": 142100 + }, + { + "epoch": 11.10503709488481, + "grad_norm": 1.0157818794250488, + "learning_rate": 1.77936850332161e-05, + "loss": 0.2338, + "step": 142200 + }, + { + "epoch": 11.112846544318625, + "grad_norm": 1.0045311450958252, + "learning_rate": 1.7792121922626026e-05, + "loss": 0.2486, + "step": 142300 + }, + { + "epoch": 11.12065599375244, + "grad_norm": 0.9703744649887085, + "learning_rate": 1.7790558812035955e-05, + "loss": 0.2449, + "step": 142400 + }, + { + "epoch": 11.128465443186256, + "grad_norm": 0.8165519833564758, + "learning_rate": 1.7788995701445878e-05, + "loss": 0.238, + "step": 142500 + }, + { + "epoch": 11.13627489262007, + "grad_norm": 0.9721464514732361, + "learning_rate": 1.7787432590855804e-05, + "loss": 0.2351, + "step": 142600 + }, + { + "epoch": 11.144084342053885, + "grad_norm": 0.9640051126480103, + "learning_rate": 1.778586948026573e-05, + "loss": 0.2364, + "step": 142700 + }, + { + "epoch": 11.1518937914877, + "grad_norm": 1.012770652770996, + "learning_rate": 1.7784306369675656e-05, + "loss": 0.2442, + "step": 142800 + }, + { + "epoch": 11.159703240921514, + "grad_norm": 1.0033341646194458, + "learning_rate": 1.7782743259085582e-05, + "loss": 0.2447, + "step": 142900 + }, + { + "epoch": 11.16751269035533, + "grad_norm": 0.6418032050132751, + "learning_rate": 1.7781180148495508e-05, + "loss": 0.2418, + "step": 143000 + }, + { + "epoch": 11.175322139789145, + "grad_norm": 1.1134809255599976, + "learning_rate": 1.777961703790543e-05, + "loss": 0.2442, + "step": 143100 + }, + { + "epoch": 11.18313158922296, + "grad_norm": 1.0568909645080566, + "learning_rate": 1.777805392731536e-05, + "loss": 0.2387, + "step": 143200 + }, + { + "epoch": 11.190941038656774, + "grad_norm": 0.8510876893997192, + "learning_rate": 1.7776490816725286e-05, + "loss": 0.2355, + "step": 143300 + }, + { + "epoch": 11.198750488090589, + "grad_norm": 0.6918273568153381, + "learning_rate": 1.777492770613521e-05, + "loss": 0.2376, + "step": 143400 + }, + { + "epoch": 11.206559937524405, + "grad_norm": 0.8617496490478516, + "learning_rate": 1.7773364595545138e-05, + "loss": 0.2387, + "step": 143500 + }, + { + "epoch": 11.21436938695822, + "grad_norm": 0.9683915972709656, + "learning_rate": 1.7771801484955064e-05, + "loss": 0.2368, + "step": 143600 + }, + { + "epoch": 11.222178836392034, + "grad_norm": 1.0224312543869019, + "learning_rate": 1.7770238374364987e-05, + "loss": 0.2441, + "step": 143700 + }, + { + "epoch": 11.229988285825849, + "grad_norm": 0.7473081350326538, + "learning_rate": 1.7768675263774913e-05, + "loss": 0.2516, + "step": 143800 + }, + { + "epoch": 11.237797735259663, + "grad_norm": 0.9434145092964172, + "learning_rate": 1.776711215318484e-05, + "loss": 0.2267, + "step": 143900 + }, + { + "epoch": 11.24560718469348, + "grad_norm": 0.887402355670929, + "learning_rate": 1.7765549042594765e-05, + "loss": 0.2336, + "step": 144000 + }, + { + "epoch": 11.253416634127294, + "grad_norm": 0.9610289931297302, + "learning_rate": 1.776398593200469e-05, + "loss": 0.2362, + "step": 144100 + }, + { + "epoch": 11.261226083561109, + "grad_norm": 0.8524101972579956, + "learning_rate": 1.7762422821414617e-05, + "loss": 0.2372, + "step": 144200 + }, + { + "epoch": 11.269035532994923, + "grad_norm": 1.01228666305542, + "learning_rate": 1.7760859710824543e-05, + "loss": 0.226, + "step": 144300 + }, + { + "epoch": 11.276844982428738, + "grad_norm": 1.0873188972473145, + "learning_rate": 1.775929660023447e-05, + "loss": 0.2393, + "step": 144400 + }, + { + "epoch": 11.284654431862554, + "grad_norm": 0.9294605851173401, + "learning_rate": 1.7757733489644395e-05, + "loss": 0.2348, + "step": 144500 + }, + { + "epoch": 11.292463881296369, + "grad_norm": 1.0935219526290894, + "learning_rate": 1.775617037905432e-05, + "loss": 0.2337, + "step": 144600 + }, + { + "epoch": 11.300273330730183, + "grad_norm": 1.033736228942871, + "learning_rate": 1.7754607268464247e-05, + "loss": 0.248, + "step": 144700 + }, + { + "epoch": 11.308082780163998, + "grad_norm": 0.8749229311943054, + "learning_rate": 1.775304415787417e-05, + "loss": 0.2486, + "step": 144800 + }, + { + "epoch": 11.315892229597813, + "grad_norm": 0.7569709420204163, + "learning_rate": 1.7751481047284095e-05, + "loss": 0.242, + "step": 144900 + }, + { + "epoch": 11.323701679031629, + "grad_norm": 1.029752492904663, + "learning_rate": 1.7749917936694025e-05, + "loss": 0.2516, + "step": 145000 + }, + { + "epoch": 11.331511128465444, + "grad_norm": 1.0709694623947144, + "learning_rate": 1.7748354826103947e-05, + "loss": 0.2427, + "step": 145100 + }, + { + "epoch": 11.339320577899258, + "grad_norm": 0.7665264010429382, + "learning_rate": 1.7746791715513873e-05, + "loss": 0.245, + "step": 145200 + }, + { + "epoch": 11.347130027333073, + "grad_norm": 0.823897659778595, + "learning_rate": 1.77452442360297e-05, + "loss": 0.2383, + "step": 145300 + }, + { + "epoch": 11.354939476766887, + "grad_norm": 0.9198905229568481, + "learning_rate": 1.7743681125439625e-05, + "loss": 0.2336, + "step": 145400 + }, + { + "epoch": 11.362748926200704, + "grad_norm": 0.7961851358413696, + "learning_rate": 1.774211801484955e-05, + "loss": 0.2378, + "step": 145500 + }, + { + "epoch": 11.370558375634518, + "grad_norm": 0.7704211473464966, + "learning_rate": 1.7740554904259477e-05, + "loss": 0.2373, + "step": 145600 + }, + { + "epoch": 11.378367825068333, + "grad_norm": 1.017962098121643, + "learning_rate": 1.7738991793669403e-05, + "loss": 0.239, + "step": 145700 + }, + { + "epoch": 11.386177274502147, + "grad_norm": 1.4405622482299805, + "learning_rate": 1.773742868307933e-05, + "loss": 0.2452, + "step": 145800 + }, + { + "epoch": 11.393986723935962, + "grad_norm": 0.8968992233276367, + "learning_rate": 1.7735865572489255e-05, + "loss": 0.2443, + "step": 145900 + }, + { + "epoch": 11.401796173369778, + "grad_norm": 0.847322404384613, + "learning_rate": 1.773430246189918e-05, + "loss": 0.2405, + "step": 146000 + }, + { + "epoch": 11.409605622803593, + "grad_norm": 0.9945650696754456, + "learning_rate": 1.7732739351309108e-05, + "loss": 0.246, + "step": 146100 + }, + { + "epoch": 11.417415072237407, + "grad_norm": 0.9590684771537781, + "learning_rate": 1.7731176240719034e-05, + "loss": 0.2509, + "step": 146200 + }, + { + "epoch": 11.425224521671222, + "grad_norm": 0.7392657995223999, + "learning_rate": 1.7729613130128956e-05, + "loss": 0.2412, + "step": 146300 + }, + { + "epoch": 11.433033971105036, + "grad_norm": 0.9596700668334961, + "learning_rate": 1.7728050019538886e-05, + "loss": 0.2443, + "step": 146400 + }, + { + "epoch": 11.440843420538853, + "grad_norm": 1.1228785514831543, + "learning_rate": 1.772648690894881e-05, + "loss": 0.2394, + "step": 146500 + }, + { + "epoch": 11.448652869972667, + "grad_norm": 1.0421438217163086, + "learning_rate": 1.7724923798358734e-05, + "loss": 0.2499, + "step": 146600 + }, + { + "epoch": 11.456462319406482, + "grad_norm": 0.8273714780807495, + "learning_rate": 1.772336068776866e-05, + "loss": 0.2457, + "step": 146700 + }, + { + "epoch": 11.464271768840296, + "grad_norm": 0.8781546950340271, + "learning_rate": 1.7721797577178586e-05, + "loss": 0.2337, + "step": 146800 + }, + { + "epoch": 11.472081218274111, + "grad_norm": 0.7646591067314148, + "learning_rate": 1.7720234466588512e-05, + "loss": 0.2365, + "step": 146900 + }, + { + "epoch": 11.479890667707927, + "grad_norm": 1.0014855861663818, + "learning_rate": 1.7718671355998438e-05, + "loss": 0.2363, + "step": 147000 + }, + { + "epoch": 11.487700117141742, + "grad_norm": 1.2053076028823853, + "learning_rate": 1.7717108245408364e-05, + "loss": 0.2348, + "step": 147100 + }, + { + "epoch": 11.495509566575556, + "grad_norm": 1.017988920211792, + "learning_rate": 1.771554513481829e-05, + "loss": 0.2365, + "step": 147200 + }, + { + "epoch": 11.503319016009371, + "grad_norm": 1.1997560262680054, + "learning_rate": 1.7713997655334116e-05, + "loss": 0.2441, + "step": 147300 + }, + { + "epoch": 11.511128465443186, + "grad_norm": 0.6547896265983582, + "learning_rate": 1.7712434544744042e-05, + "loss": 0.2363, + "step": 147400 + }, + { + "epoch": 11.518937914877002, + "grad_norm": 0.9807236790657043, + "learning_rate": 1.7710871434153968e-05, + "loss": 0.2358, + "step": 147500 + }, + { + "epoch": 11.526747364310816, + "grad_norm": 0.893150269985199, + "learning_rate": 1.7709308323563894e-05, + "loss": 0.2243, + "step": 147600 + }, + { + "epoch": 11.534556813744631, + "grad_norm": 1.184472680091858, + "learning_rate": 1.770774521297382e-05, + "loss": 0.2397, + "step": 147700 + }, + { + "epoch": 11.542366263178446, + "grad_norm": 1.0924776792526245, + "learning_rate": 1.7706182102383743e-05, + "loss": 0.2301, + "step": 147800 + }, + { + "epoch": 11.55017571261226, + "grad_norm": 0.9261873960494995, + "learning_rate": 1.7704618991793672e-05, + "loss": 0.2368, + "step": 147900 + }, + { + "epoch": 11.557985162046077, + "grad_norm": 1.007124662399292, + "learning_rate": 1.7703055881203598e-05, + "loss": 0.2295, + "step": 148000 + }, + { + "epoch": 11.565794611479891, + "grad_norm": 0.9083753228187561, + "learning_rate": 1.770149277061352e-05, + "loss": 0.2501, + "step": 148100 + }, + { + "epoch": 11.573604060913706, + "grad_norm": 0.8021575212478638, + "learning_rate": 1.7699929660023447e-05, + "loss": 0.2419, + "step": 148200 + }, + { + "epoch": 11.58141351034752, + "grad_norm": 0.8178331851959229, + "learning_rate": 1.7698366549433373e-05, + "loss": 0.2295, + "step": 148300 + }, + { + "epoch": 11.589222959781335, + "grad_norm": 0.9790313243865967, + "learning_rate": 1.76968034388433e-05, + "loss": 0.2404, + "step": 148400 + }, + { + "epoch": 11.597032409215151, + "grad_norm": 0.8207682967185974, + "learning_rate": 1.7695240328253225e-05, + "loss": 0.2338, + "step": 148500 + }, + { + "epoch": 11.604841858648966, + "grad_norm": 0.7306053638458252, + "learning_rate": 1.769367721766315e-05, + "loss": 0.2351, + "step": 148600 + }, + { + "epoch": 11.61265130808278, + "grad_norm": 0.8284056186676025, + "learning_rate": 1.7692114107073077e-05, + "loss": 0.2373, + "step": 148700 + }, + { + "epoch": 11.620460757516595, + "grad_norm": 0.7813366055488586, + "learning_rate": 1.7690550996483003e-05, + "loss": 0.2343, + "step": 148800 + }, + { + "epoch": 11.62827020695041, + "grad_norm": 1.0725852251052856, + "learning_rate": 1.768898788589293e-05, + "loss": 0.2278, + "step": 148900 + }, + { + "epoch": 11.636079656384226, + "grad_norm": 1.0066187381744385, + "learning_rate": 1.7687424775302855e-05, + "loss": 0.2341, + "step": 149000 + }, + { + "epoch": 11.64388910581804, + "grad_norm": 0.8271299600601196, + "learning_rate": 1.768586166471278e-05, + "loss": 0.2365, + "step": 149100 + }, + { + "epoch": 11.651698555251855, + "grad_norm": 1.159485936164856, + "learning_rate": 1.7684298554122704e-05, + "loss": 0.2262, + "step": 149200 + }, + { + "epoch": 11.65950800468567, + "grad_norm": 0.973098874092102, + "learning_rate": 1.768275107463853e-05, + "loss": 0.2344, + "step": 149300 + }, + { + "epoch": 11.667317454119484, + "grad_norm": 0.9480162262916565, + "learning_rate": 1.768118796404846e-05, + "loss": 0.2338, + "step": 149400 + }, + { + "epoch": 11.6751269035533, + "grad_norm": 0.5876905918121338, + "learning_rate": 1.7679624853458385e-05, + "loss": 0.2377, + "step": 149500 + }, + { + "epoch": 11.682936352987115, + "grad_norm": 1.1765408515930176, + "learning_rate": 1.7678061742868308e-05, + "loss": 0.2384, + "step": 149600 + }, + { + "epoch": 11.69074580242093, + "grad_norm": 1.0857717990875244, + "learning_rate": 1.7676498632278237e-05, + "loss": 0.2382, + "step": 149700 + }, + { + "epoch": 11.698555251854744, + "grad_norm": 0.811991810798645, + "learning_rate": 1.7674935521688163e-05, + "loss": 0.2457, + "step": 149800 + }, + { + "epoch": 11.706364701288559, + "grad_norm": 0.9020510315895081, + "learning_rate": 1.7673372411098086e-05, + "loss": 0.2152, + "step": 149900 + }, + { + "epoch": 11.714174150722375, + "grad_norm": 0.80827397108078, + "learning_rate": 1.767180930050801e-05, + "loss": 0.2355, + "step": 150000 + }, + { + "epoch": 11.72198360015619, + "grad_norm": 1.0584274530410767, + "learning_rate": 1.7670246189917938e-05, + "loss": 0.2241, + "step": 150100 + }, + { + "epoch": 11.729793049590004, + "grad_norm": 0.8974628448486328, + "learning_rate": 1.7668683079327864e-05, + "loss": 0.2401, + "step": 150200 + }, + { + "epoch": 11.737602499023819, + "grad_norm": 0.8113837838172913, + "learning_rate": 1.766711996873779e-05, + "loss": 0.2476, + "step": 150300 + }, + { + "epoch": 11.745411948457633, + "grad_norm": 0.7186545729637146, + "learning_rate": 1.7665556858147716e-05, + "loss": 0.2435, + "step": 150400 + }, + { + "epoch": 11.75322139789145, + "grad_norm": 1.1256885528564453, + "learning_rate": 1.766399374755764e-05, + "loss": 0.2272, + "step": 150500 + }, + { + "epoch": 11.761030847325264, + "grad_norm": 0.9910681843757629, + "learning_rate": 1.7662430636967568e-05, + "loss": 0.2294, + "step": 150600 + }, + { + "epoch": 11.768840296759079, + "grad_norm": 1.2361962795257568, + "learning_rate": 1.7660867526377494e-05, + "loss": 0.2423, + "step": 150700 + }, + { + "epoch": 11.776649746192893, + "grad_norm": 1.0611270666122437, + "learning_rate": 1.765930441578742e-05, + "loss": 0.2278, + "step": 150800 + }, + { + "epoch": 11.784459195626708, + "grad_norm": 0.6779769062995911, + "learning_rate": 1.7657741305197346e-05, + "loss": 0.2232, + "step": 150900 + }, + { + "epoch": 11.792268645060524, + "grad_norm": 1.2255175113677979, + "learning_rate": 1.765617819460727e-05, + "loss": 0.2333, + "step": 151000 + }, + { + "epoch": 11.800078094494339, + "grad_norm": 1.2015628814697266, + "learning_rate": 1.7654615084017194e-05, + "loss": 0.2352, + "step": 151100 + }, + { + "epoch": 11.807887543928153, + "grad_norm": 0.9914200305938721, + "learning_rate": 1.7653051973427124e-05, + "loss": 0.2326, + "step": 151200 + }, + { + "epoch": 11.815696993361968, + "grad_norm": 0.9092264175415039, + "learning_rate": 1.765150449394295e-05, + "loss": 0.2251, + "step": 151300 + }, + { + "epoch": 11.823506442795782, + "grad_norm": 0.6066728234291077, + "learning_rate": 1.7649941383352872e-05, + "loss": 0.2274, + "step": 151400 + }, + { + "epoch": 11.831315892229597, + "grad_norm": 1.1375062465667725, + "learning_rate": 1.76483782727628e-05, + "loss": 0.231, + "step": 151500 + }, + { + "epoch": 11.839125341663413, + "grad_norm": 0.839840292930603, + "learning_rate": 1.7646815162172724e-05, + "loss": 0.2291, + "step": 151600 + }, + { + "epoch": 11.846934791097228, + "grad_norm": 0.7539154887199402, + "learning_rate": 1.764525205158265e-05, + "loss": 0.2194, + "step": 151700 + }, + { + "epoch": 11.854744240531042, + "grad_norm": 1.1252955198287964, + "learning_rate": 1.7643688940992576e-05, + "loss": 0.2275, + "step": 151800 + }, + { + "epoch": 11.862553689964857, + "grad_norm": 0.7381853461265564, + "learning_rate": 1.7642125830402502e-05, + "loss": 0.2287, + "step": 151900 + }, + { + "epoch": 11.870363139398673, + "grad_norm": 1.1151509284973145, + "learning_rate": 1.764056271981243e-05, + "loss": 0.2281, + "step": 152000 + }, + { + "epoch": 11.878172588832488, + "grad_norm": 0.8168774247169495, + "learning_rate": 1.7638999609222354e-05, + "loss": 0.2151, + "step": 152100 + }, + { + "epoch": 11.885982038266302, + "grad_norm": 1.0729039907455444, + "learning_rate": 1.763743649863228e-05, + "loss": 0.2257, + "step": 152200 + }, + { + "epoch": 11.893791487700117, + "grad_norm": 0.9500818848609924, + "learning_rate": 1.7635873388042206e-05, + "loss": 0.2189, + "step": 152300 + }, + { + "epoch": 11.901600937133932, + "grad_norm": 1.022155523300171, + "learning_rate": 1.7634310277452132e-05, + "loss": 0.2375, + "step": 152400 + }, + { + "epoch": 11.909410386567746, + "grad_norm": 1.272755742073059, + "learning_rate": 1.7632747166862055e-05, + "loss": 0.2313, + "step": 152500 + }, + { + "epoch": 11.917219836001562, + "grad_norm": 0.7790395617485046, + "learning_rate": 1.763118405627198e-05, + "loss": 0.2234, + "step": 152600 + }, + { + "epoch": 11.925029285435377, + "grad_norm": 0.9795846343040466, + "learning_rate": 1.762962094568191e-05, + "loss": 0.2301, + "step": 152700 + }, + { + "epoch": 11.932838734869192, + "grad_norm": 0.6616389155387878, + "learning_rate": 1.7628057835091833e-05, + "loss": 0.2275, + "step": 152800 + }, + { + "epoch": 11.940648184303006, + "grad_norm": 0.7695783376693726, + "learning_rate": 1.762649472450176e-05, + "loss": 0.2323, + "step": 152900 + }, + { + "epoch": 11.948457633736822, + "grad_norm": 0.9902390241622925, + "learning_rate": 1.7624931613911685e-05, + "loss": 0.2335, + "step": 153000 + }, + { + "epoch": 11.956267083170637, + "grad_norm": 0.9866804480552673, + "learning_rate": 1.762336850332161e-05, + "loss": 0.2354, + "step": 153100 + }, + { + "epoch": 11.964076532604452, + "grad_norm": 0.9206321835517883, + "learning_rate": 1.7621805392731537e-05, + "loss": 0.2289, + "step": 153200 + }, + { + "epoch": 11.971885982038266, + "grad_norm": 1.0868662595748901, + "learning_rate": 1.7620242282141463e-05, + "loss": 0.2192, + "step": 153300 + }, + { + "epoch": 11.97969543147208, + "grad_norm": 1.080969214439392, + "learning_rate": 1.761869480265729e-05, + "loss": 0.2268, + "step": 153400 + }, + { + "epoch": 11.987504880905895, + "grad_norm": 0.9495891332626343, + "learning_rate": 1.7617131692067215e-05, + "loss": 0.2248, + "step": 153500 + }, + { + "epoch": 11.995314330339712, + "grad_norm": 0.6562321782112122, + "learning_rate": 1.761556858147714e-05, + "loss": 0.2279, + "step": 153600 + }, + { + "epoch": 12.003123779773526, + "grad_norm": 0.9093583822250366, + "learning_rate": 1.7614005470887067e-05, + "loss": 0.2216, + "step": 153700 + }, + { + "epoch": 12.01093322920734, + "grad_norm": 0.8424447178840637, + "learning_rate": 1.7612442360296993e-05, + "loss": 0.2312, + "step": 153800 + }, + { + "epoch": 12.018742678641155, + "grad_norm": 0.9770123362541199, + "learning_rate": 1.761087924970692e-05, + "loss": 0.2283, + "step": 153900 + }, + { + "epoch": 12.02655212807497, + "grad_norm": 1.1255728006362915, + "learning_rate": 1.7609316139116842e-05, + "loss": 0.2344, + "step": 154000 + }, + { + "epoch": 12.034361577508786, + "grad_norm": 1.158980369567871, + "learning_rate": 1.760775302852677e-05, + "loss": 0.2402, + "step": 154100 + }, + { + "epoch": 12.0421710269426, + "grad_norm": 0.8753023147583008, + "learning_rate": 1.7606189917936697e-05, + "loss": 0.2283, + "step": 154200 + }, + { + "epoch": 12.049980476376415, + "grad_norm": 1.185396671295166, + "learning_rate": 1.760462680734662e-05, + "loss": 0.2365, + "step": 154300 + }, + { + "epoch": 12.05778992581023, + "grad_norm": 0.9084982872009277, + "learning_rate": 1.7603063696756546e-05, + "loss": 0.2124, + "step": 154400 + }, + { + "epoch": 12.065599375244044, + "grad_norm": 0.947121262550354, + "learning_rate": 1.7601500586166472e-05, + "loss": 0.2329, + "step": 154500 + }, + { + "epoch": 12.07340882467786, + "grad_norm": 0.9727124571800232, + "learning_rate": 1.7599937475576398e-05, + "loss": 0.2247, + "step": 154600 + }, + { + "epoch": 12.081218274111675, + "grad_norm": 0.99271559715271, + "learning_rate": 1.7598374364986324e-05, + "loss": 0.221, + "step": 154700 + }, + { + "epoch": 12.08902772354549, + "grad_norm": 1.1732423305511475, + "learning_rate": 1.759681125439625e-05, + "loss": 0.2129, + "step": 154800 + }, + { + "epoch": 12.096837172979304, + "grad_norm": 0.9106228351593018, + "learning_rate": 1.7595248143806176e-05, + "loss": 0.2274, + "step": 154900 + }, + { + "epoch": 12.104646622413119, + "grad_norm": 1.0971242189407349, + "learning_rate": 1.7593685033216102e-05, + "loss": 0.2309, + "step": 155000 + }, + { + "epoch": 12.112456071846935, + "grad_norm": 1.2554278373718262, + "learning_rate": 1.7592121922626028e-05, + "loss": 0.2387, + "step": 155100 + }, + { + "epoch": 12.12026552128075, + "grad_norm": 0.8720566034317017, + "learning_rate": 1.7590558812035954e-05, + "loss": 0.2336, + "step": 155200 + }, + { + "epoch": 12.128074970714565, + "grad_norm": 0.8539800643920898, + "learning_rate": 1.758899570144588e-05, + "loss": 0.2296, + "step": 155300 + }, + { + "epoch": 12.135884420148379, + "grad_norm": 0.9018562436103821, + "learning_rate": 1.7587448221961706e-05, + "loss": 0.2304, + "step": 155400 + }, + { + "epoch": 12.143693869582194, + "grad_norm": 1.048981785774231, + "learning_rate": 1.758588511137163e-05, + "loss": 0.2176, + "step": 155500 + }, + { + "epoch": 12.15150331901601, + "grad_norm": 1.3624159097671509, + "learning_rate": 1.7584322000781558e-05, + "loss": 0.231, + "step": 155600 + }, + { + "epoch": 12.159312768449825, + "grad_norm": 0.9861006736755371, + "learning_rate": 1.7582758890191484e-05, + "loss": 0.2276, + "step": 155700 + }, + { + "epoch": 12.16712221788364, + "grad_norm": 0.971916139125824, + "learning_rate": 1.7581195779601406e-05, + "loss": 0.2258, + "step": 155800 + }, + { + "epoch": 12.174931667317454, + "grad_norm": 1.1522843837738037, + "learning_rate": 1.7579632669011336e-05, + "loss": 0.2222, + "step": 155900 + }, + { + "epoch": 12.182741116751268, + "grad_norm": 1.2297567129135132, + "learning_rate": 1.757806955842126e-05, + "loss": 0.2258, + "step": 156000 + }, + { + "epoch": 12.190550566185085, + "grad_norm": 0.9804288148880005, + "learning_rate": 1.7576506447831185e-05, + "loss": 0.2233, + "step": 156100 + }, + { + "epoch": 12.1983600156189, + "grad_norm": 0.9976232647895813, + "learning_rate": 1.757494333724111e-05, + "loss": 0.2233, + "step": 156200 + }, + { + "epoch": 12.206169465052714, + "grad_norm": 0.7635705471038818, + "learning_rate": 1.7573380226651037e-05, + "loss": 0.2098, + "step": 156300 + }, + { + "epoch": 12.213978914486528, + "grad_norm": 0.8271148800849915, + "learning_rate": 1.7571817116060963e-05, + "loss": 0.2262, + "step": 156400 + }, + { + "epoch": 12.221788363920343, + "grad_norm": 0.7868706583976746, + "learning_rate": 1.757025400547089e-05, + "loss": 0.2346, + "step": 156500 + }, + { + "epoch": 12.22959781335416, + "grad_norm": 1.131128191947937, + "learning_rate": 1.7568690894880815e-05, + "loss": 0.2226, + "step": 156600 + }, + { + "epoch": 12.237407262787974, + "grad_norm": 1.0516031980514526, + "learning_rate": 1.756712778429074e-05, + "loss": 0.2196, + "step": 156700 + }, + { + "epoch": 12.245216712221788, + "grad_norm": 0.8659607172012329, + "learning_rate": 1.7565564673700667e-05, + "loss": 0.2246, + "step": 156800 + }, + { + "epoch": 12.253026161655603, + "grad_norm": 0.9379199147224426, + "learning_rate": 1.7564001563110593e-05, + "loss": 0.2272, + "step": 156900 + }, + { + "epoch": 12.260835611089417, + "grad_norm": 1.2084906101226807, + "learning_rate": 1.756243845252052e-05, + "loss": 0.2239, + "step": 157000 + }, + { + "epoch": 12.268645060523234, + "grad_norm": 1.0007749795913696, + "learning_rate": 1.7560875341930445e-05, + "loss": 0.2239, + "step": 157100 + }, + { + "epoch": 12.276454509957048, + "grad_norm": 0.93492591381073, + "learning_rate": 1.7559312231340367e-05, + "loss": 0.2309, + "step": 157200 + }, + { + "epoch": 12.284263959390863, + "grad_norm": 1.4212570190429688, + "learning_rate": 1.7557749120750293e-05, + "loss": 0.2209, + "step": 157300 + }, + { + "epoch": 12.292073408824677, + "grad_norm": 1.0967302322387695, + "learning_rate": 1.7556186010160223e-05, + "loss": 0.2219, + "step": 157400 + }, + { + "epoch": 12.299882858258492, + "grad_norm": 0.8859778046607971, + "learning_rate": 1.755463853067605e-05, + "loss": 0.2353, + "step": 157500 + }, + { + "epoch": 12.307692307692308, + "grad_norm": 1.1520426273345947, + "learning_rate": 1.755307542008597e-05, + "loss": 0.2236, + "step": 157600 + }, + { + "epoch": 12.315501757126123, + "grad_norm": 0.8958144187927246, + "learning_rate": 1.7551512309495897e-05, + "loss": 0.2347, + "step": 157700 + }, + { + "epoch": 12.323311206559938, + "grad_norm": 0.8936724066734314, + "learning_rate": 1.7549949198905823e-05, + "loss": 0.2358, + "step": 157800 + }, + { + "epoch": 12.331120655993752, + "grad_norm": 1.011566400527954, + "learning_rate": 1.754838608831575e-05, + "loss": 0.2255, + "step": 157900 + }, + { + "epoch": 12.338930105427567, + "grad_norm": 0.8371241092681885, + "learning_rate": 1.7546822977725675e-05, + "loss": 0.2191, + "step": 158000 + }, + { + "epoch": 12.346739554861383, + "grad_norm": 0.837230920791626, + "learning_rate": 1.75452598671356e-05, + "loss": 0.2384, + "step": 158100 + }, + { + "epoch": 12.354549004295198, + "grad_norm": 0.9873660802841187, + "learning_rate": 1.7543696756545527e-05, + "loss": 0.2239, + "step": 158200 + }, + { + "epoch": 12.362358453729012, + "grad_norm": 1.1262012720108032, + "learning_rate": 1.7542133645955453e-05, + "loss": 0.2148, + "step": 158300 + }, + { + "epoch": 12.370167903162827, + "grad_norm": 0.9833094477653503, + "learning_rate": 1.754057053536538e-05, + "loss": 0.2182, + "step": 158400 + }, + { + "epoch": 12.377977352596641, + "grad_norm": 0.7416759729385376, + "learning_rate": 1.7539007424775305e-05, + "loss": 0.2274, + "step": 158500 + }, + { + "epoch": 12.385786802030458, + "grad_norm": 1.1120948791503906, + "learning_rate": 1.753744431418523e-05, + "loss": 0.2239, + "step": 158600 + }, + { + "epoch": 12.393596251464272, + "grad_norm": 0.8697503805160522, + "learning_rate": 1.7535881203595154e-05, + "loss": 0.228, + "step": 158700 + }, + { + "epoch": 12.401405700898087, + "grad_norm": 0.9854649901390076, + "learning_rate": 1.753431809300508e-05, + "loss": 0.223, + "step": 158800 + }, + { + "epoch": 12.409215150331901, + "grad_norm": 0.6563554406166077, + "learning_rate": 1.753275498241501e-05, + "loss": 0.2192, + "step": 158900 + }, + { + "epoch": 12.417024599765716, + "grad_norm": 0.928612232208252, + "learning_rate": 1.7531191871824932e-05, + "loss": 0.233, + "step": 159000 + }, + { + "epoch": 12.424834049199532, + "grad_norm": 0.8887084722518921, + "learning_rate": 1.7529628761234858e-05, + "loss": 0.2284, + "step": 159100 + }, + { + "epoch": 12.432643498633347, + "grad_norm": 1.3052536249160767, + "learning_rate": 1.7528065650644784e-05, + "loss": 0.224, + "step": 159200 + }, + { + "epoch": 12.440452948067161, + "grad_norm": 0.9533888697624207, + "learning_rate": 1.752650254005471e-05, + "loss": 0.2255, + "step": 159300 + }, + { + "epoch": 12.448262397500976, + "grad_norm": 0.9055050611495972, + "learning_rate": 1.7524939429464636e-05, + "loss": 0.2192, + "step": 159400 + }, + { + "epoch": 12.45607184693479, + "grad_norm": 0.9132809638977051, + "learning_rate": 1.7523391949980462e-05, + "loss": 0.236, + "step": 159500 + }, + { + "epoch": 12.463881296368607, + "grad_norm": 1.0307021141052246, + "learning_rate": 1.7521828839390388e-05, + "loss": 0.2168, + "step": 159600 + }, + { + "epoch": 12.471690745802421, + "grad_norm": 0.8152646422386169, + "learning_rate": 1.7520265728800314e-05, + "loss": 0.2167, + "step": 159700 + }, + { + "epoch": 12.479500195236236, + "grad_norm": 0.7328994870185852, + "learning_rate": 1.751870261821024e-05, + "loss": 0.217, + "step": 159800 + }, + { + "epoch": 12.48730964467005, + "grad_norm": 0.9359160661697388, + "learning_rate": 1.7517139507620166e-05, + "loss": 0.2174, + "step": 159900 + }, + { + "epoch": 12.495119094103865, + "grad_norm": 0.7891941666603088, + "learning_rate": 1.7515576397030092e-05, + "loss": 0.2179, + "step": 160000 + }, + { + "epoch": 12.502928543537681, + "grad_norm": 1.0698615312576294, + "learning_rate": 1.7514013286440018e-05, + "loss": 0.224, + "step": 160100 + }, + { + "epoch": 12.510737992971496, + "grad_norm": 0.9958947896957397, + "learning_rate": 1.751245017584994e-05, + "loss": 0.227, + "step": 160200 + }, + { + "epoch": 12.51854744240531, + "grad_norm": 0.9366388320922852, + "learning_rate": 1.751088706525987e-05, + "loss": 0.2188, + "step": 160300 + }, + { + "epoch": 12.526356891839125, + "grad_norm": 0.9255606532096863, + "learning_rate": 1.7509323954669796e-05, + "loss": 0.216, + "step": 160400 + }, + { + "epoch": 12.53416634127294, + "grad_norm": 0.9308393001556396, + "learning_rate": 1.750776084407972e-05, + "loss": 0.2189, + "step": 160500 + }, + { + "epoch": 12.541975790706756, + "grad_norm": 0.9493924975395203, + "learning_rate": 1.7506197733489645e-05, + "loss": 0.2261, + "step": 160600 + }, + { + "epoch": 12.54978524014057, + "grad_norm": 1.0675753355026245, + "learning_rate": 1.750463462289957e-05, + "loss": 0.2227, + "step": 160700 + }, + { + "epoch": 12.557594689574385, + "grad_norm": 0.8917664289474487, + "learning_rate": 1.7503071512309497e-05, + "loss": 0.2234, + "step": 160800 + }, + { + "epoch": 12.5654041390082, + "grad_norm": 0.7320645451545715, + "learning_rate": 1.7501508401719423e-05, + "loss": 0.2232, + "step": 160900 + }, + { + "epoch": 12.573213588442014, + "grad_norm": 0.8894788026809692, + "learning_rate": 1.749994529112935e-05, + "loss": 0.2298, + "step": 161000 + }, + { + "epoch": 12.58102303787583, + "grad_norm": 1.3193320035934448, + "learning_rate": 1.7498382180539275e-05, + "loss": 0.2328, + "step": 161100 + }, + { + "epoch": 12.588832487309645, + "grad_norm": 1.031163215637207, + "learning_rate": 1.74968190699492e-05, + "loss": 0.2238, + "step": 161200 + }, + { + "epoch": 12.59664193674346, + "grad_norm": 1.3573243618011475, + "learning_rate": 1.7495255959359127e-05, + "loss": 0.219, + "step": 161300 + }, + { + "epoch": 12.604451386177274, + "grad_norm": 1.072387456893921, + "learning_rate": 1.7493692848769053e-05, + "loss": 0.2245, + "step": 161400 + }, + { + "epoch": 12.612260835611089, + "grad_norm": 1.095932960510254, + "learning_rate": 1.749214536928488e-05, + "loss": 0.2147, + "step": 161500 + }, + { + "epoch": 12.620070285044905, + "grad_norm": 0.8156642913818359, + "learning_rate": 1.7490582258694805e-05, + "loss": 0.2201, + "step": 161600 + }, + { + "epoch": 12.62787973447872, + "grad_norm": 1.1821234226226807, + "learning_rate": 1.7489019148104727e-05, + "loss": 0.2088, + "step": 161700 + }, + { + "epoch": 12.635689183912534, + "grad_norm": 0.9735209345817566, + "learning_rate": 1.7487456037514657e-05, + "loss": 0.2199, + "step": 161800 + }, + { + "epoch": 12.643498633346349, + "grad_norm": 1.0844048261642456, + "learning_rate": 1.7485892926924583e-05, + "loss": 0.2186, + "step": 161900 + }, + { + "epoch": 12.651308082780163, + "grad_norm": 0.933223307132721, + "learning_rate": 1.7484329816334505e-05, + "loss": 0.2125, + "step": 162000 + }, + { + "epoch": 12.65911753221398, + "grad_norm": 1.1654493808746338, + "learning_rate": 1.7482766705744435e-05, + "loss": 0.2174, + "step": 162100 + }, + { + "epoch": 12.666926981647794, + "grad_norm": 1.048599362373352, + "learning_rate": 1.7481203595154357e-05, + "loss": 0.2116, + "step": 162200 + }, + { + "epoch": 12.674736431081609, + "grad_norm": 0.8715877532958984, + "learning_rate": 1.7479640484564283e-05, + "loss": 0.2133, + "step": 162300 + }, + { + "epoch": 12.682545880515423, + "grad_norm": 1.186110019683838, + "learning_rate": 1.747807737397421e-05, + "loss": 0.2212, + "step": 162400 + }, + { + "epoch": 12.690355329949238, + "grad_norm": 0.9320825338363647, + "learning_rate": 1.7476514263384135e-05, + "loss": 0.2197, + "step": 162500 + }, + { + "epoch": 12.698164779383054, + "grad_norm": 1.2670738697052002, + "learning_rate": 1.747495115279406e-05, + "loss": 0.2136, + "step": 162600 + }, + { + "epoch": 12.705974228816869, + "grad_norm": 0.9668679237365723, + "learning_rate": 1.7473388042203987e-05, + "loss": 0.2211, + "step": 162700 + }, + { + "epoch": 12.713783678250683, + "grad_norm": 0.7561825513839722, + "learning_rate": 1.7471824931613913e-05, + "loss": 0.2226, + "step": 162800 + }, + { + "epoch": 12.721593127684498, + "grad_norm": 0.6604050397872925, + "learning_rate": 1.747026182102384e-05, + "loss": 0.2041, + "step": 162900 + }, + { + "epoch": 12.729402577118313, + "grad_norm": 1.0108921527862549, + "learning_rate": 1.7468698710433765e-05, + "loss": 0.2174, + "step": 163000 + }, + { + "epoch": 12.737212026552129, + "grad_norm": 0.9948078989982605, + "learning_rate": 1.7467135599843688e-05, + "loss": 0.2297, + "step": 163100 + }, + { + "epoch": 12.745021475985943, + "grad_norm": 1.0243088006973267, + "learning_rate": 1.7465572489253618e-05, + "loss": 0.2356, + "step": 163200 + }, + { + "epoch": 12.752830925419758, + "grad_norm": 0.9389221668243408, + "learning_rate": 1.7464009378663544e-05, + "loss": 0.2211, + "step": 163300 + }, + { + "epoch": 12.760640374853573, + "grad_norm": 0.6568807363510132, + "learning_rate": 1.7462446268073466e-05, + "loss": 0.2166, + "step": 163400 + }, + { + "epoch": 12.768449824287387, + "grad_norm": 1.0748472213745117, + "learning_rate": 1.7460883157483392e-05, + "loss": 0.2107, + "step": 163500 + }, + { + "epoch": 12.776259273721202, + "grad_norm": 0.8790175318717957, + "learning_rate": 1.745932004689332e-05, + "loss": 0.219, + "step": 163600 + }, + { + "epoch": 12.784068723155018, + "grad_norm": 1.250709891319275, + "learning_rate": 1.7457772567409147e-05, + "loss": 0.2128, + "step": 163700 + }, + { + "epoch": 12.791878172588833, + "grad_norm": 0.8646677732467651, + "learning_rate": 1.745620945681907e-05, + "loss": 0.2218, + "step": 163800 + }, + { + "epoch": 12.799687622022647, + "grad_norm": 1.0469748973846436, + "learning_rate": 1.7454646346228996e-05, + "loss": 0.2282, + "step": 163900 + }, + { + "epoch": 12.807497071456462, + "grad_norm": 1.2025775909423828, + "learning_rate": 1.7453083235638922e-05, + "loss": 0.2204, + "step": 164000 + }, + { + "epoch": 12.815306520890278, + "grad_norm": 1.0595488548278809, + "learning_rate": 1.7451520125048848e-05, + "loss": 0.2186, + "step": 164100 + }, + { + "epoch": 12.823115970324093, + "grad_norm": 0.8982226848602295, + "learning_rate": 1.7449957014458774e-05, + "loss": 0.2278, + "step": 164200 + }, + { + "epoch": 12.830925419757907, + "grad_norm": 0.9295059442520142, + "learning_rate": 1.74483939038687e-05, + "loss": 0.2219, + "step": 164300 + }, + { + "epoch": 12.838734869191722, + "grad_norm": 0.8940107822418213, + "learning_rate": 1.7446830793278626e-05, + "loss": 0.2205, + "step": 164400 + }, + { + "epoch": 12.846544318625536, + "grad_norm": 0.9545173048973083, + "learning_rate": 1.7445267682688552e-05, + "loss": 0.2123, + "step": 164500 + }, + { + "epoch": 12.854353768059351, + "grad_norm": 0.8928143382072449, + "learning_rate": 1.7443704572098478e-05, + "loss": 0.2182, + "step": 164600 + }, + { + "epoch": 12.862163217493167, + "grad_norm": 0.9669292569160461, + "learning_rate": 1.7442141461508404e-05, + "loss": 0.2178, + "step": 164700 + }, + { + "epoch": 12.869972666926982, + "grad_norm": 0.951994776725769, + "learning_rate": 1.744057835091833e-05, + "loss": 0.2152, + "step": 164800 + }, + { + "epoch": 12.877782116360796, + "grad_norm": 0.8024113774299622, + "learning_rate": 1.7439015240328253e-05, + "loss": 0.2106, + "step": 164900 + }, + { + "epoch": 12.885591565794611, + "grad_norm": 0.8423495292663574, + "learning_rate": 1.743745212973818e-05, + "loss": 0.2171, + "step": 165000 + }, + { + "epoch": 12.893401015228427, + "grad_norm": 0.7901012897491455, + "learning_rate": 1.7435889019148108e-05, + "loss": 0.2083, + "step": 165100 + }, + { + "epoch": 12.901210464662242, + "grad_norm": 0.8986498713493347, + "learning_rate": 1.743432590855803e-05, + "loss": 0.2101, + "step": 165200 + }, + { + "epoch": 12.909019914096056, + "grad_norm": 1.0257128477096558, + "learning_rate": 1.7432762797967957e-05, + "loss": 0.2046, + "step": 165300 + }, + { + "epoch": 12.916829363529871, + "grad_norm": 0.7271884083747864, + "learning_rate": 1.7431199687377883e-05, + "loss": 0.214, + "step": 165400 + }, + { + "epoch": 12.924638812963686, + "grad_norm": 0.9488785266876221, + "learning_rate": 1.742963657678781e-05, + "loss": 0.2165, + "step": 165500 + }, + { + "epoch": 12.9324482623975, + "grad_norm": 0.999997615814209, + "learning_rate": 1.7428073466197735e-05, + "loss": 0.2113, + "step": 165600 + }, + { + "epoch": 12.940257711831316, + "grad_norm": 0.7397522926330566, + "learning_rate": 1.742652598671356e-05, + "loss": 0.2191, + "step": 165700 + }, + { + "epoch": 12.948067161265131, + "grad_norm": 1.1001430749893188, + "learning_rate": 1.7424962876123487e-05, + "loss": 0.2209, + "step": 165800 + }, + { + "epoch": 12.955876610698946, + "grad_norm": 1.1696312427520752, + "learning_rate": 1.7423399765533413e-05, + "loss": 0.2134, + "step": 165900 + }, + { + "epoch": 12.96368606013276, + "grad_norm": 0.941423237323761, + "learning_rate": 1.742183665494334e-05, + "loss": 0.2208, + "step": 166000 + }, + { + "epoch": 12.971495509566576, + "grad_norm": 0.99091637134552, + "learning_rate": 1.7420273544353265e-05, + "loss": 0.2233, + "step": 166100 + }, + { + "epoch": 12.979304959000391, + "grad_norm": 0.9629716277122498, + "learning_rate": 1.741871043376319e-05, + "loss": 0.2074, + "step": 166200 + }, + { + "epoch": 12.987114408434206, + "grad_norm": 0.8541843295097351, + "learning_rate": 1.7417147323173117e-05, + "loss": 0.2205, + "step": 166300 + }, + { + "epoch": 12.99492385786802, + "grad_norm": 0.7128260135650635, + "learning_rate": 1.741558421258304e-05, + "loss": 0.2142, + "step": 166400 + }, + { + "epoch": 13.002733307301835, + "grad_norm": 0.8291605114936829, + "learning_rate": 1.741402110199297e-05, + "loss": 0.2069, + "step": 166500 + }, + { + "epoch": 13.01054275673565, + "grad_norm": 1.0823742151260376, + "learning_rate": 1.7412457991402895e-05, + "loss": 0.2098, + "step": 166600 + }, + { + "epoch": 13.018352206169466, + "grad_norm": 1.073767900466919, + "learning_rate": 1.7410894880812818e-05, + "loss": 0.2164, + "step": 166700 + }, + { + "epoch": 13.02616165560328, + "grad_norm": 0.6933673024177551, + "learning_rate": 1.7409331770222744e-05, + "loss": 0.2101, + "step": 166800 + }, + { + "epoch": 13.033971105037095, + "grad_norm": 0.9099040031433105, + "learning_rate": 1.740776865963267e-05, + "loss": 0.2063, + "step": 166900 + }, + { + "epoch": 13.04178055447091, + "grad_norm": 0.7376136183738708, + "learning_rate": 1.7406205549042596e-05, + "loss": 0.2168, + "step": 167000 + }, + { + "epoch": 13.049590003904724, + "grad_norm": 0.9267070293426514, + "learning_rate": 1.740464243845252e-05, + "loss": 0.2157, + "step": 167100 + }, + { + "epoch": 13.05739945333854, + "grad_norm": 1.0121701955795288, + "learning_rate": 1.7403079327862448e-05, + "loss": 0.2148, + "step": 167200 + }, + { + "epoch": 13.065208902772355, + "grad_norm": 1.0252717733383179, + "learning_rate": 1.7401516217272374e-05, + "loss": 0.2113, + "step": 167300 + }, + { + "epoch": 13.07301835220617, + "grad_norm": 1.3679596185684204, + "learning_rate": 1.73999531066823e-05, + "loss": 0.2163, + "step": 167400 + }, + { + "epoch": 13.080827801639984, + "grad_norm": 0.8503546118736267, + "learning_rate": 1.7398389996092226e-05, + "loss": 0.1982, + "step": 167500 + }, + { + "epoch": 13.088637251073798, + "grad_norm": 1.2549128532409668, + "learning_rate": 1.739682688550215e-05, + "loss": 0.2172, + "step": 167600 + }, + { + "epoch": 13.096446700507615, + "grad_norm": 1.0882325172424316, + "learning_rate": 1.7395279406017978e-05, + "loss": 0.2159, + "step": 167700 + }, + { + "epoch": 13.10425614994143, + "grad_norm": 0.9475435018539429, + "learning_rate": 1.7393716295427904e-05, + "loss": 0.2076, + "step": 167800 + }, + { + "epoch": 13.112065599375244, + "grad_norm": 0.6309915781021118, + "learning_rate": 1.7392153184837826e-05, + "loss": 0.2173, + "step": 167900 + }, + { + "epoch": 13.119875048809059, + "grad_norm": 0.9528407454490662, + "learning_rate": 1.7390590074247756e-05, + "loss": 0.2087, + "step": 168000 + }, + { + "epoch": 13.127684498242873, + "grad_norm": 1.0685718059539795, + "learning_rate": 1.738902696365768e-05, + "loss": 0.2115, + "step": 168100 + }, + { + "epoch": 13.13549394767669, + "grad_norm": 0.7884103655815125, + "learning_rate": 1.7387463853067604e-05, + "loss": 0.2054, + "step": 168200 + }, + { + "epoch": 13.143303397110504, + "grad_norm": 0.9388546347618103, + "learning_rate": 1.7385900742477534e-05, + "loss": 0.2126, + "step": 168300 + }, + { + "epoch": 13.151112846544319, + "grad_norm": 0.8609205484390259, + "learning_rate": 1.7384337631887456e-05, + "loss": 0.2131, + "step": 168400 + }, + { + "epoch": 13.158922295978133, + "grad_norm": 0.9092925190925598, + "learning_rate": 1.7382774521297382e-05, + "loss": 0.2202, + "step": 168500 + }, + { + "epoch": 13.166731745411948, + "grad_norm": 1.0586522817611694, + "learning_rate": 1.738121141070731e-05, + "loss": 0.2048, + "step": 168600 + }, + { + "epoch": 13.174541194845764, + "grad_norm": 1.3472049236297607, + "learning_rate": 1.7379648300117234e-05, + "loss": 0.2166, + "step": 168700 + }, + { + "epoch": 13.182350644279579, + "grad_norm": 0.927558183670044, + "learning_rate": 1.737808518952716e-05, + "loss": 0.2226, + "step": 168800 + }, + { + "epoch": 13.190160093713393, + "grad_norm": 0.9628226161003113, + "learning_rate": 1.7376522078937086e-05, + "loss": 0.2175, + "step": 168900 + }, + { + "epoch": 13.197969543147208, + "grad_norm": 1.1431093215942383, + "learning_rate": 1.7374958968347012e-05, + "loss": 0.2094, + "step": 169000 + }, + { + "epoch": 13.205778992581022, + "grad_norm": 1.1498020887374878, + "learning_rate": 1.737339585775694e-05, + "loss": 0.2116, + "step": 169100 + }, + { + "epoch": 13.213588442014839, + "grad_norm": 0.9198805093765259, + "learning_rate": 1.7371832747166864e-05, + "loss": 0.2072, + "step": 169200 + }, + { + "epoch": 13.221397891448653, + "grad_norm": 1.065612554550171, + "learning_rate": 1.7370269636576787e-05, + "loss": 0.2149, + "step": 169300 + }, + { + "epoch": 13.229207340882468, + "grad_norm": 0.9671112895011902, + "learning_rate": 1.7368706525986716e-05, + "loss": 0.2111, + "step": 169400 + }, + { + "epoch": 13.237016790316282, + "grad_norm": 0.7491472363471985, + "learning_rate": 1.7367143415396642e-05, + "loss": 0.2146, + "step": 169500 + }, + { + "epoch": 13.244826239750097, + "grad_norm": 0.9440492391586304, + "learning_rate": 1.7365580304806565e-05, + "loss": 0.2085, + "step": 169600 + }, + { + "epoch": 13.252635689183913, + "grad_norm": 0.8738968372344971, + "learning_rate": 1.736403282532239e-05, + "loss": 0.2155, + "step": 169700 + }, + { + "epoch": 13.260445138617728, + "grad_norm": 0.9270493984222412, + "learning_rate": 1.736246971473232e-05, + "loss": 0.203, + "step": 169800 + }, + { + "epoch": 13.268254588051542, + "grad_norm": 0.8990687131881714, + "learning_rate": 1.7360906604142246e-05, + "loss": 0.2134, + "step": 169900 + }, + { + "epoch": 13.276064037485357, + "grad_norm": 0.8390234112739563, + "learning_rate": 1.735934349355217e-05, + "loss": 0.2118, + "step": 170000 + }, + { + "epoch": 13.283873486919171, + "grad_norm": 0.7005640864372253, + "learning_rate": 1.7357780382962095e-05, + "loss": 0.2078, + "step": 170100 + }, + { + "epoch": 13.291682936352988, + "grad_norm": 0.888175904750824, + "learning_rate": 1.735621727237202e-05, + "loss": 0.2136, + "step": 170200 + }, + { + "epoch": 13.299492385786802, + "grad_norm": 0.9876678586006165, + "learning_rate": 1.7354654161781947e-05, + "loss": 0.2139, + "step": 170300 + }, + { + "epoch": 13.307301835220617, + "grad_norm": 1.1229948997497559, + "learning_rate": 1.7353091051191873e-05, + "loss": 0.2026, + "step": 170400 + }, + { + "epoch": 13.315111284654432, + "grad_norm": 0.7431442737579346, + "learning_rate": 1.73515279406018e-05, + "loss": 0.2194, + "step": 170500 + }, + { + "epoch": 13.322920734088246, + "grad_norm": 0.8044176697731018, + "learning_rate": 1.7349964830011725e-05, + "loss": 0.228, + "step": 170600 + }, + { + "epoch": 13.330730183522062, + "grad_norm": 0.8355295658111572, + "learning_rate": 1.734840171942165e-05, + "loss": 0.2122, + "step": 170700 + }, + { + "epoch": 13.338539632955877, + "grad_norm": 1.1031625270843506, + "learning_rate": 1.7346838608831577e-05, + "loss": 0.2214, + "step": 170800 + }, + { + "epoch": 13.346349082389692, + "grad_norm": 1.0112760066986084, + "learning_rate": 1.7345275498241503e-05, + "loss": 0.2044, + "step": 170900 + }, + { + "epoch": 13.354158531823506, + "grad_norm": 0.8478233218193054, + "learning_rate": 1.734371238765143e-05, + "loss": 0.2099, + "step": 171000 + }, + { + "epoch": 13.36196798125732, + "grad_norm": 0.5524217486381531, + "learning_rate": 1.7342149277061352e-05, + "loss": 0.2, + "step": 171100 + }, + { + "epoch": 13.369777430691137, + "grad_norm": 1.041663646697998, + "learning_rate": 1.7340586166471278e-05, + "loss": 0.2155, + "step": 171200 + }, + { + "epoch": 13.377586880124952, + "grad_norm": 0.8660856485366821, + "learning_rate": 1.7339023055881207e-05, + "loss": 0.211, + "step": 171300 + }, + { + "epoch": 13.385396329558766, + "grad_norm": 1.2220951318740845, + "learning_rate": 1.733745994529113e-05, + "loss": 0.2081, + "step": 171400 + }, + { + "epoch": 13.39320577899258, + "grad_norm": 0.9403126835823059, + "learning_rate": 1.7335896834701056e-05, + "loss": 0.2196, + "step": 171500 + }, + { + "epoch": 13.401015228426395, + "grad_norm": 0.7787653803825378, + "learning_rate": 1.7334333724110982e-05, + "loss": 0.1999, + "step": 171600 + }, + { + "epoch": 13.408824677860212, + "grad_norm": 0.9129199981689453, + "learning_rate": 1.7332770613520908e-05, + "loss": 0.1964, + "step": 171700 + }, + { + "epoch": 13.416634127294026, + "grad_norm": 0.8007563352584839, + "learning_rate": 1.7331223134036734e-05, + "loss": 0.208, + "step": 171800 + }, + { + "epoch": 13.42444357672784, + "grad_norm": 0.8194550275802612, + "learning_rate": 1.732966002344666e-05, + "loss": 0.2093, + "step": 171900 + }, + { + "epoch": 13.432253026161655, + "grad_norm": 1.066786289215088, + "learning_rate": 1.7328096912856586e-05, + "loss": 0.2164, + "step": 172000 + }, + { + "epoch": 13.44006247559547, + "grad_norm": 1.1544183492660522, + "learning_rate": 1.7326533802266512e-05, + "loss": 0.2145, + "step": 172100 + }, + { + "epoch": 13.447871925029286, + "grad_norm": 0.9140012860298157, + "learning_rate": 1.7324970691676438e-05, + "loss": 0.2127, + "step": 172200 + }, + { + "epoch": 13.4556813744631, + "grad_norm": 1.0456069707870483, + "learning_rate": 1.7323407581086364e-05, + "loss": 0.2083, + "step": 172300 + }, + { + "epoch": 13.463490823896915, + "grad_norm": 1.0170204639434814, + "learning_rate": 1.732184447049629e-05, + "loss": 0.2024, + "step": 172400 + }, + { + "epoch": 13.47130027333073, + "grad_norm": 0.8396112322807312, + "learning_rate": 1.7320281359906216e-05, + "loss": 0.2161, + "step": 172500 + }, + { + "epoch": 13.479109722764544, + "grad_norm": 1.0795584917068481, + "learning_rate": 1.731871824931614e-05, + "loss": 0.2088, + "step": 172600 + }, + { + "epoch": 13.48691917219836, + "grad_norm": 1.0431060791015625, + "learning_rate": 1.7317155138726068e-05, + "loss": 0.2095, + "step": 172700 + }, + { + "epoch": 13.494728621632175, + "grad_norm": 0.9751796722412109, + "learning_rate": 1.7315592028135994e-05, + "loss": 0.2017, + "step": 172800 + }, + { + "epoch": 13.50253807106599, + "grad_norm": 0.9349498152732849, + "learning_rate": 1.7314028917545916e-05, + "loss": 0.2115, + "step": 172900 + }, + { + "epoch": 13.510347520499804, + "grad_norm": 0.7299902439117432, + "learning_rate": 1.7312465806955842e-05, + "loss": 0.2053, + "step": 173000 + }, + { + "epoch": 13.518156969933619, + "grad_norm": 0.9909356832504272, + "learning_rate": 1.731090269636577e-05, + "loss": 0.2127, + "step": 173100 + }, + { + "epoch": 13.525966419367435, + "grad_norm": 0.812975287437439, + "learning_rate": 1.7309339585775694e-05, + "loss": 0.2135, + "step": 173200 + }, + { + "epoch": 13.53377586880125, + "grad_norm": 0.9539850354194641, + "learning_rate": 1.730777647518562e-05, + "loss": 0.2078, + "step": 173300 + }, + { + "epoch": 13.541585318235065, + "grad_norm": 0.760218620300293, + "learning_rate": 1.7306213364595547e-05, + "loss": 0.2059, + "step": 173400 + }, + { + "epoch": 13.549394767668879, + "grad_norm": 0.8979002237319946, + "learning_rate": 1.7304650254005473e-05, + "loss": 0.2146, + "step": 173500 + }, + { + "epoch": 13.557204217102694, + "grad_norm": 1.020674228668213, + "learning_rate": 1.73030871434154e-05, + "loss": 0.2135, + "step": 173600 + }, + { + "epoch": 13.56501366653651, + "grad_norm": 0.8978680968284607, + "learning_rate": 1.7301524032825325e-05, + "loss": 0.2031, + "step": 173700 + }, + { + "epoch": 13.572823115970325, + "grad_norm": 0.8209431767463684, + "learning_rate": 1.729996092223525e-05, + "loss": 0.1995, + "step": 173800 + }, + { + "epoch": 13.580632565404139, + "grad_norm": 0.8961414098739624, + "learning_rate": 1.7298413442751077e-05, + "loss": 0.221, + "step": 173900 + }, + { + "epoch": 13.588442014837954, + "grad_norm": 0.9937573671340942, + "learning_rate": 1.7296850332161003e-05, + "loss": 0.2204, + "step": 174000 + }, + { + "epoch": 13.596251464271768, + "grad_norm": 0.9728763103485107, + "learning_rate": 1.7295287221570925e-05, + "loss": 0.2122, + "step": 174100 + }, + { + "epoch": 13.604060913705585, + "grad_norm": 0.9363887310028076, + "learning_rate": 1.7293724110980855e-05, + "loss": 0.209, + "step": 174200 + }, + { + "epoch": 13.6118703631394, + "grad_norm": 0.652712881565094, + "learning_rate": 1.729216100039078e-05, + "loss": 0.2265, + "step": 174300 + }, + { + "epoch": 13.619679812573214, + "grad_norm": 1.1150403022766113, + "learning_rate": 1.7290597889800703e-05, + "loss": 0.2013, + "step": 174400 + }, + { + "epoch": 13.627489262007028, + "grad_norm": 0.8084608912467957, + "learning_rate": 1.7289034779210633e-05, + "loss": 0.2158, + "step": 174500 + }, + { + "epoch": 13.635298711440843, + "grad_norm": 1.0631225109100342, + "learning_rate": 1.7287471668620555e-05, + "loss": 0.2073, + "step": 174600 + }, + { + "epoch": 13.64310816087466, + "grad_norm": 0.631264328956604, + "learning_rate": 1.728590855803048e-05, + "loss": 0.2201, + "step": 174700 + }, + { + "epoch": 13.650917610308474, + "grad_norm": 0.6314346194267273, + "learning_rate": 1.7284345447440407e-05, + "loss": 0.2015, + "step": 174800 + }, + { + "epoch": 13.658727059742288, + "grad_norm": 0.964005708694458, + "learning_rate": 1.7282782336850333e-05, + "loss": 0.2046, + "step": 174900 + }, + { + "epoch": 13.666536509176103, + "grad_norm": 0.6954252123832703, + "learning_rate": 1.728121922626026e-05, + "loss": 0.2129, + "step": 175000 + }, + { + "epoch": 13.674345958609917, + "grad_norm": 0.9582391381263733, + "learning_rate": 1.7279656115670185e-05, + "loss": 0.2107, + "step": 175100 + }, + { + "epoch": 13.682155408043734, + "grad_norm": 1.00692617893219, + "learning_rate": 1.727809300508011e-05, + "loss": 0.2055, + "step": 175200 + }, + { + "epoch": 13.689964857477548, + "grad_norm": 0.786759078502655, + "learning_rate": 1.7276529894490037e-05, + "loss": 0.2138, + "step": 175300 + }, + { + "epoch": 13.697774306911363, + "grad_norm": 0.9668805003166199, + "learning_rate": 1.7274966783899963e-05, + "loss": 0.2095, + "step": 175400 + }, + { + "epoch": 13.705583756345177, + "grad_norm": 0.9312834143638611, + "learning_rate": 1.7273403673309886e-05, + "loss": 0.2122, + "step": 175500 + }, + { + "epoch": 13.713393205778992, + "grad_norm": 0.785740852355957, + "learning_rate": 1.7271840562719815e-05, + "loss": 0.2113, + "step": 175600 + }, + { + "epoch": 13.721202655212807, + "grad_norm": 0.8165653347969055, + "learning_rate": 1.727027745212974e-05, + "loss": 0.2043, + "step": 175700 + }, + { + "epoch": 13.729012104646623, + "grad_norm": 0.9161164164543152, + "learning_rate": 1.7268714341539664e-05, + "loss": 0.2037, + "step": 175800 + }, + { + "epoch": 13.736821554080437, + "grad_norm": 0.8624919652938843, + "learning_rate": 1.726716686205549e-05, + "loss": 0.216, + "step": 175900 + }, + { + "epoch": 13.744631003514252, + "grad_norm": 1.1940569877624512, + "learning_rate": 1.726560375146542e-05, + "loss": 0.2025, + "step": 176000 + }, + { + "epoch": 13.752440452948067, + "grad_norm": 0.9143908023834229, + "learning_rate": 1.7264040640875345e-05, + "loss": 0.2085, + "step": 176100 + }, + { + "epoch": 13.760249902381883, + "grad_norm": 0.8099220395088196, + "learning_rate": 1.7262477530285268e-05, + "loss": 0.2092, + "step": 176200 + }, + { + "epoch": 13.768059351815698, + "grad_norm": 0.8948672413825989, + "learning_rate": 1.7260914419695194e-05, + "loss": 0.2063, + "step": 176300 + }, + { + "epoch": 13.775868801249512, + "grad_norm": 0.9727224707603455, + "learning_rate": 1.725935130910512e-05, + "loss": 0.2146, + "step": 176400 + }, + { + "epoch": 13.783678250683327, + "grad_norm": 1.0335994958877563, + "learning_rate": 1.7257788198515046e-05, + "loss": 0.206, + "step": 176500 + }, + { + "epoch": 13.791487700117141, + "grad_norm": 0.9940019845962524, + "learning_rate": 1.7256225087924972e-05, + "loss": 0.2095, + "step": 176600 + }, + { + "epoch": 13.799297149550956, + "grad_norm": 0.8024458289146423, + "learning_rate": 1.7254661977334898e-05, + "loss": 0.2072, + "step": 176700 + }, + { + "epoch": 13.807106598984772, + "grad_norm": 0.9615973830223083, + "learning_rate": 1.7253098866744824e-05, + "loss": 0.2028, + "step": 176800 + }, + { + "epoch": 13.814916048418587, + "grad_norm": 0.8130218386650085, + "learning_rate": 1.725153575615475e-05, + "loss": 0.2063, + "step": 176900 + }, + { + "epoch": 13.822725497852401, + "grad_norm": 0.9325103759765625, + "learning_rate": 1.7249972645564676e-05, + "loss": 0.1951, + "step": 177000 + }, + { + "epoch": 13.830534947286216, + "grad_norm": 0.9893394112586975, + "learning_rate": 1.7248409534974602e-05, + "loss": 0.198, + "step": 177100 + }, + { + "epoch": 13.838344396720032, + "grad_norm": 0.9419919848442078, + "learning_rate": 1.7246846424384528e-05, + "loss": 0.2198, + "step": 177200 + }, + { + "epoch": 13.846153846153847, + "grad_norm": 0.9249516129493713, + "learning_rate": 1.724528331379445e-05, + "loss": 0.2036, + "step": 177300 + }, + { + "epoch": 13.853963295587661, + "grad_norm": 0.9058451056480408, + "learning_rate": 1.7243720203204377e-05, + "loss": 0.218, + "step": 177400 + }, + { + "epoch": 13.861772745021476, + "grad_norm": 0.9602999091148376, + "learning_rate": 1.7242157092614306e-05, + "loss": 0.2005, + "step": 177500 + }, + { + "epoch": 13.86958219445529, + "grad_norm": 0.8785629868507385, + "learning_rate": 1.724059398202423e-05, + "loss": 0.2048, + "step": 177600 + }, + { + "epoch": 13.877391643889105, + "grad_norm": 0.8119620084762573, + "learning_rate": 1.7239030871434155e-05, + "loss": 0.2025, + "step": 177700 + }, + { + "epoch": 13.885201093322921, + "grad_norm": 0.8947235941886902, + "learning_rate": 1.723746776084408e-05, + "loss": 0.2083, + "step": 177800 + }, + { + "epoch": 13.893010542756736, + "grad_norm": 0.908216655254364, + "learning_rate": 1.7235920281359907e-05, + "loss": 0.2049, + "step": 177900 + }, + { + "epoch": 13.90081999219055, + "grad_norm": 0.875116229057312, + "learning_rate": 1.7234357170769833e-05, + "loss": 0.202, + "step": 178000 + }, + { + "epoch": 13.908629441624365, + "grad_norm": 0.7890712022781372, + "learning_rate": 1.723279406017976e-05, + "loss": 0.1965, + "step": 178100 + }, + { + "epoch": 13.916438891058181, + "grad_norm": 1.0504424571990967, + "learning_rate": 1.7231230949589685e-05, + "loss": 0.2124, + "step": 178200 + }, + { + "epoch": 13.924248340491996, + "grad_norm": 0.8207637667655945, + "learning_rate": 1.722966783899961e-05, + "loss": 0.1953, + "step": 178300 + }, + { + "epoch": 13.93205778992581, + "grad_norm": 0.7486147284507751, + "learning_rate": 1.7228104728409537e-05, + "loss": 0.2018, + "step": 178400 + }, + { + "epoch": 13.939867239359625, + "grad_norm": 0.9399036169052124, + "learning_rate": 1.7226541617819463e-05, + "loss": 0.206, + "step": 178500 + }, + { + "epoch": 13.94767668879344, + "grad_norm": 0.7904515862464905, + "learning_rate": 1.722497850722939e-05, + "loss": 0.2035, + "step": 178600 + }, + { + "epoch": 13.955486138227254, + "grad_norm": 0.8712892532348633, + "learning_rate": 1.7223415396639315e-05, + "loss": 0.2155, + "step": 178700 + }, + { + "epoch": 13.96329558766107, + "grad_norm": 0.901816189289093, + "learning_rate": 1.7221852286049237e-05, + "loss": 0.2029, + "step": 178800 + }, + { + "epoch": 13.971105037094885, + "grad_norm": 1.0784202814102173, + "learning_rate": 1.7220289175459167e-05, + "loss": 0.2075, + "step": 178900 + }, + { + "epoch": 13.9789144865287, + "grad_norm": 0.973260223865509, + "learning_rate": 1.7218726064869093e-05, + "loss": 0.2054, + "step": 179000 + }, + { + "epoch": 13.986723935962514, + "grad_norm": 0.6385024785995483, + "learning_rate": 1.7217162954279015e-05, + "loss": 0.2023, + "step": 179100 + }, + { + "epoch": 13.994533385396329, + "grad_norm": 1.0969829559326172, + "learning_rate": 1.721559984368894e-05, + "loss": 0.2022, + "step": 179200 + }, + { + "epoch": 14.002342834830145, + "grad_norm": 1.21580171585083, + "learning_rate": 1.7214036733098867e-05, + "loss": 0.201, + "step": 179300 + }, + { + "epoch": 14.01015228426396, + "grad_norm": 0.9006642699241638, + "learning_rate": 1.7212473622508793e-05, + "loss": 0.2046, + "step": 179400 + }, + { + "epoch": 14.017961733697774, + "grad_norm": 0.5619805455207825, + "learning_rate": 1.721091051191872e-05, + "loss": 0.2018, + "step": 179500 + }, + { + "epoch": 14.025771183131589, + "grad_norm": 0.9529440999031067, + "learning_rate": 1.7209347401328645e-05, + "loss": 0.2052, + "step": 179600 + }, + { + "epoch": 14.033580632565403, + "grad_norm": 0.8338792324066162, + "learning_rate": 1.720778429073857e-05, + "loss": 0.2022, + "step": 179700 + }, + { + "epoch": 14.04139008199922, + "grad_norm": 0.8016043901443481, + "learning_rate": 1.7206221180148497e-05, + "loss": 0.2054, + "step": 179800 + }, + { + "epoch": 14.049199531433034, + "grad_norm": 1.037266731262207, + "learning_rate": 1.7204673700664323e-05, + "loss": 0.2003, + "step": 179900 + }, + { + "epoch": 14.057008980866849, + "grad_norm": 1.0760704278945923, + "learning_rate": 1.720311059007425e-05, + "loss": 0.2057, + "step": 180000 + }, + { + "epoch": 14.064818430300663, + "grad_norm": 0.7043364644050598, + "learning_rate": 1.7201547479484175e-05, + "loss": 0.1971, + "step": 180100 + }, + { + "epoch": 14.072627879734478, + "grad_norm": 0.9022557139396667, + "learning_rate": 1.71999843688941e-05, + "loss": 0.2057, + "step": 180200 + }, + { + "epoch": 14.080437329168294, + "grad_norm": 0.88816899061203, + "learning_rate": 1.7198421258304024e-05, + "loss": 0.2054, + "step": 180300 + }, + { + "epoch": 14.088246778602109, + "grad_norm": 0.8379324674606323, + "learning_rate": 1.7196858147713953e-05, + "loss": 0.2056, + "step": 180400 + }, + { + "epoch": 14.096056228035923, + "grad_norm": 0.9925566911697388, + "learning_rate": 1.719529503712388e-05, + "loss": 0.2089, + "step": 180500 + }, + { + "epoch": 14.103865677469738, + "grad_norm": 0.9591396450996399, + "learning_rate": 1.7193731926533802e-05, + "loss": 0.1993, + "step": 180600 + }, + { + "epoch": 14.111675126903553, + "grad_norm": 1.2258329391479492, + "learning_rate": 1.7192168815943728e-05, + "loss": 0.2083, + "step": 180700 + }, + { + "epoch": 14.119484576337369, + "grad_norm": 0.7245174050331116, + "learning_rate": 1.7190605705353654e-05, + "loss": 0.2006, + "step": 180800 + }, + { + "epoch": 14.127294025771183, + "grad_norm": 0.8938778042793274, + "learning_rate": 1.718904259476358e-05, + "loss": 0.1986, + "step": 180900 + }, + { + "epoch": 14.135103475204998, + "grad_norm": 0.8464726805686951, + "learning_rate": 1.7187479484173506e-05, + "loss": 0.1904, + "step": 181000 + }, + { + "epoch": 14.142912924638813, + "grad_norm": 0.721854567527771, + "learning_rate": 1.7185916373583432e-05, + "loss": 0.1963, + "step": 181100 + }, + { + "epoch": 14.150722374072627, + "grad_norm": 0.9772906303405762, + "learning_rate": 1.7184353262993358e-05, + "loss": 0.2054, + "step": 181200 + }, + { + "epoch": 14.158531823506443, + "grad_norm": 1.0603522062301636, + "learning_rate": 1.7182790152403284e-05, + "loss": 0.2031, + "step": 181300 + }, + { + "epoch": 14.166341272940258, + "grad_norm": 0.8204702734947205, + "learning_rate": 1.718122704181321e-05, + "loss": 0.2184, + "step": 181400 + }, + { + "epoch": 14.174150722374073, + "grad_norm": 0.8588528633117676, + "learning_rate": 1.7179663931223136e-05, + "loss": 0.1977, + "step": 181500 + }, + { + "epoch": 14.181960171807887, + "grad_norm": 1.0079927444458008, + "learning_rate": 1.7178100820633062e-05, + "loss": 0.2032, + "step": 181600 + }, + { + "epoch": 14.189769621241702, + "grad_norm": 0.7643899321556091, + "learning_rate": 1.7176537710042985e-05, + "loss": 0.1924, + "step": 181700 + }, + { + "epoch": 14.197579070675518, + "grad_norm": 1.1137841939926147, + "learning_rate": 1.7174974599452914e-05, + "loss": 0.1964, + "step": 181800 + }, + { + "epoch": 14.205388520109333, + "grad_norm": 1.1593997478485107, + "learning_rate": 1.717342711996874e-05, + "loss": 0.1979, + "step": 181900 + }, + { + "epoch": 14.213197969543147, + "grad_norm": 0.9710358381271362, + "learning_rate": 1.7171864009378666e-05, + "loss": 0.2021, + "step": 182000 + }, + { + "epoch": 14.221007418976962, + "grad_norm": 0.9757130742073059, + "learning_rate": 1.717030089878859e-05, + "loss": 0.2107, + "step": 182100 + }, + { + "epoch": 14.228816868410776, + "grad_norm": 0.9044745564460754, + "learning_rate": 1.7168737788198518e-05, + "loss": 0.2032, + "step": 182200 + }, + { + "epoch": 14.236626317844593, + "grad_norm": 0.7436084747314453, + "learning_rate": 1.716717467760844e-05, + "loss": 0.1946, + "step": 182300 + }, + { + "epoch": 14.244435767278407, + "grad_norm": 0.9272249341011047, + "learning_rate": 1.7165611567018367e-05, + "loss": 0.1938, + "step": 182400 + }, + { + "epoch": 14.252245216712222, + "grad_norm": 0.9946134090423584, + "learning_rate": 1.7164048456428293e-05, + "loss": 0.2012, + "step": 182500 + }, + { + "epoch": 14.260054666146036, + "grad_norm": 0.8200010657310486, + "learning_rate": 1.716248534583822e-05, + "loss": 0.2056, + "step": 182600 + }, + { + "epoch": 14.267864115579851, + "grad_norm": 0.9531083703041077, + "learning_rate": 1.7160922235248145e-05, + "loss": 0.2077, + "step": 182700 + }, + { + "epoch": 14.275673565013667, + "grad_norm": 1.0950546264648438, + "learning_rate": 1.715935912465807e-05, + "loss": 0.1928, + "step": 182800 + }, + { + "epoch": 14.283483014447482, + "grad_norm": 0.957729160785675, + "learning_rate": 1.7157796014067997e-05, + "loss": 0.2, + "step": 182900 + }, + { + "epoch": 14.291292463881296, + "grad_norm": 0.8982499837875366, + "learning_rate": 1.7156232903477923e-05, + "loss": 0.1979, + "step": 183000 + }, + { + "epoch": 14.299101913315111, + "grad_norm": 1.290502905845642, + "learning_rate": 1.715466979288785e-05, + "loss": 0.1959, + "step": 183100 + }, + { + "epoch": 14.306911362748925, + "grad_norm": 1.0289325714111328, + "learning_rate": 1.7153106682297775e-05, + "loss": 0.205, + "step": 183200 + }, + { + "epoch": 14.314720812182742, + "grad_norm": 0.93171626329422, + "learning_rate": 1.71515435717077e-05, + "loss": 0.1993, + "step": 183300 + }, + { + "epoch": 14.322530261616556, + "grad_norm": 0.9225603938102722, + "learning_rate": 1.7149980461117627e-05, + "loss": 0.2059, + "step": 183400 + }, + { + "epoch": 14.330339711050371, + "grad_norm": 0.908424973487854, + "learning_rate": 1.714841735052755e-05, + "loss": 0.2049, + "step": 183500 + }, + { + "epoch": 14.338149160484186, + "grad_norm": 0.7969884872436523, + "learning_rate": 1.7146854239937476e-05, + "loss": 0.1964, + "step": 183600 + }, + { + "epoch": 14.345958609918, + "grad_norm": 1.1689947843551636, + "learning_rate": 1.7145291129347405e-05, + "loss": 0.1955, + "step": 183700 + }, + { + "epoch": 14.353768059351816, + "grad_norm": 1.039355754852295, + "learning_rate": 1.7143728018757328e-05, + "loss": 0.2021, + "step": 183800 + }, + { + "epoch": 14.361577508785631, + "grad_norm": 0.923857569694519, + "learning_rate": 1.7142164908167254e-05, + "loss": 0.1974, + "step": 183900 + }, + { + "epoch": 14.369386958219446, + "grad_norm": 0.8760836124420166, + "learning_rate": 1.7140617428683083e-05, + "loss": 0.2004, + "step": 184000 + }, + { + "epoch": 14.37719640765326, + "grad_norm": 0.9584679007530212, + "learning_rate": 1.7139054318093006e-05, + "loss": 0.1964, + "step": 184100 + }, + { + "epoch": 14.385005857087075, + "grad_norm": 0.7809962630271912, + "learning_rate": 1.713749120750293e-05, + "loss": 0.1946, + "step": 184200 + }, + { + "epoch": 14.392815306520891, + "grad_norm": 0.8375005125999451, + "learning_rate": 1.7135928096912858e-05, + "loss": 0.1983, + "step": 184300 + }, + { + "epoch": 14.400624755954706, + "grad_norm": 0.942611038684845, + "learning_rate": 1.7134364986322784e-05, + "loss": 0.2109, + "step": 184400 + }, + { + "epoch": 14.40843420538852, + "grad_norm": 0.8951541185379028, + "learning_rate": 1.713280187573271e-05, + "loss": 0.203, + "step": 184500 + }, + { + "epoch": 14.416243654822335, + "grad_norm": 1.221139669418335, + "learning_rate": 1.7131238765142636e-05, + "loss": 0.1992, + "step": 184600 + }, + { + "epoch": 14.42405310425615, + "grad_norm": 0.6856141090393066, + "learning_rate": 1.712967565455256e-05, + "loss": 0.1991, + "step": 184700 + }, + { + "epoch": 14.431862553689966, + "grad_norm": 1.0590518712997437, + "learning_rate": 1.7128112543962488e-05, + "loss": 0.193, + "step": 184800 + }, + { + "epoch": 14.43967200312378, + "grad_norm": 0.8135849237442017, + "learning_rate": 1.7126549433372414e-05, + "loss": 0.2112, + "step": 184900 + }, + { + "epoch": 14.447481452557595, + "grad_norm": 0.9077997803688049, + "learning_rate": 1.7124986322782336e-05, + "loss": 0.1936, + "step": 185000 + }, + { + "epoch": 14.45529090199141, + "grad_norm": 0.9828291535377502, + "learning_rate": 1.7123423212192266e-05, + "loss": 0.1972, + "step": 185100 + }, + { + "epoch": 14.463100351425224, + "grad_norm": 1.111656904220581, + "learning_rate": 1.712186010160219e-05, + "loss": 0.1999, + "step": 185200 + }, + { + "epoch": 14.47090980085904, + "grad_norm": 0.9280322790145874, + "learning_rate": 1.7120296991012114e-05, + "loss": 0.2053, + "step": 185300 + }, + { + "epoch": 14.478719250292855, + "grad_norm": 1.0524592399597168, + "learning_rate": 1.711873388042204e-05, + "loss": 0.2082, + "step": 185400 + }, + { + "epoch": 14.48652869972667, + "grad_norm": 1.1530081033706665, + "learning_rate": 1.7117170769831966e-05, + "loss": 0.1953, + "step": 185500 + }, + { + "epoch": 14.494338149160484, + "grad_norm": 0.7839354276657104, + "learning_rate": 1.7115607659241892e-05, + "loss": 0.1838, + "step": 185600 + }, + { + "epoch": 14.502147598594298, + "grad_norm": 1.0498613119125366, + "learning_rate": 1.711404454865182e-05, + "loss": 0.2024, + "step": 185700 + }, + { + "epoch": 14.509957048028115, + "grad_norm": 0.8028302192687988, + "learning_rate": 1.7112481438061744e-05, + "loss": 0.1993, + "step": 185800 + }, + { + "epoch": 14.51776649746193, + "grad_norm": 1.0131007432937622, + "learning_rate": 1.711091832747167e-05, + "loss": 0.1948, + "step": 185900 + }, + { + "epoch": 14.525575946895744, + "grad_norm": 0.9612122774124146, + "learning_rate": 1.7109370847987496e-05, + "loss": 0.1921, + "step": 186000 + }, + { + "epoch": 14.533385396329559, + "grad_norm": 0.8743976950645447, + "learning_rate": 1.7107807737397422e-05, + "loss": 0.2054, + "step": 186100 + }, + { + "epoch": 14.541194845763373, + "grad_norm": 1.2099506855010986, + "learning_rate": 1.7106244626807348e-05, + "loss": 0.2025, + "step": 186200 + }, + { + "epoch": 14.54900429519719, + "grad_norm": 0.9520013928413391, + "learning_rate": 1.7104681516217274e-05, + "loss": 0.1943, + "step": 186300 + }, + { + "epoch": 14.556813744631004, + "grad_norm": 0.7041392922401428, + "learning_rate": 1.71031184056272e-05, + "loss": 0.2021, + "step": 186400 + }, + { + "epoch": 14.564623194064819, + "grad_norm": 1.4426240921020508, + "learning_rate": 1.7101555295037123e-05, + "loss": 0.1956, + "step": 186500 + }, + { + "epoch": 14.572432643498633, + "grad_norm": 0.8749873042106628, + "learning_rate": 1.7099992184447052e-05, + "loss": 0.2027, + "step": 186600 + }, + { + "epoch": 14.580242092932448, + "grad_norm": 0.7697212100028992, + "learning_rate": 1.709842907385698e-05, + "loss": 0.2058, + "step": 186700 + }, + { + "epoch": 14.588051542366264, + "grad_norm": 0.8453297019004822, + "learning_rate": 1.70968659632669e-05, + "loss": 0.2138, + "step": 186800 + }, + { + "epoch": 14.595860991800079, + "grad_norm": 1.1992191076278687, + "learning_rate": 1.7095302852676827e-05, + "loss": 0.1904, + "step": 186900 + }, + { + "epoch": 14.603670441233893, + "grad_norm": 0.8397727608680725, + "learning_rate": 1.7093739742086753e-05, + "loss": 0.204, + "step": 187000 + }, + { + "epoch": 14.611479890667708, + "grad_norm": 0.867508053779602, + "learning_rate": 1.709217663149668e-05, + "loss": 0.2018, + "step": 187100 + }, + { + "epoch": 14.619289340101522, + "grad_norm": 0.9640269875526428, + "learning_rate": 1.7090613520906605e-05, + "loss": 0.1919, + "step": 187200 + }, + { + "epoch": 14.627098789535339, + "grad_norm": 1.1995270252227783, + "learning_rate": 1.708905041031653e-05, + "loss": 0.1937, + "step": 187300 + }, + { + "epoch": 14.634908238969153, + "grad_norm": 0.936285138130188, + "learning_rate": 1.7087487299726457e-05, + "loss": 0.2013, + "step": 187400 + }, + { + "epoch": 14.642717688402968, + "grad_norm": 0.9428242444992065, + "learning_rate": 1.7085924189136383e-05, + "loss": 0.1901, + "step": 187500 + }, + { + "epoch": 14.650527137836782, + "grad_norm": 1.123363733291626, + "learning_rate": 1.708436107854631e-05, + "loss": 0.2002, + "step": 187600 + }, + { + "epoch": 14.658336587270597, + "grad_norm": 0.7815497517585754, + "learning_rate": 1.7082797967956235e-05, + "loss": 0.1928, + "step": 187700 + }, + { + "epoch": 14.666146036704411, + "grad_norm": 0.9222931861877441, + "learning_rate": 1.708123485736616e-05, + "loss": 0.1962, + "step": 187800 + }, + { + "epoch": 14.673955486138228, + "grad_norm": 0.9944746494293213, + "learning_rate": 1.7079671746776084e-05, + "loss": 0.1985, + "step": 187900 + }, + { + "epoch": 14.681764935572042, + "grad_norm": 0.9492562413215637, + "learning_rate": 1.707812426729191e-05, + "loss": 0.1878, + "step": 188000 + }, + { + "epoch": 14.689574385005857, + "grad_norm": 1.0256750583648682, + "learning_rate": 1.707656115670184e-05, + "loss": 0.194, + "step": 188100 + }, + { + "epoch": 14.697383834439671, + "grad_norm": 0.9843342900276184, + "learning_rate": 1.7074998046111765e-05, + "loss": 0.2038, + "step": 188200 + }, + { + "epoch": 14.705193283873488, + "grad_norm": 0.6537313461303711, + "learning_rate": 1.7073434935521688e-05, + "loss": 0.1972, + "step": 188300 + }, + { + "epoch": 14.713002733307302, + "grad_norm": 1.145023226737976, + "learning_rate": 1.7071871824931617e-05, + "loss": 0.1866, + "step": 188400 + }, + { + "epoch": 14.720812182741117, + "grad_norm": 0.9992174506187439, + "learning_rate": 1.707030871434154e-05, + "loss": 0.1941, + "step": 188500 + }, + { + "epoch": 14.728621632174931, + "grad_norm": 0.9242602586746216, + "learning_rate": 1.7068745603751466e-05, + "loss": 0.1966, + "step": 188600 + }, + { + "epoch": 14.736431081608746, + "grad_norm": 0.7435315847396851, + "learning_rate": 1.7067182493161392e-05, + "loss": 0.2018, + "step": 188700 + }, + { + "epoch": 14.74424053104256, + "grad_norm": 0.8198279142379761, + "learning_rate": 1.7065619382571318e-05, + "loss": 0.1911, + "step": 188800 + }, + { + "epoch": 14.752049980476377, + "grad_norm": 0.9536947011947632, + "learning_rate": 1.7064056271981244e-05, + "loss": 0.1947, + "step": 188900 + }, + { + "epoch": 14.759859429910192, + "grad_norm": 0.814950704574585, + "learning_rate": 1.706249316139117e-05, + "loss": 0.1992, + "step": 189000 + }, + { + "epoch": 14.767668879344006, + "grad_norm": 1.2029240131378174, + "learning_rate": 1.7060930050801096e-05, + "loss": 0.1897, + "step": 189100 + }, + { + "epoch": 14.77547832877782, + "grad_norm": 0.8989683985710144, + "learning_rate": 1.7059366940211022e-05, + "loss": 0.1917, + "step": 189200 + }, + { + "epoch": 14.783287778211637, + "grad_norm": 0.981456995010376, + "learning_rate": 1.7057803829620948e-05, + "loss": 0.2043, + "step": 189300 + }, + { + "epoch": 14.791097227645452, + "grad_norm": 0.7692901492118835, + "learning_rate": 1.705624071903087e-05, + "loss": 0.1942, + "step": 189400 + }, + { + "epoch": 14.798906677079266, + "grad_norm": 0.8410061001777649, + "learning_rate": 1.70546776084408e-05, + "loss": 0.1871, + "step": 189500 + }, + { + "epoch": 14.80671612651308, + "grad_norm": 0.7750579714775085, + "learning_rate": 1.7053114497850726e-05, + "loss": 0.1978, + "step": 189600 + }, + { + "epoch": 14.814525575946895, + "grad_norm": 0.9206308722496033, + "learning_rate": 1.705155138726065e-05, + "loss": 0.1869, + "step": 189700 + }, + { + "epoch": 14.82233502538071, + "grad_norm": 0.9680977463722229, + "learning_rate": 1.7049988276670574e-05, + "loss": 0.1953, + "step": 189800 + }, + { + "epoch": 14.830144474814526, + "grad_norm": 0.9864129424095154, + "learning_rate": 1.7048425166080504e-05, + "loss": 0.1878, + "step": 189900 + }, + { + "epoch": 14.83795392424834, + "grad_norm": 0.8863809108734131, + "learning_rate": 1.704687768659633e-05, + "loss": 0.1923, + "step": 190000 + }, + { + "epoch": 14.845763373682155, + "grad_norm": 1.3199231624603271, + "learning_rate": 1.7045314576006252e-05, + "loss": 0.2056, + "step": 190100 + }, + { + "epoch": 14.85357282311597, + "grad_norm": 0.9699838161468506, + "learning_rate": 1.7043751465416182e-05, + "loss": 0.1922, + "step": 190200 + }, + { + "epoch": 14.861382272549786, + "grad_norm": 0.7888826727867126, + "learning_rate": 1.7042188354826104e-05, + "loss": 0.1952, + "step": 190300 + }, + { + "epoch": 14.8691917219836, + "grad_norm": 0.8985480070114136, + "learning_rate": 1.704062524423603e-05, + "loss": 0.1912, + "step": 190400 + }, + { + "epoch": 14.877001171417415, + "grad_norm": 1.0374826192855835, + "learning_rate": 1.7039062133645956e-05, + "loss": 0.2057, + "step": 190500 + }, + { + "epoch": 14.88481062085123, + "grad_norm": 0.8498507738113403, + "learning_rate": 1.7037499023055882e-05, + "loss": 0.2004, + "step": 190600 + }, + { + "epoch": 14.892620070285044, + "grad_norm": 0.9233710169792175, + "learning_rate": 1.703593591246581e-05, + "loss": 0.1951, + "step": 190700 + }, + { + "epoch": 14.900429519718859, + "grad_norm": 1.052475929260254, + "learning_rate": 1.7034372801875734e-05, + "loss": 0.2044, + "step": 190800 + }, + { + "epoch": 14.908238969152675, + "grad_norm": 0.9808698892593384, + "learning_rate": 1.703280969128566e-05, + "loss": 0.191, + "step": 190900 + }, + { + "epoch": 14.91604841858649, + "grad_norm": 0.8547319769859314, + "learning_rate": 1.7031246580695586e-05, + "loss": 0.1909, + "step": 191000 + }, + { + "epoch": 14.923857868020304, + "grad_norm": 0.8447765707969666, + "learning_rate": 1.7029683470105513e-05, + "loss": 0.1879, + "step": 191100 + }, + { + "epoch": 14.931667317454119, + "grad_norm": 0.9389594197273254, + "learning_rate": 1.7028120359515435e-05, + "loss": 0.1917, + "step": 191200 + }, + { + "epoch": 14.939476766887934, + "grad_norm": 0.9502831101417542, + "learning_rate": 1.7026557248925365e-05, + "loss": 0.1895, + "step": 191300 + }, + { + "epoch": 14.94728621632175, + "grad_norm": 0.8974219560623169, + "learning_rate": 1.702499413833529e-05, + "loss": 0.1938, + "step": 191400 + }, + { + "epoch": 14.955095665755564, + "grad_norm": 1.042904019355774, + "learning_rate": 1.7023431027745213e-05, + "loss": 0.1981, + "step": 191500 + }, + { + "epoch": 14.962905115189379, + "grad_norm": 0.8570128679275513, + "learning_rate": 1.702186791715514e-05, + "loss": 0.1908, + "step": 191600 + }, + { + "epoch": 14.970714564623194, + "grad_norm": 0.8636656999588013, + "learning_rate": 1.7020304806565065e-05, + "loss": 0.1876, + "step": 191700 + }, + { + "epoch": 14.978524014057008, + "grad_norm": 0.8689895868301392, + "learning_rate": 1.701874169597499e-05, + "loss": 0.1942, + "step": 191800 + }, + { + "epoch": 14.986333463490825, + "grad_norm": 1.3911195993423462, + "learning_rate": 1.7017178585384917e-05, + "loss": 0.2062, + "step": 191900 + }, + { + "epoch": 14.994142912924639, + "grad_norm": 0.7825437784194946, + "learning_rate": 1.7015631105900743e-05, + "loss": 0.2081, + "step": 192000 + }, + { + "epoch": 15.001952362358454, + "grad_norm": 0.9370694160461426, + "learning_rate": 1.701406799531067e-05, + "loss": 0.2039, + "step": 192100 + }, + { + "epoch": 15.009761811792268, + "grad_norm": 0.9433199763298035, + "learning_rate": 1.7012504884720595e-05, + "loss": 0.1972, + "step": 192200 + }, + { + "epoch": 15.017571261226083, + "grad_norm": 1.1181074380874634, + "learning_rate": 1.701094177413052e-05, + "loss": 0.1896, + "step": 192300 + }, + { + "epoch": 15.025380710659899, + "grad_norm": 0.685670018196106, + "learning_rate": 1.7009378663540447e-05, + "loss": 0.1903, + "step": 192400 + }, + { + "epoch": 15.033190160093714, + "grad_norm": 0.7437558770179749, + "learning_rate": 1.7007815552950373e-05, + "loss": 0.1866, + "step": 192500 + }, + { + "epoch": 15.040999609527528, + "grad_norm": 0.9175603985786438, + "learning_rate": 1.70062524423603e-05, + "loss": 0.1859, + "step": 192600 + }, + { + "epoch": 15.048809058961343, + "grad_norm": 1.106664776802063, + "learning_rate": 1.7004689331770222e-05, + "loss": 0.1957, + "step": 192700 + }, + { + "epoch": 15.056618508395157, + "grad_norm": 0.7231187224388123, + "learning_rate": 1.700312622118015e-05, + "loss": 0.186, + "step": 192800 + }, + { + "epoch": 15.064427957828974, + "grad_norm": 1.1075918674468994, + "learning_rate": 1.7001563110590077e-05, + "loss": 0.196, + "step": 192900 + }, + { + "epoch": 15.072237407262788, + "grad_norm": 0.8646638989448547, + "learning_rate": 1.7e-05, + "loss": 0.1928, + "step": 193000 + }, + { + "epoch": 15.080046856696603, + "grad_norm": 0.9719217419624329, + "learning_rate": 1.6998436889409926e-05, + "loss": 0.1936, + "step": 193100 + }, + { + "epoch": 15.087856306130417, + "grad_norm": 0.595931887626648, + "learning_rate": 1.6996873778819852e-05, + "loss": 0.1797, + "step": 193200 + }, + { + "epoch": 15.095665755564232, + "grad_norm": 0.7941077947616577, + "learning_rate": 1.6995310668229778e-05, + "loss": 0.1986, + "step": 193300 + }, + { + "epoch": 15.103475204998048, + "grad_norm": 0.7684542536735535, + "learning_rate": 1.6993747557639704e-05, + "loss": 0.1966, + "step": 193400 + }, + { + "epoch": 15.111284654431863, + "grad_norm": 1.0312563180923462, + "learning_rate": 1.699218444704963e-05, + "loss": 0.1971, + "step": 193500 + }, + { + "epoch": 15.119094103865677, + "grad_norm": 0.7021324038505554, + "learning_rate": 1.6990621336459556e-05, + "loss": 0.1925, + "step": 193600 + }, + { + "epoch": 15.126903553299492, + "grad_norm": 0.6282637119293213, + "learning_rate": 1.6989058225869482e-05, + "loss": 0.1907, + "step": 193700 + }, + { + "epoch": 15.134713002733307, + "grad_norm": 0.6020660996437073, + "learning_rate": 1.6987495115279408e-05, + "loss": 0.1925, + "step": 193800 + }, + { + "epoch": 15.142522452167123, + "grad_norm": 1.0062716007232666, + "learning_rate": 1.6985932004689334e-05, + "loss": 0.1955, + "step": 193900 + }, + { + "epoch": 15.150331901600937, + "grad_norm": 1.0643500089645386, + "learning_rate": 1.698436889409926e-05, + "loss": 0.1974, + "step": 194000 + }, + { + "epoch": 15.158141351034752, + "grad_norm": 0.835570752620697, + "learning_rate": 1.6982821414615086e-05, + "loss": 0.1945, + "step": 194100 + }, + { + "epoch": 15.165950800468567, + "grad_norm": 1.1955751180648804, + "learning_rate": 1.698125830402501e-05, + "loss": 0.1965, + "step": 194200 + }, + { + "epoch": 15.173760249902381, + "grad_norm": 0.7210449576377869, + "learning_rate": 1.6979695193434938e-05, + "loss": 0.1951, + "step": 194300 + }, + { + "epoch": 15.181569699336197, + "grad_norm": 1.0025458335876465, + "learning_rate": 1.6978132082844864e-05, + "loss": 0.1913, + "step": 194400 + }, + { + "epoch": 15.189379148770012, + "grad_norm": 0.8779587745666504, + "learning_rate": 1.6976568972254787e-05, + "loss": 0.1953, + "step": 194500 + }, + { + "epoch": 15.197188598203827, + "grad_norm": 1.1459087133407593, + "learning_rate": 1.6975005861664716e-05, + "loss": 0.1958, + "step": 194600 + }, + { + "epoch": 15.204998047637641, + "grad_norm": 1.0809831619262695, + "learning_rate": 1.697344275107464e-05, + "loss": 0.1961, + "step": 194700 + }, + { + "epoch": 15.212807497071456, + "grad_norm": 1.0169117450714111, + "learning_rate": 1.6971879640484565e-05, + "loss": 0.1916, + "step": 194800 + }, + { + "epoch": 15.220616946505272, + "grad_norm": 0.9966555833816528, + "learning_rate": 1.697031652989449e-05, + "loss": 0.1955, + "step": 194900 + }, + { + "epoch": 15.228426395939087, + "grad_norm": 1.2357121706008911, + "learning_rate": 1.6968753419304417e-05, + "loss": 0.1911, + "step": 195000 + }, + { + "epoch": 15.236235845372901, + "grad_norm": 0.9304054975509644, + "learning_rate": 1.6967190308714343e-05, + "loss": 0.1904, + "step": 195100 + }, + { + "epoch": 15.244045294806716, + "grad_norm": 1.0154582262039185, + "learning_rate": 1.696562719812427e-05, + "loss": 0.1935, + "step": 195200 + }, + { + "epoch": 15.25185474424053, + "grad_norm": 0.8713698983192444, + "learning_rate": 1.6964064087534195e-05, + "loss": 0.1957, + "step": 195300 + }, + { + "epoch": 15.259664193674347, + "grad_norm": 0.9780009984970093, + "learning_rate": 1.696250097694412e-05, + "loss": 0.1858, + "step": 195400 + }, + { + "epoch": 15.267473643108161, + "grad_norm": 0.7178473472595215, + "learning_rate": 1.6960937866354047e-05, + "loss": 0.1956, + "step": 195500 + }, + { + "epoch": 15.275283092541976, + "grad_norm": 0.9540956616401672, + "learning_rate": 1.695937475576397e-05, + "loss": 0.1889, + "step": 195600 + }, + { + "epoch": 15.28309254197579, + "grad_norm": 1.0060138702392578, + "learning_rate": 1.69578116451739e-05, + "loss": 0.1937, + "step": 195700 + }, + { + "epoch": 15.290901991409605, + "grad_norm": 0.9597862362861633, + "learning_rate": 1.6956248534583825e-05, + "loss": 0.1859, + "step": 195800 + }, + { + "epoch": 15.298711440843421, + "grad_norm": 1.0637125968933105, + "learning_rate": 1.6954685423993747e-05, + "loss": 0.1903, + "step": 195900 + }, + { + "epoch": 15.306520890277236, + "grad_norm": 0.8320383429527283, + "learning_rate": 1.6953122313403673e-05, + "loss": 0.1934, + "step": 196000 + }, + { + "epoch": 15.31433033971105, + "grad_norm": 0.9052497148513794, + "learning_rate": 1.6951574833919503e-05, + "loss": 0.2025, + "step": 196100 + }, + { + "epoch": 15.322139789144865, + "grad_norm": 0.7524051070213318, + "learning_rate": 1.695001172332943e-05, + "loss": 0.1882, + "step": 196200 + }, + { + "epoch": 15.32994923857868, + "grad_norm": 0.7284848093986511, + "learning_rate": 1.694844861273935e-05, + "loss": 0.185, + "step": 196300 + }, + { + "epoch": 15.337758688012496, + "grad_norm": 0.8714132905006409, + "learning_rate": 1.694688550214928e-05, + "loss": 0.1933, + "step": 196400 + }, + { + "epoch": 15.34556813744631, + "grad_norm": 0.8674778938293457, + "learning_rate": 1.6945322391559203e-05, + "loss": 0.1916, + "step": 196500 + }, + { + "epoch": 15.353377586880125, + "grad_norm": 0.9107986688613892, + "learning_rate": 1.694375928096913e-05, + "loss": 0.1928, + "step": 196600 + }, + { + "epoch": 15.36118703631394, + "grad_norm": 0.8382946252822876, + "learning_rate": 1.6942196170379055e-05, + "loss": 0.1948, + "step": 196700 + }, + { + "epoch": 15.368996485747754, + "grad_norm": 0.5965157151222229, + "learning_rate": 1.694063305978898e-05, + "loss": 0.1866, + "step": 196800 + }, + { + "epoch": 15.37680593518157, + "grad_norm": 0.9926332831382751, + "learning_rate": 1.6939069949198907e-05, + "loss": 0.1858, + "step": 196900 + }, + { + "epoch": 15.384615384615385, + "grad_norm": 0.9674046635627747, + "learning_rate": 1.6937506838608833e-05, + "loss": 0.1919, + "step": 197000 + }, + { + "epoch": 15.3924248340492, + "grad_norm": 0.763822078704834, + "learning_rate": 1.693594372801876e-05, + "loss": 0.1946, + "step": 197100 + }, + { + "epoch": 15.400234283483014, + "grad_norm": 0.9984347224235535, + "learning_rate": 1.6934380617428685e-05, + "loss": 0.2008, + "step": 197200 + }, + { + "epoch": 15.408043732916829, + "grad_norm": 1.0560930967330933, + "learning_rate": 1.693281750683861e-05, + "loss": 0.1935, + "step": 197300 + }, + { + "epoch": 15.415853182350645, + "grad_norm": 0.8882468938827515, + "learning_rate": 1.6931254396248534e-05, + "loss": 0.1827, + "step": 197400 + }, + { + "epoch": 15.42366263178446, + "grad_norm": 1.249205470085144, + "learning_rate": 1.6929691285658463e-05, + "loss": 0.1931, + "step": 197500 + }, + { + "epoch": 15.431472081218274, + "grad_norm": 0.8984086513519287, + "learning_rate": 1.692812817506839e-05, + "loss": 0.1892, + "step": 197600 + }, + { + "epoch": 15.439281530652089, + "grad_norm": 0.9370852112770081, + "learning_rate": 1.6926565064478312e-05, + "loss": 0.1809, + "step": 197700 + }, + { + "epoch": 15.447090980085903, + "grad_norm": 0.8891327977180481, + "learning_rate": 1.6925001953888238e-05, + "loss": 0.1887, + "step": 197800 + }, + { + "epoch": 15.45490042951972, + "grad_norm": 1.0804502964019775, + "learning_rate": 1.6923438843298164e-05, + "loss": 0.1868, + "step": 197900 + }, + { + "epoch": 15.462709878953534, + "grad_norm": 0.6988115906715393, + "learning_rate": 1.692187573270809e-05, + "loss": 0.1892, + "step": 198000 + }, + { + "epoch": 15.470519328387349, + "grad_norm": 1.01301908493042, + "learning_rate": 1.6920312622118016e-05, + "loss": 0.1906, + "step": 198100 + }, + { + "epoch": 15.478328777821163, + "grad_norm": 1.0333824157714844, + "learning_rate": 1.6918765142633842e-05, + "loss": 0.1905, + "step": 198200 + }, + { + "epoch": 15.486138227254978, + "grad_norm": 0.6864656209945679, + "learning_rate": 1.6917202032043768e-05, + "loss": 0.19, + "step": 198300 + }, + { + "epoch": 15.493947676688794, + "grad_norm": 0.8507186770439148, + "learning_rate": 1.6915638921453694e-05, + "loss": 0.1948, + "step": 198400 + }, + { + "epoch": 15.501757126122609, + "grad_norm": 1.074942708015442, + "learning_rate": 1.691407581086362e-05, + "loss": 0.1867, + "step": 198500 + }, + { + "epoch": 15.509566575556423, + "grad_norm": 0.9293534755706787, + "learning_rate": 1.6912512700273546e-05, + "loss": 0.1924, + "step": 198600 + }, + { + "epoch": 15.517376024990238, + "grad_norm": 1.0201472043991089, + "learning_rate": 1.6910949589683472e-05, + "loss": 0.1916, + "step": 198700 + }, + { + "epoch": 15.525185474424053, + "grad_norm": 1.2007817029953003, + "learning_rate": 1.6909386479093398e-05, + "loss": 0.1869, + "step": 198800 + }, + { + "epoch": 15.532994923857869, + "grad_norm": 0.812961757183075, + "learning_rate": 1.690782336850332e-05, + "loss": 0.1869, + "step": 198900 + }, + { + "epoch": 15.540804373291683, + "grad_norm": 0.7858532071113586, + "learning_rate": 1.690626025791325e-05, + "loss": 0.1929, + "step": 199000 + }, + { + "epoch": 15.548613822725498, + "grad_norm": 0.7535479664802551, + "learning_rate": 1.6904697147323176e-05, + "loss": 0.1882, + "step": 199100 + }, + { + "epoch": 15.556423272159313, + "grad_norm": 0.8674836158752441, + "learning_rate": 1.69031340367331e-05, + "loss": 0.1833, + "step": 199200 + }, + { + "epoch": 15.564232721593127, + "grad_norm": 1.0248985290527344, + "learning_rate": 1.6901570926143025e-05, + "loss": 0.1901, + "step": 199300 + }, + { + "epoch": 15.572042171026943, + "grad_norm": 0.8296997547149658, + "learning_rate": 1.690000781555295e-05, + "loss": 0.1891, + "step": 199400 + }, + { + "epoch": 15.579851620460758, + "grad_norm": 0.8192151188850403, + "learning_rate": 1.6898444704962877e-05, + "loss": 0.189, + "step": 199500 + }, + { + "epoch": 15.587661069894573, + "grad_norm": 0.864629864692688, + "learning_rate": 1.6896881594372803e-05, + "loss": 0.181, + "step": 199600 + }, + { + "epoch": 15.595470519328387, + "grad_norm": 0.6454740166664124, + "learning_rate": 1.689531848378273e-05, + "loss": 0.1823, + "step": 199700 + }, + { + "epoch": 15.603279968762202, + "grad_norm": 0.9795613884925842, + "learning_rate": 1.6893755373192655e-05, + "loss": 0.193, + "step": 199800 + }, + { + "epoch": 15.611089418196016, + "grad_norm": 0.735085666179657, + "learning_rate": 1.689219226260258e-05, + "loss": 0.186, + "step": 199900 + }, + { + "epoch": 15.618898867629833, + "grad_norm": 0.9269001483917236, + "learning_rate": 1.6890629152012507e-05, + "loss": 0.1859, + "step": 200000 + }, + { + "epoch": 15.626708317063647, + "grad_norm": 0.864848256111145, + "learning_rate": 1.6889066041422433e-05, + "loss": 0.1839, + "step": 200100 + }, + { + "epoch": 15.634517766497462, + "grad_norm": 0.8201810717582703, + "learning_rate": 1.688751856193826e-05, + "loss": 0.1902, + "step": 200200 + }, + { + "epoch": 15.642327215931276, + "grad_norm": 0.8653948307037354, + "learning_rate": 1.6885955451348185e-05, + "loss": 0.1925, + "step": 200300 + }, + { + "epoch": 15.650136665365093, + "grad_norm": 0.950314462184906, + "learning_rate": 1.6884392340758107e-05, + "loss": 0.1864, + "step": 200400 + }, + { + "epoch": 15.657946114798907, + "grad_norm": 0.7774307727813721, + "learning_rate": 1.6882829230168037e-05, + "loss": 0.1735, + "step": 200500 + }, + { + "epoch": 15.665755564232722, + "grad_norm": 0.800393283367157, + "learning_rate": 1.6881266119577963e-05, + "loss": 0.1881, + "step": 200600 + }, + { + "epoch": 15.673565013666536, + "grad_norm": 0.9827004671096802, + "learning_rate": 1.6879703008987885e-05, + "loss": 0.186, + "step": 200700 + }, + { + "epoch": 15.68137446310035, + "grad_norm": 0.6259004473686218, + "learning_rate": 1.6878139898397815e-05, + "loss": 0.1809, + "step": 200800 + }, + { + "epoch": 15.689183912534165, + "grad_norm": 0.8524276614189148, + "learning_rate": 1.6876576787807737e-05, + "loss": 0.1913, + "step": 200900 + }, + { + "epoch": 15.696993361967982, + "grad_norm": 0.9546549320220947, + "learning_rate": 1.6875013677217663e-05, + "loss": 0.1881, + "step": 201000 + }, + { + "epoch": 15.704802811401796, + "grad_norm": 0.999563455581665, + "learning_rate": 1.687345056662759e-05, + "loss": 0.1822, + "step": 201100 + }, + { + "epoch": 15.712612260835611, + "grad_norm": 0.8454039096832275, + "learning_rate": 1.6871887456037516e-05, + "loss": 0.188, + "step": 201200 + }, + { + "epoch": 15.720421710269425, + "grad_norm": 0.767012894153595, + "learning_rate": 1.687032434544744e-05, + "loss": 0.1828, + "step": 201300 + }, + { + "epoch": 15.728231159703242, + "grad_norm": 0.9323714971542358, + "learning_rate": 1.6868761234857368e-05, + "loss": 0.1872, + "step": 201400 + }, + { + "epoch": 15.736040609137056, + "grad_norm": 1.0684683322906494, + "learning_rate": 1.6867198124267294e-05, + "loss": 0.1968, + "step": 201500 + }, + { + "epoch": 15.743850058570871, + "grad_norm": 0.6585600972175598, + "learning_rate": 1.686563501367722e-05, + "loss": 0.1872, + "step": 201600 + }, + { + "epoch": 15.751659508004686, + "grad_norm": 0.9620580673217773, + "learning_rate": 1.6864071903087146e-05, + "loss": 0.18, + "step": 201700 + }, + { + "epoch": 15.7594689574385, + "grad_norm": 0.7878828644752502, + "learning_rate": 1.6862508792497068e-05, + "loss": 0.1847, + "step": 201800 + }, + { + "epoch": 15.767278406872315, + "grad_norm": 0.7668275237083435, + "learning_rate": 1.6860945681906998e-05, + "loss": 0.1934, + "step": 201900 + }, + { + "epoch": 15.775087856306131, + "grad_norm": 1.0003772974014282, + "learning_rate": 1.6859382571316924e-05, + "loss": 0.1816, + "step": 202000 + }, + { + "epoch": 15.782897305739946, + "grad_norm": 0.8344219923019409, + "learning_rate": 1.6857819460726846e-05, + "loss": 0.1894, + "step": 202100 + }, + { + "epoch": 15.79070675517376, + "grad_norm": 0.8241024613380432, + "learning_rate": 1.6856271981242672e-05, + "loss": 0.1842, + "step": 202200 + }, + { + "epoch": 15.798516204607575, + "grad_norm": 1.0600364208221436, + "learning_rate": 1.68547088706526e-05, + "loss": 0.1839, + "step": 202300 + }, + { + "epoch": 15.806325654041391, + "grad_norm": 0.6510144472122192, + "learning_rate": 1.6853145760062528e-05, + "loss": 0.1823, + "step": 202400 + }, + { + "epoch": 15.814135103475206, + "grad_norm": 0.9536733627319336, + "learning_rate": 1.685158264947245e-05, + "loss": 0.1819, + "step": 202500 + }, + { + "epoch": 15.82194455290902, + "grad_norm": 0.8948224782943726, + "learning_rate": 1.685001953888238e-05, + "loss": 0.183, + "step": 202600 + }, + { + "epoch": 15.829754002342835, + "grad_norm": 0.9985434412956238, + "learning_rate": 1.6848456428292302e-05, + "loss": 0.1886, + "step": 202700 + }, + { + "epoch": 15.83756345177665, + "grad_norm": 0.801413893699646, + "learning_rate": 1.6846893317702228e-05, + "loss": 0.184, + "step": 202800 + }, + { + "epoch": 15.845372901210464, + "grad_norm": 1.120006799697876, + "learning_rate": 1.6845330207112154e-05, + "loss": 0.1748, + "step": 202900 + }, + { + "epoch": 15.85318235064428, + "grad_norm": 1.0911275148391724, + "learning_rate": 1.684376709652208e-05, + "loss": 0.1966, + "step": 203000 + }, + { + "epoch": 15.860991800078095, + "grad_norm": 1.0338274240493774, + "learning_rate": 1.6842203985932006e-05, + "loss": 0.1914, + "step": 203100 + }, + { + "epoch": 15.86880124951191, + "grad_norm": 0.7905099391937256, + "learning_rate": 1.6840640875341932e-05, + "loss": 0.1753, + "step": 203200 + }, + { + "epoch": 15.876610698945724, + "grad_norm": 0.838280200958252, + "learning_rate": 1.6839077764751858e-05, + "loss": 0.1838, + "step": 203300 + }, + { + "epoch": 15.884420148379538, + "grad_norm": 1.0954149961471558, + "learning_rate": 1.6837514654161784e-05, + "loss": 0.1761, + "step": 203400 + }, + { + "epoch": 15.892229597813355, + "grad_norm": 0.5330013632774353, + "learning_rate": 1.683595154357171e-05, + "loss": 0.1758, + "step": 203500 + }, + { + "epoch": 15.90003904724717, + "grad_norm": 0.8652849197387695, + "learning_rate": 1.6834388432981633e-05, + "loss": 0.183, + "step": 203600 + }, + { + "epoch": 15.907848496680984, + "grad_norm": 0.7471824288368225, + "learning_rate": 1.6832825322391562e-05, + "loss": 0.1854, + "step": 203700 + }, + { + "epoch": 15.915657946114798, + "grad_norm": 0.820147693157196, + "learning_rate": 1.683126221180149e-05, + "loss": 0.1741, + "step": 203800 + }, + { + "epoch": 15.923467395548613, + "grad_norm": 1.0590057373046875, + "learning_rate": 1.682969910121141e-05, + "loss": 0.1749, + "step": 203900 + }, + { + "epoch": 15.93127684498243, + "grad_norm": 1.0665738582611084, + "learning_rate": 1.6828135990621337e-05, + "loss": 0.1846, + "step": 204000 + }, + { + "epoch": 15.939086294416244, + "grad_norm": 1.073426604270935, + "learning_rate": 1.6826572880031263e-05, + "loss": 0.1861, + "step": 204100 + }, + { + "epoch": 15.946895743850058, + "grad_norm": 0.9139155745506287, + "learning_rate": 1.682502540054709e-05, + "loss": 0.1734, + "step": 204200 + }, + { + "epoch": 15.954705193283873, + "grad_norm": 1.1288511753082275, + "learning_rate": 1.6823462289957015e-05, + "loss": 0.1766, + "step": 204300 + }, + { + "epoch": 15.962514642717688, + "grad_norm": 0.9638312458992004, + "learning_rate": 1.682189917936694e-05, + "loss": 0.1737, + "step": 204400 + }, + { + "epoch": 15.970324092151504, + "grad_norm": 1.0422431230545044, + "learning_rate": 1.6820336068776867e-05, + "loss": 0.1909, + "step": 204500 + }, + { + "epoch": 15.978133541585319, + "grad_norm": 0.6258545517921448, + "learning_rate": 1.6818772958186793e-05, + "loss": 0.1854, + "step": 204600 + }, + { + "epoch": 15.985942991019133, + "grad_norm": 1.0568209886550903, + "learning_rate": 1.681720984759672e-05, + "loss": 0.1826, + "step": 204700 + }, + { + "epoch": 15.993752440452948, + "grad_norm": 1.0256654024124146, + "learning_rate": 1.6815646737006645e-05, + "loss": 0.177, + "step": 204800 + }, + { + "epoch": 16.001561889886762, + "grad_norm": 1.0408920049667358, + "learning_rate": 1.681408362641657e-05, + "loss": 0.1895, + "step": 204900 + }, + { + "epoch": 16.009371339320577, + "grad_norm": 0.9039410948753357, + "learning_rate": 1.6812520515826497e-05, + "loss": 0.1888, + "step": 205000 + }, + { + "epoch": 16.01718078875439, + "grad_norm": 1.0113240480422974, + "learning_rate": 1.681095740523642e-05, + "loss": 0.1748, + "step": 205100 + }, + { + "epoch": 16.02499023818821, + "grad_norm": 1.0972598791122437, + "learning_rate": 1.680939429464635e-05, + "loss": 0.1885, + "step": 205200 + }, + { + "epoch": 16.032799687622024, + "grad_norm": 0.8362821936607361, + "learning_rate": 1.6807831184056275e-05, + "loss": 0.1777, + "step": 205300 + }, + { + "epoch": 16.04060913705584, + "grad_norm": 0.8148047924041748, + "learning_rate": 1.6806268073466198e-05, + "loss": 0.1815, + "step": 205400 + }, + { + "epoch": 16.048418586489653, + "grad_norm": 0.920591413974762, + "learning_rate": 1.6804704962876124e-05, + "loss": 0.1944, + "step": 205500 + }, + { + "epoch": 16.056228035923468, + "grad_norm": 1.1818751096725464, + "learning_rate": 1.680314185228605e-05, + "loss": 0.1754, + "step": 205600 + }, + { + "epoch": 16.064037485357282, + "grad_norm": 1.2708854675292969, + "learning_rate": 1.6801578741695976e-05, + "loss": 0.1768, + "step": 205700 + }, + { + "epoch": 16.071846934791097, + "grad_norm": 0.9716994762420654, + "learning_rate": 1.6800015631105902e-05, + "loss": 0.1886, + "step": 205800 + }, + { + "epoch": 16.07965638422491, + "grad_norm": 0.87205570936203, + "learning_rate": 1.6798452520515828e-05, + "loss": 0.1782, + "step": 205900 + }, + { + "epoch": 16.087465833658726, + "grad_norm": 0.929098904132843, + "learning_rate": 1.6796889409925754e-05, + "loss": 0.1844, + "step": 206000 + }, + { + "epoch": 16.09527528309254, + "grad_norm": 1.0598945617675781, + "learning_rate": 1.679532629933568e-05, + "loss": 0.1811, + "step": 206100 + }, + { + "epoch": 16.10308473252636, + "grad_norm": 1.0059045553207397, + "learning_rate": 1.6793763188745606e-05, + "loss": 0.1788, + "step": 206200 + }, + { + "epoch": 16.110894181960173, + "grad_norm": 0.6536023616790771, + "learning_rate": 1.679221570926143e-05, + "loss": 0.1854, + "step": 206300 + }, + { + "epoch": 16.118703631393988, + "grad_norm": 0.8467875123023987, + "learning_rate": 1.6790652598671358e-05, + "loss": 0.18, + "step": 206400 + }, + { + "epoch": 16.126513080827802, + "grad_norm": 0.8544393181800842, + "learning_rate": 1.6789089488081284e-05, + "loss": 0.182, + "step": 206500 + }, + { + "epoch": 16.134322530261617, + "grad_norm": 1.1084660291671753, + "learning_rate": 1.6787526377491206e-05, + "loss": 0.1844, + "step": 206600 + }, + { + "epoch": 16.14213197969543, + "grad_norm": 0.9038600325584412, + "learning_rate": 1.6785963266901136e-05, + "loss": 0.1836, + "step": 206700 + }, + { + "epoch": 16.149941429129246, + "grad_norm": 0.8956557512283325, + "learning_rate": 1.6784400156311062e-05, + "loss": 0.1841, + "step": 206800 + }, + { + "epoch": 16.15775087856306, + "grad_norm": 0.8761757612228394, + "learning_rate": 1.6782837045720984e-05, + "loss": 0.1794, + "step": 206900 + }, + { + "epoch": 16.165560327996875, + "grad_norm": 1.2372992038726807, + "learning_rate": 1.6781273935130914e-05, + "loss": 0.1868, + "step": 207000 + }, + { + "epoch": 16.17336977743069, + "grad_norm": 0.9002154469490051, + "learning_rate": 1.6779710824540836e-05, + "loss": 0.1806, + "step": 207100 + }, + { + "epoch": 16.181179226864508, + "grad_norm": 0.7399206161499023, + "learning_rate": 1.6778147713950762e-05, + "loss": 0.1719, + "step": 207200 + }, + { + "epoch": 16.188988676298322, + "grad_norm": 1.0187625885009766, + "learning_rate": 1.677658460336069e-05, + "loss": 0.1844, + "step": 207300 + }, + { + "epoch": 16.196798125732137, + "grad_norm": 0.6338691711425781, + "learning_rate": 1.6775021492770614e-05, + "loss": 0.181, + "step": 207400 + }, + { + "epoch": 16.20460757516595, + "grad_norm": 0.9440446496009827, + "learning_rate": 1.677345838218054e-05, + "loss": 0.1856, + "step": 207500 + }, + { + "epoch": 16.212417024599766, + "grad_norm": 1.0803288221359253, + "learning_rate": 1.6771895271590466e-05, + "loss": 0.1803, + "step": 207600 + }, + { + "epoch": 16.22022647403358, + "grad_norm": 1.3593189716339111, + "learning_rate": 1.6770332161000392e-05, + "loss": 0.1751, + "step": 207700 + }, + { + "epoch": 16.228035923467395, + "grad_norm": 1.1652122735977173, + "learning_rate": 1.676876905041032e-05, + "loss": 0.1874, + "step": 207800 + }, + { + "epoch": 16.23584537290121, + "grad_norm": 0.8041174411773682, + "learning_rate": 1.6767205939820244e-05, + "loss": 0.1798, + "step": 207900 + }, + { + "epoch": 16.243654822335024, + "grad_norm": 0.8075453639030457, + "learning_rate": 1.6765642829230167e-05, + "loss": 0.1845, + "step": 208000 + }, + { + "epoch": 16.25146427176884, + "grad_norm": 0.95871502161026, + "learning_rate": 1.6764079718640096e-05, + "loss": 0.1667, + "step": 208100 + }, + { + "epoch": 16.259273721202653, + "grad_norm": 0.9648413062095642, + "learning_rate": 1.6762516608050023e-05, + "loss": 0.1865, + "step": 208200 + }, + { + "epoch": 16.26708317063647, + "grad_norm": 0.8995979428291321, + "learning_rate": 1.676096912856585e-05, + "loss": 0.1746, + "step": 208300 + }, + { + "epoch": 16.274892620070286, + "grad_norm": 0.8468859195709229, + "learning_rate": 1.675940601797577e-05, + "loss": 0.1706, + "step": 208400 + }, + { + "epoch": 16.2827020695041, + "grad_norm": 0.9065866470336914, + "learning_rate": 1.67578429073857e-05, + "loss": 0.1778, + "step": 208500 + }, + { + "epoch": 16.290511518937915, + "grad_norm": 0.6135996580123901, + "learning_rate": 1.6756279796795623e-05, + "loss": 0.1808, + "step": 208600 + }, + { + "epoch": 16.29832096837173, + "grad_norm": 1.0263526439666748, + "learning_rate": 1.675471668620555e-05, + "loss": 0.1773, + "step": 208700 + }, + { + "epoch": 16.306130417805544, + "grad_norm": 1.000637173652649, + "learning_rate": 1.6753153575615475e-05, + "loss": 0.1792, + "step": 208800 + }, + { + "epoch": 16.31393986723936, + "grad_norm": 0.948738694190979, + "learning_rate": 1.67515904650254e-05, + "loss": 0.1876, + "step": 208900 + }, + { + "epoch": 16.321749316673174, + "grad_norm": 0.8848404288291931, + "learning_rate": 1.6750027354435327e-05, + "loss": 0.1773, + "step": 209000 + }, + { + "epoch": 16.329558766106988, + "grad_norm": 0.7282323241233826, + "learning_rate": 1.6748464243845253e-05, + "loss": 0.1773, + "step": 209100 + }, + { + "epoch": 16.337368215540803, + "grad_norm": 0.9083189368247986, + "learning_rate": 1.674690113325518e-05, + "loss": 0.1864, + "step": 209200 + }, + { + "epoch": 16.34517766497462, + "grad_norm": 0.9564442038536072, + "learning_rate": 1.6745338022665105e-05, + "loss": 0.1816, + "step": 209300 + }, + { + "epoch": 16.352987114408435, + "grad_norm": 0.5036829710006714, + "learning_rate": 1.674377491207503e-05, + "loss": 0.1756, + "step": 209400 + }, + { + "epoch": 16.36079656384225, + "grad_norm": 0.7878613471984863, + "learning_rate": 1.6742211801484957e-05, + "loss": 0.176, + "step": 209500 + }, + { + "epoch": 16.368606013276064, + "grad_norm": 0.7844358682632446, + "learning_rate": 1.6740648690894883e-05, + "loss": 0.1837, + "step": 209600 + }, + { + "epoch": 16.37641546270988, + "grad_norm": 0.9099976420402527, + "learning_rate": 1.673908558030481e-05, + "loss": 0.18, + "step": 209700 + }, + { + "epoch": 16.384224912143694, + "grad_norm": 0.8219439387321472, + "learning_rate": 1.6737522469714732e-05, + "loss": 0.1724, + "step": 209800 + }, + { + "epoch": 16.392034361577508, + "grad_norm": 0.978882372379303, + "learning_rate": 1.673595935912466e-05, + "loss": 0.1762, + "step": 209900 + }, + { + "epoch": 16.399843811011323, + "grad_norm": 1.052987813949585, + "learning_rate": 1.6734396248534587e-05, + "loss": 0.1811, + "step": 210000 + }, + { + "epoch": 16.407653260445137, + "grad_norm": 1.045796275138855, + "learning_rate": 1.673283313794451e-05, + "loss": 0.1786, + "step": 210100 + }, + { + "epoch": 16.415462709878952, + "grad_norm": 1.1169161796569824, + "learning_rate": 1.6731270027354436e-05, + "loss": 0.1822, + "step": 210200 + }, + { + "epoch": 16.42327215931277, + "grad_norm": 0.7530735731124878, + "learning_rate": 1.6729722547870265e-05, + "loss": 0.1862, + "step": 210300 + }, + { + "epoch": 16.431081608746585, + "grad_norm": 1.0057936906814575, + "learning_rate": 1.6728159437280188e-05, + "loss": 0.1614, + "step": 210400 + }, + { + "epoch": 16.4388910581804, + "grad_norm": 0.8210418820381165, + "learning_rate": 1.6726596326690114e-05, + "loss": 0.1786, + "step": 210500 + }, + { + "epoch": 16.446700507614214, + "grad_norm": 0.8076565265655518, + "learning_rate": 1.672503321610004e-05, + "loss": 0.1791, + "step": 210600 + }, + { + "epoch": 16.454509957048028, + "grad_norm": 0.9307219386100769, + "learning_rate": 1.6723470105509966e-05, + "loss": 0.1802, + "step": 210700 + }, + { + "epoch": 16.462319406481843, + "grad_norm": 0.9993940591812134, + "learning_rate": 1.6721906994919892e-05, + "loss": 0.1721, + "step": 210800 + }, + { + "epoch": 16.470128855915657, + "grad_norm": 0.8416149616241455, + "learning_rate": 1.6720343884329818e-05, + "loss": 0.1896, + "step": 210900 + }, + { + "epoch": 16.477938305349472, + "grad_norm": 0.7258075475692749, + "learning_rate": 1.6718780773739744e-05, + "loss": 0.1809, + "step": 211000 + }, + { + "epoch": 16.485747754783286, + "grad_norm": 0.8920261263847351, + "learning_rate": 1.671721766314967e-05, + "loss": 0.1816, + "step": 211100 + }, + { + "epoch": 16.4935572042171, + "grad_norm": 0.8794561624526978, + "learning_rate": 1.6715654552559596e-05, + "loss": 0.1832, + "step": 211200 + }, + { + "epoch": 16.50136665365092, + "grad_norm": 1.1431734561920166, + "learning_rate": 1.671409144196952e-05, + "loss": 0.1826, + "step": 211300 + }, + { + "epoch": 16.509176103084734, + "grad_norm": 0.7752969861030579, + "learning_rate": 1.6712528331379448e-05, + "loss": 0.1742, + "step": 211400 + }, + { + "epoch": 16.51698555251855, + "grad_norm": 1.030426263809204, + "learning_rate": 1.6710965220789374e-05, + "loss": 0.1773, + "step": 211500 + }, + { + "epoch": 16.524795001952363, + "grad_norm": 0.7855979204177856, + "learning_rate": 1.6709402110199297e-05, + "loss": 0.1857, + "step": 211600 + }, + { + "epoch": 16.532604451386177, + "grad_norm": 0.9296749830245972, + "learning_rate": 1.6707838999609223e-05, + "loss": 0.17, + "step": 211700 + }, + { + "epoch": 16.540413900819992, + "grad_norm": 0.8138403296470642, + "learning_rate": 1.670627588901915e-05, + "loss": 0.1834, + "step": 211800 + }, + { + "epoch": 16.548223350253807, + "grad_norm": 0.9249143004417419, + "learning_rate": 1.6704712778429075e-05, + "loss": 0.1748, + "step": 211900 + }, + { + "epoch": 16.55603279968762, + "grad_norm": 0.922657310962677, + "learning_rate": 1.6703149667839e-05, + "loss": 0.1728, + "step": 212000 + }, + { + "epoch": 16.563842249121436, + "grad_norm": 0.9151207804679871, + "learning_rate": 1.6701586557248927e-05, + "loss": 0.1787, + "step": 212100 + }, + { + "epoch": 16.57165169855525, + "grad_norm": 0.8385941982269287, + "learning_rate": 1.6700023446658853e-05, + "loss": 0.1883, + "step": 212200 + }, + { + "epoch": 16.57946114798907, + "grad_norm": 0.724240779876709, + "learning_rate": 1.669847596717468e-05, + "loss": 0.1833, + "step": 212300 + }, + { + "epoch": 16.587270597422883, + "grad_norm": 0.6641395092010498, + "learning_rate": 1.6696912856584605e-05, + "loss": 0.1761, + "step": 212400 + }, + { + "epoch": 16.595080046856697, + "grad_norm": 0.885064423084259, + "learning_rate": 1.669534974599453e-05, + "loss": 0.1839, + "step": 212500 + }, + { + "epoch": 16.602889496290512, + "grad_norm": 0.705733597278595, + "learning_rate": 1.6693786635404457e-05, + "loss": 0.1681, + "step": 212600 + }, + { + "epoch": 16.610698945724327, + "grad_norm": 0.9132718443870544, + "learning_rate": 1.6692223524814383e-05, + "loss": 0.1828, + "step": 212700 + }, + { + "epoch": 16.61850839515814, + "grad_norm": 0.7742204070091248, + "learning_rate": 1.6690660414224305e-05, + "loss": 0.1885, + "step": 212800 + }, + { + "epoch": 16.626317844591956, + "grad_norm": 1.0476394891738892, + "learning_rate": 1.6689097303634235e-05, + "loss": 0.1667, + "step": 212900 + }, + { + "epoch": 16.63412729402577, + "grad_norm": 0.7489650249481201, + "learning_rate": 1.668753419304416e-05, + "loss": 0.1742, + "step": 213000 + }, + { + "epoch": 16.641936743459585, + "grad_norm": 0.9502554535865784, + "learning_rate": 1.6685971082454083e-05, + "loss": 0.1723, + "step": 213100 + }, + { + "epoch": 16.6497461928934, + "grad_norm": 0.9613061547279358, + "learning_rate": 1.6684407971864013e-05, + "loss": 0.1768, + "step": 213200 + }, + { + "epoch": 16.657555642327218, + "grad_norm": 0.7885509729385376, + "learning_rate": 1.6682844861273935e-05, + "loss": 0.17, + "step": 213300 + }, + { + "epoch": 16.665365091761032, + "grad_norm": 1.1856578588485718, + "learning_rate": 1.668128175068386e-05, + "loss": 0.178, + "step": 213400 + }, + { + "epoch": 16.673174541194847, + "grad_norm": 0.7488318681716919, + "learning_rate": 1.6679718640093787e-05, + "loss": 0.1802, + "step": 213500 + }, + { + "epoch": 16.68098399062866, + "grad_norm": 0.6175550222396851, + "learning_rate": 1.6678155529503713e-05, + "loss": 0.1726, + "step": 213600 + }, + { + "epoch": 16.688793440062476, + "grad_norm": 0.8155626058578491, + "learning_rate": 1.667659241891364e-05, + "loss": 0.1868, + "step": 213700 + }, + { + "epoch": 16.69660288949629, + "grad_norm": 0.8481264114379883, + "learning_rate": 1.6675029308323565e-05, + "loss": 0.1724, + "step": 213800 + }, + { + "epoch": 16.704412338930105, + "grad_norm": 0.971606969833374, + "learning_rate": 1.667346619773349e-05, + "loss": 0.1812, + "step": 213900 + }, + { + "epoch": 16.71222178836392, + "grad_norm": 0.7704633474349976, + "learning_rate": 1.6671903087143417e-05, + "loss": 0.1814, + "step": 214000 + }, + { + "epoch": 16.720031237797734, + "grad_norm": 0.8130826950073242, + "learning_rate": 1.6670339976553343e-05, + "loss": 0.172, + "step": 214100 + }, + { + "epoch": 16.72784068723155, + "grad_norm": 0.7058693170547485, + "learning_rate": 1.6668776865963266e-05, + "loss": 0.1729, + "step": 214200 + }, + { + "epoch": 16.735650136665367, + "grad_norm": 0.7548621892929077, + "learning_rate": 1.6667213755373195e-05, + "loss": 0.1753, + "step": 214300 + }, + { + "epoch": 16.74345958609918, + "grad_norm": 0.9197354316711426, + "learning_rate": 1.666565064478312e-05, + "loss": 0.1739, + "step": 214400 + }, + { + "epoch": 16.751269035532996, + "grad_norm": 0.9533059000968933, + "learning_rate": 1.6664087534193044e-05, + "loss": 0.1778, + "step": 214500 + }, + { + "epoch": 16.75907848496681, + "grad_norm": 0.8951109051704407, + "learning_rate": 1.666254005470887e-05, + "loss": 0.19, + "step": 214600 + }, + { + "epoch": 16.766887934400625, + "grad_norm": 0.8666356801986694, + "learning_rate": 1.66609769441188e-05, + "loss": 0.1868, + "step": 214700 + }, + { + "epoch": 16.77469738383444, + "grad_norm": 1.1076565980911255, + "learning_rate": 1.6659413833528722e-05, + "loss": 0.1754, + "step": 214800 + }, + { + "epoch": 16.782506833268254, + "grad_norm": 1.198845624923706, + "learning_rate": 1.6657850722938648e-05, + "loss": 0.1803, + "step": 214900 + }, + { + "epoch": 16.79031628270207, + "grad_norm": 0.5751312971115112, + "learning_rate": 1.6656287612348574e-05, + "loss": 0.1789, + "step": 215000 + }, + { + "epoch": 16.798125732135883, + "grad_norm": 1.1437292098999023, + "learning_rate": 1.66547245017585e-05, + "loss": 0.1772, + "step": 215100 + }, + { + "epoch": 16.805935181569698, + "grad_norm": 1.0913738012313843, + "learning_rate": 1.6653161391168426e-05, + "loss": 0.1808, + "step": 215200 + }, + { + "epoch": 16.813744631003516, + "grad_norm": 0.9207703471183777, + "learning_rate": 1.6651598280578352e-05, + "loss": 0.1661, + "step": 215300 + }, + { + "epoch": 16.82155408043733, + "grad_norm": 1.096644401550293, + "learning_rate": 1.6650035169988278e-05, + "loss": 0.1802, + "step": 215400 + }, + { + "epoch": 16.829363529871145, + "grad_norm": 0.9608086347579956, + "learning_rate": 1.6648472059398204e-05, + "loss": 0.1852, + "step": 215500 + }, + { + "epoch": 16.83717297930496, + "grad_norm": 1.0263370275497437, + "learning_rate": 1.664690894880813e-05, + "loss": 0.177, + "step": 215600 + }, + { + "epoch": 16.844982428738774, + "grad_norm": 0.8369292616844177, + "learning_rate": 1.6645345838218053e-05, + "loss": 0.1728, + "step": 215700 + }, + { + "epoch": 16.85279187817259, + "grad_norm": 0.8427988290786743, + "learning_rate": 1.6643782727627982e-05, + "loss": 0.1819, + "step": 215800 + }, + { + "epoch": 16.860601327606403, + "grad_norm": 0.9546705484390259, + "learning_rate": 1.6642219617037908e-05, + "loss": 0.1756, + "step": 215900 + }, + { + "epoch": 16.868410777040218, + "grad_norm": 1.0135685205459595, + "learning_rate": 1.664065650644783e-05, + "loss": 0.1806, + "step": 216000 + }, + { + "epoch": 16.876220226474032, + "grad_norm": 0.8881898522377014, + "learning_rate": 1.663909339585776e-05, + "loss": 0.1771, + "step": 216100 + }, + { + "epoch": 16.884029675907847, + "grad_norm": 0.8927045464515686, + "learning_rate": 1.6637530285267686e-05, + "loss": 0.1792, + "step": 216200 + }, + { + "epoch": 16.891839125341665, + "grad_norm": 0.633324146270752, + "learning_rate": 1.663596717467761e-05, + "loss": 0.1758, + "step": 216300 + }, + { + "epoch": 16.89964857477548, + "grad_norm": 0.867351233959198, + "learning_rate": 1.6634404064087535e-05, + "loss": 0.1787, + "step": 216400 + }, + { + "epoch": 16.907458024209294, + "grad_norm": 1.1435070037841797, + "learning_rate": 1.663284095349746e-05, + "loss": 0.1741, + "step": 216500 + }, + { + "epoch": 16.91526747364311, + "grad_norm": 1.1241178512573242, + "learning_rate": 1.6631293474013287e-05, + "loss": 0.1757, + "step": 216600 + }, + { + "epoch": 16.923076923076923, + "grad_norm": 0.608675479888916, + "learning_rate": 1.6629730363423213e-05, + "loss": 0.1842, + "step": 216700 + }, + { + "epoch": 16.930886372510738, + "grad_norm": 0.7797546982765198, + "learning_rate": 1.662816725283314e-05, + "loss": 0.1745, + "step": 216800 + }, + { + "epoch": 16.938695821944552, + "grad_norm": 0.689860999584198, + "learning_rate": 1.6626604142243065e-05, + "loss": 0.1765, + "step": 216900 + }, + { + "epoch": 16.946505271378367, + "grad_norm": 1.104783535003662, + "learning_rate": 1.662504103165299e-05, + "loss": 0.1813, + "step": 217000 + }, + { + "epoch": 16.95431472081218, + "grad_norm": 0.9937470555305481, + "learning_rate": 1.6623477921062917e-05, + "loss": 0.1767, + "step": 217100 + }, + { + "epoch": 16.962124170245996, + "grad_norm": 0.9059083461761475, + "learning_rate": 1.6621914810472843e-05, + "loss": 0.1801, + "step": 217200 + }, + { + "epoch": 16.969933619679814, + "grad_norm": 0.7685569524765015, + "learning_rate": 1.662035169988277e-05, + "loss": 0.1765, + "step": 217300 + }, + { + "epoch": 16.97774306911363, + "grad_norm": 1.1561412811279297, + "learning_rate": 1.6618788589292695e-05, + "loss": 0.1784, + "step": 217400 + }, + { + "epoch": 16.985552518547443, + "grad_norm": 0.7909867763519287, + "learning_rate": 1.6617225478702617e-05, + "loss": 0.1706, + "step": 217500 + }, + { + "epoch": 16.993361967981258, + "grad_norm": 0.8139914870262146, + "learning_rate": 1.6615662368112547e-05, + "loss": 0.1748, + "step": 217600 + }, + { + "epoch": 17.001171417415073, + "grad_norm": 0.8759558200836182, + "learning_rate": 1.6614099257522473e-05, + "loss": 0.176, + "step": 217700 + }, + { + "epoch": 17.008980866848887, + "grad_norm": 0.780949592590332, + "learning_rate": 1.6612536146932395e-05, + "loss": 0.1765, + "step": 217800 + }, + { + "epoch": 17.0167903162827, + "grad_norm": 0.7488613128662109, + "learning_rate": 1.661097303634232e-05, + "loss": 0.1795, + "step": 217900 + }, + { + "epoch": 17.024599765716516, + "grad_norm": 0.6278981566429138, + "learning_rate": 1.6609409925752247e-05, + "loss": 0.1747, + "step": 218000 + }, + { + "epoch": 17.03240921515033, + "grad_norm": 0.8410722613334656, + "learning_rate": 1.6607846815162173e-05, + "loss": 0.1784, + "step": 218100 + }, + { + "epoch": 17.040218664584145, + "grad_norm": 0.9053663611412048, + "learning_rate": 1.66062837045721e-05, + "loss": 0.1696, + "step": 218200 + }, + { + "epoch": 17.048028114017963, + "grad_norm": 1.0183720588684082, + "learning_rate": 1.6604720593982026e-05, + "loss": 0.1764, + "step": 218300 + }, + { + "epoch": 17.055837563451778, + "grad_norm": 0.8127830624580383, + "learning_rate": 1.660315748339195e-05, + "loss": 0.18, + "step": 218400 + }, + { + "epoch": 17.063647012885593, + "grad_norm": 0.9182298183441162, + "learning_rate": 1.6601594372801878e-05, + "loss": 0.1695, + "step": 218500 + }, + { + "epoch": 17.071456462319407, + "grad_norm": 0.981143057346344, + "learning_rate": 1.6600031262211804e-05, + "loss": 0.1675, + "step": 218600 + }, + { + "epoch": 17.07926591175322, + "grad_norm": 0.8018271923065186, + "learning_rate": 1.659846815162173e-05, + "loss": 0.1697, + "step": 218700 + }, + { + "epoch": 17.087075361187036, + "grad_norm": 0.923195481300354, + "learning_rate": 1.6596920672137555e-05, + "loss": 0.1764, + "step": 218800 + }, + { + "epoch": 17.09488481062085, + "grad_norm": 1.013973593711853, + "learning_rate": 1.659535756154748e-05, + "loss": 0.1772, + "step": 218900 + }, + { + "epoch": 17.102694260054665, + "grad_norm": 0.7282880544662476, + "learning_rate": 1.6593794450957404e-05, + "loss": 0.1688, + "step": 219000 + }, + { + "epoch": 17.11050370948848, + "grad_norm": 0.8870101571083069, + "learning_rate": 1.6592231340367334e-05, + "loss": 0.1776, + "step": 219100 + }, + { + "epoch": 17.118313158922295, + "grad_norm": 0.7469478845596313, + "learning_rate": 1.659066822977726e-05, + "loss": 0.1751, + "step": 219200 + }, + { + "epoch": 17.126122608356113, + "grad_norm": 1.4116019010543823, + "learning_rate": 1.6589105119187182e-05, + "loss": 0.1711, + "step": 219300 + }, + { + "epoch": 17.133932057789927, + "grad_norm": 0.9935173988342285, + "learning_rate": 1.658754200859711e-05, + "loss": 0.1722, + "step": 219400 + }, + { + "epoch": 17.141741507223742, + "grad_norm": 0.865006685256958, + "learning_rate": 1.6585978898007034e-05, + "loss": 0.1719, + "step": 219500 + }, + { + "epoch": 17.149550956657556, + "grad_norm": 0.891258180141449, + "learning_rate": 1.658441578741696e-05, + "loss": 0.1738, + "step": 219600 + }, + { + "epoch": 17.15736040609137, + "grad_norm": 0.9739299416542053, + "learning_rate": 1.6582852676826886e-05, + "loss": 0.1767, + "step": 219700 + }, + { + "epoch": 17.165169855525185, + "grad_norm": 1.140121340751648, + "learning_rate": 1.6581289566236812e-05, + "loss": 0.1784, + "step": 219800 + }, + { + "epoch": 17.172979304959, + "grad_norm": 0.9761224389076233, + "learning_rate": 1.6579726455646738e-05, + "loss": 0.1694, + "step": 219900 + }, + { + "epoch": 17.180788754392815, + "grad_norm": 0.8532379269599915, + "learning_rate": 1.6578163345056664e-05, + "loss": 0.1752, + "step": 220000 + }, + { + "epoch": 17.18859820382663, + "grad_norm": 0.6662416458129883, + "learning_rate": 1.657660023446659e-05, + "loss": 0.167, + "step": 220100 + }, + { + "epoch": 17.196407653260444, + "grad_norm": 1.090410828590393, + "learning_rate": 1.6575037123876516e-05, + "loss": 0.177, + "step": 220200 + }, + { + "epoch": 17.20421710269426, + "grad_norm": 0.9860948920249939, + "learning_rate": 1.6573474013286442e-05, + "loss": 0.1716, + "step": 220300 + }, + { + "epoch": 17.212026552128076, + "grad_norm": 0.8205731511116028, + "learning_rate": 1.6571910902696365e-05, + "loss": 0.1708, + "step": 220400 + }, + { + "epoch": 17.21983600156189, + "grad_norm": 0.636326014995575, + "learning_rate": 1.6570347792106294e-05, + "loss": 0.1775, + "step": 220500 + }, + { + "epoch": 17.227645450995706, + "grad_norm": 1.1403089761734009, + "learning_rate": 1.656878468151622e-05, + "loss": 0.1818, + "step": 220600 + }, + { + "epoch": 17.23545490042952, + "grad_norm": 0.9462027549743652, + "learning_rate": 1.6567221570926143e-05, + "loss": 0.1717, + "step": 220700 + }, + { + "epoch": 17.243264349863335, + "grad_norm": 0.8314661979675293, + "learning_rate": 1.656565846033607e-05, + "loss": 0.1801, + "step": 220800 + }, + { + "epoch": 17.25107379929715, + "grad_norm": 1.00192391872406, + "learning_rate": 1.6564110980851898e-05, + "loss": 0.1745, + "step": 220900 + }, + { + "epoch": 17.258883248730964, + "grad_norm": 1.1850578784942627, + "learning_rate": 1.656254787026182e-05, + "loss": 0.1785, + "step": 221000 + }, + { + "epoch": 17.26669269816478, + "grad_norm": 0.5880224108695984, + "learning_rate": 1.6560984759671747e-05, + "loss": 0.1702, + "step": 221100 + }, + { + "epoch": 17.274502147598593, + "grad_norm": 0.8309512734413147, + "learning_rate": 1.6559421649081673e-05, + "loss": 0.1808, + "step": 221200 + }, + { + "epoch": 17.282311597032407, + "grad_norm": 0.7761743664741516, + "learning_rate": 1.65578585384916e-05, + "loss": 0.1764, + "step": 221300 + }, + { + "epoch": 17.290121046466226, + "grad_norm": 0.9220330715179443, + "learning_rate": 1.6556295427901525e-05, + "loss": 0.1684, + "step": 221400 + }, + { + "epoch": 17.29793049590004, + "grad_norm": 1.2165334224700928, + "learning_rate": 1.655473231731145e-05, + "loss": 0.1791, + "step": 221500 + }, + { + "epoch": 17.305739945333855, + "grad_norm": 0.9512267112731934, + "learning_rate": 1.6553169206721377e-05, + "loss": 0.1687, + "step": 221600 + }, + { + "epoch": 17.31354939476767, + "grad_norm": 1.0094327926635742, + "learning_rate": 1.6551606096131303e-05, + "loss": 0.1698, + "step": 221700 + }, + { + "epoch": 17.321358844201484, + "grad_norm": 0.8161134123802185, + "learning_rate": 1.655004298554123e-05, + "loss": 0.1693, + "step": 221800 + }, + { + "epoch": 17.3291682936353, + "grad_norm": 0.7945289611816406, + "learning_rate": 1.654847987495115e-05, + "loss": 0.1821, + "step": 221900 + }, + { + "epoch": 17.336977743069113, + "grad_norm": 0.9875785708427429, + "learning_rate": 1.654691676436108e-05, + "loss": 0.1699, + "step": 222000 + }, + { + "epoch": 17.344787192502928, + "grad_norm": 1.033823847770691, + "learning_rate": 1.6545353653771007e-05, + "loss": 0.173, + "step": 222100 + }, + { + "epoch": 17.352596641936742, + "grad_norm": 0.7107641696929932, + "learning_rate": 1.654379054318093e-05, + "loss": 0.171, + "step": 222200 + }, + { + "epoch": 17.360406091370557, + "grad_norm": 0.6887959837913513, + "learning_rate": 1.6542227432590856e-05, + "loss": 0.17, + "step": 222300 + }, + { + "epoch": 17.368215540804375, + "grad_norm": 1.2243207693099976, + "learning_rate": 1.6540664322000785e-05, + "loss": 0.1704, + "step": 222400 + }, + { + "epoch": 17.37602499023819, + "grad_norm": 0.7111543416976929, + "learning_rate": 1.6539101211410708e-05, + "loss": 0.172, + "step": 222500 + }, + { + "epoch": 17.383834439672004, + "grad_norm": 1.120431661605835, + "learning_rate": 1.6537538100820634e-05, + "loss": 0.179, + "step": 222600 + }, + { + "epoch": 17.39164388910582, + "grad_norm": 1.0118180513381958, + "learning_rate": 1.653597499023056e-05, + "loss": 0.1721, + "step": 222700 + }, + { + "epoch": 17.399453338539633, + "grad_norm": 0.9543727040290833, + "learning_rate": 1.6534411879640486e-05, + "loss": 0.1648, + "step": 222800 + }, + { + "epoch": 17.407262787973448, + "grad_norm": 1.019272804260254, + "learning_rate": 1.6532848769050412e-05, + "loss": 0.1761, + "step": 222900 + }, + { + "epoch": 17.415072237407262, + "grad_norm": 0.8520439863204956, + "learning_rate": 1.6531301289566238e-05, + "loss": 0.1668, + "step": 223000 + }, + { + "epoch": 17.422881686841077, + "grad_norm": 0.9222218990325928, + "learning_rate": 1.6529738178976164e-05, + "loss": 0.1787, + "step": 223100 + }, + { + "epoch": 17.43069113627489, + "grad_norm": 0.8735765218734741, + "learning_rate": 1.652817506838609e-05, + "loss": 0.1759, + "step": 223200 + }, + { + "epoch": 17.438500585708706, + "grad_norm": 1.4969502687454224, + "learning_rate": 1.6526611957796016e-05, + "loss": 0.1734, + "step": 223300 + }, + { + "epoch": 17.446310035142524, + "grad_norm": 0.671943187713623, + "learning_rate": 1.652504884720594e-05, + "loss": 0.1759, + "step": 223400 + }, + { + "epoch": 17.45411948457634, + "grad_norm": 0.890876829624176, + "learning_rate": 1.6523485736615868e-05, + "loss": 0.1652, + "step": 223500 + }, + { + "epoch": 17.461928934010153, + "grad_norm": 0.6937717795372009, + "learning_rate": 1.6521922626025794e-05, + "loss": 0.163, + "step": 223600 + }, + { + "epoch": 17.469738383443968, + "grad_norm": 0.827542781829834, + "learning_rate": 1.6520359515435716e-05, + "loss": 0.176, + "step": 223700 + }, + { + "epoch": 17.477547832877782, + "grad_norm": 0.9324260354042053, + "learning_rate": 1.6518796404845646e-05, + "loss": 0.1692, + "step": 223800 + }, + { + "epoch": 17.485357282311597, + "grad_norm": 0.8106001615524292, + "learning_rate": 1.6517233294255572e-05, + "loss": 0.1728, + "step": 223900 + }, + { + "epoch": 17.49316673174541, + "grad_norm": 1.4987694025039673, + "learning_rate": 1.6515670183665494e-05, + "loss": 0.1747, + "step": 224000 + }, + { + "epoch": 17.500976181179226, + "grad_norm": 0.955620288848877, + "learning_rate": 1.651410707307542e-05, + "loss": 0.1772, + "step": 224100 + }, + { + "epoch": 17.50878563061304, + "grad_norm": 0.8453149199485779, + "learning_rate": 1.6512543962485346e-05, + "loss": 0.1774, + "step": 224200 + }, + { + "epoch": 17.516595080046855, + "grad_norm": 0.646578311920166, + "learning_rate": 1.6510980851895272e-05, + "loss": 0.1662, + "step": 224300 + }, + { + "epoch": 17.524404529480673, + "grad_norm": 1.0147279500961304, + "learning_rate": 1.65094177413052e-05, + "loss": 0.1761, + "step": 224400 + }, + { + "epoch": 17.532213978914488, + "grad_norm": 1.0600621700286865, + "learning_rate": 1.6507854630715124e-05, + "loss": 0.1601, + "step": 224500 + }, + { + "epoch": 17.540023428348302, + "grad_norm": 1.0022188425064087, + "learning_rate": 1.650629152012505e-05, + "loss": 0.1677, + "step": 224600 + }, + { + "epoch": 17.547832877782117, + "grad_norm": 0.8853134512901306, + "learning_rate": 1.6504728409534976e-05, + "loss": 0.1757, + "step": 224700 + }, + { + "epoch": 17.55564232721593, + "grad_norm": 0.8376500606536865, + "learning_rate": 1.6503165298944902e-05, + "loss": 0.1777, + "step": 224800 + }, + { + "epoch": 17.563451776649746, + "grad_norm": 1.0630090236663818, + "learning_rate": 1.650160218835483e-05, + "loss": 0.1755, + "step": 224900 + }, + { + "epoch": 17.57126122608356, + "grad_norm": 0.9697504639625549, + "learning_rate": 1.6500054708870654e-05, + "loss": 0.173, + "step": 225000 + }, + { + "epoch": 17.579070675517375, + "grad_norm": 1.2545424699783325, + "learning_rate": 1.649849159828058e-05, + "loss": 0.1761, + "step": 225100 + }, + { + "epoch": 17.58688012495119, + "grad_norm": 0.8433966636657715, + "learning_rate": 1.6496928487690503e-05, + "loss": 0.1701, + "step": 225200 + }, + { + "epoch": 17.594689574385004, + "grad_norm": 1.4100017547607422, + "learning_rate": 1.6495365377100432e-05, + "loss": 0.166, + "step": 225300 + }, + { + "epoch": 17.602499023818822, + "grad_norm": 1.0186513662338257, + "learning_rate": 1.649380226651036e-05, + "loss": 0.1715, + "step": 225400 + }, + { + "epoch": 17.610308473252637, + "grad_norm": 0.8596687912940979, + "learning_rate": 1.649223915592028e-05, + "loss": 0.1686, + "step": 225500 + }, + { + "epoch": 17.61811792268645, + "grad_norm": 0.9129221439361572, + "learning_rate": 1.649067604533021e-05, + "loss": 0.168, + "step": 225600 + }, + { + "epoch": 17.625927372120266, + "grad_norm": 1.0046827793121338, + "learning_rate": 1.6489112934740133e-05, + "loss": 0.1796, + "step": 225700 + }, + { + "epoch": 17.63373682155408, + "grad_norm": 0.8824147582054138, + "learning_rate": 1.648754982415006e-05, + "loss": 0.1545, + "step": 225800 + }, + { + "epoch": 17.641546270987895, + "grad_norm": 0.9992631077766418, + "learning_rate": 1.6485986713559985e-05, + "loss": 0.1662, + "step": 225900 + }, + { + "epoch": 17.64935572042171, + "grad_norm": 0.931957483291626, + "learning_rate": 1.648442360296991e-05, + "loss": 0.1631, + "step": 226000 + }, + { + "epoch": 17.657165169855524, + "grad_norm": 0.8518832325935364, + "learning_rate": 1.6482860492379837e-05, + "loss": 0.1639, + "step": 226100 + }, + { + "epoch": 17.66497461928934, + "grad_norm": 0.9512225985527039, + "learning_rate": 1.6481297381789763e-05, + "loss": 0.1745, + "step": 226200 + }, + { + "epoch": 17.672784068723153, + "grad_norm": 0.6881412267684937, + "learning_rate": 1.647973427119969e-05, + "loss": 0.1649, + "step": 226300 + }, + { + "epoch": 17.68059351815697, + "grad_norm": 1.2182663679122925, + "learning_rate": 1.6478171160609615e-05, + "loss": 0.1712, + "step": 226400 + }, + { + "epoch": 17.688402967590786, + "grad_norm": 0.7335953116416931, + "learning_rate": 1.647660805001954e-05, + "loss": 0.1709, + "step": 226500 + }, + { + "epoch": 17.6962124170246, + "grad_norm": 0.7070138454437256, + "learning_rate": 1.6475044939429464e-05, + "loss": 0.1826, + "step": 226600 + }, + { + "epoch": 17.704021866458415, + "grad_norm": 1.0415022373199463, + "learning_rate": 1.6473481828839393e-05, + "loss": 0.1709, + "step": 226700 + }, + { + "epoch": 17.71183131589223, + "grad_norm": 1.1055572032928467, + "learning_rate": 1.647191871824932e-05, + "loss": 0.1656, + "step": 226800 + }, + { + "epoch": 17.719640765326044, + "grad_norm": 0.989414393901825, + "learning_rate": 1.6470355607659242e-05, + "loss": 0.1657, + "step": 226900 + }, + { + "epoch": 17.72745021475986, + "grad_norm": 1.3034842014312744, + "learning_rate": 1.6468808128175068e-05, + "loss": 0.159, + "step": 227000 + }, + { + "epoch": 17.735259664193674, + "grad_norm": 0.7893176078796387, + "learning_rate": 1.6467245017584997e-05, + "loss": 0.1735, + "step": 227100 + }, + { + "epoch": 17.743069113627488, + "grad_norm": 0.9313830733299255, + "learning_rate": 1.646568190699492e-05, + "loss": 0.1645, + "step": 227200 + }, + { + "epoch": 17.750878563061303, + "grad_norm": 1.0136443376541138, + "learning_rate": 1.6464118796404846e-05, + "loss": 0.167, + "step": 227300 + }, + { + "epoch": 17.75868801249512, + "grad_norm": 0.9547396898269653, + "learning_rate": 1.6462555685814772e-05, + "loss": 0.1681, + "step": 227400 + }, + { + "epoch": 17.766497461928935, + "grad_norm": 1.1903905868530273, + "learning_rate": 1.6460992575224698e-05, + "loss": 0.1724, + "step": 227500 + }, + { + "epoch": 17.77430691136275, + "grad_norm": 0.9577434659004211, + "learning_rate": 1.6459429464634624e-05, + "loss": 0.1614, + "step": 227600 + }, + { + "epoch": 17.782116360796564, + "grad_norm": 0.9196197390556335, + "learning_rate": 1.645786635404455e-05, + "loss": 0.1749, + "step": 227700 + }, + { + "epoch": 17.78992581023038, + "grad_norm": 0.8257341980934143, + "learning_rate": 1.6456303243454476e-05, + "loss": 0.1644, + "step": 227800 + }, + { + "epoch": 17.797735259664194, + "grad_norm": 1.012330412864685, + "learning_rate": 1.6454740132864402e-05, + "loss": 0.1614, + "step": 227900 + }, + { + "epoch": 17.805544709098008, + "grad_norm": 0.7949550747871399, + "learning_rate": 1.6453177022274328e-05, + "loss": 0.1682, + "step": 228000 + }, + { + "epoch": 17.813354158531823, + "grad_norm": 0.9207527041435242, + "learning_rate": 1.645161391168425e-05, + "loss": 0.1636, + "step": 228100 + }, + { + "epoch": 17.821163607965637, + "grad_norm": 0.742048978805542, + "learning_rate": 1.645005080109418e-05, + "loss": 0.1577, + "step": 228200 + }, + { + "epoch": 17.828973057399452, + "grad_norm": 1.159746766090393, + "learning_rate": 1.6448487690504106e-05, + "loss": 0.171, + "step": 228300 + }, + { + "epoch": 17.83678250683327, + "grad_norm": 0.996147871017456, + "learning_rate": 1.644692457991403e-05, + "loss": 0.1661, + "step": 228400 + }, + { + "epoch": 17.844591956267084, + "grad_norm": 0.8791880011558533, + "learning_rate": 1.6445361469323955e-05, + "loss": 0.1665, + "step": 228500 + }, + { + "epoch": 17.8524014057009, + "grad_norm": 0.823704183101654, + "learning_rate": 1.6443798358733884e-05, + "loss": 0.1632, + "step": 228600 + }, + { + "epoch": 17.860210855134714, + "grad_norm": 0.6628607511520386, + "learning_rate": 1.6442235248143807e-05, + "loss": 0.1698, + "step": 228700 + }, + { + "epoch": 17.868020304568528, + "grad_norm": 0.931252658367157, + "learning_rate": 1.6440672137553733e-05, + "loss": 0.1741, + "step": 228800 + }, + { + "epoch": 17.875829754002343, + "grad_norm": 0.8795222640037537, + "learning_rate": 1.643910902696366e-05, + "loss": 0.1704, + "step": 228900 + }, + { + "epoch": 17.883639203436157, + "grad_norm": 0.7636140584945679, + "learning_rate": 1.6437545916373585e-05, + "loss": 0.1631, + "step": 229000 + }, + { + "epoch": 17.891448652869972, + "grad_norm": 0.8694945573806763, + "learning_rate": 1.643598280578351e-05, + "loss": 0.1684, + "step": 229100 + }, + { + "epoch": 17.899258102303786, + "grad_norm": 0.8268353343009949, + "learning_rate": 1.6434435326299337e-05, + "loss": 0.1762, + "step": 229200 + }, + { + "epoch": 17.9070675517376, + "grad_norm": 0.9662846326828003, + "learning_rate": 1.6432872215709263e-05, + "loss": 0.1773, + "step": 229300 + }, + { + "epoch": 17.91487700117142, + "grad_norm": 0.6733610033988953, + "learning_rate": 1.643130910511919e-05, + "loss": 0.1744, + "step": 229400 + }, + { + "epoch": 17.922686450605234, + "grad_norm": 0.9901676177978516, + "learning_rate": 1.6429745994529115e-05, + "loss": 0.1633, + "step": 229500 + }, + { + "epoch": 17.93049590003905, + "grad_norm": 0.8217541575431824, + "learning_rate": 1.642818288393904e-05, + "loss": 0.1585, + "step": 229600 + }, + { + "epoch": 17.938305349472863, + "grad_norm": 1.0147716999053955, + "learning_rate": 1.6426619773348967e-05, + "loss": 0.166, + "step": 229700 + }, + { + "epoch": 17.946114798906677, + "grad_norm": 0.9158501029014587, + "learning_rate": 1.6425056662758893e-05, + "loss": 0.1677, + "step": 229800 + }, + { + "epoch": 17.953924248340492, + "grad_norm": 0.7738738059997559, + "learning_rate": 1.6423493552168815e-05, + "loss": 0.1697, + "step": 229900 + }, + { + "epoch": 17.961733697774307, + "grad_norm": 0.766834557056427, + "learning_rate": 1.6421930441578745e-05, + "loss": 0.1657, + "step": 230000 + }, + { + "epoch": 17.96954314720812, + "grad_norm": 1.1993781328201294, + "learning_rate": 1.642036733098867e-05, + "loss": 0.1699, + "step": 230100 + }, + { + "epoch": 17.977352596641936, + "grad_norm": 0.7733214497566223, + "learning_rate": 1.6418804220398593e-05, + "loss": 0.1666, + "step": 230200 + }, + { + "epoch": 17.98516204607575, + "grad_norm": 0.8509901762008667, + "learning_rate": 1.641724110980852e-05, + "loss": 0.1682, + "step": 230300 + }, + { + "epoch": 17.992971495509565, + "grad_norm": 0.775961697101593, + "learning_rate": 1.6415677999218445e-05, + "loss": 0.1595, + "step": 230400 + }, + { + "epoch": 18.000780944943383, + "grad_norm": 0.824718177318573, + "learning_rate": 1.641411488862837e-05, + "loss": 0.1674, + "step": 230500 + }, + { + "epoch": 18.008590394377197, + "grad_norm": 0.5882493257522583, + "learning_rate": 1.6412551778038297e-05, + "loss": 0.1658, + "step": 230600 + }, + { + "epoch": 18.016399843811012, + "grad_norm": 0.9169065356254578, + "learning_rate": 1.6410988667448223e-05, + "loss": 0.1612, + "step": 230700 + }, + { + "epoch": 18.024209293244827, + "grad_norm": 0.9664234519004822, + "learning_rate": 1.640942555685815e-05, + "loss": 0.1662, + "step": 230800 + }, + { + "epoch": 18.03201874267864, + "grad_norm": 0.6878706812858582, + "learning_rate": 1.6407862446268075e-05, + "loss": 0.1701, + "step": 230900 + }, + { + "epoch": 18.039828192112456, + "grad_norm": 1.0365025997161865, + "learning_rate": 1.6406299335678e-05, + "loss": 0.1694, + "step": 231000 + }, + { + "epoch": 18.04763764154627, + "grad_norm": 0.8461370468139648, + "learning_rate": 1.6404736225087927e-05, + "loss": 0.1723, + "step": 231100 + }, + { + "epoch": 18.055447090980085, + "grad_norm": 0.8166918158531189, + "learning_rate": 1.6403188745603753e-05, + "loss": 0.1703, + "step": 231200 + }, + { + "epoch": 18.0632565404139, + "grad_norm": 0.9236775636672974, + "learning_rate": 1.640162563501368e-05, + "loss": 0.1714, + "step": 231300 + }, + { + "epoch": 18.071065989847718, + "grad_norm": 1.0226106643676758, + "learning_rate": 1.6400062524423602e-05, + "loss": 0.171, + "step": 231400 + }, + { + "epoch": 18.078875439281532, + "grad_norm": 0.6999830603599548, + "learning_rate": 1.639849941383353e-05, + "loss": 0.1631, + "step": 231500 + }, + { + "epoch": 18.086684888715347, + "grad_norm": 0.7705897092819214, + "learning_rate": 1.6396936303243457e-05, + "loss": 0.1755, + "step": 231600 + }, + { + "epoch": 18.09449433814916, + "grad_norm": 0.8058556914329529, + "learning_rate": 1.639537319265338e-05, + "loss": 0.1762, + "step": 231700 + }, + { + "epoch": 18.102303787582976, + "grad_norm": 1.0926358699798584, + "learning_rate": 1.639381008206331e-05, + "loss": 0.1622, + "step": 231800 + }, + { + "epoch": 18.11011323701679, + "grad_norm": 0.9904764294624329, + "learning_rate": 1.6392246971473232e-05, + "loss": 0.1731, + "step": 231900 + }, + { + "epoch": 18.117922686450605, + "grad_norm": 0.8789817690849304, + "learning_rate": 1.6390683860883158e-05, + "loss": 0.1664, + "step": 232000 + }, + { + "epoch": 18.12573213588442, + "grad_norm": 0.8609523773193359, + "learning_rate": 1.6389120750293084e-05, + "loss": 0.1774, + "step": 232100 + }, + { + "epoch": 18.133541585318234, + "grad_norm": 0.929350733757019, + "learning_rate": 1.638755763970301e-05, + "loss": 0.1655, + "step": 232200 + }, + { + "epoch": 18.14135103475205, + "grad_norm": 1.0220617055892944, + "learning_rate": 1.6385994529112936e-05, + "loss": 0.1588, + "step": 232300 + }, + { + "epoch": 18.149160484185863, + "grad_norm": 0.9037083983421326, + "learning_rate": 1.6384431418522862e-05, + "loss": 0.1674, + "step": 232400 + }, + { + "epoch": 18.15696993361968, + "grad_norm": 1.134727120399475, + "learning_rate": 1.6382868307932788e-05, + "loss": 0.1771, + "step": 232500 + }, + { + "epoch": 18.164779383053496, + "grad_norm": 0.8640575408935547, + "learning_rate": 1.6381305197342714e-05, + "loss": 0.1626, + "step": 232600 + }, + { + "epoch": 18.17258883248731, + "grad_norm": 1.0782209634780884, + "learning_rate": 1.637974208675264e-05, + "loss": 0.1618, + "step": 232700 + }, + { + "epoch": 18.180398281921125, + "grad_norm": 0.8964031338691711, + "learning_rate": 1.6378178976162563e-05, + "loss": 0.163, + "step": 232800 + }, + { + "epoch": 18.18820773135494, + "grad_norm": 0.8061158657073975, + "learning_rate": 1.6376615865572492e-05, + "loss": 0.1672, + "step": 232900 + }, + { + "epoch": 18.196017180788754, + "grad_norm": 0.7686555981636047, + "learning_rate": 1.6375052754982418e-05, + "loss": 0.1631, + "step": 233000 + }, + { + "epoch": 18.20382663022257, + "grad_norm": 0.8305268883705139, + "learning_rate": 1.637348964439234e-05, + "loss": 0.1595, + "step": 233100 + }, + { + "epoch": 18.211636079656383, + "grad_norm": 0.7960754036903381, + "learning_rate": 1.6371942164908167e-05, + "loss": 0.1761, + "step": 233200 + }, + { + "epoch": 18.219445529090198, + "grad_norm": 0.9720467329025269, + "learning_rate": 1.6370379054318096e-05, + "loss": 0.1647, + "step": 233300 + }, + { + "epoch": 18.227254978524012, + "grad_norm": 0.7741464376449585, + "learning_rate": 1.636881594372802e-05, + "loss": 0.1609, + "step": 233400 + }, + { + "epoch": 18.23506442795783, + "grad_norm": 0.7279603481292725, + "learning_rate": 1.6367252833137945e-05, + "loss": 0.1622, + "step": 233500 + }, + { + "epoch": 18.242873877391645, + "grad_norm": 1.0035083293914795, + "learning_rate": 1.636568972254787e-05, + "loss": 0.1674, + "step": 233600 + }, + { + "epoch": 18.25068332682546, + "grad_norm": 0.6845473647117615, + "learning_rate": 1.6364126611957797e-05, + "loss": 0.176, + "step": 233700 + }, + { + "epoch": 18.258492776259274, + "grad_norm": 0.8481450080871582, + "learning_rate": 1.6362563501367723e-05, + "loss": 0.1642, + "step": 233800 + }, + { + "epoch": 18.26630222569309, + "grad_norm": 0.8318843245506287, + "learning_rate": 1.636100039077765e-05, + "loss": 0.1614, + "step": 233900 + }, + { + "epoch": 18.274111675126903, + "grad_norm": 0.7613986134529114, + "learning_rate": 1.6359437280187575e-05, + "loss": 0.1693, + "step": 234000 + }, + { + "epoch": 18.281921124560718, + "grad_norm": 0.8641628623008728, + "learning_rate": 1.63578741695975e-05, + "loss": 0.1605, + "step": 234100 + }, + { + "epoch": 18.289730573994532, + "grad_norm": 0.7250987887382507, + "learning_rate": 1.6356311059007427e-05, + "loss": 0.157, + "step": 234200 + }, + { + "epoch": 18.297540023428347, + "grad_norm": 0.6812145709991455, + "learning_rate": 1.635474794841735e-05, + "loss": 0.1586, + "step": 234300 + }, + { + "epoch": 18.30534947286216, + "grad_norm": 0.8246647119522095, + "learning_rate": 1.635318483782728e-05, + "loss": 0.1649, + "step": 234400 + }, + { + "epoch": 18.31315892229598, + "grad_norm": 0.8091587424278259, + "learning_rate": 1.6351621727237205e-05, + "loss": 0.1661, + "step": 234500 + }, + { + "epoch": 18.320968371729794, + "grad_norm": 0.8854368329048157, + "learning_rate": 1.6350058616647127e-05, + "loss": 0.1656, + "step": 234600 + }, + { + "epoch": 18.32877782116361, + "grad_norm": 0.8019761443138123, + "learning_rate": 1.6348495506057053e-05, + "loss": 0.1645, + "step": 234700 + }, + { + "epoch": 18.336587270597423, + "grad_norm": 0.9290599226951599, + "learning_rate": 1.6346932395466983e-05, + "loss": 0.167, + "step": 234800 + }, + { + "epoch": 18.344396720031238, + "grad_norm": 1.1367971897125244, + "learning_rate": 1.6345369284876905e-05, + "loss": 0.1552, + "step": 234900 + }, + { + "epoch": 18.352206169465052, + "grad_norm": 0.8118088841438293, + "learning_rate": 1.634380617428683e-05, + "loss": 0.1651, + "step": 235000 + }, + { + "epoch": 18.360015618898867, + "grad_norm": 1.0581218004226685, + "learning_rate": 1.6342243063696757e-05, + "loss": 0.1642, + "step": 235100 + }, + { + "epoch": 18.36782506833268, + "grad_norm": 0.7852098941802979, + "learning_rate": 1.6340695584212583e-05, + "loss": 0.1621, + "step": 235200 + }, + { + "epoch": 18.375634517766496, + "grad_norm": 0.8468132019042969, + "learning_rate": 1.633913247362251e-05, + "loss": 0.1517, + "step": 235300 + }, + { + "epoch": 18.38344396720031, + "grad_norm": 0.9250136613845825, + "learning_rate": 1.6337569363032435e-05, + "loss": 0.1654, + "step": 235400 + }, + { + "epoch": 18.39125341663413, + "grad_norm": 0.8608556389808655, + "learning_rate": 1.633600625244236e-05, + "loss": 0.1651, + "step": 235500 + }, + { + "epoch": 18.399062866067943, + "grad_norm": 0.9423208236694336, + "learning_rate": 1.6334443141852287e-05, + "loss": 0.1694, + "step": 235600 + }, + { + "epoch": 18.406872315501758, + "grad_norm": 1.0843865871429443, + "learning_rate": 1.6332880031262213e-05, + "loss": 0.1657, + "step": 235700 + }, + { + "epoch": 18.414681764935573, + "grad_norm": 0.6090073585510254, + "learning_rate": 1.633131692067214e-05, + "loss": 0.1601, + "step": 235800 + }, + { + "epoch": 18.422491214369387, + "grad_norm": 0.8230893015861511, + "learning_rate": 1.6329753810082065e-05, + "loss": 0.1647, + "step": 235900 + }, + { + "epoch": 18.4303006638032, + "grad_norm": 0.7741209268569946, + "learning_rate": 1.632819069949199e-05, + "loss": 0.1622, + "step": 236000 + }, + { + "epoch": 18.438110113237016, + "grad_norm": 0.8756573796272278, + "learning_rate": 1.6326627588901914e-05, + "loss": 0.1655, + "step": 236100 + }, + { + "epoch": 18.44591956267083, + "grad_norm": 1.0780562162399292, + "learning_rate": 1.6325064478311844e-05, + "loss": 0.1695, + "step": 236200 + }, + { + "epoch": 18.453729012104645, + "grad_norm": 1.0400506258010864, + "learning_rate": 1.632350136772177e-05, + "loss": 0.166, + "step": 236300 + }, + { + "epoch": 18.46153846153846, + "grad_norm": 1.2005187273025513, + "learning_rate": 1.6321938257131692e-05, + "loss": 0.1739, + "step": 236400 + }, + { + "epoch": 18.469347910972278, + "grad_norm": 0.978459358215332, + "learning_rate": 1.6320375146541618e-05, + "loss": 0.1632, + "step": 236500 + }, + { + "epoch": 18.477157360406093, + "grad_norm": 0.8813901543617249, + "learning_rate": 1.6318812035951544e-05, + "loss": 0.167, + "step": 236600 + }, + { + "epoch": 18.484966809839907, + "grad_norm": 0.8949891924858093, + "learning_rate": 1.631724892536147e-05, + "loss": 0.1684, + "step": 236700 + }, + { + "epoch": 18.49277625927372, + "grad_norm": 0.8135596513748169, + "learning_rate": 1.6315685814771396e-05, + "loss": 0.159, + "step": 236800 + }, + { + "epoch": 18.500585708707536, + "grad_norm": 0.739747166633606, + "learning_rate": 1.6314122704181322e-05, + "loss": 0.1664, + "step": 236900 + }, + { + "epoch": 18.50839515814135, + "grad_norm": 0.9250356554985046, + "learning_rate": 1.6312559593591248e-05, + "loss": 0.1581, + "step": 237000 + }, + { + "epoch": 18.516204607575165, + "grad_norm": 1.1903828382492065, + "learning_rate": 1.6310996483001174e-05, + "loss": 0.16, + "step": 237100 + }, + { + "epoch": 18.52401405700898, + "grad_norm": 0.8911426663398743, + "learning_rate": 1.63094333724111e-05, + "loss": 0.1595, + "step": 237200 + }, + { + "epoch": 18.531823506442795, + "grad_norm": 0.7672592401504517, + "learning_rate": 1.6307885892926926e-05, + "loss": 0.1678, + "step": 237300 + }, + { + "epoch": 18.53963295587661, + "grad_norm": 0.8545451760292053, + "learning_rate": 1.6306322782336852e-05, + "loss": 0.1595, + "step": 237400 + }, + { + "epoch": 18.547442405310427, + "grad_norm": 1.0210609436035156, + "learning_rate": 1.6304759671746778e-05, + "loss": 0.1622, + "step": 237500 + }, + { + "epoch": 18.55525185474424, + "grad_norm": 0.6720797419548035, + "learning_rate": 1.63031965611567e-05, + "loss": 0.1748, + "step": 237600 + }, + { + "epoch": 18.563061304178056, + "grad_norm": 1.0264678001403809, + "learning_rate": 1.630163345056663e-05, + "loss": 0.1624, + "step": 237700 + }, + { + "epoch": 18.57087075361187, + "grad_norm": 1.1977595090866089, + "learning_rate": 1.6300070339976556e-05, + "loss": 0.1536, + "step": 237800 + }, + { + "epoch": 18.578680203045685, + "grad_norm": 0.7712870836257935, + "learning_rate": 1.629850722938648e-05, + "loss": 0.1679, + "step": 237900 + }, + { + "epoch": 18.5864896524795, + "grad_norm": 0.8661043643951416, + "learning_rate": 1.6296944118796408e-05, + "loss": 0.1581, + "step": 238000 + }, + { + "epoch": 18.594299101913315, + "grad_norm": 0.8155229687690735, + "learning_rate": 1.629538100820633e-05, + "loss": 0.1616, + "step": 238100 + }, + { + "epoch": 18.60210855134713, + "grad_norm": 1.1041407585144043, + "learning_rate": 1.6293817897616257e-05, + "loss": 0.1591, + "step": 238200 + }, + { + "epoch": 18.609918000780944, + "grad_norm": 0.8246005773544312, + "learning_rate": 1.6292254787026183e-05, + "loss": 0.1606, + "step": 238300 + }, + { + "epoch": 18.61772745021476, + "grad_norm": 0.8949695229530334, + "learning_rate": 1.629069167643611e-05, + "loss": 0.1591, + "step": 238400 + }, + { + "epoch": 18.625536899648576, + "grad_norm": 1.056926965713501, + "learning_rate": 1.6289128565846035e-05, + "loss": 0.1557, + "step": 238500 + }, + { + "epoch": 18.63334634908239, + "grad_norm": 0.9467104077339172, + "learning_rate": 1.628756545525596e-05, + "loss": 0.1654, + "step": 238600 + }, + { + "epoch": 18.641155798516206, + "grad_norm": 0.8194491863250732, + "learning_rate": 1.6286002344665887e-05, + "loss": 0.1629, + "step": 238700 + }, + { + "epoch": 18.64896524795002, + "grad_norm": 0.8596170544624329, + "learning_rate": 1.6284439234075813e-05, + "loss": 0.1506, + "step": 238800 + }, + { + "epoch": 18.656774697383835, + "grad_norm": 0.8728044629096985, + "learning_rate": 1.628287612348574e-05, + "loss": 0.1677, + "step": 238900 + }, + { + "epoch": 18.66458414681765, + "grad_norm": 0.7847612500190735, + "learning_rate": 1.628131301289566e-05, + "loss": 0.1625, + "step": 239000 + }, + { + "epoch": 18.672393596251464, + "grad_norm": 0.9129346609115601, + "learning_rate": 1.627974990230559e-05, + "loss": 0.1501, + "step": 239100 + }, + { + "epoch": 18.68020304568528, + "grad_norm": 0.6726586818695068, + "learning_rate": 1.6278186791715517e-05, + "loss": 0.1712, + "step": 239200 + }, + { + "epoch": 18.688012495119093, + "grad_norm": 1.2120012044906616, + "learning_rate": 1.6276639312231343e-05, + "loss": 0.175, + "step": 239300 + }, + { + "epoch": 18.695821944552907, + "grad_norm": 0.7493081092834473, + "learning_rate": 1.6275076201641266e-05, + "loss": 0.1604, + "step": 239400 + }, + { + "epoch": 18.703631393986726, + "grad_norm": 1.1694607734680176, + "learning_rate": 1.6273513091051195e-05, + "loss": 0.1605, + "step": 239500 + }, + { + "epoch": 18.71144084342054, + "grad_norm": 0.7457549571990967, + "learning_rate": 1.6271949980461118e-05, + "loss": 0.1635, + "step": 239600 + }, + { + "epoch": 18.719250292854355, + "grad_norm": 1.2835865020751953, + "learning_rate": 1.6270386869871044e-05, + "loss": 0.1624, + "step": 239700 + }, + { + "epoch": 18.72705974228817, + "grad_norm": 0.6410244703292847, + "learning_rate": 1.626882375928097e-05, + "loss": 0.165, + "step": 239800 + }, + { + "epoch": 18.734869191721984, + "grad_norm": 0.8569579124450684, + "learning_rate": 1.6267260648690896e-05, + "loss": 0.1606, + "step": 239900 + }, + { + "epoch": 18.7426786411558, + "grad_norm": 0.994696319103241, + "learning_rate": 1.626569753810082e-05, + "loss": 0.1603, + "step": 240000 + }, + { + "epoch": 18.750488090589613, + "grad_norm": 0.7902507185935974, + "learning_rate": 1.6264134427510748e-05, + "loss": 0.1638, + "step": 240100 + }, + { + "epoch": 18.758297540023428, + "grad_norm": 0.7056221961975098, + "learning_rate": 1.6262571316920674e-05, + "loss": 0.162, + "step": 240200 + }, + { + "epoch": 18.766106989457242, + "grad_norm": 0.7884091138839722, + "learning_rate": 1.62610082063306e-05, + "loss": 0.1557, + "step": 240300 + }, + { + "epoch": 18.773916438891057, + "grad_norm": 0.782772958278656, + "learning_rate": 1.6259445095740526e-05, + "loss": 0.1711, + "step": 240400 + }, + { + "epoch": 18.781725888324875, + "grad_norm": 0.9223241209983826, + "learning_rate": 1.6257881985150448e-05, + "loss": 0.1638, + "step": 240500 + }, + { + "epoch": 18.78953533775869, + "grad_norm": 0.6795259714126587, + "learning_rate": 1.6256318874560378e-05, + "loss": 0.1583, + "step": 240600 + }, + { + "epoch": 18.797344787192504, + "grad_norm": 0.917625367641449, + "learning_rate": 1.6254755763970304e-05, + "loss": 0.1596, + "step": 240700 + }, + { + "epoch": 18.80515423662632, + "grad_norm": 0.6310639977455139, + "learning_rate": 1.6253192653380226e-05, + "loss": 0.1593, + "step": 240800 + }, + { + "epoch": 18.812963686060133, + "grad_norm": 0.914989173412323, + "learning_rate": 1.6251629542790152e-05, + "loss": 0.1614, + "step": 240900 + }, + { + "epoch": 18.820773135493948, + "grad_norm": 0.8099808692932129, + "learning_rate": 1.6250066432200082e-05, + "loss": 0.1567, + "step": 241000 + }, + { + "epoch": 18.828582584927762, + "grad_norm": 0.812492311000824, + "learning_rate": 1.6248503321610004e-05, + "loss": 0.1642, + "step": 241100 + }, + { + "epoch": 18.836392034361577, + "grad_norm": 1.177435040473938, + "learning_rate": 1.624694021101993e-05, + "loss": 0.172, + "step": 241200 + }, + { + "epoch": 18.84420148379539, + "grad_norm": 0.8142099380493164, + "learning_rate": 1.624539273153576e-05, + "loss": 0.1616, + "step": 241300 + }, + { + "epoch": 18.852010933229206, + "grad_norm": 0.7500884532928467, + "learning_rate": 1.6243829620945682e-05, + "loss": 0.1631, + "step": 241400 + }, + { + "epoch": 18.859820382663024, + "grad_norm": 0.670933187007904, + "learning_rate": 1.624226651035561e-05, + "loss": 0.1509, + "step": 241500 + }, + { + "epoch": 18.86762983209684, + "grad_norm": 0.8267391920089722, + "learning_rate": 1.6240703399765534e-05, + "loss": 0.1634, + "step": 241600 + }, + { + "epoch": 18.875439281530653, + "grad_norm": 0.8886703848838806, + "learning_rate": 1.623914028917546e-05, + "loss": 0.1546, + "step": 241700 + }, + { + "epoch": 18.883248730964468, + "grad_norm": 0.6933555603027344, + "learning_rate": 1.6237577178585386e-05, + "loss": 0.1683, + "step": 241800 + }, + { + "epoch": 18.891058180398282, + "grad_norm": 1.125935435295105, + "learning_rate": 1.6236014067995312e-05, + "loss": 0.1701, + "step": 241900 + }, + { + "epoch": 18.898867629832097, + "grad_norm": 0.9346698522567749, + "learning_rate": 1.623445095740524e-05, + "loss": 0.1591, + "step": 242000 + }, + { + "epoch": 18.90667707926591, + "grad_norm": 1.0114465951919556, + "learning_rate": 1.6232887846815164e-05, + "loss": 0.1556, + "step": 242100 + }, + { + "epoch": 18.914486528699726, + "grad_norm": 1.0575963258743286, + "learning_rate": 1.623132473622509e-05, + "loss": 0.1669, + "step": 242200 + }, + { + "epoch": 18.92229597813354, + "grad_norm": 1.0619617700576782, + "learning_rate": 1.6229761625635013e-05, + "loss": 0.1678, + "step": 242300 + }, + { + "epoch": 18.930105427567355, + "grad_norm": 0.8855568170547485, + "learning_rate": 1.6228198515044942e-05, + "loss": 0.1565, + "step": 242400 + }, + { + "epoch": 18.93791487700117, + "grad_norm": 0.8218313455581665, + "learning_rate": 1.622663540445487e-05, + "loss": 0.1583, + "step": 242500 + }, + { + "epoch": 18.945724326434988, + "grad_norm": 0.7416090965270996, + "learning_rate": 1.622507229386479e-05, + "loss": 0.164, + "step": 242600 + }, + { + "epoch": 18.953533775868802, + "grad_norm": 1.218461513519287, + "learning_rate": 1.6223509183274717e-05, + "loss": 0.1572, + "step": 242700 + }, + { + "epoch": 18.961343225302617, + "grad_norm": 0.6875521540641785, + "learning_rate": 1.6221946072684643e-05, + "loss": 0.1533, + "step": 242800 + }, + { + "epoch": 18.96915267473643, + "grad_norm": 0.8290515542030334, + "learning_rate": 1.622038296209457e-05, + "loss": 0.1607, + "step": 242900 + }, + { + "epoch": 18.976962124170246, + "grad_norm": 1.013169765472412, + "learning_rate": 1.6218819851504495e-05, + "loss": 0.1732, + "step": 243000 + }, + { + "epoch": 18.98477157360406, + "grad_norm": 0.86373370885849, + "learning_rate": 1.621725674091442e-05, + "loss": 0.1696, + "step": 243100 + }, + { + "epoch": 18.992581023037875, + "grad_norm": 0.6753338575363159, + "learning_rate": 1.6215693630324347e-05, + "loss": 0.1519, + "step": 243200 + }, + { + "epoch": 19.00039047247169, + "grad_norm": 0.7721323370933533, + "learning_rate": 1.6214130519734273e-05, + "loss": 0.1585, + "step": 243300 + }, + { + "epoch": 19.008199921905504, + "grad_norm": 1.0071583986282349, + "learning_rate": 1.62125830402501e-05, + "loss": 0.1591, + "step": 243400 + }, + { + "epoch": 19.016009371339322, + "grad_norm": 0.9117600917816162, + "learning_rate": 1.6211019929660025e-05, + "loss": 0.1604, + "step": 243500 + }, + { + "epoch": 19.023818820773137, + "grad_norm": 0.8695400953292847, + "learning_rate": 1.620945681906995e-05, + "loss": 0.1517, + "step": 243600 + }, + { + "epoch": 19.03162827020695, + "grad_norm": 0.7234436273574829, + "learning_rate": 1.6207893708479877e-05, + "loss": 0.1581, + "step": 243700 + }, + { + "epoch": 19.039437719640766, + "grad_norm": 0.7975166440010071, + "learning_rate": 1.62063305978898e-05, + "loss": 0.1518, + "step": 243800 + }, + { + "epoch": 19.04724716907458, + "grad_norm": 1.156203031539917, + "learning_rate": 1.620476748729973e-05, + "loss": 0.1564, + "step": 243900 + }, + { + "epoch": 19.055056618508395, + "grad_norm": 0.9850621223449707, + "learning_rate": 1.6203204376709655e-05, + "loss": 0.1479, + "step": 244000 + }, + { + "epoch": 19.06286606794221, + "grad_norm": 0.7504763007164001, + "learning_rate": 1.6201641266119578e-05, + "loss": 0.1555, + "step": 244100 + }, + { + "epoch": 19.070675517376024, + "grad_norm": 0.8326386213302612, + "learning_rate": 1.6200078155529507e-05, + "loss": 0.154, + "step": 244200 + }, + { + "epoch": 19.07848496680984, + "grad_norm": 1.0703577995300293, + "learning_rate": 1.619851504493943e-05, + "loss": 0.1604, + "step": 244300 + }, + { + "epoch": 19.086294416243653, + "grad_norm": 1.010055422782898, + "learning_rate": 1.6196951934349356e-05, + "loss": 0.1693, + "step": 244400 + }, + { + "epoch": 19.094103865677468, + "grad_norm": 0.8883320093154907, + "learning_rate": 1.6195388823759282e-05, + "loss": 0.1547, + "step": 244500 + }, + { + "epoch": 19.101913315111286, + "grad_norm": 0.8305409550666809, + "learning_rate": 1.6193825713169208e-05, + "loss": 0.1679, + "step": 244600 + }, + { + "epoch": 19.1097227645451, + "grad_norm": 0.7261273860931396, + "learning_rate": 1.6192262602579134e-05, + "loss": 0.1602, + "step": 244700 + }, + { + "epoch": 19.117532213978915, + "grad_norm": 0.8075242638587952, + "learning_rate": 1.619069949198906e-05, + "loss": 0.1612, + "step": 244800 + }, + { + "epoch": 19.12534166341273, + "grad_norm": 1.0095022916793823, + "learning_rate": 1.6189136381398986e-05, + "loss": 0.1636, + "step": 244900 + }, + { + "epoch": 19.133151112846544, + "grad_norm": 0.7797873020172119, + "learning_rate": 1.6187573270808912e-05, + "loss": 0.1508, + "step": 245000 + }, + { + "epoch": 19.14096056228036, + "grad_norm": 0.7069320678710938, + "learning_rate": 1.6186010160218838e-05, + "loss": 0.1545, + "step": 245100 + }, + { + "epoch": 19.148770011714173, + "grad_norm": 0.7221994996070862, + "learning_rate": 1.618444704962876e-05, + "loss": 0.1483, + "step": 245200 + }, + { + "epoch": 19.156579461147988, + "grad_norm": 0.9450590014457703, + "learning_rate": 1.618288393903869e-05, + "loss": 0.1672, + "step": 245300 + }, + { + "epoch": 19.164388910581803, + "grad_norm": 0.7580748200416565, + "learning_rate": 1.6181336459554516e-05, + "loss": 0.1685, + "step": 245400 + }, + { + "epoch": 19.172198360015617, + "grad_norm": 0.9807697534561157, + "learning_rate": 1.6179773348964442e-05, + "loss": 0.1631, + "step": 245500 + }, + { + "epoch": 19.180007809449435, + "grad_norm": 0.7284419536590576, + "learning_rate": 1.6178210238374364e-05, + "loss": 0.1618, + "step": 245600 + }, + { + "epoch": 19.18781725888325, + "grad_norm": 1.0209381580352783, + "learning_rate": 1.6176647127784294e-05, + "loss": 0.1584, + "step": 245700 + }, + { + "epoch": 19.195626708317064, + "grad_norm": 0.763217031955719, + "learning_rate": 1.6175084017194216e-05, + "loss": 0.1601, + "step": 245800 + }, + { + "epoch": 19.20343615775088, + "grad_norm": 0.7209915518760681, + "learning_rate": 1.6173520906604142e-05, + "loss": 0.1591, + "step": 245900 + }, + { + "epoch": 19.211245607184694, + "grad_norm": 0.8466573357582092, + "learning_rate": 1.617195779601407e-05, + "loss": 0.161, + "step": 246000 + }, + { + "epoch": 19.219055056618508, + "grad_norm": 0.739590048789978, + "learning_rate": 1.6170394685423995e-05, + "loss": 0.1595, + "step": 246100 + }, + { + "epoch": 19.226864506052323, + "grad_norm": 0.8309229612350464, + "learning_rate": 1.616883157483392e-05, + "loss": 0.1539, + "step": 246200 + }, + { + "epoch": 19.234673955486137, + "grad_norm": 0.9524125456809998, + "learning_rate": 1.6167268464243847e-05, + "loss": 0.1574, + "step": 246300 + }, + { + "epoch": 19.242483404919952, + "grad_norm": 0.79534512758255, + "learning_rate": 1.6165705353653773e-05, + "loss": 0.1537, + "step": 246400 + }, + { + "epoch": 19.250292854353766, + "grad_norm": 0.698271632194519, + "learning_rate": 1.61641422430637e-05, + "loss": 0.1624, + "step": 246500 + }, + { + "epoch": 19.258102303787584, + "grad_norm": 1.0599291324615479, + "learning_rate": 1.6162579132473625e-05, + "loss": 0.1646, + "step": 246600 + }, + { + "epoch": 19.2659117532214, + "grad_norm": 0.7981654405593872, + "learning_rate": 1.6161016021883547e-05, + "loss": 0.1572, + "step": 246700 + }, + { + "epoch": 19.273721202655214, + "grad_norm": 1.2136812210083008, + "learning_rate": 1.6159452911293477e-05, + "loss": 0.1589, + "step": 246800 + }, + { + "epoch": 19.281530652089028, + "grad_norm": 0.8294757604598999, + "learning_rate": 1.6157889800703403e-05, + "loss": 0.1539, + "step": 246900 + }, + { + "epoch": 19.289340101522843, + "grad_norm": 0.8389180898666382, + "learning_rate": 1.6156326690113325e-05, + "loss": 0.1599, + "step": 247000 + }, + { + "epoch": 19.297149550956657, + "grad_norm": 0.7896692752838135, + "learning_rate": 1.615476357952325e-05, + "loss": 0.1527, + "step": 247100 + }, + { + "epoch": 19.304959000390472, + "grad_norm": 0.541652262210846, + "learning_rate": 1.615320046893318e-05, + "loss": 0.155, + "step": 247200 + }, + { + "epoch": 19.312768449824286, + "grad_norm": 0.7445521354675293, + "learning_rate": 1.6151637358343103e-05, + "loss": 0.163, + "step": 247300 + }, + { + "epoch": 19.3205778992581, + "grad_norm": 0.8510544896125793, + "learning_rate": 1.615007424775303e-05, + "loss": 0.1594, + "step": 247400 + }, + { + "epoch": 19.328387348691916, + "grad_norm": 0.760508120059967, + "learning_rate": 1.614852676826886e-05, + "loss": 0.1582, + "step": 247500 + }, + { + "epoch": 19.336196798125734, + "grad_norm": 0.9797661304473877, + "learning_rate": 1.614696365767878e-05, + "loss": 0.1573, + "step": 247600 + }, + { + "epoch": 19.34400624755955, + "grad_norm": 1.1147596836090088, + "learning_rate": 1.6145400547088707e-05, + "loss": 0.1613, + "step": 247700 + }, + { + "epoch": 19.351815696993363, + "grad_norm": 0.8399510979652405, + "learning_rate": 1.6143837436498633e-05, + "loss": 0.1545, + "step": 247800 + }, + { + "epoch": 19.359625146427177, + "grad_norm": 0.9175117015838623, + "learning_rate": 1.614227432590856e-05, + "loss": 0.1512, + "step": 247900 + }, + { + "epoch": 19.367434595860992, + "grad_norm": 0.8806548714637756, + "learning_rate": 1.6140711215318485e-05, + "loss": 0.15, + "step": 248000 + }, + { + "epoch": 19.375244045294806, + "grad_norm": 0.8820730447769165, + "learning_rate": 1.613914810472841e-05, + "loss": 0.1625, + "step": 248100 + }, + { + "epoch": 19.38305349472862, + "grad_norm": 0.5434184670448303, + "learning_rate": 1.6137584994138334e-05, + "loss": 0.1545, + "step": 248200 + }, + { + "epoch": 19.390862944162436, + "grad_norm": 1.0711945295333862, + "learning_rate": 1.6136021883548263e-05, + "loss": 0.1525, + "step": 248300 + }, + { + "epoch": 19.39867239359625, + "grad_norm": 0.8247241377830505, + "learning_rate": 1.613445877295819e-05, + "loss": 0.15, + "step": 248400 + }, + { + "epoch": 19.406481843030065, + "grad_norm": 0.6772851943969727, + "learning_rate": 1.6132895662368112e-05, + "loss": 0.1561, + "step": 248500 + }, + { + "epoch": 19.414291292463883, + "grad_norm": 1.05992591381073, + "learning_rate": 1.613133255177804e-05, + "loss": 0.1478, + "step": 248600 + }, + { + "epoch": 19.422100741897697, + "grad_norm": 0.8491674065589905, + "learning_rate": 1.6129769441187967e-05, + "loss": 0.1558, + "step": 248700 + }, + { + "epoch": 19.429910191331512, + "grad_norm": 0.7933894395828247, + "learning_rate": 1.612820633059789e-05, + "loss": 0.158, + "step": 248800 + }, + { + "epoch": 19.437719640765327, + "grad_norm": 0.8349812030792236, + "learning_rate": 1.6126643220007816e-05, + "loss": 0.158, + "step": 248900 + }, + { + "epoch": 19.44552909019914, + "grad_norm": 1.0554237365722656, + "learning_rate": 1.6125080109417742e-05, + "loss": 0.1579, + "step": 249000 + }, + { + "epoch": 19.453338539632956, + "grad_norm": 0.6732720136642456, + "learning_rate": 1.6123516998827668e-05, + "loss": 0.1521, + "step": 249100 + }, + { + "epoch": 19.46114798906677, + "grad_norm": 0.8966999053955078, + "learning_rate": 1.6121953888237594e-05, + "loss": 0.1633, + "step": 249200 + }, + { + "epoch": 19.468957438500585, + "grad_norm": 1.0872215032577515, + "learning_rate": 1.612039077764752e-05, + "loss": 0.1567, + "step": 249300 + }, + { + "epoch": 19.4767668879344, + "grad_norm": 0.893544614315033, + "learning_rate": 1.6118827667057446e-05, + "loss": 0.1555, + "step": 249400 + }, + { + "epoch": 19.484576337368214, + "grad_norm": 1.0895719528198242, + "learning_rate": 1.6117280187573272e-05, + "loss": 0.164, + "step": 249500 + }, + { + "epoch": 19.492385786802032, + "grad_norm": 0.6664323210716248, + "learning_rate": 1.6115717076983198e-05, + "loss": 0.1569, + "step": 249600 + }, + { + "epoch": 19.500195236235847, + "grad_norm": 0.7812830805778503, + "learning_rate": 1.6114153966393124e-05, + "loss": 0.153, + "step": 249700 + }, + { + "epoch": 19.50800468566966, + "grad_norm": 0.8430187702178955, + "learning_rate": 1.611259085580305e-05, + "loss": 0.1612, + "step": 249800 + }, + { + "epoch": 19.515814135103476, + "grad_norm": 0.997885525226593, + "learning_rate": 1.6111027745212976e-05, + "loss": 0.1557, + "step": 249900 + }, + { + "epoch": 19.52362358453729, + "grad_norm": 0.9310106635093689, + "learning_rate": 1.61094646346229e-05, + "loss": 0.1623, + "step": 250000 + }, + { + "epoch": 19.531433033971105, + "grad_norm": 0.7472857236862183, + "learning_rate": 1.6107901524032828e-05, + "loss": 0.1553, + "step": 250100 + }, + { + "epoch": 19.53924248340492, + "grad_norm": 0.7143159508705139, + "learning_rate": 1.6106338413442754e-05, + "loss": 0.1578, + "step": 250200 + }, + { + "epoch": 19.547051932838734, + "grad_norm": 0.7594248056411743, + "learning_rate": 1.6104775302852677e-05, + "loss": 0.1588, + "step": 250300 + }, + { + "epoch": 19.55486138227255, + "grad_norm": 0.8787967562675476, + "learning_rate": 1.6103212192262603e-05, + "loss": 0.1592, + "step": 250400 + }, + { + "epoch": 19.562670831706363, + "grad_norm": 0.9230839610099792, + "learning_rate": 1.610164908167253e-05, + "loss": 0.1543, + "step": 250500 + }, + { + "epoch": 19.57048028114018, + "grad_norm": 0.9485085606575012, + "learning_rate": 1.6100085971082455e-05, + "loss": 0.1472, + "step": 250600 + }, + { + "epoch": 19.578289730573996, + "grad_norm": 0.8526518940925598, + "learning_rate": 1.609852286049238e-05, + "loss": 0.1576, + "step": 250700 + }, + { + "epoch": 19.58609918000781, + "grad_norm": 0.8549910187721252, + "learning_rate": 1.6096959749902307e-05, + "loss": 0.1556, + "step": 250800 + }, + { + "epoch": 19.593908629441625, + "grad_norm": 1.0488536357879639, + "learning_rate": 1.6095396639312233e-05, + "loss": 0.1595, + "step": 250900 + }, + { + "epoch": 19.60171807887544, + "grad_norm": 1.1000605821609497, + "learning_rate": 1.609383352872216e-05, + "loss": 0.1479, + "step": 251000 + }, + { + "epoch": 19.609527528309254, + "grad_norm": 0.8705799579620361, + "learning_rate": 1.6092270418132085e-05, + "loss": 0.1571, + "step": 251100 + }, + { + "epoch": 19.61733697774307, + "grad_norm": 0.9906345009803772, + "learning_rate": 1.609070730754201e-05, + "loss": 0.1566, + "step": 251200 + }, + { + "epoch": 19.625146427176883, + "grad_norm": 0.9738631248474121, + "learning_rate": 1.6089144196951937e-05, + "loss": 0.1615, + "step": 251300 + }, + { + "epoch": 19.632955876610698, + "grad_norm": 0.7126808762550354, + "learning_rate": 1.608758108636186e-05, + "loss": 0.1561, + "step": 251400 + }, + { + "epoch": 19.640765326044512, + "grad_norm": 0.8045843243598938, + "learning_rate": 1.608603360687769e-05, + "loss": 0.1612, + "step": 251500 + }, + { + "epoch": 19.64857477547833, + "grad_norm": 0.812263011932373, + "learning_rate": 1.6084470496287615e-05, + "loss": 0.1588, + "step": 251600 + }, + { + "epoch": 19.656384224912145, + "grad_norm": 0.9213880300521851, + "learning_rate": 1.608290738569754e-05, + "loss": 0.1548, + "step": 251700 + }, + { + "epoch": 19.66419367434596, + "grad_norm": 1.1556429862976074, + "learning_rate": 1.6081344275107463e-05, + "loss": 0.1539, + "step": 251800 + }, + { + "epoch": 19.672003123779774, + "grad_norm": 0.8295390605926514, + "learning_rate": 1.6079781164517393e-05, + "loss": 0.1552, + "step": 251900 + }, + { + "epoch": 19.67981257321359, + "grad_norm": 1.0561972856521606, + "learning_rate": 1.6078218053927315e-05, + "loss": 0.1497, + "step": 252000 + }, + { + "epoch": 19.687622022647403, + "grad_norm": 0.6519757509231567, + "learning_rate": 1.607665494333724e-05, + "loss": 0.1477, + "step": 252100 + }, + { + "epoch": 19.695431472081218, + "grad_norm": 0.8574190735816956, + "learning_rate": 1.6075091832747167e-05, + "loss": 0.1609, + "step": 252200 + }, + { + "epoch": 19.703240921515032, + "grad_norm": 0.885389506816864, + "learning_rate": 1.6073528722157093e-05, + "loss": 0.1584, + "step": 252300 + }, + { + "epoch": 19.711050370948847, + "grad_norm": 0.9630519151687622, + "learning_rate": 1.607196561156702e-05, + "loss": 0.1517, + "step": 252400 + }, + { + "epoch": 19.71885982038266, + "grad_norm": 0.7270082235336304, + "learning_rate": 1.6070402500976945e-05, + "loss": 0.1575, + "step": 252500 + }, + { + "epoch": 19.72666926981648, + "grad_norm": 0.999950110912323, + "learning_rate": 1.606883939038687e-05, + "loss": 0.1598, + "step": 252600 + }, + { + "epoch": 19.734478719250294, + "grad_norm": 0.7339434623718262, + "learning_rate": 1.6067276279796797e-05, + "loss": 0.1548, + "step": 252700 + }, + { + "epoch": 19.74228816868411, + "grad_norm": 0.8402358889579773, + "learning_rate": 1.6065713169206723e-05, + "loss": 0.1545, + "step": 252800 + }, + { + "epoch": 19.750097618117923, + "grad_norm": 1.0282187461853027, + "learning_rate": 1.6064150058616646e-05, + "loss": 0.1547, + "step": 252900 + }, + { + "epoch": 19.757907067551738, + "grad_norm": 0.903782844543457, + "learning_rate": 1.6062586948026575e-05, + "loss": 0.1524, + "step": 253000 + }, + { + "epoch": 19.765716516985552, + "grad_norm": 0.876445472240448, + "learning_rate": 1.60610238374365e-05, + "loss": 0.1548, + "step": 253100 + }, + { + "epoch": 19.773525966419367, + "grad_norm": 0.7405554056167603, + "learning_rate": 1.6059460726846424e-05, + "loss": 0.1598, + "step": 253200 + }, + { + "epoch": 19.78133541585318, + "grad_norm": 0.8406596183776855, + "learning_rate": 1.605789761625635e-05, + "loss": 0.151, + "step": 253300 + }, + { + "epoch": 19.789144865286996, + "grad_norm": 0.8342940211296082, + "learning_rate": 1.6056334505666276e-05, + "loss": 0.1475, + "step": 253400 + }, + { + "epoch": 19.79695431472081, + "grad_norm": 1.0519615411758423, + "learning_rate": 1.6054771395076202e-05, + "loss": 0.1625, + "step": 253500 + }, + { + "epoch": 19.80476376415463, + "grad_norm": 0.903139054775238, + "learning_rate": 1.6053223915592028e-05, + "loss": 0.1609, + "step": 253600 + }, + { + "epoch": 19.812573213588443, + "grad_norm": 1.150200605392456, + "learning_rate": 1.6051660805001957e-05, + "loss": 0.1601, + "step": 253700 + }, + { + "epoch": 19.820382663022258, + "grad_norm": 1.1543763875961304, + "learning_rate": 1.605009769441188e-05, + "loss": 0.1615, + "step": 253800 + }, + { + "epoch": 19.828192112456072, + "grad_norm": 0.739437997341156, + "learning_rate": 1.6048534583821806e-05, + "loss": 0.1514, + "step": 253900 + }, + { + "epoch": 19.836001561889887, + "grad_norm": 1.1290647983551025, + "learning_rate": 1.6046971473231732e-05, + "loss": 0.1585, + "step": 254000 + }, + { + "epoch": 19.8438110113237, + "grad_norm": 0.8145610690116882, + "learning_rate": 1.6045408362641658e-05, + "loss": 0.154, + "step": 254100 + }, + { + "epoch": 19.851620460757516, + "grad_norm": 1.0215067863464355, + "learning_rate": 1.6043845252051584e-05, + "loss": 0.15, + "step": 254200 + }, + { + "epoch": 19.85942991019133, + "grad_norm": 0.885907769203186, + "learning_rate": 1.604228214146151e-05, + "loss": 0.1463, + "step": 254300 + }, + { + "epoch": 19.867239359625145, + "grad_norm": 1.0276118516921997, + "learning_rate": 1.6040719030871433e-05, + "loss": 0.1447, + "step": 254400 + }, + { + "epoch": 19.87504880905896, + "grad_norm": 0.740425169467926, + "learning_rate": 1.6039155920281362e-05, + "loss": 0.1547, + "step": 254500 + }, + { + "epoch": 19.882858258492774, + "grad_norm": 0.740181565284729, + "learning_rate": 1.6037592809691288e-05, + "loss": 0.1524, + "step": 254600 + }, + { + "epoch": 19.890667707926593, + "grad_norm": 0.7419953346252441, + "learning_rate": 1.603602969910121e-05, + "loss": 0.1522, + "step": 254700 + }, + { + "epoch": 19.898477157360407, + "grad_norm": 0.8773970603942871, + "learning_rate": 1.603446658851114e-05, + "loss": 0.1537, + "step": 254800 + }, + { + "epoch": 19.90628660679422, + "grad_norm": 0.9431491494178772, + "learning_rate": 1.6032903477921066e-05, + "loss": 0.153, + "step": 254900 + }, + { + "epoch": 19.914096056228036, + "grad_norm": 0.887361466884613, + "learning_rate": 1.603134036733099e-05, + "loss": 0.152, + "step": 255000 + }, + { + "epoch": 19.92190550566185, + "grad_norm": 0.9748094081878662, + "learning_rate": 1.6029777256740915e-05, + "loss": 0.1502, + "step": 255100 + }, + { + "epoch": 19.929714955095665, + "grad_norm": 0.7755160927772522, + "learning_rate": 1.602821414615084e-05, + "loss": 0.1517, + "step": 255200 + }, + { + "epoch": 19.93752440452948, + "grad_norm": 0.7254877090454102, + "learning_rate": 1.6026651035560767e-05, + "loss": 0.1636, + "step": 255300 + }, + { + "epoch": 19.945333853963295, + "grad_norm": 0.8369965553283691, + "learning_rate": 1.6025087924970693e-05, + "loss": 0.1463, + "step": 255400 + }, + { + "epoch": 19.95314330339711, + "grad_norm": 0.999004602432251, + "learning_rate": 1.602352481438062e-05, + "loss": 0.1553, + "step": 255500 + }, + { + "epoch": 19.960952752830927, + "grad_norm": 0.8538402915000916, + "learning_rate": 1.6021961703790545e-05, + "loss": 0.1498, + "step": 255600 + }, + { + "epoch": 19.96876220226474, + "grad_norm": 0.8866280913352966, + "learning_rate": 1.602041422430637e-05, + "loss": 0.1537, + "step": 255700 + }, + { + "epoch": 19.976571651698556, + "grad_norm": 0.6460148692131042, + "learning_rate": 1.6018851113716297e-05, + "loss": 0.1558, + "step": 255800 + }, + { + "epoch": 19.98438110113237, + "grad_norm": 0.8335278630256653, + "learning_rate": 1.6017288003126223e-05, + "loss": 0.1565, + "step": 255900 + }, + { + "epoch": 19.992190550566185, + "grad_norm": 0.9254976511001587, + "learning_rate": 1.601572489253615e-05, + "loss": 0.1594, + "step": 256000 + }, + { + "epoch": 20.0, + "grad_norm": 0.9401211142539978, + "learning_rate": 1.6014161781946075e-05, + "loss": 0.1563, + "step": 256100 + }, + { + "epoch": 20.007809449433815, + "grad_norm": 1.0900503396987915, + "learning_rate": 1.6012598671355998e-05, + "loss": 0.1537, + "step": 256200 + }, + { + "epoch": 20.01561889886763, + "grad_norm": 0.7111983895301819, + "learning_rate": 1.6011035560765927e-05, + "loss": 0.1549, + "step": 256300 + }, + { + "epoch": 20.023428348301444, + "grad_norm": 0.8802400827407837, + "learning_rate": 1.6009472450175853e-05, + "loss": 0.1535, + "step": 256400 + }, + { + "epoch": 20.03123779773526, + "grad_norm": 0.9764599204063416, + "learning_rate": 1.6007909339585776e-05, + "loss": 0.1481, + "step": 256500 + }, + { + "epoch": 20.039047247169073, + "grad_norm": 1.1582820415496826, + "learning_rate": 1.60063462289957e-05, + "loss": 0.1539, + "step": 256600 + }, + { + "epoch": 20.04685669660289, + "grad_norm": 0.8898832201957703, + "learning_rate": 1.6004783118405628e-05, + "loss": 0.1494, + "step": 256700 + }, + { + "epoch": 20.054666146036705, + "grad_norm": 0.9158037900924683, + "learning_rate": 1.6003220007815554e-05, + "loss": 0.1563, + "step": 256800 + }, + { + "epoch": 20.06247559547052, + "grad_norm": 0.738610029220581, + "learning_rate": 1.600165689722548e-05, + "loss": 0.152, + "step": 256900 + }, + { + "epoch": 20.070285044904335, + "grad_norm": 0.8221123218536377, + "learning_rate": 1.6000093786635406e-05, + "loss": 0.1594, + "step": 257000 + }, + { + "epoch": 20.07809449433815, + "grad_norm": 1.2848199605941772, + "learning_rate": 1.599853067604533e-05, + "loss": 0.1506, + "step": 257100 + }, + { + "epoch": 20.085903943771964, + "grad_norm": 1.2601646184921265, + "learning_rate": 1.5996967565455258e-05, + "loss": 0.1597, + "step": 257200 + }, + { + "epoch": 20.09371339320578, + "grad_norm": 0.9480125308036804, + "learning_rate": 1.5995404454865184e-05, + "loss": 0.1527, + "step": 257300 + }, + { + "epoch": 20.101522842639593, + "grad_norm": 0.9821155667304993, + "learning_rate": 1.599384134427511e-05, + "loss": 0.1552, + "step": 257400 + }, + { + "epoch": 20.109332292073407, + "grad_norm": 1.0815869569778442, + "learning_rate": 1.5992278233685036e-05, + "loss": 0.1543, + "step": 257500 + }, + { + "epoch": 20.117141741507222, + "grad_norm": 1.0338879823684692, + "learning_rate": 1.5990715123094958e-05, + "loss": 0.1537, + "step": 257600 + }, + { + "epoch": 20.12495119094104, + "grad_norm": 1.0437591075897217, + "learning_rate": 1.5989167643610784e-05, + "loss": 0.1526, + "step": 257700 + }, + { + "epoch": 20.132760640374855, + "grad_norm": 0.6990692019462585, + "learning_rate": 1.5987604533020714e-05, + "loss": 0.1583, + "step": 257800 + }, + { + "epoch": 20.14057008980867, + "grad_norm": 0.9085626602172852, + "learning_rate": 1.598604142243064e-05, + "loss": 0.147, + "step": 257900 + }, + { + "epoch": 20.148379539242484, + "grad_norm": 0.9155283570289612, + "learning_rate": 1.5984478311840562e-05, + "loss": 0.1508, + "step": 258000 + }, + { + "epoch": 20.1561889886763, + "grad_norm": 0.67339026927948, + "learning_rate": 1.598291520125049e-05, + "loss": 0.1525, + "step": 258100 + }, + { + "epoch": 20.163998438110113, + "grad_norm": 0.8980534672737122, + "learning_rate": 1.5981352090660414e-05, + "loss": 0.159, + "step": 258200 + }, + { + "epoch": 20.171807887543928, + "grad_norm": 0.8323621153831482, + "learning_rate": 1.597978898007034e-05, + "loss": 0.1569, + "step": 258300 + }, + { + "epoch": 20.179617336977742, + "grad_norm": 0.8660298585891724, + "learning_rate": 1.5978225869480266e-05, + "loss": 0.152, + "step": 258400 + }, + { + "epoch": 20.187426786411557, + "grad_norm": 0.9316799640655518, + "learning_rate": 1.5976662758890192e-05, + "loss": 0.1665, + "step": 258500 + }, + { + "epoch": 20.19523623584537, + "grad_norm": 0.7573386430740356, + "learning_rate": 1.597509964830012e-05, + "loss": 0.1464, + "step": 258600 + }, + { + "epoch": 20.20304568527919, + "grad_norm": 1.2083278894424438, + "learning_rate": 1.5973536537710044e-05, + "loss": 0.1512, + "step": 258700 + }, + { + "epoch": 20.210855134713004, + "grad_norm": 0.9421168565750122, + "learning_rate": 1.597197342711997e-05, + "loss": 0.1545, + "step": 258800 + }, + { + "epoch": 20.21866458414682, + "grad_norm": 0.8368399739265442, + "learning_rate": 1.5970410316529896e-05, + "loss": 0.1436, + "step": 258900 + }, + { + "epoch": 20.226474033580633, + "grad_norm": 0.8021383881568909, + "learning_rate": 1.5968847205939822e-05, + "loss": 0.1435, + "step": 259000 + }, + { + "epoch": 20.234283483014448, + "grad_norm": 0.7664214968681335, + "learning_rate": 1.5967284095349745e-05, + "loss": 0.1475, + "step": 259100 + }, + { + "epoch": 20.242092932448262, + "grad_norm": 1.0566761493682861, + "learning_rate": 1.5965720984759674e-05, + "loss": 0.1577, + "step": 259200 + }, + { + "epoch": 20.249902381882077, + "grad_norm": 0.7286539077758789, + "learning_rate": 1.59641578741696e-05, + "loss": 0.1524, + "step": 259300 + }, + { + "epoch": 20.25771183131589, + "grad_norm": 0.8399373292922974, + "learning_rate": 1.5962594763579523e-05, + "loss": 0.1542, + "step": 259400 + }, + { + "epoch": 20.265521280749706, + "grad_norm": 1.2981783151626587, + "learning_rate": 1.596103165298945e-05, + "loss": 0.1465, + "step": 259500 + }, + { + "epoch": 20.27333073018352, + "grad_norm": 1.052333950996399, + "learning_rate": 1.5959468542399375e-05, + "loss": 0.1514, + "step": 259600 + }, + { + "epoch": 20.28114017961734, + "grad_norm": 1.0850844383239746, + "learning_rate": 1.59579210629152e-05, + "loss": 0.1449, + "step": 259700 + }, + { + "epoch": 20.288949629051153, + "grad_norm": 1.1221493482589722, + "learning_rate": 1.5956357952325127e-05, + "loss": 0.1595, + "step": 259800 + }, + { + "epoch": 20.296759078484968, + "grad_norm": 0.5353173613548279, + "learning_rate": 1.5954794841735056e-05, + "loss": 0.1454, + "step": 259900 + }, + { + "epoch": 20.304568527918782, + "grad_norm": 1.2891056537628174, + "learning_rate": 1.595323173114498e-05, + "loss": 0.1558, + "step": 260000 + }, + { + "epoch": 20.312377977352597, + "grad_norm": 0.6218847632408142, + "learning_rate": 1.5951668620554905e-05, + "loss": 0.1436, + "step": 260100 + }, + { + "epoch": 20.32018742678641, + "grad_norm": 0.9923516511917114, + "learning_rate": 1.595010550996483e-05, + "loss": 0.1459, + "step": 260200 + }, + { + "epoch": 20.327996876220226, + "grad_norm": 0.9337815046310425, + "learning_rate": 1.5948542399374757e-05, + "loss": 0.1478, + "step": 260300 + }, + { + "epoch": 20.33580632565404, + "grad_norm": 0.8427096605300903, + "learning_rate": 1.5946979288784683e-05, + "loss": 0.1454, + "step": 260400 + }, + { + "epoch": 20.343615775087855, + "grad_norm": 0.8702914118766785, + "learning_rate": 1.594541617819461e-05, + "loss": 0.1515, + "step": 260500 + }, + { + "epoch": 20.35142522452167, + "grad_norm": 1.1068722009658813, + "learning_rate": 1.594385306760453e-05, + "loss": 0.1466, + "step": 260600 + }, + { + "epoch": 20.359234673955488, + "grad_norm": 0.9992669224739075, + "learning_rate": 1.594228995701446e-05, + "loss": 0.1517, + "step": 260700 + }, + { + "epoch": 20.367044123389302, + "grad_norm": 0.8575073480606079, + "learning_rate": 1.5940726846424387e-05, + "loss": 0.1492, + "step": 260800 + }, + { + "epoch": 20.374853572823117, + "grad_norm": 0.9805818200111389, + "learning_rate": 1.593916373583431e-05, + "loss": 0.15, + "step": 260900 + }, + { + "epoch": 20.38266302225693, + "grad_norm": 0.9054665565490723, + "learning_rate": 1.593760062524424e-05, + "loss": 0.1435, + "step": 261000 + }, + { + "epoch": 20.390472471690746, + "grad_norm": 0.788601279258728, + "learning_rate": 1.5936037514654165e-05, + "loss": 0.1551, + "step": 261100 + }, + { + "epoch": 20.39828192112456, + "grad_norm": 1.1899691820144653, + "learning_rate": 1.5934474404064088e-05, + "loss": 0.1545, + "step": 261200 + }, + { + "epoch": 20.406091370558375, + "grad_norm": 0.8695967793464661, + "learning_rate": 1.5932911293474014e-05, + "loss": 0.1481, + "step": 261300 + }, + { + "epoch": 20.41390081999219, + "grad_norm": 0.8515498042106628, + "learning_rate": 1.593134818288394e-05, + "loss": 0.1495, + "step": 261400 + }, + { + "epoch": 20.421710269426004, + "grad_norm": 0.8685891628265381, + "learning_rate": 1.5929785072293866e-05, + "loss": 0.1488, + "step": 261500 + }, + { + "epoch": 20.42951971885982, + "grad_norm": 0.8837761282920837, + "learning_rate": 1.5928221961703792e-05, + "loss": 0.1541, + "step": 261600 + }, + { + "epoch": 20.437329168293637, + "grad_norm": 0.5444580912590027, + "learning_rate": 1.5926674482219618e-05, + "loss": 0.1462, + "step": 261700 + }, + { + "epoch": 20.44513861772745, + "grad_norm": 0.7685499787330627, + "learning_rate": 1.5925111371629544e-05, + "loss": 0.1555, + "step": 261800 + }, + { + "epoch": 20.452948067161266, + "grad_norm": 0.9541754126548767, + "learning_rate": 1.592354826103947e-05, + "loss": 0.1511, + "step": 261900 + }, + { + "epoch": 20.46075751659508, + "grad_norm": 0.8788749575614929, + "learning_rate": 1.5921985150449396e-05, + "loss": 0.1598, + "step": 262000 + }, + { + "epoch": 20.468566966028895, + "grad_norm": 0.9367106556892395, + "learning_rate": 1.5920422039859322e-05, + "loss": 0.1426, + "step": 262100 + }, + { + "epoch": 20.47637641546271, + "grad_norm": 0.6599664688110352, + "learning_rate": 1.5918858929269248e-05, + "loss": 0.1476, + "step": 262200 + }, + { + "epoch": 20.484185864896524, + "grad_norm": 0.9754122495651245, + "learning_rate": 1.5917295818679174e-05, + "loss": 0.156, + "step": 262300 + }, + { + "epoch": 20.49199531433034, + "grad_norm": 0.921559751033783, + "learning_rate": 1.5915732708089096e-05, + "loss": 0.1504, + "step": 262400 + }, + { + "epoch": 20.499804763764153, + "grad_norm": 0.5617296099662781, + "learning_rate": 1.5914169597499026e-05, + "loss": 0.1478, + "step": 262500 + }, + { + "epoch": 20.507614213197968, + "grad_norm": 0.7331305146217346, + "learning_rate": 1.5912606486908952e-05, + "loss": 0.1414, + "step": 262600 + }, + { + "epoch": 20.515423662631786, + "grad_norm": 0.9771094918251038, + "learning_rate": 1.5911043376318874e-05, + "loss": 0.1487, + "step": 262700 + }, + { + "epoch": 20.5232331120656, + "grad_norm": 0.9460648894309998, + "learning_rate": 1.59094802657288e-05, + "loss": 0.1441, + "step": 262800 + }, + { + "epoch": 20.531042561499415, + "grad_norm": 0.7630587220191956, + "learning_rate": 1.5907917155138726e-05, + "loss": 0.1521, + "step": 262900 + }, + { + "epoch": 20.53885201093323, + "grad_norm": 1.0937858819961548, + "learning_rate": 1.5906354044548652e-05, + "loss": 0.1452, + "step": 263000 + }, + { + "epoch": 20.546661460367044, + "grad_norm": 0.5268203020095825, + "learning_rate": 1.590479093395858e-05, + "loss": 0.1423, + "step": 263100 + }, + { + "epoch": 20.55447090980086, + "grad_norm": 0.7118089199066162, + "learning_rate": 1.5903227823368505e-05, + "loss": 0.1515, + "step": 263200 + }, + { + "epoch": 20.562280359234673, + "grad_norm": 0.9789298176765442, + "learning_rate": 1.590166471277843e-05, + "loss": 0.1476, + "step": 263300 + }, + { + "epoch": 20.570089808668488, + "grad_norm": 0.9407501220703125, + "learning_rate": 1.5900101602188357e-05, + "loss": 0.1458, + "step": 263400 + }, + { + "epoch": 20.577899258102303, + "grad_norm": 0.942206859588623, + "learning_rate": 1.5898538491598283e-05, + "loss": 0.1462, + "step": 263500 + }, + { + "epoch": 20.585708707536117, + "grad_norm": 0.7750436067581177, + "learning_rate": 1.589697538100821e-05, + "loss": 0.149, + "step": 263600 + }, + { + "epoch": 20.593518156969935, + "grad_norm": 1.2150155305862427, + "learning_rate": 1.5895412270418135e-05, + "loss": 0.1492, + "step": 263700 + }, + { + "epoch": 20.60132760640375, + "grad_norm": 0.917884349822998, + "learning_rate": 1.5893849159828057e-05, + "loss": 0.1482, + "step": 263800 + }, + { + "epoch": 20.609137055837564, + "grad_norm": 0.9767228960990906, + "learning_rate": 1.5892286049237983e-05, + "loss": 0.1459, + "step": 263900 + }, + { + "epoch": 20.61694650527138, + "grad_norm": 0.6677699685096741, + "learning_rate": 1.5890738569753813e-05, + "loss": 0.1505, + "step": 264000 + }, + { + "epoch": 20.624755954705194, + "grad_norm": 0.9703248739242554, + "learning_rate": 1.588917545916374e-05, + "loss": 0.1502, + "step": 264100 + }, + { + "epoch": 20.632565404139008, + "grad_norm": 0.7702843546867371, + "learning_rate": 1.588761234857366e-05, + "loss": 0.1514, + "step": 264200 + }, + { + "epoch": 20.640374853572823, + "grad_norm": 0.4829925298690796, + "learning_rate": 1.588604923798359e-05, + "loss": 0.1551, + "step": 264300 + }, + { + "epoch": 20.648184303006637, + "grad_norm": 1.015032172203064, + "learning_rate": 1.5884486127393513e-05, + "loss": 0.1573, + "step": 264400 + }, + { + "epoch": 20.65599375244045, + "grad_norm": 0.7382018566131592, + "learning_rate": 1.588292301680344e-05, + "loss": 0.1468, + "step": 264500 + }, + { + "epoch": 20.663803201874266, + "grad_norm": 0.8831744194030762, + "learning_rate": 1.5881359906213365e-05, + "loss": 0.1488, + "step": 264600 + }, + { + "epoch": 20.671612651308084, + "grad_norm": 0.983797013759613, + "learning_rate": 1.587979679562329e-05, + "loss": 0.149, + "step": 264700 + }, + { + "epoch": 20.6794221007419, + "grad_norm": 0.9039027094841003, + "learning_rate": 1.5878233685033217e-05, + "loss": 0.1453, + "step": 264800 + }, + { + "epoch": 20.687231550175714, + "grad_norm": 0.7768588662147522, + "learning_rate": 1.5876670574443143e-05, + "loss": 0.1425, + "step": 264900 + }, + { + "epoch": 20.695040999609528, + "grad_norm": 0.8414096832275391, + "learning_rate": 1.587510746385307e-05, + "loss": 0.147, + "step": 265000 + }, + { + "epoch": 20.702850449043343, + "grad_norm": 0.981864869594574, + "learning_rate": 1.5873544353262995e-05, + "loss": 0.1482, + "step": 265100 + }, + { + "epoch": 20.710659898477157, + "grad_norm": 0.6834481358528137, + "learning_rate": 1.587198124267292e-05, + "loss": 0.1507, + "step": 265200 + }, + { + "epoch": 20.718469347910972, + "grad_norm": 1.1916412115097046, + "learning_rate": 1.5870418132082844e-05, + "loss": 0.156, + "step": 265300 + }, + { + "epoch": 20.726278797344786, + "grad_norm": 0.7448387742042542, + "learning_rate": 1.5868855021492773e-05, + "loss": 0.1463, + "step": 265400 + }, + { + "epoch": 20.7340882467786, + "grad_norm": 0.8105716705322266, + "learning_rate": 1.58672919109027e-05, + "loss": 0.1498, + "step": 265500 + }, + { + "epoch": 20.741897696212416, + "grad_norm": 0.7603853940963745, + "learning_rate": 1.5865728800312622e-05, + "loss": 0.1453, + "step": 265600 + }, + { + "epoch": 20.749707145646234, + "grad_norm": 0.9040424227714539, + "learning_rate": 1.5864165689722548e-05, + "loss": 0.153, + "step": 265700 + }, + { + "epoch": 20.757516595080048, + "grad_norm": 1.0994298458099365, + "learning_rate": 1.5862602579132474e-05, + "loss": 0.1534, + "step": 265800 + }, + { + "epoch": 20.765326044513863, + "grad_norm": 0.6235753297805786, + "learning_rate": 1.58610394685424e-05, + "loss": 0.1476, + "step": 265900 + }, + { + "epoch": 20.773135493947677, + "grad_norm": 0.9457157254219055, + "learning_rate": 1.5859491989058226e-05, + "loss": 0.147, + "step": 266000 + }, + { + "epoch": 20.780944943381492, + "grad_norm": 1.1702708005905151, + "learning_rate": 1.5857928878468155e-05, + "loss": 0.1493, + "step": 266100 + }, + { + "epoch": 20.788754392815306, + "grad_norm": 0.9484367966651917, + "learning_rate": 1.5856365767878078e-05, + "loss": 0.1423, + "step": 266200 + }, + { + "epoch": 20.79656384224912, + "grad_norm": 0.8259855508804321, + "learning_rate": 1.5854802657288004e-05, + "loss": 0.1465, + "step": 266300 + }, + { + "epoch": 20.804373291682936, + "grad_norm": 0.8354877233505249, + "learning_rate": 1.585323954669793e-05, + "loss": 0.1403, + "step": 266400 + }, + { + "epoch": 20.81218274111675, + "grad_norm": 0.9899166226387024, + "learning_rate": 1.5851676436107856e-05, + "loss": 0.1575, + "step": 266500 + }, + { + "epoch": 20.819992190550565, + "grad_norm": 1.1090909242630005, + "learning_rate": 1.5850113325517782e-05, + "loss": 0.1496, + "step": 266600 + }, + { + "epoch": 20.82780163998438, + "grad_norm": 0.7917970418930054, + "learning_rate": 1.5848550214927708e-05, + "loss": 0.1525, + "step": 266700 + }, + { + "epoch": 20.835611089418197, + "grad_norm": 0.7563430666923523, + "learning_rate": 1.584698710433763e-05, + "loss": 0.1441, + "step": 266800 + }, + { + "epoch": 20.843420538852012, + "grad_norm": 0.8265455365180969, + "learning_rate": 1.584542399374756e-05, + "loss": 0.1462, + "step": 266900 + }, + { + "epoch": 20.851229988285827, + "grad_norm": 0.6629912257194519, + "learning_rate": 1.5843860883157486e-05, + "loss": 0.1469, + "step": 267000 + }, + { + "epoch": 20.85903943771964, + "grad_norm": 1.1149176359176636, + "learning_rate": 1.584229777256741e-05, + "loss": 0.1514, + "step": 267100 + }, + { + "epoch": 20.866848887153456, + "grad_norm": 0.7931187748908997, + "learning_rate": 1.5840734661977338e-05, + "loss": 0.1478, + "step": 267200 + }, + { + "epoch": 20.87465833658727, + "grad_norm": 0.49144962430000305, + "learning_rate": 1.5839171551387264e-05, + "loss": 0.1559, + "step": 267300 + }, + { + "epoch": 20.882467786021085, + "grad_norm": 1.0094081163406372, + "learning_rate": 1.5837608440797187e-05, + "loss": 0.1516, + "step": 267400 + }, + { + "epoch": 20.8902772354549, + "grad_norm": 0.905548632144928, + "learning_rate": 1.5836045330207113e-05, + "loss": 0.1448, + "step": 267500 + }, + { + "epoch": 20.898086684888714, + "grad_norm": 0.7649174332618713, + "learning_rate": 1.583448221961704e-05, + "loss": 0.1521, + "step": 267600 + }, + { + "epoch": 20.905896134322532, + "grad_norm": 0.6280129551887512, + "learning_rate": 1.5832919109026965e-05, + "loss": 0.1458, + "step": 267700 + }, + { + "epoch": 20.913705583756347, + "grad_norm": 0.8115954995155334, + "learning_rate": 1.583135599843689e-05, + "loss": 0.1587, + "step": 267800 + }, + { + "epoch": 20.92151503319016, + "grad_norm": 1.0527704954147339, + "learning_rate": 1.5829792887846817e-05, + "loss": 0.1528, + "step": 267900 + }, + { + "epoch": 20.929324482623976, + "grad_norm": 0.943759024143219, + "learning_rate": 1.5828245408362643e-05, + "loss": 0.1505, + "step": 268000 + }, + { + "epoch": 20.93713393205779, + "grad_norm": 0.8028054237365723, + "learning_rate": 1.582668229777257e-05, + "loss": 0.1493, + "step": 268100 + }, + { + "epoch": 20.944943381491605, + "grad_norm": 0.9871835708618164, + "learning_rate": 1.5825119187182495e-05, + "loss": 0.1489, + "step": 268200 + }, + { + "epoch": 20.95275283092542, + "grad_norm": 0.9000875353813171, + "learning_rate": 1.582355607659242e-05, + "loss": 0.1488, + "step": 268300 + }, + { + "epoch": 20.960562280359234, + "grad_norm": 0.8018443584442139, + "learning_rate": 1.5821992966002347e-05, + "loss": 0.1485, + "step": 268400 + }, + { + "epoch": 20.96837172979305, + "grad_norm": 1.2284111976623535, + "learning_rate": 1.5820429855412273e-05, + "loss": 0.1506, + "step": 268500 + }, + { + "epoch": 20.976181179226863, + "grad_norm": 0.5926699042320251, + "learning_rate": 1.5818866744822195e-05, + "loss": 0.1468, + "step": 268600 + }, + { + "epoch": 20.983990628660678, + "grad_norm": 0.8168689012527466, + "learning_rate": 1.5817303634232125e-05, + "loss": 0.1497, + "step": 268700 + }, + { + "epoch": 20.991800078094496, + "grad_norm": 0.9076529741287231, + "learning_rate": 1.581574052364205e-05, + "loss": 0.147, + "step": 268800 + }, + { + "epoch": 20.99960952752831, + "grad_norm": 1.1257812976837158, + "learning_rate": 1.5814177413051973e-05, + "loss": 0.152, + "step": 268900 + }, + { + "epoch": 21.007418976962125, + "grad_norm": 0.6348904967308044, + "learning_rate": 1.58126143024619e-05, + "loss": 0.1503, + "step": 269000 + }, + { + "epoch": 21.01522842639594, + "grad_norm": 0.6594538688659668, + "learning_rate": 1.5811051191871825e-05, + "loss": 0.145, + "step": 269100 + }, + { + "epoch": 21.023037875829754, + "grad_norm": 0.9883918762207031, + "learning_rate": 1.580948808128175e-05, + "loss": 0.1465, + "step": 269200 + }, + { + "epoch": 21.03084732526357, + "grad_norm": 0.9725099802017212, + "learning_rate": 1.5807924970691677e-05, + "loss": 0.1514, + "step": 269300 + }, + { + "epoch": 21.038656774697383, + "grad_norm": 0.8018738031387329, + "learning_rate": 1.5806361860101603e-05, + "loss": 0.1557, + "step": 269400 + }, + { + "epoch": 21.046466224131198, + "grad_norm": 0.71611487865448, + "learning_rate": 1.580479874951153e-05, + "loss": 0.1453, + "step": 269500 + }, + { + "epoch": 21.054275673565012, + "grad_norm": 0.8069080114364624, + "learning_rate": 1.5803235638921455e-05, + "loss": 0.1493, + "step": 269600 + }, + { + "epoch": 21.062085122998827, + "grad_norm": 1.025117039680481, + "learning_rate": 1.580167252833138e-05, + "loss": 0.1398, + "step": 269700 + }, + { + "epoch": 21.069894572432645, + "grad_norm": 0.8417700529098511, + "learning_rate": 1.5800109417741307e-05, + "loss": 0.1514, + "step": 269800 + }, + { + "epoch": 21.07770402186646, + "grad_norm": 0.6837599873542786, + "learning_rate": 1.5798546307151233e-05, + "loss": 0.1453, + "step": 269900 + }, + { + "epoch": 21.085513471300274, + "grad_norm": 0.7681900858879089, + "learning_rate": 1.5796983196561156e-05, + "loss": 0.1502, + "step": 270000 + }, + { + "epoch": 21.09332292073409, + "grad_norm": 1.1430526971817017, + "learning_rate": 1.5795435717076982e-05, + "loss": 0.1446, + "step": 270100 + }, + { + "epoch": 21.101132370167903, + "grad_norm": 0.6700018048286438, + "learning_rate": 1.579387260648691e-05, + "loss": 0.1482, + "step": 270200 + }, + { + "epoch": 21.108941819601718, + "grad_norm": 0.48721635341644287, + "learning_rate": 1.5792309495896837e-05, + "loss": 0.1471, + "step": 270300 + }, + { + "epoch": 21.116751269035532, + "grad_norm": 1.0522655248641968, + "learning_rate": 1.579074638530676e-05, + "loss": 0.1468, + "step": 270400 + }, + { + "epoch": 21.124560718469347, + "grad_norm": 0.5169057250022888, + "learning_rate": 1.578918327471669e-05, + "loss": 0.1473, + "step": 270500 + }, + { + "epoch": 21.13237016790316, + "grad_norm": 0.614720344543457, + "learning_rate": 1.5787620164126612e-05, + "loss": 0.1415, + "step": 270600 + }, + { + "epoch": 21.140179617336976, + "grad_norm": 1.132947564125061, + "learning_rate": 1.5786057053536538e-05, + "loss": 0.142, + "step": 270700 + }, + { + "epoch": 21.147989066770794, + "grad_norm": 1.131731390953064, + "learning_rate": 1.5784493942946464e-05, + "loss": 0.1433, + "step": 270800 + }, + { + "epoch": 21.15579851620461, + "grad_norm": 0.8054686784744263, + "learning_rate": 1.578293083235639e-05, + "loss": 0.1434, + "step": 270900 + }, + { + "epoch": 21.163607965638423, + "grad_norm": 0.8654218912124634, + "learning_rate": 1.5781367721766316e-05, + "loss": 0.1398, + "step": 271000 + }, + { + "epoch": 21.171417415072238, + "grad_norm": 0.7513204216957092, + "learning_rate": 1.5779804611176242e-05, + "loss": 0.1375, + "step": 271100 + }, + { + "epoch": 21.179226864506052, + "grad_norm": 0.8772755265235901, + "learning_rate": 1.5778241500586168e-05, + "loss": 0.143, + "step": 271200 + }, + { + "epoch": 21.187036313939867, + "grad_norm": 0.661716103553772, + "learning_rate": 1.5776678389996094e-05, + "loss": 0.1414, + "step": 271300 + }, + { + "epoch": 21.19484576337368, + "grad_norm": 1.0139799118041992, + "learning_rate": 1.577511527940602e-05, + "loss": 0.148, + "step": 271400 + }, + { + "epoch": 21.202655212807496, + "grad_norm": 1.010338544845581, + "learning_rate": 1.5773552168815943e-05, + "loss": 0.1527, + "step": 271500 + }, + { + "epoch": 21.21046466224131, + "grad_norm": 1.147233247756958, + "learning_rate": 1.5771989058225872e-05, + "loss": 0.14, + "step": 271600 + }, + { + "epoch": 21.218274111675125, + "grad_norm": 1.077331304550171, + "learning_rate": 1.5770425947635798e-05, + "loss": 0.1498, + "step": 271700 + }, + { + "epoch": 21.226083561108943, + "grad_norm": 0.8089535236358643, + "learning_rate": 1.576886283704572e-05, + "loss": 0.1503, + "step": 271800 + }, + { + "epoch": 21.233893010542758, + "grad_norm": 0.8551170825958252, + "learning_rate": 1.5767299726455647e-05, + "loss": 0.1423, + "step": 271900 + }, + { + "epoch": 21.241702459976572, + "grad_norm": 0.8253236413002014, + "learning_rate": 1.5765736615865573e-05, + "loss": 0.1452, + "step": 272000 + }, + { + "epoch": 21.249511909410387, + "grad_norm": 0.9153646230697632, + "learning_rate": 1.57641891363814e-05, + "loss": 0.1483, + "step": 272100 + }, + { + "epoch": 21.2573213588442, + "grad_norm": 0.7935351729393005, + "learning_rate": 1.5762626025791325e-05, + "loss": 0.1425, + "step": 272200 + }, + { + "epoch": 21.265130808278016, + "grad_norm": 0.7212216258049011, + "learning_rate": 1.5761062915201254e-05, + "loss": 0.1475, + "step": 272300 + }, + { + "epoch": 21.27294025771183, + "grad_norm": 0.6691550612449646, + "learning_rate": 1.5759499804611177e-05, + "loss": 0.1344, + "step": 272400 + }, + { + "epoch": 21.280749707145645, + "grad_norm": 0.9414501786231995, + "learning_rate": 1.5757936694021103e-05, + "loss": 0.1445, + "step": 272500 + }, + { + "epoch": 21.28855915657946, + "grad_norm": 0.8180404901504517, + "learning_rate": 1.575637358343103e-05, + "loss": 0.1462, + "step": 272600 + }, + { + "epoch": 21.296368606013274, + "grad_norm": 0.8017256259918213, + "learning_rate": 1.5754810472840955e-05, + "loss": 0.1483, + "step": 272700 + }, + { + "epoch": 21.304178055447093, + "grad_norm": 0.8329930901527405, + "learning_rate": 1.575324736225088e-05, + "loss": 0.1408, + "step": 272800 + }, + { + "epoch": 21.311987504880907, + "grad_norm": 1.1513885259628296, + "learning_rate": 1.5751684251660807e-05, + "loss": 0.1509, + "step": 272900 + }, + { + "epoch": 21.31979695431472, + "grad_norm": 0.7862344980239868, + "learning_rate": 1.575012114107073e-05, + "loss": 0.1452, + "step": 273000 + }, + { + "epoch": 21.327606403748536, + "grad_norm": 0.8844183087348938, + "learning_rate": 1.574855803048066e-05, + "loss": 0.1455, + "step": 273100 + }, + { + "epoch": 21.33541585318235, + "grad_norm": 0.8400126695632935, + "learning_rate": 1.5746994919890585e-05, + "loss": 0.1447, + "step": 273200 + }, + { + "epoch": 21.343225302616165, + "grad_norm": 0.8862954378128052, + "learning_rate": 1.5745431809300508e-05, + "loss": 0.1322, + "step": 273300 + }, + { + "epoch": 21.35103475204998, + "grad_norm": 0.8712252378463745, + "learning_rate": 1.5743868698710437e-05, + "loss": 0.1483, + "step": 273400 + }, + { + "epoch": 21.358844201483794, + "grad_norm": 0.8062180876731873, + "learning_rate": 1.5742305588120363e-05, + "loss": 0.145, + "step": 273500 + }, + { + "epoch": 21.36665365091761, + "grad_norm": 0.9273768663406372, + "learning_rate": 1.5740742477530286e-05, + "loss": 0.141, + "step": 273600 + }, + { + "epoch": 21.374463100351424, + "grad_norm": 0.6157429218292236, + "learning_rate": 1.573917936694021e-05, + "loss": 0.1433, + "step": 273700 + }, + { + "epoch": 21.38227254978524, + "grad_norm": 0.5980425477027893, + "learning_rate": 1.5737616256350138e-05, + "loss": 0.1445, + "step": 273800 + }, + { + "epoch": 21.390081999219056, + "grad_norm": 0.8572303652763367, + "learning_rate": 1.5736053145760064e-05, + "loss": 0.1446, + "step": 273900 + }, + { + "epoch": 21.39789144865287, + "grad_norm": 1.1001536846160889, + "learning_rate": 1.573449003516999e-05, + "loss": 0.1496, + "step": 274000 + }, + { + "epoch": 21.405700898086685, + "grad_norm": 0.6422202587127686, + "learning_rate": 1.5732942555685816e-05, + "loss": 0.1492, + "step": 274100 + }, + { + "epoch": 21.4135103475205, + "grad_norm": 0.9524158835411072, + "learning_rate": 1.573137944509574e-05, + "loss": 0.1382, + "step": 274200 + }, + { + "epoch": 21.421319796954315, + "grad_norm": 0.8874322175979614, + "learning_rate": 1.5729816334505668e-05, + "loss": 0.1547, + "step": 274300 + }, + { + "epoch": 21.42912924638813, + "grad_norm": 0.8682050704956055, + "learning_rate": 1.5728253223915594e-05, + "loss": 0.1414, + "step": 274400 + }, + { + "epoch": 21.436938695821944, + "grad_norm": 0.7201287150382996, + "learning_rate": 1.572669011332552e-05, + "loss": 0.144, + "step": 274500 + }, + { + "epoch": 21.44474814525576, + "grad_norm": 0.8117372989654541, + "learning_rate": 1.5725127002735446e-05, + "loss": 0.1452, + "step": 274600 + }, + { + "epoch": 21.452557594689573, + "grad_norm": 0.8349194526672363, + "learning_rate": 1.572356389214537e-05, + "loss": 0.1467, + "step": 274700 + }, + { + "epoch": 21.46036704412339, + "grad_norm": 0.6320531368255615, + "learning_rate": 1.5722000781555294e-05, + "loss": 0.144, + "step": 274800 + }, + { + "epoch": 21.468176493557205, + "grad_norm": 0.9218801856040955, + "learning_rate": 1.5720437670965224e-05, + "loss": 0.1433, + "step": 274900 + }, + { + "epoch": 21.47598594299102, + "grad_norm": 1.0286614894866943, + "learning_rate": 1.571887456037515e-05, + "loss": 0.1446, + "step": 275000 + }, + { + "epoch": 21.483795392424835, + "grad_norm": 0.8825885653495789, + "learning_rate": 1.5717311449785072e-05, + "loss": 0.1411, + "step": 275100 + }, + { + "epoch": 21.49160484185865, + "grad_norm": 0.9197112917900085, + "learning_rate": 1.5715748339194998e-05, + "loss": 0.1406, + "step": 275200 + }, + { + "epoch": 21.499414291292464, + "grad_norm": 1.0009756088256836, + "learning_rate": 1.5714185228604924e-05, + "loss": 0.1452, + "step": 275300 + }, + { + "epoch": 21.50722374072628, + "grad_norm": 1.0206668376922607, + "learning_rate": 1.571262211801485e-05, + "loss": 0.1446, + "step": 275400 + }, + { + "epoch": 21.515033190160093, + "grad_norm": 0.8309497833251953, + "learning_rate": 1.5711059007424776e-05, + "loss": 0.1419, + "step": 275500 + }, + { + "epoch": 21.522842639593907, + "grad_norm": 0.8372001647949219, + "learning_rate": 1.5709495896834702e-05, + "loss": 0.1401, + "step": 275600 + }, + { + "epoch": 21.530652089027722, + "grad_norm": 0.7666497826576233, + "learning_rate": 1.570793278624463e-05, + "loss": 0.1295, + "step": 275700 + }, + { + "epoch": 21.53846153846154, + "grad_norm": 0.7983540892601013, + "learning_rate": 1.5706369675654554e-05, + "loss": 0.1456, + "step": 275800 + }, + { + "epoch": 21.546270987895355, + "grad_norm": 0.8200308084487915, + "learning_rate": 1.570480656506448e-05, + "loss": 0.1499, + "step": 275900 + }, + { + "epoch": 21.55408043732917, + "grad_norm": 1.095491886138916, + "learning_rate": 1.5703243454474406e-05, + "loss": 0.1378, + "step": 276000 + }, + { + "epoch": 21.561889886762984, + "grad_norm": 0.8706907033920288, + "learning_rate": 1.5701695974990232e-05, + "loss": 0.1529, + "step": 276100 + }, + { + "epoch": 21.5696993361968, + "grad_norm": 0.7600975036621094, + "learning_rate": 1.5700132864400158e-05, + "loss": 0.1424, + "step": 276200 + }, + { + "epoch": 21.577508785630613, + "grad_norm": 0.8851431608200073, + "learning_rate": 1.569856975381008e-05, + "loss": 0.1512, + "step": 276300 + }, + { + "epoch": 21.585318235064427, + "grad_norm": 0.9802671670913696, + "learning_rate": 1.569700664322001e-05, + "loss": 0.1418, + "step": 276400 + }, + { + "epoch": 21.593127684498242, + "grad_norm": 0.8661850690841675, + "learning_rate": 1.5695443532629936e-05, + "loss": 0.1484, + "step": 276500 + }, + { + "epoch": 21.600937133932057, + "grad_norm": 0.8756442666053772, + "learning_rate": 1.569388042203986e-05, + "loss": 0.1385, + "step": 276600 + }, + { + "epoch": 21.60874658336587, + "grad_norm": 1.033715009689331, + "learning_rate": 1.569231731144979e-05, + "loss": 0.1436, + "step": 276700 + }, + { + "epoch": 21.61655603279969, + "grad_norm": 0.6875880360603333, + "learning_rate": 1.569075420085971e-05, + "loss": 0.1468, + "step": 276800 + }, + { + "epoch": 21.624365482233504, + "grad_norm": 0.9872573018074036, + "learning_rate": 1.5689191090269637e-05, + "loss": 0.1393, + "step": 276900 + }, + { + "epoch": 21.63217493166732, + "grad_norm": 0.6981116533279419, + "learning_rate": 1.5687627979679563e-05, + "loss": 0.1386, + "step": 277000 + }, + { + "epoch": 21.639984381101133, + "grad_norm": 0.9638392925262451, + "learning_rate": 1.568606486908949e-05, + "loss": 0.1482, + "step": 277100 + }, + { + "epoch": 21.647793830534948, + "grad_norm": 0.8693450093269348, + "learning_rate": 1.5684501758499415e-05, + "loss": 0.1388, + "step": 277200 + }, + { + "epoch": 21.655603279968762, + "grad_norm": 0.8939495086669922, + "learning_rate": 1.568293864790934e-05, + "loss": 0.1428, + "step": 277300 + }, + { + "epoch": 21.663412729402577, + "grad_norm": 0.989825427532196, + "learning_rate": 1.5681375537319267e-05, + "loss": 0.1459, + "step": 277400 + }, + { + "epoch": 21.67122217883639, + "grad_norm": 0.9190754294395447, + "learning_rate": 1.5679812426729193e-05, + "loss": 0.1465, + "step": 277500 + }, + { + "epoch": 21.679031628270206, + "grad_norm": 0.9194713234901428, + "learning_rate": 1.567824931613912e-05, + "loss": 0.1494, + "step": 277600 + }, + { + "epoch": 21.68684107770402, + "grad_norm": 0.9386900067329407, + "learning_rate": 1.567668620554904e-05, + "loss": 0.144, + "step": 277700 + }, + { + "epoch": 21.69465052713784, + "grad_norm": 0.7496020197868347, + "learning_rate": 1.567512309495897e-05, + "loss": 0.1438, + "step": 277800 + }, + { + "epoch": 21.702459976571653, + "grad_norm": 0.7539072036743164, + "learning_rate": 1.5673559984368897e-05, + "loss": 0.147, + "step": 277900 + }, + { + "epoch": 21.710269426005468, + "grad_norm": 0.8383206129074097, + "learning_rate": 1.567199687377882e-05, + "loss": 0.1464, + "step": 278000 + }, + { + "epoch": 21.718078875439282, + "grad_norm": 0.9453973770141602, + "learning_rate": 1.5670433763188746e-05, + "loss": 0.1429, + "step": 278100 + }, + { + "epoch": 21.725888324873097, + "grad_norm": 0.8745108842849731, + "learning_rate": 1.5668886283704575e-05, + "loss": 0.1422, + "step": 278200 + }, + { + "epoch": 21.73369777430691, + "grad_norm": 0.8260504007339478, + "learning_rate": 1.5667323173114498e-05, + "loss": 0.1438, + "step": 278300 + }, + { + "epoch": 21.741507223740726, + "grad_norm": 1.0408035516738892, + "learning_rate": 1.5665760062524424e-05, + "loss": 0.1328, + "step": 278400 + }, + { + "epoch": 21.74931667317454, + "grad_norm": 0.7690845727920532, + "learning_rate": 1.566419695193435e-05, + "loss": 0.1367, + "step": 278500 + }, + { + "epoch": 21.757126122608355, + "grad_norm": 0.9121866226196289, + "learning_rate": 1.5662633841344276e-05, + "loss": 0.1474, + "step": 278600 + }, + { + "epoch": 21.76493557204217, + "grad_norm": 1.1322033405303955, + "learning_rate": 1.5661070730754202e-05, + "loss": 0.1411, + "step": 278700 + }, + { + "epoch": 21.772745021475984, + "grad_norm": 1.1597561836242676, + "learning_rate": 1.5659523251270028e-05, + "loss": 0.1428, + "step": 278800 + }, + { + "epoch": 21.780554470909802, + "grad_norm": 0.9378727078437805, + "learning_rate": 1.5657960140679954e-05, + "loss": 0.151, + "step": 278900 + }, + { + "epoch": 21.788363920343617, + "grad_norm": 0.7633863091468811, + "learning_rate": 1.565639703008988e-05, + "loss": 0.1456, + "step": 279000 + }, + { + "epoch": 21.79617336977743, + "grad_norm": 0.6599911451339722, + "learning_rate": 1.5654833919499806e-05, + "loss": 0.1436, + "step": 279100 + }, + { + "epoch": 21.803982819211246, + "grad_norm": 0.8178460597991943, + "learning_rate": 1.5653270808909732e-05, + "loss": 0.1426, + "step": 279200 + }, + { + "epoch": 21.81179226864506, + "grad_norm": 0.8605934977531433, + "learning_rate": 1.5651707698319658e-05, + "loss": 0.1444, + "step": 279300 + }, + { + "epoch": 21.819601718078875, + "grad_norm": 0.898315966129303, + "learning_rate": 1.5650144587729584e-05, + "loss": 0.1348, + "step": 279400 + }, + { + "epoch": 21.82741116751269, + "grad_norm": 1.0550199747085571, + "learning_rate": 1.564858147713951e-05, + "loss": 0.1317, + "step": 279500 + }, + { + "epoch": 21.835220616946504, + "grad_norm": 0.9702891707420349, + "learning_rate": 1.5647018366549436e-05, + "loss": 0.1412, + "step": 279600 + }, + { + "epoch": 21.84303006638032, + "grad_norm": 0.9088001251220703, + "learning_rate": 1.5645455255959362e-05, + "loss": 0.14, + "step": 279700 + }, + { + "epoch": 21.850839515814137, + "grad_norm": 0.9616572856903076, + "learning_rate": 1.5643892145369284e-05, + "loss": 0.1338, + "step": 279800 + }, + { + "epoch": 21.85864896524795, + "grad_norm": 0.8163248300552368, + "learning_rate": 1.564232903477921e-05, + "loss": 0.1398, + "step": 279900 + }, + { + "epoch": 21.866458414681766, + "grad_norm": 0.8260008096694946, + "learning_rate": 1.564076592418914e-05, + "loss": 0.1384, + "step": 280000 + }, + { + "epoch": 21.87426786411558, + "grad_norm": 1.126587152481079, + "learning_rate": 1.5639202813599062e-05, + "loss": 0.1439, + "step": 280100 + }, + { + "epoch": 21.882077313549395, + "grad_norm": 0.8658336997032166, + "learning_rate": 1.563763970300899e-05, + "loss": 0.1465, + "step": 280200 + }, + { + "epoch": 21.88988676298321, + "grad_norm": 0.8323200345039368, + "learning_rate": 1.5636076592418914e-05, + "loss": 0.1516, + "step": 280300 + }, + { + "epoch": 21.897696212417024, + "grad_norm": 0.99912029504776, + "learning_rate": 1.563451348182884e-05, + "loss": 0.1352, + "step": 280400 + }, + { + "epoch": 21.90550566185084, + "grad_norm": 0.9015044569969177, + "learning_rate": 1.5632950371238766e-05, + "loss": 0.1428, + "step": 280500 + }, + { + "epoch": 21.913315111284653, + "grad_norm": 0.7489168643951416, + "learning_rate": 1.5631387260648692e-05, + "loss": 0.1373, + "step": 280600 + }, + { + "epoch": 21.921124560718468, + "grad_norm": 0.9612414836883545, + "learning_rate": 1.562982415005862e-05, + "loss": 0.1411, + "step": 280700 + }, + { + "epoch": 21.928934010152282, + "grad_norm": 0.8368463516235352, + "learning_rate": 1.5628261039468544e-05, + "loss": 0.1441, + "step": 280800 + }, + { + "epoch": 21.9367434595861, + "grad_norm": 0.7885879874229431, + "learning_rate": 1.562669792887847e-05, + "loss": 0.1297, + "step": 280900 + }, + { + "epoch": 21.944552909019915, + "grad_norm": 0.882032036781311, + "learning_rate": 1.5625134818288393e-05, + "loss": 0.1425, + "step": 281000 + }, + { + "epoch": 21.95236235845373, + "grad_norm": 0.7525109648704529, + "learning_rate": 1.5623571707698323e-05, + "loss": 0.1387, + "step": 281100 + }, + { + "epoch": 21.960171807887544, + "grad_norm": 0.7078118920326233, + "learning_rate": 1.562200859710825e-05, + "loss": 0.1427, + "step": 281200 + }, + { + "epoch": 21.96798125732136, + "grad_norm": 1.0537186861038208, + "learning_rate": 1.562044548651817e-05, + "loss": 0.1395, + "step": 281300 + }, + { + "epoch": 21.975790706755173, + "grad_norm": 0.9437336921691895, + "learning_rate": 1.5618882375928097e-05, + "loss": 0.1357, + "step": 281400 + }, + { + "epoch": 21.983600156188988, + "grad_norm": 0.6707448363304138, + "learning_rate": 1.5617319265338023e-05, + "loss": 0.1432, + "step": 281500 + }, + { + "epoch": 21.991409605622803, + "grad_norm": 1.0596201419830322, + "learning_rate": 1.561575615474795e-05, + "loss": 0.1426, + "step": 281600 + }, + { + "epoch": 21.999219055056617, + "grad_norm": 0.85732102394104, + "learning_rate": 1.5614193044157875e-05, + "loss": 0.1459, + "step": 281700 + }, + { + "epoch": 22.00702850449043, + "grad_norm": 0.6178754568099976, + "learning_rate": 1.56126299335678e-05, + "loss": 0.142, + "step": 281800 + }, + { + "epoch": 22.01483795392425, + "grad_norm": 0.7230113744735718, + "learning_rate": 1.5611066822977727e-05, + "loss": 0.1409, + "step": 281900 + }, + { + "epoch": 22.022647403358064, + "grad_norm": 0.7485736608505249, + "learning_rate": 1.5609503712387653e-05, + "loss": 0.133, + "step": 282000 + }, + { + "epoch": 22.03045685279188, + "grad_norm": 0.6739600300788879, + "learning_rate": 1.560794060179758e-05, + "loss": 0.1363, + "step": 282100 + }, + { + "epoch": 22.038266302225693, + "grad_norm": 0.7293041944503784, + "learning_rate": 1.5606377491207505e-05, + "loss": 0.1479, + "step": 282200 + }, + { + "epoch": 22.046075751659508, + "grad_norm": 0.722092866897583, + "learning_rate": 1.560481438061743e-05, + "loss": 0.137, + "step": 282300 + }, + { + "epoch": 22.053885201093323, + "grad_norm": 0.8747148513793945, + "learning_rate": 1.5603251270027354e-05, + "loss": 0.1345, + "step": 282400 + }, + { + "epoch": 22.061694650527137, + "grad_norm": 0.969915509223938, + "learning_rate": 1.560168815943728e-05, + "loss": 0.1381, + "step": 282500 + }, + { + "epoch": 22.06950409996095, + "grad_norm": 0.8908195495605469, + "learning_rate": 1.560012504884721e-05, + "loss": 0.1342, + "step": 282600 + }, + { + "epoch": 22.077313549394766, + "grad_norm": 0.9030640721321106, + "learning_rate": 1.5598561938257132e-05, + "loss": 0.1343, + "step": 282700 + }, + { + "epoch": 22.08512299882858, + "grad_norm": 0.7707045078277588, + "learning_rate": 1.5597014458772958e-05, + "loss": 0.1375, + "step": 282800 + }, + { + "epoch": 22.0929324482624, + "grad_norm": 0.7158626914024353, + "learning_rate": 1.5595451348182887e-05, + "loss": 0.1391, + "step": 282900 + }, + { + "epoch": 22.100741897696214, + "grad_norm": 0.8736804723739624, + "learning_rate": 1.559388823759281e-05, + "loss": 0.1356, + "step": 283000 + }, + { + "epoch": 22.108551347130028, + "grad_norm": 0.8259661793708801, + "learning_rate": 1.5592325127002736e-05, + "loss": 0.1417, + "step": 283100 + }, + { + "epoch": 22.116360796563843, + "grad_norm": 0.7664586305618286, + "learning_rate": 1.5590762016412662e-05, + "loss": 0.1404, + "step": 283200 + }, + { + "epoch": 22.124170245997657, + "grad_norm": 0.7634907960891724, + "learning_rate": 1.5589198905822588e-05, + "loss": 0.1408, + "step": 283300 + }, + { + "epoch": 22.131979695431472, + "grad_norm": 0.7122980952262878, + "learning_rate": 1.5587635795232514e-05, + "loss": 0.1359, + "step": 283400 + }, + { + "epoch": 22.139789144865286, + "grad_norm": 1.002042531967163, + "learning_rate": 1.558607268464244e-05, + "loss": 0.1345, + "step": 283500 + }, + { + "epoch": 22.1475985942991, + "grad_norm": 0.675338089466095, + "learning_rate": 1.5584509574052366e-05, + "loss": 0.1415, + "step": 283600 + }, + { + "epoch": 22.155408043732916, + "grad_norm": 1.0163286924362183, + "learning_rate": 1.5582946463462292e-05, + "loss": 0.141, + "step": 283700 + }, + { + "epoch": 22.16321749316673, + "grad_norm": 0.7089374661445618, + "learning_rate": 1.5581383352872218e-05, + "loss": 0.1497, + "step": 283800 + }, + { + "epoch": 22.171026942600548, + "grad_norm": 0.6143225431442261, + "learning_rate": 1.557982024228214e-05, + "loss": 0.1443, + "step": 283900 + }, + { + "epoch": 22.178836392034363, + "grad_norm": 1.1019307374954224, + "learning_rate": 1.557825713169207e-05, + "loss": 0.1351, + "step": 284000 + }, + { + "epoch": 22.186645841468177, + "grad_norm": 0.901965856552124, + "learning_rate": 1.5576694021101996e-05, + "loss": 0.1405, + "step": 284100 + }, + { + "epoch": 22.194455290901992, + "grad_norm": 0.5475614070892334, + "learning_rate": 1.557513091051192e-05, + "loss": 0.1338, + "step": 284200 + }, + { + "epoch": 22.202264740335806, + "grad_norm": 0.8850200176239014, + "learning_rate": 1.5573567799921845e-05, + "loss": 0.1438, + "step": 284300 + }, + { + "epoch": 22.21007418976962, + "grad_norm": 1.0332914590835571, + "learning_rate": 1.557200468933177e-05, + "loss": 0.1353, + "step": 284400 + }, + { + "epoch": 22.217883639203436, + "grad_norm": 0.8510705828666687, + "learning_rate": 1.5570441578741697e-05, + "loss": 0.1396, + "step": 284500 + }, + { + "epoch": 22.22569308863725, + "grad_norm": 0.9297281503677368, + "learning_rate": 1.5568878468151623e-05, + "loss": 0.142, + "step": 284600 + }, + { + "epoch": 22.233502538071065, + "grad_norm": 1.048340916633606, + "learning_rate": 1.556731535756155e-05, + "loss": 0.1349, + "step": 284700 + }, + { + "epoch": 22.24131198750488, + "grad_norm": 0.9004128575325012, + "learning_rate": 1.5565767878077375e-05, + "loss": 0.1369, + "step": 284800 + }, + { + "epoch": 22.249121436938697, + "grad_norm": 0.7570412755012512, + "learning_rate": 1.55642047674873e-05, + "loss": 0.1367, + "step": 284900 + }, + { + "epoch": 22.256930886372512, + "grad_norm": 1.0669686794281006, + "learning_rate": 1.5562641656897227e-05, + "loss": 0.1483, + "step": 285000 + }, + { + "epoch": 22.264740335806326, + "grad_norm": 0.7067891955375671, + "learning_rate": 1.5561078546307153e-05, + "loss": 0.1474, + "step": 285100 + }, + { + "epoch": 22.27254978524014, + "grad_norm": 0.7027773857116699, + "learning_rate": 1.555951543571708e-05, + "loss": 0.1402, + "step": 285200 + }, + { + "epoch": 22.280359234673956, + "grad_norm": 0.7071043848991394, + "learning_rate": 1.5557952325127005e-05, + "loss": 0.1376, + "step": 285300 + }, + { + "epoch": 22.28816868410777, + "grad_norm": 0.8070594668388367, + "learning_rate": 1.5556389214536927e-05, + "loss": 0.1365, + "step": 285400 + }, + { + "epoch": 22.295978133541585, + "grad_norm": 0.5632590651512146, + "learning_rate": 1.5554826103946857e-05, + "loss": 0.1352, + "step": 285500 + }, + { + "epoch": 22.3037875829754, + "grad_norm": 0.7632501125335693, + "learning_rate": 1.5553262993356783e-05, + "loss": 0.1374, + "step": 285600 + }, + { + "epoch": 22.311597032409214, + "grad_norm": 0.9150585532188416, + "learning_rate": 1.5551699882766705e-05, + "loss": 0.1369, + "step": 285700 + }, + { + "epoch": 22.31940648184303, + "grad_norm": 0.5579971671104431, + "learning_rate": 1.5550136772176635e-05, + "loss": 0.1367, + "step": 285800 + }, + { + "epoch": 22.327215931276847, + "grad_norm": 0.9822412133216858, + "learning_rate": 1.5548573661586557e-05, + "loss": 0.1387, + "step": 285900 + }, + { + "epoch": 22.33502538071066, + "grad_norm": 0.6855137944221497, + "learning_rate": 1.5547010550996483e-05, + "loss": 0.1364, + "step": 286000 + }, + { + "epoch": 22.342834830144476, + "grad_norm": 0.592866063117981, + "learning_rate": 1.554544744040641e-05, + "loss": 0.145, + "step": 286100 + }, + { + "epoch": 22.35064427957829, + "grad_norm": 0.7260706424713135, + "learning_rate": 1.5543884329816335e-05, + "loss": 0.1404, + "step": 286200 + }, + { + "epoch": 22.358453729012105, + "grad_norm": 0.7422342896461487, + "learning_rate": 1.554232121922626e-05, + "loss": 0.1384, + "step": 286300 + }, + { + "epoch": 22.36626317844592, + "grad_norm": 0.8567727208137512, + "learning_rate": 1.5540758108636187e-05, + "loss": 0.1349, + "step": 286400 + }, + { + "epoch": 22.374072627879734, + "grad_norm": 0.9445796608924866, + "learning_rate": 1.5539194998046113e-05, + "loss": 0.1358, + "step": 286500 + }, + { + "epoch": 22.38188207731355, + "grad_norm": 1.007031798362732, + "learning_rate": 1.553763188745604e-05, + "loss": 0.1367, + "step": 286600 + }, + { + "epoch": 22.389691526747363, + "grad_norm": 0.663798987865448, + "learning_rate": 1.5536068776865965e-05, + "loss": 0.1381, + "step": 286700 + }, + { + "epoch": 22.397500976181178, + "grad_norm": 0.9678874611854553, + "learning_rate": 1.553452129738179e-05, + "loss": 0.1298, + "step": 286800 + }, + { + "epoch": 22.405310425614996, + "grad_norm": 0.6478586196899414, + "learning_rate": 1.5532958186791717e-05, + "loss": 0.1412, + "step": 286900 + }, + { + "epoch": 22.41311987504881, + "grad_norm": 0.9851172566413879, + "learning_rate": 1.5531395076201643e-05, + "loss": 0.1384, + "step": 287000 + }, + { + "epoch": 22.420929324482625, + "grad_norm": 0.925794243812561, + "learning_rate": 1.552983196561157e-05, + "loss": 0.1407, + "step": 287100 + }, + { + "epoch": 22.42873877391644, + "grad_norm": 0.7711319923400879, + "learning_rate": 1.5528268855021492e-05, + "loss": 0.1469, + "step": 287200 + }, + { + "epoch": 22.436548223350254, + "grad_norm": 0.8208619356155396, + "learning_rate": 1.552670574443142e-05, + "loss": 0.1439, + "step": 287300 + }, + { + "epoch": 22.44435767278407, + "grad_norm": 0.9265621304512024, + "learning_rate": 1.5525142633841347e-05, + "loss": 0.1356, + "step": 287400 + }, + { + "epoch": 22.452167122217883, + "grad_norm": 0.6884872317314148, + "learning_rate": 1.552357952325127e-05, + "loss": 0.136, + "step": 287500 + }, + { + "epoch": 22.459976571651698, + "grad_norm": 0.899463415145874, + "learning_rate": 1.5522016412661196e-05, + "loss": 0.1399, + "step": 287600 + }, + { + "epoch": 22.467786021085512, + "grad_norm": 0.9107911586761475, + "learning_rate": 1.5520453302071122e-05, + "loss": 0.1357, + "step": 287700 + }, + { + "epoch": 22.475595470519327, + "grad_norm": 0.9815389513969421, + "learning_rate": 1.5518890191481048e-05, + "loss": 0.14, + "step": 287800 + }, + { + "epoch": 22.483404919953145, + "grad_norm": 0.8975253701210022, + "learning_rate": 1.5517327080890974e-05, + "loss": 0.1393, + "step": 287900 + }, + { + "epoch": 22.49121436938696, + "grad_norm": 0.8066114187240601, + "learning_rate": 1.55157639703009e-05, + "loss": 0.1336, + "step": 288000 + }, + { + "epoch": 22.499023818820774, + "grad_norm": 0.7992839813232422, + "learning_rate": 1.5514200859710826e-05, + "loss": 0.132, + "step": 288100 + }, + { + "epoch": 22.50683326825459, + "grad_norm": 0.8005449771881104, + "learning_rate": 1.5512637749120752e-05, + "loss": 0.1399, + "step": 288200 + }, + { + "epoch": 22.514642717688403, + "grad_norm": 0.7647144198417664, + "learning_rate": 1.5511074638530678e-05, + "loss": 0.1372, + "step": 288300 + }, + { + "epoch": 22.522452167122218, + "grad_norm": 0.8365885019302368, + "learning_rate": 1.5509511527940604e-05, + "loss": 0.14, + "step": 288400 + }, + { + "epoch": 22.530261616556032, + "grad_norm": 0.9638312458992004, + "learning_rate": 1.550794841735053e-05, + "loss": 0.1315, + "step": 288500 + }, + { + "epoch": 22.538071065989847, + "grad_norm": 0.7517139911651611, + "learning_rate": 1.5506385306760453e-05, + "loss": 0.1413, + "step": 288600 + }, + { + "epoch": 22.54588051542366, + "grad_norm": 0.8458108901977539, + "learning_rate": 1.550482219617038e-05, + "loss": 0.1393, + "step": 288700 + }, + { + "epoch": 22.553689964857476, + "grad_norm": 0.7653716802597046, + "learning_rate": 1.5503259085580308e-05, + "loss": 0.1404, + "step": 288800 + }, + { + "epoch": 22.561499414291294, + "grad_norm": 0.631316065788269, + "learning_rate": 1.550169597499023e-05, + "loss": 0.133, + "step": 288900 + }, + { + "epoch": 22.56930886372511, + "grad_norm": 1.053382158279419, + "learning_rate": 1.5500148495506057e-05, + "loss": 0.1336, + "step": 289000 + }, + { + "epoch": 22.577118313158923, + "grad_norm": 1.0804376602172852, + "learning_rate": 1.5498585384915986e-05, + "loss": 0.1325, + "step": 289100 + }, + { + "epoch": 22.584927762592738, + "grad_norm": 0.7826420664787292, + "learning_rate": 1.549702227432591e-05, + "loss": 0.1439, + "step": 289200 + }, + { + "epoch": 22.592737212026552, + "grad_norm": 0.9446529150009155, + "learning_rate": 1.5495459163735835e-05, + "loss": 0.1418, + "step": 289300 + }, + { + "epoch": 22.600546661460367, + "grad_norm": 0.9799430966377258, + "learning_rate": 1.549389605314576e-05, + "loss": 0.131, + "step": 289400 + }, + { + "epoch": 22.60835611089418, + "grad_norm": 0.7131752371788025, + "learning_rate": 1.5492332942555687e-05, + "loss": 0.1356, + "step": 289500 + }, + { + "epoch": 22.616165560327996, + "grad_norm": 0.7591296434402466, + "learning_rate": 1.5490769831965613e-05, + "loss": 0.1302, + "step": 289600 + }, + { + "epoch": 22.62397500976181, + "grad_norm": 0.7944523096084595, + "learning_rate": 1.548920672137554e-05, + "loss": 0.1381, + "step": 289700 + }, + { + "epoch": 22.631784459195625, + "grad_norm": 0.7745599150657654, + "learning_rate": 1.5487643610785465e-05, + "loss": 0.1378, + "step": 289800 + }, + { + "epoch": 22.639593908629443, + "grad_norm": 0.5822691917419434, + "learning_rate": 1.548608050019539e-05, + "loss": 0.1412, + "step": 289900 + }, + { + "epoch": 22.647403358063258, + "grad_norm": 0.8896892666816711, + "learning_rate": 1.5484517389605317e-05, + "loss": 0.1324, + "step": 290000 + }, + { + "epoch": 22.655212807497072, + "grad_norm": 0.9473217725753784, + "learning_rate": 1.548295427901524e-05, + "loss": 0.1372, + "step": 290100 + }, + { + "epoch": 22.663022256930887, + "grad_norm": 0.6952362656593323, + "learning_rate": 1.548139116842517e-05, + "loss": 0.1369, + "step": 290200 + }, + { + "epoch": 22.6708317063647, + "grad_norm": 0.5139284133911133, + "learning_rate": 1.5479828057835095e-05, + "loss": 0.1305, + "step": 290300 + }, + { + "epoch": 22.678641155798516, + "grad_norm": 0.8785337805747986, + "learning_rate": 1.5478264947245018e-05, + "loss": 0.1378, + "step": 290400 + }, + { + "epoch": 22.68645060523233, + "grad_norm": 1.0016475915908813, + "learning_rate": 1.5476701836654944e-05, + "loss": 0.1411, + "step": 290500 + }, + { + "epoch": 22.694260054666145, + "grad_norm": 0.8744795918464661, + "learning_rate": 1.547513872606487e-05, + "loss": 0.1342, + "step": 290600 + }, + { + "epoch": 22.70206950409996, + "grad_norm": 0.9555802345275879, + "learning_rate": 1.5473575615474796e-05, + "loss": 0.1388, + "step": 290700 + }, + { + "epoch": 22.709878953533774, + "grad_norm": 0.7396284341812134, + "learning_rate": 1.547201250488472e-05, + "loss": 0.1375, + "step": 290800 + }, + { + "epoch": 22.71768840296759, + "grad_norm": 0.872858464717865, + "learning_rate": 1.5470449394294648e-05, + "loss": 0.1316, + "step": 290900 + }, + { + "epoch": 22.725497852401407, + "grad_norm": 0.8516215682029724, + "learning_rate": 1.5468901914810474e-05, + "loss": 0.133, + "step": 291000 + }, + { + "epoch": 22.73330730183522, + "grad_norm": 0.7716134190559387, + "learning_rate": 1.54673388042204e-05, + "loss": 0.1353, + "step": 291100 + }, + { + "epoch": 22.741116751269036, + "grad_norm": 1.0244024991989136, + "learning_rate": 1.5465775693630326e-05, + "loss": 0.1388, + "step": 291200 + }, + { + "epoch": 22.74892620070285, + "grad_norm": 0.9649984836578369, + "learning_rate": 1.546421258304025e-05, + "loss": 0.1372, + "step": 291300 + }, + { + "epoch": 22.756735650136665, + "grad_norm": 0.7406355738639832, + "learning_rate": 1.5462649472450178e-05, + "loss": 0.1393, + "step": 291400 + }, + { + "epoch": 22.76454509957048, + "grad_norm": 0.9723506569862366, + "learning_rate": 1.5461086361860104e-05, + "loss": 0.1381, + "step": 291500 + }, + { + "epoch": 22.772354549004294, + "grad_norm": 0.9101099371910095, + "learning_rate": 1.5459523251270026e-05, + "loss": 0.1427, + "step": 291600 + }, + { + "epoch": 22.78016399843811, + "grad_norm": 0.8881165981292725, + "learning_rate": 1.5457960140679956e-05, + "loss": 0.1262, + "step": 291700 + }, + { + "epoch": 22.787973447871924, + "grad_norm": 0.8255956172943115, + "learning_rate": 1.545639703008988e-05, + "loss": 0.1344, + "step": 291800 + }, + { + "epoch": 22.79578289730574, + "grad_norm": 0.8563169240951538, + "learning_rate": 1.5454833919499804e-05, + "loss": 0.1417, + "step": 291900 + }, + { + "epoch": 22.803592346739556, + "grad_norm": 0.8315694332122803, + "learning_rate": 1.545327080890973e-05, + "loss": 0.1317, + "step": 292000 + }, + { + "epoch": 22.81140179617337, + "grad_norm": 0.9142987132072449, + "learning_rate": 1.5451707698319656e-05, + "loss": 0.1405, + "step": 292100 + }, + { + "epoch": 22.819211245607185, + "grad_norm": 0.7362247109413147, + "learning_rate": 1.5450144587729582e-05, + "loss": 0.1409, + "step": 292200 + }, + { + "epoch": 22.827020695041, + "grad_norm": 0.6340514421463013, + "learning_rate": 1.5448581477139508e-05, + "loss": 0.1348, + "step": 292300 + }, + { + "epoch": 22.834830144474815, + "grad_norm": 0.8943261504173279, + "learning_rate": 1.5447018366549434e-05, + "loss": 0.1318, + "step": 292400 + }, + { + "epoch": 22.84263959390863, + "grad_norm": 0.596819281578064, + "learning_rate": 1.544545525595936e-05, + "loss": 0.1394, + "step": 292500 + }, + { + "epoch": 22.850449043342444, + "grad_norm": 0.9999971985816956, + "learning_rate": 1.5443892145369286e-05, + "loss": 0.1382, + "step": 292600 + }, + { + "epoch": 22.858258492776258, + "grad_norm": 0.6040255427360535, + "learning_rate": 1.5442329034779212e-05, + "loss": 0.1326, + "step": 292700 + }, + { + "epoch": 22.866067942210073, + "grad_norm": 1.0905107259750366, + "learning_rate": 1.544076592418914e-05, + "loss": 0.1316, + "step": 292800 + }, + { + "epoch": 22.873877391643887, + "grad_norm": 0.8896177411079407, + "learning_rate": 1.5439202813599064e-05, + "loss": 0.1349, + "step": 292900 + }, + { + "epoch": 22.881686841077705, + "grad_norm": 0.9923672676086426, + "learning_rate": 1.5437639703008987e-05, + "loss": 0.1318, + "step": 293000 + }, + { + "epoch": 22.88949629051152, + "grad_norm": 0.9534666538238525, + "learning_rate": 1.5436092223524816e-05, + "loss": 0.1391, + "step": 293100 + }, + { + "epoch": 22.897305739945335, + "grad_norm": 0.6288168430328369, + "learning_rate": 1.5434529112934742e-05, + "loss": 0.1423, + "step": 293200 + }, + { + "epoch": 22.90511518937915, + "grad_norm": 0.9616836905479431, + "learning_rate": 1.5432966002344668e-05, + "loss": 0.1415, + "step": 293300 + }, + { + "epoch": 22.912924638812964, + "grad_norm": 0.5409935712814331, + "learning_rate": 1.543140289175459e-05, + "loss": 0.133, + "step": 293400 + }, + { + "epoch": 22.92073408824678, + "grad_norm": 0.8281419277191162, + "learning_rate": 1.542983978116452e-05, + "loss": 0.1354, + "step": 293500 + }, + { + "epoch": 22.928543537680593, + "grad_norm": 0.8489418029785156, + "learning_rate": 1.5428276670574446e-05, + "loss": 0.1317, + "step": 293600 + }, + { + "epoch": 22.936352987114407, + "grad_norm": 0.729698657989502, + "learning_rate": 1.542671355998437e-05, + "loss": 0.1385, + "step": 293700 + }, + { + "epoch": 22.944162436548222, + "grad_norm": 0.6333410143852234, + "learning_rate": 1.5425150449394295e-05, + "loss": 0.135, + "step": 293800 + }, + { + "epoch": 22.95197188598204, + "grad_norm": 0.7198565006256104, + "learning_rate": 1.542358733880422e-05, + "loss": 0.1333, + "step": 293900 + }, + { + "epoch": 22.959781335415855, + "grad_norm": 0.9172605872154236, + "learning_rate": 1.5422024228214147e-05, + "loss": 0.1343, + "step": 294000 + }, + { + "epoch": 22.96759078484967, + "grad_norm": 0.7993659973144531, + "learning_rate": 1.5420461117624073e-05, + "loss": 0.1392, + "step": 294100 + }, + { + "epoch": 22.975400234283484, + "grad_norm": 1.051163673400879, + "learning_rate": 1.5418898007034e-05, + "loss": 0.1458, + "step": 294200 + }, + { + "epoch": 22.9832096837173, + "grad_norm": 0.7416157722473145, + "learning_rate": 1.5417334896443925e-05, + "loss": 0.1334, + "step": 294300 + }, + { + "epoch": 22.991019133151113, + "grad_norm": 1.0222550630569458, + "learning_rate": 1.541577178585385e-05, + "loss": 0.1387, + "step": 294400 + }, + { + "epoch": 22.998828582584927, + "grad_norm": 0.7647424936294556, + "learning_rate": 1.5414208675263777e-05, + "loss": 0.1434, + "step": 294500 + }, + { + "epoch": 23.006638032018742, + "grad_norm": 0.6978480219841003, + "learning_rate": 1.5412645564673703e-05, + "loss": 0.1371, + "step": 294600 + }, + { + "epoch": 23.014447481452557, + "grad_norm": 0.8882784247398376, + "learning_rate": 1.541108245408363e-05, + "loss": 0.135, + "step": 294700 + }, + { + "epoch": 23.02225693088637, + "grad_norm": 0.8186953067779541, + "learning_rate": 1.540951934349355e-05, + "loss": 0.1364, + "step": 294800 + }, + { + "epoch": 23.030066380320186, + "grad_norm": 0.738304078578949, + "learning_rate": 1.5407956232903478e-05, + "loss": 0.1401, + "step": 294900 + }, + { + "epoch": 23.037875829754004, + "grad_norm": 0.8697826266288757, + "learning_rate": 1.5406393122313407e-05, + "loss": 0.131, + "step": 295000 + }, + { + "epoch": 23.04568527918782, + "grad_norm": 0.9213365316390991, + "learning_rate": 1.5404845642829233e-05, + "loss": 0.1318, + "step": 295100 + }, + { + "epoch": 23.053494728621633, + "grad_norm": 0.856341540813446, + "learning_rate": 1.5403282532239156e-05, + "loss": 0.1332, + "step": 295200 + }, + { + "epoch": 23.061304178055448, + "grad_norm": 0.9739687442779541, + "learning_rate": 1.5401735052754985e-05, + "loss": 0.1352, + "step": 295300 + }, + { + "epoch": 23.069113627489262, + "grad_norm": 0.6837376356124878, + "learning_rate": 1.540017194216491e-05, + "loss": 0.1331, + "step": 295400 + }, + { + "epoch": 23.076923076923077, + "grad_norm": 0.7577429413795471, + "learning_rate": 1.5398608831574834e-05, + "loss": 0.1404, + "step": 295500 + }, + { + "epoch": 23.08473252635689, + "grad_norm": 0.8737211227416992, + "learning_rate": 1.539704572098476e-05, + "loss": 0.1317, + "step": 295600 + }, + { + "epoch": 23.092541975790706, + "grad_norm": 1.0972391366958618, + "learning_rate": 1.539548261039469e-05, + "loss": 0.1308, + "step": 295700 + }, + { + "epoch": 23.10035142522452, + "grad_norm": 0.8688048720359802, + "learning_rate": 1.539391949980461e-05, + "loss": 0.1316, + "step": 295800 + }, + { + "epoch": 23.108160874658335, + "grad_norm": 0.6488736271858215, + "learning_rate": 1.5392356389214538e-05, + "loss": 0.1259, + "step": 295900 + }, + { + "epoch": 23.115970324092153, + "grad_norm": 0.8195211291313171, + "learning_rate": 1.5390793278624464e-05, + "loss": 0.1336, + "step": 296000 + }, + { + "epoch": 23.123779773525968, + "grad_norm": 0.7433906197547913, + "learning_rate": 1.538923016803439e-05, + "loss": 0.1328, + "step": 296100 + }, + { + "epoch": 23.131589222959782, + "grad_norm": 1.1779958009719849, + "learning_rate": 1.5387667057444316e-05, + "loss": 0.1335, + "step": 296200 + }, + { + "epoch": 23.139398672393597, + "grad_norm": 0.7165812849998474, + "learning_rate": 1.5386103946854242e-05, + "loss": 0.1291, + "step": 296300 + }, + { + "epoch": 23.14720812182741, + "grad_norm": 0.8701615929603577, + "learning_rate": 1.5384540836264168e-05, + "loss": 0.1317, + "step": 296400 + }, + { + "epoch": 23.155017571261226, + "grad_norm": 1.0225144624710083, + "learning_rate": 1.5382977725674094e-05, + "loss": 0.1426, + "step": 296500 + }, + { + "epoch": 23.16282702069504, + "grad_norm": 0.9554911255836487, + "learning_rate": 1.538141461508402e-05, + "loss": 0.1428, + "step": 296600 + }, + { + "epoch": 23.170636470128855, + "grad_norm": 0.9963477849960327, + "learning_rate": 1.5379851504493942e-05, + "loss": 0.139, + "step": 296700 + }, + { + "epoch": 23.17844591956267, + "grad_norm": 0.8080196380615234, + "learning_rate": 1.5378288393903872e-05, + "loss": 0.1344, + "step": 296800 + }, + { + "epoch": 23.186255368996484, + "grad_norm": 1.0430487394332886, + "learning_rate": 1.5376725283313794e-05, + "loss": 0.1315, + "step": 296900 + }, + { + "epoch": 23.194064818430302, + "grad_norm": 1.0507380962371826, + "learning_rate": 1.537516217272372e-05, + "loss": 0.1314, + "step": 297000 + }, + { + "epoch": 23.201874267864117, + "grad_norm": 0.8849628567695618, + "learning_rate": 1.5373599062133646e-05, + "loss": 0.1315, + "step": 297100 + }, + { + "epoch": 23.20968371729793, + "grad_norm": 0.7833200693130493, + "learning_rate": 1.5372035951543572e-05, + "loss": 0.1405, + "step": 297200 + }, + { + "epoch": 23.217493166731746, + "grad_norm": 1.1596750020980835, + "learning_rate": 1.53704728409535e-05, + "loss": 0.134, + "step": 297300 + }, + { + "epoch": 23.22530261616556, + "grad_norm": 1.0101959705352783, + "learning_rate": 1.5368909730363424e-05, + "loss": 0.1334, + "step": 297400 + }, + { + "epoch": 23.233112065599375, + "grad_norm": 0.7190399169921875, + "learning_rate": 1.536734661977335e-05, + "loss": 0.1302, + "step": 297500 + }, + { + "epoch": 23.24092151503319, + "grad_norm": 0.890193521976471, + "learning_rate": 1.5365783509183276e-05, + "loss": 0.1232, + "step": 297600 + }, + { + "epoch": 23.248730964467004, + "grad_norm": 0.8231490850448608, + "learning_rate": 1.5364220398593202e-05, + "loss": 0.13, + "step": 297700 + }, + { + "epoch": 23.25654041390082, + "grad_norm": 1.3083713054656982, + "learning_rate": 1.5362657288003125e-05, + "loss": 0.1337, + "step": 297800 + }, + { + "epoch": 23.264349863334633, + "grad_norm": 0.8749645352363586, + "learning_rate": 1.5361094177413054e-05, + "loss": 0.1425, + "step": 297900 + }, + { + "epoch": 23.27215931276845, + "grad_norm": 0.8127243518829346, + "learning_rate": 1.535953106682298e-05, + "loss": 0.13, + "step": 298000 + }, + { + "epoch": 23.279968762202266, + "grad_norm": 0.7600269913673401, + "learning_rate": 1.5357967956232903e-05, + "loss": 0.1277, + "step": 298100 + }, + { + "epoch": 23.28777821163608, + "grad_norm": 0.8609147667884827, + "learning_rate": 1.535640484564283e-05, + "loss": 0.1247, + "step": 298200 + }, + { + "epoch": 23.295587661069895, + "grad_norm": 0.6120932102203369, + "learning_rate": 1.5354841735052755e-05, + "loss": 0.1262, + "step": 298300 + }, + { + "epoch": 23.30339711050371, + "grad_norm": 0.6841028928756714, + "learning_rate": 1.535327862446268e-05, + "loss": 0.1273, + "step": 298400 + }, + { + "epoch": 23.311206559937524, + "grad_norm": 0.9684091806411743, + "learning_rate": 1.5351715513872607e-05, + "loss": 0.131, + "step": 298500 + }, + { + "epoch": 23.31901600937134, + "grad_norm": 0.8668888807296753, + "learning_rate": 1.5350152403282533e-05, + "loss": 0.1295, + "step": 298600 + }, + { + "epoch": 23.326825458805153, + "grad_norm": 0.8056829571723938, + "learning_rate": 1.534858929269246e-05, + "loss": 0.1406, + "step": 298700 + }, + { + "epoch": 23.334634908238968, + "grad_norm": 0.9638793468475342, + "learning_rate": 1.5347026182102385e-05, + "loss": 0.1263, + "step": 298800 + }, + { + "epoch": 23.342444357672782, + "grad_norm": 0.7716854810714722, + "learning_rate": 1.534546307151231e-05, + "loss": 0.1359, + "step": 298900 + }, + { + "epoch": 23.3502538071066, + "grad_norm": 0.930443525314331, + "learning_rate": 1.5343899960922237e-05, + "loss": 0.1367, + "step": 299000 + }, + { + "epoch": 23.358063256540415, + "grad_norm": 1.0026413202285767, + "learning_rate": 1.5342336850332163e-05, + "loss": 0.1345, + "step": 299100 + }, + { + "epoch": 23.36587270597423, + "grad_norm": 0.6678506731987, + "learning_rate": 1.5340773739742086e-05, + "loss": 0.1351, + "step": 299200 + }, + { + "epoch": 23.373682155408044, + "grad_norm": 1.0416382551193237, + "learning_rate": 1.5339210629152015e-05, + "loss": 0.1386, + "step": 299300 + }, + { + "epoch": 23.38149160484186, + "grad_norm": 0.9412092566490173, + "learning_rate": 1.533764751856194e-05, + "loss": 0.1309, + "step": 299400 + }, + { + "epoch": 23.389301054275673, + "grad_norm": 0.7648351788520813, + "learning_rate": 1.5336084407971864e-05, + "loss": 0.1328, + "step": 299500 + }, + { + "epoch": 23.397110503709488, + "grad_norm": 0.6699008941650391, + "learning_rate": 1.533452129738179e-05, + "loss": 0.1357, + "step": 299600 + }, + { + "epoch": 23.404919953143303, + "grad_norm": 0.7237615585327148, + "learning_rate": 1.533297381789762e-05, + "loss": 0.129, + "step": 299700 + }, + { + "epoch": 23.412729402577117, + "grad_norm": 1.0570127964019775, + "learning_rate": 1.5331410707307545e-05, + "loss": 0.133, + "step": 299800 + }, + { + "epoch": 23.42053885201093, + "grad_norm": 0.845137894153595, + "learning_rate": 1.5329847596717468e-05, + "loss": 0.1344, + "step": 299900 + }, + { + "epoch": 23.42834830144475, + "grad_norm": 0.8534473180770874, + "learning_rate": 1.5328284486127394e-05, + "loss": 0.1348, + "step": 300000 + } + ], + "logging_steps": 100, + "max_steps": 1280500, + "num_input_tokens_seen": 0, + "num_train_epochs": 100, + "save_steps": 5000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.98964279526528e+18, + "train_batch_size": 64, + "trial_name": null, + "trial_params": null +}