{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.9988942130482865, "eval_steps": 500, "global_step": 20345, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002457304337142155, "grad_norm": 0.285283696562915, "learning_rate": 2.457002457002457e-07, "loss": 0.7971, "step": 10 }, { "epoch": 0.00491460867428431, "grad_norm": 0.2921522667654233, "learning_rate": 4.914004914004914e-07, "loss": 0.8074, "step": 20 }, { "epoch": 0.007371913011426466, "grad_norm": 0.31280161515343075, "learning_rate": 7.371007371007371e-07, "loss": 0.7997, "step": 30 }, { "epoch": 0.00982921734856862, "grad_norm": 0.3370149755824867, "learning_rate": 9.828009828009828e-07, "loss": 0.8018, "step": 40 }, { "epoch": 0.012286521685710775, "grad_norm": 0.3033390004437635, "learning_rate": 1.2285012285012285e-06, "loss": 0.8166, "step": 50 }, { "epoch": 0.014743826022852931, "grad_norm": 0.3712919971264193, "learning_rate": 1.4742014742014743e-06, "loss": 0.8132, "step": 60 }, { "epoch": 0.017201130359995084, "grad_norm": 0.31321758823466445, "learning_rate": 1.71990171990172e-06, "loss": 0.8184, "step": 70 }, { "epoch": 0.01965843469713724, "grad_norm": 0.37516879011862714, "learning_rate": 1.9656019656019657e-06, "loss": 0.8294, "step": 80 }, { "epoch": 0.022115739034279394, "grad_norm": 0.32289871046183183, "learning_rate": 2.211302211302211e-06, "loss": 0.8294, "step": 90 }, { "epoch": 0.02457304337142155, "grad_norm": 0.338168927309645, "learning_rate": 2.457002457002457e-06, "loss": 0.7895, "step": 100 }, { "epoch": 0.027030347708563704, "grad_norm": 0.35536905336773267, "learning_rate": 2.702702702702703e-06, "loss": 0.7911, "step": 110 }, { "epoch": 0.029487652045705862, "grad_norm": 0.33240119439091104, "learning_rate": 2.9484029484029485e-06, "loss": 0.7548, "step": 120 }, { "epoch": 0.031944956382848014, "grad_norm": 0.26170341536833136, "learning_rate": 3.1941031941031944e-06, "loss": 0.7708, "step": 130 }, { "epoch": 0.03440226071999017, "grad_norm": 0.20842847682799195, "learning_rate": 3.43980343980344e-06, "loss": 0.7768, "step": 140 }, { "epoch": 0.036859565057132324, "grad_norm": 0.16918469914077636, "learning_rate": 3.6855036855036854e-06, "loss": 0.7595, "step": 150 }, { "epoch": 0.03931686939427448, "grad_norm": 0.15469461581116478, "learning_rate": 3.931203931203931e-06, "loss": 0.7447, "step": 160 }, { "epoch": 0.04177417373141663, "grad_norm": 0.15006212075675995, "learning_rate": 4.176904176904177e-06, "loss": 0.7556, "step": 170 }, { "epoch": 0.04423147806855879, "grad_norm": 0.1352258435985501, "learning_rate": 4.422604422604422e-06, "loss": 0.7441, "step": 180 }, { "epoch": 0.04668878240570094, "grad_norm": 0.1290665246165999, "learning_rate": 4.668304668304669e-06, "loss": 0.7235, "step": 190 }, { "epoch": 0.0491460867428431, "grad_norm": 0.11555636205518427, "learning_rate": 4.914004914004914e-06, "loss": 0.7027, "step": 200 }, { "epoch": 0.05160339107998525, "grad_norm": 0.13425317356259797, "learning_rate": 5.15970515970516e-06, "loss": 0.7044, "step": 210 }, { "epoch": 0.05406069541712741, "grad_norm": 0.1179890262847447, "learning_rate": 5.405405405405406e-06, "loss": 0.7277, "step": 220 }, { "epoch": 0.05651799975426957, "grad_norm": 0.1154047478396338, "learning_rate": 5.6511056511056515e-06, "loss": 0.7222, "step": 230 }, { "epoch": 0.058975304091411725, "grad_norm": 0.11243808804099786, "learning_rate": 5.896805896805897e-06, "loss": 0.71, "step": 240 }, { "epoch": 0.06143260842855388, "grad_norm": 0.1029399728773226, "learning_rate": 6.142506142506143e-06, "loss": 0.718, "step": 250 }, { "epoch": 0.06388991276569603, "grad_norm": 0.1193206080163564, "learning_rate": 6.388206388206389e-06, "loss": 0.6742, "step": 260 }, { "epoch": 0.06634721710283818, "grad_norm": 0.10628085212313074, "learning_rate": 6.6339066339066335e-06, "loss": 0.6829, "step": 270 }, { "epoch": 0.06880452143998034, "grad_norm": 0.10262438845570923, "learning_rate": 6.87960687960688e-06, "loss": 0.6673, "step": 280 }, { "epoch": 0.07126182577712249, "grad_norm": 0.09225054185955571, "learning_rate": 7.125307125307126e-06, "loss": 0.6702, "step": 290 }, { "epoch": 0.07371913011426465, "grad_norm": 0.12953379688538696, "learning_rate": 7.371007371007371e-06, "loss": 0.6668, "step": 300 }, { "epoch": 0.0761764344514068, "grad_norm": 0.10396609524400965, "learning_rate": 7.616707616707617e-06, "loss": 0.6659, "step": 310 }, { "epoch": 0.07863373878854896, "grad_norm": 0.1217952706413849, "learning_rate": 7.862407862407863e-06, "loss": 0.6787, "step": 320 }, { "epoch": 0.08109104312569111, "grad_norm": 0.11421835568050573, "learning_rate": 8.108108108108109e-06, "loss": 0.6646, "step": 330 }, { "epoch": 0.08354834746283327, "grad_norm": 0.08674641249218844, "learning_rate": 8.353808353808354e-06, "loss": 0.6683, "step": 340 }, { "epoch": 0.08600565179997542, "grad_norm": 0.085566584015851, "learning_rate": 8.5995085995086e-06, "loss": 0.6664, "step": 350 }, { "epoch": 0.08846295613711758, "grad_norm": 0.09682344166889431, "learning_rate": 8.845208845208845e-06, "loss": 0.6545, "step": 360 }, { "epoch": 0.09092026047425973, "grad_norm": 0.10126575286385862, "learning_rate": 9.090909090909091e-06, "loss": 0.6358, "step": 370 }, { "epoch": 0.09337756481140189, "grad_norm": 0.09362155896342508, "learning_rate": 9.336609336609337e-06, "loss": 0.646, "step": 380 }, { "epoch": 0.09583486914854404, "grad_norm": 0.08635432526667908, "learning_rate": 9.582309582309584e-06, "loss": 0.6331, "step": 390 }, { "epoch": 0.0982921734856862, "grad_norm": 0.11202705784768645, "learning_rate": 9.828009828009828e-06, "loss": 0.6118, "step": 400 }, { "epoch": 0.10074947782282835, "grad_norm": 0.1072217744733489, "learning_rate": 1.0073710073710075e-05, "loss": 0.5937, "step": 410 }, { "epoch": 0.1032067821599705, "grad_norm": 0.11393990310743093, "learning_rate": 1.031941031941032e-05, "loss": 0.6374, "step": 420 }, { "epoch": 0.10566408649711266, "grad_norm": 0.09847171653602577, "learning_rate": 1.0565110565110566e-05, "loss": 0.5999, "step": 430 }, { "epoch": 0.10812139083425482, "grad_norm": 0.11517872111658645, "learning_rate": 1.0810810810810812e-05, "loss": 0.6212, "step": 440 }, { "epoch": 0.11057869517139697, "grad_norm": 0.10178303276537287, "learning_rate": 1.1056511056511057e-05, "loss": 0.6221, "step": 450 }, { "epoch": 0.11303599950853914, "grad_norm": 0.10496134270991121, "learning_rate": 1.1302211302211303e-05, "loss": 0.5962, "step": 460 }, { "epoch": 0.1154933038456813, "grad_norm": 0.10989949759147273, "learning_rate": 1.1547911547911548e-05, "loss": 0.6065, "step": 470 }, { "epoch": 0.11795060818282345, "grad_norm": 0.11989291161714485, "learning_rate": 1.1793611793611794e-05, "loss": 0.6166, "step": 480 }, { "epoch": 0.1204079125199656, "grad_norm": 0.13634046844599296, "learning_rate": 1.2039312039312039e-05, "loss": 0.6175, "step": 490 }, { "epoch": 0.12286521685710776, "grad_norm": 0.11769647832040009, "learning_rate": 1.2285012285012287e-05, "loss": 0.5712, "step": 500 }, { "epoch": 0.12532252119424991, "grad_norm": 0.12529744446227853, "learning_rate": 1.2530712530712533e-05, "loss": 0.608, "step": 510 }, { "epoch": 0.12777982553139206, "grad_norm": 0.11293096033059141, "learning_rate": 1.2776412776412778e-05, "loss": 0.5888, "step": 520 }, { "epoch": 0.13023712986853422, "grad_norm": 0.10558319421094586, "learning_rate": 1.3022113022113022e-05, "loss": 0.5986, "step": 530 }, { "epoch": 0.13269443420567636, "grad_norm": 0.1276470058331539, "learning_rate": 1.3267813267813267e-05, "loss": 0.6028, "step": 540 }, { "epoch": 0.13515173854281853, "grad_norm": 0.11691435053453154, "learning_rate": 1.3513513513513515e-05, "loss": 0.5908, "step": 550 }, { "epoch": 0.13760904287996067, "grad_norm": 0.11031785753729674, "learning_rate": 1.375921375921376e-05, "loss": 0.6009, "step": 560 }, { "epoch": 0.14006634721710284, "grad_norm": 0.11247717051699746, "learning_rate": 1.4004914004914004e-05, "loss": 0.5752, "step": 570 }, { "epoch": 0.14252365155424498, "grad_norm": 0.11262303106540275, "learning_rate": 1.4250614250614252e-05, "loss": 0.5978, "step": 580 }, { "epoch": 0.14498095589138715, "grad_norm": 0.12981549150414357, "learning_rate": 1.4496314496314497e-05, "loss": 0.5764, "step": 590 }, { "epoch": 0.1474382602285293, "grad_norm": 0.12196249260795661, "learning_rate": 1.4742014742014742e-05, "loss": 0.5638, "step": 600 }, { "epoch": 0.14989556456567146, "grad_norm": 0.14389357388926616, "learning_rate": 1.4987714987714988e-05, "loss": 0.5583, "step": 610 }, { "epoch": 0.1523528689028136, "grad_norm": 0.13493140763801018, "learning_rate": 1.5233415233415234e-05, "loss": 0.5692, "step": 620 }, { "epoch": 0.15481017323995577, "grad_norm": 0.11799766416329001, "learning_rate": 1.547911547911548e-05, "loss": 0.574, "step": 630 }, { "epoch": 0.1572674775770979, "grad_norm": 0.13778936740698278, "learning_rate": 1.5724815724815725e-05, "loss": 0.5659, "step": 640 }, { "epoch": 0.15972478191424008, "grad_norm": 0.1295247945439516, "learning_rate": 1.5970515970515972e-05, "loss": 0.5712, "step": 650 }, { "epoch": 0.16218208625138222, "grad_norm": 0.12292281534510974, "learning_rate": 1.6216216216216218e-05, "loss": 0.5841, "step": 660 }, { "epoch": 0.1646393905885244, "grad_norm": 0.13396128778746164, "learning_rate": 1.6461916461916464e-05, "loss": 0.5587, "step": 670 }, { "epoch": 0.16709669492566653, "grad_norm": 0.1385303574504802, "learning_rate": 1.6707616707616707e-05, "loss": 0.5516, "step": 680 }, { "epoch": 0.1695539992628087, "grad_norm": 0.12174445099989228, "learning_rate": 1.6953316953316954e-05, "loss": 0.5549, "step": 690 }, { "epoch": 0.17201130359995084, "grad_norm": 0.14928713859091242, "learning_rate": 1.71990171990172e-05, "loss": 0.5689, "step": 700 }, { "epoch": 0.174468607937093, "grad_norm": 0.11779901331190067, "learning_rate": 1.7444717444717446e-05, "loss": 0.5641, "step": 710 }, { "epoch": 0.17692591227423515, "grad_norm": 0.16327872360386755, "learning_rate": 1.769041769041769e-05, "loss": 0.5615, "step": 720 }, { "epoch": 0.17938321661137732, "grad_norm": 0.13303710883623834, "learning_rate": 1.793611793611794e-05, "loss": 0.5467, "step": 730 }, { "epoch": 0.18184052094851946, "grad_norm": 0.14783287233529716, "learning_rate": 1.8181818181818182e-05, "loss": 0.5537, "step": 740 }, { "epoch": 0.18429782528566163, "grad_norm": 0.1426201426693433, "learning_rate": 1.842751842751843e-05, "loss": 0.5744, "step": 750 }, { "epoch": 0.18675512962280377, "grad_norm": 0.1422007121328193, "learning_rate": 1.8673218673218675e-05, "loss": 0.5803, "step": 760 }, { "epoch": 0.18921243395994594, "grad_norm": 0.1467764506639497, "learning_rate": 1.891891891891892e-05, "loss": 0.5765, "step": 770 }, { "epoch": 0.19166973829708808, "grad_norm": 0.15738136945495151, "learning_rate": 1.9164619164619167e-05, "loss": 0.5531, "step": 780 }, { "epoch": 0.19412704263423025, "grad_norm": 0.14845531443726892, "learning_rate": 1.941031941031941e-05, "loss": 0.5555, "step": 790 }, { "epoch": 0.1965843469713724, "grad_norm": 0.15347644666496915, "learning_rate": 1.9656019656019657e-05, "loss": 0.5594, "step": 800 }, { "epoch": 0.19904165130851456, "grad_norm": 0.16037738358748618, "learning_rate": 1.9901719901719903e-05, "loss": 0.5625, "step": 810 }, { "epoch": 0.2014989556456567, "grad_norm": 0.1509235030179032, "learning_rate": 2.014742014742015e-05, "loss": 0.5894, "step": 820 }, { "epoch": 0.20395625998279887, "grad_norm": 0.14830227446684158, "learning_rate": 2.0393120393120392e-05, "loss": 0.5661, "step": 830 }, { "epoch": 0.206413564319941, "grad_norm": 0.1572316134126635, "learning_rate": 2.063882063882064e-05, "loss": 0.5538, "step": 840 }, { "epoch": 0.20887086865708318, "grad_norm": 0.14732239775272546, "learning_rate": 2.0884520884520885e-05, "loss": 0.5458, "step": 850 }, { "epoch": 0.21132817299422532, "grad_norm": 0.1220008396510658, "learning_rate": 2.113022113022113e-05, "loss": 0.5522, "step": 860 }, { "epoch": 0.2137854773313675, "grad_norm": 0.16370268820026745, "learning_rate": 2.1375921375921378e-05, "loss": 0.5554, "step": 870 }, { "epoch": 0.21624278166850963, "grad_norm": 0.1560196936157587, "learning_rate": 2.1621621621621624e-05, "loss": 0.5598, "step": 880 }, { "epoch": 0.2187000860056518, "grad_norm": 0.14525540493668182, "learning_rate": 2.186732186732187e-05, "loss": 0.5394, "step": 890 }, { "epoch": 0.22115739034279394, "grad_norm": 0.15153641038210222, "learning_rate": 2.2113022113022113e-05, "loss": 0.5529, "step": 900 }, { "epoch": 0.2236146946799361, "grad_norm": 0.18345710939999, "learning_rate": 2.235872235872236e-05, "loss": 0.5477, "step": 910 }, { "epoch": 0.22607199901707828, "grad_norm": 0.18013988510346432, "learning_rate": 2.2604422604422606e-05, "loss": 0.5429, "step": 920 }, { "epoch": 0.22852930335422042, "grad_norm": 0.15543830116993207, "learning_rate": 2.2850122850122852e-05, "loss": 0.5533, "step": 930 }, { "epoch": 0.2309866076913626, "grad_norm": 0.16577417915128642, "learning_rate": 2.3095823095823095e-05, "loss": 0.5648, "step": 940 }, { "epoch": 0.23344391202850473, "grad_norm": 0.1657685129078517, "learning_rate": 2.3341523341523342e-05, "loss": 0.5407, "step": 950 }, { "epoch": 0.2359012163656469, "grad_norm": 0.16103255077486922, "learning_rate": 2.3587223587223588e-05, "loss": 0.5344, "step": 960 }, { "epoch": 0.23835852070278904, "grad_norm": 0.16281313964998417, "learning_rate": 2.3832923832923834e-05, "loss": 0.5433, "step": 970 }, { "epoch": 0.2408158250399312, "grad_norm": 0.18428028251708076, "learning_rate": 2.4078624078624077e-05, "loss": 0.5609, "step": 980 }, { "epoch": 0.24327312937707335, "grad_norm": 0.19593731345221804, "learning_rate": 2.4324324324324327e-05, "loss": 0.563, "step": 990 }, { "epoch": 0.24573043371421552, "grad_norm": 0.17293647293235315, "learning_rate": 2.4570024570024573e-05, "loss": 0.5441, "step": 1000 }, { "epoch": 0.24818773805135766, "grad_norm": 0.1498494858717953, "learning_rate": 2.4815724815724816e-05, "loss": 0.5449, "step": 1010 }, { "epoch": 0.25064504238849983, "grad_norm": 0.14330244684310947, "learning_rate": 2.5061425061425066e-05, "loss": 0.5268, "step": 1020 }, { "epoch": 0.25310234672564197, "grad_norm": 0.18515286254532334, "learning_rate": 2.5307125307125306e-05, "loss": 0.5337, "step": 1030 }, { "epoch": 0.2555596510627841, "grad_norm": 0.17539240043630094, "learning_rate": 2.5552825552825555e-05, "loss": 0.5446, "step": 1040 }, { "epoch": 0.2580169553999263, "grad_norm": 0.17988828403504115, "learning_rate": 2.5798525798525802e-05, "loss": 0.5348, "step": 1050 }, { "epoch": 0.26047425973706845, "grad_norm": 0.19201441361949384, "learning_rate": 2.6044226044226045e-05, "loss": 0.5509, "step": 1060 }, { "epoch": 0.2629315640742106, "grad_norm": 0.19078868278704236, "learning_rate": 2.628992628992629e-05, "loss": 0.5466, "step": 1070 }, { "epoch": 0.26538886841135273, "grad_norm": 0.18113121541669838, "learning_rate": 2.6535626535626534e-05, "loss": 0.5381, "step": 1080 }, { "epoch": 0.2678461727484949, "grad_norm": 0.16645216874896743, "learning_rate": 2.678132678132678e-05, "loss": 0.5291, "step": 1090 }, { "epoch": 0.27030347708563707, "grad_norm": 0.1826794355797869, "learning_rate": 2.702702702702703e-05, "loss": 0.5208, "step": 1100 }, { "epoch": 0.2727607814227792, "grad_norm": 0.17801539598794783, "learning_rate": 2.7272727272727273e-05, "loss": 0.5495, "step": 1110 }, { "epoch": 0.27521808575992135, "grad_norm": 0.19289818168337372, "learning_rate": 2.751842751842752e-05, "loss": 0.5288, "step": 1120 }, { "epoch": 0.27767539009706355, "grad_norm": 0.1664016411629758, "learning_rate": 2.776412776412777e-05, "loss": 0.5422, "step": 1130 }, { "epoch": 0.2801326944342057, "grad_norm": 0.20707094208173624, "learning_rate": 2.800982800982801e-05, "loss": 0.5557, "step": 1140 }, { "epoch": 0.28258999877134783, "grad_norm": 0.1831812793359572, "learning_rate": 2.825552825552826e-05, "loss": 0.5414, "step": 1150 }, { "epoch": 0.28504730310848997, "grad_norm": 0.18551678355890616, "learning_rate": 2.8501228501228505e-05, "loss": 0.526, "step": 1160 }, { "epoch": 0.28750460744563217, "grad_norm": 0.19902772923521764, "learning_rate": 2.8746928746928748e-05, "loss": 0.529, "step": 1170 }, { "epoch": 0.2899619117827743, "grad_norm": 0.20133605718148306, "learning_rate": 2.8992628992628994e-05, "loss": 0.5578, "step": 1180 }, { "epoch": 0.29241921611991645, "grad_norm": 0.19899676751912754, "learning_rate": 2.9238329238329237e-05, "loss": 0.5417, "step": 1190 }, { "epoch": 0.2948765204570586, "grad_norm": 0.18178765104192152, "learning_rate": 2.9484029484029483e-05, "loss": 0.5367, "step": 1200 }, { "epoch": 0.2973338247942008, "grad_norm": 0.18782089100973998, "learning_rate": 2.9729729729729733e-05, "loss": 0.529, "step": 1210 }, { "epoch": 0.2997911291313429, "grad_norm": 0.18000473671727077, "learning_rate": 2.9975429975429976e-05, "loss": 0.5108, "step": 1220 }, { "epoch": 0.30224843346848507, "grad_norm": 0.17875759061289231, "learning_rate": 3.0221130221130222e-05, "loss": 0.5391, "step": 1230 }, { "epoch": 0.3047057378056272, "grad_norm": 0.18176083566839848, "learning_rate": 3.046683046683047e-05, "loss": 0.5472, "step": 1240 }, { "epoch": 0.3071630421427694, "grad_norm": 0.20536941155207303, "learning_rate": 3.071253071253071e-05, "loss": 0.5378, "step": 1250 }, { "epoch": 0.30962034647991155, "grad_norm": 0.18558658846811613, "learning_rate": 3.095823095823096e-05, "loss": 0.5364, "step": 1260 }, { "epoch": 0.3120776508170537, "grad_norm": 0.20143910717254088, "learning_rate": 3.120393120393121e-05, "loss": 0.542, "step": 1270 }, { "epoch": 0.3145349551541958, "grad_norm": 0.20789811879294776, "learning_rate": 3.144963144963145e-05, "loss": 0.5166, "step": 1280 }, { "epoch": 0.316992259491338, "grad_norm": 0.175249702855173, "learning_rate": 3.16953316953317e-05, "loss": 0.532, "step": 1290 }, { "epoch": 0.31944956382848017, "grad_norm": 0.2235057911371926, "learning_rate": 3.1941031941031943e-05, "loss": 0.5218, "step": 1300 }, { "epoch": 0.3219068681656223, "grad_norm": 0.16708951572010586, "learning_rate": 3.218673218673219e-05, "loss": 0.5341, "step": 1310 }, { "epoch": 0.32436417250276445, "grad_norm": 0.19749449073107928, "learning_rate": 3.2432432432432436e-05, "loss": 0.5443, "step": 1320 }, { "epoch": 0.32682147683990664, "grad_norm": 0.20299715729218815, "learning_rate": 3.2678132678132676e-05, "loss": 0.5308, "step": 1330 }, { "epoch": 0.3292787811770488, "grad_norm": 0.2190891544394821, "learning_rate": 3.292383292383293e-05, "loss": 0.5163, "step": 1340 }, { "epoch": 0.3317360855141909, "grad_norm": 0.23691694898995339, "learning_rate": 3.3169533169533175e-05, "loss": 0.5307, "step": 1350 }, { "epoch": 0.33419338985133307, "grad_norm": 0.1817007736989202, "learning_rate": 3.3415233415233415e-05, "loss": 0.5241, "step": 1360 }, { "epoch": 0.33665069418847526, "grad_norm": 0.21238762356642577, "learning_rate": 3.366093366093366e-05, "loss": 0.517, "step": 1370 }, { "epoch": 0.3391079985256174, "grad_norm": 0.2241067732170629, "learning_rate": 3.390663390663391e-05, "loss": 0.5325, "step": 1380 }, { "epoch": 0.34156530286275955, "grad_norm": 0.16588569746053264, "learning_rate": 3.4152334152334154e-05, "loss": 0.5327, "step": 1390 }, { "epoch": 0.3440226071999017, "grad_norm": 0.18274664837668758, "learning_rate": 3.43980343980344e-05, "loss": 0.5279, "step": 1400 }, { "epoch": 0.3464799115370439, "grad_norm": 0.21762941385363407, "learning_rate": 3.4643734643734647e-05, "loss": 0.5346, "step": 1410 }, { "epoch": 0.348937215874186, "grad_norm": 0.20941495926240525, "learning_rate": 3.488943488943489e-05, "loss": 0.5291, "step": 1420 }, { "epoch": 0.35139452021132817, "grad_norm": 0.19183490102139417, "learning_rate": 3.513513513513514e-05, "loss": 0.5194, "step": 1430 }, { "epoch": 0.3538518245484703, "grad_norm": 0.20581130680856005, "learning_rate": 3.538083538083538e-05, "loss": 0.5284, "step": 1440 }, { "epoch": 0.3563091288856125, "grad_norm": 0.1929885335054331, "learning_rate": 3.562653562653563e-05, "loss": 0.5291, "step": 1450 }, { "epoch": 0.35876643322275464, "grad_norm": 0.21345649149340465, "learning_rate": 3.587223587223588e-05, "loss": 0.5334, "step": 1460 }, { "epoch": 0.3612237375598968, "grad_norm": 0.2145035802249372, "learning_rate": 3.611793611793612e-05, "loss": 0.5252, "step": 1470 }, { "epoch": 0.3636810418970389, "grad_norm": 0.18224020154138426, "learning_rate": 3.6363636363636364e-05, "loss": 0.5313, "step": 1480 }, { "epoch": 0.3661383462341811, "grad_norm": 0.20856063798459196, "learning_rate": 3.660933660933661e-05, "loss": 0.5187, "step": 1490 }, { "epoch": 0.36859565057132326, "grad_norm": 0.22181399366201102, "learning_rate": 3.685503685503686e-05, "loss": 0.5079, "step": 1500 }, { "epoch": 0.3710529549084654, "grad_norm": 0.20407778938572704, "learning_rate": 3.71007371007371e-05, "loss": 0.521, "step": 1510 }, { "epoch": 0.37351025924560755, "grad_norm": 0.23712462648254148, "learning_rate": 3.734643734643735e-05, "loss": 0.5171, "step": 1520 }, { "epoch": 0.37596756358274974, "grad_norm": 0.2711402035682359, "learning_rate": 3.7592137592137596e-05, "loss": 0.5048, "step": 1530 }, { "epoch": 0.3784248679198919, "grad_norm": 0.24274019346329545, "learning_rate": 3.783783783783784e-05, "loss": 0.5182, "step": 1540 }, { "epoch": 0.380882172257034, "grad_norm": 0.26731740223466915, "learning_rate": 3.808353808353808e-05, "loss": 0.5194, "step": 1550 }, { "epoch": 0.38333947659417617, "grad_norm": 0.22055703973026103, "learning_rate": 3.8329238329238335e-05, "loss": 0.5343, "step": 1560 }, { "epoch": 0.38579678093131836, "grad_norm": 0.2063960798897839, "learning_rate": 3.857493857493858e-05, "loss": 0.5151, "step": 1570 }, { "epoch": 0.3882540852684605, "grad_norm": 0.2194155648506203, "learning_rate": 3.882063882063882e-05, "loss": 0.5179, "step": 1580 }, { "epoch": 0.39071138960560264, "grad_norm": 0.21960779105238923, "learning_rate": 3.906633906633907e-05, "loss": 0.5366, "step": 1590 }, { "epoch": 0.3931686939427448, "grad_norm": 0.19441641706332527, "learning_rate": 3.9312039312039314e-05, "loss": 0.5141, "step": 1600 }, { "epoch": 0.395625998279887, "grad_norm": 0.20642602638661328, "learning_rate": 3.955773955773956e-05, "loss": 0.52, "step": 1610 }, { "epoch": 0.3980833026170291, "grad_norm": 0.25184292049942375, "learning_rate": 3.9803439803439806e-05, "loss": 0.5285, "step": 1620 }, { "epoch": 0.40054060695417126, "grad_norm": 0.22672130310186506, "learning_rate": 4.004914004914005e-05, "loss": 0.5191, "step": 1630 }, { "epoch": 0.4029979112913134, "grad_norm": 0.21590805408754327, "learning_rate": 4.02948402948403e-05, "loss": 0.5119, "step": 1640 }, { "epoch": 0.4054552156284556, "grad_norm": 0.28007695217572004, "learning_rate": 4.0540540540540545e-05, "loss": 0.527, "step": 1650 }, { "epoch": 0.40791251996559774, "grad_norm": 0.22873880090369952, "learning_rate": 4.0786240786240785e-05, "loss": 0.5288, "step": 1660 }, { "epoch": 0.4103698243027399, "grad_norm": 0.24697278067641193, "learning_rate": 4.103194103194104e-05, "loss": 0.521, "step": 1670 }, { "epoch": 0.412827128639882, "grad_norm": 0.23869814822867966, "learning_rate": 4.127764127764128e-05, "loss": 0.5208, "step": 1680 }, { "epoch": 0.4152844329770242, "grad_norm": 0.2003048754724827, "learning_rate": 4.1523341523341524e-05, "loss": 0.5144, "step": 1690 }, { "epoch": 0.41774173731416636, "grad_norm": 0.24280914684025612, "learning_rate": 4.176904176904177e-05, "loss": 0.5316, "step": 1700 }, { "epoch": 0.4201990416513085, "grad_norm": 0.19977154791698154, "learning_rate": 4.2014742014742017e-05, "loss": 0.5228, "step": 1710 }, { "epoch": 0.42265634598845064, "grad_norm": 0.22726167397555366, "learning_rate": 4.226044226044226e-05, "loss": 0.541, "step": 1720 }, { "epoch": 0.42511365032559284, "grad_norm": 0.24202178652923947, "learning_rate": 4.250614250614251e-05, "loss": 0.5024, "step": 1730 }, { "epoch": 0.427570954662735, "grad_norm": 0.20653187632761189, "learning_rate": 4.2751842751842756e-05, "loss": 0.5067, "step": 1740 }, { "epoch": 0.4300282589998771, "grad_norm": 0.22484598364847194, "learning_rate": 4.2997542997543e-05, "loss": 0.5213, "step": 1750 }, { "epoch": 0.43248556333701926, "grad_norm": 0.22474531238513018, "learning_rate": 4.324324324324325e-05, "loss": 0.5244, "step": 1760 }, { "epoch": 0.43494286767416146, "grad_norm": 0.23898679970071965, "learning_rate": 4.348894348894349e-05, "loss": 0.5188, "step": 1770 }, { "epoch": 0.4374001720113036, "grad_norm": 0.20727579041254537, "learning_rate": 4.373464373464374e-05, "loss": 0.5177, "step": 1780 }, { "epoch": 0.43985747634844574, "grad_norm": 0.17781799055410283, "learning_rate": 4.398034398034398e-05, "loss": 0.5114, "step": 1790 }, { "epoch": 0.4423147806855879, "grad_norm": 0.2799732125996814, "learning_rate": 4.422604422604423e-05, "loss": 0.5016, "step": 1800 }, { "epoch": 0.4447720850227301, "grad_norm": 0.24820971111154308, "learning_rate": 4.447174447174447e-05, "loss": 0.5354, "step": 1810 }, { "epoch": 0.4472293893598722, "grad_norm": 0.23992195196663207, "learning_rate": 4.471744471744472e-05, "loss": 0.5291, "step": 1820 }, { "epoch": 0.44968669369701436, "grad_norm": 0.2450283971245491, "learning_rate": 4.4963144963144966e-05, "loss": 0.5172, "step": 1830 }, { "epoch": 0.45214399803415656, "grad_norm": 0.22941785503600065, "learning_rate": 4.520884520884521e-05, "loss": 0.5138, "step": 1840 }, { "epoch": 0.4546013023712987, "grad_norm": 0.21965615056709337, "learning_rate": 4.545454545454546e-05, "loss": 0.5295, "step": 1850 }, { "epoch": 0.45705860670844084, "grad_norm": 0.23747350591081395, "learning_rate": 4.5700245700245705e-05, "loss": 0.526, "step": 1860 }, { "epoch": 0.459515911045583, "grad_norm": 0.24570560907421246, "learning_rate": 4.594594594594595e-05, "loss": 0.5006, "step": 1870 }, { "epoch": 0.4619732153827252, "grad_norm": 0.20139897477413185, "learning_rate": 4.619164619164619e-05, "loss": 0.5101, "step": 1880 }, { "epoch": 0.4644305197198673, "grad_norm": 0.22380516690848093, "learning_rate": 4.6437346437346444e-05, "loss": 0.5237, "step": 1890 }, { "epoch": 0.46688782405700946, "grad_norm": 0.2352934583427217, "learning_rate": 4.6683046683046684e-05, "loss": 0.5233, "step": 1900 }, { "epoch": 0.4693451283941516, "grad_norm": 0.23300978185707835, "learning_rate": 4.692874692874693e-05, "loss": 0.5188, "step": 1910 }, { "epoch": 0.4718024327312938, "grad_norm": 0.221040743390929, "learning_rate": 4.7174447174447176e-05, "loss": 0.5173, "step": 1920 }, { "epoch": 0.47425973706843594, "grad_norm": 0.22575332977219328, "learning_rate": 4.742014742014742e-05, "loss": 0.5177, "step": 1930 }, { "epoch": 0.4767170414055781, "grad_norm": 0.18898079051576847, "learning_rate": 4.766584766584767e-05, "loss": 0.5217, "step": 1940 }, { "epoch": 0.4791743457427202, "grad_norm": 0.229447011685531, "learning_rate": 4.7911547911547915e-05, "loss": 0.5058, "step": 1950 }, { "epoch": 0.4816316500798624, "grad_norm": 0.21714926306524987, "learning_rate": 4.8157248157248155e-05, "loss": 0.5097, "step": 1960 }, { "epoch": 0.48408895441700456, "grad_norm": 0.22190283362696686, "learning_rate": 4.840294840294841e-05, "loss": 0.5077, "step": 1970 }, { "epoch": 0.4865462587541467, "grad_norm": 0.2868145938299506, "learning_rate": 4.8648648648648654e-05, "loss": 0.5346, "step": 1980 }, { "epoch": 0.48900356309128884, "grad_norm": 0.25048353733237727, "learning_rate": 4.8894348894348894e-05, "loss": 0.5246, "step": 1990 }, { "epoch": 0.49146086742843104, "grad_norm": 0.23418241321881647, "learning_rate": 4.914004914004915e-05, "loss": 0.5278, "step": 2000 }, { "epoch": 0.4939181717655732, "grad_norm": 0.22141195113592158, "learning_rate": 4.9385749385749387e-05, "loss": 0.5115, "step": 2010 }, { "epoch": 0.4963754761027153, "grad_norm": 0.19870360364704232, "learning_rate": 4.963144963144963e-05, "loss": 0.5429, "step": 2020 }, { "epoch": 0.49883278043985746, "grad_norm": 0.1996739776260852, "learning_rate": 4.987714987714988e-05, "loss": 0.5057, "step": 2030 }, { "epoch": 0.5012900847769997, "grad_norm": 0.26890457064454826, "learning_rate": 4.998634625887493e-05, "loss": 0.5109, "step": 2040 }, { "epoch": 0.5037473891141417, "grad_norm": 0.2776952742944889, "learning_rate": 4.99590387766248e-05, "loss": 0.4987, "step": 2050 }, { "epoch": 0.5062046934512839, "grad_norm": 0.21968671731393788, "learning_rate": 4.993173129437466e-05, "loss": 0.5053, "step": 2060 }, { "epoch": 0.5086619977884261, "grad_norm": 0.22365054821494762, "learning_rate": 4.990442381212452e-05, "loss": 0.5213, "step": 2070 }, { "epoch": 0.5111193021255682, "grad_norm": 0.26608749447717817, "learning_rate": 4.9877116329874394e-05, "loss": 0.5369, "step": 2080 }, { "epoch": 0.5135766064627104, "grad_norm": 0.2180152727254584, "learning_rate": 4.984980884762425e-05, "loss": 0.5227, "step": 2090 }, { "epoch": 0.5160339107998526, "grad_norm": 0.20987174928327534, "learning_rate": 4.9822501365374115e-05, "loss": 0.52, "step": 2100 }, { "epoch": 0.5184912151369947, "grad_norm": 0.21953079128291408, "learning_rate": 4.979519388312398e-05, "loss": 0.5157, "step": 2110 }, { "epoch": 0.5209485194741369, "grad_norm": 0.22853203340293846, "learning_rate": 4.976788640087384e-05, "loss": 0.513, "step": 2120 }, { "epoch": 0.523405823811279, "grad_norm": 0.24208536630000035, "learning_rate": 4.974057891862371e-05, "loss": 0.4965, "step": 2130 }, { "epoch": 0.5258631281484212, "grad_norm": 0.23286361091592817, "learning_rate": 4.971327143637357e-05, "loss": 0.519, "step": 2140 }, { "epoch": 0.5283204324855634, "grad_norm": 0.1919766128870602, "learning_rate": 4.9685963954123435e-05, "loss": 0.5146, "step": 2150 }, { "epoch": 0.5307777368227055, "grad_norm": 0.23118449104563832, "learning_rate": 4.965865647187329e-05, "loss": 0.5095, "step": 2160 }, { "epoch": 0.5332350411598477, "grad_norm": 0.23961483335916764, "learning_rate": 4.963134898962316e-05, "loss": 0.4943, "step": 2170 }, { "epoch": 0.5356923454969899, "grad_norm": 0.21866421214292348, "learning_rate": 4.960404150737302e-05, "loss": 0.5052, "step": 2180 }, { "epoch": 0.5381496498341319, "grad_norm": 0.22136014401719178, "learning_rate": 4.9576734025122884e-05, "loss": 0.4924, "step": 2190 }, { "epoch": 0.5406069541712741, "grad_norm": 0.21154206849159637, "learning_rate": 4.954942654287275e-05, "loss": 0.5069, "step": 2200 }, { "epoch": 0.5430642585084162, "grad_norm": 0.2543158646581087, "learning_rate": 4.952211906062261e-05, "loss": 0.5043, "step": 2210 }, { "epoch": 0.5455215628455584, "grad_norm": 0.21625638918344717, "learning_rate": 4.9494811578372476e-05, "loss": 0.493, "step": 2220 }, { "epoch": 0.5479788671827006, "grad_norm": 0.2000012813076822, "learning_rate": 4.946750409612234e-05, "loss": 0.5035, "step": 2230 }, { "epoch": 0.5504361715198427, "grad_norm": 0.2546940405574182, "learning_rate": 4.9440196613872204e-05, "loss": 0.5167, "step": 2240 }, { "epoch": 0.5528934758569849, "grad_norm": 0.26417834764844234, "learning_rate": 4.941288913162206e-05, "loss": 0.5165, "step": 2250 }, { "epoch": 0.5553507801941271, "grad_norm": 0.1955398353410745, "learning_rate": 4.938558164937193e-05, "loss": 0.5082, "step": 2260 }, { "epoch": 0.5578080845312692, "grad_norm": 0.23325168574550775, "learning_rate": 4.9358274167121796e-05, "loss": 0.5251, "step": 2270 }, { "epoch": 0.5602653888684114, "grad_norm": 0.2333449386191706, "learning_rate": 4.9330966684871653e-05, "loss": 0.5134, "step": 2280 }, { "epoch": 0.5627226932055535, "grad_norm": 0.2522354210169095, "learning_rate": 4.9303659202621524e-05, "loss": 0.495, "step": 2290 }, { "epoch": 0.5651799975426957, "grad_norm": 0.19356975140195612, "learning_rate": 4.927635172037138e-05, "loss": 0.5262, "step": 2300 }, { "epoch": 0.5676373018798379, "grad_norm": 0.201280556342862, "learning_rate": 4.924904423812125e-05, "loss": 0.4991, "step": 2310 }, { "epoch": 0.5700946062169799, "grad_norm": 0.19758493553454815, "learning_rate": 4.922173675587111e-05, "loss": 0.5284, "step": 2320 }, { "epoch": 0.5725519105541221, "grad_norm": 0.21272785233137384, "learning_rate": 4.9194429273620974e-05, "loss": 0.5087, "step": 2330 }, { "epoch": 0.5750092148912643, "grad_norm": 0.19765391874025637, "learning_rate": 4.916712179137084e-05, "loss": 0.5205, "step": 2340 }, { "epoch": 0.5774665192284064, "grad_norm": 0.19491967963961232, "learning_rate": 4.91398143091207e-05, "loss": 0.5037, "step": 2350 }, { "epoch": 0.5799238235655486, "grad_norm": 0.22323057818723008, "learning_rate": 4.9112506826870566e-05, "loss": 0.5222, "step": 2360 }, { "epoch": 0.5823811279026907, "grad_norm": 0.24017458513818432, "learning_rate": 4.908519934462043e-05, "loss": 0.487, "step": 2370 }, { "epoch": 0.5848384322398329, "grad_norm": 0.21791722079635228, "learning_rate": 4.9057891862370294e-05, "loss": 0.5147, "step": 2380 }, { "epoch": 0.5872957365769751, "grad_norm": 0.2082116852712141, "learning_rate": 4.903058438012015e-05, "loss": 0.498, "step": 2390 }, { "epoch": 0.5897530409141172, "grad_norm": 0.2505094499112228, "learning_rate": 4.900327689787002e-05, "loss": 0.5079, "step": 2400 }, { "epoch": 0.5922103452512594, "grad_norm": 0.2793806816434224, "learning_rate": 4.8975969415619886e-05, "loss": 0.5052, "step": 2410 }, { "epoch": 0.5946676495884016, "grad_norm": 0.2652536033198818, "learning_rate": 4.894866193336974e-05, "loss": 0.5056, "step": 2420 }, { "epoch": 0.5971249539255437, "grad_norm": 0.22634230966181795, "learning_rate": 4.8921354451119614e-05, "loss": 0.5033, "step": 2430 }, { "epoch": 0.5995822582626859, "grad_norm": 0.2730715574569027, "learning_rate": 4.889404696886947e-05, "loss": 0.5244, "step": 2440 }, { "epoch": 0.6020395625998279, "grad_norm": 0.22571582121005274, "learning_rate": 4.8866739486619335e-05, "loss": 0.5026, "step": 2450 }, { "epoch": 0.6044968669369701, "grad_norm": 0.18653938245851134, "learning_rate": 4.88394320043692e-05, "loss": 0.5233, "step": 2460 }, { "epoch": 0.6069541712741123, "grad_norm": 0.219208485263761, "learning_rate": 4.881212452211906e-05, "loss": 0.519, "step": 2470 }, { "epoch": 0.6094114756112544, "grad_norm": 0.22240577631837938, "learning_rate": 4.878481703986893e-05, "loss": 0.5067, "step": 2480 }, { "epoch": 0.6118687799483966, "grad_norm": 0.2247948084982268, "learning_rate": 4.875750955761879e-05, "loss": 0.5041, "step": 2490 }, { "epoch": 0.6143260842855388, "grad_norm": 0.249395419484343, "learning_rate": 4.8730202075368655e-05, "loss": 0.4998, "step": 2500 }, { "epoch": 0.6167833886226809, "grad_norm": 0.24641314043036824, "learning_rate": 4.870289459311851e-05, "loss": 0.5189, "step": 2510 }, { "epoch": 0.6192406929598231, "grad_norm": 0.21025973088779973, "learning_rate": 4.867558711086838e-05, "loss": 0.52, "step": 2520 }, { "epoch": 0.6216979972969652, "grad_norm": 0.2109588916541641, "learning_rate": 4.864827962861824e-05, "loss": 0.5244, "step": 2530 }, { "epoch": 0.6241553016341074, "grad_norm": 0.2607289550157468, "learning_rate": 4.8620972146368104e-05, "loss": 0.4969, "step": 2540 }, { "epoch": 0.6266126059712496, "grad_norm": 0.25093548637567686, "learning_rate": 4.8593664664117975e-05, "loss": 0.5205, "step": 2550 }, { "epoch": 0.6290699103083917, "grad_norm": 0.2316428256698885, "learning_rate": 4.856635718186783e-05, "loss": 0.5102, "step": 2560 }, { "epoch": 0.6315272146455339, "grad_norm": 0.25640883218802163, "learning_rate": 4.85390496996177e-05, "loss": 0.5091, "step": 2570 }, { "epoch": 0.633984518982676, "grad_norm": 0.20609021265879665, "learning_rate": 4.851174221736756e-05, "loss": 0.4886, "step": 2580 }, { "epoch": 0.6364418233198181, "grad_norm": 0.22932589845058854, "learning_rate": 4.8484434735117424e-05, "loss": 0.4927, "step": 2590 }, { "epoch": 0.6388991276569603, "grad_norm": 0.21129103003890498, "learning_rate": 4.845712725286729e-05, "loss": 0.4834, "step": 2600 }, { "epoch": 0.6413564319941024, "grad_norm": 0.22607038944209173, "learning_rate": 4.842981977061715e-05, "loss": 0.4958, "step": 2610 }, { "epoch": 0.6438137363312446, "grad_norm": 0.2215046559625124, "learning_rate": 4.8402512288367016e-05, "loss": 0.4923, "step": 2620 }, { "epoch": 0.6462710406683868, "grad_norm": 0.21879770030749482, "learning_rate": 4.837520480611688e-05, "loss": 0.5118, "step": 2630 }, { "epoch": 0.6487283450055289, "grad_norm": 0.21881982664659413, "learning_rate": 4.8347897323866744e-05, "loss": 0.5111, "step": 2640 }, { "epoch": 0.6511856493426711, "grad_norm": 0.24331286334101362, "learning_rate": 4.83205898416166e-05, "loss": 0.4917, "step": 2650 }, { "epoch": 0.6536429536798133, "grad_norm": 0.2175741081605323, "learning_rate": 4.829328235936647e-05, "loss": 0.5135, "step": 2660 }, { "epoch": 0.6561002580169554, "grad_norm": 0.2612562929657568, "learning_rate": 4.826597487711633e-05, "loss": 0.5359, "step": 2670 }, { "epoch": 0.6585575623540976, "grad_norm": 0.21375839679338737, "learning_rate": 4.823866739486619e-05, "loss": 0.5073, "step": 2680 }, { "epoch": 0.6610148666912397, "grad_norm": 0.23445523073299923, "learning_rate": 4.8211359912616064e-05, "loss": 0.517, "step": 2690 }, { "epoch": 0.6634721710283819, "grad_norm": 0.21861284063838463, "learning_rate": 4.818405243036592e-05, "loss": 0.4649, "step": 2700 }, { "epoch": 0.665929475365524, "grad_norm": 0.2160090171891385, "learning_rate": 4.8156744948115785e-05, "loss": 0.5066, "step": 2710 }, { "epoch": 0.6683867797026661, "grad_norm": 0.21770271316827652, "learning_rate": 4.812943746586565e-05, "loss": 0.5003, "step": 2720 }, { "epoch": 0.6708440840398083, "grad_norm": 0.2323077866647499, "learning_rate": 4.810212998361551e-05, "loss": 0.4908, "step": 2730 }, { "epoch": 0.6733013883769505, "grad_norm": 0.22070790479213248, "learning_rate": 4.807482250136538e-05, "loss": 0.5042, "step": 2740 }, { "epoch": 0.6757586927140926, "grad_norm": 0.19264522658161332, "learning_rate": 4.804751501911524e-05, "loss": 0.5074, "step": 2750 }, { "epoch": 0.6782159970512348, "grad_norm": 0.2057339253153092, "learning_rate": 4.8020207536865105e-05, "loss": 0.4787, "step": 2760 }, { "epoch": 0.680673301388377, "grad_norm": 0.23394100997269088, "learning_rate": 4.799290005461496e-05, "loss": 0.4908, "step": 2770 }, { "epoch": 0.6831306057255191, "grad_norm": 0.24059487234294308, "learning_rate": 4.796559257236483e-05, "loss": 0.4951, "step": 2780 }, { "epoch": 0.6855879100626613, "grad_norm": 0.26632711167416795, "learning_rate": 4.793828509011469e-05, "loss": 0.4917, "step": 2790 }, { "epoch": 0.6880452143998034, "grad_norm": 0.22016594342009094, "learning_rate": 4.7910977607864554e-05, "loss": 0.5135, "step": 2800 }, { "epoch": 0.6905025187369456, "grad_norm": 0.20697702051467373, "learning_rate": 4.788367012561442e-05, "loss": 0.515, "step": 2810 }, { "epoch": 0.6929598230740878, "grad_norm": 0.22515297268582504, "learning_rate": 4.785636264336428e-05, "loss": 0.4978, "step": 2820 }, { "epoch": 0.6954171274112299, "grad_norm": 0.19145769629604445, "learning_rate": 4.782905516111415e-05, "loss": 0.4876, "step": 2830 }, { "epoch": 0.697874431748372, "grad_norm": 0.22102617282240558, "learning_rate": 4.780174767886401e-05, "loss": 0.5002, "step": 2840 }, { "epoch": 0.7003317360855142, "grad_norm": 0.23232015942192108, "learning_rate": 4.7774440196613874e-05, "loss": 0.5165, "step": 2850 }, { "epoch": 0.7027890404226563, "grad_norm": 0.19967999340461975, "learning_rate": 4.774713271436374e-05, "loss": 0.5009, "step": 2860 }, { "epoch": 0.7052463447597985, "grad_norm": 0.2413552966785213, "learning_rate": 4.77198252321136e-05, "loss": 0.4999, "step": 2870 }, { "epoch": 0.7077036490969406, "grad_norm": 0.2596136751401026, "learning_rate": 4.7692517749863466e-05, "loss": 0.5086, "step": 2880 }, { "epoch": 0.7101609534340828, "grad_norm": 0.22758807171892, "learning_rate": 4.766521026761333e-05, "loss": 0.5159, "step": 2890 }, { "epoch": 0.712618257771225, "grad_norm": 0.2230803896182145, "learning_rate": 4.7637902785363194e-05, "loss": 0.5186, "step": 2900 }, { "epoch": 0.7150755621083671, "grad_norm": 0.20943975574403229, "learning_rate": 4.761059530311305e-05, "loss": 0.5087, "step": 2910 }, { "epoch": 0.7175328664455093, "grad_norm": 0.21424107746252266, "learning_rate": 4.758328782086292e-05, "loss": 0.4958, "step": 2920 }, { "epoch": 0.7199901707826515, "grad_norm": 0.2121272501620453, "learning_rate": 4.755598033861278e-05, "loss": 0.5092, "step": 2930 }, { "epoch": 0.7224474751197936, "grad_norm": 0.24934768605612778, "learning_rate": 4.7528672856362644e-05, "loss": 0.4975, "step": 2940 }, { "epoch": 0.7249047794569358, "grad_norm": 0.2551183649562764, "learning_rate": 4.750136537411251e-05, "loss": 0.4959, "step": 2950 }, { "epoch": 0.7273620837940779, "grad_norm": 0.2195958119982897, "learning_rate": 4.747405789186237e-05, "loss": 0.4941, "step": 2960 }, { "epoch": 0.72981938813122, "grad_norm": 0.2123700040914377, "learning_rate": 4.7446750409612236e-05, "loss": 0.5104, "step": 2970 }, { "epoch": 0.7322766924683622, "grad_norm": 0.21814319210125513, "learning_rate": 4.74194429273621e-05, "loss": 0.4952, "step": 2980 }, { "epoch": 0.7347339968055043, "grad_norm": 0.22576538815541397, "learning_rate": 4.7392135445111964e-05, "loss": 0.4869, "step": 2990 }, { "epoch": 0.7371913011426465, "grad_norm": 0.22757481561213871, "learning_rate": 4.736482796286182e-05, "loss": 0.5021, "step": 3000 }, { "epoch": 0.7396486054797887, "grad_norm": 0.2314588209995822, "learning_rate": 4.733752048061169e-05, "loss": 0.5095, "step": 3010 }, { "epoch": 0.7421059098169308, "grad_norm": 0.21291565765103015, "learning_rate": 4.7310212998361556e-05, "loss": 0.4953, "step": 3020 }, { "epoch": 0.744563214154073, "grad_norm": 0.22879142850722695, "learning_rate": 4.728290551611141e-05, "loss": 0.513, "step": 3030 }, { "epoch": 0.7470205184912151, "grad_norm": 0.211927788455519, "learning_rate": 4.7255598033861284e-05, "loss": 0.4995, "step": 3040 }, { "epoch": 0.7494778228283573, "grad_norm": 0.234019710677994, "learning_rate": 4.722829055161114e-05, "loss": 0.5185, "step": 3050 }, { "epoch": 0.7519351271654995, "grad_norm": 0.2154944124915675, "learning_rate": 4.7200983069361005e-05, "loss": 0.499, "step": 3060 }, { "epoch": 0.7543924315026416, "grad_norm": 0.22654266892129007, "learning_rate": 4.717367558711087e-05, "loss": 0.4943, "step": 3070 }, { "epoch": 0.7568497358397838, "grad_norm": 0.21011714175068505, "learning_rate": 4.714636810486073e-05, "loss": 0.5126, "step": 3080 }, { "epoch": 0.759307040176926, "grad_norm": 0.2547097551891561, "learning_rate": 4.71190606226106e-05, "loss": 0.4849, "step": 3090 }, { "epoch": 0.761764344514068, "grad_norm": 0.2452467415021898, "learning_rate": 4.709175314036046e-05, "loss": 0.5151, "step": 3100 }, { "epoch": 0.7642216488512102, "grad_norm": 0.23559209655988236, "learning_rate": 4.7064445658110325e-05, "loss": 0.4987, "step": 3110 }, { "epoch": 0.7666789531883523, "grad_norm": 0.20122098584290513, "learning_rate": 4.703713817586019e-05, "loss": 0.4964, "step": 3120 }, { "epoch": 0.7691362575254945, "grad_norm": 0.1988515331525375, "learning_rate": 4.700983069361005e-05, "loss": 0.4891, "step": 3130 }, { "epoch": 0.7715935618626367, "grad_norm": 0.23007696963491614, "learning_rate": 4.698252321135991e-05, "loss": 0.503, "step": 3140 }, { "epoch": 0.7740508661997788, "grad_norm": 0.17746080056789454, "learning_rate": 4.695521572910978e-05, "loss": 0.5093, "step": 3150 }, { "epoch": 0.776508170536921, "grad_norm": 0.22054868903015123, "learning_rate": 4.6927908246859645e-05, "loss": 0.4931, "step": 3160 }, { "epoch": 0.7789654748740632, "grad_norm": 0.220910234929321, "learning_rate": 4.69006007646095e-05, "loss": 0.4906, "step": 3170 }, { "epoch": 0.7814227792112053, "grad_norm": 0.21850398825794007, "learning_rate": 4.687329328235937e-05, "loss": 0.5096, "step": 3180 }, { "epoch": 0.7838800835483475, "grad_norm": 0.20319988901760286, "learning_rate": 4.684598580010923e-05, "loss": 0.4978, "step": 3190 }, { "epoch": 0.7863373878854896, "grad_norm": 0.22649918726773163, "learning_rate": 4.6818678317859094e-05, "loss": 0.5022, "step": 3200 }, { "epoch": 0.7887946922226318, "grad_norm": 0.1986128649267418, "learning_rate": 4.679137083560896e-05, "loss": 0.519, "step": 3210 }, { "epoch": 0.791251996559774, "grad_norm": 0.23341723521559332, "learning_rate": 4.676406335335882e-05, "loss": 0.498, "step": 3220 }, { "epoch": 0.793709300896916, "grad_norm": 0.20038090567265168, "learning_rate": 4.6736755871108686e-05, "loss": 0.4888, "step": 3230 }, { "epoch": 0.7961666052340582, "grad_norm": 0.24849256391639574, "learning_rate": 4.670944838885855e-05, "loss": 0.4983, "step": 3240 }, { "epoch": 0.7986239095712004, "grad_norm": 0.21639057431065642, "learning_rate": 4.6682140906608414e-05, "loss": 0.4927, "step": 3250 }, { "epoch": 0.8010812139083425, "grad_norm": 0.2218161696013767, "learning_rate": 4.665483342435827e-05, "loss": 0.5018, "step": 3260 }, { "epoch": 0.8035385182454847, "grad_norm": 0.21137563041948226, "learning_rate": 4.662752594210814e-05, "loss": 0.4799, "step": 3270 }, { "epoch": 0.8059958225826268, "grad_norm": 0.21590482255578294, "learning_rate": 4.6600218459858e-05, "loss": 0.5071, "step": 3280 }, { "epoch": 0.808453126919769, "grad_norm": 0.2036908909439029, "learning_rate": 4.6572910977607863e-05, "loss": 0.4826, "step": 3290 }, { "epoch": 0.8109104312569112, "grad_norm": 0.25658415869511264, "learning_rate": 4.6545603495357734e-05, "loss": 0.4958, "step": 3300 }, { "epoch": 0.8133677355940533, "grad_norm": 0.22000742531085704, "learning_rate": 4.651829601310759e-05, "loss": 0.5033, "step": 3310 }, { "epoch": 0.8158250399311955, "grad_norm": 0.196920622117492, "learning_rate": 4.6490988530857455e-05, "loss": 0.5002, "step": 3320 }, { "epoch": 0.8182823442683377, "grad_norm": 0.2328263260391484, "learning_rate": 4.646368104860732e-05, "loss": 0.5105, "step": 3330 }, { "epoch": 0.8207396486054798, "grad_norm": 0.20835400810927568, "learning_rate": 4.6436373566357183e-05, "loss": 0.492, "step": 3340 }, { "epoch": 0.823196952942622, "grad_norm": 0.21696077178254128, "learning_rate": 4.640906608410705e-05, "loss": 0.4885, "step": 3350 }, { "epoch": 0.825654257279764, "grad_norm": 0.20443406519593144, "learning_rate": 4.638175860185691e-05, "loss": 0.494, "step": 3360 }, { "epoch": 0.8281115616169062, "grad_norm": 0.18733611630835953, "learning_rate": 4.6354451119606775e-05, "loss": 0.5007, "step": 3370 }, { "epoch": 0.8305688659540484, "grad_norm": 0.18354473639273466, "learning_rate": 4.632714363735664e-05, "loss": 0.4992, "step": 3380 }, { "epoch": 0.8330261702911905, "grad_norm": 0.21596049257360814, "learning_rate": 4.6299836155106503e-05, "loss": 0.5193, "step": 3390 }, { "epoch": 0.8354834746283327, "grad_norm": 0.200491286141889, "learning_rate": 4.627252867285636e-05, "loss": 0.4979, "step": 3400 }, { "epoch": 0.8379407789654749, "grad_norm": 0.19630241939616574, "learning_rate": 4.624522119060623e-05, "loss": 0.4838, "step": 3410 }, { "epoch": 0.840398083302617, "grad_norm": 0.20537972546812255, "learning_rate": 4.621791370835609e-05, "loss": 0.5095, "step": 3420 }, { "epoch": 0.8428553876397592, "grad_norm": 0.1993843900153466, "learning_rate": 4.619060622610595e-05, "loss": 0.502, "step": 3430 }, { "epoch": 0.8453126919769013, "grad_norm": 0.20501337257690094, "learning_rate": 4.6163298743855823e-05, "loss": 0.4931, "step": 3440 }, { "epoch": 0.8477699963140435, "grad_norm": 0.2122139395792062, "learning_rate": 4.613599126160568e-05, "loss": 0.507, "step": 3450 }, { "epoch": 0.8502273006511857, "grad_norm": 0.1907063254484309, "learning_rate": 4.6108683779355545e-05, "loss": 0.5073, "step": 3460 }, { "epoch": 0.8526846049883278, "grad_norm": 0.18876959253267506, "learning_rate": 4.608137629710541e-05, "loss": 0.4977, "step": 3470 }, { "epoch": 0.85514190932547, "grad_norm": 0.2111464650943642, "learning_rate": 4.605406881485527e-05, "loss": 0.4764, "step": 3480 }, { "epoch": 0.8575992136626122, "grad_norm": 0.26061386335224984, "learning_rate": 4.602676133260514e-05, "loss": 0.4826, "step": 3490 }, { "epoch": 0.8600565179997542, "grad_norm": 0.22021575711613647, "learning_rate": 4.5999453850355e-05, "loss": 0.4885, "step": 3500 }, { "epoch": 0.8625138223368964, "grad_norm": 0.20376623165615398, "learning_rate": 4.5972146368104865e-05, "loss": 0.5028, "step": 3510 }, { "epoch": 0.8649711266740385, "grad_norm": 0.25969524571665265, "learning_rate": 4.594483888585472e-05, "loss": 0.486, "step": 3520 }, { "epoch": 0.8674284310111807, "grad_norm": 0.22473719895071698, "learning_rate": 4.591753140360459e-05, "loss": 0.4894, "step": 3530 }, { "epoch": 0.8698857353483229, "grad_norm": 0.1957179031036569, "learning_rate": 4.589022392135445e-05, "loss": 0.504, "step": 3540 }, { "epoch": 0.872343039685465, "grad_norm": 0.20545546894956657, "learning_rate": 4.5862916439104314e-05, "loss": 0.509, "step": 3550 }, { "epoch": 0.8748003440226072, "grad_norm": 0.24447669648649606, "learning_rate": 4.5835608956854185e-05, "loss": 0.5139, "step": 3560 }, { "epoch": 0.8772576483597494, "grad_norm": 0.22540719663815353, "learning_rate": 4.580830147460404e-05, "loss": 0.4706, "step": 3570 }, { "epoch": 0.8797149526968915, "grad_norm": 0.2139220831950044, "learning_rate": 4.5780993992353906e-05, "loss": 0.4841, "step": 3580 }, { "epoch": 0.8821722570340337, "grad_norm": 0.2253516307335816, "learning_rate": 4.575368651010377e-05, "loss": 0.4861, "step": 3590 }, { "epoch": 0.8846295613711758, "grad_norm": 0.2303080859730776, "learning_rate": 4.5726379027853634e-05, "loss": 0.4757, "step": 3600 }, { "epoch": 0.887086865708318, "grad_norm": 0.20782270178135376, "learning_rate": 4.56990715456035e-05, "loss": 0.4972, "step": 3610 }, { "epoch": 0.8895441700454602, "grad_norm": 0.19641608531165228, "learning_rate": 4.567176406335336e-05, "loss": 0.4994, "step": 3620 }, { "epoch": 0.8920014743826022, "grad_norm": 0.23625481642440613, "learning_rate": 4.5644456581103226e-05, "loss": 0.5016, "step": 3630 }, { "epoch": 0.8944587787197444, "grad_norm": 0.22761064959855556, "learning_rate": 4.561714909885309e-05, "loss": 0.4847, "step": 3640 }, { "epoch": 0.8969160830568866, "grad_norm": 0.1932569775456255, "learning_rate": 4.5589841616602954e-05, "loss": 0.4908, "step": 3650 }, { "epoch": 0.8993733873940287, "grad_norm": 0.23028091966785066, "learning_rate": 4.556253413435281e-05, "loss": 0.4913, "step": 3660 }, { "epoch": 0.9018306917311709, "grad_norm": 0.22487835410372534, "learning_rate": 4.553522665210268e-05, "loss": 0.4959, "step": 3670 }, { "epoch": 0.9042879960683131, "grad_norm": 0.2177947130878207, "learning_rate": 4.550791916985254e-05, "loss": 0.4958, "step": 3680 }, { "epoch": 0.9067453004054552, "grad_norm": 0.22296174130297752, "learning_rate": 4.54806116876024e-05, "loss": 0.4878, "step": 3690 }, { "epoch": 0.9092026047425974, "grad_norm": 0.2156582982971167, "learning_rate": 4.5453304205352274e-05, "loss": 0.5101, "step": 3700 }, { "epoch": 0.9116599090797395, "grad_norm": 0.22622851322717139, "learning_rate": 4.542599672310213e-05, "loss": 0.4887, "step": 3710 }, { "epoch": 0.9141172134168817, "grad_norm": 0.22243709074365253, "learning_rate": 4.5398689240851995e-05, "loss": 0.4978, "step": 3720 }, { "epoch": 0.9165745177540239, "grad_norm": 0.23754816828749, "learning_rate": 4.537138175860186e-05, "loss": 0.4874, "step": 3730 }, { "epoch": 0.919031822091166, "grad_norm": 0.2252203429839259, "learning_rate": 4.534407427635172e-05, "loss": 0.4885, "step": 3740 }, { "epoch": 0.9214891264283082, "grad_norm": 0.19478874803686486, "learning_rate": 4.531676679410159e-05, "loss": 0.4876, "step": 3750 }, { "epoch": 0.9239464307654504, "grad_norm": 0.21117771404016986, "learning_rate": 4.528945931185145e-05, "loss": 0.4776, "step": 3760 }, { "epoch": 0.9264037351025924, "grad_norm": 0.21970902291482003, "learning_rate": 4.5262151829601315e-05, "loss": 0.4994, "step": 3770 }, { "epoch": 0.9288610394397346, "grad_norm": 0.24593940912039802, "learning_rate": 4.523484434735117e-05, "loss": 0.4934, "step": 3780 }, { "epoch": 0.9313183437768767, "grad_norm": 0.2293866397485531, "learning_rate": 4.520753686510104e-05, "loss": 0.4899, "step": 3790 }, { "epoch": 0.9337756481140189, "grad_norm": 0.21816882692950798, "learning_rate": 4.51802293828509e-05, "loss": 0.4849, "step": 3800 }, { "epoch": 0.9362329524511611, "grad_norm": 0.20290655675432326, "learning_rate": 4.5152921900600764e-05, "loss": 0.5096, "step": 3810 }, { "epoch": 0.9386902567883032, "grad_norm": 0.19136040539460286, "learning_rate": 4.512561441835063e-05, "loss": 0.5024, "step": 3820 }, { "epoch": 0.9411475611254454, "grad_norm": 0.21601144033460024, "learning_rate": 4.509830693610049e-05, "loss": 0.4838, "step": 3830 }, { "epoch": 0.9436048654625876, "grad_norm": 0.20681036293841767, "learning_rate": 4.5070999453850356e-05, "loss": 0.4876, "step": 3840 }, { "epoch": 0.9460621697997297, "grad_norm": 0.19863673691619213, "learning_rate": 4.504369197160022e-05, "loss": 0.5062, "step": 3850 }, { "epoch": 0.9485194741368719, "grad_norm": 0.23572941508519224, "learning_rate": 4.5016384489350084e-05, "loss": 0.5021, "step": 3860 }, { "epoch": 0.950976778474014, "grad_norm": 0.22504953748260223, "learning_rate": 4.498907700709995e-05, "loss": 0.5054, "step": 3870 }, { "epoch": 0.9534340828111562, "grad_norm": 0.2082275151937166, "learning_rate": 4.496176952484981e-05, "loss": 0.491, "step": 3880 }, { "epoch": 0.9558913871482984, "grad_norm": 0.24292108778271831, "learning_rate": 4.4934462042599676e-05, "loss": 0.494, "step": 3890 }, { "epoch": 0.9583486914854404, "grad_norm": 0.20161316601865786, "learning_rate": 4.490715456034954e-05, "loss": 0.5004, "step": 3900 }, { "epoch": 0.9608059958225826, "grad_norm": 0.2011479184387663, "learning_rate": 4.4879847078099404e-05, "loss": 0.4817, "step": 3910 }, { "epoch": 0.9632633001597248, "grad_norm": 0.2158664243027108, "learning_rate": 4.485253959584926e-05, "loss": 0.4828, "step": 3920 }, { "epoch": 0.9657206044968669, "grad_norm": 0.22410731777401557, "learning_rate": 4.482523211359913e-05, "loss": 0.4943, "step": 3930 }, { "epoch": 0.9681779088340091, "grad_norm": 0.209456849623133, "learning_rate": 4.479792463134899e-05, "loss": 0.4777, "step": 3940 }, { "epoch": 0.9706352131711512, "grad_norm": 0.22307620131661046, "learning_rate": 4.4770617149098854e-05, "loss": 0.4856, "step": 3950 }, { "epoch": 0.9730925175082934, "grad_norm": 0.22727052158239805, "learning_rate": 4.474330966684872e-05, "loss": 0.5038, "step": 3960 }, { "epoch": 0.9755498218454356, "grad_norm": 0.18946233491987924, "learning_rate": 4.471600218459858e-05, "loss": 0.4937, "step": 3970 }, { "epoch": 0.9780071261825777, "grad_norm": 0.21590062101990748, "learning_rate": 4.4688694702348446e-05, "loss": 0.4931, "step": 3980 }, { "epoch": 0.9804644305197199, "grad_norm": 0.20700233972254015, "learning_rate": 4.466138722009831e-05, "loss": 0.4816, "step": 3990 }, { "epoch": 0.9829217348568621, "grad_norm": 0.19893583469865267, "learning_rate": 4.4634079737848174e-05, "loss": 0.5122, "step": 4000 }, { "epoch": 0.9853790391940042, "grad_norm": 0.22205197061646506, "learning_rate": 4.460677225559803e-05, "loss": 0.4881, "step": 4010 }, { "epoch": 0.9878363435311464, "grad_norm": 0.21836187545342464, "learning_rate": 4.45794647733479e-05, "loss": 0.4745, "step": 4020 }, { "epoch": 0.9902936478682884, "grad_norm": 0.21295485666185762, "learning_rate": 4.4552157291097766e-05, "loss": 0.5012, "step": 4030 }, { "epoch": 0.9927509522054306, "grad_norm": 0.22774552432148013, "learning_rate": 4.452484980884762e-05, "loss": 0.4984, "step": 4040 }, { "epoch": 0.9952082565425728, "grad_norm": 0.21849817796755186, "learning_rate": 4.4497542326597494e-05, "loss": 0.4879, "step": 4050 }, { "epoch": 0.9976655608797149, "grad_norm": 0.2406002421801494, "learning_rate": 4.447023484434735e-05, "loss": 0.4965, "step": 4060 }, { "epoch": 1.0, "grad_norm": 0.2040523699349269, "learning_rate": 4.4442927362097215e-05, "loss": 0.4637, "step": 4070 }, { "epoch": 1.002457304337142, "grad_norm": 0.250830301200396, "learning_rate": 4.441561987984708e-05, "loss": 0.4922, "step": 4080 }, { "epoch": 1.0049146086742844, "grad_norm": 0.21609337651713642, "learning_rate": 4.438831239759694e-05, "loss": 0.4935, "step": 4090 }, { "epoch": 1.0073719130114265, "grad_norm": 0.2196930334448974, "learning_rate": 4.436100491534681e-05, "loss": 0.4976, "step": 4100 }, { "epoch": 1.0098292173485686, "grad_norm": 0.25461196565137806, "learning_rate": 4.433369743309667e-05, "loss": 0.4741, "step": 4110 }, { "epoch": 1.0122865216857109, "grad_norm": 0.21410344158580394, "learning_rate": 4.4306389950846535e-05, "loss": 0.4694, "step": 4120 }, { "epoch": 1.014743826022853, "grad_norm": 0.21764426445688004, "learning_rate": 4.42790824685964e-05, "loss": 0.4836, "step": 4130 }, { "epoch": 1.017201130359995, "grad_norm": 0.2351627571188016, "learning_rate": 4.425177498634626e-05, "loss": 0.4792, "step": 4140 }, { "epoch": 1.0196584346971371, "grad_norm": 0.2211068945947347, "learning_rate": 4.422446750409612e-05, "loss": 0.4783, "step": 4150 }, { "epoch": 1.0221157390342794, "grad_norm": 0.2208756244706653, "learning_rate": 4.419716002184599e-05, "loss": 0.4905, "step": 4160 }, { "epoch": 1.0245730433714215, "grad_norm": 0.21795464075481713, "learning_rate": 4.4169852539595855e-05, "loss": 0.4877, "step": 4170 }, { "epoch": 1.0270303477085636, "grad_norm": 0.2528480992567245, "learning_rate": 4.414254505734571e-05, "loss": 0.4909, "step": 4180 }, { "epoch": 1.029487652045706, "grad_norm": 0.2298525061172174, "learning_rate": 4.411523757509558e-05, "loss": 0.4874, "step": 4190 }, { "epoch": 1.031944956382848, "grad_norm": 0.20861941177089396, "learning_rate": 4.408793009284544e-05, "loss": 0.4816, "step": 4200 }, { "epoch": 1.03440226071999, "grad_norm": 0.23475091720803895, "learning_rate": 4.4060622610595304e-05, "loss": 0.5013, "step": 4210 }, { "epoch": 1.0368595650571324, "grad_norm": 0.1863127112327997, "learning_rate": 4.403331512834517e-05, "loss": 0.4745, "step": 4220 }, { "epoch": 1.0393168693942745, "grad_norm": 0.18545564397578473, "learning_rate": 4.400600764609503e-05, "loss": 0.4832, "step": 4230 }, { "epoch": 1.0417741737314166, "grad_norm": 0.23340225466890202, "learning_rate": 4.3978700163844896e-05, "loss": 0.5149, "step": 4240 }, { "epoch": 1.0442314780685589, "grad_norm": 0.21696934158916054, "learning_rate": 4.395139268159476e-05, "loss": 0.4863, "step": 4250 }, { "epoch": 1.046688782405701, "grad_norm": 0.2081775372902454, "learning_rate": 4.3924085199344624e-05, "loss": 0.4777, "step": 4260 }, { "epoch": 1.049146086742843, "grad_norm": 0.2415870266971161, "learning_rate": 4.389677771709448e-05, "loss": 0.5065, "step": 4270 }, { "epoch": 1.0516033910799854, "grad_norm": 0.24420810728889544, "learning_rate": 4.386947023484435e-05, "loss": 0.4879, "step": 4280 }, { "epoch": 1.0540606954171274, "grad_norm": 0.20365558490914865, "learning_rate": 4.384216275259421e-05, "loss": 0.4952, "step": 4290 }, { "epoch": 1.0565179997542695, "grad_norm": 0.2663413721146928, "learning_rate": 4.3814855270344073e-05, "loss": 0.5, "step": 4300 }, { "epoch": 1.0589753040914118, "grad_norm": 0.21711739405390704, "learning_rate": 4.3787547788093944e-05, "loss": 0.5043, "step": 4310 }, { "epoch": 1.061432608428554, "grad_norm": 0.23295502865775047, "learning_rate": 4.37602403058438e-05, "loss": 0.4837, "step": 4320 }, { "epoch": 1.063889912765696, "grad_norm": 0.2080276530149086, "learning_rate": 4.3732932823593665e-05, "loss": 0.4991, "step": 4330 }, { "epoch": 1.066347217102838, "grad_norm": 0.22917120025858467, "learning_rate": 4.370562534134353e-05, "loss": 0.4934, "step": 4340 }, { "epoch": 1.0688045214399804, "grad_norm": 0.24264426298353975, "learning_rate": 4.3678317859093393e-05, "loss": 0.488, "step": 4350 }, { "epoch": 1.0712618257771225, "grad_norm": 0.23240505604283956, "learning_rate": 4.365101037684326e-05, "loss": 0.4963, "step": 4360 }, { "epoch": 1.0737191301142646, "grad_norm": 0.2536612345126809, "learning_rate": 4.362370289459312e-05, "loss": 0.4827, "step": 4370 }, { "epoch": 1.0761764344514069, "grad_norm": 0.2476970761379682, "learning_rate": 4.3596395412342985e-05, "loss": 0.5014, "step": 4380 }, { "epoch": 1.078633738788549, "grad_norm": 0.2345081176663945, "learning_rate": 4.356908793009285e-05, "loss": 0.4892, "step": 4390 }, { "epoch": 1.081091043125691, "grad_norm": 0.2502712691973593, "learning_rate": 4.3541780447842713e-05, "loss": 0.4944, "step": 4400 }, { "epoch": 1.0835483474628334, "grad_norm": 0.2201814785253552, "learning_rate": 4.351447296559257e-05, "loss": 0.4812, "step": 4410 }, { "epoch": 1.0860056517999754, "grad_norm": 0.2178532831980987, "learning_rate": 4.348716548334244e-05, "loss": 0.4892, "step": 4420 }, { "epoch": 1.0884629561371175, "grad_norm": 0.23549482854091758, "learning_rate": 4.34598580010923e-05, "loss": 0.4756, "step": 4430 }, { "epoch": 1.0909202604742598, "grad_norm": 0.2053248466113806, "learning_rate": 4.343255051884216e-05, "loss": 0.4751, "step": 4440 }, { "epoch": 1.093377564811402, "grad_norm": 0.2305928911751541, "learning_rate": 4.3405243036592033e-05, "loss": 0.478, "step": 4450 }, { "epoch": 1.095834869148544, "grad_norm": 0.21888483707367837, "learning_rate": 4.337793555434189e-05, "loss": 0.4909, "step": 4460 }, { "epoch": 1.098292173485686, "grad_norm": 0.23364167397024577, "learning_rate": 4.3350628072091755e-05, "loss": 0.4924, "step": 4470 }, { "epoch": 1.1007494778228284, "grad_norm": 0.24326222834435987, "learning_rate": 4.332332058984162e-05, "loss": 0.4712, "step": 4480 }, { "epoch": 1.1032067821599705, "grad_norm": 0.20146800617703484, "learning_rate": 4.329601310759148e-05, "loss": 0.4716, "step": 4490 }, { "epoch": 1.1056640864971126, "grad_norm": 0.19223715860469356, "learning_rate": 4.326870562534135e-05, "loss": 0.4909, "step": 4500 }, { "epoch": 1.1081213908342549, "grad_norm": 0.20474575146035973, "learning_rate": 4.324139814309121e-05, "loss": 0.4834, "step": 4510 }, { "epoch": 1.110578695171397, "grad_norm": 0.22718439775706498, "learning_rate": 4.3214090660841075e-05, "loss": 0.4988, "step": 4520 }, { "epoch": 1.113035999508539, "grad_norm": 0.21336158602752442, "learning_rate": 4.318678317859093e-05, "loss": 0.4777, "step": 4530 }, { "epoch": 1.1154933038456813, "grad_norm": 0.19478288577444003, "learning_rate": 4.31594756963408e-05, "loss": 0.4812, "step": 4540 }, { "epoch": 1.1179506081828234, "grad_norm": 0.22694167493838935, "learning_rate": 4.313216821409066e-05, "loss": 0.4919, "step": 4550 }, { "epoch": 1.1204079125199655, "grad_norm": 0.21018597419532883, "learning_rate": 4.3104860731840524e-05, "loss": 0.4587, "step": 4560 }, { "epoch": 1.1228652168571078, "grad_norm": 0.19274814677285118, "learning_rate": 4.307755324959039e-05, "loss": 0.5069, "step": 4570 }, { "epoch": 1.12532252119425, "grad_norm": 0.1943196663331846, "learning_rate": 4.305024576734025e-05, "loss": 0.4866, "step": 4580 }, { "epoch": 1.127779825531392, "grad_norm": 0.2371345405504217, "learning_rate": 4.3022938285090116e-05, "loss": 0.491, "step": 4590 }, { "epoch": 1.1302371298685343, "grad_norm": 0.19574506290642518, "learning_rate": 4.299563080283998e-05, "loss": 0.4952, "step": 4600 }, { "epoch": 1.1326944342056764, "grad_norm": 0.2435755034131264, "learning_rate": 4.2968323320589844e-05, "loss": 0.479, "step": 4610 }, { "epoch": 1.1351517385428185, "grad_norm": 0.21667064521784163, "learning_rate": 4.294101583833971e-05, "loss": 0.4989, "step": 4620 }, { "epoch": 1.1376090428799608, "grad_norm": 0.2036923190958465, "learning_rate": 4.291370835608957e-05, "loss": 0.4942, "step": 4630 }, { "epoch": 1.1400663472171029, "grad_norm": 0.230183676994977, "learning_rate": 4.2886400873839436e-05, "loss": 0.4874, "step": 4640 }, { "epoch": 1.142523651554245, "grad_norm": 0.2792211289326701, "learning_rate": 4.28590933915893e-05, "loss": 0.459, "step": 4650 }, { "epoch": 1.1449809558913873, "grad_norm": 0.22791782918093742, "learning_rate": 4.2831785909339164e-05, "loss": 0.4903, "step": 4660 }, { "epoch": 1.1474382602285293, "grad_norm": 0.22534470458605965, "learning_rate": 4.280447842708902e-05, "loss": 0.4654, "step": 4670 }, { "epoch": 1.1498955645656714, "grad_norm": 0.2288086188207275, "learning_rate": 4.277717094483889e-05, "loss": 0.5081, "step": 4680 }, { "epoch": 1.1523528689028135, "grad_norm": 0.207158051919178, "learning_rate": 4.274986346258875e-05, "loss": 0.4956, "step": 4690 }, { "epoch": 1.1548101732399558, "grad_norm": 0.23693106186566867, "learning_rate": 4.272255598033861e-05, "loss": 0.4805, "step": 4700 }, { "epoch": 1.157267477577098, "grad_norm": 0.21798018541060843, "learning_rate": 4.269524849808848e-05, "loss": 0.4704, "step": 4710 }, { "epoch": 1.15972478191424, "grad_norm": 0.21725555327866905, "learning_rate": 4.266794101583834e-05, "loss": 0.4973, "step": 4720 }, { "epoch": 1.1621820862513823, "grad_norm": 0.2244578715523494, "learning_rate": 4.2640633533588205e-05, "loss": 0.4931, "step": 4730 }, { "epoch": 1.1646393905885244, "grad_norm": 0.21071548646698912, "learning_rate": 4.261332605133807e-05, "loss": 0.5075, "step": 4740 }, { "epoch": 1.1670966949256665, "grad_norm": 0.20741794625476292, "learning_rate": 4.258601856908793e-05, "loss": 0.4837, "step": 4750 }, { "epoch": 1.1695539992628088, "grad_norm": 0.21579357562079096, "learning_rate": 4.255871108683779e-05, "loss": 0.4733, "step": 4760 }, { "epoch": 1.1720113035999509, "grad_norm": 0.23961305781872566, "learning_rate": 4.253140360458766e-05, "loss": 0.4864, "step": 4770 }, { "epoch": 1.174468607937093, "grad_norm": 0.20520997121828558, "learning_rate": 4.2504096122337525e-05, "loss": 0.4822, "step": 4780 }, { "epoch": 1.176925912274235, "grad_norm": 0.2144748248163639, "learning_rate": 4.247678864008738e-05, "loss": 0.4812, "step": 4790 }, { "epoch": 1.1793832166113773, "grad_norm": 0.21901487045600554, "learning_rate": 4.244948115783725e-05, "loss": 0.4947, "step": 4800 }, { "epoch": 1.1818405209485194, "grad_norm": 0.21205008974842163, "learning_rate": 4.242217367558711e-05, "loss": 0.4848, "step": 4810 }, { "epoch": 1.1842978252856615, "grad_norm": 0.21756749538287112, "learning_rate": 4.2394866193336974e-05, "loss": 0.471, "step": 4820 }, { "epoch": 1.1867551296228038, "grad_norm": 0.2254393600714676, "learning_rate": 4.236755871108684e-05, "loss": 0.4862, "step": 4830 }, { "epoch": 1.189212433959946, "grad_norm": 0.22051706247292133, "learning_rate": 4.23402512288367e-05, "loss": 0.4748, "step": 4840 }, { "epoch": 1.191669738297088, "grad_norm": 0.2397983998737101, "learning_rate": 4.2312943746586566e-05, "loss": 0.4937, "step": 4850 }, { "epoch": 1.1941270426342303, "grad_norm": 0.18415082627118753, "learning_rate": 4.228563626433643e-05, "loss": 0.4762, "step": 4860 }, { "epoch": 1.1965843469713724, "grad_norm": 0.23773944387799775, "learning_rate": 4.2258328782086294e-05, "loss": 0.4877, "step": 4870 }, { "epoch": 1.1990416513085145, "grad_norm": 0.1999274201764957, "learning_rate": 4.223102129983616e-05, "loss": 0.4756, "step": 4880 }, { "epoch": 1.2014989556456568, "grad_norm": 0.1982921596147605, "learning_rate": 4.220371381758602e-05, "loss": 0.4946, "step": 4890 }, { "epoch": 1.2039562599827989, "grad_norm": 0.2135209762605839, "learning_rate": 4.217640633533588e-05, "loss": 0.4943, "step": 4900 }, { "epoch": 1.206413564319941, "grad_norm": 0.22554449895425793, "learning_rate": 4.214909885308575e-05, "loss": 0.4836, "step": 4910 }, { "epoch": 1.2088708686570833, "grad_norm": 0.21150072013554155, "learning_rate": 4.2121791370835614e-05, "loss": 0.482, "step": 4920 }, { "epoch": 1.2113281729942253, "grad_norm": 0.26324166747416056, "learning_rate": 4.209448388858547e-05, "loss": 0.4852, "step": 4930 }, { "epoch": 1.2137854773313674, "grad_norm": 0.24533358812765088, "learning_rate": 4.206717640633534e-05, "loss": 0.4981, "step": 4940 }, { "epoch": 1.2162427816685097, "grad_norm": 0.22547233854048704, "learning_rate": 4.20398689240852e-05, "loss": 0.4729, "step": 4950 }, { "epoch": 1.2187000860056518, "grad_norm": 0.24835543372361046, "learning_rate": 4.2012561441835064e-05, "loss": 0.4881, "step": 4960 }, { "epoch": 1.221157390342794, "grad_norm": 0.2021606495811566, "learning_rate": 4.198525395958493e-05, "loss": 0.4907, "step": 4970 }, { "epoch": 1.2236146946799362, "grad_norm": 0.2173859850842382, "learning_rate": 4.195794647733479e-05, "loss": 0.4999, "step": 4980 }, { "epoch": 1.2260719990170783, "grad_norm": 0.2116581781405141, "learning_rate": 4.1930638995084656e-05, "loss": 0.4837, "step": 4990 }, { "epoch": 1.2285293033542204, "grad_norm": 0.2101329330950578, "learning_rate": 4.190333151283452e-05, "loss": 0.4977, "step": 5000 }, { "epoch": 1.2309866076913627, "grad_norm": 0.23875071017699484, "learning_rate": 4.1876024030584384e-05, "loss": 0.5034, "step": 5010 }, { "epoch": 1.2334439120285048, "grad_norm": 0.20992324254370834, "learning_rate": 4.184871654833424e-05, "loss": 0.4932, "step": 5020 }, { "epoch": 1.2359012163656469, "grad_norm": 0.21766640868581622, "learning_rate": 4.182140906608411e-05, "loss": 0.4927, "step": 5030 }, { "epoch": 1.238358520702789, "grad_norm": 0.210647709453403, "learning_rate": 4.179410158383397e-05, "loss": 0.4716, "step": 5040 }, { "epoch": 1.2408158250399313, "grad_norm": 0.19264560117560567, "learning_rate": 4.176679410158383e-05, "loss": 0.4869, "step": 5050 }, { "epoch": 1.2432731293770733, "grad_norm": 0.24153335773104279, "learning_rate": 4.1739486619333704e-05, "loss": 0.4772, "step": 5060 }, { "epoch": 1.2457304337142154, "grad_norm": 0.21879684224785917, "learning_rate": 4.171217913708356e-05, "loss": 0.485, "step": 5070 }, { "epoch": 1.2481877380513577, "grad_norm": 0.23833163024870652, "learning_rate": 4.1684871654833425e-05, "loss": 0.4701, "step": 5080 }, { "epoch": 1.2506450423884998, "grad_norm": 0.19470320987776987, "learning_rate": 4.165756417258329e-05, "loss": 0.489, "step": 5090 }, { "epoch": 1.253102346725642, "grad_norm": 0.23739283995848326, "learning_rate": 4.163025669033315e-05, "loss": 0.506, "step": 5100 }, { "epoch": 1.255559651062784, "grad_norm": 0.28168414546355774, "learning_rate": 4.160294920808302e-05, "loss": 0.49, "step": 5110 }, { "epoch": 1.2580169553999263, "grad_norm": 0.2612741242584854, "learning_rate": 4.157564172583288e-05, "loss": 0.4984, "step": 5120 }, { "epoch": 1.2604742597370684, "grad_norm": 0.22494226934128708, "learning_rate": 4.1548334243582745e-05, "loss": 0.507, "step": 5130 }, { "epoch": 1.2629315640742105, "grad_norm": 0.23841292697810973, "learning_rate": 4.152102676133261e-05, "loss": 0.4713, "step": 5140 }, { "epoch": 1.2653888684113528, "grad_norm": 0.18533505426710892, "learning_rate": 4.149371927908247e-05, "loss": 0.4662, "step": 5150 }, { "epoch": 1.2678461727484949, "grad_norm": 0.24310571967968417, "learning_rate": 4.146641179683233e-05, "loss": 0.4639, "step": 5160 }, { "epoch": 1.270303477085637, "grad_norm": 0.2281375534663682, "learning_rate": 4.14391043145822e-05, "loss": 0.483, "step": 5170 }, { "epoch": 1.2727607814227793, "grad_norm": 0.22196656707800144, "learning_rate": 4.1411796832332065e-05, "loss": 0.4809, "step": 5180 }, { "epoch": 1.2752180857599213, "grad_norm": 0.19361496248309532, "learning_rate": 4.138448935008192e-05, "loss": 0.4641, "step": 5190 }, { "epoch": 1.2776753900970634, "grad_norm": 0.20594495177334785, "learning_rate": 4.135718186783179e-05, "loss": 0.468, "step": 5200 }, { "epoch": 1.2801326944342057, "grad_norm": 0.21457113154143378, "learning_rate": 4.132987438558165e-05, "loss": 0.473, "step": 5210 }, { "epoch": 1.2825899987713478, "grad_norm": 0.22595611738103977, "learning_rate": 4.1302566903331514e-05, "loss": 0.4891, "step": 5220 }, { "epoch": 1.28504730310849, "grad_norm": 0.25262727315091843, "learning_rate": 4.127525942108138e-05, "loss": 0.4991, "step": 5230 }, { "epoch": 1.2875046074456322, "grad_norm": 0.23350469584840358, "learning_rate": 4.124795193883124e-05, "loss": 0.487, "step": 5240 }, { "epoch": 1.2899619117827743, "grad_norm": 0.1933062405601809, "learning_rate": 4.1220644456581106e-05, "loss": 0.4657, "step": 5250 }, { "epoch": 1.2924192161199164, "grad_norm": 0.21452553460953863, "learning_rate": 4.119333697433097e-05, "loss": 0.4873, "step": 5260 }, { "epoch": 1.2948765204570587, "grad_norm": 0.21200514048252667, "learning_rate": 4.1166029492080834e-05, "loss": 0.4926, "step": 5270 }, { "epoch": 1.2973338247942008, "grad_norm": 0.2112755127007217, "learning_rate": 4.113872200983069e-05, "loss": 0.4792, "step": 5280 }, { "epoch": 1.2997911291313429, "grad_norm": 0.19761262240275676, "learning_rate": 4.111141452758056e-05, "loss": 0.4749, "step": 5290 }, { "epoch": 1.3022484334684852, "grad_norm": 0.23737922250514296, "learning_rate": 4.108410704533042e-05, "loss": 0.4849, "step": 5300 }, { "epoch": 1.3047057378056273, "grad_norm": 0.21087045206198496, "learning_rate": 4.105679956308028e-05, "loss": 0.4895, "step": 5310 }, { "epoch": 1.3071630421427693, "grad_norm": 0.21371805843708308, "learning_rate": 4.1029492080830154e-05, "loss": 0.4792, "step": 5320 }, { "epoch": 1.3096203464799117, "grad_norm": 0.22713563198950856, "learning_rate": 4.100218459858001e-05, "loss": 0.47, "step": 5330 }, { "epoch": 1.3120776508170537, "grad_norm": 0.20515694998155343, "learning_rate": 4.0974877116329875e-05, "loss": 0.4863, "step": 5340 }, { "epoch": 1.3145349551541958, "grad_norm": 0.20126467027666253, "learning_rate": 4.094756963407974e-05, "loss": 0.4662, "step": 5350 }, { "epoch": 1.3169922594913381, "grad_norm": 0.25200989998738654, "learning_rate": 4.09202621518296e-05, "loss": 0.4824, "step": 5360 }, { "epoch": 1.3194495638284802, "grad_norm": 0.2095477475234895, "learning_rate": 4.089295466957947e-05, "loss": 0.4772, "step": 5370 }, { "epoch": 1.3219068681656223, "grad_norm": 0.21960610695985935, "learning_rate": 4.086564718732933e-05, "loss": 0.4862, "step": 5380 }, { "epoch": 1.3243641725027644, "grad_norm": 0.23066615346594657, "learning_rate": 4.0838339705079195e-05, "loss": 0.4749, "step": 5390 }, { "epoch": 1.3268214768399067, "grad_norm": 0.20303038060262418, "learning_rate": 4.081103222282906e-05, "loss": 0.485, "step": 5400 }, { "epoch": 1.3292787811770488, "grad_norm": 0.20839427966978574, "learning_rate": 4.078372474057892e-05, "loss": 0.4666, "step": 5410 }, { "epoch": 1.3317360855141909, "grad_norm": 0.2323079003043353, "learning_rate": 4.075641725832878e-05, "loss": 0.4695, "step": 5420 }, { "epoch": 1.334193389851333, "grad_norm": 0.22473649040685884, "learning_rate": 4.072910977607865e-05, "loss": 0.4821, "step": 5430 }, { "epoch": 1.3366506941884753, "grad_norm": 0.2117264318021387, "learning_rate": 4.070180229382851e-05, "loss": 0.4619, "step": 5440 }, { "epoch": 1.3391079985256173, "grad_norm": 0.21156010717299376, "learning_rate": 4.067449481157837e-05, "loss": 0.5038, "step": 5450 }, { "epoch": 1.3415653028627594, "grad_norm": 0.22958187448309003, "learning_rate": 4.064718732932824e-05, "loss": 0.487, "step": 5460 }, { "epoch": 1.3440226071999017, "grad_norm": 0.21822363221771598, "learning_rate": 4.06198798470781e-05, "loss": 0.4916, "step": 5470 }, { "epoch": 1.3464799115370438, "grad_norm": 0.22980982510420692, "learning_rate": 4.0592572364827965e-05, "loss": 0.4892, "step": 5480 }, { "epoch": 1.348937215874186, "grad_norm": 0.25526204526505863, "learning_rate": 4.056526488257783e-05, "loss": 0.4753, "step": 5490 }, { "epoch": 1.3513945202113282, "grad_norm": 0.19985296427944868, "learning_rate": 4.053795740032769e-05, "loss": 0.4943, "step": 5500 }, { "epoch": 1.3538518245484703, "grad_norm": 0.20654844117011495, "learning_rate": 4.0510649918077557e-05, "loss": 0.477, "step": 5510 }, { "epoch": 1.3563091288856124, "grad_norm": 0.20099202002763059, "learning_rate": 4.048334243582742e-05, "loss": 0.4923, "step": 5520 }, { "epoch": 1.3587664332227547, "grad_norm": 0.23680527037897012, "learning_rate": 4.0456034953577285e-05, "loss": 0.4777, "step": 5530 }, { "epoch": 1.3612237375598968, "grad_norm": 0.21499933621725614, "learning_rate": 4.042872747132714e-05, "loss": 0.4618, "step": 5540 }, { "epoch": 1.3636810418970389, "grad_norm": 0.19209146147978023, "learning_rate": 4.040141998907701e-05, "loss": 0.4773, "step": 5550 }, { "epoch": 1.3661383462341812, "grad_norm": 0.20642465476497432, "learning_rate": 4.037411250682687e-05, "loss": 0.4666, "step": 5560 }, { "epoch": 1.3685956505713233, "grad_norm": 0.2560023288388242, "learning_rate": 4.0346805024576734e-05, "loss": 0.4696, "step": 5570 }, { "epoch": 1.3710529549084653, "grad_norm": 0.2723662479124362, "learning_rate": 4.03194975423266e-05, "loss": 0.4832, "step": 5580 }, { "epoch": 1.3735102592456077, "grad_norm": 0.22530729491564552, "learning_rate": 4.029219006007646e-05, "loss": 0.5007, "step": 5590 }, { "epoch": 1.3759675635827497, "grad_norm": 0.21602069378359923, "learning_rate": 4.0264882577826326e-05, "loss": 0.4797, "step": 5600 }, { "epoch": 1.3784248679198918, "grad_norm": 0.2130739627529285, "learning_rate": 4.023757509557619e-05, "loss": 0.4759, "step": 5610 }, { "epoch": 1.3808821722570341, "grad_norm": 0.20184300818424514, "learning_rate": 4.0210267613326054e-05, "loss": 0.4926, "step": 5620 }, { "epoch": 1.3833394765941762, "grad_norm": 0.2518537434245742, "learning_rate": 4.018296013107591e-05, "loss": 0.4977, "step": 5630 }, { "epoch": 1.3857967809313183, "grad_norm": 0.25737231000272764, "learning_rate": 4.015565264882578e-05, "loss": 0.4885, "step": 5640 }, { "epoch": 1.3882540852684606, "grad_norm": 0.21142441942245663, "learning_rate": 4.0128345166575646e-05, "loss": 0.4761, "step": 5650 }, { "epoch": 1.3907113896056027, "grad_norm": 0.23343285527794622, "learning_rate": 4.010103768432551e-05, "loss": 0.4702, "step": 5660 }, { "epoch": 1.3931686939427448, "grad_norm": 0.1917905274601211, "learning_rate": 4.0073730202075374e-05, "loss": 0.4837, "step": 5670 }, { "epoch": 1.395625998279887, "grad_norm": 0.20055770758014324, "learning_rate": 4.004642271982523e-05, "loss": 0.4716, "step": 5680 }, { "epoch": 1.3980833026170292, "grad_norm": 0.21760874717675105, "learning_rate": 4.00191152375751e-05, "loss": 0.4706, "step": 5690 }, { "epoch": 1.4005406069541713, "grad_norm": 0.21561125364906, "learning_rate": 3.999180775532496e-05, "loss": 0.489, "step": 5700 }, { "epoch": 1.4029979112913133, "grad_norm": 0.21104957959697965, "learning_rate": 3.996450027307482e-05, "loss": 0.4818, "step": 5710 }, { "epoch": 1.4054552156284557, "grad_norm": 0.2277920989434151, "learning_rate": 3.993719279082469e-05, "loss": 0.4871, "step": 5720 }, { "epoch": 1.4079125199655977, "grad_norm": 0.20675901886533113, "learning_rate": 3.990988530857455e-05, "loss": 0.4805, "step": 5730 }, { "epoch": 1.4103698243027398, "grad_norm": 0.2231078178863371, "learning_rate": 3.9882577826324415e-05, "loss": 0.4661, "step": 5740 }, { "epoch": 1.412827128639882, "grad_norm": 0.27647825802307824, "learning_rate": 3.985527034407428e-05, "loss": 0.4744, "step": 5750 }, { "epoch": 1.4152844329770242, "grad_norm": 0.22387948598271537, "learning_rate": 3.982796286182414e-05, "loss": 0.4746, "step": 5760 }, { "epoch": 1.4177417373141663, "grad_norm": 0.2233483249506091, "learning_rate": 3.9800655379574e-05, "loss": 0.4953, "step": 5770 }, { "epoch": 1.4201990416513084, "grad_norm": 0.20843856370003688, "learning_rate": 3.977334789732387e-05, "loss": 0.4768, "step": 5780 }, { "epoch": 1.4226563459884507, "grad_norm": 0.21774959959771537, "learning_rate": 3.9746040415073735e-05, "loss": 0.4713, "step": 5790 }, { "epoch": 1.4251136503255928, "grad_norm": 0.24031506056750532, "learning_rate": 3.971873293282359e-05, "loss": 0.4888, "step": 5800 }, { "epoch": 1.4275709546627349, "grad_norm": 0.20594760610708304, "learning_rate": 3.969142545057346e-05, "loss": 0.4949, "step": 5810 }, { "epoch": 1.4300282589998772, "grad_norm": 0.237901661290486, "learning_rate": 3.966411796832332e-05, "loss": 0.4744, "step": 5820 }, { "epoch": 1.4324855633370193, "grad_norm": 0.25165673084683277, "learning_rate": 3.9636810486073184e-05, "loss": 0.5013, "step": 5830 }, { "epoch": 1.4349428676741613, "grad_norm": 0.24689022209660122, "learning_rate": 3.960950300382305e-05, "loss": 0.4937, "step": 5840 }, { "epoch": 1.4374001720113037, "grad_norm": 0.23271007252915837, "learning_rate": 3.958219552157291e-05, "loss": 0.4854, "step": 5850 }, { "epoch": 1.4398574763484457, "grad_norm": 0.2224675129034689, "learning_rate": 3.9554888039322776e-05, "loss": 0.4894, "step": 5860 }, { "epoch": 1.4423147806855878, "grad_norm": 0.1872052712167039, "learning_rate": 3.952758055707264e-05, "loss": 0.5014, "step": 5870 }, { "epoch": 1.4447720850227301, "grad_norm": 0.21476325658013848, "learning_rate": 3.9500273074822504e-05, "loss": 0.4891, "step": 5880 }, { "epoch": 1.4472293893598722, "grad_norm": 0.19246984583383078, "learning_rate": 3.947296559257236e-05, "loss": 0.483, "step": 5890 }, { "epoch": 1.4496866936970143, "grad_norm": 0.2188302246565097, "learning_rate": 3.944565811032223e-05, "loss": 0.4676, "step": 5900 }, { "epoch": 1.4521439980341566, "grad_norm": 0.27309786443275597, "learning_rate": 3.941835062807209e-05, "loss": 0.4882, "step": 5910 }, { "epoch": 1.4546013023712987, "grad_norm": 0.19853552661158066, "learning_rate": 3.939104314582196e-05, "loss": 0.4659, "step": 5920 }, { "epoch": 1.4570586067084408, "grad_norm": 0.20420341426260694, "learning_rate": 3.9363735663571824e-05, "loss": 0.4923, "step": 5930 }, { "epoch": 1.459515911045583, "grad_norm": 0.21917610189434603, "learning_rate": 3.933642818132168e-05, "loss": 0.4791, "step": 5940 }, { "epoch": 1.4619732153827252, "grad_norm": 0.2537938396185388, "learning_rate": 3.930912069907155e-05, "loss": 0.4773, "step": 5950 }, { "epoch": 1.4644305197198673, "grad_norm": 0.23079506960410506, "learning_rate": 3.928181321682141e-05, "loss": 0.5053, "step": 5960 }, { "epoch": 1.4668878240570096, "grad_norm": 0.2054945882004399, "learning_rate": 3.9254505734571274e-05, "loss": 0.4841, "step": 5970 }, { "epoch": 1.4693451283941517, "grad_norm": 0.2257469518778975, "learning_rate": 3.922719825232114e-05, "loss": 0.4905, "step": 5980 }, { "epoch": 1.4718024327312937, "grad_norm": 0.20086003700611832, "learning_rate": 3.9199890770071e-05, "loss": 0.4761, "step": 5990 }, { "epoch": 1.474259737068436, "grad_norm": 0.23254136425241836, "learning_rate": 3.9172583287820866e-05, "loss": 0.476, "step": 6000 }, { "epoch": 1.4767170414055781, "grad_norm": 0.20947738317730064, "learning_rate": 3.914527580557073e-05, "loss": 0.471, "step": 6010 }, { "epoch": 1.4791743457427202, "grad_norm": 0.2295091074746577, "learning_rate": 3.9117968323320594e-05, "loss": 0.4635, "step": 6020 }, { "epoch": 1.4816316500798625, "grad_norm": 0.21821241190861704, "learning_rate": 3.909066084107045e-05, "loss": 0.474, "step": 6030 }, { "epoch": 1.4840889544170046, "grad_norm": 0.21580453470649083, "learning_rate": 3.906335335882032e-05, "loss": 0.4674, "step": 6040 }, { "epoch": 1.4865462587541467, "grad_norm": 0.21031398216275315, "learning_rate": 3.903604587657018e-05, "loss": 0.4915, "step": 6050 }, { "epoch": 1.4890035630912888, "grad_norm": 0.19888635837037671, "learning_rate": 3.900873839432004e-05, "loss": 0.4766, "step": 6060 }, { "epoch": 1.491460867428431, "grad_norm": 0.20025747244543848, "learning_rate": 3.8981430912069914e-05, "loss": 0.4572, "step": 6070 }, { "epoch": 1.4939181717655732, "grad_norm": 0.24331894289871062, "learning_rate": 3.895412342981977e-05, "loss": 0.4817, "step": 6080 }, { "epoch": 1.4963754761027153, "grad_norm": 0.22515188053608076, "learning_rate": 3.8926815947569635e-05, "loss": 0.4924, "step": 6090 }, { "epoch": 1.4988327804398573, "grad_norm": 0.23018085564056992, "learning_rate": 3.88995084653195e-05, "loss": 0.483, "step": 6100 }, { "epoch": 1.5012900847769997, "grad_norm": 0.213635070843007, "learning_rate": 3.887220098306936e-05, "loss": 0.4794, "step": 6110 }, { "epoch": 1.5037473891141417, "grad_norm": 0.197614014347439, "learning_rate": 3.884489350081923e-05, "loss": 0.4691, "step": 6120 }, { "epoch": 1.5062046934512838, "grad_norm": 0.21669421104369374, "learning_rate": 3.881758601856909e-05, "loss": 0.4821, "step": 6130 }, { "epoch": 1.5086619977884261, "grad_norm": 0.20421165046589285, "learning_rate": 3.8790278536318955e-05, "loss": 0.4578, "step": 6140 }, { "epoch": 1.5111193021255682, "grad_norm": 0.22822846923727821, "learning_rate": 3.876297105406881e-05, "loss": 0.4707, "step": 6150 }, { "epoch": 1.5135766064627103, "grad_norm": 0.22924264116564155, "learning_rate": 3.873566357181868e-05, "loss": 0.4946, "step": 6160 }, { "epoch": 1.5160339107998526, "grad_norm": 0.24076077240741484, "learning_rate": 3.870835608956854e-05, "loss": 0.475, "step": 6170 }, { "epoch": 1.5184912151369947, "grad_norm": 0.21336037633759278, "learning_rate": 3.868104860731841e-05, "loss": 0.4615, "step": 6180 }, { "epoch": 1.5209485194741368, "grad_norm": 0.19852196228771005, "learning_rate": 3.865374112506827e-05, "loss": 0.4749, "step": 6190 }, { "epoch": 1.523405823811279, "grad_norm": 0.20797923750727224, "learning_rate": 3.862643364281813e-05, "loss": 0.4785, "step": 6200 }, { "epoch": 1.5258631281484212, "grad_norm": 0.2026243924211699, "learning_rate": 3.8599126160568e-05, "loss": 0.4958, "step": 6210 }, { "epoch": 1.5283204324855633, "grad_norm": 0.24389492824155284, "learning_rate": 3.857181867831786e-05, "loss": 0.4778, "step": 6220 }, { "epoch": 1.5307777368227056, "grad_norm": 0.2126519911133868, "learning_rate": 3.8544511196067724e-05, "loss": 0.4796, "step": 6230 }, { "epoch": 1.5332350411598477, "grad_norm": 0.20766068576592908, "learning_rate": 3.851720371381759e-05, "loss": 0.4678, "step": 6240 }, { "epoch": 1.5356923454969897, "grad_norm": 0.2149764455696779, "learning_rate": 3.848989623156745e-05, "loss": 0.4768, "step": 6250 }, { "epoch": 1.538149649834132, "grad_norm": 0.20468673270823337, "learning_rate": 3.8462588749317316e-05, "loss": 0.4827, "step": 6260 }, { "epoch": 1.5406069541712741, "grad_norm": 0.21146972691141805, "learning_rate": 3.843528126706718e-05, "loss": 0.4727, "step": 6270 }, { "epoch": 1.5430642585084162, "grad_norm": 0.23383493353262955, "learning_rate": 3.8407973784817044e-05, "loss": 0.4798, "step": 6280 }, { "epoch": 1.5455215628455585, "grad_norm": 0.20056543426250098, "learning_rate": 3.83806663025669e-05, "loss": 0.4542, "step": 6290 }, { "epoch": 1.5479788671827006, "grad_norm": 0.19061215672010873, "learning_rate": 3.835335882031677e-05, "loss": 0.4837, "step": 6300 }, { "epoch": 1.5504361715198427, "grad_norm": 0.21739423584431433, "learning_rate": 3.832605133806663e-05, "loss": 0.4916, "step": 6310 }, { "epoch": 1.552893475856985, "grad_norm": 0.2169822691443041, "learning_rate": 3.829874385581649e-05, "loss": 0.4839, "step": 6320 }, { "epoch": 1.555350780194127, "grad_norm": 0.20440998147955355, "learning_rate": 3.827143637356636e-05, "loss": 0.4821, "step": 6330 }, { "epoch": 1.5578080845312692, "grad_norm": 0.24760625833936567, "learning_rate": 3.824412889131622e-05, "loss": 0.4643, "step": 6340 }, { "epoch": 1.5602653888684115, "grad_norm": 0.2267468069184166, "learning_rate": 3.8216821409066085e-05, "loss": 0.4858, "step": 6350 }, { "epoch": 1.5627226932055533, "grad_norm": 0.19953840576834647, "learning_rate": 3.818951392681595e-05, "loss": 0.4909, "step": 6360 }, { "epoch": 1.5651799975426957, "grad_norm": 0.22432876755791925, "learning_rate": 3.816220644456581e-05, "loss": 0.4889, "step": 6370 }, { "epoch": 1.567637301879838, "grad_norm": 0.2919729156292652, "learning_rate": 3.813489896231567e-05, "loss": 0.475, "step": 6380 }, { "epoch": 1.5700946062169798, "grad_norm": 0.20455143235100304, "learning_rate": 3.810759148006554e-05, "loss": 0.4835, "step": 6390 }, { "epoch": 1.5725519105541221, "grad_norm": 0.20193932825339367, "learning_rate": 3.8080283997815405e-05, "loss": 0.4693, "step": 6400 }, { "epoch": 1.5750092148912644, "grad_norm": 0.2223773883898152, "learning_rate": 3.805297651556526e-05, "loss": 0.465, "step": 6410 }, { "epoch": 1.5774665192284063, "grad_norm": 0.2143857426826946, "learning_rate": 3.802566903331513e-05, "loss": 0.4606, "step": 6420 }, { "epoch": 1.5799238235655486, "grad_norm": 0.17969236474439015, "learning_rate": 3.799836155106499e-05, "loss": 0.4887, "step": 6430 }, { "epoch": 1.5823811279026907, "grad_norm": 0.20123526785297347, "learning_rate": 3.797105406881486e-05, "loss": 0.4729, "step": 6440 }, { "epoch": 1.5848384322398328, "grad_norm": 0.2074739778628687, "learning_rate": 3.794374658656472e-05, "loss": 0.4739, "step": 6450 }, { "epoch": 1.587295736576975, "grad_norm": 0.21834739800426722, "learning_rate": 3.791643910431458e-05, "loss": 0.4782, "step": 6460 }, { "epoch": 1.5897530409141172, "grad_norm": 0.19983196013079796, "learning_rate": 3.7889131622064447e-05, "loss": 0.4763, "step": 6470 }, { "epoch": 1.5922103452512593, "grad_norm": 0.22527419080506883, "learning_rate": 3.786182413981431e-05, "loss": 0.4593, "step": 6480 }, { "epoch": 1.5946676495884016, "grad_norm": 0.21254497072991352, "learning_rate": 3.7834516657564175e-05, "loss": 0.4749, "step": 6490 }, { "epoch": 1.5971249539255437, "grad_norm": 0.2048531868041732, "learning_rate": 3.780720917531404e-05, "loss": 0.472, "step": 6500 }, { "epoch": 1.5995822582626857, "grad_norm": 0.2201016821474272, "learning_rate": 3.77799016930639e-05, "loss": 0.4874, "step": 6510 }, { "epoch": 1.602039562599828, "grad_norm": 0.24866133598786955, "learning_rate": 3.775259421081376e-05, "loss": 0.467, "step": 6520 }, { "epoch": 1.6044968669369701, "grad_norm": 0.19785738015090398, "learning_rate": 3.772528672856363e-05, "loss": 0.4935, "step": 6530 }, { "epoch": 1.6069541712741122, "grad_norm": 0.21324763889653559, "learning_rate": 3.7697979246313495e-05, "loss": 0.4773, "step": 6540 }, { "epoch": 1.6094114756112545, "grad_norm": 0.22624764944978842, "learning_rate": 3.767067176406335e-05, "loss": 0.4571, "step": 6550 }, { "epoch": 1.6118687799483966, "grad_norm": 0.2053800135686612, "learning_rate": 3.764336428181322e-05, "loss": 0.48, "step": 6560 }, { "epoch": 1.6143260842855387, "grad_norm": 0.19778557213435322, "learning_rate": 3.761605679956308e-05, "loss": 0.468, "step": 6570 }, { "epoch": 1.616783388622681, "grad_norm": 0.23513440426005253, "learning_rate": 3.7588749317312944e-05, "loss": 0.4848, "step": 6580 }, { "epoch": 1.619240692959823, "grad_norm": 0.19831972044725713, "learning_rate": 3.756144183506281e-05, "loss": 0.4643, "step": 6590 }, { "epoch": 1.6216979972969652, "grad_norm": 0.21742588275182353, "learning_rate": 3.753413435281267e-05, "loss": 0.4624, "step": 6600 }, { "epoch": 1.6241553016341075, "grad_norm": 0.23495426256663005, "learning_rate": 3.7506826870562536e-05, "loss": 0.473, "step": 6610 }, { "epoch": 1.6266126059712496, "grad_norm": 0.21461901102779155, "learning_rate": 3.74795193883124e-05, "loss": 0.481, "step": 6620 }, { "epoch": 1.6290699103083917, "grad_norm": 0.20296132979377943, "learning_rate": 3.7452211906062264e-05, "loss": 0.4704, "step": 6630 }, { "epoch": 1.631527214645534, "grad_norm": 0.23677508949557954, "learning_rate": 3.742490442381212e-05, "loss": 0.4742, "step": 6640 }, { "epoch": 1.633984518982676, "grad_norm": 0.2021984405956069, "learning_rate": 3.739759694156199e-05, "loss": 0.4896, "step": 6650 }, { "epoch": 1.6364418233198181, "grad_norm": 0.22215738982475103, "learning_rate": 3.7370289459311856e-05, "loss": 0.4921, "step": 6660 }, { "epoch": 1.6388991276569604, "grad_norm": 0.20717383182174898, "learning_rate": 3.734298197706171e-05, "loss": 0.4917, "step": 6670 }, { "epoch": 1.6413564319941023, "grad_norm": 0.21542698938540128, "learning_rate": 3.7315674494811584e-05, "loss": 0.4767, "step": 6680 }, { "epoch": 1.6438137363312446, "grad_norm": 0.19339035120865958, "learning_rate": 3.728836701256144e-05, "loss": 0.4956, "step": 6690 }, { "epoch": 1.646271040668387, "grad_norm": 0.2297133813722935, "learning_rate": 3.726105953031131e-05, "loss": 0.4779, "step": 6700 }, { "epoch": 1.6487283450055288, "grad_norm": 0.22910319025533588, "learning_rate": 3.723375204806117e-05, "loss": 0.4773, "step": 6710 }, { "epoch": 1.651185649342671, "grad_norm": 0.2695521691456437, "learning_rate": 3.720644456581103e-05, "loss": 0.487, "step": 6720 }, { "epoch": 1.6536429536798134, "grad_norm": 0.2360533448166473, "learning_rate": 3.71791370835609e-05, "loss": 0.4953, "step": 6730 }, { "epoch": 1.6561002580169553, "grad_norm": 0.239710825943215, "learning_rate": 3.715182960131076e-05, "loss": 0.4747, "step": 6740 }, { "epoch": 1.6585575623540976, "grad_norm": 0.20558610617160553, "learning_rate": 3.7124522119060625e-05, "loss": 0.4733, "step": 6750 }, { "epoch": 1.6610148666912397, "grad_norm": 0.1968391603345054, "learning_rate": 3.709721463681049e-05, "loss": 0.486, "step": 6760 }, { "epoch": 1.6634721710283817, "grad_norm": 0.19015409443514636, "learning_rate": 3.706990715456035e-05, "loss": 0.4806, "step": 6770 }, { "epoch": 1.665929475365524, "grad_norm": 0.22700799882252767, "learning_rate": 3.704259967231021e-05, "loss": 0.4798, "step": 6780 }, { "epoch": 1.6683867797026661, "grad_norm": 0.21942276764363905, "learning_rate": 3.701529219006008e-05, "loss": 0.4718, "step": 6790 }, { "epoch": 1.6708440840398082, "grad_norm": 0.193404501826084, "learning_rate": 3.6987984707809945e-05, "loss": 0.4624, "step": 6800 }, { "epoch": 1.6733013883769505, "grad_norm": 0.2203070622014381, "learning_rate": 3.69606772255598e-05, "loss": 0.4887, "step": 6810 }, { "epoch": 1.6757586927140926, "grad_norm": 0.1954081147949961, "learning_rate": 3.693336974330967e-05, "loss": 0.4899, "step": 6820 }, { "epoch": 1.6782159970512347, "grad_norm": 0.20374103690875775, "learning_rate": 3.690606226105953e-05, "loss": 0.4615, "step": 6830 }, { "epoch": 1.680673301388377, "grad_norm": 0.20767558878971387, "learning_rate": 3.6878754778809394e-05, "loss": 0.4778, "step": 6840 }, { "epoch": 1.683130605725519, "grad_norm": 0.20351819570625204, "learning_rate": 3.685144729655926e-05, "loss": 0.4954, "step": 6850 }, { "epoch": 1.6855879100626612, "grad_norm": 0.21391887742072932, "learning_rate": 3.682413981430912e-05, "loss": 0.4718, "step": 6860 }, { "epoch": 1.6880452143998035, "grad_norm": 0.2370653601175896, "learning_rate": 3.6796832332058986e-05, "loss": 0.4682, "step": 6870 }, { "epoch": 1.6905025187369456, "grad_norm": 0.21750345465737086, "learning_rate": 3.676952484980885e-05, "loss": 0.5038, "step": 6880 }, { "epoch": 1.6929598230740877, "grad_norm": 0.21600699113771493, "learning_rate": 3.6742217367558714e-05, "loss": 0.4928, "step": 6890 }, { "epoch": 1.69541712741123, "grad_norm": 0.20419480364253356, "learning_rate": 3.671490988530857e-05, "loss": 0.4775, "step": 6900 }, { "epoch": 1.697874431748372, "grad_norm": 0.2349445968513915, "learning_rate": 3.668760240305844e-05, "loss": 0.4775, "step": 6910 }, { "epoch": 1.7003317360855141, "grad_norm": 0.2108958198230414, "learning_rate": 3.66602949208083e-05, "loss": 0.4682, "step": 6920 }, { "epoch": 1.7027890404226564, "grad_norm": 0.2161184236207077, "learning_rate": 3.6632987438558164e-05, "loss": 0.4913, "step": 6930 }, { "epoch": 1.7052463447597985, "grad_norm": 0.21716703698902884, "learning_rate": 3.6605679956308034e-05, "loss": 0.463, "step": 6940 }, { "epoch": 1.7077036490969406, "grad_norm": 0.18594388303970458, "learning_rate": 3.657837247405789e-05, "loss": 0.475, "step": 6950 }, { "epoch": 1.710160953434083, "grad_norm": 0.21887484169036558, "learning_rate": 3.655106499180776e-05, "loss": 0.4776, "step": 6960 }, { "epoch": 1.712618257771225, "grad_norm": 0.2042494153887751, "learning_rate": 3.652375750955762e-05, "loss": 0.4746, "step": 6970 }, { "epoch": 1.715075562108367, "grad_norm": 0.2398014585971733, "learning_rate": 3.6496450027307484e-05, "loss": 0.4686, "step": 6980 }, { "epoch": 1.7175328664455094, "grad_norm": 0.23318365182089182, "learning_rate": 3.646914254505735e-05, "loss": 0.4678, "step": 6990 }, { "epoch": 1.7199901707826515, "grad_norm": 0.21234139841579042, "learning_rate": 3.644183506280721e-05, "loss": 0.4749, "step": 7000 }, { "epoch": 1.7224474751197936, "grad_norm": 0.24065208089300041, "learning_rate": 3.6414527580557076e-05, "loss": 0.4916, "step": 7010 }, { "epoch": 1.7249047794569359, "grad_norm": 0.18763062924212043, "learning_rate": 3.638722009830694e-05, "loss": 0.4777, "step": 7020 }, { "epoch": 1.7273620837940777, "grad_norm": 0.1974023013198927, "learning_rate": 3.6359912616056804e-05, "loss": 0.4981, "step": 7030 }, { "epoch": 1.72981938813122, "grad_norm": 0.21971275568208828, "learning_rate": 3.633260513380666e-05, "loss": 0.4875, "step": 7040 }, { "epoch": 1.7322766924683624, "grad_norm": 0.2281212025278206, "learning_rate": 3.630529765155653e-05, "loss": 0.4721, "step": 7050 }, { "epoch": 1.7347339968055042, "grad_norm": 0.21559335506873292, "learning_rate": 3.627799016930639e-05, "loss": 0.4883, "step": 7060 }, { "epoch": 1.7371913011426465, "grad_norm": 0.20472457883149422, "learning_rate": 3.625068268705625e-05, "loss": 0.4742, "step": 7070 }, { "epoch": 1.7396486054797888, "grad_norm": 0.21690509020241866, "learning_rate": 3.6223375204806124e-05, "loss": 0.4703, "step": 7080 }, { "epoch": 1.7421059098169307, "grad_norm": 0.2288052027760249, "learning_rate": 3.619606772255598e-05, "loss": 0.4699, "step": 7090 }, { "epoch": 1.744563214154073, "grad_norm": 0.23415684306205015, "learning_rate": 3.6168760240305845e-05, "loss": 0.4901, "step": 7100 }, { "epoch": 1.747020518491215, "grad_norm": 0.19903307472897583, "learning_rate": 3.614145275805571e-05, "loss": 0.4985, "step": 7110 }, { "epoch": 1.7494778228283572, "grad_norm": 0.21487023846264766, "learning_rate": 3.611414527580557e-05, "loss": 0.4734, "step": 7120 }, { "epoch": 1.7519351271654995, "grad_norm": 0.21538362836565014, "learning_rate": 3.608683779355544e-05, "loss": 0.4692, "step": 7130 }, { "epoch": 1.7543924315026416, "grad_norm": 0.23821168537385645, "learning_rate": 3.60595303113053e-05, "loss": 0.4656, "step": 7140 }, { "epoch": 1.7568497358397837, "grad_norm": 0.20298312706484975, "learning_rate": 3.6032222829055165e-05, "loss": 0.4659, "step": 7150 }, { "epoch": 1.759307040176926, "grad_norm": 0.2157486790889547, "learning_rate": 3.600491534680502e-05, "loss": 0.4776, "step": 7160 }, { "epoch": 1.761764344514068, "grad_norm": 0.1945645829192793, "learning_rate": 3.597760786455489e-05, "loss": 0.5011, "step": 7170 }, { "epoch": 1.7642216488512101, "grad_norm": 0.21701310535206014, "learning_rate": 3.595030038230475e-05, "loss": 0.493, "step": 7180 }, { "epoch": 1.7666789531883524, "grad_norm": 0.22108994291291276, "learning_rate": 3.5922992900054614e-05, "loss": 0.4794, "step": 7190 }, { "epoch": 1.7691362575254945, "grad_norm": 0.21128387122109699, "learning_rate": 3.589568541780448e-05, "loss": 0.5022, "step": 7200 }, { "epoch": 1.7715935618626366, "grad_norm": 0.21465151688405962, "learning_rate": 3.586837793555434e-05, "loss": 0.4787, "step": 7210 }, { "epoch": 1.774050866199779, "grad_norm": 0.20377533601173636, "learning_rate": 3.584107045330421e-05, "loss": 0.4836, "step": 7220 }, { "epoch": 1.776508170536921, "grad_norm": 0.20532956083091775, "learning_rate": 3.581376297105407e-05, "loss": 0.4939, "step": 7230 }, { "epoch": 1.778965474874063, "grad_norm": 0.23605510419119913, "learning_rate": 3.5786455488803934e-05, "loss": 0.4539, "step": 7240 }, { "epoch": 1.7814227792112054, "grad_norm": 0.21122952555547478, "learning_rate": 3.57591480065538e-05, "loss": 0.4878, "step": 7250 }, { "epoch": 1.7838800835483475, "grad_norm": 0.20083452048280934, "learning_rate": 3.573184052430366e-05, "loss": 0.4878, "step": 7260 }, { "epoch": 1.7863373878854896, "grad_norm": 0.17680654275112798, "learning_rate": 3.5704533042053526e-05, "loss": 0.4552, "step": 7270 }, { "epoch": 1.7887946922226319, "grad_norm": 0.2148980002861629, "learning_rate": 3.567722555980339e-05, "loss": 0.4748, "step": 7280 }, { "epoch": 1.791251996559774, "grad_norm": 0.20134951318744335, "learning_rate": 3.5649918077553254e-05, "loss": 0.4661, "step": 7290 }, { "epoch": 1.793709300896916, "grad_norm": 0.19969656533600666, "learning_rate": 3.562261059530311e-05, "loss": 0.4738, "step": 7300 }, { "epoch": 1.7961666052340584, "grad_norm": 0.26508620771698177, "learning_rate": 3.559530311305298e-05, "loss": 0.4672, "step": 7310 }, { "epoch": 1.7986239095712004, "grad_norm": 0.22165547702412966, "learning_rate": 3.556799563080284e-05, "loss": 0.4806, "step": 7320 }, { "epoch": 1.8010812139083425, "grad_norm": 0.22897686089116842, "learning_rate": 3.55406881485527e-05, "loss": 0.4826, "step": 7330 }, { "epoch": 1.8035385182454848, "grad_norm": 0.209169909187072, "learning_rate": 3.551338066630257e-05, "loss": 0.4673, "step": 7340 }, { "epoch": 1.8059958225826267, "grad_norm": 0.2221673453197431, "learning_rate": 3.548607318405243e-05, "loss": 0.4788, "step": 7350 }, { "epoch": 1.808453126919769, "grad_norm": 0.2388076884521034, "learning_rate": 3.5458765701802295e-05, "loss": 0.4795, "step": 7360 }, { "epoch": 1.8109104312569113, "grad_norm": 0.22156428025619737, "learning_rate": 3.543145821955216e-05, "loss": 0.4786, "step": 7370 }, { "epoch": 1.8133677355940532, "grad_norm": 0.224185447837835, "learning_rate": 3.540415073730202e-05, "loss": 0.4485, "step": 7380 }, { "epoch": 1.8158250399311955, "grad_norm": 0.21988080392653925, "learning_rate": 3.537684325505188e-05, "loss": 0.4746, "step": 7390 }, { "epoch": 1.8182823442683378, "grad_norm": 0.23979560105990233, "learning_rate": 3.534953577280175e-05, "loss": 0.4899, "step": 7400 }, { "epoch": 1.8207396486054797, "grad_norm": 0.28884709139891124, "learning_rate": 3.5322228290551615e-05, "loss": 0.4759, "step": 7410 }, { "epoch": 1.823196952942622, "grad_norm": 0.23408957386182885, "learning_rate": 3.529492080830147e-05, "loss": 0.4681, "step": 7420 }, { "epoch": 1.825654257279764, "grad_norm": 0.24889134678384428, "learning_rate": 3.526761332605134e-05, "loss": 0.4811, "step": 7430 }, { "epoch": 1.8281115616169061, "grad_norm": 0.23083520719837955, "learning_rate": 3.52403058438012e-05, "loss": 0.4703, "step": 7440 }, { "epoch": 1.8305688659540484, "grad_norm": 0.20876676709812586, "learning_rate": 3.5212998361551064e-05, "loss": 0.4789, "step": 7450 }, { "epoch": 1.8330261702911905, "grad_norm": 0.25399619631129977, "learning_rate": 3.518569087930093e-05, "loss": 0.4857, "step": 7460 }, { "epoch": 1.8354834746283326, "grad_norm": 0.26340640502991636, "learning_rate": 3.515838339705079e-05, "loss": 0.4744, "step": 7470 }, { "epoch": 1.837940778965475, "grad_norm": 0.2108775945003158, "learning_rate": 3.5131075914800657e-05, "loss": 0.4746, "step": 7480 }, { "epoch": 1.840398083302617, "grad_norm": 0.25192733262280886, "learning_rate": 3.510376843255052e-05, "loss": 0.4739, "step": 7490 }, { "epoch": 1.842855387639759, "grad_norm": 0.2329160597430707, "learning_rate": 3.5076460950300385e-05, "loss": 0.4719, "step": 7500 }, { "epoch": 1.8453126919769014, "grad_norm": 0.23965945644465173, "learning_rate": 3.504915346805025e-05, "loss": 0.4604, "step": 7510 }, { "epoch": 1.8477699963140435, "grad_norm": 0.21135399298145785, "learning_rate": 3.502184598580011e-05, "loss": 0.4813, "step": 7520 }, { "epoch": 1.8502273006511856, "grad_norm": 0.22091821544277299, "learning_rate": 3.499453850354997e-05, "loss": 0.4797, "step": 7530 }, { "epoch": 1.8526846049883279, "grad_norm": 0.20372652424823276, "learning_rate": 3.496723102129984e-05, "loss": 0.4692, "step": 7540 }, { "epoch": 1.85514190932547, "grad_norm": 0.2265277501127676, "learning_rate": 3.4939923539049705e-05, "loss": 0.4636, "step": 7550 }, { "epoch": 1.857599213662612, "grad_norm": 0.20375885781510644, "learning_rate": 3.491261605679956e-05, "loss": 0.4649, "step": 7560 }, { "epoch": 1.8600565179997544, "grad_norm": 0.20909000085701768, "learning_rate": 3.488530857454943e-05, "loss": 0.4572, "step": 7570 }, { "epoch": 1.8625138223368964, "grad_norm": 0.21859938960234893, "learning_rate": 3.485800109229929e-05, "loss": 0.4851, "step": 7580 }, { "epoch": 1.8649711266740385, "grad_norm": 0.2198681009957251, "learning_rate": 3.4830693610049154e-05, "loss": 0.4907, "step": 7590 }, { "epoch": 1.8674284310111808, "grad_norm": 0.2436879157054554, "learning_rate": 3.480338612779902e-05, "loss": 0.475, "step": 7600 }, { "epoch": 1.869885735348323, "grad_norm": 0.219735862186119, "learning_rate": 3.477607864554888e-05, "loss": 0.482, "step": 7610 }, { "epoch": 1.872343039685465, "grad_norm": 0.22164953206835294, "learning_rate": 3.4748771163298746e-05, "loss": 0.4794, "step": 7620 }, { "epoch": 1.8748003440226073, "grad_norm": 0.20060267987675756, "learning_rate": 3.472146368104861e-05, "loss": 0.4831, "step": 7630 }, { "epoch": 1.8772576483597494, "grad_norm": 0.26025888294133664, "learning_rate": 3.4694156198798474e-05, "loss": 0.4735, "step": 7640 }, { "epoch": 1.8797149526968915, "grad_norm": 0.20868227810367368, "learning_rate": 3.466684871654833e-05, "loss": 0.4737, "step": 7650 }, { "epoch": 1.8821722570340338, "grad_norm": 0.2172622040028448, "learning_rate": 3.46395412342982e-05, "loss": 0.4834, "step": 7660 }, { "epoch": 1.8846295613711757, "grad_norm": 0.22565958817341328, "learning_rate": 3.461223375204806e-05, "loss": 0.4801, "step": 7670 }, { "epoch": 1.887086865708318, "grad_norm": 0.2490050325329352, "learning_rate": 3.458492626979792e-05, "loss": 0.4542, "step": 7680 }, { "epoch": 1.8895441700454603, "grad_norm": 0.20161434794137106, "learning_rate": 3.4557618787547794e-05, "loss": 0.4726, "step": 7690 }, { "epoch": 1.8920014743826021, "grad_norm": 0.20039558376780842, "learning_rate": 3.453031130529765e-05, "loss": 0.4688, "step": 7700 }, { "epoch": 1.8944587787197444, "grad_norm": 0.19357342685772788, "learning_rate": 3.450300382304752e-05, "loss": 0.4752, "step": 7710 }, { "epoch": 1.8969160830568867, "grad_norm": 0.20740171124608273, "learning_rate": 3.447569634079738e-05, "loss": 0.4804, "step": 7720 }, { "epoch": 1.8993733873940286, "grad_norm": 0.21533059075376107, "learning_rate": 3.444838885854724e-05, "loss": 0.4755, "step": 7730 }, { "epoch": 1.901830691731171, "grad_norm": 0.28163769717876636, "learning_rate": 3.442108137629711e-05, "loss": 0.4756, "step": 7740 }, { "epoch": 1.9042879960683132, "grad_norm": 0.22490234454583385, "learning_rate": 3.439377389404697e-05, "loss": 0.4679, "step": 7750 }, { "epoch": 1.906745300405455, "grad_norm": 0.2116885998264266, "learning_rate": 3.4366466411796835e-05, "loss": 0.4676, "step": 7760 }, { "epoch": 1.9092026047425974, "grad_norm": 0.2453989453817177, "learning_rate": 3.43391589295467e-05, "loss": 0.4843, "step": 7770 }, { "epoch": 1.9116599090797395, "grad_norm": 0.23209031315734022, "learning_rate": 3.431185144729656e-05, "loss": 0.4773, "step": 7780 }, { "epoch": 1.9141172134168816, "grad_norm": 0.21036220392649368, "learning_rate": 3.428454396504642e-05, "loss": 0.4837, "step": 7790 }, { "epoch": 1.9165745177540239, "grad_norm": 0.22466745262608806, "learning_rate": 3.425723648279629e-05, "loss": 0.4718, "step": 7800 }, { "epoch": 1.919031822091166, "grad_norm": 0.22554434020479877, "learning_rate": 3.422992900054615e-05, "loss": 0.4702, "step": 7810 }, { "epoch": 1.921489126428308, "grad_norm": 0.22389221318679592, "learning_rate": 3.420262151829601e-05, "loss": 0.4641, "step": 7820 }, { "epoch": 1.9239464307654504, "grad_norm": 0.1919679538949704, "learning_rate": 3.417531403604588e-05, "loss": 0.4693, "step": 7830 }, { "epoch": 1.9264037351025924, "grad_norm": 0.23515200230660738, "learning_rate": 3.414800655379574e-05, "loss": 0.4677, "step": 7840 }, { "epoch": 1.9288610394397345, "grad_norm": 0.21254118272191402, "learning_rate": 3.4120699071545604e-05, "loss": 0.4747, "step": 7850 }, { "epoch": 1.9313183437768768, "grad_norm": 0.22998881926455508, "learning_rate": 3.409339158929547e-05, "loss": 0.4664, "step": 7860 }, { "epoch": 1.933775648114019, "grad_norm": 0.18926664337012358, "learning_rate": 3.406608410704533e-05, "loss": 0.4748, "step": 7870 }, { "epoch": 1.936232952451161, "grad_norm": 0.23837006519034618, "learning_rate": 3.4038776624795196e-05, "loss": 0.461, "step": 7880 }, { "epoch": 1.9386902567883033, "grad_norm": 0.22828882867583755, "learning_rate": 3.401146914254506e-05, "loss": 0.486, "step": 7890 }, { "epoch": 1.9411475611254454, "grad_norm": 0.21050049172539043, "learning_rate": 3.3984161660294924e-05, "loss": 0.4708, "step": 7900 }, { "epoch": 1.9436048654625875, "grad_norm": 0.20076724824048311, "learning_rate": 3.395685417804478e-05, "loss": 0.475, "step": 7910 }, { "epoch": 1.9460621697997298, "grad_norm": 0.25024340099057396, "learning_rate": 3.392954669579465e-05, "loss": 0.4916, "step": 7920 }, { "epoch": 1.9485194741368719, "grad_norm": 0.2627881438733633, "learning_rate": 3.390223921354451e-05, "loss": 0.4655, "step": 7930 }, { "epoch": 1.950976778474014, "grad_norm": 0.21934332436756104, "learning_rate": 3.3874931731294373e-05, "loss": 0.4624, "step": 7940 }, { "epoch": 1.9534340828111563, "grad_norm": 0.23116936347162595, "learning_rate": 3.384762424904424e-05, "loss": 0.4672, "step": 7950 }, { "epoch": 1.9558913871482984, "grad_norm": 0.21232762866716418, "learning_rate": 3.38203167667941e-05, "loss": 0.4506, "step": 7960 }, { "epoch": 1.9583486914854404, "grad_norm": 0.19787982615147723, "learning_rate": 3.379300928454397e-05, "loss": 0.4792, "step": 7970 }, { "epoch": 1.9608059958225827, "grad_norm": 0.24197729071380544, "learning_rate": 3.376570180229383e-05, "loss": 0.4918, "step": 7980 }, { "epoch": 1.9632633001597248, "grad_norm": 0.2099746732084618, "learning_rate": 3.3738394320043693e-05, "loss": 0.4701, "step": 7990 }, { "epoch": 1.965720604496867, "grad_norm": 0.1841900252779867, "learning_rate": 3.371108683779356e-05, "loss": 0.468, "step": 8000 }, { "epoch": 1.9681779088340092, "grad_norm": 0.2331901318599321, "learning_rate": 3.368377935554342e-05, "loss": 0.4746, "step": 8010 }, { "epoch": 1.970635213171151, "grad_norm": 0.2744488475121063, "learning_rate": 3.3656471873293285e-05, "loss": 0.4477, "step": 8020 }, { "epoch": 1.9730925175082934, "grad_norm": 0.20306576525297193, "learning_rate": 3.362916439104315e-05, "loss": 0.459, "step": 8030 }, { "epoch": 1.9755498218454357, "grad_norm": 0.2119017510852611, "learning_rate": 3.3601856908793013e-05, "loss": 0.4544, "step": 8040 }, { "epoch": 1.9780071261825776, "grad_norm": 0.23014387116085533, "learning_rate": 3.357454942654287e-05, "loss": 0.4716, "step": 8050 }, { "epoch": 1.9804644305197199, "grad_norm": 0.2340341879576761, "learning_rate": 3.354724194429274e-05, "loss": 0.456, "step": 8060 }, { "epoch": 1.9829217348568622, "grad_norm": 0.24083755987264954, "learning_rate": 3.35199344620426e-05, "loss": 0.4606, "step": 8070 }, { "epoch": 1.985379039194004, "grad_norm": 0.18735862673811138, "learning_rate": 3.349262697979246e-05, "loss": 0.4792, "step": 8080 }, { "epoch": 1.9878363435311464, "grad_norm": 0.24240472300131474, "learning_rate": 3.346531949754233e-05, "loss": 0.4591, "step": 8090 }, { "epoch": 1.9902936478682884, "grad_norm": 0.22930744905575495, "learning_rate": 3.343801201529219e-05, "loss": 0.4782, "step": 8100 }, { "epoch": 1.9927509522054305, "grad_norm": 0.24123502437280436, "learning_rate": 3.3410704533042055e-05, "loss": 0.4569, "step": 8110 }, { "epoch": 1.9952082565425728, "grad_norm": 0.20179848965413416, "learning_rate": 3.338339705079192e-05, "loss": 0.4702, "step": 8120 }, { "epoch": 1.997665560879715, "grad_norm": 0.22950055617120527, "learning_rate": 3.335608956854178e-05, "loss": 0.4628, "step": 8130 }, { "epoch": 2.0, "grad_norm": 0.20588728961533437, "learning_rate": 3.332878208629164e-05, "loss": 0.4394, "step": 8140 }, { "epoch": 2.0024573043371423, "grad_norm": 0.23086870805885393, "learning_rate": 3.330147460404151e-05, "loss": 0.4826, "step": 8150 }, { "epoch": 2.004914608674284, "grad_norm": 0.22709568235983987, "learning_rate": 3.3274167121791375e-05, "loss": 0.4713, "step": 8160 }, { "epoch": 2.0073719130114265, "grad_norm": 0.23139305673853863, "learning_rate": 3.324685963954123e-05, "loss": 0.4459, "step": 8170 }, { "epoch": 2.009829217348569, "grad_norm": 0.24736410648429957, "learning_rate": 3.32195521572911e-05, "loss": 0.4778, "step": 8180 }, { "epoch": 2.0122865216857106, "grad_norm": 0.21589707704158886, "learning_rate": 3.319224467504096e-05, "loss": 0.4487, "step": 8190 }, { "epoch": 2.014743826022853, "grad_norm": 0.19322761672852884, "learning_rate": 3.3164937192790824e-05, "loss": 0.4778, "step": 8200 }, { "epoch": 2.0172011303599953, "grad_norm": 0.22736195148797173, "learning_rate": 3.313762971054069e-05, "loss": 0.4733, "step": 8210 }, { "epoch": 2.019658434697137, "grad_norm": 0.21385538270241763, "learning_rate": 3.311032222829055e-05, "loss": 0.4609, "step": 8220 }, { "epoch": 2.0221157390342794, "grad_norm": 0.2611256842383756, "learning_rate": 3.308301474604042e-05, "loss": 0.4734, "step": 8230 }, { "epoch": 2.0245730433714217, "grad_norm": 0.22575300212111182, "learning_rate": 3.305570726379028e-05, "loss": 0.4941, "step": 8240 }, { "epoch": 2.0270303477085636, "grad_norm": 0.2085009410284925, "learning_rate": 3.3028399781540144e-05, "loss": 0.459, "step": 8250 }, { "epoch": 2.029487652045706, "grad_norm": 0.21671010562663526, "learning_rate": 3.300109229929001e-05, "loss": 0.4626, "step": 8260 }, { "epoch": 2.031944956382848, "grad_norm": 0.20880373912345715, "learning_rate": 3.297378481703987e-05, "loss": 0.4566, "step": 8270 }, { "epoch": 2.03440226071999, "grad_norm": 0.205037677585801, "learning_rate": 3.2946477334789736e-05, "loss": 0.4601, "step": 8280 }, { "epoch": 2.0368595650571324, "grad_norm": 0.20759740411261993, "learning_rate": 3.29191698525396e-05, "loss": 0.4689, "step": 8290 }, { "epoch": 2.0393168693942743, "grad_norm": 0.23594208324320126, "learning_rate": 3.2891862370289464e-05, "loss": 0.4599, "step": 8300 }, { "epoch": 2.0417741737314166, "grad_norm": 0.1934270823286036, "learning_rate": 3.286455488803932e-05, "loss": 0.4471, "step": 8310 }, { "epoch": 2.044231478068559, "grad_norm": 0.20676222692200844, "learning_rate": 3.283724740578919e-05, "loss": 0.4533, "step": 8320 }, { "epoch": 2.0466887824057007, "grad_norm": 0.21050069056944368, "learning_rate": 3.280993992353905e-05, "loss": 0.4636, "step": 8330 }, { "epoch": 2.049146086742843, "grad_norm": 0.23113386716518325, "learning_rate": 3.278263244128891e-05, "loss": 0.4643, "step": 8340 }, { "epoch": 2.0516033910799854, "grad_norm": 0.2203559640403448, "learning_rate": 3.275532495903878e-05, "loss": 0.4404, "step": 8350 }, { "epoch": 2.054060695417127, "grad_norm": 0.24253330400401166, "learning_rate": 3.272801747678864e-05, "loss": 0.4707, "step": 8360 }, { "epoch": 2.0565179997542695, "grad_norm": 0.26865120063445547, "learning_rate": 3.2700709994538505e-05, "loss": 0.4733, "step": 8370 }, { "epoch": 2.058975304091412, "grad_norm": 0.23849421295870535, "learning_rate": 3.267340251228837e-05, "loss": 0.4722, "step": 8380 }, { "epoch": 2.0614326084285537, "grad_norm": 0.2127012590158884, "learning_rate": 3.264609503003823e-05, "loss": 0.464, "step": 8390 }, { "epoch": 2.063889912765696, "grad_norm": 0.22565548094443988, "learning_rate": 3.261878754778809e-05, "loss": 0.4892, "step": 8400 }, { "epoch": 2.0663472171028383, "grad_norm": 0.22351291790291297, "learning_rate": 3.259148006553796e-05, "loss": 0.4751, "step": 8410 }, { "epoch": 2.06880452143998, "grad_norm": 0.23644128013383833, "learning_rate": 3.2564172583287825e-05, "loss": 0.4525, "step": 8420 }, { "epoch": 2.0712618257771225, "grad_norm": 0.2221490797731559, "learning_rate": 3.253686510103768e-05, "loss": 0.4656, "step": 8430 }, { "epoch": 2.073719130114265, "grad_norm": 0.25913199816365295, "learning_rate": 3.250955761878755e-05, "loss": 0.4613, "step": 8440 }, { "epoch": 2.0761764344514066, "grad_norm": 0.19205625441994592, "learning_rate": 3.248225013653741e-05, "loss": 0.4707, "step": 8450 }, { "epoch": 2.078633738788549, "grad_norm": 0.22473275733791476, "learning_rate": 3.2454942654287274e-05, "loss": 0.482, "step": 8460 }, { "epoch": 2.0810910431256913, "grad_norm": 0.19444810135608193, "learning_rate": 3.242763517203714e-05, "loss": 0.481, "step": 8470 }, { "epoch": 2.083548347462833, "grad_norm": 0.2381185336812415, "learning_rate": 3.2400327689787e-05, "loss": 0.4763, "step": 8480 }, { "epoch": 2.0860056517999754, "grad_norm": 0.19809370497152642, "learning_rate": 3.2373020207536866e-05, "loss": 0.4551, "step": 8490 }, { "epoch": 2.0884629561371177, "grad_norm": 0.2379845579385787, "learning_rate": 3.234571272528673e-05, "loss": 0.4632, "step": 8500 }, { "epoch": 2.0909202604742596, "grad_norm": 0.22730410649199814, "learning_rate": 3.2318405243036594e-05, "loss": 0.4902, "step": 8510 }, { "epoch": 2.093377564811402, "grad_norm": 0.2423551277695343, "learning_rate": 3.229109776078646e-05, "loss": 0.4662, "step": 8520 }, { "epoch": 2.095834869148544, "grad_norm": 0.18973349881263868, "learning_rate": 3.226379027853632e-05, "loss": 0.481, "step": 8530 }, { "epoch": 2.098292173485686, "grad_norm": 0.20870961621760806, "learning_rate": 3.223648279628618e-05, "loss": 0.4645, "step": 8540 }, { "epoch": 2.1007494778228284, "grad_norm": 0.24156551311857213, "learning_rate": 3.220917531403605e-05, "loss": 0.467, "step": 8550 }, { "epoch": 2.1032067821599707, "grad_norm": 0.21751700440122063, "learning_rate": 3.2181867831785914e-05, "loss": 0.4665, "step": 8560 }, { "epoch": 2.1056640864971126, "grad_norm": 0.19559855422231587, "learning_rate": 3.215456034953577e-05, "loss": 0.4758, "step": 8570 }, { "epoch": 2.108121390834255, "grad_norm": 0.20907407929830205, "learning_rate": 3.212725286728564e-05, "loss": 0.4638, "step": 8580 }, { "epoch": 2.110578695171397, "grad_norm": 0.2411818030184255, "learning_rate": 3.20999453850355e-05, "loss": 0.4746, "step": 8590 }, { "epoch": 2.113035999508539, "grad_norm": 0.22971136573220866, "learning_rate": 3.2072637902785364e-05, "loss": 0.4859, "step": 8600 }, { "epoch": 2.1154933038456813, "grad_norm": 0.2195019147063572, "learning_rate": 3.204533042053523e-05, "loss": 0.4581, "step": 8610 }, { "epoch": 2.1179506081828237, "grad_norm": 0.2318521897943634, "learning_rate": 3.201802293828509e-05, "loss": 0.4805, "step": 8620 }, { "epoch": 2.1204079125199655, "grad_norm": 0.21445330158342926, "learning_rate": 3.1990715456034956e-05, "loss": 0.4571, "step": 8630 }, { "epoch": 2.122865216857108, "grad_norm": 0.2230108465476756, "learning_rate": 3.196340797378482e-05, "loss": 0.4889, "step": 8640 }, { "epoch": 2.12532252119425, "grad_norm": 0.24555033551374628, "learning_rate": 3.1936100491534684e-05, "loss": 0.4837, "step": 8650 }, { "epoch": 2.127779825531392, "grad_norm": 0.21327248036099916, "learning_rate": 3.190879300928454e-05, "loss": 0.4788, "step": 8660 }, { "epoch": 2.1302371298685343, "grad_norm": 0.23656428202183466, "learning_rate": 3.188148552703441e-05, "loss": 0.4749, "step": 8670 }, { "epoch": 2.132694434205676, "grad_norm": 0.20779050236132277, "learning_rate": 3.185417804478427e-05, "loss": 0.4709, "step": 8680 }, { "epoch": 2.1351517385428185, "grad_norm": 0.22731338205750304, "learning_rate": 3.182687056253413e-05, "loss": 0.4571, "step": 8690 }, { "epoch": 2.137609042879961, "grad_norm": 0.24425550320106074, "learning_rate": 3.1799563080284004e-05, "loss": 0.4736, "step": 8700 }, { "epoch": 2.1400663472171026, "grad_norm": 0.2843928570937864, "learning_rate": 3.177225559803386e-05, "loss": 0.4705, "step": 8710 }, { "epoch": 2.142523651554245, "grad_norm": 0.2497485611537917, "learning_rate": 3.1744948115783725e-05, "loss": 0.4639, "step": 8720 }, { "epoch": 2.1449809558913873, "grad_norm": 0.22012075592303645, "learning_rate": 3.171764063353359e-05, "loss": 0.4593, "step": 8730 }, { "epoch": 2.147438260228529, "grad_norm": 0.2150721196442227, "learning_rate": 3.169033315128345e-05, "loss": 0.4817, "step": 8740 }, { "epoch": 2.1498955645656714, "grad_norm": 0.20432822654418273, "learning_rate": 3.166302566903332e-05, "loss": 0.4624, "step": 8750 }, { "epoch": 2.1523528689028137, "grad_norm": 0.2261262491189348, "learning_rate": 3.163571818678318e-05, "loss": 0.4639, "step": 8760 }, { "epoch": 2.1548101732399556, "grad_norm": 0.2162828532464022, "learning_rate": 3.1608410704533045e-05, "loss": 0.4513, "step": 8770 }, { "epoch": 2.157267477577098, "grad_norm": 0.2390527594652659, "learning_rate": 3.158110322228291e-05, "loss": 0.4648, "step": 8780 }, { "epoch": 2.15972478191424, "grad_norm": 0.23120176994662123, "learning_rate": 3.155379574003277e-05, "loss": 0.4643, "step": 8790 }, { "epoch": 2.162182086251382, "grad_norm": 0.2209514345242446, "learning_rate": 3.152648825778263e-05, "loss": 0.4627, "step": 8800 }, { "epoch": 2.1646393905885244, "grad_norm": 0.2328910535805196, "learning_rate": 3.14991807755325e-05, "loss": 0.4601, "step": 8810 }, { "epoch": 2.1670966949256667, "grad_norm": 0.25452025162008945, "learning_rate": 3.147187329328236e-05, "loss": 0.4699, "step": 8820 }, { "epoch": 2.1695539992628086, "grad_norm": 0.24685942528624788, "learning_rate": 3.144456581103222e-05, "loss": 0.4623, "step": 8830 }, { "epoch": 2.172011303599951, "grad_norm": 0.22185880992745124, "learning_rate": 3.141725832878209e-05, "loss": 0.4648, "step": 8840 }, { "epoch": 2.174468607937093, "grad_norm": 0.24961685832636427, "learning_rate": 3.138995084653195e-05, "loss": 0.4643, "step": 8850 }, { "epoch": 2.176925912274235, "grad_norm": 0.29051771773402885, "learning_rate": 3.1362643364281814e-05, "loss": 0.4651, "step": 8860 }, { "epoch": 2.1793832166113773, "grad_norm": 0.1930245227310935, "learning_rate": 3.133533588203168e-05, "loss": 0.4497, "step": 8870 }, { "epoch": 2.1818405209485197, "grad_norm": 0.2330441066242711, "learning_rate": 3.130802839978154e-05, "loss": 0.4547, "step": 8880 }, { "epoch": 2.1842978252856615, "grad_norm": 0.19354956694012349, "learning_rate": 3.1280720917531406e-05, "loss": 0.4932, "step": 8890 }, { "epoch": 2.186755129622804, "grad_norm": 0.32146591934367, "learning_rate": 3.125341343528127e-05, "loss": 0.4749, "step": 8900 }, { "epoch": 2.189212433959946, "grad_norm": 0.1918056163482702, "learning_rate": 3.1226105953031134e-05, "loss": 0.4734, "step": 8910 }, { "epoch": 2.191669738297088, "grad_norm": 0.2504374008209726, "learning_rate": 3.119879847078099e-05, "loss": 0.4543, "step": 8920 }, { "epoch": 2.1941270426342303, "grad_norm": 0.22034573294029391, "learning_rate": 3.117149098853086e-05, "loss": 0.4664, "step": 8930 }, { "epoch": 2.196584346971372, "grad_norm": 0.22954568810077847, "learning_rate": 3.114418350628072e-05, "loss": 0.4481, "step": 8940 }, { "epoch": 2.1990416513085145, "grad_norm": 0.24372597391425624, "learning_rate": 3.1116876024030583e-05, "loss": 0.4689, "step": 8950 }, { "epoch": 2.201498955645657, "grad_norm": 0.19567739005969909, "learning_rate": 3.108956854178045e-05, "loss": 0.4811, "step": 8960 }, { "epoch": 2.2039562599827986, "grad_norm": 0.2091826633598359, "learning_rate": 3.106226105953031e-05, "loss": 0.48, "step": 8970 }, { "epoch": 2.206413564319941, "grad_norm": 0.237584233714104, "learning_rate": 3.1034953577280175e-05, "loss": 0.4451, "step": 8980 }, { "epoch": 2.2088708686570833, "grad_norm": 0.2247036154348432, "learning_rate": 3.100764609503004e-05, "loss": 0.4506, "step": 8990 }, { "epoch": 2.211328172994225, "grad_norm": 0.18637331552481487, "learning_rate": 3.0980338612779903e-05, "loss": 0.4677, "step": 9000 }, { "epoch": 2.2137854773313674, "grad_norm": 0.22504996860603724, "learning_rate": 3.095303113052977e-05, "loss": 0.4485, "step": 9010 }, { "epoch": 2.2162427816685097, "grad_norm": 0.2121031614146781, "learning_rate": 3.092572364827963e-05, "loss": 0.4581, "step": 9020 }, { "epoch": 2.2187000860056516, "grad_norm": 0.23963361422664128, "learning_rate": 3.0898416166029495e-05, "loss": 0.4673, "step": 9030 }, { "epoch": 2.221157390342794, "grad_norm": 0.2577432111249706, "learning_rate": 3.087110868377936e-05, "loss": 0.4656, "step": 9040 }, { "epoch": 2.223614694679936, "grad_norm": 0.22562848191397397, "learning_rate": 3.0843801201529223e-05, "loss": 0.4557, "step": 9050 }, { "epoch": 2.226071999017078, "grad_norm": 0.21326214787029954, "learning_rate": 3.081649371927908e-05, "loss": 0.4638, "step": 9060 }, { "epoch": 2.2285293033542204, "grad_norm": 0.21980175305882294, "learning_rate": 3.078918623702895e-05, "loss": 0.4605, "step": 9070 }, { "epoch": 2.2309866076913627, "grad_norm": 0.2533193633035986, "learning_rate": 3.076187875477881e-05, "loss": 0.4661, "step": 9080 }, { "epoch": 2.2334439120285046, "grad_norm": 0.2458582414821253, "learning_rate": 3.073457127252867e-05, "loss": 0.4782, "step": 9090 }, { "epoch": 2.235901216365647, "grad_norm": 0.2257782505034606, "learning_rate": 3.070726379027854e-05, "loss": 0.4866, "step": 9100 }, { "epoch": 2.238358520702789, "grad_norm": 0.21738303323542574, "learning_rate": 3.06799563080284e-05, "loss": 0.4749, "step": 9110 }, { "epoch": 2.240815825039931, "grad_norm": 0.21806581534473965, "learning_rate": 3.0652648825778265e-05, "loss": 0.4514, "step": 9120 }, { "epoch": 2.2432731293770733, "grad_norm": 0.2587906386369436, "learning_rate": 3.062534134352813e-05, "loss": 0.4876, "step": 9130 }, { "epoch": 2.2457304337142157, "grad_norm": 0.24091812703835033, "learning_rate": 3.059803386127799e-05, "loss": 0.4531, "step": 9140 }, { "epoch": 2.2481877380513575, "grad_norm": 0.24042307872931903, "learning_rate": 3.057072637902785e-05, "loss": 0.448, "step": 9150 }, { "epoch": 2.2506450423885, "grad_norm": 0.21313610942445735, "learning_rate": 3.054341889677772e-05, "loss": 0.452, "step": 9160 }, { "epoch": 2.253102346725642, "grad_norm": 0.2369059375134552, "learning_rate": 3.051611141452758e-05, "loss": 0.481, "step": 9170 }, { "epoch": 2.255559651062784, "grad_norm": 0.21260172967261756, "learning_rate": 3.0488803932277442e-05, "loss": 0.4845, "step": 9180 }, { "epoch": 2.2580169553999263, "grad_norm": 0.30855600666188415, "learning_rate": 3.046149645002731e-05, "loss": 0.4586, "step": 9190 }, { "epoch": 2.2604742597370686, "grad_norm": 0.18860025425663512, "learning_rate": 3.0434188967777173e-05, "loss": 0.4504, "step": 9200 }, { "epoch": 2.2629315640742105, "grad_norm": 0.21720829227587254, "learning_rate": 3.0406881485527034e-05, "loss": 0.4514, "step": 9210 }, { "epoch": 2.265388868411353, "grad_norm": 0.2557726188846286, "learning_rate": 3.03795740032769e-05, "loss": 0.4643, "step": 9220 }, { "epoch": 2.267846172748495, "grad_norm": 0.29226878286566216, "learning_rate": 3.0352266521026762e-05, "loss": 0.4542, "step": 9230 }, { "epoch": 2.270303477085637, "grad_norm": 0.23398871992018244, "learning_rate": 3.0324959038776623e-05, "loss": 0.4592, "step": 9240 }, { "epoch": 2.2727607814227793, "grad_norm": 0.23414275253738434, "learning_rate": 3.029765155652649e-05, "loss": 0.4753, "step": 9250 }, { "epoch": 2.2752180857599216, "grad_norm": 0.25091191842593047, "learning_rate": 3.027034407427635e-05, "loss": 0.4719, "step": 9260 }, { "epoch": 2.2776753900970634, "grad_norm": 0.22237303141350093, "learning_rate": 3.0243036592026218e-05, "loss": 0.4565, "step": 9270 }, { "epoch": 2.2801326944342057, "grad_norm": 0.21370467174740576, "learning_rate": 3.0215729109776082e-05, "loss": 0.4853, "step": 9280 }, { "epoch": 2.282589998771348, "grad_norm": 0.20113028089640425, "learning_rate": 3.0188421627525943e-05, "loss": 0.4605, "step": 9290 }, { "epoch": 2.28504730310849, "grad_norm": 0.245853440990544, "learning_rate": 3.016111414527581e-05, "loss": 0.4745, "step": 9300 }, { "epoch": 2.287504607445632, "grad_norm": 0.23134423127975007, "learning_rate": 3.013380666302567e-05, "loss": 0.4506, "step": 9310 }, { "epoch": 2.2899619117827745, "grad_norm": 0.24266010362961368, "learning_rate": 3.010649918077553e-05, "loss": 0.4756, "step": 9320 }, { "epoch": 2.2924192161199164, "grad_norm": 0.2295158068388005, "learning_rate": 3.00791916985254e-05, "loss": 0.4669, "step": 9330 }, { "epoch": 2.2948765204570587, "grad_norm": 0.2165293236636441, "learning_rate": 3.0051884216275263e-05, "loss": 0.4489, "step": 9340 }, { "epoch": 2.297333824794201, "grad_norm": 0.22548579187622828, "learning_rate": 3.0024576734025123e-05, "loss": 0.4867, "step": 9350 }, { "epoch": 2.299791129131343, "grad_norm": 0.2739797541917317, "learning_rate": 2.999726925177499e-05, "loss": 0.4708, "step": 9360 }, { "epoch": 2.302248433468485, "grad_norm": 0.21476551334090263, "learning_rate": 2.996996176952485e-05, "loss": 0.4688, "step": 9370 }, { "epoch": 2.304705737805627, "grad_norm": 0.25281759570567375, "learning_rate": 2.9942654287274712e-05, "loss": 0.4721, "step": 9380 }, { "epoch": 2.3071630421427693, "grad_norm": 0.25035484909205924, "learning_rate": 2.991534680502458e-05, "loss": 0.475, "step": 9390 }, { "epoch": 2.3096203464799117, "grad_norm": 0.21568600848096872, "learning_rate": 2.9888039322774443e-05, "loss": 0.4644, "step": 9400 }, { "epoch": 2.3120776508170535, "grad_norm": 0.2200552561947087, "learning_rate": 2.9860731840524304e-05, "loss": 0.4526, "step": 9410 }, { "epoch": 2.314534955154196, "grad_norm": 0.22308534970861327, "learning_rate": 2.983342435827417e-05, "loss": 0.4879, "step": 9420 }, { "epoch": 2.316992259491338, "grad_norm": 0.20769766258070438, "learning_rate": 2.9806116876024032e-05, "loss": 0.4752, "step": 9430 }, { "epoch": 2.31944956382848, "grad_norm": 0.2471483416247399, "learning_rate": 2.9778809393773892e-05, "loss": 0.4477, "step": 9440 }, { "epoch": 2.3219068681656223, "grad_norm": 0.209367934952951, "learning_rate": 2.975150191152376e-05, "loss": 0.4507, "step": 9450 }, { "epoch": 2.3243641725027646, "grad_norm": 0.26968972732209234, "learning_rate": 2.972419442927362e-05, "loss": 0.4695, "step": 9460 }, { "epoch": 2.3268214768399065, "grad_norm": 0.19739440872760514, "learning_rate": 2.9696886947023484e-05, "loss": 0.4597, "step": 9470 }, { "epoch": 2.329278781177049, "grad_norm": 0.23610647902946938, "learning_rate": 2.9669579464773352e-05, "loss": 0.4769, "step": 9480 }, { "epoch": 2.331736085514191, "grad_norm": 0.22147494900891346, "learning_rate": 2.9642271982523212e-05, "loss": 0.4614, "step": 9490 }, { "epoch": 2.334193389851333, "grad_norm": 0.23383897533724513, "learning_rate": 2.9614964500273073e-05, "loss": 0.469, "step": 9500 }, { "epoch": 2.3366506941884753, "grad_norm": 0.2498208680936285, "learning_rate": 2.958765701802294e-05, "loss": 0.4824, "step": 9510 }, { "epoch": 2.3391079985256176, "grad_norm": 0.24780944142755854, "learning_rate": 2.95603495357728e-05, "loss": 0.4669, "step": 9520 }, { "epoch": 2.3415653028627594, "grad_norm": 0.21992509731819107, "learning_rate": 2.953304205352267e-05, "loss": 0.4551, "step": 9530 }, { "epoch": 2.3440226071999017, "grad_norm": 0.24289843477047404, "learning_rate": 2.9505734571272532e-05, "loss": 0.4523, "step": 9540 }, { "epoch": 2.346479911537044, "grad_norm": 0.2326256979920201, "learning_rate": 2.9478427089022393e-05, "loss": 0.4738, "step": 9550 }, { "epoch": 2.348937215874186, "grad_norm": 0.21950156548829464, "learning_rate": 2.945111960677226e-05, "loss": 0.4472, "step": 9560 }, { "epoch": 2.351394520211328, "grad_norm": 0.20954559285772525, "learning_rate": 2.942381212452212e-05, "loss": 0.4713, "step": 9570 }, { "epoch": 2.35385182454847, "grad_norm": 0.2294463556276587, "learning_rate": 2.939650464227198e-05, "loss": 0.4556, "step": 9580 }, { "epoch": 2.3563091288856124, "grad_norm": 0.2923286985048196, "learning_rate": 2.936919716002185e-05, "loss": 0.4703, "step": 9590 }, { "epoch": 2.3587664332227547, "grad_norm": 0.20810026103645318, "learning_rate": 2.934188967777171e-05, "loss": 0.4479, "step": 9600 }, { "epoch": 2.3612237375598966, "grad_norm": 0.24700428166420418, "learning_rate": 2.9314582195521574e-05, "loss": 0.4574, "step": 9610 }, { "epoch": 2.363681041897039, "grad_norm": 0.24402529916774843, "learning_rate": 2.928727471327144e-05, "loss": 0.4826, "step": 9620 }, { "epoch": 2.366138346234181, "grad_norm": 0.23159576177204488, "learning_rate": 2.92599672310213e-05, "loss": 0.4594, "step": 9630 }, { "epoch": 2.368595650571323, "grad_norm": 0.2213444834536971, "learning_rate": 2.9232659748771162e-05, "loss": 0.4925, "step": 9640 }, { "epoch": 2.3710529549084653, "grad_norm": 0.2717283398246681, "learning_rate": 2.920535226652103e-05, "loss": 0.4652, "step": 9650 }, { "epoch": 2.3735102592456077, "grad_norm": 0.24211154232546514, "learning_rate": 2.917804478427089e-05, "loss": 0.4757, "step": 9660 }, { "epoch": 2.3759675635827495, "grad_norm": 0.21903272367338916, "learning_rate": 2.9150737302020754e-05, "loss": 0.4702, "step": 9670 }, { "epoch": 2.378424867919892, "grad_norm": 0.237405023806024, "learning_rate": 2.912342981977062e-05, "loss": 0.4727, "step": 9680 }, { "epoch": 2.380882172257034, "grad_norm": 0.20869704652231424, "learning_rate": 2.9096122337520482e-05, "loss": 0.4788, "step": 9690 }, { "epoch": 2.383339476594176, "grad_norm": 0.21180894287164603, "learning_rate": 2.9068814855270343e-05, "loss": 0.4753, "step": 9700 }, { "epoch": 2.3857967809313183, "grad_norm": 0.2217450174050269, "learning_rate": 2.904150737302021e-05, "loss": 0.4656, "step": 9710 }, { "epoch": 2.3882540852684606, "grad_norm": 0.20785577833570962, "learning_rate": 2.901419989077007e-05, "loss": 0.4719, "step": 9720 }, { "epoch": 2.3907113896056025, "grad_norm": 0.21387935754002607, "learning_rate": 2.8986892408519935e-05, "loss": 0.482, "step": 9730 }, { "epoch": 2.393168693942745, "grad_norm": 0.2280649808082625, "learning_rate": 2.89595849262698e-05, "loss": 0.4673, "step": 9740 }, { "epoch": 2.395625998279887, "grad_norm": 0.2038858331385471, "learning_rate": 2.8932277444019663e-05, "loss": 0.4723, "step": 9750 }, { "epoch": 2.398083302617029, "grad_norm": 0.22429922929715784, "learning_rate": 2.8904969961769524e-05, "loss": 0.4758, "step": 9760 }, { "epoch": 2.4005406069541713, "grad_norm": 0.23827283166815597, "learning_rate": 2.887766247951939e-05, "loss": 0.4537, "step": 9770 }, { "epoch": 2.4029979112913136, "grad_norm": 0.2191256436071045, "learning_rate": 2.885035499726925e-05, "loss": 0.4667, "step": 9780 }, { "epoch": 2.4054552156284554, "grad_norm": 0.20825703173254825, "learning_rate": 2.882304751501912e-05, "loss": 0.4679, "step": 9790 }, { "epoch": 2.4079125199655977, "grad_norm": 0.24953351689092793, "learning_rate": 2.879574003276898e-05, "loss": 0.4704, "step": 9800 }, { "epoch": 2.41036982430274, "grad_norm": 0.2626174389252187, "learning_rate": 2.8768432550518844e-05, "loss": 0.4534, "step": 9810 }, { "epoch": 2.412827128639882, "grad_norm": 0.21775028068831487, "learning_rate": 2.874112506826871e-05, "loss": 0.4512, "step": 9820 }, { "epoch": 2.415284432977024, "grad_norm": 0.22850122709656226, "learning_rate": 2.871381758601857e-05, "loss": 0.4824, "step": 9830 }, { "epoch": 2.4177417373141665, "grad_norm": 0.2163749632280222, "learning_rate": 2.8686510103768432e-05, "loss": 0.4579, "step": 9840 }, { "epoch": 2.4201990416513084, "grad_norm": 0.21932096873256862, "learning_rate": 2.86592026215183e-05, "loss": 0.465, "step": 9850 }, { "epoch": 2.4226563459884507, "grad_norm": 0.20647093357580845, "learning_rate": 2.863189513926816e-05, "loss": 0.4722, "step": 9860 }, { "epoch": 2.425113650325593, "grad_norm": 0.22502773427875142, "learning_rate": 2.8604587657018024e-05, "loss": 0.4577, "step": 9870 }, { "epoch": 2.427570954662735, "grad_norm": 0.21691710304908415, "learning_rate": 2.8577280174767888e-05, "loss": 0.4701, "step": 9880 }, { "epoch": 2.430028258999877, "grad_norm": 0.22124865601315866, "learning_rate": 2.8549972692517752e-05, "loss": 0.4834, "step": 9890 }, { "epoch": 2.4324855633370195, "grad_norm": 0.2237834724562597, "learning_rate": 2.8522665210267613e-05, "loss": 0.4739, "step": 9900 }, { "epoch": 2.4349428676741613, "grad_norm": 0.21320464566981032, "learning_rate": 2.849535772801748e-05, "loss": 0.4579, "step": 9910 }, { "epoch": 2.4374001720113037, "grad_norm": 0.18641182491970817, "learning_rate": 2.846805024576734e-05, "loss": 0.4709, "step": 9920 }, { "epoch": 2.439857476348446, "grad_norm": 0.23624947432742907, "learning_rate": 2.84407427635172e-05, "loss": 0.4509, "step": 9930 }, { "epoch": 2.442314780685588, "grad_norm": 0.2154399637239965, "learning_rate": 2.841343528126707e-05, "loss": 0.4791, "step": 9940 }, { "epoch": 2.44477208502273, "grad_norm": 0.21523347576040358, "learning_rate": 2.8386127799016933e-05, "loss": 0.4805, "step": 9950 }, { "epoch": 2.4472293893598724, "grad_norm": 0.24296455141804477, "learning_rate": 2.8358820316766793e-05, "loss": 0.4521, "step": 9960 }, { "epoch": 2.4496866936970143, "grad_norm": 0.17776166723488843, "learning_rate": 2.833151283451666e-05, "loss": 0.4677, "step": 9970 }, { "epoch": 2.4521439980341566, "grad_norm": 0.22172957175169605, "learning_rate": 2.830420535226652e-05, "loss": 0.4732, "step": 9980 }, { "epoch": 2.454601302371299, "grad_norm": 0.21809738340511675, "learning_rate": 2.8276897870016382e-05, "loss": 0.4556, "step": 9990 }, { "epoch": 2.457058606708441, "grad_norm": 0.20690007517958336, "learning_rate": 2.824959038776625e-05, "loss": 0.4657, "step": 10000 }, { "epoch": 2.459515911045583, "grad_norm": 0.22266070211480957, "learning_rate": 2.8222282905516113e-05, "loss": 0.4704, "step": 10010 }, { "epoch": 2.4619732153827254, "grad_norm": 0.19649618045383402, "learning_rate": 2.8194975423265974e-05, "loss": 0.4555, "step": 10020 }, { "epoch": 2.4644305197198673, "grad_norm": 0.2346442795135612, "learning_rate": 2.816766794101584e-05, "loss": 0.4705, "step": 10030 }, { "epoch": 2.4668878240570096, "grad_norm": 0.20052857734231383, "learning_rate": 2.8140360458765702e-05, "loss": 0.4712, "step": 10040 }, { "epoch": 2.4693451283941514, "grad_norm": 0.18357183952465753, "learning_rate": 2.811305297651557e-05, "loss": 0.4692, "step": 10050 }, { "epoch": 2.4718024327312937, "grad_norm": 0.220758730326293, "learning_rate": 2.808574549426543e-05, "loss": 0.4553, "step": 10060 }, { "epoch": 2.474259737068436, "grad_norm": 0.2237107247558613, "learning_rate": 2.8058438012015294e-05, "loss": 0.4598, "step": 10070 }, { "epoch": 2.476717041405578, "grad_norm": 0.2762488388294683, "learning_rate": 2.8031130529765158e-05, "loss": 0.4715, "step": 10080 }, { "epoch": 2.47917434574272, "grad_norm": 0.2071783536356249, "learning_rate": 2.8003823047515022e-05, "loss": 0.4738, "step": 10090 }, { "epoch": 2.4816316500798625, "grad_norm": 0.24421587828193858, "learning_rate": 2.7976515565264883e-05, "loss": 0.4577, "step": 10100 }, { "epoch": 2.4840889544170044, "grad_norm": 0.20951229601352675, "learning_rate": 2.794920808301475e-05, "loss": 0.4769, "step": 10110 }, { "epoch": 2.4865462587541467, "grad_norm": 0.21936863146217012, "learning_rate": 2.792190060076461e-05, "loss": 0.4715, "step": 10120 }, { "epoch": 2.489003563091289, "grad_norm": 0.21809188078942418, "learning_rate": 2.789459311851447e-05, "loss": 0.4775, "step": 10130 }, { "epoch": 2.491460867428431, "grad_norm": 0.22640145345122423, "learning_rate": 2.786728563626434e-05, "loss": 0.4647, "step": 10140 }, { "epoch": 2.493918171765573, "grad_norm": 0.2509651004134882, "learning_rate": 2.7839978154014203e-05, "loss": 0.4897, "step": 10150 }, { "epoch": 2.4963754761027155, "grad_norm": 0.21823193300723234, "learning_rate": 2.7812670671764063e-05, "loss": 0.4629, "step": 10160 }, { "epoch": 2.4988327804398573, "grad_norm": 0.2319218904874237, "learning_rate": 2.778536318951393e-05, "loss": 0.4639, "step": 10170 }, { "epoch": 2.5012900847769997, "grad_norm": 0.21999698649608096, "learning_rate": 2.775805570726379e-05, "loss": 0.4847, "step": 10180 }, { "epoch": 2.5037473891141415, "grad_norm": 0.21381092676617208, "learning_rate": 2.7730748225013652e-05, "loss": 0.4426, "step": 10190 }, { "epoch": 2.506204693451284, "grad_norm": 0.26988792973054065, "learning_rate": 2.770344074276352e-05, "loss": 0.4478, "step": 10200 }, { "epoch": 2.508661997788426, "grad_norm": 0.22915102532676257, "learning_rate": 2.7676133260513383e-05, "loss": 0.4574, "step": 10210 }, { "epoch": 2.511119302125568, "grad_norm": 0.25018050575650014, "learning_rate": 2.7648825778263244e-05, "loss": 0.4605, "step": 10220 }, { "epoch": 2.5135766064627103, "grad_norm": 0.24686072594090294, "learning_rate": 2.762151829601311e-05, "loss": 0.4681, "step": 10230 }, { "epoch": 2.5160339107998526, "grad_norm": 0.2529645320608986, "learning_rate": 2.7594210813762972e-05, "loss": 0.4719, "step": 10240 }, { "epoch": 2.5184912151369945, "grad_norm": 0.2600241665141402, "learning_rate": 2.7566903331512832e-05, "loss": 0.4816, "step": 10250 }, { "epoch": 2.520948519474137, "grad_norm": 0.23455308182830673, "learning_rate": 2.75395958492627e-05, "loss": 0.4855, "step": 10260 }, { "epoch": 2.523405823811279, "grad_norm": 0.2420418742415749, "learning_rate": 2.751228836701256e-05, "loss": 0.4481, "step": 10270 }, { "epoch": 2.525863128148421, "grad_norm": 0.2134395080173078, "learning_rate": 2.7484980884762424e-05, "loss": 0.4684, "step": 10280 }, { "epoch": 2.5283204324855633, "grad_norm": 0.22622536091407153, "learning_rate": 2.7457673402512292e-05, "loss": 0.4694, "step": 10290 }, { "epoch": 2.5307777368227056, "grad_norm": 0.2441588975798663, "learning_rate": 2.7430365920262152e-05, "loss": 0.4686, "step": 10300 }, { "epoch": 2.5332350411598474, "grad_norm": 0.20571943078847643, "learning_rate": 2.740305843801202e-05, "loss": 0.464, "step": 10310 }, { "epoch": 2.5356923454969897, "grad_norm": 0.23374607837285494, "learning_rate": 2.737575095576188e-05, "loss": 0.4664, "step": 10320 }, { "epoch": 2.538149649834132, "grad_norm": 0.2213006996242281, "learning_rate": 2.734844347351174e-05, "loss": 0.4658, "step": 10330 }, { "epoch": 2.540606954171274, "grad_norm": 0.2412367626417784, "learning_rate": 2.732113599126161e-05, "loss": 0.4767, "step": 10340 }, { "epoch": 2.543064258508416, "grad_norm": 0.23114277570559297, "learning_rate": 2.7293828509011472e-05, "loss": 0.4921, "step": 10350 }, { "epoch": 2.5455215628455585, "grad_norm": 0.21854656809759296, "learning_rate": 2.7266521026761333e-05, "loss": 0.4631, "step": 10360 }, { "epoch": 2.5479788671827004, "grad_norm": 0.23092469766860044, "learning_rate": 2.72392135445112e-05, "loss": 0.4581, "step": 10370 }, { "epoch": 2.5504361715198427, "grad_norm": 0.1845007041790498, "learning_rate": 2.721190606226106e-05, "loss": 0.4711, "step": 10380 }, { "epoch": 2.552893475856985, "grad_norm": 0.202452867891502, "learning_rate": 2.7184598580010922e-05, "loss": 0.4493, "step": 10390 }, { "epoch": 2.555350780194127, "grad_norm": 0.2196571239048975, "learning_rate": 2.715729109776079e-05, "loss": 0.4832, "step": 10400 }, { "epoch": 2.557808084531269, "grad_norm": 0.20375482187894864, "learning_rate": 2.712998361551065e-05, "loss": 0.4803, "step": 10410 }, { "epoch": 2.5602653888684115, "grad_norm": 0.24921466209782642, "learning_rate": 2.7102676133260514e-05, "loss": 0.4516, "step": 10420 }, { "epoch": 2.5627226932055533, "grad_norm": 0.19288419911506707, "learning_rate": 2.707536865101038e-05, "loss": 0.4898, "step": 10430 }, { "epoch": 2.5651799975426957, "grad_norm": 0.21646312564908032, "learning_rate": 2.7048061168760242e-05, "loss": 0.4602, "step": 10440 }, { "epoch": 2.567637301879838, "grad_norm": 0.2390376963043931, "learning_rate": 2.7020753686510102e-05, "loss": 0.4798, "step": 10450 }, { "epoch": 2.57009460621698, "grad_norm": 0.19881089859221437, "learning_rate": 2.699344620425997e-05, "loss": 0.4649, "step": 10460 }, { "epoch": 2.572551910554122, "grad_norm": 0.2301432266004415, "learning_rate": 2.696613872200983e-05, "loss": 0.4738, "step": 10470 }, { "epoch": 2.5750092148912644, "grad_norm": 0.2139526670454026, "learning_rate": 2.6938831239759694e-05, "loss": 0.4567, "step": 10480 }, { "epoch": 2.5774665192284063, "grad_norm": 0.19092657514377476, "learning_rate": 2.6911523757509562e-05, "loss": 0.4571, "step": 10490 }, { "epoch": 2.5799238235655486, "grad_norm": 0.22645174124588344, "learning_rate": 2.6884216275259422e-05, "loss": 0.4775, "step": 10500 }, { "epoch": 2.582381127902691, "grad_norm": 0.2139432266960397, "learning_rate": 2.6856908793009283e-05, "loss": 0.4645, "step": 10510 }, { "epoch": 2.584838432239833, "grad_norm": 0.21584072550069966, "learning_rate": 2.682960131075915e-05, "loss": 0.4626, "step": 10520 }, { "epoch": 2.587295736576975, "grad_norm": 0.26123316375957645, "learning_rate": 2.680229382850901e-05, "loss": 0.4521, "step": 10530 }, { "epoch": 2.5897530409141174, "grad_norm": 0.2445779637845101, "learning_rate": 2.6774986346258875e-05, "loss": 0.4617, "step": 10540 }, { "epoch": 2.5922103452512593, "grad_norm": 0.23686537022855808, "learning_rate": 2.674767886400874e-05, "loss": 0.4528, "step": 10550 }, { "epoch": 2.5946676495884016, "grad_norm": 0.2552385384054787, "learning_rate": 2.6720371381758603e-05, "loss": 0.4573, "step": 10560 }, { "epoch": 2.597124953925544, "grad_norm": 0.25545927418183295, "learning_rate": 2.669306389950847e-05, "loss": 0.4569, "step": 10570 }, { "epoch": 2.5995822582626857, "grad_norm": 0.23258003918795944, "learning_rate": 2.666575641725833e-05, "loss": 0.4652, "step": 10580 }, { "epoch": 2.602039562599828, "grad_norm": 0.20853980956710347, "learning_rate": 2.663844893500819e-05, "loss": 0.4483, "step": 10590 }, { "epoch": 2.6044968669369704, "grad_norm": 0.23950914878509066, "learning_rate": 2.661114145275806e-05, "loss": 0.4656, "step": 10600 }, { "epoch": 2.606954171274112, "grad_norm": 0.22897057001152737, "learning_rate": 2.658383397050792e-05, "loss": 0.4757, "step": 10610 }, { "epoch": 2.6094114756112545, "grad_norm": 0.21781206924212018, "learning_rate": 2.6556526488257784e-05, "loss": 0.4675, "step": 10620 }, { "epoch": 2.611868779948397, "grad_norm": 0.1905524058513491, "learning_rate": 2.652921900600765e-05, "loss": 0.4708, "step": 10630 }, { "epoch": 2.6143260842855387, "grad_norm": 0.22207840169630597, "learning_rate": 2.650191152375751e-05, "loss": 0.4553, "step": 10640 }, { "epoch": 2.616783388622681, "grad_norm": 0.2021740679263663, "learning_rate": 2.6474604041507372e-05, "loss": 0.4457, "step": 10650 }, { "epoch": 2.6192406929598233, "grad_norm": 0.22861214123753582, "learning_rate": 2.644729655925724e-05, "loss": 0.4593, "step": 10660 }, { "epoch": 2.621697997296965, "grad_norm": 0.25662799368408, "learning_rate": 2.64199890770071e-05, "loss": 0.4716, "step": 10670 }, { "epoch": 2.6241553016341075, "grad_norm": 0.2165868326117391, "learning_rate": 2.6392681594756964e-05, "loss": 0.4719, "step": 10680 }, { "epoch": 2.62661260597125, "grad_norm": 0.24212767734985405, "learning_rate": 2.6365374112506828e-05, "loss": 0.4688, "step": 10690 }, { "epoch": 2.6290699103083917, "grad_norm": 0.1960141004216827, "learning_rate": 2.6338066630256692e-05, "loss": 0.4779, "step": 10700 }, { "epoch": 2.631527214645534, "grad_norm": 0.2326514129470284, "learning_rate": 2.6310759148006553e-05, "loss": 0.4797, "step": 10710 }, { "epoch": 2.6339845189826763, "grad_norm": 0.20593800706104412, "learning_rate": 2.628345166575642e-05, "loss": 0.4355, "step": 10720 }, { "epoch": 2.636441823319818, "grad_norm": 0.20045390325856555, "learning_rate": 2.625614418350628e-05, "loss": 0.4545, "step": 10730 }, { "epoch": 2.6388991276569604, "grad_norm": 0.2255607179501345, "learning_rate": 2.622883670125614e-05, "loss": 0.4666, "step": 10740 }, { "epoch": 2.6413564319941023, "grad_norm": 0.261981776528225, "learning_rate": 2.620152921900601e-05, "loss": 0.4669, "step": 10750 }, { "epoch": 2.6438137363312446, "grad_norm": 0.2177009091413204, "learning_rate": 2.6174221736755873e-05, "loss": 0.457, "step": 10760 }, { "epoch": 2.646271040668387, "grad_norm": 0.23077375231931144, "learning_rate": 2.6146914254505733e-05, "loss": 0.4423, "step": 10770 }, { "epoch": 2.648728345005529, "grad_norm": 0.22651590587471418, "learning_rate": 2.61196067722556e-05, "loss": 0.4532, "step": 10780 }, { "epoch": 2.651185649342671, "grad_norm": 0.2109609900571008, "learning_rate": 2.609229929000546e-05, "loss": 0.466, "step": 10790 }, { "epoch": 2.6536429536798134, "grad_norm": 0.22906248684712918, "learning_rate": 2.6064991807755322e-05, "loss": 0.4792, "step": 10800 }, { "epoch": 2.6561002580169553, "grad_norm": 0.3504019249759813, "learning_rate": 2.603768432550519e-05, "loss": 0.4763, "step": 10810 }, { "epoch": 2.6585575623540976, "grad_norm": 0.2531891225283304, "learning_rate": 2.6010376843255053e-05, "loss": 0.4602, "step": 10820 }, { "epoch": 2.6610148666912394, "grad_norm": 0.22462378645769038, "learning_rate": 2.598306936100492e-05, "loss": 0.4729, "step": 10830 }, { "epoch": 2.6634721710283817, "grad_norm": 0.22586861893334653, "learning_rate": 2.595576187875478e-05, "loss": 0.4677, "step": 10840 }, { "epoch": 2.665929475365524, "grad_norm": 0.23706843666335933, "learning_rate": 2.5928454396504642e-05, "loss": 0.4581, "step": 10850 }, { "epoch": 2.668386779702666, "grad_norm": 0.2265373918236112, "learning_rate": 2.590114691425451e-05, "loss": 0.4673, "step": 10860 }, { "epoch": 2.670844084039808, "grad_norm": 0.23087856244351537, "learning_rate": 2.587383943200437e-05, "loss": 0.4579, "step": 10870 }, { "epoch": 2.6733013883769505, "grad_norm": 0.25859560503086265, "learning_rate": 2.5846531949754234e-05, "loss": 0.4729, "step": 10880 }, { "epoch": 2.6757586927140924, "grad_norm": 0.2364989776258722, "learning_rate": 2.5819224467504098e-05, "loss": 0.4725, "step": 10890 }, { "epoch": 2.6782159970512347, "grad_norm": 0.20948851813664335, "learning_rate": 2.5791916985253962e-05, "loss": 0.469, "step": 10900 }, { "epoch": 2.680673301388377, "grad_norm": 0.20607392261011773, "learning_rate": 2.5764609503003823e-05, "loss": 0.4582, "step": 10910 }, { "epoch": 2.683130605725519, "grad_norm": 0.228259039367775, "learning_rate": 2.573730202075369e-05, "loss": 0.4557, "step": 10920 }, { "epoch": 2.685587910062661, "grad_norm": 0.24602385919646583, "learning_rate": 2.570999453850355e-05, "loss": 0.4714, "step": 10930 }, { "epoch": 2.6880452143998035, "grad_norm": 0.22795781038461269, "learning_rate": 2.568268705625341e-05, "loss": 0.4747, "step": 10940 }, { "epoch": 2.6905025187369453, "grad_norm": 0.20554486993246518, "learning_rate": 2.565537957400328e-05, "loss": 0.4778, "step": 10950 }, { "epoch": 2.6929598230740877, "grad_norm": 0.25843744138894226, "learning_rate": 2.5628072091753143e-05, "loss": 0.4741, "step": 10960 }, { "epoch": 2.69541712741123, "grad_norm": 0.2246816562714157, "learning_rate": 2.5600764609503003e-05, "loss": 0.4556, "step": 10970 }, { "epoch": 2.697874431748372, "grad_norm": 0.2490862852528546, "learning_rate": 2.557345712725287e-05, "loss": 0.4743, "step": 10980 }, { "epoch": 2.700331736085514, "grad_norm": 0.21871624383009478, "learning_rate": 2.554614964500273e-05, "loss": 0.4775, "step": 10990 }, { "epoch": 2.7027890404226564, "grad_norm": 0.24321681439032125, "learning_rate": 2.5518842162752592e-05, "loss": 0.4648, "step": 11000 }, { "epoch": 2.7052463447597983, "grad_norm": 0.20765557450651592, "learning_rate": 2.549153468050246e-05, "loss": 0.4605, "step": 11010 }, { "epoch": 2.7077036490969406, "grad_norm": 0.22696724395238108, "learning_rate": 2.5464227198252323e-05, "loss": 0.4576, "step": 11020 }, { "epoch": 2.710160953434083, "grad_norm": 0.2422689254963273, "learning_rate": 2.5436919716002184e-05, "loss": 0.4657, "step": 11030 }, { "epoch": 2.712618257771225, "grad_norm": 0.19781212994657302, "learning_rate": 2.540961223375205e-05, "loss": 0.475, "step": 11040 }, { "epoch": 2.715075562108367, "grad_norm": 0.21050718478427133, "learning_rate": 2.5382304751501912e-05, "loss": 0.4471, "step": 11050 }, { "epoch": 2.7175328664455094, "grad_norm": 0.2512463510057314, "learning_rate": 2.5354997269251773e-05, "loss": 0.468, "step": 11060 }, { "epoch": 2.7199901707826513, "grad_norm": 0.2144661667613184, "learning_rate": 2.532768978700164e-05, "loss": 0.4635, "step": 11070 }, { "epoch": 2.7224474751197936, "grad_norm": 0.23945582403769308, "learning_rate": 2.53003823047515e-05, "loss": 0.4733, "step": 11080 }, { "epoch": 2.724904779456936, "grad_norm": 0.22814232599781328, "learning_rate": 2.5273074822501368e-05, "loss": 0.4653, "step": 11090 }, { "epoch": 2.7273620837940777, "grad_norm": 0.21902481649177044, "learning_rate": 2.5245767340251232e-05, "loss": 0.4577, "step": 11100 }, { "epoch": 2.72981938813122, "grad_norm": 0.23957466060802232, "learning_rate": 2.5218459858001093e-05, "loss": 0.4518, "step": 11110 }, { "epoch": 2.7322766924683624, "grad_norm": 0.23924971206057638, "learning_rate": 2.519115237575096e-05, "loss": 0.4543, "step": 11120 }, { "epoch": 2.734733996805504, "grad_norm": 0.23320729986897504, "learning_rate": 2.516384489350082e-05, "loss": 0.4557, "step": 11130 }, { "epoch": 2.7371913011426465, "grad_norm": 0.22017370462491218, "learning_rate": 2.513653741125068e-05, "loss": 0.4506, "step": 11140 }, { "epoch": 2.739648605479789, "grad_norm": 0.2671906809433662, "learning_rate": 2.510922992900055e-05, "loss": 0.4551, "step": 11150 }, { "epoch": 2.7421059098169307, "grad_norm": 0.2978727345939552, "learning_rate": 2.5081922446750413e-05, "loss": 0.4649, "step": 11160 }, { "epoch": 2.744563214154073, "grad_norm": 0.22725677104535802, "learning_rate": 2.5054614964500273e-05, "loss": 0.4695, "step": 11170 }, { "epoch": 2.7470205184912153, "grad_norm": 0.20846668596290582, "learning_rate": 2.502730748225014e-05, "loss": 0.4664, "step": 11180 }, { "epoch": 2.749477822828357, "grad_norm": 0.23110597728177495, "learning_rate": 2.5e-05, "loss": 0.4731, "step": 11190 }, { "epoch": 2.7519351271654995, "grad_norm": 0.22888504120782915, "learning_rate": 2.4972692517749865e-05, "loss": 0.4584, "step": 11200 }, { "epoch": 2.754392431502642, "grad_norm": 0.33508731228475624, "learning_rate": 2.4945385035499726e-05, "loss": 0.4541, "step": 11210 }, { "epoch": 2.7568497358397837, "grad_norm": 0.2284417942231712, "learning_rate": 2.491807755324959e-05, "loss": 0.4553, "step": 11220 }, { "epoch": 2.759307040176926, "grad_norm": 0.22688977991403916, "learning_rate": 2.4890770070999457e-05, "loss": 0.4754, "step": 11230 }, { "epoch": 2.7617643445140683, "grad_norm": 0.21707236390689819, "learning_rate": 2.4863462588749318e-05, "loss": 0.4716, "step": 11240 }, { "epoch": 2.76422164885121, "grad_norm": 0.22338931194832995, "learning_rate": 2.4836155106499182e-05, "loss": 0.4561, "step": 11250 }, { "epoch": 2.7666789531883524, "grad_norm": 0.22248712837062729, "learning_rate": 2.4808847624249046e-05, "loss": 0.4508, "step": 11260 }, { "epoch": 2.7691362575254947, "grad_norm": 0.23691805886152753, "learning_rate": 2.478154014199891e-05, "loss": 0.445, "step": 11270 }, { "epoch": 2.7715935618626366, "grad_norm": 0.2421359495758341, "learning_rate": 2.475423265974877e-05, "loss": 0.4563, "step": 11280 }, { "epoch": 2.774050866199779, "grad_norm": 0.23933861798778341, "learning_rate": 2.4726925177498634e-05, "loss": 0.4605, "step": 11290 }, { "epoch": 2.7765081705369212, "grad_norm": 0.24004477134817498, "learning_rate": 2.4699617695248502e-05, "loss": 0.4701, "step": 11300 }, { "epoch": 2.778965474874063, "grad_norm": 0.23928401177380876, "learning_rate": 2.4672310212998362e-05, "loss": 0.4713, "step": 11310 }, { "epoch": 2.7814227792112054, "grad_norm": 0.20952976088205574, "learning_rate": 2.4645002730748226e-05, "loss": 0.4441, "step": 11320 }, { "epoch": 2.7838800835483477, "grad_norm": 0.2355515118668955, "learning_rate": 2.461769524849809e-05, "loss": 0.4772, "step": 11330 }, { "epoch": 2.7863373878854896, "grad_norm": 0.2178396537387694, "learning_rate": 2.459038776624795e-05, "loss": 0.451, "step": 11340 }, { "epoch": 2.788794692222632, "grad_norm": 0.23067088551522177, "learning_rate": 2.4563080283997815e-05, "loss": 0.4627, "step": 11350 }, { "epoch": 2.791251996559774, "grad_norm": 0.2509261554564765, "learning_rate": 2.453577280174768e-05, "loss": 0.481, "step": 11360 }, { "epoch": 2.793709300896916, "grad_norm": 0.24269782919960514, "learning_rate": 2.4508465319497543e-05, "loss": 0.4789, "step": 11370 }, { "epoch": 2.7961666052340584, "grad_norm": 0.21925422695155256, "learning_rate": 2.4481157837247407e-05, "loss": 0.4727, "step": 11380 }, { "epoch": 2.7986239095712007, "grad_norm": 0.22637248073412622, "learning_rate": 2.445385035499727e-05, "loss": 0.4642, "step": 11390 }, { "epoch": 2.8010812139083425, "grad_norm": 0.2451570081530454, "learning_rate": 2.4426542872747135e-05, "loss": 0.4895, "step": 11400 }, { "epoch": 2.803538518245485, "grad_norm": 0.2338576204266014, "learning_rate": 2.4399235390496996e-05, "loss": 0.4832, "step": 11410 }, { "epoch": 2.8059958225826267, "grad_norm": 0.2064947716478111, "learning_rate": 2.437192790824686e-05, "loss": 0.4573, "step": 11420 }, { "epoch": 2.808453126919769, "grad_norm": 0.23523939754835027, "learning_rate": 2.4344620425996724e-05, "loss": 0.4716, "step": 11430 }, { "epoch": 2.8109104312569113, "grad_norm": 0.22853851361934993, "learning_rate": 2.4317312943746588e-05, "loss": 0.4562, "step": 11440 }, { "epoch": 2.813367735594053, "grad_norm": 0.2100166697329639, "learning_rate": 2.429000546149645e-05, "loss": 0.4606, "step": 11450 }, { "epoch": 2.8158250399311955, "grad_norm": 0.23948757484558067, "learning_rate": 2.4262697979246316e-05, "loss": 0.4841, "step": 11460 }, { "epoch": 2.818282344268338, "grad_norm": 0.22010864630970262, "learning_rate": 2.4235390496996176e-05, "loss": 0.4614, "step": 11470 }, { "epoch": 2.8207396486054797, "grad_norm": 0.22816940138888006, "learning_rate": 2.420808301474604e-05, "loss": 0.4649, "step": 11480 }, { "epoch": 2.823196952942622, "grad_norm": 0.24507040152914447, "learning_rate": 2.4180775532495904e-05, "loss": 0.4558, "step": 11490 }, { "epoch": 2.825654257279764, "grad_norm": 0.2842372069281166, "learning_rate": 2.415346805024577e-05, "loss": 0.4655, "step": 11500 }, { "epoch": 2.828111561616906, "grad_norm": 0.21684619353525153, "learning_rate": 2.4126160567995632e-05, "loss": 0.4466, "step": 11510 }, { "epoch": 2.8305688659540484, "grad_norm": 0.21195614114810418, "learning_rate": 2.4098853085745496e-05, "loss": 0.4557, "step": 11520 }, { "epoch": 2.8330261702911903, "grad_norm": 0.2250359742697813, "learning_rate": 2.407154560349536e-05, "loss": 0.4671, "step": 11530 }, { "epoch": 2.8354834746283326, "grad_norm": 0.2435537452719269, "learning_rate": 2.404423812124522e-05, "loss": 0.477, "step": 11540 }, { "epoch": 2.837940778965475, "grad_norm": 0.2354739626773478, "learning_rate": 2.4016930638995085e-05, "loss": 0.4729, "step": 11550 }, { "epoch": 2.840398083302617, "grad_norm": 0.23471570402166542, "learning_rate": 2.398962315674495e-05, "loss": 0.4646, "step": 11560 }, { "epoch": 2.842855387639759, "grad_norm": 0.21258489839169506, "learning_rate": 2.3962315674494813e-05, "loss": 0.4589, "step": 11570 }, { "epoch": 2.8453126919769014, "grad_norm": 0.213034517784611, "learning_rate": 2.3935008192244677e-05, "loss": 0.4604, "step": 11580 }, { "epoch": 2.8477699963140433, "grad_norm": 0.26516479615979816, "learning_rate": 2.390770070999454e-05, "loss": 0.4571, "step": 11590 }, { "epoch": 2.8502273006511856, "grad_norm": 0.20631162665893307, "learning_rate": 2.38803932277444e-05, "loss": 0.4588, "step": 11600 }, { "epoch": 2.852684604988328, "grad_norm": 0.25308203193111467, "learning_rate": 2.3853085745494266e-05, "loss": 0.4421, "step": 11610 }, { "epoch": 2.8551419093254697, "grad_norm": 0.2507822165130935, "learning_rate": 2.382577826324413e-05, "loss": 0.4533, "step": 11620 }, { "epoch": 2.857599213662612, "grad_norm": 0.2267999606070306, "learning_rate": 2.3798470780993994e-05, "loss": 0.4625, "step": 11630 }, { "epoch": 2.8600565179997544, "grad_norm": 0.25426034583366175, "learning_rate": 2.3771163298743858e-05, "loss": 0.4454, "step": 11640 }, { "epoch": 2.862513822336896, "grad_norm": 0.21918307999875095, "learning_rate": 2.374385581649372e-05, "loss": 0.456, "step": 11650 }, { "epoch": 2.8649711266740385, "grad_norm": 0.21971579193869895, "learning_rate": 2.3716548334243586e-05, "loss": 0.4664, "step": 11660 }, { "epoch": 2.867428431011181, "grad_norm": 0.2453941909231234, "learning_rate": 2.3689240851993446e-05, "loss": 0.4562, "step": 11670 }, { "epoch": 2.8698857353483227, "grad_norm": 0.20849625950744416, "learning_rate": 2.366193336974331e-05, "loss": 0.4628, "step": 11680 }, { "epoch": 2.872343039685465, "grad_norm": 0.20945846120353087, "learning_rate": 2.3634625887493174e-05, "loss": 0.46, "step": 11690 }, { "epoch": 2.8748003440226073, "grad_norm": 0.20374445014350315, "learning_rate": 2.3607318405243038e-05, "loss": 0.4572, "step": 11700 }, { "epoch": 2.877257648359749, "grad_norm": 0.24803340837557183, "learning_rate": 2.3580010922992902e-05, "loss": 0.4815, "step": 11710 }, { "epoch": 2.8797149526968915, "grad_norm": 0.21725993445276667, "learning_rate": 2.3552703440742766e-05, "loss": 0.4547, "step": 11720 }, { "epoch": 2.882172257034034, "grad_norm": 0.22305091649028605, "learning_rate": 2.3525395958492627e-05, "loss": 0.4556, "step": 11730 }, { "epoch": 2.8846295613711757, "grad_norm": 0.2784909859098821, "learning_rate": 2.349808847624249e-05, "loss": 0.4683, "step": 11740 }, { "epoch": 2.887086865708318, "grad_norm": 0.2286094150131365, "learning_rate": 2.3470780993992355e-05, "loss": 0.4479, "step": 11750 }, { "epoch": 2.8895441700454603, "grad_norm": 0.2510908291046141, "learning_rate": 2.344347351174222e-05, "loss": 0.4477, "step": 11760 }, { "epoch": 2.892001474382602, "grad_norm": 0.21649830652947094, "learning_rate": 2.3416166029492083e-05, "loss": 0.46, "step": 11770 }, { "epoch": 2.8944587787197444, "grad_norm": 0.23760370344348888, "learning_rate": 2.3388858547241947e-05, "loss": 0.4608, "step": 11780 }, { "epoch": 2.8969160830568867, "grad_norm": 0.2339260116456388, "learning_rate": 2.336155106499181e-05, "loss": 0.4601, "step": 11790 }, { "epoch": 2.8993733873940286, "grad_norm": 0.24291516603639388, "learning_rate": 2.333424358274167e-05, "loss": 0.4466, "step": 11800 }, { "epoch": 2.901830691731171, "grad_norm": 0.2344808303473327, "learning_rate": 2.3306936100491535e-05, "loss": 0.4595, "step": 11810 }, { "epoch": 2.9042879960683132, "grad_norm": 0.22316723292680377, "learning_rate": 2.32796286182414e-05, "loss": 0.453, "step": 11820 }, { "epoch": 2.906745300405455, "grad_norm": 0.2063620870885038, "learning_rate": 2.3252321135991263e-05, "loss": 0.4593, "step": 11830 }, { "epoch": 2.9092026047425974, "grad_norm": 0.22769350389631834, "learning_rate": 2.3225013653741127e-05, "loss": 0.4775, "step": 11840 }, { "epoch": 2.9116599090797397, "grad_norm": 0.18528211054080806, "learning_rate": 2.319770617149099e-05, "loss": 0.4536, "step": 11850 }, { "epoch": 2.9141172134168816, "grad_norm": 0.23285513173201985, "learning_rate": 2.3170398689240852e-05, "loss": 0.4726, "step": 11860 }, { "epoch": 2.916574517754024, "grad_norm": 0.2114740879538715, "learning_rate": 2.3143091206990716e-05, "loss": 0.4529, "step": 11870 }, { "epoch": 2.919031822091166, "grad_norm": 0.2091196434239753, "learning_rate": 2.311578372474058e-05, "loss": 0.4741, "step": 11880 }, { "epoch": 2.921489126428308, "grad_norm": 0.21420641196816947, "learning_rate": 2.308847624249044e-05, "loss": 0.4622, "step": 11890 }, { "epoch": 2.9239464307654504, "grad_norm": 0.23952376823875515, "learning_rate": 2.3061168760240308e-05, "loss": 0.4696, "step": 11900 }, { "epoch": 2.9264037351025927, "grad_norm": 0.2426519486657896, "learning_rate": 2.3033861277990172e-05, "loss": 0.4479, "step": 11910 }, { "epoch": 2.9288610394397345, "grad_norm": 0.21263334217629407, "learning_rate": 2.3006553795740036e-05, "loss": 0.4616, "step": 11920 }, { "epoch": 2.931318343776877, "grad_norm": 0.22499252367499067, "learning_rate": 2.2979246313489897e-05, "loss": 0.4701, "step": 11930 }, { "epoch": 2.933775648114019, "grad_norm": 0.24343596148806174, "learning_rate": 2.295193883123976e-05, "loss": 0.4512, "step": 11940 }, { "epoch": 2.936232952451161, "grad_norm": 0.24193185414319587, "learning_rate": 2.2924631348989625e-05, "loss": 0.4635, "step": 11950 }, { "epoch": 2.9386902567883033, "grad_norm": 0.25381704670422695, "learning_rate": 2.2897323866739485e-05, "loss": 0.4521, "step": 11960 }, { "epoch": 2.9411475611254456, "grad_norm": 0.26468311590411214, "learning_rate": 2.2870016384489353e-05, "loss": 0.4506, "step": 11970 }, { "epoch": 2.9436048654625875, "grad_norm": 0.26565816826158406, "learning_rate": 2.2842708902239217e-05, "loss": 0.4496, "step": 11980 }, { "epoch": 2.94606216979973, "grad_norm": 0.26263117421351556, "learning_rate": 2.2815401419989077e-05, "loss": 0.4741, "step": 11990 }, { "epoch": 2.948519474136872, "grad_norm": 0.21447849436153582, "learning_rate": 2.278809393773894e-05, "loss": 0.4724, "step": 12000 }, { "epoch": 2.950976778474014, "grad_norm": 0.2312465980251314, "learning_rate": 2.2760786455488805e-05, "loss": 0.4578, "step": 12010 }, { "epoch": 2.9534340828111563, "grad_norm": 0.2411534848000069, "learning_rate": 2.2733478973238666e-05, "loss": 0.4561, "step": 12020 }, { "epoch": 2.9558913871482986, "grad_norm": 0.2340271269347395, "learning_rate": 2.270617149098853e-05, "loss": 0.4454, "step": 12030 }, { "epoch": 2.9583486914854404, "grad_norm": 0.23841214894547633, "learning_rate": 2.2678864008738397e-05, "loss": 0.4767, "step": 12040 }, { "epoch": 2.9608059958225827, "grad_norm": 0.1858187576785494, "learning_rate": 2.265155652648826e-05, "loss": 0.4541, "step": 12050 }, { "epoch": 2.963263300159725, "grad_norm": 0.2505787692145619, "learning_rate": 2.2624249044238122e-05, "loss": 0.4588, "step": 12060 }, { "epoch": 2.965720604496867, "grad_norm": 0.22417559093502415, "learning_rate": 2.2596941561987986e-05, "loss": 0.473, "step": 12070 }, { "epoch": 2.9681779088340092, "grad_norm": 0.22889946323514754, "learning_rate": 2.256963407973785e-05, "loss": 0.4501, "step": 12080 }, { "epoch": 2.970635213171151, "grad_norm": 0.2282025033700709, "learning_rate": 2.254232659748771e-05, "loss": 0.46, "step": 12090 }, { "epoch": 2.9730925175082934, "grad_norm": 0.22833096591180627, "learning_rate": 2.2515019115237575e-05, "loss": 0.4582, "step": 12100 }, { "epoch": 2.9755498218454357, "grad_norm": 0.22739434205539533, "learning_rate": 2.2487711632987442e-05, "loss": 0.4663, "step": 12110 }, { "epoch": 2.9780071261825776, "grad_norm": 0.21799107074553542, "learning_rate": 2.2460404150737303e-05, "loss": 0.4491, "step": 12120 }, { "epoch": 2.98046443051972, "grad_norm": 0.21099099311468147, "learning_rate": 2.2433096668487167e-05, "loss": 0.4645, "step": 12130 }, { "epoch": 2.982921734856862, "grad_norm": 0.19479226409907394, "learning_rate": 2.240578918623703e-05, "loss": 0.488, "step": 12140 }, { "epoch": 2.985379039194004, "grad_norm": 0.22898504442133052, "learning_rate": 2.237848170398689e-05, "loss": 0.4714, "step": 12150 }, { "epoch": 2.9878363435311464, "grad_norm": 0.23355900934210763, "learning_rate": 2.2351174221736755e-05, "loss": 0.4629, "step": 12160 }, { "epoch": 2.990293647868288, "grad_norm": 0.22061999429867898, "learning_rate": 2.232386673948662e-05, "loss": 0.4441, "step": 12170 }, { "epoch": 2.9927509522054305, "grad_norm": 0.20729457426758488, "learning_rate": 2.2296559257236487e-05, "loss": 0.4608, "step": 12180 }, { "epoch": 2.995208256542573, "grad_norm": 0.2481212768297379, "learning_rate": 2.2269251774986347e-05, "loss": 0.4714, "step": 12190 }, { "epoch": 2.9976655608797147, "grad_norm": 0.2294713994090812, "learning_rate": 2.224194429273621e-05, "loss": 0.4691, "step": 12200 }, { "epoch": 3.0, "grad_norm": 0.24903926706663176, "learning_rate": 2.2214636810486075e-05, "loss": 0.4411, "step": 12210 }, { "epoch": 3.0024573043371423, "grad_norm": 0.21708015171816233, "learning_rate": 2.2187329328235936e-05, "loss": 0.4737, "step": 12220 }, { "epoch": 3.004914608674284, "grad_norm": 0.22011816633394107, "learning_rate": 2.21600218459858e-05, "loss": 0.4478, "step": 12230 }, { "epoch": 3.0073719130114265, "grad_norm": 0.20258363899247325, "learning_rate": 2.2132714363735664e-05, "loss": 0.4429, "step": 12240 }, { "epoch": 3.009829217348569, "grad_norm": 0.2296975115282099, "learning_rate": 2.2105406881485528e-05, "loss": 0.4467, "step": 12250 }, { "epoch": 3.0122865216857106, "grad_norm": 0.21867903306383163, "learning_rate": 2.2078099399235392e-05, "loss": 0.4404, "step": 12260 }, { "epoch": 3.014743826022853, "grad_norm": 0.22973268574397432, "learning_rate": 2.2050791916985256e-05, "loss": 0.4527, "step": 12270 }, { "epoch": 3.0172011303599953, "grad_norm": 0.21243708096455463, "learning_rate": 2.2023484434735116e-05, "loss": 0.4586, "step": 12280 }, { "epoch": 3.019658434697137, "grad_norm": 0.22979230292671524, "learning_rate": 2.199617695248498e-05, "loss": 0.4745, "step": 12290 }, { "epoch": 3.0221157390342794, "grad_norm": 0.22150424727046403, "learning_rate": 2.1968869470234844e-05, "loss": 0.4518, "step": 12300 }, { "epoch": 3.0245730433714217, "grad_norm": 0.20710127467764697, "learning_rate": 2.194156198798471e-05, "loss": 0.4435, "step": 12310 }, { "epoch": 3.0270303477085636, "grad_norm": 0.2173721700675865, "learning_rate": 2.1914254505734572e-05, "loss": 0.4706, "step": 12320 }, { "epoch": 3.029487652045706, "grad_norm": 0.21894259614812783, "learning_rate": 2.1886947023484436e-05, "loss": 0.4524, "step": 12330 }, { "epoch": 3.031944956382848, "grad_norm": 0.2228122258094699, "learning_rate": 2.18596395412343e-05, "loss": 0.4698, "step": 12340 }, { "epoch": 3.03440226071999, "grad_norm": 0.25805965103537476, "learning_rate": 2.183233205898416e-05, "loss": 0.4659, "step": 12350 }, { "epoch": 3.0368595650571324, "grad_norm": 0.2067781487726531, "learning_rate": 2.1805024576734025e-05, "loss": 0.4496, "step": 12360 }, { "epoch": 3.0393168693942743, "grad_norm": 0.2689518125307192, "learning_rate": 2.177771709448389e-05, "loss": 0.4533, "step": 12370 }, { "epoch": 3.0417741737314166, "grad_norm": 0.2201055045329148, "learning_rate": 2.1750409612233753e-05, "loss": 0.442, "step": 12380 }, { "epoch": 3.044231478068559, "grad_norm": 0.22365843971098368, "learning_rate": 2.1723102129983617e-05, "loss": 0.4669, "step": 12390 }, { "epoch": 3.0466887824057007, "grad_norm": 0.2366170949389356, "learning_rate": 2.169579464773348e-05, "loss": 0.4514, "step": 12400 }, { "epoch": 3.049146086742843, "grad_norm": 0.23460609357433065, "learning_rate": 2.166848716548334e-05, "loss": 0.4661, "step": 12410 }, { "epoch": 3.0516033910799854, "grad_norm": 0.23281877564803047, "learning_rate": 2.1641179683233206e-05, "loss": 0.4553, "step": 12420 }, { "epoch": 3.054060695417127, "grad_norm": 0.22947652107158387, "learning_rate": 2.161387220098307e-05, "loss": 0.4615, "step": 12430 }, { "epoch": 3.0565179997542695, "grad_norm": 0.25704365460617623, "learning_rate": 2.1586564718732934e-05, "loss": 0.4716, "step": 12440 }, { "epoch": 3.058975304091412, "grad_norm": 0.24039660246570713, "learning_rate": 2.1559257236482798e-05, "loss": 0.4634, "step": 12450 }, { "epoch": 3.0614326084285537, "grad_norm": 0.22758984955022662, "learning_rate": 2.153194975423266e-05, "loss": 0.4586, "step": 12460 }, { "epoch": 3.063889912765696, "grad_norm": 0.32348383980700135, "learning_rate": 2.1504642271982526e-05, "loss": 0.4416, "step": 12470 }, { "epoch": 3.0663472171028383, "grad_norm": 0.23822349623667632, "learning_rate": 2.1477334789732386e-05, "loss": 0.4694, "step": 12480 }, { "epoch": 3.06880452143998, "grad_norm": 0.2298911692261393, "learning_rate": 2.145002730748225e-05, "loss": 0.4614, "step": 12490 }, { "epoch": 3.0712618257771225, "grad_norm": 0.22651333542998098, "learning_rate": 2.1422719825232114e-05, "loss": 0.4847, "step": 12500 }, { "epoch": 3.073719130114265, "grad_norm": 0.2253735457816194, "learning_rate": 2.1395412342981978e-05, "loss": 0.4538, "step": 12510 }, { "epoch": 3.0761764344514066, "grad_norm": 0.22503835013882104, "learning_rate": 2.1368104860731842e-05, "loss": 0.457, "step": 12520 }, { "epoch": 3.078633738788549, "grad_norm": 0.25666613210687045, "learning_rate": 2.1340797378481706e-05, "loss": 0.4441, "step": 12530 }, { "epoch": 3.0810910431256913, "grad_norm": 0.24111184592408527, "learning_rate": 2.1313489896231567e-05, "loss": 0.4679, "step": 12540 }, { "epoch": 3.083548347462833, "grad_norm": 0.18819161003507445, "learning_rate": 2.128618241398143e-05, "loss": 0.4615, "step": 12550 }, { "epoch": 3.0860056517999754, "grad_norm": 0.21744280705457159, "learning_rate": 2.1258874931731295e-05, "loss": 0.4606, "step": 12560 }, { "epoch": 3.0884629561371177, "grad_norm": 0.2257440557095706, "learning_rate": 2.123156744948116e-05, "loss": 0.4644, "step": 12570 }, { "epoch": 3.0909202604742596, "grad_norm": 0.26284789089178284, "learning_rate": 2.1204259967231023e-05, "loss": 0.4521, "step": 12580 }, { "epoch": 3.093377564811402, "grad_norm": 0.2352480598002161, "learning_rate": 2.1176952484980887e-05, "loss": 0.4606, "step": 12590 }, { "epoch": 3.095834869148544, "grad_norm": 0.24732098674810968, "learning_rate": 2.114964500273075e-05, "loss": 0.4419, "step": 12600 }, { "epoch": 3.098292173485686, "grad_norm": 0.19968567046795999, "learning_rate": 2.112233752048061e-05, "loss": 0.4861, "step": 12610 }, { "epoch": 3.1007494778228284, "grad_norm": 0.2301142061006899, "learning_rate": 2.1095030038230476e-05, "loss": 0.457, "step": 12620 }, { "epoch": 3.1032067821599707, "grad_norm": 0.22878573445902786, "learning_rate": 2.106772255598034e-05, "loss": 0.468, "step": 12630 }, { "epoch": 3.1056640864971126, "grad_norm": 0.20147109643431238, "learning_rate": 2.1040415073730204e-05, "loss": 0.4745, "step": 12640 }, { "epoch": 3.108121390834255, "grad_norm": 0.2209661723630014, "learning_rate": 2.1013107591480068e-05, "loss": 0.4616, "step": 12650 }, { "epoch": 3.110578695171397, "grad_norm": 0.2158053521238273, "learning_rate": 2.098580010922993e-05, "loss": 0.4541, "step": 12660 }, { "epoch": 3.113035999508539, "grad_norm": 0.20651699741737886, "learning_rate": 2.0958492626979792e-05, "loss": 0.4527, "step": 12670 }, { "epoch": 3.1154933038456813, "grad_norm": 0.23127276817853465, "learning_rate": 2.0931185144729656e-05, "loss": 0.4635, "step": 12680 }, { "epoch": 3.1179506081828237, "grad_norm": 0.2292072560926833, "learning_rate": 2.090387766247952e-05, "loss": 0.4575, "step": 12690 }, { "epoch": 3.1204079125199655, "grad_norm": 0.21832014524806834, "learning_rate": 2.0876570180229384e-05, "loss": 0.436, "step": 12700 }, { "epoch": 3.122865216857108, "grad_norm": 0.2252325463176511, "learning_rate": 2.0849262697979248e-05, "loss": 0.4539, "step": 12710 }, { "epoch": 3.12532252119425, "grad_norm": 0.24137614469038135, "learning_rate": 2.0821955215729112e-05, "loss": 0.4499, "step": 12720 }, { "epoch": 3.127779825531392, "grad_norm": 0.22656777089738467, "learning_rate": 2.0794647733478976e-05, "loss": 0.4717, "step": 12730 }, { "epoch": 3.1302371298685343, "grad_norm": 0.20905862775947698, "learning_rate": 2.0767340251228837e-05, "loss": 0.4496, "step": 12740 }, { "epoch": 3.132694434205676, "grad_norm": 0.21813456747425986, "learning_rate": 2.07400327689787e-05, "loss": 0.4518, "step": 12750 }, { "epoch": 3.1351517385428185, "grad_norm": 0.2657903547446198, "learning_rate": 2.0712725286728565e-05, "loss": 0.4674, "step": 12760 }, { "epoch": 3.137609042879961, "grad_norm": 0.21663940873544174, "learning_rate": 2.0685417804478425e-05, "loss": 0.4641, "step": 12770 }, { "epoch": 3.1400663472171026, "grad_norm": 0.29269408220903137, "learning_rate": 2.0658110322228293e-05, "loss": 0.4507, "step": 12780 }, { "epoch": 3.142523651554245, "grad_norm": 0.21683889726119937, "learning_rate": 2.0630802839978157e-05, "loss": 0.4689, "step": 12790 }, { "epoch": 3.1449809558913873, "grad_norm": 0.24416057839893152, "learning_rate": 2.0603495357728017e-05, "loss": 0.4602, "step": 12800 }, { "epoch": 3.147438260228529, "grad_norm": 0.22369583978959923, "learning_rate": 2.057618787547788e-05, "loss": 0.4561, "step": 12810 }, { "epoch": 3.1498955645656714, "grad_norm": 0.2586735578256772, "learning_rate": 2.0548880393227745e-05, "loss": 0.4476, "step": 12820 }, { "epoch": 3.1523528689028137, "grad_norm": 0.23267909860617023, "learning_rate": 2.052157291097761e-05, "loss": 0.4666, "step": 12830 }, { "epoch": 3.1548101732399556, "grad_norm": 0.2826869909407162, "learning_rate": 2.049426542872747e-05, "loss": 0.4538, "step": 12840 }, { "epoch": 3.157267477577098, "grad_norm": 0.23838913333102163, "learning_rate": 2.0466957946477337e-05, "loss": 0.4695, "step": 12850 }, { "epoch": 3.15972478191424, "grad_norm": 0.23783418968163408, "learning_rate": 2.04396504642272e-05, "loss": 0.4593, "step": 12860 }, { "epoch": 3.162182086251382, "grad_norm": 0.21666586424531808, "learning_rate": 2.0412342981977062e-05, "loss": 0.4432, "step": 12870 }, { "epoch": 3.1646393905885244, "grad_norm": 0.258619774353376, "learning_rate": 2.0385035499726926e-05, "loss": 0.4761, "step": 12880 }, { "epoch": 3.1670966949256667, "grad_norm": 0.21921186725877786, "learning_rate": 2.035772801747679e-05, "loss": 0.4674, "step": 12890 }, { "epoch": 3.1695539992628086, "grad_norm": 0.21158362717401571, "learning_rate": 2.033042053522665e-05, "loss": 0.4624, "step": 12900 }, { "epoch": 3.172011303599951, "grad_norm": 0.2798412802942036, "learning_rate": 2.0303113052976515e-05, "loss": 0.4625, "step": 12910 }, { "epoch": 3.174468607937093, "grad_norm": 0.2421844454146704, "learning_rate": 2.0275805570726382e-05, "loss": 0.4621, "step": 12920 }, { "epoch": 3.176925912274235, "grad_norm": 0.23954323520529602, "learning_rate": 2.0248498088476243e-05, "loss": 0.4602, "step": 12930 }, { "epoch": 3.1793832166113773, "grad_norm": 0.28411969807874377, "learning_rate": 2.0221190606226107e-05, "loss": 0.45, "step": 12940 }, { "epoch": 3.1818405209485197, "grad_norm": 0.23858091233578238, "learning_rate": 2.019388312397597e-05, "loss": 0.442, "step": 12950 }, { "epoch": 3.1842978252856615, "grad_norm": 0.240550740630788, "learning_rate": 2.0166575641725835e-05, "loss": 0.4511, "step": 12960 }, { "epoch": 3.186755129622804, "grad_norm": 0.23038613198898647, "learning_rate": 2.0139268159475695e-05, "loss": 0.4615, "step": 12970 }, { "epoch": 3.189212433959946, "grad_norm": 0.2483399257676633, "learning_rate": 2.011196067722556e-05, "loss": 0.4619, "step": 12980 }, { "epoch": 3.191669738297088, "grad_norm": 0.22028221365723963, "learning_rate": 2.0084653194975427e-05, "loss": 0.4665, "step": 12990 }, { "epoch": 3.1941270426342303, "grad_norm": 0.21476391197398942, "learning_rate": 2.0057345712725287e-05, "loss": 0.4485, "step": 13000 }, { "epoch": 3.196584346971372, "grad_norm": 0.22366117754981768, "learning_rate": 2.003003823047515e-05, "loss": 0.4465, "step": 13010 }, { "epoch": 3.1990416513085145, "grad_norm": 0.27415701900610173, "learning_rate": 2.0002730748225015e-05, "loss": 0.426, "step": 13020 }, { "epoch": 3.201498955645657, "grad_norm": 0.23736077690110519, "learning_rate": 1.9975423265974876e-05, "loss": 0.4612, "step": 13030 }, { "epoch": 3.2039562599827986, "grad_norm": 0.22291041797791805, "learning_rate": 1.994811578372474e-05, "loss": 0.4658, "step": 13040 }, { "epoch": 3.206413564319941, "grad_norm": 0.2246528414106428, "learning_rate": 1.9920808301474604e-05, "loss": 0.4584, "step": 13050 }, { "epoch": 3.2088708686570833, "grad_norm": 0.24576650955152476, "learning_rate": 1.989350081922447e-05, "loss": 0.4509, "step": 13060 }, { "epoch": 3.211328172994225, "grad_norm": 0.181403746783428, "learning_rate": 1.9866193336974332e-05, "loss": 0.4499, "step": 13070 }, { "epoch": 3.2137854773313674, "grad_norm": 0.2126372222353172, "learning_rate": 1.9838885854724196e-05, "loss": 0.4636, "step": 13080 }, { "epoch": 3.2162427816685097, "grad_norm": 0.24583821433211553, "learning_rate": 1.981157837247406e-05, "loss": 0.4682, "step": 13090 }, { "epoch": 3.2187000860056516, "grad_norm": 0.22263033599848842, "learning_rate": 1.978427089022392e-05, "loss": 0.46, "step": 13100 }, { "epoch": 3.221157390342794, "grad_norm": 0.23035489663738348, "learning_rate": 1.9756963407973784e-05, "loss": 0.4413, "step": 13110 }, { "epoch": 3.223614694679936, "grad_norm": 0.23100878980373796, "learning_rate": 1.9729655925723652e-05, "loss": 0.463, "step": 13120 }, { "epoch": 3.226071999017078, "grad_norm": 0.2252331060096205, "learning_rate": 1.9702348443473512e-05, "loss": 0.4579, "step": 13130 }, { "epoch": 3.2285293033542204, "grad_norm": 0.22802783631777332, "learning_rate": 1.9675040961223376e-05, "loss": 0.431, "step": 13140 }, { "epoch": 3.2309866076913627, "grad_norm": 0.23217468022377075, "learning_rate": 1.964773347897324e-05, "loss": 0.4617, "step": 13150 }, { "epoch": 3.2334439120285046, "grad_norm": 0.2462992469146962, "learning_rate": 1.96204259967231e-05, "loss": 0.4455, "step": 13160 }, { "epoch": 3.235901216365647, "grad_norm": 0.2250673742538919, "learning_rate": 1.9593118514472965e-05, "loss": 0.4507, "step": 13170 }, { "epoch": 3.238358520702789, "grad_norm": 0.23729506416314866, "learning_rate": 1.956581103222283e-05, "loss": 0.4595, "step": 13180 }, { "epoch": 3.240815825039931, "grad_norm": 0.23967285476540093, "learning_rate": 1.9538503549972696e-05, "loss": 0.4517, "step": 13190 }, { "epoch": 3.2432731293770733, "grad_norm": 0.27105846179831916, "learning_rate": 1.9511196067722557e-05, "loss": 0.4633, "step": 13200 }, { "epoch": 3.2457304337142157, "grad_norm": 0.22570636284203516, "learning_rate": 1.948388858547242e-05, "loss": 0.4374, "step": 13210 }, { "epoch": 3.2481877380513575, "grad_norm": 0.27250538011268405, "learning_rate": 1.9456581103222285e-05, "loss": 0.4545, "step": 13220 }, { "epoch": 3.2506450423885, "grad_norm": 0.25306484808630575, "learning_rate": 1.9429273620972146e-05, "loss": 0.4641, "step": 13230 }, { "epoch": 3.253102346725642, "grad_norm": 0.2528221685999568, "learning_rate": 1.940196613872201e-05, "loss": 0.4546, "step": 13240 }, { "epoch": 3.255559651062784, "grad_norm": 0.20439154057353615, "learning_rate": 1.9374658656471874e-05, "loss": 0.4591, "step": 13250 }, { "epoch": 3.2580169553999263, "grad_norm": 0.2611834269913105, "learning_rate": 1.9347351174221738e-05, "loss": 0.4711, "step": 13260 }, { "epoch": 3.2604742597370686, "grad_norm": 0.23116236938517262, "learning_rate": 1.9320043691971602e-05, "loss": 0.4811, "step": 13270 }, { "epoch": 3.2629315640742105, "grad_norm": 0.24066408699361, "learning_rate": 1.9292736209721466e-05, "loss": 0.469, "step": 13280 }, { "epoch": 3.265388868411353, "grad_norm": 0.24002087870002642, "learning_rate": 1.9265428727471326e-05, "loss": 0.4475, "step": 13290 }, { "epoch": 3.267846172748495, "grad_norm": 0.22296910756020752, "learning_rate": 1.923812124522119e-05, "loss": 0.4496, "step": 13300 }, { "epoch": 3.270303477085637, "grad_norm": 0.20974931241546266, "learning_rate": 1.9210813762971054e-05, "loss": 0.4827, "step": 13310 }, { "epoch": 3.2727607814227793, "grad_norm": 0.20620733160422441, "learning_rate": 1.918350628072092e-05, "loss": 0.4589, "step": 13320 }, { "epoch": 3.2752180857599216, "grad_norm": 0.23928577995101002, "learning_rate": 1.9156198798470782e-05, "loss": 0.468, "step": 13330 }, { "epoch": 3.2776753900970634, "grad_norm": 0.2028037855523519, "learning_rate": 1.9128891316220646e-05, "loss": 0.4608, "step": 13340 }, { "epoch": 3.2801326944342057, "grad_norm": 0.23908781178535016, "learning_rate": 1.910158383397051e-05, "loss": 0.4655, "step": 13350 }, { "epoch": 3.282589998771348, "grad_norm": 0.22276276125193423, "learning_rate": 1.907427635172037e-05, "loss": 0.4633, "step": 13360 }, { "epoch": 3.28504730310849, "grad_norm": 0.2395618359861455, "learning_rate": 1.9046968869470235e-05, "loss": 0.4457, "step": 13370 }, { "epoch": 3.287504607445632, "grad_norm": 0.27660841342138787, "learning_rate": 1.90196613872201e-05, "loss": 0.4567, "step": 13380 }, { "epoch": 3.2899619117827745, "grad_norm": 0.2629368282086009, "learning_rate": 1.8992353904969963e-05, "loss": 0.4572, "step": 13390 }, { "epoch": 3.2924192161199164, "grad_norm": 0.24257932537431437, "learning_rate": 1.8965046422719827e-05, "loss": 0.4713, "step": 13400 }, { "epoch": 3.2948765204570587, "grad_norm": 0.25561666142134704, "learning_rate": 1.893773894046969e-05, "loss": 0.477, "step": 13410 }, { "epoch": 3.297333824794201, "grad_norm": 0.21583035025196956, "learning_rate": 1.891043145821955e-05, "loss": 0.4444, "step": 13420 }, { "epoch": 3.299791129131343, "grad_norm": 0.23262493585891, "learning_rate": 1.8883123975969416e-05, "loss": 0.4516, "step": 13430 }, { "epoch": 3.302248433468485, "grad_norm": 0.2332701247635043, "learning_rate": 1.885581649371928e-05, "loss": 0.4649, "step": 13440 }, { "epoch": 3.304705737805627, "grad_norm": 0.2280419264946994, "learning_rate": 1.8828509011469144e-05, "loss": 0.4583, "step": 13450 }, { "epoch": 3.3071630421427693, "grad_norm": 0.22389638186460062, "learning_rate": 1.8801201529219008e-05, "loss": 0.4434, "step": 13460 }, { "epoch": 3.3096203464799117, "grad_norm": 0.23204664289802654, "learning_rate": 1.877389404696887e-05, "loss": 0.4588, "step": 13470 }, { "epoch": 3.3120776508170535, "grad_norm": 0.20198021616583992, "learning_rate": 1.8746586564718736e-05, "loss": 0.4477, "step": 13480 }, { "epoch": 3.314534955154196, "grad_norm": 0.23651948225371044, "learning_rate": 1.8719279082468596e-05, "loss": 0.4517, "step": 13490 }, { "epoch": 3.316992259491338, "grad_norm": 0.23252069842559203, "learning_rate": 1.869197160021846e-05, "loss": 0.4505, "step": 13500 }, { "epoch": 3.31944956382848, "grad_norm": 0.22735940486784878, "learning_rate": 1.8664664117968324e-05, "loss": 0.4522, "step": 13510 }, { "epoch": 3.3219068681656223, "grad_norm": 0.26359866965815665, "learning_rate": 1.8637356635718188e-05, "loss": 0.4662, "step": 13520 }, { "epoch": 3.3243641725027646, "grad_norm": 0.22256571631932098, "learning_rate": 1.8610049153468052e-05, "loss": 0.4471, "step": 13530 }, { "epoch": 3.3268214768399065, "grad_norm": 0.27041296033429757, "learning_rate": 1.8582741671217916e-05, "loss": 0.4497, "step": 13540 }, { "epoch": 3.329278781177049, "grad_norm": 0.23568690164112682, "learning_rate": 1.8555434188967777e-05, "loss": 0.438, "step": 13550 }, { "epoch": 3.331736085514191, "grad_norm": 0.2596352471856333, "learning_rate": 1.852812670671764e-05, "loss": 0.4552, "step": 13560 }, { "epoch": 3.334193389851333, "grad_norm": 0.23136133074430096, "learning_rate": 1.8500819224467505e-05, "loss": 0.466, "step": 13570 }, { "epoch": 3.3366506941884753, "grad_norm": 0.2462030908697693, "learning_rate": 1.847351174221737e-05, "loss": 0.4632, "step": 13580 }, { "epoch": 3.3391079985256176, "grad_norm": 0.222950925218002, "learning_rate": 1.8446204259967233e-05, "loss": 0.4658, "step": 13590 }, { "epoch": 3.3415653028627594, "grad_norm": 0.2693942899215433, "learning_rate": 1.8418896777717097e-05, "loss": 0.4545, "step": 13600 }, { "epoch": 3.3440226071999017, "grad_norm": 0.23988702557724462, "learning_rate": 1.839158929546696e-05, "loss": 0.4639, "step": 13610 }, { "epoch": 3.346479911537044, "grad_norm": 0.27065087049903075, "learning_rate": 1.836428181321682e-05, "loss": 0.471, "step": 13620 }, { "epoch": 3.348937215874186, "grad_norm": 0.21911362946828605, "learning_rate": 1.8336974330966685e-05, "loss": 0.4696, "step": 13630 }, { "epoch": 3.351394520211328, "grad_norm": 0.21348338831666844, "learning_rate": 1.830966684871655e-05, "loss": 0.4552, "step": 13640 }, { "epoch": 3.35385182454847, "grad_norm": 0.236112386215486, "learning_rate": 1.828235936646641e-05, "loss": 0.4461, "step": 13650 }, { "epoch": 3.3563091288856124, "grad_norm": 0.19806244414034688, "learning_rate": 1.8255051884216277e-05, "loss": 0.4575, "step": 13660 }, { "epoch": 3.3587664332227547, "grad_norm": 0.23105418335109135, "learning_rate": 1.822774440196614e-05, "loss": 0.4781, "step": 13670 }, { "epoch": 3.3612237375598966, "grad_norm": 0.2126719127017622, "learning_rate": 1.8200436919716002e-05, "loss": 0.4495, "step": 13680 }, { "epoch": 3.363681041897039, "grad_norm": 0.2689671672676364, "learning_rate": 1.8173129437465866e-05, "loss": 0.4675, "step": 13690 }, { "epoch": 3.366138346234181, "grad_norm": 0.21183165734715922, "learning_rate": 1.814582195521573e-05, "loss": 0.4529, "step": 13700 }, { "epoch": 3.368595650571323, "grad_norm": 0.22564744100094353, "learning_rate": 1.8118514472965594e-05, "loss": 0.4358, "step": 13710 }, { "epoch": 3.3710529549084653, "grad_norm": 0.22038371423351472, "learning_rate": 1.8091206990715455e-05, "loss": 0.4378, "step": 13720 }, { "epoch": 3.3735102592456077, "grad_norm": 0.21849966371182317, "learning_rate": 1.8063899508465322e-05, "loss": 0.4518, "step": 13730 }, { "epoch": 3.3759675635827495, "grad_norm": 0.21302150720911217, "learning_rate": 1.8036592026215186e-05, "loss": 0.4683, "step": 13740 }, { "epoch": 3.378424867919892, "grad_norm": 0.24555292620767752, "learning_rate": 1.8009284543965047e-05, "loss": 0.4509, "step": 13750 }, { "epoch": 3.380882172257034, "grad_norm": 0.2566767385525081, "learning_rate": 1.798197706171491e-05, "loss": 0.4453, "step": 13760 }, { "epoch": 3.383339476594176, "grad_norm": 0.24454493283266962, "learning_rate": 1.7954669579464775e-05, "loss": 0.4695, "step": 13770 }, { "epoch": 3.3857967809313183, "grad_norm": 0.23881511093363533, "learning_rate": 1.7927362097214635e-05, "loss": 0.4678, "step": 13780 }, { "epoch": 3.3882540852684606, "grad_norm": 0.21930182982878635, "learning_rate": 1.79000546149645e-05, "loss": 0.4524, "step": 13790 }, { "epoch": 3.3907113896056025, "grad_norm": 0.25414181715625656, "learning_rate": 1.7872747132714367e-05, "loss": 0.4354, "step": 13800 }, { "epoch": 3.393168693942745, "grad_norm": 0.24107102363246452, "learning_rate": 1.7845439650464227e-05, "loss": 0.4428, "step": 13810 }, { "epoch": 3.395625998279887, "grad_norm": 0.25127300706614925, "learning_rate": 1.781813216821409e-05, "loss": 0.4445, "step": 13820 }, { "epoch": 3.398083302617029, "grad_norm": 0.2225196966924624, "learning_rate": 1.7790824685963955e-05, "loss": 0.4474, "step": 13830 }, { "epoch": 3.4005406069541713, "grad_norm": 0.24257792169699224, "learning_rate": 1.776351720371382e-05, "loss": 0.4748, "step": 13840 }, { "epoch": 3.4029979112913136, "grad_norm": 0.22458084551518392, "learning_rate": 1.773620972146368e-05, "loss": 0.4451, "step": 13850 }, { "epoch": 3.4054552156284554, "grad_norm": 0.234988148314365, "learning_rate": 1.7708902239213547e-05, "loss": 0.4562, "step": 13860 }, { "epoch": 3.4079125199655977, "grad_norm": 0.21890137171003474, "learning_rate": 1.768159475696341e-05, "loss": 0.4628, "step": 13870 }, { "epoch": 3.41036982430274, "grad_norm": 0.2344955336333079, "learning_rate": 1.7654287274713272e-05, "loss": 0.4526, "step": 13880 }, { "epoch": 3.412827128639882, "grad_norm": 0.2518813975734915, "learning_rate": 1.7626979792463136e-05, "loss": 0.4483, "step": 13890 }, { "epoch": 3.415284432977024, "grad_norm": 0.21922119722644792, "learning_rate": 1.7599672310213e-05, "loss": 0.4666, "step": 13900 }, { "epoch": 3.4177417373141665, "grad_norm": 0.2668921248977976, "learning_rate": 1.757236482796286e-05, "loss": 0.4585, "step": 13910 }, { "epoch": 3.4201990416513084, "grad_norm": 0.21166031043907066, "learning_rate": 1.7545057345712725e-05, "loss": 0.4645, "step": 13920 }, { "epoch": 3.4226563459884507, "grad_norm": 0.2644240538554049, "learning_rate": 1.7517749863462592e-05, "loss": 0.4599, "step": 13930 }, { "epoch": 3.425113650325593, "grad_norm": 0.20689920966659675, "learning_rate": 1.7490442381212453e-05, "loss": 0.456, "step": 13940 }, { "epoch": 3.427570954662735, "grad_norm": 0.2451343800983702, "learning_rate": 1.7463134898962317e-05, "loss": 0.4696, "step": 13950 }, { "epoch": 3.430028258999877, "grad_norm": 0.2187659885694797, "learning_rate": 1.743582741671218e-05, "loss": 0.4555, "step": 13960 }, { "epoch": 3.4324855633370195, "grad_norm": 0.2276250443035276, "learning_rate": 1.7408519934462045e-05, "loss": 0.4519, "step": 13970 }, { "epoch": 3.4349428676741613, "grad_norm": 0.23102217208027584, "learning_rate": 1.7381212452211905e-05, "loss": 0.4558, "step": 13980 }, { "epoch": 3.4374001720113037, "grad_norm": 0.36040811981577564, "learning_rate": 1.735390496996177e-05, "loss": 0.4576, "step": 13990 }, { "epoch": 3.439857476348446, "grad_norm": 0.20896623507001913, "learning_rate": 1.7326597487711637e-05, "loss": 0.4682, "step": 14000 }, { "epoch": 3.442314780685588, "grad_norm": 0.25502689746751656, "learning_rate": 1.7299290005461497e-05, "loss": 0.4715, "step": 14010 }, { "epoch": 3.44477208502273, "grad_norm": 0.23061836198023772, "learning_rate": 1.727198252321136e-05, "loss": 0.4733, "step": 14020 }, { "epoch": 3.4472293893598724, "grad_norm": 0.22537959481413214, "learning_rate": 1.7244675040961225e-05, "loss": 0.4677, "step": 14030 }, { "epoch": 3.4496866936970143, "grad_norm": 0.22062680011096708, "learning_rate": 1.7217367558711086e-05, "loss": 0.4598, "step": 14040 }, { "epoch": 3.4521439980341566, "grad_norm": 0.25560064995239945, "learning_rate": 1.719006007646095e-05, "loss": 0.4561, "step": 14050 }, { "epoch": 3.454601302371299, "grad_norm": 0.25057213209081836, "learning_rate": 1.7162752594210814e-05, "loss": 0.4735, "step": 14060 }, { "epoch": 3.457058606708441, "grad_norm": 0.2754058883282188, "learning_rate": 1.7135445111960678e-05, "loss": 0.4428, "step": 14070 }, { "epoch": 3.459515911045583, "grad_norm": 0.24333994357253555, "learning_rate": 1.7108137629710542e-05, "loss": 0.4527, "step": 14080 }, { "epoch": 3.4619732153827254, "grad_norm": 0.24357330375848435, "learning_rate": 1.7080830147460406e-05, "loss": 0.4731, "step": 14090 }, { "epoch": 3.4644305197198673, "grad_norm": 0.26560218421885323, "learning_rate": 1.705352266521027e-05, "loss": 0.4644, "step": 14100 }, { "epoch": 3.4668878240570096, "grad_norm": 0.22840874021484367, "learning_rate": 1.702621518296013e-05, "loss": 0.4512, "step": 14110 }, { "epoch": 3.4693451283941514, "grad_norm": 0.23706341219143065, "learning_rate": 1.6998907700709994e-05, "loss": 0.4617, "step": 14120 }, { "epoch": 3.4718024327312937, "grad_norm": 0.25837238208251356, "learning_rate": 1.697160021845986e-05, "loss": 0.486, "step": 14130 }, { "epoch": 3.474259737068436, "grad_norm": 0.24010076789611126, "learning_rate": 1.6944292736209722e-05, "loss": 0.4459, "step": 14140 }, { "epoch": 3.476717041405578, "grad_norm": 0.20691404513325073, "learning_rate": 1.6916985253959586e-05, "loss": 0.4525, "step": 14150 }, { "epoch": 3.47917434574272, "grad_norm": 0.24669468605992204, "learning_rate": 1.688967777170945e-05, "loss": 0.4783, "step": 14160 }, { "epoch": 3.4816316500798625, "grad_norm": 0.2544829714435944, "learning_rate": 1.686237028945931e-05, "loss": 0.4498, "step": 14170 }, { "epoch": 3.4840889544170044, "grad_norm": 0.2678382915871057, "learning_rate": 1.6835062807209175e-05, "loss": 0.4481, "step": 14180 }, { "epoch": 3.4865462587541467, "grad_norm": 0.24187424736274868, "learning_rate": 1.680775532495904e-05, "loss": 0.4606, "step": 14190 }, { "epoch": 3.489003563091289, "grad_norm": 0.24354538553084912, "learning_rate": 1.6780447842708903e-05, "loss": 0.461, "step": 14200 }, { "epoch": 3.491460867428431, "grad_norm": 0.23660743995859193, "learning_rate": 1.6753140360458767e-05, "loss": 0.4723, "step": 14210 }, { "epoch": 3.493918171765573, "grad_norm": 0.2386276744640651, "learning_rate": 1.672583287820863e-05, "loss": 0.4653, "step": 14220 }, { "epoch": 3.4963754761027155, "grad_norm": 0.18972262022870842, "learning_rate": 1.6698525395958495e-05, "loss": 0.4529, "step": 14230 }, { "epoch": 3.4988327804398573, "grad_norm": 0.24014091903328427, "learning_rate": 1.6671217913708356e-05, "loss": 0.4499, "step": 14240 }, { "epoch": 3.5012900847769997, "grad_norm": 0.23041304118288322, "learning_rate": 1.664391043145822e-05, "loss": 0.4609, "step": 14250 }, { "epoch": 3.5037473891141415, "grad_norm": 0.2528029467511012, "learning_rate": 1.6616602949208084e-05, "loss": 0.4607, "step": 14260 }, { "epoch": 3.506204693451284, "grad_norm": 0.22126731457166177, "learning_rate": 1.6589295466957948e-05, "loss": 0.4507, "step": 14270 }, { "epoch": 3.508661997788426, "grad_norm": 0.27380702846560373, "learning_rate": 1.656198798470781e-05, "loss": 0.4687, "step": 14280 }, { "epoch": 3.511119302125568, "grad_norm": 0.23459674145243797, "learning_rate": 1.6534680502457676e-05, "loss": 0.4556, "step": 14290 }, { "epoch": 3.5135766064627103, "grad_norm": 0.25693788251202365, "learning_rate": 1.6507373020207536e-05, "loss": 0.4452, "step": 14300 }, { "epoch": 3.5160339107998526, "grad_norm": 0.2018035834901406, "learning_rate": 1.64800655379574e-05, "loss": 0.4533, "step": 14310 }, { "epoch": 3.5184912151369945, "grad_norm": 0.2518020510577878, "learning_rate": 1.6452758055707264e-05, "loss": 0.4707, "step": 14320 }, { "epoch": 3.520948519474137, "grad_norm": 0.2381905040216694, "learning_rate": 1.6425450573457128e-05, "loss": 0.4523, "step": 14330 }, { "epoch": 3.523405823811279, "grad_norm": 0.21801706826370956, "learning_rate": 1.6398143091206992e-05, "loss": 0.4565, "step": 14340 }, { "epoch": 3.525863128148421, "grad_norm": 0.21913679962103172, "learning_rate": 1.6370835608956856e-05, "loss": 0.4575, "step": 14350 }, { "epoch": 3.5283204324855633, "grad_norm": 0.22450648967693346, "learning_rate": 1.634352812670672e-05, "loss": 0.4533, "step": 14360 }, { "epoch": 3.5307777368227056, "grad_norm": 0.22790413251381103, "learning_rate": 1.631622064445658e-05, "loss": 0.4665, "step": 14370 }, { "epoch": 3.5332350411598474, "grad_norm": 0.25483368334241624, "learning_rate": 1.6288913162206445e-05, "loss": 0.4686, "step": 14380 }, { "epoch": 3.5356923454969897, "grad_norm": 0.2547297778858158, "learning_rate": 1.626160567995631e-05, "loss": 0.4538, "step": 14390 }, { "epoch": 3.538149649834132, "grad_norm": 0.23599472115429282, "learning_rate": 1.6234298197706173e-05, "loss": 0.4552, "step": 14400 }, { "epoch": 3.540606954171274, "grad_norm": 0.24586896598037009, "learning_rate": 1.6206990715456037e-05, "loss": 0.4582, "step": 14410 }, { "epoch": 3.543064258508416, "grad_norm": 0.26855280130560255, "learning_rate": 1.61796832332059e-05, "loss": 0.4687, "step": 14420 }, { "epoch": 3.5455215628455585, "grad_norm": 0.2014599217461247, "learning_rate": 1.615237575095576e-05, "loss": 0.4546, "step": 14430 }, { "epoch": 3.5479788671827004, "grad_norm": 0.2703164709742999, "learning_rate": 1.6125068268705626e-05, "loss": 0.4453, "step": 14440 }, { "epoch": 3.5504361715198427, "grad_norm": 0.2477785677600987, "learning_rate": 1.609776078645549e-05, "loss": 0.4547, "step": 14450 }, { "epoch": 3.552893475856985, "grad_norm": 0.2719122864066834, "learning_rate": 1.607045330420535e-05, "loss": 0.457, "step": 14460 }, { "epoch": 3.555350780194127, "grad_norm": 0.2412712450613338, "learning_rate": 1.6043145821955218e-05, "loss": 0.4597, "step": 14470 }, { "epoch": 3.557808084531269, "grad_norm": 0.23188002062001184, "learning_rate": 1.601583833970508e-05, "loss": 0.4539, "step": 14480 }, { "epoch": 3.5602653888684115, "grad_norm": 0.21162683967523677, "learning_rate": 1.5988530857454946e-05, "loss": 0.465, "step": 14490 }, { "epoch": 3.5627226932055533, "grad_norm": 0.22862804904300726, "learning_rate": 1.5961223375204806e-05, "loss": 0.4625, "step": 14500 }, { "epoch": 3.5651799975426957, "grad_norm": 0.20400945624347722, "learning_rate": 1.593391589295467e-05, "loss": 0.461, "step": 14510 }, { "epoch": 3.567637301879838, "grad_norm": 0.2403392698346932, "learning_rate": 1.5906608410704534e-05, "loss": 0.4521, "step": 14520 }, { "epoch": 3.57009460621698, "grad_norm": 0.25540737337029473, "learning_rate": 1.5879300928454395e-05, "loss": 0.4476, "step": 14530 }, { "epoch": 3.572551910554122, "grad_norm": 0.24612018266105182, "learning_rate": 1.5851993446204262e-05, "loss": 0.4484, "step": 14540 }, { "epoch": 3.5750092148912644, "grad_norm": 0.24028092696459755, "learning_rate": 1.5824685963954126e-05, "loss": 0.4839, "step": 14550 }, { "epoch": 3.5774665192284063, "grad_norm": 0.2508923321860788, "learning_rate": 1.5797378481703987e-05, "loss": 0.4415, "step": 14560 }, { "epoch": 3.5799238235655486, "grad_norm": 0.25224829879467947, "learning_rate": 1.577007099945385e-05, "loss": 0.4675, "step": 14570 }, { "epoch": 3.582381127902691, "grad_norm": 0.21601850388430766, "learning_rate": 1.5742763517203715e-05, "loss": 0.446, "step": 14580 }, { "epoch": 3.584838432239833, "grad_norm": 0.2267056277247008, "learning_rate": 1.5715456034953575e-05, "loss": 0.456, "step": 14590 }, { "epoch": 3.587295736576975, "grad_norm": 0.25373487141352086, "learning_rate": 1.568814855270344e-05, "loss": 0.4371, "step": 14600 }, { "epoch": 3.5897530409141174, "grad_norm": 0.226113839445458, "learning_rate": 1.5660841070453307e-05, "loss": 0.4574, "step": 14610 }, { "epoch": 3.5922103452512593, "grad_norm": 0.27332443354351005, "learning_rate": 1.563353358820317e-05, "loss": 0.4572, "step": 14620 }, { "epoch": 3.5946676495884016, "grad_norm": 0.21991578054932612, "learning_rate": 1.560622610595303e-05, "loss": 0.4651, "step": 14630 }, { "epoch": 3.597124953925544, "grad_norm": 0.22690462199971173, "learning_rate": 1.5578918623702895e-05, "loss": 0.4698, "step": 14640 }, { "epoch": 3.5995822582626857, "grad_norm": 0.24348592401842892, "learning_rate": 1.555161114145276e-05, "loss": 0.4543, "step": 14650 }, { "epoch": 3.602039562599828, "grad_norm": 0.2482833458756066, "learning_rate": 1.552430365920262e-05, "loss": 0.4429, "step": 14660 }, { "epoch": 3.6044968669369704, "grad_norm": 0.24713888594514793, "learning_rate": 1.5496996176952487e-05, "loss": 0.4469, "step": 14670 }, { "epoch": 3.606954171274112, "grad_norm": 0.26026006857308664, "learning_rate": 1.546968869470235e-05, "loss": 0.4599, "step": 14680 }, { "epoch": 3.6094114756112545, "grad_norm": 0.23785758355746167, "learning_rate": 1.5442381212452212e-05, "loss": 0.4646, "step": 14690 }, { "epoch": 3.611868779948397, "grad_norm": 0.26859969309011006, "learning_rate": 1.5415073730202076e-05, "loss": 0.4318, "step": 14700 }, { "epoch": 3.6143260842855387, "grad_norm": 0.3118655693763755, "learning_rate": 1.538776624795194e-05, "loss": 0.4689, "step": 14710 }, { "epoch": 3.616783388622681, "grad_norm": 0.2288687986113886, "learning_rate": 1.53604587657018e-05, "loss": 0.4616, "step": 14720 }, { "epoch": 3.6192406929598233, "grad_norm": 0.3067305244002297, "learning_rate": 1.5333151283451665e-05, "loss": 0.4649, "step": 14730 }, { "epoch": 3.621697997296965, "grad_norm": 0.23073525966578426, "learning_rate": 1.5305843801201532e-05, "loss": 0.453, "step": 14740 }, { "epoch": 3.6241553016341075, "grad_norm": 0.24977834570454938, "learning_rate": 1.5278536318951396e-05, "loss": 0.446, "step": 14750 }, { "epoch": 3.62661260597125, "grad_norm": 0.2478874139773429, "learning_rate": 1.5251228836701257e-05, "loss": 0.4585, "step": 14760 }, { "epoch": 3.6290699103083917, "grad_norm": 0.23547015661254878, "learning_rate": 1.522392135445112e-05, "loss": 0.4567, "step": 14770 }, { "epoch": 3.631527214645534, "grad_norm": 0.22709774019197057, "learning_rate": 1.5196613872200985e-05, "loss": 0.4547, "step": 14780 }, { "epoch": 3.6339845189826763, "grad_norm": 0.2390648178483822, "learning_rate": 1.5169306389950847e-05, "loss": 0.454, "step": 14790 }, { "epoch": 3.636441823319818, "grad_norm": 0.2212015420430321, "learning_rate": 1.5141998907700711e-05, "loss": 0.4555, "step": 14800 }, { "epoch": 3.6388991276569604, "grad_norm": 0.23725158506517868, "learning_rate": 1.5114691425450575e-05, "loss": 0.4534, "step": 14810 }, { "epoch": 3.6413564319941023, "grad_norm": 0.23771995631119283, "learning_rate": 1.5087383943200437e-05, "loss": 0.4356, "step": 14820 }, { "epoch": 3.6438137363312446, "grad_norm": 0.2450664592666036, "learning_rate": 1.5060076460950301e-05, "loss": 0.4688, "step": 14830 }, { "epoch": 3.646271040668387, "grad_norm": 0.2670465208365733, "learning_rate": 1.5032768978700165e-05, "loss": 0.4334, "step": 14840 }, { "epoch": 3.648728345005529, "grad_norm": 0.2350633566572758, "learning_rate": 1.5005461496450028e-05, "loss": 0.4544, "step": 14850 }, { "epoch": 3.651185649342671, "grad_norm": 0.23537231012614837, "learning_rate": 1.4978154014199892e-05, "loss": 0.4553, "step": 14860 }, { "epoch": 3.6536429536798134, "grad_norm": 0.24451362378992747, "learning_rate": 1.4950846531949756e-05, "loss": 0.4535, "step": 14870 }, { "epoch": 3.6561002580169553, "grad_norm": 0.23997508888570385, "learning_rate": 1.492353904969962e-05, "loss": 0.4544, "step": 14880 }, { "epoch": 3.6585575623540976, "grad_norm": 0.23383807534338613, "learning_rate": 1.4896231567449482e-05, "loss": 0.4578, "step": 14890 }, { "epoch": 3.6610148666912394, "grad_norm": 0.23002884323202305, "learning_rate": 1.4868924085199346e-05, "loss": 0.4408, "step": 14900 }, { "epoch": 3.6634721710283817, "grad_norm": 0.26816707717204663, "learning_rate": 1.484161660294921e-05, "loss": 0.4312, "step": 14910 }, { "epoch": 3.665929475365524, "grad_norm": 0.24500847723456493, "learning_rate": 1.4814309120699072e-05, "loss": 0.4661, "step": 14920 }, { "epoch": 3.668386779702666, "grad_norm": 0.2102998595780861, "learning_rate": 1.4787001638448936e-05, "loss": 0.4492, "step": 14930 }, { "epoch": 3.670844084039808, "grad_norm": 0.28502138937102905, "learning_rate": 1.47596941561988e-05, "loss": 0.4681, "step": 14940 }, { "epoch": 3.6733013883769505, "grad_norm": 0.21731858706031554, "learning_rate": 1.473238667394866e-05, "loss": 0.4484, "step": 14950 }, { "epoch": 3.6757586927140924, "grad_norm": 0.2125155509674663, "learning_rate": 1.4705079191698527e-05, "loss": 0.4587, "step": 14960 }, { "epoch": 3.6782159970512347, "grad_norm": 0.2361250555876648, "learning_rate": 1.467777170944839e-05, "loss": 0.4669, "step": 14970 }, { "epoch": 3.680673301388377, "grad_norm": 0.2302298050498988, "learning_rate": 1.4650464227198251e-05, "loss": 0.4493, "step": 14980 }, { "epoch": 3.683130605725519, "grad_norm": 0.20487995333813935, "learning_rate": 1.4623156744948117e-05, "loss": 0.466, "step": 14990 }, { "epoch": 3.685587910062661, "grad_norm": 0.2949031556115403, "learning_rate": 1.459584926269798e-05, "loss": 0.4672, "step": 15000 }, { "epoch": 3.6880452143998035, "grad_norm": 0.23695952785419527, "learning_rate": 1.4568541780447845e-05, "loss": 0.4541, "step": 15010 }, { "epoch": 3.6905025187369453, "grad_norm": 0.23966811734146523, "learning_rate": 1.4541234298197705e-05, "loss": 0.4691, "step": 15020 }, { "epoch": 3.6929598230740877, "grad_norm": 0.23583725973098502, "learning_rate": 1.4513926815947571e-05, "loss": 0.4487, "step": 15030 }, { "epoch": 3.69541712741123, "grad_norm": 0.23565582676885768, "learning_rate": 1.4486619333697435e-05, "loss": 0.4492, "step": 15040 }, { "epoch": 3.697874431748372, "grad_norm": 0.2457122990828032, "learning_rate": 1.4459311851447296e-05, "loss": 0.4643, "step": 15050 }, { "epoch": 3.700331736085514, "grad_norm": 0.226529289385806, "learning_rate": 1.4432004369197161e-05, "loss": 0.4604, "step": 15060 }, { "epoch": 3.7027890404226564, "grad_norm": 0.21544307559941467, "learning_rate": 1.4404696886947025e-05, "loss": 0.4558, "step": 15070 }, { "epoch": 3.7052463447597983, "grad_norm": 0.2715767380469318, "learning_rate": 1.4377389404696886e-05, "loss": 0.438, "step": 15080 }, { "epoch": 3.7077036490969406, "grad_norm": 0.24960166829591868, "learning_rate": 1.435008192244675e-05, "loss": 0.4588, "step": 15090 }, { "epoch": 3.710160953434083, "grad_norm": 0.206245319003777, "learning_rate": 1.4322774440196616e-05, "loss": 0.4542, "step": 15100 }, { "epoch": 3.712618257771225, "grad_norm": 0.21872365631349308, "learning_rate": 1.4295466957946476e-05, "loss": 0.4505, "step": 15110 }, { "epoch": 3.715075562108367, "grad_norm": 0.24186817266330532, "learning_rate": 1.426815947569634e-05, "loss": 0.4631, "step": 15120 }, { "epoch": 3.7175328664455094, "grad_norm": 0.22269903283483153, "learning_rate": 1.4240851993446206e-05, "loss": 0.4715, "step": 15130 }, { "epoch": 3.7199901707826513, "grad_norm": 0.243517746368057, "learning_rate": 1.421354451119607e-05, "loss": 0.4521, "step": 15140 }, { "epoch": 3.7224474751197936, "grad_norm": 0.24790071687915075, "learning_rate": 1.418623702894593e-05, "loss": 0.4679, "step": 15150 }, { "epoch": 3.724904779456936, "grad_norm": 0.23684660116835593, "learning_rate": 1.4158929546695795e-05, "loss": 0.4555, "step": 15160 }, { "epoch": 3.7273620837940777, "grad_norm": 0.2734345643560449, "learning_rate": 1.413162206444566e-05, "loss": 0.4675, "step": 15170 }, { "epoch": 3.72981938813122, "grad_norm": 0.24245382554725067, "learning_rate": 1.4104314582195521e-05, "loss": 0.4678, "step": 15180 }, { "epoch": 3.7322766924683624, "grad_norm": 0.22373811044846842, "learning_rate": 1.4077007099945385e-05, "loss": 0.4473, "step": 15190 }, { "epoch": 3.734733996805504, "grad_norm": 0.2334601792479221, "learning_rate": 1.404969961769525e-05, "loss": 0.4623, "step": 15200 }, { "epoch": 3.7371913011426465, "grad_norm": 0.23500894095256977, "learning_rate": 1.4022392135445111e-05, "loss": 0.4704, "step": 15210 }, { "epoch": 3.739648605479789, "grad_norm": 0.27633092411930577, "learning_rate": 1.3995084653194975e-05, "loss": 0.4514, "step": 15220 }, { "epoch": 3.7421059098169307, "grad_norm": 0.2500313003663449, "learning_rate": 1.3967777170944841e-05, "loss": 0.4448, "step": 15230 }, { "epoch": 3.744563214154073, "grad_norm": 0.2902250282031905, "learning_rate": 1.3940469688694702e-05, "loss": 0.4592, "step": 15240 }, { "epoch": 3.7470205184912153, "grad_norm": 0.2589303963162972, "learning_rate": 1.3913162206444566e-05, "loss": 0.4621, "step": 15250 }, { "epoch": 3.749477822828357, "grad_norm": 0.24190854408938783, "learning_rate": 1.388585472419443e-05, "loss": 0.4509, "step": 15260 }, { "epoch": 3.7519351271654995, "grad_norm": 0.24639046019273952, "learning_rate": 1.3858547241944295e-05, "loss": 0.4608, "step": 15270 }, { "epoch": 3.754392431502642, "grad_norm": 0.21927644382112044, "learning_rate": 1.3831239759694156e-05, "loss": 0.4704, "step": 15280 }, { "epoch": 3.7568497358397837, "grad_norm": 0.25925277935582763, "learning_rate": 1.380393227744402e-05, "loss": 0.443, "step": 15290 }, { "epoch": 3.759307040176926, "grad_norm": 0.21464276627419757, "learning_rate": 1.3776624795193886e-05, "loss": 0.4479, "step": 15300 }, { "epoch": 3.7617643445140683, "grad_norm": 0.23792960398078913, "learning_rate": 1.3749317312943746e-05, "loss": 0.4633, "step": 15310 }, { "epoch": 3.76422164885121, "grad_norm": 0.23537034434750342, "learning_rate": 1.372200983069361e-05, "loss": 0.44, "step": 15320 }, { "epoch": 3.7666789531883524, "grad_norm": 0.25262925499986016, "learning_rate": 1.3694702348443474e-05, "loss": 0.4506, "step": 15330 }, { "epoch": 3.7691362575254947, "grad_norm": 0.2649249934390416, "learning_rate": 1.3667394866193337e-05, "loss": 0.4583, "step": 15340 }, { "epoch": 3.7715935618626366, "grad_norm": 0.2280689768614271, "learning_rate": 1.36400873839432e-05, "loss": 0.4747, "step": 15350 }, { "epoch": 3.774050866199779, "grad_norm": 0.2097288243074226, "learning_rate": 1.3612779901693065e-05, "loss": 0.472, "step": 15360 }, { "epoch": 3.7765081705369212, "grad_norm": 0.2699865946964672, "learning_rate": 1.3585472419442927e-05, "loss": 0.4708, "step": 15370 }, { "epoch": 3.778965474874063, "grad_norm": 0.21392797148230525, "learning_rate": 1.3558164937192791e-05, "loss": 0.4502, "step": 15380 }, { "epoch": 3.7814227792112054, "grad_norm": 0.25742464688530026, "learning_rate": 1.3530857454942655e-05, "loss": 0.4584, "step": 15390 }, { "epoch": 3.7838800835483477, "grad_norm": 0.23563867238362413, "learning_rate": 1.3503549972692519e-05, "loss": 0.4665, "step": 15400 }, { "epoch": 3.7863373878854896, "grad_norm": 0.21503930500602877, "learning_rate": 1.3476242490442381e-05, "loss": 0.4685, "step": 15410 }, { "epoch": 3.788794692222632, "grad_norm": 0.22144683140993815, "learning_rate": 1.3448935008192245e-05, "loss": 0.4418, "step": 15420 }, { "epoch": 3.791251996559774, "grad_norm": 0.2541013728259448, "learning_rate": 1.342162752594211e-05, "loss": 0.4553, "step": 15430 }, { "epoch": 3.793709300896916, "grad_norm": 0.22045579159420614, "learning_rate": 1.3394320043691971e-05, "loss": 0.4815, "step": 15440 }, { "epoch": 3.7961666052340584, "grad_norm": 0.21300455875294527, "learning_rate": 1.3367012561441835e-05, "loss": 0.4471, "step": 15450 }, { "epoch": 3.7986239095712007, "grad_norm": 0.23263910198105017, "learning_rate": 1.33397050791917e-05, "loss": 0.4696, "step": 15460 }, { "epoch": 3.8010812139083425, "grad_norm": 0.22989950423542166, "learning_rate": 1.3312397596941562e-05, "loss": 0.4639, "step": 15470 }, { "epoch": 3.803538518245485, "grad_norm": 0.2605819284922907, "learning_rate": 1.3285090114691426e-05, "loss": 0.4629, "step": 15480 }, { "epoch": 3.8059958225826267, "grad_norm": 0.27623048261541683, "learning_rate": 1.325778263244129e-05, "loss": 0.4635, "step": 15490 }, { "epoch": 3.808453126919769, "grad_norm": 0.22765097331197834, "learning_rate": 1.3230475150191152e-05, "loss": 0.4596, "step": 15500 }, { "epoch": 3.8109104312569113, "grad_norm": 0.21231314315393612, "learning_rate": 1.3203167667941016e-05, "loss": 0.4462, "step": 15510 }, { "epoch": 3.813367735594053, "grad_norm": 0.2547953119103437, "learning_rate": 1.317586018569088e-05, "loss": 0.4426, "step": 15520 }, { "epoch": 3.8158250399311955, "grad_norm": 0.2297028880156302, "learning_rate": 1.3148552703440744e-05, "loss": 0.431, "step": 15530 }, { "epoch": 3.818282344268338, "grad_norm": 0.23847474591218792, "learning_rate": 1.3121245221190606e-05, "loss": 0.4415, "step": 15540 }, { "epoch": 3.8207396486054797, "grad_norm": 0.260753716046491, "learning_rate": 1.309393773894047e-05, "loss": 0.4546, "step": 15550 }, { "epoch": 3.823196952942622, "grad_norm": 0.22458227895064078, "learning_rate": 1.3066630256690334e-05, "loss": 0.4359, "step": 15560 }, { "epoch": 3.825654257279764, "grad_norm": 0.31393438733778584, "learning_rate": 1.3039322774440197e-05, "loss": 0.4665, "step": 15570 }, { "epoch": 3.828111561616906, "grad_norm": 0.2363337458986093, "learning_rate": 1.301201529219006e-05, "loss": 0.4581, "step": 15580 }, { "epoch": 3.8305688659540484, "grad_norm": 0.30387063985896323, "learning_rate": 1.2984707809939925e-05, "loss": 0.4533, "step": 15590 }, { "epoch": 3.8330261702911903, "grad_norm": 0.2405717962072862, "learning_rate": 1.2957400327689787e-05, "loss": 0.4569, "step": 15600 }, { "epoch": 3.8354834746283326, "grad_norm": 0.2503501949984234, "learning_rate": 1.2930092845439651e-05, "loss": 0.4333, "step": 15610 }, { "epoch": 3.837940778965475, "grad_norm": 0.238873808248505, "learning_rate": 1.2902785363189515e-05, "loss": 0.4537, "step": 15620 }, { "epoch": 3.840398083302617, "grad_norm": 0.24776729168107717, "learning_rate": 1.2875477880939377e-05, "loss": 0.4519, "step": 15630 }, { "epoch": 3.842855387639759, "grad_norm": 0.22787163213787723, "learning_rate": 1.2848170398689241e-05, "loss": 0.45, "step": 15640 }, { "epoch": 3.8453126919769014, "grad_norm": 0.2319647844415062, "learning_rate": 1.2820862916439105e-05, "loss": 0.4537, "step": 15650 }, { "epoch": 3.8477699963140433, "grad_norm": 0.28470628943014414, "learning_rate": 1.279355543418897e-05, "loss": 0.447, "step": 15660 }, { "epoch": 3.8502273006511856, "grad_norm": 0.2537151278306558, "learning_rate": 1.2766247951938832e-05, "loss": 0.4457, "step": 15670 }, { "epoch": 3.852684604988328, "grad_norm": 0.2387121246579386, "learning_rate": 1.2738940469688696e-05, "loss": 0.4715, "step": 15680 }, { "epoch": 3.8551419093254697, "grad_norm": 0.22394224562422013, "learning_rate": 1.271163298743856e-05, "loss": 0.4653, "step": 15690 }, { "epoch": 3.857599213662612, "grad_norm": 0.25458103719156916, "learning_rate": 1.2684325505188422e-05, "loss": 0.4421, "step": 15700 }, { "epoch": 3.8600565179997544, "grad_norm": 0.2281261387334152, "learning_rate": 1.2657018022938286e-05, "loss": 0.4507, "step": 15710 }, { "epoch": 3.862513822336896, "grad_norm": 0.22792034223578347, "learning_rate": 1.262971054068815e-05, "loss": 0.4641, "step": 15720 }, { "epoch": 3.8649711266740385, "grad_norm": 0.20666725525408547, "learning_rate": 1.2602403058438012e-05, "loss": 0.4654, "step": 15730 }, { "epoch": 3.867428431011181, "grad_norm": 0.26016980923023986, "learning_rate": 1.2575095576187876e-05, "loss": 0.4663, "step": 15740 }, { "epoch": 3.8698857353483227, "grad_norm": 0.2652386765152672, "learning_rate": 1.254778809393774e-05, "loss": 0.4738, "step": 15750 }, { "epoch": 3.872343039685465, "grad_norm": 0.26681656902625195, "learning_rate": 1.2520480611687601e-05, "loss": 0.4518, "step": 15760 }, { "epoch": 3.8748003440226073, "grad_norm": 0.24236368314349188, "learning_rate": 1.2493173129437467e-05, "loss": 0.4367, "step": 15770 }, { "epoch": 3.877257648359749, "grad_norm": 0.21917904070425542, "learning_rate": 1.246586564718733e-05, "loss": 0.4392, "step": 15780 }, { "epoch": 3.8797149526968915, "grad_norm": 0.2193003298767845, "learning_rate": 1.2438558164937193e-05, "loss": 0.43, "step": 15790 }, { "epoch": 3.882172257034034, "grad_norm": 0.2796330961610747, "learning_rate": 1.2411250682687057e-05, "loss": 0.4499, "step": 15800 }, { "epoch": 3.8846295613711757, "grad_norm": 0.2445358245486064, "learning_rate": 1.2383943200436921e-05, "loss": 0.456, "step": 15810 }, { "epoch": 3.887086865708318, "grad_norm": 0.2580097107493922, "learning_rate": 1.2356635718186783e-05, "loss": 0.4593, "step": 15820 }, { "epoch": 3.8895441700454603, "grad_norm": 0.25886851051716886, "learning_rate": 1.2329328235936647e-05, "loss": 0.45, "step": 15830 }, { "epoch": 3.892001474382602, "grad_norm": 0.21187659593778993, "learning_rate": 1.2302020753686511e-05, "loss": 0.4616, "step": 15840 }, { "epoch": 3.8944587787197444, "grad_norm": 0.2423587565899232, "learning_rate": 1.2274713271436374e-05, "loss": 0.4481, "step": 15850 }, { "epoch": 3.8969160830568867, "grad_norm": 0.24052870113284075, "learning_rate": 1.2247405789186238e-05, "loss": 0.4547, "step": 15860 }, { "epoch": 3.8993733873940286, "grad_norm": 0.2246150743039094, "learning_rate": 1.2220098306936102e-05, "loss": 0.4672, "step": 15870 }, { "epoch": 3.901830691731171, "grad_norm": 0.2261359849648761, "learning_rate": 1.2192790824685966e-05, "loss": 0.4551, "step": 15880 }, { "epoch": 3.9042879960683132, "grad_norm": 0.24777895877677805, "learning_rate": 1.2165483342435828e-05, "loss": 0.4729, "step": 15890 }, { "epoch": 3.906745300405455, "grad_norm": 0.24854374186235825, "learning_rate": 1.213817586018569e-05, "loss": 0.4669, "step": 15900 }, { "epoch": 3.9092026047425974, "grad_norm": 0.29271668009054663, "learning_rate": 1.2110868377935556e-05, "loss": 0.4522, "step": 15910 }, { "epoch": 3.9116599090797397, "grad_norm": 0.22203867726801949, "learning_rate": 1.2083560895685418e-05, "loss": 0.4649, "step": 15920 }, { "epoch": 3.9141172134168816, "grad_norm": 0.2332692800413824, "learning_rate": 1.2056253413435282e-05, "loss": 0.4477, "step": 15930 }, { "epoch": 3.916574517754024, "grad_norm": 0.23896091857843754, "learning_rate": 1.2028945931185146e-05, "loss": 0.4462, "step": 15940 }, { "epoch": 3.919031822091166, "grad_norm": 0.23842843651143342, "learning_rate": 1.2001638448935008e-05, "loss": 0.4629, "step": 15950 }, { "epoch": 3.921489126428308, "grad_norm": 0.2468050405387071, "learning_rate": 1.1974330966684872e-05, "loss": 0.4399, "step": 15960 }, { "epoch": 3.9239464307654504, "grad_norm": 0.2508375115439494, "learning_rate": 1.1947023484434736e-05, "loss": 0.4681, "step": 15970 }, { "epoch": 3.9264037351025927, "grad_norm": 0.24634544341868955, "learning_rate": 1.1919716002184599e-05, "loss": 0.448, "step": 15980 }, { "epoch": 3.9288610394397345, "grad_norm": 0.2051591516298991, "learning_rate": 1.1892408519934463e-05, "loss": 0.456, "step": 15990 }, { "epoch": 3.931318343776877, "grad_norm": 0.22103494461986323, "learning_rate": 1.1865101037684325e-05, "loss": 0.4376, "step": 16000 }, { "epoch": 3.933775648114019, "grad_norm": 0.2814070229661432, "learning_rate": 1.183779355543419e-05, "loss": 0.4409, "step": 16010 }, { "epoch": 3.936232952451161, "grad_norm": 0.21152375447371732, "learning_rate": 1.1810486073184053e-05, "loss": 0.4448, "step": 16020 }, { "epoch": 3.9386902567883033, "grad_norm": 0.21348059911482625, "learning_rate": 1.1783178590933915e-05, "loss": 0.4607, "step": 16030 }, { "epoch": 3.9411475611254456, "grad_norm": 0.2146507080604754, "learning_rate": 1.1755871108683781e-05, "loss": 0.4492, "step": 16040 }, { "epoch": 3.9436048654625875, "grad_norm": 0.28418692397271467, "learning_rate": 1.1728563626433643e-05, "loss": 0.4538, "step": 16050 }, { "epoch": 3.94606216979973, "grad_norm": 0.23118677516420738, "learning_rate": 1.1701256144183507e-05, "loss": 0.4494, "step": 16060 }, { "epoch": 3.948519474136872, "grad_norm": 0.24280875605952726, "learning_rate": 1.167394866193337e-05, "loss": 0.4392, "step": 16070 }, { "epoch": 3.950976778474014, "grad_norm": 0.21393697583976115, "learning_rate": 1.1646641179683234e-05, "loss": 0.4811, "step": 16080 }, { "epoch": 3.9534340828111563, "grad_norm": 0.23679767208484345, "learning_rate": 1.1619333697433098e-05, "loss": 0.4593, "step": 16090 }, { "epoch": 3.9558913871482986, "grad_norm": 0.23719554154857894, "learning_rate": 1.159202621518296e-05, "loss": 0.454, "step": 16100 }, { "epoch": 3.9583486914854404, "grad_norm": 0.2250682479111969, "learning_rate": 1.1564718732932824e-05, "loss": 0.4589, "step": 16110 }, { "epoch": 3.9608059958225827, "grad_norm": 0.2432582817044515, "learning_rate": 1.1537411250682688e-05, "loss": 0.4526, "step": 16120 }, { "epoch": 3.963263300159725, "grad_norm": 0.23774597035843228, "learning_rate": 1.151010376843255e-05, "loss": 0.4442, "step": 16130 }, { "epoch": 3.965720604496867, "grad_norm": 0.2725146583862779, "learning_rate": 1.1482796286182414e-05, "loss": 0.458, "step": 16140 }, { "epoch": 3.9681779088340092, "grad_norm": 0.2232892350572878, "learning_rate": 1.1455488803932278e-05, "loss": 0.4591, "step": 16150 }, { "epoch": 3.970635213171151, "grad_norm": 0.22590687882909433, "learning_rate": 1.142818132168214e-05, "loss": 0.4539, "step": 16160 }, { "epoch": 3.9730925175082934, "grad_norm": 0.22509888831259067, "learning_rate": 1.1400873839432005e-05, "loss": 0.4541, "step": 16170 }, { "epoch": 3.9755498218454357, "grad_norm": 0.2506404546189212, "learning_rate": 1.1373566357181869e-05, "loss": 0.4598, "step": 16180 }, { "epoch": 3.9780071261825776, "grad_norm": 0.2696332120432258, "learning_rate": 1.1346258874931733e-05, "loss": 0.4481, "step": 16190 }, { "epoch": 3.98046443051972, "grad_norm": 0.25381715836962404, "learning_rate": 1.1318951392681595e-05, "loss": 0.4467, "step": 16200 }, { "epoch": 3.982921734856862, "grad_norm": 0.21866440613544838, "learning_rate": 1.1291643910431459e-05, "loss": 0.448, "step": 16210 }, { "epoch": 3.985379039194004, "grad_norm": 0.25113035340628487, "learning_rate": 1.1264336428181323e-05, "loss": 0.4456, "step": 16220 }, { "epoch": 3.9878363435311464, "grad_norm": 0.19871988307158275, "learning_rate": 1.1237028945931185e-05, "loss": 0.4592, "step": 16230 }, { "epoch": 3.990293647868288, "grad_norm": 0.25455314288832315, "learning_rate": 1.120972146368105e-05, "loss": 0.4698, "step": 16240 }, { "epoch": 3.9927509522054305, "grad_norm": 0.2771347904504137, "learning_rate": 1.1182413981430913e-05, "loss": 0.4413, "step": 16250 }, { "epoch": 3.995208256542573, "grad_norm": 0.23898949196321978, "learning_rate": 1.1155106499180776e-05, "loss": 0.463, "step": 16260 }, { "epoch": 3.9976655608797147, "grad_norm": 0.2603589015612899, "learning_rate": 1.112779901693064e-05, "loss": 0.4517, "step": 16270 }, { "epoch": 4.0, "grad_norm": 0.2553658411280956, "learning_rate": 1.1100491534680504e-05, "loss": 0.4457, "step": 16280 }, { "epoch": 4.002457304337142, "grad_norm": 0.22572774830755557, "learning_rate": 1.1073184052430366e-05, "loss": 0.4453, "step": 16290 }, { "epoch": 4.004914608674285, "grad_norm": 0.27154322585721313, "learning_rate": 1.104587657018023e-05, "loss": 0.4528, "step": 16300 }, { "epoch": 4.0073719130114265, "grad_norm": 0.2384451193234366, "learning_rate": 1.1018569087930094e-05, "loss": 0.4506, "step": 16310 }, { "epoch": 4.009829217348568, "grad_norm": 0.2805812358821854, "learning_rate": 1.0991261605679958e-05, "loss": 0.4434, "step": 16320 }, { "epoch": 4.012286521685711, "grad_norm": 0.24568688666516497, "learning_rate": 1.096395412342982e-05, "loss": 0.4605, "step": 16330 }, { "epoch": 4.014743826022853, "grad_norm": 0.24735307964102315, "learning_rate": 1.0936646641179683e-05, "loss": 0.4632, "step": 16340 }, { "epoch": 4.017201130359995, "grad_norm": 0.23665024450407332, "learning_rate": 1.0909339158929548e-05, "loss": 0.4504, "step": 16350 }, { "epoch": 4.019658434697138, "grad_norm": 0.21283359473044855, "learning_rate": 1.088203167667941e-05, "loss": 0.4529, "step": 16360 }, { "epoch": 4.022115739034279, "grad_norm": 0.26301344355291906, "learning_rate": 1.0854724194429273e-05, "loss": 0.4563, "step": 16370 }, { "epoch": 4.024573043371421, "grad_norm": 0.22726471358655417, "learning_rate": 1.0827416712179139e-05, "loss": 0.4542, "step": 16380 }, { "epoch": 4.027030347708564, "grad_norm": 0.2394495923025657, "learning_rate": 1.0800109229929e-05, "loss": 0.4481, "step": 16390 }, { "epoch": 4.029487652045706, "grad_norm": 0.23369109919422804, "learning_rate": 1.0772801747678865e-05, "loss": 0.4626, "step": 16400 }, { "epoch": 4.031944956382848, "grad_norm": 0.2340315629872485, "learning_rate": 1.0745494265428729e-05, "loss": 0.4429, "step": 16410 }, { "epoch": 4.0344022607199905, "grad_norm": 0.21963630888717905, "learning_rate": 1.0718186783178591e-05, "loss": 0.4643, "step": 16420 }, { "epoch": 4.036859565057132, "grad_norm": 0.22302984800798606, "learning_rate": 1.0690879300928455e-05, "loss": 0.4359, "step": 16430 }, { "epoch": 4.039316869394274, "grad_norm": 0.25440512146056654, "learning_rate": 1.0663571818678317e-05, "loss": 0.4495, "step": 16440 }, { "epoch": 4.041774173731417, "grad_norm": 0.24082825368882035, "learning_rate": 1.0636264336428183e-05, "loss": 0.455, "step": 16450 }, { "epoch": 4.044231478068559, "grad_norm": 0.27070566149456926, "learning_rate": 1.0608956854178045e-05, "loss": 0.4337, "step": 16460 }, { "epoch": 4.046688782405701, "grad_norm": 0.262277183001365, "learning_rate": 1.0581649371927908e-05, "loss": 0.4591, "step": 16470 }, { "epoch": 4.0491460867428435, "grad_norm": 0.22043094748055544, "learning_rate": 1.0554341889677773e-05, "loss": 0.4524, "step": 16480 }, { "epoch": 4.051603391079985, "grad_norm": 0.2866812465109704, "learning_rate": 1.0527034407427636e-05, "loss": 0.4426, "step": 16490 }, { "epoch": 4.054060695417127, "grad_norm": 0.26445549018486963, "learning_rate": 1.0499726925177498e-05, "loss": 0.4434, "step": 16500 }, { "epoch": 4.05651799975427, "grad_norm": 0.23343215697162384, "learning_rate": 1.0472419442927362e-05, "loss": 0.4488, "step": 16510 }, { "epoch": 4.058975304091412, "grad_norm": 0.22510665067559674, "learning_rate": 1.0445111960677226e-05, "loss": 0.4545, "step": 16520 }, { "epoch": 4.061432608428554, "grad_norm": 0.25543564640673294, "learning_rate": 1.041780447842709e-05, "loss": 0.4622, "step": 16530 }, { "epoch": 4.063889912765696, "grad_norm": 0.23645438963556337, "learning_rate": 1.0390496996176952e-05, "loss": 0.4511, "step": 16540 }, { "epoch": 4.066347217102838, "grad_norm": 0.25156431839684235, "learning_rate": 1.0363189513926816e-05, "loss": 0.4518, "step": 16550 }, { "epoch": 4.06880452143998, "grad_norm": 0.24639346190003597, "learning_rate": 1.033588203167668e-05, "loss": 0.4365, "step": 16560 }, { "epoch": 4.071261825777123, "grad_norm": 0.2414167483216541, "learning_rate": 1.0308574549426543e-05, "loss": 0.4398, "step": 16570 }, { "epoch": 4.073719130114265, "grad_norm": 0.2047195467697924, "learning_rate": 1.0281267067176407e-05, "loss": 0.4492, "step": 16580 }, { "epoch": 4.076176434451407, "grad_norm": 0.23123148337155322, "learning_rate": 1.025395958492627e-05, "loss": 0.4525, "step": 16590 }, { "epoch": 4.0786337387885485, "grad_norm": 0.26782450294021737, "learning_rate": 1.0226652102676133e-05, "loss": 0.4362, "step": 16600 }, { "epoch": 4.081091043125691, "grad_norm": 0.22983639335201364, "learning_rate": 1.0199344620425997e-05, "loss": 0.4549, "step": 16610 }, { "epoch": 4.083548347462833, "grad_norm": 0.23875563397936758, "learning_rate": 1.0172037138175861e-05, "loss": 0.4516, "step": 16620 }, { "epoch": 4.086005651799975, "grad_norm": 0.21296148258028416, "learning_rate": 1.0144729655925723e-05, "loss": 0.4522, "step": 16630 }, { "epoch": 4.088462956137118, "grad_norm": 0.20452982007612538, "learning_rate": 1.0117422173675587e-05, "loss": 0.4584, "step": 16640 }, { "epoch": 4.09092026047426, "grad_norm": 0.26710402229191377, "learning_rate": 1.0090114691425451e-05, "loss": 0.4761, "step": 16650 }, { "epoch": 4.0933775648114015, "grad_norm": 0.23569830431910085, "learning_rate": 1.0062807209175315e-05, "loss": 0.4412, "step": 16660 }, { "epoch": 4.095834869148544, "grad_norm": 0.25223690170781937, "learning_rate": 1.0035499726925178e-05, "loss": 0.4497, "step": 16670 }, { "epoch": 4.098292173485686, "grad_norm": 0.28525637992103636, "learning_rate": 1.0008192244675042e-05, "loss": 0.4476, "step": 16680 }, { "epoch": 4.100749477822828, "grad_norm": 0.23646756046315426, "learning_rate": 9.980884762424906e-06, "loss": 0.4637, "step": 16690 }, { "epoch": 4.103206782159971, "grad_norm": 0.2701762002956552, "learning_rate": 9.953577280174768e-06, "loss": 0.4539, "step": 16700 }, { "epoch": 4.105664086497113, "grad_norm": 0.2177465875544762, "learning_rate": 9.926269797924632e-06, "loss": 0.4719, "step": 16710 }, { "epoch": 4.108121390834254, "grad_norm": 0.21291607372149068, "learning_rate": 9.898962315674496e-06, "loss": 0.4491, "step": 16720 }, { "epoch": 4.110578695171397, "grad_norm": 0.26533989541175956, "learning_rate": 9.871654833424358e-06, "loss": 0.4488, "step": 16730 }, { "epoch": 4.113035999508539, "grad_norm": 0.2371213563883778, "learning_rate": 9.844347351174222e-06, "loss": 0.4466, "step": 16740 }, { "epoch": 4.115493303845681, "grad_norm": 0.24840158280142077, "learning_rate": 9.817039868924086e-06, "loss": 0.4522, "step": 16750 }, { "epoch": 4.117950608182824, "grad_norm": 0.2579005635723337, "learning_rate": 9.789732386673949e-06, "loss": 0.4574, "step": 16760 }, { "epoch": 4.1204079125199655, "grad_norm": 0.2389096967466241, "learning_rate": 9.762424904423813e-06, "loss": 0.4692, "step": 16770 }, { "epoch": 4.122865216857107, "grad_norm": 0.272868875943755, "learning_rate": 9.735117422173677e-06, "loss": 0.4447, "step": 16780 }, { "epoch": 4.12532252119425, "grad_norm": 0.24532950197971123, "learning_rate": 9.70780993992354e-06, "loss": 0.44, "step": 16790 }, { "epoch": 4.127779825531392, "grad_norm": 0.22255088029164297, "learning_rate": 9.680502457673403e-06, "loss": 0.4544, "step": 16800 }, { "epoch": 4.130237129868534, "grad_norm": 0.2621679359592567, "learning_rate": 9.653194975423265e-06, "loss": 0.4532, "step": 16810 }, { "epoch": 4.132694434205677, "grad_norm": 0.2246712031559465, "learning_rate": 9.625887493173131e-06, "loss": 0.4607, "step": 16820 }, { "epoch": 4.1351517385428185, "grad_norm": 0.22557924310411132, "learning_rate": 9.598580010922993e-06, "loss": 0.4633, "step": 16830 }, { "epoch": 4.13760904287996, "grad_norm": 0.2526735964274597, "learning_rate": 9.571272528672857e-06, "loss": 0.4603, "step": 16840 }, { "epoch": 4.140066347217103, "grad_norm": 0.2434703315308289, "learning_rate": 9.543965046422721e-06, "loss": 0.4475, "step": 16850 }, { "epoch": 4.142523651554245, "grad_norm": 0.29430220555113074, "learning_rate": 9.516657564172583e-06, "loss": 0.454, "step": 16860 }, { "epoch": 4.144980955891387, "grad_norm": 0.23631262980662573, "learning_rate": 9.489350081922447e-06, "loss": 0.4361, "step": 16870 }, { "epoch": 4.14743826022853, "grad_norm": 0.2503516455185589, "learning_rate": 9.46204259967231e-06, "loss": 0.4623, "step": 16880 }, { "epoch": 4.149895564565671, "grad_norm": 0.22757486448315656, "learning_rate": 9.434735117422174e-06, "loss": 0.459, "step": 16890 }, { "epoch": 4.152352868902813, "grad_norm": 0.25041638039271363, "learning_rate": 9.407427635172038e-06, "loss": 0.4462, "step": 16900 }, { "epoch": 4.154810173239956, "grad_norm": 0.2924399006214983, "learning_rate": 9.3801201529219e-06, "loss": 0.4557, "step": 16910 }, { "epoch": 4.157267477577098, "grad_norm": 0.315391859281466, "learning_rate": 9.352812670671766e-06, "loss": 0.4544, "step": 16920 }, { "epoch": 4.15972478191424, "grad_norm": 0.22616811339065143, "learning_rate": 9.325505188421628e-06, "loss": 0.4519, "step": 16930 }, { "epoch": 4.1621820862513825, "grad_norm": 0.23503370923213834, "learning_rate": 9.29819770617149e-06, "loss": 0.4709, "step": 16940 }, { "epoch": 4.164639390588524, "grad_norm": 0.259641520561869, "learning_rate": 9.270890223921354e-06, "loss": 0.446, "step": 16950 }, { "epoch": 4.167096694925666, "grad_norm": 0.2185408869532425, "learning_rate": 9.243582741671218e-06, "loss": 0.4727, "step": 16960 }, { "epoch": 4.169553999262809, "grad_norm": 0.2594632262054857, "learning_rate": 9.216275259421082e-06, "loss": 0.458, "step": 16970 }, { "epoch": 4.172011303599951, "grad_norm": 0.2556928320217053, "learning_rate": 9.188967777170945e-06, "loss": 0.4452, "step": 16980 }, { "epoch": 4.174468607937093, "grad_norm": 0.26237759326343124, "learning_rate": 9.161660294920809e-06, "loss": 0.4343, "step": 16990 }, { "epoch": 4.1769259122742355, "grad_norm": 0.24155283510264963, "learning_rate": 9.134352812670673e-06, "loss": 0.4429, "step": 17000 }, { "epoch": 4.179383216611377, "grad_norm": 0.2314347160543151, "learning_rate": 9.107045330420535e-06, "loss": 0.4484, "step": 17010 }, { "epoch": 4.181840520948519, "grad_norm": 0.25023800379955596, "learning_rate": 9.079737848170399e-06, "loss": 0.4498, "step": 17020 }, { "epoch": 4.184297825285662, "grad_norm": 0.2592538553462896, "learning_rate": 9.052430365920263e-06, "loss": 0.453, "step": 17030 }, { "epoch": 4.186755129622804, "grad_norm": 0.24082778599147092, "learning_rate": 9.025122883670125e-06, "loss": 0.4706, "step": 17040 }, { "epoch": 4.189212433959946, "grad_norm": 0.2350458918800031, "learning_rate": 8.99781540141999e-06, "loss": 0.4602, "step": 17050 }, { "epoch": 4.191669738297088, "grad_norm": 0.22355640518351555, "learning_rate": 8.970507919169853e-06, "loss": 0.4591, "step": 17060 }, { "epoch": 4.19412704263423, "grad_norm": 0.2644735993586744, "learning_rate": 8.943200436919716e-06, "loss": 0.4416, "step": 17070 }, { "epoch": 4.196584346971372, "grad_norm": 0.2648319364023754, "learning_rate": 8.91589295466958e-06, "loss": 0.4654, "step": 17080 }, { "epoch": 4.199041651308515, "grad_norm": 0.2513056280192471, "learning_rate": 8.888585472419444e-06, "loss": 0.4578, "step": 17090 }, { "epoch": 4.201498955645657, "grad_norm": 0.21001232510015086, "learning_rate": 8.861277990169308e-06, "loss": 0.4421, "step": 17100 }, { "epoch": 4.203956259982799, "grad_norm": 0.2772461876313325, "learning_rate": 8.83397050791917e-06, "loss": 0.4623, "step": 17110 }, { "epoch": 4.206413564319941, "grad_norm": 0.25079961259038247, "learning_rate": 8.806663025669034e-06, "loss": 0.4527, "step": 17120 }, { "epoch": 4.208870868657083, "grad_norm": 0.22865406475031222, "learning_rate": 8.779355543418898e-06, "loss": 0.4563, "step": 17130 }, { "epoch": 4.211328172994225, "grad_norm": 0.2410711463315314, "learning_rate": 8.75204806116876e-06, "loss": 0.4479, "step": 17140 }, { "epoch": 4.213785477331368, "grad_norm": 0.24832785152818102, "learning_rate": 8.724740578918624e-06, "loss": 0.4693, "step": 17150 }, { "epoch": 4.21624278166851, "grad_norm": 0.23387958386920898, "learning_rate": 8.697433096668488e-06, "loss": 0.4454, "step": 17160 }, { "epoch": 4.218700086005652, "grad_norm": 0.23804199582397795, "learning_rate": 8.67012561441835e-06, "loss": 0.4574, "step": 17170 }, { "epoch": 4.221157390342794, "grad_norm": 0.2851295543256884, "learning_rate": 8.642818132168215e-06, "loss": 0.4407, "step": 17180 }, { "epoch": 4.223614694679936, "grad_norm": 0.22738574809119458, "learning_rate": 8.615510649918079e-06, "loss": 0.4399, "step": 17190 }, { "epoch": 4.226071999017078, "grad_norm": 0.28972633107234314, "learning_rate": 8.588203167667941e-06, "loss": 0.4539, "step": 17200 }, { "epoch": 4.228529303354221, "grad_norm": 0.22789756524568994, "learning_rate": 8.560895685417805e-06, "loss": 0.4319, "step": 17210 }, { "epoch": 4.230986607691363, "grad_norm": 0.29410672452641845, "learning_rate": 8.533588203167669e-06, "loss": 0.4481, "step": 17220 }, { "epoch": 4.233443912028505, "grad_norm": 0.26298238508456406, "learning_rate": 8.506280720917533e-06, "loss": 0.4369, "step": 17230 }, { "epoch": 4.235901216365647, "grad_norm": 0.303389632108738, "learning_rate": 8.478973238667395e-06, "loss": 0.4552, "step": 17240 }, { "epoch": 4.238358520702789, "grad_norm": 0.22171502027227838, "learning_rate": 8.451665756417258e-06, "loss": 0.4483, "step": 17250 }, { "epoch": 4.240815825039931, "grad_norm": 0.28202576408747043, "learning_rate": 8.424358274167123e-06, "loss": 0.4516, "step": 17260 }, { "epoch": 4.243273129377073, "grad_norm": 0.2817596306305276, "learning_rate": 8.397050791916986e-06, "loss": 0.4737, "step": 17270 }, { "epoch": 4.245730433714216, "grad_norm": 0.25857958502843775, "learning_rate": 8.369743309666848e-06, "loss": 0.4453, "step": 17280 }, { "epoch": 4.2481877380513575, "grad_norm": 0.22868363142074052, "learning_rate": 8.342435827416714e-06, "loss": 0.4511, "step": 17290 }, { "epoch": 4.2506450423885, "grad_norm": 0.26256367722700785, "learning_rate": 8.315128345166576e-06, "loss": 0.4437, "step": 17300 }, { "epoch": 4.253102346725642, "grad_norm": 0.23172170537482445, "learning_rate": 8.28782086291644e-06, "loss": 0.4538, "step": 17310 }, { "epoch": 4.255559651062784, "grad_norm": 0.2617798120862009, "learning_rate": 8.260513380666302e-06, "loss": 0.4369, "step": 17320 }, { "epoch": 4.258016955399926, "grad_norm": 0.2820714014805312, "learning_rate": 8.233205898416166e-06, "loss": 0.4559, "step": 17330 }, { "epoch": 4.260474259737069, "grad_norm": 0.20811179720686507, "learning_rate": 8.20589841616603e-06, "loss": 0.4405, "step": 17340 }, { "epoch": 4.2629315640742105, "grad_norm": 0.21213051432653518, "learning_rate": 8.178590933915892e-06, "loss": 0.4589, "step": 17350 }, { "epoch": 4.265388868411352, "grad_norm": 0.26228753419281187, "learning_rate": 8.151283451665758e-06, "loss": 0.4668, "step": 17360 }, { "epoch": 4.267846172748495, "grad_norm": 0.21684894123165716, "learning_rate": 8.12397596941562e-06, "loss": 0.4512, "step": 17370 }, { "epoch": 4.270303477085637, "grad_norm": 0.2763258976661735, "learning_rate": 8.096668487165483e-06, "loss": 0.4353, "step": 17380 }, { "epoch": 4.272760781422779, "grad_norm": 0.2647333701191584, "learning_rate": 8.069361004915347e-06, "loss": 0.4537, "step": 17390 }, { "epoch": 4.275218085759922, "grad_norm": 0.2516468790320715, "learning_rate": 8.04205352266521e-06, "loss": 0.4375, "step": 17400 }, { "epoch": 4.277675390097063, "grad_norm": 0.24920899176798603, "learning_rate": 8.014746040415073e-06, "loss": 0.4532, "step": 17410 }, { "epoch": 4.280132694434205, "grad_norm": 0.2703272354481981, "learning_rate": 7.987438558164937e-06, "loss": 0.45, "step": 17420 }, { "epoch": 4.282589998771348, "grad_norm": 0.2311347131271007, "learning_rate": 7.960131075914801e-06, "loss": 0.4453, "step": 17430 }, { "epoch": 4.28504730310849, "grad_norm": 0.24228862693048772, "learning_rate": 7.932823593664665e-06, "loss": 0.4631, "step": 17440 }, { "epoch": 4.287504607445632, "grad_norm": 0.2338460818147281, "learning_rate": 7.905516111414527e-06, "loss": 0.4332, "step": 17450 }, { "epoch": 4.2899619117827745, "grad_norm": 0.2975235285550145, "learning_rate": 7.878208629164391e-06, "loss": 0.4577, "step": 17460 }, { "epoch": 4.292419216119916, "grad_norm": 0.23926514171270782, "learning_rate": 7.850901146914255e-06, "loss": 0.4506, "step": 17470 }, { "epoch": 4.294876520457058, "grad_norm": 0.2352272738817256, "learning_rate": 7.823593664664118e-06, "loss": 0.4456, "step": 17480 }, { "epoch": 4.297333824794201, "grad_norm": 0.25590808267976456, "learning_rate": 7.796286182413982e-06, "loss": 0.4405, "step": 17490 }, { "epoch": 4.299791129131343, "grad_norm": 0.22297210642580087, "learning_rate": 7.768978700163846e-06, "loss": 0.4406, "step": 17500 }, { "epoch": 4.302248433468485, "grad_norm": 0.2800720306051109, "learning_rate": 7.741671217913708e-06, "loss": 0.4598, "step": 17510 }, { "epoch": 4.3047057378056275, "grad_norm": 0.21066708720368524, "learning_rate": 7.714363735663572e-06, "loss": 0.4551, "step": 17520 }, { "epoch": 4.307163042142769, "grad_norm": 0.220516838302953, "learning_rate": 7.687056253413436e-06, "loss": 0.4437, "step": 17530 }, { "epoch": 4.309620346479911, "grad_norm": 0.22966823298255384, "learning_rate": 7.659748771163298e-06, "loss": 0.4368, "step": 17540 }, { "epoch": 4.312077650817054, "grad_norm": 0.24063189799354912, "learning_rate": 7.632441288913162e-06, "loss": 0.4469, "step": 17550 }, { "epoch": 4.314534955154196, "grad_norm": 0.25759032840899243, "learning_rate": 7.6051338066630255e-06, "loss": 0.46, "step": 17560 }, { "epoch": 4.316992259491338, "grad_norm": 0.28996652231905024, "learning_rate": 7.57782632441289e-06, "loss": 0.4652, "step": 17570 }, { "epoch": 4.31944956382848, "grad_norm": 0.2457204073171015, "learning_rate": 7.550518842162753e-06, "loss": 0.4831, "step": 17580 }, { "epoch": 4.321906868165622, "grad_norm": 0.2324877162515334, "learning_rate": 7.523211359912616e-06, "loss": 0.4542, "step": 17590 }, { "epoch": 4.324364172502764, "grad_norm": 0.23944395591367945, "learning_rate": 7.49590387766248e-06, "loss": 0.4548, "step": 17600 }, { "epoch": 4.326821476839907, "grad_norm": 0.2730181025579195, "learning_rate": 7.468596395412343e-06, "loss": 0.4564, "step": 17610 }, { "epoch": 4.329278781177049, "grad_norm": 0.21830656867114592, "learning_rate": 7.441288913162208e-06, "loss": 0.4657, "step": 17620 }, { "epoch": 4.331736085514191, "grad_norm": 0.23546935970603225, "learning_rate": 7.41398143091207e-06, "loss": 0.4509, "step": 17630 }, { "epoch": 4.334193389851333, "grad_norm": 0.22976050535910808, "learning_rate": 7.386673948661933e-06, "loss": 0.4425, "step": 17640 }, { "epoch": 4.336650694188475, "grad_norm": 0.27405233318049543, "learning_rate": 7.359366466411797e-06, "loss": 0.4402, "step": 17650 }, { "epoch": 4.339107998525617, "grad_norm": 0.2419985740478002, "learning_rate": 7.33205898416166e-06, "loss": 0.4156, "step": 17660 }, { "epoch": 4.34156530286276, "grad_norm": 0.27197971999364723, "learning_rate": 7.304751501911524e-06, "loss": 0.4492, "step": 17670 }, { "epoch": 4.344022607199902, "grad_norm": 0.2546144253566118, "learning_rate": 7.2774440196613876e-06, "loss": 0.447, "step": 17680 }, { "epoch": 4.346479911537044, "grad_norm": 0.2996228127135411, "learning_rate": 7.250136537411251e-06, "loss": 0.4468, "step": 17690 }, { "epoch": 4.348937215874186, "grad_norm": 0.2941829020267576, "learning_rate": 7.222829055161115e-06, "loss": 0.4493, "step": 17700 }, { "epoch": 4.351394520211328, "grad_norm": 0.2657782951875237, "learning_rate": 7.195521572910978e-06, "loss": 0.4617, "step": 17710 }, { "epoch": 4.35385182454847, "grad_norm": 0.25564323890172974, "learning_rate": 7.168214090660841e-06, "loss": 0.4532, "step": 17720 }, { "epoch": 4.356309128885613, "grad_norm": 0.2692335269089997, "learning_rate": 7.140906608410705e-06, "loss": 0.4391, "step": 17730 }, { "epoch": 4.358766433222755, "grad_norm": 0.2833835619156608, "learning_rate": 7.113599126160568e-06, "loss": 0.427, "step": 17740 }, { "epoch": 4.361223737559897, "grad_norm": 0.25267835468987165, "learning_rate": 7.086291643910432e-06, "loss": 0.4431, "step": 17750 }, { "epoch": 4.363681041897039, "grad_norm": 0.2630377030196379, "learning_rate": 7.058984161660295e-06, "loss": 0.4478, "step": 17760 }, { "epoch": 4.366138346234181, "grad_norm": 0.21670460926724966, "learning_rate": 7.0316766794101585e-06, "loss": 0.4629, "step": 17770 }, { "epoch": 4.368595650571323, "grad_norm": 0.222571607999932, "learning_rate": 7.0043691971600225e-06, "loss": 0.4533, "step": 17780 }, { "epoch": 4.371052954908466, "grad_norm": 0.24234173041761392, "learning_rate": 6.977061714909886e-06, "loss": 0.4372, "step": 17790 }, { "epoch": 4.373510259245608, "grad_norm": 0.22210606901330066, "learning_rate": 6.94975423265975e-06, "loss": 0.4623, "step": 17800 }, { "epoch": 4.3759675635827495, "grad_norm": 0.282005039572637, "learning_rate": 6.922446750409613e-06, "loss": 0.4416, "step": 17810 }, { "epoch": 4.378424867919892, "grad_norm": 0.2575219012184894, "learning_rate": 6.895139268159476e-06, "loss": 0.4423, "step": 17820 }, { "epoch": 4.380882172257034, "grad_norm": 0.2752618700773304, "learning_rate": 6.86783178590934e-06, "loss": 0.4374, "step": 17830 }, { "epoch": 4.383339476594176, "grad_norm": 0.2310113160182981, "learning_rate": 6.840524303659203e-06, "loss": 0.4441, "step": 17840 }, { "epoch": 4.385796780931319, "grad_norm": 0.2376150643633886, "learning_rate": 6.8132168214090654e-06, "loss": 0.4583, "step": 17850 }, { "epoch": 4.388254085268461, "grad_norm": 0.22939670594340586, "learning_rate": 6.78590933915893e-06, "loss": 0.4512, "step": 17860 }, { "epoch": 4.3907113896056025, "grad_norm": 0.24115738550724714, "learning_rate": 6.7586018569087934e-06, "loss": 0.4591, "step": 17870 }, { "epoch": 4.393168693942744, "grad_norm": 0.2374091762247028, "learning_rate": 6.7312943746586574e-06, "loss": 0.4261, "step": 17880 }, { "epoch": 4.395625998279887, "grad_norm": 0.2684868371977687, "learning_rate": 6.703986892408521e-06, "loss": 0.4473, "step": 17890 }, { "epoch": 4.398083302617029, "grad_norm": 0.24011924051301423, "learning_rate": 6.676679410158383e-06, "loss": 0.4509, "step": 17900 }, { "epoch": 4.400540606954172, "grad_norm": 0.269928652268437, "learning_rate": 6.649371927908248e-06, "loss": 0.4544, "step": 17910 }, { "epoch": 4.402997911291314, "grad_norm": 0.24280068126182722, "learning_rate": 6.62206444565811e-06, "loss": 0.4413, "step": 17920 }, { "epoch": 4.405455215628455, "grad_norm": 0.217054888158821, "learning_rate": 6.594756963407975e-06, "loss": 0.4605, "step": 17930 }, { "epoch": 4.407912519965597, "grad_norm": 0.2505128150732249, "learning_rate": 6.567449481157838e-06, "loss": 0.4463, "step": 17940 }, { "epoch": 4.41036982430274, "grad_norm": 0.27520155698241855, "learning_rate": 6.5401419989077e-06, "loss": 0.4528, "step": 17950 }, { "epoch": 4.412827128639882, "grad_norm": 0.20222686630751943, "learning_rate": 6.512834516657565e-06, "loss": 0.4506, "step": 17960 }, { "epoch": 4.415284432977025, "grad_norm": 0.2757530617114866, "learning_rate": 6.4855270344074275e-06, "loss": 0.4416, "step": 17970 }, { "epoch": 4.4177417373141665, "grad_norm": 0.27308927471888966, "learning_rate": 6.458219552157291e-06, "loss": 0.4543, "step": 17980 }, { "epoch": 4.420199041651308, "grad_norm": 0.23629449163595298, "learning_rate": 6.4309120699071555e-06, "loss": 0.4475, "step": 17990 }, { "epoch": 4.42265634598845, "grad_norm": 0.21875534457801946, "learning_rate": 6.403604587657018e-06, "loss": 0.4413, "step": 18000 }, { "epoch": 4.425113650325593, "grad_norm": 0.22610892700077578, "learning_rate": 6.376297105406883e-06, "loss": 0.4477, "step": 18010 }, { "epoch": 4.427570954662735, "grad_norm": 0.2485104209025531, "learning_rate": 6.348989623156745e-06, "loss": 0.4644, "step": 18020 }, { "epoch": 4.430028258999877, "grad_norm": 0.3039073021498438, "learning_rate": 6.321682140906608e-06, "loss": 0.448, "step": 18030 }, { "epoch": 4.4324855633370195, "grad_norm": 0.25324584550453433, "learning_rate": 6.294374658656472e-06, "loss": 0.4544, "step": 18040 }, { "epoch": 4.434942867674161, "grad_norm": 0.21379069160371947, "learning_rate": 6.267067176406335e-06, "loss": 0.4347, "step": 18050 }, { "epoch": 4.437400172011303, "grad_norm": 0.22115790632590332, "learning_rate": 6.239759694156199e-06, "loss": 0.4613, "step": 18060 }, { "epoch": 4.439857476348446, "grad_norm": 0.2619146947456015, "learning_rate": 6.2124522119060625e-06, "loss": 0.4566, "step": 18070 }, { "epoch": 4.442314780685588, "grad_norm": 0.2307752818678924, "learning_rate": 6.1851447296559265e-06, "loss": 0.463, "step": 18080 }, { "epoch": 4.44477208502273, "grad_norm": 0.21547400193284338, "learning_rate": 6.15783724740579e-06, "loss": 0.4564, "step": 18090 }, { "epoch": 4.447229389359872, "grad_norm": 0.2238813597100371, "learning_rate": 6.130529765155653e-06, "loss": 0.4708, "step": 18100 }, { "epoch": 4.449686693697014, "grad_norm": 0.24168056093318888, "learning_rate": 6.103222282905517e-06, "loss": 0.4718, "step": 18110 }, { "epoch": 4.452143998034156, "grad_norm": 0.24998378795699466, "learning_rate": 6.07591480065538e-06, "loss": 0.4469, "step": 18120 }, { "epoch": 4.454601302371299, "grad_norm": 0.24262483093833676, "learning_rate": 6.048607318405243e-06, "loss": 0.4516, "step": 18130 }, { "epoch": 4.457058606708441, "grad_norm": 0.23706376320004255, "learning_rate": 6.021299836155106e-06, "loss": 0.4566, "step": 18140 }, { "epoch": 4.459515911045583, "grad_norm": 0.24185789106237968, "learning_rate": 5.99399235390497e-06, "loss": 0.439, "step": 18150 }, { "epoch": 4.461973215382725, "grad_norm": 0.22157397077735258, "learning_rate": 5.966684871654834e-06, "loss": 0.4576, "step": 18160 }, { "epoch": 4.464430519719867, "grad_norm": 0.2326409495906749, "learning_rate": 5.939377389404697e-06, "loss": 0.4384, "step": 18170 }, { "epoch": 4.466887824057009, "grad_norm": 0.24319495187946383, "learning_rate": 5.9120699071545605e-06, "loss": 0.4493, "step": 18180 }, { "epoch": 4.469345128394152, "grad_norm": 0.29572295237131924, "learning_rate": 5.884762424904424e-06, "loss": 0.4555, "step": 18190 }, { "epoch": 4.471802432731294, "grad_norm": 0.2315717104130603, "learning_rate": 5.857454942654288e-06, "loss": 0.4384, "step": 18200 }, { "epoch": 4.474259737068436, "grad_norm": 0.26847427183827205, "learning_rate": 5.830147460404152e-06, "loss": 0.4355, "step": 18210 }, { "epoch": 4.476717041405578, "grad_norm": 0.26588806374132495, "learning_rate": 5.802839978154014e-06, "loss": 0.4479, "step": 18220 }, { "epoch": 4.47917434574272, "grad_norm": 0.22669653484395658, "learning_rate": 5.775532495903878e-06, "loss": 0.4688, "step": 18230 }, { "epoch": 4.481631650079862, "grad_norm": 0.2812178522065026, "learning_rate": 5.748225013653741e-06, "loss": 0.4588, "step": 18240 }, { "epoch": 4.484088954417005, "grad_norm": 0.2520025479179895, "learning_rate": 5.720917531403605e-06, "loss": 0.4538, "step": 18250 }, { "epoch": 4.486546258754147, "grad_norm": 0.23768512835221248, "learning_rate": 5.693610049153468e-06, "loss": 0.4456, "step": 18260 }, { "epoch": 4.489003563091289, "grad_norm": 0.26847007744574347, "learning_rate": 5.6663025669033315e-06, "loss": 0.4644, "step": 18270 }, { "epoch": 4.491460867428431, "grad_norm": 0.24931475962596739, "learning_rate": 5.6389950846531955e-06, "loss": 0.4604, "step": 18280 }, { "epoch": 4.493918171765573, "grad_norm": 0.22685923886061518, "learning_rate": 5.611687602403059e-06, "loss": 0.4586, "step": 18290 }, { "epoch": 4.496375476102715, "grad_norm": 0.21047600500211652, "learning_rate": 5.584380120152923e-06, "loss": 0.4418, "step": 18300 }, { "epoch": 4.498832780439858, "grad_norm": 0.2816462833900746, "learning_rate": 5.557072637902786e-06, "loss": 0.4545, "step": 18310 }, { "epoch": 4.501290084777, "grad_norm": 0.2764031292038267, "learning_rate": 5.529765155652649e-06, "loss": 0.4419, "step": 18320 }, { "epoch": 4.5037473891141415, "grad_norm": 0.22277912175557005, "learning_rate": 5.502457673402513e-06, "loss": 0.4345, "step": 18330 }, { "epoch": 4.506204693451284, "grad_norm": 0.3142920540674117, "learning_rate": 5.475150191152376e-06, "loss": 0.4493, "step": 18340 }, { "epoch": 4.508661997788426, "grad_norm": 0.24574406250696776, "learning_rate": 5.447842708902239e-06, "loss": 0.432, "step": 18350 }, { "epoch": 4.511119302125568, "grad_norm": 0.2942072283624335, "learning_rate": 5.420535226652103e-06, "loss": 0.4448, "step": 18360 }, { "epoch": 4.513576606462711, "grad_norm": 0.22348890580324948, "learning_rate": 5.393227744401966e-06, "loss": 0.4651, "step": 18370 }, { "epoch": 4.516033910799853, "grad_norm": 0.2767325499717418, "learning_rate": 5.36592026215183e-06, "loss": 0.471, "step": 18380 }, { "epoch": 4.5184912151369945, "grad_norm": 0.2234636026495501, "learning_rate": 5.338612779901693e-06, "loss": 0.4466, "step": 18390 }, { "epoch": 4.520948519474137, "grad_norm": 0.22673985784533895, "learning_rate": 5.311305297651557e-06, "loss": 0.4535, "step": 18400 }, { "epoch": 4.523405823811279, "grad_norm": 0.24066651331290892, "learning_rate": 5.28399781540142e-06, "loss": 0.4458, "step": 18410 }, { "epoch": 4.525863128148421, "grad_norm": 0.22165124374349943, "learning_rate": 5.256690333151284e-06, "loss": 0.4689, "step": 18420 }, { "epoch": 4.528320432485564, "grad_norm": 0.22415597168711676, "learning_rate": 5.229382850901148e-06, "loss": 0.4575, "step": 18430 }, { "epoch": 4.530777736822706, "grad_norm": 0.22419352638069617, "learning_rate": 5.20207536865101e-06, "loss": 0.4464, "step": 18440 }, { "epoch": 4.533235041159847, "grad_norm": 0.237172513047808, "learning_rate": 5.174767886400874e-06, "loss": 0.4482, "step": 18450 }, { "epoch": 4.53569234549699, "grad_norm": 0.2434521205330227, "learning_rate": 5.147460404150737e-06, "loss": 0.4401, "step": 18460 }, { "epoch": 4.538149649834132, "grad_norm": 0.2617903933483766, "learning_rate": 5.120152921900601e-06, "loss": 0.4655, "step": 18470 }, { "epoch": 4.540606954171274, "grad_norm": 0.24797971976003697, "learning_rate": 5.0928454396504645e-06, "loss": 0.4532, "step": 18480 }, { "epoch": 4.543064258508416, "grad_norm": 0.2492021225010885, "learning_rate": 5.065537957400328e-06, "loss": 0.4629, "step": 18490 }, { "epoch": 4.5455215628455585, "grad_norm": 0.2366914102446359, "learning_rate": 5.038230475150192e-06, "loss": 0.4544, "step": 18500 }, { "epoch": 4.5479788671827, "grad_norm": 0.2529023702356229, "learning_rate": 5.010922992900055e-06, "loss": 0.4576, "step": 18510 }, { "epoch": 4.550436171519843, "grad_norm": 0.238688132494116, "learning_rate": 4.983615510649918e-06, "loss": 0.4481, "step": 18520 }, { "epoch": 4.552893475856985, "grad_norm": 0.23579478221731465, "learning_rate": 4.956308028399782e-06, "loss": 0.4589, "step": 18530 }, { "epoch": 4.555350780194127, "grad_norm": 0.30245992919513076, "learning_rate": 4.929000546149645e-06, "loss": 0.4389, "step": 18540 }, { "epoch": 4.557808084531269, "grad_norm": 0.2698958339879691, "learning_rate": 4.901693063899509e-06, "loss": 0.4624, "step": 18550 }, { "epoch": 4.5602653888684115, "grad_norm": 0.289808380894861, "learning_rate": 4.874385581649372e-06, "loss": 0.4794, "step": 18560 }, { "epoch": 4.562722693205553, "grad_norm": 0.25637154399528916, "learning_rate": 4.8470780993992354e-06, "loss": 0.4519, "step": 18570 }, { "epoch": 4.565179997542696, "grad_norm": 0.25465357332979077, "learning_rate": 4.8197706171490994e-06, "loss": 0.4464, "step": 18580 }, { "epoch": 4.567637301879838, "grad_norm": 0.2397316153422739, "learning_rate": 4.792463134898963e-06, "loss": 0.448, "step": 18590 }, { "epoch": 4.57009460621698, "grad_norm": 0.25227254181008546, "learning_rate": 4.765155652648827e-06, "loss": 0.4505, "step": 18600 }, { "epoch": 4.572551910554122, "grad_norm": 0.22192517019829197, "learning_rate": 4.737848170398689e-06, "loss": 0.4599, "step": 18610 }, { "epoch": 4.575009214891264, "grad_norm": 0.21442126116885818, "learning_rate": 4.710540688148553e-06, "loss": 0.439, "step": 18620 }, { "epoch": 4.577466519228406, "grad_norm": 0.30482395228513554, "learning_rate": 4.683233205898416e-06, "loss": 0.4535, "step": 18630 }, { "epoch": 4.579923823565549, "grad_norm": 0.23629564949717738, "learning_rate": 4.65592572364828e-06, "loss": 0.4733, "step": 18640 }, { "epoch": 4.582381127902691, "grad_norm": 0.24492226343531434, "learning_rate": 4.628618241398143e-06, "loss": 0.4451, "step": 18650 }, { "epoch": 4.584838432239833, "grad_norm": 0.24977682225312148, "learning_rate": 4.601310759148006e-06, "loss": 0.4447, "step": 18660 }, { "epoch": 4.587295736576975, "grad_norm": 0.22725521638735774, "learning_rate": 4.57400327689787e-06, "loss": 0.4616, "step": 18670 }, { "epoch": 4.589753040914117, "grad_norm": 0.24426472847975217, "learning_rate": 4.5466957946477335e-06, "loss": 0.4588, "step": 18680 }, { "epoch": 4.592210345251259, "grad_norm": 0.2549154905101622, "learning_rate": 4.5193883123975975e-06, "loss": 0.4506, "step": 18690 }, { "epoch": 4.594667649588402, "grad_norm": 0.2547487465872458, "learning_rate": 4.492080830147461e-06, "loss": 0.4687, "step": 18700 }, { "epoch": 4.597124953925544, "grad_norm": 0.25551824567423326, "learning_rate": 4.464773347897324e-06, "loss": 0.4493, "step": 18710 }, { "epoch": 4.599582258262686, "grad_norm": 0.2825350569262872, "learning_rate": 4.437465865647188e-06, "loss": 0.4591, "step": 18720 }, { "epoch": 4.602039562599828, "grad_norm": 0.25978392314831417, "learning_rate": 4.410158383397051e-06, "loss": 0.4516, "step": 18730 }, { "epoch": 4.60449686693697, "grad_norm": 0.25952790860654595, "learning_rate": 4.382850901146914e-06, "loss": 0.4481, "step": 18740 }, { "epoch": 4.606954171274112, "grad_norm": 0.23705621889674747, "learning_rate": 4.355543418896778e-06, "loss": 0.4431, "step": 18750 }, { "epoch": 4.609411475611254, "grad_norm": 0.24764204507517165, "learning_rate": 4.328235936646641e-06, "loss": 0.4433, "step": 18760 }, { "epoch": 4.611868779948397, "grad_norm": 0.24734518814365983, "learning_rate": 4.300928454396505e-06, "loss": 0.4497, "step": 18770 }, { "epoch": 4.614326084285539, "grad_norm": 0.2485775562041274, "learning_rate": 4.273620972146368e-06, "loss": 0.4588, "step": 18780 }, { "epoch": 4.616783388622681, "grad_norm": 0.25469814703833293, "learning_rate": 4.246313489896232e-06, "loss": 0.4513, "step": 18790 }, { "epoch": 4.619240692959823, "grad_norm": 0.23551045285123773, "learning_rate": 4.219006007646096e-06, "loss": 0.4388, "step": 18800 }, { "epoch": 4.621697997296965, "grad_norm": 0.2560793868617513, "learning_rate": 4.191698525395959e-06, "loss": 0.4381, "step": 18810 }, { "epoch": 4.624155301634107, "grad_norm": 0.26326289589562096, "learning_rate": 4.164391043145823e-06, "loss": 0.4283, "step": 18820 }, { "epoch": 4.62661260597125, "grad_norm": 0.3287992869836343, "learning_rate": 4.137083560895685e-06, "loss": 0.4521, "step": 18830 }, { "epoch": 4.629069910308392, "grad_norm": 0.21890051339446748, "learning_rate": 4.109776078645549e-06, "loss": 0.4649, "step": 18840 }, { "epoch": 4.6315272146455335, "grad_norm": 0.23718303738031016, "learning_rate": 4.082468596395412e-06, "loss": 0.4205, "step": 18850 }, { "epoch": 4.633984518982676, "grad_norm": 0.21019495076290362, "learning_rate": 4.055161114145276e-06, "loss": 0.4537, "step": 18860 }, { "epoch": 4.636441823319818, "grad_norm": 0.24193010131187018, "learning_rate": 4.027853631895139e-06, "loss": 0.4404, "step": 18870 }, { "epoch": 4.63889912765696, "grad_norm": 0.22566538713652837, "learning_rate": 4.0005461496450025e-06, "loss": 0.4532, "step": 18880 }, { "epoch": 4.641356431994103, "grad_norm": 0.24741371173550428, "learning_rate": 3.9732386673948665e-06, "loss": 0.4454, "step": 18890 }, { "epoch": 4.643813736331245, "grad_norm": 0.23883804744468282, "learning_rate": 3.94593118514473e-06, "loss": 0.4689, "step": 18900 }, { "epoch": 4.6462710406683865, "grad_norm": 0.25707686748164893, "learning_rate": 3.918623702894593e-06, "loss": 0.4477, "step": 18910 }, { "epoch": 4.648728345005529, "grad_norm": 0.2573410143220795, "learning_rate": 3.891316220644457e-06, "loss": 0.4545, "step": 18920 }, { "epoch": 4.651185649342671, "grad_norm": 0.25230723940904065, "learning_rate": 3.86400873839432e-06, "loss": 0.4279, "step": 18930 }, { "epoch": 4.653642953679813, "grad_norm": 0.214807152898999, "learning_rate": 3.836701256144184e-06, "loss": 0.4381, "step": 18940 }, { "epoch": 4.656100258016956, "grad_norm": 0.2626711410050249, "learning_rate": 3.8093937738940476e-06, "loss": 0.4488, "step": 18950 }, { "epoch": 4.658557562354098, "grad_norm": 0.25507440487668365, "learning_rate": 3.7820862916439103e-06, "loss": 0.4816, "step": 18960 }, { "epoch": 4.661014866691239, "grad_norm": 0.26396031087427396, "learning_rate": 3.754778809393774e-06, "loss": 0.4421, "step": 18970 }, { "epoch": 4.663472171028382, "grad_norm": 0.2237719620129181, "learning_rate": 3.7274713271436375e-06, "loss": 0.464, "step": 18980 }, { "epoch": 4.665929475365524, "grad_norm": 0.257003198310816, "learning_rate": 3.700163844893501e-06, "loss": 0.4487, "step": 18990 }, { "epoch": 4.668386779702666, "grad_norm": 0.21625557249022057, "learning_rate": 3.672856362643364e-06, "loss": 0.4361, "step": 19000 }, { "epoch": 4.670844084039809, "grad_norm": 0.25208393410964197, "learning_rate": 3.6455488803932278e-06, "loss": 0.445, "step": 19010 }, { "epoch": 4.6733013883769505, "grad_norm": 0.3295551105587811, "learning_rate": 3.6182413981430914e-06, "loss": 0.4623, "step": 19020 }, { "epoch": 4.675758692714092, "grad_norm": 0.2521753066585643, "learning_rate": 3.590933915892955e-06, "loss": 0.4519, "step": 19030 }, { "epoch": 4.678215997051235, "grad_norm": 0.23834867901600432, "learning_rate": 3.5636264336428185e-06, "loss": 0.4562, "step": 19040 }, { "epoch": 4.680673301388377, "grad_norm": 0.23770244596559734, "learning_rate": 3.5363189513926817e-06, "loss": 0.4691, "step": 19050 }, { "epoch": 4.683130605725519, "grad_norm": 0.23840325000078333, "learning_rate": 3.5090114691425453e-06, "loss": 0.4414, "step": 19060 }, { "epoch": 4.685587910062662, "grad_norm": 0.21625913623536427, "learning_rate": 3.481703986892409e-06, "loss": 0.4563, "step": 19070 }, { "epoch": 4.6880452143998035, "grad_norm": 0.25777311187133933, "learning_rate": 3.4543965046422724e-06, "loss": 0.4564, "step": 19080 }, { "epoch": 4.690502518736945, "grad_norm": 0.2528779894151935, "learning_rate": 3.4270890223921356e-06, "loss": 0.4577, "step": 19090 }, { "epoch": 4.692959823074088, "grad_norm": 0.23397634132250053, "learning_rate": 3.399781540141999e-06, "loss": 0.4384, "step": 19100 }, { "epoch": 4.69541712741123, "grad_norm": 0.21562924346304171, "learning_rate": 3.3724740578918627e-06, "loss": 0.4558, "step": 19110 }, { "epoch": 4.697874431748372, "grad_norm": 0.28060538412087876, "learning_rate": 3.3451665756417263e-06, "loss": 0.4511, "step": 19120 }, { "epoch": 4.700331736085515, "grad_norm": 0.22066163383921125, "learning_rate": 3.317859093391589e-06, "loss": 0.4572, "step": 19130 }, { "epoch": 4.702789040422656, "grad_norm": 0.23371619297159302, "learning_rate": 3.2905516111414526e-06, "loss": 0.4582, "step": 19140 }, { "epoch": 4.705246344759798, "grad_norm": 0.2720830389376513, "learning_rate": 3.263244128891316e-06, "loss": 0.4496, "step": 19150 }, { "epoch": 4.70770364909694, "grad_norm": 0.23428662530368213, "learning_rate": 3.23593664664118e-06, "loss": 0.453, "step": 19160 }, { "epoch": 4.710160953434083, "grad_norm": 0.2190959820637881, "learning_rate": 3.2086291643910438e-06, "loss": 0.4613, "step": 19170 }, { "epoch": 4.712618257771225, "grad_norm": 0.2580419243728512, "learning_rate": 3.1813216821409065e-06, "loss": 0.4368, "step": 19180 }, { "epoch": 4.7150755621083675, "grad_norm": 0.23860541769635593, "learning_rate": 3.15401419989077e-06, "loss": 0.444, "step": 19190 }, { "epoch": 4.717532866445509, "grad_norm": 0.24743766200744194, "learning_rate": 3.1267067176406336e-06, "loss": 0.4574, "step": 19200 }, { "epoch": 4.719990170782651, "grad_norm": 0.26446246195784223, "learning_rate": 3.0993992353904972e-06, "loss": 0.4623, "step": 19210 }, { "epoch": 4.722447475119793, "grad_norm": 0.21706566424933532, "learning_rate": 3.072091753140361e-06, "loss": 0.4615, "step": 19220 }, { "epoch": 4.724904779456936, "grad_norm": 0.2482377008527956, "learning_rate": 3.044784270890224e-06, "loss": 0.4505, "step": 19230 }, { "epoch": 4.727362083794078, "grad_norm": 0.25580526586792773, "learning_rate": 3.0174767886400875e-06, "loss": 0.4661, "step": 19240 }, { "epoch": 4.7298193881312205, "grad_norm": 0.2663397842653643, "learning_rate": 2.9901693063899507e-06, "loss": 0.4543, "step": 19250 }, { "epoch": 4.732276692468362, "grad_norm": 0.24279793269627262, "learning_rate": 2.9628618241398147e-06, "loss": 0.4463, "step": 19260 }, { "epoch": 4.734733996805504, "grad_norm": 0.23600518288103742, "learning_rate": 2.935554341889678e-06, "loss": 0.4507, "step": 19270 }, { "epoch": 4.737191301142646, "grad_norm": 0.23682585632148617, "learning_rate": 2.9082468596395414e-06, "loss": 0.4513, "step": 19280 }, { "epoch": 4.739648605479789, "grad_norm": 0.22772883950306444, "learning_rate": 2.880939377389405e-06, "loss": 0.4324, "step": 19290 }, { "epoch": 4.742105909816931, "grad_norm": 0.24017600138449516, "learning_rate": 2.853631895139268e-06, "loss": 0.4527, "step": 19300 }, { "epoch": 4.7445632141540734, "grad_norm": 0.27040078635166886, "learning_rate": 2.8263244128891317e-06, "loss": 0.4367, "step": 19310 }, { "epoch": 4.747020518491215, "grad_norm": 0.2208315919737886, "learning_rate": 2.7990169306389953e-06, "loss": 0.4618, "step": 19320 }, { "epoch": 4.749477822828357, "grad_norm": 0.2723456066618346, "learning_rate": 2.771709448388859e-06, "loss": 0.4431, "step": 19330 }, { "epoch": 4.751935127165499, "grad_norm": 0.25905510222354133, "learning_rate": 2.744401966138722e-06, "loss": 0.461, "step": 19340 }, { "epoch": 4.754392431502642, "grad_norm": 0.2839697612946893, "learning_rate": 2.7170944838885856e-06, "loss": 0.46, "step": 19350 }, { "epoch": 4.756849735839784, "grad_norm": 0.22924869880381113, "learning_rate": 2.6897870016384488e-06, "loss": 0.4502, "step": 19360 }, { "epoch": 4.759307040176926, "grad_norm": 0.21554898058126848, "learning_rate": 2.6624795193883128e-06, "loss": 0.4427, "step": 19370 }, { "epoch": 4.761764344514068, "grad_norm": 0.29341845604213257, "learning_rate": 2.635172037138176e-06, "loss": 0.4793, "step": 19380 }, { "epoch": 4.76422164885121, "grad_norm": 0.26660510102699775, "learning_rate": 2.6078645548880395e-06, "loss": 0.4426, "step": 19390 }, { "epoch": 4.766678953188352, "grad_norm": 0.3362339922209631, "learning_rate": 2.5805570726379027e-06, "loss": 0.4661, "step": 19400 }, { "epoch": 4.769136257525495, "grad_norm": 0.20050772752000878, "learning_rate": 2.5532495903877662e-06, "loss": 0.4475, "step": 19410 }, { "epoch": 4.771593561862637, "grad_norm": 0.23511326473245636, "learning_rate": 2.52594210813763e-06, "loss": 0.4357, "step": 19420 }, { "epoch": 4.7740508661997785, "grad_norm": 0.24176207962025129, "learning_rate": 2.4986346258874934e-06, "loss": 0.4574, "step": 19430 }, { "epoch": 4.776508170536921, "grad_norm": 0.2631261196937751, "learning_rate": 2.471327143637357e-06, "loss": 0.4618, "step": 19440 }, { "epoch": 4.778965474874063, "grad_norm": 0.26620464990862047, "learning_rate": 2.44401966138722e-06, "loss": 0.4888, "step": 19450 }, { "epoch": 4.781422779211205, "grad_norm": 0.24210352293726203, "learning_rate": 2.4167121791370837e-06, "loss": 0.439, "step": 19460 }, { "epoch": 4.783880083548348, "grad_norm": 0.2497314559930393, "learning_rate": 2.389404696886947e-06, "loss": 0.4316, "step": 19470 }, { "epoch": 4.78633738788549, "grad_norm": 0.2431689802581428, "learning_rate": 2.362097214636811e-06, "loss": 0.4437, "step": 19480 }, { "epoch": 4.788794692222631, "grad_norm": 0.23952550469724238, "learning_rate": 2.334789732386674e-06, "loss": 0.4671, "step": 19490 }, { "epoch": 4.791251996559774, "grad_norm": 0.22001198240950562, "learning_rate": 2.3074822501365376e-06, "loss": 0.4541, "step": 19500 }, { "epoch": 4.793709300896916, "grad_norm": 0.23030117904252037, "learning_rate": 2.2801747678864008e-06, "loss": 0.4495, "step": 19510 }, { "epoch": 4.796166605234058, "grad_norm": 0.24252518137235146, "learning_rate": 2.2528672856362643e-06, "loss": 0.4723, "step": 19520 }, { "epoch": 4.798623909571201, "grad_norm": 0.2664411375770211, "learning_rate": 2.225559803386128e-06, "loss": 0.4442, "step": 19530 }, { "epoch": 4.8010812139083425, "grad_norm": 0.26021950891711904, "learning_rate": 2.1982523211359915e-06, "loss": 0.4402, "step": 19540 }, { "epoch": 4.803538518245484, "grad_norm": 0.20128172167710995, "learning_rate": 2.170944838885855e-06, "loss": 0.4413, "step": 19550 }, { "epoch": 4.805995822582627, "grad_norm": 0.24762898929629448, "learning_rate": 2.1436373566357182e-06, "loss": 0.4252, "step": 19560 }, { "epoch": 4.808453126919769, "grad_norm": 0.2454806293505124, "learning_rate": 2.116329874385582e-06, "loss": 0.4676, "step": 19570 }, { "epoch": 4.810910431256911, "grad_norm": 0.2748119951604152, "learning_rate": 2.089022392135445e-06, "loss": 0.4481, "step": 19580 }, { "epoch": 4.813367735594054, "grad_norm": 0.2389454281794093, "learning_rate": 2.061714909885309e-06, "loss": 0.4441, "step": 19590 }, { "epoch": 4.8158250399311955, "grad_norm": 0.2781066559991768, "learning_rate": 2.034407427635172e-06, "loss": 0.4738, "step": 19600 }, { "epoch": 4.818282344268337, "grad_norm": 0.2639516193077918, "learning_rate": 2.0070999453850357e-06, "loss": 0.4573, "step": 19610 }, { "epoch": 4.82073964860548, "grad_norm": 0.2239383808717699, "learning_rate": 1.979792463134899e-06, "loss": 0.4555, "step": 19620 }, { "epoch": 4.823196952942622, "grad_norm": 0.28240386875231166, "learning_rate": 1.9524849808847624e-06, "loss": 0.4447, "step": 19630 }, { "epoch": 4.825654257279764, "grad_norm": 0.22516422994153712, "learning_rate": 1.925177498634626e-06, "loss": 0.4527, "step": 19640 }, { "epoch": 4.828111561616907, "grad_norm": 0.25710441458493816, "learning_rate": 1.8978700163844894e-06, "loss": 0.4618, "step": 19650 }, { "epoch": 4.830568865954048, "grad_norm": 0.2388670963553709, "learning_rate": 1.8705625341343532e-06, "loss": 0.451, "step": 19660 }, { "epoch": 4.83302617029119, "grad_norm": 0.2509645058107287, "learning_rate": 1.8432550518842163e-06, "loss": 0.4605, "step": 19670 }, { "epoch": 4.835483474628333, "grad_norm": 0.30630918883188085, "learning_rate": 1.8159475696340799e-06, "loss": 0.4705, "step": 19680 }, { "epoch": 4.837940778965475, "grad_norm": 0.23557403153997705, "learning_rate": 1.7886400873839433e-06, "loss": 0.4453, "step": 19690 }, { "epoch": 4.840398083302617, "grad_norm": 0.23187546438904136, "learning_rate": 1.7613326051338068e-06, "loss": 0.4324, "step": 19700 }, { "epoch": 4.8428553876397595, "grad_norm": 0.24376498896584986, "learning_rate": 1.7340251228836702e-06, "loss": 0.4549, "step": 19710 }, { "epoch": 4.845312691976901, "grad_norm": 0.24329853565054785, "learning_rate": 1.7067176406335338e-06, "loss": 0.4465, "step": 19720 }, { "epoch": 4.847769996314043, "grad_norm": 0.237466378319194, "learning_rate": 1.679410158383397e-06, "loss": 0.4372, "step": 19730 }, { "epoch": 4.850227300651186, "grad_norm": 0.20797236361823504, "learning_rate": 1.6521026761332607e-06, "loss": 0.4687, "step": 19740 }, { "epoch": 4.852684604988328, "grad_norm": 0.25369712398855704, "learning_rate": 1.6247951938831239e-06, "loss": 0.4602, "step": 19750 }, { "epoch": 4.85514190932547, "grad_norm": 0.2189652342799039, "learning_rate": 1.5974877116329875e-06, "loss": 0.455, "step": 19760 }, { "epoch": 4.8575992136626125, "grad_norm": 0.23499945843873862, "learning_rate": 1.5701802293828508e-06, "loss": 0.4436, "step": 19770 }, { "epoch": 4.860056517999754, "grad_norm": 0.230843934764629, "learning_rate": 1.5428727471327144e-06, "loss": 0.4495, "step": 19780 }, { "epoch": 4.862513822336896, "grad_norm": 0.23520547711077686, "learning_rate": 1.515565264882578e-06, "loss": 0.4676, "step": 19790 }, { "epoch": 4.864971126674039, "grad_norm": 0.2576976421165044, "learning_rate": 1.4882577826324413e-06, "loss": 0.4545, "step": 19800 }, { "epoch": 4.867428431011181, "grad_norm": 0.26502325305518526, "learning_rate": 1.4609503003823047e-06, "loss": 0.4576, "step": 19810 }, { "epoch": 4.869885735348323, "grad_norm": 0.2391068683748481, "learning_rate": 1.4336428181321683e-06, "loss": 0.4699, "step": 19820 }, { "epoch": 4.872343039685465, "grad_norm": 0.22952283690879635, "learning_rate": 1.4063353358820317e-06, "loss": 0.4598, "step": 19830 }, { "epoch": 4.874800344022607, "grad_norm": 0.23170675909424945, "learning_rate": 1.379027853631895e-06, "loss": 0.4386, "step": 19840 }, { "epoch": 4.877257648359749, "grad_norm": 0.24141420900925256, "learning_rate": 1.3517203713817588e-06, "loss": 0.4618, "step": 19850 }, { "epoch": 4.879714952696892, "grad_norm": 0.2292287000532329, "learning_rate": 1.3244128891316222e-06, "loss": 0.4552, "step": 19860 }, { "epoch": 4.882172257034034, "grad_norm": 0.22417971785026963, "learning_rate": 1.2971054068814855e-06, "loss": 0.4426, "step": 19870 }, { "epoch": 4.884629561371176, "grad_norm": 0.2315556127578329, "learning_rate": 1.2697979246313491e-06, "loss": 0.4539, "step": 19880 }, { "epoch": 4.8870868657083175, "grad_norm": 0.24315591166383424, "learning_rate": 1.2424904423812125e-06, "loss": 0.448, "step": 19890 }, { "epoch": 4.88954417004546, "grad_norm": 0.29565482524333386, "learning_rate": 1.215182960131076e-06, "loss": 0.4333, "step": 19900 }, { "epoch": 4.892001474382602, "grad_norm": 0.2448819472468166, "learning_rate": 1.1878754778809394e-06, "loss": 0.4594, "step": 19910 }, { "epoch": 4.894458778719745, "grad_norm": 0.21165814912657804, "learning_rate": 1.1605679956308028e-06, "loss": 0.4523, "step": 19920 }, { "epoch": 4.896916083056887, "grad_norm": 0.23086312058867559, "learning_rate": 1.1332605133806664e-06, "loss": 0.4393, "step": 19930 }, { "epoch": 4.899373387394029, "grad_norm": 0.22826693179087457, "learning_rate": 1.1059530311305297e-06, "loss": 0.4465, "step": 19940 }, { "epoch": 4.9018306917311705, "grad_norm": 0.23763241948417752, "learning_rate": 1.0786455488803933e-06, "loss": 0.4446, "step": 19950 }, { "epoch": 4.904287996068313, "grad_norm": 0.24793932891792275, "learning_rate": 1.0513380666302567e-06, "loss": 0.4526, "step": 19960 }, { "epoch": 4.906745300405455, "grad_norm": 0.2534708983886002, "learning_rate": 1.02403058438012e-06, "loss": 0.4667, "step": 19970 }, { "epoch": 4.909202604742598, "grad_norm": 0.25624949942684416, "learning_rate": 9.967231021299836e-07, "loss": 0.4383, "step": 19980 }, { "epoch": 4.91165990907974, "grad_norm": 0.2209230528014843, "learning_rate": 9.694156198798472e-07, "loss": 0.4473, "step": 19990 }, { "epoch": 4.914117213416882, "grad_norm": 0.2817363021676811, "learning_rate": 9.421081376297107e-07, "loss": 0.4735, "step": 20000 }, { "epoch": 4.916574517754023, "grad_norm": 0.2337259199966452, "learning_rate": 9.14800655379574e-07, "loss": 0.4676, "step": 20010 }, { "epoch": 4.919031822091166, "grad_norm": 0.2918341976858164, "learning_rate": 8.874931731294375e-07, "loss": 0.4285, "step": 20020 }, { "epoch": 4.921489126428308, "grad_norm": 0.2433694980530171, "learning_rate": 8.60185690879301e-07, "loss": 0.4584, "step": 20030 }, { "epoch": 4.923946430765451, "grad_norm": 0.22406522358602268, "learning_rate": 8.328782086291645e-07, "loss": 0.4315, "step": 20040 }, { "epoch": 4.926403735102593, "grad_norm": 0.21350375881544495, "learning_rate": 8.055707263790278e-07, "loss": 0.4675, "step": 20050 }, { "epoch": 4.9288610394397345, "grad_norm": 0.26048047575805233, "learning_rate": 7.782632441288913e-07, "loss": 0.4696, "step": 20060 }, { "epoch": 4.931318343776876, "grad_norm": 0.259836271188662, "learning_rate": 7.509557618787549e-07, "loss": 0.4623, "step": 20070 }, { "epoch": 4.933775648114019, "grad_norm": 0.22444584059648676, "learning_rate": 7.236482796286183e-07, "loss": 0.4487, "step": 20080 }, { "epoch": 4.936232952451161, "grad_norm": 0.24526397070426145, "learning_rate": 6.963407973784817e-07, "loss": 0.4734, "step": 20090 }, { "epoch": 4.938690256788303, "grad_norm": 0.2584642741771824, "learning_rate": 6.690333151283452e-07, "loss": 0.4766, "step": 20100 }, { "epoch": 4.941147561125446, "grad_norm": 0.22864914216726084, "learning_rate": 6.417258328782087e-07, "loss": 0.4451, "step": 20110 }, { "epoch": 4.9436048654625875, "grad_norm": 0.26111885429569776, "learning_rate": 6.144183506280721e-07, "loss": 0.4713, "step": 20120 }, { "epoch": 4.946062169799729, "grad_norm": 0.2214573981988787, "learning_rate": 5.871108683779356e-07, "loss": 0.4595, "step": 20130 }, { "epoch": 4.948519474136872, "grad_norm": 0.24365562708196356, "learning_rate": 5.598033861277991e-07, "loss": 0.4397, "step": 20140 }, { "epoch": 4.950976778474014, "grad_norm": 0.27538909297525915, "learning_rate": 5.324959038776626e-07, "loss": 0.4584, "step": 20150 }, { "epoch": 4.953434082811156, "grad_norm": 0.2618430456358992, "learning_rate": 5.051884216275259e-07, "loss": 0.4472, "step": 20160 }, { "epoch": 4.955891387148299, "grad_norm": 0.23866840201203718, "learning_rate": 4.778809393773894e-07, "loss": 0.4427, "step": 20170 }, { "epoch": 4.95834869148544, "grad_norm": 0.27678997174863, "learning_rate": 4.5057345712725287e-07, "loss": 0.4481, "step": 20180 }, { "epoch": 4.960805995822582, "grad_norm": 0.25857290826979473, "learning_rate": 4.232659748771164e-07, "loss": 0.429, "step": 20190 }, { "epoch": 4.963263300159725, "grad_norm": 0.2598506940048192, "learning_rate": 3.9595849262697986e-07, "loss": 0.46, "step": 20200 }, { "epoch": 4.965720604496867, "grad_norm": 0.293556453627303, "learning_rate": 3.686510103768433e-07, "loss": 0.477, "step": 20210 }, { "epoch": 4.968177908834009, "grad_norm": 0.22884392625074826, "learning_rate": 3.4134352812670676e-07, "loss": 0.4467, "step": 20220 }, { "epoch": 4.9706352131711515, "grad_norm": 0.2941145180779771, "learning_rate": 3.140360458765702e-07, "loss": 0.4522, "step": 20230 }, { "epoch": 4.973092517508293, "grad_norm": 0.22399298687605282, "learning_rate": 2.8672856362643365e-07, "loss": 0.4504, "step": 20240 }, { "epoch": 4.975549821845435, "grad_norm": 0.30829502928101216, "learning_rate": 2.594210813762971e-07, "loss": 0.461, "step": 20250 }, { "epoch": 4.978007126182578, "grad_norm": 0.24819019016938745, "learning_rate": 2.3211359912616057e-07, "loss": 0.4615, "step": 20260 }, { "epoch": 4.98046443051972, "grad_norm": 0.25048482720259696, "learning_rate": 2.0480611687602406e-07, "loss": 0.4356, "step": 20270 }, { "epoch": 4.982921734856862, "grad_norm": 0.22814297171959336, "learning_rate": 1.774986346258875e-07, "loss": 0.4659, "step": 20280 }, { "epoch": 4.9853790391940045, "grad_norm": 0.2192838417652881, "learning_rate": 1.5019115237575096e-07, "loss": 0.4396, "step": 20290 }, { "epoch": 4.987836343531146, "grad_norm": 0.2400601090501823, "learning_rate": 1.2288367012561443e-07, "loss": 0.4574, "step": 20300 }, { "epoch": 4.990293647868288, "grad_norm": 0.2309851282481528, "learning_rate": 9.557618787547789e-08, "loss": 0.4645, "step": 20310 }, { "epoch": 4.992750952205431, "grad_norm": 0.22705545564337354, "learning_rate": 6.826870562534135e-08, "loss": 0.4444, "step": 20320 }, { "epoch": 4.995208256542573, "grad_norm": 0.23999288726808476, "learning_rate": 4.0961223375204805e-08, "loss": 0.4569, "step": 20330 }, { "epoch": 4.997665560879715, "grad_norm": 0.23936303974316056, "learning_rate": 1.3653741125068269e-08, "loss": 0.4597, "step": 20340 }, { "epoch": 4.9988942130482865, "step": 20345, "total_flos": 6.700673317784781e+16, "train_loss": 0.4786904600803903, "train_runtime": 98131.1995, "train_samples_per_second": 3.317, "train_steps_per_second": 0.207 } ], "logging_steps": 10, "max_steps": 20345, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.700673317784781e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }