{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 6.85846194433452, "eval_steps": 2000, "global_step": 26000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005276348766653475, "grad_norm": 10.63061237335205, "learning_rate": 1.5826958586125035e-09, "loss": 3.118, "step": 2 }, { "epoch": 0.001055269753330695, "grad_norm": 3.89923095703125, "learning_rate": 4.7480875758375096e-09, "loss": 3.1226, "step": 4 }, { "epoch": 0.0015829046299960427, "grad_norm": 9.74691104888916, "learning_rate": 7.913479293062517e-09, "loss": 3.1203, "step": 6 }, { "epoch": 0.00211053950666139, "grad_norm": 4.172689914703369, "learning_rate": 1.1078871010287523e-08, "loss": 3.1213, "step": 8 }, { "epoch": 0.002638174383326738, "grad_norm": 4.440515518188477, "learning_rate": 1.4244262727512531e-08, "loss": 3.119, "step": 10 }, { "epoch": 0.0031658092599920855, "grad_norm": 3.8980653285980225, "learning_rate": 1.5826958586125035e-08, "loss": 3.1182, "step": 12 }, { "epoch": 0.003693444136657433, "grad_norm": 4.052310466766357, "learning_rate": 1.8992350303350038e-08, "loss": 3.121, "step": 14 }, { "epoch": 0.00422107901332278, "grad_norm": NaN, "learning_rate": 2.2157742020575045e-08, "loss": 3.117, "step": 16 }, { "epoch": 0.004748713889988128, "grad_norm": 3.6980514526367188, "learning_rate": 2.374043787918755e-08, "loss": 3.1205, "step": 18 }, { "epoch": 0.005276348766653476, "grad_norm": Infinity, "learning_rate": 2.6905829596412556e-08, "loss": 3.117, "step": 20 }, { "epoch": 0.005803983643318823, "grad_norm": 5.677456855773926, "learning_rate": 2.6905829596412556e-08, "loss": 3.1152, "step": 22 }, { "epoch": 0.006331618519984171, "grad_norm": 3.7962942123413086, "learning_rate": 3.0071221313637566e-08, "loss": 3.12, "step": 24 }, { "epoch": 0.006859253396649519, "grad_norm": 3.6216907501220703, "learning_rate": 3.323661303086257e-08, "loss": 3.1243, "step": 26 }, { "epoch": 0.007386888273314866, "grad_norm": 3.6370582580566406, "learning_rate": 3.640200474808758e-08, "loss": 3.1201, "step": 28 }, { "epoch": 0.007914523149980214, "grad_norm": 3.5170176029205322, "learning_rate": 3.9567396465312587e-08, "loss": 3.1209, "step": 30 }, { "epoch": 0.00844215802664556, "grad_norm": 3.674708127975464, "learning_rate": 4.2732788182537593e-08, "loss": 3.1224, "step": 32 }, { "epoch": 0.00896979290331091, "grad_norm": 8.07484245300293, "learning_rate": 4.58981798997626e-08, "loss": 3.1173, "step": 34 }, { "epoch": 0.009497427779976256, "grad_norm": 4.291584491729736, "learning_rate": 4.906357161698761e-08, "loss": 3.1188, "step": 36 }, { "epoch": 0.010025062656641603, "grad_norm": 13.340597152709961, "learning_rate": 5.222896333421261e-08, "loss": 3.1187, "step": 38 }, { "epoch": 0.010552697533306952, "grad_norm": 3.7641255855560303, "learning_rate": 5.539435505143762e-08, "loss": 3.118, "step": 40 }, { "epoch": 0.0110803324099723, "grad_norm": 3.779597759246826, "learning_rate": 5.855974676866262e-08, "loss": 3.1182, "step": 42 }, { "epoch": 0.011607967286637646, "grad_norm": 3.892887592315674, "learning_rate": 6.172513848588763e-08, "loss": 3.1222, "step": 44 }, { "epoch": 0.012135602163302995, "grad_norm": 7.81919002532959, "learning_rate": 6.489053020311264e-08, "loss": 3.1111, "step": 46 }, { "epoch": 0.012663237039968342, "grad_norm": 5.6544694900512695, "learning_rate": 6.805592192033764e-08, "loss": 3.1193, "step": 48 }, { "epoch": 0.013190871916633689, "grad_norm": 3.258709192276001, "learning_rate": 7.122131363756265e-08, "loss": 3.1227, "step": 50 }, { "epoch": 0.013718506793299038, "grad_norm": 10.753890991210938, "learning_rate": 7.438670535478766e-08, "loss": 3.1159, "step": 52 }, { "epoch": 0.014246141669964385, "grad_norm": 4.299476623535156, "learning_rate": 7.755209707201266e-08, "loss": 3.1139, "step": 54 }, { "epoch": 0.014773776546629732, "grad_norm": 4.407174110412598, "learning_rate": 8.071748878923767e-08, "loss": 3.1124, "step": 56 }, { "epoch": 0.01530141142329508, "grad_norm": 3.7634332180023193, "learning_rate": 8.388288050646268e-08, "loss": 3.1223, "step": 58 }, { "epoch": 0.015829046299960427, "grad_norm": 6.243371963500977, "learning_rate": 8.704827222368768e-08, "loss": 3.1215, "step": 60 }, { "epoch": 0.016356681176625774, "grad_norm": 3.9180145263671875, "learning_rate": 9.021366394091269e-08, "loss": 3.1161, "step": 62 }, { "epoch": 0.01688431605329112, "grad_norm": 3.5213489532470703, "learning_rate": 9.33790556581377e-08, "loss": 3.1097, "step": 64 }, { "epoch": 0.01741195092995647, "grad_norm": 22.96717643737793, "learning_rate": 9.654444737536269e-08, "loss": 3.1129, "step": 66 }, { "epoch": 0.01793958580662182, "grad_norm": 4.3595662117004395, "learning_rate": 9.970983909258771e-08, "loss": 3.1077, "step": 68 }, { "epoch": 0.018467220683287166, "grad_norm": 9.737805366516113, "learning_rate": 1.0287523080981272e-07, "loss": 3.1178, "step": 70 }, { "epoch": 0.018994855559952513, "grad_norm": 3.9947943687438965, "learning_rate": 1.0604062252703773e-07, "loss": 3.1142, "step": 72 }, { "epoch": 0.01952249043661786, "grad_norm": 3.415964365005493, "learning_rate": 1.0920601424426272e-07, "loss": 3.1119, "step": 74 }, { "epoch": 0.020050125313283207, "grad_norm": 4.052531719207764, "learning_rate": 1.1237140596148774e-07, "loss": 3.1163, "step": 76 }, { "epoch": 0.020577760189948554, "grad_norm": 4.042687892913818, "learning_rate": 1.1553679767871275e-07, "loss": 3.1079, "step": 78 }, { "epoch": 0.021105395066613904, "grad_norm": 6.892518997192383, "learning_rate": 1.1870218939593775e-07, "loss": 3.1099, "step": 80 }, { "epoch": 0.02163302994327925, "grad_norm": 3.957357168197632, "learning_rate": 1.2186758111316276e-07, "loss": 3.1064, "step": 82 }, { "epoch": 0.0221606648199446, "grad_norm": 3.5434350967407227, "learning_rate": 1.2503297283038775e-07, "loss": 3.1107, "step": 84 }, { "epoch": 0.022688299696609945, "grad_norm": 4.312195301055908, "learning_rate": 1.2819836454761277e-07, "loss": 3.1086, "step": 86 }, { "epoch": 0.023215934573275292, "grad_norm": 3.6304945945739746, "learning_rate": 1.313637562648378e-07, "loss": 3.1056, "step": 88 }, { "epoch": 0.02374356944994064, "grad_norm": 3.6523940563201904, "learning_rate": 1.345291479820628e-07, "loss": 3.1179, "step": 90 }, { "epoch": 0.02427120432660599, "grad_norm": 14.188759803771973, "learning_rate": 1.3769453969928778e-07, "loss": 3.1091, "step": 92 }, { "epoch": 0.024798839203271337, "grad_norm": 32.31694030761719, "learning_rate": 1.408599314165128e-07, "loss": 3.1013, "step": 94 }, { "epoch": 0.025326474079936684, "grad_norm": 3.7072343826293945, "learning_rate": 1.4402532313373782e-07, "loss": 3.1039, "step": 96 }, { "epoch": 0.02585410895660203, "grad_norm": 4.091297626495361, "learning_rate": 1.4719071485096282e-07, "loss": 3.1038, "step": 98 }, { "epoch": 0.026381743833267378, "grad_norm": 4.3570051193237305, "learning_rate": 1.503561065681878e-07, "loss": 3.1083, "step": 100 }, { "epoch": 0.026909378709932725, "grad_norm": 4.217991828918457, "learning_rate": 1.5352149828541283e-07, "loss": 3.1028, "step": 102 }, { "epoch": 0.027437013586598075, "grad_norm": 3.8245296478271484, "learning_rate": 1.5668689000263785e-07, "loss": 3.1048, "step": 104 }, { "epoch": 0.027964648463263422, "grad_norm": 3.1063156127929688, "learning_rate": 1.5985228171986284e-07, "loss": 3.1119, "step": 106 }, { "epoch": 0.02849228333992877, "grad_norm": 4.0869903564453125, "learning_rate": 1.6301767343708784e-07, "loss": 3.0913, "step": 108 }, { "epoch": 0.029019918216594116, "grad_norm": 8.032771110534668, "learning_rate": 1.6618306515431286e-07, "loss": 3.1044, "step": 110 }, { "epoch": 0.029547553093259463, "grad_norm": 3.3503494262695312, "learning_rate": 1.6934845687153785e-07, "loss": 3.1079, "step": 112 }, { "epoch": 0.03007518796992481, "grad_norm": 3.620342254638672, "learning_rate": 1.7251384858876287e-07, "loss": 3.098, "step": 114 }, { "epoch": 0.03060282284659016, "grad_norm": 4.638995170593262, "learning_rate": 1.7567924030598786e-07, "loss": 3.0957, "step": 116 }, { "epoch": 0.031130457723255508, "grad_norm": 3.3510072231292725, "learning_rate": 1.7884463202321288e-07, "loss": 3.1065, "step": 118 }, { "epoch": 0.031658092599920855, "grad_norm": 3.750783920288086, "learning_rate": 1.8201002374043788e-07, "loss": 3.0952, "step": 120 }, { "epoch": 0.032185727476586205, "grad_norm": 5.77292013168335, "learning_rate": 1.851754154576629e-07, "loss": 3.092, "step": 122 }, { "epoch": 0.03271336235325155, "grad_norm": 3.825963020324707, "learning_rate": 1.883408071748879e-07, "loss": 3.0943, "step": 124 }, { "epoch": 0.0332409972299169, "grad_norm": NaN, "learning_rate": 1.9150619889211289e-07, "loss": 3.0935, "step": 126 }, { "epoch": 0.03376863210658224, "grad_norm": 6.503266334533691, "learning_rate": 1.9308889475072538e-07, "loss": 3.1072, "step": 128 }, { "epoch": 0.03429626698324759, "grad_norm": 4.057147026062012, "learning_rate": 1.9625428646795043e-07, "loss": 3.0988, "step": 130 }, { "epoch": 0.03482390185991294, "grad_norm": 4.507822036743164, "learning_rate": 1.9941967818517542e-07, "loss": 3.0922, "step": 132 }, { "epoch": 0.03535153673657829, "grad_norm": 5.098320484161377, "learning_rate": 2.0258506990240044e-07, "loss": 3.0924, "step": 134 }, { "epoch": 0.03587917161324364, "grad_norm": 3.768684148788452, "learning_rate": 2.0575046161962544e-07, "loss": 3.0929, "step": 136 }, { "epoch": 0.03640680648990898, "grad_norm": 53.15306854248047, "learning_rate": 2.0891585333685043e-07, "loss": 3.0901, "step": 138 }, { "epoch": 0.03693444136657433, "grad_norm": 3.136974573135376, "learning_rate": 2.1208124505407545e-07, "loss": 3.0957, "step": 140 }, { "epoch": 0.037462076243239675, "grad_norm": 3.7080907821655273, "learning_rate": 2.1524663677130044e-07, "loss": 3.0879, "step": 142 }, { "epoch": 0.037989711119905026, "grad_norm": 3.4428350925445557, "learning_rate": 2.1841202848852544e-07, "loss": 3.0871, "step": 144 }, { "epoch": 0.038517345996570376, "grad_norm": 17.945940017700195, "learning_rate": 2.2157742020575048e-07, "loss": 3.0981, "step": 146 }, { "epoch": 0.03904498087323572, "grad_norm": 3.4155871868133545, "learning_rate": 2.2474281192297548e-07, "loss": 3.0819, "step": 148 }, { "epoch": 0.03957261574990107, "grad_norm": 6.937693119049072, "learning_rate": 2.279082036402005e-07, "loss": 3.0815, "step": 150 }, { "epoch": 0.040100250626566414, "grad_norm": 3.309572696685791, "learning_rate": 2.29490899498813e-07, "loss": 3.0961, "step": 152 }, { "epoch": 0.040627885503231764, "grad_norm": 6.810710906982422, "learning_rate": 2.32656291216038e-07, "loss": 3.073, "step": 154 }, { "epoch": 0.04115552037989711, "grad_norm": 4.969203472137451, "learning_rate": 2.3582168293326298e-07, "loss": 3.0787, "step": 156 }, { "epoch": 0.04168315525656246, "grad_norm": 21.858579635620117, "learning_rate": 2.3898707465048803e-07, "loss": 3.0913, "step": 158 }, { "epoch": 0.04221079013322781, "grad_norm": 13.633378028869629, "learning_rate": 2.42152466367713e-07, "loss": 3.0828, "step": 160 }, { "epoch": 0.04273842500989315, "grad_norm": 4.032525062561035, "learning_rate": 2.45317858084938e-07, "loss": 3.0763, "step": 162 }, { "epoch": 0.0432660598865585, "grad_norm": 3.84182071685791, "learning_rate": 2.48483249802163e-07, "loss": 3.0743, "step": 164 }, { "epoch": 0.043793694763223846, "grad_norm": 8.648025512695312, "learning_rate": 2.51648641519388e-07, "loss": 3.0816, "step": 166 }, { "epoch": 0.0443213296398892, "grad_norm": 26.10881233215332, "learning_rate": 2.5481403323661305e-07, "loss": 3.0842, "step": 168 }, { "epoch": 0.04484896451655455, "grad_norm": 3.917948007583618, "learning_rate": 2.5797942495383804e-07, "loss": 3.0782, "step": 170 }, { "epoch": 0.04537659939321989, "grad_norm": 3.3842382431030273, "learning_rate": 2.6114481667106304e-07, "loss": 3.0747, "step": 172 }, { "epoch": 0.04590423426988524, "grad_norm": 4.040017604827881, "learning_rate": 2.643102083882881e-07, "loss": 3.075, "step": 174 }, { "epoch": 0.046431869146550585, "grad_norm": 5.069931507110596, "learning_rate": 2.674756001055131e-07, "loss": 3.079, "step": 176 }, { "epoch": 0.046959504023215935, "grad_norm": 3.5548975467681885, "learning_rate": 2.7064099182273807e-07, "loss": 3.0688, "step": 178 }, { "epoch": 0.04748713889988128, "grad_norm": 3.510364055633545, "learning_rate": 2.7380638353996307e-07, "loss": 3.0733, "step": 180 }, { "epoch": 0.04801477377654663, "grad_norm": 3.9880564212799072, "learning_rate": 2.7697177525718806e-07, "loss": 3.0626, "step": 182 }, { "epoch": 0.04854240865321198, "grad_norm": 21.581445693969727, "learning_rate": 2.801371669744131e-07, "loss": 3.0767, "step": 184 }, { "epoch": 0.04907004352987732, "grad_norm": 3.4098148345947266, "learning_rate": 2.833025586916381e-07, "loss": 3.0726, "step": 186 }, { "epoch": 0.049597678406542674, "grad_norm": 3.431478977203369, "learning_rate": 2.864679504088631e-07, "loss": 3.0654, "step": 188 }, { "epoch": 0.05012531328320802, "grad_norm": 3.6033594608306885, "learning_rate": 2.8963334212608814e-07, "loss": 3.0694, "step": 190 }, { "epoch": 0.05065294815987337, "grad_norm": 3.661468505859375, "learning_rate": 2.9279873384331313e-07, "loss": 3.0541, "step": 192 }, { "epoch": 0.05118058303653872, "grad_norm": 3.6140053272247314, "learning_rate": 2.9596412556053813e-07, "loss": 3.0556, "step": 194 }, { "epoch": 0.05170821791320406, "grad_norm": 3.656641721725464, "learning_rate": 2.991295172777631e-07, "loss": 3.0651, "step": 196 }, { "epoch": 0.05223585278986941, "grad_norm": 3.7295479774475098, "learning_rate": 3.022949089949881e-07, "loss": 3.0556, "step": 198 }, { "epoch": 0.052763487666534756, "grad_norm": 3.60648250579834, "learning_rate": 3.0546030071221316e-07, "loss": 3.065, "step": 200 }, { "epoch": 0.053291122543200106, "grad_norm": 4.598979473114014, "learning_rate": 3.0862569242943816e-07, "loss": 3.0634, "step": 202 }, { "epoch": 0.05381875741986545, "grad_norm": 5.809854030609131, "learning_rate": 3.1179108414666315e-07, "loss": 3.0459, "step": 204 }, { "epoch": 0.0543463922965308, "grad_norm": 3.5358667373657227, "learning_rate": 3.149564758638882e-07, "loss": 3.0464, "step": 206 }, { "epoch": 0.05487402717319615, "grad_norm": 4.352460861206055, "learning_rate": 3.181218675811132e-07, "loss": 3.0385, "step": 208 }, { "epoch": 0.055401662049861494, "grad_norm": 5.881943702697754, "learning_rate": 3.212872592983382e-07, "loss": 3.0499, "step": 210 }, { "epoch": 0.055929296926526845, "grad_norm": 3.854926586151123, "learning_rate": 3.244526510155632e-07, "loss": 3.0455, "step": 212 }, { "epoch": 0.05645693180319219, "grad_norm": 5.436080455780029, "learning_rate": 3.2761804273278817e-07, "loss": 3.0491, "step": 214 }, { "epoch": 0.05698456667985754, "grad_norm": 3.948077917098999, "learning_rate": 3.307834344500132e-07, "loss": 3.0432, "step": 216 }, { "epoch": 0.05751220155652289, "grad_norm": 3.8775601387023926, "learning_rate": 3.339488261672382e-07, "loss": 3.0399, "step": 218 }, { "epoch": 0.05803983643318823, "grad_norm": 3.877772331237793, "learning_rate": 3.371142178844632e-07, "loss": 3.0459, "step": 220 }, { "epoch": 0.05856747130985358, "grad_norm": 3.9653570652008057, "learning_rate": 3.4027960960168825e-07, "loss": 3.0328, "step": 222 }, { "epoch": 0.05909510618651893, "grad_norm": 17.950790405273438, "learning_rate": 3.4344500131891324e-07, "loss": 3.0472, "step": 224 }, { "epoch": 0.05962274106318428, "grad_norm": 4.24899435043335, "learning_rate": 3.4661039303613824e-07, "loss": 3.0339, "step": 226 }, { "epoch": 0.06015037593984962, "grad_norm": 3.7685251235961914, "learning_rate": 3.4977578475336323e-07, "loss": 3.028, "step": 228 }, { "epoch": 0.06067801081651497, "grad_norm": 6.790403366088867, "learning_rate": 3.529411764705882e-07, "loss": 3.0255, "step": 230 }, { "epoch": 0.06120564569318032, "grad_norm": 14.249598503112793, "learning_rate": 3.561065681878132e-07, "loss": 3.0255, "step": 232 }, { "epoch": 0.061733280569845665, "grad_norm": 3.8878512382507324, "learning_rate": 3.5927195990503827e-07, "loss": 3.0194, "step": 234 }, { "epoch": 0.062260915446511016, "grad_norm": 181.97958374023438, "learning_rate": 3.6243735162226326e-07, "loss": 3.0384, "step": 236 }, { "epoch": 0.06278855032317636, "grad_norm": 26.713119506835938, "learning_rate": 3.656027433394883e-07, "loss": 3.0098, "step": 238 }, { "epoch": 0.06331618519984171, "grad_norm": 3.728546380996704, "learning_rate": 3.687681350567133e-07, "loss": 3.0123, "step": 240 }, { "epoch": 0.06384382007650706, "grad_norm": 3.748676061630249, "learning_rate": 3.719335267739383e-07, "loss": 3.0116, "step": 242 }, { "epoch": 0.06437145495317241, "grad_norm": 4.286544322967529, "learning_rate": 3.750989184911633e-07, "loss": 3.0159, "step": 244 }, { "epoch": 0.06489908982983775, "grad_norm": 4.723263263702393, "learning_rate": 3.7826431020838833e-07, "loss": 3.0175, "step": 246 }, { "epoch": 0.0654267247065031, "grad_norm": 51.690547943115234, "learning_rate": 3.8142970192561333e-07, "loss": 3.0028, "step": 248 }, { "epoch": 0.06595435958316845, "grad_norm": 6.1476216316223145, "learning_rate": 3.845950936428383e-07, "loss": 3.0194, "step": 250 }, { "epoch": 0.0664819944598338, "grad_norm": 3.8994789123535156, "learning_rate": 3.877604853600633e-07, "loss": 2.9916, "step": 252 }, { "epoch": 0.06700962933649915, "grad_norm": 3.4499690532684326, "learning_rate": 3.909258770772883e-07, "loss": 3.0055, "step": 254 }, { "epoch": 0.06753726421316449, "grad_norm": 3.774888515472412, "learning_rate": 3.940912687945133e-07, "loss": 2.9939, "step": 256 }, { "epoch": 0.06806489908982984, "grad_norm": 5.102309703826904, "learning_rate": 3.9725666051173835e-07, "loss": 2.9886, "step": 258 }, { "epoch": 0.06859253396649519, "grad_norm": 3.595855474472046, "learning_rate": 4.0042205222896334e-07, "loss": 2.999, "step": 260 }, { "epoch": 0.06912016884316054, "grad_norm": 3.9701950550079346, "learning_rate": 4.0358744394618834e-07, "loss": 2.9745, "step": 262 }, { "epoch": 0.06964780371982587, "grad_norm": 5.626059532165527, "learning_rate": 4.0675283566341333e-07, "loss": 2.9807, "step": 264 }, { "epoch": 0.07017543859649122, "grad_norm": 3.325183153152466, "learning_rate": 4.099182273806383e-07, "loss": 2.9869, "step": 266 }, { "epoch": 0.07070307347315657, "grad_norm": 4.714856147766113, "learning_rate": 4.130836190978634e-07, "loss": 2.9738, "step": 268 }, { "epoch": 0.07123070834982193, "grad_norm": 27.470544815063477, "learning_rate": 4.162490108150884e-07, "loss": 2.9722, "step": 270 }, { "epoch": 0.07175834322648728, "grad_norm": 4.5491557121276855, "learning_rate": 4.194144025323134e-07, "loss": 2.9604, "step": 272 }, { "epoch": 0.07228597810315261, "grad_norm": 3.8969497680664062, "learning_rate": 4.225797942495384e-07, "loss": 2.9593, "step": 274 }, { "epoch": 0.07281361297981796, "grad_norm": 26.62067413330078, "learning_rate": 4.257451859667634e-07, "loss": 2.9719, "step": 276 }, { "epoch": 0.07334124785648331, "grad_norm": 79.6410140991211, "learning_rate": 4.2891057768398834e-07, "loss": 2.9404, "step": 278 }, { "epoch": 0.07386888273314866, "grad_norm": 4.5006842613220215, "learning_rate": 4.3207596940121344e-07, "loss": 2.9676, "step": 280 }, { "epoch": 0.07439651760981401, "grad_norm": 4.560907363891602, "learning_rate": 4.3524136111843843e-07, "loss": 2.9407, "step": 282 }, { "epoch": 0.07492415248647935, "grad_norm": 4.374421119689941, "learning_rate": 4.384067528356634e-07, "loss": 2.946, "step": 284 }, { "epoch": 0.0754517873631447, "grad_norm": 4.291843414306641, "learning_rate": 4.415721445528884e-07, "loss": 2.9504, "step": 286 }, { "epoch": 0.07597942223981005, "grad_norm": 250.4010009765625, "learning_rate": 4.447375362701134e-07, "loss": 2.9324, "step": 288 }, { "epoch": 0.0765070571164754, "grad_norm": 5.000425338745117, "learning_rate": 4.4790292798733846e-07, "loss": 2.9429, "step": 290 }, { "epoch": 0.07703469199314075, "grad_norm": 3.4573416709899902, "learning_rate": 4.5106831970456345e-07, "loss": 2.9532, "step": 292 }, { "epoch": 0.07756232686980609, "grad_norm": 5.4387383460998535, "learning_rate": 4.5423371142178845e-07, "loss": 2.941, "step": 294 }, { "epoch": 0.07808996174647144, "grad_norm": 18.556203842163086, "learning_rate": 4.5739910313901344e-07, "loss": 2.9316, "step": 296 }, { "epoch": 0.07861759662313679, "grad_norm": 3.8978123664855957, "learning_rate": 4.6056449485623843e-07, "loss": 2.9137, "step": 298 }, { "epoch": 0.07914523149980214, "grad_norm": 27.42274284362793, "learning_rate": 4.6372988657346353e-07, "loss": 2.8984, "step": 300 }, { "epoch": 0.07967286637646749, "grad_norm": 4.952641487121582, "learning_rate": 4.6689527829068853e-07, "loss": 2.9019, "step": 302 }, { "epoch": 0.08020050125313283, "grad_norm": 3.9949371814727783, "learning_rate": 4.700606700079135e-07, "loss": 2.9122, "step": 304 }, { "epoch": 0.08072813612979818, "grad_norm": 4.310922145843506, "learning_rate": 4.732260617251385e-07, "loss": 2.9039, "step": 306 }, { "epoch": 0.08125577100646353, "grad_norm": 439.82720947265625, "learning_rate": 4.763914534423635e-07, "loss": 2.914, "step": 308 }, { "epoch": 0.08178340588312888, "grad_norm": 4.2072601318359375, "learning_rate": 4.795568451595885e-07, "loss": 2.896, "step": 310 }, { "epoch": 0.08231104075979422, "grad_norm": 6.234478950500488, "learning_rate": 4.827222368768136e-07, "loss": 2.8787, "step": 312 }, { "epoch": 0.08283867563645957, "grad_norm": 4.07114315032959, "learning_rate": 4.858876285940385e-07, "loss": 2.8823, "step": 314 }, { "epoch": 0.08336631051312492, "grad_norm": 12.13647747039795, "learning_rate": 4.890530203112635e-07, "loss": 2.8418, "step": 316 }, { "epoch": 0.08389394538979027, "grad_norm": 4.440990447998047, "learning_rate": 4.922184120284885e-07, "loss": 2.8621, "step": 318 }, { "epoch": 0.08442158026645562, "grad_norm": 4.676474571228027, "learning_rate": 4.953838037457135e-07, "loss": 2.8406, "step": 320 }, { "epoch": 0.08494921514312095, "grad_norm": 6.979465007781982, "learning_rate": 4.985491954629386e-07, "loss": 2.8595, "step": 322 }, { "epoch": 0.0854768500197863, "grad_norm": 23.532075881958008, "learning_rate": 5.017145871801636e-07, "loss": 2.8519, "step": 324 }, { "epoch": 0.08600448489645166, "grad_norm": 4.47237491607666, "learning_rate": 5.048799788973886e-07, "loss": 2.8248, "step": 326 }, { "epoch": 0.086532119773117, "grad_norm": 8.386637687683105, "learning_rate": 5.080453706146136e-07, "loss": 2.8274, "step": 328 }, { "epoch": 0.08705975464978236, "grad_norm": 6.28348970413208, "learning_rate": 5.112107623318385e-07, "loss": 2.8229, "step": 330 }, { "epoch": 0.08758738952644769, "grad_norm": 39.5052490234375, "learning_rate": 5.143761540490635e-07, "loss": 2.8051, "step": 332 }, { "epoch": 0.08811502440311304, "grad_norm": 8.301041603088379, "learning_rate": 5.175415457662886e-07, "loss": 2.7887, "step": 334 }, { "epoch": 0.0886426592797784, "grad_norm": 5.466032028198242, "learning_rate": 5.207069374835136e-07, "loss": 2.799, "step": 336 }, { "epoch": 0.08917029415644374, "grad_norm": 5.751029014587402, "learning_rate": 5.238723292007386e-07, "loss": 2.7671, "step": 338 }, { "epoch": 0.0896979290331091, "grad_norm": 18.97295379638672, "learning_rate": 5.270377209179636e-07, "loss": 2.7386, "step": 340 }, { "epoch": 0.09022556390977443, "grad_norm": 7.221994400024414, "learning_rate": 5.302031126351886e-07, "loss": 2.7361, "step": 342 }, { "epoch": 0.09075319878643978, "grad_norm": 6.9952569007873535, "learning_rate": 5.333685043524137e-07, "loss": 2.7109, "step": 344 }, { "epoch": 0.09128083366310513, "grad_norm": 7.349247932434082, "learning_rate": 5.365338960696386e-07, "loss": 2.6863, "step": 346 }, { "epoch": 0.09180846853977048, "grad_norm": 18.650484085083008, "learning_rate": 5.396992877868636e-07, "loss": 2.6557, "step": 348 }, { "epoch": 0.09233610341643583, "grad_norm": 18.881860733032227, "learning_rate": 5.428646795040886e-07, "loss": 2.6363, "step": 350 }, { "epoch": 0.09286373829310117, "grad_norm": 10.242255210876465, "learning_rate": 5.460300712213136e-07, "loss": 2.6116, "step": 352 }, { "epoch": 0.09339137316976652, "grad_norm": 93.39981842041016, "learning_rate": 5.491954629385386e-07, "loss": 2.5872, "step": 354 }, { "epoch": 0.09391900804643187, "grad_norm": 15.660743713378906, "learning_rate": 5.523608546557637e-07, "loss": 2.5354, "step": 356 }, { "epoch": 0.09444664292309722, "grad_norm": 12.351272583007812, "learning_rate": 5.555262463729887e-07, "loss": 2.4601, "step": 358 }, { "epoch": 0.09497427779976256, "grad_norm": 11.470568656921387, "learning_rate": 5.586916380902137e-07, "loss": 2.4743, "step": 360 }, { "epoch": 0.09550191267642791, "grad_norm": 13.759496688842773, "learning_rate": 5.618570298074387e-07, "loss": 2.3909, "step": 362 }, { "epoch": 0.09602954755309326, "grad_norm": 22.23471450805664, "learning_rate": 5.650224215246636e-07, "loss": 2.3572, "step": 364 }, { "epoch": 0.09655718242975861, "grad_norm": 18.849763870239258, "learning_rate": 5.681878132418887e-07, "loss": 2.3659, "step": 366 }, { "epoch": 0.09708481730642396, "grad_norm": 7.141429424285889, "learning_rate": 5.713532049591137e-07, "loss": 2.3717, "step": 368 }, { "epoch": 0.0976124521830893, "grad_norm": 7.242023468017578, "learning_rate": 5.745185966763387e-07, "loss": 2.3093, "step": 370 }, { "epoch": 0.09814008705975465, "grad_norm": 10.476563453674316, "learning_rate": 5.776839883935637e-07, "loss": 2.3104, "step": 372 }, { "epoch": 0.09866772193642, "grad_norm": 11.235527992248535, "learning_rate": 5.808493801107887e-07, "loss": 2.335, "step": 374 }, { "epoch": 0.09919535681308535, "grad_norm": 7.1529316902160645, "learning_rate": 5.840147718280138e-07, "loss": 2.2767, "step": 376 }, { "epoch": 0.0997229916897507, "grad_norm": 16.254568099975586, "learning_rate": 5.871801635452387e-07, "loss": 2.298, "step": 378 }, { "epoch": 0.10025062656641603, "grad_norm": 7.703162670135498, "learning_rate": 5.903455552624637e-07, "loss": 2.2543, "step": 380 }, { "epoch": 0.10077826144308139, "grad_norm": 6.903515815734863, "learning_rate": 5.935109469796887e-07, "loss": 2.2344, "step": 382 }, { "epoch": 0.10130589631974674, "grad_norm": 39.57255935668945, "learning_rate": 5.966763386969137e-07, "loss": 2.1989, "step": 384 }, { "epoch": 0.10183353119641209, "grad_norm": 312.2984924316406, "learning_rate": 5.998417304141387e-07, "loss": 2.2378, "step": 386 }, { "epoch": 0.10236116607307744, "grad_norm": 6.538335800170898, "learning_rate": 6.030071221313638e-07, "loss": 2.1629, "step": 388 }, { "epoch": 0.10288880094974277, "grad_norm": 12.139670372009277, "learning_rate": 6.061725138485888e-07, "loss": 2.1675, "step": 390 }, { "epoch": 0.10341643582640812, "grad_norm": 10.272560119628906, "learning_rate": 6.093379055658138e-07, "loss": 2.2078, "step": 392 }, { "epoch": 0.10394407070307347, "grad_norm": 26.437850952148438, "learning_rate": 6.125032972830388e-07, "loss": 2.1006, "step": 394 }, { "epoch": 0.10447170557973882, "grad_norm": 6.894162654876709, "learning_rate": 6.156686890002638e-07, "loss": 2.2138, "step": 396 }, { "epoch": 0.10499934045640417, "grad_norm": 6.283015251159668, "learning_rate": 6.188340807174889e-07, "loss": 2.1231, "step": 398 }, { "epoch": 0.10552697533306951, "grad_norm": 5.862778186798096, "learning_rate": 6.219994724347139e-07, "loss": 2.0995, "step": 400 }, { "epoch": 0.10605461020973486, "grad_norm": 20.769254684448242, "learning_rate": 6.251648641519388e-07, "loss": 2.0735, "step": 402 }, { "epoch": 0.10658224508640021, "grad_norm": 4.914285659790039, "learning_rate": 6.283302558691638e-07, "loss": 2.1858, "step": 404 }, { "epoch": 0.10710987996306556, "grad_norm": 6.644514083862305, "learning_rate": 6.314956475863888e-07, "loss": 2.091, "step": 406 }, { "epoch": 0.1076375148397309, "grad_norm": 5.144701957702637, "learning_rate": 6.346610393036138e-07, "loss": 2.085, "step": 408 }, { "epoch": 0.10816514971639625, "grad_norm": 67.21956634521484, "learning_rate": 6.378264310208388e-07, "loss": 2.0787, "step": 410 }, { "epoch": 0.1086927845930616, "grad_norm": 346.9925537109375, "learning_rate": 6.409918227380638e-07, "loss": 2.0834, "step": 412 }, { "epoch": 0.10922041946972695, "grad_norm": 35.35666275024414, "learning_rate": 6.441572144552888e-07, "loss": 2.0581, "step": 414 }, { "epoch": 0.1097480543463923, "grad_norm": 4.849177837371826, "learning_rate": 6.473226061725138e-07, "loss": 2.0783, "step": 416 }, { "epoch": 0.11027568922305764, "grad_norm": 12.528923988342285, "learning_rate": 6.504879978897388e-07, "loss": 2.0375, "step": 418 }, { "epoch": 0.11080332409972299, "grad_norm": 8.717477798461914, "learning_rate": 6.536533896069639e-07, "loss": 1.9441, "step": 420 }, { "epoch": 0.11133095897638834, "grad_norm": 5.233604431152344, "learning_rate": 6.568187813241889e-07, "loss": 2.0353, "step": 422 }, { "epoch": 0.11185859385305369, "grad_norm": 8.945021629333496, "learning_rate": 6.599841730414139e-07, "loss": 2.0427, "step": 424 }, { "epoch": 0.11238622872971904, "grad_norm": 13.68307876586914, "learning_rate": 6.631495647586389e-07, "loss": 1.9788, "step": 426 }, { "epoch": 0.11291386360638438, "grad_norm": 221.02398681640625, "learning_rate": 6.663149564758639e-07, "loss": 1.9574, "step": 428 }, { "epoch": 0.11344149848304973, "grad_norm": 10.046215057373047, "learning_rate": 6.694803481930889e-07, "loss": 1.9946, "step": 430 }, { "epoch": 0.11396913335971508, "grad_norm": 42.39809799194336, "learning_rate": 6.72645739910314e-07, "loss": 1.9223, "step": 432 }, { "epoch": 0.11449676823638043, "grad_norm": 31.401229858398438, "learning_rate": 6.75811131627539e-07, "loss": 1.958, "step": 434 }, { "epoch": 0.11502440311304578, "grad_norm": 4.510138988494873, "learning_rate": 6.78976523344764e-07, "loss": 1.9795, "step": 436 }, { "epoch": 0.11555203798971111, "grad_norm": 26.804170608520508, "learning_rate": 6.821419150619889e-07, "loss": 1.91, "step": 438 }, { "epoch": 0.11607967286637647, "grad_norm": 47.06333923339844, "learning_rate": 6.853073067792139e-07, "loss": 1.9556, "step": 440 }, { "epoch": 0.11660730774304182, "grad_norm": 8.737351417541504, "learning_rate": 6.884726984964389e-07, "loss": 1.9659, "step": 442 }, { "epoch": 0.11713494261970717, "grad_norm": 5.002654075622559, "learning_rate": 6.916380902136639e-07, "loss": 1.9172, "step": 444 }, { "epoch": 0.11766257749637252, "grad_norm": 5.708297252655029, "learning_rate": 6.948034819308889e-07, "loss": 1.899, "step": 446 }, { "epoch": 0.11819021237303785, "grad_norm": 4.335618019104004, "learning_rate": 6.979688736481139e-07, "loss": 1.9121, "step": 448 }, { "epoch": 0.1187178472497032, "grad_norm": 16.725826263427734, "learning_rate": 7.011342653653389e-07, "loss": 1.9543, "step": 450 }, { "epoch": 0.11924548212636855, "grad_norm": 3.3918020725250244, "learning_rate": 7.04299657082564e-07, "loss": 1.9807, "step": 452 }, { "epoch": 0.1197731170030339, "grad_norm": 3.315584182739258, "learning_rate": 7.07465048799789e-07, "loss": 1.924, "step": 454 }, { "epoch": 0.12030075187969924, "grad_norm": 4.775875568389893, "learning_rate": 7.10630440517014e-07, "loss": 1.9887, "step": 456 }, { "epoch": 0.12082838675636459, "grad_norm": 3.3061673641204834, "learning_rate": 7.13795832234239e-07, "loss": 1.9296, "step": 458 }, { "epoch": 0.12135602163302994, "grad_norm": 11.375423431396484, "learning_rate": 7.16961223951464e-07, "loss": 1.8258, "step": 460 }, { "epoch": 0.12188365650969529, "grad_norm": 3.138812303543091, "learning_rate": 7.20126615668689e-07, "loss": 1.9243, "step": 462 }, { "epoch": 0.12241129138636064, "grad_norm": 3.5061349868774414, "learning_rate": 7.232920073859141e-07, "loss": 1.809, "step": 464 }, { "epoch": 0.12293892626302598, "grad_norm": 7.768166542053223, "learning_rate": 7.264573991031391e-07, "loss": 1.8456, "step": 466 }, { "epoch": 0.12346656113969133, "grad_norm": 5.6815290451049805, "learning_rate": 7.296227908203641e-07, "loss": 1.8473, "step": 468 }, { "epoch": 0.12399419601635668, "grad_norm": 4.317296504974365, "learning_rate": 7.327881825375891e-07, "loss": 1.8428, "step": 470 }, { "epoch": 0.12452183089302203, "grad_norm": 10.897123336791992, "learning_rate": 7.35953574254814e-07, "loss": 1.832, "step": 472 }, { "epoch": 0.12504946576968737, "grad_norm": 4.75904655456543, "learning_rate": 7.39118965972039e-07, "loss": 1.7447, "step": 474 }, { "epoch": 0.12557710064635272, "grad_norm": 3.115638256072998, "learning_rate": 7.42284357689264e-07, "loss": 1.7623, "step": 476 }, { "epoch": 0.12610473552301807, "grad_norm": 7.685421466827393, "learning_rate": 7.45449749406489e-07, "loss": 1.8404, "step": 478 }, { "epoch": 0.12663237039968342, "grad_norm": 2.861933708190918, "learning_rate": 7.48615141123714e-07, "loss": 1.8295, "step": 480 }, { "epoch": 0.12716000527634877, "grad_norm": 14.216230392456055, "learning_rate": 7.51780532840939e-07, "loss": 1.8809, "step": 482 }, { "epoch": 0.12768764015301412, "grad_norm": 3.920020580291748, "learning_rate": 7.549459245581641e-07, "loss": 1.8081, "step": 484 }, { "epoch": 0.12821527502967947, "grad_norm": 14.852561950683594, "learning_rate": 7.58111316275389e-07, "loss": 1.821, "step": 486 }, { "epoch": 0.12874290990634482, "grad_norm": 3.8297274112701416, "learning_rate": 7.612767079926141e-07, "loss": 1.756, "step": 488 }, { "epoch": 0.12927054478301014, "grad_norm": 2.5728490352630615, "learning_rate": 7.644420997098392e-07, "loss": 1.7976, "step": 490 }, { "epoch": 0.1297981796596755, "grad_norm": 2.6024117469787598, "learning_rate": 7.676074914270641e-07, "loss": 1.8201, "step": 492 }, { "epoch": 0.13032581453634084, "grad_norm": 90.09078216552734, "learning_rate": 7.707728831442892e-07, "loss": 1.8757, "step": 494 }, { "epoch": 0.1308534494130062, "grad_norm": 3.4318013191223145, "learning_rate": 7.739382748615141e-07, "loss": 1.6999, "step": 496 }, { "epoch": 0.13138108428967155, "grad_norm": 4.377386569976807, "learning_rate": 7.771036665787392e-07, "loss": 1.7231, "step": 498 }, { "epoch": 0.1319087191663369, "grad_norm": 2.983527183532715, "learning_rate": 7.802690582959641e-07, "loss": 1.762, "step": 500 }, { "epoch": 0.13243635404300225, "grad_norm": 3.167055606842041, "learning_rate": 7.834344500131892e-07, "loss": 1.8063, "step": 502 }, { "epoch": 0.1329639889196676, "grad_norm": 2.8757708072662354, "learning_rate": 7.865998417304143e-07, "loss": 1.7856, "step": 504 }, { "epoch": 0.13349162379633295, "grad_norm": 5.490010738372803, "learning_rate": 7.897652334476392e-07, "loss": 1.8106, "step": 506 }, { "epoch": 0.1340192586729983, "grad_norm": 2.4995267391204834, "learning_rate": 7.929306251648641e-07, "loss": 1.7826, "step": 508 }, { "epoch": 0.13454689354966362, "grad_norm": 2.523712396621704, "learning_rate": 7.960960168820891e-07, "loss": 1.766, "step": 510 }, { "epoch": 0.13507452842632897, "grad_norm": 231.08944702148438, "learning_rate": 7.992614085993141e-07, "loss": 1.7917, "step": 512 }, { "epoch": 0.13560216330299432, "grad_norm": 4.6620588302612305, "learning_rate": 8.024268003165392e-07, "loss": 1.7039, "step": 514 }, { "epoch": 0.13612979817965967, "grad_norm": 2.528660774230957, "learning_rate": 8.055921920337641e-07, "loss": 1.8108, "step": 516 }, { "epoch": 0.13665743305632502, "grad_norm": 2.8519413471221924, "learning_rate": 8.087575837509892e-07, "loss": 1.734, "step": 518 }, { "epoch": 0.13718506793299037, "grad_norm": 2.6957249641418457, "learning_rate": 8.103402796096018e-07, "loss": 1.7094, "step": 520 }, { "epoch": 0.13771270280965572, "grad_norm": 2.7327024936676025, "learning_rate": 8.135056713268267e-07, "loss": 1.7397, "step": 522 }, { "epoch": 0.13824033768632107, "grad_norm": 2.275146245956421, "learning_rate": 8.166710630440518e-07, "loss": 1.7572, "step": 524 }, { "epoch": 0.13876797256298642, "grad_norm": 11.106001853942871, "learning_rate": 8.198364547612766e-07, "loss": 1.6726, "step": 526 }, { "epoch": 0.13929560743965175, "grad_norm": 13.439924240112305, "learning_rate": 8.230018464785017e-07, "loss": 1.6798, "step": 528 }, { "epoch": 0.1398232423163171, "grad_norm": 13.131622314453125, "learning_rate": 8.261672381957268e-07, "loss": 1.7929, "step": 530 }, { "epoch": 0.14035087719298245, "grad_norm": 1.8635261058807373, "learning_rate": 8.293326299129517e-07, "loss": 1.8191, "step": 532 }, { "epoch": 0.1408785120696478, "grad_norm": 2.3563292026519775, "learning_rate": 8.324980216301768e-07, "loss": 1.7729, "step": 534 }, { "epoch": 0.14140614694631315, "grad_norm": 4.519763469696045, "learning_rate": 8.356634133474017e-07, "loss": 1.6867, "step": 536 }, { "epoch": 0.1419337818229785, "grad_norm": 19.113256454467773, "learning_rate": 8.388288050646268e-07, "loss": 1.7382, "step": 538 }, { "epoch": 0.14246141669964385, "grad_norm": 14.166841506958008, "learning_rate": 8.419941967818518e-07, "loss": 1.6163, "step": 540 }, { "epoch": 0.1429890515763092, "grad_norm": 2.117267370223999, "learning_rate": 8.451595884990768e-07, "loss": 1.69, "step": 542 }, { "epoch": 0.14351668645297455, "grad_norm": 1.8111017942428589, "learning_rate": 8.483249802163018e-07, "loss": 1.6765, "step": 544 }, { "epoch": 0.1440443213296399, "grad_norm": 5.602942943572998, "learning_rate": 8.514903719335268e-07, "loss": 1.7221, "step": 546 }, { "epoch": 0.14457195620630522, "grad_norm": 30.669492721557617, "learning_rate": 8.546557636507518e-07, "loss": 1.692, "step": 548 }, { "epoch": 0.14509959108297057, "grad_norm": 13.780606269836426, "learning_rate": 8.578211553679767e-07, "loss": 1.6894, "step": 550 }, { "epoch": 0.14562722595963593, "grad_norm": 1.9086394309997559, "learning_rate": 8.609865470852018e-07, "loss": 1.7092, "step": 552 }, { "epoch": 0.14615486083630128, "grad_norm": 1.7397961616516113, "learning_rate": 8.641519388024269e-07, "loss": 1.7322, "step": 554 }, { "epoch": 0.14668249571296663, "grad_norm": 2.6899242401123047, "learning_rate": 8.673173305196518e-07, "loss": 1.7577, "step": 556 }, { "epoch": 0.14721013058963198, "grad_norm": 2.3949573040008545, "learning_rate": 8.704827222368769e-07, "loss": 1.6528, "step": 558 }, { "epoch": 0.14773776546629733, "grad_norm": 1.8351446390151978, "learning_rate": 8.736481139541018e-07, "loss": 1.6825, "step": 560 }, { "epoch": 0.14826540034296268, "grad_norm": 4.145223617553711, "learning_rate": 8.768135056713269e-07, "loss": 1.7218, "step": 562 }, { "epoch": 0.14879303521962803, "grad_norm": 1.7428418397903442, "learning_rate": 8.79978897388552e-07, "loss": 1.6197, "step": 564 }, { "epoch": 0.14932067009629338, "grad_norm": 1.8484017848968506, "learning_rate": 8.831442891057768e-07, "loss": 1.6237, "step": 566 }, { "epoch": 0.1498483049729587, "grad_norm": 2.2409474849700928, "learning_rate": 8.863096808230019e-07, "loss": 1.6375, "step": 568 }, { "epoch": 0.15037593984962405, "grad_norm": 3.7220683097839355, "learning_rate": 8.894750725402268e-07, "loss": 1.6541, "step": 570 }, { "epoch": 0.1509035747262894, "grad_norm": 2.456707715988159, "learning_rate": 8.926404642574519e-07, "loss": 1.5779, "step": 572 }, { "epoch": 0.15143120960295475, "grad_norm": 3.2933413982391357, "learning_rate": 8.958058559746769e-07, "loss": 1.6528, "step": 574 }, { "epoch": 0.1519588444796201, "grad_norm": 6.106878280639648, "learning_rate": 8.989712476919019e-07, "loss": 1.7001, "step": 576 }, { "epoch": 0.15248647935628545, "grad_norm": 1.9034886360168457, "learning_rate": 9.021366394091269e-07, "loss": 1.7205, "step": 578 }, { "epoch": 0.1530141142329508, "grad_norm": 1.5437761545181274, "learning_rate": 9.053020311263519e-07, "loss": 1.7294, "step": 580 }, { "epoch": 0.15354174910961615, "grad_norm": 1.6879631280899048, "learning_rate": 9.084674228435769e-07, "loss": 1.6741, "step": 582 }, { "epoch": 0.1540693839862815, "grad_norm": 3.208197832107544, "learning_rate": 9.11632814560802e-07, "loss": 1.6705, "step": 584 }, { "epoch": 0.15459701886294683, "grad_norm": 3.211737632751465, "learning_rate": 9.147982062780269e-07, "loss": 1.6378, "step": 586 }, { "epoch": 0.15512465373961218, "grad_norm": 4.817298412322998, "learning_rate": 9.17963597995252e-07, "loss": 1.6775, "step": 588 }, { "epoch": 0.15565228861627753, "grad_norm": 2.6161069869995117, "learning_rate": 9.211289897124769e-07, "loss": 1.6476, "step": 590 }, { "epoch": 0.15617992349294288, "grad_norm": 1.4974908828735352, "learning_rate": 9.24294381429702e-07, "loss": 1.611, "step": 592 }, { "epoch": 0.15670755836960823, "grad_norm": 12.373671531677246, "learning_rate": 9.274597731469271e-07, "loss": 1.7671, "step": 594 }, { "epoch": 0.15723519324627358, "grad_norm": 4.4535112380981445, "learning_rate": 9.30625164864152e-07, "loss": 1.648, "step": 596 }, { "epoch": 0.15776282812293893, "grad_norm": 2.431645631790161, "learning_rate": 9.337905565813771e-07, "loss": 1.6018, "step": 598 }, { "epoch": 0.15829046299960428, "grad_norm": 134.37440490722656, "learning_rate": 9.369559482986019e-07, "loss": 1.5923, "step": 600 }, { "epoch": 0.15881809787626963, "grad_norm": 1.7523298263549805, "learning_rate": 9.40121340015827e-07, "loss": 1.5391, "step": 602 }, { "epoch": 0.15934573275293498, "grad_norm": 1.6761407852172852, "learning_rate": 9.432867317330519e-07, "loss": 1.6202, "step": 604 }, { "epoch": 0.1598733676296003, "grad_norm": 2.7170040607452393, "learning_rate": 9.46452123450277e-07, "loss": 1.6633, "step": 606 }, { "epoch": 0.16040100250626566, "grad_norm": 2.480468988418579, "learning_rate": 9.49617515167502e-07, "loss": 1.655, "step": 608 }, { "epoch": 0.160928637382931, "grad_norm": 1.472699522972107, "learning_rate": 9.52782906884727e-07, "loss": 1.7089, "step": 610 }, { "epoch": 0.16145627225959636, "grad_norm": 1.5007071495056152, "learning_rate": 9.559482986019521e-07, "loss": 1.6408, "step": 612 }, { "epoch": 0.1619839071362617, "grad_norm": 2.0963947772979736, "learning_rate": 9.59113690319177e-07, "loss": 1.5103, "step": 614 }, { "epoch": 0.16251154201292706, "grad_norm": 2.5707194805145264, "learning_rate": 9.62279082036402e-07, "loss": 1.6511, "step": 616 }, { "epoch": 0.1630391768895924, "grad_norm": 1.318936824798584, "learning_rate": 9.654444737536272e-07, "loss": 1.6386, "step": 618 }, { "epoch": 0.16356681176625776, "grad_norm": 1.5995737314224243, "learning_rate": 9.68609865470852e-07, "loss": 1.6966, "step": 620 }, { "epoch": 0.1640944466429231, "grad_norm": 30.013790130615234, "learning_rate": 9.71775257188077e-07, "loss": 1.5908, "step": 622 }, { "epoch": 0.16462208151958843, "grad_norm": 1.646623134613037, "learning_rate": 9.74940648905302e-07, "loss": 1.569, "step": 624 }, { "epoch": 0.16514971639625378, "grad_norm": 1.8013776540756226, "learning_rate": 9.78106040622527e-07, "loss": 1.6192, "step": 626 }, { "epoch": 0.16567735127291913, "grad_norm": 1.7122480869293213, "learning_rate": 9.81271432339752e-07, "loss": 1.6392, "step": 628 }, { "epoch": 0.16620498614958448, "grad_norm": 1.2395094633102417, "learning_rate": 9.84436824056977e-07, "loss": 1.6289, "step": 630 }, { "epoch": 0.16673262102624983, "grad_norm": 1.3338885307312012, "learning_rate": 9.87602215774202e-07, "loss": 1.6474, "step": 632 }, { "epoch": 0.16726025590291518, "grad_norm": 2.5006799697875977, "learning_rate": 9.90767607491427e-07, "loss": 1.5943, "step": 634 }, { "epoch": 0.16778789077958053, "grad_norm": 1.3727054595947266, "learning_rate": 9.93932999208652e-07, "loss": 1.5938, "step": 636 }, { "epoch": 0.16831552565624588, "grad_norm": 16.668867111206055, "learning_rate": 9.970983909258771e-07, "loss": 1.5624, "step": 638 }, { "epoch": 0.16884316053291124, "grad_norm": 3.4030957221984863, "learning_rate": 1.000263782643102e-06, "loss": 1.5584, "step": 640 }, { "epoch": 0.16937079540957659, "grad_norm": 1.9105530977249146, "learning_rate": 1.0034291743603271e-06, "loss": 1.5402, "step": 642 }, { "epoch": 0.1698984302862419, "grad_norm": 1.0987966060638428, "learning_rate": 1.006594566077552e-06, "loss": 1.6023, "step": 644 }, { "epoch": 0.17042606516290726, "grad_norm": 2.12005615234375, "learning_rate": 1.0097599577947771e-06, "loss": 1.5839, "step": 646 }, { "epoch": 0.1709537000395726, "grad_norm": 32.119136810302734, "learning_rate": 1.0129253495120022e-06, "loss": 1.4616, "step": 648 }, { "epoch": 0.17148133491623796, "grad_norm": 1.3282711505889893, "learning_rate": 1.016090741229227e-06, "loss": 1.6007, "step": 650 }, { "epoch": 0.1720089697929033, "grad_norm": 2.2703359127044678, "learning_rate": 1.0192561329464522e-06, "loss": 1.6225, "step": 652 }, { "epoch": 0.17253660466956866, "grad_norm": 1.7477887868881226, "learning_rate": 1.022421524663677e-06, "loss": 1.4285, "step": 654 }, { "epoch": 0.173064239546234, "grad_norm": 1.7680574655532837, "learning_rate": 1.0255869163809022e-06, "loss": 1.585, "step": 656 }, { "epoch": 0.17359187442289936, "grad_norm": 1.6171153783798218, "learning_rate": 1.028752308098127e-06, "loss": 1.5478, "step": 658 }, { "epoch": 0.1741195092995647, "grad_norm": 1.2142833471298218, "learning_rate": 1.0319176998153522e-06, "loss": 1.6283, "step": 660 }, { "epoch": 0.17464714417623006, "grad_norm": 1.236351490020752, "learning_rate": 1.0350830915325773e-06, "loss": 1.4972, "step": 662 }, { "epoch": 0.17517477905289539, "grad_norm": 1.2554686069488525, "learning_rate": 1.0382484832498022e-06, "loss": 1.4629, "step": 664 }, { "epoch": 0.17570241392956074, "grad_norm": 9.059700012207031, "learning_rate": 1.0414138749670273e-06, "loss": 1.4942, "step": 666 }, { "epoch": 0.1762300488062261, "grad_norm": 1.2169561386108398, "learning_rate": 1.0445792666842522e-06, "loss": 1.4853, "step": 668 }, { "epoch": 0.17675768368289144, "grad_norm": 1.7622199058532715, "learning_rate": 1.0477446584014773e-06, "loss": 1.5755, "step": 670 }, { "epoch": 0.1772853185595568, "grad_norm": 1.282034993171692, "learning_rate": 1.0509100501187024e-06, "loss": 1.5359, "step": 672 }, { "epoch": 0.17781295343622214, "grad_norm": 3.5057008266448975, "learning_rate": 1.0540754418359272e-06, "loss": 1.4498, "step": 674 }, { "epoch": 0.1783405883128875, "grad_norm": 3.269981622695923, "learning_rate": 1.0572408335531523e-06, "loss": 1.5747, "step": 676 }, { "epoch": 0.17886822318955284, "grad_norm": 1.5184547901153564, "learning_rate": 1.0604062252703772e-06, "loss": 1.5383, "step": 678 }, { "epoch": 0.1793958580662182, "grad_norm": 1.9844199419021606, "learning_rate": 1.0635716169876023e-06, "loss": 1.514, "step": 680 }, { "epoch": 0.1799234929428835, "grad_norm": 4.098089694976807, "learning_rate": 1.0667370087048274e-06, "loss": 1.5194, "step": 682 }, { "epoch": 0.18045112781954886, "grad_norm": 11.229107856750488, "learning_rate": 1.0699024004220523e-06, "loss": 1.4639, "step": 684 }, { "epoch": 0.1809787626962142, "grad_norm": 1.3155637979507446, "learning_rate": 1.0730677921392772e-06, "loss": 1.5586, "step": 686 }, { "epoch": 0.18150639757287956, "grad_norm": 2.0016496181488037, "learning_rate": 1.0762331838565023e-06, "loss": 1.4946, "step": 688 }, { "epoch": 0.1820340324495449, "grad_norm": 10.557250022888184, "learning_rate": 1.0793985755737272e-06, "loss": 1.5463, "step": 690 }, { "epoch": 0.18256166732621026, "grad_norm": 1.5625876188278198, "learning_rate": 1.0825639672909523e-06, "loss": 1.5027, "step": 692 }, { "epoch": 0.18308930220287561, "grad_norm": 1.4027928113937378, "learning_rate": 1.0857293590081772e-06, "loss": 1.4948, "step": 694 }, { "epoch": 0.18361693707954096, "grad_norm": 1.6292178630828857, "learning_rate": 1.0888947507254023e-06, "loss": 1.4391, "step": 696 }, { "epoch": 0.18414457195620632, "grad_norm": 13.869874000549316, "learning_rate": 1.0920601424426272e-06, "loss": 1.502, "step": 698 }, { "epoch": 0.18467220683287167, "grad_norm": 2.36840558052063, "learning_rate": 1.0952255341598523e-06, "loss": 1.4188, "step": 700 }, { "epoch": 0.185199841709537, "grad_norm": 2.2423574924468994, "learning_rate": 1.0983909258770772e-06, "loss": 1.4677, "step": 702 }, { "epoch": 0.18572747658620234, "grad_norm": 4.243188858032227, "learning_rate": 1.1015563175943023e-06, "loss": 1.4032, "step": 704 }, { "epoch": 0.1862551114628677, "grad_norm": 4.7017822265625, "learning_rate": 1.1047217093115273e-06, "loss": 1.4371, "step": 706 }, { "epoch": 0.18678274633953304, "grad_norm": 6.137529373168945, "learning_rate": 1.1078871010287522e-06, "loss": 1.4116, "step": 708 }, { "epoch": 0.1873103812161984, "grad_norm": 1.8684600591659546, "learning_rate": 1.1110524927459773e-06, "loss": 1.4272, "step": 710 }, { "epoch": 0.18783801609286374, "grad_norm": 2.5977835655212402, "learning_rate": 1.1142178844632022e-06, "loss": 1.3857, "step": 712 }, { "epoch": 0.1883656509695291, "grad_norm": 2.460083484649658, "learning_rate": 1.1173832761804273e-06, "loss": 1.4007, "step": 714 }, { "epoch": 0.18889328584619444, "grad_norm": 2.51881742477417, "learning_rate": 1.1205486678976524e-06, "loss": 1.3193, "step": 716 }, { "epoch": 0.1894209207228598, "grad_norm": 1.9214409589767456, "learning_rate": 1.1237140596148773e-06, "loss": 1.3716, "step": 718 }, { "epoch": 0.18994855559952512, "grad_norm": 1.8559600114822388, "learning_rate": 1.1268794513321024e-06, "loss": 1.2991, "step": 720 }, { "epoch": 0.19047619047619047, "grad_norm": 8.61906909942627, "learning_rate": 1.1300448430493273e-06, "loss": 1.3363, "step": 722 }, { "epoch": 0.19100382535285582, "grad_norm": 7.767070293426514, "learning_rate": 1.1332102347665524e-06, "loss": 1.3368, "step": 724 }, { "epoch": 0.19153146022952117, "grad_norm": 3.286449909210205, "learning_rate": 1.1363756264837775e-06, "loss": 1.3834, "step": 726 }, { "epoch": 0.19205909510618652, "grad_norm": 2.04839825630188, "learning_rate": 1.1395410182010024e-06, "loss": 1.3843, "step": 728 }, { "epoch": 0.19258672998285187, "grad_norm": 16.70387077331543, "learning_rate": 1.1427064099182275e-06, "loss": 1.3074, "step": 730 }, { "epoch": 0.19311436485951722, "grad_norm": 2.1279189586639404, "learning_rate": 1.1458718016354524e-06, "loss": 1.2979, "step": 732 }, { "epoch": 0.19364199973618257, "grad_norm": 2.3043153285980225, "learning_rate": 1.1490371933526775e-06, "loss": 1.2439, "step": 734 }, { "epoch": 0.19416963461284792, "grad_norm": 5.022019863128662, "learning_rate": 1.1522025850699026e-06, "loss": 1.3042, "step": 736 }, { "epoch": 0.19469726948951327, "grad_norm": 3.1726889610290527, "learning_rate": 1.1553679767871275e-06, "loss": 1.3584, "step": 738 }, { "epoch": 0.1952249043661786, "grad_norm": 1.7981421947479248, "learning_rate": 1.1585333685043526e-06, "loss": 1.2869, "step": 740 }, { "epoch": 0.19575253924284394, "grad_norm": 9.865686416625977, "learning_rate": 1.1616987602215774e-06, "loss": 1.2508, "step": 742 }, { "epoch": 0.1962801741195093, "grad_norm": 2.0200655460357666, "learning_rate": 1.1648641519388025e-06, "loss": 1.2733, "step": 744 }, { "epoch": 0.19680780899617464, "grad_norm": 2.3451051712036133, "learning_rate": 1.1680295436560276e-06, "loss": 1.2759, "step": 746 }, { "epoch": 0.19733544387284, "grad_norm": 2.7840566635131836, "learning_rate": 1.1711949353732525e-06, "loss": 1.3548, "step": 748 }, { "epoch": 0.19786307874950534, "grad_norm": 1.8059780597686768, "learning_rate": 1.1743603270904774e-06, "loss": 1.289, "step": 750 }, { "epoch": 0.1983907136261707, "grad_norm": 3.119260549545288, "learning_rate": 1.1775257188077025e-06, "loss": 1.2401, "step": 752 }, { "epoch": 0.19891834850283605, "grad_norm": 16.02177619934082, "learning_rate": 1.1806911105249274e-06, "loss": 1.271, "step": 754 }, { "epoch": 0.1994459833795014, "grad_norm": 2.6665375232696533, "learning_rate": 1.1838565022421525e-06, "loss": 1.2175, "step": 756 }, { "epoch": 0.19997361825616675, "grad_norm": 3.429889678955078, "learning_rate": 1.1870218939593774e-06, "loss": 1.2138, "step": 758 }, { "epoch": 0.20050125313283207, "grad_norm": 26.781169891357422, "learning_rate": 1.1901872856766025e-06, "loss": 1.2507, "step": 760 }, { "epoch": 0.20102888800949742, "grad_norm": 4.237110614776611, "learning_rate": 1.1933526773938274e-06, "loss": 1.1563, "step": 762 }, { "epoch": 0.20155652288616277, "grad_norm": 1.87749445438385, "learning_rate": 1.1965180691110525e-06, "loss": 1.095, "step": 764 }, { "epoch": 0.20208415776282812, "grad_norm": 5.437673568725586, "learning_rate": 1.1996834608282774e-06, "loss": 1.2088, "step": 766 }, { "epoch": 0.20261179263949347, "grad_norm": 4.96746301651001, "learning_rate": 1.2028488525455025e-06, "loss": 1.2614, "step": 768 }, { "epoch": 0.20313942751615882, "grad_norm": 3.4945335388183594, "learning_rate": 1.2060142442627276e-06, "loss": 1.1909, "step": 770 }, { "epoch": 0.20366706239282417, "grad_norm": 7.298605442047119, "learning_rate": 1.2091796359799525e-06, "loss": 1.1887, "step": 772 }, { "epoch": 0.20419469726948952, "grad_norm": 3.4771487712860107, "learning_rate": 1.2123450276971776e-06, "loss": 1.2188, "step": 774 }, { "epoch": 0.20472233214615487, "grad_norm": 2.1659739017486572, "learning_rate": 1.2155104194144024e-06, "loss": 1.1953, "step": 776 }, { "epoch": 0.2052499670228202, "grad_norm": 2.1853280067443848, "learning_rate": 1.2186758111316275e-06, "loss": 1.1696, "step": 778 }, { "epoch": 0.20577760189948555, "grad_norm": 1.8405373096466064, "learning_rate": 1.2218412028488526e-06, "loss": 1.1831, "step": 780 }, { "epoch": 0.2063052367761509, "grad_norm": 2.2711904048919678, "learning_rate": 1.2250065945660775e-06, "loss": 1.1992, "step": 782 }, { "epoch": 0.20683287165281625, "grad_norm": 1.8648481369018555, "learning_rate": 1.2281719862833026e-06, "loss": 1.166, "step": 784 }, { "epoch": 0.2073605065294816, "grad_norm": 2.255871534347534, "learning_rate": 1.2313373780005275e-06, "loss": 1.1595, "step": 786 }, { "epoch": 0.20788814140614695, "grad_norm": 1.8258112668991089, "learning_rate": 1.2345027697177526e-06, "loss": 1.1983, "step": 788 }, { "epoch": 0.2084157762828123, "grad_norm": 2.1655097007751465, "learning_rate": 1.2376681614349777e-06, "loss": 1.1031, "step": 790 }, { "epoch": 0.20894341115947765, "grad_norm": 31.389863967895508, "learning_rate": 1.2408335531522026e-06, "loss": 1.1286, "step": 792 }, { "epoch": 0.209471046036143, "grad_norm": 1.9339861869812012, "learning_rate": 1.2439989448694277e-06, "loss": 1.0729, "step": 794 }, { "epoch": 0.20999868091280835, "grad_norm": 1.754899501800537, "learning_rate": 1.2471643365866526e-06, "loss": 1.11, "step": 796 }, { "epoch": 0.21052631578947367, "grad_norm": 2.565979242324829, "learning_rate": 1.2503297283038777e-06, "loss": 1.1198, "step": 798 }, { "epoch": 0.21105395066613902, "grad_norm": 2.8031320571899414, "learning_rate": 1.2534951200211028e-06, "loss": 1.0654, "step": 800 }, { "epoch": 0.21158158554280437, "grad_norm": 4.04087495803833, "learning_rate": 1.2566605117383277e-06, "loss": 1.0384, "step": 802 }, { "epoch": 0.21210922041946972, "grad_norm": 9.76944351196289, "learning_rate": 1.2598259034555528e-06, "loss": 1.0563, "step": 804 }, { "epoch": 0.21263685529613507, "grad_norm": 2.515469789505005, "learning_rate": 1.2629912951727777e-06, "loss": 1.1399, "step": 806 }, { "epoch": 0.21316449017280042, "grad_norm": 4.329207897186279, "learning_rate": 1.2661566868900028e-06, "loss": 1.0597, "step": 808 }, { "epoch": 0.21369212504946578, "grad_norm": 2.221740245819092, "learning_rate": 1.2693220786072277e-06, "loss": 1.0308, "step": 810 }, { "epoch": 0.21421975992613113, "grad_norm": 3.643765926361084, "learning_rate": 1.2724874703244528e-06, "loss": 1.0739, "step": 812 }, { "epoch": 0.21474739480279648, "grad_norm": 4.32287073135376, "learning_rate": 1.2756528620416776e-06, "loss": 1.0584, "step": 814 }, { "epoch": 0.2152750296794618, "grad_norm": 2.5370936393737793, "learning_rate": 1.2788182537589027e-06, "loss": 1.0223, "step": 816 }, { "epoch": 0.21580266455612715, "grad_norm": 9.411702156066895, "learning_rate": 1.2819836454761276e-06, "loss": 1.0003, "step": 818 }, { "epoch": 0.2163302994327925, "grad_norm": 6.156253814697266, "learning_rate": 1.2851490371933527e-06, "loss": 1.0156, "step": 820 }, { "epoch": 0.21685793430945785, "grad_norm": 6.366454601287842, "learning_rate": 1.2883144289105776e-06, "loss": 1.052, "step": 822 }, { "epoch": 0.2173855691861232, "grad_norm": 3.7483413219451904, "learning_rate": 1.2914798206278027e-06, "loss": 1.0665, "step": 824 }, { "epoch": 0.21791320406278855, "grad_norm": 9.663166046142578, "learning_rate": 1.2946452123450276e-06, "loss": 1.0127, "step": 826 }, { "epoch": 0.2184408389394539, "grad_norm": 1.8149288892745972, "learning_rate": 1.2978106040622527e-06, "loss": 0.9916, "step": 828 }, { "epoch": 0.21896847381611925, "grad_norm": 2.965620994567871, "learning_rate": 1.3009759957794776e-06, "loss": 1.0291, "step": 830 }, { "epoch": 0.2194961086927846, "grad_norm": 1.817092776298523, "learning_rate": 1.3041413874967027e-06, "loss": 0.9471, "step": 832 }, { "epoch": 0.22002374356944995, "grad_norm": 4.682610988616943, "learning_rate": 1.3073067792139278e-06, "loss": 0.996, "step": 834 }, { "epoch": 0.22055137844611528, "grad_norm": 3.931837320327759, "learning_rate": 1.3104721709311527e-06, "loss": 0.9615, "step": 836 }, { "epoch": 0.22107901332278063, "grad_norm": 1.737269639968872, "learning_rate": 1.3136375626483778e-06, "loss": 0.9867, "step": 838 }, { "epoch": 0.22160664819944598, "grad_norm": 1.7939696311950684, "learning_rate": 1.3168029543656027e-06, "loss": 0.9366, "step": 840 }, { "epoch": 0.22213428307611133, "grad_norm": 3.386397123336792, "learning_rate": 1.3199683460828278e-06, "loss": 0.943, "step": 842 }, { "epoch": 0.22266191795277668, "grad_norm": 2.1728124618530273, "learning_rate": 1.3231337378000529e-06, "loss": 0.9484, "step": 844 }, { "epoch": 0.22318955282944203, "grad_norm": 2.368584156036377, "learning_rate": 1.3262991295172778e-06, "loss": 0.9637, "step": 846 }, { "epoch": 0.22371718770610738, "grad_norm": 1.890722393989563, "learning_rate": 1.3294645212345029e-06, "loss": 0.9707, "step": 848 }, { "epoch": 0.22424482258277273, "grad_norm": 2.8964996337890625, "learning_rate": 1.3326299129517277e-06, "loss": 0.9066, "step": 850 }, { "epoch": 0.22477245745943808, "grad_norm": 3.7780983448028564, "learning_rate": 1.3357953046689528e-06, "loss": 0.877, "step": 852 }, { "epoch": 0.22530009233610343, "grad_norm": 2.04473614692688, "learning_rate": 1.3389606963861777e-06, "loss": 0.9666, "step": 854 }, { "epoch": 0.22582772721276875, "grad_norm": 2.874096155166626, "learning_rate": 1.3421260881034028e-06, "loss": 0.8969, "step": 856 }, { "epoch": 0.2263553620894341, "grad_norm": 4.898131847381592, "learning_rate": 1.345291479820628e-06, "loss": 0.8877, "step": 858 }, { "epoch": 0.22688299696609945, "grad_norm": 2.8480496406555176, "learning_rate": 1.3484568715378528e-06, "loss": 0.92, "step": 860 }, { "epoch": 0.2274106318427648, "grad_norm": 2.134669542312622, "learning_rate": 1.351622263255078e-06, "loss": 0.8899, "step": 862 }, { "epoch": 0.22793826671943015, "grad_norm": 7.07193660736084, "learning_rate": 1.3547876549723028e-06, "loss": 0.9251, "step": 864 }, { "epoch": 0.2284659015960955, "grad_norm": 7.293907165527344, "learning_rate": 1.357953046689528e-06, "loss": 0.8605, "step": 866 }, { "epoch": 0.22899353647276086, "grad_norm": 1.7836155891418457, "learning_rate": 1.361118438406753e-06, "loss": 0.8636, "step": 868 }, { "epoch": 0.2295211713494262, "grad_norm": 1.9364670515060425, "learning_rate": 1.3642838301239779e-06, "loss": 0.8231, "step": 870 }, { "epoch": 0.23004880622609156, "grad_norm": 3.483208656311035, "learning_rate": 1.367449221841203e-06, "loss": 0.8625, "step": 872 }, { "epoch": 0.23057644110275688, "grad_norm": 1.7155433893203735, "learning_rate": 1.3706146135584279e-06, "loss": 0.8445, "step": 874 }, { "epoch": 0.23110407597942223, "grad_norm": 1.9780004024505615, "learning_rate": 1.373780005275653e-06, "loss": 0.8367, "step": 876 }, { "epoch": 0.23163171085608758, "grad_norm": 2.7429893016815186, "learning_rate": 1.3769453969928779e-06, "loss": 0.8309, "step": 878 }, { "epoch": 0.23215934573275293, "grad_norm": 2.1795814037323, "learning_rate": 1.380110788710103e-06, "loss": 0.8911, "step": 880 }, { "epoch": 0.23268698060941828, "grad_norm": 2.2982327938079834, "learning_rate": 1.3832761804273279e-06, "loss": 0.8358, "step": 882 }, { "epoch": 0.23321461548608363, "grad_norm": 4.184241771697998, "learning_rate": 1.386441572144553e-06, "loss": 0.8319, "step": 884 }, { "epoch": 0.23374225036274898, "grad_norm": 3.891277551651001, "learning_rate": 1.3896069638617778e-06, "loss": 0.8708, "step": 886 }, { "epoch": 0.23426988523941433, "grad_norm": 2.120737314224243, "learning_rate": 1.392772355579003e-06, "loss": 0.7818, "step": 888 }, { "epoch": 0.23479752011607968, "grad_norm": 2.2806737422943115, "learning_rate": 1.3959377472962278e-06, "loss": 0.8178, "step": 890 }, { "epoch": 0.23532515499274503, "grad_norm": 2.040803909301758, "learning_rate": 1.399103139013453e-06, "loss": 0.8584, "step": 892 }, { "epoch": 0.23585278986941036, "grad_norm": 7.973177433013916, "learning_rate": 1.4022685307306778e-06, "loss": 0.7798, "step": 894 }, { "epoch": 0.2363804247460757, "grad_norm": 2.3779876232147217, "learning_rate": 1.405433922447903e-06, "loss": 0.7944, "step": 896 }, { "epoch": 0.23690805962274106, "grad_norm": 3.485119342803955, "learning_rate": 1.408599314165128e-06, "loss": 0.7836, "step": 898 }, { "epoch": 0.2374356944994064, "grad_norm": 1.9134609699249268, "learning_rate": 1.411764705882353e-06, "loss": 0.7953, "step": 900 }, { "epoch": 0.23796332937607176, "grad_norm": 2.0154483318328857, "learning_rate": 1.414930097599578e-06, "loss": 0.8046, "step": 902 }, { "epoch": 0.2384909642527371, "grad_norm": 2.699458360671997, "learning_rate": 1.4180954893168029e-06, "loss": 0.7357, "step": 904 }, { "epoch": 0.23901859912940246, "grad_norm": 3.280712366104126, "learning_rate": 1.421260881034028e-06, "loss": 0.7915, "step": 906 }, { "epoch": 0.2395462340060678, "grad_norm": 2.2956960201263428, "learning_rate": 1.4244262727512529e-06, "loss": 0.7475, "step": 908 }, { "epoch": 0.24007386888273316, "grad_norm": 1.9440631866455078, "learning_rate": 1.427591664468478e-06, "loss": 0.7564, "step": 910 }, { "epoch": 0.24060150375939848, "grad_norm": 2.0214784145355225, "learning_rate": 1.430757056185703e-06, "loss": 0.7144, "step": 912 }, { "epoch": 0.24112913863606383, "grad_norm": 1.743786096572876, "learning_rate": 1.433922447902928e-06, "loss": 0.725, "step": 914 }, { "epoch": 0.24165677351272918, "grad_norm": 67.3874740600586, "learning_rate": 1.437087839620153e-06, "loss": 0.7586, "step": 916 }, { "epoch": 0.24218440838939453, "grad_norm": 3.798499345779419, "learning_rate": 1.440253231337378e-06, "loss": 0.6939, "step": 918 }, { "epoch": 0.24271204326605988, "grad_norm": 1.4319835901260376, "learning_rate": 1.443418623054603e-06, "loss": 0.6793, "step": 920 }, { "epoch": 0.24323967814272524, "grad_norm": 1.6663343906402588, "learning_rate": 1.4465840147718282e-06, "loss": 0.72, "step": 922 }, { "epoch": 0.24376731301939059, "grad_norm": 2.124204397201538, "learning_rate": 1.449749406489053e-06, "loss": 0.6948, "step": 924 }, { "epoch": 0.24429494789605594, "grad_norm": 1.793984293937683, "learning_rate": 1.4529147982062781e-06, "loss": 0.694, "step": 926 }, { "epoch": 0.2448225827727213, "grad_norm": 4.249074935913086, "learning_rate": 1.456080189923503e-06, "loss": 0.6884, "step": 928 }, { "epoch": 0.24535021764938664, "grad_norm": 1.7482000589370728, "learning_rate": 1.4592455816407281e-06, "loss": 0.6562, "step": 930 }, { "epoch": 0.24587785252605196, "grad_norm": 1.7331106662750244, "learning_rate": 1.4624109733579532e-06, "loss": 0.6984, "step": 932 }, { "epoch": 0.2464054874027173, "grad_norm": 1.7432920932769775, "learning_rate": 1.4655763650751781e-06, "loss": 0.7045, "step": 934 }, { "epoch": 0.24693312227938266, "grad_norm": 9.180392265319824, "learning_rate": 1.4687417567924032e-06, "loss": 0.6827, "step": 936 }, { "epoch": 0.247460757156048, "grad_norm": 1.8119057416915894, "learning_rate": 1.471907148509628e-06, "loss": 0.6696, "step": 938 }, { "epoch": 0.24798839203271336, "grad_norm": 6.415572166442871, "learning_rate": 1.4750725402268532e-06, "loss": 0.6963, "step": 940 }, { "epoch": 0.2485160269093787, "grad_norm": 1.9907336235046387, "learning_rate": 1.478237931944078e-06, "loss": 0.6499, "step": 942 }, { "epoch": 0.24904366178604406, "grad_norm": 3.4387030601501465, "learning_rate": 1.4814033236613032e-06, "loss": 0.6947, "step": 944 }, { "epoch": 0.2495712966627094, "grad_norm": 1.7977615594863892, "learning_rate": 1.484568715378528e-06, "loss": 0.65, "step": 946 }, { "epoch": 0.25009893153937474, "grad_norm": 2.690739631652832, "learning_rate": 1.4877341070957532e-06, "loss": 0.6968, "step": 948 }, { "epoch": 0.2506265664160401, "grad_norm": 2.99204683303833, "learning_rate": 1.490899498812978e-06, "loss": 0.6236, "step": 950 }, { "epoch": 0.25115420129270544, "grad_norm": 2.8351523876190186, "learning_rate": 1.4940648905302032e-06, "loss": 0.6436, "step": 952 }, { "epoch": 0.2516818361693708, "grad_norm": 1.8864368200302124, "learning_rate": 1.497230282247428e-06, "loss": 0.5806, "step": 954 }, { "epoch": 0.25220947104603614, "grad_norm": 1.8832203149795532, "learning_rate": 1.5003956739646531e-06, "loss": 0.6339, "step": 956 }, { "epoch": 0.2527371059227015, "grad_norm": 6.1116743087768555, "learning_rate": 1.503561065681878e-06, "loss": 0.6073, "step": 958 }, { "epoch": 0.25326474079936684, "grad_norm": 2.7902886867523193, "learning_rate": 1.5067264573991031e-06, "loss": 0.6505, "step": 960 }, { "epoch": 0.2537923756760322, "grad_norm": 1.772148609161377, "learning_rate": 1.5098918491163282e-06, "loss": 0.5973, "step": 962 }, { "epoch": 0.25432001055269754, "grad_norm": 1.6532127857208252, "learning_rate": 1.5130572408335533e-06, "loss": 0.6277, "step": 964 }, { "epoch": 0.2548476454293629, "grad_norm": 2.595583438873291, "learning_rate": 1.516222632550778e-06, "loss": 0.5893, "step": 966 }, { "epoch": 0.25537528030602824, "grad_norm": 1.772367238998413, "learning_rate": 1.5193880242680031e-06, "loss": 0.581, "step": 968 }, { "epoch": 0.2559029151826936, "grad_norm": 4.653487205505371, "learning_rate": 1.5225534159852282e-06, "loss": 0.5732, "step": 970 }, { "epoch": 0.25643055005935894, "grad_norm": 3.0727460384368896, "learning_rate": 1.5257188077024533e-06, "loss": 0.5756, "step": 972 }, { "epoch": 0.2569581849360243, "grad_norm": 2.1570754051208496, "learning_rate": 1.5288841994196784e-06, "loss": 0.581, "step": 974 }, { "epoch": 0.25748581981268964, "grad_norm": 4.1256184577941895, "learning_rate": 1.532049591136903e-06, "loss": 0.5637, "step": 976 }, { "epoch": 0.258013454689355, "grad_norm": 3.5562965869903564, "learning_rate": 1.5352149828541282e-06, "loss": 0.5721, "step": 978 }, { "epoch": 0.2585410895660203, "grad_norm": 4.133514881134033, "learning_rate": 1.5383803745713533e-06, "loss": 0.5563, "step": 980 }, { "epoch": 0.25906872444268564, "grad_norm": 1.8864938020706177, "learning_rate": 1.5415457662885784e-06, "loss": 0.559, "step": 982 }, { "epoch": 0.259596359319351, "grad_norm": 2.7057766914367676, "learning_rate": 1.544711158005803e-06, "loss": 0.5417, "step": 984 }, { "epoch": 0.26012399419601634, "grad_norm": 2.073669910430908, "learning_rate": 1.5478765497230282e-06, "loss": 0.5314, "step": 986 }, { "epoch": 0.2606516290726817, "grad_norm": 1.5045002698898315, "learning_rate": 1.5510419414402533e-06, "loss": 0.4907, "step": 988 }, { "epoch": 0.26117926394934704, "grad_norm": 1.528200626373291, "learning_rate": 1.5542073331574784e-06, "loss": 0.5887, "step": 990 }, { "epoch": 0.2617068988260124, "grad_norm": 1.6019675731658936, "learning_rate": 1.5573727248747035e-06, "loss": 0.5453, "step": 992 }, { "epoch": 0.26223453370267774, "grad_norm": 3.8030145168304443, "learning_rate": 1.5605381165919281e-06, "loss": 0.5591, "step": 994 }, { "epoch": 0.2627621685793431, "grad_norm": 2.3230791091918945, "learning_rate": 1.5637035083091532e-06, "loss": 0.5357, "step": 996 }, { "epoch": 0.26328980345600844, "grad_norm": 2.6748831272125244, "learning_rate": 1.5668689000263783e-06, "loss": 0.4925, "step": 998 }, { "epoch": 0.2638174383326738, "grad_norm": 1.6943038702011108, "learning_rate": 1.5700342917436034e-06, "loss": 0.5159, "step": 1000 }, { "epoch": 0.26434507320933914, "grad_norm": 3.1638882160186768, "learning_rate": 1.5731996834608285e-06, "loss": 0.5331, "step": 1002 }, { "epoch": 0.2648727080860045, "grad_norm": 2.309004545211792, "learning_rate": 1.5763650751780532e-06, "loss": 0.5291, "step": 1004 }, { "epoch": 0.26540034296266984, "grad_norm": 1.7442991733551025, "learning_rate": 1.5795304668952783e-06, "loss": 0.4788, "step": 1006 }, { "epoch": 0.2659279778393352, "grad_norm": 2.4253149032592773, "learning_rate": 1.5826958586125034e-06, "loss": 0.4778, "step": 1008 }, { "epoch": 0.26645561271600054, "grad_norm": 2.182070732116699, "learning_rate": 1.5858612503297283e-06, "loss": 0.4716, "step": 1010 }, { "epoch": 0.2669832475926659, "grad_norm": 1.511818528175354, "learning_rate": 1.5890266420469534e-06, "loss": 0.4724, "step": 1012 }, { "epoch": 0.26751088246933125, "grad_norm": 1.7519385814666748, "learning_rate": 1.5921920337641783e-06, "loss": 0.4888, "step": 1014 }, { "epoch": 0.2680385173459966, "grad_norm": 7.77711296081543, "learning_rate": 1.5953574254814034e-06, "loss": 0.459, "step": 1016 }, { "epoch": 0.2685661522226619, "grad_norm": 1.6824848651885986, "learning_rate": 1.5985228171986283e-06, "loss": 0.4404, "step": 1018 }, { "epoch": 0.26909378709932724, "grad_norm": 8.392828941345215, "learning_rate": 1.6016882089158534e-06, "loss": 0.4798, "step": 1020 }, { "epoch": 0.2696214219759926, "grad_norm": 3.5258712768554688, "learning_rate": 1.6048536006330785e-06, "loss": 0.4492, "step": 1022 }, { "epoch": 0.27014905685265794, "grad_norm": 1.4664301872253418, "learning_rate": 1.6080189923503031e-06, "loss": 0.4337, "step": 1024 }, { "epoch": 0.2706766917293233, "grad_norm": 1.9668514728546143, "learning_rate": 1.6111843840675282e-06, "loss": 0.4286, "step": 1026 }, { "epoch": 0.27120432660598864, "grad_norm": 12.022194862365723, "learning_rate": 1.6143497757847533e-06, "loss": 0.4729, "step": 1028 }, { "epoch": 0.271731961482654, "grad_norm": 2.394244432449341, "learning_rate": 1.6175151675019784e-06, "loss": 0.4822, "step": 1030 }, { "epoch": 0.27225959635931934, "grad_norm": 1.812638521194458, "learning_rate": 1.6206805592192035e-06, "loss": 0.4549, "step": 1032 }, { "epoch": 0.2727872312359847, "grad_norm": 22.3372859954834, "learning_rate": 1.6238459509364282e-06, "loss": 0.4302, "step": 1034 }, { "epoch": 0.27331486611265005, "grad_norm": 3.775691032409668, "learning_rate": 1.6270113426536533e-06, "loss": 0.4473, "step": 1036 }, { "epoch": 0.2738425009893154, "grad_norm": 1.9030145406723022, "learning_rate": 1.6301767343708784e-06, "loss": 0.4587, "step": 1038 }, { "epoch": 0.27437013586598075, "grad_norm": 1.628822922706604, "learning_rate": 1.6333421260881035e-06, "loss": 0.4468, "step": 1040 }, { "epoch": 0.2748977707426461, "grad_norm": 2.1797549724578857, "learning_rate": 1.6365075178053286e-06, "loss": 0.442, "step": 1042 }, { "epoch": 0.27542540561931145, "grad_norm": 1.884577751159668, "learning_rate": 1.6396729095225533e-06, "loss": 0.4412, "step": 1044 }, { "epoch": 0.2759530404959768, "grad_norm": 1.4529446363449097, "learning_rate": 1.6428383012397784e-06, "loss": 0.4205, "step": 1046 }, { "epoch": 0.27648067537264215, "grad_norm": 1.9543184041976929, "learning_rate": 1.6460036929570035e-06, "loss": 0.4369, "step": 1048 }, { "epoch": 0.2770083102493075, "grad_norm": 1.3626457452774048, "learning_rate": 1.6491690846742286e-06, "loss": 0.4291, "step": 1050 }, { "epoch": 0.27753594512597285, "grad_norm": 171.4417266845703, "learning_rate": 1.6523344763914537e-06, "loss": 0.4051, "step": 1052 }, { "epoch": 0.2780635800026382, "grad_norm": 1.9871186017990112, "learning_rate": 1.6554998681086784e-06, "loss": 0.4009, "step": 1054 }, { "epoch": 0.2785912148793035, "grad_norm": 4.018630027770996, "learning_rate": 1.6586652598259035e-06, "loss": 0.4032, "step": 1056 }, { "epoch": 0.27911884975596885, "grad_norm": 1.754981279373169, "learning_rate": 1.6618306515431286e-06, "loss": 0.3965, "step": 1058 }, { "epoch": 0.2796464846326342, "grad_norm": 6.118928909301758, "learning_rate": 1.6649960432603537e-06, "loss": 0.399, "step": 1060 }, { "epoch": 0.28017411950929955, "grad_norm": 2.1844773292541504, "learning_rate": 1.6681614349775788e-06, "loss": 0.391, "step": 1062 }, { "epoch": 0.2807017543859649, "grad_norm": 1.3163981437683105, "learning_rate": 1.6713268266948034e-06, "loss": 0.3967, "step": 1064 }, { "epoch": 0.28122938926263025, "grad_norm": 1.8430933952331543, "learning_rate": 1.6744922184120285e-06, "loss": 0.4027, "step": 1066 }, { "epoch": 0.2817570241392956, "grad_norm": 2.090280771255493, "learning_rate": 1.6776576101292536e-06, "loss": 0.367, "step": 1068 }, { "epoch": 0.28228465901596095, "grad_norm": 1.9689620733261108, "learning_rate": 1.6808230018464785e-06, "loss": 0.3865, "step": 1070 }, { "epoch": 0.2828122938926263, "grad_norm": 5.218667507171631, "learning_rate": 1.6839883935637036e-06, "loss": 0.3692, "step": 1072 }, { "epoch": 0.28333992876929165, "grad_norm": 1.68572998046875, "learning_rate": 1.6871537852809285e-06, "loss": 0.3693, "step": 1074 }, { "epoch": 0.283867563645957, "grad_norm": 1.2989487648010254, "learning_rate": 1.6903191769981536e-06, "loss": 0.3518, "step": 1076 }, { "epoch": 0.28439519852262235, "grad_norm": 1.735660433769226, "learning_rate": 1.6934845687153785e-06, "loss": 0.3651, "step": 1078 }, { "epoch": 0.2849228333992877, "grad_norm": 2.9053869247436523, "learning_rate": 1.6966499604326036e-06, "loss": 0.3425, "step": 1080 }, { "epoch": 0.28545046827595305, "grad_norm": 3.0840747356414795, "learning_rate": 1.6998153521498287e-06, "loss": 0.3859, "step": 1082 }, { "epoch": 0.2859781031526184, "grad_norm": 8.169676780700684, "learning_rate": 1.7029807438670536e-06, "loss": 0.3631, "step": 1084 }, { "epoch": 0.28650573802928375, "grad_norm": 3.1091198921203613, "learning_rate": 1.7061461355842785e-06, "loss": 0.3535, "step": 1086 }, { "epoch": 0.2870333729059491, "grad_norm": 1.6814955472946167, "learning_rate": 1.7093115273015036e-06, "loss": 0.3394, "step": 1088 }, { "epoch": 0.28756100778261445, "grad_norm": 8.423828125, "learning_rate": 1.7124769190187287e-06, "loss": 0.3276, "step": 1090 }, { "epoch": 0.2880886426592798, "grad_norm": 1.6410207748413086, "learning_rate": 1.7156423107359534e-06, "loss": 0.3713, "step": 1092 }, { "epoch": 0.2886162775359451, "grad_norm": 10.183674812316895, "learning_rate": 1.7188077024531785e-06, "loss": 0.3496, "step": 1094 }, { "epoch": 0.28914391241261045, "grad_norm": 3.038703680038452, "learning_rate": 1.7219730941704036e-06, "loss": 0.3546, "step": 1096 }, { "epoch": 0.2896715472892758, "grad_norm": 1.2585532665252686, "learning_rate": 1.7251384858876287e-06, "loss": 0.3271, "step": 1098 }, { "epoch": 0.29019918216594115, "grad_norm": 14.248102188110352, "learning_rate": 1.7283038776048538e-06, "loss": 0.329, "step": 1100 }, { "epoch": 0.2907268170426065, "grad_norm": 1.5602171421051025, "learning_rate": 1.7314692693220784e-06, "loss": 0.3263, "step": 1102 }, { "epoch": 0.29125445191927185, "grad_norm": 3.048746109008789, "learning_rate": 1.7346346610393035e-06, "loss": 0.3138, "step": 1104 }, { "epoch": 0.2917820867959372, "grad_norm": 1.3877471685409546, "learning_rate": 1.7378000527565286e-06, "loss": 0.3321, "step": 1106 }, { "epoch": 0.29230972167260255, "grad_norm": 1.6889264583587646, "learning_rate": 1.7409654444737537e-06, "loss": 0.3239, "step": 1108 }, { "epoch": 0.2928373565492679, "grad_norm": 1.1939541101455688, "learning_rate": 1.7441308361909788e-06, "loss": 0.3356, "step": 1110 }, { "epoch": 0.29336499142593325, "grad_norm": 1.2623542547225952, "learning_rate": 1.7472962279082035e-06, "loss": 0.3008, "step": 1112 }, { "epoch": 0.2938926263025986, "grad_norm": 4.6671833992004395, "learning_rate": 1.7504616196254286e-06, "loss": 0.2973, "step": 1114 }, { "epoch": 0.29442026117926395, "grad_norm": 3.153873920440674, "learning_rate": 1.7536270113426537e-06, "loss": 0.3097, "step": 1116 }, { "epoch": 0.2949478960559293, "grad_norm": 1.4362809658050537, "learning_rate": 1.7567924030598788e-06, "loss": 0.3154, "step": 1118 }, { "epoch": 0.29547553093259465, "grad_norm": 1.0906723737716675, "learning_rate": 1.759957794777104e-06, "loss": 0.3055, "step": 1120 }, { "epoch": 0.29600316580926, "grad_norm": 1.5593767166137695, "learning_rate": 1.7631231864943286e-06, "loss": 0.3134, "step": 1122 }, { "epoch": 0.29653080068592536, "grad_norm": 54.36036682128906, "learning_rate": 1.7662885782115537e-06, "loss": 0.301, "step": 1124 }, { "epoch": 0.2970584355625907, "grad_norm": 1.520894169807434, "learning_rate": 1.7694539699287788e-06, "loss": 0.2932, "step": 1126 }, { "epoch": 0.29758607043925606, "grad_norm": 2.4869132041931152, "learning_rate": 1.7726193616460039e-06, "loss": 0.3066, "step": 1128 }, { "epoch": 0.2981137053159214, "grad_norm": 1.4451065063476562, "learning_rate": 1.775784753363229e-06, "loss": 0.2738, "step": 1130 }, { "epoch": 0.29864134019258676, "grad_norm": 1.7304786443710327, "learning_rate": 1.7789501450804537e-06, "loss": 0.2844, "step": 1132 }, { "epoch": 0.29916897506925205, "grad_norm": 1.5170713663101196, "learning_rate": 1.7821155367976788e-06, "loss": 0.2791, "step": 1134 }, { "epoch": 0.2996966099459174, "grad_norm": 1.1413973569869995, "learning_rate": 1.7852809285149039e-06, "loss": 0.2692, "step": 1136 }, { "epoch": 0.30022424482258275, "grad_norm": 1.226335048675537, "learning_rate": 1.7884463202321287e-06, "loss": 0.2854, "step": 1138 }, { "epoch": 0.3007518796992481, "grad_norm": 23.365983963012695, "learning_rate": 1.7916117119493538e-06, "loss": 0.3214, "step": 1140 }, { "epoch": 0.30127951457591345, "grad_norm": 1.778048038482666, "learning_rate": 1.7947771036665787e-06, "loss": 0.27, "step": 1142 }, { "epoch": 0.3018071494525788, "grad_norm": 1.301975965499878, "learning_rate": 1.7979424953838038e-06, "loss": 0.2812, "step": 1144 }, { "epoch": 0.30233478432924416, "grad_norm": 1.7135143280029297, "learning_rate": 1.8011078871010287e-06, "loss": 0.286, "step": 1146 }, { "epoch": 0.3028624192059095, "grad_norm": 1.116446614265442, "learning_rate": 1.8042732788182538e-06, "loss": 0.2767, "step": 1148 }, { "epoch": 0.30339005408257486, "grad_norm": 2.37831711769104, "learning_rate": 1.807438670535479e-06, "loss": 0.2812, "step": 1150 }, { "epoch": 0.3039176889592402, "grad_norm": 1.1418694257736206, "learning_rate": 1.8106040622527038e-06, "loss": 0.2568, "step": 1152 }, { "epoch": 0.30444532383590556, "grad_norm": 1.065244436264038, "learning_rate": 1.8137694539699287e-06, "loss": 0.2573, "step": 1154 }, { "epoch": 0.3049729587125709, "grad_norm": 1.0732486248016357, "learning_rate": 1.8169348456871538e-06, "loss": 0.2536, "step": 1156 }, { "epoch": 0.30550059358923626, "grad_norm": 1.3314629793167114, "learning_rate": 1.8201002374043789e-06, "loss": 0.2694, "step": 1158 }, { "epoch": 0.3060282284659016, "grad_norm": 1.183067798614502, "learning_rate": 1.823265629121604e-06, "loss": 0.2686, "step": 1160 }, { "epoch": 0.30655586334256696, "grad_norm": 1.435129165649414, "learning_rate": 1.8264310208388287e-06, "loss": 0.2599, "step": 1162 }, { "epoch": 0.3070834982192323, "grad_norm": 1.656307339668274, "learning_rate": 1.8295964125560538e-06, "loss": 0.2606, "step": 1164 }, { "epoch": 0.30761113309589766, "grad_norm": 1.608139991760254, "learning_rate": 1.8327618042732789e-06, "loss": 0.2392, "step": 1166 }, { "epoch": 0.308138767972563, "grad_norm": 1.3906010389328003, "learning_rate": 1.835927195990504e-06, "loss": 0.2368, "step": 1168 }, { "epoch": 0.30866640284922836, "grad_norm": 1.4927085638046265, "learning_rate": 1.839092587707729e-06, "loss": 0.2379, "step": 1170 }, { "epoch": 0.30919403772589366, "grad_norm": 2.0680274963378906, "learning_rate": 1.8422579794249537e-06, "loss": 0.2569, "step": 1172 }, { "epoch": 0.309721672602559, "grad_norm": 1.0759352445602417, "learning_rate": 1.8454233711421788e-06, "loss": 0.2734, "step": 1174 }, { "epoch": 0.31024930747922436, "grad_norm": 1.573889136314392, "learning_rate": 1.848588762859404e-06, "loss": 0.2328, "step": 1176 }, { "epoch": 0.3107769423558897, "grad_norm": 6.936001777648926, "learning_rate": 1.851754154576629e-06, "loss": 0.2303, "step": 1178 }, { "epoch": 0.31130457723255506, "grad_norm": 1.133170247077942, "learning_rate": 1.8549195462938541e-06, "loss": 0.241, "step": 1180 }, { "epoch": 0.3118322121092204, "grad_norm": 1.5342645645141602, "learning_rate": 1.8580849380110788e-06, "loss": 0.2383, "step": 1182 }, { "epoch": 0.31235984698588576, "grad_norm": 0.9906195998191833, "learning_rate": 1.861250329728304e-06, "loss": 0.2289, "step": 1184 }, { "epoch": 0.3128874818625511, "grad_norm": 4.355713844299316, "learning_rate": 1.864415721445529e-06, "loss": 0.2433, "step": 1186 }, { "epoch": 0.31341511673921646, "grad_norm": 2.7618322372436523, "learning_rate": 1.8675811131627541e-06, "loss": 0.2406, "step": 1188 }, { "epoch": 0.3139427516158818, "grad_norm": 17.73569107055664, "learning_rate": 1.8707465048799788e-06, "loss": 0.229, "step": 1190 }, { "epoch": 0.31447038649254716, "grad_norm": 1.825258493423462, "learning_rate": 1.8739118965972039e-06, "loss": 0.2539, "step": 1192 }, { "epoch": 0.3149980213692125, "grad_norm": 1.0131174325942993, "learning_rate": 1.877077288314429e-06, "loss": 0.2373, "step": 1194 }, { "epoch": 0.31552565624587786, "grad_norm": 1.0670689344406128, "learning_rate": 1.880242680031654e-06, "loss": 0.229, "step": 1196 }, { "epoch": 0.3160532911225432, "grad_norm": 1.0052562952041626, "learning_rate": 1.883408071748879e-06, "loss": 0.2112, "step": 1198 }, { "epoch": 0.31658092599920856, "grad_norm": 1.1214179992675781, "learning_rate": 1.8865734634661039e-06, "loss": 0.2169, "step": 1200 }, { "epoch": 0.3171085608758739, "grad_norm": 1.2606592178344727, "learning_rate": 1.889738855183329e-06, "loss": 0.2266, "step": 1202 }, { "epoch": 0.31763619575253926, "grad_norm": 1.2784298658370972, "learning_rate": 1.892904246900554e-06, "loss": 0.2135, "step": 1204 }, { "epoch": 0.3181638306292046, "grad_norm": 0.8711207509040833, "learning_rate": 1.896069638617779e-06, "loss": 0.23, "step": 1206 }, { "epoch": 0.31869146550586996, "grad_norm": 0.9394475221633911, "learning_rate": 1.899235030335004e-06, "loss": 0.2002, "step": 1208 }, { "epoch": 0.31921910038253526, "grad_norm": 1.0801122188568115, "learning_rate": 1.902400422052229e-06, "loss": 0.1968, "step": 1210 }, { "epoch": 0.3197467352592006, "grad_norm": 3.977280616760254, "learning_rate": 1.905565813769454e-06, "loss": 0.2021, "step": 1212 }, { "epoch": 0.32027437013586596, "grad_norm": 1.0590304136276245, "learning_rate": 1.908731205486679e-06, "loss": 0.2043, "step": 1214 }, { "epoch": 0.3208020050125313, "grad_norm": 1.4837419986724854, "learning_rate": 1.9118965972039042e-06, "loss": 0.2013, "step": 1216 }, { "epoch": 0.32132963988919666, "grad_norm": 1.2130537033081055, "learning_rate": 1.915061988921129e-06, "loss": 0.2047, "step": 1218 }, { "epoch": 0.321857274765862, "grad_norm": 1.2156982421875, "learning_rate": 1.918227380638354e-06, "loss": 0.202, "step": 1220 }, { "epoch": 0.32238490964252736, "grad_norm": 0.9331347942352295, "learning_rate": 1.921392772355579e-06, "loss": 0.1884, "step": 1222 }, { "epoch": 0.3229125445191927, "grad_norm": 0.8981188535690308, "learning_rate": 1.924558164072804e-06, "loss": 0.183, "step": 1224 }, { "epoch": 0.32344017939585806, "grad_norm": 1.0446797609329224, "learning_rate": 1.927723555790029e-06, "loss": 0.1914, "step": 1226 }, { "epoch": 0.3239678142725234, "grad_norm": 1.2048068046569824, "learning_rate": 1.9308889475072544e-06, "loss": 0.1923, "step": 1228 }, { "epoch": 0.32449544914918876, "grad_norm": 0.9378485083580017, "learning_rate": 1.934054339224479e-06, "loss": 0.1952, "step": 1230 }, { "epoch": 0.3250230840258541, "grad_norm": 0.8534780144691467, "learning_rate": 1.937219730941704e-06, "loss": 0.1723, "step": 1232 }, { "epoch": 0.32555071890251946, "grad_norm": 0.8965113759040833, "learning_rate": 1.940385122658929e-06, "loss": 0.1906, "step": 1234 }, { "epoch": 0.3260783537791848, "grad_norm": 68.4958724975586, "learning_rate": 1.943550514376154e-06, "loss": 0.1942, "step": 1236 }, { "epoch": 0.32660598865585017, "grad_norm": 1.4038256406784058, "learning_rate": 1.9467159060933793e-06, "loss": 0.1738, "step": 1238 }, { "epoch": 0.3271336235325155, "grad_norm": 0.9357498288154602, "learning_rate": 1.949881297810604e-06, "loss": 0.1669, "step": 1240 }, { "epoch": 0.32766125840918087, "grad_norm": 1.1912391185760498, "learning_rate": 1.953046689527829e-06, "loss": 0.2022, "step": 1242 }, { "epoch": 0.3281888932858462, "grad_norm": 1.1525591611862183, "learning_rate": 1.956212081245054e-06, "loss": 0.1828, "step": 1244 }, { "epoch": 0.32871652816251157, "grad_norm": 1.527302622795105, "learning_rate": 1.9593774729622792e-06, "loss": 0.1927, "step": 1246 }, { "epoch": 0.32924416303917686, "grad_norm": 1.6815617084503174, "learning_rate": 1.962542864679504e-06, "loss": 0.1867, "step": 1248 }, { "epoch": 0.3297717979158422, "grad_norm": 0.7902088761329651, "learning_rate": 1.965708256396729e-06, "loss": 0.1828, "step": 1250 }, { "epoch": 0.33029943279250756, "grad_norm": 0.8924186825752258, "learning_rate": 1.968873648113954e-06, "loss": 0.1779, "step": 1252 }, { "epoch": 0.3308270676691729, "grad_norm": 1.5974019765853882, "learning_rate": 1.9720390398311792e-06, "loss": 0.1789, "step": 1254 }, { "epoch": 0.33135470254583826, "grad_norm": 0.9107617139816284, "learning_rate": 1.975204431548404e-06, "loss": 0.178, "step": 1256 }, { "epoch": 0.3318823374225036, "grad_norm": 0.7424228191375732, "learning_rate": 1.9783698232656294e-06, "loss": 0.1844, "step": 1258 }, { "epoch": 0.33240997229916897, "grad_norm": 1.0243717432022095, "learning_rate": 1.981535214982854e-06, "loss": 0.1704, "step": 1260 }, { "epoch": 0.3329376071758343, "grad_norm": 1.2186057567596436, "learning_rate": 1.984700606700079e-06, "loss": 0.199, "step": 1262 }, { "epoch": 0.33346524205249967, "grad_norm": 0.787755012512207, "learning_rate": 1.987865998417304e-06, "loss": 0.1604, "step": 1264 }, { "epoch": 0.333992876929165, "grad_norm": 0.6824118494987488, "learning_rate": 1.9910313901345294e-06, "loss": 0.1528, "step": 1266 }, { "epoch": 0.33452051180583037, "grad_norm": 0.6912674903869629, "learning_rate": 1.9941967818517543e-06, "loss": 0.1674, "step": 1268 }, { "epoch": 0.3350481466824957, "grad_norm": 1.4686000347137451, "learning_rate": 1.997362173568979e-06, "loss": 0.1619, "step": 1270 }, { "epoch": 0.33557578155916107, "grad_norm": 0.9034278988838196, "learning_rate": 2.000527565286204e-06, "loss": 0.1581, "step": 1272 }, { "epoch": 0.3361034164358264, "grad_norm": 0.9573628306388855, "learning_rate": 2.0036929570034294e-06, "loss": 0.1563, "step": 1274 }, { "epoch": 0.33663105131249177, "grad_norm": 0.9620110392570496, "learning_rate": 2.0068583487206543e-06, "loss": 0.1651, "step": 1276 }, { "epoch": 0.3371586861891571, "grad_norm": 0.9714809656143188, "learning_rate": 2.0100237404378796e-06, "loss": 0.1754, "step": 1278 }, { "epoch": 0.33768632106582247, "grad_norm": 0.8553822636604309, "learning_rate": 2.013189132155104e-06, "loss": 0.1586, "step": 1280 }, { "epoch": 0.3382139559424878, "grad_norm": 1.4794291257858276, "learning_rate": 2.0163545238723293e-06, "loss": 0.1558, "step": 1282 }, { "epoch": 0.33874159081915317, "grad_norm": 1.1040078401565552, "learning_rate": 2.0195199155895542e-06, "loss": 0.1615, "step": 1284 }, { "epoch": 0.33926922569581847, "grad_norm": 0.7665732502937317, "learning_rate": 2.0226853073067795e-06, "loss": 0.1443, "step": 1286 }, { "epoch": 0.3397968605724838, "grad_norm": 1.529727578163147, "learning_rate": 2.0258506990240044e-06, "loss": 0.1632, "step": 1288 }, { "epoch": 0.34032449544914917, "grad_norm": 0.7866541147232056, "learning_rate": 2.0290160907412293e-06, "loss": 0.1523, "step": 1290 }, { "epoch": 0.3408521303258145, "grad_norm": 1.4099279642105103, "learning_rate": 2.032181482458454e-06, "loss": 0.1689, "step": 1292 }, { "epoch": 0.34137976520247987, "grad_norm": 1.181907296180725, "learning_rate": 2.0353468741756795e-06, "loss": 0.1706, "step": 1294 }, { "epoch": 0.3419074000791452, "grad_norm": 0.7322559356689453, "learning_rate": 2.0385122658929044e-06, "loss": 0.1447, "step": 1296 }, { "epoch": 0.34243503495581057, "grad_norm": 0.7078782320022583, "learning_rate": 2.0416776576101293e-06, "loss": 0.1505, "step": 1298 }, { "epoch": 0.3429626698324759, "grad_norm": 0.9894326329231262, "learning_rate": 2.044843049327354e-06, "loss": 0.1555, "step": 1300 }, { "epoch": 0.34349030470914127, "grad_norm": 0.9212680459022522, "learning_rate": 2.0480084410445795e-06, "loss": 0.1543, "step": 1302 }, { "epoch": 0.3440179395858066, "grad_norm": 0.7818958163261414, "learning_rate": 2.0511738327618044e-06, "loss": 0.1304, "step": 1304 }, { "epoch": 0.34454557446247197, "grad_norm": 0.977411150932312, "learning_rate": 2.0543392244790293e-06, "loss": 0.161, "step": 1306 }, { "epoch": 0.3450732093391373, "grad_norm": 0.7091043591499329, "learning_rate": 2.057504616196254e-06, "loss": 0.1603, "step": 1308 }, { "epoch": 0.34560084421580267, "grad_norm": 1.0982341766357422, "learning_rate": 2.0606700079134795e-06, "loss": 0.1441, "step": 1310 }, { "epoch": 0.346128479092468, "grad_norm": 0.8673431873321533, "learning_rate": 2.0638353996307044e-06, "loss": 0.1456, "step": 1312 }, { "epoch": 0.3466561139691334, "grad_norm": 0.6166326999664307, "learning_rate": 2.0670007913479292e-06, "loss": 0.1462, "step": 1314 }, { "epoch": 0.3471837488457987, "grad_norm": 0.8409046530723572, "learning_rate": 2.0701661830651546e-06, "loss": 0.152, "step": 1316 }, { "epoch": 0.3477113837224641, "grad_norm": 2.2291295528411865, "learning_rate": 2.073331574782379e-06, "loss": 0.1325, "step": 1318 }, { "epoch": 0.3482390185991294, "grad_norm": 0.5791150331497192, "learning_rate": 2.0764969664996043e-06, "loss": 0.1281, "step": 1320 }, { "epoch": 0.3487666534757948, "grad_norm": 0.7807860970497131, "learning_rate": 2.0796623582168292e-06, "loss": 0.1309, "step": 1322 }, { "epoch": 0.3492942883524601, "grad_norm": 1.2889002561569214, "learning_rate": 2.0828277499340545e-06, "loss": 0.1402, "step": 1324 }, { "epoch": 0.3498219232291254, "grad_norm": 43.41680145263672, "learning_rate": 2.0859931416512794e-06, "loss": 0.138, "step": 1326 }, { "epoch": 0.35034955810579077, "grad_norm": 1.0518972873687744, "learning_rate": 2.0891585333685043e-06, "loss": 0.1313, "step": 1328 }, { "epoch": 0.3508771929824561, "grad_norm": 0.7686504125595093, "learning_rate": 2.092323925085729e-06, "loss": 0.1313, "step": 1330 }, { "epoch": 0.35140482785912147, "grad_norm": 1.3749741315841675, "learning_rate": 2.0954893168029545e-06, "loss": 0.1381, "step": 1332 }, { "epoch": 0.3519324627357868, "grad_norm": 0.7722144722938538, "learning_rate": 2.0986547085201794e-06, "loss": 0.1232, "step": 1334 }, { "epoch": 0.3524600976124522, "grad_norm": 0.980959415435791, "learning_rate": 2.1018201002374047e-06, "loss": 0.1261, "step": 1336 }, { "epoch": 0.3529877324891175, "grad_norm": 0.8315899968147278, "learning_rate": 2.104985491954629e-06, "loss": 0.127, "step": 1338 }, { "epoch": 0.3535153673657829, "grad_norm": 0.9581456780433655, "learning_rate": 2.1081508836718545e-06, "loss": 0.1248, "step": 1340 }, { "epoch": 0.3540430022424482, "grad_norm": 0.6388617753982544, "learning_rate": 2.1113162753890794e-06, "loss": 0.1269, "step": 1342 }, { "epoch": 0.3545706371191136, "grad_norm": 1.4070053100585938, "learning_rate": 2.1144816671063047e-06, "loss": 0.1165, "step": 1344 }, { "epoch": 0.3550982719957789, "grad_norm": 0.8881339430809021, "learning_rate": 2.1176470588235296e-06, "loss": 0.1504, "step": 1346 }, { "epoch": 0.3556259068724443, "grad_norm": 1.0181913375854492, "learning_rate": 2.1208124505407545e-06, "loss": 0.1195, "step": 1348 }, { "epoch": 0.3561535417491096, "grad_norm": 1.3038676977157593, "learning_rate": 2.1239778422579793e-06, "loss": 0.1221, "step": 1350 }, { "epoch": 0.356681176625775, "grad_norm": 0.961363673210144, "learning_rate": 2.1271432339752047e-06, "loss": 0.1248, "step": 1352 }, { "epoch": 0.3572088115024403, "grad_norm": 6.0120320320129395, "learning_rate": 2.1303086256924295e-06, "loss": 0.1186, "step": 1354 }, { "epoch": 0.3577364463791057, "grad_norm": 0.805404007434845, "learning_rate": 2.133474017409655e-06, "loss": 0.1109, "step": 1356 }, { "epoch": 0.358264081255771, "grad_norm": 1.0887759923934937, "learning_rate": 2.1366394091268793e-06, "loss": 0.1297, "step": 1358 }, { "epoch": 0.3587917161324364, "grad_norm": 0.5715579986572266, "learning_rate": 2.1398048008441046e-06, "loss": 0.1123, "step": 1360 }, { "epoch": 0.35931935100910173, "grad_norm": 0.8239384293556213, "learning_rate": 2.1429701925613295e-06, "loss": 0.116, "step": 1362 }, { "epoch": 0.359846985885767, "grad_norm": 1.1482157707214355, "learning_rate": 2.1461355842785544e-06, "loss": 0.1065, "step": 1364 }, { "epoch": 0.3603746207624324, "grad_norm": 1.0502039194107056, "learning_rate": 2.1493009759957797e-06, "loss": 0.116, "step": 1366 }, { "epoch": 0.3609022556390977, "grad_norm": 1.0342086553573608, "learning_rate": 2.1524663677130046e-06, "loss": 0.1116, "step": 1368 }, { "epoch": 0.3614298905157631, "grad_norm": 3.483673334121704, "learning_rate": 2.1556317594302295e-06, "loss": 0.1117, "step": 1370 }, { "epoch": 0.3619575253924284, "grad_norm": 22.337953567504883, "learning_rate": 2.1587971511474544e-06, "loss": 0.1223, "step": 1372 }, { "epoch": 0.3624851602690938, "grad_norm": 0.9538460969924927, "learning_rate": 2.1619625428646797e-06, "loss": 0.1091, "step": 1374 }, { "epoch": 0.3630127951457591, "grad_norm": 0.8235485553741455, "learning_rate": 2.1651279345819046e-06, "loss": 0.1108, "step": 1376 }, { "epoch": 0.3635404300224245, "grad_norm": 0.9927016496658325, "learning_rate": 2.1682933262991295e-06, "loss": 0.1094, "step": 1378 }, { "epoch": 0.3640680648990898, "grad_norm": 0.9533777832984924, "learning_rate": 2.1714587180163544e-06, "loss": 0.12, "step": 1380 }, { "epoch": 0.3645956997757552, "grad_norm": 0.9228023290634155, "learning_rate": 2.1746241097335797e-06, "loss": 0.1091, "step": 1382 }, { "epoch": 0.36512333465242053, "grad_norm": 0.7718196511268616, "learning_rate": 2.1777895014508046e-06, "loss": 0.1073, "step": 1384 }, { "epoch": 0.3656509695290859, "grad_norm": 2.003631114959717, "learning_rate": 2.18095489316803e-06, "loss": 0.1217, "step": 1386 }, { "epoch": 0.36617860440575123, "grad_norm": 0.7755074501037598, "learning_rate": 2.1841202848852543e-06, "loss": 0.1022, "step": 1388 }, { "epoch": 0.3667062392824166, "grad_norm": 0.644530713558197, "learning_rate": 2.1872856766024796e-06, "loss": 0.115, "step": 1390 }, { "epoch": 0.36723387415908193, "grad_norm": 1.243959903717041, "learning_rate": 2.1904510683197045e-06, "loss": 0.1147, "step": 1392 }, { "epoch": 0.3677615090357473, "grad_norm": 0.6250147819519043, "learning_rate": 2.19361646003693e-06, "loss": 0.1011, "step": 1394 }, { "epoch": 0.36828914391241263, "grad_norm": 0.759848952293396, "learning_rate": 2.1967818517541543e-06, "loss": 0.113, "step": 1396 }, { "epoch": 0.368816778789078, "grad_norm": 0.9575510025024414, "learning_rate": 2.1999472434713796e-06, "loss": 0.1041, "step": 1398 }, { "epoch": 0.36934441366574333, "grad_norm": 0.7048844695091248, "learning_rate": 2.2031126351886045e-06, "loss": 0.1103, "step": 1400 }, { "epoch": 0.3698720485424086, "grad_norm": 8.265582084655762, "learning_rate": 2.20627802690583e-06, "loss": 0.102, "step": 1402 }, { "epoch": 0.370399683419074, "grad_norm": 0.49567532539367676, "learning_rate": 2.2094434186230547e-06, "loss": 0.1002, "step": 1404 }, { "epoch": 0.37092731829573933, "grad_norm": 0.4794163405895233, "learning_rate": 2.2126088103402796e-06, "loss": 0.1036, "step": 1406 }, { "epoch": 0.3714549531724047, "grad_norm": 0.7531571984291077, "learning_rate": 2.2157742020575045e-06, "loss": 0.1206, "step": 1408 }, { "epoch": 0.37198258804907003, "grad_norm": 0.7591971755027771, "learning_rate": 2.2189395937747298e-06, "loss": 0.092, "step": 1410 }, { "epoch": 0.3725102229257354, "grad_norm": 0.6259726881980896, "learning_rate": 2.2221049854919547e-06, "loss": 0.1096, "step": 1412 }, { "epoch": 0.37303785780240073, "grad_norm": 1.6606630086898804, "learning_rate": 2.22527037720918e-06, "loss": 0.1125, "step": 1414 }, { "epoch": 0.3735654926790661, "grad_norm": 3.913923501968384, "learning_rate": 2.2284357689264045e-06, "loss": 0.0966, "step": 1416 }, { "epoch": 0.37409312755573143, "grad_norm": 1.003739356994629, "learning_rate": 2.2316011606436298e-06, "loss": 0.0979, "step": 1418 }, { "epoch": 0.3746207624323968, "grad_norm": 0.8434851765632629, "learning_rate": 2.2347665523608546e-06, "loss": 0.1056, "step": 1420 }, { "epoch": 0.37514839730906213, "grad_norm": 1.0260391235351562, "learning_rate": 2.23793194407808e-06, "loss": 0.1119, "step": 1422 }, { "epoch": 0.3756760321857275, "grad_norm": 0.7455624341964722, "learning_rate": 2.241097335795305e-06, "loss": 0.0913, "step": 1424 }, { "epoch": 0.37620366706239283, "grad_norm": 0.9014041423797607, "learning_rate": 2.2442627275125297e-06, "loss": 0.0977, "step": 1426 }, { "epoch": 0.3767313019390582, "grad_norm": 0.9828375577926636, "learning_rate": 2.2474281192297546e-06, "loss": 0.0954, "step": 1428 }, { "epoch": 0.37725893681572353, "grad_norm": 0.9237997531890869, "learning_rate": 2.25059351094698e-06, "loss": 0.101, "step": 1430 }, { "epoch": 0.3777865716923889, "grad_norm": 0.5890629887580872, "learning_rate": 2.253758902664205e-06, "loss": 0.1197, "step": 1432 }, { "epoch": 0.37831420656905423, "grad_norm": 0.8084545731544495, "learning_rate": 2.2569242943814297e-06, "loss": 0.0912, "step": 1434 }, { "epoch": 0.3788418414457196, "grad_norm": 0.7956644296646118, "learning_rate": 2.2600896860986546e-06, "loss": 0.1041, "step": 1436 }, { "epoch": 0.37936947632238494, "grad_norm": 0.6723247170448303, "learning_rate": 2.26325507781588e-06, "loss": 0.0936, "step": 1438 }, { "epoch": 0.37989711119905023, "grad_norm": 0.6902758479118347, "learning_rate": 2.266420469533105e-06, "loss": 0.097, "step": 1440 }, { "epoch": 0.3804247460757156, "grad_norm": 1.4020812511444092, "learning_rate": 2.2695858612503297e-06, "loss": 0.0954, "step": 1442 }, { "epoch": 0.38095238095238093, "grad_norm": 0.5012320876121521, "learning_rate": 2.272751252967555e-06, "loss": 0.0918, "step": 1444 }, { "epoch": 0.3814800158290463, "grad_norm": 0.8172739148139954, "learning_rate": 2.2759166446847795e-06, "loss": 0.117, "step": 1446 }, { "epoch": 0.38200765070571163, "grad_norm": 1.5943480730056763, "learning_rate": 2.2790820364020048e-06, "loss": 0.0855, "step": 1448 }, { "epoch": 0.382535285582377, "grad_norm": 1.4084240198135376, "learning_rate": 2.2822474281192297e-06, "loss": 0.1105, "step": 1450 }, { "epoch": 0.38306292045904233, "grad_norm": 0.5549253821372986, "learning_rate": 2.285412819836455e-06, "loss": 0.0896, "step": 1452 }, { "epoch": 0.3835905553357077, "grad_norm": 0.9530546069145203, "learning_rate": 2.28857821155368e-06, "loss": 0.1003, "step": 1454 }, { "epoch": 0.38411819021237303, "grad_norm": 0.5439847707748413, "learning_rate": 2.2917436032709047e-06, "loss": 0.0883, "step": 1456 }, { "epoch": 0.3846458250890384, "grad_norm": 0.5771483182907104, "learning_rate": 2.2949089949881296e-06, "loss": 0.0813, "step": 1458 }, { "epoch": 0.38517345996570374, "grad_norm": 0.6019130945205688, "learning_rate": 2.298074386705355e-06, "loss": 0.1027, "step": 1460 }, { "epoch": 0.3857010948423691, "grad_norm": 0.468273788690567, "learning_rate": 2.30123977842258e-06, "loss": 0.0866, "step": 1462 }, { "epoch": 0.38622872971903444, "grad_norm": 0.6012829542160034, "learning_rate": 2.304405170139805e-06, "loss": 0.0919, "step": 1464 }, { "epoch": 0.3867563645956998, "grad_norm": 1.628769040107727, "learning_rate": 2.3075705618570296e-06, "loss": 0.0894, "step": 1466 }, { "epoch": 0.38728399947236514, "grad_norm": 0.512100338935852, "learning_rate": 2.310735953574255e-06, "loss": 0.0843, "step": 1468 }, { "epoch": 0.3878116343490305, "grad_norm": 1.8907301425933838, "learning_rate": 2.31390134529148e-06, "loss": 0.0966, "step": 1470 }, { "epoch": 0.38833926922569584, "grad_norm": 1.1195001602172852, "learning_rate": 2.317066737008705e-06, "loss": 0.0799, "step": 1472 }, { "epoch": 0.3888669041023612, "grad_norm": 0.5208202600479126, "learning_rate": 2.32023212872593e-06, "loss": 0.0856, "step": 1474 }, { "epoch": 0.38939453897902654, "grad_norm": 1.348487377166748, "learning_rate": 2.323397520443155e-06, "loss": 0.0874, "step": 1476 }, { "epoch": 0.38992217385569183, "grad_norm": 0.6057493090629578, "learning_rate": 2.3265629121603798e-06, "loss": 0.0884, "step": 1478 }, { "epoch": 0.3904498087323572, "grad_norm": 0.605069637298584, "learning_rate": 2.329728303877605e-06, "loss": 0.0929, "step": 1480 }, { "epoch": 0.39097744360902253, "grad_norm": 0.5677754282951355, "learning_rate": 2.33289369559483e-06, "loss": 0.0844, "step": 1482 }, { "epoch": 0.3915050784856879, "grad_norm": 0.9152353405952454, "learning_rate": 2.3360590873120553e-06, "loss": 0.0953, "step": 1484 }, { "epoch": 0.39203271336235324, "grad_norm": 0.6717398166656494, "learning_rate": 2.3392244790292798e-06, "loss": 0.0961, "step": 1486 }, { "epoch": 0.3925603482390186, "grad_norm": 2.239109992980957, "learning_rate": 2.342389870746505e-06, "loss": 0.0969, "step": 1488 }, { "epoch": 0.39308798311568394, "grad_norm": 0.6708511114120483, "learning_rate": 2.34555526246373e-06, "loss": 0.0799, "step": 1490 }, { "epoch": 0.3936156179923493, "grad_norm": 0.6662078499794006, "learning_rate": 2.348720654180955e-06, "loss": 0.082, "step": 1492 }, { "epoch": 0.39414325286901464, "grad_norm": 0.6810004711151123, "learning_rate": 2.3518860458981797e-06, "loss": 0.0919, "step": 1494 }, { "epoch": 0.39467088774568, "grad_norm": 1.252829670906067, "learning_rate": 2.355051437615405e-06, "loss": 0.09, "step": 1496 }, { "epoch": 0.39519852262234534, "grad_norm": 0.641607403755188, "learning_rate": 2.35821682933263e-06, "loss": 0.0869, "step": 1498 }, { "epoch": 0.3957261574990107, "grad_norm": 2.013505220413208, "learning_rate": 2.361382221049855e-06, "loss": 0.0711, "step": 1500 }, { "epoch": 0.39625379237567604, "grad_norm": 0.9156221151351929, "learning_rate": 2.36454761276708e-06, "loss": 0.0925, "step": 1502 }, { "epoch": 0.3967814272523414, "grad_norm": 0.9502848982810974, "learning_rate": 2.367713004484305e-06, "loss": 0.1003, "step": 1504 }, { "epoch": 0.39730906212900674, "grad_norm": 0.9710351228713989, "learning_rate": 2.37087839620153e-06, "loss": 0.093, "step": 1506 }, { "epoch": 0.3978366970056721, "grad_norm": 0.5948989987373352, "learning_rate": 2.374043787918755e-06, "loss": 0.0753, "step": 1508 }, { "epoch": 0.39836433188233744, "grad_norm": 0.6346333026885986, "learning_rate": 2.37720917963598e-06, "loss": 0.0917, "step": 1510 }, { "epoch": 0.3988919667590028, "grad_norm": 0.7364798784255981, "learning_rate": 2.380374571353205e-06, "loss": 0.088, "step": 1512 }, { "epoch": 0.39941960163566814, "grad_norm": 1.109704613685608, "learning_rate": 2.38353996307043e-06, "loss": 0.0776, "step": 1514 }, { "epoch": 0.3999472365123335, "grad_norm": 0.5268648862838745, "learning_rate": 2.3867053547876548e-06, "loss": 0.0885, "step": 1516 }, { "epoch": 0.4004748713889988, "grad_norm": 0.39282116293907166, "learning_rate": 2.38987074650488e-06, "loss": 0.0804, "step": 1518 }, { "epoch": 0.40100250626566414, "grad_norm": 0.6000965237617493, "learning_rate": 2.393036138222105e-06, "loss": 0.0845, "step": 1520 }, { "epoch": 0.4015301411423295, "grad_norm": 0.6243029832839966, "learning_rate": 2.3962015299393303e-06, "loss": 0.0806, "step": 1522 }, { "epoch": 0.40205777601899484, "grad_norm": 0.4706527590751648, "learning_rate": 2.3993669216565547e-06, "loss": 0.0818, "step": 1524 }, { "epoch": 0.4025854108956602, "grad_norm": 0.8156248331069946, "learning_rate": 2.40253231337378e-06, "loss": 0.0766, "step": 1526 }, { "epoch": 0.40311304577232554, "grad_norm": 0.47973358631134033, "learning_rate": 2.405697705091005e-06, "loss": 0.0797, "step": 1528 }, { "epoch": 0.4036406806489909, "grad_norm": 0.5107767581939697, "learning_rate": 2.4088630968082303e-06, "loss": 0.0835, "step": 1530 }, { "epoch": 0.40416831552565624, "grad_norm": 0.5917051434516907, "learning_rate": 2.412028488525455e-06, "loss": 0.0731, "step": 1532 }, { "epoch": 0.4046959504023216, "grad_norm": 1.214198112487793, "learning_rate": 2.41519388024268e-06, "loss": 0.0817, "step": 1534 }, { "epoch": 0.40522358527898694, "grad_norm": 0.5156332850456238, "learning_rate": 2.418359271959905e-06, "loss": 0.0767, "step": 1536 }, { "epoch": 0.4057512201556523, "grad_norm": 0.4734150767326355, "learning_rate": 2.4215246636771302e-06, "loss": 0.0817, "step": 1538 }, { "epoch": 0.40627885503231764, "grad_norm": 1.3453925848007202, "learning_rate": 2.424690055394355e-06, "loss": 0.0801, "step": 1540 }, { "epoch": 0.406806489908983, "grad_norm": 0.606713056564331, "learning_rate": 2.4278554471115804e-06, "loss": 0.0812, "step": 1542 }, { "epoch": 0.40733412478564834, "grad_norm": 0.9796473383903503, "learning_rate": 2.431020838828805e-06, "loss": 0.0763, "step": 1544 }, { "epoch": 0.4078617596623137, "grad_norm": 0.532355785369873, "learning_rate": 2.43418623054603e-06, "loss": 0.0761, "step": 1546 }, { "epoch": 0.40838939453897904, "grad_norm": 1.226231575012207, "learning_rate": 2.437351622263255e-06, "loss": 0.071, "step": 1548 }, { "epoch": 0.4089170294156444, "grad_norm": 1.4272723197937012, "learning_rate": 2.4405170139804804e-06, "loss": 0.0813, "step": 1550 }, { "epoch": 0.40944466429230975, "grad_norm": 0.3924344480037689, "learning_rate": 2.4436824056977053e-06, "loss": 0.0645, "step": 1552 }, { "epoch": 0.4099722991689751, "grad_norm": 0.5131269693374634, "learning_rate": 2.44684779741493e-06, "loss": 0.0682, "step": 1554 }, { "epoch": 0.4104999340456404, "grad_norm": 0.44350627064704895, "learning_rate": 2.450013189132155e-06, "loss": 0.0702, "step": 1556 }, { "epoch": 0.41102756892230574, "grad_norm": 0.7178897857666016, "learning_rate": 2.4531785808493804e-06, "loss": 0.0708, "step": 1558 }, { "epoch": 0.4115552037989711, "grad_norm": 0.9046401381492615, "learning_rate": 2.4563439725666053e-06, "loss": 0.0849, "step": 1560 }, { "epoch": 0.41208283867563644, "grad_norm": 0.4270871579647064, "learning_rate": 2.45950936428383e-06, "loss": 0.0711, "step": 1562 }, { "epoch": 0.4126104735523018, "grad_norm": 0.9986486434936523, "learning_rate": 2.462674756001055e-06, "loss": 0.0718, "step": 1564 }, { "epoch": 0.41313810842896714, "grad_norm": 0.7129299640655518, "learning_rate": 2.4658401477182804e-06, "loss": 0.0701, "step": 1566 }, { "epoch": 0.4136657433056325, "grad_norm": 1.9633287191390991, "learning_rate": 2.4690055394355052e-06, "loss": 0.0898, "step": 1568 }, { "epoch": 0.41419337818229784, "grad_norm": 0.8055351376533508, "learning_rate": 2.47217093115273e-06, "loss": 0.0703, "step": 1570 }, { "epoch": 0.4147210130589632, "grad_norm": 0.5783991813659668, "learning_rate": 2.4753363228699554e-06, "loss": 0.0614, "step": 1572 }, { "epoch": 0.41524864793562855, "grad_norm": 0.6681216359138489, "learning_rate": 2.47850171458718e-06, "loss": 0.0784, "step": 1574 }, { "epoch": 0.4157762828122939, "grad_norm": 0.5658100843429565, "learning_rate": 2.4816671063044052e-06, "loss": 0.0672, "step": 1576 }, { "epoch": 0.41630391768895925, "grad_norm": 0.3482312262058258, "learning_rate": 2.48483249802163e-06, "loss": 0.0716, "step": 1578 }, { "epoch": 0.4168315525656246, "grad_norm": 0.4377562701702118, "learning_rate": 2.4879978897388554e-06, "loss": 0.0728, "step": 1580 }, { "epoch": 0.41735918744228995, "grad_norm": 0.42335811257362366, "learning_rate": 2.4911632814560803e-06, "loss": 0.0697, "step": 1582 }, { "epoch": 0.4178868223189553, "grad_norm": 0.5614668130874634, "learning_rate": 2.494328673173305e-06, "loss": 0.0711, "step": 1584 }, { "epoch": 0.41841445719562065, "grad_norm": 0.862246572971344, "learning_rate": 2.49749406489053e-06, "loss": 0.0759, "step": 1586 }, { "epoch": 0.418942092072286, "grad_norm": 0.7870267033576965, "learning_rate": 2.5006594566077554e-06, "loss": 0.0733, "step": 1588 }, { "epoch": 0.41946972694895135, "grad_norm": 1.0963091850280762, "learning_rate": 2.5038248483249803e-06, "loss": 0.0746, "step": 1590 }, { "epoch": 0.4199973618256167, "grad_norm": 1.0494639873504639, "learning_rate": 2.5069902400422056e-06, "loss": 0.0657, "step": 1592 }, { "epoch": 0.420524996702282, "grad_norm": 0.3974427878856659, "learning_rate": 2.51015563175943e-06, "loss": 0.0647, "step": 1594 }, { "epoch": 0.42105263157894735, "grad_norm": 1.3751622438430786, "learning_rate": 2.5133210234766554e-06, "loss": 0.0658, "step": 1596 }, { "epoch": 0.4215802664556127, "grad_norm": 0.580010712146759, "learning_rate": 2.5164864151938803e-06, "loss": 0.062, "step": 1598 }, { "epoch": 0.42210790133227805, "grad_norm": 0.5694828033447266, "learning_rate": 2.5196518069111056e-06, "loss": 0.0628, "step": 1600 }, { "epoch": 0.4226355362089434, "grad_norm": 0.4018910229206085, "learning_rate": 2.52281719862833e-06, "loss": 0.0709, "step": 1602 }, { "epoch": 0.42316317108560875, "grad_norm": 0.995413064956665, "learning_rate": 2.5259825903455553e-06, "loss": 0.0703, "step": 1604 }, { "epoch": 0.4236908059622741, "grad_norm": 0.6190749406814575, "learning_rate": 2.5291479820627802e-06, "loss": 0.0557, "step": 1606 }, { "epoch": 0.42421844083893945, "grad_norm": 0.7050879001617432, "learning_rate": 2.5323133737800055e-06, "loss": 0.0703, "step": 1608 }, { "epoch": 0.4247460757156048, "grad_norm": 0.8867683410644531, "learning_rate": 2.5354787654972304e-06, "loss": 0.0719, "step": 1610 }, { "epoch": 0.42527371059227015, "grad_norm": 1.4647448062896729, "learning_rate": 2.5386441572144553e-06, "loss": 0.0642, "step": 1612 }, { "epoch": 0.4258013454689355, "grad_norm": 0.9159901142120361, "learning_rate": 2.54180954893168e-06, "loss": 0.0647, "step": 1614 }, { "epoch": 0.42632898034560085, "grad_norm": 0.43022096157073975, "learning_rate": 2.5449749406489055e-06, "loss": 0.062, "step": 1616 }, { "epoch": 0.4268566152222662, "grad_norm": 7.480991840362549, "learning_rate": 2.5481403323661304e-06, "loss": 0.0761, "step": 1618 }, { "epoch": 0.42738425009893155, "grad_norm": 0.5442585349082947, "learning_rate": 2.5513057240833553e-06, "loss": 0.0672, "step": 1620 }, { "epoch": 0.4279118849755969, "grad_norm": 0.5803089141845703, "learning_rate": 2.55447111580058e-06, "loss": 0.0628, "step": 1622 }, { "epoch": 0.42843951985226225, "grad_norm": 0.5275807976722717, "learning_rate": 2.5576365075178055e-06, "loss": 0.0627, "step": 1624 }, { "epoch": 0.4289671547289276, "grad_norm": 3.4779295921325684, "learning_rate": 2.5608018992350304e-06, "loss": 0.06, "step": 1626 }, { "epoch": 0.42949478960559295, "grad_norm": 0.36860036849975586, "learning_rate": 2.5639672909522553e-06, "loss": 0.0649, "step": 1628 }, { "epoch": 0.4300224244822583, "grad_norm": 0.6027747392654419, "learning_rate": 2.5671326826694806e-06, "loss": 0.0682, "step": 1630 }, { "epoch": 0.4305500593589236, "grad_norm": 0.5017476677894592, "learning_rate": 2.5702980743867055e-06, "loss": 0.0699, "step": 1632 }, { "epoch": 0.43107769423558895, "grad_norm": 0.5226329565048218, "learning_rate": 2.5734634661039304e-06, "loss": 0.0708, "step": 1634 }, { "epoch": 0.4316053291122543, "grad_norm": 0.46846529841423035, "learning_rate": 2.5766288578211552e-06, "loss": 0.0568, "step": 1636 }, { "epoch": 0.43213296398891965, "grad_norm": 0.620316207408905, "learning_rate": 2.5797942495383805e-06, "loss": 0.056, "step": 1638 }, { "epoch": 0.432660598865585, "grad_norm": 0.6260894536972046, "learning_rate": 2.5829596412556054e-06, "loss": 0.0667, "step": 1640 }, { "epoch": 0.43318823374225035, "grad_norm": 0.9165812134742737, "learning_rate": 2.5861250329728303e-06, "loss": 0.0609, "step": 1642 }, { "epoch": 0.4337158686189157, "grad_norm": 2.2983179092407227, "learning_rate": 2.5892904246900552e-06, "loss": 0.0741, "step": 1644 }, { "epoch": 0.43424350349558105, "grad_norm": 0.556117057800293, "learning_rate": 2.5924558164072805e-06, "loss": 0.0857, "step": 1646 }, { "epoch": 0.4347711383722464, "grad_norm": 1.232069492340088, "learning_rate": 2.5956212081245054e-06, "loss": 0.0596, "step": 1648 }, { "epoch": 0.43529877324891175, "grad_norm": 1.406594157218933, "learning_rate": 2.5987865998417307e-06, "loss": 0.0625, "step": 1650 }, { "epoch": 0.4358264081255771, "grad_norm": 0.736531674861908, "learning_rate": 2.601951991558955e-06, "loss": 0.0772, "step": 1652 }, { "epoch": 0.43635404300224245, "grad_norm": 0.3503958582878113, "learning_rate": 2.6051173832761805e-06, "loss": 0.064, "step": 1654 }, { "epoch": 0.4368816778789078, "grad_norm": 0.5790503621101379, "learning_rate": 2.6082827749934054e-06, "loss": 0.0538, "step": 1656 }, { "epoch": 0.43740931275557315, "grad_norm": 1.476129174232483, "learning_rate": 2.6114481667106307e-06, "loss": 0.0578, "step": 1658 }, { "epoch": 0.4379369476322385, "grad_norm": 0.632100522518158, "learning_rate": 2.6146135584278556e-06, "loss": 0.0576, "step": 1660 }, { "epoch": 0.43846458250890386, "grad_norm": 1.1160160303115845, "learning_rate": 2.6177789501450805e-06, "loss": 0.0663, "step": 1662 }, { "epoch": 0.4389922173855692, "grad_norm": 0.4158248007297516, "learning_rate": 2.6209443418623054e-06, "loss": 0.0545, "step": 1664 }, { "epoch": 0.43951985226223456, "grad_norm": 0.6622627377510071, "learning_rate": 2.6241097335795307e-06, "loss": 0.062, "step": 1666 }, { "epoch": 0.4400474871388999, "grad_norm": 0.9585856199264526, "learning_rate": 2.6272751252967556e-06, "loss": 0.0564, "step": 1668 }, { "epoch": 0.4405751220155652, "grad_norm": 0.4281589984893799, "learning_rate": 2.630440517013981e-06, "loss": 0.0601, "step": 1670 }, { "epoch": 0.44110275689223055, "grad_norm": 0.4978153109550476, "learning_rate": 2.6336059087312053e-06, "loss": 0.0592, "step": 1672 }, { "epoch": 0.4416303917688959, "grad_norm": 0.6236041188240051, "learning_rate": 2.6367713004484306e-06, "loss": 0.0602, "step": 1674 }, { "epoch": 0.44215802664556125, "grad_norm": 0.8851639628410339, "learning_rate": 2.6399366921656555e-06, "loss": 0.0518, "step": 1676 }, { "epoch": 0.4426856615222266, "grad_norm": 0.44254812598228455, "learning_rate": 2.643102083882881e-06, "loss": 0.0589, "step": 1678 }, { "epoch": 0.44321329639889195, "grad_norm": 0.9866037964820862, "learning_rate": 2.6462674756001057e-06, "loss": 0.0583, "step": 1680 }, { "epoch": 0.4437409312755573, "grad_norm": 2.508331537246704, "learning_rate": 2.6494328673173306e-06, "loss": 0.0671, "step": 1682 }, { "epoch": 0.44426856615222265, "grad_norm": 1.2595497369766235, "learning_rate": 2.6525982590345555e-06, "loss": 0.0655, "step": 1684 }, { "epoch": 0.444796201028888, "grad_norm": 0.4192308783531189, "learning_rate": 2.655763650751781e-06, "loss": 0.0627, "step": 1686 }, { "epoch": 0.44532383590555336, "grad_norm": 0.6188547015190125, "learning_rate": 2.6589290424690057e-06, "loss": 0.0586, "step": 1688 }, { "epoch": 0.4458514707822187, "grad_norm": 0.6082819104194641, "learning_rate": 2.6620944341862306e-06, "loss": 0.0594, "step": 1690 }, { "epoch": 0.44637910565888406, "grad_norm": 0.7339548468589783, "learning_rate": 2.6652598259034555e-06, "loss": 0.0552, "step": 1692 }, { "epoch": 0.4469067405355494, "grad_norm": 0.7761785387992859, "learning_rate": 2.668425217620681e-06, "loss": 0.0482, "step": 1694 }, { "epoch": 0.44743437541221476, "grad_norm": 1.0765841007232666, "learning_rate": 2.6715906093379057e-06, "loss": 0.0654, "step": 1696 }, { "epoch": 0.4479620102888801, "grad_norm": 0.4182967245578766, "learning_rate": 2.6747560010551306e-06, "loss": 0.0567, "step": 1698 }, { "epoch": 0.44848964516554546, "grad_norm": 1.106003999710083, "learning_rate": 2.6779213927723555e-06, "loss": 0.0729, "step": 1700 }, { "epoch": 0.4490172800422108, "grad_norm": 0.5718616843223572, "learning_rate": 2.6810867844895803e-06, "loss": 0.0608, "step": 1702 }, { "epoch": 0.44954491491887616, "grad_norm": 0.6691154837608337, "learning_rate": 2.6842521762068057e-06, "loss": 0.0561, "step": 1704 }, { "epoch": 0.4500725497955415, "grad_norm": 1.5674644708633423, "learning_rate": 2.6874175679240305e-06, "loss": 0.0672, "step": 1706 }, { "epoch": 0.45060018467220686, "grad_norm": 0.6891922354698181, "learning_rate": 2.690582959641256e-06, "loss": 0.0612, "step": 1708 }, { "epoch": 0.45112781954887216, "grad_norm": 0.644825279712677, "learning_rate": 2.6937483513584803e-06, "loss": 0.0519, "step": 1710 }, { "epoch": 0.4516554544255375, "grad_norm": 0.7329215407371521, "learning_rate": 2.6969137430757056e-06, "loss": 0.0638, "step": 1712 }, { "epoch": 0.45218308930220286, "grad_norm": 1.0073093175888062, "learning_rate": 2.7000791347929305e-06, "loss": 0.0579, "step": 1714 }, { "epoch": 0.4527107241788682, "grad_norm": 0.4969639480113983, "learning_rate": 2.703244526510156e-06, "loss": 0.0461, "step": 1716 }, { "epoch": 0.45323835905553356, "grad_norm": 0.5012308955192566, "learning_rate": 2.7064099182273807e-06, "loss": 0.0564, "step": 1718 }, { "epoch": 0.4537659939321989, "grad_norm": 0.4245254695415497, "learning_rate": 2.7095753099446056e-06, "loss": 0.0636, "step": 1720 }, { "epoch": 0.45429362880886426, "grad_norm": 1.990149974822998, "learning_rate": 2.7127407016618305e-06, "loss": 0.0658, "step": 1722 }, { "epoch": 0.4548212636855296, "grad_norm": 0.6609019041061401, "learning_rate": 2.715906093379056e-06, "loss": 0.0592, "step": 1724 }, { "epoch": 0.45534889856219496, "grad_norm": 0.2981458008289337, "learning_rate": 2.7190714850962807e-06, "loss": 0.0522, "step": 1726 }, { "epoch": 0.4558765334388603, "grad_norm": 0.45507240295410156, "learning_rate": 2.722236876813506e-06, "loss": 0.0574, "step": 1728 }, { "epoch": 0.45640416831552566, "grad_norm": 0.5596851706504822, "learning_rate": 2.7254022685307305e-06, "loss": 0.0604, "step": 1730 }, { "epoch": 0.456931803192191, "grad_norm": 5.388506889343262, "learning_rate": 2.7285676602479558e-06, "loss": 0.0562, "step": 1732 }, { "epoch": 0.45745943806885636, "grad_norm": 0.785186767578125, "learning_rate": 2.7317330519651807e-06, "loss": 0.0512, "step": 1734 }, { "epoch": 0.4579870729455217, "grad_norm": 2.1398565769195557, "learning_rate": 2.734898443682406e-06, "loss": 0.0637, "step": 1736 }, { "epoch": 0.45851470782218706, "grad_norm": 0.40568551421165466, "learning_rate": 2.738063835399631e-06, "loss": 0.0496, "step": 1738 }, { "epoch": 0.4590423426988524, "grad_norm": 0.6653757095336914, "learning_rate": 2.7412292271168558e-06, "loss": 0.0596, "step": 1740 }, { "epoch": 0.45956997757551776, "grad_norm": 3.7852466106414795, "learning_rate": 2.7443946188340806e-06, "loss": 0.0613, "step": 1742 }, { "epoch": 0.4600976124521831, "grad_norm": 3.4511024951934814, "learning_rate": 2.747560010551306e-06, "loss": 0.0605, "step": 1744 }, { "epoch": 0.46062524732884846, "grad_norm": 1.4782220125198364, "learning_rate": 2.750725402268531e-06, "loss": 0.0531, "step": 1746 }, { "epoch": 0.46115288220551376, "grad_norm": 2.799539089202881, "learning_rate": 2.7538907939857557e-06, "loss": 0.0546, "step": 1748 }, { "epoch": 0.4616805170821791, "grad_norm": 0.47666803002357483, "learning_rate": 2.7570561857029806e-06, "loss": 0.0439, "step": 1750 }, { "epoch": 0.46220815195884446, "grad_norm": 1.1121459007263184, "learning_rate": 2.760221577420206e-06, "loss": 0.0498, "step": 1752 }, { "epoch": 0.4627357868355098, "grad_norm": 0.42023077607154846, "learning_rate": 2.763386969137431e-06, "loss": 0.0478, "step": 1754 }, { "epoch": 0.46326342171217516, "grad_norm": 0.5808268785476685, "learning_rate": 2.7665523608546557e-06, "loss": 0.0471, "step": 1756 }, { "epoch": 0.4637910565888405, "grad_norm": 0.31835681200027466, "learning_rate": 2.769717752571881e-06, "loss": 0.0502, "step": 1758 }, { "epoch": 0.46431869146550586, "grad_norm": 1.3028249740600586, "learning_rate": 2.772883144289106e-06, "loss": 0.0549, "step": 1760 }, { "epoch": 0.4648463263421712, "grad_norm": 2.0150835514068604, "learning_rate": 2.776048536006331e-06, "loss": 0.0625, "step": 1762 }, { "epoch": 0.46537396121883656, "grad_norm": 0.430611252784729, "learning_rate": 2.7792139277235557e-06, "loss": 0.0549, "step": 1764 }, { "epoch": 0.4659015960955019, "grad_norm": 0.9392898678779602, "learning_rate": 2.782379319440781e-06, "loss": 0.0526, "step": 1766 }, { "epoch": 0.46642923097216726, "grad_norm": 0.4866000711917877, "learning_rate": 2.785544711158006e-06, "loss": 0.0458, "step": 1768 }, { "epoch": 0.4669568658488326, "grad_norm": 0.6748418211936951, "learning_rate": 2.7887101028752308e-06, "loss": 0.0515, "step": 1770 }, { "epoch": 0.46748450072549796, "grad_norm": 0.7944962382316589, "learning_rate": 2.7918754945924557e-06, "loss": 0.0505, "step": 1772 }, { "epoch": 0.4680121356021633, "grad_norm": 1.5323951244354248, "learning_rate": 2.795040886309681e-06, "loss": 0.0487, "step": 1774 }, { "epoch": 0.46853977047882867, "grad_norm": 0.4848267138004303, "learning_rate": 2.798206278026906e-06, "loss": 0.0545, "step": 1776 }, { "epoch": 0.469067405355494, "grad_norm": 1.3174611330032349, "learning_rate": 2.801371669744131e-06, "loss": 0.0587, "step": 1778 }, { "epoch": 0.46959504023215937, "grad_norm": 2.4376754760742188, "learning_rate": 2.8045370614613556e-06, "loss": 0.0582, "step": 1780 }, { "epoch": 0.4701226751088247, "grad_norm": 0.49510398507118225, "learning_rate": 2.807702453178581e-06, "loss": 0.0597, "step": 1782 }, { "epoch": 0.47065030998549007, "grad_norm": 1.3371195793151855, "learning_rate": 2.810867844895806e-06, "loss": 0.0617, "step": 1784 }, { "epoch": 0.47117794486215536, "grad_norm": 0.33198419213294983, "learning_rate": 2.814033236613031e-06, "loss": 0.0466, "step": 1786 }, { "epoch": 0.4717055797388207, "grad_norm": 0.487519770860672, "learning_rate": 2.817198628330256e-06, "loss": 0.0452, "step": 1788 }, { "epoch": 0.47223321461548606, "grad_norm": 0.48676764965057373, "learning_rate": 2.820364020047481e-06, "loss": 0.0461, "step": 1790 }, { "epoch": 0.4727608494921514, "grad_norm": 0.43267348408699036, "learning_rate": 2.823529411764706e-06, "loss": 0.0507, "step": 1792 }, { "epoch": 0.47328848436881676, "grad_norm": 0.9765422940254211, "learning_rate": 2.826694803481931e-06, "loss": 0.0502, "step": 1794 }, { "epoch": 0.4738161192454821, "grad_norm": 1.688683032989502, "learning_rate": 2.829860195199156e-06, "loss": 0.0536, "step": 1796 }, { "epoch": 0.47434375412214747, "grad_norm": 0.41948986053466797, "learning_rate": 2.8330255869163813e-06, "loss": 0.0487, "step": 1798 }, { "epoch": 0.4748713889988128, "grad_norm": 0.7259230613708496, "learning_rate": 2.8361909786336058e-06, "loss": 0.0474, "step": 1800 }, { "epoch": 0.47539902387547817, "grad_norm": 0.3069029152393341, "learning_rate": 2.839356370350831e-06, "loss": 0.0446, "step": 1802 }, { "epoch": 0.4759266587521435, "grad_norm": 0.428520530462265, "learning_rate": 2.842521762068056e-06, "loss": 0.0479, "step": 1804 }, { "epoch": 0.47645429362880887, "grad_norm": 0.4521285593509674, "learning_rate": 2.8456871537852813e-06, "loss": 0.0477, "step": 1806 }, { "epoch": 0.4769819285054742, "grad_norm": 0.597745954990387, "learning_rate": 2.8488525455025058e-06, "loss": 0.0513, "step": 1808 }, { "epoch": 0.47750956338213957, "grad_norm": 0.3145551085472107, "learning_rate": 2.852017937219731e-06, "loss": 0.0462, "step": 1810 }, { "epoch": 0.4780371982588049, "grad_norm": 1.354028344154358, "learning_rate": 2.855183328936956e-06, "loss": 0.0518, "step": 1812 }, { "epoch": 0.47856483313547027, "grad_norm": 0.8952604532241821, "learning_rate": 2.8583487206541813e-06, "loss": 0.0517, "step": 1814 }, { "epoch": 0.4790924680121356, "grad_norm": 0.7620837092399597, "learning_rate": 2.861514112371406e-06, "loss": 0.0439, "step": 1816 }, { "epoch": 0.47962010288880097, "grad_norm": 2.596632480621338, "learning_rate": 2.864679504088631e-06, "loss": 0.0592, "step": 1818 }, { "epoch": 0.4801477377654663, "grad_norm": 1.4789361953735352, "learning_rate": 2.867844895805856e-06, "loss": 0.045, "step": 1820 }, { "epoch": 0.48067537264213167, "grad_norm": 0.45299792289733887, "learning_rate": 2.8710102875230812e-06, "loss": 0.0452, "step": 1822 }, { "epoch": 0.48120300751879697, "grad_norm": 0.7549160122871399, "learning_rate": 2.874175679240306e-06, "loss": 0.0403, "step": 1824 }, { "epoch": 0.4817306423954623, "grad_norm": 0.9982080459594727, "learning_rate": 2.877341070957531e-06, "loss": 0.0397, "step": 1826 }, { "epoch": 0.48225827727212767, "grad_norm": 0.766582190990448, "learning_rate": 2.880506462674756e-06, "loss": 0.0438, "step": 1828 }, { "epoch": 0.482785912148793, "grad_norm": 0.27428311109542847, "learning_rate": 2.883671854391981e-06, "loss": 0.0504, "step": 1830 }, { "epoch": 0.48331354702545837, "grad_norm": 0.2130294144153595, "learning_rate": 2.886837246109206e-06, "loss": 0.0433, "step": 1832 }, { "epoch": 0.4838411819021237, "grad_norm": 0.3681633472442627, "learning_rate": 2.890002637826431e-06, "loss": 0.0469, "step": 1834 }, { "epoch": 0.48436881677878907, "grad_norm": 0.9752593040466309, "learning_rate": 2.8931680295436563e-06, "loss": 0.06, "step": 1836 }, { "epoch": 0.4848964516554544, "grad_norm": 1.2234699726104736, "learning_rate": 2.8963334212608808e-06, "loss": 0.053, "step": 1838 }, { "epoch": 0.48542408653211977, "grad_norm": 0.5522381663322449, "learning_rate": 2.899498812978106e-06, "loss": 0.0541, "step": 1840 }, { "epoch": 0.4859517214087851, "grad_norm": 2.3994314670562744, "learning_rate": 2.902664204695331e-06, "loss": 0.0444, "step": 1842 }, { "epoch": 0.48647935628545047, "grad_norm": 2.3681275844573975, "learning_rate": 2.9058295964125563e-06, "loss": 0.0486, "step": 1844 }, { "epoch": 0.4870069911621158, "grad_norm": 0.49472299218177795, "learning_rate": 2.908994988129781e-06, "loss": 0.0422, "step": 1846 }, { "epoch": 0.48753462603878117, "grad_norm": 1.1659129858016968, "learning_rate": 2.912160379847006e-06, "loss": 0.0514, "step": 1848 }, { "epoch": 0.4880622609154465, "grad_norm": 1.4318286180496216, "learning_rate": 2.915325771564231e-06, "loss": 0.0425, "step": 1850 }, { "epoch": 0.4885898957921119, "grad_norm": 0.6727498769760132, "learning_rate": 2.9184911632814563e-06, "loss": 0.0484, "step": 1852 }, { "epoch": 0.4891175306687772, "grad_norm": 0.8026925921440125, "learning_rate": 2.921656554998681e-06, "loss": 0.043, "step": 1854 }, { "epoch": 0.4896451655454426, "grad_norm": 1.1455261707305908, "learning_rate": 2.9248219467159064e-06, "loss": 0.0393, "step": 1856 }, { "epoch": 0.4901728004221079, "grad_norm": 0.4170989990234375, "learning_rate": 2.927987338433131e-06, "loss": 0.0397, "step": 1858 }, { "epoch": 0.4907004352987733, "grad_norm": 1.6348708868026733, "learning_rate": 2.9311527301503562e-06, "loss": 0.038, "step": 1860 }, { "epoch": 0.49122807017543857, "grad_norm": 0.54513019323349, "learning_rate": 2.934318121867581e-06, "loss": 0.0395, "step": 1862 }, { "epoch": 0.4917557050521039, "grad_norm": 0.5045168399810791, "learning_rate": 2.9374835135848064e-06, "loss": 0.0484, "step": 1864 }, { "epoch": 0.49228333992876927, "grad_norm": 1.0541917085647583, "learning_rate": 2.9406489053020313e-06, "loss": 0.0395, "step": 1866 }, { "epoch": 0.4928109748054346, "grad_norm": 1.2559421062469482, "learning_rate": 2.943814297019256e-06, "loss": 0.0421, "step": 1868 }, { "epoch": 0.49333860968209997, "grad_norm": 0.43147119879722595, "learning_rate": 2.946979688736481e-06, "loss": 0.0515, "step": 1870 }, { "epoch": 0.4938662445587653, "grad_norm": 0.8090803623199463, "learning_rate": 2.9501450804537064e-06, "loss": 0.0385, "step": 1872 }, { "epoch": 0.49439387943543067, "grad_norm": 0.40691378712654114, "learning_rate": 2.9533104721709313e-06, "loss": 0.0369, "step": 1874 }, { "epoch": 0.494921514312096, "grad_norm": 0.3722217082977295, "learning_rate": 2.956475863888156e-06, "loss": 0.0402, "step": 1876 }, { "epoch": 0.4954491491887614, "grad_norm": 0.24141599237918854, "learning_rate": 2.959641255605381e-06, "loss": 0.0556, "step": 1878 }, { "epoch": 0.4959767840654267, "grad_norm": 0.6322010159492493, "learning_rate": 2.9628066473226064e-06, "loss": 0.0424, "step": 1880 }, { "epoch": 0.4965044189420921, "grad_norm": 0.3974672853946686, "learning_rate": 2.9659720390398313e-06, "loss": 0.0467, "step": 1882 }, { "epoch": 0.4970320538187574, "grad_norm": 0.5056261420249939, "learning_rate": 2.969137430757056e-06, "loss": 0.0447, "step": 1884 }, { "epoch": 0.4975596886954228, "grad_norm": 0.47678083181381226, "learning_rate": 2.9723028224742815e-06, "loss": 0.0458, "step": 1886 }, { "epoch": 0.4980873235720881, "grad_norm": 2.1718225479125977, "learning_rate": 2.9754682141915063e-06, "loss": 0.0549, "step": 1888 }, { "epoch": 0.4986149584487535, "grad_norm": 0.4063360095024109, "learning_rate": 2.9786336059087312e-06, "loss": 0.0506, "step": 1890 }, { "epoch": 0.4991425933254188, "grad_norm": 1.4347976446151733, "learning_rate": 2.981798997625956e-06, "loss": 0.0426, "step": 1892 }, { "epoch": 0.4996702282020842, "grad_norm": 2.5460526943206787, "learning_rate": 2.9849643893431814e-06, "loss": 0.0483, "step": 1894 }, { "epoch": 0.5001978630787495, "grad_norm": 0.5475360155105591, "learning_rate": 2.9881297810604063e-06, "loss": 0.0503, "step": 1896 }, { "epoch": 0.5007254979554149, "grad_norm": 1.1321625709533691, "learning_rate": 2.991295172777631e-06, "loss": 0.0429, "step": 1898 }, { "epoch": 0.5012531328320802, "grad_norm": 0.7710878252983093, "learning_rate": 2.994460564494856e-06, "loss": 0.0488, "step": 1900 }, { "epoch": 0.5017807677087456, "grad_norm": 0.9541162252426147, "learning_rate": 2.9976259562120814e-06, "loss": 0.0426, "step": 1902 }, { "epoch": 0.5023084025854109, "grad_norm": 0.7505131959915161, "learning_rate": 3.0007913479293063e-06, "loss": 0.0448, "step": 1904 }, { "epoch": 0.5028360374620763, "grad_norm": 0.27710893750190735, "learning_rate": 3.0039567396465316e-06, "loss": 0.04, "step": 1906 }, { "epoch": 0.5033636723387416, "grad_norm": 0.5373765230178833, "learning_rate": 3.007122131363756e-06, "loss": 0.0458, "step": 1908 }, { "epoch": 0.503891307215407, "grad_norm": 0.6453938484191895, "learning_rate": 3.010287523080982e-06, "loss": 0.0531, "step": 1910 }, { "epoch": 0.5044189420920723, "grad_norm": 0.374208927154541, "learning_rate": 3.0134529147982063e-06, "loss": 0.0456, "step": 1912 }, { "epoch": 0.5049465769687377, "grad_norm": 0.4828614294528961, "learning_rate": 3.016618306515431e-06, "loss": 0.0456, "step": 1914 }, { "epoch": 0.505474211845403, "grad_norm": 0.6171508431434631, "learning_rate": 3.0197836982326565e-06, "loss": 0.0623, "step": 1916 }, { "epoch": 0.5060018467220684, "grad_norm": 0.6831582188606262, "learning_rate": 3.0229490899498814e-06, "loss": 0.0546, "step": 1918 }, { "epoch": 0.5065294815987337, "grad_norm": 0.4897107779979706, "learning_rate": 3.0261144816671067e-06, "loss": 0.0525, "step": 1920 }, { "epoch": 0.507057116475399, "grad_norm": 0.3276914954185486, "learning_rate": 3.0292798733843316e-06, "loss": 0.0411, "step": 1922 }, { "epoch": 0.5075847513520644, "grad_norm": 0.6034899353981018, "learning_rate": 3.032445265101556e-06, "loss": 0.0539, "step": 1924 }, { "epoch": 0.5081123862287297, "grad_norm": 0.4934389591217041, "learning_rate": 3.0356106568187818e-06, "loss": 0.0388, "step": 1926 }, { "epoch": 0.5086400211053951, "grad_norm": 0.7592576742172241, "learning_rate": 3.0387760485360062e-06, "loss": 0.0445, "step": 1928 }, { "epoch": 0.5091676559820604, "grad_norm": 0.5280578136444092, "learning_rate": 3.0419414402532315e-06, "loss": 0.0424, "step": 1930 }, { "epoch": 0.5096952908587258, "grad_norm": 0.8189111351966858, "learning_rate": 3.0451068319704564e-06, "loss": 0.0407, "step": 1932 }, { "epoch": 0.5102229257353911, "grad_norm": 0.6256850957870483, "learning_rate": 3.0482722236876813e-06, "loss": 0.046, "step": 1934 }, { "epoch": 0.5107505606120565, "grad_norm": 2.146711587905884, "learning_rate": 3.0514376154049066e-06, "loss": 0.044, "step": 1936 }, { "epoch": 0.5112781954887218, "grad_norm": 1.0252081155776978, "learning_rate": 3.0546030071221315e-06, "loss": 0.0589, "step": 1938 }, { "epoch": 0.5118058303653872, "grad_norm": 1.3845586776733398, "learning_rate": 3.057768398839357e-06, "loss": 0.0405, "step": 1940 }, { "epoch": 0.5123334652420525, "grad_norm": 0.425008624792099, "learning_rate": 3.0609337905565817e-06, "loss": 0.0334, "step": 1942 }, { "epoch": 0.5128611001187179, "grad_norm": 0.4669320285320282, "learning_rate": 3.064099182273806e-06, "loss": 0.0445, "step": 1944 }, { "epoch": 0.5133887349953832, "grad_norm": 0.21031861007213593, "learning_rate": 3.0672645739910315e-06, "loss": 0.0305, "step": 1946 }, { "epoch": 0.5139163698720486, "grad_norm": 0.20423036813735962, "learning_rate": 3.0704299657082564e-06, "loss": 0.04, "step": 1948 }, { "epoch": 0.5144440047487139, "grad_norm": 0.5859588384628296, "learning_rate": 3.0735953574254817e-06, "loss": 0.0481, "step": 1950 }, { "epoch": 0.5149716396253793, "grad_norm": 0.26222842931747437, "learning_rate": 3.0767607491427066e-06, "loss": 0.0353, "step": 1952 }, { "epoch": 0.5154992745020446, "grad_norm": 0.9789291620254517, "learning_rate": 3.0799261408599315e-06, "loss": 0.0399, "step": 1954 }, { "epoch": 0.51602690937871, "grad_norm": 0.39622819423675537, "learning_rate": 3.0830915325771568e-06, "loss": 0.0353, "step": 1956 }, { "epoch": 0.5165545442553753, "grad_norm": 2.0482771396636963, "learning_rate": 3.0862569242943812e-06, "loss": 0.0344, "step": 1958 }, { "epoch": 0.5170821791320406, "grad_norm": 0.5281033515930176, "learning_rate": 3.089422316011606e-06, "loss": 0.0442, "step": 1960 }, { "epoch": 0.517609814008706, "grad_norm": 0.33357542753219604, "learning_rate": 3.0925877077288314e-06, "loss": 0.0432, "step": 1962 }, { "epoch": 0.5181374488853713, "grad_norm": 0.42881911993026733, "learning_rate": 3.0957530994460563e-06, "loss": 0.0341, "step": 1964 }, { "epoch": 0.5186650837620367, "grad_norm": 0.24993351101875305, "learning_rate": 3.0989184911632816e-06, "loss": 0.0358, "step": 1966 }, { "epoch": 0.519192718638702, "grad_norm": 0.6105499863624573, "learning_rate": 3.1020838828805065e-06, "loss": 0.0347, "step": 1968 }, { "epoch": 0.5197203535153674, "grad_norm": 1.9145152568817139, "learning_rate": 3.1052492745977314e-06, "loss": 0.035, "step": 1970 }, { "epoch": 0.5202479883920327, "grad_norm": 0.32719966769218445, "learning_rate": 3.1084146663149567e-06, "loss": 0.0339, "step": 1972 }, { "epoch": 0.5207756232686981, "grad_norm": 0.44071894884109497, "learning_rate": 3.111580058032181e-06, "loss": 0.0441, "step": 1974 }, { "epoch": 0.5213032581453634, "grad_norm": 0.3369580805301666, "learning_rate": 3.114745449749407e-06, "loss": 0.0502, "step": 1976 }, { "epoch": 0.5218308930220288, "grad_norm": 3.6426143646240234, "learning_rate": 3.1179108414666314e-06, "loss": 0.0495, "step": 1978 }, { "epoch": 0.5223585278986941, "grad_norm": 0.36534810066223145, "learning_rate": 3.1210762331838563e-06, "loss": 0.0357, "step": 1980 }, { "epoch": 0.5228861627753595, "grad_norm": 2.3544201850891113, "learning_rate": 3.1242416249010816e-06, "loss": 0.0535, "step": 1982 }, { "epoch": 0.5234137976520248, "grad_norm": 0.3278028070926666, "learning_rate": 3.1274070166183065e-06, "loss": 0.0537, "step": 1984 }, { "epoch": 0.5239414325286902, "grad_norm": 0.32263126969337463, "learning_rate": 3.1305724083355318e-06, "loss": 0.043, "step": 1986 }, { "epoch": 0.5244690674053555, "grad_norm": 0.5078027248382568, "learning_rate": 3.1337378000527567e-06, "loss": 0.062, "step": 1988 }, { "epoch": 0.5249967022820209, "grad_norm": 0.6066910028457642, "learning_rate": 3.136903191769981e-06, "loss": 0.0364, "step": 1990 }, { "epoch": 0.5255243371586862, "grad_norm": 1.1821262836456299, "learning_rate": 3.140068583487207e-06, "loss": 0.0368, "step": 1992 }, { "epoch": 0.5260519720353516, "grad_norm": 1.2033902406692505, "learning_rate": 3.1432339752044313e-06, "loss": 0.0359, "step": 1994 }, { "epoch": 0.5265796069120169, "grad_norm": 0.44899484515190125, "learning_rate": 3.146399366921657e-06, "loss": 0.0344, "step": 1996 }, { "epoch": 0.5271072417886822, "grad_norm": 0.2310202419757843, "learning_rate": 3.1495647586388815e-06, "loss": 0.0351, "step": 1998 }, { "epoch": 0.5276348766653476, "grad_norm": 0.3545657694339752, "learning_rate": 3.1527301503561064e-06, "loss": 0.0357, "step": 2000 }, { "epoch": 0.5276348766653476, "eval_loss": 0.02902209199965, "eval_runtime": 310.8312, "eval_samples_per_second": 693.756, "eval_steps_per_second": 86.722, "step": 2000 }, { "epoch": 0.5281625115420129, "grad_norm": 1.6099448204040527, "learning_rate": 3.1558955420733317e-06, "loss": 0.0385, "step": 2002 }, { "epoch": 0.5286901464186783, "grad_norm": 0.7010210752487183, "learning_rate": 3.1590609337905566e-06, "loss": 0.038, "step": 2004 }, { "epoch": 0.5292177812953436, "grad_norm": 0.24778702855110168, "learning_rate": 3.162226325507782e-06, "loss": 0.0359, "step": 2006 }, { "epoch": 0.529745416172009, "grad_norm": 0.2769429087638855, "learning_rate": 3.165391717225007e-06, "loss": 0.0352, "step": 2008 }, { "epoch": 0.5302730510486743, "grad_norm": 0.18883945047855377, "learning_rate": 3.1685571089422313e-06, "loss": 0.0343, "step": 2010 }, { "epoch": 0.5308006859253397, "grad_norm": 1.1806058883666992, "learning_rate": 3.1717225006594566e-06, "loss": 0.0413, "step": 2012 }, { "epoch": 0.531328320802005, "grad_norm": 1.1877425909042358, "learning_rate": 3.1748878923766815e-06, "loss": 0.0371, "step": 2014 }, { "epoch": 0.5318559556786704, "grad_norm": 0.3501647710800171, "learning_rate": 3.178053284093907e-06, "loss": 0.0432, "step": 2016 }, { "epoch": 0.5323835905553357, "grad_norm": 0.7728976011276245, "learning_rate": 3.1812186758111317e-06, "loss": 0.0308, "step": 2018 }, { "epoch": 0.5329112254320011, "grad_norm": 0.5798470377922058, "learning_rate": 3.1843840675283566e-06, "loss": 0.0293, "step": 2020 }, { "epoch": 0.5334388603086664, "grad_norm": 0.6002846956253052, "learning_rate": 3.187549459245582e-06, "loss": 0.0415, "step": 2022 }, { "epoch": 0.5339664951853318, "grad_norm": 0.9215429425239563, "learning_rate": 3.1907148509628068e-06, "loss": 0.0426, "step": 2024 }, { "epoch": 0.5344941300619971, "grad_norm": 0.1574408859014511, "learning_rate": 3.193880242680032e-06, "loss": 0.029, "step": 2026 }, { "epoch": 0.5350217649386625, "grad_norm": 0.47910353541374207, "learning_rate": 3.1970456343972565e-06, "loss": 0.0305, "step": 2028 }, { "epoch": 0.5355493998153278, "grad_norm": 0.3268490135669708, "learning_rate": 3.2002110261144814e-06, "loss": 0.0354, "step": 2030 }, { "epoch": 0.5360770346919932, "grad_norm": 0.46540021896362305, "learning_rate": 3.2033764178317067e-06, "loss": 0.029, "step": 2032 }, { "epoch": 0.5366046695686585, "grad_norm": 0.2959149479866028, "learning_rate": 3.2065418095489316e-06, "loss": 0.0275, "step": 2034 }, { "epoch": 0.5371323044453238, "grad_norm": 0.42392697930336, "learning_rate": 3.209707201266157e-06, "loss": 0.0293, "step": 2036 }, { "epoch": 0.5376599393219892, "grad_norm": 1.1446301937103271, "learning_rate": 3.212872592983382e-06, "loss": 0.0388, "step": 2038 }, { "epoch": 0.5381875741986545, "grad_norm": 1.1108171939849854, "learning_rate": 3.2160379847006063e-06, "loss": 0.0462, "step": 2040 }, { "epoch": 0.5387152090753199, "grad_norm": 2.5641822814941406, "learning_rate": 3.219203376417832e-06, "loss": 0.0308, "step": 2042 }, { "epoch": 0.5392428439519852, "grad_norm": 3.191714286804199, "learning_rate": 3.2223687681350565e-06, "loss": 0.039, "step": 2044 }, { "epoch": 0.5397704788286506, "grad_norm": 0.33012959361076355, "learning_rate": 3.2255341598522822e-06, "loss": 0.0447, "step": 2046 }, { "epoch": 0.5402981137053159, "grad_norm": 0.6458933353424072, "learning_rate": 3.2286995515695067e-06, "loss": 0.049, "step": 2048 }, { "epoch": 0.5408257485819813, "grad_norm": 0.37859952449798584, "learning_rate": 3.2318649432867316e-06, "loss": 0.0295, "step": 2050 }, { "epoch": 0.5413533834586466, "grad_norm": 0.4152364432811737, "learning_rate": 3.235030335003957e-06, "loss": 0.0407, "step": 2052 }, { "epoch": 0.541881018335312, "grad_norm": 4.739086627960205, "learning_rate": 3.2381957267211818e-06, "loss": 0.0328, "step": 2054 }, { "epoch": 0.5424086532119773, "grad_norm": 1.117039442062378, "learning_rate": 3.241361118438407e-06, "loss": 0.0335, "step": 2056 }, { "epoch": 0.5429362880886427, "grad_norm": 83.01644134521484, "learning_rate": 3.244526510155632e-06, "loss": 0.0364, "step": 2058 }, { "epoch": 0.543463922965308, "grad_norm": 0.19705918431282043, "learning_rate": 3.2476919018728564e-06, "loss": 0.0415, "step": 2060 }, { "epoch": 0.5439915578419734, "grad_norm": 3.347323417663574, "learning_rate": 3.250857293590082e-06, "loss": 0.0471, "step": 2062 }, { "epoch": 0.5445191927186387, "grad_norm": 0.5076499581336975, "learning_rate": 3.2540226853073066e-06, "loss": 0.0311, "step": 2064 }, { "epoch": 0.5450468275953041, "grad_norm": 0.3279983103275299, "learning_rate": 3.257188077024532e-06, "loss": 0.0259, "step": 2066 }, { "epoch": 0.5455744624719694, "grad_norm": 0.7464612722396851, "learning_rate": 3.260353468741757e-06, "loss": 0.0361, "step": 2068 }, { "epoch": 0.5461020973486348, "grad_norm": 1.66709566116333, "learning_rate": 3.2635188604589817e-06, "loss": 0.0372, "step": 2070 }, { "epoch": 0.5466297322253001, "grad_norm": 0.4998496472835541, "learning_rate": 3.266684252176207e-06, "loss": 0.0321, "step": 2072 }, { "epoch": 0.5471573671019654, "grad_norm": 1.459741234779358, "learning_rate": 3.269849643893432e-06, "loss": 0.0365, "step": 2074 }, { "epoch": 0.5476850019786308, "grad_norm": 0.32634055614471436, "learning_rate": 3.2730150356106572e-06, "loss": 0.0335, "step": 2076 }, { "epoch": 0.5482126368552961, "grad_norm": 2.24405574798584, "learning_rate": 3.276180427327882e-06, "loss": 0.0393, "step": 2078 }, { "epoch": 0.5487402717319615, "grad_norm": 0.1869361400604248, "learning_rate": 3.2793458190451066e-06, "loss": 0.0257, "step": 2080 }, { "epoch": 0.5492679066086268, "grad_norm": 1.240576148033142, "learning_rate": 3.282511210762332e-06, "loss": 0.0455, "step": 2082 }, { "epoch": 0.5497955414852922, "grad_norm": 0.37359312176704407, "learning_rate": 3.285676602479557e-06, "loss": 0.0361, "step": 2084 }, { "epoch": 0.5503231763619575, "grad_norm": 1.0560349225997925, "learning_rate": 3.288841994196782e-06, "loss": 0.0266, "step": 2086 }, { "epoch": 0.5508508112386229, "grad_norm": 1.445717215538025, "learning_rate": 3.292007385914007e-06, "loss": 0.0344, "step": 2088 }, { "epoch": 0.5513784461152882, "grad_norm": 0.6747927665710449, "learning_rate": 3.295172777631232e-06, "loss": 0.0364, "step": 2090 }, { "epoch": 0.5519060809919536, "grad_norm": 2.1587069034576416, "learning_rate": 3.298338169348457e-06, "loss": 0.0504, "step": 2092 }, { "epoch": 0.5524337158686189, "grad_norm": 0.31244412064552307, "learning_rate": 3.3015035610656817e-06, "loss": 0.0361, "step": 2094 }, { "epoch": 0.5529613507452843, "grad_norm": 1.2769473791122437, "learning_rate": 3.3046689527829074e-06, "loss": 0.0412, "step": 2096 }, { "epoch": 0.5534889856219496, "grad_norm": 0.2109454870223999, "learning_rate": 3.307834344500132e-06, "loss": 0.0273, "step": 2098 }, { "epoch": 0.554016620498615, "grad_norm": 0.48778021335601807, "learning_rate": 3.3109997362173567e-06, "loss": 0.035, "step": 2100 }, { "epoch": 0.5545442553752803, "grad_norm": 0.40759578347206116, "learning_rate": 3.314165127934582e-06, "loss": 0.0398, "step": 2102 }, { "epoch": 0.5550718902519457, "grad_norm": 1.3402738571166992, "learning_rate": 3.317330519651807e-06, "loss": 0.0295, "step": 2104 }, { "epoch": 0.555599525128611, "grad_norm": 0.6495593190193176, "learning_rate": 3.3204959113690322e-06, "loss": 0.0273, "step": 2106 }, { "epoch": 0.5561271600052764, "grad_norm": 1.2184473276138306, "learning_rate": 3.323661303086257e-06, "loss": 0.0321, "step": 2108 }, { "epoch": 0.5566547948819417, "grad_norm": 1.0136405229568481, "learning_rate": 3.3268266948034816e-06, "loss": 0.0338, "step": 2110 }, { "epoch": 0.557182429758607, "grad_norm": 0.600299060344696, "learning_rate": 3.3299920865207073e-06, "loss": 0.0302, "step": 2112 }, { "epoch": 0.5577100646352724, "grad_norm": 0.5074283480644226, "learning_rate": 3.333157478237932e-06, "loss": 0.0276, "step": 2114 }, { "epoch": 0.5582376995119377, "grad_norm": 0.5293172597885132, "learning_rate": 3.3363228699551575e-06, "loss": 0.0343, "step": 2116 }, { "epoch": 0.5587653343886031, "grad_norm": 0.9168714284896851, "learning_rate": 3.339488261672382e-06, "loss": 0.0317, "step": 2118 }, { "epoch": 0.5592929692652684, "grad_norm": 0.2580234110355377, "learning_rate": 3.342653653389607e-06, "loss": 0.033, "step": 2120 }, { "epoch": 0.5598206041419338, "grad_norm": 0.39127251505851746, "learning_rate": 3.345819045106832e-06, "loss": 0.0323, "step": 2122 }, { "epoch": 0.5603482390185991, "grad_norm": 0.6818301677703857, "learning_rate": 3.348984436824057e-06, "loss": 0.0281, "step": 2124 }, { "epoch": 0.5608758738952645, "grad_norm": 0.16757026314735413, "learning_rate": 3.3521498285412824e-06, "loss": 0.0247, "step": 2126 }, { "epoch": 0.5614035087719298, "grad_norm": 0.2881892919540405, "learning_rate": 3.3553152202585073e-06, "loss": 0.0396, "step": 2128 }, { "epoch": 0.5619311436485952, "grad_norm": 0.8079879879951477, "learning_rate": 3.3584806119757317e-06, "loss": 0.0357, "step": 2130 }, { "epoch": 0.5624587785252605, "grad_norm": 0.18507671356201172, "learning_rate": 3.361646003692957e-06, "loss": 0.0241, "step": 2132 }, { "epoch": 0.5629864134019259, "grad_norm": 0.5554057955741882, "learning_rate": 3.364811395410182e-06, "loss": 0.069, "step": 2134 }, { "epoch": 0.5635140482785912, "grad_norm": 0.5762998461723328, "learning_rate": 3.3679767871274073e-06, "loss": 0.0263, "step": 2136 }, { "epoch": 0.5640416831552566, "grad_norm": 0.4014618396759033, "learning_rate": 3.371142178844632e-06, "loss": 0.025, "step": 2138 }, { "epoch": 0.5645693180319219, "grad_norm": 0.22042196989059448, "learning_rate": 3.374307570561857e-06, "loss": 0.0344, "step": 2140 }, { "epoch": 0.5650969529085873, "grad_norm": 0.31914570927619934, "learning_rate": 3.3774729622790823e-06, "loss": 0.0357, "step": 2142 }, { "epoch": 0.5656245877852526, "grad_norm": 0.3665647804737091, "learning_rate": 3.3806383539963072e-06, "loss": 0.0355, "step": 2144 }, { "epoch": 0.566152222661918, "grad_norm": 1.4395577907562256, "learning_rate": 3.3838037457135325e-06, "loss": 0.0344, "step": 2146 }, { "epoch": 0.5666798575385833, "grad_norm": 0.5821990966796875, "learning_rate": 3.386969137430757e-06, "loss": 0.0419, "step": 2148 }, { "epoch": 0.5672074924152486, "grad_norm": 0.21527105569839478, "learning_rate": 3.390134529147982e-06, "loss": 0.0253, "step": 2150 }, { "epoch": 0.567735127291914, "grad_norm": 0.5079959034919739, "learning_rate": 3.393299920865207e-06, "loss": 0.0412, "step": 2152 }, { "epoch": 0.5682627621685793, "grad_norm": 1.0145529508590698, "learning_rate": 3.396465312582432e-06, "loss": 0.0368, "step": 2154 }, { "epoch": 0.5687903970452447, "grad_norm": 0.6327733397483826, "learning_rate": 3.3996307042996574e-06, "loss": 0.033, "step": 2156 }, { "epoch": 0.56931803192191, "grad_norm": 1.3233561515808105, "learning_rate": 3.4027960960168823e-06, "loss": 0.0375, "step": 2158 }, { "epoch": 0.5698456667985754, "grad_norm": 1.204206109046936, "learning_rate": 3.405961487734107e-06, "loss": 0.0298, "step": 2160 }, { "epoch": 0.5703733016752407, "grad_norm": 0.7142801284790039, "learning_rate": 3.4091268794513325e-06, "loss": 0.0304, "step": 2162 }, { "epoch": 0.5709009365519061, "grad_norm": 0.20553304255008698, "learning_rate": 3.412292271168557e-06, "loss": 0.0316, "step": 2164 }, { "epoch": 0.5714285714285714, "grad_norm": 0.2700999081134796, "learning_rate": 3.415457662885782e-06, "loss": 0.0264, "step": 2166 }, { "epoch": 0.5719562063052368, "grad_norm": 0.17358584702014923, "learning_rate": 3.418623054603007e-06, "loss": 0.0345, "step": 2168 }, { "epoch": 0.5724838411819021, "grad_norm": 0.6454617381095886, "learning_rate": 3.421788446320232e-06, "loss": 0.0313, "step": 2170 }, { "epoch": 0.5730114760585675, "grad_norm": 2.0852339267730713, "learning_rate": 3.4249538380374574e-06, "loss": 0.0534, "step": 2172 }, { "epoch": 0.5735391109352328, "grad_norm": 2.0227832794189453, "learning_rate": 3.4281192297546822e-06, "loss": 0.0489, "step": 2174 }, { "epoch": 0.5740667458118982, "grad_norm": 0.6395230293273926, "learning_rate": 3.4312846214719067e-06, "loss": 0.0359, "step": 2176 }, { "epoch": 0.5745943806885635, "grad_norm": 2.1264166831970215, "learning_rate": 3.4344500131891324e-06, "loss": 0.0377, "step": 2178 }, { "epoch": 0.5751220155652289, "grad_norm": 1.6464383602142334, "learning_rate": 3.437615404906357e-06, "loss": 0.0317, "step": 2180 }, { "epoch": 0.5756496504418942, "grad_norm": 1.212165117263794, "learning_rate": 3.4407807966235826e-06, "loss": 0.0256, "step": 2182 }, { "epoch": 0.5761772853185596, "grad_norm": 0.6137009859085083, "learning_rate": 3.443946188340807e-06, "loss": 0.0424, "step": 2184 }, { "epoch": 0.5767049201952249, "grad_norm": 0.9786401391029358, "learning_rate": 3.447111580058032e-06, "loss": 0.0238, "step": 2186 }, { "epoch": 0.5772325550718902, "grad_norm": 0.4303385615348816, "learning_rate": 3.4502769717752573e-06, "loss": 0.0236, "step": 2188 }, { "epoch": 0.5777601899485556, "grad_norm": 0.4986165165901184, "learning_rate": 3.453442363492482e-06, "loss": 0.0256, "step": 2190 }, { "epoch": 0.5782878248252209, "grad_norm": 0.5192486047744751, "learning_rate": 3.4566077552097075e-06, "loss": 0.0334, "step": 2192 }, { "epoch": 0.5788154597018863, "grad_norm": 0.3099115490913391, "learning_rate": 3.4597731469269324e-06, "loss": 0.03, "step": 2194 }, { "epoch": 0.5793430945785516, "grad_norm": 0.5307159423828125, "learning_rate": 3.462938538644157e-06, "loss": 0.0463, "step": 2196 }, { "epoch": 0.579870729455217, "grad_norm": 0.4851226210594177, "learning_rate": 3.4661039303613826e-06, "loss": 0.0307, "step": 2198 }, { "epoch": 0.5803983643318823, "grad_norm": 0.3913153111934662, "learning_rate": 3.469269322078607e-06, "loss": 0.0236, "step": 2200 }, { "epoch": 0.5809259992085477, "grad_norm": 0.2602981626987457, "learning_rate": 3.4724347137958324e-06, "loss": 0.0217, "step": 2202 }, { "epoch": 0.581453634085213, "grad_norm": 0.28824582695961, "learning_rate": 3.4756001055130573e-06, "loss": 0.0316, "step": 2204 }, { "epoch": 0.5819812689618784, "grad_norm": 0.6976718902587891, "learning_rate": 3.478765497230282e-06, "loss": 0.0256, "step": 2206 }, { "epoch": 0.5825089038385437, "grad_norm": 0.6182402968406677, "learning_rate": 3.4819308889475075e-06, "loss": 0.0281, "step": 2208 }, { "epoch": 0.5830365387152091, "grad_norm": 0.42921748757362366, "learning_rate": 3.4850962806647323e-06, "loss": 0.026, "step": 2210 }, { "epoch": 0.5835641735918744, "grad_norm": 1.0264490842819214, "learning_rate": 3.4882616723819577e-06, "loss": 0.0338, "step": 2212 }, { "epoch": 0.5840918084685398, "grad_norm": 0.20654933154582977, "learning_rate": 3.491427064099182e-06, "loss": 0.0217, "step": 2214 }, { "epoch": 0.5846194433452051, "grad_norm": 0.5180326104164124, "learning_rate": 3.494592455816407e-06, "loss": 0.0374, "step": 2216 }, { "epoch": 0.5851470782218705, "grad_norm": 0.11143133044242859, "learning_rate": 3.4977578475336323e-06, "loss": 0.0253, "step": 2218 }, { "epoch": 0.5856747130985358, "grad_norm": 0.1824999302625656, "learning_rate": 3.500923239250857e-06, "loss": 0.0237, "step": 2220 }, { "epoch": 0.5862023479752012, "grad_norm": 0.8795636296272278, "learning_rate": 3.5040886309680825e-06, "loss": 0.0319, "step": 2222 }, { "epoch": 0.5867299828518665, "grad_norm": 0.4264724850654602, "learning_rate": 3.5072540226853074e-06, "loss": 0.0239, "step": 2224 }, { "epoch": 0.5872576177285319, "grad_norm": 1.1607873439788818, "learning_rate": 3.5104194144025323e-06, "loss": 0.0404, "step": 2226 }, { "epoch": 0.5877852526051972, "grad_norm": 0.9393691420555115, "learning_rate": 3.5135848061197576e-06, "loss": 0.0304, "step": 2228 }, { "epoch": 0.5883128874818625, "grad_norm": 0.30072906613349915, "learning_rate": 3.516750197836982e-06, "loss": 0.0217, "step": 2230 }, { "epoch": 0.5888405223585279, "grad_norm": 0.169446662068367, "learning_rate": 3.519915589554208e-06, "loss": 0.0215, "step": 2232 }, { "epoch": 0.5893681572351932, "grad_norm": 0.1642509400844574, "learning_rate": 3.5230809812714323e-06, "loss": 0.0211, "step": 2234 }, { "epoch": 0.5898957921118586, "grad_norm": 0.3764875531196594, "learning_rate": 3.526246372988657e-06, "loss": 0.0304, "step": 2236 }, { "epoch": 0.5904234269885239, "grad_norm": 0.3814910352230072, "learning_rate": 3.5294117647058825e-06, "loss": 0.0271, "step": 2238 }, { "epoch": 0.5909510618651893, "grad_norm": 1.891156792640686, "learning_rate": 3.5325771564231074e-06, "loss": 0.0357, "step": 2240 }, { "epoch": 0.5914786967418546, "grad_norm": 0.303205668926239, "learning_rate": 3.5357425481403327e-06, "loss": 0.0237, "step": 2242 }, { "epoch": 0.59200633161852, "grad_norm": 0.8689278960227966, "learning_rate": 3.5389079398575576e-06, "loss": 0.0408, "step": 2244 }, { "epoch": 0.5925339664951853, "grad_norm": 0.6937569975852966, "learning_rate": 3.542073331574782e-06, "loss": 0.0295, "step": 2246 }, { "epoch": 0.5930616013718507, "grad_norm": 1.1819913387298584, "learning_rate": 3.5452387232920078e-06, "loss": 0.0391, "step": 2248 }, { "epoch": 0.593589236248516, "grad_norm": 0.3776259422302246, "learning_rate": 3.5484041150092322e-06, "loss": 0.0219, "step": 2250 }, { "epoch": 0.5941168711251814, "grad_norm": 0.7029288411140442, "learning_rate": 3.551569506726458e-06, "loss": 0.0221, "step": 2252 }, { "epoch": 0.5946445060018467, "grad_norm": 0.3192138075828552, "learning_rate": 3.5547348984436824e-06, "loss": 0.0326, "step": 2254 }, { "epoch": 0.5951721408785121, "grad_norm": 0.3613951802253723, "learning_rate": 3.5579002901609073e-06, "loss": 0.0246, "step": 2256 }, { "epoch": 0.5956997757551774, "grad_norm": 0.3339724540710449, "learning_rate": 3.5610656818781326e-06, "loss": 0.0255, "step": 2258 }, { "epoch": 0.5962274106318428, "grad_norm": 0.12720289826393127, "learning_rate": 3.5642310735953575e-06, "loss": 0.0303, "step": 2260 }, { "epoch": 0.5967550455085081, "grad_norm": 2.1383471488952637, "learning_rate": 3.567396465312583e-06, "loss": 0.033, "step": 2262 }, { "epoch": 0.5972826803851735, "grad_norm": 0.23961623013019562, "learning_rate": 3.5705618570298077e-06, "loss": 0.0225, "step": 2264 }, { "epoch": 0.5978103152618388, "grad_norm": 1.2634083032608032, "learning_rate": 3.573727248747032e-06, "loss": 0.0307, "step": 2266 }, { "epoch": 0.5983379501385041, "grad_norm": 0.37798649072647095, "learning_rate": 3.5768926404642575e-06, "loss": 0.0276, "step": 2268 }, { "epoch": 0.5988655850151695, "grad_norm": 0.2823624610900879, "learning_rate": 3.5800580321814824e-06, "loss": 0.02, "step": 2270 }, { "epoch": 0.5993932198918348, "grad_norm": 0.6869069933891296, "learning_rate": 3.5832234238987077e-06, "loss": 0.0377, "step": 2272 }, { "epoch": 0.5999208547685002, "grad_norm": 1.5112158060073853, "learning_rate": 3.5863888156159326e-06, "loss": 0.0308, "step": 2274 }, { "epoch": 0.6004484896451655, "grad_norm": 0.6258841753005981, "learning_rate": 3.5895542073331575e-06, "loss": 0.0383, "step": 2276 }, { "epoch": 0.6009761245218309, "grad_norm": 0.5972566604614258, "learning_rate": 3.5927195990503828e-06, "loss": 0.0242, "step": 2278 }, { "epoch": 0.6015037593984962, "grad_norm": 0.1793154627084732, "learning_rate": 3.5958849907676077e-06, "loss": 0.0262, "step": 2280 }, { "epoch": 0.6020313942751616, "grad_norm": 0.29571613669395447, "learning_rate": 3.599050382484833e-06, "loss": 0.025, "step": 2282 }, { "epoch": 0.6025590291518269, "grad_norm": 0.17086449265480042, "learning_rate": 3.6022157742020574e-06, "loss": 0.0304, "step": 2284 }, { "epoch": 0.6030866640284923, "grad_norm": 0.7315145134925842, "learning_rate": 3.6053811659192823e-06, "loss": 0.0213, "step": 2286 }, { "epoch": 0.6036142989051576, "grad_norm": 0.15917065739631653, "learning_rate": 3.6085465576365076e-06, "loss": 0.0298, "step": 2288 }, { "epoch": 0.604141933781823, "grad_norm": 0.5712944865226746, "learning_rate": 3.6117119493537325e-06, "loss": 0.0255, "step": 2290 }, { "epoch": 0.6046695686584883, "grad_norm": 0.47829002141952515, "learning_rate": 3.614877341070958e-06, "loss": 0.0233, "step": 2292 }, { "epoch": 0.6051972035351537, "grad_norm": 0.7523993253707886, "learning_rate": 3.6180427327881827e-06, "loss": 0.0279, "step": 2294 }, { "epoch": 0.605724838411819, "grad_norm": 0.34661412239074707, "learning_rate": 3.6212081245054076e-06, "loss": 0.0226, "step": 2296 }, { "epoch": 0.6062524732884844, "grad_norm": 0.17086388170719147, "learning_rate": 3.624373516222633e-06, "loss": 0.0218, "step": 2298 }, { "epoch": 0.6067801081651497, "grad_norm": 0.4769533574581146, "learning_rate": 3.6275389079398574e-06, "loss": 0.0195, "step": 2300 }, { "epoch": 0.6073077430418151, "grad_norm": 0.2941848039627075, "learning_rate": 3.630704299657083e-06, "loss": 0.0234, "step": 2302 }, { "epoch": 0.6078353779184804, "grad_norm": 0.15296393632888794, "learning_rate": 3.6338696913743076e-06, "loss": 0.0237, "step": 2304 }, { "epoch": 0.6083630127951457, "grad_norm": 1.2328786849975586, "learning_rate": 3.6370350830915325e-06, "loss": 0.034, "step": 2306 }, { "epoch": 0.6088906476718111, "grad_norm": 0.32123616337776184, "learning_rate": 3.6402004748087578e-06, "loss": 0.0263, "step": 2308 }, { "epoch": 0.6094182825484764, "grad_norm": 0.7238378524780273, "learning_rate": 3.6433658665259827e-06, "loss": 0.0218, "step": 2310 }, { "epoch": 0.6099459174251418, "grad_norm": 0.4688388705253601, "learning_rate": 3.646531258243208e-06, "loss": 0.0262, "step": 2312 }, { "epoch": 0.6104735523018071, "grad_norm": 0.9583467245101929, "learning_rate": 3.649696649960433e-06, "loss": 0.0301, "step": 2314 }, { "epoch": 0.6110011871784725, "grad_norm": 0.941128134727478, "learning_rate": 3.6528620416776573e-06, "loss": 0.0273, "step": 2316 }, { "epoch": 0.6115288220551378, "grad_norm": 0.522165060043335, "learning_rate": 3.656027433394883e-06, "loss": 0.0202, "step": 2318 }, { "epoch": 0.6120564569318032, "grad_norm": 1.0316932201385498, "learning_rate": 3.6591928251121075e-06, "loss": 0.0268, "step": 2320 }, { "epoch": 0.6125840918084685, "grad_norm": 0.26395493745803833, "learning_rate": 3.662358216829333e-06, "loss": 0.023, "step": 2322 }, { "epoch": 0.6131117266851339, "grad_norm": 0.9306442737579346, "learning_rate": 3.6655236085465577e-06, "loss": 0.0251, "step": 2324 }, { "epoch": 0.6136393615617992, "grad_norm": 0.18457600474357605, "learning_rate": 3.6686890002637826e-06, "loss": 0.021, "step": 2326 }, { "epoch": 0.6141669964384646, "grad_norm": 1.1213719844818115, "learning_rate": 3.671854391981008e-06, "loss": 0.024, "step": 2328 }, { "epoch": 0.6146946313151299, "grad_norm": 1.4903128147125244, "learning_rate": 3.675019783698233e-06, "loss": 0.0292, "step": 2330 }, { "epoch": 0.6152222661917953, "grad_norm": 0.28464746475219727, "learning_rate": 3.678185175415458e-06, "loss": 0.024, "step": 2332 }, { "epoch": 0.6157499010684606, "grad_norm": 0.9961441159248352, "learning_rate": 3.681350567132683e-06, "loss": 0.0359, "step": 2334 }, { "epoch": 0.616277535945126, "grad_norm": 0.3163481652736664, "learning_rate": 3.6845159588499075e-06, "loss": 0.0194, "step": 2336 }, { "epoch": 0.6168051708217913, "grad_norm": 1.3960694074630737, "learning_rate": 3.6876813505671328e-06, "loss": 0.0263, "step": 2338 }, { "epoch": 0.6173328056984567, "grad_norm": 0.7215791940689087, "learning_rate": 3.6908467422843577e-06, "loss": 0.0189, "step": 2340 }, { "epoch": 0.617860440575122, "grad_norm": 0.5767099261283875, "learning_rate": 3.694012134001583e-06, "loss": 0.0371, "step": 2342 }, { "epoch": 0.6183880754517873, "grad_norm": 2.5413200855255127, "learning_rate": 3.697177525718808e-06, "loss": 0.0212, "step": 2344 }, { "epoch": 0.6189157103284527, "grad_norm": 0.0991898626089096, "learning_rate": 3.7003429174360328e-06, "loss": 0.0196, "step": 2346 }, { "epoch": 0.619443345205118, "grad_norm": 0.9494491219520569, "learning_rate": 3.703508309153258e-06, "loss": 0.0204, "step": 2348 }, { "epoch": 0.6199709800817834, "grad_norm": 0.238158717751503, "learning_rate": 3.7066737008704825e-06, "loss": 0.023, "step": 2350 }, { "epoch": 0.6204986149584487, "grad_norm": 0.5340887904167175, "learning_rate": 3.7098390925877083e-06, "loss": 0.0276, "step": 2352 }, { "epoch": 0.6210262498351141, "grad_norm": 0.5883889198303223, "learning_rate": 3.7130044843049327e-06, "loss": 0.0219, "step": 2354 }, { "epoch": 0.6215538847117794, "grad_norm": 1.3821057081222534, "learning_rate": 3.7161698760221576e-06, "loss": 0.0326, "step": 2356 }, { "epoch": 0.6220815195884448, "grad_norm": 0.6724366545677185, "learning_rate": 3.719335267739383e-06, "loss": 0.0228, "step": 2358 }, { "epoch": 0.6226091544651101, "grad_norm": 0.2332792431116104, "learning_rate": 3.722500659456608e-06, "loss": 0.0309, "step": 2360 }, { "epoch": 0.6231367893417755, "grad_norm": 0.8632251024246216, "learning_rate": 3.725666051173833e-06, "loss": 0.0278, "step": 2362 }, { "epoch": 0.6236644242184408, "grad_norm": 0.22078560292720795, "learning_rate": 3.728831442891058e-06, "loss": 0.0168, "step": 2364 }, { "epoch": 0.6241920590951062, "grad_norm": 0.8707985877990723, "learning_rate": 3.7319968346082825e-06, "loss": 0.0274, "step": 2366 }, { "epoch": 0.6247196939717715, "grad_norm": 0.951934278011322, "learning_rate": 3.7351622263255082e-06, "loss": 0.0348, "step": 2368 }, { "epoch": 0.6252473288484369, "grad_norm": 0.37927481532096863, "learning_rate": 3.7383276180427327e-06, "loss": 0.0305, "step": 2370 }, { "epoch": 0.6257749637251022, "grad_norm": 0.36250901222229004, "learning_rate": 3.7414930097599576e-06, "loss": 0.0182, "step": 2372 }, { "epoch": 0.6263025986017676, "grad_norm": 0.648486316204071, "learning_rate": 3.744658401477183e-06, "loss": 0.0304, "step": 2374 }, { "epoch": 0.6268302334784329, "grad_norm": 0.29201140999794006, "learning_rate": 3.7478237931944078e-06, "loss": 0.0301, "step": 2376 }, { "epoch": 0.6273578683550983, "grad_norm": 0.6512871384620667, "learning_rate": 3.750989184911633e-06, "loss": 0.0238, "step": 2378 }, { "epoch": 0.6278855032317636, "grad_norm": 0.1252242624759674, "learning_rate": 3.754154576628858e-06, "loss": 0.016, "step": 2380 }, { "epoch": 0.6284131381084289, "grad_norm": 0.24713017046451569, "learning_rate": 3.7573199683460824e-06, "loss": 0.0281, "step": 2382 }, { "epoch": 0.6289407729850943, "grad_norm": 1.844192624092102, "learning_rate": 3.760485360063308e-06, "loss": 0.0348, "step": 2384 }, { "epoch": 0.6294684078617596, "grad_norm": 0.2699136435985565, "learning_rate": 3.7636507517805326e-06, "loss": 0.0181, "step": 2386 }, { "epoch": 0.629996042738425, "grad_norm": 0.5636822581291199, "learning_rate": 3.766816143497758e-06, "loss": 0.0235, "step": 2388 }, { "epoch": 0.6305236776150903, "grad_norm": 0.41984957456588745, "learning_rate": 3.769981535214983e-06, "loss": 0.0179, "step": 2390 }, { "epoch": 0.6310513124917557, "grad_norm": 0.9081361889839172, "learning_rate": 3.7731469269322077e-06, "loss": 0.0198, "step": 2392 }, { "epoch": 0.631578947368421, "grad_norm": 1.3224124908447266, "learning_rate": 3.776312318649433e-06, "loss": 0.0549, "step": 2394 }, { "epoch": 0.6321065822450864, "grad_norm": 1.9111706018447876, "learning_rate": 3.779477710366658e-06, "loss": 0.0182, "step": 2396 }, { "epoch": 0.6326342171217517, "grad_norm": 0.17405417561531067, "learning_rate": 3.7826431020838832e-06, "loss": 0.0222, "step": 2398 }, { "epoch": 0.6331618519984171, "grad_norm": 1.265212893486023, "learning_rate": 3.785808493801108e-06, "loss": 0.0294, "step": 2400 }, { "epoch": 0.6336894868750824, "grad_norm": 0.3393470048904419, "learning_rate": 3.7889738855183326e-06, "loss": 0.0234, "step": 2402 }, { "epoch": 0.6342171217517478, "grad_norm": 1.8103083372116089, "learning_rate": 3.792139277235558e-06, "loss": 0.0282, "step": 2404 }, { "epoch": 0.6347447566284131, "grad_norm": 0.4822346270084381, "learning_rate": 3.7953046689527828e-06, "loss": 0.0205, "step": 2406 }, { "epoch": 0.6352723915050785, "grad_norm": 0.6791880130767822, "learning_rate": 3.798470060670008e-06, "loss": 0.0265, "step": 2408 }, { "epoch": 0.6358000263817438, "grad_norm": 0.6768174171447754, "learning_rate": 3.801635452387233e-06, "loss": 0.0315, "step": 2410 }, { "epoch": 0.6363276612584092, "grad_norm": 0.6634063720703125, "learning_rate": 3.804800844104458e-06, "loss": 0.0285, "step": 2412 }, { "epoch": 0.6368552961350745, "grad_norm": 1.3138829469680786, "learning_rate": 3.807966235821683e-06, "loss": 0.0193, "step": 2414 }, { "epoch": 0.6373829310117399, "grad_norm": 0.35126107931137085, "learning_rate": 3.811131627538908e-06, "loss": 0.0191, "step": 2416 }, { "epoch": 0.6379105658884052, "grad_norm": 0.17949660122394562, "learning_rate": 3.8142970192561334e-06, "loss": 0.0295, "step": 2418 }, { "epoch": 0.6384382007650705, "grad_norm": 0.3899555504322052, "learning_rate": 3.817462410973358e-06, "loss": 0.031, "step": 2420 }, { "epoch": 0.6389658356417359, "grad_norm": 0.335763543844223, "learning_rate": 3.820627802690583e-06, "loss": 0.0168, "step": 2422 }, { "epoch": 0.6394934705184012, "grad_norm": 0.09910460561513901, "learning_rate": 3.8237931944078085e-06, "loss": 0.0187, "step": 2424 }, { "epoch": 0.6400211053950666, "grad_norm": 0.3106381893157959, "learning_rate": 3.8269585861250325e-06, "loss": 0.0324, "step": 2426 }, { "epoch": 0.6405487402717319, "grad_norm": 0.5558140873908997, "learning_rate": 3.830123977842258e-06, "loss": 0.0291, "step": 2428 }, { "epoch": 0.6410763751483973, "grad_norm": 0.5240744352340698, "learning_rate": 3.833289369559483e-06, "loss": 0.0256, "step": 2430 }, { "epoch": 0.6416040100250626, "grad_norm": 0.5743783712387085, "learning_rate": 3.836454761276708e-06, "loss": 0.0187, "step": 2432 }, { "epoch": 0.642131644901728, "grad_norm": 0.4711568057537079, "learning_rate": 3.839620152993933e-06, "loss": 0.0242, "step": 2434 }, { "epoch": 0.6426592797783933, "grad_norm": 0.42669713497161865, "learning_rate": 3.842785544711158e-06, "loss": 0.0204, "step": 2436 }, { "epoch": 0.6431869146550587, "grad_norm": 1.9163533449172974, "learning_rate": 3.8459509364283835e-06, "loss": 0.0196, "step": 2438 }, { "epoch": 0.643714549531724, "grad_norm": 0.7213656306266785, "learning_rate": 3.849116328145608e-06, "loss": 0.0317, "step": 2440 }, { "epoch": 0.6442421844083894, "grad_norm": 0.8064097762107849, "learning_rate": 3.8522817198628325e-06, "loss": 0.0258, "step": 2442 }, { "epoch": 0.6447698192850547, "grad_norm": 1.3939565420150757, "learning_rate": 3.855447111580058e-06, "loss": 0.0254, "step": 2444 }, { "epoch": 0.6452974541617201, "grad_norm": 0.2356719970703125, "learning_rate": 3.858612503297283e-06, "loss": 0.0268, "step": 2446 }, { "epoch": 0.6458250890383854, "grad_norm": 0.19240041077136993, "learning_rate": 3.861777895014509e-06, "loss": 0.0185, "step": 2448 }, { "epoch": 0.6463527239150508, "grad_norm": 0.1421094536781311, "learning_rate": 3.864943286731733e-06, "loss": 0.0144, "step": 2450 }, { "epoch": 0.6468803587917161, "grad_norm": 0.9229819178581238, "learning_rate": 3.868108678448958e-06, "loss": 0.028, "step": 2452 }, { "epoch": 0.6474079936683815, "grad_norm": 0.5974205136299133, "learning_rate": 3.8712740701661835e-06, "loss": 0.0278, "step": 2454 }, { "epoch": 0.6479356285450468, "grad_norm": 0.8437492251396179, "learning_rate": 3.874439461883408e-06, "loss": 0.0212, "step": 2456 }, { "epoch": 0.6484632634217121, "grad_norm": 0.6167869567871094, "learning_rate": 3.877604853600633e-06, "loss": 0.0253, "step": 2458 }, { "epoch": 0.6489908982983775, "grad_norm": 0.7721688747406006, "learning_rate": 3.880770245317858e-06, "loss": 0.0205, "step": 2460 }, { "epoch": 0.6495185331750428, "grad_norm": 0.7127137184143066, "learning_rate": 3.883935637035083e-06, "loss": 0.0362, "step": 2462 }, { "epoch": 0.6500461680517082, "grad_norm": 1.6623831987380981, "learning_rate": 3.887101028752308e-06, "loss": 0.018, "step": 2464 }, { "epoch": 0.6505738029283735, "grad_norm": 0.38348087668418884, "learning_rate": 3.890266420469533e-06, "loss": 0.0214, "step": 2466 }, { "epoch": 0.6511014378050389, "grad_norm": 0.4459100067615509, "learning_rate": 3.8934318121867585e-06, "loss": 0.0193, "step": 2468 }, { "epoch": 0.6516290726817042, "grad_norm": 1.1078205108642578, "learning_rate": 3.8965972039039834e-06, "loss": 0.02, "step": 2470 }, { "epoch": 0.6521567075583696, "grad_norm": 0.5625520348548889, "learning_rate": 3.899762595621208e-06, "loss": 0.018, "step": 2472 }, { "epoch": 0.6526843424350349, "grad_norm": 1.1853196620941162, "learning_rate": 3.902927987338433e-06, "loss": 0.0247, "step": 2474 }, { "epoch": 0.6532119773117003, "grad_norm": 0.43143653869628906, "learning_rate": 3.906093379055658e-06, "loss": 0.0251, "step": 2476 }, { "epoch": 0.6537396121883656, "grad_norm": 1.174720048904419, "learning_rate": 3.909258770772884e-06, "loss": 0.0268, "step": 2478 }, { "epoch": 0.654267247065031, "grad_norm": 0.3697716295719147, "learning_rate": 3.912424162490108e-06, "loss": 0.0192, "step": 2480 }, { "epoch": 0.6547948819416963, "grad_norm": 1.8201485872268677, "learning_rate": 3.915589554207333e-06, "loss": 0.021, "step": 2482 }, { "epoch": 0.6553225168183617, "grad_norm": 0.1990124136209488, "learning_rate": 3.9187549459245585e-06, "loss": 0.0234, "step": 2484 }, { "epoch": 0.655850151695027, "grad_norm": 0.388771653175354, "learning_rate": 3.921920337641783e-06, "loss": 0.0184, "step": 2486 }, { "epoch": 0.6563777865716924, "grad_norm": 0.24014493823051453, "learning_rate": 3.925085729359008e-06, "loss": 0.0174, "step": 2488 }, { "epoch": 0.6569054214483577, "grad_norm": 0.30973732471466064, "learning_rate": 3.928251121076233e-06, "loss": 0.022, "step": 2490 }, { "epoch": 0.6574330563250231, "grad_norm": 0.7552892565727234, "learning_rate": 3.931416512793458e-06, "loss": 0.029, "step": 2492 }, { "epoch": 0.6579606912016884, "grad_norm": 0.12762950360774994, "learning_rate": 3.934581904510684e-06, "loss": 0.0142, "step": 2494 }, { "epoch": 0.6584883260783537, "grad_norm": 0.46605101227760315, "learning_rate": 3.937747296227908e-06, "loss": 0.0215, "step": 2496 }, { "epoch": 0.6590159609550191, "grad_norm": 0.26892226934432983, "learning_rate": 3.9409126879451336e-06, "loss": 0.0172, "step": 2498 }, { "epoch": 0.6595435958316844, "grad_norm": 0.8034265041351318, "learning_rate": 3.9440780796623584e-06, "loss": 0.0198, "step": 2500 }, { "epoch": 0.6600712307083498, "grad_norm": 2.334217071533203, "learning_rate": 3.947243471379583e-06, "loss": 0.018, "step": 2502 }, { "epoch": 0.6605988655850151, "grad_norm": 0.4314807951450348, "learning_rate": 3.950408863096808e-06, "loss": 0.0241, "step": 2504 }, { "epoch": 0.6611265004616805, "grad_norm": 0.4469304084777832, "learning_rate": 3.953574254814033e-06, "loss": 0.0202, "step": 2506 }, { "epoch": 0.6616541353383458, "grad_norm": 0.2107272446155548, "learning_rate": 3.956739646531259e-06, "loss": 0.025, "step": 2508 }, { "epoch": 0.6621817702150112, "grad_norm": 0.22971351444721222, "learning_rate": 3.959905038248484e-06, "loss": 0.0265, "step": 2510 }, { "epoch": 0.6627094050916765, "grad_norm": 1.090361475944519, "learning_rate": 3.963070429965708e-06, "loss": 0.0342, "step": 2512 }, { "epoch": 0.6632370399683419, "grad_norm": 0.643498957157135, "learning_rate": 3.9662358216829335e-06, "loss": 0.0233, "step": 2514 }, { "epoch": 0.6637646748450072, "grad_norm": 0.236141175031662, "learning_rate": 3.969401213400158e-06, "loss": 0.0231, "step": 2516 }, { "epoch": 0.6642923097216726, "grad_norm": 0.3221989870071411, "learning_rate": 3.972566605117383e-06, "loss": 0.0212, "step": 2518 }, { "epoch": 0.6648199445983379, "grad_norm": 0.8680428862571716, "learning_rate": 3.975731996834608e-06, "loss": 0.0268, "step": 2520 }, { "epoch": 0.6653475794750033, "grad_norm": 0.3660508394241333, "learning_rate": 3.978897388551833e-06, "loss": 0.0191, "step": 2522 }, { "epoch": 0.6658752143516686, "grad_norm": 0.6108194589614868, "learning_rate": 3.982062780269059e-06, "loss": 0.0222, "step": 2524 }, { "epoch": 0.666402849228334, "grad_norm": 0.7233983278274536, "learning_rate": 3.985228171986284e-06, "loss": 0.0222, "step": 2526 }, { "epoch": 0.6669304841049993, "grad_norm": 0.3497779667377472, "learning_rate": 3.9883935637035086e-06, "loss": 0.0281, "step": 2528 }, { "epoch": 0.6674581189816647, "grad_norm": 0.23925979435443878, "learning_rate": 3.9915589554207335e-06, "loss": 0.0166, "step": 2530 }, { "epoch": 0.66798575385833, "grad_norm": 3.270691156387329, "learning_rate": 3.994724347137958e-06, "loss": 0.025, "step": 2532 }, { "epoch": 0.6685133887349953, "grad_norm": 0.45185238122940063, "learning_rate": 3.997889738855183e-06, "loss": 0.0182, "step": 2534 }, { "epoch": 0.6690410236116607, "grad_norm": 0.12252955138683319, "learning_rate": 4.001055130572408e-06, "loss": 0.0195, "step": 2536 }, { "epoch": 0.669568658488326, "grad_norm": 0.5042688846588135, "learning_rate": 4.004220522289634e-06, "loss": 0.0231, "step": 2538 }, { "epoch": 0.6700962933649914, "grad_norm": 0.5034874677658081, "learning_rate": 4.007385914006859e-06, "loss": 0.0294, "step": 2540 }, { "epoch": 0.6706239282416567, "grad_norm": 1.6882213354110718, "learning_rate": 4.010551305724084e-06, "loss": 0.0239, "step": 2542 }, { "epoch": 0.6711515631183221, "grad_norm": 0.978559672832489, "learning_rate": 4.0137166974413085e-06, "loss": 0.0227, "step": 2544 }, { "epoch": 0.6716791979949874, "grad_norm": 0.19490653276443481, "learning_rate": 4.016882089158533e-06, "loss": 0.0145, "step": 2546 }, { "epoch": 0.6722068328716528, "grad_norm": 1.8783458471298218, "learning_rate": 4.020047480875759e-06, "loss": 0.0248, "step": 2548 }, { "epoch": 0.6727344677483181, "grad_norm": 0.17617398500442505, "learning_rate": 4.023212872592983e-06, "loss": 0.014, "step": 2550 }, { "epoch": 0.6732621026249835, "grad_norm": 0.9396033883094788, "learning_rate": 4.026378264310208e-06, "loss": 0.0325, "step": 2552 }, { "epoch": 0.6737897375016488, "grad_norm": 0.36595192551612854, "learning_rate": 4.029543656027434e-06, "loss": 0.0209, "step": 2554 }, { "epoch": 0.6743173723783142, "grad_norm": 0.5004674196243286, "learning_rate": 4.032709047744659e-06, "loss": 0.0219, "step": 2556 }, { "epoch": 0.6748450072549795, "grad_norm": 1.0235081911087036, "learning_rate": 4.0358744394618836e-06, "loss": 0.0202, "step": 2558 }, { "epoch": 0.6753726421316449, "grad_norm": 0.9894943833351135, "learning_rate": 4.0390398311791085e-06, "loss": 0.026, "step": 2560 }, { "epoch": 0.6759002770083102, "grad_norm": 0.7630473375320435, "learning_rate": 4.042205222896333e-06, "loss": 0.0205, "step": 2562 }, { "epoch": 0.6764279118849756, "grad_norm": 0.2936755120754242, "learning_rate": 4.045370614613559e-06, "loss": 0.0206, "step": 2564 }, { "epoch": 0.6769555467616409, "grad_norm": 0.36845967173576355, "learning_rate": 4.048536006330783e-06, "loss": 0.0198, "step": 2566 }, { "epoch": 0.6774831816383063, "grad_norm": 0.25058528780937195, "learning_rate": 4.051701398048009e-06, "loss": 0.0174, "step": 2568 }, { "epoch": 0.6780108165149716, "grad_norm": 0.33731940388679504, "learning_rate": 4.054866789765234e-06, "loss": 0.017, "step": 2570 }, { "epoch": 0.6785384513916369, "grad_norm": 1.8289517164230347, "learning_rate": 4.058032181482459e-06, "loss": 0.0183, "step": 2572 }, { "epoch": 0.6790660862683023, "grad_norm": 0.2757560610771179, "learning_rate": 4.0611975731996835e-06, "loss": 0.0192, "step": 2574 }, { "epoch": 0.6795937211449676, "grad_norm": 0.30896684527397156, "learning_rate": 4.064362964916908e-06, "loss": 0.015, "step": 2576 }, { "epoch": 0.680121356021633, "grad_norm": 0.2428664118051529, "learning_rate": 4.067528356634133e-06, "loss": 0.0162, "step": 2578 }, { "epoch": 0.6806489908982983, "grad_norm": 1.5549787282943726, "learning_rate": 4.070693748351359e-06, "loss": 0.0234, "step": 2580 }, { "epoch": 0.6811766257749637, "grad_norm": 0.14385928213596344, "learning_rate": 4.073859140068583e-06, "loss": 0.0202, "step": 2582 }, { "epoch": 0.681704260651629, "grad_norm": 2.188333511352539, "learning_rate": 4.077024531785809e-06, "loss": 0.0177, "step": 2584 }, { "epoch": 0.6822318955282944, "grad_norm": 0.3938981294631958, "learning_rate": 4.080189923503034e-06, "loss": 0.0317, "step": 2586 }, { "epoch": 0.6827595304049597, "grad_norm": 0.11234832555055618, "learning_rate": 4.083355315220259e-06, "loss": 0.0151, "step": 2588 }, { "epoch": 0.6832871652816251, "grad_norm": 0.11082098633050919, "learning_rate": 4.0865207069374835e-06, "loss": 0.0133, "step": 2590 }, { "epoch": 0.6838148001582904, "grad_norm": 0.4701370298862457, "learning_rate": 4.089686098654708e-06, "loss": 0.0175, "step": 2592 }, { "epoch": 0.6843424350349558, "grad_norm": 0.6293345093727112, "learning_rate": 4.092851490371934e-06, "loss": 0.0174, "step": 2594 }, { "epoch": 0.6848700699116211, "grad_norm": 0.19789668917655945, "learning_rate": 4.096016882089159e-06, "loss": 0.0151, "step": 2596 }, { "epoch": 0.6853977047882865, "grad_norm": 0.32720381021499634, "learning_rate": 4.099182273806383e-06, "loss": 0.0137, "step": 2598 }, { "epoch": 0.6859253396649518, "grad_norm": 0.4510430097579956, "learning_rate": 4.102347665523609e-06, "loss": 0.0178, "step": 2600 }, { "epoch": 0.6864529745416172, "grad_norm": 0.6687141060829163, "learning_rate": 4.105513057240834e-06, "loss": 0.0175, "step": 2602 }, { "epoch": 0.6869806094182825, "grad_norm": 0.15476059913635254, "learning_rate": 4.1086784489580585e-06, "loss": 0.0186, "step": 2604 }, { "epoch": 0.687508244294948, "grad_norm": 0.14893430471420288, "learning_rate": 4.111843840675283e-06, "loss": 0.0134, "step": 2606 }, { "epoch": 0.6880358791716132, "grad_norm": 0.4300229847431183, "learning_rate": 4.115009232392508e-06, "loss": 0.0211, "step": 2608 }, { "epoch": 0.6885635140482786, "grad_norm": 0.15237262845039368, "learning_rate": 4.118174624109734e-06, "loss": 0.0132, "step": 2610 }, { "epoch": 0.6890911489249439, "grad_norm": 0.537898600101471, "learning_rate": 4.121340015826959e-06, "loss": 0.0138, "step": 2612 }, { "epoch": 0.6896187838016092, "grad_norm": 0.2646832764148712, "learning_rate": 4.124505407544184e-06, "loss": 0.0266, "step": 2614 }, { "epoch": 0.6901464186782746, "grad_norm": 0.5507882833480835, "learning_rate": 4.127670799261409e-06, "loss": 0.0332, "step": 2616 }, { "epoch": 0.6906740535549399, "grad_norm": 0.455045223236084, "learning_rate": 4.130836190978634e-06, "loss": 0.0134, "step": 2618 }, { "epoch": 0.6912016884316053, "grad_norm": 0.3004356920719147, "learning_rate": 4.1340015826958585e-06, "loss": 0.0205, "step": 2620 }, { "epoch": 0.6917293233082706, "grad_norm": 0.2483050525188446, "learning_rate": 4.137166974413083e-06, "loss": 0.0199, "step": 2622 }, { "epoch": 0.692256958184936, "grad_norm": 0.16244125366210938, "learning_rate": 4.140332366130309e-06, "loss": 0.0145, "step": 2624 }, { "epoch": 0.6927845930616013, "grad_norm": 0.4492971897125244, "learning_rate": 4.143497757847534e-06, "loss": 0.0199, "step": 2626 }, { "epoch": 0.6933122279382667, "grad_norm": 1.373543620109558, "learning_rate": 4.146663149564758e-06, "loss": 0.0182, "step": 2628 }, { "epoch": 0.693839862814932, "grad_norm": 0.45583996176719666, "learning_rate": 4.149828541281984e-06, "loss": 0.0177, "step": 2630 }, { "epoch": 0.6943674976915974, "grad_norm": 0.7154488563537598, "learning_rate": 4.152993932999209e-06, "loss": 0.0203, "step": 2632 }, { "epoch": 0.6948951325682627, "grad_norm": 0.4879291355609894, "learning_rate": 4.156159324716434e-06, "loss": 0.0168, "step": 2634 }, { "epoch": 0.6954227674449281, "grad_norm": 0.12157629430294037, "learning_rate": 4.1593247164336584e-06, "loss": 0.0187, "step": 2636 }, { "epoch": 0.6959504023215934, "grad_norm": 0.14414167404174805, "learning_rate": 4.162490108150883e-06, "loss": 0.0165, "step": 2638 }, { "epoch": 0.6964780371982588, "grad_norm": 0.8872300386428833, "learning_rate": 4.165655499868109e-06, "loss": 0.0178, "step": 2640 }, { "epoch": 0.6970056720749241, "grad_norm": 0.14403760433197021, "learning_rate": 4.168820891585334e-06, "loss": 0.0187, "step": 2642 }, { "epoch": 0.6975333069515895, "grad_norm": 0.7547650933265686, "learning_rate": 4.171986283302559e-06, "loss": 0.0205, "step": 2644 }, { "epoch": 0.6980609418282548, "grad_norm": 1.8857234716415405, "learning_rate": 4.175151675019784e-06, "loss": 0.0258, "step": 2646 }, { "epoch": 0.6985885767049202, "grad_norm": 0.263934850692749, "learning_rate": 4.178317066737009e-06, "loss": 0.0237, "step": 2648 }, { "epoch": 0.6991162115815855, "grad_norm": 0.7084464430809021, "learning_rate": 4.181482458454234e-06, "loss": 0.0258, "step": 2650 }, { "epoch": 0.6996438464582508, "grad_norm": 0.31090474128723145, "learning_rate": 4.184647850171458e-06, "loss": 0.0178, "step": 2652 }, { "epoch": 0.7001714813349162, "grad_norm": 2.069028854370117, "learning_rate": 4.187813241888684e-06, "loss": 0.0187, "step": 2654 }, { "epoch": 0.7006991162115815, "grad_norm": 0.5228036046028137, "learning_rate": 4.190978633605909e-06, "loss": 0.0185, "step": 2656 }, { "epoch": 0.701226751088247, "grad_norm": 0.614904522895813, "learning_rate": 4.194144025323134e-06, "loss": 0.0182, "step": 2658 }, { "epoch": 0.7017543859649122, "grad_norm": 0.6052983403205872, "learning_rate": 4.197309417040359e-06, "loss": 0.0189, "step": 2660 }, { "epoch": 0.7022820208415776, "grad_norm": 0.6486141681671143, "learning_rate": 4.200474808757584e-06, "loss": 0.0131, "step": 2662 }, { "epoch": 0.7028096557182429, "grad_norm": 0.4250546395778656, "learning_rate": 4.203640200474809e-06, "loss": 0.0255, "step": 2664 }, { "epoch": 0.7033372905949083, "grad_norm": 0.9957100749015808, "learning_rate": 4.2068055921920334e-06, "loss": 0.0192, "step": 2666 }, { "epoch": 0.7038649254715736, "grad_norm": 0.47679847478866577, "learning_rate": 4.209970983909258e-06, "loss": 0.0214, "step": 2668 }, { "epoch": 0.704392560348239, "grad_norm": 1.5114938020706177, "learning_rate": 4.213136375626484e-06, "loss": 0.0187, "step": 2670 }, { "epoch": 0.7049201952249043, "grad_norm": 0.40410885214805603, "learning_rate": 4.216301767343709e-06, "loss": 0.0154, "step": 2672 }, { "epoch": 0.7054478301015698, "grad_norm": 0.30573251843452454, "learning_rate": 4.219467159060934e-06, "loss": 0.0135, "step": 2674 }, { "epoch": 0.705975464978235, "grad_norm": 0.8800173997879028, "learning_rate": 4.222632550778159e-06, "loss": 0.0229, "step": 2676 }, { "epoch": 0.7065030998549005, "grad_norm": 2.0134661197662354, "learning_rate": 4.225797942495384e-06, "loss": 0.0237, "step": 2678 }, { "epoch": 0.7070307347315657, "grad_norm": 0.17939506471157074, "learning_rate": 4.228963334212609e-06, "loss": 0.015, "step": 2680 }, { "epoch": 0.7075583696082312, "grad_norm": 1.5183531045913696, "learning_rate": 4.232128725929833e-06, "loss": 0.0259, "step": 2682 }, { "epoch": 0.7080860044848964, "grad_norm": 0.19224783778190613, "learning_rate": 4.235294117647059e-06, "loss": 0.0129, "step": 2684 }, { "epoch": 0.7086136393615619, "grad_norm": 1.259989619255066, "learning_rate": 4.238459509364284e-06, "loss": 0.0149, "step": 2686 }, { "epoch": 0.7091412742382271, "grad_norm": 0.5261680483818054, "learning_rate": 4.241624901081509e-06, "loss": 0.0177, "step": 2688 }, { "epoch": 0.7096689091148924, "grad_norm": 0.6510837078094482, "learning_rate": 4.244790292798734e-06, "loss": 0.0187, "step": 2690 }, { "epoch": 0.7101965439915578, "grad_norm": 0.618127167224884, "learning_rate": 4.247955684515959e-06, "loss": 0.0178, "step": 2692 }, { "epoch": 0.7107241788682231, "grad_norm": 0.4512351155281067, "learning_rate": 4.251121076233184e-06, "loss": 0.0175, "step": 2694 }, { "epoch": 0.7112518137448886, "grad_norm": 0.6487525701522827, "learning_rate": 4.254286467950409e-06, "loss": 0.0136, "step": 2696 }, { "epoch": 0.7117794486215538, "grad_norm": 0.49791041016578674, "learning_rate": 4.257451859667633e-06, "loss": 0.0126, "step": 2698 }, { "epoch": 0.7123070834982193, "grad_norm": 0.2168516218662262, "learning_rate": 4.260617251384859e-06, "loss": 0.0168, "step": 2700 }, { "epoch": 0.7128347183748845, "grad_norm": 0.145096555352211, "learning_rate": 4.263782643102084e-06, "loss": 0.0138, "step": 2702 }, { "epoch": 0.71336235325155, "grad_norm": 0.2621806859970093, "learning_rate": 4.26694803481931e-06, "loss": 0.0111, "step": 2704 }, { "epoch": 0.7138899881282152, "grad_norm": 1.1992698907852173, "learning_rate": 4.270113426536534e-06, "loss": 0.0135, "step": 2706 }, { "epoch": 0.7144176230048807, "grad_norm": 0.10347657650709152, "learning_rate": 4.273278818253759e-06, "loss": 0.0174, "step": 2708 }, { "epoch": 0.714945257881546, "grad_norm": 0.2245938926935196, "learning_rate": 4.276444209970984e-06, "loss": 0.0206, "step": 2710 }, { "epoch": 0.7154728927582114, "grad_norm": 0.38818803429603577, "learning_rate": 4.279609601688209e-06, "loss": 0.0135, "step": 2712 }, { "epoch": 0.7160005276348766, "grad_norm": 0.9899524450302124, "learning_rate": 4.282774993405434e-06, "loss": 0.0263, "step": 2714 }, { "epoch": 0.716528162511542, "grad_norm": 0.5929756164550781, "learning_rate": 4.285940385122659e-06, "loss": 0.0292, "step": 2716 }, { "epoch": 0.7170557973882073, "grad_norm": 0.2231491506099701, "learning_rate": 4.289105776839884e-06, "loss": 0.0153, "step": 2718 }, { "epoch": 0.7175834322648728, "grad_norm": 0.4044378697872162, "learning_rate": 4.292271168557109e-06, "loss": 0.0203, "step": 2720 }, { "epoch": 0.718111067141538, "grad_norm": 0.7408985495567322, "learning_rate": 4.295436560274334e-06, "loss": 0.0128, "step": 2722 }, { "epoch": 0.7186387020182035, "grad_norm": 1.0550377368927002, "learning_rate": 4.2986019519915594e-06, "loss": 0.0175, "step": 2724 }, { "epoch": 0.7191663368948688, "grad_norm": 0.19540096819400787, "learning_rate": 4.301767343708784e-06, "loss": 0.0125, "step": 2726 }, { "epoch": 0.719693971771534, "grad_norm": 1.093827486038208, "learning_rate": 4.304932735426009e-06, "loss": 0.0171, "step": 2728 }, { "epoch": 0.7202216066481995, "grad_norm": 0.39898014068603516, "learning_rate": 4.308098127143234e-06, "loss": 0.0214, "step": 2730 }, { "epoch": 0.7207492415248647, "grad_norm": 0.11559843271970749, "learning_rate": 4.311263518860459e-06, "loss": 0.0199, "step": 2732 }, { "epoch": 0.7212768764015302, "grad_norm": 1.3128612041473389, "learning_rate": 4.314428910577685e-06, "loss": 0.0362, "step": 2734 }, { "epoch": 0.7218045112781954, "grad_norm": 0.0742977038025856, "learning_rate": 4.317594302294909e-06, "loss": 0.0223, "step": 2736 }, { "epoch": 0.7223321461548609, "grad_norm": 0.4265320599079132, "learning_rate": 4.320759694012134e-06, "loss": 0.0175, "step": 2738 }, { "epoch": 0.7228597810315261, "grad_norm": 0.8606309294700623, "learning_rate": 4.323925085729359e-06, "loss": 0.0249, "step": 2740 }, { "epoch": 0.7233874159081916, "grad_norm": 0.4904651939868927, "learning_rate": 4.327090477446584e-06, "loss": 0.0177, "step": 2742 }, { "epoch": 0.7239150507848569, "grad_norm": 1.0046353340148926, "learning_rate": 4.330255869163809e-06, "loss": 0.0209, "step": 2744 }, { "epoch": 0.7244426856615223, "grad_norm": 0.6480415463447571, "learning_rate": 4.333421260881034e-06, "loss": 0.0154, "step": 2746 }, { "epoch": 0.7249703205381876, "grad_norm": 0.34565436840057373, "learning_rate": 4.336586652598259e-06, "loss": 0.017, "step": 2748 }, { "epoch": 0.725497955414853, "grad_norm": 0.5037415623664856, "learning_rate": 4.339752044315485e-06, "loss": 0.0162, "step": 2750 }, { "epoch": 0.7260255902915183, "grad_norm": 0.3616088330745697, "learning_rate": 4.342917436032709e-06, "loss": 0.0166, "step": 2752 }, { "epoch": 0.7265532251681837, "grad_norm": 0.07349856197834015, "learning_rate": 4.3460828277499344e-06, "loss": 0.0177, "step": 2754 }, { "epoch": 0.727080860044849, "grad_norm": 2.1587557792663574, "learning_rate": 4.349248219467159e-06, "loss": 0.0203, "step": 2756 }, { "epoch": 0.7276084949215144, "grad_norm": 1.448222279548645, "learning_rate": 4.352413611184384e-06, "loss": 0.0203, "step": 2758 }, { "epoch": 0.7281361297981797, "grad_norm": 0.5149561166763306, "learning_rate": 4.355579002901609e-06, "loss": 0.0201, "step": 2760 }, { "epoch": 0.7286637646748451, "grad_norm": 0.13492192327976227, "learning_rate": 4.358744394618834e-06, "loss": 0.0195, "step": 2762 }, { "epoch": 0.7291913995515104, "grad_norm": 0.07897324860095978, "learning_rate": 4.36190978633606e-06, "loss": 0.0185, "step": 2764 }, { "epoch": 0.7297190344281757, "grad_norm": 0.13632380962371826, "learning_rate": 4.365075178053285e-06, "loss": 0.0114, "step": 2766 }, { "epoch": 0.7302466693048411, "grad_norm": 2.0502781867980957, "learning_rate": 4.368240569770509e-06, "loss": 0.0236, "step": 2768 }, { "epoch": 0.7307743041815064, "grad_norm": 0.7041348218917847, "learning_rate": 4.371405961487734e-06, "loss": 0.0288, "step": 2770 }, { "epoch": 0.7313019390581718, "grad_norm": 0.596674382686615, "learning_rate": 4.374571353204959e-06, "loss": 0.0148, "step": 2772 }, { "epoch": 0.731829573934837, "grad_norm": 0.244255930185318, "learning_rate": 4.377736744922184e-06, "loss": 0.0115, "step": 2774 }, { "epoch": 0.7323572088115025, "grad_norm": 0.3496476709842682, "learning_rate": 4.380902136639409e-06, "loss": 0.0156, "step": 2776 }, { "epoch": 0.7328848436881678, "grad_norm": 0.7407938838005066, "learning_rate": 4.384067528356634e-06, "loss": 0.0142, "step": 2778 }, { "epoch": 0.7334124785648332, "grad_norm": 0.35498589277267456, "learning_rate": 4.38723292007386e-06, "loss": 0.0188, "step": 2780 }, { "epoch": 0.7339401134414985, "grad_norm": 0.0816589966416359, "learning_rate": 4.3903983117910846e-06, "loss": 0.0177, "step": 2782 }, { "epoch": 0.7344677483181639, "grad_norm": 0.387824684381485, "learning_rate": 4.393563703508309e-06, "loss": 0.0155, "step": 2784 }, { "epoch": 0.7349953831948292, "grad_norm": 0.49979138374328613, "learning_rate": 4.396729095225534e-06, "loss": 0.0226, "step": 2786 }, { "epoch": 0.7355230180714946, "grad_norm": 0.5388631820678711, "learning_rate": 4.399894486942759e-06, "loss": 0.0241, "step": 2788 }, { "epoch": 0.7360506529481599, "grad_norm": 0.46202704310417175, "learning_rate": 4.403059878659984e-06, "loss": 0.0291, "step": 2790 }, { "epoch": 0.7365782878248253, "grad_norm": 1.1060441732406616, "learning_rate": 4.406225270377209e-06, "loss": 0.0164, "step": 2792 }, { "epoch": 0.7371059227014906, "grad_norm": 0.22927577793598175, "learning_rate": 4.409390662094434e-06, "loss": 0.0194, "step": 2794 }, { "epoch": 0.737633557578156, "grad_norm": 2.063310384750366, "learning_rate": 4.41255605381166e-06, "loss": 0.0164, "step": 2796 }, { "epoch": 0.7381611924548213, "grad_norm": 2.094332218170166, "learning_rate": 4.4157214455288845e-06, "loss": 0.0207, "step": 2798 }, { "epoch": 0.7386888273314867, "grad_norm": 0.7991780638694763, "learning_rate": 4.418886837246109e-06, "loss": 0.0126, "step": 2800 }, { "epoch": 0.739216462208152, "grad_norm": 0.36001285910606384, "learning_rate": 4.422052228963334e-06, "loss": 0.0153, "step": 2802 }, { "epoch": 0.7397440970848173, "grad_norm": 0.3792913854122162, "learning_rate": 4.425217620680559e-06, "loss": 0.0112, "step": 2804 }, { "epoch": 0.7402717319614827, "grad_norm": 0.0934685617685318, "learning_rate": 4.428383012397784e-06, "loss": 0.0218, "step": 2806 }, { "epoch": 0.740799366838148, "grad_norm": 0.1747591197490692, "learning_rate": 4.431548404115009e-06, "loss": 0.0172, "step": 2808 }, { "epoch": 0.7413270017148134, "grad_norm": 0.4185813367366791, "learning_rate": 4.434713795832235e-06, "loss": 0.0294, "step": 2810 }, { "epoch": 0.7418546365914787, "grad_norm": 0.13962164521217346, "learning_rate": 4.4378791875494596e-06, "loss": 0.017, "step": 2812 }, { "epoch": 0.7423822714681441, "grad_norm": 0.27756285667419434, "learning_rate": 4.4410445792666845e-06, "loss": 0.0118, "step": 2814 }, { "epoch": 0.7429099063448094, "grad_norm": 0.06531640142202377, "learning_rate": 4.444209970983909e-06, "loss": 0.0102, "step": 2816 }, { "epoch": 0.7434375412214748, "grad_norm": 0.1855584979057312, "learning_rate": 4.447375362701134e-06, "loss": 0.016, "step": 2818 }, { "epoch": 0.7439651760981401, "grad_norm": 1.0512547492980957, "learning_rate": 4.45054075441836e-06, "loss": 0.025, "step": 2820 }, { "epoch": 0.7444928109748055, "grad_norm": 1.0276983976364136, "learning_rate": 4.453706146135584e-06, "loss": 0.0168, "step": 2822 }, { "epoch": 0.7450204458514708, "grad_norm": 0.2451350837945938, "learning_rate": 4.456871537852809e-06, "loss": 0.0132, "step": 2824 }, { "epoch": 0.7455480807281362, "grad_norm": 1.1694729328155518, "learning_rate": 4.460036929570035e-06, "loss": 0.0197, "step": 2826 }, { "epoch": 0.7460757156048015, "grad_norm": 0.1432192325592041, "learning_rate": 4.4632023212872595e-06, "loss": 0.0182, "step": 2828 }, { "epoch": 0.7466033504814669, "grad_norm": 0.3720073401927948, "learning_rate": 4.466367713004484e-06, "loss": 0.0232, "step": 2830 }, { "epoch": 0.7471309853581322, "grad_norm": 0.311870813369751, "learning_rate": 4.469533104721709e-06, "loss": 0.0115, "step": 2832 }, { "epoch": 0.7476586202347976, "grad_norm": 0.5441110134124756, "learning_rate": 4.472698496438934e-06, "loss": 0.0136, "step": 2834 }, { "epoch": 0.7481862551114629, "grad_norm": 0.13443031907081604, "learning_rate": 4.47586388815616e-06, "loss": 0.0135, "step": 2836 }, { "epoch": 0.7487138899881283, "grad_norm": 0.2624175250530243, "learning_rate": 4.479029279873384e-06, "loss": 0.0119, "step": 2838 }, { "epoch": 0.7492415248647936, "grad_norm": 0.22427283227443695, "learning_rate": 4.48219467159061e-06, "loss": 0.0106, "step": 2840 }, { "epoch": 0.7497691597414589, "grad_norm": 0.29559269547462463, "learning_rate": 4.485360063307835e-06, "loss": 0.0115, "step": 2842 }, { "epoch": 0.7502967946181243, "grad_norm": 0.26229172945022583, "learning_rate": 4.4885254550250595e-06, "loss": 0.0126, "step": 2844 }, { "epoch": 0.7508244294947896, "grad_norm": 0.6288419365882874, "learning_rate": 4.491690846742284e-06, "loss": 0.0307, "step": 2846 }, { "epoch": 0.751352064371455, "grad_norm": 0.6901496648788452, "learning_rate": 4.494856238459509e-06, "loss": 0.0183, "step": 2848 }, { "epoch": 0.7518796992481203, "grad_norm": 0.14027273654937744, "learning_rate": 4.498021630176735e-06, "loss": 0.0131, "step": 2850 }, { "epoch": 0.7524073341247857, "grad_norm": 0.2747593820095062, "learning_rate": 4.50118702189396e-06, "loss": 0.0136, "step": 2852 }, { "epoch": 0.752934969001451, "grad_norm": 1.7848752737045288, "learning_rate": 4.504352413611184e-06, "loss": 0.0165, "step": 2854 }, { "epoch": 0.7534626038781164, "grad_norm": 0.5652771592140198, "learning_rate": 4.50751780532841e-06, "loss": 0.0152, "step": 2856 }, { "epoch": 0.7539902387547817, "grad_norm": 0.6071173548698425, "learning_rate": 4.5106831970456345e-06, "loss": 0.0104, "step": 2858 }, { "epoch": 0.7545178736314471, "grad_norm": 0.062218740582466125, "learning_rate": 4.513848588762859e-06, "loss": 0.0147, "step": 2860 }, { "epoch": 0.7550455085081124, "grad_norm": 0.31957560777664185, "learning_rate": 4.517013980480084e-06, "loss": 0.0133, "step": 2862 }, { "epoch": 0.7555731433847778, "grad_norm": 0.31391867995262146, "learning_rate": 4.520179372197309e-06, "loss": 0.0305, "step": 2864 }, { "epoch": 0.7561007782614431, "grad_norm": 0.3589886724948883, "learning_rate": 4.523344763914535e-06, "loss": 0.0178, "step": 2866 }, { "epoch": 0.7566284131381085, "grad_norm": 0.1567973792552948, "learning_rate": 4.52651015563176e-06, "loss": 0.0212, "step": 2868 }, { "epoch": 0.7571560480147738, "grad_norm": 1.7605550289154053, "learning_rate": 4.529675547348985e-06, "loss": 0.0141, "step": 2870 }, { "epoch": 0.7576836828914392, "grad_norm": 0.23587724566459656, "learning_rate": 4.53284093906621e-06, "loss": 0.0178, "step": 2872 }, { "epoch": 0.7582113177681045, "grad_norm": 1.1569936275482178, "learning_rate": 4.5360063307834345e-06, "loss": 0.0127, "step": 2874 }, { "epoch": 0.7587389526447699, "grad_norm": 0.5396696329116821, "learning_rate": 4.539171722500659e-06, "loss": 0.0125, "step": 2876 }, { "epoch": 0.7592665875214352, "grad_norm": 0.7576901316642761, "learning_rate": 4.542337114217884e-06, "loss": 0.0249, "step": 2878 }, { "epoch": 0.7597942223981005, "grad_norm": 0.9280470013618469, "learning_rate": 4.54550250593511e-06, "loss": 0.0216, "step": 2880 }, { "epoch": 0.7603218572747659, "grad_norm": 2.826911211013794, "learning_rate": 4.548667897652335e-06, "loss": 0.0197, "step": 2882 }, { "epoch": 0.7608494921514312, "grad_norm": 0.7008497714996338, "learning_rate": 4.551833289369559e-06, "loss": 0.0146, "step": 2884 }, { "epoch": 0.7613771270280966, "grad_norm": 0.34695741534233093, "learning_rate": 4.554998681086785e-06, "loss": 0.0172, "step": 2886 }, { "epoch": 0.7619047619047619, "grad_norm": 0.15650534629821777, "learning_rate": 4.5581640728040095e-06, "loss": 0.0124, "step": 2888 }, { "epoch": 0.7624323967814273, "grad_norm": 0.6110717058181763, "learning_rate": 4.561329464521235e-06, "loss": 0.0237, "step": 2890 }, { "epoch": 0.7629600316580926, "grad_norm": 0.08989797532558441, "learning_rate": 4.564494856238459e-06, "loss": 0.0148, "step": 2892 }, { "epoch": 0.763487666534758, "grad_norm": 1.1831153631210327, "learning_rate": 4.567660247955684e-06, "loss": 0.0248, "step": 2894 }, { "epoch": 0.7640153014114233, "grad_norm": 0.3896614909172058, "learning_rate": 4.57082563967291e-06, "loss": 0.01, "step": 2896 }, { "epoch": 0.7645429362880887, "grad_norm": 1.0562958717346191, "learning_rate": 4.573991031390135e-06, "loss": 0.015, "step": 2898 }, { "epoch": 0.765070571164754, "grad_norm": 0.30765923857688904, "learning_rate": 4.57715642310736e-06, "loss": 0.0231, "step": 2900 }, { "epoch": 0.7655982060414194, "grad_norm": 0.4979744553565979, "learning_rate": 4.580321814824585e-06, "loss": 0.0145, "step": 2902 }, { "epoch": 0.7661258409180847, "grad_norm": 0.2948472797870636, "learning_rate": 4.5834872065418095e-06, "loss": 0.0167, "step": 2904 }, { "epoch": 0.7666534757947501, "grad_norm": 0.09408403933048248, "learning_rate": 4.586652598259035e-06, "loss": 0.0099, "step": 2906 }, { "epoch": 0.7671811106714154, "grad_norm": 0.0798734799027443, "learning_rate": 4.589817989976259e-06, "loss": 0.0228, "step": 2908 }, { "epoch": 0.7677087455480808, "grad_norm": 0.4251658320426941, "learning_rate": 4.592983381693485e-06, "loss": 0.0155, "step": 2910 }, { "epoch": 0.7682363804247461, "grad_norm": 0.1381504237651825, "learning_rate": 4.59614877341071e-06, "loss": 0.0134, "step": 2912 }, { "epoch": 0.7687640153014115, "grad_norm": 0.5560098886489868, "learning_rate": 4.599314165127935e-06, "loss": 0.0206, "step": 2914 }, { "epoch": 0.7692916501780768, "grad_norm": 1.263564109802246, "learning_rate": 4.60247955684516e-06, "loss": 0.0216, "step": 2916 }, { "epoch": 0.7698192850547421, "grad_norm": 0.35682570934295654, "learning_rate": 4.6056449485623846e-06, "loss": 0.0126, "step": 2918 }, { "epoch": 0.7703469199314075, "grad_norm": 0.13601651787757874, "learning_rate": 4.60881034027961e-06, "loss": 0.0106, "step": 2920 }, { "epoch": 0.7708745548080728, "grad_norm": 0.3853400945663452, "learning_rate": 4.611975731996834e-06, "loss": 0.0143, "step": 2922 }, { "epoch": 0.7714021896847382, "grad_norm": 0.0914212241768837, "learning_rate": 4.615141123714059e-06, "loss": 0.0094, "step": 2924 }, { "epoch": 0.7719298245614035, "grad_norm": 0.24442338943481445, "learning_rate": 4.618306515431285e-06, "loss": 0.009, "step": 2926 }, { "epoch": 0.7724574594380689, "grad_norm": 0.16390761733055115, "learning_rate": 4.62147190714851e-06, "loss": 0.0153, "step": 2928 }, { "epoch": 0.7729850943147342, "grad_norm": 0.0804232582449913, "learning_rate": 4.624637298865735e-06, "loss": 0.0156, "step": 2930 }, { "epoch": 0.7735127291913996, "grad_norm": 0.33201536536216736, "learning_rate": 4.62780269058296e-06, "loss": 0.0172, "step": 2932 }, { "epoch": 0.7740403640680649, "grad_norm": 1.7494584321975708, "learning_rate": 4.6309680823001845e-06, "loss": 0.0114, "step": 2934 }, { "epoch": 0.7745679989447303, "grad_norm": 0.43806010484695435, "learning_rate": 4.63413347401741e-06, "loss": 0.0109, "step": 2936 }, { "epoch": 0.7750956338213956, "grad_norm": 0.5263921618461609, "learning_rate": 4.637298865734634e-06, "loss": 0.0142, "step": 2938 }, { "epoch": 0.775623268698061, "grad_norm": 0.37104788422584534, "learning_rate": 4.64046425745186e-06, "loss": 0.0136, "step": 2940 }, { "epoch": 0.7761509035747263, "grad_norm": 0.30164262652397156, "learning_rate": 4.643629649169085e-06, "loss": 0.0219, "step": 2942 }, { "epoch": 0.7766785384513917, "grad_norm": 0.18235008418560028, "learning_rate": 4.64679504088631e-06, "loss": 0.0155, "step": 2944 }, { "epoch": 0.777206173328057, "grad_norm": 1.0940406322479248, "learning_rate": 4.649960432603535e-06, "loss": 0.0168, "step": 2946 }, { "epoch": 0.7777338082047224, "grad_norm": 0.6847497820854187, "learning_rate": 4.6531258243207596e-06, "loss": 0.0191, "step": 2948 }, { "epoch": 0.7782614430813877, "grad_norm": 1.2705919742584229, "learning_rate": 4.656291216037985e-06, "loss": 0.0175, "step": 2950 }, { "epoch": 0.7787890779580531, "grad_norm": 0.15378057956695557, "learning_rate": 4.65945660775521e-06, "loss": 0.0231, "step": 2952 }, { "epoch": 0.7793167128347184, "grad_norm": 0.36698415875434875, "learning_rate": 4.662621999472434e-06, "loss": 0.0119, "step": 2954 }, { "epoch": 0.7798443477113837, "grad_norm": 0.5501545071601868, "learning_rate": 4.66578739118966e-06, "loss": 0.0186, "step": 2956 }, { "epoch": 0.7803719825880491, "grad_norm": 0.23204955458641052, "learning_rate": 4.668952782906885e-06, "loss": 0.0132, "step": 2958 }, { "epoch": 0.7808996174647144, "grad_norm": 2.005495548248291, "learning_rate": 4.672118174624111e-06, "loss": 0.0138, "step": 2960 }, { "epoch": 0.7814272523413798, "grad_norm": 0.1447867453098297, "learning_rate": 4.675283566341335e-06, "loss": 0.0129, "step": 2962 }, { "epoch": 0.7819548872180451, "grad_norm": 0.1986972987651825, "learning_rate": 4.6784489580585595e-06, "loss": 0.0206, "step": 2964 }, { "epoch": 0.7824825220947105, "grad_norm": 0.7350953817367554, "learning_rate": 4.681614349775785e-06, "loss": 0.0099, "step": 2966 }, { "epoch": 0.7830101569713758, "grad_norm": 0.30616337060928345, "learning_rate": 4.68477974149301e-06, "loss": 0.0353, "step": 2968 }, { "epoch": 0.7835377918480412, "grad_norm": 0.6835904121398926, "learning_rate": 4.687945133210235e-06, "loss": 0.0227, "step": 2970 }, { "epoch": 0.7840654267247065, "grad_norm": 0.21878664195537567, "learning_rate": 4.69111052492746e-06, "loss": 0.0129, "step": 2972 }, { "epoch": 0.7845930616013719, "grad_norm": 0.23605652153491974, "learning_rate": 4.694275916644685e-06, "loss": 0.0104, "step": 2974 }, { "epoch": 0.7851206964780372, "grad_norm": 0.07127878069877625, "learning_rate": 4.69744130836191e-06, "loss": 0.0136, "step": 2976 }, { "epoch": 0.7856483313547026, "grad_norm": 1.0166345834732056, "learning_rate": 4.700606700079135e-06, "loss": 0.0269, "step": 2978 }, { "epoch": 0.7861759662313679, "grad_norm": 0.8122791051864624, "learning_rate": 4.7037720917963595e-06, "loss": 0.0114, "step": 2980 }, { "epoch": 0.7867036011080333, "grad_norm": 0.7651612758636475, "learning_rate": 4.706937483513585e-06, "loss": 0.0133, "step": 2982 }, { "epoch": 0.7872312359846986, "grad_norm": 0.5582394599914551, "learning_rate": 4.71010287523081e-06, "loss": 0.0231, "step": 2984 }, { "epoch": 0.787758870861364, "grad_norm": 0.48365145921707153, "learning_rate": 4.713268266948035e-06, "loss": 0.0257, "step": 2986 }, { "epoch": 0.7882865057380293, "grad_norm": 0.2562965452671051, "learning_rate": 4.71643365866526e-06, "loss": 0.0097, "step": 2988 }, { "epoch": 0.7888141406146947, "grad_norm": 0.3612721562385559, "learning_rate": 4.719599050382485e-06, "loss": 0.0192, "step": 2990 }, { "epoch": 0.78934177549136, "grad_norm": 0.13296641409397125, "learning_rate": 4.72276444209971e-06, "loss": 0.0095, "step": 2992 }, { "epoch": 0.7898694103680254, "grad_norm": 0.2206280529499054, "learning_rate": 4.7259298338169345e-06, "loss": 0.0155, "step": 2994 }, { "epoch": 0.7903970452446907, "grad_norm": 0.07786241173744202, "learning_rate": 4.72909522553416e-06, "loss": 0.0108, "step": 2996 }, { "epoch": 0.790924680121356, "grad_norm": 0.5418732166290283, "learning_rate": 4.732260617251385e-06, "loss": 0.0151, "step": 2998 }, { "epoch": 0.7914523149980214, "grad_norm": 0.2260584980249405, "learning_rate": 4.73542600896861e-06, "loss": 0.0269, "step": 3000 }, { "epoch": 0.7919799498746867, "grad_norm": 0.09928394109010696, "learning_rate": 4.738591400685835e-06, "loss": 0.0084, "step": 3002 }, { "epoch": 0.7925075847513521, "grad_norm": 0.07079390436410904, "learning_rate": 4.74175679240306e-06, "loss": 0.0097, "step": 3004 }, { "epoch": 0.7930352196280174, "grad_norm": 0.4321292042732239, "learning_rate": 4.7449221841202855e-06, "loss": 0.0094, "step": 3006 }, { "epoch": 0.7935628545046828, "grad_norm": 0.053451187908649445, "learning_rate": 4.74808757583751e-06, "loss": 0.0077, "step": 3008 }, { "epoch": 0.7940904893813481, "grad_norm": 0.9034824967384338, "learning_rate": 4.7512529675547345e-06, "loss": 0.0243, "step": 3010 }, { "epoch": 0.7946181242580135, "grad_norm": 0.4614518880844116, "learning_rate": 4.75441835927196e-06, "loss": 0.01, "step": 3012 }, { "epoch": 0.7951457591346788, "grad_norm": 0.1350790113210678, "learning_rate": 4.757583750989185e-06, "loss": 0.0177, "step": 3014 }, { "epoch": 0.7956733940113442, "grad_norm": 0.6811172962188721, "learning_rate": 4.76074914270641e-06, "loss": 0.0172, "step": 3016 }, { "epoch": 0.7962010288880095, "grad_norm": 0.37572765350341797, "learning_rate": 4.763914534423635e-06, "loss": 0.0207, "step": 3018 }, { "epoch": 0.7967286637646749, "grad_norm": 0.38759568333625793, "learning_rate": 4.76707992614086e-06, "loss": 0.0104, "step": 3020 }, { "epoch": 0.7972562986413402, "grad_norm": 0.17738020420074463, "learning_rate": 4.7702453178580855e-06, "loss": 0.0094, "step": 3022 }, { "epoch": 0.7977839335180056, "grad_norm": 0.45300742983818054, "learning_rate": 4.7734107095753095e-06, "loss": 0.011, "step": 3024 }, { "epoch": 0.7983115683946709, "grad_norm": 0.08157246559858322, "learning_rate": 4.776576101292535e-06, "loss": 0.0081, "step": 3026 }, { "epoch": 0.7988392032713363, "grad_norm": 0.10925646871328354, "learning_rate": 4.77974149300976e-06, "loss": 0.0099, "step": 3028 }, { "epoch": 0.7993668381480016, "grad_norm": 0.15543323755264282, "learning_rate": 4.782906884726985e-06, "loss": 0.0114, "step": 3030 }, { "epoch": 0.799894473024667, "grad_norm": 0.06609004735946655, "learning_rate": 4.78607227644421e-06, "loss": 0.0138, "step": 3032 }, { "epoch": 0.8004221079013323, "grad_norm": 0.12275318056344986, "learning_rate": 4.789237668161435e-06, "loss": 0.0135, "step": 3034 }, { "epoch": 0.8009497427779976, "grad_norm": 0.9975630640983582, "learning_rate": 4.7924030598786606e-06, "loss": 0.0121, "step": 3036 }, { "epoch": 0.801477377654663, "grad_norm": 0.11846053600311279, "learning_rate": 4.7955684515958854e-06, "loss": 0.0205, "step": 3038 }, { "epoch": 0.8020050125313283, "grad_norm": 0.09677263349294662, "learning_rate": 4.7987338433131095e-06, "loss": 0.0213, "step": 3040 }, { "epoch": 0.8025326474079937, "grad_norm": 0.15526480972766876, "learning_rate": 4.801899235030335e-06, "loss": 0.0122, "step": 3042 }, { "epoch": 0.803060282284659, "grad_norm": 0.38546475768089294, "learning_rate": 4.80506462674756e-06, "loss": 0.0146, "step": 3044 }, { "epoch": 0.8035879171613244, "grad_norm": 0.5240786075592041, "learning_rate": 4.808230018464785e-06, "loss": 0.0151, "step": 3046 }, { "epoch": 0.8041155520379897, "grad_norm": 2.5582263469696045, "learning_rate": 4.81139541018201e-06, "loss": 0.0187, "step": 3048 }, { "epoch": 0.8046431869146551, "grad_norm": 0.08718177676200867, "learning_rate": 4.814560801899235e-06, "loss": 0.0085, "step": 3050 }, { "epoch": 0.8051708217913204, "grad_norm": 0.09406238049268723, "learning_rate": 4.8177261936164605e-06, "loss": 0.0086, "step": 3052 }, { "epoch": 0.8056984566679858, "grad_norm": 0.8522637486457825, "learning_rate": 4.820891585333685e-06, "loss": 0.0245, "step": 3054 }, { "epoch": 0.8062260915446511, "grad_norm": 1.8692662715911865, "learning_rate": 4.82405697705091e-06, "loss": 0.0201, "step": 3056 }, { "epoch": 0.8067537264213165, "grad_norm": 0.20576222240924835, "learning_rate": 4.827222368768135e-06, "loss": 0.0144, "step": 3058 }, { "epoch": 0.8072813612979818, "grad_norm": 0.6096840500831604, "learning_rate": 4.83038776048536e-06, "loss": 0.0143, "step": 3060 }, { "epoch": 0.8078089961746472, "grad_norm": 1.5236879587173462, "learning_rate": 4.833553152202585e-06, "loss": 0.0138, "step": 3062 }, { "epoch": 0.8083366310513125, "grad_norm": 1.4029396772384644, "learning_rate": 4.83671854391981e-06, "loss": 0.0137, "step": 3064 }, { "epoch": 0.8088642659279779, "grad_norm": 0.06379983574151993, "learning_rate": 4.8398839356370356e-06, "loss": 0.0079, "step": 3066 }, { "epoch": 0.8093919008046432, "grad_norm": 0.4317783713340759, "learning_rate": 4.8430493273542605e-06, "loss": 0.0129, "step": 3068 }, { "epoch": 0.8099195356813086, "grad_norm": 0.3889419138431549, "learning_rate": 4.846214719071485e-06, "loss": 0.0179, "step": 3070 }, { "epoch": 0.8104471705579739, "grad_norm": 0.23881329596042633, "learning_rate": 4.84938011078871e-06, "loss": 0.0115, "step": 3072 }, { "epoch": 0.8109748054346392, "grad_norm": 0.49063101410865784, "learning_rate": 4.852545502505935e-06, "loss": 0.015, "step": 3074 }, { "epoch": 0.8115024403113046, "grad_norm": 0.08687926083803177, "learning_rate": 4.855710894223161e-06, "loss": 0.0151, "step": 3076 }, { "epoch": 0.8120300751879699, "grad_norm": 0.8989101052284241, "learning_rate": 4.858876285940385e-06, "loss": 0.0085, "step": 3078 }, { "epoch": 0.8125577100646353, "grad_norm": 0.6208742260932922, "learning_rate": 4.86204167765761e-06, "loss": 0.0213, "step": 3080 }, { "epoch": 0.8130853449413006, "grad_norm": 0.2771821916103363, "learning_rate": 4.8652070693748355e-06, "loss": 0.0239, "step": 3082 }, { "epoch": 0.813612979817966, "grad_norm": 0.19522957503795624, "learning_rate": 4.86837246109206e-06, "loss": 0.0088, "step": 3084 }, { "epoch": 0.8141406146946313, "grad_norm": 0.11845764517784119, "learning_rate": 4.871537852809285e-06, "loss": 0.016, "step": 3086 }, { "epoch": 0.8146682495712967, "grad_norm": 0.3913410007953644, "learning_rate": 4.87470324452651e-06, "loss": 0.0217, "step": 3088 }, { "epoch": 0.815195884447962, "grad_norm": 0.1835995465517044, "learning_rate": 4.877868636243735e-06, "loss": 0.0083, "step": 3090 }, { "epoch": 0.8157235193246274, "grad_norm": 0.5953846573829651, "learning_rate": 4.881034027960961e-06, "loss": 0.0175, "step": 3092 }, { "epoch": 0.8162511542012927, "grad_norm": 0.07035313546657562, "learning_rate": 4.884199419678185e-06, "loss": 0.0179, "step": 3094 }, { "epoch": 0.8167787890779581, "grad_norm": 0.14374004304409027, "learning_rate": 4.887364811395411e-06, "loss": 0.0083, "step": 3096 }, { "epoch": 0.8173064239546234, "grad_norm": 0.10660360753536224, "learning_rate": 4.8905302031126355e-06, "loss": 0.0187, "step": 3098 }, { "epoch": 0.8178340588312888, "grad_norm": 0.08958137780427933, "learning_rate": 4.89369559482986e-06, "loss": 0.008, "step": 3100 }, { "epoch": 0.8183616937079541, "grad_norm": 1.5418074131011963, "learning_rate": 4.896860986547085e-06, "loss": 0.0182, "step": 3102 }, { "epoch": 0.8188893285846195, "grad_norm": 0.9721730947494507, "learning_rate": 4.90002637826431e-06, "loss": 0.0187, "step": 3104 }, { "epoch": 0.8194169634612848, "grad_norm": 0.4065903127193451, "learning_rate": 4.903191769981536e-06, "loss": 0.0153, "step": 3106 }, { "epoch": 0.8199445983379502, "grad_norm": 0.2312549203634262, "learning_rate": 4.906357161698761e-06, "loss": 0.0109, "step": 3108 }, { "epoch": 0.8204722332146155, "grad_norm": 0.11536154896020889, "learning_rate": 4.909522553415985e-06, "loss": 0.0146, "step": 3110 }, { "epoch": 0.8209998680912808, "grad_norm": 0.24379375576972961, "learning_rate": 4.9126879451332105e-06, "loss": 0.0086, "step": 3112 }, { "epoch": 0.8215275029679462, "grad_norm": 0.8899049162864685, "learning_rate": 4.915853336850435e-06, "loss": 0.0164, "step": 3114 }, { "epoch": 0.8220551378446115, "grad_norm": 0.24516168236732483, "learning_rate": 4.91901872856766e-06, "loss": 0.0111, "step": 3116 }, { "epoch": 0.8225827727212769, "grad_norm": 0.6459700465202332, "learning_rate": 4.922184120284885e-06, "loss": 0.0178, "step": 3118 }, { "epoch": 0.8231104075979422, "grad_norm": 1.5070396661758423, "learning_rate": 4.92534951200211e-06, "loss": 0.0112, "step": 3120 }, { "epoch": 0.8236380424746076, "grad_norm": 0.19304858148097992, "learning_rate": 4.928514903719336e-06, "loss": 0.0139, "step": 3122 }, { "epoch": 0.8241656773512729, "grad_norm": 1.466913104057312, "learning_rate": 4.931680295436561e-06, "loss": 0.0137, "step": 3124 }, { "epoch": 0.8246933122279383, "grad_norm": 0.10960590094327927, "learning_rate": 4.934845687153786e-06, "loss": 0.0087, "step": 3126 }, { "epoch": 0.8252209471046036, "grad_norm": 0.29351574182510376, "learning_rate": 4.9380110788710105e-06, "loss": 0.0158, "step": 3128 }, { "epoch": 0.825748581981269, "grad_norm": 2.106218099594116, "learning_rate": 4.941176470588235e-06, "loss": 0.0219, "step": 3130 }, { "epoch": 0.8262762168579343, "grad_norm": 0.35812029242515564, "learning_rate": 4.94434186230546e-06, "loss": 0.0079, "step": 3132 }, { "epoch": 0.8268038517345997, "grad_norm": 0.5276069045066833, "learning_rate": 4.947507254022685e-06, "loss": 0.009, "step": 3134 }, { "epoch": 0.827331486611265, "grad_norm": 0.5007577538490295, "learning_rate": 4.950672645739911e-06, "loss": 0.0156, "step": 3136 }, { "epoch": 0.8278591214879304, "grad_norm": 0.24602162837982178, "learning_rate": 4.953838037457136e-06, "loss": 0.0095, "step": 3138 }, { "epoch": 0.8283867563645957, "grad_norm": 0.2727855145931244, "learning_rate": 4.95700342917436e-06, "loss": 0.0173, "step": 3140 }, { "epoch": 0.8289143912412611, "grad_norm": 0.3603224456310272, "learning_rate": 4.9601688208915855e-06, "loss": 0.0131, "step": 3142 }, { "epoch": 0.8294420261179264, "grad_norm": 0.19935069978237152, "learning_rate": 4.9633342126088104e-06, "loss": 0.0102, "step": 3144 }, { "epoch": 0.8299696609945918, "grad_norm": 0.9300878643989563, "learning_rate": 4.966499604326036e-06, "loss": 0.013, "step": 3146 }, { "epoch": 0.8304972958712571, "grad_norm": 0.27380213141441345, "learning_rate": 4.96966499604326e-06, "loss": 0.0182, "step": 3148 }, { "epoch": 0.8310249307479224, "grad_norm": 0.0879579409956932, "learning_rate": 4.972830387760485e-06, "loss": 0.0075, "step": 3150 }, { "epoch": 0.8315525656245878, "grad_norm": 0.43250006437301636, "learning_rate": 4.975995779477711e-06, "loss": 0.0146, "step": 3152 }, { "epoch": 0.8320802005012531, "grad_norm": 0.06928907334804535, "learning_rate": 4.979161171194936e-06, "loss": 0.0153, "step": 3154 }, { "epoch": 0.8326078353779185, "grad_norm": 0.3872459828853607, "learning_rate": 4.982326562912161e-06, "loss": 0.0087, "step": 3156 }, { "epoch": 0.8331354702545838, "grad_norm": 1.9165856838226318, "learning_rate": 4.9854919546293855e-06, "loss": 0.0118, "step": 3158 }, { "epoch": 0.8336631051312492, "grad_norm": 0.5250787734985352, "learning_rate": 4.98865734634661e-06, "loss": 0.0083, "step": 3160 }, { "epoch": 0.8341907400079145, "grad_norm": 1.1487146615982056, "learning_rate": 4.991822738063836e-06, "loss": 0.0089, "step": 3162 }, { "epoch": 0.8347183748845799, "grad_norm": 2.8361008167266846, "learning_rate": 4.99498812978106e-06, "loss": 0.0134, "step": 3164 }, { "epoch": 0.8352460097612452, "grad_norm": 0.26855507493019104, "learning_rate": 4.998153521498286e-06, "loss": 0.0183, "step": 3166 }, { "epoch": 0.8357736446379106, "grad_norm": 0.2940095365047455, "learning_rate": 5.001318913215511e-06, "loss": 0.0074, "step": 3168 }, { "epoch": 0.8363012795145759, "grad_norm": 0.8513874411582947, "learning_rate": 5.004484304932736e-06, "loss": 0.0161, "step": 3170 }, { "epoch": 0.8368289143912413, "grad_norm": 0.6617721319198608, "learning_rate": 5.0076496966499606e-06, "loss": 0.0207, "step": 3172 }, { "epoch": 0.8373565492679066, "grad_norm": 0.4519553482532501, "learning_rate": 5.0108150883671854e-06, "loss": 0.0119, "step": 3174 }, { "epoch": 0.837884184144572, "grad_norm": 0.5017978549003601, "learning_rate": 5.013980480084411e-06, "loss": 0.0239, "step": 3176 }, { "epoch": 0.8384118190212373, "grad_norm": 0.350477397441864, "learning_rate": 5.017145871801636e-06, "loss": 0.0118, "step": 3178 }, { "epoch": 0.8389394538979027, "grad_norm": 0.3586798310279846, "learning_rate": 5.02031126351886e-06, "loss": 0.0139, "step": 3180 }, { "epoch": 0.839467088774568, "grad_norm": 0.07015179097652435, "learning_rate": 5.023476655236086e-06, "loss": 0.0086, "step": 3182 }, { "epoch": 0.8399947236512334, "grad_norm": 0.9076225161552429, "learning_rate": 5.026642046953311e-06, "loss": 0.0136, "step": 3184 }, { "epoch": 0.8405223585278987, "grad_norm": 0.4590561091899872, "learning_rate": 5.029807438670536e-06, "loss": 0.0227, "step": 3186 }, { "epoch": 0.841049993404564, "grad_norm": 0.3494877815246582, "learning_rate": 5.0329728303877605e-06, "loss": 0.0187, "step": 3188 }, { "epoch": 0.8415776282812294, "grad_norm": 2.7495832443237305, "learning_rate": 5.036138222104985e-06, "loss": 0.024, "step": 3190 }, { "epoch": 0.8421052631578947, "grad_norm": 0.49433380365371704, "learning_rate": 5.039303613822211e-06, "loss": 0.0151, "step": 3192 }, { "epoch": 0.8426328980345601, "grad_norm": 2.4680190086364746, "learning_rate": 5.042469005539435e-06, "loss": 0.0197, "step": 3194 }, { "epoch": 0.8431605329112254, "grad_norm": 0.5602519512176514, "learning_rate": 5.04563439725666e-06, "loss": 0.0157, "step": 3196 }, { "epoch": 0.8436881677878908, "grad_norm": 0.5271774530410767, "learning_rate": 5.048799788973886e-06, "loss": 0.015, "step": 3198 }, { "epoch": 0.8442158026645561, "grad_norm": 0.45806291699409485, "learning_rate": 5.051965180691111e-06, "loss": 0.0086, "step": 3200 }, { "epoch": 0.8447434375412215, "grad_norm": 0.626529335975647, "learning_rate": 5.0551305724083356e-06, "loss": 0.0111, "step": 3202 }, { "epoch": 0.8452710724178868, "grad_norm": 0.0755823627114296, "learning_rate": 5.0582959641255605e-06, "loss": 0.0194, "step": 3204 }, { "epoch": 0.8457987072945522, "grad_norm": 1.7859153747558594, "learning_rate": 5.061461355842785e-06, "loss": 0.016, "step": 3206 }, { "epoch": 0.8463263421712175, "grad_norm": 0.8356786370277405, "learning_rate": 5.064626747560011e-06, "loss": 0.0199, "step": 3208 }, { "epoch": 0.8468539770478829, "grad_norm": 0.3843086063861847, "learning_rate": 5.067792139277235e-06, "loss": 0.016, "step": 3210 }, { "epoch": 0.8473816119245482, "grad_norm": 0.10241444408893585, "learning_rate": 5.070957530994461e-06, "loss": 0.0072, "step": 3212 }, { "epoch": 0.8479092468012136, "grad_norm": 1.3613190650939941, "learning_rate": 5.074122922711686e-06, "loss": 0.019, "step": 3214 }, { "epoch": 0.8484368816778789, "grad_norm": 0.7824454307556152, "learning_rate": 5.077288314428911e-06, "loss": 0.0147, "step": 3216 }, { "epoch": 0.8489645165545443, "grad_norm": 0.8478617072105408, "learning_rate": 5.0804537061461355e-06, "loss": 0.0126, "step": 3218 }, { "epoch": 0.8494921514312096, "grad_norm": 0.6140949726104736, "learning_rate": 5.08361909786336e-06, "loss": 0.0126, "step": 3220 }, { "epoch": 0.850019786307875, "grad_norm": 0.4059818983078003, "learning_rate": 5.086784489580586e-06, "loss": 0.0156, "step": 3222 }, { "epoch": 0.8505474211845403, "grad_norm": 0.2651032507419586, "learning_rate": 5.089949881297811e-06, "loss": 0.008, "step": 3224 }, { "epoch": 0.8510750560612056, "grad_norm": 0.5370323061943054, "learning_rate": 5.093115273015035e-06, "loss": 0.0093, "step": 3226 }, { "epoch": 0.851602690937871, "grad_norm": 0.4930459260940552, "learning_rate": 5.096280664732261e-06, "loss": 0.009, "step": 3228 }, { "epoch": 0.8521303258145363, "grad_norm": 0.07745850086212158, "learning_rate": 5.099446056449486e-06, "loss": 0.0106, "step": 3230 }, { "epoch": 0.8526579606912017, "grad_norm": 0.43778330087661743, "learning_rate": 5.102611448166711e-06, "loss": 0.0128, "step": 3232 }, { "epoch": 0.853185595567867, "grad_norm": 0.04973269999027252, "learning_rate": 5.1057768398839355e-06, "loss": 0.0066, "step": 3234 }, { "epoch": 0.8537132304445324, "grad_norm": 1.4805828332901, "learning_rate": 5.10894223160116e-06, "loss": 0.0179, "step": 3236 }, { "epoch": 0.8542408653211977, "grad_norm": 0.528084933757782, "learning_rate": 5.112107623318386e-06, "loss": 0.0088, "step": 3238 }, { "epoch": 0.8547685001978631, "grad_norm": 0.05899150297045708, "learning_rate": 5.115273015035611e-06, "loss": 0.0074, "step": 3240 }, { "epoch": 0.8552961350745284, "grad_norm": 0.07013611495494843, "learning_rate": 5.118438406752836e-06, "loss": 0.0062, "step": 3242 }, { "epoch": 0.8558237699511938, "grad_norm": 0.11871986836194992, "learning_rate": 5.121603798470061e-06, "loss": 0.0179, "step": 3244 }, { "epoch": 0.8563514048278591, "grad_norm": 0.8186540603637695, "learning_rate": 5.124769190187286e-06, "loss": 0.014, "step": 3246 }, { "epoch": 0.8568790397045245, "grad_norm": 1.7807480096817017, "learning_rate": 5.1279345819045105e-06, "loss": 0.0077, "step": 3248 }, { "epoch": 0.8574066745811898, "grad_norm": 3.1136832237243652, "learning_rate": 5.131099973621735e-06, "loss": 0.0227, "step": 3250 }, { "epoch": 0.8579343094578552, "grad_norm": 0.855719804763794, "learning_rate": 5.134265365338961e-06, "loss": 0.0178, "step": 3252 }, { "epoch": 0.8584619443345205, "grad_norm": 0.43665745854377747, "learning_rate": 5.137430757056186e-06, "loss": 0.0105, "step": 3254 }, { "epoch": 0.8589895792111859, "grad_norm": 0.2393963783979416, "learning_rate": 5.140596148773411e-06, "loss": 0.0102, "step": 3256 }, { "epoch": 0.8595172140878512, "grad_norm": 0.08678192645311356, "learning_rate": 5.143761540490636e-06, "loss": 0.0083, "step": 3258 }, { "epoch": 0.8600448489645166, "grad_norm": 0.726828932762146, "learning_rate": 5.146926932207861e-06, "loss": 0.0115, "step": 3260 }, { "epoch": 0.8605724838411819, "grad_norm": 3.0469772815704346, "learning_rate": 5.1500923239250864e-06, "loss": 0.0159, "step": 3262 }, { "epoch": 0.8611001187178472, "grad_norm": 1.371192216873169, "learning_rate": 5.1532577156423105e-06, "loss": 0.0113, "step": 3264 }, { "epoch": 0.8616277535945126, "grad_norm": 0.7984887957572937, "learning_rate": 5.156423107359535e-06, "loss": 0.0083, "step": 3266 }, { "epoch": 0.8621553884711779, "grad_norm": 0.4274028241634369, "learning_rate": 5.159588499076761e-06, "loss": 0.0114, "step": 3268 }, { "epoch": 0.8626830233478433, "grad_norm": 0.08448777347803116, "learning_rate": 5.162753890793986e-06, "loss": 0.0121, "step": 3270 }, { "epoch": 0.8632106582245086, "grad_norm": 0.05891956016421318, "learning_rate": 5.165919282511211e-06, "loss": 0.008, "step": 3272 }, { "epoch": 0.863738293101174, "grad_norm": 0.1704418659210205, "learning_rate": 5.169084674228436e-06, "loss": 0.0068, "step": 3274 }, { "epoch": 0.8642659279778393, "grad_norm": 0.9037020206451416, "learning_rate": 5.172250065945661e-06, "loss": 0.0092, "step": 3276 }, { "epoch": 0.8647935628545047, "grad_norm": 0.07021836191415787, "learning_rate": 5.175415457662886e-06, "loss": 0.0063, "step": 3278 }, { "epoch": 0.86532119773117, "grad_norm": 1.1310291290283203, "learning_rate": 5.1785808493801104e-06, "loss": 0.0279, "step": 3280 }, { "epoch": 0.8658488326078354, "grad_norm": 0.053802695125341415, "learning_rate": 5.181746241097336e-06, "loss": 0.0077, "step": 3282 }, { "epoch": 0.8663764674845007, "grad_norm": 1.570737600326538, "learning_rate": 5.184911632814561e-06, "loss": 0.0135, "step": 3284 }, { "epoch": 0.8669041023611661, "grad_norm": 0.32544147968292236, "learning_rate": 5.188077024531786e-06, "loss": 0.0172, "step": 3286 }, { "epoch": 0.8674317372378314, "grad_norm": 0.9720383882522583, "learning_rate": 5.191242416249011e-06, "loss": 0.019, "step": 3288 }, { "epoch": 0.8679593721144968, "grad_norm": 0.38866716623306274, "learning_rate": 5.194407807966236e-06, "loss": 0.0111, "step": 3290 }, { "epoch": 0.8684870069911621, "grad_norm": 0.8867990970611572, "learning_rate": 5.1975731996834614e-06, "loss": 0.0106, "step": 3292 }, { "epoch": 0.8690146418678275, "grad_norm": 0.12812672555446625, "learning_rate": 5.200738591400686e-06, "loss": 0.0113, "step": 3294 }, { "epoch": 0.8695422767444928, "grad_norm": 0.40541431307792664, "learning_rate": 5.20390398311791e-06, "loss": 0.0214, "step": 3296 }, { "epoch": 0.8700699116211582, "grad_norm": 0.22729043662548065, "learning_rate": 5.207069374835136e-06, "loss": 0.01, "step": 3298 }, { "epoch": 0.8705975464978235, "grad_norm": 0.17461398243904114, "learning_rate": 5.210234766552361e-06, "loss": 0.0067, "step": 3300 }, { "epoch": 0.8711251813744888, "grad_norm": 1.3105415105819702, "learning_rate": 5.213400158269586e-06, "loss": 0.0105, "step": 3302 }, { "epoch": 0.8716528162511542, "grad_norm": 0.455825537443161, "learning_rate": 5.216565549986811e-06, "loss": 0.0172, "step": 3304 }, { "epoch": 0.8721804511278195, "grad_norm": 1.7446379661560059, "learning_rate": 5.219730941704036e-06, "loss": 0.0258, "step": 3306 }, { "epoch": 0.8727080860044849, "grad_norm": 0.4511977732181549, "learning_rate": 5.222896333421261e-06, "loss": 0.012, "step": 3308 }, { "epoch": 0.8732357208811502, "grad_norm": 0.20725496113300323, "learning_rate": 5.226061725138486e-06, "loss": 0.0164, "step": 3310 }, { "epoch": 0.8737633557578156, "grad_norm": 0.7894477844238281, "learning_rate": 5.229227116855711e-06, "loss": 0.0172, "step": 3312 }, { "epoch": 0.8742909906344809, "grad_norm": 0.6720626950263977, "learning_rate": 5.232392508572936e-06, "loss": 0.0158, "step": 3314 }, { "epoch": 0.8748186255111463, "grad_norm": 0.5250063538551331, "learning_rate": 5.235557900290161e-06, "loss": 0.0233, "step": 3316 }, { "epoch": 0.8753462603878116, "grad_norm": 0.43819066882133484, "learning_rate": 5.238723292007386e-06, "loss": 0.0075, "step": 3318 }, { "epoch": 0.875873895264477, "grad_norm": 0.5129814147949219, "learning_rate": 5.241888683724611e-06, "loss": 0.0111, "step": 3320 }, { "epoch": 0.8764015301411423, "grad_norm": 0.20252399146556854, "learning_rate": 5.2450540754418365e-06, "loss": 0.0078, "step": 3322 }, { "epoch": 0.8769291650178077, "grad_norm": 0.3735501766204834, "learning_rate": 5.248219467159061e-06, "loss": 0.0101, "step": 3324 }, { "epoch": 0.877456799894473, "grad_norm": 0.06471690535545349, "learning_rate": 5.251384858876286e-06, "loss": 0.0064, "step": 3326 }, { "epoch": 0.8779844347711384, "grad_norm": 1.0792912244796753, "learning_rate": 5.254550250593511e-06, "loss": 0.014, "step": 3328 }, { "epoch": 0.8785120696478037, "grad_norm": 0.034124355763196945, "learning_rate": 5.257715642310736e-06, "loss": 0.0101, "step": 3330 }, { "epoch": 0.8790397045244691, "grad_norm": 0.03733518719673157, "learning_rate": 5.260881034027962e-06, "loss": 0.0069, "step": 3332 }, { "epoch": 0.8795673394011344, "grad_norm": 0.08243560791015625, "learning_rate": 5.264046425745186e-06, "loss": 0.014, "step": 3334 }, { "epoch": 0.8800949742777998, "grad_norm": 0.34308934211730957, "learning_rate": 5.267211817462411e-06, "loss": 0.0064, "step": 3336 }, { "epoch": 0.8806226091544651, "grad_norm": 0.3164360225200653, "learning_rate": 5.270377209179636e-06, "loss": 0.0252, "step": 3338 }, { "epoch": 0.8811502440311304, "grad_norm": 0.26733264327049255, "learning_rate": 5.273542600896861e-06, "loss": 0.0073, "step": 3340 }, { "epoch": 0.8816778789077958, "grad_norm": 1.3708502054214478, "learning_rate": 5.276707992614086e-06, "loss": 0.0306, "step": 3342 }, { "epoch": 0.8822055137844611, "grad_norm": 0.2344052642583847, "learning_rate": 5.279873384331311e-06, "loss": 0.0103, "step": 3344 }, { "epoch": 0.8827331486611265, "grad_norm": 0.35155367851257324, "learning_rate": 5.283038776048536e-06, "loss": 0.0144, "step": 3346 }, { "epoch": 0.8832607835377918, "grad_norm": 0.6879295110702515, "learning_rate": 5.286204167765762e-06, "loss": 0.0251, "step": 3348 }, { "epoch": 0.8837884184144572, "grad_norm": 0.5527849197387695, "learning_rate": 5.289369559482986e-06, "loss": 0.0182, "step": 3350 }, { "epoch": 0.8843160532911225, "grad_norm": 0.18057262897491455, "learning_rate": 5.2925349512002115e-06, "loss": 0.0106, "step": 3352 }, { "epoch": 0.8848436881677879, "grad_norm": 0.18241886794567108, "learning_rate": 5.295700342917436e-06, "loss": 0.0168, "step": 3354 }, { "epoch": 0.8853713230444532, "grad_norm": 0.38347503542900085, "learning_rate": 5.298865734634661e-06, "loss": 0.0125, "step": 3356 }, { "epoch": 0.8858989579211186, "grad_norm": 0.48882994055747986, "learning_rate": 5.302031126351886e-06, "loss": 0.0096, "step": 3358 }, { "epoch": 0.8864265927977839, "grad_norm": 0.27230629324913025, "learning_rate": 5.305196518069111e-06, "loss": 0.012, "step": 3360 }, { "epoch": 0.8869542276744493, "grad_norm": 0.419534832239151, "learning_rate": 5.308361909786337e-06, "loss": 0.0155, "step": 3362 }, { "epoch": 0.8874818625511146, "grad_norm": 0.512951135635376, "learning_rate": 5.311527301503562e-06, "loss": 0.0098, "step": 3364 }, { "epoch": 0.88800949742778, "grad_norm": 0.1986953169107437, "learning_rate": 5.314692693220786e-06, "loss": 0.0095, "step": 3366 }, { "epoch": 0.8885371323044453, "grad_norm": 0.5172854065895081, "learning_rate": 5.317858084938011e-06, "loss": 0.0184, "step": 3368 }, { "epoch": 0.8890647671811107, "grad_norm": 0.44931915402412415, "learning_rate": 5.321023476655236e-06, "loss": 0.0196, "step": 3370 }, { "epoch": 0.889592402057776, "grad_norm": 0.21886897087097168, "learning_rate": 5.324188868372461e-06, "loss": 0.0064, "step": 3372 }, { "epoch": 0.8901200369344414, "grad_norm": 0.19931648671627045, "learning_rate": 5.327354260089686e-06, "loss": 0.0094, "step": 3374 }, { "epoch": 0.8906476718111067, "grad_norm": 0.1784026175737381, "learning_rate": 5.330519651806911e-06, "loss": 0.0071, "step": 3376 }, { "epoch": 0.891175306687772, "grad_norm": 0.07177034765481949, "learning_rate": 5.333685043524137e-06, "loss": 0.0133, "step": 3378 }, { "epoch": 0.8917029415644374, "grad_norm": 0.0729404017329216, "learning_rate": 5.336850435241362e-06, "loss": 0.0072, "step": 3380 }, { "epoch": 0.8922305764411027, "grad_norm": 0.8893320560455322, "learning_rate": 5.3400158269585865e-06, "loss": 0.0193, "step": 3382 }, { "epoch": 0.8927582113177681, "grad_norm": 0.217534601688385, "learning_rate": 5.343181218675811e-06, "loss": 0.0124, "step": 3384 }, { "epoch": 0.8932858461944334, "grad_norm": 1.0584132671356201, "learning_rate": 5.346346610393036e-06, "loss": 0.0168, "step": 3386 }, { "epoch": 0.8938134810710988, "grad_norm": 0.48041313886642456, "learning_rate": 5.349512002110261e-06, "loss": 0.013, "step": 3388 }, { "epoch": 0.8943411159477641, "grad_norm": 0.3125177323818207, "learning_rate": 5.352677393827486e-06, "loss": 0.0168, "step": 3390 }, { "epoch": 0.8948687508244295, "grad_norm": 0.5408496856689453, "learning_rate": 5.355842785544711e-06, "loss": 0.0204, "step": 3392 }, { "epoch": 0.8953963857010948, "grad_norm": 0.07497286796569824, "learning_rate": 5.359008177261937e-06, "loss": 0.0073, "step": 3394 }, { "epoch": 0.8959240205777602, "grad_norm": 6.0620832443237305, "learning_rate": 5.362173568979161e-06, "loss": 0.0075, "step": 3396 }, { "epoch": 0.8964516554544255, "grad_norm": 0.5866127610206604, "learning_rate": 5.3653389606963864e-06, "loss": 0.0073, "step": 3398 }, { "epoch": 0.8969792903310909, "grad_norm": 0.43421706557273865, "learning_rate": 5.368504352413611e-06, "loss": 0.0081, "step": 3400 }, { "epoch": 0.8975069252077562, "grad_norm": 0.4320656657218933, "learning_rate": 5.371669744130836e-06, "loss": 0.0132, "step": 3402 }, { "epoch": 0.8980345600844216, "grad_norm": 0.12853601574897766, "learning_rate": 5.374835135848061e-06, "loss": 0.0073, "step": 3404 }, { "epoch": 0.8985621949610869, "grad_norm": 1.0736013650894165, "learning_rate": 5.378000527565286e-06, "loss": 0.0182, "step": 3406 }, { "epoch": 0.8990898298377523, "grad_norm": 0.2904844284057617, "learning_rate": 5.381165919282512e-06, "loss": 0.0125, "step": 3408 }, { "epoch": 0.8996174647144176, "grad_norm": 2.14424204826355, "learning_rate": 5.384331310999737e-06, "loss": 0.009, "step": 3410 }, { "epoch": 0.900145099591083, "grad_norm": 1.2870653867721558, "learning_rate": 5.387496702716961e-06, "loss": 0.0202, "step": 3412 }, { "epoch": 0.9006727344677483, "grad_norm": 0.35176122188568115, "learning_rate": 5.390662094434186e-06, "loss": 0.018, "step": 3414 }, { "epoch": 0.9012003693444137, "grad_norm": 1.198384165763855, "learning_rate": 5.393827486151411e-06, "loss": 0.011, "step": 3416 }, { "epoch": 0.901728004221079, "grad_norm": 0.5270561575889587, "learning_rate": 5.396992877868637e-06, "loss": 0.019, "step": 3418 }, { "epoch": 0.9022556390977443, "grad_norm": 0.1500644087791443, "learning_rate": 5.400158269585861e-06, "loss": 0.0084, "step": 3420 }, { "epoch": 0.9027832739744097, "grad_norm": 0.17214635014533997, "learning_rate": 5.403323661303086e-06, "loss": 0.007, "step": 3422 }, { "epoch": 0.903310908851075, "grad_norm": 1.706470251083374, "learning_rate": 5.406489053020312e-06, "loss": 0.011, "step": 3424 }, { "epoch": 0.9038385437277404, "grad_norm": 0.11470861732959747, "learning_rate": 5.4096544447375366e-06, "loss": 0.0063, "step": 3426 }, { "epoch": 0.9043661786044057, "grad_norm": 0.40045928955078125, "learning_rate": 5.4128198364547614e-06, "loss": 0.0069, "step": 3428 }, { "epoch": 0.9048938134810711, "grad_norm": 0.2554756700992584, "learning_rate": 5.415985228171986e-06, "loss": 0.0176, "step": 3430 }, { "epoch": 0.9054214483577364, "grad_norm": 0.7406858205795288, "learning_rate": 5.419150619889211e-06, "loss": 0.0071, "step": 3432 }, { "epoch": 0.9059490832344018, "grad_norm": 0.06270800530910492, "learning_rate": 5.422316011606437e-06, "loss": 0.0121, "step": 3434 }, { "epoch": 0.9064767181110671, "grad_norm": 0.3890087902545929, "learning_rate": 5.425481403323661e-06, "loss": 0.0093, "step": 3436 }, { "epoch": 0.9070043529877325, "grad_norm": 0.18908540904521942, "learning_rate": 5.428646795040887e-06, "loss": 0.0137, "step": 3438 }, { "epoch": 0.9075319878643978, "grad_norm": 0.5461505651473999, "learning_rate": 5.431812186758112e-06, "loss": 0.006, "step": 3440 }, { "epoch": 0.9080596227410632, "grad_norm": 0.13035523891448975, "learning_rate": 5.4349775784753365e-06, "loss": 0.0122, "step": 3442 }, { "epoch": 0.9085872576177285, "grad_norm": 1.40940260887146, "learning_rate": 5.438142970192561e-06, "loss": 0.0089, "step": 3444 }, { "epoch": 0.9091148924943939, "grad_norm": 0.5749211311340332, "learning_rate": 5.441308361909786e-06, "loss": 0.0194, "step": 3446 }, { "epoch": 0.9096425273710592, "grad_norm": 0.03780429810285568, "learning_rate": 5.444473753627012e-06, "loss": 0.0085, "step": 3448 }, { "epoch": 0.9101701622477246, "grad_norm": 1.6526379585266113, "learning_rate": 5.447639145344236e-06, "loss": 0.0084, "step": 3450 }, { "epoch": 0.9106977971243899, "grad_norm": 0.20532675087451935, "learning_rate": 5.450804537061461e-06, "loss": 0.0128, "step": 3452 }, { "epoch": 0.9112254320010553, "grad_norm": 0.35261133313179016, "learning_rate": 5.453969928778687e-06, "loss": 0.0193, "step": 3454 }, { "epoch": 0.9117530668777206, "grad_norm": 0.4029576778411865, "learning_rate": 5.4571353204959116e-06, "loss": 0.0388, "step": 3456 }, { "epoch": 0.9122807017543859, "grad_norm": 1.0403594970703125, "learning_rate": 5.4603007122131365e-06, "loss": 0.0082, "step": 3458 }, { "epoch": 0.9128083366310513, "grad_norm": 0.3944488763809204, "learning_rate": 5.463466103930361e-06, "loss": 0.0186, "step": 3460 }, { "epoch": 0.9133359715077166, "grad_norm": 0.3051987290382385, "learning_rate": 5.466631495647586e-06, "loss": 0.0067, "step": 3462 }, { "epoch": 0.913863606384382, "grad_norm": 0.5323511958122253, "learning_rate": 5.469796887364812e-06, "loss": 0.0069, "step": 3464 }, { "epoch": 0.9143912412610473, "grad_norm": 0.18510587513446808, "learning_rate": 5.472962279082036e-06, "loss": 0.0071, "step": 3466 }, { "epoch": 0.9149188761377127, "grad_norm": 0.8380105495452881, "learning_rate": 5.476127670799262e-06, "loss": 0.0147, "step": 3468 }, { "epoch": 0.915446511014378, "grad_norm": 1.6352897882461548, "learning_rate": 5.479293062516487e-06, "loss": 0.0177, "step": 3470 }, { "epoch": 0.9159741458910434, "grad_norm": 0.5755694508552551, "learning_rate": 5.4824584542337115e-06, "loss": 0.0115, "step": 3472 }, { "epoch": 0.9165017807677087, "grad_norm": 0.19004349410533905, "learning_rate": 5.485623845950936e-06, "loss": 0.0133, "step": 3474 }, { "epoch": 0.9170294156443741, "grad_norm": 0.36075177788734436, "learning_rate": 5.488789237668161e-06, "loss": 0.0145, "step": 3476 }, { "epoch": 0.9175570505210394, "grad_norm": 0.18607044219970703, "learning_rate": 5.491954629385387e-06, "loss": 0.0074, "step": 3478 }, { "epoch": 0.9180846853977048, "grad_norm": 0.228286013007164, "learning_rate": 5.495120021102612e-06, "loss": 0.0148, "step": 3480 }, { "epoch": 0.9186123202743701, "grad_norm": 0.32614973187446594, "learning_rate": 5.498285412819836e-06, "loss": 0.0061, "step": 3482 }, { "epoch": 0.9191399551510355, "grad_norm": 0.23622038960456848, "learning_rate": 5.501450804537062e-06, "loss": 0.0147, "step": 3484 }, { "epoch": 0.9196675900277008, "grad_norm": 0.48676034808158875, "learning_rate": 5.5046161962542866e-06, "loss": 0.0141, "step": 3486 }, { "epoch": 0.9201952249043662, "grad_norm": 0.09046272188425064, "learning_rate": 5.5077815879715115e-06, "loss": 0.0142, "step": 3488 }, { "epoch": 0.9207228597810315, "grad_norm": 0.05823260918259621, "learning_rate": 5.510946979688736e-06, "loss": 0.0055, "step": 3490 }, { "epoch": 0.9212504946576969, "grad_norm": 0.8801203966140747, "learning_rate": 5.514112371405961e-06, "loss": 0.0079, "step": 3492 }, { "epoch": 0.9217781295343622, "grad_norm": 0.33263495564460754, "learning_rate": 5.517277763123187e-06, "loss": 0.0237, "step": 3494 }, { "epoch": 0.9223057644110275, "grad_norm": 0.727774441242218, "learning_rate": 5.520443154840412e-06, "loss": 0.0129, "step": 3496 }, { "epoch": 0.9228333992876929, "grad_norm": 0.11577192693948746, "learning_rate": 5.523608546557637e-06, "loss": 0.006, "step": 3498 }, { "epoch": 0.9233610341643582, "grad_norm": 0.5392929315567017, "learning_rate": 5.526773938274862e-06, "loss": 0.0217, "step": 3500 }, { "epoch": 0.9238886690410236, "grad_norm": 0.11145241558551788, "learning_rate": 5.5299393299920865e-06, "loss": 0.0118, "step": 3502 }, { "epoch": 0.9244163039176889, "grad_norm": 0.46118199825286865, "learning_rate": 5.533104721709311e-06, "loss": 0.0089, "step": 3504 }, { "epoch": 0.9249439387943543, "grad_norm": 0.2645895481109619, "learning_rate": 5.536270113426536e-06, "loss": 0.0114, "step": 3506 }, { "epoch": 0.9254715736710196, "grad_norm": 1.0469872951507568, "learning_rate": 5.539435505143762e-06, "loss": 0.0078, "step": 3508 }, { "epoch": 0.925999208547685, "grad_norm": 1.2701587677001953, "learning_rate": 5.542600896860987e-06, "loss": 0.0105, "step": 3510 }, { "epoch": 0.9265268434243503, "grad_norm": 1.168466567993164, "learning_rate": 5.545766288578212e-06, "loss": 0.0158, "step": 3512 }, { "epoch": 0.9270544783010157, "grad_norm": 0.04408371075987816, "learning_rate": 5.548931680295437e-06, "loss": 0.0085, "step": 3514 }, { "epoch": 0.927582113177681, "grad_norm": 0.05861206725239754, "learning_rate": 5.552097072012662e-06, "loss": 0.0053, "step": 3516 }, { "epoch": 0.9281097480543464, "grad_norm": 0.3866897225379944, "learning_rate": 5.555262463729887e-06, "loss": 0.0214, "step": 3518 }, { "epoch": 0.9286373829310117, "grad_norm": 0.24339330196380615, "learning_rate": 5.558427855447111e-06, "loss": 0.0247, "step": 3520 }, { "epoch": 0.9291650178076771, "grad_norm": 0.04301082342863083, "learning_rate": 5.561593247164336e-06, "loss": 0.0153, "step": 3522 }, { "epoch": 0.9296926526843424, "grad_norm": 0.2198035717010498, "learning_rate": 5.564758638881562e-06, "loss": 0.0126, "step": 3524 }, { "epoch": 0.9302202875610078, "grad_norm": 2.1998019218444824, "learning_rate": 5.567924030598787e-06, "loss": 0.0221, "step": 3526 }, { "epoch": 0.9307479224376731, "grad_norm": 0.08548520505428314, "learning_rate": 5.571089422316012e-06, "loss": 0.0057, "step": 3528 }, { "epoch": 0.9312755573143385, "grad_norm": 0.27147623896598816, "learning_rate": 5.574254814033237e-06, "loss": 0.0142, "step": 3530 }, { "epoch": 0.9318031921910038, "grad_norm": 0.3382900059223175, "learning_rate": 5.5774202057504615e-06, "loss": 0.017, "step": 3532 }, { "epoch": 0.9323308270676691, "grad_norm": 0.34548404812812805, "learning_rate": 5.580585597467687e-06, "loss": 0.0096, "step": 3534 }, { "epoch": 0.9328584619443345, "grad_norm": 0.5008452534675598, "learning_rate": 5.583750989184911e-06, "loss": 0.0154, "step": 3536 }, { "epoch": 0.9333860968209998, "grad_norm": 0.4703221619129181, "learning_rate": 5.586916380902137e-06, "loss": 0.0164, "step": 3538 }, { "epoch": 0.9339137316976652, "grad_norm": 0.7277804017066956, "learning_rate": 5.590081772619362e-06, "loss": 0.0078, "step": 3540 }, { "epoch": 0.9344413665743305, "grad_norm": 0.13310998678207397, "learning_rate": 5.593247164336587e-06, "loss": 0.0064, "step": 3542 }, { "epoch": 0.9349690014509959, "grad_norm": 0.5473229289054871, "learning_rate": 5.596412556053812e-06, "loss": 0.0132, "step": 3544 }, { "epoch": 0.9354966363276612, "grad_norm": 0.07491260766983032, "learning_rate": 5.599577947771037e-06, "loss": 0.0093, "step": 3546 }, { "epoch": 0.9360242712043266, "grad_norm": 0.13536615669727325, "learning_rate": 5.602743339488262e-06, "loss": 0.0066, "step": 3548 }, { "epoch": 0.9365519060809919, "grad_norm": 0.05802198871970177, "learning_rate": 5.605908731205487e-06, "loss": 0.0067, "step": 3550 }, { "epoch": 0.9370795409576573, "grad_norm": 0.11051968485116959, "learning_rate": 5.609074122922711e-06, "loss": 0.0056, "step": 3552 }, { "epoch": 0.9376071758343226, "grad_norm": 0.0678415447473526, "learning_rate": 5.612239514639937e-06, "loss": 0.0054, "step": 3554 }, { "epoch": 0.938134810710988, "grad_norm": 0.1532164067029953, "learning_rate": 5.615404906357162e-06, "loss": 0.005, "step": 3556 }, { "epoch": 0.9386624455876533, "grad_norm": 0.07212632894515991, "learning_rate": 5.618570298074387e-06, "loss": 0.0105, "step": 3558 }, { "epoch": 0.9391900804643187, "grad_norm": 0.4714149534702301, "learning_rate": 5.621735689791612e-06, "loss": 0.0077, "step": 3560 }, { "epoch": 0.939717715340984, "grad_norm": 0.05451786890625954, "learning_rate": 5.6249010815088365e-06, "loss": 0.0059, "step": 3562 }, { "epoch": 0.9402453502176494, "grad_norm": 0.920157790184021, "learning_rate": 5.628066473226062e-06, "loss": 0.006, "step": 3564 }, { "epoch": 0.9407729850943147, "grad_norm": 0.8891220092773438, "learning_rate": 5.631231864943287e-06, "loss": 0.0258, "step": 3566 }, { "epoch": 0.9413006199709801, "grad_norm": 0.9060091972351074, "learning_rate": 5.634397256660512e-06, "loss": 0.0063, "step": 3568 }, { "epoch": 0.9418282548476454, "grad_norm": 0.5933660268783569, "learning_rate": 5.637562648377737e-06, "loss": 0.0104, "step": 3570 }, { "epoch": 0.9423558897243107, "grad_norm": 0.07567280530929565, "learning_rate": 5.640728040094962e-06, "loss": 0.0048, "step": 3572 }, { "epoch": 0.9428835246009761, "grad_norm": 0.2865144610404968, "learning_rate": 5.643893431812187e-06, "loss": 0.0104, "step": 3574 }, { "epoch": 0.9434111594776414, "grad_norm": 0.0631774440407753, "learning_rate": 5.647058823529412e-06, "loss": 0.006, "step": 3576 }, { "epoch": 0.9439387943543068, "grad_norm": 0.058809421956539154, "learning_rate": 5.650224215246637e-06, "loss": 0.0077, "step": 3578 }, { "epoch": 0.9444664292309721, "grad_norm": 0.6970632672309875, "learning_rate": 5.653389606963862e-06, "loss": 0.0075, "step": 3580 }, { "epoch": 0.9449940641076375, "grad_norm": 0.061956897377967834, "learning_rate": 5.656554998681087e-06, "loss": 0.005, "step": 3582 }, { "epoch": 0.9455216989843028, "grad_norm": 0.043684639036655426, "learning_rate": 5.659720390398312e-06, "loss": 0.0046, "step": 3584 }, { "epoch": 0.9460493338609682, "grad_norm": 0.23880116641521454, "learning_rate": 5.662885782115537e-06, "loss": 0.0093, "step": 3586 }, { "epoch": 0.9465769687376335, "grad_norm": 0.4024014472961426, "learning_rate": 5.666051173832763e-06, "loss": 0.0119, "step": 3588 }, { "epoch": 0.9471046036142989, "grad_norm": 0.04407423734664917, "learning_rate": 5.669216565549987e-06, "loss": 0.0063, "step": 3590 }, { "epoch": 0.9476322384909642, "grad_norm": 0.21699169278144836, "learning_rate": 5.6723819572672116e-06, "loss": 0.0087, "step": 3592 }, { "epoch": 0.9481598733676296, "grad_norm": 0.18363149464130402, "learning_rate": 5.675547348984437e-06, "loss": 0.0055, "step": 3594 }, { "epoch": 0.9486875082442949, "grad_norm": 0.12566880881786346, "learning_rate": 5.678712740701662e-06, "loss": 0.01, "step": 3596 }, { "epoch": 0.9492151431209603, "grad_norm": 0.5645434856414795, "learning_rate": 5.681878132418887e-06, "loss": 0.0267, "step": 3598 }, { "epoch": 0.9497427779976256, "grad_norm": 0.07877538353204727, "learning_rate": 5.685043524136112e-06, "loss": 0.0159, "step": 3600 }, { "epoch": 0.950270412874291, "grad_norm": 1.3619004487991333, "learning_rate": 5.688208915853337e-06, "loss": 0.0127, "step": 3602 }, { "epoch": 0.9507980477509563, "grad_norm": 0.642821729183197, "learning_rate": 5.691374307570563e-06, "loss": 0.0132, "step": 3604 }, { "epoch": 0.9513256826276217, "grad_norm": 0.8877767324447632, "learning_rate": 5.694539699287787e-06, "loss": 0.0098, "step": 3606 }, { "epoch": 0.951853317504287, "grad_norm": 0.07117123156785965, "learning_rate": 5.6977050910050115e-06, "loss": 0.0057, "step": 3608 }, { "epoch": 0.9523809523809523, "grad_norm": 0.07212719321250916, "learning_rate": 5.700870482722237e-06, "loss": 0.0078, "step": 3610 }, { "epoch": 0.9529085872576177, "grad_norm": 0.4886595904827118, "learning_rate": 5.704035874439462e-06, "loss": 0.0134, "step": 3612 }, { "epoch": 0.953436222134283, "grad_norm": 0.42447203397750854, "learning_rate": 5.707201266156687e-06, "loss": 0.0057, "step": 3614 }, { "epoch": 0.9539638570109484, "grad_norm": 0.17869265377521515, "learning_rate": 5.710366657873912e-06, "loss": 0.0105, "step": 3616 }, { "epoch": 0.9544914918876137, "grad_norm": 0.5479180812835693, "learning_rate": 5.713532049591137e-06, "loss": 0.0076, "step": 3618 }, { "epoch": 0.9550191267642791, "grad_norm": 0.1355251669883728, "learning_rate": 5.7166974413083625e-06, "loss": 0.0179, "step": 3620 }, { "epoch": 0.9555467616409444, "grad_norm": 0.5383084416389465, "learning_rate": 5.7198628330255866e-06, "loss": 0.0219, "step": 3622 }, { "epoch": 0.9560743965176098, "grad_norm": 0.5458008050918579, "learning_rate": 5.723028224742812e-06, "loss": 0.0085, "step": 3624 }, { "epoch": 0.9566020313942751, "grad_norm": 0.44931283593177795, "learning_rate": 5.726193616460037e-06, "loss": 0.0113, "step": 3626 }, { "epoch": 0.9571296662709405, "grad_norm": 0.08533202111721039, "learning_rate": 5.729359008177262e-06, "loss": 0.0052, "step": 3628 }, { "epoch": 0.9576573011476058, "grad_norm": 0.38972893357276917, "learning_rate": 5.732524399894487e-06, "loss": 0.0082, "step": 3630 }, { "epoch": 0.9581849360242712, "grad_norm": 0.41948068141937256, "learning_rate": 5.735689791611712e-06, "loss": 0.0122, "step": 3632 }, { "epoch": 0.9587125709009365, "grad_norm": 0.46797600388526917, "learning_rate": 5.738855183328938e-06, "loss": 0.0102, "step": 3634 }, { "epoch": 0.9592402057776019, "grad_norm": 0.6167570948600769, "learning_rate": 5.7420205750461625e-06, "loss": 0.0155, "step": 3636 }, { "epoch": 0.9597678406542672, "grad_norm": 0.3082354664802551, "learning_rate": 5.7451859667633865e-06, "loss": 0.0147, "step": 3638 }, { "epoch": 0.9602954755309326, "grad_norm": 2.8520655632019043, "learning_rate": 5.748351358480612e-06, "loss": 0.0212, "step": 3640 }, { "epoch": 0.9608231104075979, "grad_norm": 0.36803174018859863, "learning_rate": 5.751516750197837e-06, "loss": 0.0129, "step": 3642 }, { "epoch": 0.9613507452842633, "grad_norm": 0.05154629424214363, "learning_rate": 5.754682141915062e-06, "loss": 0.0052, "step": 3644 }, { "epoch": 0.9618783801609286, "grad_norm": 0.05170159041881561, "learning_rate": 5.757847533632287e-06, "loss": 0.0126, "step": 3646 }, { "epoch": 0.9624060150375939, "grad_norm": 0.053817860782146454, "learning_rate": 5.761012925349512e-06, "loss": 0.0117, "step": 3648 }, { "epoch": 0.9629336499142593, "grad_norm": 0.5205416679382324, "learning_rate": 5.7641783170667375e-06, "loss": 0.0195, "step": 3650 }, { "epoch": 0.9634612847909246, "grad_norm": 0.09597280621528625, "learning_rate": 5.767343708783962e-06, "loss": 0.0128, "step": 3652 }, { "epoch": 0.96398891966759, "grad_norm": 1.071079969406128, "learning_rate": 5.770509100501187e-06, "loss": 0.0166, "step": 3654 }, { "epoch": 0.9645165545442553, "grad_norm": 0.2609986662864685, "learning_rate": 5.773674492218412e-06, "loss": 0.0107, "step": 3656 }, { "epoch": 0.9650441894209207, "grad_norm": 0.9233928918838501, "learning_rate": 5.776839883935637e-06, "loss": 0.0194, "step": 3658 }, { "epoch": 0.965571824297586, "grad_norm": 1.5896427631378174, "learning_rate": 5.780005275652862e-06, "loss": 0.0098, "step": 3660 }, { "epoch": 0.9660994591742514, "grad_norm": 0.5989594459533691, "learning_rate": 5.783170667370087e-06, "loss": 0.0167, "step": 3662 }, { "epoch": 0.9666270940509167, "grad_norm": 1.6687531471252441, "learning_rate": 5.786336059087313e-06, "loss": 0.0156, "step": 3664 }, { "epoch": 0.9671547289275821, "grad_norm": 0.11727967858314514, "learning_rate": 5.7895014508045375e-06, "loss": 0.0081, "step": 3666 }, { "epoch": 0.9676823638042474, "grad_norm": 1.1543070077896118, "learning_rate": 5.7926668425217615e-06, "loss": 0.0171, "step": 3668 }, { "epoch": 0.9682099986809128, "grad_norm": 0.6125832796096802, "learning_rate": 5.795832234238987e-06, "loss": 0.0145, "step": 3670 }, { "epoch": 0.9687376335575781, "grad_norm": 0.30063778162002563, "learning_rate": 5.798997625956212e-06, "loss": 0.0069, "step": 3672 }, { "epoch": 0.9692652684342435, "grad_norm": 1.0450471639633179, "learning_rate": 5.802163017673438e-06, "loss": 0.0081, "step": 3674 }, { "epoch": 0.9697929033109088, "grad_norm": 0.7952768206596375, "learning_rate": 5.805328409390662e-06, "loss": 0.0129, "step": 3676 }, { "epoch": 0.9703205381875742, "grad_norm": 0.32513466477394104, "learning_rate": 5.808493801107887e-06, "loss": 0.006, "step": 3678 }, { "epoch": 0.9708481730642395, "grad_norm": 0.6978696584701538, "learning_rate": 5.8116591928251126e-06, "loss": 0.0134, "step": 3680 }, { "epoch": 0.971375807940905, "grad_norm": 0.1069924384355545, "learning_rate": 5.8148245845423374e-06, "loss": 0.0147, "step": 3682 }, { "epoch": 0.9719034428175702, "grad_norm": 0.1017770916223526, "learning_rate": 5.817989976259562e-06, "loss": 0.0067, "step": 3684 }, { "epoch": 0.9724310776942355, "grad_norm": 0.30253469944000244, "learning_rate": 5.821155367976787e-06, "loss": 0.0082, "step": 3686 }, { "epoch": 0.9729587125709009, "grad_norm": 0.08041433244943619, "learning_rate": 5.824320759694012e-06, "loss": 0.0046, "step": 3688 }, { "epoch": 0.9734863474475662, "grad_norm": 0.2796190679073334, "learning_rate": 5.827486151411238e-06, "loss": 0.0064, "step": 3690 }, { "epoch": 0.9740139823242316, "grad_norm": 0.17176930606365204, "learning_rate": 5.830651543128462e-06, "loss": 0.0052, "step": 3692 }, { "epoch": 0.9745416172008969, "grad_norm": 0.17984965443611145, "learning_rate": 5.833816934845688e-06, "loss": 0.0144, "step": 3694 }, { "epoch": 0.9750692520775623, "grad_norm": 0.07161843776702881, "learning_rate": 5.8369823265629125e-06, "loss": 0.005, "step": 3696 }, { "epoch": 0.9755968869542276, "grad_norm": 0.28650355339050293, "learning_rate": 5.840147718280137e-06, "loss": 0.0063, "step": 3698 }, { "epoch": 0.976124521830893, "grad_norm": 0.3195345103740692, "learning_rate": 5.843313109997362e-06, "loss": 0.0063, "step": 3700 }, { "epoch": 0.9766521567075583, "grad_norm": 0.7548908591270447, "learning_rate": 5.846478501714587e-06, "loss": 0.0198, "step": 3702 }, { "epoch": 0.9771797915842237, "grad_norm": 0.39963972568511963, "learning_rate": 5.849643893431813e-06, "loss": 0.0131, "step": 3704 }, { "epoch": 0.977707426460889, "grad_norm": 0.48770031332969666, "learning_rate": 5.852809285149037e-06, "loss": 0.0053, "step": 3706 }, { "epoch": 0.9782350613375544, "grad_norm": 0.029394231736660004, "learning_rate": 5.855974676866262e-06, "loss": 0.0043, "step": 3708 }, { "epoch": 0.9787626962142197, "grad_norm": 0.046684879809617996, "learning_rate": 5.8591400685834876e-06, "loss": 0.0045, "step": 3710 }, { "epoch": 0.9792903310908851, "grad_norm": 0.1703922599554062, "learning_rate": 5.8623054603007125e-06, "loss": 0.0126, "step": 3712 }, { "epoch": 0.9798179659675504, "grad_norm": 0.2953357994556427, "learning_rate": 5.865470852017937e-06, "loss": 0.0086, "step": 3714 }, { "epoch": 0.9803456008442158, "grad_norm": 0.3428744375705719, "learning_rate": 5.868636243735162e-06, "loss": 0.009, "step": 3716 }, { "epoch": 0.9808732357208811, "grad_norm": 0.06268803775310516, "learning_rate": 5.871801635452387e-06, "loss": 0.0054, "step": 3718 }, { "epoch": 0.9814008705975465, "grad_norm": 0.05753438547253609, "learning_rate": 5.874967027169613e-06, "loss": 0.005, "step": 3720 }, { "epoch": 0.9819285054742118, "grad_norm": 1.4635716676712036, "learning_rate": 5.878132418886837e-06, "loss": 0.0068, "step": 3722 }, { "epoch": 0.9824561403508771, "grad_norm": 0.13622619211673737, "learning_rate": 5.881297810604063e-06, "loss": 0.0076, "step": 3724 }, { "epoch": 0.9829837752275425, "grad_norm": 1.1788989305496216, "learning_rate": 5.8844632023212875e-06, "loss": 0.015, "step": 3726 }, { "epoch": 0.9835114101042078, "grad_norm": 1.0579243898391724, "learning_rate": 5.887628594038512e-06, "loss": 0.0096, "step": 3728 }, { "epoch": 0.9840390449808732, "grad_norm": 1.9088139533996582, "learning_rate": 5.890793985755737e-06, "loss": 0.0052, "step": 3730 }, { "epoch": 0.9845666798575385, "grad_norm": 0.7136862874031067, "learning_rate": 5.893959377472962e-06, "loss": 0.011, "step": 3732 }, { "epoch": 0.985094314734204, "grad_norm": 0.2431148886680603, "learning_rate": 5.897124769190188e-06, "loss": 0.0063, "step": 3734 }, { "epoch": 0.9856219496108692, "grad_norm": 0.9011943936347961, "learning_rate": 5.900290160907413e-06, "loss": 0.0058, "step": 3736 }, { "epoch": 0.9861495844875346, "grad_norm": 0.11174432188272476, "learning_rate": 5.903455552624637e-06, "loss": 0.008, "step": 3738 }, { "epoch": 0.9866772193641999, "grad_norm": 0.6009191274642944, "learning_rate": 5.9066209443418626e-06, "loss": 0.006, "step": 3740 }, { "epoch": 0.9872048542408653, "grad_norm": 2.1808621883392334, "learning_rate": 5.9097863360590875e-06, "loss": 0.0164, "step": 3742 }, { "epoch": 0.9877324891175306, "grad_norm": 0.04237968474626541, "learning_rate": 5.912951727776312e-06, "loss": 0.0045, "step": 3744 }, { "epoch": 0.988260123994196, "grad_norm": 0.1556313931941986, "learning_rate": 5.916117119493537e-06, "loss": 0.012, "step": 3746 }, { "epoch": 0.9887877588708613, "grad_norm": 0.9141594171524048, "learning_rate": 5.919282511210762e-06, "loss": 0.0116, "step": 3748 }, { "epoch": 0.9893153937475268, "grad_norm": 0.5840505957603455, "learning_rate": 5.922447902927988e-06, "loss": 0.0046, "step": 3750 }, { "epoch": 0.989843028624192, "grad_norm": 0.8259785175323486, "learning_rate": 5.925613294645213e-06, "loss": 0.0057, "step": 3752 }, { "epoch": 0.9903706635008575, "grad_norm": 0.08367958664894104, "learning_rate": 5.928778686362438e-06, "loss": 0.009, "step": 3754 }, { "epoch": 0.9908982983775227, "grad_norm": 0.20518195629119873, "learning_rate": 5.9319440780796625e-06, "loss": 0.0061, "step": 3756 }, { "epoch": 0.9914259332541882, "grad_norm": 0.43299946188926697, "learning_rate": 5.935109469796887e-06, "loss": 0.0154, "step": 3758 }, { "epoch": 0.9919535681308534, "grad_norm": 0.08547072857618332, "learning_rate": 5.938274861514112e-06, "loss": 0.0144, "step": 3760 }, { "epoch": 0.9924812030075187, "grad_norm": 0.09816929697990417, "learning_rate": 5.941440253231337e-06, "loss": 0.0055, "step": 3762 }, { "epoch": 0.9930088378841841, "grad_norm": 0.06217197701334953, "learning_rate": 5.944605644948563e-06, "loss": 0.009, "step": 3764 }, { "epoch": 0.9935364727608494, "grad_norm": 0.415962278842926, "learning_rate": 5.947771036665788e-06, "loss": 0.0185, "step": 3766 }, { "epoch": 0.9940641076375148, "grad_norm": 1.4293839931488037, "learning_rate": 5.950936428383013e-06, "loss": 0.0099, "step": 3768 }, { "epoch": 0.9945917425141801, "grad_norm": 0.2198646515607834, "learning_rate": 5.954101820100238e-06, "loss": 0.0079, "step": 3770 }, { "epoch": 0.9951193773908455, "grad_norm": 0.4999811351299286, "learning_rate": 5.9572672118174625e-06, "loss": 0.0049, "step": 3772 }, { "epoch": 0.9956470122675108, "grad_norm": 0.848260223865509, "learning_rate": 5.960432603534688e-06, "loss": 0.0125, "step": 3774 }, { "epoch": 0.9961746471441763, "grad_norm": 0.05321720987558365, "learning_rate": 5.963597995251912e-06, "loss": 0.0096, "step": 3776 }, { "epoch": 0.9967022820208415, "grad_norm": 0.10288362205028534, "learning_rate": 5.966763386969137e-06, "loss": 0.0045, "step": 3778 }, { "epoch": 0.997229916897507, "grad_norm": 1.6024041175842285, "learning_rate": 5.969928778686363e-06, "loss": 0.0216, "step": 3780 }, { "epoch": 0.9977575517741722, "grad_norm": 0.1929457187652588, "learning_rate": 5.973094170403588e-06, "loss": 0.0073, "step": 3782 }, { "epoch": 0.9982851866508377, "grad_norm": 0.9122471213340759, "learning_rate": 5.976259562120813e-06, "loss": 0.0093, "step": 3784 }, { "epoch": 0.998812821527503, "grad_norm": 0.19691334664821625, "learning_rate": 5.9794249538380375e-06, "loss": 0.0057, "step": 3786 }, { "epoch": 0.9993404564041684, "grad_norm": 0.11809835582971573, "learning_rate": 5.982590345555262e-06, "loss": 0.0114, "step": 3788 }, { "epoch": 0.9998680912808336, "grad_norm": 0.08168607950210571, "learning_rate": 5.985755737272488e-06, "loss": 0.0044, "step": 3790 }, { "epoch": 1.0002638174383327, "grad_norm": 0.06260409951210022, "learning_rate": 5.988921128989712e-06, "loss": 0.0042, "step": 3792 }, { "epoch": 1.000791452314998, "grad_norm": 0.10383816808462143, "learning_rate": 5.992086520706938e-06, "loss": 0.0041, "step": 3794 }, { "epoch": 1.0013190871916633, "grad_norm": 0.4753197133541107, "learning_rate": 5.995251912424163e-06, "loss": 0.0147, "step": 3796 }, { "epoch": 1.0018467220683287, "grad_norm": 0.27113935351371765, "learning_rate": 5.998417304141388e-06, "loss": 0.0058, "step": 3798 }, { "epoch": 1.002374356944994, "grad_norm": 0.054416775703430176, "learning_rate": 5.999824144904598e-06, "loss": 0.0039, "step": 3800 }, { "epoch": 1.0029019918216595, "grad_norm": 0.52354496717453, "learning_rate": 5.999472434713796e-06, "loss": 0.0064, "step": 3802 }, { "epoch": 1.0034296266983247, "grad_norm": 0.20974940061569214, "learning_rate": 5.999120724522993e-06, "loss": 0.0104, "step": 3804 }, { "epoch": 1.00395726157499, "grad_norm": 0.14292187988758087, "learning_rate": 5.998769014332191e-06, "loss": 0.0059, "step": 3806 }, { "epoch": 1.0044848964516555, "grad_norm": 0.8864532113075256, "learning_rate": 5.998417304141388e-06, "loss": 0.0184, "step": 3808 }, { "epoch": 1.0050125313283207, "grad_norm": 0.28171616792678833, "learning_rate": 5.998065593950585e-06, "loss": 0.0105, "step": 3810 }, { "epoch": 1.005540166204986, "grad_norm": 0.05003759264945984, "learning_rate": 5.9977138837597825e-06, "loss": 0.006, "step": 3812 }, { "epoch": 1.0060678010816515, "grad_norm": 0.382224977016449, "learning_rate": 5.997362173568979e-06, "loss": 0.012, "step": 3814 }, { "epoch": 1.006595435958317, "grad_norm": 1.132320761680603, "learning_rate": 5.997010463378176e-06, "loss": 0.0088, "step": 3816 }, { "epoch": 1.007123070834982, "grad_norm": 0.36651158332824707, "learning_rate": 5.996658753187374e-06, "loss": 0.0068, "step": 3818 }, { "epoch": 1.0076507057116475, "grad_norm": 0.4755989909172058, "learning_rate": 5.996307042996571e-06, "loss": 0.0199, "step": 3820 }, { "epoch": 1.008178340588313, "grad_norm": 0.052660223096609116, "learning_rate": 5.995955332805768e-06, "loss": 0.0094, "step": 3822 }, { "epoch": 1.0087059754649783, "grad_norm": 0.22248491644859314, "learning_rate": 5.995603622614965e-06, "loss": 0.0093, "step": 3824 }, { "epoch": 1.0092336103416435, "grad_norm": 0.6083714365959167, "learning_rate": 5.995251912424163e-06, "loss": 0.01, "step": 3826 }, { "epoch": 1.009761245218309, "grad_norm": 0.46941956877708435, "learning_rate": 5.99490020223336e-06, "loss": 0.0092, "step": 3828 }, { "epoch": 1.0102888800949743, "grad_norm": 0.14832627773284912, "learning_rate": 5.994548492042557e-06, "loss": 0.0071, "step": 3830 }, { "epoch": 1.0108165149716397, "grad_norm": 0.567726731300354, "learning_rate": 5.9941967818517545e-06, "loss": 0.009, "step": 3832 }, { "epoch": 1.011344149848305, "grad_norm": 0.21542797982692719, "learning_rate": 5.9938450716609515e-06, "loss": 0.0152, "step": 3834 }, { "epoch": 1.0118717847249703, "grad_norm": 0.5408229231834412, "learning_rate": 5.993493361470149e-06, "loss": 0.0139, "step": 3836 }, { "epoch": 1.0123994196016357, "grad_norm": 0.2815437912940979, "learning_rate": 5.993141651279345e-06, "loss": 0.0139, "step": 3838 }, { "epoch": 1.0129270544783011, "grad_norm": 0.4970417618751526, "learning_rate": 5.992789941088543e-06, "loss": 0.0084, "step": 3840 }, { "epoch": 1.0134546893549663, "grad_norm": 0.26554080843925476, "learning_rate": 5.992438230897741e-06, "loss": 0.0056, "step": 3842 }, { "epoch": 1.0139823242316317, "grad_norm": 0.5267416834831238, "learning_rate": 5.992086520706938e-06, "loss": 0.0114, "step": 3844 }, { "epoch": 1.014509959108297, "grad_norm": 0.06048185005784035, "learning_rate": 5.991734810516135e-06, "loss": 0.0111, "step": 3846 }, { "epoch": 1.0150375939849625, "grad_norm": 0.07876840978860855, "learning_rate": 5.991383100325332e-06, "loss": 0.0043, "step": 3848 }, { "epoch": 1.0155652288616277, "grad_norm": 0.32011768221855164, "learning_rate": 5.99103139013453e-06, "loss": 0.0092, "step": 3850 }, { "epoch": 1.016092863738293, "grad_norm": 0.3631177246570587, "learning_rate": 5.990679679943727e-06, "loss": 0.0106, "step": 3852 }, { "epoch": 1.0166204986149585, "grad_norm": 0.12909099459648132, "learning_rate": 5.9903279697529235e-06, "loss": 0.0043, "step": 3854 }, { "epoch": 1.0171481334916237, "grad_norm": 0.41997507214546204, "learning_rate": 5.989976259562121e-06, "loss": 0.0111, "step": 3856 }, { "epoch": 1.017675768368289, "grad_norm": 0.7984555959701538, "learning_rate": 5.989624549371318e-06, "loss": 0.008, "step": 3858 }, { "epoch": 1.0182034032449545, "grad_norm": 0.46521198749542236, "learning_rate": 5.989272839180515e-06, "loss": 0.0079, "step": 3860 }, { "epoch": 1.01873103812162, "grad_norm": 0.04624466598033905, "learning_rate": 5.988921128989712e-06, "loss": 0.0056, "step": 3862 }, { "epoch": 1.019258672998285, "grad_norm": 1.5686753988265991, "learning_rate": 5.98856941879891e-06, "loss": 0.006, "step": 3864 }, { "epoch": 1.0197863078749505, "grad_norm": 0.29982513189315796, "learning_rate": 5.988217708608107e-06, "loss": 0.0048, "step": 3866 }, { "epoch": 1.020313942751616, "grad_norm": 1.2359626293182373, "learning_rate": 5.987865998417304e-06, "loss": 0.0222, "step": 3868 }, { "epoch": 1.0208415776282813, "grad_norm": 0.03191059082746506, "learning_rate": 5.987514288226502e-06, "loss": 0.0084, "step": 3870 }, { "epoch": 1.0213692125049465, "grad_norm": 0.09218622744083405, "learning_rate": 5.987162578035699e-06, "loss": 0.0055, "step": 3872 }, { "epoch": 1.021896847381612, "grad_norm": 0.0943019762635231, "learning_rate": 5.9868108678448965e-06, "loss": 0.004, "step": 3874 }, { "epoch": 1.0224244822582773, "grad_norm": 0.835507333278656, "learning_rate": 5.986459157654093e-06, "loss": 0.0059, "step": 3876 }, { "epoch": 1.0229521171349427, "grad_norm": 0.19655011594295502, "learning_rate": 5.98610744746329e-06, "loss": 0.0044, "step": 3878 }, { "epoch": 1.023479752011608, "grad_norm": 0.7021778225898743, "learning_rate": 5.985755737272488e-06, "loss": 0.0067, "step": 3880 }, { "epoch": 1.0240073868882733, "grad_norm": 0.40728697180747986, "learning_rate": 5.985404027081684e-06, "loss": 0.0064, "step": 3882 }, { "epoch": 1.0245350217649387, "grad_norm": 0.13840322196483612, "learning_rate": 5.985052316890882e-06, "loss": 0.0073, "step": 3884 }, { "epoch": 1.025062656641604, "grad_norm": 0.03875766322016716, "learning_rate": 5.984700606700079e-06, "loss": 0.0036, "step": 3886 }, { "epoch": 1.0255902915182693, "grad_norm": 0.06649758666753769, "learning_rate": 5.984348896509277e-06, "loss": 0.0039, "step": 3888 }, { "epoch": 1.0261179263949347, "grad_norm": 0.35281917452812195, "learning_rate": 5.983997186318474e-06, "loss": 0.0135, "step": 3890 }, { "epoch": 1.0266455612716001, "grad_norm": 1.9073301553726196, "learning_rate": 5.983645476127671e-06, "loss": 0.0211, "step": 3892 }, { "epoch": 1.0271731961482653, "grad_norm": 2.2267749309539795, "learning_rate": 5.9832937659368685e-06, "loss": 0.0092, "step": 3894 }, { "epoch": 1.0277008310249307, "grad_norm": 0.08891505002975464, "learning_rate": 5.9829420557460655e-06, "loss": 0.0038, "step": 3896 }, { "epoch": 1.0282284659015961, "grad_norm": 0.3697172701358795, "learning_rate": 5.982590345555262e-06, "loss": 0.0059, "step": 3898 }, { "epoch": 1.0287561007782615, "grad_norm": 0.33955007791519165, "learning_rate": 5.98223863536446e-06, "loss": 0.0173, "step": 3900 }, { "epoch": 1.0292837356549267, "grad_norm": 0.6357468962669373, "learning_rate": 5.981886925173657e-06, "loss": 0.0088, "step": 3902 }, { "epoch": 1.029811370531592, "grad_norm": 0.4015860855579376, "learning_rate": 5.981535214982855e-06, "loss": 0.0136, "step": 3904 }, { "epoch": 1.0303390054082575, "grad_norm": 0.07561653852462769, "learning_rate": 5.981183504792051e-06, "loss": 0.0038, "step": 3906 }, { "epoch": 1.030866640284923, "grad_norm": 0.5430837273597717, "learning_rate": 5.980831794601249e-06, "loss": 0.0094, "step": 3908 }, { "epoch": 1.031394275161588, "grad_norm": 0.24814987182617188, "learning_rate": 5.980480084410446e-06, "loss": 0.0116, "step": 3910 }, { "epoch": 1.0319219100382535, "grad_norm": 0.6749204993247986, "learning_rate": 5.980128374219643e-06, "loss": 0.0121, "step": 3912 }, { "epoch": 1.032449544914919, "grad_norm": 0.4521927535533905, "learning_rate": 5.979776664028841e-06, "loss": 0.0078, "step": 3914 }, { "epoch": 1.0329771797915843, "grad_norm": 0.12956710159778595, "learning_rate": 5.9794249538380375e-06, "loss": 0.0153, "step": 3916 }, { "epoch": 1.0335048146682495, "grad_norm": 0.6806002259254456, "learning_rate": 5.979073243647235e-06, "loss": 0.0081, "step": 3918 }, { "epoch": 1.034032449544915, "grad_norm": 0.08512496948242188, "learning_rate": 5.9787215334564314e-06, "loss": 0.0045, "step": 3920 }, { "epoch": 1.0345600844215803, "grad_norm": 0.14165176451206207, "learning_rate": 5.978369823265629e-06, "loss": 0.005, "step": 3922 }, { "epoch": 1.0350877192982457, "grad_norm": 0.4588511288166046, "learning_rate": 5.978018113074826e-06, "loss": 0.017, "step": 3924 }, { "epoch": 1.035615354174911, "grad_norm": 0.07829814404249191, "learning_rate": 5.977666402884024e-06, "loss": 0.004, "step": 3926 }, { "epoch": 1.0361429890515763, "grad_norm": 0.6030765175819397, "learning_rate": 5.977314692693221e-06, "loss": 0.0093, "step": 3928 }, { "epoch": 1.0366706239282417, "grad_norm": 0.2706483006477356, "learning_rate": 5.976962982502418e-06, "loss": 0.0084, "step": 3930 }, { "epoch": 1.037198258804907, "grad_norm": 0.1788373589515686, "learning_rate": 5.976611272311616e-06, "loss": 0.0044, "step": 3932 }, { "epoch": 1.0377258936815723, "grad_norm": 0.06985223293304443, "learning_rate": 5.976259562120813e-06, "loss": 0.0041, "step": 3934 }, { "epoch": 1.0382535285582377, "grad_norm": 0.17545880377292633, "learning_rate": 5.97590785193001e-06, "loss": 0.0178, "step": 3936 }, { "epoch": 1.0387811634349031, "grad_norm": 0.2816494107246399, "learning_rate": 5.975556141739207e-06, "loss": 0.012, "step": 3938 }, { "epoch": 1.0393087983115683, "grad_norm": 1.0950720310211182, "learning_rate": 5.975204431548404e-06, "loss": 0.0096, "step": 3940 }, { "epoch": 1.0398364331882337, "grad_norm": 0.9591851234436035, "learning_rate": 5.974852721357601e-06, "loss": 0.01, "step": 3942 }, { "epoch": 1.0403640680648991, "grad_norm": 0.034522272646427155, "learning_rate": 5.974501011166798e-06, "loss": 0.004, "step": 3944 }, { "epoch": 1.0408917029415645, "grad_norm": 0.3708389103412628, "learning_rate": 5.974149300975996e-06, "loss": 0.0045, "step": 3946 }, { "epoch": 1.0414193378182297, "grad_norm": 0.7349219918251038, "learning_rate": 5.973797590785193e-06, "loss": 0.0196, "step": 3948 }, { "epoch": 1.0419469726948951, "grad_norm": 0.3598858714103699, "learning_rate": 5.97344588059439e-06, "loss": 0.0121, "step": 3950 }, { "epoch": 1.0424746075715605, "grad_norm": 0.27487486600875854, "learning_rate": 5.973094170403588e-06, "loss": 0.0047, "step": 3952 }, { "epoch": 1.043002242448226, "grad_norm": 1.0047138929367065, "learning_rate": 5.972742460212785e-06, "loss": 0.0091, "step": 3954 }, { "epoch": 1.043529877324891, "grad_norm": 0.24478255212306976, "learning_rate": 5.9723907500219825e-06, "loss": 0.0099, "step": 3956 }, { "epoch": 1.0440575122015565, "grad_norm": 2.2695376873016357, "learning_rate": 5.9720390398311795e-06, "loss": 0.0113, "step": 3958 }, { "epoch": 1.044585147078222, "grad_norm": 2.8563380241394043, "learning_rate": 5.971687329640376e-06, "loss": 0.0087, "step": 3960 }, { "epoch": 1.045112781954887, "grad_norm": 0.08181760460138321, "learning_rate": 5.971335619449574e-06, "loss": 0.0124, "step": 3962 }, { "epoch": 1.0456404168315525, "grad_norm": 0.24792265892028809, "learning_rate": 5.970983909258771e-06, "loss": 0.0075, "step": 3964 }, { "epoch": 1.046168051708218, "grad_norm": 0.22064250707626343, "learning_rate": 5.970632199067968e-06, "loss": 0.0059, "step": 3966 }, { "epoch": 1.0466956865848833, "grad_norm": 1.0509603023529053, "learning_rate": 5.970280488877165e-06, "loss": 0.0091, "step": 3968 }, { "epoch": 1.0472233214615485, "grad_norm": 0.39126676321029663, "learning_rate": 5.969928778686363e-06, "loss": 0.0083, "step": 3970 }, { "epoch": 1.047750956338214, "grad_norm": 0.5521587133407593, "learning_rate": 5.96957706849556e-06, "loss": 0.0059, "step": 3972 }, { "epoch": 1.0482785912148793, "grad_norm": 0.2689548432826996, "learning_rate": 5.969225358304757e-06, "loss": 0.0044, "step": 3974 }, { "epoch": 1.0488062260915447, "grad_norm": 0.07628428936004639, "learning_rate": 5.9688736481139546e-06, "loss": 0.0136, "step": 3976 }, { "epoch": 1.04933386096821, "grad_norm": 0.7169488072395325, "learning_rate": 5.9685219379231515e-06, "loss": 0.0092, "step": 3978 }, { "epoch": 1.0498614958448753, "grad_norm": 1.673007607460022, "learning_rate": 5.9681702277323485e-06, "loss": 0.0172, "step": 3980 }, { "epoch": 1.0503891307215407, "grad_norm": 0.5763864517211914, "learning_rate": 5.9678185175415454e-06, "loss": 0.0051, "step": 3982 }, { "epoch": 1.0509167655982061, "grad_norm": 0.31183356046676636, "learning_rate": 5.967466807350743e-06, "loss": 0.0038, "step": 3984 }, { "epoch": 1.0514444004748713, "grad_norm": 0.24390852451324463, "learning_rate": 5.967115097159941e-06, "loss": 0.0042, "step": 3986 }, { "epoch": 1.0519720353515367, "grad_norm": 0.5743184685707092, "learning_rate": 5.966763386969137e-06, "loss": 0.0067, "step": 3988 }, { "epoch": 1.0524996702282021, "grad_norm": 0.040596459060907364, "learning_rate": 5.966411676778335e-06, "loss": 0.0048, "step": 3990 }, { "epoch": 1.0530273051048675, "grad_norm": 0.030759939923882484, "learning_rate": 5.966059966587532e-06, "loss": 0.0038, "step": 3992 }, { "epoch": 1.0535549399815327, "grad_norm": 0.060099899768829346, "learning_rate": 5.96570825639673e-06, "loss": 0.0038, "step": 3994 }, { "epoch": 1.0540825748581981, "grad_norm": 0.1790747344493866, "learning_rate": 5.965356546205927e-06, "loss": 0.0038, "step": 3996 }, { "epoch": 1.0546102097348635, "grad_norm": 0.03776716813445091, "learning_rate": 5.965004836015124e-06, "loss": 0.0036, "step": 3998 }, { "epoch": 1.055137844611529, "grad_norm": 0.20031189918518066, "learning_rate": 5.964653125824321e-06, "loss": 0.0034, "step": 4000 }, { "epoch": 1.055137844611529, "eval_loss": 0.006768143270164728, "eval_runtime": 307.7917, "eval_samples_per_second": 700.607, "eval_steps_per_second": 87.579, "step": 4000 }, { "epoch": 1.0556654794881941, "grad_norm": 0.0761580541729927, "learning_rate": 5.9643014156335175e-06, "loss": 0.0055, "step": 4002 }, { "epoch": 1.0561931143648595, "grad_norm": 1.0917761325836182, "learning_rate": 5.963949705442715e-06, "loss": 0.0127, "step": 4004 }, { "epoch": 1.056720749241525, "grad_norm": 0.06665147840976715, "learning_rate": 5.963597995251912e-06, "loss": 0.0032, "step": 4006 }, { "epoch": 1.05724838411819, "grad_norm": 0.0489024892449379, "learning_rate": 5.96324628506111e-06, "loss": 0.0035, "step": 4008 }, { "epoch": 1.0577760189948555, "grad_norm": 0.03955915570259094, "learning_rate": 5.962894574870307e-06, "loss": 0.0073, "step": 4010 }, { "epoch": 1.058303653871521, "grad_norm": 0.1288267821073532, "learning_rate": 5.962542864679504e-06, "loss": 0.006, "step": 4012 }, { "epoch": 1.0588312887481863, "grad_norm": 0.10672944039106369, "learning_rate": 5.962191154488702e-06, "loss": 0.0042, "step": 4014 }, { "epoch": 1.0593589236248515, "grad_norm": 0.5358752608299255, "learning_rate": 5.961839444297899e-06, "loss": 0.0124, "step": 4016 }, { "epoch": 1.059886558501517, "grad_norm": 0.055051758885383606, "learning_rate": 5.961487734107096e-06, "loss": 0.0033, "step": 4018 }, { "epoch": 1.0604141933781823, "grad_norm": 0.9549580812454224, "learning_rate": 5.9611360239162935e-06, "loss": 0.0054, "step": 4020 }, { "epoch": 1.0609418282548477, "grad_norm": 1.44691002368927, "learning_rate": 5.96078431372549e-06, "loss": 0.0153, "step": 4022 }, { "epoch": 1.061469463131513, "grad_norm": 0.34928545355796814, "learning_rate": 5.960432603534688e-06, "loss": 0.0039, "step": 4024 }, { "epoch": 1.0619970980081783, "grad_norm": 3.341787338256836, "learning_rate": 5.960080893343884e-06, "loss": 0.0038, "step": 4026 }, { "epoch": 1.0625247328848437, "grad_norm": 0.33038803935050964, "learning_rate": 5.959729183153082e-06, "loss": 0.007, "step": 4028 }, { "epoch": 1.0630523677615091, "grad_norm": 0.5186553001403809, "learning_rate": 5.959377472962279e-06, "loss": 0.0205, "step": 4030 }, { "epoch": 1.0635800026381743, "grad_norm": 0.766558825969696, "learning_rate": 5.959025762771476e-06, "loss": 0.0082, "step": 4032 }, { "epoch": 1.0641076375148397, "grad_norm": 0.1227744072675705, "learning_rate": 5.958674052580674e-06, "loss": 0.0094, "step": 4034 }, { "epoch": 1.0646352723915051, "grad_norm": 0.05983549728989601, "learning_rate": 5.958322342389871e-06, "loss": 0.0036, "step": 4036 }, { "epoch": 1.0651629072681703, "grad_norm": 0.2651796340942383, "learning_rate": 5.9579706321990686e-06, "loss": 0.0096, "step": 4038 }, { "epoch": 1.0656905421448357, "grad_norm": 0.26678019762039185, "learning_rate": 5.957618922008265e-06, "loss": 0.0089, "step": 4040 }, { "epoch": 1.0662181770215011, "grad_norm": 0.16620728373527527, "learning_rate": 5.9572672118174625e-06, "loss": 0.0038, "step": 4042 }, { "epoch": 1.0667458118981665, "grad_norm": 0.05890706926584244, "learning_rate": 5.95691550162666e-06, "loss": 0.011, "step": 4044 }, { "epoch": 1.0672734467748317, "grad_norm": 0.3212912678718567, "learning_rate": 5.956563791435857e-06, "loss": 0.004, "step": 4046 }, { "epoch": 1.0678010816514971, "grad_norm": 0.36914828419685364, "learning_rate": 5.956212081245054e-06, "loss": 0.0139, "step": 4048 }, { "epoch": 1.0683287165281625, "grad_norm": 0.0422121025621891, "learning_rate": 5.955860371054251e-06, "loss": 0.0137, "step": 4050 }, { "epoch": 1.068856351404828, "grad_norm": 0.3930920958518982, "learning_rate": 5.955508660863449e-06, "loss": 0.0116, "step": 4052 }, { "epoch": 1.0693839862814931, "grad_norm": 0.054515037685632706, "learning_rate": 5.955156950672646e-06, "loss": 0.0075, "step": 4054 }, { "epoch": 1.0699116211581585, "grad_norm": 0.03185296803712845, "learning_rate": 5.954805240481843e-06, "loss": 0.0114, "step": 4056 }, { "epoch": 1.070439256034824, "grad_norm": 0.12916222214698792, "learning_rate": 5.954453530291041e-06, "loss": 0.0053, "step": 4058 }, { "epoch": 1.0709668909114893, "grad_norm": 0.04007614403963089, "learning_rate": 5.954101820100238e-06, "loss": 0.0175, "step": 4060 }, { "epoch": 1.0714945257881545, "grad_norm": 0.14285892248153687, "learning_rate": 5.9537501099094345e-06, "loss": 0.0138, "step": 4062 }, { "epoch": 1.07202216066482, "grad_norm": 0.02643457241356373, "learning_rate": 5.9533983997186315e-06, "loss": 0.0077, "step": 4064 }, { "epoch": 1.0725497955414853, "grad_norm": 0.037268463522195816, "learning_rate": 5.953046689527829e-06, "loss": 0.0181, "step": 4066 }, { "epoch": 1.0730774304181507, "grad_norm": 0.8003777861595154, "learning_rate": 5.952694979337027e-06, "loss": 0.0092, "step": 4068 }, { "epoch": 1.073605065294816, "grad_norm": 0.47269120812416077, "learning_rate": 5.952343269146223e-06, "loss": 0.0062, "step": 4070 }, { "epoch": 1.0741327001714813, "grad_norm": 0.20298118889331818, "learning_rate": 5.951991558955421e-06, "loss": 0.0047, "step": 4072 }, { "epoch": 1.0746603350481467, "grad_norm": 0.2964349091053009, "learning_rate": 5.951639848764618e-06, "loss": 0.0075, "step": 4074 }, { "epoch": 1.0751879699248121, "grad_norm": 0.9206012487411499, "learning_rate": 5.951288138573816e-06, "loss": 0.0106, "step": 4076 }, { "epoch": 1.0757156048014773, "grad_norm": 0.1577734649181366, "learning_rate": 5.950936428383013e-06, "loss": 0.0108, "step": 4078 }, { "epoch": 1.0762432396781427, "grad_norm": 0.3397778868675232, "learning_rate": 5.95058471819221e-06, "loss": 0.0065, "step": 4080 }, { "epoch": 1.0767708745548081, "grad_norm": 0.11701516807079315, "learning_rate": 5.9502330080014074e-06, "loss": 0.0148, "step": 4082 }, { "epoch": 1.0772985094314733, "grad_norm": 1.029966115951538, "learning_rate": 5.949881297810604e-06, "loss": 0.0097, "step": 4084 }, { "epoch": 1.0778261443081387, "grad_norm": 0.1938423067331314, "learning_rate": 5.949529587619801e-06, "loss": 0.0046, "step": 4086 }, { "epoch": 1.0783537791848041, "grad_norm": 0.7441293597221375, "learning_rate": 5.949177877428998e-06, "loss": 0.0111, "step": 4088 }, { "epoch": 1.0788814140614695, "grad_norm": 0.4050735533237457, "learning_rate": 5.948826167238196e-06, "loss": 0.0067, "step": 4090 }, { "epoch": 1.0794090489381347, "grad_norm": 0.6188096404075623, "learning_rate": 5.948474457047393e-06, "loss": 0.0118, "step": 4092 }, { "epoch": 1.0799366838148001, "grad_norm": 0.5200625061988831, "learning_rate": 5.94812274685659e-06, "loss": 0.0097, "step": 4094 }, { "epoch": 1.0804643186914655, "grad_norm": 0.25236326456069946, "learning_rate": 5.947771036665788e-06, "loss": 0.0075, "step": 4096 }, { "epoch": 1.080991953568131, "grad_norm": 0.1802530288696289, "learning_rate": 5.947419326474985e-06, "loss": 0.0035, "step": 4098 }, { "epoch": 1.0815195884447961, "grad_norm": 0.15959151089191437, "learning_rate": 5.947067616284182e-06, "loss": 0.0041, "step": 4100 }, { "epoch": 1.0820472233214615, "grad_norm": 0.7127006649971008, "learning_rate": 5.9467159060933795e-06, "loss": 0.0069, "step": 4102 }, { "epoch": 1.082574858198127, "grad_norm": 0.17579476535320282, "learning_rate": 5.9463641959025765e-06, "loss": 0.0112, "step": 4104 }, { "epoch": 1.0831024930747923, "grad_norm": 0.4253542721271515, "learning_rate": 5.946012485711774e-06, "loss": 0.0088, "step": 4106 }, { "epoch": 1.0836301279514575, "grad_norm": 0.03827614337205887, "learning_rate": 5.94566077552097e-06, "loss": 0.0072, "step": 4108 }, { "epoch": 1.084157762828123, "grad_norm": 1.2263107299804688, "learning_rate": 5.945309065330168e-06, "loss": 0.0149, "step": 4110 }, { "epoch": 1.0846853977047883, "grad_norm": 0.17847813665866852, "learning_rate": 5.944957355139365e-06, "loss": 0.0047, "step": 4112 }, { "epoch": 1.0852130325814535, "grad_norm": 0.05754408985376358, "learning_rate": 5.944605644948563e-06, "loss": 0.0063, "step": 4114 }, { "epoch": 1.085740667458119, "grad_norm": 0.48372650146484375, "learning_rate": 5.94425393475776e-06, "loss": 0.0044, "step": 4116 }, { "epoch": 1.0862683023347843, "grad_norm": 0.5806470513343811, "learning_rate": 5.943902224566957e-06, "loss": 0.0062, "step": 4118 }, { "epoch": 1.0867959372114497, "grad_norm": 0.47058114409446716, "learning_rate": 5.943550514376155e-06, "loss": 0.0056, "step": 4120 }, { "epoch": 1.0873235720881151, "grad_norm": 1.0434728860855103, "learning_rate": 5.943198804185351e-06, "loss": 0.0082, "step": 4122 }, { "epoch": 1.0878512069647803, "grad_norm": 0.23054403066635132, "learning_rate": 5.9428470939945485e-06, "loss": 0.0097, "step": 4124 }, { "epoch": 1.0883788418414457, "grad_norm": 0.0332040935754776, "learning_rate": 5.942495383803746e-06, "loss": 0.0048, "step": 4126 }, { "epoch": 1.0889064767181111, "grad_norm": 0.7285342812538147, "learning_rate": 5.942143673612943e-06, "loss": 0.0088, "step": 4128 }, { "epoch": 1.0894341115947763, "grad_norm": 0.03758569061756134, "learning_rate": 5.94179196342214e-06, "loss": 0.0048, "step": 4130 }, { "epoch": 1.0899617464714417, "grad_norm": 0.09739779680967331, "learning_rate": 5.941440253231337e-06, "loss": 0.0095, "step": 4132 }, { "epoch": 1.0904893813481071, "grad_norm": 0.05845189094543457, "learning_rate": 5.941088543040535e-06, "loss": 0.0099, "step": 4134 }, { "epoch": 1.0910170162247725, "grad_norm": 0.5323328971862793, "learning_rate": 5.940736832849732e-06, "loss": 0.0097, "step": 4136 }, { "epoch": 1.0915446511014377, "grad_norm": 0.1795583814382553, "learning_rate": 5.940385122658929e-06, "loss": 0.0091, "step": 4138 }, { "epoch": 1.0920722859781031, "grad_norm": 0.09979342669248581, "learning_rate": 5.940033412468127e-06, "loss": 0.0032, "step": 4140 }, { "epoch": 1.0925999208547685, "grad_norm": 0.10667362809181213, "learning_rate": 5.939681702277324e-06, "loss": 0.0038, "step": 4142 }, { "epoch": 1.093127555731434, "grad_norm": 0.04990679398179054, "learning_rate": 5.9393299920865214e-06, "loss": 0.0039, "step": 4144 }, { "epoch": 1.0936551906080991, "grad_norm": 0.712762176990509, "learning_rate": 5.9389782818957175e-06, "loss": 0.0149, "step": 4146 }, { "epoch": 1.0941828254847645, "grad_norm": 0.09826178103685379, "learning_rate": 5.938626571704915e-06, "loss": 0.0039, "step": 4148 }, { "epoch": 1.09471046036143, "grad_norm": 0.7769744396209717, "learning_rate": 5.938274861514112e-06, "loss": 0.0103, "step": 4150 }, { "epoch": 1.0952380952380953, "grad_norm": 0.17245380580425262, "learning_rate": 5.937923151323309e-06, "loss": 0.0083, "step": 4152 }, { "epoch": 1.0957657301147605, "grad_norm": 0.1423395425081253, "learning_rate": 5.937571441132507e-06, "loss": 0.0187, "step": 4154 }, { "epoch": 1.096293364991426, "grad_norm": 0.9081747531890869, "learning_rate": 5.937219730941704e-06, "loss": 0.0078, "step": 4156 }, { "epoch": 1.0968209998680913, "grad_norm": 0.5149850845336914, "learning_rate": 5.936868020750902e-06, "loss": 0.0148, "step": 4158 }, { "epoch": 1.0973486347447565, "grad_norm": 0.07755296677350998, "learning_rate": 5.936516310560099e-06, "loss": 0.0042, "step": 4160 }, { "epoch": 1.097876269621422, "grad_norm": 0.07628656178712845, "learning_rate": 5.936164600369296e-06, "loss": 0.0083, "step": 4162 }, { "epoch": 1.0984039044980873, "grad_norm": 0.23485715687274933, "learning_rate": 5.9358128901784935e-06, "loss": 0.0036, "step": 4164 }, { "epoch": 1.0989315393747527, "grad_norm": 0.46286705136299133, "learning_rate": 5.9354611799876905e-06, "loss": 0.0049, "step": 4166 }, { "epoch": 1.099459174251418, "grad_norm": 0.14087606966495514, "learning_rate": 5.935109469796887e-06, "loss": 0.0177, "step": 4168 }, { "epoch": 1.0999868091280833, "grad_norm": 0.331839382648468, "learning_rate": 5.934757759606084e-06, "loss": 0.0084, "step": 4170 }, { "epoch": 1.1005144440047487, "grad_norm": 0.030434500426054, "learning_rate": 5.934406049415282e-06, "loss": 0.0115, "step": 4172 }, { "epoch": 1.1010420788814141, "grad_norm": 0.05513356998562813, "learning_rate": 5.934054339224479e-06, "loss": 0.003, "step": 4174 }, { "epoch": 1.1015697137580793, "grad_norm": 0.23737043142318726, "learning_rate": 5.933702629033676e-06, "loss": 0.0132, "step": 4176 }, { "epoch": 1.1020973486347447, "grad_norm": 0.19290249049663544, "learning_rate": 5.933350918842874e-06, "loss": 0.007, "step": 4178 }, { "epoch": 1.1026249835114101, "grad_norm": 0.24390675127506256, "learning_rate": 5.932999208652071e-06, "loss": 0.0152, "step": 4180 }, { "epoch": 1.1031526183880755, "grad_norm": 0.20332658290863037, "learning_rate": 5.932647498461268e-06, "loss": 0.0044, "step": 4182 }, { "epoch": 1.1036802532647407, "grad_norm": 0.2555258572101593, "learning_rate": 5.9322957882704656e-06, "loss": 0.0187, "step": 4184 }, { "epoch": 1.1042078881414061, "grad_norm": 0.06230524554848671, "learning_rate": 5.9319440780796625e-06, "loss": 0.0071, "step": 4186 }, { "epoch": 1.1047355230180715, "grad_norm": 1.3664534091949463, "learning_rate": 5.93159236788886e-06, "loss": 0.0074, "step": 4188 }, { "epoch": 1.1052631578947367, "grad_norm": 0.0786578506231308, "learning_rate": 5.9312406576980564e-06, "loss": 0.0038, "step": 4190 }, { "epoch": 1.1057907927714021, "grad_norm": 0.08665662258863449, "learning_rate": 5.930888947507254e-06, "loss": 0.0068, "step": 4192 }, { "epoch": 1.1063184276480675, "grad_norm": 0.4641355872154236, "learning_rate": 5.930537237316451e-06, "loss": 0.0037, "step": 4194 }, { "epoch": 1.106846062524733, "grad_norm": 0.35535696148872375, "learning_rate": 5.930185527125649e-06, "loss": 0.0123, "step": 4196 }, { "epoch": 1.1073736974013983, "grad_norm": 0.06691774725914001, "learning_rate": 5.929833816934846e-06, "loss": 0.0071, "step": 4198 }, { "epoch": 1.1079013322780635, "grad_norm": 0.4141412079334259, "learning_rate": 5.929482106744043e-06, "loss": 0.0046, "step": 4200 }, { "epoch": 1.108428967154729, "grad_norm": 0.07766465097665787, "learning_rate": 5.929130396553241e-06, "loss": 0.0077, "step": 4202 }, { "epoch": 1.1089566020313943, "grad_norm": 0.7818406820297241, "learning_rate": 5.928778686362438e-06, "loss": 0.0037, "step": 4204 }, { "epoch": 1.1094842369080595, "grad_norm": 0.5902292132377625, "learning_rate": 5.928426976171635e-06, "loss": 0.005, "step": 4206 }, { "epoch": 1.110011871784725, "grad_norm": 0.043587442487478256, "learning_rate": 5.9280752659808315e-06, "loss": 0.0037, "step": 4208 }, { "epoch": 1.1105395066613903, "grad_norm": 0.13054034113883972, "learning_rate": 5.927723555790029e-06, "loss": 0.0112, "step": 4210 }, { "epoch": 1.1110671415380557, "grad_norm": 0.03394649177789688, "learning_rate": 5.927371845599226e-06, "loss": 0.0029, "step": 4212 }, { "epoch": 1.111594776414721, "grad_norm": 1.5376572608947754, "learning_rate": 5.927020135408423e-06, "loss": 0.0228, "step": 4214 }, { "epoch": 1.1121224112913863, "grad_norm": 0.041744161397218704, "learning_rate": 5.926668425217621e-06, "loss": 0.0083, "step": 4216 }, { "epoch": 1.1126500461680517, "grad_norm": 0.2708398401737213, "learning_rate": 5.926316715026818e-06, "loss": 0.0145, "step": 4218 }, { "epoch": 1.1131776810447171, "grad_norm": 0.11400587856769562, "learning_rate": 5.925965004836015e-06, "loss": 0.005, "step": 4220 }, { "epoch": 1.1137053159213823, "grad_norm": 0.659196674823761, "learning_rate": 5.925613294645213e-06, "loss": 0.0042, "step": 4222 }, { "epoch": 1.1142329507980477, "grad_norm": 0.38809943199157715, "learning_rate": 5.92526158445441e-06, "loss": 0.006, "step": 4224 }, { "epoch": 1.1147605856747131, "grad_norm": 0.0365917794406414, "learning_rate": 5.9249098742636075e-06, "loss": 0.0032, "step": 4226 }, { "epoch": 1.1152882205513786, "grad_norm": 1.3015064001083374, "learning_rate": 5.924558164072804e-06, "loss": 0.0047, "step": 4228 }, { "epoch": 1.1158158554280437, "grad_norm": 0.0365254171192646, "learning_rate": 5.924206453882001e-06, "loss": 0.0051, "step": 4230 }, { "epoch": 1.1163434903047091, "grad_norm": 0.38866597414016724, "learning_rate": 5.923854743691198e-06, "loss": 0.0044, "step": 4232 }, { "epoch": 1.1168711251813745, "grad_norm": 0.09240370988845825, "learning_rate": 5.923503033500396e-06, "loss": 0.0125, "step": 4234 }, { "epoch": 1.1173987600580397, "grad_norm": 0.451965868473053, "learning_rate": 5.923151323309593e-06, "loss": 0.0043, "step": 4236 }, { "epoch": 1.1179263949347051, "grad_norm": 0.027691366150975227, "learning_rate": 5.92279961311879e-06, "loss": 0.0067, "step": 4238 }, { "epoch": 1.1184540298113705, "grad_norm": 1.8160966634750366, "learning_rate": 5.922447902927988e-06, "loss": 0.0157, "step": 4240 }, { "epoch": 1.118981664688036, "grad_norm": 0.025669081136584282, "learning_rate": 5.922096192737185e-06, "loss": 0.0115, "step": 4242 }, { "epoch": 1.1195092995647011, "grad_norm": 0.19953106343746185, "learning_rate": 5.921744482546382e-06, "loss": 0.0099, "step": 4244 }, { "epoch": 1.1200369344413665, "grad_norm": 0.16344329714775085, "learning_rate": 5.9213927723555796e-06, "loss": 0.0033, "step": 4246 }, { "epoch": 1.120564569318032, "grad_norm": 0.2005971074104309, "learning_rate": 5.9210410621647765e-06, "loss": 0.0046, "step": 4248 }, { "epoch": 1.1210922041946974, "grad_norm": 0.5159090161323547, "learning_rate": 5.9206893519739735e-06, "loss": 0.0073, "step": 4250 }, { "epoch": 1.1216198390713625, "grad_norm": 0.04949517548084259, "learning_rate": 5.92033764178317e-06, "loss": 0.0059, "step": 4252 }, { "epoch": 1.122147473948028, "grad_norm": 0.16475045680999756, "learning_rate": 5.919985931592368e-06, "loss": 0.0031, "step": 4254 }, { "epoch": 1.1226751088246933, "grad_norm": 0.10154606401920319, "learning_rate": 5.919634221401565e-06, "loss": 0.0028, "step": 4256 }, { "epoch": 1.1232027437013588, "grad_norm": 1.320400357246399, "learning_rate": 5.919282511210762e-06, "loss": 0.007, "step": 4258 }, { "epoch": 1.123730378578024, "grad_norm": 0.11165601760149002, "learning_rate": 5.91893080101996e-06, "loss": 0.0032, "step": 4260 }, { "epoch": 1.1242580134546893, "grad_norm": 0.7321994304656982, "learning_rate": 5.918579090829157e-06, "loss": 0.0058, "step": 4262 }, { "epoch": 1.1247856483313547, "grad_norm": 0.12660516798496246, "learning_rate": 5.918227380638355e-06, "loss": 0.0123, "step": 4264 }, { "epoch": 1.12531328320802, "grad_norm": 0.029522718861699104, "learning_rate": 5.917875670447551e-06, "loss": 0.0027, "step": 4266 }, { "epoch": 1.1258409180846853, "grad_norm": 0.038140155375003815, "learning_rate": 5.917523960256749e-06, "loss": 0.0027, "step": 4268 }, { "epoch": 1.1263685529613507, "grad_norm": 0.49625295400619507, "learning_rate": 5.917172250065946e-06, "loss": 0.0122, "step": 4270 }, { "epoch": 1.1268961878380162, "grad_norm": 0.6637237071990967, "learning_rate": 5.9168205398751425e-06, "loss": 0.0042, "step": 4272 }, { "epoch": 1.1274238227146816, "grad_norm": 0.4365116357803345, "learning_rate": 5.91646882968434e-06, "loss": 0.022, "step": 4274 }, { "epoch": 1.1279514575913467, "grad_norm": 0.47940337657928467, "learning_rate": 5.916117119493537e-06, "loss": 0.0047, "step": 4276 }, { "epoch": 1.1284790924680121, "grad_norm": 0.21069253981113434, "learning_rate": 5.915765409302735e-06, "loss": 0.0032, "step": 4278 }, { "epoch": 1.1290067273446776, "grad_norm": 0.1916239708662033, "learning_rate": 5.915413699111932e-06, "loss": 0.0055, "step": 4280 }, { "epoch": 1.1295343622213427, "grad_norm": 0.0299335066229105, "learning_rate": 5.915061988921129e-06, "loss": 0.0029, "step": 4282 }, { "epoch": 1.1300619970980081, "grad_norm": 0.6847232580184937, "learning_rate": 5.914710278730327e-06, "loss": 0.0142, "step": 4284 }, { "epoch": 1.1305896319746735, "grad_norm": 0.04908822476863861, "learning_rate": 5.914358568539524e-06, "loss": 0.0036, "step": 4286 }, { "epoch": 1.131117266851339, "grad_norm": 0.04385972023010254, "learning_rate": 5.914006858348721e-06, "loss": 0.0043, "step": 4288 }, { "epoch": 1.1316449017280041, "grad_norm": 0.062346477061510086, "learning_rate": 5.913655148157918e-06, "loss": 0.0063, "step": 4290 }, { "epoch": 1.1321725366046695, "grad_norm": 0.08509960770606995, "learning_rate": 5.913303437967115e-06, "loss": 0.0096, "step": 4292 }, { "epoch": 1.132700171481335, "grad_norm": 0.05004263296723366, "learning_rate": 5.912951727776312e-06, "loss": 0.0034, "step": 4294 }, { "epoch": 1.1332278063580004, "grad_norm": 0.20187202095985413, "learning_rate": 5.912600017585509e-06, "loss": 0.0032, "step": 4296 }, { "epoch": 1.1337554412346655, "grad_norm": 0.10154867172241211, "learning_rate": 5.912248307394707e-06, "loss": 0.0115, "step": 4298 }, { "epoch": 1.134283076111331, "grad_norm": 0.5187499523162842, "learning_rate": 5.911896597203904e-06, "loss": 0.0057, "step": 4300 }, { "epoch": 1.1348107109879964, "grad_norm": 0.037969328463077545, "learning_rate": 5.911544887013101e-06, "loss": 0.0061, "step": 4302 }, { "epoch": 1.1353383458646618, "grad_norm": 0.189756840467453, "learning_rate": 5.911193176822299e-06, "loss": 0.0031, "step": 4304 }, { "epoch": 1.135865980741327, "grad_norm": 0.06906686723232269, "learning_rate": 5.910841466631496e-06, "loss": 0.0054, "step": 4306 }, { "epoch": 1.1363936156179923, "grad_norm": 3.2587168216705322, "learning_rate": 5.9104897564406936e-06, "loss": 0.0144, "step": 4308 }, { "epoch": 1.1369212504946578, "grad_norm": 0.22053393721580505, "learning_rate": 5.91013804624989e-06, "loss": 0.0158, "step": 4310 }, { "epoch": 1.137448885371323, "grad_norm": 0.1708652675151825, "learning_rate": 5.9097863360590875e-06, "loss": 0.0068, "step": 4312 }, { "epoch": 1.1379765202479883, "grad_norm": 0.02521701343357563, "learning_rate": 5.909434625868284e-06, "loss": 0.0033, "step": 4314 }, { "epoch": 1.1385041551246537, "grad_norm": 1.2252119779586792, "learning_rate": 5.909082915677482e-06, "loss": 0.0058, "step": 4316 }, { "epoch": 1.1390317900013192, "grad_norm": 0.052718061953783035, "learning_rate": 5.908731205486679e-06, "loss": 0.0027, "step": 4318 }, { "epoch": 1.1395594248779843, "grad_norm": 0.03924494981765747, "learning_rate": 5.908379495295876e-06, "loss": 0.006, "step": 4320 }, { "epoch": 1.1400870597546497, "grad_norm": 0.041565440595149994, "learning_rate": 5.908027785105074e-06, "loss": 0.0028, "step": 4322 }, { "epoch": 1.1406146946313152, "grad_norm": 2.694974899291992, "learning_rate": 5.907676074914271e-06, "loss": 0.0044, "step": 4324 }, { "epoch": 1.1411423295079806, "grad_norm": 0.2952689826488495, "learning_rate": 5.907324364723468e-06, "loss": 0.0087, "step": 4326 }, { "epoch": 1.1416699643846457, "grad_norm": 0.3178585171699524, "learning_rate": 5.906972654532666e-06, "loss": 0.0085, "step": 4328 }, { "epoch": 1.1421975992613111, "grad_norm": 0.18290607631206512, "learning_rate": 5.9066209443418626e-06, "loss": 0.0058, "step": 4330 }, { "epoch": 1.1427252341379766, "grad_norm": 0.20787282288074493, "learning_rate": 5.9062692341510595e-06, "loss": 0.0039, "step": 4332 }, { "epoch": 1.143252869014642, "grad_norm": 0.4268997013568878, "learning_rate": 5.9059175239602565e-06, "loss": 0.0039, "step": 4334 }, { "epoch": 1.1437805038913071, "grad_norm": 0.3980439007282257, "learning_rate": 5.905565813769454e-06, "loss": 0.006, "step": 4336 }, { "epoch": 1.1443081387679725, "grad_norm": 0.06625896692276001, "learning_rate": 5.905214103578651e-06, "loss": 0.0025, "step": 4338 }, { "epoch": 1.144835773644638, "grad_norm": 0.06241413950920105, "learning_rate": 5.904862393387848e-06, "loss": 0.0029, "step": 4340 }, { "epoch": 1.1453634085213031, "grad_norm": 0.18385522067546844, "learning_rate": 5.904510683197046e-06, "loss": 0.0028, "step": 4342 }, { "epoch": 1.1458910433979685, "grad_norm": 0.7228710055351257, "learning_rate": 5.904158973006243e-06, "loss": 0.009, "step": 4344 }, { "epoch": 1.146418678274634, "grad_norm": 0.4794173240661621, "learning_rate": 5.903807262815441e-06, "loss": 0.0079, "step": 4346 }, { "epoch": 1.1469463131512994, "grad_norm": 0.19380231201648712, "learning_rate": 5.903455552624637e-06, "loss": 0.003, "step": 4348 }, { "epoch": 1.1474739480279648, "grad_norm": 0.02956997975707054, "learning_rate": 5.903103842433835e-06, "loss": 0.0025, "step": 4350 }, { "epoch": 1.14800158290463, "grad_norm": 0.21197070181369781, "learning_rate": 5.902752132243032e-06, "loss": 0.0075, "step": 4352 }, { "epoch": 1.1485292177812954, "grad_norm": 0.037441764026880264, "learning_rate": 5.902400422052229e-06, "loss": 0.0025, "step": 4354 }, { "epoch": 1.1490568526579608, "grad_norm": 0.3654518723487854, "learning_rate": 5.902048711861426e-06, "loss": 0.0094, "step": 4356 }, { "epoch": 1.149584487534626, "grad_norm": 0.02732408232986927, "learning_rate": 5.901697001670623e-06, "loss": 0.0026, "step": 4358 }, { "epoch": 1.1501121224112913, "grad_norm": 0.9568282961845398, "learning_rate": 5.901345291479821e-06, "loss": 0.009, "step": 4360 }, { "epoch": 1.1506397572879568, "grad_norm": 2.3799822330474854, "learning_rate": 5.900993581289018e-06, "loss": 0.0088, "step": 4362 }, { "epoch": 1.1511673921646222, "grad_norm": 0.1301020085811615, "learning_rate": 5.900641871098215e-06, "loss": 0.0032, "step": 4364 }, { "epoch": 1.1516950270412873, "grad_norm": 0.027596039697527885, "learning_rate": 5.900290160907413e-06, "loss": 0.0028, "step": 4366 }, { "epoch": 1.1522226619179528, "grad_norm": 0.31992679834365845, "learning_rate": 5.89993845071661e-06, "loss": 0.0041, "step": 4368 }, { "epoch": 1.1527502967946182, "grad_norm": 0.10331673175096512, "learning_rate": 5.899586740525807e-06, "loss": 0.0102, "step": 4370 }, { "epoch": 1.1532779316712836, "grad_norm": 0.29702886939048767, "learning_rate": 5.899235030335004e-06, "loss": 0.0048, "step": 4372 }, { "epoch": 1.1538055665479487, "grad_norm": 0.14972200989723206, "learning_rate": 5.8988833201442015e-06, "loss": 0.006, "step": 4374 }, { "epoch": 1.1543332014246142, "grad_norm": 0.040110357105731964, "learning_rate": 5.898531609953398e-06, "loss": 0.0174, "step": 4376 }, { "epoch": 1.1548608363012796, "grad_norm": 0.1181645542383194, "learning_rate": 5.898179899762595e-06, "loss": 0.0039, "step": 4378 }, { "epoch": 1.155388471177945, "grad_norm": 0.2927685081958771, "learning_rate": 5.897828189571793e-06, "loss": 0.0039, "step": 4380 }, { "epoch": 1.1559161060546101, "grad_norm": 0.09561280906200409, "learning_rate": 5.89747647938099e-06, "loss": 0.0026, "step": 4382 }, { "epoch": 1.1564437409312756, "grad_norm": 0.07092442363500595, "learning_rate": 5.897124769190188e-06, "loss": 0.0153, "step": 4384 }, { "epoch": 1.156971375807941, "grad_norm": 0.060671642422676086, "learning_rate": 5.896773058999385e-06, "loss": 0.0109, "step": 4386 }, { "epoch": 1.1574990106846061, "grad_norm": 0.04316437989473343, "learning_rate": 5.896421348808582e-06, "loss": 0.0043, "step": 4388 }, { "epoch": 1.1580266455612716, "grad_norm": 0.5323244333267212, "learning_rate": 5.89606963861778e-06, "loss": 0.0122, "step": 4390 }, { "epoch": 1.158554280437937, "grad_norm": 0.06341452151536942, "learning_rate": 5.895717928426976e-06, "loss": 0.0152, "step": 4392 }, { "epoch": 1.1590819153146024, "grad_norm": 0.4186045825481415, "learning_rate": 5.8953662182361735e-06, "loss": 0.0075, "step": 4394 }, { "epoch": 1.1596095501912675, "grad_norm": 0.32741600275039673, "learning_rate": 5.8950145080453705e-06, "loss": 0.0047, "step": 4396 }, { "epoch": 1.160137185067933, "grad_norm": 0.1989438384771347, "learning_rate": 5.894662797854568e-06, "loss": 0.0059, "step": 4398 }, { "epoch": 1.1606648199445984, "grad_norm": 0.782043993473053, "learning_rate": 5.894311087663765e-06, "loss": 0.0046, "step": 4400 }, { "epoch": 1.1611924548212638, "grad_norm": 0.10984619706869125, "learning_rate": 5.893959377472962e-06, "loss": 0.003, "step": 4402 }, { "epoch": 1.161720089697929, "grad_norm": 1.0754060745239258, "learning_rate": 5.89360766728216e-06, "loss": 0.0177, "step": 4404 }, { "epoch": 1.1622477245745944, "grad_norm": 0.11019442230463028, "learning_rate": 5.893255957091357e-06, "loss": 0.0085, "step": 4406 }, { "epoch": 1.1627753594512598, "grad_norm": 0.019420532509684563, "learning_rate": 5.892904246900554e-06, "loss": 0.0023, "step": 4408 }, { "epoch": 1.1633029943279252, "grad_norm": 0.3407262861728668, "learning_rate": 5.892552536709751e-06, "loss": 0.0064, "step": 4410 }, { "epoch": 1.1638306292045904, "grad_norm": 0.0384417325258255, "learning_rate": 5.892200826518949e-06, "loss": 0.0025, "step": 4412 }, { "epoch": 1.1643582640812558, "grad_norm": 0.4530719816684723, "learning_rate": 5.8918491163281464e-06, "loss": 0.0112, "step": 4414 }, { "epoch": 1.1648858989579212, "grad_norm": 0.08130984753370285, "learning_rate": 5.8914974061373425e-06, "loss": 0.0027, "step": 4416 }, { "epoch": 1.1654135338345863, "grad_norm": 0.45922473073005676, "learning_rate": 5.89114569594654e-06, "loss": 0.0064, "step": 4418 }, { "epoch": 1.1659411687112518, "grad_norm": 0.024049291387200356, "learning_rate": 5.890793985755737e-06, "loss": 0.0027, "step": 4420 }, { "epoch": 1.1664688035879172, "grad_norm": 0.04930641129612923, "learning_rate": 5.890442275564935e-06, "loss": 0.0055, "step": 4422 }, { "epoch": 1.1669964384645826, "grad_norm": 0.03961892053484917, "learning_rate": 5.890090565374132e-06, "loss": 0.0024, "step": 4424 }, { "epoch": 1.167524073341248, "grad_norm": 0.2330285608768463, "learning_rate": 5.889738855183329e-06, "loss": 0.018, "step": 4426 }, { "epoch": 1.1680517082179132, "grad_norm": 0.12506891787052155, "learning_rate": 5.889387144992527e-06, "loss": 0.0026, "step": 4428 }, { "epoch": 1.1685793430945786, "grad_norm": 1.6182596683502197, "learning_rate": 5.889035434801723e-06, "loss": 0.004, "step": 4430 }, { "epoch": 1.169106977971244, "grad_norm": 0.05843423679471016, "learning_rate": 5.888683724610921e-06, "loss": 0.0025, "step": 4432 }, { "epoch": 1.1696346128479091, "grad_norm": 0.07749781012535095, "learning_rate": 5.888332014420118e-06, "loss": 0.0059, "step": 4434 }, { "epoch": 1.1701622477245746, "grad_norm": 0.20270350575447083, "learning_rate": 5.8879803042293154e-06, "loss": 0.0098, "step": 4436 }, { "epoch": 1.17068988260124, "grad_norm": 1.0933868885040283, "learning_rate": 5.887628594038512e-06, "loss": 0.0124, "step": 4438 }, { "epoch": 1.1712175174779054, "grad_norm": 0.8132510781288147, "learning_rate": 5.887276883847709e-06, "loss": 0.0123, "step": 4440 }, { "epoch": 1.1717451523545706, "grad_norm": 0.03335731104016304, "learning_rate": 5.886925173656907e-06, "loss": 0.0026, "step": 4442 }, { "epoch": 1.172272787231236, "grad_norm": 0.25778353214263916, "learning_rate": 5.886573463466104e-06, "loss": 0.0195, "step": 4444 }, { "epoch": 1.1728004221079014, "grad_norm": 0.8188944458961487, "learning_rate": 5.886221753275301e-06, "loss": 0.0031, "step": 4446 }, { "epoch": 1.1733280569845668, "grad_norm": 0.4364459216594696, "learning_rate": 5.885870043084499e-06, "loss": 0.0084, "step": 4448 }, { "epoch": 1.173855691861232, "grad_norm": 0.3131161034107208, "learning_rate": 5.885518332893696e-06, "loss": 0.0034, "step": 4450 }, { "epoch": 1.1743833267378974, "grad_norm": 1.1283894777297974, "learning_rate": 5.885166622702894e-06, "loss": 0.0231, "step": 4452 }, { "epoch": 1.1749109616145628, "grad_norm": 0.5507065653800964, "learning_rate": 5.88481491251209e-06, "loss": 0.0037, "step": 4454 }, { "epoch": 1.1754385964912282, "grad_norm": 0.6374000906944275, "learning_rate": 5.8844632023212875e-06, "loss": 0.0069, "step": 4456 }, { "epoch": 1.1759662313678934, "grad_norm": 0.04716873914003372, "learning_rate": 5.8841114921304845e-06, "loss": 0.0044, "step": 4458 }, { "epoch": 1.1764938662445588, "grad_norm": 0.507703423500061, "learning_rate": 5.883759781939681e-06, "loss": 0.0049, "step": 4460 }, { "epoch": 1.1770215011212242, "grad_norm": 0.03835958614945412, "learning_rate": 5.883408071748879e-06, "loss": 0.0033, "step": 4462 }, { "epoch": 1.1775491359978894, "grad_norm": 0.35002899169921875, "learning_rate": 5.883056361558076e-06, "loss": 0.0383, "step": 4464 }, { "epoch": 1.1780767708745548, "grad_norm": 0.12768769264221191, "learning_rate": 5.882704651367274e-06, "loss": 0.0051, "step": 4466 }, { "epoch": 1.1786044057512202, "grad_norm": 0.5208406448364258, "learning_rate": 5.88235294117647e-06, "loss": 0.0042, "step": 4468 }, { "epoch": 1.1791320406278856, "grad_norm": 0.26094502210617065, "learning_rate": 5.882001230985668e-06, "loss": 0.0064, "step": 4470 }, { "epoch": 1.179659675504551, "grad_norm": 0.2717078924179077, "learning_rate": 5.881649520794866e-06, "loss": 0.006, "step": 4472 }, { "epoch": 1.1801873103812162, "grad_norm": 0.34043845534324646, "learning_rate": 5.881297810604063e-06, "loss": 0.0217, "step": 4474 }, { "epoch": 1.1807149452578816, "grad_norm": 0.5623799562454224, "learning_rate": 5.8809461004132596e-06, "loss": 0.0227, "step": 4476 }, { "epoch": 1.181242580134547, "grad_norm": 0.05072758346796036, "learning_rate": 5.8805943902224565e-06, "loss": 0.0027, "step": 4478 }, { "epoch": 1.1817702150112122, "grad_norm": 0.32253241539001465, "learning_rate": 5.880242680031654e-06, "loss": 0.007, "step": 4480 }, { "epoch": 1.1822978498878776, "grad_norm": 0.31502681970596313, "learning_rate": 5.879890969840851e-06, "loss": 0.0041, "step": 4482 }, { "epoch": 1.182825484764543, "grad_norm": 0.03599608317017555, "learning_rate": 5.879539259650048e-06, "loss": 0.0048, "step": 4484 }, { "epoch": 1.1833531196412084, "grad_norm": 0.026085637509822845, "learning_rate": 5.879187549459246e-06, "loss": 0.0025, "step": 4486 }, { "epoch": 1.1838807545178736, "grad_norm": 0.02119782753288746, "learning_rate": 5.878835839268443e-06, "loss": 0.0027, "step": 4488 }, { "epoch": 1.184408389394539, "grad_norm": 0.1676606833934784, "learning_rate": 5.87848412907764e-06, "loss": 0.0034, "step": 4490 }, { "epoch": 1.1849360242712044, "grad_norm": 0.25987574458122253, "learning_rate": 5.878132418886837e-06, "loss": 0.0031, "step": 4492 }, { "epoch": 1.1854636591478696, "grad_norm": 0.025197669863700867, "learning_rate": 5.877780708696035e-06, "loss": 0.0026, "step": 4494 }, { "epoch": 1.185991294024535, "grad_norm": 0.023235486820340157, "learning_rate": 5.8774289985052325e-06, "loss": 0.0025, "step": 4496 }, { "epoch": 1.1865189289012004, "grad_norm": 0.868391752243042, "learning_rate": 5.877077288314429e-06, "loss": 0.0134, "step": 4498 }, { "epoch": 1.1870465637778658, "grad_norm": 0.35505393147468567, "learning_rate": 5.876725578123626e-06, "loss": 0.0121, "step": 4500 }, { "epoch": 1.1875741986545312, "grad_norm": 0.04475186765193939, "learning_rate": 5.876373867932823e-06, "loss": 0.0024, "step": 4502 }, { "epoch": 1.1881018335311964, "grad_norm": 0.1258855015039444, "learning_rate": 5.876022157742021e-06, "loss": 0.0061, "step": 4504 }, { "epoch": 1.1886294684078618, "grad_norm": 0.1312287300825119, "learning_rate": 5.875670447551218e-06, "loss": 0.0027, "step": 4506 }, { "epoch": 1.1891571032845272, "grad_norm": 0.144680917263031, "learning_rate": 5.875318737360415e-06, "loss": 0.0111, "step": 4508 }, { "epoch": 1.1896847381611924, "grad_norm": 0.12998482584953308, "learning_rate": 5.874967027169613e-06, "loss": 0.0036, "step": 4510 }, { "epoch": 1.1902123730378578, "grad_norm": 0.1682034283876419, "learning_rate": 5.87461531697881e-06, "loss": 0.0091, "step": 4512 }, { "epoch": 1.1907400079145232, "grad_norm": 0.13025681674480438, "learning_rate": 5.874263606788007e-06, "loss": 0.009, "step": 4514 }, { "epoch": 1.1912676427911886, "grad_norm": 0.05289244279265404, "learning_rate": 5.873911896597204e-06, "loss": 0.0057, "step": 4516 }, { "epoch": 1.1917952776678538, "grad_norm": 0.179793581366539, "learning_rate": 5.8735601864064015e-06, "loss": 0.0093, "step": 4518 }, { "epoch": 1.1923229125445192, "grad_norm": 0.03451903164386749, "learning_rate": 5.8732084762155985e-06, "loss": 0.0121, "step": 4520 }, { "epoch": 1.1928505474211846, "grad_norm": 1.0027045011520386, "learning_rate": 5.872856766024795e-06, "loss": 0.0249, "step": 4522 }, { "epoch": 1.19337818229785, "grad_norm": 0.06888268887996674, "learning_rate": 5.872505055833993e-06, "loss": 0.0058, "step": 4524 }, { "epoch": 1.1939058171745152, "grad_norm": 0.8438951373100281, "learning_rate": 5.87215334564319e-06, "loss": 0.0186, "step": 4526 }, { "epoch": 1.1944334520511806, "grad_norm": 0.4082862138748169, "learning_rate": 5.871801635452387e-06, "loss": 0.0032, "step": 4528 }, { "epoch": 1.194961086927846, "grad_norm": 0.5507084727287292, "learning_rate": 5.871449925261585e-06, "loss": 0.0036, "step": 4530 }, { "epoch": 1.1954887218045114, "grad_norm": 0.3009345233440399, "learning_rate": 5.871098215070782e-06, "loss": 0.0103, "step": 4532 }, { "epoch": 1.1960163566811766, "grad_norm": 0.06400346010923386, "learning_rate": 5.87074650487998e-06, "loss": 0.0038, "step": 4534 }, { "epoch": 1.196543991557842, "grad_norm": 0.21381063759326935, "learning_rate": 5.870394794689176e-06, "loss": 0.0064, "step": 4536 }, { "epoch": 1.1970716264345074, "grad_norm": 0.0794493779540062, "learning_rate": 5.8700430844983736e-06, "loss": 0.0027, "step": 4538 }, { "epoch": 1.1975992613111726, "grad_norm": 0.14855997264385223, "learning_rate": 5.8696913743075705e-06, "loss": 0.0029, "step": 4540 }, { "epoch": 1.198126896187838, "grad_norm": 1.0123133659362793, "learning_rate": 5.869339664116768e-06, "loss": 0.0062, "step": 4542 }, { "epoch": 1.1986545310645034, "grad_norm": 0.13674351572990417, "learning_rate": 5.868987953925965e-06, "loss": 0.0028, "step": 4544 }, { "epoch": 1.1991821659411688, "grad_norm": 0.18914544582366943, "learning_rate": 5.868636243735162e-06, "loss": 0.0031, "step": 4546 }, { "epoch": 1.1997098008178342, "grad_norm": 0.4595835208892822, "learning_rate": 5.86828453354436e-06, "loss": 0.0139, "step": 4548 }, { "epoch": 1.2002374356944994, "grad_norm": 0.7520080804824829, "learning_rate": 5.867932823353556e-06, "loss": 0.0068, "step": 4550 }, { "epoch": 1.2007650705711648, "grad_norm": 0.023256629705429077, "learning_rate": 5.867581113162754e-06, "loss": 0.0074, "step": 4552 }, { "epoch": 1.2012927054478302, "grad_norm": 0.3615191876888275, "learning_rate": 5.867229402971952e-06, "loss": 0.0044, "step": 4554 }, { "epoch": 1.2018203403244954, "grad_norm": 0.15092436969280243, "learning_rate": 5.866877692781149e-06, "loss": 0.0028, "step": 4556 }, { "epoch": 1.2023479752011608, "grad_norm": 0.3358282744884491, "learning_rate": 5.866525982590346e-06, "loss": 0.016, "step": 4558 }, { "epoch": 1.2028756100778262, "grad_norm": 0.4475660026073456, "learning_rate": 5.866174272399543e-06, "loss": 0.0127, "step": 4560 }, { "epoch": 1.2034032449544916, "grad_norm": 0.17321088910102844, "learning_rate": 5.86582256220874e-06, "loss": 0.0028, "step": 4562 }, { "epoch": 1.2039308798311568, "grad_norm": 0.02138424478471279, "learning_rate": 5.865470852017937e-06, "loss": 0.0026, "step": 4564 }, { "epoch": 1.2044585147078222, "grad_norm": 0.4083772599697113, "learning_rate": 5.865119141827134e-06, "loss": 0.0129, "step": 4566 }, { "epoch": 1.2049861495844876, "grad_norm": 0.466214656829834, "learning_rate": 5.864767431636332e-06, "loss": 0.0034, "step": 4568 }, { "epoch": 1.2055137844611528, "grad_norm": 0.35625791549682617, "learning_rate": 5.864415721445529e-06, "loss": 0.0028, "step": 4570 }, { "epoch": 1.2060414193378182, "grad_norm": 0.10473976284265518, "learning_rate": 5.864064011254727e-06, "loss": 0.0027, "step": 4572 }, { "epoch": 1.2065690542144836, "grad_norm": 0.029834313318133354, "learning_rate": 5.863712301063923e-06, "loss": 0.0027, "step": 4574 }, { "epoch": 1.207096689091149, "grad_norm": 0.10131552815437317, "learning_rate": 5.863360590873121e-06, "loss": 0.0031, "step": 4576 }, { "epoch": 1.2076243239678144, "grad_norm": 0.1650562882423401, "learning_rate": 5.863008880682318e-06, "loss": 0.0131, "step": 4578 }, { "epoch": 1.2081519588444796, "grad_norm": 0.21972431242465973, "learning_rate": 5.862657170491515e-06, "loss": 0.0059, "step": 4580 }, { "epoch": 1.208679593721145, "grad_norm": 0.08296562731266022, "learning_rate": 5.8623054603007125e-06, "loss": 0.0087, "step": 4582 }, { "epoch": 1.2092072285978104, "grad_norm": 0.13198339939117432, "learning_rate": 5.861953750109909e-06, "loss": 0.0025, "step": 4584 }, { "epoch": 1.2097348634744756, "grad_norm": 0.07933294028043747, "learning_rate": 5.861602039919107e-06, "loss": 0.0025, "step": 4586 }, { "epoch": 1.210262498351141, "grad_norm": 0.019185107201337814, "learning_rate": 5.861250329728304e-06, "loss": 0.0023, "step": 4588 }, { "epoch": 1.2107901332278064, "grad_norm": 0.23289300501346588, "learning_rate": 5.860898619537501e-06, "loss": 0.0134, "step": 4590 }, { "epoch": 1.2113177681044718, "grad_norm": 0.326416939496994, "learning_rate": 5.860546909346699e-06, "loss": 0.0084, "step": 4592 }, { "epoch": 1.211845402981137, "grad_norm": 0.22126950323581696, "learning_rate": 5.860195199155896e-06, "loss": 0.0045, "step": 4594 }, { "epoch": 1.2123730378578024, "grad_norm": 0.7691903114318848, "learning_rate": 5.859843488965093e-06, "loss": 0.0101, "step": 4596 }, { "epoch": 1.2129006727344678, "grad_norm": 0.975631058216095, "learning_rate": 5.85949177877429e-06, "loss": 0.0045, "step": 4598 }, { "epoch": 1.2134283076111332, "grad_norm": 0.4381425082683563, "learning_rate": 5.8591400685834876e-06, "loss": 0.0287, "step": 4600 }, { "epoch": 1.2139559424877984, "grad_norm": 0.3083375096321106, "learning_rate": 5.8587883583926845e-06, "loss": 0.0125, "step": 4602 }, { "epoch": 1.2144835773644638, "grad_norm": 0.21918804943561554, "learning_rate": 5.8584366482018815e-06, "loss": 0.0028, "step": 4604 }, { "epoch": 1.2150112122411292, "grad_norm": 0.21342870593070984, "learning_rate": 5.858084938011079e-06, "loss": 0.0079, "step": 4606 }, { "epoch": 1.2155388471177946, "grad_norm": 0.3692604899406433, "learning_rate": 5.857733227820276e-06, "loss": 0.0068, "step": 4608 }, { "epoch": 1.2160664819944598, "grad_norm": 0.4964478015899658, "learning_rate": 5.857381517629473e-06, "loss": 0.0174, "step": 4610 }, { "epoch": 1.2165941168711252, "grad_norm": 0.17735335230827332, "learning_rate": 5.857029807438671e-06, "loss": 0.0056, "step": 4612 }, { "epoch": 1.2171217517477906, "grad_norm": 0.09393025934696198, "learning_rate": 5.856678097247868e-06, "loss": 0.0026, "step": 4614 }, { "epoch": 1.2176493866244558, "grad_norm": 0.17577487230300903, "learning_rate": 5.856326387057066e-06, "loss": 0.0128, "step": 4616 }, { "epoch": 1.2181770215011212, "grad_norm": 0.23644688725471497, "learning_rate": 5.855974676866262e-06, "loss": 0.005, "step": 4618 }, { "epoch": 1.2187046563777866, "grad_norm": 0.14734528958797455, "learning_rate": 5.85562296667546e-06, "loss": 0.0029, "step": 4620 }, { "epoch": 1.219232291254452, "grad_norm": 0.22386854887008667, "learning_rate": 5.855271256484657e-06, "loss": 0.0029, "step": 4622 }, { "epoch": 1.2197599261311174, "grad_norm": 0.6522747874259949, "learning_rate": 5.854919546293854e-06, "loss": 0.0252, "step": 4624 }, { "epoch": 1.2202875610077826, "grad_norm": 0.048744648694992065, "learning_rate": 5.854567836103051e-06, "loss": 0.0031, "step": 4626 }, { "epoch": 1.220815195884448, "grad_norm": 0.05996677279472351, "learning_rate": 5.854216125912248e-06, "loss": 0.0026, "step": 4628 }, { "epoch": 1.2213428307611134, "grad_norm": 0.645183265209198, "learning_rate": 5.853864415721446e-06, "loss": 0.0141, "step": 4630 }, { "epoch": 1.2218704656377786, "grad_norm": 0.39548006653785706, "learning_rate": 5.853512705530643e-06, "loss": 0.007, "step": 4632 }, { "epoch": 1.222398100514444, "grad_norm": 0.19050702452659607, "learning_rate": 5.85316099533984e-06, "loss": 0.0106, "step": 4634 }, { "epoch": 1.2229257353911094, "grad_norm": 0.4727887511253357, "learning_rate": 5.852809285149037e-06, "loss": 0.0088, "step": 4636 }, { "epoch": 1.2234533702677748, "grad_norm": 0.15285733342170715, "learning_rate": 5.852457574958235e-06, "loss": 0.0027, "step": 4638 }, { "epoch": 1.22398100514444, "grad_norm": 0.025991715490818024, "learning_rate": 5.852105864767432e-06, "loss": 0.0056, "step": 4640 }, { "epoch": 1.2245086400211054, "grad_norm": 0.08074310421943665, "learning_rate": 5.851754154576629e-06, "loss": 0.0043, "step": 4642 }, { "epoch": 1.2250362748977708, "grad_norm": 0.29991257190704346, "learning_rate": 5.8514024443858264e-06, "loss": 0.0122, "step": 4644 }, { "epoch": 1.225563909774436, "grad_norm": 0.04378051683306694, "learning_rate": 5.851050734195023e-06, "loss": 0.0025, "step": 4646 }, { "epoch": 1.2260915446511014, "grad_norm": 0.26875412464141846, "learning_rate": 5.85069902400422e-06, "loss": 0.0045, "step": 4648 }, { "epoch": 1.2266191795277668, "grad_norm": 0.029535502195358276, "learning_rate": 5.850347313813418e-06, "loss": 0.003, "step": 4650 }, { "epoch": 1.2271468144044322, "grad_norm": 0.17299498617649078, "learning_rate": 5.849995603622615e-06, "loss": 0.0035, "step": 4652 }, { "epoch": 1.2276744492810976, "grad_norm": 0.1073143407702446, "learning_rate": 5.849643893431813e-06, "loss": 0.0026, "step": 4654 }, { "epoch": 1.2282020841577628, "grad_norm": 0.04484666511416435, "learning_rate": 5.849292183241009e-06, "loss": 0.0023, "step": 4656 }, { "epoch": 1.2287297190344282, "grad_norm": 0.41119375824928284, "learning_rate": 5.848940473050207e-06, "loss": 0.0067, "step": 4658 }, { "epoch": 1.2292573539110936, "grad_norm": 1.1833858489990234, "learning_rate": 5.848588762859404e-06, "loss": 0.0116, "step": 4660 }, { "epoch": 1.2297849887877588, "grad_norm": 0.17494656145572662, "learning_rate": 5.8482370526686016e-06, "loss": 0.0027, "step": 4662 }, { "epoch": 1.2303126236644242, "grad_norm": 0.9596167802810669, "learning_rate": 5.8478853424777985e-06, "loss": 0.0025, "step": 4664 }, { "epoch": 1.2308402585410896, "grad_norm": 0.3326127529144287, "learning_rate": 5.8475336322869955e-06, "loss": 0.0034, "step": 4666 }, { "epoch": 1.231367893417755, "grad_norm": 0.1928786337375641, "learning_rate": 5.847181922096193e-06, "loss": 0.0113, "step": 4668 }, { "epoch": 1.2318955282944202, "grad_norm": 0.1246623545885086, "learning_rate": 5.84683021190539e-06, "loss": 0.0028, "step": 4670 }, { "epoch": 1.2324231631710856, "grad_norm": 0.35938259959220886, "learning_rate": 5.846478501714587e-06, "loss": 0.0031, "step": 4672 }, { "epoch": 1.232950798047751, "grad_norm": 0.2981209456920624, "learning_rate": 5.846126791523785e-06, "loss": 0.0044, "step": 4674 }, { "epoch": 1.2334784329244164, "grad_norm": 0.09425009042024612, "learning_rate": 5.845775081332982e-06, "loss": 0.0111, "step": 4676 }, { "epoch": 1.2340060678010816, "grad_norm": 0.3038043677806854, "learning_rate": 5.845423371142179e-06, "loss": 0.0089, "step": 4678 }, { "epoch": 1.234533702677747, "grad_norm": 0.0227659922093153, "learning_rate": 5.845071660951376e-06, "loss": 0.0021, "step": 4680 }, { "epoch": 1.2350613375544124, "grad_norm": 0.1076723039150238, "learning_rate": 5.844719950760574e-06, "loss": 0.0078, "step": 4682 }, { "epoch": 1.2355889724310778, "grad_norm": 0.022531261667609215, "learning_rate": 5.8443682405697706e-06, "loss": 0.0024, "step": 4684 }, { "epoch": 1.236116607307743, "grad_norm": 0.6716088652610779, "learning_rate": 5.8440165303789675e-06, "loss": 0.0035, "step": 4686 }, { "epoch": 1.2366442421844084, "grad_norm": 0.02179471030831337, "learning_rate": 5.843664820188165e-06, "loss": 0.0021, "step": 4688 }, { "epoch": 1.2371718770610738, "grad_norm": 0.03525307774543762, "learning_rate": 5.843313109997362e-06, "loss": 0.0045, "step": 4690 }, { "epoch": 1.237699511937739, "grad_norm": 0.019599368795752525, "learning_rate": 5.84296139980656e-06, "loss": 0.0105, "step": 4692 }, { "epoch": 1.2382271468144044, "grad_norm": 0.37321704626083374, "learning_rate": 5.842609689615756e-06, "loss": 0.0039, "step": 4694 }, { "epoch": 1.2387547816910698, "grad_norm": 0.04308081418275833, "learning_rate": 5.842257979424954e-06, "loss": 0.0022, "step": 4696 }, { "epoch": 1.2392824165677352, "grad_norm": 1.5804978609085083, "learning_rate": 5.841906269234152e-06, "loss": 0.0082, "step": 4698 }, { "epoch": 1.2398100514444006, "grad_norm": 0.22819505631923676, "learning_rate": 5.841554559043348e-06, "loss": 0.0137, "step": 4700 }, { "epoch": 1.2403376863210658, "grad_norm": 0.6110243797302246, "learning_rate": 5.841202848852546e-06, "loss": 0.0381, "step": 4702 }, { "epoch": 1.2408653211977312, "grad_norm": 0.10608334094285965, "learning_rate": 5.840851138661743e-06, "loss": 0.0125, "step": 4704 }, { "epoch": 1.2413929560743966, "grad_norm": 0.13754400610923767, "learning_rate": 5.8404994284709404e-06, "loss": 0.0034, "step": 4706 }, { "epoch": 1.2419205909510618, "grad_norm": 0.14170925319194794, "learning_rate": 5.840147718280137e-06, "loss": 0.0185, "step": 4708 }, { "epoch": 1.2424482258277272, "grad_norm": 0.46673205494880676, "learning_rate": 5.839796008089334e-06, "loss": 0.0043, "step": 4710 }, { "epoch": 1.2429758607043926, "grad_norm": 0.08095873147249222, "learning_rate": 5.839444297898532e-06, "loss": 0.0024, "step": 4712 }, { "epoch": 1.243503495581058, "grad_norm": 0.03892667964100838, "learning_rate": 5.839092587707729e-06, "loss": 0.0033, "step": 4714 }, { "epoch": 1.2440311304577232, "grad_norm": 0.03116220235824585, "learning_rate": 5.838740877516926e-06, "loss": 0.0021, "step": 4716 }, { "epoch": 1.2445587653343886, "grad_norm": 0.22685210406780243, "learning_rate": 5.838389167326123e-06, "loss": 0.0026, "step": 4718 }, { "epoch": 1.245086400211054, "grad_norm": 0.5218978524208069, "learning_rate": 5.838037457135321e-06, "loss": 0.0082, "step": 4720 }, { "epoch": 1.2456140350877192, "grad_norm": 0.03547034040093422, "learning_rate": 5.837685746944519e-06, "loss": 0.0031, "step": 4722 }, { "epoch": 1.2461416699643846, "grad_norm": 0.28805407881736755, "learning_rate": 5.837334036753715e-06, "loss": 0.0028, "step": 4724 }, { "epoch": 1.24666930484105, "grad_norm": 0.029723823070526123, "learning_rate": 5.8369823265629125e-06, "loss": 0.0021, "step": 4726 }, { "epoch": 1.2471969397177154, "grad_norm": 0.03645463287830353, "learning_rate": 5.8366306163721095e-06, "loss": 0.0095, "step": 4728 }, { "epoch": 1.2477245745943808, "grad_norm": 0.08188829571008682, "learning_rate": 5.836278906181306e-06, "loss": 0.0041, "step": 4730 }, { "epoch": 1.248252209471046, "grad_norm": 0.38773974776268005, "learning_rate": 5.835927195990504e-06, "loss": 0.0052, "step": 4732 }, { "epoch": 1.2487798443477114, "grad_norm": 0.10514359921216965, "learning_rate": 5.835575485799701e-06, "loss": 0.018, "step": 4734 }, { "epoch": 1.2493074792243768, "grad_norm": 0.8601570129394531, "learning_rate": 5.835223775608899e-06, "loss": 0.0119, "step": 4736 }, { "epoch": 1.249835114101042, "grad_norm": 0.08297839760780334, "learning_rate": 5.834872065418095e-06, "loss": 0.0066, "step": 4738 }, { "epoch": 1.2503627489777074, "grad_norm": 0.13315899670124054, "learning_rate": 5.834520355227293e-06, "loss": 0.0038, "step": 4740 }, { "epoch": 1.2508903838543728, "grad_norm": 0.283048152923584, "learning_rate": 5.83416864503649e-06, "loss": 0.0029, "step": 4742 }, { "epoch": 1.2514180187310382, "grad_norm": 0.2513100206851959, "learning_rate": 5.833816934845688e-06, "loss": 0.0066, "step": 4744 }, { "epoch": 1.2519456536077036, "grad_norm": 0.2642924189567566, "learning_rate": 5.8334652246548846e-06, "loss": 0.0074, "step": 4746 }, { "epoch": 1.2524732884843688, "grad_norm": 0.4497087001800537, "learning_rate": 5.8331135144640815e-06, "loss": 0.0065, "step": 4748 }, { "epoch": 1.2530009233610342, "grad_norm": 0.9173867106437683, "learning_rate": 5.832761804273279e-06, "loss": 0.0147, "step": 4750 }, { "epoch": 1.2535285582376994, "grad_norm": 0.038533248007297516, "learning_rate": 5.832410094082476e-06, "loss": 0.0061, "step": 4752 }, { "epoch": 1.2540561931143648, "grad_norm": 0.41098907589912415, "learning_rate": 5.832058383891673e-06, "loss": 0.006, "step": 4754 }, { "epoch": 1.2545838279910302, "grad_norm": 0.04807174578309059, "learning_rate": 5.831706673700871e-06, "loss": 0.0062, "step": 4756 }, { "epoch": 1.2551114628676956, "grad_norm": 0.04619212821125984, "learning_rate": 5.831354963510068e-06, "loss": 0.0032, "step": 4758 }, { "epoch": 1.255639097744361, "grad_norm": 0.07520747929811478, "learning_rate": 5.831003253319265e-06, "loss": 0.0073, "step": 4760 }, { "epoch": 1.2561667326210262, "grad_norm": 0.20885047316551208, "learning_rate": 5.830651543128462e-06, "loss": 0.0082, "step": 4762 }, { "epoch": 1.2566943674976916, "grad_norm": 0.05065050348639488, "learning_rate": 5.83029983293766e-06, "loss": 0.0041, "step": 4764 }, { "epoch": 1.257222002374357, "grad_norm": 0.19616864621639252, "learning_rate": 5.829948122746857e-06, "loss": 0.0099, "step": 4766 }, { "epoch": 1.2577496372510222, "grad_norm": 0.04913657158613205, "learning_rate": 5.829596412556054e-06, "loss": 0.0026, "step": 4768 }, { "epoch": 1.2582772721276876, "grad_norm": 0.41298502683639526, "learning_rate": 5.829244702365251e-06, "loss": 0.0184, "step": 4770 }, { "epoch": 1.258804907004353, "grad_norm": 0.518196165561676, "learning_rate": 5.828892992174448e-06, "loss": 0.0039, "step": 4772 }, { "epoch": 1.2593325418810184, "grad_norm": 0.07850131392478943, "learning_rate": 5.828541281983646e-06, "loss": 0.0027, "step": 4774 }, { "epoch": 1.2598601767576838, "grad_norm": 0.3568188548088074, "learning_rate": 5.828189571792842e-06, "loss": 0.0034, "step": 4776 }, { "epoch": 1.260387811634349, "grad_norm": 0.20063358545303345, "learning_rate": 5.82783786160204e-06, "loss": 0.0037, "step": 4778 }, { "epoch": 1.2609154465110144, "grad_norm": 0.101934053003788, "learning_rate": 5.827486151411238e-06, "loss": 0.0024, "step": 4780 }, { "epoch": 1.2614430813876798, "grad_norm": 0.46200019121170044, "learning_rate": 5.827134441220435e-06, "loss": 0.0063, "step": 4782 }, { "epoch": 1.261970716264345, "grad_norm": 0.30934569239616394, "learning_rate": 5.826782731029632e-06, "loss": 0.0029, "step": 4784 }, { "epoch": 1.2624983511410104, "grad_norm": 0.07100816816091537, "learning_rate": 5.826431020838829e-06, "loss": 0.0029, "step": 4786 }, { "epoch": 1.2630259860176758, "grad_norm": 0.1335499882698059, "learning_rate": 5.8260793106480265e-06, "loss": 0.0209, "step": 4788 }, { "epoch": 1.2635536208943412, "grad_norm": 0.0427490659058094, "learning_rate": 5.8257276004572234e-06, "loss": 0.0036, "step": 4790 }, { "epoch": 1.2640812557710064, "grad_norm": 0.01439090259373188, "learning_rate": 5.82537589026642e-06, "loss": 0.0029, "step": 4792 }, { "epoch": 1.2646088906476718, "grad_norm": 0.024132510647177696, "learning_rate": 5.825024180075618e-06, "loss": 0.0021, "step": 4794 }, { "epoch": 1.2651365255243372, "grad_norm": 0.3390601873397827, "learning_rate": 5.824672469884815e-06, "loss": 0.0047, "step": 4796 }, { "epoch": 1.2656641604010024, "grad_norm": 0.0321304090321064, "learning_rate": 5.824320759694012e-06, "loss": 0.0025, "step": 4798 }, { "epoch": 1.2661917952776678, "grad_norm": 0.11553817987442017, "learning_rate": 5.823969049503209e-06, "loss": 0.0022, "step": 4800 }, { "epoch": 1.2667194301543332, "grad_norm": 0.18847928941249847, "learning_rate": 5.823617339312407e-06, "loss": 0.0022, "step": 4802 }, { "epoch": 1.2672470650309986, "grad_norm": 0.02472066506743431, "learning_rate": 5.823265629121604e-06, "loss": 0.012, "step": 4804 }, { "epoch": 1.267774699907664, "grad_norm": 0.3541640043258667, "learning_rate": 5.822913918930801e-06, "loss": 0.0057, "step": 4806 }, { "epoch": 1.2683023347843292, "grad_norm": 1.5961017608642578, "learning_rate": 5.8225622087399986e-06, "loss": 0.0052, "step": 4808 }, { "epoch": 1.2688299696609946, "grad_norm": 2.5185201168060303, "learning_rate": 5.8222104985491955e-06, "loss": 0.0085, "step": 4810 }, { "epoch": 1.26935760453766, "grad_norm": 0.03969825059175491, "learning_rate": 5.821858788358393e-06, "loss": 0.0026, "step": 4812 }, { "epoch": 1.2698852394143252, "grad_norm": 0.7368956208229065, "learning_rate": 5.82150707816759e-06, "loss": 0.0037, "step": 4814 }, { "epoch": 1.2704128742909906, "grad_norm": 0.06321894377470016, "learning_rate": 5.821155367976787e-06, "loss": 0.0024, "step": 4816 }, { "epoch": 1.270940509167656, "grad_norm": 0.09370073676109314, "learning_rate": 5.820803657785985e-06, "loss": 0.0047, "step": 4818 }, { "epoch": 1.2714681440443214, "grad_norm": 0.02734762243926525, "learning_rate": 5.820451947595181e-06, "loss": 0.0034, "step": 4820 }, { "epoch": 1.2719957789209868, "grad_norm": 0.048427265137434006, "learning_rate": 5.820100237404379e-06, "loss": 0.0021, "step": 4822 }, { "epoch": 1.272523413797652, "grad_norm": 0.792088508605957, "learning_rate": 5.819748527213576e-06, "loss": 0.0029, "step": 4824 }, { "epoch": 1.2730510486743174, "grad_norm": 0.09780382364988327, "learning_rate": 5.819396817022774e-06, "loss": 0.0037, "step": 4826 }, { "epoch": 1.2735786835509826, "grad_norm": 0.5044623017311096, "learning_rate": 5.819045106831971e-06, "loss": 0.0039, "step": 4828 }, { "epoch": 1.274106318427648, "grad_norm": 0.03465502709150314, "learning_rate": 5.8186933966411676e-06, "loss": 0.0021, "step": 4830 }, { "epoch": 1.2746339533043134, "grad_norm": 0.533805251121521, "learning_rate": 5.818341686450365e-06, "loss": 0.0071, "step": 4832 }, { "epoch": 1.2751615881809788, "grad_norm": 1.0718488693237305, "learning_rate": 5.817989976259562e-06, "loss": 0.008, "step": 4834 }, { "epoch": 1.2756892230576442, "grad_norm": 0.45289307832717896, "learning_rate": 5.817638266068759e-06, "loss": 0.0056, "step": 4836 }, { "epoch": 1.2762168579343094, "grad_norm": 0.7972418069839478, "learning_rate": 5.817286555877957e-06, "loss": 0.0103, "step": 4838 }, { "epoch": 1.2767444928109748, "grad_norm": 0.01946425810456276, "learning_rate": 5.816934845687154e-06, "loss": 0.0019, "step": 4840 }, { "epoch": 1.2772721276876402, "grad_norm": 0.3322647213935852, "learning_rate": 5.816583135496352e-06, "loss": 0.0067, "step": 4842 }, { "epoch": 1.2777997625643054, "grad_norm": 0.09995825588703156, "learning_rate": 5.816231425305548e-06, "loss": 0.0123, "step": 4844 }, { "epoch": 1.2783273974409708, "grad_norm": 0.6934739351272583, "learning_rate": 5.815879715114746e-06, "loss": 0.0054, "step": 4846 }, { "epoch": 1.2788550323176362, "grad_norm": 0.7918305397033691, "learning_rate": 5.815528004923943e-06, "loss": 0.0088, "step": 4848 }, { "epoch": 1.2793826671943016, "grad_norm": 0.2597324550151825, "learning_rate": 5.81517629473314e-06, "loss": 0.0031, "step": 4850 }, { "epoch": 1.279910302070967, "grad_norm": 0.20971199870109558, "learning_rate": 5.8148245845423374e-06, "loss": 0.0135, "step": 4852 }, { "epoch": 1.2804379369476322, "grad_norm": 0.7565637230873108, "learning_rate": 5.814472874351534e-06, "loss": 0.028, "step": 4854 }, { "epoch": 1.2809655718242976, "grad_norm": 0.33064955472946167, "learning_rate": 5.814121164160732e-06, "loss": 0.0033, "step": 4856 }, { "epoch": 1.281493206700963, "grad_norm": 0.21693184971809387, "learning_rate": 5.813769453969928e-06, "loss": 0.0045, "step": 4858 }, { "epoch": 1.2820208415776282, "grad_norm": 0.06303434818983078, "learning_rate": 5.813417743779126e-06, "loss": 0.0022, "step": 4860 }, { "epoch": 1.2825484764542936, "grad_norm": 0.46875959634780884, "learning_rate": 5.813066033588323e-06, "loss": 0.0027, "step": 4862 }, { "epoch": 1.283076111330959, "grad_norm": 0.45186689496040344, "learning_rate": 5.812714323397521e-06, "loss": 0.0129, "step": 4864 }, { "epoch": 1.2836037462076244, "grad_norm": 0.5819169282913208, "learning_rate": 5.812362613206718e-06, "loss": 0.0052, "step": 4866 }, { "epoch": 1.2841313810842896, "grad_norm": 0.04232395067811012, "learning_rate": 5.812010903015915e-06, "loss": 0.0026, "step": 4868 }, { "epoch": 1.284659015960955, "grad_norm": 0.6116772890090942, "learning_rate": 5.8116591928251126e-06, "loss": 0.0234, "step": 4870 }, { "epoch": 1.2851866508376204, "grad_norm": 0.09529289603233337, "learning_rate": 5.8113074826343095e-06, "loss": 0.0021, "step": 4872 }, { "epoch": 1.2857142857142856, "grad_norm": 0.570353627204895, "learning_rate": 5.8109557724435065e-06, "loss": 0.0148, "step": 4874 }, { "epoch": 1.286241920590951, "grad_norm": 0.12543249130249023, "learning_rate": 5.810604062252704e-06, "loss": 0.0028, "step": 4876 }, { "epoch": 1.2867695554676164, "grad_norm": 1.5356978178024292, "learning_rate": 5.810252352061901e-06, "loss": 0.0154, "step": 4878 }, { "epoch": 1.2872971903442818, "grad_norm": 0.035998664796352386, "learning_rate": 5.809900641871098e-06, "loss": 0.0021, "step": 4880 }, { "epoch": 1.2878248252209472, "grad_norm": 0.02292576991021633, "learning_rate": 5.809548931680295e-06, "loss": 0.0116, "step": 4882 }, { "epoch": 1.2883524600976124, "grad_norm": 0.04247569665312767, "learning_rate": 5.809197221489493e-06, "loss": 0.0045, "step": 4884 }, { "epoch": 1.2888800949742778, "grad_norm": 0.2961863577365875, "learning_rate": 5.80884551129869e-06, "loss": 0.0065, "step": 4886 }, { "epoch": 1.2894077298509432, "grad_norm": 0.09939096122980118, "learning_rate": 5.808493801107887e-06, "loss": 0.0165, "step": 4888 }, { "epoch": 1.2899353647276084, "grad_norm": 0.5871089100837708, "learning_rate": 5.808142090917085e-06, "loss": 0.0073, "step": 4890 }, { "epoch": 1.2904629996042738, "grad_norm": 0.49815645813941956, "learning_rate": 5.8077903807262816e-06, "loss": 0.0035, "step": 4892 }, { "epoch": 1.2909906344809392, "grad_norm": 0.11924421042203903, "learning_rate": 5.807438670535479e-06, "loss": 0.003, "step": 4894 }, { "epoch": 1.2915182693576046, "grad_norm": 0.07491407543420792, "learning_rate": 5.807086960344676e-06, "loss": 0.005, "step": 4896 }, { "epoch": 1.29204590423427, "grad_norm": 0.5629310607910156, "learning_rate": 5.806735250153873e-06, "loss": 0.0119, "step": 4898 }, { "epoch": 1.2925735391109352, "grad_norm": 0.3574075400829315, "learning_rate": 5.806383539963071e-06, "loss": 0.0059, "step": 4900 }, { "epoch": 1.2931011739876006, "grad_norm": 0.5383639335632324, "learning_rate": 5.806031829772268e-06, "loss": 0.0117, "step": 4902 }, { "epoch": 1.293628808864266, "grad_norm": 0.04151812568306923, "learning_rate": 5.805680119581465e-06, "loss": 0.0033, "step": 4904 }, { "epoch": 1.2941564437409312, "grad_norm": 0.481562077999115, "learning_rate": 5.805328409390662e-06, "loss": 0.0074, "step": 4906 }, { "epoch": 1.2946840786175966, "grad_norm": 1.0734018087387085, "learning_rate": 5.80497669919986e-06, "loss": 0.013, "step": 4908 }, { "epoch": 1.295211713494262, "grad_norm": 0.1315944790840149, "learning_rate": 5.804624989009057e-06, "loss": 0.0046, "step": 4910 }, { "epoch": 1.2957393483709274, "grad_norm": 0.44969773292541504, "learning_rate": 5.804273278818254e-06, "loss": 0.0094, "step": 4912 }, { "epoch": 1.2962669832475926, "grad_norm": 0.39412665367126465, "learning_rate": 5.8039215686274514e-06, "loss": 0.009, "step": 4914 }, { "epoch": 1.296794618124258, "grad_norm": 0.42531612515449524, "learning_rate": 5.803569858436648e-06, "loss": 0.0051, "step": 4916 }, { "epoch": 1.2973222530009234, "grad_norm": 0.11389409750699997, "learning_rate": 5.803218148245845e-06, "loss": 0.0023, "step": 4918 }, { "epoch": 1.2978498878775886, "grad_norm": 0.7342713475227356, "learning_rate": 5.802866438055042e-06, "loss": 0.0044, "step": 4920 }, { "epoch": 1.298377522754254, "grad_norm": 0.6563075184822083, "learning_rate": 5.80251472786424e-06, "loss": 0.0084, "step": 4922 }, { "epoch": 1.2989051576309194, "grad_norm": 0.03760267421603203, "learning_rate": 5.802163017673438e-06, "loss": 0.0023, "step": 4924 }, { "epoch": 1.2994327925075848, "grad_norm": 0.019458195194602013, "learning_rate": 5.801811307482634e-06, "loss": 0.0019, "step": 4926 }, { "epoch": 1.2999604273842502, "grad_norm": 0.5913904905319214, "learning_rate": 5.801459597291832e-06, "loss": 0.0084, "step": 4928 }, { "epoch": 1.3004880622609154, "grad_norm": 0.10739616304636002, "learning_rate": 5.801107887101029e-06, "loss": 0.0049, "step": 4930 }, { "epoch": 1.3010156971375808, "grad_norm": 0.6122298240661621, "learning_rate": 5.8007561769102265e-06, "loss": 0.0135, "step": 4932 }, { "epoch": 1.3015433320142462, "grad_norm": 0.05946006253361702, "learning_rate": 5.8004044667194235e-06, "loss": 0.002, "step": 4934 }, { "epoch": 1.3020709668909114, "grad_norm": 0.42673254013061523, "learning_rate": 5.8000527565286204e-06, "loss": 0.0099, "step": 4936 }, { "epoch": 1.3025986017675768, "grad_norm": 0.6158038973808289, "learning_rate": 5.799701046337818e-06, "loss": 0.0145, "step": 4938 }, { "epoch": 1.3031262366442422, "grad_norm": 0.08669020235538483, "learning_rate": 5.799349336147014e-06, "loss": 0.0023, "step": 4940 }, { "epoch": 1.3036538715209076, "grad_norm": 0.24144873023033142, "learning_rate": 5.798997625956212e-06, "loss": 0.0048, "step": 4942 }, { "epoch": 1.3041815063975728, "grad_norm": 0.2175649255514145, "learning_rate": 5.798645915765409e-06, "loss": 0.003, "step": 4944 }, { "epoch": 1.3047091412742382, "grad_norm": 0.040025271475315094, "learning_rate": 5.798294205574607e-06, "loss": 0.0071, "step": 4946 }, { "epoch": 1.3052367761509036, "grad_norm": 0.05797363817691803, "learning_rate": 5.797942495383804e-06, "loss": 0.0025, "step": 4948 }, { "epoch": 1.3057644110275688, "grad_norm": 0.03437514975667, "learning_rate": 5.797590785193001e-06, "loss": 0.0024, "step": 4950 }, { "epoch": 1.3062920459042342, "grad_norm": 0.14400623738765717, "learning_rate": 5.797239075002199e-06, "loss": 0.0122, "step": 4952 }, { "epoch": 1.3068196807808996, "grad_norm": 0.20428717136383057, "learning_rate": 5.7968873648113956e-06, "loss": 0.016, "step": 4954 }, { "epoch": 1.307347315657565, "grad_norm": 0.3017105758190155, "learning_rate": 5.7965356546205925e-06, "loss": 0.0099, "step": 4956 }, { "epoch": 1.3078749505342304, "grad_norm": 0.12142409384250641, "learning_rate": 5.79618394442979e-06, "loss": 0.0025, "step": 4958 }, { "epoch": 1.3084025854108956, "grad_norm": 0.15162429213523865, "learning_rate": 5.795832234238987e-06, "loss": 0.0128, "step": 4960 }, { "epoch": 1.308930220287561, "grad_norm": 0.47429025173187256, "learning_rate": 5.795480524048185e-06, "loss": 0.0086, "step": 4962 }, { "epoch": 1.3094578551642264, "grad_norm": 0.09660805761814117, "learning_rate": 5.795128813857381e-06, "loss": 0.0052, "step": 4964 }, { "epoch": 1.3099854900408916, "grad_norm": 0.7929002046585083, "learning_rate": 5.794777103666579e-06, "loss": 0.0082, "step": 4966 }, { "epoch": 1.310513124917557, "grad_norm": 0.1930655539035797, "learning_rate": 5.794425393475776e-06, "loss": 0.0095, "step": 4968 }, { "epoch": 1.3110407597942224, "grad_norm": 0.10905519127845764, "learning_rate": 5.794073683284973e-06, "loss": 0.0048, "step": 4970 }, { "epoch": 1.3115683946708878, "grad_norm": 0.13808968663215637, "learning_rate": 5.793721973094171e-06, "loss": 0.0105, "step": 4972 }, { "epoch": 1.3120960295475532, "grad_norm": 0.16489392518997192, "learning_rate": 5.793370262903368e-06, "loss": 0.0026, "step": 4974 }, { "epoch": 1.3126236644242184, "grad_norm": 0.02430584467947483, "learning_rate": 5.7930185527125654e-06, "loss": 0.0085, "step": 4976 }, { "epoch": 1.3131512993008838, "grad_norm": 0.2259346842765808, "learning_rate": 5.7926668425217615e-06, "loss": 0.0037, "step": 4978 }, { "epoch": 1.3136789341775492, "grad_norm": 1.0382171869277954, "learning_rate": 5.792315132330959e-06, "loss": 0.0072, "step": 4980 }, { "epoch": 1.3142065690542144, "grad_norm": 0.4598039388656616, "learning_rate": 5.791963422140157e-06, "loss": 0.0046, "step": 4982 }, { "epoch": 1.3147342039308798, "grad_norm": 0.20837587118148804, "learning_rate": 5.791611711949354e-06, "loss": 0.0029, "step": 4984 }, { "epoch": 1.3152618388075452, "grad_norm": 0.5512325763702393, "learning_rate": 5.791260001758551e-06, "loss": 0.0033, "step": 4986 }, { "epoch": 1.3157894736842106, "grad_norm": 0.1507902592420578, "learning_rate": 5.790908291567748e-06, "loss": 0.0053, "step": 4988 }, { "epoch": 1.3163171085608758, "grad_norm": 0.022411784157156944, "learning_rate": 5.790556581376946e-06, "loss": 0.0066, "step": 4990 }, { "epoch": 1.3168447434375412, "grad_norm": 0.019965244457125664, "learning_rate": 5.790204871186143e-06, "loss": 0.004, "step": 4992 }, { "epoch": 1.3173723783142066, "grad_norm": 0.1286943256855011, "learning_rate": 5.78985316099534e-06, "loss": 0.0067, "step": 4994 }, { "epoch": 1.3179000131908718, "grad_norm": 0.14519309997558594, "learning_rate": 5.7895014508045375e-06, "loss": 0.0054, "step": 4996 }, { "epoch": 1.3184276480675372, "grad_norm": 0.04468713328242302, "learning_rate": 5.7891497406137344e-06, "loss": 0.0019, "step": 4998 }, { "epoch": 1.3189552829442026, "grad_norm": 0.04844222217798233, "learning_rate": 5.788798030422931e-06, "loss": 0.0022, "step": 5000 }, { "epoch": 1.319482917820868, "grad_norm": 0.03687555715441704, "learning_rate": 5.788446320232128e-06, "loss": 0.0027, "step": 5002 }, { "epoch": 1.3200105526975334, "grad_norm": 0.057283539324998856, "learning_rate": 5.788094610041326e-06, "loss": 0.002, "step": 5004 }, { "epoch": 1.3205381875741986, "grad_norm": 0.039636075496673584, "learning_rate": 5.787742899850523e-06, "loss": 0.0018, "step": 5006 }, { "epoch": 1.321065822450864, "grad_norm": 0.14982499182224274, "learning_rate": 5.78739118965972e-06, "loss": 0.0019, "step": 5008 }, { "epoch": 1.3215934573275294, "grad_norm": 0.0519658662378788, "learning_rate": 5.787039479468918e-06, "loss": 0.0072, "step": 5010 }, { "epoch": 1.3221210922041946, "grad_norm": 0.06256701797246933, "learning_rate": 5.786687769278115e-06, "loss": 0.0049, "step": 5012 }, { "epoch": 1.32264872708086, "grad_norm": 0.4356502592563629, "learning_rate": 5.786336059087313e-06, "loss": 0.0159, "step": 5014 }, { "epoch": 1.3231763619575254, "grad_norm": 0.0763649195432663, "learning_rate": 5.7859843488965096e-06, "loss": 0.0058, "step": 5016 }, { "epoch": 1.3237039968341908, "grad_norm": 0.19453312456607819, "learning_rate": 5.7856326387057065e-06, "loss": 0.0144, "step": 5018 }, { "epoch": 1.324231631710856, "grad_norm": 0.5425065159797668, "learning_rate": 5.785280928514904e-06, "loss": 0.0165, "step": 5020 }, { "epoch": 1.3247592665875214, "grad_norm": 0.1314072608947754, "learning_rate": 5.784929218324101e-06, "loss": 0.0026, "step": 5022 }, { "epoch": 1.3252869014641868, "grad_norm": 0.3191956877708435, "learning_rate": 5.784577508133298e-06, "loss": 0.013, "step": 5024 }, { "epoch": 1.325814536340852, "grad_norm": 0.16299186646938324, "learning_rate": 5.784225797942495e-06, "loss": 0.0026, "step": 5026 }, { "epoch": 1.3263421712175174, "grad_norm": 0.09773801267147064, "learning_rate": 5.783874087751693e-06, "loss": 0.0073, "step": 5028 }, { "epoch": 1.3268698060941828, "grad_norm": 0.03709521144628525, "learning_rate": 5.78352237756089e-06, "loss": 0.0021, "step": 5030 }, { "epoch": 1.3273974409708482, "grad_norm": 0.4289134442806244, "learning_rate": 5.783170667370087e-06, "loss": 0.0064, "step": 5032 }, { "epoch": 1.3279250758475136, "grad_norm": 0.05580095201730728, "learning_rate": 5.782818957179285e-06, "loss": 0.0023, "step": 5034 }, { "epoch": 1.3284527107241788, "grad_norm": 0.05328407511115074, "learning_rate": 5.782467246988482e-06, "loss": 0.0028, "step": 5036 }, { "epoch": 1.3289803456008442, "grad_norm": 0.41811603307724, "learning_rate": 5.7821155367976786e-06, "loss": 0.0104, "step": 5038 }, { "epoch": 1.3295079804775096, "grad_norm": 0.014769010245800018, "learning_rate": 5.781763826606876e-06, "loss": 0.0018, "step": 5040 }, { "epoch": 1.3300356153541748, "grad_norm": 0.1649676263332367, "learning_rate": 5.781412116416073e-06, "loss": 0.0029, "step": 5042 }, { "epoch": 1.3305632502308402, "grad_norm": 0.03361603245139122, "learning_rate": 5.781060406225271e-06, "loss": 0.0018, "step": 5044 }, { "epoch": 1.3310908851075056, "grad_norm": 0.37362319231033325, "learning_rate": 5.780708696034467e-06, "loss": 0.0052, "step": 5046 }, { "epoch": 1.331618519984171, "grad_norm": 0.021357692778110504, "learning_rate": 5.780356985843665e-06, "loss": 0.0026, "step": 5048 }, { "epoch": 1.3321461548608364, "grad_norm": 0.0277920700609684, "learning_rate": 5.780005275652862e-06, "loss": 0.0075, "step": 5050 }, { "epoch": 1.3326737897375016, "grad_norm": 0.4350624084472656, "learning_rate": 5.77965356546206e-06, "loss": 0.0024, "step": 5052 }, { "epoch": 1.333201424614167, "grad_norm": 0.05112631246447563, "learning_rate": 5.779301855271257e-06, "loss": 0.0019, "step": 5054 }, { "epoch": 1.3337290594908324, "grad_norm": 0.07014521956443787, "learning_rate": 5.778950145080454e-06, "loss": 0.0021, "step": 5056 }, { "epoch": 1.3342566943674976, "grad_norm": 0.031694866716861725, "learning_rate": 5.7785984348896515e-06, "loss": 0.0046, "step": 5058 }, { "epoch": 1.334784329244163, "grad_norm": 0.24107488989830017, "learning_rate": 5.7782467246988484e-06, "loss": 0.0027, "step": 5060 }, { "epoch": 1.3353119641208284, "grad_norm": 0.20779390633106232, "learning_rate": 5.777895014508045e-06, "loss": 0.0063, "step": 5062 }, { "epoch": 1.3358395989974938, "grad_norm": 0.40337371826171875, "learning_rate": 5.777543304317242e-06, "loss": 0.0049, "step": 5064 }, { "epoch": 1.336367233874159, "grad_norm": 0.021382367238402367, "learning_rate": 5.77719159412644e-06, "loss": 0.0027, "step": 5066 }, { "epoch": 1.3368948687508244, "grad_norm": 0.14853394031524658, "learning_rate": 5.776839883935637e-06, "loss": 0.002, "step": 5068 }, { "epoch": 1.3374225036274898, "grad_norm": 0.05209249258041382, "learning_rate": 5.776488173744834e-06, "loss": 0.0039, "step": 5070 }, { "epoch": 1.337950138504155, "grad_norm": 0.13605239987373352, "learning_rate": 5.776136463554032e-06, "loss": 0.0052, "step": 5072 }, { "epoch": 1.3384777733808204, "grad_norm": 0.01880500465631485, "learning_rate": 5.775784753363229e-06, "loss": 0.0017, "step": 5074 }, { "epoch": 1.3390054082574858, "grad_norm": 0.018555855378508568, "learning_rate": 5.775433043172426e-06, "loss": 0.0029, "step": 5076 }, { "epoch": 1.3395330431341512, "grad_norm": 0.6816891431808472, "learning_rate": 5.7750813329816235e-06, "loss": 0.002, "step": 5078 }, { "epoch": 1.3400606780108166, "grad_norm": 0.6865852475166321, "learning_rate": 5.7747296227908205e-06, "loss": 0.0068, "step": 5080 }, { "epoch": 1.3405883128874818, "grad_norm": 0.029136469587683678, "learning_rate": 5.774377912600018e-06, "loss": 0.0071, "step": 5082 }, { "epoch": 1.3411159477641472, "grad_norm": 0.3080501854419708, "learning_rate": 5.774026202409214e-06, "loss": 0.0159, "step": 5084 }, { "epoch": 1.3416435826408126, "grad_norm": 0.06385640054941177, "learning_rate": 5.773674492218412e-06, "loss": 0.0019, "step": 5086 }, { "epoch": 1.3421712175174778, "grad_norm": 0.10041742026805878, "learning_rate": 5.773322782027609e-06, "loss": 0.0048, "step": 5088 }, { "epoch": 1.3426988523941432, "grad_norm": 0.6464371085166931, "learning_rate": 5.772971071836807e-06, "loss": 0.0079, "step": 5090 }, { "epoch": 1.3432264872708086, "grad_norm": 0.026856794953346252, "learning_rate": 5.772619361646004e-06, "loss": 0.0028, "step": 5092 }, { "epoch": 1.343754122147474, "grad_norm": 0.10425892472267151, "learning_rate": 5.772267651455201e-06, "loss": 0.002, "step": 5094 }, { "epoch": 1.3442817570241394, "grad_norm": 0.01718476600944996, "learning_rate": 5.771915941264399e-06, "loss": 0.0018, "step": 5096 }, { "epoch": 1.3448093919008046, "grad_norm": 0.7050358057022095, "learning_rate": 5.771564231073596e-06, "loss": 0.0081, "step": 5098 }, { "epoch": 1.34533702677747, "grad_norm": 0.2467825710773468, "learning_rate": 5.7712125208827926e-06, "loss": 0.0025, "step": 5100 }, { "epoch": 1.3458646616541352, "grad_norm": 0.02826487459242344, "learning_rate": 5.77086081069199e-06, "loss": 0.0017, "step": 5102 }, { "epoch": 1.3463922965308006, "grad_norm": 0.3153332471847534, "learning_rate": 5.770509100501187e-06, "loss": 0.0045, "step": 5104 }, { "epoch": 1.346919931407466, "grad_norm": 0.25587400794029236, "learning_rate": 5.770157390310384e-06, "loss": 0.0026, "step": 5106 }, { "epoch": 1.3474475662841314, "grad_norm": 0.03407534956932068, "learning_rate": 5.769805680119581e-06, "loss": 0.0018, "step": 5108 }, { "epoch": 1.3479752011607968, "grad_norm": 1.2836732864379883, "learning_rate": 5.769453969928779e-06, "loss": 0.01, "step": 5110 }, { "epoch": 1.348502836037462, "grad_norm": 0.04959702864289284, "learning_rate": 5.769102259737976e-06, "loss": 0.0016, "step": 5112 }, { "epoch": 1.3490304709141274, "grad_norm": 0.19386276602745056, "learning_rate": 5.768750549547173e-06, "loss": 0.0129, "step": 5114 }, { "epoch": 1.3495581057907928, "grad_norm": 0.11194858700037003, "learning_rate": 5.768398839356371e-06, "loss": 0.002, "step": 5116 }, { "epoch": 1.350085740667458, "grad_norm": 0.5145659446716309, "learning_rate": 5.768047129165568e-06, "loss": 0.0105, "step": 5118 }, { "epoch": 1.3506133755441234, "grad_norm": 0.02877403050661087, "learning_rate": 5.7676954189747655e-06, "loss": 0.0036, "step": 5120 }, { "epoch": 1.3511410104207888, "grad_norm": 0.4487072229385376, "learning_rate": 5.767343708783962e-06, "loss": 0.0162, "step": 5122 }, { "epoch": 1.3516686452974542, "grad_norm": 0.12808401882648468, "learning_rate": 5.766991998593159e-06, "loss": 0.0037, "step": 5124 }, { "epoch": 1.3521962801741196, "grad_norm": 0.05611057206988335, "learning_rate": 5.766640288402357e-06, "loss": 0.0028, "step": 5126 }, { "epoch": 1.3527239150507848, "grad_norm": 0.08847817778587341, "learning_rate": 5.766288578211553e-06, "loss": 0.0058, "step": 5128 }, { "epoch": 1.3532515499274502, "grad_norm": 1.368660807609558, "learning_rate": 5.765936868020751e-06, "loss": 0.0035, "step": 5130 }, { "epoch": 1.3537791848041156, "grad_norm": 0.17927421629428864, "learning_rate": 5.765585157829948e-06, "loss": 0.0046, "step": 5132 }, { "epoch": 1.3543068196807808, "grad_norm": 0.37323078513145447, "learning_rate": 5.765233447639146e-06, "loss": 0.0075, "step": 5134 }, { "epoch": 1.3548344545574462, "grad_norm": 1.2990546226501465, "learning_rate": 5.764881737448343e-06, "loss": 0.0099, "step": 5136 }, { "epoch": 1.3553620894341116, "grad_norm": 0.14748527109622955, "learning_rate": 5.76453002725754e-06, "loss": 0.0019, "step": 5138 }, { "epoch": 1.355889724310777, "grad_norm": 0.7577086091041565, "learning_rate": 5.7641783170667375e-06, "loss": 0.0028, "step": 5140 }, { "epoch": 1.3564173591874422, "grad_norm": 0.5080617070198059, "learning_rate": 5.7638266068759345e-06, "loss": 0.0051, "step": 5142 }, { "epoch": 1.3569449940641076, "grad_norm": 0.01539627369493246, "learning_rate": 5.7634748966851314e-06, "loss": 0.0025, "step": 5144 }, { "epoch": 1.357472628940773, "grad_norm": 0.03705693036317825, "learning_rate": 5.763123186494328e-06, "loss": 0.0016, "step": 5146 }, { "epoch": 1.3580002638174382, "grad_norm": 0.01628102920949459, "learning_rate": 5.762771476303526e-06, "loss": 0.0024, "step": 5148 }, { "epoch": 1.3585278986941036, "grad_norm": 0.48793619871139526, "learning_rate": 5.762419766112724e-06, "loss": 0.0063, "step": 5150 }, { "epoch": 1.359055533570769, "grad_norm": 0.014329182915389538, "learning_rate": 5.76206805592192e-06, "loss": 0.0028, "step": 5152 }, { "epoch": 1.3595831684474344, "grad_norm": 0.6360824108123779, "learning_rate": 5.761716345731118e-06, "loss": 0.0024, "step": 5154 }, { "epoch": 1.3601108033240998, "grad_norm": 0.012531841173768044, "learning_rate": 5.761364635540315e-06, "loss": 0.0021, "step": 5156 }, { "epoch": 1.360638438200765, "grad_norm": 0.025477053597569466, "learning_rate": 5.761012925349512e-06, "loss": 0.0066, "step": 5158 }, { "epoch": 1.3611660730774304, "grad_norm": 0.11152269691228867, "learning_rate": 5.76066121515871e-06, "loss": 0.0071, "step": 5160 }, { "epoch": 1.3616937079540958, "grad_norm": 0.610949695110321, "learning_rate": 5.7603095049679066e-06, "loss": 0.0053, "step": 5162 }, { "epoch": 1.362221342830761, "grad_norm": 0.02430444397032261, "learning_rate": 5.759957794777104e-06, "loss": 0.0045, "step": 5164 }, { "epoch": 1.3627489777074264, "grad_norm": 0.05660809949040413, "learning_rate": 5.7596060845863005e-06, "loss": 0.0016, "step": 5166 }, { "epoch": 1.3632766125840918, "grad_norm": 1.3336641788482666, "learning_rate": 5.759254374395498e-06, "loss": 0.0113, "step": 5168 }, { "epoch": 1.3638042474607572, "grad_norm": 0.14707531034946442, "learning_rate": 5.758902664204695e-06, "loss": 0.0208, "step": 5170 }, { "epoch": 1.3643318823374226, "grad_norm": 0.66103196144104, "learning_rate": 5.758550954013893e-06, "loss": 0.0095, "step": 5172 }, { "epoch": 1.3648595172140878, "grad_norm": 1.7631783485412598, "learning_rate": 5.75819924382309e-06, "loss": 0.0034, "step": 5174 }, { "epoch": 1.3653871520907532, "grad_norm": 0.03352837264537811, "learning_rate": 5.757847533632287e-06, "loss": 0.0017, "step": 5176 }, { "epoch": 1.3659147869674184, "grad_norm": 2.628824234008789, "learning_rate": 5.757495823441485e-06, "loss": 0.0077, "step": 5178 }, { "epoch": 1.3664424218440838, "grad_norm": 1.427245020866394, "learning_rate": 5.757144113250682e-06, "loss": 0.0061, "step": 5180 }, { "epoch": 1.3669700567207492, "grad_norm": 0.4390449523925781, "learning_rate": 5.756792403059879e-06, "loss": 0.01, "step": 5182 }, { "epoch": 1.3674976915974146, "grad_norm": 0.09908091276884079, "learning_rate": 5.756440692869076e-06, "loss": 0.002, "step": 5184 }, { "epoch": 1.36802532647408, "grad_norm": 0.039166487753391266, "learning_rate": 5.756088982678273e-06, "loss": 0.0029, "step": 5186 }, { "epoch": 1.3685529613507452, "grad_norm": 0.14096038043498993, "learning_rate": 5.75573727248747e-06, "loss": 0.0033, "step": 5188 }, { "epoch": 1.3690805962274106, "grad_norm": 0.0449051558971405, "learning_rate": 5.755385562296667e-06, "loss": 0.0018, "step": 5190 }, { "epoch": 1.369608231104076, "grad_norm": 1.1427464485168457, "learning_rate": 5.755033852105865e-06, "loss": 0.0275, "step": 5192 }, { "epoch": 1.3701358659807412, "grad_norm": 1.0979666709899902, "learning_rate": 5.754682141915062e-06, "loss": 0.0041, "step": 5194 }, { "epoch": 1.3706635008574066, "grad_norm": 0.4393324553966522, "learning_rate": 5.754330431724259e-06, "loss": 0.0125, "step": 5196 }, { "epoch": 1.371191135734072, "grad_norm": 0.024876633659005165, "learning_rate": 5.753978721533457e-06, "loss": 0.0016, "step": 5198 }, { "epoch": 1.3717187706107374, "grad_norm": 0.12751102447509766, "learning_rate": 5.753627011342654e-06, "loss": 0.0042, "step": 5200 }, { "epoch": 1.3722464054874028, "grad_norm": 0.22680151462554932, "learning_rate": 5.7532753011518515e-06, "loss": 0.0101, "step": 5202 }, { "epoch": 1.372774040364068, "grad_norm": 0.03958825394511223, "learning_rate": 5.752923590961048e-06, "loss": 0.0018, "step": 5204 }, { "epoch": 1.3733016752407334, "grad_norm": 0.036740876734256744, "learning_rate": 5.7525718807702454e-06, "loss": 0.0023, "step": 5206 }, { "epoch": 1.3738293101173988, "grad_norm": 0.41305407881736755, "learning_rate": 5.752220170579443e-06, "loss": 0.0071, "step": 5208 }, { "epoch": 1.374356944994064, "grad_norm": 0.58838951587677, "learning_rate": 5.75186846038864e-06, "loss": 0.005, "step": 5210 }, { "epoch": 1.3748845798707294, "grad_norm": 0.2247505933046341, "learning_rate": 5.751516750197837e-06, "loss": 0.0057, "step": 5212 }, { "epoch": 1.3754122147473948, "grad_norm": 1.1926766633987427, "learning_rate": 5.751165040007034e-06, "loss": 0.0064, "step": 5214 }, { "epoch": 1.3759398496240602, "grad_norm": 0.23591652512550354, "learning_rate": 5.750813329816232e-06, "loss": 0.0083, "step": 5216 }, { "epoch": 1.3764674845007254, "grad_norm": 0.32920387387275696, "learning_rate": 5.750461619625429e-06, "loss": 0.0119, "step": 5218 }, { "epoch": 1.3769951193773908, "grad_norm": 0.07371518015861511, "learning_rate": 5.750109909434626e-06, "loss": 0.0024, "step": 5220 }, { "epoch": 1.3775227542540562, "grad_norm": 0.08161453902721405, "learning_rate": 5.749758199243824e-06, "loss": 0.0101, "step": 5222 }, { "epoch": 1.3780503891307214, "grad_norm": 0.13574247062206268, "learning_rate": 5.7494064890530205e-06, "loss": 0.0022, "step": 5224 }, { "epoch": 1.3785780240073868, "grad_norm": 0.027460366487503052, "learning_rate": 5.7490547788622175e-06, "loss": 0.0033, "step": 5226 }, { "epoch": 1.3791056588840522, "grad_norm": 0.019700586795806885, "learning_rate": 5.7487030686714145e-06, "loss": 0.0019, "step": 5228 }, { "epoch": 1.3796332937607176, "grad_norm": 0.017152801156044006, "learning_rate": 5.748351358480612e-06, "loss": 0.0022, "step": 5230 }, { "epoch": 1.380160928637383, "grad_norm": 0.40014025568962097, "learning_rate": 5.747999648289809e-06, "loss": 0.006, "step": 5232 }, { "epoch": 1.3806885635140482, "grad_norm": 0.3044649064540863, "learning_rate": 5.747647938099006e-06, "loss": 0.0049, "step": 5234 }, { "epoch": 1.3812161983907136, "grad_norm": 0.8121777176856995, "learning_rate": 5.747296227908204e-06, "loss": 0.0042, "step": 5236 }, { "epoch": 1.381743833267379, "grad_norm": 0.016458241268992424, "learning_rate": 5.746944517717401e-06, "loss": 0.0018, "step": 5238 }, { "epoch": 1.3822714681440442, "grad_norm": 0.054723914712667465, "learning_rate": 5.746592807526599e-06, "loss": 0.0018, "step": 5240 }, { "epoch": 1.3827991030207096, "grad_norm": 0.3298775553703308, "learning_rate": 5.746241097335796e-06, "loss": 0.0063, "step": 5242 }, { "epoch": 1.383326737897375, "grad_norm": 0.2931506633758545, "learning_rate": 5.745889387144993e-06, "loss": 0.0042, "step": 5244 }, { "epoch": 1.3838543727740404, "grad_norm": 0.26780250668525696, "learning_rate": 5.74553767695419e-06, "loss": 0.0091, "step": 5246 }, { "epoch": 1.3843820076507058, "grad_norm": 0.024777762591838837, "learning_rate": 5.7451859667633865e-06, "loss": 0.0123, "step": 5248 }, { "epoch": 1.384909642527371, "grad_norm": 0.045407816767692566, "learning_rate": 5.744834256572584e-06, "loss": 0.0053, "step": 5250 }, { "epoch": 1.3854372774040364, "grad_norm": 0.10699927061796188, "learning_rate": 5.744482546381781e-06, "loss": 0.0026, "step": 5252 }, { "epoch": 1.3859649122807016, "grad_norm": 0.060449760407209396, "learning_rate": 5.744130836190979e-06, "loss": 0.0018, "step": 5254 }, { "epoch": 1.386492547157367, "grad_norm": 0.040356509387493134, "learning_rate": 5.743779126000176e-06, "loss": 0.0083, "step": 5256 }, { "epoch": 1.3870201820340324, "grad_norm": 0.8158619999885559, "learning_rate": 5.743427415809373e-06, "loss": 0.0124, "step": 5258 }, { "epoch": 1.3875478169106978, "grad_norm": 0.0648449957370758, "learning_rate": 5.743075705618571e-06, "loss": 0.002, "step": 5260 }, { "epoch": 1.3880754517873632, "grad_norm": 0.06098516285419464, "learning_rate": 5.742723995427768e-06, "loss": 0.0032, "step": 5262 }, { "epoch": 1.3886030866640284, "grad_norm": 0.1437073051929474, "learning_rate": 5.742372285236965e-06, "loss": 0.0025, "step": 5264 }, { "epoch": 1.3891307215406938, "grad_norm": 0.5242015719413757, "learning_rate": 5.7420205750461625e-06, "loss": 0.0076, "step": 5266 }, { "epoch": 1.3896583564173592, "grad_norm": 0.05312640592455864, "learning_rate": 5.7416688648553594e-06, "loss": 0.002, "step": 5268 }, { "epoch": 1.3901859912940244, "grad_norm": 0.424696683883667, "learning_rate": 5.741317154664557e-06, "loss": 0.004, "step": 5270 }, { "epoch": 1.3907136261706898, "grad_norm": 0.6945917010307312, "learning_rate": 5.740965444473753e-06, "loss": 0.0163, "step": 5272 }, { "epoch": 1.3912412610473552, "grad_norm": 0.02310420200228691, "learning_rate": 5.740613734282951e-06, "loss": 0.0042, "step": 5274 }, { "epoch": 1.3917688959240206, "grad_norm": 0.2066752165555954, "learning_rate": 5.740262024092148e-06, "loss": 0.0249, "step": 5276 }, { "epoch": 1.392296530800686, "grad_norm": 0.0778648629784584, "learning_rate": 5.739910313901345e-06, "loss": 0.0058, "step": 5278 }, { "epoch": 1.3928241656773512, "grad_norm": 0.5454005002975464, "learning_rate": 5.739558603710543e-06, "loss": 0.0062, "step": 5280 }, { "epoch": 1.3933518005540166, "grad_norm": 0.16241303086280823, "learning_rate": 5.73920689351974e-06, "loss": 0.0036, "step": 5282 }, { "epoch": 1.393879435430682, "grad_norm": 0.1558098942041397, "learning_rate": 5.738855183328938e-06, "loss": 0.0023, "step": 5284 }, { "epoch": 1.3944070703073472, "grad_norm": 0.020031942054629326, "learning_rate": 5.738503473138134e-06, "loss": 0.007, "step": 5286 }, { "epoch": 1.3949347051840126, "grad_norm": 0.19223175942897797, "learning_rate": 5.7381517629473315e-06, "loss": 0.0062, "step": 5288 }, { "epoch": 1.395462340060678, "grad_norm": 0.6634377837181091, "learning_rate": 5.7378000527565284e-06, "loss": 0.0126, "step": 5290 }, { "epoch": 1.3959899749373434, "grad_norm": 0.38593801856040955, "learning_rate": 5.737448342565726e-06, "loss": 0.0037, "step": 5292 }, { "epoch": 1.3965176098140086, "grad_norm": 0.42742276191711426, "learning_rate": 5.737096632374923e-06, "loss": 0.0037, "step": 5294 }, { "epoch": 1.397045244690674, "grad_norm": 0.5214412212371826, "learning_rate": 5.73674492218412e-06, "loss": 0.0085, "step": 5296 }, { "epoch": 1.3975728795673394, "grad_norm": 0.6212027668952942, "learning_rate": 5.736393211993318e-06, "loss": 0.0042, "step": 5298 }, { "epoch": 1.3981005144440046, "grad_norm": 0.041980940848588943, "learning_rate": 5.736041501802515e-06, "loss": 0.0113, "step": 5300 }, { "epoch": 1.39862814932067, "grad_norm": 0.13668060302734375, "learning_rate": 5.735689791611712e-06, "loss": 0.0078, "step": 5302 }, { "epoch": 1.3991557841973354, "grad_norm": 0.6440321803092957, "learning_rate": 5.73533808142091e-06, "loss": 0.006, "step": 5304 }, { "epoch": 1.3996834190740008, "grad_norm": 0.04293186962604523, "learning_rate": 5.734986371230107e-06, "loss": 0.0016, "step": 5306 }, { "epoch": 1.4002110539506663, "grad_norm": 0.6642261743545532, "learning_rate": 5.7346346610393036e-06, "loss": 0.008, "step": 5308 }, { "epoch": 1.4007386888273314, "grad_norm": 0.09547359496355057, "learning_rate": 5.7342829508485005e-06, "loss": 0.0118, "step": 5310 }, { "epoch": 1.4012663237039968, "grad_norm": 0.26397576928138733, "learning_rate": 5.733931240657698e-06, "loss": 0.0021, "step": 5312 }, { "epoch": 1.4017939585806622, "grad_norm": 0.36207520961761475, "learning_rate": 5.733579530466895e-06, "loss": 0.003, "step": 5314 }, { "epoch": 1.4023215934573274, "grad_norm": 0.0989401638507843, "learning_rate": 5.733227820276092e-06, "loss": 0.002, "step": 5316 }, { "epoch": 1.4028492283339928, "grad_norm": 0.039399467408657074, "learning_rate": 5.73287611008529e-06, "loss": 0.0017, "step": 5318 }, { "epoch": 1.4033768632106582, "grad_norm": 0.1666027456521988, "learning_rate": 5.732524399894487e-06, "loss": 0.0021, "step": 5320 }, { "epoch": 1.4039044980873236, "grad_norm": 0.363765150308609, "learning_rate": 5.732172689703685e-06, "loss": 0.0076, "step": 5322 }, { "epoch": 1.404432132963989, "grad_norm": 0.5636208653450012, "learning_rate": 5.731820979512882e-06, "loss": 0.0069, "step": 5324 }, { "epoch": 1.4049597678406542, "grad_norm": 0.03219389170408249, "learning_rate": 5.731469269322079e-06, "loss": 0.0027, "step": 5326 }, { "epoch": 1.4054874027173196, "grad_norm": 0.4228559732437134, "learning_rate": 5.7311175591312765e-06, "loss": 0.011, "step": 5328 }, { "epoch": 1.4060150375939848, "grad_norm": 0.04977480322122574, "learning_rate": 5.7307658489404734e-06, "loss": 0.0016, "step": 5330 }, { "epoch": 1.4065426724706502, "grad_norm": 0.31534677743911743, "learning_rate": 5.73041413874967e-06, "loss": 0.0095, "step": 5332 }, { "epoch": 1.4070703073473156, "grad_norm": 0.1754073202610016, "learning_rate": 5.730062428558867e-06, "loss": 0.0023, "step": 5334 }, { "epoch": 1.407597942223981, "grad_norm": 0.1497064232826233, "learning_rate": 5.729710718368065e-06, "loss": 0.0019, "step": 5336 }, { "epoch": 1.4081255771006465, "grad_norm": 0.17783601582050323, "learning_rate": 5.729359008177262e-06, "loss": 0.002, "step": 5338 }, { "epoch": 1.4086532119773116, "grad_norm": 0.15410968661308289, "learning_rate": 5.729007297986459e-06, "loss": 0.0064, "step": 5340 }, { "epoch": 1.409180846853977, "grad_norm": 0.30438053607940674, "learning_rate": 5.728655587795657e-06, "loss": 0.0043, "step": 5342 }, { "epoch": 1.4097084817306424, "grad_norm": 0.3834567070007324, "learning_rate": 5.728303877604854e-06, "loss": 0.0057, "step": 5344 }, { "epoch": 1.4102361166073076, "grad_norm": 0.30271410942077637, "learning_rate": 5.727952167414051e-06, "loss": 0.0056, "step": 5346 }, { "epoch": 1.410763751483973, "grad_norm": 0.8286554217338562, "learning_rate": 5.727600457223248e-06, "loss": 0.0133, "step": 5348 }, { "epoch": 1.4112913863606384, "grad_norm": 0.06120412051677704, "learning_rate": 5.7272487470324455e-06, "loss": 0.0112, "step": 5350 }, { "epoch": 1.4118190212373039, "grad_norm": 0.06874700635671616, "learning_rate": 5.726897036841643e-06, "loss": 0.0039, "step": 5352 }, { "epoch": 1.4123466561139693, "grad_norm": 0.23249267041683197, "learning_rate": 5.726545326650839e-06, "loss": 0.0108, "step": 5354 }, { "epoch": 1.4128742909906344, "grad_norm": 0.02090161107480526, "learning_rate": 5.726193616460037e-06, "loss": 0.0015, "step": 5356 }, { "epoch": 1.4134019258672998, "grad_norm": 0.07275869697332382, "learning_rate": 5.725841906269234e-06, "loss": 0.0016, "step": 5358 }, { "epoch": 1.4139295607439653, "grad_norm": 0.06603968888521194, "learning_rate": 5.725490196078432e-06, "loss": 0.0016, "step": 5360 }, { "epoch": 1.4144571956206304, "grad_norm": 0.04687988758087158, "learning_rate": 5.725138485887629e-06, "loss": 0.0132, "step": 5362 }, { "epoch": 1.4149848304972958, "grad_norm": 0.07295101881027222, "learning_rate": 5.724786775696826e-06, "loss": 0.006, "step": 5364 }, { "epoch": 1.4155124653739612, "grad_norm": 0.021280542016029358, "learning_rate": 5.724435065506024e-06, "loss": 0.0019, "step": 5366 }, { "epoch": 1.4160401002506267, "grad_norm": 0.021789606660604477, "learning_rate": 5.72408335531522e-06, "loss": 0.0024, "step": 5368 }, { "epoch": 1.4165677351272918, "grad_norm": 0.04232395812869072, "learning_rate": 5.7237316451244176e-06, "loss": 0.0016, "step": 5370 }, { "epoch": 1.4170953700039572, "grad_norm": 0.18134178221225739, "learning_rate": 5.7233799349336145e-06, "loss": 0.0101, "step": 5372 }, { "epoch": 1.4176230048806227, "grad_norm": 0.0772232785820961, "learning_rate": 5.723028224742812e-06, "loss": 0.002, "step": 5374 }, { "epoch": 1.4181506397572878, "grad_norm": 0.127940833568573, "learning_rate": 5.722676514552009e-06, "loss": 0.0043, "step": 5376 }, { "epoch": 1.4186782746339532, "grad_norm": 0.02218434400856495, "learning_rate": 5.722324804361206e-06, "loss": 0.0016, "step": 5378 }, { "epoch": 1.4192059095106186, "grad_norm": 0.06627558171749115, "learning_rate": 5.721973094170404e-06, "loss": 0.0016, "step": 5380 }, { "epoch": 1.419733544387284, "grad_norm": 0.04249270260334015, "learning_rate": 5.721621383979601e-06, "loss": 0.0015, "step": 5382 }, { "epoch": 1.4202611792639495, "grad_norm": 0.03560956194996834, "learning_rate": 5.721269673788798e-06, "loss": 0.0016, "step": 5384 }, { "epoch": 1.4207888141406146, "grad_norm": 0.8204081654548645, "learning_rate": 5.720917963597996e-06, "loss": 0.0065, "step": 5386 }, { "epoch": 1.42131644901728, "grad_norm": 0.0695529580116272, "learning_rate": 5.720566253407193e-06, "loss": 0.0126, "step": 5388 }, { "epoch": 1.4218440838939455, "grad_norm": 0.167499378323555, "learning_rate": 5.7202145432163905e-06, "loss": 0.0045, "step": 5390 }, { "epoch": 1.4223717187706106, "grad_norm": 0.49359390139579773, "learning_rate": 5.7198628330255866e-06, "loss": 0.0166, "step": 5392 }, { "epoch": 1.422899353647276, "grad_norm": 0.06944341957569122, "learning_rate": 5.719511122834784e-06, "loss": 0.0029, "step": 5394 }, { "epoch": 1.4234269885239414, "grad_norm": 0.13818706572055817, "learning_rate": 5.719159412643981e-06, "loss": 0.0028, "step": 5396 }, { "epoch": 1.4239546234006069, "grad_norm": 0.10619524866342545, "learning_rate": 5.718807702453178e-06, "loss": 0.0109, "step": 5398 }, { "epoch": 1.4244822582772723, "grad_norm": 0.1662093698978424, "learning_rate": 5.718455992262376e-06, "loss": 0.0117, "step": 5400 }, { "epoch": 1.4250098931539374, "grad_norm": 0.41655731201171875, "learning_rate": 5.718104282071573e-06, "loss": 0.005, "step": 5402 }, { "epoch": 1.4255375280306029, "grad_norm": 0.7725035548210144, "learning_rate": 5.717752571880771e-06, "loss": 0.0117, "step": 5404 }, { "epoch": 1.426065162907268, "grad_norm": 0.045101597905159, "learning_rate": 5.717400861689967e-06, "loss": 0.0046, "step": 5406 }, { "epoch": 1.4265927977839334, "grad_norm": 0.03869100287556648, "learning_rate": 5.717049151499165e-06, "loss": 0.0017, "step": 5408 }, { "epoch": 1.4271204326605988, "grad_norm": 0.035899970680475235, "learning_rate": 5.7166974413083625e-06, "loss": 0.0051, "step": 5410 }, { "epoch": 1.4276480675372643, "grad_norm": 0.10107250511646271, "learning_rate": 5.7163457311175595e-06, "loss": 0.0117, "step": 5412 }, { "epoch": 1.4281757024139297, "grad_norm": 0.3394050896167755, "learning_rate": 5.7159940209267564e-06, "loss": 0.0104, "step": 5414 }, { "epoch": 1.4287033372905948, "grad_norm": 0.5025769472122192, "learning_rate": 5.715642310735953e-06, "loss": 0.0032, "step": 5416 }, { "epoch": 1.4292309721672602, "grad_norm": 0.05414950102567673, "learning_rate": 5.715290600545151e-06, "loss": 0.0018, "step": 5418 }, { "epoch": 1.4297586070439257, "grad_norm": 0.0664336085319519, "learning_rate": 5.714938890354348e-06, "loss": 0.0016, "step": 5420 }, { "epoch": 1.4302862419205908, "grad_norm": 0.34164300560951233, "learning_rate": 5.714587180163545e-06, "loss": 0.0147, "step": 5422 }, { "epoch": 1.4308138767972562, "grad_norm": 0.08511331677436829, "learning_rate": 5.714235469972743e-06, "loss": 0.0021, "step": 5424 }, { "epoch": 1.4313415116739217, "grad_norm": 0.08195549994707108, "learning_rate": 5.71388375978194e-06, "loss": 0.0052, "step": 5426 }, { "epoch": 1.431869146550587, "grad_norm": 0.05104782432317734, "learning_rate": 5.713532049591137e-06, "loss": 0.0017, "step": 5428 }, { "epoch": 1.4323967814272525, "grad_norm": 0.11245305091142654, "learning_rate": 5.713180339400334e-06, "loss": 0.0064, "step": 5430 }, { "epoch": 1.4329244163039176, "grad_norm": 0.059793341904878616, "learning_rate": 5.7128286292095315e-06, "loss": 0.0018, "step": 5432 }, { "epoch": 1.433452051180583, "grad_norm": 0.18177182972431183, "learning_rate": 5.712476919018729e-06, "loss": 0.0023, "step": 5434 }, { "epoch": 1.4339796860572485, "grad_norm": 0.36225613951683044, "learning_rate": 5.7121252088279255e-06, "loss": 0.0022, "step": 5436 }, { "epoch": 1.4345073209339136, "grad_norm": 0.02633105032145977, "learning_rate": 5.711773498637123e-06, "loss": 0.0014, "step": 5438 }, { "epoch": 1.435034955810579, "grad_norm": 0.04358436167240143, "learning_rate": 5.71142178844632e-06, "loss": 0.0016, "step": 5440 }, { "epoch": 1.4355625906872445, "grad_norm": 0.43003296852111816, "learning_rate": 5.711070078255518e-06, "loss": 0.0029, "step": 5442 }, { "epoch": 1.4360902255639099, "grad_norm": 0.387493371963501, "learning_rate": 5.710718368064715e-06, "loss": 0.0167, "step": 5444 }, { "epoch": 1.436617860440575, "grad_norm": 0.2564697265625, "learning_rate": 5.710366657873912e-06, "loss": 0.0112, "step": 5446 }, { "epoch": 1.4371454953172405, "grad_norm": 0.012769684195518494, "learning_rate": 5.71001494768311e-06, "loss": 0.0021, "step": 5448 }, { "epoch": 1.4376731301939059, "grad_norm": 0.11273187398910522, "learning_rate": 5.709663237492307e-06, "loss": 0.0018, "step": 5450 }, { "epoch": 1.438200765070571, "grad_norm": 0.9547806978225708, "learning_rate": 5.709311527301504e-06, "loss": 0.0067, "step": 5452 }, { "epoch": 1.4387283999472364, "grad_norm": 0.6634114384651184, "learning_rate": 5.7089598171107006e-06, "loss": 0.0067, "step": 5454 }, { "epoch": 1.4392560348239019, "grad_norm": 0.08659157156944275, "learning_rate": 5.708608106919898e-06, "loss": 0.0067, "step": 5456 }, { "epoch": 1.4397836697005673, "grad_norm": 0.20037874579429626, "learning_rate": 5.708256396729095e-06, "loss": 0.0041, "step": 5458 }, { "epoch": 1.4403113045772327, "grad_norm": 0.04920978471636772, "learning_rate": 5.707904686538292e-06, "loss": 0.0014, "step": 5460 }, { "epoch": 1.4408389394538978, "grad_norm": 0.6427474617958069, "learning_rate": 5.70755297634749e-06, "loss": 0.0021, "step": 5462 }, { "epoch": 1.4413665743305633, "grad_norm": 0.014374230057001114, "learning_rate": 5.707201266156687e-06, "loss": 0.0096, "step": 5464 }, { "epoch": 1.4418942092072287, "grad_norm": 0.627732515335083, "learning_rate": 5.706849555965884e-06, "loss": 0.0109, "step": 5466 }, { "epoch": 1.4424218440838938, "grad_norm": 0.07690135389566422, "learning_rate": 5.706497845775082e-06, "loss": 0.0015, "step": 5468 }, { "epoch": 1.4429494789605593, "grad_norm": 0.11077740788459778, "learning_rate": 5.706146135584279e-06, "loss": 0.002, "step": 5470 }, { "epoch": 1.4434771138372247, "grad_norm": 0.9852369427680969, "learning_rate": 5.7057944253934765e-06, "loss": 0.0106, "step": 5472 }, { "epoch": 1.44400474871389, "grad_norm": 0.019002728164196014, "learning_rate": 5.705442715202673e-06, "loss": 0.0017, "step": 5474 }, { "epoch": 1.4445323835905555, "grad_norm": 0.3788120150566101, "learning_rate": 5.7050910050118704e-06, "loss": 0.0035, "step": 5476 }, { "epoch": 1.4450600184672207, "grad_norm": 0.11948461830615997, "learning_rate": 5.704739294821067e-06, "loss": 0.0122, "step": 5478 }, { "epoch": 1.445587653343886, "grad_norm": 0.04171111434698105, "learning_rate": 5.704387584630265e-06, "loss": 0.0018, "step": 5480 }, { "epoch": 1.4461152882205512, "grad_norm": 0.025479357689619064, "learning_rate": 5.704035874439462e-06, "loss": 0.0015, "step": 5482 }, { "epoch": 1.4466429230972166, "grad_norm": 0.1606965810060501, "learning_rate": 5.703684164248659e-06, "loss": 0.0026, "step": 5484 }, { "epoch": 1.447170557973882, "grad_norm": 0.02025483176112175, "learning_rate": 5.703332454057857e-06, "loss": 0.0015, "step": 5486 }, { "epoch": 1.4476981928505475, "grad_norm": 0.014199369587004185, "learning_rate": 5.702980743867053e-06, "loss": 0.0064, "step": 5488 }, { "epoch": 1.4482258277272129, "grad_norm": 0.302564412355423, "learning_rate": 5.702629033676251e-06, "loss": 0.025, "step": 5490 }, { "epoch": 1.448753462603878, "grad_norm": 0.018051767721772194, "learning_rate": 5.702277323485449e-06, "loss": 0.0015, "step": 5492 }, { "epoch": 1.4492810974805435, "grad_norm": 0.03222641721367836, "learning_rate": 5.7019256132946455e-06, "loss": 0.0015, "step": 5494 }, { "epoch": 1.4498087323572089, "grad_norm": 0.07142238318920135, "learning_rate": 5.7015739031038425e-06, "loss": 0.0013, "step": 5496 }, { "epoch": 1.450336367233874, "grad_norm": 0.3739297091960907, "learning_rate": 5.7012221929130394e-06, "loss": 0.0074, "step": 5498 }, { "epoch": 1.4508640021105395, "grad_norm": 0.08682761341333389, "learning_rate": 5.700870482722237e-06, "loss": 0.0019, "step": 5500 }, { "epoch": 1.4513916369872049, "grad_norm": 0.8655498623847961, "learning_rate": 5.700518772531434e-06, "loss": 0.0055, "step": 5502 }, { "epoch": 1.4519192718638703, "grad_norm": 0.061148736625909805, "learning_rate": 5.700167062340631e-06, "loss": 0.0016, "step": 5504 }, { "epoch": 1.4524469067405357, "grad_norm": 0.39420661330223083, "learning_rate": 5.699815352149829e-06, "loss": 0.0157, "step": 5506 }, { "epoch": 1.4529745416172009, "grad_norm": 0.07849511504173279, "learning_rate": 5.699463641959026e-06, "loss": 0.0022, "step": 5508 }, { "epoch": 1.4535021764938663, "grad_norm": 0.45096996426582336, "learning_rate": 5.699111931768224e-06, "loss": 0.0086, "step": 5510 }, { "epoch": 1.4540298113705317, "grad_norm": 0.3132898807525635, "learning_rate": 5.69876022157742e-06, "loss": 0.0062, "step": 5512 }, { "epoch": 1.4545574462471969, "grad_norm": 0.3126857876777649, "learning_rate": 5.698408511386618e-06, "loss": 0.0035, "step": 5514 }, { "epoch": 1.4550850811238623, "grad_norm": 0.14854203164577484, "learning_rate": 5.6980568011958146e-06, "loss": 0.0125, "step": 5516 }, { "epoch": 1.4556127160005277, "grad_norm": 0.07519487291574478, "learning_rate": 5.6977050910050115e-06, "loss": 0.0054, "step": 5518 }, { "epoch": 1.456140350877193, "grad_norm": 0.16796861588954926, "learning_rate": 5.697353380814209e-06, "loss": 0.0081, "step": 5520 }, { "epoch": 1.4566679857538583, "grad_norm": 0.06778734922409058, "learning_rate": 5.697001670623406e-06, "loss": 0.0019, "step": 5522 }, { "epoch": 1.4571956206305237, "grad_norm": 0.360325425863266, "learning_rate": 5.696649960432604e-06, "loss": 0.0134, "step": 5524 }, { "epoch": 1.457723255507189, "grad_norm": 0.10740821063518524, "learning_rate": 5.696298250241801e-06, "loss": 0.002, "step": 5526 }, { "epoch": 1.4582508903838542, "grad_norm": 0.28758704662323, "learning_rate": 5.695946540050998e-06, "loss": 0.0094, "step": 5528 }, { "epoch": 1.4587785252605197, "grad_norm": 0.18839746713638306, "learning_rate": 5.695594829860196e-06, "loss": 0.0019, "step": 5530 }, { "epoch": 1.459306160137185, "grad_norm": 0.047740012407302856, "learning_rate": 5.695243119669393e-06, "loss": 0.0021, "step": 5532 }, { "epoch": 1.4598337950138505, "grad_norm": 0.29916879534721375, "learning_rate": 5.69489140947859e-06, "loss": 0.0019, "step": 5534 }, { "epoch": 1.4603614298905159, "grad_norm": 0.20427103340625763, "learning_rate": 5.694539699287787e-06, "loss": 0.0023, "step": 5536 }, { "epoch": 1.460889064767181, "grad_norm": 0.05093039199709892, "learning_rate": 5.694187989096984e-06, "loss": 0.0036, "step": 5538 }, { "epoch": 1.4614166996438465, "grad_norm": 0.038292285054922104, "learning_rate": 5.693836278906181e-06, "loss": 0.0016, "step": 5540 }, { "epoch": 1.4619443345205119, "grad_norm": 0.701895534992218, "learning_rate": 5.693484568715378e-06, "loss": 0.0038, "step": 5542 }, { "epoch": 1.462471969397177, "grad_norm": 1.4343245029449463, "learning_rate": 5.693132858524576e-06, "loss": 0.0179, "step": 5544 }, { "epoch": 1.4629996042738425, "grad_norm": 0.3361428380012512, "learning_rate": 5.692781148333773e-06, "loss": 0.0052, "step": 5546 }, { "epoch": 1.4635272391505079, "grad_norm": 0.050879981368780136, "learning_rate": 5.69242943814297e-06, "loss": 0.0016, "step": 5548 }, { "epoch": 1.4640548740271733, "grad_norm": 0.015889788046479225, "learning_rate": 5.692077727952168e-06, "loss": 0.0014, "step": 5550 }, { "epoch": 1.4645825089038387, "grad_norm": 0.5307744741439819, "learning_rate": 5.691726017761365e-06, "loss": 0.0141, "step": 5552 }, { "epoch": 1.4651101437805039, "grad_norm": 0.06763352453708649, "learning_rate": 5.691374307570563e-06, "loss": 0.0016, "step": 5554 }, { "epoch": 1.4656377786571693, "grad_norm": 0.01768641173839569, "learning_rate": 5.691022597379759e-06, "loss": 0.0145, "step": 5556 }, { "epoch": 1.4661654135338344, "grad_norm": 0.2903168201446533, "learning_rate": 5.6906708871889565e-06, "loss": 0.0041, "step": 5558 }, { "epoch": 1.4666930484104999, "grad_norm": 0.03891117870807648, "learning_rate": 5.6903191769981534e-06, "loss": 0.002, "step": 5560 }, { "epoch": 1.4672206832871653, "grad_norm": 0.4479115307331085, "learning_rate": 5.689967466807351e-06, "loss": 0.0131, "step": 5562 }, { "epoch": 1.4677483181638307, "grad_norm": 0.7131748795509338, "learning_rate": 5.689615756616548e-06, "loss": 0.0113, "step": 5564 }, { "epoch": 1.468275953040496, "grad_norm": 0.08830536901950836, "learning_rate": 5.689264046425745e-06, "loss": 0.0017, "step": 5566 }, { "epoch": 1.4688035879171613, "grad_norm": 0.06963130086660385, "learning_rate": 5.688912336234943e-06, "loss": 0.0015, "step": 5568 }, { "epoch": 1.4693312227938267, "grad_norm": 0.06609972566366196, "learning_rate": 5.68856062604414e-06, "loss": 0.0041, "step": 5570 }, { "epoch": 1.469858857670492, "grad_norm": 0.09693916141986847, "learning_rate": 5.688208915853337e-06, "loss": 0.0014, "step": 5572 }, { "epoch": 1.4703864925471573, "grad_norm": 0.22860930860042572, "learning_rate": 5.687857205662534e-06, "loss": 0.002, "step": 5574 }, { "epoch": 1.4709141274238227, "grad_norm": 0.03869302198290825, "learning_rate": 5.687505495471732e-06, "loss": 0.0058, "step": 5576 }, { "epoch": 1.471441762300488, "grad_norm": 0.11313772201538086, "learning_rate": 5.6871537852809285e-06, "loss": 0.0041, "step": 5578 }, { "epoch": 1.4719693971771535, "grad_norm": 0.2026917040348053, "learning_rate": 5.6868020750901255e-06, "loss": 0.0017, "step": 5580 }, { "epoch": 1.4724970320538189, "grad_norm": 0.4914742410182953, "learning_rate": 5.686450364899323e-06, "loss": 0.0062, "step": 5582 }, { "epoch": 1.473024666930484, "grad_norm": 0.27456068992614746, "learning_rate": 5.68609865470852e-06, "loss": 0.002, "step": 5584 }, { "epoch": 1.4735523018071495, "grad_norm": 0.31188225746154785, "learning_rate": 5.685746944517717e-06, "loss": 0.0158, "step": 5586 }, { "epoch": 1.4740799366838149, "grad_norm": 0.056727658957242966, "learning_rate": 5.685395234326915e-06, "loss": 0.002, "step": 5588 }, { "epoch": 1.47460757156048, "grad_norm": 0.08275242149829865, "learning_rate": 5.685043524136112e-06, "loss": 0.0018, "step": 5590 }, { "epoch": 1.4751352064371455, "grad_norm": 0.5202338099479675, "learning_rate": 5.68469181394531e-06, "loss": 0.0055, "step": 5592 }, { "epoch": 1.4756628413138109, "grad_norm": 0.0518956333398819, "learning_rate": 5.684340103754506e-06, "loss": 0.0013, "step": 5594 }, { "epoch": 1.4761904761904763, "grad_norm": 0.5144376754760742, "learning_rate": 5.683988393563704e-06, "loss": 0.0122, "step": 5596 }, { "epoch": 1.4767181110671415, "grad_norm": 0.13455183804035187, "learning_rate": 5.683636683372901e-06, "loss": 0.0018, "step": 5598 }, { "epoch": 1.4772457459438069, "grad_norm": 0.2994706332683563, "learning_rate": 5.683284973182098e-06, "loss": 0.0049, "step": 5600 }, { "epoch": 1.4777733808204723, "grad_norm": 0.7156603932380676, "learning_rate": 5.682933262991295e-06, "loss": 0.0095, "step": 5602 }, { "epoch": 1.4783010156971375, "grad_norm": 0.4229554831981659, "learning_rate": 5.682581552800492e-06, "loss": 0.0025, "step": 5604 }, { "epoch": 1.4788286505738029, "grad_norm": 0.02084295079112053, "learning_rate": 5.68222984260969e-06, "loss": 0.0073, "step": 5606 }, { "epoch": 1.4793562854504683, "grad_norm": 0.09924768656492233, "learning_rate": 5.681878132418887e-06, "loss": 0.002, "step": 5608 }, { "epoch": 1.4798839203271337, "grad_norm": 2.365713119506836, "learning_rate": 5.681526422228084e-06, "loss": 0.0064, "step": 5610 }, { "epoch": 1.480411555203799, "grad_norm": 0.1672387421131134, "learning_rate": 5.681174712037282e-06, "loss": 0.0084, "step": 5612 }, { "epoch": 1.4809391900804643, "grad_norm": 0.22176823019981384, "learning_rate": 5.680823001846479e-06, "loss": 0.0105, "step": 5614 }, { "epoch": 1.4814668249571297, "grad_norm": 0.09632369875907898, "learning_rate": 5.680471291655676e-06, "loss": 0.0024, "step": 5616 }, { "epoch": 1.481994459833795, "grad_norm": 0.8041802644729614, "learning_rate": 5.680119581464873e-06, "loss": 0.0062, "step": 5618 }, { "epoch": 1.4825220947104603, "grad_norm": 0.018497290089726448, "learning_rate": 5.6797678712740705e-06, "loss": 0.0026, "step": 5620 }, { "epoch": 1.4830497295871257, "grad_norm": 0.03175603598356247, "learning_rate": 5.6794161610832674e-06, "loss": 0.0013, "step": 5622 }, { "epoch": 1.483577364463791, "grad_norm": 0.09330154210329056, "learning_rate": 5.679064450892464e-06, "loss": 0.0044, "step": 5624 }, { "epoch": 1.4841049993404565, "grad_norm": 0.7956753373146057, "learning_rate": 5.678712740701662e-06, "loss": 0.0083, "step": 5626 }, { "epoch": 1.4846326342171219, "grad_norm": 0.6010192632675171, "learning_rate": 5.678361030510859e-06, "loss": 0.0092, "step": 5628 }, { "epoch": 1.485160269093787, "grad_norm": 0.7038750648498535, "learning_rate": 5.678009320320057e-06, "loss": 0.0073, "step": 5630 }, { "epoch": 1.4856879039704525, "grad_norm": 0.4693587124347687, "learning_rate": 5.677657610129253e-06, "loss": 0.0026, "step": 5632 }, { "epoch": 1.4862155388471177, "grad_norm": 0.017818201333284378, "learning_rate": 5.677305899938451e-06, "loss": 0.003, "step": 5634 }, { "epoch": 1.486743173723783, "grad_norm": 0.2698207199573517, "learning_rate": 5.676954189747649e-06, "loss": 0.0075, "step": 5636 }, { "epoch": 1.4872708086004485, "grad_norm": 0.15943235158920288, "learning_rate": 5.676602479556845e-06, "loss": 0.008, "step": 5638 }, { "epoch": 1.4877984434771139, "grad_norm": 0.3346497416496277, "learning_rate": 5.6762507693660425e-06, "loss": 0.01, "step": 5640 }, { "epoch": 1.4883260783537793, "grad_norm": 0.1334133744239807, "learning_rate": 5.6758990591752395e-06, "loss": 0.0027, "step": 5642 }, { "epoch": 1.4888537132304445, "grad_norm": 0.7080581188201904, "learning_rate": 5.675547348984437e-06, "loss": 0.0102, "step": 5644 }, { "epoch": 1.4893813481071099, "grad_norm": 0.17164982855319977, "learning_rate": 5.675195638793634e-06, "loss": 0.0088, "step": 5646 }, { "epoch": 1.4899089829837753, "grad_norm": 0.22690525650978088, "learning_rate": 5.674843928602831e-06, "loss": 0.0096, "step": 5648 }, { "epoch": 1.4904366178604405, "grad_norm": 0.5425533652305603, "learning_rate": 5.674492218412029e-06, "loss": 0.0155, "step": 5650 }, { "epoch": 1.4909642527371059, "grad_norm": 0.10327813774347305, "learning_rate": 5.674140508221226e-06, "loss": 0.0017, "step": 5652 }, { "epoch": 1.4914918876137713, "grad_norm": 0.06415417790412903, "learning_rate": 5.673788798030423e-06, "loss": 0.0016, "step": 5654 }, { "epoch": 1.4920195224904367, "grad_norm": 0.1712850034236908, "learning_rate": 5.67343708783962e-06, "loss": 0.0023, "step": 5656 }, { "epoch": 1.492547157367102, "grad_norm": 0.042237311601638794, "learning_rate": 5.673085377648818e-06, "loss": 0.0041, "step": 5658 }, { "epoch": 1.4930747922437673, "grad_norm": 0.0306895412504673, "learning_rate": 5.672733667458015e-06, "loss": 0.0033, "step": 5660 }, { "epoch": 1.4936024271204327, "grad_norm": 0.057527489960193634, "learning_rate": 5.6723819572672116e-06, "loss": 0.002, "step": 5662 }, { "epoch": 1.494130061997098, "grad_norm": 0.5420340895652771, "learning_rate": 5.672030247076409e-06, "loss": 0.0044, "step": 5664 }, { "epoch": 1.4946576968737633, "grad_norm": 0.025614244863390923, "learning_rate": 5.671678536885606e-06, "loss": 0.0014, "step": 5666 }, { "epoch": 1.4951853317504287, "grad_norm": 0.6163070201873779, "learning_rate": 5.671326826694804e-06, "loss": 0.0027, "step": 5668 }, { "epoch": 1.495712966627094, "grad_norm": 0.5592097640037537, "learning_rate": 5.670975116504001e-06, "loss": 0.0024, "step": 5670 }, { "epoch": 1.4962406015037595, "grad_norm": 0.044019829481840134, "learning_rate": 5.670623406313198e-06, "loss": 0.0029, "step": 5672 }, { "epoch": 1.4967682363804247, "grad_norm": 0.04277949035167694, "learning_rate": 5.670271696122396e-06, "loss": 0.0014, "step": 5674 }, { "epoch": 1.49729587125709, "grad_norm": 0.4555404484272003, "learning_rate": 5.669919985931592e-06, "loss": 0.0044, "step": 5676 }, { "epoch": 1.4978235061337555, "grad_norm": 0.07889489084482193, "learning_rate": 5.66956827574079e-06, "loss": 0.0017, "step": 5678 }, { "epoch": 1.4983511410104207, "grad_norm": 0.8283011317253113, "learning_rate": 5.669216565549987e-06, "loss": 0.0109, "step": 5680 }, { "epoch": 1.498878775887086, "grad_norm": 0.31054237484931946, "learning_rate": 5.6688648553591845e-06, "loss": 0.0057, "step": 5682 }, { "epoch": 1.4994064107637515, "grad_norm": 0.020863138139247894, "learning_rate": 5.668513145168381e-06, "loss": 0.002, "step": 5684 }, { "epoch": 1.4999340456404169, "grad_norm": 0.636784017086029, "learning_rate": 5.668161434977578e-06, "loss": 0.0067, "step": 5686 }, { "epoch": 1.5004616805170823, "grad_norm": 0.06299258023500443, "learning_rate": 5.667809724786776e-06, "loss": 0.0102, "step": 5688 }, { "epoch": 1.5009893153937477, "grad_norm": 0.03932757303118706, "learning_rate": 5.667458014595973e-06, "loss": 0.0017, "step": 5690 }, { "epoch": 1.5015169502704129, "grad_norm": 0.17950239777565002, "learning_rate": 5.66710630440517e-06, "loss": 0.0074, "step": 5692 }, { "epoch": 1.502044585147078, "grad_norm": 0.35902640223503113, "learning_rate": 5.666754594214368e-06, "loss": 0.0081, "step": 5694 }, { "epoch": 1.5025722200237435, "grad_norm": 0.260647714138031, "learning_rate": 5.666402884023565e-06, "loss": 0.0091, "step": 5696 }, { "epoch": 1.5030998549004089, "grad_norm": 0.031244054436683655, "learning_rate": 5.666051173832763e-06, "loss": 0.0028, "step": 5698 }, { "epoch": 1.5036274897770743, "grad_norm": 0.2275756299495697, "learning_rate": 5.665699463641959e-06, "loss": 0.0104, "step": 5700 }, { "epoch": 1.5041551246537397, "grad_norm": 0.4216736853122711, "learning_rate": 5.6653477534511565e-06, "loss": 0.0061, "step": 5702 }, { "epoch": 1.504682759530405, "grad_norm": 0.031554143875837326, "learning_rate": 5.6649960432603535e-06, "loss": 0.0091, "step": 5704 }, { "epoch": 1.5052103944070703, "grad_norm": 0.03512243926525116, "learning_rate": 5.6646443330695504e-06, "loss": 0.0015, "step": 5706 }, { "epoch": 1.5057380292837357, "grad_norm": 0.4278688132762909, "learning_rate": 5.664292622878748e-06, "loss": 0.0189, "step": 5708 }, { "epoch": 1.5062656641604009, "grad_norm": 0.10424812138080597, "learning_rate": 5.663940912687945e-06, "loss": 0.0059, "step": 5710 }, { "epoch": 1.5067932990370663, "grad_norm": 0.19694975018501282, "learning_rate": 5.663589202497143e-06, "loss": 0.0029, "step": 5712 }, { "epoch": 1.5073209339137317, "grad_norm": 0.26754170656204224, "learning_rate": 5.663237492306339e-06, "loss": 0.0047, "step": 5714 }, { "epoch": 1.507848568790397, "grad_norm": 0.23394429683685303, "learning_rate": 5.662885782115537e-06, "loss": 0.0056, "step": 5716 }, { "epoch": 1.5083762036670625, "grad_norm": 0.06737557798624039, "learning_rate": 5.662534071924734e-06, "loss": 0.0018, "step": 5718 }, { "epoch": 1.508903838543728, "grad_norm": 0.2543068528175354, "learning_rate": 5.662182361733932e-06, "loss": 0.0032, "step": 5720 }, { "epoch": 1.509431473420393, "grad_norm": 0.8314550518989563, "learning_rate": 5.661830651543129e-06, "loss": 0.0048, "step": 5722 }, { "epoch": 1.5099591082970585, "grad_norm": 0.2549379765987396, "learning_rate": 5.6614789413523256e-06, "loss": 0.0126, "step": 5724 }, { "epoch": 1.5104867431737237, "grad_norm": 0.3028561472892761, "learning_rate": 5.661127231161523e-06, "loss": 0.0093, "step": 5726 }, { "epoch": 1.511014378050389, "grad_norm": 0.42751017212867737, "learning_rate": 5.66077552097072e-06, "loss": 0.0048, "step": 5728 }, { "epoch": 1.5115420129270545, "grad_norm": 0.6227596402168274, "learning_rate": 5.660423810779917e-06, "loss": 0.0193, "step": 5730 }, { "epoch": 1.5120696478037199, "grad_norm": 0.07301518321037292, "learning_rate": 5.660072100589115e-06, "loss": 0.0019, "step": 5732 }, { "epoch": 1.5125972826803853, "grad_norm": 0.014774835668504238, "learning_rate": 5.659720390398312e-06, "loss": 0.0114, "step": 5734 }, { "epoch": 1.5131249175570505, "grad_norm": 0.6288931965827942, "learning_rate": 5.659368680207509e-06, "loss": 0.0063, "step": 5736 }, { "epoch": 1.5136525524337159, "grad_norm": 0.015981897711753845, "learning_rate": 5.659016970016706e-06, "loss": 0.0013, "step": 5738 }, { "epoch": 1.514180187310381, "grad_norm": 0.014368824660778046, "learning_rate": 5.658665259825904e-06, "loss": 0.0065, "step": 5740 }, { "epoch": 1.5147078221870465, "grad_norm": 0.031424663960933685, "learning_rate": 5.658313549635101e-06, "loss": 0.0013, "step": 5742 }, { "epoch": 1.5152354570637119, "grad_norm": 0.5219122767448425, "learning_rate": 5.657961839444298e-06, "loss": 0.0041, "step": 5744 }, { "epoch": 1.5157630919403773, "grad_norm": 0.26772013306617737, "learning_rate": 5.657610129253495e-06, "loss": 0.008, "step": 5746 }, { "epoch": 1.5162907268170427, "grad_norm": 0.07188833504915237, "learning_rate": 5.657258419062692e-06, "loss": 0.0015, "step": 5748 }, { "epoch": 1.516818361693708, "grad_norm": 0.0520935021340847, "learning_rate": 5.65690670887189e-06, "loss": 0.0115, "step": 5750 }, { "epoch": 1.5173459965703733, "grad_norm": 0.10801482200622559, "learning_rate": 5.656554998681087e-06, "loss": 0.0121, "step": 5752 }, { "epoch": 1.5178736314470387, "grad_norm": 0.03730657324194908, "learning_rate": 5.656203288490284e-06, "loss": 0.0138, "step": 5754 }, { "epoch": 1.5184012663237039, "grad_norm": 0.06366570293903351, "learning_rate": 5.655851578299482e-06, "loss": 0.0014, "step": 5756 }, { "epoch": 1.5189289012003693, "grad_norm": 0.02283896505832672, "learning_rate": 5.655499868108679e-06, "loss": 0.0013, "step": 5758 }, { "epoch": 1.5194565360770347, "grad_norm": 0.10655398666858673, "learning_rate": 5.655148157917876e-06, "loss": 0.0021, "step": 5760 }, { "epoch": 1.5199841709537, "grad_norm": 0.07358899712562561, "learning_rate": 5.654796447727073e-06, "loss": 0.0038, "step": 5762 }, { "epoch": 1.5205118058303655, "grad_norm": 0.051509249955415726, "learning_rate": 5.6544447375362705e-06, "loss": 0.0013, "step": 5764 }, { "epoch": 1.521039440707031, "grad_norm": 0.26036131381988525, "learning_rate": 5.6540930273454675e-06, "loss": 0.0018, "step": 5766 }, { "epoch": 1.521567075583696, "grad_norm": 0.4602700471878052, "learning_rate": 5.6537413171546644e-06, "loss": 0.0056, "step": 5768 }, { "epoch": 1.5220947104603613, "grad_norm": 0.2980864942073822, "learning_rate": 5.653389606963862e-06, "loss": 0.0127, "step": 5770 }, { "epoch": 1.5226223453370267, "grad_norm": 0.1304105669260025, "learning_rate": 5.653037896773059e-06, "loss": 0.0048, "step": 5772 }, { "epoch": 1.523149980213692, "grad_norm": 0.01578648015856743, "learning_rate": 5.652686186582256e-06, "loss": 0.0023, "step": 5774 }, { "epoch": 1.5236776150903575, "grad_norm": 0.12302812188863754, "learning_rate": 5.652334476391453e-06, "loss": 0.0014, "step": 5776 }, { "epoch": 1.524205249967023, "grad_norm": 0.21309033036231995, "learning_rate": 5.651982766200651e-06, "loss": 0.0036, "step": 5778 }, { "epoch": 1.5247328848436883, "grad_norm": 0.07717626541852951, "learning_rate": 5.651631056009849e-06, "loss": 0.0118, "step": 5780 }, { "epoch": 1.5252605197203535, "grad_norm": 0.02100449427962303, "learning_rate": 5.651279345819045e-06, "loss": 0.0078, "step": 5782 }, { "epoch": 1.5257881545970189, "grad_norm": 0.04416874796152115, "learning_rate": 5.650927635628243e-06, "loss": 0.0014, "step": 5784 }, { "epoch": 1.526315789473684, "grad_norm": 0.9470024108886719, "learning_rate": 5.6505759254374395e-06, "loss": 0.0056, "step": 5786 }, { "epoch": 1.5268434243503495, "grad_norm": 0.2194100320339203, "learning_rate": 5.650224215246637e-06, "loss": 0.0018, "step": 5788 }, { "epoch": 1.5273710592270149, "grad_norm": 0.19426491856575012, "learning_rate": 5.649872505055834e-06, "loss": 0.0017, "step": 5790 }, { "epoch": 1.5278986941036803, "grad_norm": 0.023034440353512764, "learning_rate": 5.649520794865031e-06, "loss": 0.0012, "step": 5792 }, { "epoch": 1.5284263289803457, "grad_norm": 0.21322371065616608, "learning_rate": 5.649169084674229e-06, "loss": 0.0095, "step": 5794 }, { "epoch": 1.528953963857011, "grad_norm": 0.22444453835487366, "learning_rate": 5.648817374483425e-06, "loss": 0.0026, "step": 5796 }, { "epoch": 1.5294815987336763, "grad_norm": 0.3238725960254669, "learning_rate": 5.648465664292623e-06, "loss": 0.0063, "step": 5798 }, { "epoch": 1.5300092336103417, "grad_norm": 0.41728243231773376, "learning_rate": 5.64811395410182e-06, "loss": 0.0129, "step": 5800 }, { "epoch": 1.5305368684870069, "grad_norm": 0.2975119650363922, "learning_rate": 5.647762243911018e-06, "loss": 0.0152, "step": 5802 }, { "epoch": 1.5310645033636723, "grad_norm": 0.5163164734840393, "learning_rate": 5.647410533720215e-06, "loss": 0.0054, "step": 5804 }, { "epoch": 1.5315921382403377, "grad_norm": 0.14186061918735504, "learning_rate": 5.647058823529412e-06, "loss": 0.0038, "step": 5806 }, { "epoch": 1.532119773117003, "grad_norm": 0.3640466332435608, "learning_rate": 5.646707113338609e-06, "loss": 0.0132, "step": 5808 }, { "epoch": 1.5326474079936685, "grad_norm": 0.3814058005809784, "learning_rate": 5.646355403147806e-06, "loss": 0.0105, "step": 5810 }, { "epoch": 1.5331750428703337, "grad_norm": 0.15478183329105377, "learning_rate": 5.646003692957003e-06, "loss": 0.0024, "step": 5812 }, { "epoch": 1.533702677746999, "grad_norm": 0.41413894295692444, "learning_rate": 5.645651982766201e-06, "loss": 0.0054, "step": 5814 }, { "epoch": 1.5342303126236643, "grad_norm": 0.9886385202407837, "learning_rate": 5.645300272575398e-06, "loss": 0.0048, "step": 5816 }, { "epoch": 1.5347579475003297, "grad_norm": 0.029590291902422905, "learning_rate": 5.644948562384596e-06, "loss": 0.0012, "step": 5818 }, { "epoch": 1.535285582376995, "grad_norm": 0.09152291715145111, "learning_rate": 5.644596852193792e-06, "loss": 0.0015, "step": 5820 }, { "epoch": 1.5358132172536605, "grad_norm": 0.09842834621667862, "learning_rate": 5.64424514200299e-06, "loss": 0.0066, "step": 5822 }, { "epoch": 1.536340852130326, "grad_norm": 0.02526884898543358, "learning_rate": 5.643893431812187e-06, "loss": 0.0014, "step": 5824 }, { "epoch": 1.5368684870069913, "grad_norm": 0.02238163910806179, "learning_rate": 5.643541721621384e-06, "loss": 0.0013, "step": 5826 }, { "epoch": 1.5373961218836565, "grad_norm": 0.8151556849479675, "learning_rate": 5.6431900114305815e-06, "loss": 0.0056, "step": 5828 }, { "epoch": 1.537923756760322, "grad_norm": 0.03342890739440918, "learning_rate": 5.6428383012397784e-06, "loss": 0.0015, "step": 5830 }, { "epoch": 1.538451391636987, "grad_norm": 0.8071246147155762, "learning_rate": 5.642486591048976e-06, "loss": 0.0103, "step": 5832 }, { "epoch": 1.5389790265136525, "grad_norm": 0.14452072978019714, "learning_rate": 5.642134880858172e-06, "loss": 0.01, "step": 5834 }, { "epoch": 1.5395066613903179, "grad_norm": 0.01665368862450123, "learning_rate": 5.64178317066737e-06, "loss": 0.0058, "step": 5836 }, { "epoch": 1.5400342962669833, "grad_norm": 0.3408218324184418, "learning_rate": 5.641431460476568e-06, "loss": 0.005, "step": 5838 }, { "epoch": 1.5405619311436487, "grad_norm": 0.28835737705230713, "learning_rate": 5.641079750285765e-06, "loss": 0.0025, "step": 5840 }, { "epoch": 1.541089566020314, "grad_norm": 0.6755359768867493, "learning_rate": 5.640728040094962e-06, "loss": 0.0189, "step": 5842 }, { "epoch": 1.5416172008969793, "grad_norm": 0.36054402589797974, "learning_rate": 5.640376329904159e-06, "loss": 0.0158, "step": 5844 }, { "epoch": 1.5421448357736445, "grad_norm": 0.11835277825593948, "learning_rate": 5.640024619713357e-06, "loss": 0.0104, "step": 5846 }, { "epoch": 1.5426724706503099, "grad_norm": 0.0264911986887455, "learning_rate": 5.6396729095225535e-06, "loss": 0.0013, "step": 5848 }, { "epoch": 1.5432001055269753, "grad_norm": 0.05042378604412079, "learning_rate": 5.6393211993317505e-06, "loss": 0.0023, "step": 5850 }, { "epoch": 1.5437277404036407, "grad_norm": 0.07682254165410995, "learning_rate": 5.638969489140948e-06, "loss": 0.0045, "step": 5852 }, { "epoch": 1.544255375280306, "grad_norm": 0.43877342343330383, "learning_rate": 5.638617778950145e-06, "loss": 0.0062, "step": 5854 }, { "epoch": 1.5447830101569715, "grad_norm": 0.052720241248607635, "learning_rate": 5.638266068759342e-06, "loss": 0.0043, "step": 5856 }, { "epoch": 1.5453106450336367, "grad_norm": 0.9021618366241455, "learning_rate": 5.637914358568539e-06, "loss": 0.0054, "step": 5858 }, { "epoch": 1.545838279910302, "grad_norm": 0.17161479592323303, "learning_rate": 5.637562648377737e-06, "loss": 0.0019, "step": 5860 }, { "epoch": 1.5463659147869673, "grad_norm": 0.026572581380605698, "learning_rate": 5.637210938186935e-06, "loss": 0.0016, "step": 5862 }, { "epoch": 1.5468935496636327, "grad_norm": 0.024972442537546158, "learning_rate": 5.636859227996131e-06, "loss": 0.0017, "step": 5864 }, { "epoch": 1.547421184540298, "grad_norm": 0.03283753618597984, "learning_rate": 5.636507517805329e-06, "loss": 0.0075, "step": 5866 }, { "epoch": 1.5479488194169635, "grad_norm": 0.672739565372467, "learning_rate": 5.636155807614526e-06, "loss": 0.0046, "step": 5868 }, { "epoch": 1.548476454293629, "grad_norm": 0.014904257841408253, "learning_rate": 5.635804097423723e-06, "loss": 0.0011, "step": 5870 }, { "epoch": 1.5490040891702943, "grad_norm": 0.5137752890586853, "learning_rate": 5.63545238723292e-06, "loss": 0.0088, "step": 5872 }, { "epoch": 1.5495317240469595, "grad_norm": 0.021909330040216446, "learning_rate": 5.635100677042117e-06, "loss": 0.0034, "step": 5874 }, { "epoch": 1.550059358923625, "grad_norm": 0.02136465534567833, "learning_rate": 5.634748966851315e-06, "loss": 0.0103, "step": 5876 }, { "epoch": 1.55058699380029, "grad_norm": 0.06078660488128662, "learning_rate": 5.634397256660512e-06, "loss": 0.0015, "step": 5878 }, { "epoch": 1.5511146286769555, "grad_norm": 0.01902274787425995, "learning_rate": 5.634045546469709e-06, "loss": 0.0064, "step": 5880 }, { "epoch": 1.551642263553621, "grad_norm": 0.021427201107144356, "learning_rate": 5.633693836278906e-06, "loss": 0.0068, "step": 5882 }, { "epoch": 1.5521698984302863, "grad_norm": 0.030339127406477928, "learning_rate": 5.633342126088104e-06, "loss": 0.0013, "step": 5884 }, { "epoch": 1.5526975333069517, "grad_norm": 0.1451546549797058, "learning_rate": 5.632990415897301e-06, "loss": 0.0056, "step": 5886 }, { "epoch": 1.5532251681836169, "grad_norm": 0.02622574009001255, "learning_rate": 5.632638705706498e-06, "loss": 0.0093, "step": 5888 }, { "epoch": 1.5537528030602823, "grad_norm": 0.24532511830329895, "learning_rate": 5.6322869955156955e-06, "loss": 0.0105, "step": 5890 }, { "epoch": 1.5542804379369475, "grad_norm": 0.06471958756446838, "learning_rate": 5.631935285324892e-06, "loss": 0.0015, "step": 5892 }, { "epoch": 1.5548080728136129, "grad_norm": 0.08683309704065323, "learning_rate": 5.631583575134089e-06, "loss": 0.0016, "step": 5894 }, { "epoch": 1.5553357076902783, "grad_norm": 0.3371615707874298, "learning_rate": 5.631231864943287e-06, "loss": 0.0085, "step": 5896 }, { "epoch": 1.5558633425669437, "grad_norm": 0.13057777285575867, "learning_rate": 5.630880154752484e-06, "loss": 0.0017, "step": 5898 }, { "epoch": 1.556390977443609, "grad_norm": 0.2129533588886261, "learning_rate": 5.630528444561682e-06, "loss": 0.0015, "step": 5900 }, { "epoch": 1.5569186123202745, "grad_norm": 0.06740522384643555, "learning_rate": 5.630176734370878e-06, "loss": 0.0132, "step": 5902 }, { "epoch": 1.5574462471969397, "grad_norm": 0.03012491762638092, "learning_rate": 5.629825024180076e-06, "loss": 0.0012, "step": 5904 }, { "epoch": 1.557973882073605, "grad_norm": 0.08227842301130295, "learning_rate": 5.629473313989273e-06, "loss": 0.0131, "step": 5906 }, { "epoch": 1.5585015169502703, "grad_norm": 0.017297277227044106, "learning_rate": 5.629121603798471e-06, "loss": 0.0015, "step": 5908 }, { "epoch": 1.5590291518269357, "grad_norm": 0.0594463013112545, "learning_rate": 5.6287698936076675e-06, "loss": 0.004, "step": 5910 }, { "epoch": 1.559556786703601, "grad_norm": 0.1231759712100029, "learning_rate": 5.6284181834168645e-06, "loss": 0.006, "step": 5912 }, { "epoch": 1.5600844215802665, "grad_norm": 0.5910042524337769, "learning_rate": 5.628066473226062e-06, "loss": 0.0023, "step": 5914 }, { "epoch": 1.560612056456932, "grad_norm": 0.011245762929320335, "learning_rate": 5.627714763035258e-06, "loss": 0.0018, "step": 5916 }, { "epoch": 1.5611396913335973, "grad_norm": 0.012563249096274376, "learning_rate": 5.627363052844456e-06, "loss": 0.0051, "step": 5918 }, { "epoch": 1.5616673262102625, "grad_norm": 0.03535247966647148, "learning_rate": 5.627011342653654e-06, "loss": 0.0012, "step": 5920 }, { "epoch": 1.5621949610869277, "grad_norm": 0.01929664984345436, "learning_rate": 5.626659632462851e-06, "loss": 0.0102, "step": 5922 }, { "epoch": 1.562722595963593, "grad_norm": 0.338609516620636, "learning_rate": 5.626307922272048e-06, "loss": 0.002, "step": 5924 }, { "epoch": 1.5632502308402585, "grad_norm": 0.026165790855884552, "learning_rate": 5.625956212081245e-06, "loss": 0.0139, "step": 5926 }, { "epoch": 1.563777865716924, "grad_norm": 0.2205565869808197, "learning_rate": 5.625604501890443e-06, "loss": 0.0047, "step": 5928 }, { "epoch": 1.5643055005935893, "grad_norm": 0.15195706486701965, "learning_rate": 5.62525279169964e-06, "loss": 0.0091, "step": 5930 }, { "epoch": 1.5648331354702547, "grad_norm": 0.3382548987865448, "learning_rate": 5.6249010815088365e-06, "loss": 0.0013, "step": 5932 }, { "epoch": 1.56536077034692, "grad_norm": 0.030311496928334236, "learning_rate": 5.624549371318034e-06, "loss": 0.0026, "step": 5934 }, { "epoch": 1.5658884052235853, "grad_norm": 0.193287655711174, "learning_rate": 5.624197661127231e-06, "loss": 0.0041, "step": 5936 }, { "epoch": 1.5664160401002505, "grad_norm": 0.6739153861999512, "learning_rate": 5.623845950936429e-06, "loss": 0.0053, "step": 5938 }, { "epoch": 1.566943674976916, "grad_norm": 0.44683367013931274, "learning_rate": 5.623494240745625e-06, "loss": 0.0058, "step": 5940 }, { "epoch": 1.5674713098535813, "grad_norm": 0.2365870624780655, "learning_rate": 5.623142530554823e-06, "loss": 0.0109, "step": 5942 }, { "epoch": 1.5679989447302467, "grad_norm": 0.04290133714675903, "learning_rate": 5.62279082036402e-06, "loss": 0.0083, "step": 5944 }, { "epoch": 1.568526579606912, "grad_norm": 0.1409742385149002, "learning_rate": 5.622439110173217e-06, "loss": 0.0092, "step": 5946 }, { "epoch": 1.5690542144835775, "grad_norm": 0.6673303246498108, "learning_rate": 5.622087399982415e-06, "loss": 0.0071, "step": 5948 }, { "epoch": 1.5695818493602427, "grad_norm": 0.6179690361022949, "learning_rate": 5.621735689791612e-06, "loss": 0.0124, "step": 5950 }, { "epoch": 1.570109484236908, "grad_norm": 0.6144758462905884, "learning_rate": 5.6213839796008095e-06, "loss": 0.0063, "step": 5952 }, { "epoch": 1.5706371191135733, "grad_norm": 0.032697293907403946, "learning_rate": 5.621032269410006e-06, "loss": 0.0019, "step": 5954 }, { "epoch": 1.5711647539902387, "grad_norm": 0.26440325379371643, "learning_rate": 5.620680559219203e-06, "loss": 0.0105, "step": 5956 }, { "epoch": 1.571692388866904, "grad_norm": 0.015228623524308205, "learning_rate": 5.620328849028401e-06, "loss": 0.0075, "step": 5958 }, { "epoch": 1.5722200237435695, "grad_norm": 0.4992796778678894, "learning_rate": 5.619977138837598e-06, "loss": 0.0141, "step": 5960 }, { "epoch": 1.572747658620235, "grad_norm": 0.2156374752521515, "learning_rate": 5.619625428646795e-06, "loss": 0.0144, "step": 5962 }, { "epoch": 1.5732752934969, "grad_norm": 1.2874001264572144, "learning_rate": 5.619273718455992e-06, "loss": 0.0074, "step": 5964 }, { "epoch": 1.5738029283735655, "grad_norm": 0.8684878349304199, "learning_rate": 5.61892200826519e-06, "loss": 0.0099, "step": 5966 }, { "epoch": 1.5743305632502307, "grad_norm": 0.2159098982810974, "learning_rate": 5.618570298074387e-06, "loss": 0.0027, "step": 5968 }, { "epoch": 1.574858198126896, "grad_norm": 0.49903321266174316, "learning_rate": 5.618218587883584e-06, "loss": 0.0219, "step": 5970 }, { "epoch": 1.5753858330035615, "grad_norm": 0.09320463240146637, "learning_rate": 5.6178668776927815e-06, "loss": 0.0047, "step": 5972 }, { "epoch": 1.575913467880227, "grad_norm": 0.7888954281806946, "learning_rate": 5.6175151675019785e-06, "loss": 0.003, "step": 5974 }, { "epoch": 1.5764411027568923, "grad_norm": 0.07666453719139099, "learning_rate": 5.6171634573111754e-06, "loss": 0.0039, "step": 5976 }, { "epoch": 1.5769687376335577, "grad_norm": 0.5759866833686829, "learning_rate": 5.616811747120373e-06, "loss": 0.0063, "step": 5978 }, { "epoch": 1.577496372510223, "grad_norm": 0.6646282076835632, "learning_rate": 5.61646003692957e-06, "loss": 0.0133, "step": 5980 }, { "epoch": 1.5780240073868883, "grad_norm": 0.023144451901316643, "learning_rate": 5.616108326738768e-06, "loss": 0.0014, "step": 5982 }, { "epoch": 1.5785516422635535, "grad_norm": 0.12300408631563187, "learning_rate": 5.615756616547964e-06, "loss": 0.0015, "step": 5984 }, { "epoch": 1.579079277140219, "grad_norm": 0.9845938086509705, "learning_rate": 5.615404906357162e-06, "loss": 0.0043, "step": 5986 }, { "epoch": 1.5796069120168843, "grad_norm": 0.267657071352005, "learning_rate": 5.615053196166359e-06, "loss": 0.0023, "step": 5988 }, { "epoch": 1.5801345468935497, "grad_norm": 0.023009195923805237, "learning_rate": 5.614701485975557e-06, "loss": 0.0068, "step": 5990 }, { "epoch": 1.5806621817702151, "grad_norm": 0.04372045770287514, "learning_rate": 5.614349775784754e-06, "loss": 0.0014, "step": 5992 }, { "epoch": 1.5811898166468805, "grad_norm": 0.30238640308380127, "learning_rate": 5.6139980655939505e-06, "loss": 0.0021, "step": 5994 }, { "epoch": 1.5817174515235457, "grad_norm": 0.04424754157662392, "learning_rate": 5.613646355403148e-06, "loss": 0.0014, "step": 5996 }, { "epoch": 1.582245086400211, "grad_norm": 0.6934711337089539, "learning_rate": 5.613294645212345e-06, "loss": 0.0041, "step": 5998 }, { "epoch": 1.5827727212768763, "grad_norm": 0.024954821914434433, "learning_rate": 5.612942935021542e-06, "loss": 0.0043, "step": 6000 }, { "epoch": 1.5827727212768763, "eval_loss": 0.0040659294463694096, "eval_runtime": 307.2447, "eval_samples_per_second": 701.854, "eval_steps_per_second": 87.735, "step": 6000 }, { "epoch": 1.5833003561535417, "grad_norm": 0.683167040348053, "learning_rate": 5.612591224830739e-06, "loss": 0.0152, "step": 6002 }, { "epoch": 1.583827991030207, "grad_norm": 0.10679124295711517, "learning_rate": 5.612239514639937e-06, "loss": 0.0111, "step": 6004 }, { "epoch": 1.5843556259068725, "grad_norm": 0.04637598618865013, "learning_rate": 5.611887804449134e-06, "loss": 0.0013, "step": 6006 }, { "epoch": 1.584883260783538, "grad_norm": 0.4146021902561188, "learning_rate": 5.611536094258331e-06, "loss": 0.0031, "step": 6008 }, { "epoch": 1.585410895660203, "grad_norm": 0.36938345432281494, "learning_rate": 5.611184384067529e-06, "loss": 0.0057, "step": 6010 }, { "epoch": 1.5859385305368685, "grad_norm": 0.4056265652179718, "learning_rate": 5.610832673876726e-06, "loss": 0.009, "step": 6012 }, { "epoch": 1.5864661654135337, "grad_norm": 0.08081087470054626, "learning_rate": 5.610480963685923e-06, "loss": 0.0017, "step": 6014 }, { "epoch": 1.586993800290199, "grad_norm": 0.035161975771188736, "learning_rate": 5.61012925349512e-06, "loss": 0.0015, "step": 6016 }, { "epoch": 1.5875214351668645, "grad_norm": 0.01879427768290043, "learning_rate": 5.609777543304317e-06, "loss": 0.0056, "step": 6018 }, { "epoch": 1.58804907004353, "grad_norm": 0.3979986906051636, "learning_rate": 5.609425833113515e-06, "loss": 0.0119, "step": 6020 }, { "epoch": 1.5885767049201953, "grad_norm": 0.400405615568161, "learning_rate": 5.609074122922711e-06, "loss": 0.0073, "step": 6022 }, { "epoch": 1.5891043397968607, "grad_norm": 0.05340639501810074, "learning_rate": 5.608722412731909e-06, "loss": 0.0016, "step": 6024 }, { "epoch": 1.589631974673526, "grad_norm": 0.6934317350387573, "learning_rate": 5.608370702541106e-06, "loss": 0.0027, "step": 6026 }, { "epoch": 1.5901596095501913, "grad_norm": 0.5001693367958069, "learning_rate": 5.608018992350304e-06, "loss": 0.0035, "step": 6028 }, { "epoch": 1.5906872444268565, "grad_norm": 0.017052682116627693, "learning_rate": 5.607667282159501e-06, "loss": 0.0012, "step": 6030 }, { "epoch": 1.591214879303522, "grad_norm": 0.20623713731765747, "learning_rate": 5.607315571968698e-06, "loss": 0.0023, "step": 6032 }, { "epoch": 1.5917425141801873, "grad_norm": 0.6897438168525696, "learning_rate": 5.6069638617778955e-06, "loss": 0.0044, "step": 6034 }, { "epoch": 1.5922701490568527, "grad_norm": 0.04417767748236656, "learning_rate": 5.6066121515870925e-06, "loss": 0.0015, "step": 6036 }, { "epoch": 1.5927977839335181, "grad_norm": 0.39092734456062317, "learning_rate": 5.606260441396289e-06, "loss": 0.0149, "step": 6038 }, { "epoch": 1.5933254188101833, "grad_norm": 0.09169255942106247, "learning_rate": 5.605908731205487e-06, "loss": 0.0015, "step": 6040 }, { "epoch": 1.5938530536868487, "grad_norm": 0.3997108042240143, "learning_rate": 5.605557021014684e-06, "loss": 0.004, "step": 6042 }, { "epoch": 1.594380688563514, "grad_norm": 0.21614471077919006, "learning_rate": 5.605205310823881e-06, "loss": 0.0065, "step": 6044 }, { "epoch": 1.5949083234401793, "grad_norm": 0.6814819574356079, "learning_rate": 5.604853600633078e-06, "loss": 0.0061, "step": 6046 }, { "epoch": 1.5954359583168447, "grad_norm": 0.17829982936382294, "learning_rate": 5.604501890442276e-06, "loss": 0.0102, "step": 6048 }, { "epoch": 1.59596359319351, "grad_norm": 0.25281989574432373, "learning_rate": 5.604150180251473e-06, "loss": 0.0025, "step": 6050 }, { "epoch": 1.5964912280701755, "grad_norm": 0.4806809425354004, "learning_rate": 5.60379847006067e-06, "loss": 0.0057, "step": 6052 }, { "epoch": 1.597018862946841, "grad_norm": 0.03691401332616806, "learning_rate": 5.603446759869868e-06, "loss": 0.0044, "step": 6054 }, { "epoch": 1.597546497823506, "grad_norm": 0.23109938204288483, "learning_rate": 5.6030950496790645e-06, "loss": 0.0042, "step": 6056 }, { "epoch": 1.5980741327001715, "grad_norm": 0.05954277515411377, "learning_rate": 5.602743339488262e-06, "loss": 0.0015, "step": 6058 }, { "epoch": 1.5986017675768367, "grad_norm": 0.06592543423175812, "learning_rate": 5.6023916292974584e-06, "loss": 0.0168, "step": 6060 }, { "epoch": 1.599129402453502, "grad_norm": 0.010315890423953533, "learning_rate": 5.602039919106656e-06, "loss": 0.0043, "step": 6062 }, { "epoch": 1.5996570373301675, "grad_norm": 0.06240810453891754, "learning_rate": 5.601688208915854e-06, "loss": 0.0044, "step": 6064 }, { "epoch": 1.600184672206833, "grad_norm": 0.015061081387102604, "learning_rate": 5.60133649872505e-06, "loss": 0.003, "step": 6066 }, { "epoch": 1.6007123070834983, "grad_norm": 0.016121190041303635, "learning_rate": 5.600984788534248e-06, "loss": 0.0061, "step": 6068 }, { "epoch": 1.6012399419601637, "grad_norm": 0.009327448904514313, "learning_rate": 5.600633078343445e-06, "loss": 0.0015, "step": 6070 }, { "epoch": 1.601767576836829, "grad_norm": 0.04170222580432892, "learning_rate": 5.600281368152643e-06, "loss": 0.0014, "step": 6072 }, { "epoch": 1.6022952117134943, "grad_norm": 0.03340314328670502, "learning_rate": 5.59992965796184e-06, "loss": 0.0025, "step": 6074 }, { "epoch": 1.6028228465901595, "grad_norm": 0.5314703583717346, "learning_rate": 5.599577947771037e-06, "loss": 0.0099, "step": 6076 }, { "epoch": 1.603350481466825, "grad_norm": 0.23405936360359192, "learning_rate": 5.599226237580234e-06, "loss": 0.0098, "step": 6078 }, { "epoch": 1.6038781163434903, "grad_norm": 0.4645594656467438, "learning_rate": 5.598874527389431e-06, "loss": 0.003, "step": 6080 }, { "epoch": 1.6044057512201557, "grad_norm": 0.012774769216775894, "learning_rate": 5.598522817198628e-06, "loss": 0.0024, "step": 6082 }, { "epoch": 1.6049333860968211, "grad_norm": 0.011375200934708118, "learning_rate": 5.598171107007825e-06, "loss": 0.001, "step": 6084 }, { "epoch": 1.6054610209734863, "grad_norm": 0.08609773218631744, "learning_rate": 5.597819396817023e-06, "loss": 0.0013, "step": 6086 }, { "epoch": 1.6059886558501517, "grad_norm": 0.04342079907655716, "learning_rate": 5.597467686626221e-06, "loss": 0.0012, "step": 6088 }, { "epoch": 1.606516290726817, "grad_norm": 0.049233920872211456, "learning_rate": 5.597115976435417e-06, "loss": 0.0084, "step": 6090 }, { "epoch": 1.6070439256034823, "grad_norm": 0.040253035724163055, "learning_rate": 5.596764266244615e-06, "loss": 0.0013, "step": 6092 }, { "epoch": 1.6075715604801477, "grad_norm": 0.10872682183980942, "learning_rate": 5.596412556053812e-06, "loss": 0.0057, "step": 6094 }, { "epoch": 1.6080991953568131, "grad_norm": 0.02487105503678322, "learning_rate": 5.596060845863009e-06, "loss": 0.0012, "step": 6096 }, { "epoch": 1.6086268302334785, "grad_norm": 0.9914295673370361, "learning_rate": 5.5957091356722065e-06, "loss": 0.0194, "step": 6098 }, { "epoch": 1.609154465110144, "grad_norm": 0.026640290394425392, "learning_rate": 5.595357425481403e-06, "loss": 0.0011, "step": 6100 }, { "epoch": 1.6096820999868091, "grad_norm": 0.029896708205342293, "learning_rate": 5.595005715290601e-06, "loss": 0.0121, "step": 6102 }, { "epoch": 1.6102097348634745, "grad_norm": 0.10118181258440018, "learning_rate": 5.594654005099797e-06, "loss": 0.011, "step": 6104 }, { "epoch": 1.6107373697401397, "grad_norm": 2.184718132019043, "learning_rate": 5.594302294908995e-06, "loss": 0.0122, "step": 6106 }, { "epoch": 1.611265004616805, "grad_norm": 0.11866839975118637, "learning_rate": 5.593950584718192e-06, "loss": 0.0063, "step": 6108 }, { "epoch": 1.6117926394934705, "grad_norm": 0.0652252584695816, "learning_rate": 5.59359887452739e-06, "loss": 0.007, "step": 6110 }, { "epoch": 1.612320274370136, "grad_norm": 0.028215574100613594, "learning_rate": 5.593247164336587e-06, "loss": 0.0015, "step": 6112 }, { "epoch": 1.6128479092468013, "grad_norm": 0.4336148202419281, "learning_rate": 5.592895454145784e-06, "loss": 0.0132, "step": 6114 }, { "epoch": 1.6133755441234665, "grad_norm": 0.0435764454305172, "learning_rate": 5.5925437439549816e-06, "loss": 0.0014, "step": 6116 }, { "epoch": 1.613903179000132, "grad_norm": 0.03602185472846031, "learning_rate": 5.5921920337641785e-06, "loss": 0.0012, "step": 6118 }, { "epoch": 1.614430813876797, "grad_norm": 0.010843116790056229, "learning_rate": 5.5918403235733755e-06, "loss": 0.0013, "step": 6120 }, { "epoch": 1.6149584487534625, "grad_norm": 0.21004542708396912, "learning_rate": 5.591488613382573e-06, "loss": 0.0066, "step": 6122 }, { "epoch": 1.615486083630128, "grad_norm": 0.24136362969875336, "learning_rate": 5.59113690319177e-06, "loss": 0.0077, "step": 6124 }, { "epoch": 1.6160137185067933, "grad_norm": 0.19544069468975067, "learning_rate": 5.590785193000967e-06, "loss": 0.0015, "step": 6126 }, { "epoch": 1.6165413533834587, "grad_norm": 0.05432740971446037, "learning_rate": 5.590433482810164e-06, "loss": 0.0017, "step": 6128 }, { "epoch": 1.6170689882601241, "grad_norm": 0.03239517658948898, "learning_rate": 5.590081772619362e-06, "loss": 0.0028, "step": 6130 }, { "epoch": 1.6175966231367893, "grad_norm": 0.0586702898144722, "learning_rate": 5.589730062428559e-06, "loss": 0.0013, "step": 6132 }, { "epoch": 1.6181242580134547, "grad_norm": 1.7554187774658203, "learning_rate": 5.589378352237756e-06, "loss": 0.0083, "step": 6134 }, { "epoch": 1.61865189289012, "grad_norm": 0.38865724205970764, "learning_rate": 5.589026642046954e-06, "loss": 0.002, "step": 6136 }, { "epoch": 1.6191795277667853, "grad_norm": 2.357633113861084, "learning_rate": 5.588674931856151e-06, "loss": 0.0099, "step": 6138 }, { "epoch": 1.6197071626434507, "grad_norm": 0.01906571350991726, "learning_rate": 5.588323221665348e-06, "loss": 0.0151, "step": 6140 }, { "epoch": 1.6202347975201161, "grad_norm": 0.0530710332095623, "learning_rate": 5.5879715114745445e-06, "loss": 0.0012, "step": 6142 }, { "epoch": 1.6207624323967815, "grad_norm": 0.039040159434080124, "learning_rate": 5.587619801283742e-06, "loss": 0.0012, "step": 6144 }, { "epoch": 1.621290067273447, "grad_norm": 0.03548073023557663, "learning_rate": 5.58726809109294e-06, "loss": 0.0036, "step": 6146 }, { "epoch": 1.6218177021501121, "grad_norm": 0.3812026083469391, "learning_rate": 5.586916380902137e-06, "loss": 0.0039, "step": 6148 }, { "epoch": 1.6223453370267775, "grad_norm": 0.17885509133338928, "learning_rate": 5.586564670711334e-06, "loss": 0.0136, "step": 6150 }, { "epoch": 1.6228729719034427, "grad_norm": 0.43733885884284973, "learning_rate": 5.586212960520531e-06, "loss": 0.0045, "step": 6152 }, { "epoch": 1.6234006067801081, "grad_norm": 0.27510249614715576, "learning_rate": 5.585861250329729e-06, "loss": 0.0063, "step": 6154 }, { "epoch": 1.6239282416567735, "grad_norm": 0.0401649996638298, "learning_rate": 5.585509540138926e-06, "loss": 0.0013, "step": 6156 }, { "epoch": 1.624455876533439, "grad_norm": 0.022634359076619148, "learning_rate": 5.585157829948123e-06, "loss": 0.0086, "step": 6158 }, { "epoch": 1.6249835114101043, "grad_norm": 0.36849498748779297, "learning_rate": 5.5848061197573205e-06, "loss": 0.0201, "step": 6160 }, { "epoch": 1.6255111462867695, "grad_norm": 1.087669014930725, "learning_rate": 5.584454409566517e-06, "loss": 0.0072, "step": 6162 }, { "epoch": 1.626038781163435, "grad_norm": 0.2503262460231781, "learning_rate": 5.584102699375714e-06, "loss": 0.0108, "step": 6164 }, { "epoch": 1.6265664160401, "grad_norm": 0.02975691296160221, "learning_rate": 5.583750989184911e-06, "loss": 0.0065, "step": 6166 }, { "epoch": 1.6270940509167655, "grad_norm": 0.1503678560256958, "learning_rate": 5.583399278994109e-06, "loss": 0.0016, "step": 6168 }, { "epoch": 1.627621685793431, "grad_norm": 0.4372195601463318, "learning_rate": 5.583047568803306e-06, "loss": 0.0047, "step": 6170 }, { "epoch": 1.6281493206700963, "grad_norm": 0.39683425426483154, "learning_rate": 5.582695858612503e-06, "loss": 0.002, "step": 6172 }, { "epoch": 1.6286769555467617, "grad_norm": 0.3209620714187622, "learning_rate": 5.582344148421701e-06, "loss": 0.0049, "step": 6174 }, { "epoch": 1.6292045904234271, "grad_norm": 0.06526808440685272, "learning_rate": 5.581992438230898e-06, "loss": 0.0017, "step": 6176 }, { "epoch": 1.6297322253000923, "grad_norm": 0.43388596177101135, "learning_rate": 5.5816407280400956e-06, "loss": 0.0106, "step": 6178 }, { "epoch": 1.6302598601767577, "grad_norm": 0.13679993152618408, "learning_rate": 5.5812890178492925e-06, "loss": 0.0019, "step": 6180 }, { "epoch": 1.630787495053423, "grad_norm": 0.033899735659360886, "learning_rate": 5.5809373076584895e-06, "loss": 0.0053, "step": 6182 }, { "epoch": 1.6313151299300883, "grad_norm": 0.11138220131397247, "learning_rate": 5.580585597467687e-06, "loss": 0.0016, "step": 6184 }, { "epoch": 1.6318427648067537, "grad_norm": 0.06881757080554962, "learning_rate": 5.580233887276883e-06, "loss": 0.0023, "step": 6186 }, { "epoch": 1.6323703996834191, "grad_norm": 0.15435583889484406, "learning_rate": 5.579882177086081e-06, "loss": 0.0022, "step": 6188 }, { "epoch": 1.6328980345600845, "grad_norm": 0.13278628885746002, "learning_rate": 5.579530466895278e-06, "loss": 0.0063, "step": 6190 }, { "epoch": 1.6334256694367497, "grad_norm": 0.017994467169046402, "learning_rate": 5.579178756704476e-06, "loss": 0.0011, "step": 6192 }, { "epoch": 1.6339533043134151, "grad_norm": 0.3334842622280121, "learning_rate": 5.578827046513673e-06, "loss": 0.0171, "step": 6194 }, { "epoch": 1.6344809391900803, "grad_norm": 0.3690873682498932, "learning_rate": 5.57847533632287e-06, "loss": 0.0032, "step": 6196 }, { "epoch": 1.6350085740667457, "grad_norm": 0.016958871856331825, "learning_rate": 5.578123626132068e-06, "loss": 0.0012, "step": 6198 }, { "epoch": 1.6355362089434111, "grad_norm": 0.3055766522884369, "learning_rate": 5.577771915941265e-06, "loss": 0.0023, "step": 6200 }, { "epoch": 1.6360638438200765, "grad_norm": 0.021885791793465614, "learning_rate": 5.5774202057504615e-06, "loss": 0.0011, "step": 6202 }, { "epoch": 1.636591478696742, "grad_norm": 0.01269014272838831, "learning_rate": 5.577068495559659e-06, "loss": 0.001, "step": 6204 }, { "epoch": 1.6371191135734073, "grad_norm": 0.022319694980978966, "learning_rate": 5.576716785368856e-06, "loss": 0.0011, "step": 6206 }, { "epoch": 1.6376467484500725, "grad_norm": 0.35511377453804016, "learning_rate": 5.576365075178054e-06, "loss": 0.0055, "step": 6208 }, { "epoch": 1.638174383326738, "grad_norm": 0.09283630549907684, "learning_rate": 5.57601336498725e-06, "loss": 0.0014, "step": 6210 }, { "epoch": 1.638702018203403, "grad_norm": 0.016969159245491028, "learning_rate": 5.575661654796448e-06, "loss": 0.001, "step": 6212 }, { "epoch": 1.6392296530800685, "grad_norm": 0.01325088832527399, "learning_rate": 5.575309944605645e-06, "loss": 0.001, "step": 6214 }, { "epoch": 1.639757287956734, "grad_norm": 0.26524144411087036, "learning_rate": 5.574958234414842e-06, "loss": 0.0043, "step": 6216 }, { "epoch": 1.6402849228333993, "grad_norm": 0.11319736391305923, "learning_rate": 5.57460652422404e-06, "loss": 0.0086, "step": 6218 }, { "epoch": 1.6408125577100647, "grad_norm": 0.0836726501584053, "learning_rate": 5.574254814033237e-06, "loss": 0.0013, "step": 6220 }, { "epoch": 1.6413401925867301, "grad_norm": 0.0409582257270813, "learning_rate": 5.5739031038424344e-06, "loss": 0.0011, "step": 6222 }, { "epoch": 1.6418678274633953, "grad_norm": 0.08067189157009125, "learning_rate": 5.5735513936516306e-06, "loss": 0.0013, "step": 6224 }, { "epoch": 1.6423954623400607, "grad_norm": 0.018967777490615845, "learning_rate": 5.573199683460828e-06, "loss": 0.001, "step": 6226 }, { "epoch": 1.642923097216726, "grad_norm": 0.018793251365423203, "learning_rate": 5.572847973270025e-06, "loss": 0.0011, "step": 6228 }, { "epoch": 1.6434507320933913, "grad_norm": 0.04107720032334328, "learning_rate": 5.572496263079223e-06, "loss": 0.0042, "step": 6230 }, { "epoch": 1.6439783669700567, "grad_norm": 0.1640511453151703, "learning_rate": 5.57214455288842e-06, "loss": 0.001, "step": 6232 }, { "epoch": 1.6445060018467221, "grad_norm": 0.2531209886074066, "learning_rate": 5.571792842697617e-06, "loss": 0.002, "step": 6234 }, { "epoch": 1.6450336367233875, "grad_norm": 0.01573321968317032, "learning_rate": 5.571441132506815e-06, "loss": 0.0022, "step": 6236 }, { "epoch": 1.6455612716000527, "grad_norm": 0.2742309272289276, "learning_rate": 5.571089422316012e-06, "loss": 0.0076, "step": 6238 }, { "epoch": 1.6460889064767181, "grad_norm": 0.6073260307312012, "learning_rate": 5.570737712125209e-06, "loss": 0.0014, "step": 6240 }, { "epoch": 1.6466165413533833, "grad_norm": 0.010071164928376675, "learning_rate": 5.5703860019344065e-06, "loss": 0.001, "step": 6242 }, { "epoch": 1.6471441762300487, "grad_norm": 0.02033088728785515, "learning_rate": 5.5700342917436035e-06, "loss": 0.001, "step": 6244 }, { "epoch": 1.6476718111067141, "grad_norm": 0.013133624568581581, "learning_rate": 5.5696825815528e-06, "loss": 0.001, "step": 6246 }, { "epoch": 1.6481994459833795, "grad_norm": 0.11628585308790207, "learning_rate": 5.569330871361997e-06, "loss": 0.0105, "step": 6248 }, { "epoch": 1.648727080860045, "grad_norm": 0.12025776505470276, "learning_rate": 5.568979161171195e-06, "loss": 0.0059, "step": 6250 }, { "epoch": 1.6492547157367103, "grad_norm": 0.01595199666917324, "learning_rate": 5.568627450980392e-06, "loss": 0.003, "step": 6252 }, { "epoch": 1.6497823506133755, "grad_norm": 0.6729468107223511, "learning_rate": 5.568275740789589e-06, "loss": 0.0047, "step": 6254 }, { "epoch": 1.650309985490041, "grad_norm": 0.01427797507494688, "learning_rate": 5.567924030598787e-06, "loss": 0.0011, "step": 6256 }, { "epoch": 1.6508376203667061, "grad_norm": 0.20824278891086578, "learning_rate": 5.567572320407984e-06, "loss": 0.0014, "step": 6258 }, { "epoch": 1.6513652552433715, "grad_norm": 0.18766021728515625, "learning_rate": 5.567220610217182e-06, "loss": 0.0013, "step": 6260 }, { "epoch": 1.651892890120037, "grad_norm": 0.08325053751468658, "learning_rate": 5.566868900026379e-06, "loss": 0.0125, "step": 6262 }, { "epoch": 1.6524205249967023, "grad_norm": 0.11482401192188263, "learning_rate": 5.5665171898355755e-06, "loss": 0.0209, "step": 6264 }, { "epoch": 1.6529481598733677, "grad_norm": 0.011677229776978493, "learning_rate": 5.566165479644773e-06, "loss": 0.0011, "step": 6266 }, { "epoch": 1.653475794750033, "grad_norm": 0.013308722525835037, "learning_rate": 5.56581376945397e-06, "loss": 0.0011, "step": 6268 }, { "epoch": 1.6540034296266983, "grad_norm": 0.03132570534944534, "learning_rate": 5.565462059263167e-06, "loss": 0.0012, "step": 6270 }, { "epoch": 1.6545310645033635, "grad_norm": 0.06385908275842667, "learning_rate": 5.565110349072364e-06, "loss": 0.0023, "step": 6272 }, { "epoch": 1.655058699380029, "grad_norm": 0.036379143595695496, "learning_rate": 5.564758638881562e-06, "loss": 0.0011, "step": 6274 }, { "epoch": 1.6555863342566943, "grad_norm": 0.2242639809846878, "learning_rate": 5.564406928690759e-06, "loss": 0.0024, "step": 6276 }, { "epoch": 1.6561139691333597, "grad_norm": 0.1648469865322113, "learning_rate": 5.564055218499956e-06, "loss": 0.0013, "step": 6278 }, { "epoch": 1.6566416040100251, "grad_norm": 0.09749012440443039, "learning_rate": 5.563703508309154e-06, "loss": 0.016, "step": 6280 }, { "epoch": 1.6571692388866905, "grad_norm": 0.06313823908567429, "learning_rate": 5.563351798118351e-06, "loss": 0.0045, "step": 6282 }, { "epoch": 1.6576968737633557, "grad_norm": 0.992618203163147, "learning_rate": 5.563000087927548e-06, "loss": 0.003, "step": 6284 }, { "epoch": 1.6582245086400211, "grad_norm": 0.06228332221508026, "learning_rate": 5.5626483777367445e-06, "loss": 0.0012, "step": 6286 }, { "epoch": 1.6587521435166863, "grad_norm": 0.029599396511912346, "learning_rate": 5.562296667545942e-06, "loss": 0.0011, "step": 6288 }, { "epoch": 1.6592797783933517, "grad_norm": 0.06257214397192001, "learning_rate": 5.56194495735514e-06, "loss": 0.0077, "step": 6290 }, { "epoch": 1.6598074132700171, "grad_norm": 0.35311922430992126, "learning_rate": 5.561593247164336e-06, "loss": 0.003, "step": 6292 }, { "epoch": 1.6603350481466825, "grad_norm": 0.24035432934761047, "learning_rate": 5.561241536973534e-06, "loss": 0.01, "step": 6294 }, { "epoch": 1.660862683023348, "grad_norm": 0.01974809542298317, "learning_rate": 5.560889826782731e-06, "loss": 0.0062, "step": 6296 }, { "epoch": 1.6613903179000133, "grad_norm": 0.15304240584373474, "learning_rate": 5.560538116591929e-06, "loss": 0.0014, "step": 6298 }, { "epoch": 1.6619179527766785, "grad_norm": 0.034163933247327805, "learning_rate": 5.560186406401126e-06, "loss": 0.0012, "step": 6300 }, { "epoch": 1.662445587653344, "grad_norm": 0.16799978911876678, "learning_rate": 5.559834696210323e-06, "loss": 0.0022, "step": 6302 }, { "epoch": 1.6629732225300091, "grad_norm": 0.03482669219374657, "learning_rate": 5.5594829860195205e-06, "loss": 0.0023, "step": 6304 }, { "epoch": 1.6635008574066745, "grad_norm": 0.7627435922622681, "learning_rate": 5.5591312758287175e-06, "loss": 0.0095, "step": 6306 }, { "epoch": 1.66402849228334, "grad_norm": 0.0731622576713562, "learning_rate": 5.558779565637914e-06, "loss": 0.0038, "step": 6308 }, { "epoch": 1.6645561271600053, "grad_norm": 0.16579946875572205, "learning_rate": 5.558427855447111e-06, "loss": 0.0033, "step": 6310 }, { "epoch": 1.6650837620366707, "grad_norm": 1.060270071029663, "learning_rate": 5.558076145256309e-06, "loss": 0.0128, "step": 6312 }, { "epoch": 1.665611396913336, "grad_norm": 0.39316311478614807, "learning_rate": 5.557724435065506e-06, "loss": 0.0025, "step": 6314 }, { "epoch": 1.6661390317900013, "grad_norm": 0.12508831918239594, "learning_rate": 5.557372724874703e-06, "loss": 0.0032, "step": 6316 }, { "epoch": 1.6666666666666665, "grad_norm": 0.027907220646739006, "learning_rate": 5.557021014683901e-06, "loss": 0.0054, "step": 6318 }, { "epoch": 1.667194301543332, "grad_norm": 0.5469873547554016, "learning_rate": 5.556669304493098e-06, "loss": 0.0035, "step": 6320 }, { "epoch": 1.6677219364199973, "grad_norm": 0.22418464720249176, "learning_rate": 5.556317594302295e-06, "loss": 0.0018, "step": 6322 }, { "epoch": 1.6682495712966627, "grad_norm": 0.4780941605567932, "learning_rate": 5.5559658841114926e-06, "loss": 0.0022, "step": 6324 }, { "epoch": 1.6687772061733281, "grad_norm": 0.3465365469455719, "learning_rate": 5.5556141739206895e-06, "loss": 0.002, "step": 6326 }, { "epoch": 1.6693048410499935, "grad_norm": 0.20127196609973907, "learning_rate": 5.555262463729887e-06, "loss": 0.0119, "step": 6328 }, { "epoch": 1.6698324759266587, "grad_norm": 0.04245069622993469, "learning_rate": 5.5549107535390834e-06, "loss": 0.0018, "step": 6330 }, { "epoch": 1.6703601108033241, "grad_norm": 0.05276680737733841, "learning_rate": 5.554559043348281e-06, "loss": 0.0078, "step": 6332 }, { "epoch": 1.6708877456799893, "grad_norm": 0.40984249114990234, "learning_rate": 5.554207333157478e-06, "loss": 0.0139, "step": 6334 }, { "epoch": 1.6714153805566547, "grad_norm": 0.024850282818078995, "learning_rate": 5.553855622966676e-06, "loss": 0.0011, "step": 6336 }, { "epoch": 1.6719430154333201, "grad_norm": 0.019042575731873512, "learning_rate": 5.553503912775873e-06, "loss": 0.001, "step": 6338 }, { "epoch": 1.6724706503099855, "grad_norm": 0.23486392199993134, "learning_rate": 5.55315220258507e-06, "loss": 0.0015, "step": 6340 }, { "epoch": 1.672998285186651, "grad_norm": 0.028405526652932167, "learning_rate": 5.552800492394268e-06, "loss": 0.001, "step": 6342 }, { "epoch": 1.6735259200633161, "grad_norm": 0.36339429020881653, "learning_rate": 5.552448782203464e-06, "loss": 0.0121, "step": 6344 }, { "epoch": 1.6740535549399815, "grad_norm": 0.05787444859743118, "learning_rate": 5.552097072012662e-06, "loss": 0.0153, "step": 6346 }, { "epoch": 1.6745811898166467, "grad_norm": 0.027688754722476006, "learning_rate": 5.551745361821859e-06, "loss": 0.0012, "step": 6348 }, { "epoch": 1.6751088246933121, "grad_norm": 0.05258029326796532, "learning_rate": 5.551393651631056e-06, "loss": 0.0013, "step": 6350 }, { "epoch": 1.6756364595699775, "grad_norm": 0.7286681532859802, "learning_rate": 5.551041941440253e-06, "loss": 0.0063, "step": 6352 }, { "epoch": 1.676164094446643, "grad_norm": 0.019825518131256104, "learning_rate": 5.55069023124945e-06, "loss": 0.001, "step": 6354 }, { "epoch": 1.6766917293233083, "grad_norm": 0.026216603815555573, "learning_rate": 5.550338521058648e-06, "loss": 0.0014, "step": 6356 }, { "epoch": 1.6772193641999738, "grad_norm": 0.9137712121009827, "learning_rate": 5.549986810867845e-06, "loss": 0.0151, "step": 6358 }, { "epoch": 1.677746999076639, "grad_norm": 0.04881402105093002, "learning_rate": 5.549635100677042e-06, "loss": 0.0012, "step": 6360 }, { "epoch": 1.6782746339533043, "grad_norm": 0.16634151339530945, "learning_rate": 5.54928339048624e-06, "loss": 0.012, "step": 6362 }, { "epoch": 1.6788022688299695, "grad_norm": 0.33897775411605835, "learning_rate": 5.548931680295437e-06, "loss": 0.0038, "step": 6364 }, { "epoch": 1.679329903706635, "grad_norm": 0.0338318757712841, "learning_rate": 5.5485799701046345e-06, "loss": 0.0022, "step": 6366 }, { "epoch": 1.6798575385833003, "grad_norm": 0.033097684383392334, "learning_rate": 5.548228259913831e-06, "loss": 0.0038, "step": 6368 }, { "epoch": 1.6803851734599657, "grad_norm": 0.12019284814596176, "learning_rate": 5.547876549723028e-06, "loss": 0.0019, "step": 6370 }, { "epoch": 1.6809128083366311, "grad_norm": 0.028788302093744278, "learning_rate": 5.547524839532225e-06, "loss": 0.0042, "step": 6372 }, { "epoch": 1.6814404432132966, "grad_norm": 0.3517928421497345, "learning_rate": 5.547173129341422e-06, "loss": 0.0037, "step": 6374 }, { "epoch": 1.6819680780899617, "grad_norm": 0.7371252775192261, "learning_rate": 5.54682141915062e-06, "loss": 0.002, "step": 6376 }, { "epoch": 1.6824957129666271, "grad_norm": 0.044404368847608566, "learning_rate": 5.546469708959817e-06, "loss": 0.0013, "step": 6378 }, { "epoch": 1.6830233478432923, "grad_norm": 0.029225105419754982, "learning_rate": 5.546117998769015e-06, "loss": 0.0013, "step": 6380 }, { "epoch": 1.6835509827199577, "grad_norm": 0.010552137158811092, "learning_rate": 5.545766288578212e-06, "loss": 0.0011, "step": 6382 }, { "epoch": 1.6840786175966231, "grad_norm": 0.022134574130177498, "learning_rate": 5.545414578387409e-06, "loss": 0.0013, "step": 6384 }, { "epoch": 1.6846062524732885, "grad_norm": 0.025616994127631187, "learning_rate": 5.5450628681966066e-06, "loss": 0.0034, "step": 6386 }, { "epoch": 1.685133887349954, "grad_norm": 0.12945124506950378, "learning_rate": 5.5447111580058035e-06, "loss": 0.0195, "step": 6388 }, { "epoch": 1.6856615222266191, "grad_norm": 0.13717690110206604, "learning_rate": 5.5443594478150005e-06, "loss": 0.0114, "step": 6390 }, { "epoch": 1.6861891571032845, "grad_norm": 1.153249979019165, "learning_rate": 5.544007737624197e-06, "loss": 0.0023, "step": 6392 }, { "epoch": 1.6867167919799497, "grad_norm": 0.17204831540584564, "learning_rate": 5.543656027433395e-06, "loss": 0.0029, "step": 6394 }, { "epoch": 1.6872444268566151, "grad_norm": 0.24158649146556854, "learning_rate": 5.543304317242592e-06, "loss": 0.0021, "step": 6396 }, { "epoch": 1.6877720617332805, "grad_norm": 0.5682036876678467, "learning_rate": 5.542952607051789e-06, "loss": 0.011, "step": 6398 }, { "epoch": 1.688299696609946, "grad_norm": 0.017822500318288803, "learning_rate": 5.542600896860987e-06, "loss": 0.001, "step": 6400 }, { "epoch": 1.6888273314866113, "grad_norm": 0.02328723855316639, "learning_rate": 5.542249186670184e-06, "loss": 0.0013, "step": 6402 }, { "epoch": 1.6893549663632768, "grad_norm": 0.015449532307684422, "learning_rate": 5.541897476479381e-06, "loss": 0.0094, "step": 6404 }, { "epoch": 1.689882601239942, "grad_norm": 0.26490703225135803, "learning_rate": 5.541545766288579e-06, "loss": 0.0087, "step": 6406 }, { "epoch": 1.6904102361166073, "grad_norm": 0.09832624346017838, "learning_rate": 5.541194056097776e-06, "loss": 0.0044, "step": 6408 }, { "epoch": 1.6909378709932725, "grad_norm": 0.09502290934324265, "learning_rate": 5.540842345906973e-06, "loss": 0.0012, "step": 6410 }, { "epoch": 1.691465505869938, "grad_norm": 0.18162031471729279, "learning_rate": 5.5404906357161695e-06, "loss": 0.0096, "step": 6412 }, { "epoch": 1.6919931407466033, "grad_norm": 0.10286914557218552, "learning_rate": 5.540138925525367e-06, "loss": 0.0053, "step": 6414 }, { "epoch": 1.6925207756232687, "grad_norm": 0.3106815218925476, "learning_rate": 5.539787215334564e-06, "loss": 0.0038, "step": 6416 }, { "epoch": 1.6930484104999342, "grad_norm": 0.041248418390750885, "learning_rate": 5.539435505143762e-06, "loss": 0.0013, "step": 6418 }, { "epoch": 1.6935760453765996, "grad_norm": 0.9328728318214417, "learning_rate": 5.539083794952959e-06, "loss": 0.0076, "step": 6420 }, { "epoch": 1.6941036802532647, "grad_norm": 0.15065202116966248, "learning_rate": 5.538732084762156e-06, "loss": 0.0032, "step": 6422 }, { "epoch": 1.69463131512993, "grad_norm": 0.2607967257499695, "learning_rate": 5.538380374571354e-06, "loss": 0.0124, "step": 6424 }, { "epoch": 1.6951589500065953, "grad_norm": 0.2541629374027252, "learning_rate": 5.538028664380551e-06, "loss": 0.002, "step": 6426 }, { "epoch": 1.6956865848832607, "grad_norm": 0.041131991893053055, "learning_rate": 5.537676954189748e-06, "loss": 0.0013, "step": 6428 }, { "epoch": 1.6962142197599261, "grad_norm": 0.3700435161590576, "learning_rate": 5.537325243998945e-06, "loss": 0.0026, "step": 6430 }, { "epoch": 1.6967418546365916, "grad_norm": 0.9755399227142334, "learning_rate": 5.536973533808142e-06, "loss": 0.0063, "step": 6432 }, { "epoch": 1.697269489513257, "grad_norm": 0.016258424147963524, "learning_rate": 5.536621823617339e-06, "loss": 0.001, "step": 6434 }, { "epoch": 1.6977971243899221, "grad_norm": 0.012892625294625759, "learning_rate": 5.536270113426536e-06, "loss": 0.0011, "step": 6436 }, { "epoch": 1.6983247592665875, "grad_norm": 0.017937738448381424, "learning_rate": 5.535918403235734e-06, "loss": 0.0011, "step": 6438 }, { "epoch": 1.6988523941432527, "grad_norm": 0.015347287058830261, "learning_rate": 5.535566693044931e-06, "loss": 0.001, "step": 6440 }, { "epoch": 1.6993800290199181, "grad_norm": 0.024849576875567436, "learning_rate": 5.535214982854128e-06, "loss": 0.0042, "step": 6442 }, { "epoch": 1.6999076638965835, "grad_norm": 0.7326050996780396, "learning_rate": 5.534863272663326e-06, "loss": 0.0044, "step": 6444 }, { "epoch": 1.700435298773249, "grad_norm": 0.19685456156730652, "learning_rate": 5.534511562472523e-06, "loss": 0.0087, "step": 6446 }, { "epoch": 1.7009629336499144, "grad_norm": 0.7069540619850159, "learning_rate": 5.5341598522817206e-06, "loss": 0.0054, "step": 6448 }, { "epoch": 1.7014905685265798, "grad_norm": 0.3687279224395752, "learning_rate": 5.533808142090917e-06, "loss": 0.0046, "step": 6450 }, { "epoch": 1.702018203403245, "grad_norm": 0.2868693470954895, "learning_rate": 5.5334564319001145e-06, "loss": 0.0017, "step": 6452 }, { "epoch": 1.7025458382799104, "grad_norm": 0.32621416449546814, "learning_rate": 5.533104721709311e-06, "loss": 0.0079, "step": 6454 }, { "epoch": 1.7030734731565755, "grad_norm": 0.02732204645872116, "learning_rate": 5.532753011518509e-06, "loss": 0.0013, "step": 6456 }, { "epoch": 1.703601108033241, "grad_norm": 0.05964028835296631, "learning_rate": 5.532401301327706e-06, "loss": 0.0013, "step": 6458 }, { "epoch": 1.7041287429099063, "grad_norm": 1.0784775018692017, "learning_rate": 5.532049591136903e-06, "loss": 0.0149, "step": 6460 }, { "epoch": 1.7046563777865718, "grad_norm": 0.1489546298980713, "learning_rate": 5.531697880946101e-06, "loss": 0.0019, "step": 6462 }, { "epoch": 1.7051840126632372, "grad_norm": 0.19637338817119598, "learning_rate": 5.531346170755298e-06, "loss": 0.0019, "step": 6464 }, { "epoch": 1.7057116475399023, "grad_norm": 0.27559876441955566, "learning_rate": 5.530994460564495e-06, "loss": 0.0167, "step": 6466 }, { "epoch": 1.7062392824165677, "grad_norm": 0.01565246656537056, "learning_rate": 5.530642750373693e-06, "loss": 0.0009, "step": 6468 }, { "epoch": 1.706766917293233, "grad_norm": 0.010674954392015934, "learning_rate": 5.5302910401828896e-06, "loss": 0.0009, "step": 6470 }, { "epoch": 1.7072945521698983, "grad_norm": 0.5171468257904053, "learning_rate": 5.5299393299920865e-06, "loss": 0.0051, "step": 6472 }, { "epoch": 1.7078221870465637, "grad_norm": 0.385797917842865, "learning_rate": 5.5295876198012835e-06, "loss": 0.0024, "step": 6474 }, { "epoch": 1.7083498219232292, "grad_norm": 0.03542343154549599, "learning_rate": 5.529235909610481e-06, "loss": 0.0018, "step": 6476 }, { "epoch": 1.7088774567998946, "grad_norm": 1.065805196762085, "learning_rate": 5.528884199419678e-06, "loss": 0.0072, "step": 6478 }, { "epoch": 1.70940509167656, "grad_norm": 0.699910044670105, "learning_rate": 5.528532489228875e-06, "loss": 0.0029, "step": 6480 }, { "epoch": 1.7099327265532251, "grad_norm": 0.016396569088101387, "learning_rate": 5.528180779038073e-06, "loss": 0.0012, "step": 6482 }, { "epoch": 1.7104603614298906, "grad_norm": 0.015254832804203033, "learning_rate": 5.52782906884727e-06, "loss": 0.0037, "step": 6484 }, { "epoch": 1.7109879963065557, "grad_norm": 0.04441356286406517, "learning_rate": 5.527477358656468e-06, "loss": 0.001, "step": 6486 }, { "epoch": 1.7115156311832211, "grad_norm": 0.41300642490386963, "learning_rate": 5.527125648465664e-06, "loss": 0.0073, "step": 6488 }, { "epoch": 1.7120432660598865, "grad_norm": 0.01685871183872223, "learning_rate": 5.526773938274862e-06, "loss": 0.0014, "step": 6490 }, { "epoch": 1.712570900936552, "grad_norm": 0.5930164456367493, "learning_rate": 5.5264222280840594e-06, "loss": 0.0095, "step": 6492 }, { "epoch": 1.7130985358132174, "grad_norm": 0.043497636914253235, "learning_rate": 5.5260705178932555e-06, "loss": 0.0011, "step": 6494 }, { "epoch": 1.7136261706898828, "grad_norm": 0.15379908680915833, "learning_rate": 5.525718807702453e-06, "loss": 0.0016, "step": 6496 }, { "epoch": 1.714153805566548, "grad_norm": 0.07031967490911484, "learning_rate": 5.52536709751165e-06, "loss": 0.0015, "step": 6498 }, { "epoch": 1.7146814404432131, "grad_norm": 0.03262971341609955, "learning_rate": 5.525015387320848e-06, "loss": 0.001, "step": 6500 }, { "epoch": 1.7152090753198785, "grad_norm": 0.056463394314050674, "learning_rate": 5.524663677130045e-06, "loss": 0.0011, "step": 6502 }, { "epoch": 1.715736710196544, "grad_norm": 0.012075823731720448, "learning_rate": 5.524311966939242e-06, "loss": 0.001, "step": 6504 }, { "epoch": 1.7162643450732094, "grad_norm": 0.03636883199214935, "learning_rate": 5.52396025674844e-06, "loss": 0.0034, "step": 6506 }, { "epoch": 1.7167919799498748, "grad_norm": 0.01873171329498291, "learning_rate": 5.523608546557637e-06, "loss": 0.001, "step": 6508 }, { "epoch": 1.7173196148265402, "grad_norm": 0.2943829298019409, "learning_rate": 5.523256836366834e-06, "loss": 0.0179, "step": 6510 }, { "epoch": 1.7178472497032053, "grad_norm": 0.025546913966536522, "learning_rate": 5.522905126176031e-06, "loss": 0.001, "step": 6512 }, { "epoch": 1.7183748845798708, "grad_norm": 0.039023492485284805, "learning_rate": 5.5225534159852285e-06, "loss": 0.007, "step": 6514 }, { "epoch": 1.718902519456536, "grad_norm": 0.31852275133132935, "learning_rate": 5.522201705794426e-06, "loss": 0.004, "step": 6516 }, { "epoch": 1.7194301543332013, "grad_norm": 0.03409795090556145, "learning_rate": 5.521849995603622e-06, "loss": 0.001, "step": 6518 }, { "epoch": 1.7199577892098667, "grad_norm": 0.022034769877791405, "learning_rate": 5.52149828541282e-06, "loss": 0.0012, "step": 6520 }, { "epoch": 1.7204854240865322, "grad_norm": 0.18117748200893402, "learning_rate": 5.521146575222017e-06, "loss": 0.0127, "step": 6522 }, { "epoch": 1.7210130589631976, "grad_norm": 0.05004452168941498, "learning_rate": 5.520794865031214e-06, "loss": 0.0014, "step": 6524 }, { "epoch": 1.721540693839863, "grad_norm": 0.9387491941452026, "learning_rate": 5.520443154840412e-06, "loss": 0.0166, "step": 6526 }, { "epoch": 1.7220683287165282, "grad_norm": 0.2080076038837433, "learning_rate": 5.520091444649609e-06, "loss": 0.012, "step": 6528 }, { "epoch": 1.7225959635931936, "grad_norm": 0.9080438017845154, "learning_rate": 5.519739734458807e-06, "loss": 0.0049, "step": 6530 }, { "epoch": 1.7231235984698587, "grad_norm": 0.0349423848092556, "learning_rate": 5.519388024268003e-06, "loss": 0.0158, "step": 6532 }, { "epoch": 1.7236512333465241, "grad_norm": 0.17681851983070374, "learning_rate": 5.5190363140772005e-06, "loss": 0.0055, "step": 6534 }, { "epoch": 1.7241788682231896, "grad_norm": 0.021260837092995644, "learning_rate": 5.5186846038863975e-06, "loss": 0.0011, "step": 6536 }, { "epoch": 1.724706503099855, "grad_norm": 0.6948876976966858, "learning_rate": 5.518332893695595e-06, "loss": 0.0112, "step": 6538 }, { "epoch": 1.7252341379765204, "grad_norm": 0.5975644588470459, "learning_rate": 5.517981183504792e-06, "loss": 0.0045, "step": 6540 }, { "epoch": 1.7257617728531855, "grad_norm": 0.04890720546245575, "learning_rate": 5.517629473313989e-06, "loss": 0.0014, "step": 6542 }, { "epoch": 1.726289407729851, "grad_norm": 0.16988006234169006, "learning_rate": 5.517277763123187e-06, "loss": 0.0014, "step": 6544 }, { "epoch": 1.7268170426065161, "grad_norm": 0.37989139556884766, "learning_rate": 5.516926052932384e-06, "loss": 0.007, "step": 6546 }, { "epoch": 1.7273446774831815, "grad_norm": 0.03064267709851265, "learning_rate": 5.516574342741581e-06, "loss": 0.0035, "step": 6548 }, { "epoch": 1.727872312359847, "grad_norm": 0.02612055279314518, "learning_rate": 5.516222632550779e-06, "loss": 0.0012, "step": 6550 }, { "epoch": 1.7283999472365124, "grad_norm": 0.4663720726966858, "learning_rate": 5.515870922359976e-06, "loss": 0.0099, "step": 6552 }, { "epoch": 1.7289275821131778, "grad_norm": 0.5571471452713013, "learning_rate": 5.515519212169173e-06, "loss": 0.0075, "step": 6554 }, { "epoch": 1.7294552169898432, "grad_norm": 0.0503295361995697, "learning_rate": 5.5151675019783695e-06, "loss": 0.0026, "step": 6556 }, { "epoch": 1.7299828518665084, "grad_norm": 0.12764957547187805, "learning_rate": 5.514815791787567e-06, "loss": 0.0018, "step": 6558 }, { "epoch": 1.7305104867431738, "grad_norm": 0.1741480827331543, "learning_rate": 5.514464081596764e-06, "loss": 0.0017, "step": 6560 }, { "epoch": 1.731038121619839, "grad_norm": 0.04671191796660423, "learning_rate": 5.514112371405961e-06, "loss": 0.0019, "step": 6562 }, { "epoch": 1.7315657564965043, "grad_norm": 0.26599347591400146, "learning_rate": 5.513760661215159e-06, "loss": 0.0034, "step": 6564 }, { "epoch": 1.7320933913731698, "grad_norm": 0.03243933618068695, "learning_rate": 5.513408951024356e-06, "loss": 0.0011, "step": 6566 }, { "epoch": 1.7326210262498352, "grad_norm": 0.03213516250252724, "learning_rate": 5.513057240833554e-06, "loss": 0.001, "step": 6568 }, { "epoch": 1.7331486611265006, "grad_norm": 0.030172476544976234, "learning_rate": 5.51270553064275e-06, "loss": 0.0043, "step": 6570 }, { "epoch": 1.733676296003166, "grad_norm": 0.1836121529340744, "learning_rate": 5.512353820451948e-06, "loss": 0.0021, "step": 6572 }, { "epoch": 1.7342039308798312, "grad_norm": 0.3137440085411072, "learning_rate": 5.5120021102611455e-06, "loss": 0.0084, "step": 6574 }, { "epoch": 1.7347315657564963, "grad_norm": 0.11797860264778137, "learning_rate": 5.5116504000703424e-06, "loss": 0.0126, "step": 6576 }, { "epoch": 1.7352592006331617, "grad_norm": 0.3759970963001251, "learning_rate": 5.511298689879539e-06, "loss": 0.0036, "step": 6578 }, { "epoch": 1.7357868355098272, "grad_norm": 0.6272067427635193, "learning_rate": 5.510946979688736e-06, "loss": 0.0035, "step": 6580 }, { "epoch": 1.7363144703864926, "grad_norm": 0.2642800211906433, "learning_rate": 5.510595269497934e-06, "loss": 0.0054, "step": 6582 }, { "epoch": 1.736842105263158, "grad_norm": 0.013481405563652515, "learning_rate": 5.510243559307131e-06, "loss": 0.0143, "step": 6584 }, { "epoch": 1.7373697401398234, "grad_norm": 0.07309658080339432, "learning_rate": 5.509891849116328e-06, "loss": 0.0016, "step": 6586 }, { "epoch": 1.7378973750164886, "grad_norm": 0.19844429194927216, "learning_rate": 5.509540138925526e-06, "loss": 0.0039, "step": 6588 }, { "epoch": 1.738425009893154, "grad_norm": 0.42635878920555115, "learning_rate": 5.509188428734723e-06, "loss": 0.0047, "step": 6590 }, { "epoch": 1.7389526447698191, "grad_norm": 0.19556763768196106, "learning_rate": 5.50883671854392e-06, "loss": 0.0045, "step": 6592 }, { "epoch": 1.7394802796464846, "grad_norm": 0.1175779402256012, "learning_rate": 5.508485008353117e-06, "loss": 0.0014, "step": 6594 }, { "epoch": 1.74000791452315, "grad_norm": 0.17724399268627167, "learning_rate": 5.5081332981623145e-06, "loss": 0.0071, "step": 6596 }, { "epoch": 1.7405355493998154, "grad_norm": 0.07922027260065079, "learning_rate": 5.5077815879715115e-06, "loss": 0.0058, "step": 6598 }, { "epoch": 1.7410631842764808, "grad_norm": 0.037377554923295975, "learning_rate": 5.507429877780708e-06, "loss": 0.0014, "step": 6600 }, { "epoch": 1.7415908191531462, "grad_norm": 0.10207214951515198, "learning_rate": 5.507078167589906e-06, "loss": 0.0015, "step": 6602 }, { "epoch": 1.7421184540298114, "grad_norm": 0.048819754272699356, "learning_rate": 5.506726457399103e-06, "loss": 0.0013, "step": 6604 }, { "epoch": 1.7426460889064768, "grad_norm": 0.24074161052703857, "learning_rate": 5.506374747208301e-06, "loss": 0.0064, "step": 6606 }, { "epoch": 1.743173723783142, "grad_norm": 0.49936485290527344, "learning_rate": 5.506023037017498e-06, "loss": 0.0048, "step": 6608 }, { "epoch": 1.7437013586598074, "grad_norm": 0.05785336345434189, "learning_rate": 5.505671326826695e-06, "loss": 0.0024, "step": 6610 }, { "epoch": 1.7442289935364728, "grad_norm": 0.025243734940886497, "learning_rate": 5.505319616635893e-06, "loss": 0.0014, "step": 6612 }, { "epoch": 1.7447566284131382, "grad_norm": 0.020508766174316406, "learning_rate": 5.504967906445089e-06, "loss": 0.0062, "step": 6614 }, { "epoch": 1.7452842632898036, "grad_norm": 0.10289491713047028, "learning_rate": 5.5046161962542866e-06, "loss": 0.0157, "step": 6616 }, { "epoch": 1.7458118981664688, "grad_norm": 0.25516563653945923, "learning_rate": 5.5042644860634835e-06, "loss": 0.0073, "step": 6618 }, { "epoch": 1.7463395330431342, "grad_norm": 0.121053047478199, "learning_rate": 5.503912775872681e-06, "loss": 0.0048, "step": 6620 }, { "epoch": 1.7468671679197993, "grad_norm": 0.18820635974407196, "learning_rate": 5.503561065681878e-06, "loss": 0.0086, "step": 6622 }, { "epoch": 1.7473948027964648, "grad_norm": 0.16756798326969147, "learning_rate": 5.503209355491075e-06, "loss": 0.0118, "step": 6624 }, { "epoch": 1.7479224376731302, "grad_norm": 0.06558208167552948, "learning_rate": 5.502857645300273e-06, "loss": 0.0012, "step": 6626 }, { "epoch": 1.7484500725497956, "grad_norm": 0.020701345056295395, "learning_rate": 5.50250593510947e-06, "loss": 0.001, "step": 6628 }, { "epoch": 1.748977707426461, "grad_norm": 0.025811053812503815, "learning_rate": 5.502154224918667e-06, "loss": 0.0012, "step": 6630 }, { "epoch": 1.7495053423031264, "grad_norm": 0.3641943037509918, "learning_rate": 5.501802514727865e-06, "loss": 0.0038, "step": 6632 }, { "epoch": 1.7500329771797916, "grad_norm": 0.13442644476890564, "learning_rate": 5.501450804537062e-06, "loss": 0.0113, "step": 6634 }, { "epoch": 1.750560612056457, "grad_norm": 0.3167036473751068, "learning_rate": 5.5010990943462595e-06, "loss": 0.0058, "step": 6636 }, { "epoch": 1.7510882469331222, "grad_norm": 0.40247097611427307, "learning_rate": 5.500747384155456e-06, "loss": 0.0028, "step": 6638 }, { "epoch": 1.7516158818097876, "grad_norm": 0.05518099293112755, "learning_rate": 5.500395673964653e-06, "loss": 0.0011, "step": 6640 }, { "epoch": 1.752143516686453, "grad_norm": 0.07245636731386185, "learning_rate": 5.50004396377385e-06, "loss": 0.0015, "step": 6642 }, { "epoch": 1.7526711515631184, "grad_norm": 0.03133753314614296, "learning_rate": 5.499692253583047e-06, "loss": 0.001, "step": 6644 }, { "epoch": 1.7531987864397838, "grad_norm": 0.42354345321655273, "learning_rate": 5.499340543392245e-06, "loss": 0.0065, "step": 6646 }, { "epoch": 1.7537264213164492, "grad_norm": 0.1094929650425911, "learning_rate": 5.498988833201442e-06, "loss": 0.0012, "step": 6648 }, { "epoch": 1.7542540561931144, "grad_norm": 0.42690858244895935, "learning_rate": 5.49863712301064e-06, "loss": 0.0043, "step": 6650 }, { "epoch": 1.7547816910697795, "grad_norm": 0.10596078634262085, "learning_rate": 5.498285412819836e-06, "loss": 0.0137, "step": 6652 }, { "epoch": 1.755309325946445, "grad_norm": 0.12294569611549377, "learning_rate": 5.497933702629034e-06, "loss": 0.0013, "step": 6654 }, { "epoch": 1.7558369608231104, "grad_norm": 0.47425442934036255, "learning_rate": 5.497581992438231e-06, "loss": 0.0052, "step": 6656 }, { "epoch": 1.7563645956997758, "grad_norm": 0.04285281151533127, "learning_rate": 5.4972302822474285e-06, "loss": 0.0011, "step": 6658 }, { "epoch": 1.7568922305764412, "grad_norm": 0.4353286325931549, "learning_rate": 5.4968785720566255e-06, "loss": 0.0202, "step": 6660 }, { "epoch": 1.7574198654531066, "grad_norm": 0.07682003825902939, "learning_rate": 5.496526861865822e-06, "loss": 0.0012, "step": 6662 }, { "epoch": 1.7579475003297718, "grad_norm": 0.11895839869976044, "learning_rate": 5.49617515167502e-06, "loss": 0.0056, "step": 6664 }, { "epoch": 1.7584751352064372, "grad_norm": 0.025418709963560104, "learning_rate": 5.495823441484217e-06, "loss": 0.0018, "step": 6666 }, { "epoch": 1.7590027700831024, "grad_norm": 0.12081797420978546, "learning_rate": 5.495471731293414e-06, "loss": 0.009, "step": 6668 }, { "epoch": 1.7595304049597678, "grad_norm": 0.03900193050503731, "learning_rate": 5.495120021102612e-06, "loss": 0.0011, "step": 6670 }, { "epoch": 1.7600580398364332, "grad_norm": 0.18400731682777405, "learning_rate": 5.494768310911809e-06, "loss": 0.0015, "step": 6672 }, { "epoch": 1.7605856747130986, "grad_norm": 0.10105472803115845, "learning_rate": 5.494416600721006e-06, "loss": 0.0127, "step": 6674 }, { "epoch": 1.761113309589764, "grad_norm": 0.04335251823067665, "learning_rate": 5.494064890530203e-06, "loss": 0.0012, "step": 6676 }, { "epoch": 1.7616409444664294, "grad_norm": 0.014517437666654587, "learning_rate": 5.4937131803394006e-06, "loss": 0.001, "step": 6678 }, { "epoch": 1.7621685793430946, "grad_norm": 0.024545468389987946, "learning_rate": 5.4933614701485975e-06, "loss": 0.0014, "step": 6680 }, { "epoch": 1.76269621421976, "grad_norm": 0.015248970128595829, "learning_rate": 5.4930097599577945e-06, "loss": 0.0087, "step": 6682 }, { "epoch": 1.7632238490964252, "grad_norm": 0.03626991808414459, "learning_rate": 5.492658049766992e-06, "loss": 0.0009, "step": 6684 }, { "epoch": 1.7637514839730906, "grad_norm": 0.019493836909532547, "learning_rate": 5.492306339576189e-06, "loss": 0.0015, "step": 6686 }, { "epoch": 1.764279118849756, "grad_norm": 0.5653656125068665, "learning_rate": 5.491954629385387e-06, "loss": 0.0022, "step": 6688 }, { "epoch": 1.7648067537264214, "grad_norm": 0.016465459018945694, "learning_rate": 5.491602919194584e-06, "loss": 0.0009, "step": 6690 }, { "epoch": 1.7653343886030868, "grad_norm": 0.2560301721096039, "learning_rate": 5.491251209003781e-06, "loss": 0.0052, "step": 6692 }, { "epoch": 1.765862023479752, "grad_norm": 0.4240022897720337, "learning_rate": 5.490899498812979e-06, "loss": 0.0058, "step": 6694 }, { "epoch": 1.7663896583564174, "grad_norm": 0.019664060324430466, "learning_rate": 5.490547788622176e-06, "loss": 0.0009, "step": 6696 }, { "epoch": 1.7669172932330826, "grad_norm": 0.3034157454967499, "learning_rate": 5.490196078431373e-06, "loss": 0.018, "step": 6698 }, { "epoch": 1.767444928109748, "grad_norm": 0.058589186519384384, "learning_rate": 5.48984436824057e-06, "loss": 0.0012, "step": 6700 }, { "epoch": 1.7679725629864134, "grad_norm": 0.08717080950737, "learning_rate": 5.489492658049767e-06, "loss": 0.0044, "step": 6702 }, { "epoch": 1.7685001978630788, "grad_norm": 0.019425852224230766, "learning_rate": 5.489140947858964e-06, "loss": 0.001, "step": 6704 }, { "epoch": 1.7690278327397442, "grad_norm": 0.1233447790145874, "learning_rate": 5.488789237668161e-06, "loss": 0.0038, "step": 6706 }, { "epoch": 1.7695554676164096, "grad_norm": 0.23134049773216248, "learning_rate": 5.488437527477359e-06, "loss": 0.005, "step": 6708 }, { "epoch": 1.7700831024930748, "grad_norm": 0.2531786262989044, "learning_rate": 5.488085817286556e-06, "loss": 0.0028, "step": 6710 }, { "epoch": 1.7706107373697402, "grad_norm": 0.025570929050445557, "learning_rate": 5.487734107095753e-06, "loss": 0.0125, "step": 6712 }, { "epoch": 1.7711383722464054, "grad_norm": 0.04255019873380661, "learning_rate": 5.48738239690495e-06, "loss": 0.0014, "step": 6714 }, { "epoch": 1.7716660071230708, "grad_norm": 0.09740000218153, "learning_rate": 5.487030686714148e-06, "loss": 0.005, "step": 6716 }, { "epoch": 1.7721936419997362, "grad_norm": 0.10994407534599304, "learning_rate": 5.4866789765233455e-06, "loss": 0.0017, "step": 6718 }, { "epoch": 1.7727212768764016, "grad_norm": 0.5278284549713135, "learning_rate": 5.486327266332542e-06, "loss": 0.0164, "step": 6720 }, { "epoch": 1.773248911753067, "grad_norm": 0.04596379026770592, "learning_rate": 5.4859755561417394e-06, "loss": 0.0016, "step": 6722 }, { "epoch": 1.7737765466297324, "grad_norm": 0.018136171624064445, "learning_rate": 5.485623845950936e-06, "loss": 0.0009, "step": 6724 }, { "epoch": 1.7743041815063976, "grad_norm": 0.014730670489370823, "learning_rate": 5.485272135760134e-06, "loss": 0.0089, "step": 6726 }, { "epoch": 1.7748318163830628, "grad_norm": 0.027048341929912567, "learning_rate": 5.484920425569331e-06, "loss": 0.0009, "step": 6728 }, { "epoch": 1.7753594512597282, "grad_norm": 0.27028530836105347, "learning_rate": 5.484568715378528e-06, "loss": 0.0045, "step": 6730 }, { "epoch": 1.7758870861363936, "grad_norm": 0.6562306880950928, "learning_rate": 5.484217005187726e-06, "loss": 0.0049, "step": 6732 }, { "epoch": 1.776414721013059, "grad_norm": 1.0853184461593628, "learning_rate": 5.483865294996922e-06, "loss": 0.0089, "step": 6734 }, { "epoch": 1.7769423558897244, "grad_norm": 0.35923367738723755, "learning_rate": 5.48351358480612e-06, "loss": 0.0091, "step": 6736 }, { "epoch": 1.7774699907663898, "grad_norm": 0.3142394423484802, "learning_rate": 5.483161874615317e-06, "loss": 0.0099, "step": 6738 }, { "epoch": 1.777997625643055, "grad_norm": 0.05703070014715195, "learning_rate": 5.4828101644245146e-06, "loss": 0.0014, "step": 6740 }, { "epoch": 1.7785252605197204, "grad_norm": 0.43811866641044617, "learning_rate": 5.4824584542337115e-06, "loss": 0.0022, "step": 6742 }, { "epoch": 1.7790528953963856, "grad_norm": 0.385358601808548, "learning_rate": 5.4821067440429085e-06, "loss": 0.0016, "step": 6744 }, { "epoch": 1.779580530273051, "grad_norm": 0.2788046896457672, "learning_rate": 5.481755033852106e-06, "loss": 0.0035, "step": 6746 }, { "epoch": 1.7801081651497164, "grad_norm": 0.7273073196411133, "learning_rate": 5.481403323661303e-06, "loss": 0.0116, "step": 6748 }, { "epoch": 1.7806358000263818, "grad_norm": 0.10102139413356781, "learning_rate": 5.4810516134705e-06, "loss": 0.0089, "step": 6750 }, { "epoch": 1.7811634349030472, "grad_norm": 0.07866609841585159, "learning_rate": 5.480699903279698e-06, "loss": 0.0018, "step": 6752 }, { "epoch": 1.7816910697797126, "grad_norm": 0.6800727844238281, "learning_rate": 5.480348193088895e-06, "loss": 0.0037, "step": 6754 }, { "epoch": 1.7822187046563778, "grad_norm": 0.12186422199010849, "learning_rate": 5.479996482898093e-06, "loss": 0.0117, "step": 6756 }, { "epoch": 1.7827463395330432, "grad_norm": 0.43133100867271423, "learning_rate": 5.479644772707289e-06, "loss": 0.0138, "step": 6758 }, { "epoch": 1.7832739744097084, "grad_norm": 0.07576553523540497, "learning_rate": 5.479293062516487e-06, "loss": 0.0012, "step": 6760 }, { "epoch": 1.7838016092863738, "grad_norm": 0.2543865740299225, "learning_rate": 5.478941352325684e-06, "loss": 0.0017, "step": 6762 }, { "epoch": 1.7843292441630392, "grad_norm": 1.0460495948791504, "learning_rate": 5.4785896421348805e-06, "loss": 0.0182, "step": 6764 }, { "epoch": 1.7848568790397046, "grad_norm": 0.44218122959136963, "learning_rate": 5.478237931944078e-06, "loss": 0.0026, "step": 6766 }, { "epoch": 1.78538451391637, "grad_norm": 0.4099707007408142, "learning_rate": 5.477886221753275e-06, "loss": 0.0062, "step": 6768 }, { "epoch": 1.7859121487930352, "grad_norm": 0.11210674792528152, "learning_rate": 5.477534511562473e-06, "loss": 0.0094, "step": 6770 }, { "epoch": 1.7864397836697006, "grad_norm": 0.029759449884295464, "learning_rate": 5.477182801371669e-06, "loss": 0.001, "step": 6772 }, { "epoch": 1.7869674185463658, "grad_norm": 0.03305407986044884, "learning_rate": 5.476831091180867e-06, "loss": 0.0042, "step": 6774 }, { "epoch": 1.7874950534230312, "grad_norm": 0.03904668986797333, "learning_rate": 5.476479380990065e-06, "loss": 0.0012, "step": 6776 }, { "epoch": 1.7880226882996966, "grad_norm": 0.5189715027809143, "learning_rate": 5.476127670799262e-06, "loss": 0.0064, "step": 6778 }, { "epoch": 1.788550323176362, "grad_norm": 0.4270525574684143, "learning_rate": 5.475775960608459e-06, "loss": 0.0016, "step": 6780 }, { "epoch": 1.7890779580530274, "grad_norm": 0.16399623453617096, "learning_rate": 5.475424250417656e-06, "loss": 0.0017, "step": 6782 }, { "epoch": 1.7896055929296928, "grad_norm": 0.026977835223078728, "learning_rate": 5.4750725402268534e-06, "loss": 0.0114, "step": 6784 }, { "epoch": 1.790133227806358, "grad_norm": 0.058129459619522095, "learning_rate": 5.47472083003605e-06, "loss": 0.001, "step": 6786 }, { "epoch": 1.7906608626830234, "grad_norm": 0.01748109981417656, "learning_rate": 5.474369119845247e-06, "loss": 0.0009, "step": 6788 }, { "epoch": 1.7911884975596886, "grad_norm": 0.6166647672653198, "learning_rate": 5.474017409654445e-06, "loss": 0.0058, "step": 6790 }, { "epoch": 1.791716132436354, "grad_norm": 0.7350605726242065, "learning_rate": 5.473665699463642e-06, "loss": 0.0034, "step": 6792 }, { "epoch": 1.7922437673130194, "grad_norm": 0.361468106508255, "learning_rate": 5.473313989272839e-06, "loss": 0.0088, "step": 6794 }, { "epoch": 1.7927714021896848, "grad_norm": 0.10716474801301956, "learning_rate": 5.472962279082036e-06, "loss": 0.0014, "step": 6796 }, { "epoch": 1.7932990370663502, "grad_norm": 0.30920836329460144, "learning_rate": 5.472610568891234e-06, "loss": 0.0044, "step": 6798 }, { "epoch": 1.7938266719430156, "grad_norm": 0.025332927703857422, "learning_rate": 5.472258858700432e-06, "loss": 0.0013, "step": 6800 }, { "epoch": 1.7943543068196808, "grad_norm": 0.05889245495200157, "learning_rate": 5.471907148509628e-06, "loss": 0.0012, "step": 6802 }, { "epoch": 1.7948819416963462, "grad_norm": 0.38225647807121277, "learning_rate": 5.4715554383188255e-06, "loss": 0.0024, "step": 6804 }, { "epoch": 1.7954095765730114, "grad_norm": 0.38149338960647583, "learning_rate": 5.4712037281280225e-06, "loss": 0.0024, "step": 6806 }, { "epoch": 1.7959372114496768, "grad_norm": 0.3135508894920349, "learning_rate": 5.47085201793722e-06, "loss": 0.0039, "step": 6808 }, { "epoch": 1.7964648463263422, "grad_norm": 0.05366232618689537, "learning_rate": 5.470500307746417e-06, "loss": 0.001, "step": 6810 }, { "epoch": 1.7969924812030076, "grad_norm": 0.03385375812649727, "learning_rate": 5.470148597555614e-06, "loss": 0.0024, "step": 6812 }, { "epoch": 1.797520116079673, "grad_norm": 0.22335673868656158, "learning_rate": 5.469796887364812e-06, "loss": 0.0156, "step": 6814 }, { "epoch": 1.7980477509563382, "grad_norm": 0.8721953630447388, "learning_rate": 5.469445177174009e-06, "loss": 0.0051, "step": 6816 }, { "epoch": 1.7985753858330036, "grad_norm": 0.029424753040075302, "learning_rate": 5.469093466983206e-06, "loss": 0.0068, "step": 6818 }, { "epoch": 1.7991030207096688, "grad_norm": 1.079617977142334, "learning_rate": 5.468741756792403e-06, "loss": 0.0087, "step": 6820 }, { "epoch": 1.7996306555863342, "grad_norm": 0.6432510018348694, "learning_rate": 5.468390046601601e-06, "loss": 0.0057, "step": 6822 }, { "epoch": 1.8001582904629996, "grad_norm": 0.5555378794670105, "learning_rate": 5.4680383364107976e-06, "loss": 0.0058, "step": 6824 }, { "epoch": 1.800685925339665, "grad_norm": 0.22054629027843475, "learning_rate": 5.4676866262199945e-06, "loss": 0.0016, "step": 6826 }, { "epoch": 1.8012135602163304, "grad_norm": 0.25315460562705994, "learning_rate": 5.467334916029192e-06, "loss": 0.0067, "step": 6828 }, { "epoch": 1.8017411950929958, "grad_norm": 0.0568460188806057, "learning_rate": 5.466983205838389e-06, "loss": 0.0062, "step": 6830 }, { "epoch": 1.802268829969661, "grad_norm": 0.12421313673257828, "learning_rate": 5.466631495647586e-06, "loss": 0.0177, "step": 6832 }, { "epoch": 1.8027964648463264, "grad_norm": 0.03509705141186714, "learning_rate": 5.466279785456784e-06, "loss": 0.0063, "step": 6834 }, { "epoch": 1.8033240997229916, "grad_norm": 0.2502407431602478, "learning_rate": 5.465928075265981e-06, "loss": 0.0113, "step": 6836 }, { "epoch": 1.803851734599657, "grad_norm": 0.0533878430724144, "learning_rate": 5.465576365075179e-06, "loss": 0.0019, "step": 6838 }, { "epoch": 1.8043793694763224, "grad_norm": 0.584128201007843, "learning_rate": 5.465224654884375e-06, "loss": 0.011, "step": 6840 }, { "epoch": 1.8049070043529878, "grad_norm": 0.04119263216853142, "learning_rate": 5.464872944693573e-06, "loss": 0.0047, "step": 6842 }, { "epoch": 1.8054346392296532, "grad_norm": 0.06635899841785431, "learning_rate": 5.46452123450277e-06, "loss": 0.0013, "step": 6844 }, { "epoch": 1.8059622741063184, "grad_norm": 0.21541014313697815, "learning_rate": 5.4641695243119674e-06, "loss": 0.002, "step": 6846 }, { "epoch": 1.8064899089829838, "grad_norm": 0.05561905354261398, "learning_rate": 5.463817814121164e-06, "loss": 0.0013, "step": 6848 }, { "epoch": 1.807017543859649, "grad_norm": 0.8551082015037537, "learning_rate": 5.463466103930361e-06, "loss": 0.0044, "step": 6850 }, { "epoch": 1.8075451787363144, "grad_norm": 0.1318473070859909, "learning_rate": 5.463114393739559e-06, "loss": 0.0098, "step": 6852 }, { "epoch": 1.8080728136129798, "grad_norm": 0.21279588341712952, "learning_rate": 5.462762683548755e-06, "loss": 0.002, "step": 6854 }, { "epoch": 1.8086004484896452, "grad_norm": 0.07348164916038513, "learning_rate": 5.462410973357953e-06, "loss": 0.0109, "step": 6856 }, { "epoch": 1.8091280833663106, "grad_norm": 0.08736370503902435, "learning_rate": 5.462059263167151e-06, "loss": 0.0017, "step": 6858 }, { "epoch": 1.809655718242976, "grad_norm": 0.47707003355026245, "learning_rate": 5.461707552976348e-06, "loss": 0.002, "step": 6860 }, { "epoch": 1.8101833531196412, "grad_norm": 0.294566810131073, "learning_rate": 5.461355842785545e-06, "loss": 0.0012, "step": 6862 }, { "epoch": 1.8107109879963066, "grad_norm": 0.03325522691011429, "learning_rate": 5.461004132594742e-06, "loss": 0.0013, "step": 6864 }, { "epoch": 1.8112386228729718, "grad_norm": 0.015842914581298828, "learning_rate": 5.4606524224039395e-06, "loss": 0.0009, "step": 6866 }, { "epoch": 1.8117662577496372, "grad_norm": 0.03206586837768555, "learning_rate": 5.4603007122131365e-06, "loss": 0.001, "step": 6868 }, { "epoch": 1.8122938926263026, "grad_norm": 0.05738183110952377, "learning_rate": 5.459949002022333e-06, "loss": 0.0021, "step": 6870 }, { "epoch": 1.812821527502968, "grad_norm": 0.42467957735061646, "learning_rate": 5.459597291831531e-06, "loss": 0.0061, "step": 6872 }, { "epoch": 1.8133491623796334, "grad_norm": 0.05789192393422127, "learning_rate": 5.459245581640728e-06, "loss": 0.001, "step": 6874 }, { "epoch": 1.8138767972562988, "grad_norm": 0.05743095278739929, "learning_rate": 5.458893871449926e-06, "loss": 0.0044, "step": 6876 }, { "epoch": 1.814404432132964, "grad_norm": 0.06390988081693649, "learning_rate": 5.458542161259122e-06, "loss": 0.0029, "step": 6878 }, { "epoch": 1.8149320670096294, "grad_norm": 0.2164028286933899, "learning_rate": 5.45819045106832e-06, "loss": 0.0124, "step": 6880 }, { "epoch": 1.8154597018862946, "grad_norm": 0.41449564695358276, "learning_rate": 5.457838740877517e-06, "loss": 0.0022, "step": 6882 }, { "epoch": 1.81598733676296, "grad_norm": 0.0993645191192627, "learning_rate": 5.457487030686715e-06, "loss": 0.0012, "step": 6884 }, { "epoch": 1.8165149716396254, "grad_norm": 0.34083595871925354, "learning_rate": 5.4571353204959116e-06, "loss": 0.0054, "step": 6886 }, { "epoch": 1.8170426065162908, "grad_norm": 0.06353422999382019, "learning_rate": 5.4567836103051085e-06, "loss": 0.0009, "step": 6888 }, { "epoch": 1.8175702413929562, "grad_norm": 0.010618380270898342, "learning_rate": 5.456431900114306e-06, "loss": 0.0009, "step": 6890 }, { "epoch": 1.8180978762696214, "grad_norm": 0.01712103746831417, "learning_rate": 5.456080189923503e-06, "loss": 0.0099, "step": 6892 }, { "epoch": 1.8186255111462868, "grad_norm": 0.03754789009690285, "learning_rate": 5.4557284797327e-06, "loss": 0.0011, "step": 6894 }, { "epoch": 1.819153146022952, "grad_norm": 0.012960772030055523, "learning_rate": 5.455376769541898e-06, "loss": 0.0042, "step": 6896 }, { "epoch": 1.8196807808996174, "grad_norm": 0.07494911551475525, "learning_rate": 5.455025059351095e-06, "loss": 0.0093, "step": 6898 }, { "epoch": 1.8202084157762828, "grad_norm": 0.03325270116329193, "learning_rate": 5.454673349160292e-06, "loss": 0.001, "step": 6900 }, { "epoch": 1.8207360506529482, "grad_norm": 0.046167388558387756, "learning_rate": 5.454321638969489e-06, "loss": 0.0011, "step": 6902 }, { "epoch": 1.8212636855296136, "grad_norm": 0.2385750263929367, "learning_rate": 5.453969928778687e-06, "loss": 0.0067, "step": 6904 }, { "epoch": 1.821791320406279, "grad_norm": 0.036204591393470764, "learning_rate": 5.453618218587884e-06, "loss": 0.0072, "step": 6906 }, { "epoch": 1.8223189552829442, "grad_norm": 1.1998815536499023, "learning_rate": 5.453266508397081e-06, "loss": 0.0067, "step": 6908 }, { "epoch": 1.8228465901596096, "grad_norm": 0.3604499399662018, "learning_rate": 5.452914798206278e-06, "loss": 0.0128, "step": 6910 }, { "epoch": 1.8233742250362748, "grad_norm": 0.03981272503733635, "learning_rate": 5.452563088015475e-06, "loss": 0.001, "step": 6912 }, { "epoch": 1.8239018599129402, "grad_norm": 0.5178662538528442, "learning_rate": 5.452211377824673e-06, "loss": 0.0131, "step": 6914 }, { "epoch": 1.8244294947896056, "grad_norm": 0.5091825127601624, "learning_rate": 5.45185966763387e-06, "loss": 0.0165, "step": 6916 }, { "epoch": 1.824957129666271, "grad_norm": 0.04229525104165077, "learning_rate": 5.451507957443067e-06, "loss": 0.0045, "step": 6918 }, { "epoch": 1.8254847645429364, "grad_norm": 0.323720246553421, "learning_rate": 5.451156247252265e-06, "loss": 0.0046, "step": 6920 }, { "epoch": 1.8260123994196016, "grad_norm": 0.23151294887065887, "learning_rate": 5.450804537061461e-06, "loss": 0.0028, "step": 6922 }, { "epoch": 1.826540034296267, "grad_norm": 0.09055811911821365, "learning_rate": 5.450452826870659e-06, "loss": 0.0013, "step": 6924 }, { "epoch": 1.8270676691729322, "grad_norm": 0.182155042886734, "learning_rate": 5.450101116679856e-06, "loss": 0.005, "step": 6926 }, { "epoch": 1.8275953040495976, "grad_norm": 0.06916556507349014, "learning_rate": 5.4497494064890535e-06, "loss": 0.0013, "step": 6928 }, { "epoch": 1.828122938926263, "grad_norm": 0.011232764460146427, "learning_rate": 5.4493976962982504e-06, "loss": 0.0019, "step": 6930 }, { "epoch": 1.8286505738029284, "grad_norm": 0.03160587325692177, "learning_rate": 5.449045986107447e-06, "loss": 0.0011, "step": 6932 }, { "epoch": 1.8291782086795938, "grad_norm": 0.23042608797550201, "learning_rate": 5.448694275916645e-06, "loss": 0.0195, "step": 6934 }, { "epoch": 1.8297058435562592, "grad_norm": 0.05180882290005684, "learning_rate": 5.448342565725842e-06, "loss": 0.0011, "step": 6936 }, { "epoch": 1.8302334784329244, "grad_norm": 0.3600846529006958, "learning_rate": 5.447990855535039e-06, "loss": 0.0018, "step": 6938 }, { "epoch": 1.8307611133095898, "grad_norm": 0.26908165216445923, "learning_rate": 5.447639145344236e-06, "loss": 0.0032, "step": 6940 }, { "epoch": 1.831288748186255, "grad_norm": 0.03575409576296806, "learning_rate": 5.447287435153434e-06, "loss": 0.0011, "step": 6942 }, { "epoch": 1.8318163830629204, "grad_norm": 0.034452956169843674, "learning_rate": 5.446935724962632e-06, "loss": 0.0009, "step": 6944 }, { "epoch": 1.8323440179395858, "grad_norm": 0.044841594994068146, "learning_rate": 5.446584014771828e-06, "loss": 0.0008, "step": 6946 }, { "epoch": 1.8328716528162512, "grad_norm": 0.042064011096954346, "learning_rate": 5.4462323045810256e-06, "loss": 0.0022, "step": 6948 }, { "epoch": 1.8333992876929166, "grad_norm": 0.0483175627887249, "learning_rate": 5.4458805943902225e-06, "loss": 0.0013, "step": 6950 }, { "epoch": 1.833926922569582, "grad_norm": 0.03283000364899635, "learning_rate": 5.4455288841994195e-06, "loss": 0.0009, "step": 6952 }, { "epoch": 1.8344545574462472, "grad_norm": 2.313035488128662, "learning_rate": 5.445177174008617e-06, "loss": 0.0037, "step": 6954 }, { "epoch": 1.8349821923229126, "grad_norm": 0.7565831542015076, "learning_rate": 5.444825463817814e-06, "loss": 0.0114, "step": 6956 }, { "epoch": 1.8355098271995778, "grad_norm": 0.0898764505982399, "learning_rate": 5.444473753627012e-06, "loss": 0.0009, "step": 6958 }, { "epoch": 1.8360374620762432, "grad_norm": 0.6617520451545715, "learning_rate": 5.444122043436208e-06, "loss": 0.0073, "step": 6960 }, { "epoch": 1.8365650969529086, "grad_norm": 0.012200764380395412, "learning_rate": 5.443770333245406e-06, "loss": 0.0101, "step": 6962 }, { "epoch": 1.837092731829574, "grad_norm": 0.9430074691772461, "learning_rate": 5.443418623054603e-06, "loss": 0.0083, "step": 6964 }, { "epoch": 1.8376203667062394, "grad_norm": 0.03691469132900238, "learning_rate": 5.443066912863801e-06, "loss": 0.001, "step": 6966 }, { "epoch": 1.8381480015829046, "grad_norm": 0.15412960946559906, "learning_rate": 5.442715202672998e-06, "loss": 0.0012, "step": 6968 }, { "epoch": 1.83867563645957, "grad_norm": 0.09386707097291946, "learning_rate": 5.4423634924821946e-06, "loss": 0.0016, "step": 6970 }, { "epoch": 1.8392032713362352, "grad_norm": 0.1476527601480484, "learning_rate": 5.442011782291392e-06, "loss": 0.0012, "step": 6972 }, { "epoch": 1.8397309062129006, "grad_norm": 0.14683744311332703, "learning_rate": 5.441660072100589e-06, "loss": 0.0017, "step": 6974 }, { "epoch": 1.840258541089566, "grad_norm": 0.029256733134388924, "learning_rate": 5.441308361909786e-06, "loss": 0.0036, "step": 6976 }, { "epoch": 1.8407861759662314, "grad_norm": 0.2664845287799835, "learning_rate": 5.440956651718984e-06, "loss": 0.0096, "step": 6978 }, { "epoch": 1.8413138108428968, "grad_norm": 0.039994485676288605, "learning_rate": 5.440604941528181e-06, "loss": 0.0012, "step": 6980 }, { "epoch": 1.8418414457195622, "grad_norm": 0.10808856040239334, "learning_rate": 5.440253231337378e-06, "loss": 0.0009, "step": 6982 }, { "epoch": 1.8423690805962274, "grad_norm": 0.04348642751574516, "learning_rate": 5.439901521146575e-06, "loss": 0.0035, "step": 6984 }, { "epoch": 1.8428967154728928, "grad_norm": 0.02652277797460556, "learning_rate": 5.439549810955773e-06, "loss": 0.0041, "step": 6986 }, { "epoch": 1.843424350349558, "grad_norm": 0.12671343982219696, "learning_rate": 5.43919810076497e-06, "loss": 0.0119, "step": 6988 }, { "epoch": 1.8439519852262234, "grad_norm": 0.07397110015153885, "learning_rate": 5.438846390574167e-06, "loss": 0.0014, "step": 6990 }, { "epoch": 1.8444796201028888, "grad_norm": 0.015618138015270233, "learning_rate": 5.4384946803833644e-06, "loss": 0.0021, "step": 6992 }, { "epoch": 1.8450072549795542, "grad_norm": 0.03935684263706207, "learning_rate": 5.438142970192561e-06, "loss": 0.0011, "step": 6994 }, { "epoch": 1.8455348898562196, "grad_norm": 0.055037107318639755, "learning_rate": 5.437791260001759e-06, "loss": 0.0024, "step": 6996 }, { "epoch": 1.8460625247328848, "grad_norm": 0.016672803089022636, "learning_rate": 5.437439549810955e-06, "loss": 0.0051, "step": 6998 }, { "epoch": 1.8465901596095502, "grad_norm": 0.6457607746124268, "learning_rate": 5.437087839620153e-06, "loss": 0.0042, "step": 7000 }, { "epoch": 1.8471177944862154, "grad_norm": 0.607643187046051, "learning_rate": 5.436736129429351e-06, "loss": 0.0102, "step": 7002 }, { "epoch": 1.8476454293628808, "grad_norm": 0.23699310421943665, "learning_rate": 5.436384419238548e-06, "loss": 0.0049, "step": 7004 }, { "epoch": 1.8481730642395462, "grad_norm": 0.21801446378231049, "learning_rate": 5.436032709047745e-06, "loss": 0.0016, "step": 7006 }, { "epoch": 1.8487006991162116, "grad_norm": 0.18455111980438232, "learning_rate": 5.435680998856942e-06, "loss": 0.0108, "step": 7008 }, { "epoch": 1.849228333992877, "grad_norm": 0.0771934911608696, "learning_rate": 5.4353292886661395e-06, "loss": 0.0018, "step": 7010 }, { "epoch": 1.8497559688695424, "grad_norm": 0.08430987596511841, "learning_rate": 5.4349775784753365e-06, "loss": 0.0056, "step": 7012 }, { "epoch": 1.8502836037462076, "grad_norm": 0.020613249391317368, "learning_rate": 5.4346258682845335e-06, "loss": 0.001, "step": 7014 }, { "epoch": 1.850811238622873, "grad_norm": 0.05092984810471535, "learning_rate": 5.434274158093731e-06, "loss": 0.001, "step": 7016 }, { "epoch": 1.8513388734995382, "grad_norm": 0.035666659474372864, "learning_rate": 5.433922447902928e-06, "loss": 0.0013, "step": 7018 }, { "epoch": 1.8518665083762036, "grad_norm": 0.3488101065158844, "learning_rate": 5.433570737712125e-06, "loss": 0.002, "step": 7020 }, { "epoch": 1.852394143252869, "grad_norm": 0.034271445125341415, "learning_rate": 5.433219027521322e-06, "loss": 0.0008, "step": 7022 }, { "epoch": 1.8529217781295344, "grad_norm": 0.06270210444927216, "learning_rate": 5.43286731733052e-06, "loss": 0.0097, "step": 7024 }, { "epoch": 1.8534494130061998, "grad_norm": 0.39614492654800415, "learning_rate": 5.432515607139718e-06, "loss": 0.0036, "step": 7026 }, { "epoch": 1.8539770478828652, "grad_norm": 0.06137092038989067, "learning_rate": 5.432163896948914e-06, "loss": 0.001, "step": 7028 }, { "epoch": 1.8545046827595304, "grad_norm": 0.11593027412891388, "learning_rate": 5.431812186758112e-06, "loss": 0.0013, "step": 7030 }, { "epoch": 1.8550323176361958, "grad_norm": 0.013575529679656029, "learning_rate": 5.4314604765673086e-06, "loss": 0.0042, "step": 7032 }, { "epoch": 1.855559952512861, "grad_norm": 0.007519472856074572, "learning_rate": 5.431108766376506e-06, "loss": 0.0007, "step": 7034 }, { "epoch": 1.8560875873895264, "grad_norm": 0.07550283521413803, "learning_rate": 5.430757056185703e-06, "loss": 0.0119, "step": 7036 }, { "epoch": 1.8566152222661918, "grad_norm": 0.008943779394030571, "learning_rate": 5.4304053459949e-06, "loss": 0.0008, "step": 7038 }, { "epoch": 1.8571428571428572, "grad_norm": 0.49316346645355225, "learning_rate": 5.430053635804098e-06, "loss": 0.0088, "step": 7040 }, { "epoch": 1.8576704920195226, "grad_norm": 0.009263130836188793, "learning_rate": 5.429701925613294e-06, "loss": 0.0049, "step": 7042 }, { "epoch": 1.8581981268961878, "grad_norm": 0.012602381408214569, "learning_rate": 5.429350215422492e-06, "loss": 0.0008, "step": 7044 }, { "epoch": 1.8587257617728532, "grad_norm": 0.022376710548996925, "learning_rate": 5.428998505231689e-06, "loss": 0.004, "step": 7046 }, { "epoch": 1.8592533966495184, "grad_norm": 0.0816216915845871, "learning_rate": 5.428646795040887e-06, "loss": 0.0049, "step": 7048 }, { "epoch": 1.8597810315261838, "grad_norm": 0.026144234463572502, "learning_rate": 5.428295084850084e-06, "loss": 0.0037, "step": 7050 }, { "epoch": 1.8603086664028492, "grad_norm": 0.4777994155883789, "learning_rate": 5.427943374659281e-06, "loss": 0.0078, "step": 7052 }, { "epoch": 1.8608363012795146, "grad_norm": 0.3426479995250702, "learning_rate": 5.4275916644684784e-06, "loss": 0.0035, "step": 7054 }, { "epoch": 1.86136393615618, "grad_norm": 0.365434855222702, "learning_rate": 5.427239954277675e-06, "loss": 0.0014, "step": 7056 }, { "epoch": 1.8618915710328454, "grad_norm": 0.009177206084132195, "learning_rate": 5.426888244086872e-06, "loss": 0.0007, "step": 7058 }, { "epoch": 1.8624192059095106, "grad_norm": 0.01570485346019268, "learning_rate": 5.42653653389607e-06, "loss": 0.0007, "step": 7060 }, { "epoch": 1.862946840786176, "grad_norm": 0.006361030973494053, "learning_rate": 5.426184823705267e-06, "loss": 0.0049, "step": 7062 }, { "epoch": 1.8634744756628412, "grad_norm": 0.13977067172527313, "learning_rate": 5.425833113514465e-06, "loss": 0.0012, "step": 7064 }, { "epoch": 1.8640021105395066, "grad_norm": 0.07871274650096893, "learning_rate": 5.425481403323661e-06, "loss": 0.0016, "step": 7066 }, { "epoch": 1.864529745416172, "grad_norm": 0.1639704555273056, "learning_rate": 5.425129693132859e-06, "loss": 0.0013, "step": 7068 }, { "epoch": 1.8650573802928374, "grad_norm": 0.08618514239788055, "learning_rate": 5.424777982942056e-06, "loss": 0.0013, "step": 7070 }, { "epoch": 1.8655850151695028, "grad_norm": 0.022069424390792847, "learning_rate": 5.424426272751253e-06, "loss": 0.0008, "step": 7072 }, { "epoch": 1.866112650046168, "grad_norm": 0.20646311342716217, "learning_rate": 5.4240745625604505e-06, "loss": 0.0077, "step": 7074 }, { "epoch": 1.8666402849228334, "grad_norm": 0.5445399284362793, "learning_rate": 5.4237228523696474e-06, "loss": 0.0066, "step": 7076 }, { "epoch": 1.8671679197994986, "grad_norm": 0.19794446229934692, "learning_rate": 5.423371142178845e-06, "loss": 0.0084, "step": 7078 }, { "epoch": 1.867695554676164, "grad_norm": 0.5089738368988037, "learning_rate": 5.423019431988041e-06, "loss": 0.0151, "step": 7080 }, { "epoch": 1.8682231895528294, "grad_norm": 0.05521763861179352, "learning_rate": 5.422667721797239e-06, "loss": 0.0014, "step": 7082 }, { "epoch": 1.8687508244294948, "grad_norm": 0.6457949876785278, "learning_rate": 5.422316011606437e-06, "loss": 0.0029, "step": 7084 }, { "epoch": 1.8692784593061602, "grad_norm": 1.1037269830703735, "learning_rate": 5.421964301415634e-06, "loss": 0.0038, "step": 7086 }, { "epoch": 1.8698060941828256, "grad_norm": 0.03708566352725029, "learning_rate": 5.421612591224831e-06, "loss": 0.0012, "step": 7088 }, { "epoch": 1.8703337290594908, "grad_norm": 0.44713693857192993, "learning_rate": 5.421260881034028e-06, "loss": 0.0032, "step": 7090 }, { "epoch": 1.8708613639361562, "grad_norm": 0.4601665437221527, "learning_rate": 5.420909170843226e-06, "loss": 0.0074, "step": 7092 }, { "epoch": 1.8713889988128214, "grad_norm": 0.0402689091861248, "learning_rate": 5.4205574606524226e-06, "loss": 0.0012, "step": 7094 }, { "epoch": 1.8719166336894868, "grad_norm": 0.008978253230452538, "learning_rate": 5.4202057504616195e-06, "loss": 0.001, "step": 7096 }, { "epoch": 1.8724442685661522, "grad_norm": 0.3040068447589874, "learning_rate": 5.419854040270817e-06, "loss": 0.0054, "step": 7098 }, { "epoch": 1.8729719034428176, "grad_norm": 0.115607850253582, "learning_rate": 5.419502330080014e-06, "loss": 0.0013, "step": 7100 }, { "epoch": 1.873499538319483, "grad_norm": 0.2770068943500519, "learning_rate": 5.419150619889211e-06, "loss": 0.0195, "step": 7102 }, { "epoch": 1.8740271731961484, "grad_norm": 0.114885114133358, "learning_rate": 5.418798909698408e-06, "loss": 0.0111, "step": 7104 }, { "epoch": 1.8745548080728136, "grad_norm": 0.060434967279434204, "learning_rate": 5.418447199507606e-06, "loss": 0.0077, "step": 7106 }, { "epoch": 1.875082442949479, "grad_norm": 0.3682064712047577, "learning_rate": 5.418095489316803e-06, "loss": 0.0064, "step": 7108 }, { "epoch": 1.8756100778261442, "grad_norm": 1.1539156436920166, "learning_rate": 5.417743779126e-06, "loss": 0.0041, "step": 7110 }, { "epoch": 1.8761377127028096, "grad_norm": 0.13845694065093994, "learning_rate": 5.417392068935198e-06, "loss": 0.0013, "step": 7112 }, { "epoch": 1.876665347579475, "grad_norm": 0.017714910209178925, "learning_rate": 5.417040358744395e-06, "loss": 0.0019, "step": 7114 }, { "epoch": 1.8771929824561404, "grad_norm": 0.04682474210858345, "learning_rate": 5.4166886485535924e-06, "loss": 0.001, "step": 7116 }, { "epoch": 1.8777206173328058, "grad_norm": 0.04993218928575516, "learning_rate": 5.416336938362789e-06, "loss": 0.0009, "step": 7118 }, { "epoch": 1.878248252209471, "grad_norm": 0.03580799698829651, "learning_rate": 5.415985228171986e-06, "loss": 0.0013, "step": 7120 }, { "epoch": 1.8787758870861364, "grad_norm": 0.08456449210643768, "learning_rate": 5.415633517981184e-06, "loss": 0.0016, "step": 7122 }, { "epoch": 1.8793035219628016, "grad_norm": 0.07811684906482697, "learning_rate": 5.415281807790381e-06, "loss": 0.001, "step": 7124 }, { "epoch": 1.879831156839467, "grad_norm": 0.008456489071249962, "learning_rate": 5.414930097599578e-06, "loss": 0.0012, "step": 7126 }, { "epoch": 1.8803587917161324, "grad_norm": 0.09307073801755905, "learning_rate": 5.414578387408775e-06, "loss": 0.001, "step": 7128 }, { "epoch": 1.8808864265927978, "grad_norm": 0.014283445663750172, "learning_rate": 5.414226677217973e-06, "loss": 0.0114, "step": 7130 }, { "epoch": 1.8814140614694632, "grad_norm": 0.01658465526998043, "learning_rate": 5.41387496702717e-06, "loss": 0.0044, "step": 7132 }, { "epoch": 1.8819416963461286, "grad_norm": 0.0329047292470932, "learning_rate": 5.413523256836367e-06, "loss": 0.0012, "step": 7134 }, { "epoch": 1.8824693312227938, "grad_norm": 0.17309948801994324, "learning_rate": 5.4131715466455645e-06, "loss": 0.001, "step": 7136 }, { "epoch": 1.8829969660994592, "grad_norm": 0.08823759853839874, "learning_rate": 5.4128198364547614e-06, "loss": 0.0009, "step": 7138 }, { "epoch": 1.8835246009761244, "grad_norm": 0.1733444482088089, "learning_rate": 5.412468126263958e-06, "loss": 0.0014, "step": 7140 }, { "epoch": 1.8840522358527898, "grad_norm": 0.01715058460831642, "learning_rate": 5.412116416073156e-06, "loss": 0.0059, "step": 7142 }, { "epoch": 1.8845798707294552, "grad_norm": 0.5998902320861816, "learning_rate": 5.411764705882353e-06, "loss": 0.0026, "step": 7144 }, { "epoch": 1.8851075056061206, "grad_norm": 0.750779926776886, "learning_rate": 5.411412995691551e-06, "loss": 0.0048, "step": 7146 }, { "epoch": 1.885635140482786, "grad_norm": 0.0842941626906395, "learning_rate": 5.411061285500747e-06, "loss": 0.0012, "step": 7148 }, { "epoch": 1.8861627753594512, "grad_norm": 0.02049078606069088, "learning_rate": 5.410709575309945e-06, "loss": 0.0042, "step": 7150 }, { "epoch": 1.8866904102361166, "grad_norm": 0.0184036735445261, "learning_rate": 5.410357865119142e-06, "loss": 0.0009, "step": 7152 }, { "epoch": 1.8872180451127818, "grad_norm": 0.026819072663784027, "learning_rate": 5.41000615492834e-06, "loss": 0.0008, "step": 7154 }, { "epoch": 1.8877456799894472, "grad_norm": 0.13611343502998352, "learning_rate": 5.4096544447375366e-06, "loss": 0.0012, "step": 7156 }, { "epoch": 1.8882733148661126, "grad_norm": 0.013728918507695198, "learning_rate": 5.4093027345467335e-06, "loss": 0.0016, "step": 7158 }, { "epoch": 1.888800949742778, "grad_norm": 0.7948787808418274, "learning_rate": 5.408951024355931e-06, "loss": 0.0223, "step": 7160 }, { "epoch": 1.8893285846194434, "grad_norm": 0.14991815388202667, "learning_rate": 5.408599314165127e-06, "loss": 0.0086, "step": 7162 }, { "epoch": 1.8898562194961088, "grad_norm": 0.0926903784275055, "learning_rate": 5.408247603974325e-06, "loss": 0.0227, "step": 7164 }, { "epoch": 1.890383854372774, "grad_norm": 0.024483755230903625, "learning_rate": 5.407895893783522e-06, "loss": 0.0075, "step": 7166 }, { "epoch": 1.8909114892494394, "grad_norm": 0.42552533745765686, "learning_rate": 5.40754418359272e-06, "loss": 0.0177, "step": 7168 }, { "epoch": 1.8914391241261046, "grad_norm": 0.3927389681339264, "learning_rate": 5.407192473401917e-06, "loss": 0.0121, "step": 7170 }, { "epoch": 1.89196675900277, "grad_norm": 0.37163445353507996, "learning_rate": 5.406840763211114e-06, "loss": 0.0073, "step": 7172 }, { "epoch": 1.8924943938794354, "grad_norm": 0.07718350738286972, "learning_rate": 5.406489053020312e-06, "loss": 0.0028, "step": 7174 }, { "epoch": 1.8930220287561008, "grad_norm": 0.06994698941707611, "learning_rate": 5.406137342829509e-06, "loss": 0.0011, "step": 7176 }, { "epoch": 1.8935496636327662, "grad_norm": 0.03695470839738846, "learning_rate": 5.4057856326387056e-06, "loss": 0.003, "step": 7178 }, { "epoch": 1.8940772985094316, "grad_norm": 0.05414924770593643, "learning_rate": 5.405433922447903e-06, "loss": 0.0094, "step": 7180 }, { "epoch": 1.8946049333860968, "grad_norm": 0.021210145205259323, "learning_rate": 5.4050822122571e-06, "loss": 0.0052, "step": 7182 }, { "epoch": 1.8951325682627622, "grad_norm": 0.0867697224020958, "learning_rate": 5.404730502066298e-06, "loss": 0.0018, "step": 7184 }, { "epoch": 1.8956602031394274, "grad_norm": 0.08620347082614899, "learning_rate": 5.404378791875494e-06, "loss": 0.0145, "step": 7186 }, { "epoch": 1.8961878380160928, "grad_norm": 0.11968614161014557, "learning_rate": 5.404027081684692e-06, "loss": 0.0037, "step": 7188 }, { "epoch": 1.8967154728927582, "grad_norm": 0.7566429376602173, "learning_rate": 5.403675371493889e-06, "loss": 0.0078, "step": 7190 }, { "epoch": 1.8972431077694236, "grad_norm": 0.035688355565071106, "learning_rate": 5.403323661303086e-06, "loss": 0.0038, "step": 7192 }, { "epoch": 1.897770742646089, "grad_norm": 0.5312109589576721, "learning_rate": 5.402971951112284e-06, "loss": 0.0074, "step": 7194 }, { "epoch": 1.8982983775227542, "grad_norm": 0.23579232394695282, "learning_rate": 5.402620240921481e-06, "loss": 0.0019, "step": 7196 }, { "epoch": 1.8988260123994196, "grad_norm": 0.16733546555042267, "learning_rate": 5.4022685307306785e-06, "loss": 0.0017, "step": 7198 }, { "epoch": 1.8993536472760848, "grad_norm": 0.033814799040555954, "learning_rate": 5.4019168205398754e-06, "loss": 0.0058, "step": 7200 }, { "epoch": 1.8998812821527502, "grad_norm": 0.1100504919886589, "learning_rate": 5.401565110349072e-06, "loss": 0.0013, "step": 7202 }, { "epoch": 1.9004089170294156, "grad_norm": 0.8806285858154297, "learning_rate": 5.40121340015827e-06, "loss": 0.0038, "step": 7204 }, { "epoch": 1.900936551906081, "grad_norm": 0.020297706127166748, "learning_rate": 5.400861689967467e-06, "loss": 0.001, "step": 7206 }, { "epoch": 1.9014641867827464, "grad_norm": 0.020839771255850792, "learning_rate": 5.400509979776664e-06, "loss": 0.001, "step": 7208 }, { "epoch": 1.9019918216594118, "grad_norm": 0.15609219670295715, "learning_rate": 5.400158269585861e-06, "loss": 0.0031, "step": 7210 }, { "epoch": 1.902519456536077, "grad_norm": 0.01985071785748005, "learning_rate": 5.399806559395059e-06, "loss": 0.0023, "step": 7212 }, { "epoch": 1.9030470914127424, "grad_norm": 0.8587923049926758, "learning_rate": 5.399454849204256e-06, "loss": 0.0078, "step": 7214 }, { "epoch": 1.9035747262894076, "grad_norm": 0.011171107180416584, "learning_rate": 5.399103139013453e-06, "loss": 0.0024, "step": 7216 }, { "epoch": 1.904102361166073, "grad_norm": 0.03907221555709839, "learning_rate": 5.3987514288226505e-06, "loss": 0.0016, "step": 7218 }, { "epoch": 1.9046299960427384, "grad_norm": 0.04995122179389, "learning_rate": 5.3983997186318475e-06, "loss": 0.0083, "step": 7220 }, { "epoch": 1.9051576309194038, "grad_norm": 0.5042977333068848, "learning_rate": 5.3980480084410445e-06, "loss": 0.0068, "step": 7222 }, { "epoch": 1.9056852657960692, "grad_norm": 0.2846747934818268, "learning_rate": 5.397696298250241e-06, "loss": 0.002, "step": 7224 }, { "epoch": 1.9062129006727346, "grad_norm": 0.4226668179035187, "learning_rate": 5.397344588059439e-06, "loss": 0.0035, "step": 7226 }, { "epoch": 1.9067405355493998, "grad_norm": 0.15168039500713348, "learning_rate": 5.396992877868637e-06, "loss": 0.0029, "step": 7228 }, { "epoch": 1.907268170426065, "grad_norm": 0.02696707658469677, "learning_rate": 5.396641167677833e-06, "loss": 0.0008, "step": 7230 }, { "epoch": 1.9077958053027304, "grad_norm": 0.5827708840370178, "learning_rate": 5.396289457487031e-06, "loss": 0.0071, "step": 7232 }, { "epoch": 1.9083234401793958, "grad_norm": 0.015522358007729053, "learning_rate": 5.395937747296228e-06, "loss": 0.0007, "step": 7234 }, { "epoch": 1.9088510750560612, "grad_norm": 0.7329305410385132, "learning_rate": 5.395586037105426e-06, "loss": 0.0022, "step": 7236 }, { "epoch": 1.9093787099327266, "grad_norm": 0.2834704518318176, "learning_rate": 5.395234326914623e-06, "loss": 0.0094, "step": 7238 }, { "epoch": 1.909906344809392, "grad_norm": 0.2042616307735443, "learning_rate": 5.3948826167238196e-06, "loss": 0.002, "step": 7240 }, { "epoch": 1.9104339796860572, "grad_norm": 0.5244129300117493, "learning_rate": 5.394530906533017e-06, "loss": 0.002, "step": 7242 }, { "epoch": 1.9109616145627226, "grad_norm": 0.03019142895936966, "learning_rate": 5.394179196342214e-06, "loss": 0.0008, "step": 7244 }, { "epoch": 1.9114892494393878, "grad_norm": 0.05029725655913353, "learning_rate": 5.393827486151411e-06, "loss": 0.0008, "step": 7246 }, { "epoch": 1.9120168843160532, "grad_norm": 0.05018198862671852, "learning_rate": 5.393475775960608e-06, "loss": 0.0102, "step": 7248 }, { "epoch": 1.9125445191927186, "grad_norm": 0.32381075620651245, "learning_rate": 5.393124065769806e-06, "loss": 0.0026, "step": 7250 }, { "epoch": 1.913072154069384, "grad_norm": 0.2632635533809662, "learning_rate": 5.392772355579003e-06, "loss": 0.0014, "step": 7252 }, { "epoch": 1.9135997889460494, "grad_norm": 0.03771102800965309, "learning_rate": 5.3924206453882e-06, "loss": 0.0031, "step": 7254 }, { "epoch": 1.9141274238227148, "grad_norm": 0.05571499839425087, "learning_rate": 5.392068935197398e-06, "loss": 0.0014, "step": 7256 }, { "epoch": 1.91465505869938, "grad_norm": 0.01583034172654152, "learning_rate": 5.391717225006595e-06, "loss": 0.0007, "step": 7258 }, { "epoch": 1.9151826935760454, "grad_norm": 0.20817455649375916, "learning_rate": 5.391365514815792e-06, "loss": 0.0084, "step": 7260 }, { "epoch": 1.9157103284527106, "grad_norm": 0.127214252948761, "learning_rate": 5.3910138046249894e-06, "loss": 0.005, "step": 7262 }, { "epoch": 1.916237963329376, "grad_norm": 0.013029668480157852, "learning_rate": 5.390662094434186e-06, "loss": 0.001, "step": 7264 }, { "epoch": 1.9167655982060414, "grad_norm": 0.24396835267543793, "learning_rate": 5.390310384243384e-06, "loss": 0.0097, "step": 7266 }, { "epoch": 1.9172932330827068, "grad_norm": 0.008828602731227875, "learning_rate": 5.38995867405258e-06, "loss": 0.005, "step": 7268 }, { "epoch": 1.9178208679593722, "grad_norm": 0.009636364877223969, "learning_rate": 5.389606963861778e-06, "loss": 0.0006, "step": 7270 }, { "epoch": 1.9183485028360374, "grad_norm": 0.017257360741496086, "learning_rate": 5.389255253670975e-06, "loss": 0.0007, "step": 7272 }, { "epoch": 1.9188761377127028, "grad_norm": 0.01181272603571415, "learning_rate": 5.388903543480173e-06, "loss": 0.0054, "step": 7274 }, { "epoch": 1.919403772589368, "grad_norm": 0.656717836856842, "learning_rate": 5.38855183328937e-06, "loss": 0.0015, "step": 7276 }, { "epoch": 1.9199314074660334, "grad_norm": 0.09667148441076279, "learning_rate": 5.388200123098567e-06, "loss": 0.001, "step": 7278 }, { "epoch": 1.9204590423426988, "grad_norm": 0.012938588857650757, "learning_rate": 5.3878484129077645e-06, "loss": 0.0011, "step": 7280 }, { "epoch": 1.9209866772193642, "grad_norm": 0.055664412677288055, "learning_rate": 5.387496702716961e-06, "loss": 0.001, "step": 7282 }, { "epoch": 1.9215143120960296, "grad_norm": 0.6420822739601135, "learning_rate": 5.3871449925261584e-06, "loss": 0.0011, "step": 7284 }, { "epoch": 1.922041946972695, "grad_norm": 0.02715984545648098, "learning_rate": 5.386793282335356e-06, "loss": 0.0016, "step": 7286 }, { "epoch": 1.9225695818493602, "grad_norm": 0.11869627237319946, "learning_rate": 5.386441572144553e-06, "loss": 0.0139, "step": 7288 }, { "epoch": 1.9230972167260256, "grad_norm": 0.2598155736923218, "learning_rate": 5.38608986195375e-06, "loss": 0.0038, "step": 7290 }, { "epoch": 1.9236248516026908, "grad_norm": 0.02414396032691002, "learning_rate": 5.385738151762947e-06, "loss": 0.0007, "step": 7292 }, { "epoch": 1.9241524864793562, "grad_norm": 0.1762804090976715, "learning_rate": 5.385386441572145e-06, "loss": 0.001, "step": 7294 }, { "epoch": 1.9246801213560216, "grad_norm": 0.047197334468364716, "learning_rate": 5.385034731381342e-06, "loss": 0.0018, "step": 7296 }, { "epoch": 1.925207756232687, "grad_norm": 0.019051752984523773, "learning_rate": 5.384683021190539e-06, "loss": 0.0017, "step": 7298 }, { "epoch": 1.9257353911093524, "grad_norm": 0.11510805040597916, "learning_rate": 5.384331310999737e-06, "loss": 0.0091, "step": 7300 }, { "epoch": 1.9262630259860178, "grad_norm": 0.23146817088127136, "learning_rate": 5.3839796008089336e-06, "loss": 0.0045, "step": 7302 }, { "epoch": 1.926790660862683, "grad_norm": 0.028088899329304695, "learning_rate": 5.383627890618131e-06, "loss": 0.0007, "step": 7304 }, { "epoch": 1.9273182957393482, "grad_norm": 0.04130669683218002, "learning_rate": 5.3832761804273275e-06, "loss": 0.0113, "step": 7306 }, { "epoch": 1.9278459306160136, "grad_norm": 0.1666133552789688, "learning_rate": 5.382924470236525e-06, "loss": 0.0141, "step": 7308 }, { "epoch": 1.928373565492679, "grad_norm": 0.03883674368262291, "learning_rate": 5.382572760045722e-06, "loss": 0.0012, "step": 7310 }, { "epoch": 1.9289012003693444, "grad_norm": 0.24224936962127686, "learning_rate": 5.382221049854919e-06, "loss": 0.009, "step": 7312 }, { "epoch": 1.9294288352460098, "grad_norm": 0.0591258779168129, "learning_rate": 5.381869339664117e-06, "loss": 0.0009, "step": 7314 }, { "epoch": 1.9299564701226752, "grad_norm": 0.035433076322078705, "learning_rate": 5.381517629473314e-06, "loss": 0.0034, "step": 7316 }, { "epoch": 1.9304841049993404, "grad_norm": 1.1213085651397705, "learning_rate": 5.381165919282512e-06, "loss": 0.0045, "step": 7318 }, { "epoch": 1.9310117398760058, "grad_norm": 0.007351245731115341, "learning_rate": 5.380814209091709e-06, "loss": 0.0006, "step": 7320 }, { "epoch": 1.931539374752671, "grad_norm": 0.05248188599944115, "learning_rate": 5.380462498900906e-06, "loss": 0.0049, "step": 7322 }, { "epoch": 1.9320670096293364, "grad_norm": 0.0781407579779625, "learning_rate": 5.380110788710103e-06, "loss": 0.0076, "step": 7324 }, { "epoch": 1.9325946445060018, "grad_norm": 0.09937616437673569, "learning_rate": 5.3797590785193e-06, "loss": 0.0009, "step": 7326 }, { "epoch": 1.9331222793826672, "grad_norm": 0.14931611716747284, "learning_rate": 5.379407368328497e-06, "loss": 0.0016, "step": 7328 }, { "epoch": 1.9336499142593326, "grad_norm": 0.04634704440832138, "learning_rate": 5.379055658137694e-06, "loss": 0.0026, "step": 7330 }, { "epoch": 1.934177549135998, "grad_norm": 0.06704021990299225, "learning_rate": 5.378703947946892e-06, "loss": 0.0054, "step": 7332 }, { "epoch": 1.9347051840126632, "grad_norm": 0.43789419531822205, "learning_rate": 5.378352237756089e-06, "loss": 0.0126, "step": 7334 }, { "epoch": 1.9352328188893286, "grad_norm": 0.2272530198097229, "learning_rate": 5.378000527565286e-06, "loss": 0.0018, "step": 7336 }, { "epoch": 1.9357604537659938, "grad_norm": 0.015400842763483524, "learning_rate": 5.377648817374484e-06, "loss": 0.0019, "step": 7338 }, { "epoch": 1.9362880886426592, "grad_norm": 0.05090561881661415, "learning_rate": 5.377297107183681e-06, "loss": 0.0046, "step": 7340 }, { "epoch": 1.9368157235193246, "grad_norm": 0.35832467675209045, "learning_rate": 5.376945396992878e-06, "loss": 0.0041, "step": 7342 }, { "epoch": 1.93734335839599, "grad_norm": 0.04571964591741562, "learning_rate": 5.3765936868020755e-06, "loss": 0.0014, "step": 7344 }, { "epoch": 1.9378709932726554, "grad_norm": 0.15087997913360596, "learning_rate": 5.3762419766112724e-06, "loss": 0.0013, "step": 7346 }, { "epoch": 1.9383986281493206, "grad_norm": 0.6170521974563599, "learning_rate": 5.37589026642047e-06, "loss": 0.0047, "step": 7348 }, { "epoch": 1.938926263025986, "grad_norm": 0.044024091213941574, "learning_rate": 5.375538556229666e-06, "loss": 0.004, "step": 7350 }, { "epoch": 1.9394538979026512, "grad_norm": 0.026516376063227654, "learning_rate": 5.375186846038864e-06, "loss": 0.0017, "step": 7352 }, { "epoch": 1.9399815327793166, "grad_norm": 0.7442540526390076, "learning_rate": 5.374835135848061e-06, "loss": 0.0099, "step": 7354 }, { "epoch": 1.940509167655982, "grad_norm": 0.2852506935596466, "learning_rate": 5.374483425657259e-06, "loss": 0.0075, "step": 7356 }, { "epoch": 1.9410368025326474, "grad_norm": 0.14819639921188354, "learning_rate": 5.374131715466456e-06, "loss": 0.0015, "step": 7358 }, { "epoch": 1.9415644374093128, "grad_norm": 0.03375093638896942, "learning_rate": 5.373780005275653e-06, "loss": 0.005, "step": 7360 }, { "epoch": 1.9420920722859782, "grad_norm": 0.054518986493349075, "learning_rate": 5.373428295084851e-06, "loss": 0.0011, "step": 7362 }, { "epoch": 1.9426197071626434, "grad_norm": 0.8771181702613831, "learning_rate": 5.3730765848940475e-06, "loss": 0.0035, "step": 7364 }, { "epoch": 1.9431473420393088, "grad_norm": 0.6068755984306335, "learning_rate": 5.3727248747032445e-06, "loss": 0.0111, "step": 7366 }, { "epoch": 1.943674976915974, "grad_norm": 0.07248653471469879, "learning_rate": 5.3723731645124415e-06, "loss": 0.005, "step": 7368 }, { "epoch": 1.9442026117926394, "grad_norm": 0.3765837550163269, "learning_rate": 5.372021454321639e-06, "loss": 0.0027, "step": 7370 }, { "epoch": 1.9447302466693048, "grad_norm": 0.01461095828562975, "learning_rate": 5.371669744130836e-06, "loss": 0.0124, "step": 7372 }, { "epoch": 1.9452578815459702, "grad_norm": 0.40280070900917053, "learning_rate": 5.371318033940033e-06, "loss": 0.0027, "step": 7374 }, { "epoch": 1.9457855164226356, "grad_norm": 0.1255592554807663, "learning_rate": 5.370966323749231e-06, "loss": 0.001, "step": 7376 }, { "epoch": 1.946313151299301, "grad_norm": 0.05761240795254707, "learning_rate": 5.370614613558428e-06, "loss": 0.001, "step": 7378 }, { "epoch": 1.9468407861759662, "grad_norm": 0.1588171273469925, "learning_rate": 5.370262903367625e-06, "loss": 0.0008, "step": 7380 }, { "epoch": 1.9473684210526314, "grad_norm": 0.019463714212179184, "learning_rate": 5.369911193176823e-06, "loss": 0.0012, "step": 7382 }, { "epoch": 1.9478960559292968, "grad_norm": 0.02582273632287979, "learning_rate": 5.36955948298602e-06, "loss": 0.0079, "step": 7384 }, { "epoch": 1.9484236908059622, "grad_norm": 0.1021297425031662, "learning_rate": 5.369207772795217e-06, "loss": 0.0028, "step": 7386 }, { "epoch": 1.9489513256826276, "grad_norm": 0.3484865725040436, "learning_rate": 5.3688560626044135e-06, "loss": 0.0056, "step": 7388 }, { "epoch": 1.949478960559293, "grad_norm": 0.019234444946050644, "learning_rate": 5.368504352413611e-06, "loss": 0.0056, "step": 7390 }, { "epoch": 1.9500065954359584, "grad_norm": 0.3278183043003082, "learning_rate": 5.368152642222808e-06, "loss": 0.0035, "step": 7392 }, { "epoch": 1.9505342303126236, "grad_norm": 0.04009656980633736, "learning_rate": 5.367800932032006e-06, "loss": 0.0009, "step": 7394 }, { "epoch": 1.951061865189289, "grad_norm": 0.10407160222530365, "learning_rate": 5.367449221841203e-06, "loss": 0.0011, "step": 7396 }, { "epoch": 1.9515895000659542, "grad_norm": 0.3307563066482544, "learning_rate": 5.3670975116504e-06, "loss": 0.0083, "step": 7398 }, { "epoch": 1.9521171349426196, "grad_norm": 0.011592800728976727, "learning_rate": 5.366745801459598e-06, "loss": 0.0007, "step": 7400 }, { "epoch": 1.952644769819285, "grad_norm": 0.024905811995267868, "learning_rate": 5.366394091268795e-06, "loss": 0.0008, "step": 7402 }, { "epoch": 1.9531724046959504, "grad_norm": 0.0694381520152092, "learning_rate": 5.366042381077992e-06, "loss": 0.0012, "step": 7404 }, { "epoch": 1.9537000395726158, "grad_norm": 0.051488202065229416, "learning_rate": 5.3656906708871895e-06, "loss": 0.0009, "step": 7406 }, { "epoch": 1.9542276744492812, "grad_norm": 0.2870785593986511, "learning_rate": 5.3653389606963864e-06, "loss": 0.0054, "step": 7408 }, { "epoch": 1.9547553093259464, "grad_norm": 0.046444956213235855, "learning_rate": 5.364987250505583e-06, "loss": 0.0009, "step": 7410 }, { "epoch": 1.9552829442026118, "grad_norm": 0.18180039525032043, "learning_rate": 5.36463554031478e-06, "loss": 0.0062, "step": 7412 }, { "epoch": 1.955810579079277, "grad_norm": 0.28289058804512024, "learning_rate": 5.364283830123978e-06, "loss": 0.0019, "step": 7414 }, { "epoch": 1.9563382139559424, "grad_norm": 0.014692672528326511, "learning_rate": 5.363932119933175e-06, "loss": 0.0019, "step": 7416 }, { "epoch": 1.9568658488326078, "grad_norm": 0.014851038344204426, "learning_rate": 5.363580409742372e-06, "loss": 0.0008, "step": 7418 }, { "epoch": 1.9573934837092732, "grad_norm": 0.04948257654905319, "learning_rate": 5.36322869955157e-06, "loss": 0.0057, "step": 7420 }, { "epoch": 1.9579211185859386, "grad_norm": 0.4035344421863556, "learning_rate": 5.362876989360767e-06, "loss": 0.0122, "step": 7422 }, { "epoch": 1.9584487534626038, "grad_norm": 0.2042231261730194, "learning_rate": 5.362525279169965e-06, "loss": 0.0041, "step": 7424 }, { "epoch": 1.9589763883392692, "grad_norm": 0.1583019196987152, "learning_rate": 5.362173568979161e-06, "loss": 0.0021, "step": 7426 }, { "epoch": 1.9595040232159344, "grad_norm": 0.04885249584913254, "learning_rate": 5.3618218587883585e-06, "loss": 0.001, "step": 7428 }, { "epoch": 1.9600316580925998, "grad_norm": 0.4468289911746979, "learning_rate": 5.361470148597556e-06, "loss": 0.0102, "step": 7430 }, { "epoch": 1.9605592929692652, "grad_norm": 0.27161145210266113, "learning_rate": 5.361118438406752e-06, "loss": 0.0062, "step": 7432 }, { "epoch": 1.9610869278459306, "grad_norm": 0.32148507237434387, "learning_rate": 5.36076672821595e-06, "loss": 0.0044, "step": 7434 }, { "epoch": 1.961614562722596, "grad_norm": 0.2321450412273407, "learning_rate": 5.360415018025147e-06, "loss": 0.0017, "step": 7436 }, { "epoch": 1.9621421975992615, "grad_norm": 0.008310789242386818, "learning_rate": 5.360063307834345e-06, "loss": 0.0006, "step": 7438 }, { "epoch": 1.9626698324759266, "grad_norm": 0.3294728398323059, "learning_rate": 5.359711597643542e-06, "loss": 0.0027, "step": 7440 }, { "epoch": 1.963197467352592, "grad_norm": 0.023042352870106697, "learning_rate": 5.359359887452739e-06, "loss": 0.0007, "step": 7442 }, { "epoch": 1.9637251022292572, "grad_norm": 1.3926198482513428, "learning_rate": 5.359008177261937e-06, "loss": 0.0067, "step": 7444 }, { "epoch": 1.9642527371059226, "grad_norm": 0.16309243440628052, "learning_rate": 5.358656467071134e-06, "loss": 0.0018, "step": 7446 }, { "epoch": 1.964780371982588, "grad_norm": 0.10816987603902817, "learning_rate": 5.3583047568803306e-06, "loss": 0.002, "step": 7448 }, { "epoch": 1.9653080068592534, "grad_norm": 0.023180222138762474, "learning_rate": 5.3579530466895275e-06, "loss": 0.0015, "step": 7450 }, { "epoch": 1.9658356417359188, "grad_norm": 0.6898549795150757, "learning_rate": 5.357601336498725e-06, "loss": 0.0068, "step": 7452 }, { "epoch": 1.9663632766125843, "grad_norm": 0.49899062514305115, "learning_rate": 5.357249626307923e-06, "loss": 0.0047, "step": 7454 }, { "epoch": 1.9668909114892494, "grad_norm": 0.12925978004932404, "learning_rate": 5.356897916117119e-06, "loss": 0.0096, "step": 7456 }, { "epoch": 1.9674185463659146, "grad_norm": 0.8914574980735779, "learning_rate": 5.356546205926317e-06, "loss": 0.0121, "step": 7458 }, { "epoch": 1.96794618124258, "grad_norm": 0.5562547445297241, "learning_rate": 5.356194495735514e-06, "loss": 0.0038, "step": 7460 }, { "epoch": 1.9684738161192454, "grad_norm": 0.9635394811630249, "learning_rate": 5.355842785544711e-06, "loss": 0.0083, "step": 7462 }, { "epoch": 1.9690014509959108, "grad_norm": 0.23106391727924347, "learning_rate": 5.355491075353909e-06, "loss": 0.0099, "step": 7464 }, { "epoch": 1.9695290858725762, "grad_norm": 0.19416101276874542, "learning_rate": 5.355139365163106e-06, "loss": 0.0221, "step": 7466 }, { "epoch": 1.9700567207492417, "grad_norm": 0.3167969882488251, "learning_rate": 5.3547876549723035e-06, "loss": 0.0017, "step": 7468 }, { "epoch": 1.9705843556259068, "grad_norm": 0.22039294242858887, "learning_rate": 5.3544359447814996e-06, "loss": 0.0119, "step": 7470 }, { "epoch": 1.9711119905025722, "grad_norm": 0.05957667902112007, "learning_rate": 5.354084234590697e-06, "loss": 0.001, "step": 7472 }, { "epoch": 1.9716396253792374, "grad_norm": 0.047108497470617294, "learning_rate": 5.353732524399894e-06, "loss": 0.0011, "step": 7474 }, { "epoch": 1.9721672602559028, "grad_norm": 0.2809363305568695, "learning_rate": 5.353380814209092e-06, "loss": 0.0066, "step": 7476 }, { "epoch": 1.9726948951325682, "grad_norm": 0.9469135403633118, "learning_rate": 5.353029104018289e-06, "loss": 0.0052, "step": 7478 }, { "epoch": 1.9732225300092336, "grad_norm": 0.19523940980434418, "learning_rate": 5.352677393827486e-06, "loss": 0.0028, "step": 7480 }, { "epoch": 1.973750164885899, "grad_norm": 0.027505191043019295, "learning_rate": 5.352325683636684e-06, "loss": 0.0045, "step": 7482 }, { "epoch": 1.9742777997625645, "grad_norm": 0.02946813590824604, "learning_rate": 5.351973973445881e-06, "loss": 0.0051, "step": 7484 }, { "epoch": 1.9748054346392296, "grad_norm": 0.6056808829307556, "learning_rate": 5.351622263255078e-06, "loss": 0.0159, "step": 7486 }, { "epoch": 1.975333069515895, "grad_norm": 0.16613125801086426, "learning_rate": 5.3512705530642755e-06, "loss": 0.0016, "step": 7488 }, { "epoch": 1.9758607043925602, "grad_norm": 0.2754090428352356, "learning_rate": 5.3509188428734725e-06, "loss": 0.0025, "step": 7490 }, { "epoch": 1.9763883392692256, "grad_norm": 0.14702218770980835, "learning_rate": 5.35056713268267e-06, "loss": 0.0146, "step": 7492 }, { "epoch": 1.976915974145891, "grad_norm": 0.06694843620061874, "learning_rate": 5.350215422491866e-06, "loss": 0.0056, "step": 7494 }, { "epoch": 1.9774436090225564, "grad_norm": 0.36391428112983704, "learning_rate": 5.349863712301064e-06, "loss": 0.0097, "step": 7496 }, { "epoch": 1.9779712438992219, "grad_norm": 0.12019544839859009, "learning_rate": 5.349512002110261e-06, "loss": 0.0042, "step": 7498 }, { "epoch": 1.978498878775887, "grad_norm": 0.6007609367370605, "learning_rate": 5.349160291919458e-06, "loss": 0.0048, "step": 7500 }, { "epoch": 1.9790265136525524, "grad_norm": 0.17026235163211823, "learning_rate": 5.348808581728656e-06, "loss": 0.003, "step": 7502 }, { "epoch": 1.9795541485292176, "grad_norm": 0.4854772686958313, "learning_rate": 5.348456871537853e-06, "loss": 0.0036, "step": 7504 }, { "epoch": 1.980081783405883, "grad_norm": 0.10669800639152527, "learning_rate": 5.348105161347051e-06, "loss": 0.0016, "step": 7506 }, { "epoch": 1.9806094182825484, "grad_norm": 0.07477548718452454, "learning_rate": 5.347753451156247e-06, "loss": 0.0018, "step": 7508 }, { "epoch": 1.9811370531592138, "grad_norm": 0.04367079213261604, "learning_rate": 5.3474017409654446e-06, "loss": 0.0013, "step": 7510 }, { "epoch": 1.9816646880358793, "grad_norm": 0.02988153137266636, "learning_rate": 5.347050030774642e-06, "loss": 0.0018, "step": 7512 }, { "epoch": 1.9821923229125447, "grad_norm": 0.04884495586156845, "learning_rate": 5.346698320583839e-06, "loss": 0.0009, "step": 7514 }, { "epoch": 1.9827199577892098, "grad_norm": 0.24405978620052338, "learning_rate": 5.346346610393036e-06, "loss": 0.0026, "step": 7516 }, { "epoch": 1.9832475926658752, "grad_norm": 0.26876533031463623, "learning_rate": 5.345994900202233e-06, "loss": 0.0065, "step": 7518 }, { "epoch": 1.9837752275425404, "grad_norm": 0.13706852495670319, "learning_rate": 5.345643190011431e-06, "loss": 0.0168, "step": 7520 }, { "epoch": 1.9843028624192058, "grad_norm": 0.6464529037475586, "learning_rate": 5.345291479820628e-06, "loss": 0.0105, "step": 7522 }, { "epoch": 1.9848304972958712, "grad_norm": 0.01626642607152462, "learning_rate": 5.344939769629825e-06, "loss": 0.0007, "step": 7524 }, { "epoch": 1.9853581321725366, "grad_norm": 0.16217544674873352, "learning_rate": 5.344588059439023e-06, "loss": 0.0017, "step": 7526 }, { "epoch": 1.985885767049202, "grad_norm": 0.10218197852373123, "learning_rate": 5.34423634924822e-06, "loss": 0.009, "step": 7528 }, { "epoch": 1.9864134019258675, "grad_norm": 0.06368608772754669, "learning_rate": 5.343884639057417e-06, "loss": 0.004, "step": 7530 }, { "epoch": 1.9869410368025326, "grad_norm": 0.04898801073431969, "learning_rate": 5.3435329288666136e-06, "loss": 0.0082, "step": 7532 }, { "epoch": 1.9874686716791978, "grad_norm": 0.11517906934022903, "learning_rate": 5.343181218675811e-06, "loss": 0.005, "step": 7534 }, { "epoch": 1.9879963065558632, "grad_norm": 0.25409063696861267, "learning_rate": 5.342829508485008e-06, "loss": 0.0015, "step": 7536 }, { "epoch": 1.9885239414325286, "grad_norm": 0.31451940536499023, "learning_rate": 5.342477798294205e-06, "loss": 0.0021, "step": 7538 }, { "epoch": 1.989051576309194, "grad_norm": 0.0234073456376791, "learning_rate": 5.342126088103403e-06, "loss": 0.0008, "step": 7540 }, { "epoch": 1.9895792111858595, "grad_norm": 0.06589966267347336, "learning_rate": 5.3417743779126e-06, "loss": 0.0039, "step": 7542 }, { "epoch": 1.9901068460625249, "grad_norm": 0.08927473425865173, "learning_rate": 5.341422667721798e-06, "loss": 0.0142, "step": 7544 }, { "epoch": 1.99063448093919, "grad_norm": 0.07531983405351639, "learning_rate": 5.341070957530995e-06, "loss": 0.0122, "step": 7546 }, { "epoch": 1.9911621158158554, "grad_norm": 0.1268133968114853, "learning_rate": 5.340719247340192e-06, "loss": 0.0012, "step": 7548 }, { "epoch": 1.9916897506925206, "grad_norm": 0.041313912719488144, "learning_rate": 5.3403675371493895e-06, "loss": 0.0018, "step": 7550 }, { "epoch": 1.992217385569186, "grad_norm": 0.028343867510557175, "learning_rate": 5.3400158269585865e-06, "loss": 0.0007, "step": 7552 }, { "epoch": 1.9927450204458514, "grad_norm": 0.06997419893741608, "learning_rate": 5.3396641167677834e-06, "loss": 0.0011, "step": 7554 }, { "epoch": 1.9932726553225169, "grad_norm": 0.009557254612445831, "learning_rate": 5.33931240657698e-06, "loss": 0.0008, "step": 7556 }, { "epoch": 1.9938002901991823, "grad_norm": 0.09696868807077408, "learning_rate": 5.338960696386178e-06, "loss": 0.0044, "step": 7558 }, { "epoch": 1.9943279250758477, "grad_norm": 0.014795721508562565, "learning_rate": 5.338608986195375e-06, "loss": 0.0015, "step": 7560 }, { "epoch": 1.9948555599525128, "grad_norm": 0.018224384635686874, "learning_rate": 5.338257276004572e-06, "loss": 0.003, "step": 7562 }, { "epoch": 1.9953831948291783, "grad_norm": 0.005175348836928606, "learning_rate": 5.33790556581377e-06, "loss": 0.001, "step": 7564 }, { "epoch": 1.9959108297058434, "grad_norm": 0.04045415669679642, "learning_rate": 5.337553855622967e-06, "loss": 0.0096, "step": 7566 }, { "epoch": 1.9964384645825088, "grad_norm": 0.17164084315299988, "learning_rate": 5.337202145432164e-06, "loss": 0.0013, "step": 7568 }, { "epoch": 1.9969660994591742, "grad_norm": 0.029378315433859825, "learning_rate": 5.336850435241362e-06, "loss": 0.0217, "step": 7570 }, { "epoch": 1.9974937343358397, "grad_norm": 0.11905122548341751, "learning_rate": 5.3364987250505585e-06, "loss": 0.0017, "step": 7572 }, { "epoch": 1.998021369212505, "grad_norm": 0.03603150695562363, "learning_rate": 5.336147014859756e-06, "loss": 0.0021, "step": 7574 }, { "epoch": 1.9985490040891702, "grad_norm": 0.07525298744440079, "learning_rate": 5.3357953046689524e-06, "loss": 0.0019, "step": 7576 }, { "epoch": 1.9990766389658357, "grad_norm": 0.2558468282222748, "learning_rate": 5.33544359447815e-06, "loss": 0.0051, "step": 7578 }, { "epoch": 1.9996042738425008, "grad_norm": 0.06117354705929756, "learning_rate": 5.335091884287347e-06, "loss": 0.003, "step": 7580 }, { "epoch": 2.0, "grad_norm": 0.1735069304704666, "learning_rate": 5.334740174096545e-06, "loss": 0.0068, "step": 7582 }, { "epoch": 2.0005276348766654, "grad_norm": 0.1078600361943245, "learning_rate": 5.334388463905742e-06, "loss": 0.008, "step": 7584 }, { "epoch": 2.001055269753331, "grad_norm": 0.013776451349258423, "learning_rate": 5.334036753714939e-06, "loss": 0.0049, "step": 7586 }, { "epoch": 2.001582904629996, "grad_norm": 0.022813057526946068, "learning_rate": 5.333685043524137e-06, "loss": 0.0008, "step": 7588 }, { "epoch": 2.002110539506661, "grad_norm": 0.8424115180969238, "learning_rate": 5.333333333333333e-06, "loss": 0.0021, "step": 7590 }, { "epoch": 2.0026381743833266, "grad_norm": 0.09484437853097916, "learning_rate": 5.332981623142531e-06, "loss": 0.0012, "step": 7592 }, { "epoch": 2.003165809259992, "grad_norm": 0.039806924760341644, "learning_rate": 5.3326299129517276e-06, "loss": 0.0068, "step": 7594 }, { "epoch": 2.0036934441366574, "grad_norm": 0.010632372461259365, "learning_rate": 5.332278202760925e-06, "loss": 0.006, "step": 7596 }, { "epoch": 2.004221079013323, "grad_norm": 0.13177156448364258, "learning_rate": 5.331926492570122e-06, "loss": 0.0021, "step": 7598 }, { "epoch": 2.004748713889988, "grad_norm": 0.137518972158432, "learning_rate": 5.331574782379319e-06, "loss": 0.0109, "step": 7600 }, { "epoch": 2.0052763487666536, "grad_norm": 0.05473814159631729, "learning_rate": 5.331223072188517e-06, "loss": 0.002, "step": 7602 }, { "epoch": 2.005803983643319, "grad_norm": 0.5328726172447205, "learning_rate": 5.330871361997714e-06, "loss": 0.0034, "step": 7604 }, { "epoch": 2.006331618519984, "grad_norm": 0.09307392686605453, "learning_rate": 5.330519651806911e-06, "loss": 0.001, "step": 7606 }, { "epoch": 2.0068592533966494, "grad_norm": 0.04515058174729347, "learning_rate": 5.330167941616109e-06, "loss": 0.0031, "step": 7608 }, { "epoch": 2.007386888273315, "grad_norm": 0.5790168046951294, "learning_rate": 5.329816231425306e-06, "loss": 0.0056, "step": 7610 }, { "epoch": 2.00791452314998, "grad_norm": 0.04649742692708969, "learning_rate": 5.3294645212345035e-06, "loss": 0.0023, "step": 7612 }, { "epoch": 2.0084421580266456, "grad_norm": 0.09179841727018356, "learning_rate": 5.3291128110437e-06, "loss": 0.001, "step": 7614 }, { "epoch": 2.008969792903311, "grad_norm": 0.050783030688762665, "learning_rate": 5.3287611008528974e-06, "loss": 0.0037, "step": 7616 }, { "epoch": 2.0094974277799764, "grad_norm": 0.021731005981564522, "learning_rate": 5.328409390662094e-06, "loss": 0.001, "step": 7618 }, { "epoch": 2.0100250626566414, "grad_norm": 0.038512516766786575, "learning_rate": 5.328057680471291e-06, "loss": 0.001, "step": 7620 }, { "epoch": 2.010552697533307, "grad_norm": 0.015809105709195137, "learning_rate": 5.327705970280489e-06, "loss": 0.0006, "step": 7622 }, { "epoch": 2.011080332409972, "grad_norm": 0.3707762658596039, "learning_rate": 5.327354260089686e-06, "loss": 0.0076, "step": 7624 }, { "epoch": 2.0116079672866376, "grad_norm": 0.34519460797309875, "learning_rate": 5.327002549898884e-06, "loss": 0.0036, "step": 7626 }, { "epoch": 2.012135602163303, "grad_norm": 0.1766204982995987, "learning_rate": 5.326650839708081e-06, "loss": 0.0013, "step": 7628 }, { "epoch": 2.0126632370399684, "grad_norm": 0.1947007179260254, "learning_rate": 5.326299129517278e-06, "loss": 0.0016, "step": 7630 }, { "epoch": 2.013190871916634, "grad_norm": 0.08802849054336548, "learning_rate": 5.325947419326476e-06, "loss": 0.0009, "step": 7632 }, { "epoch": 2.0137185067932992, "grad_norm": 0.010629210621118546, "learning_rate": 5.3255957091356725e-06, "loss": 0.0007, "step": 7634 }, { "epoch": 2.014246141669964, "grad_norm": 0.200439453125, "learning_rate": 5.3252439989448695e-06, "loss": 0.001, "step": 7636 }, { "epoch": 2.0147737765466296, "grad_norm": 0.7521776556968689, "learning_rate": 5.3248922887540664e-06, "loss": 0.0113, "step": 7638 }, { "epoch": 2.015301411423295, "grad_norm": 0.2581040859222412, "learning_rate": 5.324540578563264e-06, "loss": 0.0021, "step": 7640 }, { "epoch": 2.0158290462999604, "grad_norm": 0.015294553712010384, "learning_rate": 5.324188868372461e-06, "loss": 0.0014, "step": 7642 }, { "epoch": 2.016356681176626, "grad_norm": 0.038068633526563644, "learning_rate": 5.323837158181658e-06, "loss": 0.0068, "step": 7644 }, { "epoch": 2.016884316053291, "grad_norm": 0.8750513792037964, "learning_rate": 5.323485447990856e-06, "loss": 0.0073, "step": 7646 }, { "epoch": 2.0174119509299566, "grad_norm": 0.3973555266857147, "learning_rate": 5.323133737800053e-06, "loss": 0.0059, "step": 7648 }, { "epoch": 2.017939585806622, "grad_norm": 0.2247975617647171, "learning_rate": 5.32278202760925e-06, "loss": 0.0096, "step": 7650 }, { "epoch": 2.018467220683287, "grad_norm": 0.6935496926307678, "learning_rate": 5.322430317418447e-06, "loss": 0.0108, "step": 7652 }, { "epoch": 2.0189948555599524, "grad_norm": 0.2511788010597229, "learning_rate": 5.322078607227645e-06, "loss": 0.0091, "step": 7654 }, { "epoch": 2.019522490436618, "grad_norm": 0.48659029603004456, "learning_rate": 5.321726897036842e-06, "loss": 0.0057, "step": 7656 }, { "epoch": 2.020050125313283, "grad_norm": 0.015965957194566727, "learning_rate": 5.3213751868460385e-06, "loss": 0.0008, "step": 7658 }, { "epoch": 2.0205777601899486, "grad_norm": 0.12582623958587646, "learning_rate": 5.321023476655236e-06, "loss": 0.0023, "step": 7660 }, { "epoch": 2.021105395066614, "grad_norm": 0.026307031512260437, "learning_rate": 5.320671766464433e-06, "loss": 0.0048, "step": 7662 }, { "epoch": 2.0216330299432794, "grad_norm": 0.019831879064440727, "learning_rate": 5.320320056273631e-06, "loss": 0.0008, "step": 7664 }, { "epoch": 2.0221606648199444, "grad_norm": 0.1572244018316269, "learning_rate": 5.319968346082828e-06, "loss": 0.0107, "step": 7666 }, { "epoch": 2.02268829969661, "grad_norm": 0.20759578049182892, "learning_rate": 5.319616635892025e-06, "loss": 0.0021, "step": 7668 }, { "epoch": 2.023215934573275, "grad_norm": 0.11051427572965622, "learning_rate": 5.319264925701223e-06, "loss": 0.0007, "step": 7670 }, { "epoch": 2.0237435694499406, "grad_norm": 0.014105690643191338, "learning_rate": 5.31891321551042e-06, "loss": 0.0007, "step": 7672 }, { "epoch": 2.024271204326606, "grad_norm": 0.036109842360019684, "learning_rate": 5.318561505319617e-06, "loss": 0.0008, "step": 7674 }, { "epoch": 2.0247988392032714, "grad_norm": 0.04995192587375641, "learning_rate": 5.318209795128814e-06, "loss": 0.0008, "step": 7676 }, { "epoch": 2.025326474079937, "grad_norm": 0.10122929513454437, "learning_rate": 5.317858084938011e-06, "loss": 0.0021, "step": 7678 }, { "epoch": 2.0258541089566022, "grad_norm": 0.11584475636482239, "learning_rate": 5.317506374747208e-06, "loss": 0.0012, "step": 7680 }, { "epoch": 2.026381743833267, "grad_norm": 0.16981260478496552, "learning_rate": 5.317154664556405e-06, "loss": 0.0071, "step": 7682 }, { "epoch": 2.0269093787099326, "grad_norm": 0.21485431492328644, "learning_rate": 5.316802954365603e-06, "loss": 0.0109, "step": 7684 }, { "epoch": 2.027437013586598, "grad_norm": 0.08724717050790787, "learning_rate": 5.3164512441748e-06, "loss": 0.0009, "step": 7686 }, { "epoch": 2.0279646484632634, "grad_norm": 0.20143641531467438, "learning_rate": 5.316099533983997e-06, "loss": 0.0086, "step": 7688 }, { "epoch": 2.028492283339929, "grad_norm": 0.49361199140548706, "learning_rate": 5.315747823793195e-06, "loss": 0.014, "step": 7690 }, { "epoch": 2.029019918216594, "grad_norm": 0.015696216374635696, "learning_rate": 5.315396113602392e-06, "loss": 0.0109, "step": 7692 }, { "epoch": 2.0295475530932596, "grad_norm": 0.018450727686285973, "learning_rate": 5.31504440341159e-06, "loss": 0.0012, "step": 7694 }, { "epoch": 2.030075187969925, "grad_norm": 0.29936903715133667, "learning_rate": 5.314692693220786e-06, "loss": 0.0056, "step": 7696 }, { "epoch": 2.03060282284659, "grad_norm": 0.00855297688394785, "learning_rate": 5.3143409830299835e-06, "loss": 0.0006, "step": 7698 }, { "epoch": 2.0311304577232554, "grad_norm": 0.06842666864395142, "learning_rate": 5.3139892728391804e-06, "loss": 0.0013, "step": 7700 }, { "epoch": 2.031658092599921, "grad_norm": 0.2262687236070633, "learning_rate": 5.313637562648378e-06, "loss": 0.0011, "step": 7702 }, { "epoch": 2.032185727476586, "grad_norm": 0.8808127045631409, "learning_rate": 5.313285852457575e-06, "loss": 0.0056, "step": 7704 }, { "epoch": 2.0327133623532516, "grad_norm": 0.2273581176996231, "learning_rate": 5.312934142266772e-06, "loss": 0.0098, "step": 7706 }, { "epoch": 2.033240997229917, "grad_norm": 0.015061980113387108, "learning_rate": 5.31258243207597e-06, "loss": 0.0006, "step": 7708 }, { "epoch": 2.0337686321065824, "grad_norm": 0.022064892575144768, "learning_rate": 5.312230721885166e-06, "loss": 0.0066, "step": 7710 }, { "epoch": 2.0342962669832474, "grad_norm": 0.059012800455093384, "learning_rate": 5.311879011694364e-06, "loss": 0.0007, "step": 7712 }, { "epoch": 2.034823901859913, "grad_norm": 0.006628058385103941, "learning_rate": 5.311527301503562e-06, "loss": 0.0099, "step": 7714 }, { "epoch": 2.035351536736578, "grad_norm": 0.01126441266387701, "learning_rate": 5.311175591312759e-06, "loss": 0.0006, "step": 7716 }, { "epoch": 2.0358791716132436, "grad_norm": 0.7572453618049622, "learning_rate": 5.3108238811219555e-06, "loss": 0.0022, "step": 7718 }, { "epoch": 2.036406806489909, "grad_norm": 0.10054872184991837, "learning_rate": 5.3104721709311525e-06, "loss": 0.001, "step": 7720 }, { "epoch": 2.0369344413665744, "grad_norm": 0.2883835434913635, "learning_rate": 5.31012046074035e-06, "loss": 0.0032, "step": 7722 }, { "epoch": 2.03746207624324, "grad_norm": 0.22393925487995148, "learning_rate": 5.309768750549547e-06, "loss": 0.0017, "step": 7724 }, { "epoch": 2.0379897111199052, "grad_norm": 0.0964992344379425, "learning_rate": 5.309417040358744e-06, "loss": 0.0012, "step": 7726 }, { "epoch": 2.03851734599657, "grad_norm": 0.00983272586017847, "learning_rate": 5.309065330167942e-06, "loss": 0.0014, "step": 7728 }, { "epoch": 2.0390449808732356, "grad_norm": 0.01744045875966549, "learning_rate": 5.308713619977139e-06, "loss": 0.0038, "step": 7730 }, { "epoch": 2.039572615749901, "grad_norm": 0.008725975640118122, "learning_rate": 5.308361909786337e-06, "loss": 0.0006, "step": 7732 }, { "epoch": 2.0401002506265664, "grad_norm": 0.006355857010930777, "learning_rate": 5.308010199595533e-06, "loss": 0.0011, "step": 7734 }, { "epoch": 2.040627885503232, "grad_norm": 0.6256353259086609, "learning_rate": 5.307658489404731e-06, "loss": 0.0064, "step": 7736 }, { "epoch": 2.0411555203798972, "grad_norm": 0.30852729082107544, "learning_rate": 5.3073067792139285e-06, "loss": 0.0017, "step": 7738 }, { "epoch": 2.0416831552565626, "grad_norm": 0.19609858095645905, "learning_rate": 5.3069550690231246e-06, "loss": 0.0103, "step": 7740 }, { "epoch": 2.0422107901332276, "grad_norm": 0.009266616776585579, "learning_rate": 5.306603358832322e-06, "loss": 0.005, "step": 7742 }, { "epoch": 2.042738425009893, "grad_norm": 0.2687920928001404, "learning_rate": 5.306251648641519e-06, "loss": 0.0033, "step": 7744 }, { "epoch": 2.0432660598865584, "grad_norm": 0.038870133459568024, "learning_rate": 5.305899938450717e-06, "loss": 0.0011, "step": 7746 }, { "epoch": 2.043793694763224, "grad_norm": 0.5125489830970764, "learning_rate": 5.305548228259914e-06, "loss": 0.003, "step": 7748 }, { "epoch": 2.044321329639889, "grad_norm": 0.18117594718933105, "learning_rate": 5.305196518069111e-06, "loss": 0.0025, "step": 7750 }, { "epoch": 2.0448489645165546, "grad_norm": 0.03535382077097893, "learning_rate": 5.304844807878309e-06, "loss": 0.0014, "step": 7752 }, { "epoch": 2.04537659939322, "grad_norm": 0.018970629200339317, "learning_rate": 5.304493097687506e-06, "loss": 0.0007, "step": 7754 }, { "epoch": 2.0459042342698854, "grad_norm": 0.07929360866546631, "learning_rate": 5.304141387496703e-06, "loss": 0.001, "step": 7756 }, { "epoch": 2.0464318691465504, "grad_norm": 0.017472319304943085, "learning_rate": 5.3037896773059e-06, "loss": 0.0013, "step": 7758 }, { "epoch": 2.046959504023216, "grad_norm": 0.17181703448295593, "learning_rate": 5.3034379671150975e-06, "loss": 0.0011, "step": 7760 }, { "epoch": 2.047487138899881, "grad_norm": 0.16731767356395721, "learning_rate": 5.3030862569242944e-06, "loss": 0.0036, "step": 7762 }, { "epoch": 2.0480147737765466, "grad_norm": 0.5974064469337463, "learning_rate": 5.302734546733491e-06, "loss": 0.0032, "step": 7764 }, { "epoch": 2.048542408653212, "grad_norm": 0.018310630694031715, "learning_rate": 5.302382836542689e-06, "loss": 0.0007, "step": 7766 }, { "epoch": 2.0490700435298774, "grad_norm": 0.011407332494854927, "learning_rate": 5.302031126351886e-06, "loss": 0.0014, "step": 7768 }, { "epoch": 2.049597678406543, "grad_norm": 0.0573938749730587, "learning_rate": 5.301679416161083e-06, "loss": 0.0039, "step": 7770 }, { "epoch": 2.050125313283208, "grad_norm": 0.06370183825492859, "learning_rate": 5.301327705970281e-06, "loss": 0.0079, "step": 7772 }, { "epoch": 2.050652948159873, "grad_norm": 0.0869840532541275, "learning_rate": 5.300975995779478e-06, "loss": 0.0012, "step": 7774 }, { "epoch": 2.0511805830365386, "grad_norm": 0.02095695026218891, "learning_rate": 5.300624285588676e-06, "loss": 0.0069, "step": 7776 }, { "epoch": 2.051708217913204, "grad_norm": 0.009206429123878479, "learning_rate": 5.300272575397872e-06, "loss": 0.0013, "step": 7778 }, { "epoch": 2.0522358527898694, "grad_norm": 0.04907023161649704, "learning_rate": 5.2999208652070695e-06, "loss": 0.0006, "step": 7780 }, { "epoch": 2.052763487666535, "grad_norm": 0.02712984010577202, "learning_rate": 5.2995691550162665e-06, "loss": 0.0008, "step": 7782 }, { "epoch": 2.0532911225432002, "grad_norm": 0.014930342324078083, "learning_rate": 5.299217444825464e-06, "loss": 0.0083, "step": 7784 }, { "epoch": 2.0538187574198656, "grad_norm": 0.22814896702766418, "learning_rate": 5.298865734634661e-06, "loss": 0.0015, "step": 7786 }, { "epoch": 2.0543463922965306, "grad_norm": 0.02890465222299099, "learning_rate": 5.298514024443858e-06, "loss": 0.0024, "step": 7788 }, { "epoch": 2.054874027173196, "grad_norm": 0.036391545087099075, "learning_rate": 5.298162314253056e-06, "loss": 0.0041, "step": 7790 }, { "epoch": 2.0554016620498614, "grad_norm": 0.009564150124788284, "learning_rate": 5.297810604062253e-06, "loss": 0.0007, "step": 7792 }, { "epoch": 2.055929296926527, "grad_norm": 0.15559206902980804, "learning_rate": 5.29745889387145e-06, "loss": 0.0066, "step": 7794 }, { "epoch": 2.0564569318031922, "grad_norm": 0.7723419666290283, "learning_rate": 5.297107183680648e-06, "loss": 0.0041, "step": 7796 }, { "epoch": 2.0569845666798576, "grad_norm": 0.12363036721944809, "learning_rate": 5.296755473489845e-06, "loss": 0.0024, "step": 7798 }, { "epoch": 2.057512201556523, "grad_norm": 0.14211761951446533, "learning_rate": 5.296403763299042e-06, "loss": 0.0012, "step": 7800 }, { "epoch": 2.0580398364331884, "grad_norm": 0.1249900609254837, "learning_rate": 5.2960520531082386e-06, "loss": 0.0011, "step": 7802 }, { "epoch": 2.0585674713098534, "grad_norm": 0.013907692395150661, "learning_rate": 5.295700342917436e-06, "loss": 0.0028, "step": 7804 }, { "epoch": 2.059095106186519, "grad_norm": 0.010091076605021954, "learning_rate": 5.295348632726633e-06, "loss": 0.0007, "step": 7806 }, { "epoch": 2.059622741063184, "grad_norm": 0.4143964350223541, "learning_rate": 5.29499692253583e-06, "loss": 0.0028, "step": 7808 }, { "epoch": 2.0601503759398496, "grad_norm": 0.11180243641138077, "learning_rate": 5.294645212345028e-06, "loss": 0.001, "step": 7810 }, { "epoch": 2.060678010816515, "grad_norm": 0.01494042482227087, "learning_rate": 5.294293502154225e-06, "loss": 0.0006, "step": 7812 }, { "epoch": 2.0612056456931804, "grad_norm": 0.48547104001045227, "learning_rate": 5.293941791963423e-06, "loss": 0.0142, "step": 7814 }, { "epoch": 2.061733280569846, "grad_norm": 0.45004868507385254, "learning_rate": 5.293590081772619e-06, "loss": 0.0044, "step": 7816 }, { "epoch": 2.062260915446511, "grad_norm": 0.3945074677467346, "learning_rate": 5.293238371581817e-06, "loss": 0.0036, "step": 7818 }, { "epoch": 2.062788550323176, "grad_norm": 0.012873604893684387, "learning_rate": 5.292886661391014e-06, "loss": 0.0036, "step": 7820 }, { "epoch": 2.0633161851998416, "grad_norm": 0.007101363502442837, "learning_rate": 5.2925349512002115e-06, "loss": 0.0006, "step": 7822 }, { "epoch": 2.063843820076507, "grad_norm": 0.2451222687959671, "learning_rate": 5.292183241009408e-06, "loss": 0.0049, "step": 7824 }, { "epoch": 2.0643714549531724, "grad_norm": 0.344256192445755, "learning_rate": 5.291831530818605e-06, "loss": 0.0019, "step": 7826 }, { "epoch": 2.064899089829838, "grad_norm": 0.031183777377009392, "learning_rate": 5.291479820627803e-06, "loss": 0.001, "step": 7828 }, { "epoch": 2.0654267247065032, "grad_norm": 0.3272712826728821, "learning_rate": 5.291128110437e-06, "loss": 0.0081, "step": 7830 }, { "epoch": 2.0659543595831686, "grad_norm": 0.06424051523208618, "learning_rate": 5.290776400246197e-06, "loss": 0.0006, "step": 7832 }, { "epoch": 2.0664819944598336, "grad_norm": 0.011881220154464245, "learning_rate": 5.290424690055395e-06, "loss": 0.0035, "step": 7834 }, { "epoch": 2.067009629336499, "grad_norm": 0.01268807053565979, "learning_rate": 5.290072979864592e-06, "loss": 0.0069, "step": 7836 }, { "epoch": 2.0675372642131644, "grad_norm": 0.13879072666168213, "learning_rate": 5.289721269673789e-06, "loss": 0.002, "step": 7838 }, { "epoch": 2.06806489908983, "grad_norm": 0.013797208666801453, "learning_rate": 5.289369559482986e-06, "loss": 0.0006, "step": 7840 }, { "epoch": 2.0685925339664952, "grad_norm": 0.020719358697533607, "learning_rate": 5.2890178492921835e-06, "loss": 0.0007, "step": 7842 }, { "epoch": 2.0691201688431606, "grad_norm": 0.024540867656469345, "learning_rate": 5.2886661391013805e-06, "loss": 0.0013, "step": 7844 }, { "epoch": 2.069647803719826, "grad_norm": 0.42454567551612854, "learning_rate": 5.2883144289105774e-06, "loss": 0.0018, "step": 7846 }, { "epoch": 2.0701754385964914, "grad_norm": 0.2090650051832199, "learning_rate": 5.287962718719775e-06, "loss": 0.0074, "step": 7848 }, { "epoch": 2.0707030734731564, "grad_norm": 0.31742802262306213, "learning_rate": 5.287611008528972e-06, "loss": 0.0014, "step": 7850 }, { "epoch": 2.071230708349822, "grad_norm": 0.027379481121897697, "learning_rate": 5.28725929833817e-06, "loss": 0.0013, "step": 7852 }, { "epoch": 2.071758343226487, "grad_norm": 0.02081502415239811, "learning_rate": 5.286907588147367e-06, "loss": 0.0008, "step": 7854 }, { "epoch": 2.0722859781031526, "grad_norm": 0.21183112263679504, "learning_rate": 5.286555877956564e-06, "loss": 0.0033, "step": 7856 }, { "epoch": 2.072813612979818, "grad_norm": 0.16321299970149994, "learning_rate": 5.286204167765762e-06, "loss": 0.001, "step": 7858 }, { "epoch": 2.0733412478564834, "grad_norm": 0.053337641060352325, "learning_rate": 5.285852457574958e-06, "loss": 0.0006, "step": 7860 }, { "epoch": 2.073868882733149, "grad_norm": 0.005547933746129274, "learning_rate": 5.285500747384156e-06, "loss": 0.0006, "step": 7862 }, { "epoch": 2.074396517609814, "grad_norm": 0.007445616647601128, "learning_rate": 5.2851490371933526e-06, "loss": 0.0005, "step": 7864 }, { "epoch": 2.074924152486479, "grad_norm": 0.4741054177284241, "learning_rate": 5.28479732700255e-06, "loss": 0.0008, "step": 7866 }, { "epoch": 2.0754517873631446, "grad_norm": 0.04410503804683685, "learning_rate": 5.284445616811747e-06, "loss": 0.0025, "step": 7868 }, { "epoch": 2.07597942223981, "grad_norm": 0.8509225249290466, "learning_rate": 5.284093906620944e-06, "loss": 0.0017, "step": 7870 }, { "epoch": 2.0765070571164754, "grad_norm": 0.1342010796070099, "learning_rate": 5.283742196430142e-06, "loss": 0.001, "step": 7872 }, { "epoch": 2.077034691993141, "grad_norm": 0.005059106275439262, "learning_rate": 5.283390486239339e-06, "loss": 0.0038, "step": 7874 }, { "epoch": 2.0775623268698062, "grad_norm": 0.7006996870040894, "learning_rate": 5.283038776048536e-06, "loss": 0.0062, "step": 7876 }, { "epoch": 2.0780899617464716, "grad_norm": 0.0150176752358675, "learning_rate": 5.282687065857733e-06, "loss": 0.0021, "step": 7878 }, { "epoch": 2.0786175966231366, "grad_norm": 0.014547976665198803, "learning_rate": 5.282335355666931e-06, "loss": 0.0005, "step": 7880 }, { "epoch": 2.079145231499802, "grad_norm": 0.42640265822410583, "learning_rate": 5.2819836454761285e-06, "loss": 0.0054, "step": 7882 }, { "epoch": 2.0796728663764674, "grad_norm": 0.09215012937784195, "learning_rate": 5.281631935285325e-06, "loss": 0.0011, "step": 7884 }, { "epoch": 2.080200501253133, "grad_norm": 0.11487562954425812, "learning_rate": 5.281280225094522e-06, "loss": 0.0011, "step": 7886 }, { "epoch": 2.0807281361297982, "grad_norm": 0.3184773921966553, "learning_rate": 5.280928514903719e-06, "loss": 0.0028, "step": 7888 }, { "epoch": 2.0812557710064636, "grad_norm": 0.015918051823973656, "learning_rate": 5.280576804712916e-06, "loss": 0.0006, "step": 7890 }, { "epoch": 2.081783405883129, "grad_norm": 0.007775797508656979, "learning_rate": 5.280225094522114e-06, "loss": 0.0005, "step": 7892 }, { "epoch": 2.082311040759794, "grad_norm": 0.25052687525749207, "learning_rate": 5.279873384331311e-06, "loss": 0.003, "step": 7894 }, { "epoch": 2.0828386756364594, "grad_norm": 0.007670550607144833, "learning_rate": 5.279521674140509e-06, "loss": 0.0005, "step": 7896 }, { "epoch": 2.083366310513125, "grad_norm": 0.006529587786644697, "learning_rate": 5.279169963949705e-06, "loss": 0.0109, "step": 7898 }, { "epoch": 2.0838939453897902, "grad_norm": 0.13190369307994843, "learning_rate": 5.278818253758903e-06, "loss": 0.001, "step": 7900 }, { "epoch": 2.0844215802664556, "grad_norm": 0.039310287684202194, "learning_rate": 5.2784665435681e-06, "loss": 0.0077, "step": 7902 }, { "epoch": 2.084949215143121, "grad_norm": 0.027230674400925636, "learning_rate": 5.2781148333772975e-06, "loss": 0.0014, "step": 7904 }, { "epoch": 2.0854768500197864, "grad_norm": 0.022903306409716606, "learning_rate": 5.2777631231864945e-06, "loss": 0.0006, "step": 7906 }, { "epoch": 2.086004484896452, "grad_norm": 0.3950009047985077, "learning_rate": 5.2774114129956914e-06, "loss": 0.0111, "step": 7908 }, { "epoch": 2.086532119773117, "grad_norm": 0.3060206174850464, "learning_rate": 5.277059702804889e-06, "loss": 0.0024, "step": 7910 }, { "epoch": 2.087059754649782, "grad_norm": 0.46766942739486694, "learning_rate": 5.276707992614086e-06, "loss": 0.0019, "step": 7912 }, { "epoch": 2.0875873895264476, "grad_norm": 0.017563695088028908, "learning_rate": 5.276356282423283e-06, "loss": 0.0006, "step": 7914 }, { "epoch": 2.088115024403113, "grad_norm": 0.04125635698437691, "learning_rate": 5.276004572232481e-06, "loss": 0.0119, "step": 7916 }, { "epoch": 2.0886426592797784, "grad_norm": 0.009421559982001781, "learning_rate": 5.275652862041678e-06, "loss": 0.0009, "step": 7918 }, { "epoch": 2.089170294156444, "grad_norm": 0.012307866476476192, "learning_rate": 5.275301151850875e-06, "loss": 0.0008, "step": 7920 }, { "epoch": 2.0896979290331092, "grad_norm": 0.022205952554941177, "learning_rate": 5.274949441660072e-06, "loss": 0.0006, "step": 7922 }, { "epoch": 2.090225563909774, "grad_norm": 0.07963812351226807, "learning_rate": 5.27459773146927e-06, "loss": 0.0072, "step": 7924 }, { "epoch": 2.0907531987864396, "grad_norm": 0.07307034730911255, "learning_rate": 5.2742460212784665e-06, "loss": 0.0031, "step": 7926 }, { "epoch": 2.091280833663105, "grad_norm": 0.462934285402298, "learning_rate": 5.2738943110876635e-06, "loss": 0.0051, "step": 7928 }, { "epoch": 2.0918084685397704, "grad_norm": 0.02175169810652733, "learning_rate": 5.273542600896861e-06, "loss": 0.0007, "step": 7930 }, { "epoch": 2.092336103416436, "grad_norm": 0.07324308902025223, "learning_rate": 5.273190890706058e-06, "loss": 0.0009, "step": 7932 }, { "epoch": 2.0928637382931012, "grad_norm": 0.01587897352874279, "learning_rate": 5.272839180515256e-06, "loss": 0.0006, "step": 7934 }, { "epoch": 2.0933913731697666, "grad_norm": 0.2785729169845581, "learning_rate": 5.272487470324452e-06, "loss": 0.0019, "step": 7936 }, { "epoch": 2.093919008046432, "grad_norm": 0.537908673286438, "learning_rate": 5.27213576013365e-06, "loss": 0.014, "step": 7938 }, { "epoch": 2.094446642923097, "grad_norm": 0.023139849305152893, "learning_rate": 5.271784049942848e-06, "loss": 0.0007, "step": 7940 }, { "epoch": 2.0949742777997624, "grad_norm": 0.014539945870637894, "learning_rate": 5.271432339752045e-06, "loss": 0.0037, "step": 7942 }, { "epoch": 2.095501912676428, "grad_norm": 0.05507129058241844, "learning_rate": 5.271080629561242e-06, "loss": 0.001, "step": 7944 }, { "epoch": 2.0960295475530932, "grad_norm": 0.2232331484556198, "learning_rate": 5.270728919370439e-06, "loss": 0.0012, "step": 7946 }, { "epoch": 2.0965571824297586, "grad_norm": 0.008687932975590229, "learning_rate": 5.270377209179636e-06, "loss": 0.0006, "step": 7948 }, { "epoch": 2.097084817306424, "grad_norm": 0.014811304397881031, "learning_rate": 5.270025498988833e-06, "loss": 0.0005, "step": 7950 }, { "epoch": 2.0976124521830894, "grad_norm": 0.36422058939933777, "learning_rate": 5.26967378879803e-06, "loss": 0.0098, "step": 7952 }, { "epoch": 2.098140087059755, "grad_norm": 0.20131582021713257, "learning_rate": 5.269322078607228e-06, "loss": 0.004, "step": 7954 }, { "epoch": 2.09866772193642, "grad_norm": 0.5052561163902283, "learning_rate": 5.268970368416425e-06, "loss": 0.0027, "step": 7956 }, { "epoch": 2.0991953568130852, "grad_norm": 0.010597918182611465, "learning_rate": 5.268618658225622e-06, "loss": 0.0039, "step": 7958 }, { "epoch": 2.0997229916897506, "grad_norm": 0.02423410303890705, "learning_rate": 5.268266948034819e-06, "loss": 0.0006, "step": 7960 }, { "epoch": 2.100250626566416, "grad_norm": 0.19530481100082397, "learning_rate": 5.267915237844017e-06, "loss": 0.0048, "step": 7962 }, { "epoch": 2.1007782614430814, "grad_norm": 0.01843714341521263, "learning_rate": 5.267563527653214e-06, "loss": 0.0006, "step": 7964 }, { "epoch": 2.101305896319747, "grad_norm": 0.010408427566289902, "learning_rate": 5.267211817462411e-06, "loss": 0.0009, "step": 7966 }, { "epoch": 2.1018335311964123, "grad_norm": 0.011824379675090313, "learning_rate": 5.2668601072716085e-06, "loss": 0.0007, "step": 7968 }, { "epoch": 2.102361166073077, "grad_norm": 0.023007379844784737, "learning_rate": 5.2665083970808054e-06, "loss": 0.0005, "step": 7970 }, { "epoch": 2.1028888009497426, "grad_norm": 0.4181024432182312, "learning_rate": 5.266156686890003e-06, "loss": 0.006, "step": 7972 }, { "epoch": 2.103416435826408, "grad_norm": 0.015200519934296608, "learning_rate": 5.2658049766992e-06, "loss": 0.0006, "step": 7974 }, { "epoch": 2.1039440707030734, "grad_norm": 0.11691402643918991, "learning_rate": 5.265453266508397e-06, "loss": 0.0007, "step": 7976 }, { "epoch": 2.104471705579739, "grad_norm": 0.011201371438801289, "learning_rate": 5.265101556317595e-06, "loss": 0.0036, "step": 7978 }, { "epoch": 2.1049993404564042, "grad_norm": 0.012955306097865105, "learning_rate": 5.264749846126791e-06, "loss": 0.0008, "step": 7980 }, { "epoch": 2.1055269753330697, "grad_norm": 0.02557319961488247, "learning_rate": 5.264398135935989e-06, "loss": 0.0006, "step": 7982 }, { "epoch": 2.106054610209735, "grad_norm": 0.6567156910896301, "learning_rate": 5.264046425745186e-06, "loss": 0.003, "step": 7984 }, { "epoch": 2.1065822450864, "grad_norm": 0.01014089584350586, "learning_rate": 5.263694715554384e-06, "loss": 0.0037, "step": 7986 }, { "epoch": 2.1071098799630654, "grad_norm": 0.18278785049915314, "learning_rate": 5.2633430053635805e-06, "loss": 0.008, "step": 7988 }, { "epoch": 2.107637514839731, "grad_norm": 0.0121145723387599, "learning_rate": 5.2629912951727775e-06, "loss": 0.0036, "step": 7990 }, { "epoch": 2.1081651497163962, "grad_norm": 0.011868692003190517, "learning_rate": 5.262639584981975e-06, "loss": 0.0005, "step": 7992 }, { "epoch": 2.1086927845930616, "grad_norm": 0.10066025704145432, "learning_rate": 5.262287874791172e-06, "loss": 0.0105, "step": 7994 }, { "epoch": 2.109220419469727, "grad_norm": 0.017870990559458733, "learning_rate": 5.261936164600369e-06, "loss": 0.0021, "step": 7996 }, { "epoch": 2.1097480543463925, "grad_norm": 0.08367563784122467, "learning_rate": 5.261584454409567e-06, "loss": 0.0007, "step": 7998 }, { "epoch": 2.110275689223058, "grad_norm": 0.18500715494155884, "learning_rate": 5.261232744218764e-06, "loss": 0.0048, "step": 8000 }, { "epoch": 2.110275689223058, "eval_loss": 0.003261606441810727, "eval_runtime": 307.5839, "eval_samples_per_second": 701.08, "eval_steps_per_second": 87.638, "step": 8000 }, { "epoch": 2.110803324099723, "grad_norm": 0.031526271253824234, "learning_rate": 5.260881034027962e-06, "loss": 0.0006, "step": 8002 }, { "epoch": 2.1113309589763882, "grad_norm": 0.40777587890625, "learning_rate": 5.260529323837158e-06, "loss": 0.0043, "step": 8004 }, { "epoch": 2.1118585938530536, "grad_norm": 0.14468078315258026, "learning_rate": 5.260177613646356e-06, "loss": 0.0011, "step": 8006 }, { "epoch": 2.112386228729719, "grad_norm": 0.02242281287908554, "learning_rate": 5.259825903455553e-06, "loss": 0.0006, "step": 8008 }, { "epoch": 2.1129138636063844, "grad_norm": 0.23483434319496155, "learning_rate": 5.2594741932647496e-06, "loss": 0.001, "step": 8010 }, { "epoch": 2.11344149848305, "grad_norm": 0.32525405287742615, "learning_rate": 5.259122483073947e-06, "loss": 0.002, "step": 8012 }, { "epoch": 2.1139691333597153, "grad_norm": 1.0312803983688354, "learning_rate": 5.258770772883144e-06, "loss": 0.0113, "step": 8014 }, { "epoch": 2.11449676823638, "grad_norm": 0.008573601022362709, "learning_rate": 5.258419062692342e-06, "loss": 0.0076, "step": 8016 }, { "epoch": 2.1150244031130456, "grad_norm": 0.1491900384426117, "learning_rate": 5.258067352501538e-06, "loss": 0.0011, "step": 8018 }, { "epoch": 2.115552037989711, "grad_norm": 0.14012110233306885, "learning_rate": 5.257715642310736e-06, "loss": 0.0009, "step": 8020 }, { "epoch": 2.1160796728663764, "grad_norm": 0.8409256935119629, "learning_rate": 5.257363932119933e-06, "loss": 0.0083, "step": 8022 }, { "epoch": 2.116607307743042, "grad_norm": 0.5867767930030823, "learning_rate": 5.257012221929131e-06, "loss": 0.0025, "step": 8024 }, { "epoch": 2.1171349426197072, "grad_norm": 0.062034089118242264, "learning_rate": 5.256660511738328e-06, "loss": 0.0087, "step": 8026 }, { "epoch": 2.1176625774963727, "grad_norm": 0.06404148787260056, "learning_rate": 5.256308801547525e-06, "loss": 0.001, "step": 8028 }, { "epoch": 2.118190212373038, "grad_norm": 0.22107547521591187, "learning_rate": 5.2559570913567225e-06, "loss": 0.0015, "step": 8030 }, { "epoch": 2.118717847249703, "grad_norm": 0.07160840928554535, "learning_rate": 5.255605381165919e-06, "loss": 0.0014, "step": 8032 }, { "epoch": 2.1192454821263684, "grad_norm": 0.5572961568832397, "learning_rate": 5.255253670975116e-06, "loss": 0.0159, "step": 8034 }, { "epoch": 2.119773117003034, "grad_norm": 0.04701418802142143, "learning_rate": 5.254901960784314e-06, "loss": 0.0008, "step": 8036 }, { "epoch": 2.1203007518796992, "grad_norm": 0.27837011218070984, "learning_rate": 5.254550250593511e-06, "loss": 0.0048, "step": 8038 }, { "epoch": 2.1208283867563646, "grad_norm": 0.87445467710495, "learning_rate": 5.254198540402708e-06, "loss": 0.0145, "step": 8040 }, { "epoch": 2.12135602163303, "grad_norm": 0.14913974702358246, "learning_rate": 5.253846830211905e-06, "loss": 0.0018, "step": 8042 }, { "epoch": 2.1218836565096955, "grad_norm": 0.2419958859682083, "learning_rate": 5.253495120021103e-06, "loss": 0.0015, "step": 8044 }, { "epoch": 2.122411291386361, "grad_norm": 0.5778574347496033, "learning_rate": 5.2531434098303e-06, "loss": 0.0057, "step": 8046 }, { "epoch": 2.122938926263026, "grad_norm": 0.05948590859770775, "learning_rate": 5.252791699639497e-06, "loss": 0.001, "step": 8048 }, { "epoch": 2.1234665611396912, "grad_norm": 0.10746077448129654, "learning_rate": 5.2524399894486945e-06, "loss": 0.0017, "step": 8050 }, { "epoch": 2.1239941960163566, "grad_norm": 0.01923997327685356, "learning_rate": 5.2520882792578915e-06, "loss": 0.0053, "step": 8052 }, { "epoch": 2.124521830893022, "grad_norm": 0.06866549700498581, "learning_rate": 5.251736569067089e-06, "loss": 0.0037, "step": 8054 }, { "epoch": 2.1250494657696875, "grad_norm": 0.01563616842031479, "learning_rate": 5.251384858876286e-06, "loss": 0.0015, "step": 8056 }, { "epoch": 2.125577100646353, "grad_norm": 0.01976533606648445, "learning_rate": 5.251033148685483e-06, "loss": 0.0026, "step": 8058 }, { "epoch": 2.1261047355230183, "grad_norm": 0.10342898219823837, "learning_rate": 5.250681438494681e-06, "loss": 0.002, "step": 8060 }, { "epoch": 2.1266323703996832, "grad_norm": 0.07235909253358841, "learning_rate": 5.250329728303878e-06, "loss": 0.0031, "step": 8062 }, { "epoch": 2.1271600052763486, "grad_norm": 0.166206955909729, "learning_rate": 5.249978018113075e-06, "loss": 0.004, "step": 8064 }, { "epoch": 2.127687640153014, "grad_norm": 0.24991852045059204, "learning_rate": 5.249626307922272e-06, "loss": 0.0086, "step": 8066 }, { "epoch": 2.1282152750296794, "grad_norm": 0.03825367987155914, "learning_rate": 5.24927459773147e-06, "loss": 0.003, "step": 8068 }, { "epoch": 2.128742909906345, "grad_norm": 0.4680514931678772, "learning_rate": 5.248922887540667e-06, "loss": 0.0035, "step": 8070 }, { "epoch": 2.1292705447830103, "grad_norm": 0.020053144544363022, "learning_rate": 5.2485711773498635e-06, "loss": 0.0007, "step": 8072 }, { "epoch": 2.1297981796596757, "grad_norm": 0.027071978896856308, "learning_rate": 5.248219467159061e-06, "loss": 0.0048, "step": 8074 }, { "epoch": 2.1303258145363406, "grad_norm": 0.01718468964099884, "learning_rate": 5.247867756968258e-06, "loss": 0.0008, "step": 8076 }, { "epoch": 2.130853449413006, "grad_norm": 0.012928595766425133, "learning_rate": 5.247516046777455e-06, "loss": 0.0072, "step": 8078 }, { "epoch": 2.1313810842896714, "grad_norm": 0.04479973390698433, "learning_rate": 5.247164336586652e-06, "loss": 0.0064, "step": 8080 }, { "epoch": 2.131908719166337, "grad_norm": 0.5291675329208374, "learning_rate": 5.24681262639585e-06, "loss": 0.0032, "step": 8082 }, { "epoch": 2.1324363540430022, "grad_norm": 0.0178206916898489, "learning_rate": 5.246460916205048e-06, "loss": 0.0008, "step": 8084 }, { "epoch": 2.1329639889196677, "grad_norm": 0.03771307319402695, "learning_rate": 5.246109206014244e-06, "loss": 0.0008, "step": 8086 }, { "epoch": 2.133491623796333, "grad_norm": 0.03395597264170647, "learning_rate": 5.245757495823442e-06, "loss": 0.0009, "step": 8088 }, { "epoch": 2.1340192586729985, "grad_norm": 0.023953063413500786, "learning_rate": 5.245405785632639e-06, "loss": 0.0007, "step": 8090 }, { "epoch": 2.1345468935496634, "grad_norm": 0.5049073100090027, "learning_rate": 5.2450540754418365e-06, "loss": 0.008, "step": 8092 }, { "epoch": 2.135074528426329, "grad_norm": 0.033742062747478485, "learning_rate": 5.244702365251033e-06, "loss": 0.0007, "step": 8094 }, { "epoch": 2.1356021633029942, "grad_norm": 0.5351361036300659, "learning_rate": 5.24435065506023e-06, "loss": 0.0052, "step": 8096 }, { "epoch": 2.1361297981796596, "grad_norm": 0.045818183571100235, "learning_rate": 5.243998944869428e-06, "loss": 0.0069, "step": 8098 }, { "epoch": 2.136657433056325, "grad_norm": 0.009475643746554852, "learning_rate": 5.243647234678624e-06, "loss": 0.0005, "step": 8100 }, { "epoch": 2.1371850679329905, "grad_norm": 0.025892039760947227, "learning_rate": 5.243295524487822e-06, "loss": 0.0006, "step": 8102 }, { "epoch": 2.137712702809656, "grad_norm": 0.009903918951749802, "learning_rate": 5.242943814297019e-06, "loss": 0.0078, "step": 8104 }, { "epoch": 2.1382403376863213, "grad_norm": 0.01679196208715439, "learning_rate": 5.242592104106217e-06, "loss": 0.0047, "step": 8106 }, { "epoch": 2.1387679725629862, "grad_norm": 0.01388491876423359, "learning_rate": 5.242240393915414e-06, "loss": 0.0008, "step": 8108 }, { "epoch": 2.1392956074396516, "grad_norm": 0.39994585514068604, "learning_rate": 5.241888683724611e-06, "loss": 0.0009, "step": 8110 }, { "epoch": 2.139823242316317, "grad_norm": 0.16516171395778656, "learning_rate": 5.2415369735338085e-06, "loss": 0.0009, "step": 8112 }, { "epoch": 2.1403508771929824, "grad_norm": 0.03574954345822334, "learning_rate": 5.2411852633430055e-06, "loss": 0.0007, "step": 8114 }, { "epoch": 2.140878512069648, "grad_norm": 0.3100293278694153, "learning_rate": 5.2408335531522024e-06, "loss": 0.0125, "step": 8116 }, { "epoch": 2.1414061469463133, "grad_norm": 0.02755993977189064, "learning_rate": 5.2404818429614e-06, "loss": 0.0072, "step": 8118 }, { "epoch": 2.1419337818229787, "grad_norm": 0.2511216402053833, "learning_rate": 5.240130132770597e-06, "loss": 0.0169, "step": 8120 }, { "epoch": 2.1424614166996436, "grad_norm": 0.029711822047829628, "learning_rate": 5.239778422579795e-06, "loss": 0.0006, "step": 8122 }, { "epoch": 2.142989051576309, "grad_norm": 0.2628360688686371, "learning_rate": 5.239426712388991e-06, "loss": 0.0018, "step": 8124 }, { "epoch": 2.1435166864529744, "grad_norm": 0.18077830970287323, "learning_rate": 5.239075002198189e-06, "loss": 0.0019, "step": 8126 }, { "epoch": 2.14404432132964, "grad_norm": 0.02113475650548935, "learning_rate": 5.238723292007386e-06, "loss": 0.0007, "step": 8128 }, { "epoch": 2.1445719562063053, "grad_norm": 0.021538671106100082, "learning_rate": 5.238371581816584e-06, "loss": 0.0024, "step": 8130 }, { "epoch": 2.1450995910829707, "grad_norm": 1.4847288131713867, "learning_rate": 5.238019871625781e-06, "loss": 0.0071, "step": 8132 }, { "epoch": 2.145627225959636, "grad_norm": 0.5825616717338562, "learning_rate": 5.2376681614349775e-06, "loss": 0.0036, "step": 8134 }, { "epoch": 2.1461548608363015, "grad_norm": 0.05734674632549286, "learning_rate": 5.237316451244175e-06, "loss": 0.0056, "step": 8136 }, { "epoch": 2.1466824957129664, "grad_norm": 0.3277267813682556, "learning_rate": 5.2369647410533714e-06, "loss": 0.0031, "step": 8138 }, { "epoch": 2.147210130589632, "grad_norm": 0.05448610335588455, "learning_rate": 5.236613030862569e-06, "loss": 0.0011, "step": 8140 }, { "epoch": 2.1477377654662972, "grad_norm": 0.23996351659297943, "learning_rate": 5.236261320671767e-06, "loss": 0.0025, "step": 8142 }, { "epoch": 2.1482654003429626, "grad_norm": 0.1101209819316864, "learning_rate": 5.235909610480964e-06, "loss": 0.0007, "step": 8144 }, { "epoch": 2.148793035219628, "grad_norm": 0.023076333105564117, "learning_rate": 5.235557900290161e-06, "loss": 0.004, "step": 8146 }, { "epoch": 2.1493206700962935, "grad_norm": 0.02137463167309761, "learning_rate": 5.235206190099358e-06, "loss": 0.0007, "step": 8148 }, { "epoch": 2.149848304972959, "grad_norm": 0.009158634580671787, "learning_rate": 5.234854479908556e-06, "loss": 0.0051, "step": 8150 }, { "epoch": 2.1503759398496243, "grad_norm": 1.2065140008926392, "learning_rate": 5.234502769717753e-06, "loss": 0.0056, "step": 8152 }, { "epoch": 2.1509035747262892, "grad_norm": 0.029104439541697502, "learning_rate": 5.23415105952695e-06, "loss": 0.0113, "step": 8154 }, { "epoch": 2.1514312096029546, "grad_norm": 0.04823879897594452, "learning_rate": 5.233799349336147e-06, "loss": 0.0032, "step": 8156 }, { "epoch": 2.15195884447962, "grad_norm": 0.012720179744064808, "learning_rate": 5.233447639145344e-06, "loss": 0.0006, "step": 8158 }, { "epoch": 2.1524864793562855, "grad_norm": 0.12657243013381958, "learning_rate": 5.233095928954542e-06, "loss": 0.0021, "step": 8160 }, { "epoch": 2.153014114232951, "grad_norm": 0.012874308973550797, "learning_rate": 5.232744218763738e-06, "loss": 0.0006, "step": 8162 }, { "epoch": 2.1535417491096163, "grad_norm": 0.28766870498657227, "learning_rate": 5.232392508572936e-06, "loss": 0.0067, "step": 8164 }, { "epoch": 2.1540693839862817, "grad_norm": 0.027785684913396835, "learning_rate": 5.232040798382134e-06, "loss": 0.0053, "step": 8166 }, { "epoch": 2.1545970188629466, "grad_norm": 0.032734815031290054, "learning_rate": 5.23168908819133e-06, "loss": 0.0007, "step": 8168 }, { "epoch": 2.155124653739612, "grad_norm": 0.07369598001241684, "learning_rate": 5.231337378000528e-06, "loss": 0.01, "step": 8170 }, { "epoch": 2.1556522886162774, "grad_norm": 0.015732541680336, "learning_rate": 5.230985667809725e-06, "loss": 0.0009, "step": 8172 }, { "epoch": 2.156179923492943, "grad_norm": 0.029830370098352432, "learning_rate": 5.2306339576189225e-06, "loss": 0.0009, "step": 8174 }, { "epoch": 2.1567075583696083, "grad_norm": 0.02364618144929409, "learning_rate": 5.2302822474281195e-06, "loss": 0.0006, "step": 8176 }, { "epoch": 2.1572351932462737, "grad_norm": 0.25593361258506775, "learning_rate": 5.229930537237316e-06, "loss": 0.0014, "step": 8178 }, { "epoch": 2.157762828122939, "grad_norm": 0.6062700748443604, "learning_rate": 5.229578827046514e-06, "loss": 0.0124, "step": 8180 }, { "epoch": 2.1582904629996045, "grad_norm": 0.1260451078414917, "learning_rate": 5.229227116855711e-06, "loss": 0.0047, "step": 8182 }, { "epoch": 2.1588180978762694, "grad_norm": 0.2745042145252228, "learning_rate": 5.228875406664908e-06, "loss": 0.0062, "step": 8184 }, { "epoch": 2.159345732752935, "grad_norm": 0.0779501274228096, "learning_rate": 5.228523696474105e-06, "loss": 0.0008, "step": 8186 }, { "epoch": 2.1598733676296002, "grad_norm": 0.13175936043262482, "learning_rate": 5.228171986283303e-06, "loss": 0.0006, "step": 8188 }, { "epoch": 2.1604010025062657, "grad_norm": 0.11141281574964523, "learning_rate": 5.2278202760925e-06, "loss": 0.001, "step": 8190 }, { "epoch": 2.160928637382931, "grad_norm": 0.30227723717689514, "learning_rate": 5.227468565901697e-06, "loss": 0.0024, "step": 8192 }, { "epoch": 2.1614562722595965, "grad_norm": 0.5004816055297852, "learning_rate": 5.227116855710895e-06, "loss": 0.005, "step": 8194 }, { "epoch": 2.161983907136262, "grad_norm": 0.06411049515008926, "learning_rate": 5.2267651455200915e-06, "loss": 0.0011, "step": 8196 }, { "epoch": 2.1625115420129273, "grad_norm": 0.3051586151123047, "learning_rate": 5.2264134353292885e-06, "loss": 0.0013, "step": 8198 }, { "epoch": 2.1630391768895922, "grad_norm": 0.06989262253046036, "learning_rate": 5.226061725138486e-06, "loss": 0.0008, "step": 8200 }, { "epoch": 2.1635668117662576, "grad_norm": 0.17110207676887512, "learning_rate": 5.225710014947683e-06, "loss": 0.0024, "step": 8202 }, { "epoch": 2.164094446642923, "grad_norm": 0.6932342648506165, "learning_rate": 5.225358304756881e-06, "loss": 0.0047, "step": 8204 }, { "epoch": 2.1646220815195885, "grad_norm": 0.02825365774333477, "learning_rate": 5.225006594566077e-06, "loss": 0.0009, "step": 8206 }, { "epoch": 2.165149716396254, "grad_norm": 0.2875973582267761, "learning_rate": 5.224654884375275e-06, "loss": 0.0022, "step": 8208 }, { "epoch": 2.1656773512729193, "grad_norm": 0.18900924921035767, "learning_rate": 5.224303174184472e-06, "loss": 0.002, "step": 8210 }, { "epoch": 2.1662049861495847, "grad_norm": 0.020079821348190308, "learning_rate": 5.22395146399367e-06, "loss": 0.0019, "step": 8212 }, { "epoch": 2.1667326210262496, "grad_norm": 0.1478809416294098, "learning_rate": 5.223599753802867e-06, "loss": 0.0009, "step": 8214 }, { "epoch": 2.167260255902915, "grad_norm": 0.4865858554840088, "learning_rate": 5.223248043612064e-06, "loss": 0.001, "step": 8216 }, { "epoch": 2.1677878907795805, "grad_norm": 0.5406649112701416, "learning_rate": 5.222896333421261e-06, "loss": 0.0077, "step": 8218 }, { "epoch": 2.168315525656246, "grad_norm": 0.12332602590322495, "learning_rate": 5.222544623230458e-06, "loss": 0.0091, "step": 8220 }, { "epoch": 2.1688431605329113, "grad_norm": 0.37136444449424744, "learning_rate": 5.222192913039655e-06, "loss": 0.0013, "step": 8222 }, { "epoch": 2.1693707954095767, "grad_norm": 0.0877939760684967, "learning_rate": 5.221841202848853e-06, "loss": 0.0076, "step": 8224 }, { "epoch": 2.169898430286242, "grad_norm": 0.10684044659137726, "learning_rate": 5.22148949265805e-06, "loss": 0.0009, "step": 8226 }, { "epoch": 2.170426065162907, "grad_norm": 0.011715349741280079, "learning_rate": 5.221137782467247e-06, "loss": 0.0028, "step": 8228 }, { "epoch": 2.1709537000395724, "grad_norm": 0.014563548378646374, "learning_rate": 5.220786072276444e-06, "loss": 0.0031, "step": 8230 }, { "epoch": 2.171481334916238, "grad_norm": 0.048287827521562576, "learning_rate": 5.220434362085642e-06, "loss": 0.0007, "step": 8232 }, { "epoch": 2.1720089697929033, "grad_norm": 0.4099760949611664, "learning_rate": 5.220082651894839e-06, "loss": 0.0008, "step": 8234 }, { "epoch": 2.1725366046695687, "grad_norm": 0.25414273142814636, "learning_rate": 5.219730941704036e-06, "loss": 0.0013, "step": 8236 }, { "epoch": 2.173064239546234, "grad_norm": 0.037120211869478226, "learning_rate": 5.2193792315132335e-06, "loss": 0.0006, "step": 8238 }, { "epoch": 2.1735918744228995, "grad_norm": 0.06833765655755997, "learning_rate": 5.21902752132243e-06, "loss": 0.0011, "step": 8240 }, { "epoch": 2.174119509299565, "grad_norm": 0.06471889466047287, "learning_rate": 5.218675811131628e-06, "loss": 0.0006, "step": 8242 }, { "epoch": 2.1746471441762303, "grad_norm": 0.034870047122240067, "learning_rate": 5.218324100940824e-06, "loss": 0.0018, "step": 8244 }, { "epoch": 2.1751747790528952, "grad_norm": 0.05961764603853226, "learning_rate": 5.217972390750022e-06, "loss": 0.0039, "step": 8246 }, { "epoch": 2.1757024139295607, "grad_norm": 0.06455007940530777, "learning_rate": 5.217620680559219e-06, "loss": 0.0028, "step": 8248 }, { "epoch": 2.176230048806226, "grad_norm": 0.0813375785946846, "learning_rate": 5.217268970368417e-06, "loss": 0.0112, "step": 8250 }, { "epoch": 2.1767576836828915, "grad_norm": 0.21825531125068665, "learning_rate": 5.216917260177614e-06, "loss": 0.0015, "step": 8252 }, { "epoch": 2.177285318559557, "grad_norm": 0.16222047805786133, "learning_rate": 5.216565549986811e-06, "loss": 0.0099, "step": 8254 }, { "epoch": 2.1778129534362223, "grad_norm": 0.022274402901530266, "learning_rate": 5.2162138397960086e-06, "loss": 0.0006, "step": 8256 }, { "epoch": 2.1783405883128877, "grad_norm": 0.12858006358146667, "learning_rate": 5.2158621296052055e-06, "loss": 0.0013, "step": 8258 }, { "epoch": 2.1788682231895526, "grad_norm": 0.04307202994823456, "learning_rate": 5.2155104194144025e-06, "loss": 0.0008, "step": 8260 }, { "epoch": 2.179395858066218, "grad_norm": 0.1189083606004715, "learning_rate": 5.2151587092236e-06, "loss": 0.0039, "step": 8262 }, { "epoch": 2.1799234929428835, "grad_norm": 0.13494032621383667, "learning_rate": 5.214806999032797e-06, "loss": 0.0027, "step": 8264 }, { "epoch": 2.180451127819549, "grad_norm": 0.2409951537847519, "learning_rate": 5.214455288841994e-06, "loss": 0.0038, "step": 8266 }, { "epoch": 2.1809787626962143, "grad_norm": 0.19464711844921112, "learning_rate": 5.214103578651191e-06, "loss": 0.0013, "step": 8268 }, { "epoch": 2.1815063975728797, "grad_norm": 0.2656583786010742, "learning_rate": 5.213751868460389e-06, "loss": 0.0009, "step": 8270 }, { "epoch": 2.182034032449545, "grad_norm": 0.0553288608789444, "learning_rate": 5.213400158269586e-06, "loss": 0.0007, "step": 8272 }, { "epoch": 2.18256166732621, "grad_norm": 0.12119314819574356, "learning_rate": 5.213048448078783e-06, "loss": 0.0008, "step": 8274 }, { "epoch": 2.1830893022028754, "grad_norm": 0.8838743567466736, "learning_rate": 5.212696737887981e-06, "loss": 0.0029, "step": 8276 }, { "epoch": 2.183616937079541, "grad_norm": 0.015560129657387733, "learning_rate": 5.212345027697178e-06, "loss": 0.0005, "step": 8278 }, { "epoch": 2.1841445719562063, "grad_norm": 0.6299037933349609, "learning_rate": 5.211993317506375e-06, "loss": 0.0112, "step": 8280 }, { "epoch": 2.1846722068328717, "grad_norm": 0.7490876913070679, "learning_rate": 5.211641607315572e-06, "loss": 0.0113, "step": 8282 }, { "epoch": 2.185199841709537, "grad_norm": 0.5204761624336243, "learning_rate": 5.211289897124769e-06, "loss": 0.0014, "step": 8284 }, { "epoch": 2.1857274765862025, "grad_norm": 0.028605038300156593, "learning_rate": 5.210938186933967e-06, "loss": 0.0011, "step": 8286 }, { "epoch": 2.186255111462868, "grad_norm": 0.6663943529129028, "learning_rate": 5.210586476743163e-06, "loss": 0.0063, "step": 8288 }, { "epoch": 2.186782746339533, "grad_norm": 0.008040052838623524, "learning_rate": 5.210234766552361e-06, "loss": 0.0012, "step": 8290 }, { "epoch": 2.1873103812161983, "grad_norm": 0.033691875636577606, "learning_rate": 5.209883056361558e-06, "loss": 0.0006, "step": 8292 }, { "epoch": 2.1878380160928637, "grad_norm": 0.02576187252998352, "learning_rate": 5.209531346170756e-06, "loss": 0.0006, "step": 8294 }, { "epoch": 2.188365650969529, "grad_norm": 0.018150730058550835, "learning_rate": 5.209179635979953e-06, "loss": 0.0006, "step": 8296 }, { "epoch": 2.1888932858461945, "grad_norm": 0.013428118079900742, "learning_rate": 5.20882792578915e-06, "loss": 0.0018, "step": 8298 }, { "epoch": 2.18942092072286, "grad_norm": 0.030637046322226524, "learning_rate": 5.2084762155983475e-06, "loss": 0.0012, "step": 8300 }, { "epoch": 2.1899485555995253, "grad_norm": 0.20881487429141998, "learning_rate": 5.208124505407544e-06, "loss": 0.001, "step": 8302 }, { "epoch": 2.1904761904761907, "grad_norm": 0.7774569392204285, "learning_rate": 5.207772795216741e-06, "loss": 0.0085, "step": 8304 }, { "epoch": 2.1910038253528556, "grad_norm": 0.05998125299811363, "learning_rate": 5.207421085025938e-06, "loss": 0.0007, "step": 8306 }, { "epoch": 2.191531460229521, "grad_norm": 0.2520652115345001, "learning_rate": 5.207069374835136e-06, "loss": 0.0138, "step": 8308 }, { "epoch": 2.1920590951061865, "grad_norm": 0.8501624464988708, "learning_rate": 5.206717664644334e-06, "loss": 0.003, "step": 8310 }, { "epoch": 2.192586729982852, "grad_norm": 0.9199255108833313, "learning_rate": 5.20636595445353e-06, "loss": 0.0056, "step": 8312 }, { "epoch": 2.1931143648595173, "grad_norm": 0.7133383750915527, "learning_rate": 5.206014244262728e-06, "loss": 0.0181, "step": 8314 }, { "epoch": 2.1936419997361827, "grad_norm": 0.039326705038547516, "learning_rate": 5.205662534071925e-06, "loss": 0.0042, "step": 8316 }, { "epoch": 2.194169634612848, "grad_norm": 0.20574644207954407, "learning_rate": 5.205310823881122e-06, "loss": 0.0017, "step": 8318 }, { "epoch": 2.194697269489513, "grad_norm": 1.6151822805404663, "learning_rate": 5.2049591136903195e-06, "loss": 0.0042, "step": 8320 }, { "epoch": 2.1952249043661785, "grad_norm": 0.09610091149806976, "learning_rate": 5.2046074034995165e-06, "loss": 0.001, "step": 8322 }, { "epoch": 2.195752539242844, "grad_norm": 0.30434831976890564, "learning_rate": 5.204255693308714e-06, "loss": 0.0026, "step": 8324 }, { "epoch": 2.1962801741195093, "grad_norm": 0.2592184841632843, "learning_rate": 5.20390398311791e-06, "loss": 0.0088, "step": 8326 }, { "epoch": 2.1968078089961747, "grad_norm": 0.013773331418633461, "learning_rate": 5.203552272927108e-06, "loss": 0.0006, "step": 8328 }, { "epoch": 2.19733544387284, "grad_norm": 0.021001826971769333, "learning_rate": 5.203200562736305e-06, "loss": 0.0008, "step": 8330 }, { "epoch": 2.1978630787495055, "grad_norm": 0.007082699798047543, "learning_rate": 5.202848852545503e-06, "loss": 0.0006, "step": 8332 }, { "epoch": 2.198390713626171, "grad_norm": 0.46036577224731445, "learning_rate": 5.2024971423547e-06, "loss": 0.0237, "step": 8334 }, { "epoch": 2.198918348502836, "grad_norm": 0.0166173093020916, "learning_rate": 5.202145432163897e-06, "loss": 0.0054, "step": 8336 }, { "epoch": 2.1994459833795013, "grad_norm": 0.015906374901533127, "learning_rate": 5.201793721973095e-06, "loss": 0.0006, "step": 8338 }, { "epoch": 2.1999736182561667, "grad_norm": 0.01722097583115101, "learning_rate": 5.201442011782292e-06, "loss": 0.0147, "step": 8340 }, { "epoch": 2.200501253132832, "grad_norm": 0.1219630315899849, "learning_rate": 5.2010903015914885e-06, "loss": 0.001, "step": 8342 }, { "epoch": 2.2010288880094975, "grad_norm": 0.042229749262332916, "learning_rate": 5.200738591400686e-06, "loss": 0.0008, "step": 8344 }, { "epoch": 2.201556522886163, "grad_norm": 0.4944656193256378, "learning_rate": 5.200386881209883e-06, "loss": 0.009, "step": 8346 }, { "epoch": 2.2020841577628283, "grad_norm": 0.14610520005226135, "learning_rate": 5.20003517101908e-06, "loss": 0.0038, "step": 8348 }, { "epoch": 2.2026117926394937, "grad_norm": 0.1491357684135437, "learning_rate": 5.199683460828277e-06, "loss": 0.0107, "step": 8350 }, { "epoch": 2.2031394275161587, "grad_norm": 0.224398672580719, "learning_rate": 5.199331750637475e-06, "loss": 0.0128, "step": 8352 }, { "epoch": 2.203667062392824, "grad_norm": 0.17782016098499298, "learning_rate": 5.198980040446672e-06, "loss": 0.0045, "step": 8354 }, { "epoch": 2.2041946972694895, "grad_norm": 1.1511393785476685, "learning_rate": 5.198628330255869e-06, "loss": 0.0023, "step": 8356 }, { "epoch": 2.204722332146155, "grad_norm": 0.007381723262369633, "learning_rate": 5.198276620065067e-06, "loss": 0.0006, "step": 8358 }, { "epoch": 2.2052499670228203, "grad_norm": 1.7055968046188354, "learning_rate": 5.197924909874264e-06, "loss": 0.0054, "step": 8360 }, { "epoch": 2.2057776018994857, "grad_norm": 0.17065848410129547, "learning_rate": 5.1975731996834614e-06, "loss": 0.0085, "step": 8362 }, { "epoch": 2.206305236776151, "grad_norm": 0.3202643096446991, "learning_rate": 5.1972214894926576e-06, "loss": 0.0014, "step": 8364 }, { "epoch": 2.206832871652816, "grad_norm": 0.0530879944562912, "learning_rate": 5.196869779301855e-06, "loss": 0.0008, "step": 8366 }, { "epoch": 2.2073605065294815, "grad_norm": 0.010188199579715729, "learning_rate": 5.196518069111053e-06, "loss": 0.0028, "step": 8368 }, { "epoch": 2.207888141406147, "grad_norm": 0.028334740549325943, "learning_rate": 5.19616635892025e-06, "loss": 0.0019, "step": 8370 }, { "epoch": 2.2084157762828123, "grad_norm": 0.045624010264873505, "learning_rate": 5.195814648729447e-06, "loss": 0.0007, "step": 8372 }, { "epoch": 2.2089434111594777, "grad_norm": 0.701371967792511, "learning_rate": 5.195462938538644e-06, "loss": 0.008, "step": 8374 }, { "epoch": 2.209471046036143, "grad_norm": 0.10966942459344864, "learning_rate": 5.195111228347842e-06, "loss": 0.0023, "step": 8376 }, { "epoch": 2.2099986809128085, "grad_norm": 0.14710061252117157, "learning_rate": 5.194759518157039e-06, "loss": 0.0032, "step": 8378 }, { "epoch": 2.2105263157894735, "grad_norm": 0.016293156892061234, "learning_rate": 5.194407807966236e-06, "loss": 0.0023, "step": 8380 }, { "epoch": 2.211053950666139, "grad_norm": 0.053309570997953415, "learning_rate": 5.1940560977754335e-06, "loss": 0.0041, "step": 8382 }, { "epoch": 2.2115815855428043, "grad_norm": 0.10593042522668839, "learning_rate": 5.1937043875846305e-06, "loss": 0.0009, "step": 8384 }, { "epoch": 2.2121092204194697, "grad_norm": 0.29845088720321655, "learning_rate": 5.193352677393827e-06, "loss": 0.0015, "step": 8386 }, { "epoch": 2.212636855296135, "grad_norm": 0.021697569638490677, "learning_rate": 5.193000967203024e-06, "loss": 0.0007, "step": 8388 }, { "epoch": 2.2131644901728005, "grad_norm": 0.013752615079283714, "learning_rate": 5.192649257012222e-06, "loss": 0.0006, "step": 8390 }, { "epoch": 2.213692125049466, "grad_norm": 0.025205474346876144, "learning_rate": 5.19229754682142e-06, "loss": 0.0062, "step": 8392 }, { "epoch": 2.2142197599261313, "grad_norm": 0.3388609290122986, "learning_rate": 5.191945836630616e-06, "loss": 0.0092, "step": 8394 }, { "epoch": 2.2147473948027967, "grad_norm": 0.03268222510814667, "learning_rate": 5.191594126439814e-06, "loss": 0.0008, "step": 8396 }, { "epoch": 2.2152750296794617, "grad_norm": 0.020363695919513702, "learning_rate": 5.191242416249011e-06, "loss": 0.0014, "step": 8398 }, { "epoch": 2.215802664556127, "grad_norm": 0.6901193857192993, "learning_rate": 5.190890706058209e-06, "loss": 0.0089, "step": 8400 }, { "epoch": 2.2163302994327925, "grad_norm": 0.009605759754776955, "learning_rate": 5.1905389958674056e-06, "loss": 0.0006, "step": 8402 }, { "epoch": 2.216857934309458, "grad_norm": 0.16936422884464264, "learning_rate": 5.1901872856766025e-06, "loss": 0.0038, "step": 8404 }, { "epoch": 2.2173855691861233, "grad_norm": 0.008900580927729607, "learning_rate": 5.1898355754858e-06, "loss": 0.0007, "step": 8406 }, { "epoch": 2.2179132040627887, "grad_norm": 0.36889225244522095, "learning_rate": 5.1894838652949964e-06, "loss": 0.0024, "step": 8408 }, { "epoch": 2.218440838939454, "grad_norm": 0.08767276257276535, "learning_rate": 5.189132155104194e-06, "loss": 0.0036, "step": 8410 }, { "epoch": 2.218968473816119, "grad_norm": 0.2453479766845703, "learning_rate": 5.188780444913391e-06, "loss": 0.0029, "step": 8412 }, { "epoch": 2.2194961086927845, "grad_norm": 0.6631215810775757, "learning_rate": 5.188428734722589e-06, "loss": 0.0018, "step": 8414 }, { "epoch": 2.22002374356945, "grad_norm": 0.3123356103897095, "learning_rate": 5.188077024531786e-06, "loss": 0.0056, "step": 8416 }, { "epoch": 2.2205513784461153, "grad_norm": 0.34643590450286865, "learning_rate": 5.187725314340983e-06, "loss": 0.0027, "step": 8418 }, { "epoch": 2.2210790133227807, "grad_norm": 0.04703021049499512, "learning_rate": 5.187373604150181e-06, "loss": 0.0007, "step": 8420 }, { "epoch": 2.221606648199446, "grad_norm": 0.5513396859169006, "learning_rate": 5.187021893959378e-06, "loss": 0.0029, "step": 8422 }, { "epoch": 2.2221342830761115, "grad_norm": 0.4423471987247467, "learning_rate": 5.186670183768575e-06, "loss": 0.0018, "step": 8424 }, { "epoch": 2.2226619179527765, "grad_norm": 0.01745409518480301, "learning_rate": 5.186318473577772e-06, "loss": 0.0063, "step": 8426 }, { "epoch": 2.223189552829442, "grad_norm": 0.032604917883872986, "learning_rate": 5.185966763386969e-06, "loss": 0.0007, "step": 8428 }, { "epoch": 2.2237171877061073, "grad_norm": 0.01312493160367012, "learning_rate": 5.185615053196167e-06, "loss": 0.0018, "step": 8430 }, { "epoch": 2.2242448225827727, "grad_norm": 0.02310592122375965, "learning_rate": 5.185263343005363e-06, "loss": 0.0006, "step": 8432 }, { "epoch": 2.224772457459438, "grad_norm": 0.32505032420158386, "learning_rate": 5.184911632814561e-06, "loss": 0.0023, "step": 8434 }, { "epoch": 2.2253000923361035, "grad_norm": 0.030942870303988457, "learning_rate": 5.184559922623758e-06, "loss": 0.0006, "step": 8436 }, { "epoch": 2.225827727212769, "grad_norm": 0.2764758765697479, "learning_rate": 5.184208212432955e-06, "loss": 0.0067, "step": 8438 }, { "epoch": 2.2263553620894343, "grad_norm": 0.11091317236423492, "learning_rate": 5.183856502242153e-06, "loss": 0.0022, "step": 8440 }, { "epoch": 2.2268829969660993, "grad_norm": 0.05249127745628357, "learning_rate": 5.18350479205135e-06, "loss": 0.0025, "step": 8442 }, { "epoch": 2.2274106318427647, "grad_norm": 0.02289772778749466, "learning_rate": 5.1831530818605475e-06, "loss": 0.0008, "step": 8444 }, { "epoch": 2.22793826671943, "grad_norm": 0.102070152759552, "learning_rate": 5.182801371669744e-06, "loss": 0.0098, "step": 8446 }, { "epoch": 2.2284659015960955, "grad_norm": 0.03105788677930832, "learning_rate": 5.182449661478941e-06, "loss": 0.0014, "step": 8448 }, { "epoch": 2.228993536472761, "grad_norm": 0.007011971902102232, "learning_rate": 5.182097951288139e-06, "loss": 0.0048, "step": 8450 }, { "epoch": 2.2295211713494263, "grad_norm": 0.4882867634296417, "learning_rate": 5.181746241097336e-06, "loss": 0.0012, "step": 8452 }, { "epoch": 2.2300488062260917, "grad_norm": 0.01788165047764778, "learning_rate": 5.181394530906533e-06, "loss": 0.0006, "step": 8454 }, { "epoch": 2.230576441102757, "grad_norm": 0.01517705712467432, "learning_rate": 5.18104282071573e-06, "loss": 0.0094, "step": 8456 }, { "epoch": 2.231104075979422, "grad_norm": 0.061473093926906586, "learning_rate": 5.180691110524928e-06, "loss": 0.0008, "step": 8458 }, { "epoch": 2.2316317108560875, "grad_norm": 0.247174471616745, "learning_rate": 5.180339400334125e-06, "loss": 0.0024, "step": 8460 }, { "epoch": 2.232159345732753, "grad_norm": 1.192300796508789, "learning_rate": 5.179987690143322e-06, "loss": 0.0061, "step": 8462 }, { "epoch": 2.2326869806094183, "grad_norm": 0.2468528300523758, "learning_rate": 5.1796359799525196e-06, "loss": 0.0065, "step": 8464 }, { "epoch": 2.2332146154860837, "grad_norm": 0.016143452376127243, "learning_rate": 5.1792842697617165e-06, "loss": 0.0007, "step": 8466 }, { "epoch": 2.233742250362749, "grad_norm": 0.040562015026807785, "learning_rate": 5.1789325595709135e-06, "loss": 0.0006, "step": 8468 }, { "epoch": 2.2342698852394145, "grad_norm": 0.02649901621043682, "learning_rate": 5.1785808493801104e-06, "loss": 0.0007, "step": 8470 }, { "epoch": 2.2347975201160795, "grad_norm": 0.014409623108804226, "learning_rate": 5.178229139189308e-06, "loss": 0.0006, "step": 8472 }, { "epoch": 2.235325154992745, "grad_norm": 0.04406273737549782, "learning_rate": 5.177877428998505e-06, "loss": 0.0014, "step": 8474 }, { "epoch": 2.2358527898694103, "grad_norm": 0.011089621111750603, "learning_rate": 5.177525718807702e-06, "loss": 0.0006, "step": 8476 }, { "epoch": 2.2363804247460757, "grad_norm": 0.6745510101318359, "learning_rate": 5.1771740086169e-06, "loss": 0.0049, "step": 8478 }, { "epoch": 2.236908059622741, "grad_norm": 0.12824808061122894, "learning_rate": 5.176822298426097e-06, "loss": 0.0053, "step": 8480 }, { "epoch": 2.2374356944994065, "grad_norm": 0.24829761683940887, "learning_rate": 5.176470588235295e-06, "loss": 0.007, "step": 8482 }, { "epoch": 2.237963329376072, "grad_norm": 0.5159401297569275, "learning_rate": 5.176118878044492e-06, "loss": 0.004, "step": 8484 }, { "epoch": 2.2384909642527373, "grad_norm": 0.15908609330654144, "learning_rate": 5.175767167853689e-06, "loss": 0.0014, "step": 8486 }, { "epoch": 2.2390185991294023, "grad_norm": 0.49183207750320435, "learning_rate": 5.175415457662886e-06, "loss": 0.0032, "step": 8488 }, { "epoch": 2.2395462340060677, "grad_norm": 0.04072689265012741, "learning_rate": 5.175063747472083e-06, "loss": 0.0008, "step": 8490 }, { "epoch": 2.240073868882733, "grad_norm": 0.024728255346417427, "learning_rate": 5.17471203728128e-06, "loss": 0.0005, "step": 8492 }, { "epoch": 2.2406015037593985, "grad_norm": 0.4495720863342285, "learning_rate": 5.174360327090477e-06, "loss": 0.0102, "step": 8494 }, { "epoch": 2.241129138636064, "grad_norm": 0.3948935568332672, "learning_rate": 5.174008616899675e-06, "loss": 0.0047, "step": 8496 }, { "epoch": 2.2416567735127293, "grad_norm": 0.06738210469484329, "learning_rate": 5.173656906708872e-06, "loss": 0.0025, "step": 8498 }, { "epoch": 2.2421844083893947, "grad_norm": 0.2294745147228241, "learning_rate": 5.173305196518069e-06, "loss": 0.0049, "step": 8500 }, { "epoch": 2.24271204326606, "grad_norm": 0.0118575319647789, "learning_rate": 5.172953486327267e-06, "loss": 0.0007, "step": 8502 }, { "epoch": 2.243239678142725, "grad_norm": 0.1303650140762329, "learning_rate": 5.172601776136464e-06, "loss": 0.001, "step": 8504 }, { "epoch": 2.2437673130193905, "grad_norm": 0.10836582630872726, "learning_rate": 5.172250065945661e-06, "loss": 0.0017, "step": 8506 }, { "epoch": 2.244294947896056, "grad_norm": 0.07588489353656769, "learning_rate": 5.1718983557548584e-06, "loss": 0.0039, "step": 8508 }, { "epoch": 2.2448225827727213, "grad_norm": 0.01070018857717514, "learning_rate": 5.171546645564055e-06, "loss": 0.0008, "step": 8510 }, { "epoch": 2.2453502176493867, "grad_norm": 0.8332910537719727, "learning_rate": 5.171194935373253e-06, "loss": 0.0017, "step": 8512 }, { "epoch": 2.245877852526052, "grad_norm": 0.05095122382044792, "learning_rate": 5.170843225182449e-06, "loss": 0.0007, "step": 8514 }, { "epoch": 2.2464054874027175, "grad_norm": 0.05373130738735199, "learning_rate": 5.170491514991647e-06, "loss": 0.0017, "step": 8516 }, { "epoch": 2.2469331222793825, "grad_norm": 0.45318803191185, "learning_rate": 5.170139804800844e-06, "loss": 0.0046, "step": 8518 }, { "epoch": 2.247460757156048, "grad_norm": 0.01325407437980175, "learning_rate": 5.169788094610042e-06, "loss": 0.0009, "step": 8520 }, { "epoch": 2.2479883920327133, "grad_norm": 0.32221823930740356, "learning_rate": 5.169436384419239e-06, "loss": 0.0026, "step": 8522 }, { "epoch": 2.2485160269093787, "grad_norm": 0.8471259474754333, "learning_rate": 5.169084674228436e-06, "loss": 0.0028, "step": 8524 }, { "epoch": 2.249043661786044, "grad_norm": 0.02188778668642044, "learning_rate": 5.1687329640376336e-06, "loss": 0.0005, "step": 8526 }, { "epoch": 2.2495712966627095, "grad_norm": 0.29867231845855713, "learning_rate": 5.16838125384683e-06, "loss": 0.0108, "step": 8528 }, { "epoch": 2.250098931539375, "grad_norm": 0.0180283784866333, "learning_rate": 5.1680295436560275e-06, "loss": 0.0005, "step": 8530 }, { "epoch": 2.25062656641604, "grad_norm": 0.029346568509936333, "learning_rate": 5.167677833465224e-06, "loss": 0.0005, "step": 8532 }, { "epoch": 2.2511542012927053, "grad_norm": 0.006010385695844889, "learning_rate": 5.167326123274422e-06, "loss": 0.0008, "step": 8534 }, { "epoch": 2.2516818361693707, "grad_norm": 0.6014181971549988, "learning_rate": 5.166974413083619e-06, "loss": 0.0179, "step": 8536 }, { "epoch": 2.252209471046036, "grad_norm": 0.10053841024637222, "learning_rate": 5.166622702892816e-06, "loss": 0.0008, "step": 8538 }, { "epoch": 2.2527371059227015, "grad_norm": 0.025777731090784073, "learning_rate": 5.166270992702014e-06, "loss": 0.0034, "step": 8540 }, { "epoch": 2.253264740799367, "grad_norm": 0.07179642468690872, "learning_rate": 5.165919282511211e-06, "loss": 0.0007, "step": 8542 }, { "epoch": 2.2537923756760323, "grad_norm": 0.2542804181575775, "learning_rate": 5.165567572320408e-06, "loss": 0.0022, "step": 8544 }, { "epoch": 2.2543200105526977, "grad_norm": 0.0116888377815485, "learning_rate": 5.165215862129606e-06, "loss": 0.0041, "step": 8546 }, { "epoch": 2.254847645429363, "grad_norm": 0.06753759831190109, "learning_rate": 5.164864151938803e-06, "loss": 0.0007, "step": 8548 }, { "epoch": 2.255375280306028, "grad_norm": 0.10822845250368118, "learning_rate": 5.164512441748e-06, "loss": 0.0025, "step": 8550 }, { "epoch": 2.2559029151826935, "grad_norm": 0.4923752248287201, "learning_rate": 5.1641607315571965e-06, "loss": 0.0033, "step": 8552 }, { "epoch": 2.256430550059359, "grad_norm": 1.3701562881469727, "learning_rate": 5.163809021366394e-06, "loss": 0.0086, "step": 8554 }, { "epoch": 2.2569581849360243, "grad_norm": 0.03809276223182678, "learning_rate": 5.163457311175591e-06, "loss": 0.0006, "step": 8556 }, { "epoch": 2.2574858198126897, "grad_norm": 0.27917909622192383, "learning_rate": 5.163105600984788e-06, "loss": 0.0072, "step": 8558 }, { "epoch": 2.258013454689355, "grad_norm": 0.02389122173190117, "learning_rate": 5.162753890793986e-06, "loss": 0.0089, "step": 8560 }, { "epoch": 2.2585410895660205, "grad_norm": 0.6033251881599426, "learning_rate": 5.162402180603183e-06, "loss": 0.0126, "step": 8562 }, { "epoch": 2.2590687244426855, "grad_norm": 0.03476421535015106, "learning_rate": 5.162050470412381e-06, "loss": 0.0005, "step": 8564 }, { "epoch": 2.259596359319351, "grad_norm": 0.1061694324016571, "learning_rate": 5.161698760221578e-06, "loss": 0.0131, "step": 8566 }, { "epoch": 2.2601239941960163, "grad_norm": 0.27705830335617065, "learning_rate": 5.161347050030775e-06, "loss": 0.0017, "step": 8568 }, { "epoch": 2.2606516290726817, "grad_norm": 0.017449581995606422, "learning_rate": 5.1609953398399724e-06, "loss": 0.0009, "step": 8570 }, { "epoch": 2.261179263949347, "grad_norm": 0.3471437990665436, "learning_rate": 5.160643629649169e-06, "loss": 0.0023, "step": 8572 }, { "epoch": 2.2617068988260125, "grad_norm": 0.3089016079902649, "learning_rate": 5.160291919458366e-06, "loss": 0.0041, "step": 8574 }, { "epoch": 2.262234533702678, "grad_norm": 0.041484732180833817, "learning_rate": 5.159940209267563e-06, "loss": 0.0027, "step": 8576 }, { "epoch": 2.262762168579343, "grad_norm": 0.45590150356292725, "learning_rate": 5.159588499076761e-06, "loss": 0.0037, "step": 8578 }, { "epoch": 2.2632898034560083, "grad_norm": 0.03175773471593857, "learning_rate": 5.159236788885958e-06, "loss": 0.0012, "step": 8580 }, { "epoch": 2.2638174383326737, "grad_norm": 0.15567003190517426, "learning_rate": 5.158885078695155e-06, "loss": 0.0024, "step": 8582 }, { "epoch": 2.264345073209339, "grad_norm": 0.07419619709253311, "learning_rate": 5.158533368504353e-06, "loss": 0.0007, "step": 8584 }, { "epoch": 2.2648727080860045, "grad_norm": 0.20203687250614166, "learning_rate": 5.15818165831355e-06, "loss": 0.0124, "step": 8586 }, { "epoch": 2.26540034296267, "grad_norm": 0.11452993005514145, "learning_rate": 5.157829948122747e-06, "loss": 0.0021, "step": 8588 }, { "epoch": 2.2659279778393353, "grad_norm": 0.01711185835301876, "learning_rate": 5.157478237931944e-06, "loss": 0.0005, "step": 8590 }, { "epoch": 2.2664556127160007, "grad_norm": 0.008078766986727715, "learning_rate": 5.1571265277411415e-06, "loss": 0.0005, "step": 8592 }, { "epoch": 2.266983247592666, "grad_norm": 0.018508240580558777, "learning_rate": 5.156774817550339e-06, "loss": 0.0039, "step": 8594 }, { "epoch": 2.267510882469331, "grad_norm": 0.018532708287239075, "learning_rate": 5.156423107359535e-06, "loss": 0.0006, "step": 8596 }, { "epoch": 2.2680385173459965, "grad_norm": 0.11065277457237244, "learning_rate": 5.156071397168733e-06, "loss": 0.0009, "step": 8598 }, { "epoch": 2.268566152222662, "grad_norm": 0.009334288537502289, "learning_rate": 5.15571968697793e-06, "loss": 0.0008, "step": 8600 }, { "epoch": 2.2690937870993273, "grad_norm": 0.134831964969635, "learning_rate": 5.155367976787128e-06, "loss": 0.001, "step": 8602 }, { "epoch": 2.2696214219759927, "grad_norm": 0.6478462219238281, "learning_rate": 5.155016266596325e-06, "loss": 0.0018, "step": 8604 }, { "epoch": 2.270149056852658, "grad_norm": 0.2663716673851013, "learning_rate": 5.154664556405522e-06, "loss": 0.0017, "step": 8606 }, { "epoch": 2.2706766917293235, "grad_norm": 0.008147064596414566, "learning_rate": 5.15431284621472e-06, "loss": 0.0021, "step": 8608 }, { "epoch": 2.2712043266059885, "grad_norm": 0.3196045160293579, "learning_rate": 5.1539611360239166e-06, "loss": 0.0012, "step": 8610 }, { "epoch": 2.271731961482654, "grad_norm": 0.11331094801425934, "learning_rate": 5.1536094258331135e-06, "loss": 0.0007, "step": 8612 }, { "epoch": 2.2722595963593193, "grad_norm": 0.7411702275276184, "learning_rate": 5.1532577156423105e-06, "loss": 0.0054, "step": 8614 }, { "epoch": 2.2727872312359847, "grad_norm": 0.06878668069839478, "learning_rate": 5.152906005451508e-06, "loss": 0.0008, "step": 8616 }, { "epoch": 2.27331486611265, "grad_norm": 0.0171650443226099, "learning_rate": 5.152554295260705e-06, "loss": 0.0007, "step": 8618 }, { "epoch": 2.2738425009893155, "grad_norm": 0.2911936342716217, "learning_rate": 5.152202585069902e-06, "loss": 0.001, "step": 8620 }, { "epoch": 2.274370135865981, "grad_norm": 0.5301318764686584, "learning_rate": 5.1518508748791e-06, "loss": 0.0024, "step": 8622 }, { "epoch": 2.274897770742646, "grad_norm": 0.023420993238687515, "learning_rate": 5.151499164688297e-06, "loss": 0.0106, "step": 8624 }, { "epoch": 2.2754254056193113, "grad_norm": 0.2385084331035614, "learning_rate": 5.151147454497494e-06, "loss": 0.0011, "step": 8626 }, { "epoch": 2.2759530404959767, "grad_norm": 0.2875806391239166, "learning_rate": 5.150795744306692e-06, "loss": 0.0012, "step": 8628 }, { "epoch": 2.276480675372642, "grad_norm": 0.03260553628206253, "learning_rate": 5.150444034115889e-06, "loss": 0.0008, "step": 8630 }, { "epoch": 2.2770083102493075, "grad_norm": 0.005412244703620672, "learning_rate": 5.1500923239250864e-06, "loss": 0.0061, "step": 8632 }, { "epoch": 2.277535945125973, "grad_norm": 0.01696709915995598, "learning_rate": 5.1497406137342825e-06, "loss": 0.0004, "step": 8634 }, { "epoch": 2.2780635800026383, "grad_norm": 0.030781321227550507, "learning_rate": 5.14938890354348e-06, "loss": 0.0005, "step": 8636 }, { "epoch": 2.2785912148793033, "grad_norm": 0.15350085496902466, "learning_rate": 5.149037193352677e-06, "loss": 0.0007, "step": 8638 }, { "epoch": 2.2791188497559687, "grad_norm": 0.004008461255580187, "learning_rate": 5.148685483161875e-06, "loss": 0.0004, "step": 8640 }, { "epoch": 2.279646484632634, "grad_norm": 0.13722190260887146, "learning_rate": 5.148333772971072e-06, "loss": 0.0089, "step": 8642 }, { "epoch": 2.2801741195092995, "grad_norm": 0.006346656940877438, "learning_rate": 5.147982062780269e-06, "loss": 0.0004, "step": 8644 }, { "epoch": 2.280701754385965, "grad_norm": 0.702739417552948, "learning_rate": 5.147630352589467e-06, "loss": 0.014, "step": 8646 }, { "epoch": 2.2812293892626303, "grad_norm": 0.006731912959367037, "learning_rate": 5.147278642398663e-06, "loss": 0.0058, "step": 8648 }, { "epoch": 2.2817570241392957, "grad_norm": 0.3761717975139618, "learning_rate": 5.146926932207861e-06, "loss": 0.0084, "step": 8650 }, { "epoch": 2.282284659015961, "grad_norm": 0.012377557344734669, "learning_rate": 5.1465752220170585e-06, "loss": 0.0005, "step": 8652 }, { "epoch": 2.2828122938926265, "grad_norm": 0.05973290652036667, "learning_rate": 5.1462235118262555e-06, "loss": 0.0007, "step": 8654 }, { "epoch": 2.2833399287692915, "grad_norm": 0.012925583869218826, "learning_rate": 5.145871801635452e-06, "loss": 0.0005, "step": 8656 }, { "epoch": 2.283867563645957, "grad_norm": 0.04888451099395752, "learning_rate": 5.145520091444649e-06, "loss": 0.0007, "step": 8658 }, { "epoch": 2.2843951985226223, "grad_norm": 0.026566902175545692, "learning_rate": 5.145168381253847e-06, "loss": 0.0006, "step": 8660 }, { "epoch": 2.2849228333992877, "grad_norm": 0.4260731339454651, "learning_rate": 5.144816671063044e-06, "loss": 0.0141, "step": 8662 }, { "epoch": 2.285450468275953, "grad_norm": 0.022562475875020027, "learning_rate": 5.144464960872241e-06, "loss": 0.014, "step": 8664 }, { "epoch": 2.2859781031526185, "grad_norm": 0.030182020738720894, "learning_rate": 5.144113250681439e-06, "loss": 0.0106, "step": 8666 }, { "epoch": 2.286505738029284, "grad_norm": 0.3170395791530609, "learning_rate": 5.143761540490636e-06, "loss": 0.0111, "step": 8668 }, { "epoch": 2.287033372905949, "grad_norm": 0.37638384103775024, "learning_rate": 5.143409830299834e-06, "loss": 0.0034, "step": 8670 }, { "epoch": 2.2875610077826143, "grad_norm": 0.17311608791351318, "learning_rate": 5.14305812010903e-06, "loss": 0.0012, "step": 8672 }, { "epoch": 2.2880886426592797, "grad_norm": 0.06888104230165482, "learning_rate": 5.1427064099182275e-06, "loss": 0.0007, "step": 8674 }, { "epoch": 2.288616277535945, "grad_norm": 0.024147657677531242, "learning_rate": 5.1423546997274245e-06, "loss": 0.0006, "step": 8676 }, { "epoch": 2.2891439124126105, "grad_norm": 0.4744117259979248, "learning_rate": 5.142002989536621e-06, "loss": 0.0061, "step": 8678 }, { "epoch": 2.289671547289276, "grad_norm": 0.14604726433753967, "learning_rate": 5.141651279345819e-06, "loss": 0.0011, "step": 8680 }, { "epoch": 2.2901991821659413, "grad_norm": 0.02368093840777874, "learning_rate": 5.141299569155016e-06, "loss": 0.0007, "step": 8682 }, { "epoch": 2.2907268170426063, "grad_norm": 0.0388215146958828, "learning_rate": 5.140947858964214e-06, "loss": 0.0008, "step": 8684 }, { "epoch": 2.2912544519192717, "grad_norm": 0.011549115180969238, "learning_rate": 5.140596148773411e-06, "loss": 0.0005, "step": 8686 }, { "epoch": 2.291782086795937, "grad_norm": 0.20275206863880157, "learning_rate": 5.140244438582608e-06, "loss": 0.009, "step": 8688 }, { "epoch": 2.2923097216726025, "grad_norm": 0.01777053438127041, "learning_rate": 5.139892728391806e-06, "loss": 0.0005, "step": 8690 }, { "epoch": 2.292837356549268, "grad_norm": 0.016843562945723534, "learning_rate": 5.139541018201003e-06, "loss": 0.0005, "step": 8692 }, { "epoch": 2.2933649914259333, "grad_norm": 0.28675463795661926, "learning_rate": 5.1391893080102e-06, "loss": 0.0082, "step": 8694 }, { "epoch": 2.2938926263025987, "grad_norm": 0.1693117320537567, "learning_rate": 5.1388375978193965e-06, "loss": 0.004, "step": 8696 }, { "epoch": 2.294420261179264, "grad_norm": 0.10012185573577881, "learning_rate": 5.138485887628594e-06, "loss": 0.0013, "step": 8698 }, { "epoch": 2.2949478960559295, "grad_norm": 0.15169794857501984, "learning_rate": 5.138134177437791e-06, "loss": 0.004, "step": 8700 }, { "epoch": 2.2954755309325945, "grad_norm": 0.027262229472398758, "learning_rate": 5.137782467246988e-06, "loss": 0.0011, "step": 8702 }, { "epoch": 2.29600316580926, "grad_norm": 0.029447900131344795, "learning_rate": 5.137430757056186e-06, "loss": 0.0006, "step": 8704 }, { "epoch": 2.2965308006859253, "grad_norm": 0.27390944957733154, "learning_rate": 5.137079046865383e-06, "loss": 0.0053, "step": 8706 }, { "epoch": 2.2970584355625907, "grad_norm": 0.02351076528429985, "learning_rate": 5.13672733667458e-06, "loss": 0.0011, "step": 8708 }, { "epoch": 2.297586070439256, "grad_norm": 0.7420263290405273, "learning_rate": 5.136375626483778e-06, "loss": 0.002, "step": 8710 }, { "epoch": 2.2981137053159215, "grad_norm": 0.0317399837076664, "learning_rate": 5.136023916292975e-06, "loss": 0.0025, "step": 8712 }, { "epoch": 2.298641340192587, "grad_norm": 0.024421941488981247, "learning_rate": 5.1356722061021725e-06, "loss": 0.0006, "step": 8714 }, { "epoch": 2.299168975069252, "grad_norm": 0.1949790120124817, "learning_rate": 5.135320495911369e-06, "loss": 0.0024, "step": 8716 }, { "epoch": 2.2996966099459173, "grad_norm": 0.834918200969696, "learning_rate": 5.134968785720566e-06, "loss": 0.009, "step": 8718 }, { "epoch": 2.3002242448225827, "grad_norm": 0.035864800214767456, "learning_rate": 5.134617075529763e-06, "loss": 0.0096, "step": 8720 }, { "epoch": 2.300751879699248, "grad_norm": 0.6262558698654175, "learning_rate": 5.134265365338961e-06, "loss": 0.0068, "step": 8722 }, { "epoch": 2.3012795145759135, "grad_norm": 0.0358528308570385, "learning_rate": 5.133913655148158e-06, "loss": 0.0009, "step": 8724 }, { "epoch": 2.301807149452579, "grad_norm": 0.09280488640069962, "learning_rate": 5.133561944957355e-06, "loss": 0.0109, "step": 8726 }, { "epoch": 2.3023347843292443, "grad_norm": 0.25905805826187134, "learning_rate": 5.133210234766553e-06, "loss": 0.0034, "step": 8728 }, { "epoch": 2.3028624192059093, "grad_norm": 0.25159844756126404, "learning_rate": 5.13285852457575e-06, "loss": 0.0027, "step": 8730 }, { "epoch": 2.3033900540825747, "grad_norm": 0.6593693494796753, "learning_rate": 5.132506814384947e-06, "loss": 0.0197, "step": 8732 }, { "epoch": 2.30391768895924, "grad_norm": 0.3689116835594177, "learning_rate": 5.132155104194144e-06, "loss": 0.0059, "step": 8734 }, { "epoch": 2.3044453238359055, "grad_norm": 0.15008294582366943, "learning_rate": 5.1318033940033415e-06, "loss": 0.0006, "step": 8736 }, { "epoch": 2.304972958712571, "grad_norm": 0.21096722781658173, "learning_rate": 5.131451683812539e-06, "loss": 0.0114, "step": 8738 }, { "epoch": 2.3055005935892363, "grad_norm": 0.14676959812641144, "learning_rate": 5.131099973621735e-06, "loss": 0.0089, "step": 8740 }, { "epoch": 2.3060282284659017, "grad_norm": 0.010393660515546799, "learning_rate": 5.130748263430933e-06, "loss": 0.005, "step": 8742 }, { "epoch": 2.306555863342567, "grad_norm": 0.039148684591054916, "learning_rate": 5.13039655324013e-06, "loss": 0.0006, "step": 8744 }, { "epoch": 2.3070834982192325, "grad_norm": 0.28383898735046387, "learning_rate": 5.130044843049327e-06, "loss": 0.0082, "step": 8746 }, { "epoch": 2.3076111330958975, "grad_norm": 0.06391601264476776, "learning_rate": 5.129693132858525e-06, "loss": 0.0007, "step": 8748 }, { "epoch": 2.308138767972563, "grad_norm": 0.170936718583107, "learning_rate": 5.129341422667722e-06, "loss": 0.0014, "step": 8750 }, { "epoch": 2.3086664028492283, "grad_norm": 0.08396033197641373, "learning_rate": 5.12898971247692e-06, "loss": 0.001, "step": 8752 }, { "epoch": 2.3091940377258937, "grad_norm": 0.16997505724430084, "learning_rate": 5.128638002286116e-06, "loss": 0.0057, "step": 8754 }, { "epoch": 2.309721672602559, "grad_norm": 0.14351503551006317, "learning_rate": 5.1282862920953136e-06, "loss": 0.0012, "step": 8756 }, { "epoch": 2.3102493074792245, "grad_norm": 0.18109551072120667, "learning_rate": 5.1279345819045105e-06, "loss": 0.0022, "step": 8758 }, { "epoch": 2.31077694235589, "grad_norm": 0.24242135882377625, "learning_rate": 5.127582871713708e-06, "loss": 0.0014, "step": 8760 }, { "epoch": 2.311304577232555, "grad_norm": 0.017100706696510315, "learning_rate": 5.127231161522905e-06, "loss": 0.0006, "step": 8762 }, { "epoch": 2.3118322121092203, "grad_norm": 0.2277912199497223, "learning_rate": 5.126879451332102e-06, "loss": 0.0013, "step": 8764 }, { "epoch": 2.3123598469858857, "grad_norm": 0.027185959741473198, "learning_rate": 5.1265277411413e-06, "loss": 0.001, "step": 8766 }, { "epoch": 2.312887481862551, "grad_norm": 0.01064714789390564, "learning_rate": 5.126176030950497e-06, "loss": 0.0009, "step": 8768 }, { "epoch": 2.3134151167392165, "grad_norm": 0.11809872090816498, "learning_rate": 5.125824320759694e-06, "loss": 0.0008, "step": 8770 }, { "epoch": 2.313942751615882, "grad_norm": 0.044424090534448624, "learning_rate": 5.125472610568892e-06, "loss": 0.0105, "step": 8772 }, { "epoch": 2.3144703864925473, "grad_norm": 0.10655595362186432, "learning_rate": 5.125120900378089e-06, "loss": 0.0013, "step": 8774 }, { "epoch": 2.3149980213692123, "grad_norm": 0.2701515853404999, "learning_rate": 5.124769190187286e-06, "loss": 0.0028, "step": 8776 }, { "epoch": 2.3155256562458777, "grad_norm": 0.35955506563186646, "learning_rate": 5.124417479996483e-06, "loss": 0.0118, "step": 8778 }, { "epoch": 2.316053291122543, "grad_norm": 0.25276172161102295, "learning_rate": 5.12406576980568e-06, "loss": 0.0044, "step": 8780 }, { "epoch": 2.3165809259992085, "grad_norm": 0.1830400675535202, "learning_rate": 5.123714059614877e-06, "loss": 0.0014, "step": 8782 }, { "epoch": 2.317108560875874, "grad_norm": 0.3228776454925537, "learning_rate": 5.123362349424074e-06, "loss": 0.0138, "step": 8784 }, { "epoch": 2.3176361957525393, "grad_norm": 0.15907812118530273, "learning_rate": 5.123010639233272e-06, "loss": 0.0043, "step": 8786 }, { "epoch": 2.3181638306292047, "grad_norm": 0.5436684489250183, "learning_rate": 5.122658929042469e-06, "loss": 0.0061, "step": 8788 }, { "epoch": 2.31869146550587, "grad_norm": 0.1626545637845993, "learning_rate": 5.122307218851667e-06, "loss": 0.0008, "step": 8790 }, { "epoch": 2.319219100382535, "grad_norm": 0.007765068206936121, "learning_rate": 5.121955508660863e-06, "loss": 0.0045, "step": 8792 }, { "epoch": 2.3197467352592005, "grad_norm": 0.7039000988006592, "learning_rate": 5.121603798470061e-06, "loss": 0.0047, "step": 8794 }, { "epoch": 2.320274370135866, "grad_norm": 0.5987533330917358, "learning_rate": 5.1212520882792586e-06, "loss": 0.0066, "step": 8796 }, { "epoch": 2.3208020050125313, "grad_norm": 0.07176539301872253, "learning_rate": 5.1209003780884555e-06, "loss": 0.0006, "step": 8798 }, { "epoch": 2.3213296398891967, "grad_norm": 0.03266686201095581, "learning_rate": 5.1205486678976525e-06, "loss": 0.0119, "step": 8800 }, { "epoch": 2.321857274765862, "grad_norm": 0.13890567421913147, "learning_rate": 5.120196957706849e-06, "loss": 0.0009, "step": 8802 }, { "epoch": 2.3223849096425275, "grad_norm": 0.7547696828842163, "learning_rate": 5.119845247516047e-06, "loss": 0.0016, "step": 8804 }, { "epoch": 2.322912544519193, "grad_norm": 1.1106449365615845, "learning_rate": 5.119493537325244e-06, "loss": 0.0015, "step": 8806 }, { "epoch": 2.323440179395858, "grad_norm": 0.09638337045907974, "learning_rate": 5.119141827134441e-06, "loss": 0.0011, "step": 8808 }, { "epoch": 2.3239678142725233, "grad_norm": 0.04358277842402458, "learning_rate": 5.118790116943639e-06, "loss": 0.0014, "step": 8810 }, { "epoch": 2.3244954491491887, "grad_norm": 0.5184600353240967, "learning_rate": 5.118438406752836e-06, "loss": 0.0038, "step": 8812 }, { "epoch": 2.325023084025854, "grad_norm": 1.0141353607177734, "learning_rate": 5.118086696562033e-06, "loss": 0.0082, "step": 8814 }, { "epoch": 2.3255507189025195, "grad_norm": 0.16869184374809265, "learning_rate": 5.11773498637123e-06, "loss": 0.0032, "step": 8816 }, { "epoch": 2.326078353779185, "grad_norm": 0.05129823461174965, "learning_rate": 5.1173832761804276e-06, "loss": 0.0082, "step": 8818 }, { "epoch": 2.3266059886558503, "grad_norm": 1.475931167602539, "learning_rate": 5.117031565989625e-06, "loss": 0.0057, "step": 8820 }, { "epoch": 2.3271336235325153, "grad_norm": 0.22163712978363037, "learning_rate": 5.1166798557988215e-06, "loss": 0.0018, "step": 8822 }, { "epoch": 2.3276612584091807, "grad_norm": 0.019284624606370926, "learning_rate": 5.116328145608019e-06, "loss": 0.0005, "step": 8824 }, { "epoch": 2.328188893285846, "grad_norm": 0.05144120380282402, "learning_rate": 5.115976435417216e-06, "loss": 0.0007, "step": 8826 }, { "epoch": 2.3287165281625115, "grad_norm": 0.012570284307003021, "learning_rate": 5.115624725226414e-06, "loss": 0.0009, "step": 8828 }, { "epoch": 2.329244163039177, "grad_norm": 0.38463205099105835, "learning_rate": 5.115273015035611e-06, "loss": 0.0037, "step": 8830 }, { "epoch": 2.3297717979158423, "grad_norm": 0.05936446785926819, "learning_rate": 5.114921304844808e-06, "loss": 0.0014, "step": 8832 }, { "epoch": 2.3302994327925077, "grad_norm": 0.01634257659316063, "learning_rate": 5.114569594654006e-06, "loss": 0.0006, "step": 8834 }, { "epoch": 2.3308270676691727, "grad_norm": 0.008758479729294777, "learning_rate": 5.114217884463202e-06, "loss": 0.0005, "step": 8836 }, { "epoch": 2.331354702545838, "grad_norm": 0.008068740367889404, "learning_rate": 5.1138661742724e-06, "loss": 0.0007, "step": 8838 }, { "epoch": 2.3318823374225035, "grad_norm": 0.26729074120521545, "learning_rate": 5.113514464081597e-06, "loss": 0.0009, "step": 8840 }, { "epoch": 2.332409972299169, "grad_norm": 0.1545226275920868, "learning_rate": 5.113162753890794e-06, "loss": 0.0009, "step": 8842 }, { "epoch": 2.3329376071758343, "grad_norm": 0.3195580542087555, "learning_rate": 5.112811043699991e-06, "loss": 0.0065, "step": 8844 }, { "epoch": 2.3334652420524997, "grad_norm": 0.33188891410827637, "learning_rate": 5.112459333509188e-06, "loss": 0.0047, "step": 8846 }, { "epoch": 2.333992876929165, "grad_norm": 0.036957379430532455, "learning_rate": 5.112107623318386e-06, "loss": 0.0006, "step": 8848 }, { "epoch": 2.3345205118058305, "grad_norm": 0.031698331236839294, "learning_rate": 5.111755913127583e-06, "loss": 0.0005, "step": 8850 }, { "epoch": 2.335048146682496, "grad_norm": 0.010313643142580986, "learning_rate": 5.11140420293678e-06, "loss": 0.0061, "step": 8852 }, { "epoch": 2.335575781559161, "grad_norm": 0.04612208902835846, "learning_rate": 5.111052492745978e-06, "loss": 0.0028, "step": 8854 }, { "epoch": 2.3361034164358263, "grad_norm": 0.050719499588012695, "learning_rate": 5.110700782555175e-06, "loss": 0.0006, "step": 8856 }, { "epoch": 2.3366310513124917, "grad_norm": 0.16073736548423767, "learning_rate": 5.1103490723643725e-06, "loss": 0.0011, "step": 8858 }, { "epoch": 2.337158686189157, "grad_norm": 0.025367507711052895, "learning_rate": 5.109997362173569e-06, "loss": 0.0009, "step": 8860 }, { "epoch": 2.3376863210658225, "grad_norm": 0.2529789209365845, "learning_rate": 5.1096456519827664e-06, "loss": 0.0083, "step": 8862 }, { "epoch": 2.338213955942488, "grad_norm": 0.06382140517234802, "learning_rate": 5.109293941791963e-06, "loss": 0.0011, "step": 8864 }, { "epoch": 2.3387415908191533, "grad_norm": 0.07821550220251083, "learning_rate": 5.10894223160116e-06, "loss": 0.0007, "step": 8866 }, { "epoch": 2.3392692256958183, "grad_norm": 0.005258647259324789, "learning_rate": 5.108590521410358e-06, "loss": 0.0044, "step": 8868 }, { "epoch": 2.3397968605724837, "grad_norm": 0.13085591793060303, "learning_rate": 5.108238811219555e-06, "loss": 0.0134, "step": 8870 }, { "epoch": 2.340324495449149, "grad_norm": 0.15812958776950836, "learning_rate": 5.107887101028753e-06, "loss": 0.0007, "step": 8872 }, { "epoch": 2.3408521303258145, "grad_norm": 0.01149151474237442, "learning_rate": 5.107535390837949e-06, "loss": 0.0008, "step": 8874 }, { "epoch": 2.34137976520248, "grad_norm": 0.5674832463264465, "learning_rate": 5.107183680647147e-06, "loss": 0.0027, "step": 8876 }, { "epoch": 2.3419074000791453, "grad_norm": 0.03625975176692009, "learning_rate": 5.106831970456345e-06, "loss": 0.0005, "step": 8878 }, { "epoch": 2.3424350349558107, "grad_norm": 0.007446324452757835, "learning_rate": 5.1064802602655416e-06, "loss": 0.0006, "step": 8880 }, { "epoch": 2.3429626698324757, "grad_norm": 0.005859872791916132, "learning_rate": 5.1061285500747385e-06, "loss": 0.0049, "step": 8882 }, { "epoch": 2.343490304709141, "grad_norm": 0.06586842983961105, "learning_rate": 5.1057768398839355e-06, "loss": 0.0007, "step": 8884 }, { "epoch": 2.3440179395858065, "grad_norm": 0.005783012602478266, "learning_rate": 5.105425129693133e-06, "loss": 0.0004, "step": 8886 }, { "epoch": 2.344545574462472, "grad_norm": 0.00950938742607832, "learning_rate": 5.10507341950233e-06, "loss": 0.0095, "step": 8888 }, { "epoch": 2.3450732093391373, "grad_norm": 0.011141401715576649, "learning_rate": 5.104721709311527e-06, "loss": 0.0004, "step": 8890 }, { "epoch": 2.3456008442158027, "grad_norm": 0.019699590280652046, "learning_rate": 5.104369999120725e-06, "loss": 0.008, "step": 8892 }, { "epoch": 2.346128479092468, "grad_norm": 0.13571062684059143, "learning_rate": 5.104018288929922e-06, "loss": 0.0005, "step": 8894 }, { "epoch": 2.3466561139691335, "grad_norm": 0.1481795310974121, "learning_rate": 5.103666578739119e-06, "loss": 0.0033, "step": 8896 }, { "epoch": 2.347183748845799, "grad_norm": 0.2600635290145874, "learning_rate": 5.103314868548316e-06, "loss": 0.0035, "step": 8898 }, { "epoch": 2.347711383722464, "grad_norm": 0.006122657563537359, "learning_rate": 5.102963158357514e-06, "loss": 0.0005, "step": 8900 }, { "epoch": 2.3482390185991293, "grad_norm": 0.01306266151368618, "learning_rate": 5.102611448166711e-06, "loss": 0.0032, "step": 8902 }, { "epoch": 2.3487666534757947, "grad_norm": 1.2032451629638672, "learning_rate": 5.1022597379759075e-06, "loss": 0.0043, "step": 8904 }, { "epoch": 2.34929428835246, "grad_norm": 0.007524375803768635, "learning_rate": 5.101908027785105e-06, "loss": 0.0022, "step": 8906 }, { "epoch": 2.3498219232291255, "grad_norm": 0.017531977966427803, "learning_rate": 5.101556317594302e-06, "loss": 0.0004, "step": 8908 }, { "epoch": 2.350349558105791, "grad_norm": 0.6503574848175049, "learning_rate": 5.1012046074035e-06, "loss": 0.0097, "step": 8910 }, { "epoch": 2.3508771929824563, "grad_norm": 0.15481580793857574, "learning_rate": 5.100852897212697e-06, "loss": 0.0023, "step": 8912 }, { "epoch": 2.3514048278591213, "grad_norm": 0.024739675223827362, "learning_rate": 5.100501187021894e-06, "loss": 0.0007, "step": 8914 }, { "epoch": 2.3519324627357867, "grad_norm": 0.5033866167068481, "learning_rate": 5.100149476831092e-06, "loss": 0.003, "step": 8916 }, { "epoch": 2.352460097612452, "grad_norm": 0.019715890288352966, "learning_rate": 5.099797766640289e-06, "loss": 0.0005, "step": 8918 }, { "epoch": 2.3529877324891175, "grad_norm": 0.05355163663625717, "learning_rate": 5.099446056449486e-06, "loss": 0.0006, "step": 8920 }, { "epoch": 2.353515367365783, "grad_norm": 0.0167173333466053, "learning_rate": 5.099094346258683e-06, "loss": 0.0012, "step": 8922 }, { "epoch": 2.3540430022424483, "grad_norm": 0.3047032356262207, "learning_rate": 5.0987426360678804e-06, "loss": 0.0034, "step": 8924 }, { "epoch": 2.3545706371191137, "grad_norm": 0.355065256357193, "learning_rate": 5.098390925877077e-06, "loss": 0.0025, "step": 8926 }, { "epoch": 2.3550982719957787, "grad_norm": 0.8369762897491455, "learning_rate": 5.098039215686274e-06, "loss": 0.0021, "step": 8928 }, { "epoch": 2.355625906872444, "grad_norm": 0.01645299233496189, "learning_rate": 5.097687505495472e-06, "loss": 0.0009, "step": 8930 }, { "epoch": 2.3561535417491095, "grad_norm": 0.3266793489456177, "learning_rate": 5.097335795304669e-06, "loss": 0.0071, "step": 8932 }, { "epoch": 2.356681176625775, "grad_norm": 1.1187236309051514, "learning_rate": 5.096984085113866e-06, "loss": 0.0013, "step": 8934 }, { "epoch": 2.3572088115024403, "grad_norm": 0.08045440912246704, "learning_rate": 5.096632374923064e-06, "loss": 0.0007, "step": 8936 }, { "epoch": 2.3577364463791057, "grad_norm": 0.01369867380708456, "learning_rate": 5.096280664732261e-06, "loss": 0.0006, "step": 8938 }, { "epoch": 2.358264081255771, "grad_norm": 0.07574933022260666, "learning_rate": 5.095928954541459e-06, "loss": 0.0011, "step": 8940 }, { "epoch": 2.3587917161324365, "grad_norm": 0.4122644066810608, "learning_rate": 5.095577244350655e-06, "loss": 0.0014, "step": 8942 }, { "epoch": 2.359319351009102, "grad_norm": 0.010540925897657871, "learning_rate": 5.0952255341598525e-06, "loss": 0.0005, "step": 8944 }, { "epoch": 2.359846985885767, "grad_norm": 0.6392640471458435, "learning_rate": 5.0948738239690495e-06, "loss": 0.0051, "step": 8946 }, { "epoch": 2.3603746207624323, "grad_norm": 0.0544872023165226, "learning_rate": 5.094522113778247e-06, "loss": 0.0006, "step": 8948 }, { "epoch": 2.3609022556390977, "grad_norm": 0.18169835209846497, "learning_rate": 5.094170403587444e-06, "loss": 0.0091, "step": 8950 }, { "epoch": 2.361429890515763, "grad_norm": 0.22564752399921417, "learning_rate": 5.093818693396641e-06, "loss": 0.001, "step": 8952 }, { "epoch": 2.3619575253924285, "grad_norm": 0.4471474587917328, "learning_rate": 5.093466983205839e-06, "loss": 0.0054, "step": 8954 }, { "epoch": 2.362485160269094, "grad_norm": 0.038521602749824524, "learning_rate": 5.093115273015035e-06, "loss": 0.0007, "step": 8956 }, { "epoch": 2.3630127951457593, "grad_norm": 0.5380951762199402, "learning_rate": 5.092763562824233e-06, "loss": 0.0123, "step": 8958 }, { "epoch": 2.3635404300224243, "grad_norm": 0.024946263059973717, "learning_rate": 5.09241185263343e-06, "loss": 0.0005, "step": 8960 }, { "epoch": 2.3640680648990897, "grad_norm": 0.00551377609372139, "learning_rate": 5.092060142442628e-06, "loss": 0.0051, "step": 8962 }, { "epoch": 2.364595699775755, "grad_norm": 0.011542350053787231, "learning_rate": 5.0917084322518246e-06, "loss": 0.0009, "step": 8964 }, { "epoch": 2.3651233346524205, "grad_norm": 0.011926393024623394, "learning_rate": 5.0913567220610215e-06, "loss": 0.0007, "step": 8966 }, { "epoch": 2.365650969529086, "grad_norm": 0.20887166261672974, "learning_rate": 5.091005011870219e-06, "loss": 0.0008, "step": 8968 }, { "epoch": 2.3661786044057513, "grad_norm": 0.6646697521209717, "learning_rate": 5.090653301679416e-06, "loss": 0.0024, "step": 8970 }, { "epoch": 2.3667062392824167, "grad_norm": 0.24653127789497375, "learning_rate": 5.090301591488613e-06, "loss": 0.001, "step": 8972 }, { "epoch": 2.3672338741590817, "grad_norm": 0.021323658525943756, "learning_rate": 5.089949881297811e-06, "loss": 0.0038, "step": 8974 }, { "epoch": 2.367761509035747, "grad_norm": 0.00694310525432229, "learning_rate": 5.089598171107008e-06, "loss": 0.0004, "step": 8976 }, { "epoch": 2.3682891439124125, "grad_norm": 0.0075080376118421555, "learning_rate": 5.089246460916206e-06, "loss": 0.0045, "step": 8978 }, { "epoch": 2.368816778789078, "grad_norm": 0.194957435131073, "learning_rate": 5.088894750725402e-06, "loss": 0.0012, "step": 8980 }, { "epoch": 2.3693444136657433, "grad_norm": 0.32442301511764526, "learning_rate": 5.0885430405346e-06, "loss": 0.0015, "step": 8982 }, { "epoch": 2.3698720485424087, "grad_norm": 0.017926478758454323, "learning_rate": 5.088191330343797e-06, "loss": 0.0006, "step": 8984 }, { "epoch": 2.370399683419074, "grad_norm": 0.14302803575992584, "learning_rate": 5.087839620152994e-06, "loss": 0.0059, "step": 8986 }, { "epoch": 2.370927318295739, "grad_norm": 0.11431556195020676, "learning_rate": 5.087487909962191e-06, "loss": 0.0121, "step": 8988 }, { "epoch": 2.3714549531724045, "grad_norm": 0.0065758624114096165, "learning_rate": 5.087136199771388e-06, "loss": 0.0006, "step": 8990 }, { "epoch": 2.37198258804907, "grad_norm": 0.8758652210235596, "learning_rate": 5.086784489580586e-06, "loss": 0.0039, "step": 8992 }, { "epoch": 2.3725102229257353, "grad_norm": 0.028572631999850273, "learning_rate": 5.086432779389783e-06, "loss": 0.0042, "step": 8994 }, { "epoch": 2.3730378578024007, "grad_norm": 0.3406752943992615, "learning_rate": 5.08608106919898e-06, "loss": 0.0092, "step": 8996 }, { "epoch": 2.373565492679066, "grad_norm": 0.019930623471736908, "learning_rate": 5.085729359008178e-06, "loss": 0.0006, "step": 8998 }, { "epoch": 2.3740931275557315, "grad_norm": 0.044651541858911514, "learning_rate": 5.085377648817375e-06, "loss": 0.0026, "step": 9000 }, { "epoch": 2.374620762432397, "grad_norm": 0.21405446529388428, "learning_rate": 5.085025938626572e-06, "loss": 0.0044, "step": 9002 }, { "epoch": 2.3751483973090624, "grad_norm": 0.08460244536399841, "learning_rate": 5.084674228435769e-06, "loss": 0.0045, "step": 9004 }, { "epoch": 2.3756760321857273, "grad_norm": 0.5104633569717407, "learning_rate": 5.0843225182449665e-06, "loss": 0.0175, "step": 9006 }, { "epoch": 2.3762036670623927, "grad_norm": 0.20645974576473236, "learning_rate": 5.0839708080541635e-06, "loss": 0.0088, "step": 9008 }, { "epoch": 2.376731301939058, "grad_norm": 0.3080645501613617, "learning_rate": 5.08361909786336e-06, "loss": 0.0014, "step": 9010 }, { "epoch": 2.3772589368157235, "grad_norm": 0.2967817485332489, "learning_rate": 5.083267387672558e-06, "loss": 0.0012, "step": 9012 }, { "epoch": 2.377786571692389, "grad_norm": 0.07057041674852371, "learning_rate": 5.082915677481755e-06, "loss": 0.0009, "step": 9014 }, { "epoch": 2.3783142065690543, "grad_norm": 0.12618614733219147, "learning_rate": 5.082563967290952e-06, "loss": 0.0011, "step": 9016 }, { "epoch": 2.3788418414457198, "grad_norm": 0.04997202754020691, "learning_rate": 5.082212257100149e-06, "loss": 0.0027, "step": 9018 }, { "epoch": 2.3793694763223847, "grad_norm": 0.6258527636528015, "learning_rate": 5.081860546909347e-06, "loss": 0.0015, "step": 9020 }, { "epoch": 2.37989711119905, "grad_norm": 0.024956168606877327, "learning_rate": 5.081508836718545e-06, "loss": 0.0114, "step": 9022 }, { "epoch": 2.3804247460757155, "grad_norm": 0.3825171887874603, "learning_rate": 5.081157126527741e-06, "loss": 0.0009, "step": 9024 }, { "epoch": 2.380952380952381, "grad_norm": 0.3863365948200226, "learning_rate": 5.0808054163369386e-06, "loss": 0.0015, "step": 9026 }, { "epoch": 2.3814800158290463, "grad_norm": 0.010148570872843266, "learning_rate": 5.0804537061461355e-06, "loss": 0.0006, "step": 9028 }, { "epoch": 2.3820076507057117, "grad_norm": 0.006957790348678827, "learning_rate": 5.080101995955333e-06, "loss": 0.0079, "step": 9030 }, { "epoch": 2.382535285582377, "grad_norm": 0.024573950096964836, "learning_rate": 5.07975028576453e-06, "loss": 0.0005, "step": 9032 }, { "epoch": 2.383062920459042, "grad_norm": 0.011069098487496376, "learning_rate": 5.079398575573727e-06, "loss": 0.0007, "step": 9034 }, { "epoch": 2.3835905553357075, "grad_norm": 0.011401024647057056, "learning_rate": 5.079046865382925e-06, "loss": 0.0004, "step": 9036 }, { "epoch": 2.384118190212373, "grad_norm": 0.0077980621717870235, "learning_rate": 5.078695155192122e-06, "loss": 0.0013, "step": 9038 }, { "epoch": 2.3846458250890383, "grad_norm": 0.02493930049240589, "learning_rate": 5.078343445001319e-06, "loss": 0.0005, "step": 9040 }, { "epoch": 2.3851734599657037, "grad_norm": 0.24669557809829712, "learning_rate": 5.077991734810516e-06, "loss": 0.0024, "step": 9042 }, { "epoch": 2.385701094842369, "grad_norm": 0.013749698176980019, "learning_rate": 5.077640024619714e-06, "loss": 0.0004, "step": 9044 }, { "epoch": 2.3862287297190345, "grad_norm": 0.04204066842794418, "learning_rate": 5.077288314428911e-06, "loss": 0.0007, "step": 9046 }, { "epoch": 2.3867563645957, "grad_norm": 0.05087048560380936, "learning_rate": 5.076936604238108e-06, "loss": 0.0024, "step": 9048 }, { "epoch": 2.3872839994723654, "grad_norm": 0.2955441474914551, "learning_rate": 5.076584894047305e-06, "loss": 0.0028, "step": 9050 }, { "epoch": 2.3878116343490303, "grad_norm": 0.594487726688385, "learning_rate": 5.076233183856502e-06, "loss": 0.0052, "step": 9052 }, { "epoch": 2.3883392692256957, "grad_norm": 0.01960833929479122, "learning_rate": 5.075881473665699e-06, "loss": 0.0005, "step": 9054 }, { "epoch": 2.388866904102361, "grad_norm": 0.3004758358001709, "learning_rate": 5.075529763474897e-06, "loss": 0.001, "step": 9056 }, { "epoch": 2.3893945389790265, "grad_norm": 0.05498797073960304, "learning_rate": 5.075178053284094e-06, "loss": 0.0005, "step": 9058 }, { "epoch": 2.389922173855692, "grad_norm": 0.1462889015674591, "learning_rate": 5.074826343093292e-06, "loss": 0.0015, "step": 9060 }, { "epoch": 2.3904498087323574, "grad_norm": 0.02494039386510849, "learning_rate": 5.074474632902488e-06, "loss": 0.0031, "step": 9062 }, { "epoch": 2.3909774436090228, "grad_norm": 0.3222584128379822, "learning_rate": 5.074122922711686e-06, "loss": 0.0035, "step": 9064 }, { "epoch": 2.3915050784856877, "grad_norm": 0.038091763854026794, "learning_rate": 5.073771212520883e-06, "loss": 0.0025, "step": 9066 }, { "epoch": 2.392032713362353, "grad_norm": 0.4902643859386444, "learning_rate": 5.0734195023300805e-06, "loss": 0.0054, "step": 9068 }, { "epoch": 2.3925603482390185, "grad_norm": 0.09264049679040909, "learning_rate": 5.0730677921392774e-06, "loss": 0.0014, "step": 9070 }, { "epoch": 2.393087983115684, "grad_norm": 0.08747412264347076, "learning_rate": 5.072716081948474e-06, "loss": 0.0043, "step": 9072 }, { "epoch": 2.3936156179923493, "grad_norm": 0.005794546566903591, "learning_rate": 5.072364371757672e-06, "loss": 0.0004, "step": 9074 }, { "epoch": 2.3941432528690147, "grad_norm": 0.01107536070048809, "learning_rate": 5.072012661566868e-06, "loss": 0.0004, "step": 9076 }, { "epoch": 2.39467088774568, "grad_norm": 0.051068492233753204, "learning_rate": 5.071660951376066e-06, "loss": 0.0006, "step": 9078 }, { "epoch": 2.395198522622345, "grad_norm": 0.01965392380952835, "learning_rate": 5.071309241185264e-06, "loss": 0.0008, "step": 9080 }, { "epoch": 2.3957261574990105, "grad_norm": 0.006660726387053728, "learning_rate": 5.070957530994461e-06, "loss": 0.0006, "step": 9082 }, { "epoch": 2.396253792375676, "grad_norm": 0.5224467515945435, "learning_rate": 5.070605820803658e-06, "loss": 0.0072, "step": 9084 }, { "epoch": 2.3967814272523413, "grad_norm": 0.016788965091109276, "learning_rate": 5.070254110612855e-06, "loss": 0.0004, "step": 9086 }, { "epoch": 2.3973090621290067, "grad_norm": 0.02441471442580223, "learning_rate": 5.0699024004220526e-06, "loss": 0.003, "step": 9088 }, { "epoch": 2.397836697005672, "grad_norm": 0.005210898816585541, "learning_rate": 5.0695506902312495e-06, "loss": 0.0041, "step": 9090 }, { "epoch": 2.3983643318823376, "grad_norm": 0.48644766211509705, "learning_rate": 5.0691989800404465e-06, "loss": 0.0054, "step": 9092 }, { "epoch": 2.398891966759003, "grad_norm": 0.2743516266345978, "learning_rate": 5.068847269849644e-06, "loss": 0.0015, "step": 9094 }, { "epoch": 2.3994196016356684, "grad_norm": 0.4757901728153229, "learning_rate": 5.068495559658841e-06, "loss": 0.0037, "step": 9096 }, { "epoch": 2.3999472365123333, "grad_norm": 0.02721528522670269, "learning_rate": 5.068143849468039e-06, "loss": 0.001, "step": 9098 }, { "epoch": 2.4004748713889987, "grad_norm": 0.09995196014642715, "learning_rate": 5.067792139277235e-06, "loss": 0.0008, "step": 9100 }, { "epoch": 2.401002506265664, "grad_norm": 0.009228317998349667, "learning_rate": 5.067440429086433e-06, "loss": 0.0006, "step": 9102 }, { "epoch": 2.4015301411423295, "grad_norm": 0.1726137101650238, "learning_rate": 5.067088718895631e-06, "loss": 0.0015, "step": 9104 }, { "epoch": 2.402057776018995, "grad_norm": 3.075355291366577, "learning_rate": 5.066737008704827e-06, "loss": 0.0036, "step": 9106 }, { "epoch": 2.4025854108956604, "grad_norm": 0.21417903900146484, "learning_rate": 5.066385298514025e-06, "loss": 0.0013, "step": 9108 }, { "epoch": 2.4031130457723258, "grad_norm": 0.009987307712435722, "learning_rate": 5.0660335883232216e-06, "loss": 0.0005, "step": 9110 }, { "epoch": 2.4036406806489907, "grad_norm": 0.1984531730413437, "learning_rate": 5.065681878132419e-06, "loss": 0.0015, "step": 9112 }, { "epoch": 2.404168315525656, "grad_norm": 0.01920950412750244, "learning_rate": 5.065330167941616e-06, "loss": 0.0016, "step": 9114 }, { "epoch": 2.4046959504023215, "grad_norm": 0.17210838198661804, "learning_rate": 5.064978457750813e-06, "loss": 0.0009, "step": 9116 }, { "epoch": 2.405223585278987, "grad_norm": 0.006824542302638292, "learning_rate": 5.064626747560011e-06, "loss": 0.0007, "step": 9118 }, { "epoch": 2.4057512201556523, "grad_norm": 0.029269497841596603, "learning_rate": 5.064275037369208e-06, "loss": 0.0009, "step": 9120 }, { "epoch": 2.4062788550323178, "grad_norm": 0.0872957780957222, "learning_rate": 5.063923327178405e-06, "loss": 0.0041, "step": 9122 }, { "epoch": 2.406806489908983, "grad_norm": 0.03274156525731087, "learning_rate": 5.063571616987602e-06, "loss": 0.0005, "step": 9124 }, { "epoch": 2.407334124785648, "grad_norm": 0.004742836579680443, "learning_rate": 5.0632199067968e-06, "loss": 0.0012, "step": 9126 }, { "epoch": 2.4078617596623135, "grad_norm": 0.10195928812026978, "learning_rate": 5.062868196605997e-06, "loss": 0.0006, "step": 9128 }, { "epoch": 2.408389394538979, "grad_norm": 0.014130252413451672, "learning_rate": 5.062516486415194e-06, "loss": 0.0005, "step": 9130 }, { "epoch": 2.4089170294156443, "grad_norm": 0.015950698405504227, "learning_rate": 5.0621647762243914e-06, "loss": 0.005, "step": 9132 }, { "epoch": 2.4094446642923097, "grad_norm": 0.010713761672377586, "learning_rate": 5.061813066033588e-06, "loss": 0.0027, "step": 9134 }, { "epoch": 2.409972299168975, "grad_norm": 0.048823755234479904, "learning_rate": 5.061461355842785e-06, "loss": 0.0005, "step": 9136 }, { "epoch": 2.4104999340456406, "grad_norm": 0.004295311402529478, "learning_rate": 5.061109645651983e-06, "loss": 0.0086, "step": 9138 }, { "epoch": 2.4110275689223055, "grad_norm": 0.1793050467967987, "learning_rate": 5.06075793546118e-06, "loss": 0.0009, "step": 9140 }, { "epoch": 2.411555203798971, "grad_norm": 0.024454129859805107, "learning_rate": 5.060406225270378e-06, "loss": 0.0004, "step": 9142 }, { "epoch": 2.4120828386756363, "grad_norm": 0.0313662625849247, "learning_rate": 5.060054515079574e-06, "loss": 0.0004, "step": 9144 }, { "epoch": 2.4126104735523017, "grad_norm": 0.36907461285591125, "learning_rate": 5.059702804888772e-06, "loss": 0.0027, "step": 9146 }, { "epoch": 2.413138108428967, "grad_norm": 0.4558168649673462, "learning_rate": 5.059351094697969e-06, "loss": 0.0032, "step": 9148 }, { "epoch": 2.4136657433056325, "grad_norm": 0.8539291024208069, "learning_rate": 5.0589993845071665e-06, "loss": 0.0034, "step": 9150 }, { "epoch": 2.414193378182298, "grad_norm": 0.004527589306235313, "learning_rate": 5.0586476743163635e-06, "loss": 0.0042, "step": 9152 }, { "epoch": 2.4147210130589634, "grad_norm": 0.45741769671440125, "learning_rate": 5.0582959641255605e-06, "loss": 0.005, "step": 9154 }, { "epoch": 2.4152486479356288, "grad_norm": 1.0714726448059082, "learning_rate": 5.057944253934758e-06, "loss": 0.0035, "step": 9156 }, { "epoch": 2.4157762828122937, "grad_norm": 0.4205358624458313, "learning_rate": 5.057592543743955e-06, "loss": 0.0085, "step": 9158 }, { "epoch": 2.416303917688959, "grad_norm": 0.27318552136421204, "learning_rate": 5.057240833553152e-06, "loss": 0.0026, "step": 9160 }, { "epoch": 2.4168315525656245, "grad_norm": 0.25917425751686096, "learning_rate": 5.05688912336235e-06, "loss": 0.0015, "step": 9162 }, { "epoch": 2.41735918744229, "grad_norm": 0.37146565318107605, "learning_rate": 5.056537413171547e-06, "loss": 0.0118, "step": 9164 }, { "epoch": 2.4178868223189554, "grad_norm": 0.022370457649230957, "learning_rate": 5.056185702980744e-06, "loss": 0.0057, "step": 9166 }, { "epoch": 2.4184144571956208, "grad_norm": 0.4601283371448517, "learning_rate": 5.055833992789941e-06, "loss": 0.0015, "step": 9168 }, { "epoch": 2.418942092072286, "grad_norm": 0.1302143782377243, "learning_rate": 5.055482282599139e-06, "loss": 0.0092, "step": 9170 }, { "epoch": 2.419469726948951, "grad_norm": 0.11327711492776871, "learning_rate": 5.0551305724083356e-06, "loss": 0.0029, "step": 9172 }, { "epoch": 2.4199973618256165, "grad_norm": 0.15659844875335693, "learning_rate": 5.0547788622175325e-06, "loss": 0.0074, "step": 9174 }, { "epoch": 2.420524996702282, "grad_norm": 0.10701067745685577, "learning_rate": 5.05442715202673e-06, "loss": 0.0048, "step": 9176 }, { "epoch": 2.4210526315789473, "grad_norm": 0.2365303486585617, "learning_rate": 5.054075441835927e-06, "loss": 0.0051, "step": 9178 }, { "epoch": 2.4215802664556128, "grad_norm": 0.028207307681441307, "learning_rate": 5.053723731645125e-06, "loss": 0.0038, "step": 9180 }, { "epoch": 2.422107901332278, "grad_norm": 0.10967427492141724, "learning_rate": 5.053372021454321e-06, "loss": 0.0085, "step": 9182 }, { "epoch": 2.4226355362089436, "grad_norm": 0.12945324182510376, "learning_rate": 5.053020311263519e-06, "loss": 0.0012, "step": 9184 }, { "epoch": 2.4231631710856085, "grad_norm": 0.0051793064922094345, "learning_rate": 5.052668601072716e-06, "loss": 0.0004, "step": 9186 }, { "epoch": 2.423690805962274, "grad_norm": 0.00729187298566103, "learning_rate": 5.052316890881914e-06, "loss": 0.0004, "step": 9188 }, { "epoch": 2.4242184408389393, "grad_norm": 0.2403481900691986, "learning_rate": 5.051965180691111e-06, "loss": 0.0068, "step": 9190 }, { "epoch": 2.4247460757156047, "grad_norm": 0.9769510626792908, "learning_rate": 5.051613470500308e-06, "loss": 0.006, "step": 9192 }, { "epoch": 2.42527371059227, "grad_norm": 0.00845382921397686, "learning_rate": 5.0512617603095054e-06, "loss": 0.0129, "step": 9194 }, { "epoch": 2.4258013454689356, "grad_norm": 0.3042184114456177, "learning_rate": 5.050910050118702e-06, "loss": 0.0023, "step": 9196 }, { "epoch": 2.426328980345601, "grad_norm": 0.011142095550894737, "learning_rate": 5.050558339927899e-06, "loss": 0.0004, "step": 9198 }, { "epoch": 2.4268566152222664, "grad_norm": 1.1452364921569824, "learning_rate": 5.050206629737097e-06, "loss": 0.001, "step": 9200 }, { "epoch": 2.4273842500989318, "grad_norm": 0.6035282611846924, "learning_rate": 5.049854919546294e-06, "loss": 0.0037, "step": 9202 }, { "epoch": 2.4279118849755967, "grad_norm": 0.120913565158844, "learning_rate": 5.049503209355491e-06, "loss": 0.001, "step": 9204 }, { "epoch": 2.428439519852262, "grad_norm": 0.04563799872994423, "learning_rate": 5.049151499164688e-06, "loss": 0.0069, "step": 9206 }, { "epoch": 2.4289671547289275, "grad_norm": 0.1985950767993927, "learning_rate": 5.048799788973886e-06, "loss": 0.0012, "step": 9208 }, { "epoch": 2.429494789605593, "grad_norm": 0.05908178165555, "learning_rate": 5.048448078783083e-06, "loss": 0.004, "step": 9210 }, { "epoch": 2.4300224244822584, "grad_norm": 0.889704167842865, "learning_rate": 5.04809636859228e-06, "loss": 0.0016, "step": 9212 }, { "epoch": 2.4305500593589238, "grad_norm": 0.049748484045267105, "learning_rate": 5.0477446584014775e-06, "loss": 0.0007, "step": 9214 }, { "epoch": 2.431077694235589, "grad_norm": 0.07155516743659973, "learning_rate": 5.0473929482106744e-06, "loss": 0.0033, "step": 9216 }, { "epoch": 2.431605329112254, "grad_norm": 0.014014062471687794, "learning_rate": 5.047041238019872e-06, "loss": 0.0005, "step": 9218 }, { "epoch": 2.4321329639889195, "grad_norm": 0.006012560334056616, "learning_rate": 5.046689527829069e-06, "loss": 0.0004, "step": 9220 }, { "epoch": 2.432660598865585, "grad_norm": 0.5513724684715271, "learning_rate": 5.046337817638266e-06, "loss": 0.016, "step": 9222 }, { "epoch": 2.4331882337422504, "grad_norm": 0.12003165483474731, "learning_rate": 5.045986107447464e-06, "loss": 0.0015, "step": 9224 }, { "epoch": 2.4337158686189158, "grad_norm": 0.5168538093566895, "learning_rate": 5.04563439725666e-06, "loss": 0.0036, "step": 9226 }, { "epoch": 2.434243503495581, "grad_norm": 0.2976245582103729, "learning_rate": 5.045282687065858e-06, "loss": 0.0023, "step": 9228 }, { "epoch": 2.4347711383722466, "grad_norm": 0.47360002994537354, "learning_rate": 5.044930976875055e-06, "loss": 0.0017, "step": 9230 }, { "epoch": 2.4352987732489115, "grad_norm": 0.2911134660243988, "learning_rate": 5.044579266684253e-06, "loss": 0.0015, "step": 9232 }, { "epoch": 2.435826408125577, "grad_norm": 0.09476339817047119, "learning_rate": 5.0442275564934496e-06, "loss": 0.0014, "step": 9234 }, { "epoch": 2.4363540430022423, "grad_norm": 0.2658994793891907, "learning_rate": 5.0438758463026465e-06, "loss": 0.0015, "step": 9236 }, { "epoch": 2.4368816778789077, "grad_norm": 0.6088262796401978, "learning_rate": 5.043524136111844e-06, "loss": 0.0055, "step": 9238 }, { "epoch": 2.437409312755573, "grad_norm": 0.18907678127288818, "learning_rate": 5.043172425921041e-06, "loss": 0.0058, "step": 9240 }, { "epoch": 2.4379369476322386, "grad_norm": 0.010753939859569073, "learning_rate": 5.042820715730238e-06, "loss": 0.0006, "step": 9242 }, { "epoch": 2.438464582508904, "grad_norm": 0.09201769530773163, "learning_rate": 5.042469005539435e-06, "loss": 0.0008, "step": 9244 }, { "epoch": 2.4389922173855694, "grad_norm": 0.008941961452364922, "learning_rate": 5.042117295348633e-06, "loss": 0.0023, "step": 9246 }, { "epoch": 2.439519852262235, "grad_norm": 0.04134039953351021, "learning_rate": 5.041765585157831e-06, "loss": 0.0004, "step": 9248 }, { "epoch": 2.4400474871388997, "grad_norm": 0.03742164373397827, "learning_rate": 5.041413874967027e-06, "loss": 0.0005, "step": 9250 }, { "epoch": 2.440575122015565, "grad_norm": 0.03516751527786255, "learning_rate": 5.041062164776225e-06, "loss": 0.0006, "step": 9252 }, { "epoch": 2.4411027568922306, "grad_norm": 0.08542059361934662, "learning_rate": 5.040710454585422e-06, "loss": 0.0007, "step": 9254 }, { "epoch": 2.441630391768896, "grad_norm": 0.005560290068387985, "learning_rate": 5.0403587443946186e-06, "loss": 0.0004, "step": 9256 }, { "epoch": 2.4421580266455614, "grad_norm": 0.10598307847976685, "learning_rate": 5.040007034203816e-06, "loss": 0.0006, "step": 9258 }, { "epoch": 2.4426856615222268, "grad_norm": 0.3155033588409424, "learning_rate": 5.039655324013013e-06, "loss": 0.0021, "step": 9260 }, { "epoch": 2.443213296398892, "grad_norm": 0.013607608154416084, "learning_rate": 5.039303613822211e-06, "loss": 0.0004, "step": 9262 }, { "epoch": 2.443740931275557, "grad_norm": 0.017084842547774315, "learning_rate": 5.038951903631407e-06, "loss": 0.0062, "step": 9264 }, { "epoch": 2.4442685661522225, "grad_norm": 0.009586401283740997, "learning_rate": 5.038600193440605e-06, "loss": 0.0016, "step": 9266 }, { "epoch": 2.444796201028888, "grad_norm": 0.038929250091314316, "learning_rate": 5.038248483249802e-06, "loss": 0.0005, "step": 9268 }, { "epoch": 2.4453238359055534, "grad_norm": 0.05407463386654854, "learning_rate": 5.037896773059e-06, "loss": 0.0039, "step": 9270 }, { "epoch": 2.4458514707822188, "grad_norm": 0.31542205810546875, "learning_rate": 5.037545062868197e-06, "loss": 0.006, "step": 9272 }, { "epoch": 2.446379105658884, "grad_norm": 0.020885515958070755, "learning_rate": 5.037193352677394e-06, "loss": 0.0049, "step": 9274 }, { "epoch": 2.4469067405355496, "grad_norm": 0.04839906468987465, "learning_rate": 5.0368416424865915e-06, "loss": 0.0006, "step": 9276 }, { "epoch": 2.4474343754122145, "grad_norm": 0.026399411261081696, "learning_rate": 5.0364899322957884e-06, "loss": 0.0058, "step": 9278 }, { "epoch": 2.44796201028888, "grad_norm": 0.13588270545005798, "learning_rate": 5.036138222104985e-06, "loss": 0.0078, "step": 9280 }, { "epoch": 2.4484896451655453, "grad_norm": 0.1579245924949646, "learning_rate": 5.035786511914183e-06, "loss": 0.0008, "step": 9282 }, { "epoch": 2.4490172800422108, "grad_norm": 0.011935251764953136, "learning_rate": 5.03543480172338e-06, "loss": 0.0021, "step": 9284 }, { "epoch": 2.449544914918876, "grad_norm": 0.07093897461891174, "learning_rate": 5.035083091532577e-06, "loss": 0.0008, "step": 9286 }, { "epoch": 2.4500725497955416, "grad_norm": 0.12948282063007355, "learning_rate": 5.034731381341774e-06, "loss": 0.0081, "step": 9288 }, { "epoch": 2.450600184672207, "grad_norm": 0.2554248571395874, "learning_rate": 5.034379671150972e-06, "loss": 0.0014, "step": 9290 }, { "epoch": 2.451127819548872, "grad_norm": 0.016398053616285324, "learning_rate": 5.034027960960169e-06, "loss": 0.0005, "step": 9292 }, { "epoch": 2.4516554544255373, "grad_norm": 0.550445556640625, "learning_rate": 5.033676250769366e-06, "loss": 0.0051, "step": 9294 }, { "epoch": 2.4521830893022027, "grad_norm": 0.0693625956773758, "learning_rate": 5.0333245405785636e-06, "loss": 0.0088, "step": 9296 }, { "epoch": 2.452710724178868, "grad_norm": 0.01262770313769579, "learning_rate": 5.0329728303877605e-06, "loss": 0.01, "step": 9298 }, { "epoch": 2.4532383590555336, "grad_norm": 0.023517092689871788, "learning_rate": 5.032621120196958e-06, "loss": 0.0005, "step": 9300 }, { "epoch": 2.453765993932199, "grad_norm": 0.03227754682302475, "learning_rate": 5.032269410006154e-06, "loss": 0.0018, "step": 9302 }, { "epoch": 2.4542936288088644, "grad_norm": 0.01266568060964346, "learning_rate": 5.031917699815352e-06, "loss": 0.0007, "step": 9304 }, { "epoch": 2.4548212636855298, "grad_norm": 0.05267477035522461, "learning_rate": 5.03156598962455e-06, "loss": 0.0042, "step": 9306 }, { "epoch": 2.455348898562195, "grad_norm": 0.024752989411354065, "learning_rate": 5.031214279433747e-06, "loss": 0.0008, "step": 9308 }, { "epoch": 2.45587653343886, "grad_norm": 0.34048476815223694, "learning_rate": 5.030862569242944e-06, "loss": 0.0048, "step": 9310 }, { "epoch": 2.4564041683155255, "grad_norm": 0.0752267837524414, "learning_rate": 5.030510859052141e-06, "loss": 0.0022, "step": 9312 }, { "epoch": 2.456931803192191, "grad_norm": 0.3875701129436493, "learning_rate": 5.030159148861339e-06, "loss": 0.007, "step": 9314 }, { "epoch": 2.4574594380688564, "grad_norm": 0.07683702558279037, "learning_rate": 5.029807438670536e-06, "loss": 0.0009, "step": 9316 }, { "epoch": 2.4579870729455218, "grad_norm": 0.04267311841249466, "learning_rate": 5.0294557284797326e-06, "loss": 0.0036, "step": 9318 }, { "epoch": 2.458514707822187, "grad_norm": 0.17431963980197906, "learning_rate": 5.02910401828893e-06, "loss": 0.0033, "step": 9320 }, { "epoch": 2.4590423426988526, "grad_norm": 0.43259525299072266, "learning_rate": 5.028752308098127e-06, "loss": 0.0054, "step": 9322 }, { "epoch": 2.4595699775755175, "grad_norm": 0.3332180678844452, "learning_rate": 5.028400597907324e-06, "loss": 0.0089, "step": 9324 }, { "epoch": 2.460097612452183, "grad_norm": 0.07146802544593811, "learning_rate": 5.028048887716521e-06, "loss": 0.0011, "step": 9326 }, { "epoch": 2.4606252473288484, "grad_norm": 1.0500283241271973, "learning_rate": 5.027697177525719e-06, "loss": 0.0035, "step": 9328 }, { "epoch": 2.4611528822055138, "grad_norm": 0.46826428174972534, "learning_rate": 5.027345467334917e-06, "loss": 0.0147, "step": 9330 }, { "epoch": 2.461680517082179, "grad_norm": 0.3494536578655243, "learning_rate": 5.026993757144113e-06, "loss": 0.0089, "step": 9332 }, { "epoch": 2.4622081519588446, "grad_norm": 0.11165603995323181, "learning_rate": 5.026642046953311e-06, "loss": 0.0128, "step": 9334 }, { "epoch": 2.46273578683551, "grad_norm": 0.022785993292927742, "learning_rate": 5.026290336762508e-06, "loss": 0.0033, "step": 9336 }, { "epoch": 2.463263421712175, "grad_norm": 0.015733346343040466, "learning_rate": 5.0259386265717055e-06, "loss": 0.0007, "step": 9338 }, { "epoch": 2.4637910565888403, "grad_norm": 0.8065096735954285, "learning_rate": 5.0255869163809024e-06, "loss": 0.0093, "step": 9340 }, { "epoch": 2.4643186914655058, "grad_norm": 0.4818291962146759, "learning_rate": 5.025235206190099e-06, "loss": 0.005, "step": 9342 }, { "epoch": 2.464846326342171, "grad_norm": 0.18347284197807312, "learning_rate": 5.024883495999297e-06, "loss": 0.0028, "step": 9344 }, { "epoch": 2.4653739612188366, "grad_norm": 0.034023087471723557, "learning_rate": 5.024531785808494e-06, "loss": 0.0017, "step": 9346 }, { "epoch": 2.465901596095502, "grad_norm": 0.024486234411597252, "learning_rate": 5.024180075617691e-06, "loss": 0.0006, "step": 9348 }, { "epoch": 2.4664292309721674, "grad_norm": 0.2114800214767456, "learning_rate": 5.023828365426888e-06, "loss": 0.0037, "step": 9350 }, { "epoch": 2.466956865848833, "grad_norm": 0.09771810472011566, "learning_rate": 5.023476655236086e-06, "loss": 0.0034, "step": 9352 }, { "epoch": 2.467484500725498, "grad_norm": 0.008520372211933136, "learning_rate": 5.023124945045283e-06, "loss": 0.0006, "step": 9354 }, { "epoch": 2.468012135602163, "grad_norm": 0.006691054906696081, "learning_rate": 5.02277323485448e-06, "loss": 0.0039, "step": 9356 }, { "epoch": 2.4685397704788286, "grad_norm": 0.040648285299539566, "learning_rate": 5.0224215246636775e-06, "loss": 0.0012, "step": 9358 }, { "epoch": 2.469067405355494, "grad_norm": 0.05693833902478218, "learning_rate": 5.0220698144728745e-06, "loss": 0.0006, "step": 9360 }, { "epoch": 2.4695950402321594, "grad_norm": 0.3587532341480255, "learning_rate": 5.0217181042820714e-06, "loss": 0.002, "step": 9362 }, { "epoch": 2.4701226751088248, "grad_norm": 0.006128103006631136, "learning_rate": 5.021366394091269e-06, "loss": 0.0007, "step": 9364 }, { "epoch": 2.47065030998549, "grad_norm": 0.15963546931743622, "learning_rate": 5.021014683900466e-06, "loss": 0.004, "step": 9366 }, { "epoch": 2.4711779448621556, "grad_norm": 0.23844130337238312, "learning_rate": 5.020662973709664e-06, "loss": 0.0017, "step": 9368 }, { "epoch": 2.4717055797388205, "grad_norm": 0.2501453459262848, "learning_rate": 5.02031126351886e-06, "loss": 0.0086, "step": 9370 }, { "epoch": 2.472233214615486, "grad_norm": 0.1404571533203125, "learning_rate": 5.019959553328058e-06, "loss": 0.0031, "step": 9372 }, { "epoch": 2.4727608494921514, "grad_norm": 0.47598493099212646, "learning_rate": 5.019607843137255e-06, "loss": 0.0026, "step": 9374 }, { "epoch": 2.4732884843688168, "grad_norm": 0.010988842695951462, "learning_rate": 5.019256132946453e-06, "loss": 0.0004, "step": 9376 }, { "epoch": 2.473816119245482, "grad_norm": 0.340300053358078, "learning_rate": 5.01890442275565e-06, "loss": 0.0058, "step": 9378 }, { "epoch": 2.4743437541221476, "grad_norm": 0.09366658329963684, "learning_rate": 5.0185527125648466e-06, "loss": 0.0006, "step": 9380 }, { "epoch": 2.474871388998813, "grad_norm": 0.015345078893005848, "learning_rate": 5.018201002374044e-06, "loss": 0.0004, "step": 9382 }, { "epoch": 2.475399023875478, "grad_norm": 0.006755770184099674, "learning_rate": 5.0178492921832405e-06, "loss": 0.0004, "step": 9384 }, { "epoch": 2.4759266587521434, "grad_norm": 0.26806437969207764, "learning_rate": 5.017497581992438e-06, "loss": 0.0048, "step": 9386 }, { "epoch": 2.4764542936288088, "grad_norm": 0.022879905998706818, "learning_rate": 5.017145871801636e-06, "loss": 0.0006, "step": 9388 }, { "epoch": 2.476981928505474, "grad_norm": 0.03447757661342621, "learning_rate": 5.016794161610833e-06, "loss": 0.0005, "step": 9390 }, { "epoch": 2.4775095633821396, "grad_norm": 0.015729254111647606, "learning_rate": 5.01644245142003e-06, "loss": 0.0006, "step": 9392 }, { "epoch": 2.478037198258805, "grad_norm": 0.02712169848382473, "learning_rate": 5.016090741229227e-06, "loss": 0.0005, "step": 9394 }, { "epoch": 2.4785648331354704, "grad_norm": 0.00811672955751419, "learning_rate": 5.015739031038425e-06, "loss": 0.0004, "step": 9396 }, { "epoch": 2.479092468012136, "grad_norm": 0.05039653182029724, "learning_rate": 5.015387320847622e-06, "loss": 0.0083, "step": 9398 }, { "epoch": 2.479620102888801, "grad_norm": 0.20722557604312897, "learning_rate": 5.015035610656819e-06, "loss": 0.0039, "step": 9400 }, { "epoch": 2.480147737765466, "grad_norm": 0.008238294161856174, "learning_rate": 5.0146839004660164e-06, "loss": 0.0011, "step": 9402 }, { "epoch": 2.4806753726421316, "grad_norm": 0.014020215719938278, "learning_rate": 5.014332190275213e-06, "loss": 0.0085, "step": 9404 }, { "epoch": 2.481203007518797, "grad_norm": 0.00770036643370986, "learning_rate": 5.013980480084411e-06, "loss": 0.0064, "step": 9406 }, { "epoch": 2.4817306423954624, "grad_norm": 0.007849953137338161, "learning_rate": 5.013628769893607e-06, "loss": 0.0004, "step": 9408 }, { "epoch": 2.4822582772721278, "grad_norm": 0.045924656093120575, "learning_rate": 5.013277059702805e-06, "loss": 0.0007, "step": 9410 }, { "epoch": 2.482785912148793, "grad_norm": 0.06495070457458496, "learning_rate": 5.012925349512002e-06, "loss": 0.0006, "step": 9412 }, { "epoch": 2.4833135470254586, "grad_norm": 0.5577480792999268, "learning_rate": 5.012573639321199e-06, "loss": 0.0018, "step": 9414 }, { "epoch": 2.4838411819021236, "grad_norm": 0.011818980798125267, "learning_rate": 5.012221929130397e-06, "loss": 0.0007, "step": 9416 }, { "epoch": 2.484368816778789, "grad_norm": 0.009986664168536663, "learning_rate": 5.011870218939594e-06, "loss": 0.0005, "step": 9418 }, { "epoch": 2.4848964516554544, "grad_norm": 0.05551067367196083, "learning_rate": 5.0115185087487915e-06, "loss": 0.0005, "step": 9420 }, { "epoch": 2.4854240865321198, "grad_norm": 0.01576113887131214, "learning_rate": 5.0111667985579885e-06, "loss": 0.0004, "step": 9422 }, { "epoch": 2.485951721408785, "grad_norm": 0.18901918828487396, "learning_rate": 5.0108150883671854e-06, "loss": 0.0009, "step": 9424 }, { "epoch": 2.4864793562854506, "grad_norm": 0.004834503401070833, "learning_rate": 5.010463378176383e-06, "loss": 0.0004, "step": 9426 }, { "epoch": 2.487006991162116, "grad_norm": 0.50606369972229, "learning_rate": 5.01011166798558e-06, "loss": 0.0075, "step": 9428 }, { "epoch": 2.487534626038781, "grad_norm": 0.005233790725469589, "learning_rate": 5.009759957794777e-06, "loss": 0.0065, "step": 9430 }, { "epoch": 2.4880622609154464, "grad_norm": 1.2749501466751099, "learning_rate": 5.009408247603974e-06, "loss": 0.0038, "step": 9432 }, { "epoch": 2.4885898957921118, "grad_norm": 0.6209535002708435, "learning_rate": 5.009056537413172e-06, "loss": 0.0033, "step": 9434 }, { "epoch": 2.489117530668777, "grad_norm": 0.2488076090812683, "learning_rate": 5.008704827222369e-06, "loss": 0.0011, "step": 9436 }, { "epoch": 2.4896451655454426, "grad_norm": 0.007218409329652786, "learning_rate": 5.008353117031566e-06, "loss": 0.0015, "step": 9438 }, { "epoch": 2.490172800422108, "grad_norm": 0.07317352294921875, "learning_rate": 5.008001406840764e-06, "loss": 0.0024, "step": 9440 }, { "epoch": 2.4907004352987734, "grad_norm": 0.15595057606697083, "learning_rate": 5.0076496966499606e-06, "loss": 0.0011, "step": 9442 }, { "epoch": 2.4912280701754383, "grad_norm": 0.14969946444034576, "learning_rate": 5.0072979864591575e-06, "loss": 0.0008, "step": 9444 }, { "epoch": 2.4917557050521038, "grad_norm": 0.0690724104642868, "learning_rate": 5.006946276268355e-06, "loss": 0.0017, "step": 9446 }, { "epoch": 2.492283339928769, "grad_norm": 0.003830198897048831, "learning_rate": 5.006594566077552e-06, "loss": 0.0042, "step": 9448 }, { "epoch": 2.4928109748054346, "grad_norm": 0.012806128710508347, "learning_rate": 5.00624285588675e-06, "loss": 0.0056, "step": 9450 }, { "epoch": 2.4933386096821, "grad_norm": 0.09155166149139404, "learning_rate": 5.005891145695946e-06, "loss": 0.0015, "step": 9452 }, { "epoch": 2.4938662445587654, "grad_norm": 0.08231235295534134, "learning_rate": 5.005539435505144e-06, "loss": 0.0033, "step": 9454 }, { "epoch": 2.494393879435431, "grad_norm": 0.13435940444469452, "learning_rate": 5.005187725314341e-06, "loss": 0.0012, "step": 9456 }, { "epoch": 2.494921514312096, "grad_norm": 0.015170460566878319, "learning_rate": 5.004836015123539e-06, "loss": 0.0004, "step": 9458 }, { "epoch": 2.4954491491887616, "grad_norm": 0.11891340464353561, "learning_rate": 5.004484304932736e-06, "loss": 0.0035, "step": 9460 }, { "epoch": 2.4959767840654266, "grad_norm": 0.06476238369941711, "learning_rate": 5.004132594741933e-06, "loss": 0.0007, "step": 9462 }, { "epoch": 2.496504418942092, "grad_norm": 0.5642426609992981, "learning_rate": 5.00378088455113e-06, "loss": 0.0047, "step": 9464 }, { "epoch": 2.4970320538187574, "grad_norm": 0.30621325969696045, "learning_rate": 5.003429174360327e-06, "loss": 0.0047, "step": 9466 }, { "epoch": 2.4975596886954228, "grad_norm": 0.2409539818763733, "learning_rate": 5.003077464169524e-06, "loss": 0.0026, "step": 9468 }, { "epoch": 2.498087323572088, "grad_norm": 0.3882625997066498, "learning_rate": 5.002725753978721e-06, "loss": 0.0019, "step": 9470 }, { "epoch": 2.4986149584487536, "grad_norm": 0.5688692331314087, "learning_rate": 5.002374043787919e-06, "loss": 0.0061, "step": 9472 }, { "epoch": 2.499142593325419, "grad_norm": 0.4294161796569824, "learning_rate": 5.002022333597116e-06, "loss": 0.0016, "step": 9474 }, { "epoch": 2.499670228202084, "grad_norm": 0.04234849661588669, "learning_rate": 5.001670623406313e-06, "loss": 0.0006, "step": 9476 }, { "epoch": 2.5001978630787494, "grad_norm": 0.8609362244606018, "learning_rate": 5.001318913215511e-06, "loss": 0.0073, "step": 9478 }, { "epoch": 2.5007254979554148, "grad_norm": 0.0850837305188179, "learning_rate": 5.000967203024708e-06, "loss": 0.0005, "step": 9480 }, { "epoch": 2.50125313283208, "grad_norm": 0.007308035623282194, "learning_rate": 5.000615492833905e-06, "loss": 0.0004, "step": 9482 }, { "epoch": 2.5017807677087456, "grad_norm": 0.030523929744958878, "learning_rate": 5.0002637826431025e-06, "loss": 0.0005, "step": 9484 }, { "epoch": 2.502308402585411, "grad_norm": 0.49063408374786377, "learning_rate": 4.9999120724522994e-06, "loss": 0.0052, "step": 9486 }, { "epoch": 2.5028360374620764, "grad_norm": 0.3125595152378082, "learning_rate": 4.999560362261497e-06, "loss": 0.0089, "step": 9488 }, { "epoch": 2.5033636723387414, "grad_norm": 0.012847254984080791, "learning_rate": 4.999208652070693e-06, "loss": 0.0014, "step": 9490 }, { "epoch": 2.503891307215407, "grad_norm": 0.38539862632751465, "learning_rate": 4.998856941879891e-06, "loss": 0.0115, "step": 9492 }, { "epoch": 2.504418942092072, "grad_norm": 0.017758620902895927, "learning_rate": 4.998505231689088e-06, "loss": 0.0007, "step": 9494 }, { "epoch": 2.5049465769687376, "grad_norm": 0.3053937554359436, "learning_rate": 4.998153521498286e-06, "loss": 0.0066, "step": 9496 }, { "epoch": 2.505474211845403, "grad_norm": 0.11935442686080933, "learning_rate": 4.997801811307483e-06, "loss": 0.0006, "step": 9498 }, { "epoch": 2.5060018467220684, "grad_norm": 0.7446406483650208, "learning_rate": 4.99745010111668e-06, "loss": 0.0031, "step": 9500 }, { "epoch": 2.506529481598734, "grad_norm": 0.24746470153331757, "learning_rate": 4.997098390925878e-06, "loss": 0.0012, "step": 9502 }, { "epoch": 2.5070571164753988, "grad_norm": 0.0797906219959259, "learning_rate": 4.9967466807350745e-06, "loss": 0.0007, "step": 9504 }, { "epoch": 2.5075847513520646, "grad_norm": 0.027430156245827675, "learning_rate": 4.9963949705442715e-06, "loss": 0.0009, "step": 9506 }, { "epoch": 2.5081123862287296, "grad_norm": 0.02518368326127529, "learning_rate": 4.996043260353469e-06, "loss": 0.0005, "step": 9508 }, { "epoch": 2.508640021105395, "grad_norm": 0.4484390318393707, "learning_rate": 4.995691550162666e-06, "loss": 0.0161, "step": 9510 }, { "epoch": 2.5091676559820604, "grad_norm": 0.2559569776058197, "learning_rate": 4.995339839971863e-06, "loss": 0.0038, "step": 9512 }, { "epoch": 2.509695290858726, "grad_norm": 0.20534469187259674, "learning_rate": 4.99498812978106e-06, "loss": 0.0047, "step": 9514 }, { "epoch": 2.510222925735391, "grad_norm": 0.35010722279548645, "learning_rate": 4.994636419590258e-06, "loss": 0.0043, "step": 9516 }, { "epoch": 2.5107505606120566, "grad_norm": 0.014221861027181149, "learning_rate": 4.994284709399455e-06, "loss": 0.0006, "step": 9518 }, { "epoch": 2.511278195488722, "grad_norm": 0.012919279746711254, "learning_rate": 4.993932999208652e-06, "loss": 0.0004, "step": 9520 }, { "epoch": 2.511805830365387, "grad_norm": 0.09718959033489227, "learning_rate": 4.99358128901785e-06, "loss": 0.001, "step": 9522 }, { "epoch": 2.5123334652420524, "grad_norm": 0.033427923917770386, "learning_rate": 4.993229578827047e-06, "loss": 0.0007, "step": 9524 }, { "epoch": 2.5128611001187178, "grad_norm": 0.009946790523827076, "learning_rate": 4.992877868636244e-06, "loss": 0.0004, "step": 9526 }, { "epoch": 2.513388734995383, "grad_norm": 0.025130972266197205, "learning_rate": 4.9925261584454405e-06, "loss": 0.0006, "step": 9528 }, { "epoch": 2.5139163698720486, "grad_norm": 0.022962089627981186, "learning_rate": 4.992174448254638e-06, "loss": 0.0004, "step": 9530 }, { "epoch": 2.514444004748714, "grad_norm": 0.14496514201164246, "learning_rate": 4.991822738063836e-06, "loss": 0.0159, "step": 9532 }, { "epoch": 2.5149716396253794, "grad_norm": 0.02664364129304886, "learning_rate": 4.991471027873032e-06, "loss": 0.0054, "step": 9534 }, { "epoch": 2.5154992745020444, "grad_norm": 0.7038823962211609, "learning_rate": 4.99111931768223e-06, "loss": 0.0048, "step": 9536 }, { "epoch": 2.51602690937871, "grad_norm": 0.32392990589141846, "learning_rate": 4.990767607491427e-06, "loss": 0.001, "step": 9538 }, { "epoch": 2.516554544255375, "grad_norm": 0.013740136288106441, "learning_rate": 4.990415897300625e-06, "loss": 0.0006, "step": 9540 }, { "epoch": 2.5170821791320406, "grad_norm": 0.08164145797491074, "learning_rate": 4.990064187109822e-06, "loss": 0.0036, "step": 9542 }, { "epoch": 2.517609814008706, "grad_norm": 0.018312621861696243, "learning_rate": 4.989712476919019e-06, "loss": 0.0005, "step": 9544 }, { "epoch": 2.5181374488853714, "grad_norm": 0.010355786420404911, "learning_rate": 4.9893607667282165e-06, "loss": 0.0021, "step": 9546 }, { "epoch": 2.518665083762037, "grad_norm": 0.07996019721031189, "learning_rate": 4.9890090565374134e-06, "loss": 0.0066, "step": 9548 }, { "epoch": 2.5191927186387018, "grad_norm": 0.15985427796840668, "learning_rate": 4.98865734634661e-06, "loss": 0.0117, "step": 9550 }, { "epoch": 2.5197203535153676, "grad_norm": 0.026780687272548676, "learning_rate": 4.988305636155807e-06, "loss": 0.0006, "step": 9552 }, { "epoch": 2.5202479883920326, "grad_norm": 0.022636065259575844, "learning_rate": 4.987953925965005e-06, "loss": 0.0005, "step": 9554 }, { "epoch": 2.520775623268698, "grad_norm": 0.030655717477202415, "learning_rate": 4.987602215774202e-06, "loss": 0.0006, "step": 9556 }, { "epoch": 2.5213032581453634, "grad_norm": 0.04909776896238327, "learning_rate": 4.987250505583399e-06, "loss": 0.0005, "step": 9558 }, { "epoch": 2.521830893022029, "grad_norm": 0.292372465133667, "learning_rate": 4.986898795392597e-06, "loss": 0.0028, "step": 9560 }, { "epoch": 2.522358527898694, "grad_norm": 0.7772182822227478, "learning_rate": 4.986547085201794e-06, "loss": 0.0055, "step": 9562 }, { "epoch": 2.5228861627753596, "grad_norm": 0.02466483786702156, "learning_rate": 4.986195375010991e-06, "loss": 0.0012, "step": 9564 }, { "epoch": 2.523413797652025, "grad_norm": 0.01728149503469467, "learning_rate": 4.9858436648201885e-06, "loss": 0.0004, "step": 9566 }, { "epoch": 2.52394143252869, "grad_norm": 0.2155528962612152, "learning_rate": 4.9854919546293855e-06, "loss": 0.0014, "step": 9568 }, { "epoch": 2.5244690674053554, "grad_norm": 0.04295467585325241, "learning_rate": 4.985140244438583e-06, "loss": 0.0087, "step": 9570 }, { "epoch": 2.5249967022820208, "grad_norm": 0.33334991335868835, "learning_rate": 4.984788534247779e-06, "loss": 0.0014, "step": 9572 }, { "epoch": 2.525524337158686, "grad_norm": 0.7920770645141602, "learning_rate": 4.984436824056977e-06, "loss": 0.0042, "step": 9574 }, { "epoch": 2.5260519720353516, "grad_norm": 0.06416124105453491, "learning_rate": 4.984085113866174e-06, "loss": 0.0007, "step": 9576 }, { "epoch": 2.526579606912017, "grad_norm": 0.1145738884806633, "learning_rate": 4.983733403675372e-06, "loss": 0.0009, "step": 9578 }, { "epoch": 2.5271072417886824, "grad_norm": 0.11955641955137253, "learning_rate": 4.983381693484569e-06, "loss": 0.0013, "step": 9580 }, { "epoch": 2.5276348766653474, "grad_norm": 1.0204180479049683, "learning_rate": 4.983029983293766e-06, "loss": 0.0046, "step": 9582 }, { "epoch": 2.5281625115420128, "grad_norm": 0.6312944293022156, "learning_rate": 4.982678273102964e-06, "loss": 0.0053, "step": 9584 }, { "epoch": 2.528690146418678, "grad_norm": 0.022811545059084892, "learning_rate": 4.982326562912161e-06, "loss": 0.0073, "step": 9586 }, { "epoch": 2.5292177812953436, "grad_norm": 0.09305346757173538, "learning_rate": 4.9819748527213576e-06, "loss": 0.001, "step": 9588 }, { "epoch": 2.529745416172009, "grad_norm": 0.03113548830151558, "learning_rate": 4.981623142530555e-06, "loss": 0.0022, "step": 9590 }, { "epoch": 2.5302730510486744, "grad_norm": 0.8364856243133545, "learning_rate": 4.981271432339752e-06, "loss": 0.0021, "step": 9592 }, { "epoch": 2.53080068592534, "grad_norm": 0.023826735094189644, "learning_rate": 4.980919722148949e-06, "loss": 0.0045, "step": 9594 }, { "epoch": 2.5313283208020048, "grad_norm": 0.22398722171783447, "learning_rate": 4.980568011958146e-06, "loss": 0.0028, "step": 9596 }, { "epoch": 2.5318559556786706, "grad_norm": 0.02834392711520195, "learning_rate": 4.980216301767344e-06, "loss": 0.0009, "step": 9598 }, { "epoch": 2.5323835905553356, "grad_norm": 0.014507594518363476, "learning_rate": 4.979864591576541e-06, "loss": 0.0019, "step": 9600 }, { "epoch": 2.532911225432001, "grad_norm": 0.7170217633247375, "learning_rate": 4.979512881385738e-06, "loss": 0.0087, "step": 9602 }, { "epoch": 2.5334388603086664, "grad_norm": 0.47595682740211487, "learning_rate": 4.979161171194936e-06, "loss": 0.0107, "step": 9604 }, { "epoch": 2.533966495185332, "grad_norm": 0.12102866917848587, "learning_rate": 4.978809461004133e-06, "loss": 0.001, "step": 9606 }, { "epoch": 2.534494130061997, "grad_norm": 0.05135657638311386, "learning_rate": 4.9784577508133305e-06, "loss": 0.0005, "step": 9608 }, { "epoch": 2.5350217649386626, "grad_norm": 0.022682057693600655, "learning_rate": 4.9781060406225266e-06, "loss": 0.0005, "step": 9610 }, { "epoch": 2.535549399815328, "grad_norm": 0.11252029240131378, "learning_rate": 4.977754330431724e-06, "loss": 0.0008, "step": 9612 }, { "epoch": 2.536077034691993, "grad_norm": 0.39076122641563416, "learning_rate": 4.977402620240921e-06, "loss": 0.0051, "step": 9614 }, { "epoch": 2.5366046695686584, "grad_norm": 0.024266831576824188, "learning_rate": 4.977050910050119e-06, "loss": 0.0022, "step": 9616 }, { "epoch": 2.537132304445324, "grad_norm": 0.07748313993215561, "learning_rate": 4.976699199859316e-06, "loss": 0.0006, "step": 9618 }, { "epoch": 2.537659939321989, "grad_norm": 0.28397297859191895, "learning_rate": 4.976347489668513e-06, "loss": 0.0075, "step": 9620 }, { "epoch": 2.5381875741986546, "grad_norm": 0.38996830582618713, "learning_rate": 4.975995779477711e-06, "loss": 0.0029, "step": 9622 }, { "epoch": 2.53871520907532, "grad_norm": 0.37813565135002136, "learning_rate": 4.975644069286908e-06, "loss": 0.0042, "step": 9624 }, { "epoch": 2.5392428439519854, "grad_norm": 0.0382746197283268, "learning_rate": 4.975292359096105e-06, "loss": 0.0005, "step": 9626 }, { "epoch": 2.5397704788286504, "grad_norm": 0.00598550122231245, "learning_rate": 4.9749406489053025e-06, "loss": 0.0035, "step": 9628 }, { "epoch": 2.5402981137053158, "grad_norm": 0.018195493146777153, "learning_rate": 4.9745889387144995e-06, "loss": 0.0005, "step": 9630 }, { "epoch": 2.540825748581981, "grad_norm": 0.3156777322292328, "learning_rate": 4.9742372285236964e-06, "loss": 0.0164, "step": 9632 }, { "epoch": 2.5413533834586466, "grad_norm": 0.015635855495929718, "learning_rate": 4.973885518332893e-06, "loss": 0.0103, "step": 9634 }, { "epoch": 2.541881018335312, "grad_norm": 0.023508282378315926, "learning_rate": 4.973533808142091e-06, "loss": 0.0012, "step": 9636 }, { "epoch": 2.5424086532119774, "grad_norm": 0.01700526289641857, "learning_rate": 4.973182097951288e-06, "loss": 0.0005, "step": 9638 }, { "epoch": 2.542936288088643, "grad_norm": 0.2542452812194824, "learning_rate": 4.972830387760485e-06, "loss": 0.0007, "step": 9640 }, { "epoch": 2.5434639229653078, "grad_norm": 0.04060439020395279, "learning_rate": 4.972478677569683e-06, "loss": 0.0006, "step": 9642 }, { "epoch": 2.5439915578419736, "grad_norm": 0.012368185445666313, "learning_rate": 4.97212696737888e-06, "loss": 0.0006, "step": 9644 }, { "epoch": 2.5445191927186386, "grad_norm": 0.34367433190345764, "learning_rate": 4.971775257188078e-06, "loss": 0.002, "step": 9646 }, { "epoch": 2.545046827595304, "grad_norm": 0.3340713381767273, "learning_rate": 4.971423546997275e-06, "loss": 0.0162, "step": 9648 }, { "epoch": 2.5455744624719694, "grad_norm": 0.025394899770617485, "learning_rate": 4.9710718368064716e-06, "loss": 0.0008, "step": 9650 }, { "epoch": 2.546102097348635, "grad_norm": 0.07678566128015518, "learning_rate": 4.970720126615669e-06, "loss": 0.0009, "step": 9652 }, { "epoch": 2.5466297322253, "grad_norm": 0.2958522140979767, "learning_rate": 4.9703684164248655e-06, "loss": 0.003, "step": 9654 }, { "epoch": 2.547157367101965, "grad_norm": 0.16635721921920776, "learning_rate": 4.970016706234063e-06, "loss": 0.0016, "step": 9656 }, { "epoch": 2.547685001978631, "grad_norm": 0.11554907262325287, "learning_rate": 4.96966499604326e-06, "loss": 0.0006, "step": 9658 }, { "epoch": 2.548212636855296, "grad_norm": 0.2931523323059082, "learning_rate": 4.969313285852458e-06, "loss": 0.0073, "step": 9660 }, { "epoch": 2.5487402717319614, "grad_norm": 0.012232580222189426, "learning_rate": 4.968961575661655e-06, "loss": 0.0006, "step": 9662 }, { "epoch": 2.549267906608627, "grad_norm": 0.36207109689712524, "learning_rate": 4.968609865470852e-06, "loss": 0.0022, "step": 9664 }, { "epoch": 2.549795541485292, "grad_norm": 0.42417874932289124, "learning_rate": 4.96825815528005e-06, "loss": 0.0018, "step": 9666 }, { "epoch": 2.5503231763619576, "grad_norm": 0.03316975384950638, "learning_rate": 4.967906445089247e-06, "loss": 0.0006, "step": 9668 }, { "epoch": 2.550850811238623, "grad_norm": 0.03035087324678898, "learning_rate": 4.967554734898444e-06, "loss": 0.0005, "step": 9670 }, { "epoch": 2.5513784461152884, "grad_norm": 0.44015219807624817, "learning_rate": 4.9672030247076406e-06, "loss": 0.0047, "step": 9672 }, { "epoch": 2.5519060809919534, "grad_norm": 0.01944035291671753, "learning_rate": 4.966851314516838e-06, "loss": 0.0004, "step": 9674 }, { "epoch": 2.552433715868619, "grad_norm": 0.06823410093784332, "learning_rate": 4.966499604326036e-06, "loss": 0.0059, "step": 9676 }, { "epoch": 2.552961350745284, "grad_norm": 0.08049990981817245, "learning_rate": 4.966147894135232e-06, "loss": 0.0015, "step": 9678 }, { "epoch": 2.5534889856219496, "grad_norm": 0.01712123677134514, "learning_rate": 4.96579618394443e-06, "loss": 0.0006, "step": 9680 }, { "epoch": 2.554016620498615, "grad_norm": 0.16927094757556915, "learning_rate": 4.965444473753627e-06, "loss": 0.0007, "step": 9682 }, { "epoch": 2.5545442553752804, "grad_norm": 0.020816143602132797, "learning_rate": 4.965092763562824e-06, "loss": 0.0005, "step": 9684 }, { "epoch": 2.555071890251946, "grad_norm": 0.08185690641403198, "learning_rate": 4.964741053372022e-06, "loss": 0.0008, "step": 9686 }, { "epoch": 2.5555995251286108, "grad_norm": 0.8604509234428406, "learning_rate": 4.964389343181219e-06, "loss": 0.0013, "step": 9688 }, { "epoch": 2.5561271600052766, "grad_norm": 0.03961079195141792, "learning_rate": 4.9640376329904165e-06, "loss": 0.0006, "step": 9690 }, { "epoch": 2.5566547948819416, "grad_norm": 0.006781138014048338, "learning_rate": 4.963685922799613e-06, "loss": 0.0005, "step": 9692 }, { "epoch": 2.557182429758607, "grad_norm": 0.2977781295776367, "learning_rate": 4.9633342126088104e-06, "loss": 0.0147, "step": 9694 }, { "epoch": 2.5577100646352724, "grad_norm": 0.24271787703037262, "learning_rate": 4.962982502418007e-06, "loss": 0.0085, "step": 9696 }, { "epoch": 2.558237699511938, "grad_norm": 0.02255120873451233, "learning_rate": 4.962630792227205e-06, "loss": 0.0005, "step": 9698 }, { "epoch": 2.558765334388603, "grad_norm": 0.02744998224079609, "learning_rate": 4.962279082036402e-06, "loss": 0.0009, "step": 9700 }, { "epoch": 2.559292969265268, "grad_norm": 0.01745961420238018, "learning_rate": 4.961927371845599e-06, "loss": 0.0126, "step": 9702 }, { "epoch": 2.559820604141934, "grad_norm": 0.03038637898862362, "learning_rate": 4.961575661654797e-06, "loss": 0.0007, "step": 9704 }, { "epoch": 2.560348239018599, "grad_norm": 0.05771254748106003, "learning_rate": 4.961223951463994e-06, "loss": 0.0006, "step": 9706 }, { "epoch": 2.5608758738952644, "grad_norm": 0.11398033797740936, "learning_rate": 4.960872241273191e-06, "loss": 0.0009, "step": 9708 }, { "epoch": 2.56140350877193, "grad_norm": 0.0726146250963211, "learning_rate": 4.960520531082389e-06, "loss": 0.0007, "step": 9710 }, { "epoch": 2.561931143648595, "grad_norm": 0.17030669748783112, "learning_rate": 4.9601688208915855e-06, "loss": 0.0063, "step": 9712 }, { "epoch": 2.5624587785252606, "grad_norm": 0.009599892422556877, "learning_rate": 4.9598171107007825e-06, "loss": 0.0006, "step": 9714 }, { "epoch": 2.562986413401926, "grad_norm": 0.1511242836713791, "learning_rate": 4.9594654005099794e-06, "loss": 0.0101, "step": 9716 }, { "epoch": 2.5635140482785914, "grad_norm": 0.2715572118759155, "learning_rate": 4.959113690319177e-06, "loss": 0.0039, "step": 9718 }, { "epoch": 2.5640416831552564, "grad_norm": 0.25047197937965393, "learning_rate": 4.958761980128374e-06, "loss": 0.0052, "step": 9720 }, { "epoch": 2.564569318031922, "grad_norm": 0.9096133708953857, "learning_rate": 4.958410269937571e-06, "loss": 0.006, "step": 9722 }, { "epoch": 2.565096952908587, "grad_norm": 0.06593640893697739, "learning_rate": 4.958058559746769e-06, "loss": 0.0007, "step": 9724 }, { "epoch": 2.5656245877852526, "grad_norm": 0.34726348519325256, "learning_rate": 4.957706849555966e-06, "loss": 0.0039, "step": 9726 }, { "epoch": 2.566152222661918, "grad_norm": 0.8702974915504456, "learning_rate": 4.957355139365164e-06, "loss": 0.0082, "step": 9728 }, { "epoch": 2.5666798575385834, "grad_norm": 0.033704645931720734, "learning_rate": 4.95700342917436e-06, "loss": 0.0101, "step": 9730 }, { "epoch": 2.567207492415249, "grad_norm": 0.005201447755098343, "learning_rate": 4.956651718983558e-06, "loss": 0.0045, "step": 9732 }, { "epoch": 2.5677351272919138, "grad_norm": 0.36212730407714844, "learning_rate": 4.956300008792755e-06, "loss": 0.0031, "step": 9734 }, { "epoch": 2.568262762168579, "grad_norm": 0.019518503919243813, "learning_rate": 4.955948298601952e-06, "loss": 0.0004, "step": 9736 }, { "epoch": 2.5687903970452446, "grad_norm": 0.0817452222108841, "learning_rate": 4.955596588411149e-06, "loss": 0.009, "step": 9738 }, { "epoch": 2.56931803192191, "grad_norm": 0.015455208718776703, "learning_rate": 4.955244878220346e-06, "loss": 0.0006, "step": 9740 }, { "epoch": 2.5698456667985754, "grad_norm": 0.02705664560198784, "learning_rate": 4.954893168029544e-06, "loss": 0.0049, "step": 9742 }, { "epoch": 2.570373301675241, "grad_norm": 0.045644909143447876, "learning_rate": 4.954541457838741e-06, "loss": 0.0017, "step": 9744 }, { "epoch": 2.570900936551906, "grad_norm": 0.02661614678800106, "learning_rate": 4.954189747647938e-06, "loss": 0.0007, "step": 9746 }, { "epoch": 2.571428571428571, "grad_norm": 0.6285318732261658, "learning_rate": 4.953838037457136e-06, "loss": 0.0066, "step": 9748 }, { "epoch": 2.571956206305237, "grad_norm": 0.2724461853504181, "learning_rate": 4.953486327266333e-06, "loss": 0.0074, "step": 9750 }, { "epoch": 2.572483841181902, "grad_norm": 0.30357685685157776, "learning_rate": 4.95313461707553e-06, "loss": 0.0087, "step": 9752 }, { "epoch": 2.5730114760585674, "grad_norm": 0.02533024549484253, "learning_rate": 4.952782906884727e-06, "loss": 0.0005, "step": 9754 }, { "epoch": 2.573539110935233, "grad_norm": 0.10732197016477585, "learning_rate": 4.9524311966939244e-06, "loss": 0.0105, "step": 9756 }, { "epoch": 2.574066745811898, "grad_norm": 0.20283599197864532, "learning_rate": 4.952079486503122e-06, "loss": 0.0069, "step": 9758 }, { "epoch": 2.5745943806885636, "grad_norm": 0.0451178103685379, "learning_rate": 4.951727776312318e-06, "loss": 0.001, "step": 9760 }, { "epoch": 2.575122015565229, "grad_norm": 0.23814526200294495, "learning_rate": 4.951376066121516e-06, "loss": 0.0021, "step": 9762 }, { "epoch": 2.5756496504418944, "grad_norm": 0.025061987340450287, "learning_rate": 4.951024355930713e-06, "loss": 0.0025, "step": 9764 }, { "epoch": 2.5761772853185594, "grad_norm": 0.04481504485011101, "learning_rate": 4.950672645739911e-06, "loss": 0.0018, "step": 9766 }, { "epoch": 2.576704920195225, "grad_norm": 0.33845099806785583, "learning_rate": 4.950320935549108e-06, "loss": 0.009, "step": 9768 }, { "epoch": 2.57723255507189, "grad_norm": 0.011208325624465942, "learning_rate": 4.949969225358305e-06, "loss": 0.0044, "step": 9770 }, { "epoch": 2.5777601899485556, "grad_norm": 0.03453940898180008, "learning_rate": 4.949617515167503e-06, "loss": 0.0005, "step": 9772 }, { "epoch": 2.578287824825221, "grad_norm": 0.43934476375579834, "learning_rate": 4.949265804976699e-06, "loss": 0.0032, "step": 9774 }, { "epoch": 2.5788154597018864, "grad_norm": 0.01097249984741211, "learning_rate": 4.9489140947858965e-06, "loss": 0.0005, "step": 9776 }, { "epoch": 2.579343094578552, "grad_norm": 0.07002470642328262, "learning_rate": 4.9485623845950934e-06, "loss": 0.0038, "step": 9778 }, { "epoch": 2.579870729455217, "grad_norm": 0.009359456598758698, "learning_rate": 4.948210674404291e-06, "loss": 0.0052, "step": 9780 }, { "epoch": 2.580398364331882, "grad_norm": 0.016232343390583992, "learning_rate": 4.947858964213488e-06, "loss": 0.0007, "step": 9782 }, { "epoch": 2.5809259992085476, "grad_norm": 0.017273113131523132, "learning_rate": 4.947507254022685e-06, "loss": 0.0007, "step": 9784 }, { "epoch": 2.581453634085213, "grad_norm": 0.021525226533412933, "learning_rate": 4.947155543831883e-06, "loss": 0.0005, "step": 9786 }, { "epoch": 2.5819812689618784, "grad_norm": 0.011244937777519226, "learning_rate": 4.94680383364108e-06, "loss": 0.0005, "step": 9788 }, { "epoch": 2.582508903838544, "grad_norm": 0.03197341412305832, "learning_rate": 4.946452123450277e-06, "loss": 0.0006, "step": 9790 }, { "epoch": 2.583036538715209, "grad_norm": 0.23136350512504578, "learning_rate": 4.946100413259475e-06, "loss": 0.001, "step": 9792 }, { "epoch": 2.583564173591874, "grad_norm": 0.007075142115354538, "learning_rate": 4.945748703068672e-06, "loss": 0.0004, "step": 9794 }, { "epoch": 2.58409180846854, "grad_norm": 0.04330715164542198, "learning_rate": 4.945396992877869e-06, "loss": 0.0054, "step": 9796 }, { "epoch": 2.584619443345205, "grad_norm": 0.015156463719904423, "learning_rate": 4.9450452826870655e-06, "loss": 0.0012, "step": 9798 }, { "epoch": 2.5851470782218704, "grad_norm": 0.26747581362724304, "learning_rate": 4.944693572496263e-06, "loss": 0.0039, "step": 9800 }, { "epoch": 2.585674713098536, "grad_norm": 0.015345540829002857, "learning_rate": 4.94434186230546e-06, "loss": 0.0006, "step": 9802 }, { "epoch": 2.586202347975201, "grad_norm": 0.004974526818841696, "learning_rate": 4.943990152114657e-06, "loss": 0.0009, "step": 9804 }, { "epoch": 2.5867299828518666, "grad_norm": 0.004833850543946028, "learning_rate": 4.943638441923855e-06, "loss": 0.0004, "step": 9806 }, { "epoch": 2.587257617728532, "grad_norm": 0.07993225008249283, "learning_rate": 4.943286731733052e-06, "loss": 0.0011, "step": 9808 }, { "epoch": 2.5877852526051974, "grad_norm": 0.02022692933678627, "learning_rate": 4.94293502154225e-06, "loss": 0.0004, "step": 9810 }, { "epoch": 2.5883128874818624, "grad_norm": 0.14770916104316711, "learning_rate": 4.942583311351446e-06, "loss": 0.0037, "step": 9812 }, { "epoch": 2.588840522358528, "grad_norm": 0.4674730598926544, "learning_rate": 4.942231601160644e-06, "loss": 0.009, "step": 9814 }, { "epoch": 2.589368157235193, "grad_norm": 0.007693803869187832, "learning_rate": 4.9418798909698415e-06, "loss": 0.0003, "step": 9816 }, { "epoch": 2.5898957921118586, "grad_norm": 0.06293345242738724, "learning_rate": 4.941528180779038e-06, "loss": 0.0006, "step": 9818 }, { "epoch": 2.590423426988524, "grad_norm": 0.013499993830919266, "learning_rate": 4.941176470588235e-06, "loss": 0.0008, "step": 9820 }, { "epoch": 2.5909510618651894, "grad_norm": 0.057810377329587936, "learning_rate": 4.940824760397432e-06, "loss": 0.0006, "step": 9822 }, { "epoch": 2.591478696741855, "grad_norm": 0.22713692486286163, "learning_rate": 4.94047305020663e-06, "loss": 0.0077, "step": 9824 }, { "epoch": 2.59200633161852, "grad_norm": 0.007331409491598606, "learning_rate": 4.940121340015827e-06, "loss": 0.0004, "step": 9826 }, { "epoch": 2.592533966495185, "grad_norm": 0.7455741167068481, "learning_rate": 4.939769629825024e-06, "loss": 0.0025, "step": 9828 }, { "epoch": 2.5930616013718506, "grad_norm": 0.21345897018909454, "learning_rate": 4.939417919634222e-06, "loss": 0.0035, "step": 9830 }, { "epoch": 2.593589236248516, "grad_norm": 0.007444928400218487, "learning_rate": 4.939066209443419e-06, "loss": 0.0004, "step": 9832 }, { "epoch": 2.5941168711251814, "grad_norm": 0.026112083345651627, "learning_rate": 4.938714499252616e-06, "loss": 0.0005, "step": 9834 }, { "epoch": 2.594644506001847, "grad_norm": 0.44037315249443054, "learning_rate": 4.938362789061813e-06, "loss": 0.0012, "step": 9836 }, { "epoch": 2.5951721408785122, "grad_norm": 0.1063774898648262, "learning_rate": 4.9380110788710105e-06, "loss": 0.0009, "step": 9838 }, { "epoch": 2.595699775755177, "grad_norm": 0.28619542717933655, "learning_rate": 4.9376593686802074e-06, "loss": 0.0008, "step": 9840 }, { "epoch": 2.596227410631843, "grad_norm": 0.1994050294160843, "learning_rate": 4.937307658489404e-06, "loss": 0.0176, "step": 9842 }, { "epoch": 2.596755045508508, "grad_norm": 0.2843658924102783, "learning_rate": 4.936955948298602e-06, "loss": 0.0039, "step": 9844 }, { "epoch": 2.5972826803851734, "grad_norm": 0.07429630309343338, "learning_rate": 4.936604238107799e-06, "loss": 0.0006, "step": 9846 }, { "epoch": 2.597810315261839, "grad_norm": 0.2047143280506134, "learning_rate": 4.936252527916997e-06, "loss": 0.0032, "step": 9848 }, { "epoch": 2.598337950138504, "grad_norm": 0.009640196338295937, "learning_rate": 4.935900817726194e-06, "loss": 0.0006, "step": 9850 }, { "epoch": 2.5988655850151696, "grad_norm": 0.02797550894320011, "learning_rate": 4.935549107535391e-06, "loss": 0.0063, "step": 9852 }, { "epoch": 2.5993932198918346, "grad_norm": 0.011505485512316227, "learning_rate": 4.935197397344589e-06, "loss": 0.0004, "step": 9854 }, { "epoch": 2.5999208547685004, "grad_norm": 0.039792872965335846, "learning_rate": 4.934845687153786e-06, "loss": 0.0005, "step": 9856 }, { "epoch": 2.6004484896451654, "grad_norm": 0.02123592235147953, "learning_rate": 4.9344939769629825e-06, "loss": 0.007, "step": 9858 }, { "epoch": 2.600976124521831, "grad_norm": 0.25012144446372986, "learning_rate": 4.9341422667721795e-06, "loss": 0.0095, "step": 9860 }, { "epoch": 2.601503759398496, "grad_norm": 0.10893549025058746, "learning_rate": 4.933790556581377e-06, "loss": 0.0012, "step": 9862 }, { "epoch": 2.6020313942751616, "grad_norm": 0.3069438338279724, "learning_rate": 4.933438846390574e-06, "loss": 0.0021, "step": 9864 }, { "epoch": 2.602559029151827, "grad_norm": 0.0564892441034317, "learning_rate": 4.933087136199771e-06, "loss": 0.0031, "step": 9866 }, { "epoch": 2.6030866640284924, "grad_norm": 0.09902726858854294, "learning_rate": 4.932735426008969e-06, "loss": 0.0009, "step": 9868 }, { "epoch": 2.603614298905158, "grad_norm": 0.07801219075918198, "learning_rate": 4.932383715818166e-06, "loss": 0.0054, "step": 9870 }, { "epoch": 2.604141933781823, "grad_norm": 0.028648359701037407, "learning_rate": 4.932032005627363e-06, "loss": 0.0005, "step": 9872 }, { "epoch": 2.604669568658488, "grad_norm": 0.017069507390260696, "learning_rate": 4.931680295436561e-06, "loss": 0.0004, "step": 9874 }, { "epoch": 2.6051972035351536, "grad_norm": 0.31175628304481506, "learning_rate": 4.931328585245758e-06, "loss": 0.0152, "step": 9876 }, { "epoch": 2.605724838411819, "grad_norm": 0.013240961357951164, "learning_rate": 4.9309768750549555e-06, "loss": 0.0048, "step": 9878 }, { "epoch": 2.6062524732884844, "grad_norm": 0.05692068114876747, "learning_rate": 4.9306251648641516e-06, "loss": 0.0032, "step": 9880 }, { "epoch": 2.60678010816515, "grad_norm": 0.014333183877170086, "learning_rate": 4.930273454673349e-06, "loss": 0.0018, "step": 9882 }, { "epoch": 2.6073077430418152, "grad_norm": 0.11731652915477753, "learning_rate": 4.929921744482546e-06, "loss": 0.0068, "step": 9884 }, { "epoch": 2.60783537791848, "grad_norm": 0.01691746711730957, "learning_rate": 4.929570034291744e-06, "loss": 0.0014, "step": 9886 }, { "epoch": 2.6083630127951456, "grad_norm": 0.0127705167979002, "learning_rate": 4.929218324100941e-06, "loss": 0.0005, "step": 9888 }, { "epoch": 2.608890647671811, "grad_norm": 0.2059788852930069, "learning_rate": 4.928866613910138e-06, "loss": 0.001, "step": 9890 }, { "epoch": 2.6094182825484764, "grad_norm": 0.031659260392189026, "learning_rate": 4.928514903719336e-06, "loss": 0.0005, "step": 9892 }, { "epoch": 2.609945917425142, "grad_norm": 0.2781223952770233, "learning_rate": 4.928163193528532e-06, "loss": 0.0019, "step": 9894 }, { "epoch": 2.610473552301807, "grad_norm": 0.015193932689726353, "learning_rate": 4.92781148333773e-06, "loss": 0.0015, "step": 9896 }, { "epoch": 2.6110011871784726, "grad_norm": 0.017512036487460136, "learning_rate": 4.927459773146927e-06, "loss": 0.0005, "step": 9898 }, { "epoch": 2.6115288220551376, "grad_norm": 0.6094923615455627, "learning_rate": 4.9271080629561245e-06, "loss": 0.0041, "step": 9900 }, { "epoch": 2.6120564569318034, "grad_norm": 0.006647764705121517, "learning_rate": 4.9267563527653214e-06, "loss": 0.0048, "step": 9902 }, { "epoch": 2.6125840918084684, "grad_norm": 0.09929710626602173, "learning_rate": 4.926404642574518e-06, "loss": 0.0035, "step": 9904 }, { "epoch": 2.613111726685134, "grad_norm": 0.19741766154766083, "learning_rate": 4.926052932383716e-06, "loss": 0.0019, "step": 9906 }, { "epoch": 2.613639361561799, "grad_norm": 0.010488664731383324, "learning_rate": 4.925701222192913e-06, "loss": 0.0066, "step": 9908 }, { "epoch": 2.6141669964384646, "grad_norm": 0.15106706321239471, "learning_rate": 4.92534951200211e-06, "loss": 0.0035, "step": 9910 }, { "epoch": 2.61469463131513, "grad_norm": 0.09873715788125992, "learning_rate": 4.924997801811308e-06, "loss": 0.0025, "step": 9912 }, { "epoch": 2.6152222661917954, "grad_norm": 0.547773540019989, "learning_rate": 4.924646091620505e-06, "loss": 0.0067, "step": 9914 }, { "epoch": 2.615749901068461, "grad_norm": 0.0769025906920433, "learning_rate": 4.924294381429703e-06, "loss": 0.0009, "step": 9916 }, { "epoch": 2.616277535945126, "grad_norm": 0.008484264835715294, "learning_rate": 4.923942671238899e-06, "loss": 0.0009, "step": 9918 }, { "epoch": 2.616805170821791, "grad_norm": 0.2122122049331665, "learning_rate": 4.9235909610480965e-06, "loss": 0.0016, "step": 9920 }, { "epoch": 2.6173328056984566, "grad_norm": 0.005973452236503363, "learning_rate": 4.9232392508572935e-06, "loss": 0.0004, "step": 9922 }, { "epoch": 2.617860440575122, "grad_norm": 0.030672507360577583, "learning_rate": 4.9228875406664904e-06, "loss": 0.0008, "step": 9924 }, { "epoch": 2.6183880754517874, "grad_norm": 0.017173411324620247, "learning_rate": 4.922535830475688e-06, "loss": 0.0004, "step": 9926 }, { "epoch": 2.618915710328453, "grad_norm": 0.4220389425754547, "learning_rate": 4.922184120284885e-06, "loss": 0.0146, "step": 9928 }, { "epoch": 2.6194433452051182, "grad_norm": 0.45849138498306274, "learning_rate": 4.921832410094083e-06, "loss": 0.0076, "step": 9930 }, { "epoch": 2.619970980081783, "grad_norm": 0.15910938382148743, "learning_rate": 4.92148069990328e-06, "loss": 0.0028, "step": 9932 }, { "epoch": 2.6204986149584486, "grad_norm": 0.09653618931770325, "learning_rate": 4.921128989712477e-06, "loss": 0.0141, "step": 9934 }, { "epoch": 2.621026249835114, "grad_norm": 0.0178357120603323, "learning_rate": 4.920777279521675e-06, "loss": 0.0005, "step": 9936 }, { "epoch": 2.6215538847117794, "grad_norm": 0.012115628458559513, "learning_rate": 4.920425569330872e-06, "loss": 0.0007, "step": 9938 }, { "epoch": 2.622081519588445, "grad_norm": 0.16294120252132416, "learning_rate": 4.920073859140069e-06, "loss": 0.0012, "step": 9940 }, { "epoch": 2.6226091544651102, "grad_norm": 0.49113729596138, "learning_rate": 4.9197221489492656e-06, "loss": 0.0075, "step": 9942 }, { "epoch": 2.6231367893417756, "grad_norm": 0.057428449392318726, "learning_rate": 4.919370438758463e-06, "loss": 0.0007, "step": 9944 }, { "epoch": 2.6236644242184406, "grad_norm": 0.07623180001974106, "learning_rate": 4.91901872856766e-06, "loss": 0.0009, "step": 9946 }, { "epoch": 2.6241920590951064, "grad_norm": 0.05244537070393562, "learning_rate": 4.918667018376857e-06, "loss": 0.0048, "step": 9948 }, { "epoch": 2.6247196939717714, "grad_norm": 0.06570587307214737, "learning_rate": 4.918315308186055e-06, "loss": 0.0006, "step": 9950 }, { "epoch": 2.625247328848437, "grad_norm": 0.7374190092086792, "learning_rate": 4.917963597995252e-06, "loss": 0.0037, "step": 9952 }, { "epoch": 2.625774963725102, "grad_norm": 0.03664751350879669, "learning_rate": 4.91761188780445e-06, "loss": 0.0072, "step": 9954 }, { "epoch": 2.6263025986017676, "grad_norm": 1.012587308883667, "learning_rate": 4.917260177613646e-06, "loss": 0.003, "step": 9956 }, { "epoch": 2.626830233478433, "grad_norm": 0.020769573748111725, "learning_rate": 4.916908467422844e-06, "loss": 0.0009, "step": 9958 }, { "epoch": 2.6273578683550984, "grad_norm": 0.013554254546761513, "learning_rate": 4.9165567572320415e-06, "loss": 0.0004, "step": 9960 }, { "epoch": 2.627885503231764, "grad_norm": 0.039477478712797165, "learning_rate": 4.916205047041238e-06, "loss": 0.0032, "step": 9962 }, { "epoch": 2.628413138108429, "grad_norm": 0.07941809296607971, "learning_rate": 4.915853336850435e-06, "loss": 0.0033, "step": 9964 }, { "epoch": 2.628940772985094, "grad_norm": 0.5032215118408203, "learning_rate": 4.915501626659632e-06, "loss": 0.0014, "step": 9966 }, { "epoch": 2.6294684078617596, "grad_norm": 0.017360080033540726, "learning_rate": 4.91514991646883e-06, "loss": 0.0057, "step": 9968 }, { "epoch": 2.629996042738425, "grad_norm": 0.004008777439594269, "learning_rate": 4.914798206278027e-06, "loss": 0.0006, "step": 9970 }, { "epoch": 2.6305236776150904, "grad_norm": 0.2023034542798996, "learning_rate": 4.914446496087224e-06, "loss": 0.0041, "step": 9972 }, { "epoch": 2.631051312491756, "grad_norm": 0.005043225362896919, "learning_rate": 4.914094785896422e-06, "loss": 0.0003, "step": 9974 }, { "epoch": 2.6315789473684212, "grad_norm": 0.03163626417517662, "learning_rate": 4.913743075705619e-06, "loss": 0.0005, "step": 9976 }, { "epoch": 2.632106582245086, "grad_norm": 0.9041308164596558, "learning_rate": 4.913391365514816e-06, "loss": 0.0068, "step": 9978 }, { "epoch": 2.6326342171217516, "grad_norm": 0.015023363754153252, "learning_rate": 4.913039655324013e-06, "loss": 0.0004, "step": 9980 }, { "epoch": 2.633161851998417, "grad_norm": 0.13436682522296906, "learning_rate": 4.9126879451332105e-06, "loss": 0.0014, "step": 9982 }, { "epoch": 2.6336894868750824, "grad_norm": 0.049752503633499146, "learning_rate": 4.912336234942408e-06, "loss": 0.0006, "step": 9984 }, { "epoch": 2.634217121751748, "grad_norm": 0.021008020266890526, "learning_rate": 4.9119845247516044e-06, "loss": 0.0007, "step": 9986 }, { "epoch": 2.6347447566284132, "grad_norm": 0.024554574862122536, "learning_rate": 4.911632814560802e-06, "loss": 0.0004, "step": 9988 }, { "epoch": 2.6352723915050786, "grad_norm": 0.008210291154682636, "learning_rate": 4.911281104369999e-06, "loss": 0.0003, "step": 9990 }, { "epoch": 2.6358000263817436, "grad_norm": 0.20938223600387573, "learning_rate": 4.910929394179196e-06, "loss": 0.0041, "step": 9992 }, { "epoch": 2.6363276612584094, "grad_norm": 0.05016615614295006, "learning_rate": 4.910577683988394e-06, "loss": 0.0015, "step": 9994 }, { "epoch": 2.6368552961350744, "grad_norm": 0.002989637665450573, "learning_rate": 4.910225973797591e-06, "loss": 0.0007, "step": 9996 }, { "epoch": 2.63738293101174, "grad_norm": 0.05841299518942833, "learning_rate": 4.909874263606789e-06, "loss": 0.0007, "step": 9998 }, { "epoch": 2.6379105658884052, "grad_norm": 0.007364336866885424, "learning_rate": 4.909522553415985e-06, "loss": 0.001, "step": 10000 }, { "epoch": 2.6379105658884052, "eval_loss": 0.003040681593120098, "eval_runtime": 305.4416, "eval_samples_per_second": 705.997, "eval_steps_per_second": 88.253, "step": 10000 }, { "epoch": 2.6384382007650706, "grad_norm": 0.1638088822364807, "learning_rate": 4.909170843225183e-06, "loss": 0.0014, "step": 10002 }, { "epoch": 2.638965835641736, "grad_norm": 0.009529449976980686, "learning_rate": 4.9088191330343795e-06, "loss": 0.0005, "step": 10004 }, { "epoch": 2.639493470518401, "grad_norm": 0.6917498111724854, "learning_rate": 4.908467422843577e-06, "loss": 0.0056, "step": 10006 }, { "epoch": 2.640021105395067, "grad_norm": 0.005256994627416134, "learning_rate": 4.908115712652774e-06, "loss": 0.001, "step": 10008 }, { "epoch": 2.640548740271732, "grad_norm": 0.35710135102272034, "learning_rate": 4.907764002461971e-06, "loss": 0.0101, "step": 10010 }, { "epoch": 2.641076375148397, "grad_norm": 0.01792905293405056, "learning_rate": 4.907412292271169e-06, "loss": 0.0004, "step": 10012 }, { "epoch": 2.6416040100250626, "grad_norm": 0.027957025915384293, "learning_rate": 4.907060582080366e-06, "loss": 0.0008, "step": 10014 }, { "epoch": 2.642131644901728, "grad_norm": 0.5192543864250183, "learning_rate": 4.906708871889563e-06, "loss": 0.0167, "step": 10016 }, { "epoch": 2.6426592797783934, "grad_norm": 0.4543107748031616, "learning_rate": 4.906357161698761e-06, "loss": 0.0112, "step": 10018 }, { "epoch": 2.643186914655059, "grad_norm": 0.01960085518658161, "learning_rate": 4.906005451507958e-06, "loss": 0.0051, "step": 10020 }, { "epoch": 2.6437145495317242, "grad_norm": 0.010365528985857964, "learning_rate": 4.905653741317155e-06, "loss": 0.0031, "step": 10022 }, { "epoch": 2.644242184408389, "grad_norm": 0.14326448738574982, "learning_rate": 4.905302031126352e-06, "loss": 0.0006, "step": 10024 }, { "epoch": 2.6447698192850546, "grad_norm": 0.504036545753479, "learning_rate": 4.904950320935549e-06, "loss": 0.0036, "step": 10026 }, { "epoch": 2.64529745416172, "grad_norm": 0.6032055020332336, "learning_rate": 4.904598610744746e-06, "loss": 0.0036, "step": 10028 }, { "epoch": 2.6458250890383854, "grad_norm": 0.15757279098033905, "learning_rate": 4.904246900553943e-06, "loss": 0.0018, "step": 10030 }, { "epoch": 2.646352723915051, "grad_norm": 0.41118180751800537, "learning_rate": 4.903895190363141e-06, "loss": 0.0077, "step": 10032 }, { "epoch": 2.6468803587917162, "grad_norm": 0.02871677279472351, "learning_rate": 4.903543480172338e-06, "loss": 0.0011, "step": 10034 }, { "epoch": 2.6474079936683816, "grad_norm": 0.1100771352648735, "learning_rate": 4.903191769981536e-06, "loss": 0.0009, "step": 10036 }, { "epoch": 2.6479356285450466, "grad_norm": 0.1135307177901268, "learning_rate": 4.902840059790732e-06, "loss": 0.0026, "step": 10038 }, { "epoch": 2.648463263421712, "grad_norm": 0.10444889962673187, "learning_rate": 4.90248834959993e-06, "loss": 0.0023, "step": 10040 }, { "epoch": 2.6489908982983774, "grad_norm": 0.021869085729122162, "learning_rate": 4.9021366394091276e-06, "loss": 0.0008, "step": 10042 }, { "epoch": 2.649518533175043, "grad_norm": 0.07090561836957932, "learning_rate": 4.9017849292183245e-06, "loss": 0.0011, "step": 10044 }, { "epoch": 2.6500461680517082, "grad_norm": 0.11490292847156525, "learning_rate": 4.9014332190275215e-06, "loss": 0.0008, "step": 10046 }, { "epoch": 2.6505738029283736, "grad_norm": 0.5612719655036926, "learning_rate": 4.9010815088367184e-06, "loss": 0.0067, "step": 10048 }, { "epoch": 2.651101437805039, "grad_norm": 0.03566773980855942, "learning_rate": 4.900729798645916e-06, "loss": 0.0045, "step": 10050 }, { "epoch": 2.651629072681704, "grad_norm": 0.05001593753695488, "learning_rate": 4.900378088455113e-06, "loss": 0.0037, "step": 10052 }, { "epoch": 2.65215670755837, "grad_norm": 0.03008742816746235, "learning_rate": 4.90002637826431e-06, "loss": 0.0014, "step": 10054 }, { "epoch": 2.652684342435035, "grad_norm": 0.026149284094572067, "learning_rate": 4.899674668073508e-06, "loss": 0.0086, "step": 10056 }, { "epoch": 2.6532119773117, "grad_norm": 0.25955918431282043, "learning_rate": 4.899322957882705e-06, "loss": 0.0014, "step": 10058 }, { "epoch": 2.6537396121883656, "grad_norm": 0.030602065846323967, "learning_rate": 4.898971247691902e-06, "loss": 0.0006, "step": 10060 }, { "epoch": 2.654267247065031, "grad_norm": 0.06483785063028336, "learning_rate": 4.898619537501099e-06, "loss": 0.0006, "step": 10062 }, { "epoch": 2.6547948819416964, "grad_norm": 0.23096852004528046, "learning_rate": 4.898267827310297e-06, "loss": 0.001, "step": 10064 }, { "epoch": 2.655322516818362, "grad_norm": 0.07847417145967484, "learning_rate": 4.8979161171194935e-06, "loss": 0.0006, "step": 10066 }, { "epoch": 2.6558501516950273, "grad_norm": 0.09289401769638062, "learning_rate": 4.8975644069286905e-06, "loss": 0.0044, "step": 10068 }, { "epoch": 2.656377786571692, "grad_norm": 0.0065283156000077724, "learning_rate": 4.897212696737888e-06, "loss": 0.0003, "step": 10070 }, { "epoch": 2.6569054214483576, "grad_norm": 0.03705993667244911, "learning_rate": 4.896860986547085e-06, "loss": 0.0006, "step": 10072 }, { "epoch": 2.657433056325023, "grad_norm": 0.763235867023468, "learning_rate": 4.896509276356283e-06, "loss": 0.0106, "step": 10074 }, { "epoch": 2.6579606912016884, "grad_norm": 0.09963233768939972, "learning_rate": 4.89615756616548e-06, "loss": 0.0105, "step": 10076 }, { "epoch": 2.658488326078354, "grad_norm": 0.1379678100347519, "learning_rate": 4.895805855974677e-06, "loss": 0.0035, "step": 10078 }, { "epoch": 2.6590159609550192, "grad_norm": 0.012368221767246723, "learning_rate": 4.895454145783875e-06, "loss": 0.0003, "step": 10080 }, { "epoch": 2.6595435958316846, "grad_norm": 0.020039286464452744, "learning_rate": 4.895102435593071e-06, "loss": 0.0005, "step": 10082 }, { "epoch": 2.6600712307083496, "grad_norm": 0.07973247766494751, "learning_rate": 4.894750725402269e-06, "loss": 0.0009, "step": 10084 }, { "epoch": 2.660598865585015, "grad_norm": 0.0617186464369297, "learning_rate": 4.894399015211466e-06, "loss": 0.0006, "step": 10086 }, { "epoch": 2.6611265004616804, "grad_norm": 0.023486996069550514, "learning_rate": 4.894047305020663e-06, "loss": 0.0004, "step": 10088 }, { "epoch": 2.661654135338346, "grad_norm": 0.07670652866363525, "learning_rate": 4.89369559482986e-06, "loss": 0.0031, "step": 10090 }, { "epoch": 2.6621817702150112, "grad_norm": 0.05943745747208595, "learning_rate": 4.893343884639057e-06, "loss": 0.0033, "step": 10092 }, { "epoch": 2.6627094050916766, "grad_norm": 0.6848414540290833, "learning_rate": 4.892992174448255e-06, "loss": 0.0068, "step": 10094 }, { "epoch": 2.663237039968342, "grad_norm": 0.057913076132535934, "learning_rate": 4.892640464257452e-06, "loss": 0.0041, "step": 10096 }, { "epoch": 2.663764674845007, "grad_norm": 0.014051779173314571, "learning_rate": 4.892288754066649e-06, "loss": 0.0006, "step": 10098 }, { "epoch": 2.664292309721673, "grad_norm": 0.02052035555243492, "learning_rate": 4.891937043875847e-06, "loss": 0.0004, "step": 10100 }, { "epoch": 2.664819944598338, "grad_norm": 0.566943347454071, "learning_rate": 4.891585333685044e-06, "loss": 0.0018, "step": 10102 }, { "epoch": 2.6653475794750032, "grad_norm": 0.019973741844296455, "learning_rate": 4.8912336234942416e-06, "loss": 0.0066, "step": 10104 }, { "epoch": 2.6658752143516686, "grad_norm": 0.03614860773086548, "learning_rate": 4.890881913303438e-06, "loss": 0.0019, "step": 10106 }, { "epoch": 2.666402849228334, "grad_norm": 0.40839439630508423, "learning_rate": 4.8905302031126355e-06, "loss": 0.006, "step": 10108 }, { "epoch": 2.6669304841049994, "grad_norm": 0.3382878601551056, "learning_rate": 4.890178492921832e-06, "loss": 0.013, "step": 10110 }, { "epoch": 2.667458118981665, "grad_norm": 0.15186968445777893, "learning_rate": 4.889826782731029e-06, "loss": 0.0018, "step": 10112 }, { "epoch": 2.6679857538583303, "grad_norm": 0.19736847281455994, "learning_rate": 4.889475072540227e-06, "loss": 0.0007, "step": 10114 }, { "epoch": 2.668513388734995, "grad_norm": 0.22666281461715698, "learning_rate": 4.889123362349424e-06, "loss": 0.0063, "step": 10116 }, { "epoch": 2.6690410236116606, "grad_norm": 0.012691947631537914, "learning_rate": 4.888771652158622e-06, "loss": 0.0007, "step": 10118 }, { "epoch": 2.669568658488326, "grad_norm": 0.41208407282829285, "learning_rate": 4.888419941967818e-06, "loss": 0.0026, "step": 10120 }, { "epoch": 2.6700962933649914, "grad_norm": 0.07850033789873123, "learning_rate": 4.888068231777016e-06, "loss": 0.0006, "step": 10122 }, { "epoch": 2.670623928241657, "grad_norm": 0.020112017169594765, "learning_rate": 4.887716521586213e-06, "loss": 0.0007, "step": 10124 }, { "epoch": 2.6711515631183222, "grad_norm": 0.24020005762577057, "learning_rate": 4.887364811395411e-06, "loss": 0.0011, "step": 10126 }, { "epoch": 2.6716791979949877, "grad_norm": 0.20279309153556824, "learning_rate": 4.8870131012046075e-06, "loss": 0.0038, "step": 10128 }, { "epoch": 2.6722068328716526, "grad_norm": 0.22403864562511444, "learning_rate": 4.8866613910138045e-06, "loss": 0.0035, "step": 10130 }, { "epoch": 2.672734467748318, "grad_norm": 0.7049311995506287, "learning_rate": 4.886309680823002e-06, "loss": 0.0082, "step": 10132 }, { "epoch": 2.6732621026249834, "grad_norm": 0.22138287127017975, "learning_rate": 4.885957970632199e-06, "loss": 0.0035, "step": 10134 }, { "epoch": 2.673789737501649, "grad_norm": 2.5631678104400635, "learning_rate": 4.885606260441396e-06, "loss": 0.01, "step": 10136 }, { "epoch": 2.6743173723783142, "grad_norm": 0.07217426598072052, "learning_rate": 4.885254550250594e-06, "loss": 0.0007, "step": 10138 }, { "epoch": 2.6748450072549796, "grad_norm": 0.18553683161735535, "learning_rate": 4.884902840059791e-06, "loss": 0.0017, "step": 10140 }, { "epoch": 2.675372642131645, "grad_norm": 0.19463655352592468, "learning_rate": 4.884551129868988e-06, "loss": 0.0095, "step": 10142 }, { "epoch": 2.67590027700831, "grad_norm": 0.06316284090280533, "learning_rate": 4.884199419678185e-06, "loss": 0.0007, "step": 10144 }, { "epoch": 2.676427911884976, "grad_norm": 0.22067295014858246, "learning_rate": 4.883847709487383e-06, "loss": 0.0188, "step": 10146 }, { "epoch": 2.676955546761641, "grad_norm": 0.9649526476860046, "learning_rate": 4.88349599929658e-06, "loss": 0.0041, "step": 10148 }, { "epoch": 2.6774831816383062, "grad_norm": 0.09002581983804703, "learning_rate": 4.8831442891057766e-06, "loss": 0.006, "step": 10150 }, { "epoch": 2.6780108165149716, "grad_norm": 0.6324059367179871, "learning_rate": 4.882792578914974e-06, "loss": 0.0079, "step": 10152 }, { "epoch": 2.678538451391637, "grad_norm": 0.2388220876455307, "learning_rate": 4.882440868724171e-06, "loss": 0.0017, "step": 10154 }, { "epoch": 2.6790660862683024, "grad_norm": 0.11611224710941315, "learning_rate": 4.882089158533369e-06, "loss": 0.001, "step": 10156 }, { "epoch": 2.6795937211449674, "grad_norm": 0.026970673352479935, "learning_rate": 4.881737448342566e-06, "loss": 0.0043, "step": 10158 }, { "epoch": 2.6801213560216333, "grad_norm": 0.6008735299110413, "learning_rate": 4.881385738151763e-06, "loss": 0.0028, "step": 10160 }, { "epoch": 2.6806489908982982, "grad_norm": 0.29030221700668335, "learning_rate": 4.881034027960961e-06, "loss": 0.0032, "step": 10162 }, { "epoch": 2.6811766257749636, "grad_norm": 0.1571294218301773, "learning_rate": 4.880682317770158e-06, "loss": 0.0012, "step": 10164 }, { "epoch": 2.681704260651629, "grad_norm": 0.02661198377609253, "learning_rate": 4.880330607579355e-06, "loss": 0.0005, "step": 10166 }, { "epoch": 2.6822318955282944, "grad_norm": 0.020172372460365295, "learning_rate": 4.879978897388552e-06, "loss": 0.0004, "step": 10168 }, { "epoch": 2.68275953040496, "grad_norm": 0.29547277092933655, "learning_rate": 4.8796271871977495e-06, "loss": 0.0083, "step": 10170 }, { "epoch": 2.6832871652816253, "grad_norm": 0.0122764203697443, "learning_rate": 4.879275477006946e-06, "loss": 0.0004, "step": 10172 }, { "epoch": 2.6838148001582907, "grad_norm": 0.6575045585632324, "learning_rate": 4.878923766816143e-06, "loss": 0.0047, "step": 10174 }, { "epoch": 2.6843424350349556, "grad_norm": 0.3791520297527313, "learning_rate": 4.878572056625341e-06, "loss": 0.001, "step": 10176 }, { "epoch": 2.684870069911621, "grad_norm": 0.04505553096532822, "learning_rate": 4.878220346434538e-06, "loss": 0.0005, "step": 10178 }, { "epoch": 2.6853977047882864, "grad_norm": 0.3244813084602356, "learning_rate": 4.877868636243735e-06, "loss": 0.0052, "step": 10180 }, { "epoch": 2.685925339664952, "grad_norm": 0.04633112996816635, "learning_rate": 4.877516926052932e-06, "loss": 0.0018, "step": 10182 }, { "epoch": 2.6864529745416172, "grad_norm": 0.23033525049686432, "learning_rate": 4.87716521586213e-06, "loss": 0.0076, "step": 10184 }, { "epoch": 2.6869806094182827, "grad_norm": 0.3848191797733307, "learning_rate": 4.876813505671328e-06, "loss": 0.01, "step": 10186 }, { "epoch": 2.687508244294948, "grad_norm": 0.05901183933019638, "learning_rate": 4.876461795480524e-06, "loss": 0.0014, "step": 10188 }, { "epoch": 2.688035879171613, "grad_norm": 0.16025756299495697, "learning_rate": 4.8761100852897215e-06, "loss": 0.0013, "step": 10190 }, { "epoch": 2.688563514048279, "grad_norm": 0.10116593539714813, "learning_rate": 4.8757583750989185e-06, "loss": 0.0125, "step": 10192 }, { "epoch": 2.689091148924944, "grad_norm": 0.3241443336009979, "learning_rate": 4.875406664908116e-06, "loss": 0.0046, "step": 10194 }, { "epoch": 2.6896187838016092, "grad_norm": 0.09015371650457382, "learning_rate": 4.875054954717313e-06, "loss": 0.003, "step": 10196 }, { "epoch": 2.6901464186782746, "grad_norm": 0.10360846668481827, "learning_rate": 4.87470324452651e-06, "loss": 0.003, "step": 10198 }, { "epoch": 2.69067405355494, "grad_norm": 0.11986161768436432, "learning_rate": 4.874351534335708e-06, "loss": 0.0007, "step": 10200 }, { "epoch": 2.6912016884316055, "grad_norm": 0.21223805844783783, "learning_rate": 4.873999824144904e-06, "loss": 0.0013, "step": 10202 }, { "epoch": 2.6917293233082704, "grad_norm": 0.11478252708911896, "learning_rate": 4.873648113954102e-06, "loss": 0.0021, "step": 10204 }, { "epoch": 2.6922569581849363, "grad_norm": 0.4366920292377472, "learning_rate": 4.873296403763299e-06, "loss": 0.0065, "step": 10206 }, { "epoch": 2.6927845930616012, "grad_norm": 0.05605805292725563, "learning_rate": 4.872944693572497e-06, "loss": 0.0005, "step": 10208 }, { "epoch": 2.6933122279382666, "grad_norm": 0.19040991365909576, "learning_rate": 4.872592983381694e-06, "loss": 0.0013, "step": 10210 }, { "epoch": 2.693839862814932, "grad_norm": 0.03345591202378273, "learning_rate": 4.8722412731908905e-06, "loss": 0.0037, "step": 10212 }, { "epoch": 2.6943674976915974, "grad_norm": 0.020715264603495598, "learning_rate": 4.871889563000088e-06, "loss": 0.0004, "step": 10214 }, { "epoch": 2.694895132568263, "grad_norm": 0.06666180491447449, "learning_rate": 4.871537852809285e-06, "loss": 0.0007, "step": 10216 }, { "epoch": 2.6954227674449283, "grad_norm": 0.1913573443889618, "learning_rate": 4.871186142618482e-06, "loss": 0.0038, "step": 10218 }, { "epoch": 2.6959504023215937, "grad_norm": 0.03835398703813553, "learning_rate": 4.87083443242768e-06, "loss": 0.0011, "step": 10220 }, { "epoch": 2.6964780371982586, "grad_norm": 0.00795105192810297, "learning_rate": 4.870482722236877e-06, "loss": 0.0004, "step": 10222 }, { "epoch": 2.697005672074924, "grad_norm": 0.11909277737140656, "learning_rate": 4.870131012046075e-06, "loss": 0.0114, "step": 10224 }, { "epoch": 2.6975333069515894, "grad_norm": 0.7554083466529846, "learning_rate": 4.869779301855271e-06, "loss": 0.0027, "step": 10226 }, { "epoch": 2.698060941828255, "grad_norm": 0.06110072508454323, "learning_rate": 4.869427591664469e-06, "loss": 0.0007, "step": 10228 }, { "epoch": 2.6985885767049202, "grad_norm": 0.37390798330307007, "learning_rate": 4.869075881473666e-06, "loss": 0.0117, "step": 10230 }, { "epoch": 2.6991162115815857, "grad_norm": 0.11343719065189362, "learning_rate": 4.868724171282863e-06, "loss": 0.0007, "step": 10232 }, { "epoch": 2.699643846458251, "grad_norm": 0.4225482642650604, "learning_rate": 4.86837246109206e-06, "loss": 0.0049, "step": 10234 }, { "epoch": 2.700171481334916, "grad_norm": 0.7115074396133423, "learning_rate": 4.868020750901257e-06, "loss": 0.0015, "step": 10236 }, { "epoch": 2.7006991162115814, "grad_norm": 0.1360260397195816, "learning_rate": 4.867669040710455e-06, "loss": 0.0007, "step": 10238 }, { "epoch": 2.701226751088247, "grad_norm": 0.05240325629711151, "learning_rate": 4.867317330519651e-06, "loss": 0.0045, "step": 10240 }, { "epoch": 2.7017543859649122, "grad_norm": 0.9093762040138245, "learning_rate": 4.866965620328849e-06, "loss": 0.0124, "step": 10242 }, { "epoch": 2.7022820208415776, "grad_norm": 0.20002205669879913, "learning_rate": 4.866613910138047e-06, "loss": 0.0015, "step": 10244 }, { "epoch": 2.702809655718243, "grad_norm": 0.1969161331653595, "learning_rate": 4.866262199947244e-06, "loss": 0.0069, "step": 10246 }, { "epoch": 2.7033372905949085, "grad_norm": 0.6218304634094238, "learning_rate": 4.865910489756441e-06, "loss": 0.006, "step": 10248 }, { "epoch": 2.7038649254715734, "grad_norm": 0.3879711329936981, "learning_rate": 4.865558779565638e-06, "loss": 0.0013, "step": 10250 }, { "epoch": 2.7043925603482393, "grad_norm": 0.3767373263835907, "learning_rate": 4.8652070693748355e-06, "loss": 0.0014, "step": 10252 }, { "epoch": 2.7049201952249042, "grad_norm": 0.0656413733959198, "learning_rate": 4.8648553591840325e-06, "loss": 0.0007, "step": 10254 }, { "epoch": 2.7054478301015696, "grad_norm": 0.19714100658893585, "learning_rate": 4.8645036489932294e-06, "loss": 0.0019, "step": 10256 }, { "epoch": 2.705975464978235, "grad_norm": 0.0757841244339943, "learning_rate": 4.864151938802427e-06, "loss": 0.0006, "step": 10258 }, { "epoch": 2.7065030998549005, "grad_norm": 0.030212296172976494, "learning_rate": 4.863800228611624e-06, "loss": 0.0008, "step": 10260 }, { "epoch": 2.707030734731566, "grad_norm": 0.011853760108351707, "learning_rate": 4.863448518420821e-06, "loss": 0.0004, "step": 10262 }, { "epoch": 2.7075583696082313, "grad_norm": 0.1724441647529602, "learning_rate": 4.863096808230018e-06, "loss": 0.0011, "step": 10264 }, { "epoch": 2.7080860044848967, "grad_norm": 0.6389613747596741, "learning_rate": 4.862745098039216e-06, "loss": 0.0067, "step": 10266 }, { "epoch": 2.7086136393615616, "grad_norm": 0.01674388349056244, "learning_rate": 4.862393387848413e-06, "loss": 0.0004, "step": 10268 }, { "epoch": 2.709141274238227, "grad_norm": 0.050362586975097656, "learning_rate": 4.86204167765761e-06, "loss": 0.0006, "step": 10270 }, { "epoch": 2.7096689091148924, "grad_norm": 0.32506465911865234, "learning_rate": 4.861689967466808e-06, "loss": 0.008, "step": 10272 }, { "epoch": 2.710196543991558, "grad_norm": 0.011596443131566048, "learning_rate": 4.8613382572760045e-06, "loss": 0.0004, "step": 10274 }, { "epoch": 2.7107241788682233, "grad_norm": 0.4819231331348419, "learning_rate": 4.860986547085202e-06, "loss": 0.0066, "step": 10276 }, { "epoch": 2.7112518137448887, "grad_norm": 0.41303539276123047, "learning_rate": 4.860634836894399e-06, "loss": 0.0062, "step": 10278 }, { "epoch": 2.711779448621554, "grad_norm": 0.16729678213596344, "learning_rate": 4.860283126703596e-06, "loss": 0.001, "step": 10280 }, { "epoch": 2.712307083498219, "grad_norm": 0.43211230635643005, "learning_rate": 4.859931416512794e-06, "loss": 0.0043, "step": 10282 }, { "epoch": 2.7128347183748844, "grad_norm": 0.04577764868736267, "learning_rate": 4.859579706321991e-06, "loss": 0.0011, "step": 10284 }, { "epoch": 2.71336235325155, "grad_norm": 0.058543212711811066, "learning_rate": 4.859227996131188e-06, "loss": 0.0009, "step": 10286 }, { "epoch": 2.7138899881282152, "grad_norm": 0.06239699944853783, "learning_rate": 4.858876285940385e-06, "loss": 0.0063, "step": 10288 }, { "epoch": 2.7144176230048807, "grad_norm": 0.10044394433498383, "learning_rate": 4.858524575749583e-06, "loss": 0.0026, "step": 10290 }, { "epoch": 2.714945257881546, "grad_norm": 0.1531144231557846, "learning_rate": 4.85817286555878e-06, "loss": 0.0009, "step": 10292 }, { "epoch": 2.7154728927582115, "grad_norm": 0.04258231073617935, "learning_rate": 4.857821155367977e-06, "loss": 0.0008, "step": 10294 }, { "epoch": 2.7160005276348764, "grad_norm": 0.029274897649884224, "learning_rate": 4.857469445177174e-06, "loss": 0.0006, "step": 10296 }, { "epoch": 2.7165281625115423, "grad_norm": 0.5427647233009338, "learning_rate": 4.857117734986371e-06, "loss": 0.0026, "step": 10298 }, { "epoch": 2.7170557973882072, "grad_norm": 0.11700332164764404, "learning_rate": 4.856766024795568e-06, "loss": 0.0008, "step": 10300 }, { "epoch": 2.7175834322648726, "grad_norm": 0.2297973334789276, "learning_rate": 4.856414314604766e-06, "loss": 0.0109, "step": 10302 }, { "epoch": 2.718111067141538, "grad_norm": 0.041337769478559494, "learning_rate": 4.856062604413963e-06, "loss": 0.0007, "step": 10304 }, { "epoch": 2.7186387020182035, "grad_norm": 0.03361361846327782, "learning_rate": 4.855710894223161e-06, "loss": 0.0061, "step": 10306 }, { "epoch": 2.719166336894869, "grad_norm": 0.11685937643051147, "learning_rate": 4.855359184032357e-06, "loss": 0.0103, "step": 10308 }, { "epoch": 2.719693971771534, "grad_norm": 0.025240659713745117, "learning_rate": 4.855007473841555e-06, "loss": 0.0044, "step": 10310 }, { "epoch": 2.7202216066481997, "grad_norm": 0.025921594351530075, "learning_rate": 4.854655763650752e-06, "loss": 0.0005, "step": 10312 }, { "epoch": 2.7207492415248646, "grad_norm": 0.04736468940973282, "learning_rate": 4.8543040534599495e-06, "loss": 0.0027, "step": 10314 }, { "epoch": 2.72127687640153, "grad_norm": 0.03261805325746536, "learning_rate": 4.8539523432691465e-06, "loss": 0.0048, "step": 10316 }, { "epoch": 2.7218045112781954, "grad_norm": 0.015615081414580345, "learning_rate": 4.853600633078343e-06, "loss": 0.0005, "step": 10318 }, { "epoch": 2.722332146154861, "grad_norm": 0.02141091600060463, "learning_rate": 4.853248922887541e-06, "loss": 0.0006, "step": 10320 }, { "epoch": 2.7228597810315263, "grad_norm": 0.05008675530552864, "learning_rate": 4.852897212696737e-06, "loss": 0.005, "step": 10322 }, { "epoch": 2.7233874159081917, "grad_norm": 0.013426889665424824, "learning_rate": 4.852545502505935e-06, "loss": 0.0006, "step": 10324 }, { "epoch": 2.723915050784857, "grad_norm": 0.05142280459403992, "learning_rate": 4.852193792315132e-06, "loss": 0.0007, "step": 10326 }, { "epoch": 2.724442685661522, "grad_norm": 0.2835778295993805, "learning_rate": 4.85184208212433e-06, "loss": 0.0009, "step": 10328 }, { "epoch": 2.7249703205381874, "grad_norm": 0.31152480840682983, "learning_rate": 4.851490371933527e-06, "loss": 0.0014, "step": 10330 }, { "epoch": 2.725497955414853, "grad_norm": 0.019793406128883362, "learning_rate": 4.851138661742724e-06, "loss": 0.001, "step": 10332 }, { "epoch": 2.7260255902915183, "grad_norm": 0.4360874593257904, "learning_rate": 4.850786951551922e-06, "loss": 0.0023, "step": 10334 }, { "epoch": 2.7265532251681837, "grad_norm": 0.20413795113563538, "learning_rate": 4.8504352413611185e-06, "loss": 0.0044, "step": 10336 }, { "epoch": 2.727080860044849, "grad_norm": 0.0983135774731636, "learning_rate": 4.8500835311703155e-06, "loss": 0.0009, "step": 10338 }, { "epoch": 2.7276084949215145, "grad_norm": 0.13183896243572235, "learning_rate": 4.849731820979513e-06, "loss": 0.0009, "step": 10340 }, { "epoch": 2.7281361297981794, "grad_norm": 0.4210481345653534, "learning_rate": 4.84938011078871e-06, "loss": 0.0062, "step": 10342 }, { "epoch": 2.7286637646748453, "grad_norm": 0.3697989583015442, "learning_rate": 4.849028400597908e-06, "loss": 0.0131, "step": 10344 }, { "epoch": 2.7291913995515102, "grad_norm": 0.12896735966205597, "learning_rate": 4.848676690407104e-06, "loss": 0.0008, "step": 10346 }, { "epoch": 2.7297190344281757, "grad_norm": 0.02762562967836857, "learning_rate": 4.848324980216302e-06, "loss": 0.0005, "step": 10348 }, { "epoch": 2.730246669304841, "grad_norm": 0.02130773290991783, "learning_rate": 4.847973270025499e-06, "loss": 0.0005, "step": 10350 }, { "epoch": 2.7307743041815065, "grad_norm": 0.3638101816177368, "learning_rate": 4.847621559834696e-06, "loss": 0.0012, "step": 10352 }, { "epoch": 2.731301939058172, "grad_norm": 0.15202131867408752, "learning_rate": 4.847269849643894e-06, "loss": 0.0008, "step": 10354 }, { "epoch": 2.731829573934837, "grad_norm": 0.11033894121646881, "learning_rate": 4.846918139453091e-06, "loss": 0.0077, "step": 10356 }, { "epoch": 2.7323572088115027, "grad_norm": 0.01559466402977705, "learning_rate": 4.846566429262288e-06, "loss": 0.0004, "step": 10358 }, { "epoch": 2.7328848436881676, "grad_norm": 0.02433779463171959, "learning_rate": 4.846214719071485e-06, "loss": 0.0004, "step": 10360 }, { "epoch": 2.733412478564833, "grad_norm": 0.006304068956524134, "learning_rate": 4.845863008880682e-06, "loss": 0.0004, "step": 10362 }, { "epoch": 2.7339401134414985, "grad_norm": 0.06949096918106079, "learning_rate": 4.84551129868988e-06, "loss": 0.0005, "step": 10364 }, { "epoch": 2.734467748318164, "grad_norm": 0.18216489255428314, "learning_rate": 4.845159588499077e-06, "loss": 0.0074, "step": 10366 }, { "epoch": 2.7349953831948293, "grad_norm": 0.19607144594192505, "learning_rate": 4.844807878308274e-06, "loss": 0.0091, "step": 10368 }, { "epoch": 2.7355230180714947, "grad_norm": 0.22122518718242645, "learning_rate": 4.844456168117471e-06, "loss": 0.0009, "step": 10370 }, { "epoch": 2.73605065294816, "grad_norm": 0.8910457491874695, "learning_rate": 4.844104457926669e-06, "loss": 0.0065, "step": 10372 }, { "epoch": 2.736578287824825, "grad_norm": 0.2883238196372986, "learning_rate": 4.843752747735866e-06, "loss": 0.0018, "step": 10374 }, { "epoch": 2.7371059227014904, "grad_norm": 0.011335792951285839, "learning_rate": 4.843401037545063e-06, "loss": 0.0004, "step": 10376 }, { "epoch": 2.737633557578156, "grad_norm": 0.0253742802888155, "learning_rate": 4.8430493273542605e-06, "loss": 0.0004, "step": 10378 }, { "epoch": 2.7381611924548213, "grad_norm": 0.034985434263944626, "learning_rate": 4.842697617163457e-06, "loss": 0.0008, "step": 10380 }, { "epoch": 2.7386888273314867, "grad_norm": 0.08541343361139297, "learning_rate": 4.842345906972654e-06, "loss": 0.0011, "step": 10382 }, { "epoch": 2.739216462208152, "grad_norm": 0.09939417988061905, "learning_rate": 4.841994196781851e-06, "loss": 0.0101, "step": 10384 }, { "epoch": 2.7397440970848175, "grad_norm": 0.12786734104156494, "learning_rate": 4.841642486591049e-06, "loss": 0.001, "step": 10386 }, { "epoch": 2.7402717319614824, "grad_norm": 0.3092793822288513, "learning_rate": 4.841290776400247e-06, "loss": 0.0041, "step": 10388 }, { "epoch": 2.740799366838148, "grad_norm": 0.37229955196380615, "learning_rate": 4.840939066209443e-06, "loss": 0.0088, "step": 10390 }, { "epoch": 2.7413270017148132, "grad_norm": 0.0667211040854454, "learning_rate": 4.840587356018641e-06, "loss": 0.0048, "step": 10392 }, { "epoch": 2.7418546365914787, "grad_norm": 0.0200495682656765, "learning_rate": 4.840235645827838e-06, "loss": 0.0006, "step": 10394 }, { "epoch": 2.742382271468144, "grad_norm": 0.012367097660899162, "learning_rate": 4.8398839356370356e-06, "loss": 0.0004, "step": 10396 }, { "epoch": 2.7429099063448095, "grad_norm": 0.11800480633974075, "learning_rate": 4.8395322254462325e-06, "loss": 0.0006, "step": 10398 }, { "epoch": 2.743437541221475, "grad_norm": 0.22373801469802856, "learning_rate": 4.8391805152554295e-06, "loss": 0.001, "step": 10400 }, { "epoch": 2.74396517609814, "grad_norm": 0.4586185812950134, "learning_rate": 4.838828805064627e-06, "loss": 0.0019, "step": 10402 }, { "epoch": 2.7444928109748057, "grad_norm": 0.1901768445968628, "learning_rate": 4.838477094873824e-06, "loss": 0.0042, "step": 10404 }, { "epoch": 2.7450204458514706, "grad_norm": 0.41555255651474, "learning_rate": 4.838125384683021e-06, "loss": 0.0027, "step": 10406 }, { "epoch": 2.745548080728136, "grad_norm": 0.20539918541908264, "learning_rate": 4.837773674492218e-06, "loss": 0.0054, "step": 10408 }, { "epoch": 2.7460757156048015, "grad_norm": 0.012164349667727947, "learning_rate": 4.837421964301416e-06, "loss": 0.0004, "step": 10410 }, { "epoch": 2.746603350481467, "grad_norm": 0.3632766902446747, "learning_rate": 4.837070254110613e-06, "loss": 0.0022, "step": 10412 }, { "epoch": 2.7471309853581323, "grad_norm": 0.02188701555132866, "learning_rate": 4.83671854391981e-06, "loss": 0.0067, "step": 10414 }, { "epoch": 2.7476586202347977, "grad_norm": 0.08683837950229645, "learning_rate": 4.836366833729008e-06, "loss": 0.0012, "step": 10416 }, { "epoch": 2.748186255111463, "grad_norm": 0.05272133648395538, "learning_rate": 4.836015123538205e-06, "loss": 0.0005, "step": 10418 }, { "epoch": 2.748713889988128, "grad_norm": 0.011484098620712757, "learning_rate": 4.8356634133474015e-06, "loss": 0.001, "step": 10420 }, { "epoch": 2.7492415248647935, "grad_norm": 0.027518223971128464, "learning_rate": 4.835311703156599e-06, "loss": 0.0004, "step": 10422 }, { "epoch": 2.749769159741459, "grad_norm": 0.772943377494812, "learning_rate": 4.834959992965796e-06, "loss": 0.004, "step": 10424 }, { "epoch": 2.7502967946181243, "grad_norm": 0.5087910890579224, "learning_rate": 4.834608282774994e-06, "loss": 0.0008, "step": 10426 }, { "epoch": 2.7508244294947897, "grad_norm": 0.2763347625732422, "learning_rate": 4.83425657258419e-06, "loss": 0.0076, "step": 10428 }, { "epoch": 2.751352064371455, "grad_norm": 0.6075981855392456, "learning_rate": 4.833904862393388e-06, "loss": 0.0036, "step": 10430 }, { "epoch": 2.7518796992481205, "grad_norm": 0.01771550439298153, "learning_rate": 4.833553152202585e-06, "loss": 0.0005, "step": 10432 }, { "epoch": 2.7524073341247854, "grad_norm": 0.39363938570022583, "learning_rate": 4.833201442011783e-06, "loss": 0.0079, "step": 10434 }, { "epoch": 2.752934969001451, "grad_norm": 0.05366184934973717, "learning_rate": 4.83284973182098e-06, "loss": 0.002, "step": 10436 }, { "epoch": 2.7534626038781163, "grad_norm": 0.18612205982208252, "learning_rate": 4.832498021630177e-06, "loss": 0.0033, "step": 10438 }, { "epoch": 2.7539902387547817, "grad_norm": 0.09095490723848343, "learning_rate": 4.8321463114393745e-06, "loss": 0.0012, "step": 10440 }, { "epoch": 2.754517873631447, "grad_norm": 0.11008474230766296, "learning_rate": 4.8317946012485706e-06, "loss": 0.0042, "step": 10442 }, { "epoch": 2.7550455085081125, "grad_norm": 0.4884432852268219, "learning_rate": 4.831442891057768e-06, "loss": 0.0101, "step": 10444 }, { "epoch": 2.755573143384778, "grad_norm": 0.4944087564945221, "learning_rate": 4.831091180866966e-06, "loss": 0.0025, "step": 10446 }, { "epoch": 2.756100778261443, "grad_norm": 0.3249039947986603, "learning_rate": 4.830739470676163e-06, "loss": 0.0084, "step": 10448 }, { "epoch": 2.7566284131381087, "grad_norm": 0.02587510645389557, "learning_rate": 4.83038776048536e-06, "loss": 0.0005, "step": 10450 }, { "epoch": 2.7571560480147737, "grad_norm": 0.1141115054488182, "learning_rate": 4.830036050294557e-06, "loss": 0.0008, "step": 10452 }, { "epoch": 2.757683682891439, "grad_norm": 0.023916637524962425, "learning_rate": 4.829684340103755e-06, "loss": 0.0129, "step": 10454 }, { "epoch": 2.7582113177681045, "grad_norm": 0.01521134190261364, "learning_rate": 4.829332629912952e-06, "loss": 0.0052, "step": 10456 }, { "epoch": 2.75873895264477, "grad_norm": 0.4546821117401123, "learning_rate": 4.828980919722149e-06, "loss": 0.0032, "step": 10458 }, { "epoch": 2.7592665875214353, "grad_norm": 0.14699193835258484, "learning_rate": 4.8286292095313465e-06, "loss": 0.003, "step": 10460 }, { "epoch": 2.7597942223981002, "grad_norm": 0.01914575695991516, "learning_rate": 4.8282774993405435e-06, "loss": 0.0006, "step": 10462 }, { "epoch": 2.760321857274766, "grad_norm": 0.46031609177589417, "learning_rate": 4.827925789149741e-06, "loss": 0.0102, "step": 10464 }, { "epoch": 2.760849492151431, "grad_norm": 0.08180905878543854, "learning_rate": 4.827574078958937e-06, "loss": 0.0109, "step": 10466 }, { "epoch": 2.7613771270280965, "grad_norm": 0.16583096981048584, "learning_rate": 4.827222368768135e-06, "loss": 0.001, "step": 10468 }, { "epoch": 2.761904761904762, "grad_norm": 0.011692012660205364, "learning_rate": 4.826870658577333e-06, "loss": 0.0005, "step": 10470 }, { "epoch": 2.7624323967814273, "grad_norm": 0.0340028814971447, "learning_rate": 4.826518948386529e-06, "loss": 0.0005, "step": 10472 }, { "epoch": 2.7629600316580927, "grad_norm": 0.3022186756134033, "learning_rate": 4.826167238195727e-06, "loss": 0.002, "step": 10474 }, { "epoch": 2.763487666534758, "grad_norm": 0.059784214943647385, "learning_rate": 4.825815528004924e-06, "loss": 0.0077, "step": 10476 }, { "epoch": 2.7640153014114235, "grad_norm": 0.03717672824859619, "learning_rate": 4.825463817814122e-06, "loss": 0.0004, "step": 10478 }, { "epoch": 2.7645429362880884, "grad_norm": 0.03948628157377243, "learning_rate": 4.825112107623319e-06, "loss": 0.0019, "step": 10480 }, { "epoch": 2.765070571164754, "grad_norm": 0.15712159872055054, "learning_rate": 4.8247603974325155e-06, "loss": 0.0013, "step": 10482 }, { "epoch": 2.7655982060414193, "grad_norm": 0.023069482296705246, "learning_rate": 4.824408687241713e-06, "loss": 0.0009, "step": 10484 }, { "epoch": 2.7661258409180847, "grad_norm": 0.9651660323143005, "learning_rate": 4.82405697705091e-06, "loss": 0.0058, "step": 10486 }, { "epoch": 2.76665347579475, "grad_norm": 0.18852367997169495, "learning_rate": 4.823705266860107e-06, "loss": 0.001, "step": 10488 }, { "epoch": 2.7671811106714155, "grad_norm": 0.046770982444286346, "learning_rate": 4.823353556669304e-06, "loss": 0.0007, "step": 10490 }, { "epoch": 2.767708745548081, "grad_norm": 0.1982787847518921, "learning_rate": 4.823001846478502e-06, "loss": 0.0013, "step": 10492 }, { "epoch": 2.768236380424746, "grad_norm": 0.023306405171751976, "learning_rate": 4.822650136287699e-06, "loss": 0.0007, "step": 10494 }, { "epoch": 2.7687640153014117, "grad_norm": 0.019213387742638588, "learning_rate": 4.822298426096896e-06, "loss": 0.0005, "step": 10496 }, { "epoch": 2.7692916501780767, "grad_norm": 0.03762277215719223, "learning_rate": 4.821946715906094e-06, "loss": 0.0004, "step": 10498 }, { "epoch": 2.769819285054742, "grad_norm": 0.05818525329232216, "learning_rate": 4.821595005715291e-06, "loss": 0.0102, "step": 10500 }, { "epoch": 2.7703469199314075, "grad_norm": 0.01948934979736805, "learning_rate": 4.821243295524488e-06, "loss": 0.0003, "step": 10502 }, { "epoch": 2.770874554808073, "grad_norm": 0.14897245168685913, "learning_rate": 4.820891585333685e-06, "loss": 0.0006, "step": 10504 }, { "epoch": 2.7714021896847383, "grad_norm": 0.452019602060318, "learning_rate": 4.820539875142882e-06, "loss": 0.004, "step": 10506 }, { "epoch": 2.7719298245614032, "grad_norm": 0.10049320012331009, "learning_rate": 4.82018816495208e-06, "loss": 0.0006, "step": 10508 }, { "epoch": 2.772457459438069, "grad_norm": 0.4175524115562439, "learning_rate": 4.819836454761276e-06, "loss": 0.0015, "step": 10510 }, { "epoch": 2.772985094314734, "grad_norm": 0.019371848553419113, "learning_rate": 4.819484744570474e-06, "loss": 0.007, "step": 10512 }, { "epoch": 2.7735127291913995, "grad_norm": 0.12818270921707153, "learning_rate": 4.819133034379671e-06, "loss": 0.0007, "step": 10514 }, { "epoch": 2.774040364068065, "grad_norm": 0.12369377166032791, "learning_rate": 4.818781324188869e-06, "loss": 0.0007, "step": 10516 }, { "epoch": 2.7745679989447303, "grad_norm": 0.42907610535621643, "learning_rate": 4.818429613998066e-06, "loss": 0.0104, "step": 10518 }, { "epoch": 2.7750956338213957, "grad_norm": 0.011454278603196144, "learning_rate": 4.818077903807263e-06, "loss": 0.0004, "step": 10520 }, { "epoch": 2.775623268698061, "grad_norm": 0.08180630952119827, "learning_rate": 4.8177261936164605e-06, "loss": 0.0007, "step": 10522 }, { "epoch": 2.7761509035747265, "grad_norm": 0.47178834676742554, "learning_rate": 4.8173744834256575e-06, "loss": 0.0051, "step": 10524 }, { "epoch": 2.7766785384513915, "grad_norm": 0.19103558361530304, "learning_rate": 4.817022773234854e-06, "loss": 0.0189, "step": 10526 }, { "epoch": 2.777206173328057, "grad_norm": 0.3562657833099365, "learning_rate": 4.816671063044052e-06, "loss": 0.0022, "step": 10528 }, { "epoch": 2.7777338082047223, "grad_norm": 0.2691448926925659, "learning_rate": 4.816319352853249e-06, "loss": 0.0017, "step": 10530 }, { "epoch": 2.7782614430813877, "grad_norm": 0.06094041094183922, "learning_rate": 4.815967642662446e-06, "loss": 0.0009, "step": 10532 }, { "epoch": 2.778789077958053, "grad_norm": 0.4529893696308136, "learning_rate": 4.815615932471643e-06, "loss": 0.0014, "step": 10534 }, { "epoch": 2.7793167128347185, "grad_norm": 0.46141567826271057, "learning_rate": 4.815264222280841e-06, "loss": 0.0048, "step": 10536 }, { "epoch": 2.779844347711384, "grad_norm": 0.02320549637079239, "learning_rate": 4.814912512090038e-06, "loss": 0.0004, "step": 10538 }, { "epoch": 2.780371982588049, "grad_norm": 0.018376590684056282, "learning_rate": 4.814560801899235e-06, "loss": 0.0005, "step": 10540 }, { "epoch": 2.7808996174647143, "grad_norm": 0.1438686102628708, "learning_rate": 4.8142090917084326e-06, "loss": 0.0077, "step": 10542 }, { "epoch": 2.7814272523413797, "grad_norm": 0.01834951899945736, "learning_rate": 4.8138573815176295e-06, "loss": 0.0004, "step": 10544 }, { "epoch": 2.781954887218045, "grad_norm": 0.48092037439346313, "learning_rate": 4.813505671326827e-06, "loss": 0.012, "step": 10546 }, { "epoch": 2.7824825220947105, "grad_norm": 0.033301468938589096, "learning_rate": 4.8131539611360234e-06, "loss": 0.0004, "step": 10548 }, { "epoch": 2.783010156971376, "grad_norm": 0.36238446831703186, "learning_rate": 4.812802250945221e-06, "loss": 0.0014, "step": 10550 }, { "epoch": 2.7835377918480413, "grad_norm": 0.7314324378967285, "learning_rate": 4.812450540754418e-06, "loss": 0.011, "step": 10552 }, { "epoch": 2.7840654267247062, "grad_norm": 0.015490034595131874, "learning_rate": 4.812098830563616e-06, "loss": 0.0034, "step": 10554 }, { "epoch": 2.784593061601372, "grad_norm": 0.3283213675022125, "learning_rate": 4.811747120372813e-06, "loss": 0.0058, "step": 10556 }, { "epoch": 2.785120696478037, "grad_norm": 0.10966182500123978, "learning_rate": 4.81139541018201e-06, "loss": 0.004, "step": 10558 }, { "epoch": 2.7856483313547025, "grad_norm": 0.0916728526353836, "learning_rate": 4.811043699991208e-06, "loss": 0.002, "step": 10560 }, { "epoch": 2.786175966231368, "grad_norm": 0.026879366487264633, "learning_rate": 4.810691989800405e-06, "loss": 0.0004, "step": 10562 }, { "epoch": 2.7867036011080333, "grad_norm": 0.04057028144598007, "learning_rate": 4.810340279609602e-06, "loss": 0.0013, "step": 10564 }, { "epoch": 2.7872312359846987, "grad_norm": 0.031735699623823166, "learning_rate": 4.809988569418799e-06, "loss": 0.0011, "step": 10566 }, { "epoch": 2.787758870861364, "grad_norm": 0.42466503381729126, "learning_rate": 4.809636859227996e-06, "loss": 0.012, "step": 10568 }, { "epoch": 2.7882865057380295, "grad_norm": 0.03858199343085289, "learning_rate": 4.809285149037193e-06, "loss": 0.0062, "step": 10570 }, { "epoch": 2.7888141406146945, "grad_norm": 0.022293945774435997, "learning_rate": 4.80893343884639e-06, "loss": 0.0005, "step": 10572 }, { "epoch": 2.78934177549136, "grad_norm": 0.41989800333976746, "learning_rate": 4.808581728655588e-06, "loss": 0.005, "step": 10574 }, { "epoch": 2.7898694103680253, "grad_norm": 0.02784927375614643, "learning_rate": 4.808230018464785e-06, "loss": 0.0013, "step": 10576 }, { "epoch": 2.7903970452446907, "grad_norm": 0.8880260586738586, "learning_rate": 4.807878308273982e-06, "loss": 0.0082, "step": 10578 }, { "epoch": 2.790924680121356, "grad_norm": 0.006652330979704857, "learning_rate": 4.80752659808318e-06, "loss": 0.0016, "step": 10580 }, { "epoch": 2.7914523149980215, "grad_norm": 0.6108080148696899, "learning_rate": 4.807174887892377e-06, "loss": 0.0191, "step": 10582 }, { "epoch": 2.791979949874687, "grad_norm": 0.30102622509002686, "learning_rate": 4.8068231777015745e-06, "loss": 0.0017, "step": 10584 }, { "epoch": 2.792507584751352, "grad_norm": 0.011573678813874722, "learning_rate": 4.8064714675107715e-06, "loss": 0.005, "step": 10586 }, { "epoch": 2.7930352196280173, "grad_norm": 0.6722677946090698, "learning_rate": 4.806119757319968e-06, "loss": 0.0095, "step": 10588 }, { "epoch": 2.7935628545046827, "grad_norm": 0.3082790672779083, "learning_rate": 4.805768047129166e-06, "loss": 0.0042, "step": 10590 }, { "epoch": 2.794090489381348, "grad_norm": 0.0676460936665535, "learning_rate": 4.805416336938363e-06, "loss": 0.0007, "step": 10592 }, { "epoch": 2.7946181242580135, "grad_norm": 0.3037915527820587, "learning_rate": 4.80506462674756e-06, "loss": 0.0024, "step": 10594 }, { "epoch": 2.795145759134679, "grad_norm": 0.2548559904098511, "learning_rate": 4.804712916556757e-06, "loss": 0.0024, "step": 10596 }, { "epoch": 2.7956733940113443, "grad_norm": 0.048977941274642944, "learning_rate": 4.804361206365955e-06, "loss": 0.0106, "step": 10598 }, { "epoch": 2.7962010288880093, "grad_norm": 0.6038398742675781, "learning_rate": 4.804009496175152e-06, "loss": 0.0061, "step": 10600 }, { "epoch": 2.796728663764675, "grad_norm": 0.048743292689323425, "learning_rate": 4.803657785984349e-06, "loss": 0.0008, "step": 10602 }, { "epoch": 2.79725629864134, "grad_norm": 1.2162539958953857, "learning_rate": 4.8033060757935466e-06, "loss": 0.0153, "step": 10604 }, { "epoch": 2.7977839335180055, "grad_norm": 0.05037900060415268, "learning_rate": 4.8029543656027435e-06, "loss": 0.0006, "step": 10606 }, { "epoch": 2.798311568394671, "grad_norm": 0.04323781281709671, "learning_rate": 4.8026026554119405e-06, "loss": 0.0005, "step": 10608 }, { "epoch": 2.7988392032713363, "grad_norm": 0.019182905554771423, "learning_rate": 4.8022509452211374e-06, "loss": 0.0005, "step": 10610 }, { "epoch": 2.7993668381480017, "grad_norm": 0.10099935531616211, "learning_rate": 4.801899235030335e-06, "loss": 0.0007, "step": 10612 }, { "epoch": 2.799894473024667, "grad_norm": 0.17642615735530853, "learning_rate": 4.801547524839533e-06, "loss": 0.0067, "step": 10614 }, { "epoch": 2.8004221079013325, "grad_norm": 0.3777901530265808, "learning_rate": 4.801195814648729e-06, "loss": 0.0057, "step": 10616 }, { "epoch": 2.8009497427779975, "grad_norm": 0.0077697704546153545, "learning_rate": 4.800844104457927e-06, "loss": 0.0006, "step": 10618 }, { "epoch": 2.801477377654663, "grad_norm": 0.011165780946612358, "learning_rate": 4.800492394267124e-06, "loss": 0.0005, "step": 10620 }, { "epoch": 2.8020050125313283, "grad_norm": 0.17711885273456573, "learning_rate": 4.800140684076322e-06, "loss": 0.0058, "step": 10622 }, { "epoch": 2.8025326474079937, "grad_norm": 0.14898285269737244, "learning_rate": 4.799788973885519e-06, "loss": 0.0094, "step": 10624 }, { "epoch": 2.803060282284659, "grad_norm": 0.1109338328242302, "learning_rate": 4.799437263694716e-06, "loss": 0.001, "step": 10626 }, { "epoch": 2.8035879171613245, "grad_norm": 0.06816641241312027, "learning_rate": 4.799085553503913e-06, "loss": 0.0096, "step": 10628 }, { "epoch": 2.80411555203799, "grad_norm": 0.571688711643219, "learning_rate": 4.7987338433131095e-06, "loss": 0.0021, "step": 10630 }, { "epoch": 2.804643186914655, "grad_norm": 0.046021558344364166, "learning_rate": 4.798382133122307e-06, "loss": 0.0005, "step": 10632 }, { "epoch": 2.8051708217913203, "grad_norm": 0.18670737743377686, "learning_rate": 4.798030422931504e-06, "loss": 0.0117, "step": 10634 }, { "epoch": 2.8056984566679857, "grad_norm": 0.10728882998228073, "learning_rate": 4.797678712740702e-06, "loss": 0.0009, "step": 10636 }, { "epoch": 2.806226091544651, "grad_norm": 0.04425453394651413, "learning_rate": 4.797327002549899e-06, "loss": 0.0008, "step": 10638 }, { "epoch": 2.8067537264213165, "grad_norm": 0.012011186219751835, "learning_rate": 4.796975292359096e-06, "loss": 0.0005, "step": 10640 }, { "epoch": 2.807281361297982, "grad_norm": 0.008379620499908924, "learning_rate": 4.796623582168294e-06, "loss": 0.0099, "step": 10642 }, { "epoch": 2.8078089961746473, "grad_norm": 0.04673776030540466, "learning_rate": 4.796271871977491e-06, "loss": 0.0015, "step": 10644 }, { "epoch": 2.8083366310513123, "grad_norm": 0.0141600975766778, "learning_rate": 4.795920161786688e-06, "loss": 0.0008, "step": 10646 }, { "epoch": 2.808864265927978, "grad_norm": 0.02133071795105934, "learning_rate": 4.7955684515958854e-06, "loss": 0.0005, "step": 10648 }, { "epoch": 2.809391900804643, "grad_norm": 0.3553774952888489, "learning_rate": 4.795216741405082e-06, "loss": 0.0055, "step": 10650 }, { "epoch": 2.8099195356813085, "grad_norm": 0.04229690134525299, "learning_rate": 4.79486503121428e-06, "loss": 0.0004, "step": 10652 }, { "epoch": 2.810447170557974, "grad_norm": 0.0806170254945755, "learning_rate": 4.794513321023476e-06, "loss": 0.0007, "step": 10654 }, { "epoch": 2.8109748054346393, "grad_norm": 0.45482245087623596, "learning_rate": 4.794161610832674e-06, "loss": 0.0015, "step": 10656 }, { "epoch": 2.8115024403113047, "grad_norm": 0.3464703857898712, "learning_rate": 4.793809900641871e-06, "loss": 0.0087, "step": 10658 }, { "epoch": 2.8120300751879697, "grad_norm": 0.2325928509235382, "learning_rate": 4.793458190451068e-06, "loss": 0.0015, "step": 10660 }, { "epoch": 2.8125577100646355, "grad_norm": 0.005227569956332445, "learning_rate": 4.793106480260266e-06, "loss": 0.0003, "step": 10662 }, { "epoch": 2.8130853449413005, "grad_norm": 0.2564666271209717, "learning_rate": 4.792754770069463e-06, "loss": 0.0043, "step": 10664 }, { "epoch": 2.813612979817966, "grad_norm": 0.012380906380712986, "learning_rate": 4.7924030598786606e-06, "loss": 0.0004, "step": 10666 }, { "epoch": 2.8141406146946313, "grad_norm": 0.632016122341156, "learning_rate": 4.792051349687857e-06, "loss": 0.0044, "step": 10668 }, { "epoch": 2.8146682495712967, "grad_norm": 0.23499496281147003, "learning_rate": 4.7916996394970545e-06, "loss": 0.0059, "step": 10670 }, { "epoch": 2.815195884447962, "grad_norm": 0.007811129558831453, "learning_rate": 4.791347929306252e-06, "loss": 0.003, "step": 10672 }, { "epoch": 2.8157235193246275, "grad_norm": 0.11752764880657196, "learning_rate": 4.790996219115449e-06, "loss": 0.0005, "step": 10674 }, { "epoch": 2.816251154201293, "grad_norm": 0.06535729765892029, "learning_rate": 4.790644508924646e-06, "loss": 0.0005, "step": 10676 }, { "epoch": 2.816778789077958, "grad_norm": 0.024555686861276627, "learning_rate": 4.790292798733843e-06, "loss": 0.0006, "step": 10678 }, { "epoch": 2.8173064239546233, "grad_norm": 0.02155607007443905, "learning_rate": 4.789941088543041e-06, "loss": 0.0004, "step": 10680 }, { "epoch": 2.8178340588312887, "grad_norm": 0.14196856319904327, "learning_rate": 4.789589378352238e-06, "loss": 0.0105, "step": 10682 }, { "epoch": 2.818361693707954, "grad_norm": 0.09242088347673416, "learning_rate": 4.789237668161435e-06, "loss": 0.0097, "step": 10684 }, { "epoch": 2.8188893285846195, "grad_norm": 0.038561660796403885, "learning_rate": 4.788885957970633e-06, "loss": 0.0005, "step": 10686 }, { "epoch": 2.819416963461285, "grad_norm": 0.15157058835029602, "learning_rate": 4.78853424777983e-06, "loss": 0.0007, "step": 10688 }, { "epoch": 2.8199445983379503, "grad_norm": 0.013876945711672306, "learning_rate": 4.7881825375890265e-06, "loss": 0.012, "step": 10690 }, { "epoch": 2.8204722332146153, "grad_norm": 0.056117359548807144, "learning_rate": 4.7878308273982235e-06, "loss": 0.0008, "step": 10692 }, { "epoch": 2.8209998680912807, "grad_norm": 0.03293704614043236, "learning_rate": 4.787479117207421e-06, "loss": 0.0087, "step": 10694 }, { "epoch": 2.821527502967946, "grad_norm": 0.2104954868555069, "learning_rate": 4.787127407016619e-06, "loss": 0.0009, "step": 10696 }, { "epoch": 2.8220551378446115, "grad_norm": 0.01564665324985981, "learning_rate": 4.786775696825815e-06, "loss": 0.0008, "step": 10698 }, { "epoch": 2.822582772721277, "grad_norm": 0.24708552658557892, "learning_rate": 4.786423986635013e-06, "loss": 0.0012, "step": 10700 }, { "epoch": 2.8231104075979423, "grad_norm": 0.3835621476173401, "learning_rate": 4.78607227644421e-06, "loss": 0.0122, "step": 10702 }, { "epoch": 2.8236380424746077, "grad_norm": 0.31823551654815674, "learning_rate": 4.785720566253408e-06, "loss": 0.0028, "step": 10704 }, { "epoch": 2.8241656773512727, "grad_norm": 0.34683823585510254, "learning_rate": 4.785368856062605e-06, "loss": 0.0021, "step": 10706 }, { "epoch": 2.8246933122279385, "grad_norm": 0.5379084944725037, "learning_rate": 4.785017145871802e-06, "loss": 0.0107, "step": 10708 }, { "epoch": 2.8252209471046035, "grad_norm": 0.18338236212730408, "learning_rate": 4.7846654356809994e-06, "loss": 0.001, "step": 10710 }, { "epoch": 2.825748581981269, "grad_norm": 0.7183147668838501, "learning_rate": 4.784313725490196e-06, "loss": 0.0049, "step": 10712 }, { "epoch": 2.8262762168579343, "grad_norm": 0.2857557535171509, "learning_rate": 4.783962015299393e-06, "loss": 0.0029, "step": 10714 }, { "epoch": 2.8268038517345997, "grad_norm": 0.011999053880572319, "learning_rate": 4.78361030510859e-06, "loss": 0.0004, "step": 10716 }, { "epoch": 2.827331486611265, "grad_norm": 0.05902715027332306, "learning_rate": 4.783258594917788e-06, "loss": 0.0054, "step": 10718 }, { "epoch": 2.8278591214879305, "grad_norm": 0.16034896671772003, "learning_rate": 4.782906884726985e-06, "loss": 0.0011, "step": 10720 }, { "epoch": 2.828386756364596, "grad_norm": 0.021052634343504906, "learning_rate": 4.782555174536182e-06, "loss": 0.0006, "step": 10722 }, { "epoch": 2.828914391241261, "grad_norm": 0.034812185913324356, "learning_rate": 4.78220346434538e-06, "loss": 0.0007, "step": 10724 }, { "epoch": 2.8294420261179263, "grad_norm": 0.03916877508163452, "learning_rate": 4.781851754154577e-06, "loss": 0.0007, "step": 10726 }, { "epoch": 2.8299696609945917, "grad_norm": 0.046678051352500916, "learning_rate": 4.781500043963774e-06, "loss": 0.0006, "step": 10728 }, { "epoch": 2.830497295871257, "grad_norm": 0.6033236384391785, "learning_rate": 4.7811483337729715e-06, "loss": 0.01, "step": 10730 }, { "epoch": 2.8310249307479225, "grad_norm": 0.3210005462169647, "learning_rate": 4.7807966235821685e-06, "loss": 0.0018, "step": 10732 }, { "epoch": 2.831552565624588, "grad_norm": 0.31373128294944763, "learning_rate": 4.780444913391366e-06, "loss": 0.0037, "step": 10734 }, { "epoch": 2.8320802005012533, "grad_norm": 0.26401183009147644, "learning_rate": 4.780093203200562e-06, "loss": 0.0073, "step": 10736 }, { "epoch": 2.8326078353779183, "grad_norm": 0.07133092731237411, "learning_rate": 4.77974149300976e-06, "loss": 0.0005, "step": 10738 }, { "epoch": 2.8331354702545837, "grad_norm": 0.02483743242919445, "learning_rate": 4.779389782818957e-06, "loss": 0.0004, "step": 10740 }, { "epoch": 2.833663105131249, "grad_norm": 0.030602850019931793, "learning_rate": 4.779038072628155e-06, "loss": 0.0004, "step": 10742 }, { "epoch": 2.8341907400079145, "grad_norm": 0.010919704101979733, "learning_rate": 4.778686362437352e-06, "loss": 0.0006, "step": 10744 }, { "epoch": 2.83471837488458, "grad_norm": 0.013328281231224537, "learning_rate": 4.778334652246549e-06, "loss": 0.0004, "step": 10746 }, { "epoch": 2.8352460097612453, "grad_norm": 0.05466952919960022, "learning_rate": 4.777982942055747e-06, "loss": 0.0006, "step": 10748 }, { "epoch": 2.8357736446379107, "grad_norm": 0.045389324426651, "learning_rate": 4.777631231864943e-06, "loss": 0.0007, "step": 10750 }, { "epoch": 2.8363012795145757, "grad_norm": 0.8335604667663574, "learning_rate": 4.7772795216741405e-06, "loss": 0.0079, "step": 10752 }, { "epoch": 2.8368289143912415, "grad_norm": 0.030262574553489685, "learning_rate": 4.776927811483338e-06, "loss": 0.0005, "step": 10754 }, { "epoch": 2.8373565492679065, "grad_norm": 0.07914654165506363, "learning_rate": 4.776576101292535e-06, "loss": 0.0006, "step": 10756 }, { "epoch": 2.837884184144572, "grad_norm": 0.020057573914527893, "learning_rate": 4.776224391101732e-06, "loss": 0.0004, "step": 10758 }, { "epoch": 2.8384118190212373, "grad_norm": 0.2255978137254715, "learning_rate": 4.775872680910929e-06, "loss": 0.0037, "step": 10760 }, { "epoch": 2.8389394538979027, "grad_norm": 0.09341704100370407, "learning_rate": 4.775520970720127e-06, "loss": 0.0036, "step": 10762 }, { "epoch": 2.839467088774568, "grad_norm": 0.6998338103294373, "learning_rate": 4.775169260529324e-06, "loss": 0.0033, "step": 10764 }, { "epoch": 2.8399947236512335, "grad_norm": 0.19599370658397675, "learning_rate": 4.774817550338521e-06, "loss": 0.019, "step": 10766 }, { "epoch": 2.840522358527899, "grad_norm": 0.21660743653774261, "learning_rate": 4.774465840147719e-06, "loss": 0.001, "step": 10768 }, { "epoch": 2.841049993404564, "grad_norm": 0.32059311866760254, "learning_rate": 4.774114129956916e-06, "loss": 0.0093, "step": 10770 }, { "epoch": 2.8415776282812293, "grad_norm": 0.023105960339307785, "learning_rate": 4.7737624197661134e-06, "loss": 0.0031, "step": 10772 }, { "epoch": 2.8421052631578947, "grad_norm": 0.13214616477489471, "learning_rate": 4.7734107095753095e-06, "loss": 0.0013, "step": 10774 }, { "epoch": 2.84263289803456, "grad_norm": 0.010286534205079079, "learning_rate": 4.773058999384507e-06, "loss": 0.0018, "step": 10776 }, { "epoch": 2.8431605329112255, "grad_norm": 0.3653154969215393, "learning_rate": 4.772707289193704e-06, "loss": 0.0061, "step": 10778 }, { "epoch": 2.843688167787891, "grad_norm": 0.051031820476055145, "learning_rate": 4.772355579002901e-06, "loss": 0.0035, "step": 10780 }, { "epoch": 2.8442158026645563, "grad_norm": 0.01053344551473856, "learning_rate": 4.772003868812099e-06, "loss": 0.0023, "step": 10782 }, { "epoch": 2.8447434375412213, "grad_norm": 0.051522113382816315, "learning_rate": 4.771652158621296e-06, "loss": 0.0019, "step": 10784 }, { "epoch": 2.8452710724178867, "grad_norm": 0.07844672352075577, "learning_rate": 4.771300448430494e-06, "loss": 0.0005, "step": 10786 }, { "epoch": 2.845798707294552, "grad_norm": 0.006690465845167637, "learning_rate": 4.770948738239691e-06, "loss": 0.0065, "step": 10788 }, { "epoch": 2.8463263421712175, "grad_norm": 0.006794288754463196, "learning_rate": 4.770597028048888e-06, "loss": 0.0004, "step": 10790 }, { "epoch": 2.846853977047883, "grad_norm": 0.03191155567765236, "learning_rate": 4.7702453178580855e-06, "loss": 0.0004, "step": 10792 }, { "epoch": 2.8473816119245483, "grad_norm": 0.12082222104072571, "learning_rate": 4.7698936076672825e-06, "loss": 0.005, "step": 10794 }, { "epoch": 2.8479092468012137, "grad_norm": 0.007245776243507862, "learning_rate": 4.769541897476479e-06, "loss": 0.0009, "step": 10796 }, { "epoch": 2.8484368816778787, "grad_norm": 0.010937990620732307, "learning_rate": 4.769190187285676e-06, "loss": 0.0004, "step": 10798 }, { "epoch": 2.8489645165545445, "grad_norm": 0.16129963099956512, "learning_rate": 4.768838477094874e-06, "loss": 0.0077, "step": 10800 }, { "epoch": 2.8494921514312095, "grad_norm": 0.058266863226890564, "learning_rate": 4.768486766904071e-06, "loss": 0.0005, "step": 10802 }, { "epoch": 2.850019786307875, "grad_norm": 0.03553552180528641, "learning_rate": 4.768135056713268e-06, "loss": 0.0013, "step": 10804 }, { "epoch": 2.8505474211845403, "grad_norm": 0.3816991448402405, "learning_rate": 4.767783346522466e-06, "loss": 0.0008, "step": 10806 }, { "epoch": 2.8510750560612057, "grad_norm": 0.01120362151414156, "learning_rate": 4.767431636331663e-06, "loss": 0.0036, "step": 10808 }, { "epoch": 2.851602690937871, "grad_norm": 0.4600928723812103, "learning_rate": 4.76707992614086e-06, "loss": 0.0178, "step": 10810 }, { "epoch": 2.852130325814536, "grad_norm": 0.4013015329837799, "learning_rate": 4.7667282159500576e-06, "loss": 0.0086, "step": 10812 }, { "epoch": 2.852657960691202, "grad_norm": 0.013363050296902657, "learning_rate": 4.7663765057592545e-06, "loss": 0.0006, "step": 10814 }, { "epoch": 2.853185595567867, "grad_norm": 0.051180001348257065, "learning_rate": 4.766024795568452e-06, "loss": 0.0082, "step": 10816 }, { "epoch": 2.8537132304445323, "grad_norm": 0.006491066422313452, "learning_rate": 4.765673085377648e-06, "loss": 0.0003, "step": 10818 }, { "epoch": 2.8542408653211977, "grad_norm": 0.14047090709209442, "learning_rate": 4.765321375186846e-06, "loss": 0.0043, "step": 10820 }, { "epoch": 2.854768500197863, "grad_norm": 0.014142457395792007, "learning_rate": 4.764969664996043e-06, "loss": 0.0004, "step": 10822 }, { "epoch": 2.8552961350745285, "grad_norm": 0.38438963890075684, "learning_rate": 4.764617954805241e-06, "loss": 0.0096, "step": 10824 }, { "epoch": 2.855823769951194, "grad_norm": 0.014159275218844414, "learning_rate": 4.764266244614438e-06, "loss": 0.0005, "step": 10826 }, { "epoch": 2.8563514048278593, "grad_norm": 0.8925164937973022, "learning_rate": 4.763914534423635e-06, "loss": 0.0034, "step": 10828 }, { "epoch": 2.8568790397045243, "grad_norm": 0.015104108490049839, "learning_rate": 4.763562824232833e-06, "loss": 0.0039, "step": 10830 }, { "epoch": 2.8574066745811897, "grad_norm": 0.03749639913439751, "learning_rate": 4.76321111404203e-06, "loss": 0.0006, "step": 10832 }, { "epoch": 2.857934309457855, "grad_norm": 0.34081125259399414, "learning_rate": 4.762859403851227e-06, "loss": 0.0039, "step": 10834 }, { "epoch": 2.8584619443345205, "grad_norm": 0.015464435331523418, "learning_rate": 4.7625076936604235e-06, "loss": 0.0005, "step": 10836 }, { "epoch": 2.858989579211186, "grad_norm": 0.01804499141871929, "learning_rate": 4.762155983469621e-06, "loss": 0.0013, "step": 10838 }, { "epoch": 2.8595172140878513, "grad_norm": 0.011751752346754074, "learning_rate": 4.761804273278818e-06, "loss": 0.0004, "step": 10840 }, { "epoch": 2.8600448489645167, "grad_norm": 0.2483452558517456, "learning_rate": 4.761452563088015e-06, "loss": 0.0069, "step": 10842 }, { "epoch": 2.8605724838411817, "grad_norm": 0.013573190197348595, "learning_rate": 4.761100852897213e-06, "loss": 0.0005, "step": 10844 }, { "epoch": 2.861100118717847, "grad_norm": 1.6389539241790771, "learning_rate": 4.76074914270641e-06, "loss": 0.0023, "step": 10846 }, { "epoch": 2.8616277535945125, "grad_norm": 0.016754545271396637, "learning_rate": 4.760397432515607e-06, "loss": 0.0004, "step": 10848 }, { "epoch": 2.862155388471178, "grad_norm": 0.02377566322684288, "learning_rate": 4.760045722324805e-06, "loss": 0.0006, "step": 10850 }, { "epoch": 2.8626830233478433, "grad_norm": 0.02088453620672226, "learning_rate": 4.759694012134002e-06, "loss": 0.0005, "step": 10852 }, { "epoch": 2.8632106582245087, "grad_norm": 0.32039594650268555, "learning_rate": 4.7593423019431995e-06, "loss": 0.005, "step": 10854 }, { "epoch": 2.863738293101174, "grad_norm": 0.16887354850769043, "learning_rate": 4.758990591752396e-06, "loss": 0.0013, "step": 10856 }, { "epoch": 2.864265927977839, "grad_norm": 0.04720665514469147, "learning_rate": 4.758638881561593e-06, "loss": 0.0004, "step": 10858 }, { "epoch": 2.864793562854505, "grad_norm": 0.17322807013988495, "learning_rate": 4.75828717137079e-06, "loss": 0.0011, "step": 10860 }, { "epoch": 2.86532119773117, "grad_norm": 0.1707858443260193, "learning_rate": 4.757935461179988e-06, "loss": 0.0011, "step": 10862 }, { "epoch": 2.8658488326078353, "grad_norm": 0.5664688348770142, "learning_rate": 4.757583750989185e-06, "loss": 0.0055, "step": 10864 }, { "epoch": 2.8663764674845007, "grad_norm": 0.26248684525489807, "learning_rate": 4.757232040798382e-06, "loss": 0.0012, "step": 10866 }, { "epoch": 2.866904102361166, "grad_norm": 0.038132764399051666, "learning_rate": 4.75688033060758e-06, "loss": 0.0007, "step": 10868 }, { "epoch": 2.8674317372378315, "grad_norm": 0.009340216405689716, "learning_rate": 4.756528620416777e-06, "loss": 0.0045, "step": 10870 }, { "epoch": 2.867959372114497, "grad_norm": 0.01774022914469242, "learning_rate": 4.756176910225974e-06, "loss": 0.0004, "step": 10872 }, { "epoch": 2.8684870069911623, "grad_norm": 0.016666743904352188, "learning_rate": 4.7558252000351716e-06, "loss": 0.0004, "step": 10874 }, { "epoch": 2.8690146418678273, "grad_norm": 0.005842301528900862, "learning_rate": 4.7554734898443685e-06, "loss": 0.0024, "step": 10876 }, { "epoch": 2.8695422767444927, "grad_norm": 0.017026396468281746, "learning_rate": 4.7551217796535655e-06, "loss": 0.0004, "step": 10878 }, { "epoch": 2.870069911621158, "grad_norm": 0.021731752902269363, "learning_rate": 4.754770069462762e-06, "loss": 0.0004, "step": 10880 }, { "epoch": 2.8705975464978235, "grad_norm": 0.012388339266180992, "learning_rate": 4.75441835927196e-06, "loss": 0.0043, "step": 10882 }, { "epoch": 2.871125181374489, "grad_norm": 0.019603274762630463, "learning_rate": 4.754066649081157e-06, "loss": 0.0012, "step": 10884 }, { "epoch": 2.8716528162511543, "grad_norm": 0.03024328500032425, "learning_rate": 4.753714938890354e-06, "loss": 0.0004, "step": 10886 }, { "epoch": 2.8721804511278197, "grad_norm": 0.014731322415173054, "learning_rate": 4.753363228699552e-06, "loss": 0.0004, "step": 10888 }, { "epoch": 2.8727080860044847, "grad_norm": 0.01083812490105629, "learning_rate": 4.753011518508749e-06, "loss": 0.0004, "step": 10890 }, { "epoch": 2.87323572088115, "grad_norm": 0.018990399315953255, "learning_rate": 4.752659808317947e-06, "loss": 0.0141, "step": 10892 }, { "epoch": 2.8737633557578155, "grad_norm": 0.009439426474273205, "learning_rate": 4.752308098127143e-06, "loss": 0.0004, "step": 10894 }, { "epoch": 2.874290990634481, "grad_norm": 0.019383085891604424, "learning_rate": 4.7519563879363406e-06, "loss": 0.0004, "step": 10896 }, { "epoch": 2.8748186255111463, "grad_norm": 0.21265174448490143, "learning_rate": 4.751604677745538e-06, "loss": 0.001, "step": 10898 }, { "epoch": 2.8753462603878117, "grad_norm": 0.04677991196513176, "learning_rate": 4.7512529675547345e-06, "loss": 0.004, "step": 10900 }, { "epoch": 2.875873895264477, "grad_norm": 0.003995517268776894, "learning_rate": 4.750901257363932e-06, "loss": 0.0009, "step": 10902 }, { "epoch": 2.876401530141142, "grad_norm": 0.03867164999246597, "learning_rate": 4.750549547173129e-06, "loss": 0.0092, "step": 10904 }, { "epoch": 2.876929165017808, "grad_norm": 0.007451116573065519, "learning_rate": 4.750197836982327e-06, "loss": 0.0003, "step": 10906 }, { "epoch": 2.877456799894473, "grad_norm": 0.35382893681526184, "learning_rate": 4.749846126791524e-06, "loss": 0.0012, "step": 10908 }, { "epoch": 2.8779844347711383, "grad_norm": 0.0546235591173172, "learning_rate": 4.749494416600721e-06, "loss": 0.0012, "step": 10910 }, { "epoch": 2.8785120696478037, "grad_norm": 0.024803390726447105, "learning_rate": 4.749142706409919e-06, "loss": 0.0004, "step": 10912 }, { "epoch": 2.879039704524469, "grad_norm": 0.03088868036866188, "learning_rate": 4.748790996219116e-06, "loss": 0.0011, "step": 10914 }, { "epoch": 2.8795673394011345, "grad_norm": 0.05155016854405403, "learning_rate": 4.748439286028313e-06, "loss": 0.0005, "step": 10916 }, { "epoch": 2.8800949742778, "grad_norm": 0.010161933489143848, "learning_rate": 4.74808757583751e-06, "loss": 0.0008, "step": 10918 }, { "epoch": 2.8806226091544653, "grad_norm": 0.499163955450058, "learning_rate": 4.747735865646707e-06, "loss": 0.0119, "step": 10920 }, { "epoch": 2.8811502440311303, "grad_norm": 0.32854050397872925, "learning_rate": 4.747384155455904e-06, "loss": 0.0206, "step": 10922 }, { "epoch": 2.8816778789077957, "grad_norm": 0.09370589256286621, "learning_rate": 4.747032445265101e-06, "loss": 0.0005, "step": 10924 }, { "epoch": 2.882205513784461, "grad_norm": 0.016460591927170753, "learning_rate": 4.746680735074299e-06, "loss": 0.0003, "step": 10926 }, { "epoch": 2.8827331486611265, "grad_norm": 0.012944209389388561, "learning_rate": 4.746329024883496e-06, "loss": 0.0006, "step": 10928 }, { "epoch": 2.883260783537792, "grad_norm": 0.06943453848361969, "learning_rate": 4.745977314692693e-06, "loss": 0.0006, "step": 10930 }, { "epoch": 2.8837884184144573, "grad_norm": 0.008838536217808723, "learning_rate": 4.745625604501891e-06, "loss": 0.0048, "step": 10932 }, { "epoch": 2.8843160532911227, "grad_norm": 0.148731991648674, "learning_rate": 4.745273894311088e-06, "loss": 0.0092, "step": 10934 }, { "epoch": 2.8848436881677877, "grad_norm": 0.03035176359117031, "learning_rate": 4.7449221841202855e-06, "loss": 0.0082, "step": 10936 }, { "epoch": 2.885371323044453, "grad_norm": 0.021204672753810883, "learning_rate": 4.744570473929482e-06, "loss": 0.0004, "step": 10938 }, { "epoch": 2.8858989579211185, "grad_norm": 0.00585251534357667, "learning_rate": 4.7442187637386795e-06, "loss": 0.0006, "step": 10940 }, { "epoch": 2.886426592797784, "grad_norm": 0.11383392661809921, "learning_rate": 4.743867053547876e-06, "loss": 0.0031, "step": 10942 }, { "epoch": 2.8869542276744493, "grad_norm": 0.07353951781988144, "learning_rate": 4.743515343357074e-06, "loss": 0.0006, "step": 10944 }, { "epoch": 2.8874818625511147, "grad_norm": 0.007572774309664965, "learning_rate": 4.743163633166271e-06, "loss": 0.0004, "step": 10946 }, { "epoch": 2.88800949742778, "grad_norm": 0.01356425415724516, "learning_rate": 4.742811922975468e-06, "loss": 0.0007, "step": 10948 }, { "epoch": 2.888537132304445, "grad_norm": 0.030475981533527374, "learning_rate": 4.742460212784666e-06, "loss": 0.0004, "step": 10950 }, { "epoch": 2.889064767181111, "grad_norm": 0.01688847690820694, "learning_rate": 4.742108502593863e-06, "loss": 0.0004, "step": 10952 }, { "epoch": 2.889592402057776, "grad_norm": 0.03608974441885948, "learning_rate": 4.74175679240306e-06, "loss": 0.0006, "step": 10954 }, { "epoch": 2.8901200369344413, "grad_norm": 0.00440341979265213, "learning_rate": 4.741405082212258e-06, "loss": 0.0003, "step": 10956 }, { "epoch": 2.8906476718111067, "grad_norm": 0.023670492693781853, "learning_rate": 4.7410533720214546e-06, "loss": 0.0012, "step": 10958 }, { "epoch": 2.891175306687772, "grad_norm": 0.19244089722633362, "learning_rate": 4.7407016618306515e-06, "loss": 0.0023, "step": 10960 }, { "epoch": 2.8917029415644375, "grad_norm": 0.2821299433708191, "learning_rate": 4.7403499516398485e-06, "loss": 0.0024, "step": 10962 }, { "epoch": 2.8922305764411025, "grad_norm": 0.21792258322238922, "learning_rate": 4.739998241449046e-06, "loss": 0.0051, "step": 10964 }, { "epoch": 2.8927582113177683, "grad_norm": 0.011569777503609657, "learning_rate": 4.739646531258243e-06, "loss": 0.0088, "step": 10966 }, { "epoch": 2.8932858461944333, "grad_norm": 0.011609049513936043, "learning_rate": 4.73929482106744e-06, "loss": 0.0005, "step": 10968 }, { "epoch": 2.8938134810710987, "grad_norm": 0.07031752914190292, "learning_rate": 4.738943110876638e-06, "loss": 0.0044, "step": 10970 }, { "epoch": 2.894341115947764, "grad_norm": 0.031040549278259277, "learning_rate": 4.738591400685835e-06, "loss": 0.0005, "step": 10972 }, { "epoch": 2.8948687508244295, "grad_norm": 0.05020697042346001, "learning_rate": 4.738239690495033e-06, "loss": 0.0008, "step": 10974 }, { "epoch": 2.895396385701095, "grad_norm": 0.05532156303524971, "learning_rate": 4.737887980304229e-06, "loss": 0.0036, "step": 10976 }, { "epoch": 2.8959240205777603, "grad_norm": 0.6674305200576782, "learning_rate": 4.737536270113427e-06, "loss": 0.0068, "step": 10978 }, { "epoch": 2.8964516554544257, "grad_norm": 0.036332689225673676, "learning_rate": 4.737184559922624e-06, "loss": 0.0005, "step": 10980 }, { "epoch": 2.8969792903310907, "grad_norm": 0.16986320912837982, "learning_rate": 4.736832849731821e-06, "loss": 0.0011, "step": 10982 }, { "epoch": 2.897506925207756, "grad_norm": 0.1426360011100769, "learning_rate": 4.736481139541018e-06, "loss": 0.0092, "step": 10984 }, { "epoch": 2.8980345600844215, "grad_norm": 0.019572941586375237, "learning_rate": 4.736129429350215e-06, "loss": 0.0005, "step": 10986 }, { "epoch": 2.898562194961087, "grad_norm": 0.013704000972211361, "learning_rate": 4.735777719159413e-06, "loss": 0.0004, "step": 10988 }, { "epoch": 2.8990898298377523, "grad_norm": 0.39870601892471313, "learning_rate": 4.73542600896861e-06, "loss": 0.005, "step": 10990 }, { "epoch": 2.8996174647144177, "grad_norm": 0.42233213782310486, "learning_rate": 4.735074298777807e-06, "loss": 0.0051, "step": 10992 }, { "epoch": 2.900145099591083, "grad_norm": 0.06027986481785774, "learning_rate": 4.734722588587005e-06, "loss": 0.0007, "step": 10994 }, { "epoch": 2.900672734467748, "grad_norm": 0.1431030035018921, "learning_rate": 4.734370878396202e-06, "loss": 0.0009, "step": 10996 }, { "epoch": 2.901200369344414, "grad_norm": 0.018420005217194557, "learning_rate": 4.734019168205399e-06, "loss": 0.0004, "step": 10998 }, { "epoch": 2.901728004221079, "grad_norm": 0.14212188124656677, "learning_rate": 4.733667458014596e-06, "loss": 0.0053, "step": 11000 }, { "epoch": 2.9022556390977443, "grad_norm": 0.05308295041322708, "learning_rate": 4.7333157478237934e-06, "loss": 0.002, "step": 11002 }, { "epoch": 2.9027832739744097, "grad_norm": 0.6219021081924438, "learning_rate": 4.73296403763299e-06, "loss": 0.0025, "step": 11004 }, { "epoch": 2.903310908851075, "grad_norm": 0.014537220820784569, "learning_rate": 4.732612327442187e-06, "loss": 0.0004, "step": 11006 }, { "epoch": 2.9038385437277405, "grad_norm": 0.10956580191850662, "learning_rate": 4.732260617251385e-06, "loss": 0.0005, "step": 11008 }, { "epoch": 2.9043661786044055, "grad_norm": 0.023251067847013474, "learning_rate": 4.731908907060582e-06, "loss": 0.0072, "step": 11010 }, { "epoch": 2.9048938134810713, "grad_norm": 0.14681735634803772, "learning_rate": 4.73155719686978e-06, "loss": 0.0009, "step": 11012 }, { "epoch": 2.9054214483577363, "grad_norm": 0.25463244318962097, "learning_rate": 4.731205486678977e-06, "loss": 0.0127, "step": 11014 }, { "epoch": 2.9059490832344017, "grad_norm": 0.047546904534101486, "learning_rate": 4.730853776488174e-06, "loss": 0.0008, "step": 11016 }, { "epoch": 2.906476718111067, "grad_norm": 0.014280930161476135, "learning_rate": 4.730502066297372e-06, "loss": 0.0003, "step": 11018 }, { "epoch": 2.9070043529877325, "grad_norm": 0.029984276741743088, "learning_rate": 4.730150356106568e-06, "loss": 0.0005, "step": 11020 }, { "epoch": 2.907531987864398, "grad_norm": 0.48549407720565796, "learning_rate": 4.7297986459157655e-06, "loss": 0.0069, "step": 11022 }, { "epoch": 2.9080596227410633, "grad_norm": 0.07720290124416351, "learning_rate": 4.7294469357249625e-06, "loss": 0.0051, "step": 11024 }, { "epoch": 2.9085872576177287, "grad_norm": 0.013830486685037613, "learning_rate": 4.72909522553416e-06, "loss": 0.0003, "step": 11026 }, { "epoch": 2.9091148924943937, "grad_norm": 0.013876983895897865, "learning_rate": 4.728743515343357e-06, "loss": 0.0006, "step": 11028 }, { "epoch": 2.909642527371059, "grad_norm": 0.019249403849244118, "learning_rate": 4.728391805152554e-06, "loss": 0.0006, "step": 11030 }, { "epoch": 2.9101701622477245, "grad_norm": 0.008836043998599052, "learning_rate": 4.728040094961752e-06, "loss": 0.0004, "step": 11032 }, { "epoch": 2.91069779712439, "grad_norm": 0.013231541961431503, "learning_rate": 4.727688384770949e-06, "loss": 0.0074, "step": 11034 }, { "epoch": 2.9112254320010553, "grad_norm": 0.04015597701072693, "learning_rate": 4.727336674580146e-06, "loss": 0.0147, "step": 11036 }, { "epoch": 2.9117530668777207, "grad_norm": 0.02555634267628193, "learning_rate": 4.726984964389343e-06, "loss": 0.0004, "step": 11038 }, { "epoch": 2.912280701754386, "grad_norm": 0.0060056522488594055, "learning_rate": 4.726633254198541e-06, "loss": 0.0029, "step": 11040 }, { "epoch": 2.912808336631051, "grad_norm": 0.6158862709999084, "learning_rate": 4.726281544007738e-06, "loss": 0.0069, "step": 11042 }, { "epoch": 2.9133359715077165, "grad_norm": 0.29696810245513916, "learning_rate": 4.7259298338169345e-06, "loss": 0.0014, "step": 11044 }, { "epoch": 2.913863606384382, "grad_norm": 0.08728386461734772, "learning_rate": 4.725578123626132e-06, "loss": 0.0006, "step": 11046 }, { "epoch": 2.9143912412610473, "grad_norm": 0.03288859874010086, "learning_rate": 4.725226413435329e-06, "loss": 0.0006, "step": 11048 }, { "epoch": 2.9149188761377127, "grad_norm": 0.04071732982993126, "learning_rate": 4.724874703244526e-06, "loss": 0.0005, "step": 11050 }, { "epoch": 2.915446511014378, "grad_norm": 0.18530656397342682, "learning_rate": 4.724522993053724e-06, "loss": 0.0039, "step": 11052 }, { "epoch": 2.9159741458910435, "grad_norm": 0.032430678606033325, "learning_rate": 4.724171282862921e-06, "loss": 0.0007, "step": 11054 }, { "epoch": 2.9165017807677085, "grad_norm": 0.2939896583557129, "learning_rate": 4.723819572672119e-06, "loss": 0.0033, "step": 11056 }, { "epoch": 2.9170294156443743, "grad_norm": 0.07750145345926285, "learning_rate": 4.723467862481315e-06, "loss": 0.0006, "step": 11058 }, { "epoch": 2.9175570505210393, "grad_norm": 0.008000735193490982, "learning_rate": 4.723116152290513e-06, "loss": 0.0005, "step": 11060 }, { "epoch": 2.9180846853977047, "grad_norm": 0.07052920758724213, "learning_rate": 4.72276444209971e-06, "loss": 0.0006, "step": 11062 }, { "epoch": 2.91861232027437, "grad_norm": 0.017164243385195732, "learning_rate": 4.7224127319089074e-06, "loss": 0.0004, "step": 11064 }, { "epoch": 2.9191399551510355, "grad_norm": 0.21413834393024445, "learning_rate": 4.722061021718104e-06, "loss": 0.001, "step": 11066 }, { "epoch": 2.919667590027701, "grad_norm": 0.04545889049768448, "learning_rate": 4.721709311527301e-06, "loss": 0.0037, "step": 11068 }, { "epoch": 2.9201952249043663, "grad_norm": 0.012690757401287556, "learning_rate": 4.721357601336499e-06, "loss": 0.0012, "step": 11070 }, { "epoch": 2.9207228597810317, "grad_norm": 0.5221894383430481, "learning_rate": 4.721005891145696e-06, "loss": 0.0055, "step": 11072 }, { "epoch": 2.9212504946576967, "grad_norm": 0.04831194132566452, "learning_rate": 4.720654180954893e-06, "loss": 0.0006, "step": 11074 }, { "epoch": 2.921778129534362, "grad_norm": 0.1809987723827362, "learning_rate": 4.720302470764091e-06, "loss": 0.0007, "step": 11076 }, { "epoch": 2.9223057644110275, "grad_norm": 0.03556665778160095, "learning_rate": 4.719950760573288e-06, "loss": 0.0033, "step": 11078 }, { "epoch": 2.922833399287693, "grad_norm": 0.02042791247367859, "learning_rate": 4.719599050382485e-06, "loss": 0.0011, "step": 11080 }, { "epoch": 2.9233610341643583, "grad_norm": 0.005240023601800203, "learning_rate": 4.719247340191682e-06, "loss": 0.0004, "step": 11082 }, { "epoch": 2.9238886690410237, "grad_norm": 0.0058350637555122375, "learning_rate": 4.7188956300008795e-06, "loss": 0.0003, "step": 11084 }, { "epoch": 2.924416303917689, "grad_norm": 0.15739727020263672, "learning_rate": 4.7185439198100765e-06, "loss": 0.0047, "step": 11086 }, { "epoch": 2.924943938794354, "grad_norm": 0.00619323318824172, "learning_rate": 4.718192209619273e-06, "loss": 0.0095, "step": 11088 }, { "epoch": 2.9254715736710195, "grad_norm": 0.05341134965419769, "learning_rate": 4.717840499428471e-06, "loss": 0.0004, "step": 11090 }, { "epoch": 2.925999208547685, "grad_norm": 0.015068329870700836, "learning_rate": 4.717488789237668e-06, "loss": 0.0008, "step": 11092 }, { "epoch": 2.9265268434243503, "grad_norm": 0.021215328946709633, "learning_rate": 4.717137079046866e-06, "loss": 0.002, "step": 11094 }, { "epoch": 2.9270544783010157, "grad_norm": 0.008090213872492313, "learning_rate": 4.716785368856062e-06, "loss": 0.0005, "step": 11096 }, { "epoch": 2.927582113177681, "grad_norm": 0.01079169288277626, "learning_rate": 4.71643365866526e-06, "loss": 0.0004, "step": 11098 }, { "epoch": 2.9281097480543465, "grad_norm": 0.01351839117705822, "learning_rate": 4.716081948474458e-06, "loss": 0.0004, "step": 11100 }, { "epoch": 2.9286373829310115, "grad_norm": 0.005186409689486027, "learning_rate": 4.715730238283655e-06, "loss": 0.0084, "step": 11102 }, { "epoch": 2.9291650178076774, "grad_norm": 0.4598219096660614, "learning_rate": 4.7153785280928516e-06, "loss": 0.0008, "step": 11104 }, { "epoch": 2.9296926526843423, "grad_norm": 0.00535980612039566, "learning_rate": 4.7150268179020485e-06, "loss": 0.0003, "step": 11106 }, { "epoch": 2.9302202875610077, "grad_norm": 0.831157386302948, "learning_rate": 4.714675107711246e-06, "loss": 0.0041, "step": 11108 }, { "epoch": 2.930747922437673, "grad_norm": 0.006000942084938288, "learning_rate": 4.714323397520443e-06, "loss": 0.0003, "step": 11110 }, { "epoch": 2.9312755573143385, "grad_norm": 0.027163563296198845, "learning_rate": 4.71397168732964e-06, "loss": 0.0082, "step": 11112 }, { "epoch": 2.931803192191004, "grad_norm": 0.013814624398946762, "learning_rate": 4.713619977138838e-06, "loss": 0.0028, "step": 11114 }, { "epoch": 2.932330827067669, "grad_norm": 0.8584200739860535, "learning_rate": 4.713268266948035e-06, "loss": 0.0051, "step": 11116 }, { "epoch": 2.9328584619443347, "grad_norm": 0.007386576384305954, "learning_rate": 4.712916556757232e-06, "loss": 0.0003, "step": 11118 }, { "epoch": 2.9333860968209997, "grad_norm": 0.19805653393268585, "learning_rate": 4.712564846566429e-06, "loss": 0.0041, "step": 11120 }, { "epoch": 2.933913731697665, "grad_norm": 0.441812664270401, "learning_rate": 4.712213136375627e-06, "loss": 0.0077, "step": 11122 }, { "epoch": 2.9344413665743305, "grad_norm": 0.43045562505722046, "learning_rate": 4.7118614261848245e-06, "loss": 0.0064, "step": 11124 }, { "epoch": 2.934969001450996, "grad_norm": 0.014483037404716015, "learning_rate": 4.711509715994021e-06, "loss": 0.0004, "step": 11126 }, { "epoch": 2.9354966363276613, "grad_norm": 0.4368401765823364, "learning_rate": 4.711158005803218e-06, "loss": 0.0014, "step": 11128 }, { "epoch": 2.9360242712043267, "grad_norm": 0.029287561774253845, "learning_rate": 4.710806295612415e-06, "loss": 0.0005, "step": 11130 }, { "epoch": 2.936551906080992, "grad_norm": 0.2249092161655426, "learning_rate": 4.710454585421613e-06, "loss": 0.0011, "step": 11132 }, { "epoch": 2.937079540957657, "grad_norm": 0.5888122916221619, "learning_rate": 4.71010287523081e-06, "loss": 0.0013, "step": 11134 }, { "epoch": 2.9376071758343225, "grad_norm": 7.463385105133057, "learning_rate": 4.709751165040007e-06, "loss": 0.0007, "step": 11136 }, { "epoch": 2.938134810710988, "grad_norm": 0.2922961413860321, "learning_rate": 4.709399454849205e-06, "loss": 0.0035, "step": 11138 }, { "epoch": 2.9386624455876533, "grad_norm": 0.017836930230259895, "learning_rate": 4.709047744658401e-06, "loss": 0.0004, "step": 11140 }, { "epoch": 2.9391900804643187, "grad_norm": 0.00746992789208889, "learning_rate": 4.708696034467599e-06, "loss": 0.0003, "step": 11142 }, { "epoch": 2.939717715340984, "grad_norm": 0.36747345328330994, "learning_rate": 4.708344324276796e-06, "loss": 0.0078, "step": 11144 }, { "epoch": 2.9402453502176495, "grad_norm": 0.14681118726730347, "learning_rate": 4.7079926140859935e-06, "loss": 0.0005, "step": 11146 }, { "epoch": 2.9407729850943145, "grad_norm": 0.012103460729122162, "learning_rate": 4.7076409038951904e-06, "loss": 0.0004, "step": 11148 }, { "epoch": 2.9413006199709804, "grad_norm": 0.028149062767624855, "learning_rate": 4.707289193704387e-06, "loss": 0.0024, "step": 11150 }, { "epoch": 2.9418282548476453, "grad_norm": 0.006572373677045107, "learning_rate": 4.706937483513585e-06, "loss": 0.0003, "step": 11152 }, { "epoch": 2.9423558897243107, "grad_norm": 0.04824373126029968, "learning_rate": 4.706585773322782e-06, "loss": 0.0035, "step": 11154 }, { "epoch": 2.942883524600976, "grad_norm": 0.29103320837020874, "learning_rate": 4.706234063131979e-06, "loss": 0.0036, "step": 11156 }, { "epoch": 2.9434111594776415, "grad_norm": 0.13710543513298035, "learning_rate": 4.705882352941177e-06, "loss": 0.0029, "step": 11158 }, { "epoch": 2.943938794354307, "grad_norm": 0.008060920052230358, "learning_rate": 4.705530642750374e-06, "loss": 0.013, "step": 11160 }, { "epoch": 2.944466429230972, "grad_norm": 0.057678062468767166, "learning_rate": 4.705178932559572e-06, "loss": 0.0038, "step": 11162 }, { "epoch": 2.9449940641076378, "grad_norm": 0.1402261108160019, "learning_rate": 4.704827222368768e-06, "loss": 0.0009, "step": 11164 }, { "epoch": 2.9455216989843027, "grad_norm": 0.029619911685585976, "learning_rate": 4.7044755121779656e-06, "loss": 0.0008, "step": 11166 }, { "epoch": 2.946049333860968, "grad_norm": 0.3562910556793213, "learning_rate": 4.7041238019871625e-06, "loss": 0.0081, "step": 11168 }, { "epoch": 2.9465769687376335, "grad_norm": 0.03111245669424534, "learning_rate": 4.7037720917963595e-06, "loss": 0.0004, "step": 11170 }, { "epoch": 2.947104603614299, "grad_norm": 0.4844951331615448, "learning_rate": 4.703420381605557e-06, "loss": 0.0022, "step": 11172 }, { "epoch": 2.9476322384909643, "grad_norm": 0.10383634269237518, "learning_rate": 4.703068671414754e-06, "loss": 0.0005, "step": 11174 }, { "epoch": 2.9481598733676297, "grad_norm": 0.02043628878891468, "learning_rate": 4.702716961223952e-06, "loss": 0.0091, "step": 11176 }, { "epoch": 2.948687508244295, "grad_norm": 0.01811409369111061, "learning_rate": 4.702365251033148e-06, "loss": 0.0004, "step": 11178 }, { "epoch": 2.94921514312096, "grad_norm": 0.19890014827251434, "learning_rate": 4.702013540842346e-06, "loss": 0.0017, "step": 11180 }, { "epoch": 2.9497427779976255, "grad_norm": 0.10725753009319305, "learning_rate": 4.701661830651544e-06, "loss": 0.0009, "step": 11182 }, { "epoch": 2.950270412874291, "grad_norm": 0.4846418499946594, "learning_rate": 4.701310120460741e-06, "loss": 0.0084, "step": 11184 }, { "epoch": 2.9507980477509563, "grad_norm": 0.03536667674779892, "learning_rate": 4.700958410269938e-06, "loss": 0.0005, "step": 11186 }, { "epoch": 2.9513256826276217, "grad_norm": 0.24450978636741638, "learning_rate": 4.700606700079135e-06, "loss": 0.0221, "step": 11188 }, { "epoch": 2.951853317504287, "grad_norm": 0.11332277953624725, "learning_rate": 4.700254989888332e-06, "loss": 0.0081, "step": 11190 }, { "epoch": 2.9523809523809526, "grad_norm": 0.014374731108546257, "learning_rate": 4.699903279697529e-06, "loss": 0.0004, "step": 11192 }, { "epoch": 2.9529085872576175, "grad_norm": 0.27303552627563477, "learning_rate": 4.699551569506726e-06, "loss": 0.0055, "step": 11194 }, { "epoch": 2.953436222134283, "grad_norm": 0.01382108312100172, "learning_rate": 4.699199859315924e-06, "loss": 0.0005, "step": 11196 }, { "epoch": 2.9539638570109483, "grad_norm": 0.18337956070899963, "learning_rate": 4.698848149125121e-06, "loss": 0.001, "step": 11198 }, { "epoch": 2.9544914918876137, "grad_norm": 0.011988073587417603, "learning_rate": 4.698496438934319e-06, "loss": 0.0004, "step": 11200 }, { "epoch": 2.955019126764279, "grad_norm": 0.1808093786239624, "learning_rate": 4.698144728743515e-06, "loss": 0.0019, "step": 11202 }, { "epoch": 2.9555467616409445, "grad_norm": 0.01901899464428425, "learning_rate": 4.697793018552713e-06, "loss": 0.0041, "step": 11204 }, { "epoch": 2.95607439651761, "grad_norm": 0.030466007068753242, "learning_rate": 4.69744130836191e-06, "loss": 0.0005, "step": 11206 }, { "epoch": 2.956602031394275, "grad_norm": 0.3377505838871002, "learning_rate": 4.697089598171107e-06, "loss": 0.0041, "step": 11208 }, { "epoch": 2.9571296662709408, "grad_norm": 0.008013623766601086, "learning_rate": 4.6967378879803044e-06, "loss": 0.0032, "step": 11210 }, { "epoch": 2.9576573011476057, "grad_norm": 0.007235486526042223, "learning_rate": 4.696386177789501e-06, "loss": 0.0004, "step": 11212 }, { "epoch": 2.958184936024271, "grad_norm": 0.552720844745636, "learning_rate": 4.696034467598699e-06, "loss": 0.0062, "step": 11214 }, { "epoch": 2.9587125709009365, "grad_norm": 0.019226128235459328, "learning_rate": 4.695682757407896e-06, "loss": 0.0146, "step": 11216 }, { "epoch": 2.959240205777602, "grad_norm": 0.0158623568713665, "learning_rate": 4.695331047217093e-06, "loss": 0.0077, "step": 11218 }, { "epoch": 2.9597678406542673, "grad_norm": 0.08806401491165161, "learning_rate": 4.694979337026291e-06, "loss": 0.0005, "step": 11220 }, { "epoch": 2.9602954755309328, "grad_norm": 0.034904029220342636, "learning_rate": 4.694627626835488e-06, "loss": 0.0014, "step": 11222 }, { "epoch": 2.960823110407598, "grad_norm": 0.014499736949801445, "learning_rate": 4.694275916644685e-06, "loss": 0.0005, "step": 11224 }, { "epoch": 2.961350745284263, "grad_norm": 1.9305193424224854, "learning_rate": 4.693924206453882e-06, "loss": 0.0021, "step": 11226 }, { "epoch": 2.9618783801609285, "grad_norm": 0.010082925669848919, "learning_rate": 4.6935724962630796e-06, "loss": 0.0004, "step": 11228 }, { "epoch": 2.962406015037594, "grad_norm": 0.047162171453237534, "learning_rate": 4.6932207860722765e-06, "loss": 0.0007, "step": 11230 }, { "epoch": 2.9629336499142593, "grad_norm": 0.06450549513101578, "learning_rate": 4.6928690758814735e-06, "loss": 0.0009, "step": 11232 }, { "epoch": 2.9634612847909247, "grad_norm": 0.4880484640598297, "learning_rate": 4.692517365690671e-06, "loss": 0.004, "step": 11234 }, { "epoch": 2.96398891966759, "grad_norm": 0.1658100187778473, "learning_rate": 4.692165655499868e-06, "loss": 0.0009, "step": 11236 }, { "epoch": 2.9645165545442556, "grad_norm": 0.3268703818321228, "learning_rate": 4.691813945309065e-06, "loss": 0.0034, "step": 11238 }, { "epoch": 2.9650441894209205, "grad_norm": 0.1489606350660324, "learning_rate": 4.691462235118263e-06, "loss": 0.0009, "step": 11240 }, { "epoch": 2.965571824297586, "grad_norm": 0.11770408600568771, "learning_rate": 4.69111052492746e-06, "loss": 0.0009, "step": 11242 }, { "epoch": 2.9660994591742513, "grad_norm": 0.16253630816936493, "learning_rate": 4.690758814736658e-06, "loss": 0.0048, "step": 11244 }, { "epoch": 2.9666270940509167, "grad_norm": 0.1102171242237091, "learning_rate": 4.690407104545854e-06, "loss": 0.0012, "step": 11246 }, { "epoch": 2.967154728927582, "grad_norm": 0.30154067277908325, "learning_rate": 4.690055394355052e-06, "loss": 0.0129, "step": 11248 }, { "epoch": 2.9676823638042475, "grad_norm": 0.048650410026311874, "learning_rate": 4.6897036841642486e-06, "loss": 0.0071, "step": 11250 }, { "epoch": 2.968209998680913, "grad_norm": 0.010605964809656143, "learning_rate": 4.689351973973446e-06, "loss": 0.0007, "step": 11252 }, { "epoch": 2.968737633557578, "grad_norm": 0.05728492513298988, "learning_rate": 4.689000263782643e-06, "loss": 0.0029, "step": 11254 }, { "epoch": 2.9692652684342438, "grad_norm": 0.025010772049427032, "learning_rate": 4.68864855359184e-06, "loss": 0.0009, "step": 11256 }, { "epoch": 2.9697929033109087, "grad_norm": 0.11778401583433151, "learning_rate": 4.688296843401038e-06, "loss": 0.0007, "step": 11258 }, { "epoch": 2.970320538187574, "grad_norm": 0.022339953109622, "learning_rate": 4.687945133210235e-06, "loss": 0.0005, "step": 11260 }, { "epoch": 2.9708481730642395, "grad_norm": 0.02708333358168602, "learning_rate": 4.687593423019432e-06, "loss": 0.0004, "step": 11262 }, { "epoch": 2.971375807940905, "grad_norm": 0.03351029008626938, "learning_rate": 4.687241712828629e-06, "loss": 0.0004, "step": 11264 }, { "epoch": 2.9719034428175704, "grad_norm": 0.03810356929898262, "learning_rate": 4.686890002637827e-06, "loss": 0.0078, "step": 11266 }, { "epoch": 2.9724310776942353, "grad_norm": 0.00490921176970005, "learning_rate": 4.686538292447024e-06, "loss": 0.0005, "step": 11268 }, { "epoch": 2.972958712570901, "grad_norm": 0.003711104393005371, "learning_rate": 4.686186582256221e-06, "loss": 0.0003, "step": 11270 }, { "epoch": 2.973486347447566, "grad_norm": 0.4674842059612274, "learning_rate": 4.6858348720654184e-06, "loss": 0.0006, "step": 11272 }, { "epoch": 2.9740139823242315, "grad_norm": 0.07190853357315063, "learning_rate": 4.685483161874615e-06, "loss": 0.0005, "step": 11274 }, { "epoch": 2.974541617200897, "grad_norm": 0.015225824899971485, "learning_rate": 4.685131451683812e-06, "loss": 0.0003, "step": 11276 }, { "epoch": 2.9750692520775623, "grad_norm": 0.12797683477401733, "learning_rate": 4.68477974149301e-06, "loss": 0.0039, "step": 11278 }, { "epoch": 2.9755968869542277, "grad_norm": 0.010583154857158661, "learning_rate": 4.684428031302207e-06, "loss": 0.0003, "step": 11280 }, { "epoch": 2.976124521830893, "grad_norm": 0.007331007160246372, "learning_rate": 4.684076321111405e-06, "loss": 0.0004, "step": 11282 }, { "epoch": 2.9766521567075586, "grad_norm": 0.504949688911438, "learning_rate": 4.683724610920601e-06, "loss": 0.0016, "step": 11284 }, { "epoch": 2.9771797915842235, "grad_norm": 0.005915761459618807, "learning_rate": 4.683372900729799e-06, "loss": 0.0009, "step": 11286 }, { "epoch": 2.977707426460889, "grad_norm": 0.44959375262260437, "learning_rate": 4.683021190538996e-06, "loss": 0.0111, "step": 11288 }, { "epoch": 2.9782350613375543, "grad_norm": 0.26171812415122986, "learning_rate": 4.6826694803481935e-06, "loss": 0.0095, "step": 11290 }, { "epoch": 2.9787626962142197, "grad_norm": 0.7301676869392395, "learning_rate": 4.6823177701573905e-06, "loss": 0.0103, "step": 11292 }, { "epoch": 2.979290331090885, "grad_norm": 0.020430611446499825, "learning_rate": 4.6819660599665875e-06, "loss": 0.0003, "step": 11294 }, { "epoch": 2.9798179659675506, "grad_norm": 0.32139912247657776, "learning_rate": 4.681614349775785e-06, "loss": 0.0014, "step": 11296 }, { "epoch": 2.980345600844216, "grad_norm": 0.04227328300476074, "learning_rate": 4.681262639584982e-06, "loss": 0.0047, "step": 11298 }, { "epoch": 2.980873235720881, "grad_norm": 0.050859201699495316, "learning_rate": 4.680910929394179e-06, "loss": 0.0005, "step": 11300 }, { "epoch": 2.9814008705975468, "grad_norm": 0.4167010188102722, "learning_rate": 4.680559219203377e-06, "loss": 0.0037, "step": 11302 }, { "epoch": 2.9819285054742117, "grad_norm": 0.019067099317908287, "learning_rate": 4.680207509012574e-06, "loss": 0.0003, "step": 11304 }, { "epoch": 2.982456140350877, "grad_norm": 0.13068801164627075, "learning_rate": 4.679855798821771e-06, "loss": 0.0007, "step": 11306 }, { "epoch": 2.9829837752275425, "grad_norm": 0.012351945973932743, "learning_rate": 4.679504088630968e-06, "loss": 0.0003, "step": 11308 }, { "epoch": 2.983511410104208, "grad_norm": 0.0164735559374094, "learning_rate": 4.679152378440166e-06, "loss": 0.0003, "step": 11310 }, { "epoch": 2.9840390449808734, "grad_norm": 0.006435947958379984, "learning_rate": 4.6788006682493626e-06, "loss": 0.0003, "step": 11312 }, { "epoch": 2.9845666798575383, "grad_norm": 0.3567153513431549, "learning_rate": 4.6784489580585595e-06, "loss": 0.0023, "step": 11314 }, { "epoch": 2.985094314734204, "grad_norm": 0.5349407196044922, "learning_rate": 4.678097247867757e-06, "loss": 0.0137, "step": 11316 }, { "epoch": 2.985621949610869, "grad_norm": 0.020756574347615242, "learning_rate": 4.677745537676954e-06, "loss": 0.0034, "step": 11318 }, { "epoch": 2.9861495844875345, "grad_norm": 0.007671619299799204, "learning_rate": 4.677393827486152e-06, "loss": 0.0018, "step": 11320 }, { "epoch": 2.9866772193642, "grad_norm": 0.8727436661720276, "learning_rate": 4.677042117295348e-06, "loss": 0.0133, "step": 11322 }, { "epoch": 2.9872048542408653, "grad_norm": 0.04398973286151886, "learning_rate": 4.676690407104546e-06, "loss": 0.0013, "step": 11324 }, { "epoch": 2.9877324891175308, "grad_norm": 0.012296409346163273, "learning_rate": 4.676338696913744e-06, "loss": 0.0003, "step": 11326 }, { "epoch": 2.988260123994196, "grad_norm": 0.008228388614952564, "learning_rate": 4.67598698672294e-06, "loss": 0.0003, "step": 11328 }, { "epoch": 2.9887877588708616, "grad_norm": 0.007743959780782461, "learning_rate": 4.675635276532138e-06, "loss": 0.0007, "step": 11330 }, { "epoch": 2.9893153937475265, "grad_norm": 0.22717973589897156, "learning_rate": 4.675283566341335e-06, "loss": 0.0009, "step": 11332 }, { "epoch": 2.989843028624192, "grad_norm": 0.06955210119485855, "learning_rate": 4.6749318561505324e-06, "loss": 0.0006, "step": 11334 }, { "epoch": 2.9903706635008573, "grad_norm": 0.004242014605551958, "learning_rate": 4.674580145959729e-06, "loss": 0.0003, "step": 11336 }, { "epoch": 2.9908982983775227, "grad_norm": 0.1682559698820114, "learning_rate": 4.674228435768926e-06, "loss": 0.003, "step": 11338 }, { "epoch": 2.991425933254188, "grad_norm": 0.027546344324946404, "learning_rate": 4.673876725578124e-06, "loss": 0.0032, "step": 11340 }, { "epoch": 2.9919535681308536, "grad_norm": 0.009572425857186317, "learning_rate": 4.673525015387321e-06, "loss": 0.0015, "step": 11342 }, { "epoch": 2.992481203007519, "grad_norm": 0.10459242016077042, "learning_rate": 4.673173305196518e-06, "loss": 0.0075, "step": 11344 }, { "epoch": 2.993008837884184, "grad_norm": 0.1816171556711197, "learning_rate": 4.672821595005715e-06, "loss": 0.0082, "step": 11346 }, { "epoch": 2.9935364727608493, "grad_norm": 0.02246873453259468, "learning_rate": 4.672469884814913e-06, "loss": 0.003, "step": 11348 }, { "epoch": 2.9940641076375147, "grad_norm": 0.02398240938782692, "learning_rate": 4.672118174624111e-06, "loss": 0.0004, "step": 11350 }, { "epoch": 2.99459174251418, "grad_norm": 0.011072000488638878, "learning_rate": 4.671766464433307e-06, "loss": 0.0005, "step": 11352 }, { "epoch": 2.9951193773908455, "grad_norm": 0.10083326697349548, "learning_rate": 4.6714147542425045e-06, "loss": 0.0006, "step": 11354 }, { "epoch": 2.995647012267511, "grad_norm": 0.0427430160343647, "learning_rate": 4.6710630440517014e-06, "loss": 0.001, "step": 11356 }, { "epoch": 2.9961746471441764, "grad_norm": 0.36906343698501587, "learning_rate": 4.670711333860898e-06, "loss": 0.0047, "step": 11358 }, { "epoch": 2.9967022820208413, "grad_norm": 0.01619787886738777, "learning_rate": 4.670359623670096e-06, "loss": 0.0012, "step": 11360 }, { "epoch": 2.997229916897507, "grad_norm": 0.0420609712600708, "learning_rate": 4.670007913479293e-06, "loss": 0.0027, "step": 11362 }, { "epoch": 2.997757551774172, "grad_norm": 0.054314110428094864, "learning_rate": 4.669656203288491e-06, "loss": 0.0141, "step": 11364 }, { "epoch": 2.9982851866508375, "grad_norm": 0.005784992594271898, "learning_rate": 4.669304493097687e-06, "loss": 0.0003, "step": 11366 }, { "epoch": 2.998812821527503, "grad_norm": 0.012640278786420822, "learning_rate": 4.668952782906885e-06, "loss": 0.0004, "step": 11368 }, { "epoch": 2.9993404564041684, "grad_norm": 0.5155712962150574, "learning_rate": 4.668601072716082e-06, "loss": 0.0026, "step": 11370 }, { "epoch": 2.9998680912808338, "grad_norm": 0.07226664572954178, "learning_rate": 4.66824936252528e-06, "loss": 0.0048, "step": 11372 }, { "epoch": 3.0002638174383325, "grad_norm": 0.007322821766138077, "learning_rate": 4.6678976523344766e-06, "loss": 0.0034, "step": 11374 }, { "epoch": 3.000791452314998, "grad_norm": 0.32951781153678894, "learning_rate": 4.6675459421436735e-06, "loss": 0.0055, "step": 11376 }, { "epoch": 3.0013190871916633, "grad_norm": 0.1966780722141266, "learning_rate": 4.667194231952871e-06, "loss": 0.0007, "step": 11378 }, { "epoch": 3.0018467220683287, "grad_norm": 0.04107747972011566, "learning_rate": 4.666842521762068e-06, "loss": 0.0011, "step": 11380 }, { "epoch": 3.002374356944994, "grad_norm": 0.018413493409752846, "learning_rate": 4.666490811571265e-06, "loss": 0.0004, "step": 11382 }, { "epoch": 3.0029019918216595, "grad_norm": 0.0724848061800003, "learning_rate": 4.666139101380463e-06, "loss": 0.0014, "step": 11384 }, { "epoch": 3.003429626698325, "grad_norm": 0.010411910712718964, "learning_rate": 4.66578739118966e-06, "loss": 0.0003, "step": 11386 }, { "epoch": 3.0039572615749903, "grad_norm": 0.040377531200647354, "learning_rate": 4.665435680998857e-06, "loss": 0.0005, "step": 11388 }, { "epoch": 3.0044848964516553, "grad_norm": 0.7982158064842224, "learning_rate": 4.665083970808054e-06, "loss": 0.0024, "step": 11390 }, { "epoch": 3.0050125313283207, "grad_norm": 0.036323998123407364, "learning_rate": 4.664732260617252e-06, "loss": 0.0004, "step": 11392 }, { "epoch": 3.005540166204986, "grad_norm": 0.00989345833659172, "learning_rate": 4.664380550426449e-06, "loss": 0.0004, "step": 11394 }, { "epoch": 3.0060678010816515, "grad_norm": 0.026037173345685005, "learning_rate": 4.6640288402356456e-06, "loss": 0.0013, "step": 11396 }, { "epoch": 3.006595435958317, "grad_norm": 0.02358204312622547, "learning_rate": 4.663677130044843e-06, "loss": 0.0004, "step": 11398 }, { "epoch": 3.0071230708349823, "grad_norm": 0.0102159408852458, "learning_rate": 4.66332541985404e-06, "loss": 0.0004, "step": 11400 }, { "epoch": 3.0076507057116477, "grad_norm": 0.049349382519721985, "learning_rate": 4.662973709663238e-06, "loss": 0.0007, "step": 11402 }, { "epoch": 3.0081783405883127, "grad_norm": 0.03446517139673233, "learning_rate": 4.662621999472434e-06, "loss": 0.0007, "step": 11404 }, { "epoch": 3.008705975464978, "grad_norm": 0.10129591822624207, "learning_rate": 4.662270289281632e-06, "loss": 0.001, "step": 11406 }, { "epoch": 3.0092336103416435, "grad_norm": 0.03477000817656517, "learning_rate": 4.66191857909083e-06, "loss": 0.0008, "step": 11408 }, { "epoch": 3.009761245218309, "grad_norm": 0.3672007620334625, "learning_rate": 4.661566868900027e-06, "loss": 0.0018, "step": 11410 }, { "epoch": 3.0102888800949743, "grad_norm": 0.04240288957953453, "learning_rate": 4.661215158709224e-06, "loss": 0.0004, "step": 11412 }, { "epoch": 3.0108165149716397, "grad_norm": 0.013033690862357616, "learning_rate": 4.660863448518421e-06, "loss": 0.0014, "step": 11414 }, { "epoch": 3.011344149848305, "grad_norm": 0.22210954129695892, "learning_rate": 4.6605117383276185e-06, "loss": 0.0197, "step": 11416 }, { "epoch": 3.0118717847249705, "grad_norm": 0.5197845101356506, "learning_rate": 4.6601600281368154e-06, "loss": 0.0029, "step": 11418 }, { "epoch": 3.0123994196016355, "grad_norm": 0.2727263867855072, "learning_rate": 4.659808317946012e-06, "loss": 0.0012, "step": 11420 }, { "epoch": 3.012927054478301, "grad_norm": 1.8757487535476685, "learning_rate": 4.65945660775521e-06, "loss": 0.0048, "step": 11422 }, { "epoch": 3.0134546893549663, "grad_norm": 0.6546945571899414, "learning_rate": 4.659104897564407e-06, "loss": 0.0014, "step": 11424 }, { "epoch": 3.0139823242316317, "grad_norm": 0.09445974975824356, "learning_rate": 4.658753187373604e-06, "loss": 0.0004, "step": 11426 }, { "epoch": 3.014509959108297, "grad_norm": 0.027958322316408157, "learning_rate": 4.658401477182801e-06, "loss": 0.0027, "step": 11428 }, { "epoch": 3.0150375939849625, "grad_norm": 0.03722897171974182, "learning_rate": 4.658049766991999e-06, "loss": 0.0058, "step": 11430 }, { "epoch": 3.015565228861628, "grad_norm": 0.1979139894247055, "learning_rate": 4.657698056801196e-06, "loss": 0.0068, "step": 11432 }, { "epoch": 3.016092863738293, "grad_norm": 0.005477530416101217, "learning_rate": 4.657346346610393e-06, "loss": 0.0035, "step": 11434 }, { "epoch": 3.0166204986149583, "grad_norm": 0.6626095175743103, "learning_rate": 4.6569946364195906e-06, "loss": 0.0023, "step": 11436 }, { "epoch": 3.0171481334916237, "grad_norm": 0.01425842847675085, "learning_rate": 4.6566429262287875e-06, "loss": 0.0068, "step": 11438 }, { "epoch": 3.017675768368289, "grad_norm": 0.004089179448783398, "learning_rate": 4.656291216037985e-06, "loss": 0.0004, "step": 11440 }, { "epoch": 3.0182034032449545, "grad_norm": 0.29316434264183044, "learning_rate": 4.655939505847182e-06, "loss": 0.0084, "step": 11442 }, { "epoch": 3.01873103812162, "grad_norm": 0.6621690988540649, "learning_rate": 4.655587795656379e-06, "loss": 0.0086, "step": 11444 }, { "epoch": 3.0192586729982853, "grad_norm": 0.0364183746278286, "learning_rate": 4.655236085465577e-06, "loss": 0.0008, "step": 11446 }, { "epoch": 3.0197863078749507, "grad_norm": 0.013682844117283821, "learning_rate": 4.654884375274773e-06, "loss": 0.0003, "step": 11448 }, { "epoch": 3.0203139427516157, "grad_norm": 0.0058128503151237965, "learning_rate": 4.654532665083971e-06, "loss": 0.0003, "step": 11450 }, { "epoch": 3.020841577628281, "grad_norm": 0.009486800990998745, "learning_rate": 4.654180954893168e-06, "loss": 0.0007, "step": 11452 }, { "epoch": 3.0213692125049465, "grad_norm": 0.4314770996570587, "learning_rate": 4.653829244702366e-06, "loss": 0.0022, "step": 11454 }, { "epoch": 3.021896847381612, "grad_norm": 0.2380668967962265, "learning_rate": 4.653477534511563e-06, "loss": 0.0058, "step": 11456 }, { "epoch": 3.0224244822582773, "grad_norm": 0.35102948546409607, "learning_rate": 4.6531258243207596e-06, "loss": 0.0066, "step": 11458 }, { "epoch": 3.0229521171349427, "grad_norm": 0.12558481097221375, "learning_rate": 4.652774114129957e-06, "loss": 0.0062, "step": 11460 }, { "epoch": 3.023479752011608, "grad_norm": 0.037981316447257996, "learning_rate": 4.652422403939154e-06, "loss": 0.0037, "step": 11462 }, { "epoch": 3.0240073868882735, "grad_norm": 0.02585817128419876, "learning_rate": 4.652070693748351e-06, "loss": 0.0005, "step": 11464 }, { "epoch": 3.0245350217649385, "grad_norm": 0.06599922478199005, "learning_rate": 4.651718983557549e-06, "loss": 0.0008, "step": 11466 }, { "epoch": 3.025062656641604, "grad_norm": 0.01894579827785492, "learning_rate": 4.651367273366746e-06, "loss": 0.0006, "step": 11468 }, { "epoch": 3.0255902915182693, "grad_norm": 0.03133266046643257, "learning_rate": 4.651015563175944e-06, "loss": 0.0007, "step": 11470 }, { "epoch": 3.0261179263949347, "grad_norm": 0.09426359832286835, "learning_rate": 4.65066385298514e-06, "loss": 0.0014, "step": 11472 }, { "epoch": 3.0266455612716, "grad_norm": 0.8890027403831482, "learning_rate": 4.650312142794338e-06, "loss": 0.0052, "step": 11474 }, { "epoch": 3.0271731961482655, "grad_norm": 0.06482156366109848, "learning_rate": 4.649960432603535e-06, "loss": 0.0008, "step": 11476 }, { "epoch": 3.027700831024931, "grad_norm": 0.10572422295808792, "learning_rate": 4.649608722412732e-06, "loss": 0.0044, "step": 11478 }, { "epoch": 3.028228465901596, "grad_norm": 0.017846599221229553, "learning_rate": 4.6492570122219294e-06, "loss": 0.0005, "step": 11480 }, { "epoch": 3.0287561007782613, "grad_norm": 0.16905251145362854, "learning_rate": 4.648905302031126e-06, "loss": 0.001, "step": 11482 }, { "epoch": 3.0292837356549267, "grad_norm": 0.1221517026424408, "learning_rate": 4.648553591840324e-06, "loss": 0.0008, "step": 11484 }, { "epoch": 3.029811370531592, "grad_norm": 0.053349778056144714, "learning_rate": 4.64820188164952e-06, "loss": 0.0006, "step": 11486 }, { "epoch": 3.0303390054082575, "grad_norm": 0.2656862139701843, "learning_rate": 4.647850171458718e-06, "loss": 0.0015, "step": 11488 }, { "epoch": 3.030866640284923, "grad_norm": 0.008109047077596188, "learning_rate": 4.647498461267915e-06, "loss": 0.0007, "step": 11490 }, { "epoch": 3.0313942751615883, "grad_norm": 1.1047550439834595, "learning_rate": 4.647146751077113e-06, "loss": 0.0011, "step": 11492 }, { "epoch": 3.0319219100382537, "grad_norm": 0.03626193851232529, "learning_rate": 4.64679504088631e-06, "loss": 0.0005, "step": 11494 }, { "epoch": 3.0324495449149187, "grad_norm": 0.043263550847768784, "learning_rate": 4.646443330695507e-06, "loss": 0.0022, "step": 11496 }, { "epoch": 3.032977179791584, "grad_norm": 0.024595865979790688, "learning_rate": 4.6460916205047045e-06, "loss": 0.0005, "step": 11498 }, { "epoch": 3.0335048146682495, "grad_norm": 0.34379082918167114, "learning_rate": 4.6457399103139015e-06, "loss": 0.0016, "step": 11500 }, { "epoch": 3.034032449544915, "grad_norm": 0.6634469628334045, "learning_rate": 4.6453882001230984e-06, "loss": 0.0014, "step": 11502 }, { "epoch": 3.0345600844215803, "grad_norm": 0.24698197841644287, "learning_rate": 4.645036489932296e-06, "loss": 0.0076, "step": 11504 }, { "epoch": 3.0350877192982457, "grad_norm": 0.2071264386177063, "learning_rate": 4.644684779741493e-06, "loss": 0.0113, "step": 11506 }, { "epoch": 3.035615354174911, "grad_norm": 0.4316268861293793, "learning_rate": 4.64433306955069e-06, "loss": 0.0072, "step": 11508 }, { "epoch": 3.036142989051576, "grad_norm": 0.011765213683247566, "learning_rate": 4.643981359359887e-06, "loss": 0.0006, "step": 11510 }, { "epoch": 3.0366706239282415, "grad_norm": 0.15563081204891205, "learning_rate": 4.643629649169085e-06, "loss": 0.0104, "step": 11512 }, { "epoch": 3.037198258804907, "grad_norm": 0.012263747863471508, "learning_rate": 4.643277938978282e-06, "loss": 0.0043, "step": 11514 }, { "epoch": 3.0377258936815723, "grad_norm": 0.10763101279735565, "learning_rate": 4.642926228787479e-06, "loss": 0.0049, "step": 11516 }, { "epoch": 3.0382535285582377, "grad_norm": 0.006989666260778904, "learning_rate": 4.642574518596677e-06, "loss": 0.0004, "step": 11518 }, { "epoch": 3.038781163434903, "grad_norm": 0.051698338240385056, "learning_rate": 4.6422228084058736e-06, "loss": 0.0006, "step": 11520 }, { "epoch": 3.0393087983115685, "grad_norm": 0.0788029208779335, "learning_rate": 4.641871098215071e-06, "loss": 0.0121, "step": 11522 }, { "epoch": 3.039836433188234, "grad_norm": 0.12769928574562073, "learning_rate": 4.641519388024268e-06, "loss": 0.0007, "step": 11524 }, { "epoch": 3.040364068064899, "grad_norm": 0.009169849567115307, "learning_rate": 4.641167677833465e-06, "loss": 0.0005, "step": 11526 }, { "epoch": 3.0408917029415643, "grad_norm": 0.1407863050699234, "learning_rate": 4.640815967642663e-06, "loss": 0.0011, "step": 11528 }, { "epoch": 3.0414193378182297, "grad_norm": 0.027287380769848824, "learning_rate": 4.64046425745186e-06, "loss": 0.0008, "step": 11530 }, { "epoch": 3.041946972694895, "grad_norm": 0.008296984247863293, "learning_rate": 4.640112547261057e-06, "loss": 0.0025, "step": 11532 }, { "epoch": 3.0424746075715605, "grad_norm": 0.007188430521637201, "learning_rate": 4.639760837070254e-06, "loss": 0.0003, "step": 11534 }, { "epoch": 3.043002242448226, "grad_norm": 0.20528040826320648, "learning_rate": 4.639409126879452e-06, "loss": 0.005, "step": 11536 }, { "epoch": 3.0435298773248913, "grad_norm": 0.007405134383589029, "learning_rate": 4.639057416688649e-06, "loss": 0.0003, "step": 11538 }, { "epoch": 3.0440575122015567, "grad_norm": 0.12598378956317902, "learning_rate": 4.638705706497846e-06, "loss": 0.0013, "step": 11540 }, { "epoch": 3.0445851470782217, "grad_norm": 0.02011653408408165, "learning_rate": 4.6383539963070434e-06, "loss": 0.0003, "step": 11542 }, { "epoch": 3.045112781954887, "grad_norm": 0.005344881676137447, "learning_rate": 4.63800228611624e-06, "loss": 0.0005, "step": 11544 }, { "epoch": 3.0456404168315525, "grad_norm": 0.0470038577914238, "learning_rate": 4.637650575925437e-06, "loss": 0.002, "step": 11546 }, { "epoch": 3.046168051708218, "grad_norm": 0.07272157818078995, "learning_rate": 4.637298865734634e-06, "loss": 0.0071, "step": 11548 }, { "epoch": 3.0466956865848833, "grad_norm": 0.004142336547374725, "learning_rate": 4.636947155543832e-06, "loss": 0.0003, "step": 11550 }, { "epoch": 3.0472233214615487, "grad_norm": 0.013032020069658756, "learning_rate": 4.63659544535303e-06, "loss": 0.0037, "step": 11552 }, { "epoch": 3.047750956338214, "grad_norm": 0.1600855141878128, "learning_rate": 4.636243735162226e-06, "loss": 0.0096, "step": 11554 }, { "epoch": 3.048278591214879, "grad_norm": 0.026518721133470535, "learning_rate": 4.635892024971424e-06, "loss": 0.0022, "step": 11556 }, { "epoch": 3.0488062260915445, "grad_norm": 0.551547646522522, "learning_rate": 4.635540314780621e-06, "loss": 0.0141, "step": 11558 }, { "epoch": 3.04933386096821, "grad_norm": 0.1992323100566864, "learning_rate": 4.6351886045898185e-06, "loss": 0.006, "step": 11560 }, { "epoch": 3.0498614958448753, "grad_norm": 0.6910139322280884, "learning_rate": 4.6348368943990155e-06, "loss": 0.0061, "step": 11562 }, { "epoch": 3.0503891307215407, "grad_norm": 0.356758713722229, "learning_rate": 4.6344851842082124e-06, "loss": 0.0089, "step": 11564 }, { "epoch": 3.050916765598206, "grad_norm": 0.28077325224876404, "learning_rate": 4.63413347401741e-06, "loss": 0.0049, "step": 11566 }, { "epoch": 3.0514444004748715, "grad_norm": 0.032114092260599136, "learning_rate": 4.633781763826606e-06, "loss": 0.0005, "step": 11568 }, { "epoch": 3.051972035351537, "grad_norm": 0.04388660937547684, "learning_rate": 4.633430053635804e-06, "loss": 0.0008, "step": 11570 }, { "epoch": 3.052499670228202, "grad_norm": 0.20236030220985413, "learning_rate": 4.633078343445001e-06, "loss": 0.0009, "step": 11572 }, { "epoch": 3.0530273051048673, "grad_norm": 0.6068684458732605, "learning_rate": 4.632726633254199e-06, "loss": 0.0065, "step": 11574 }, { "epoch": 3.0535549399815327, "grad_norm": 0.02144561894237995, "learning_rate": 4.632374923063396e-06, "loss": 0.0073, "step": 11576 }, { "epoch": 3.054082574858198, "grad_norm": 0.008157654665410519, "learning_rate": 4.632023212872593e-06, "loss": 0.0018, "step": 11578 }, { "epoch": 3.0546102097348635, "grad_norm": 0.013499977998435497, "learning_rate": 4.631671502681791e-06, "loss": 0.0012, "step": 11580 }, { "epoch": 3.055137844611529, "grad_norm": 0.06652120500802994, "learning_rate": 4.6313197924909876e-06, "loss": 0.0025, "step": 11582 }, { "epoch": 3.0556654794881943, "grad_norm": 0.038072001188993454, "learning_rate": 4.6309680823001845e-06, "loss": 0.0029, "step": 11584 }, { "epoch": 3.0561931143648593, "grad_norm": 0.01590789295732975, "learning_rate": 4.630616372109382e-06, "loss": 0.0005, "step": 11586 }, { "epoch": 3.0567207492415247, "grad_norm": 0.32886573672294617, "learning_rate": 4.630264661918579e-06, "loss": 0.0112, "step": 11588 }, { "epoch": 3.05724838411819, "grad_norm": 0.04729089513421059, "learning_rate": 4.629912951727777e-06, "loss": 0.0015, "step": 11590 }, { "epoch": 3.0577760189948555, "grad_norm": 0.005915776826441288, "learning_rate": 4.629561241536973e-06, "loss": 0.0022, "step": 11592 }, { "epoch": 3.058303653871521, "grad_norm": 0.012382147833704948, "learning_rate": 4.629209531346171e-06, "loss": 0.002, "step": 11594 }, { "epoch": 3.0588312887481863, "grad_norm": 0.2207762748003006, "learning_rate": 4.628857821155368e-06, "loss": 0.0026, "step": 11596 }, { "epoch": 3.0593589236248517, "grad_norm": 0.009133103303611279, "learning_rate": 4.628506110964565e-06, "loss": 0.001, "step": 11598 }, { "epoch": 3.059886558501517, "grad_norm": 0.009609916247427464, "learning_rate": 4.628154400773763e-06, "loss": 0.0003, "step": 11600 }, { "epoch": 3.060414193378182, "grad_norm": 0.17216455936431885, "learning_rate": 4.62780269058296e-06, "loss": 0.0091, "step": 11602 }, { "epoch": 3.0609418282548475, "grad_norm": 0.0143663315102458, "learning_rate": 4.627450980392157e-06, "loss": 0.0024, "step": 11604 }, { "epoch": 3.061469463131513, "grad_norm": 0.017407141625881195, "learning_rate": 4.6270992702013535e-06, "loss": 0.0014, "step": 11606 }, { "epoch": 3.0619970980081783, "grad_norm": 0.006420489400625229, "learning_rate": 4.626747560010551e-06, "loss": 0.0004, "step": 11608 }, { "epoch": 3.0625247328848437, "grad_norm": 0.018572689965367317, "learning_rate": 4.626395849819749e-06, "loss": 0.0004, "step": 11610 }, { "epoch": 3.063052367761509, "grad_norm": 0.051957856863737106, "learning_rate": 4.626044139628946e-06, "loss": 0.0008, "step": 11612 }, { "epoch": 3.0635800026381745, "grad_norm": 0.03745929151773453, "learning_rate": 4.625692429438143e-06, "loss": 0.0073, "step": 11614 }, { "epoch": 3.06410763751484, "grad_norm": 0.025933662429451942, "learning_rate": 4.62534071924734e-06, "loss": 0.0004, "step": 11616 }, { "epoch": 3.064635272391505, "grad_norm": 0.0073587060905992985, "learning_rate": 4.624989009056538e-06, "loss": 0.0106, "step": 11618 }, { "epoch": 3.0651629072681703, "grad_norm": 0.1176123395562172, "learning_rate": 4.624637298865735e-06, "loss": 0.0019, "step": 11620 }, { "epoch": 3.0656905421448357, "grad_norm": 0.07648950070142746, "learning_rate": 4.624285588674932e-06, "loss": 0.0007, "step": 11622 }, { "epoch": 3.066218177021501, "grad_norm": 0.0058733560144901276, "learning_rate": 4.6239338784841295e-06, "loss": 0.0023, "step": 11624 }, { "epoch": 3.0667458118981665, "grad_norm": 0.3716316819190979, "learning_rate": 4.6235821682933264e-06, "loss": 0.0022, "step": 11626 }, { "epoch": 3.067273446774832, "grad_norm": 0.3583928346633911, "learning_rate": 4.623230458102523e-06, "loss": 0.0048, "step": 11628 }, { "epoch": 3.0678010816514973, "grad_norm": 0.006589722353965044, "learning_rate": 4.62287874791172e-06, "loss": 0.0003, "step": 11630 }, { "epoch": 3.0683287165281623, "grad_norm": 0.02094305492937565, "learning_rate": 4.622527037720918e-06, "loss": 0.0026, "step": 11632 }, { "epoch": 3.0688563514048277, "grad_norm": 0.010727066546678543, "learning_rate": 4.622175327530115e-06, "loss": 0.0005, "step": 11634 }, { "epoch": 3.069383986281493, "grad_norm": 0.010948914103209972, "learning_rate": 4.621823617339312e-06, "loss": 0.0075, "step": 11636 }, { "epoch": 3.0699116211581585, "grad_norm": 0.025148386135697365, "learning_rate": 4.62147190714851e-06, "loss": 0.0006, "step": 11638 }, { "epoch": 3.070439256034824, "grad_norm": 0.5441969633102417, "learning_rate": 4.621120196957707e-06, "loss": 0.0056, "step": 11640 }, { "epoch": 3.0709668909114893, "grad_norm": 0.005386766046285629, "learning_rate": 4.620768486766905e-06, "loss": 0.0005, "step": 11642 }, { "epoch": 3.0714945257881547, "grad_norm": 0.1654285490512848, "learning_rate": 4.6204167765761015e-06, "loss": 0.0009, "step": 11644 }, { "epoch": 3.07202216066482, "grad_norm": 0.22278869152069092, "learning_rate": 4.6200650663852985e-06, "loss": 0.0029, "step": 11646 }, { "epoch": 3.072549795541485, "grad_norm": 0.00506659597158432, "learning_rate": 4.619713356194496e-06, "loss": 0.0005, "step": 11648 }, { "epoch": 3.0730774304181505, "grad_norm": 0.024451235309243202, "learning_rate": 4.619361646003693e-06, "loss": 0.0004, "step": 11650 }, { "epoch": 3.073605065294816, "grad_norm": 0.161421000957489, "learning_rate": 4.61900993581289e-06, "loss": 0.0051, "step": 11652 }, { "epoch": 3.0741327001714813, "grad_norm": 0.00641046604141593, "learning_rate": 4.618658225622087e-06, "loss": 0.0004, "step": 11654 }, { "epoch": 3.0746603350481467, "grad_norm": 0.02740168385207653, "learning_rate": 4.618306515431285e-06, "loss": 0.0006, "step": 11656 }, { "epoch": 3.075187969924812, "grad_norm": 0.050095271319150925, "learning_rate": 4.617954805240482e-06, "loss": 0.0006, "step": 11658 }, { "epoch": 3.0757156048014775, "grad_norm": 0.23537784814834595, "learning_rate": 4.617603095049679e-06, "loss": 0.0049, "step": 11660 }, { "epoch": 3.076243239678143, "grad_norm": 0.059421390295028687, "learning_rate": 4.617251384858877e-06, "loss": 0.0023, "step": 11662 }, { "epoch": 3.076770874554808, "grad_norm": 0.6141906380653381, "learning_rate": 4.616899674668074e-06, "loss": 0.0024, "step": 11664 }, { "epoch": 3.0772985094314733, "grad_norm": 0.025518296286463737, "learning_rate": 4.6165479644772706e-06, "loss": 0.0005, "step": 11666 }, { "epoch": 3.0778261443081387, "grad_norm": 0.022307027131319046, "learning_rate": 4.616196254286468e-06, "loss": 0.0074, "step": 11668 }, { "epoch": 3.078353779184804, "grad_norm": 0.015416831709444523, "learning_rate": 4.615844544095665e-06, "loss": 0.0003, "step": 11670 }, { "epoch": 3.0788814140614695, "grad_norm": 0.04398946464061737, "learning_rate": 4.615492833904863e-06, "loss": 0.007, "step": 11672 }, { "epoch": 3.079409048938135, "grad_norm": 0.0495106540620327, "learning_rate": 4.615141123714059e-06, "loss": 0.0005, "step": 11674 }, { "epoch": 3.0799366838148003, "grad_norm": 0.010137257166206837, "learning_rate": 4.614789413523257e-06, "loss": 0.0016, "step": 11676 }, { "epoch": 3.0804643186914653, "grad_norm": 0.0441717803478241, "learning_rate": 4.614437703332454e-06, "loss": 0.0012, "step": 11678 }, { "epoch": 3.0809919535681307, "grad_norm": 0.08884109556674957, "learning_rate": 4.614085993141652e-06, "loss": 0.0013, "step": 11680 }, { "epoch": 3.081519588444796, "grad_norm": 0.7158622741699219, "learning_rate": 4.613734282950849e-06, "loss": 0.0137, "step": 11682 }, { "epoch": 3.0820472233214615, "grad_norm": 0.007936520501971245, "learning_rate": 4.613382572760046e-06, "loss": 0.0012, "step": 11684 }, { "epoch": 3.082574858198127, "grad_norm": 0.04042980819940567, "learning_rate": 4.6130308625692435e-06, "loss": 0.0007, "step": 11686 }, { "epoch": 3.0831024930747923, "grad_norm": 0.0407627634704113, "learning_rate": 4.61267915237844e-06, "loss": 0.0006, "step": 11688 }, { "epoch": 3.0836301279514577, "grad_norm": 0.12137477099895477, "learning_rate": 4.612327442187637e-06, "loss": 0.0058, "step": 11690 }, { "epoch": 3.084157762828123, "grad_norm": 0.10916648805141449, "learning_rate": 4.611975731996834e-06, "loss": 0.0075, "step": 11692 }, { "epoch": 3.084685397704788, "grad_norm": 0.03137579932808876, "learning_rate": 4.611624021806032e-06, "loss": 0.0005, "step": 11694 }, { "epoch": 3.0852130325814535, "grad_norm": 0.06353414803743362, "learning_rate": 4.611272311615229e-06, "loss": 0.0006, "step": 11696 }, { "epoch": 3.085740667458119, "grad_norm": 0.04965219646692276, "learning_rate": 4.610920601424426e-06, "loss": 0.0004, "step": 11698 }, { "epoch": 3.0862683023347843, "grad_norm": 0.11159814149141312, "learning_rate": 4.610568891233624e-06, "loss": 0.001, "step": 11700 }, { "epoch": 3.0867959372114497, "grad_norm": 0.04073844105005264, "learning_rate": 4.610217181042821e-06, "loss": 0.0005, "step": 11702 }, { "epoch": 3.087323572088115, "grad_norm": 0.028440212830901146, "learning_rate": 4.609865470852018e-06, "loss": 0.0004, "step": 11704 }, { "epoch": 3.0878512069647805, "grad_norm": 0.018965736031532288, "learning_rate": 4.6095137606612155e-06, "loss": 0.0005, "step": 11706 }, { "epoch": 3.0883788418414455, "grad_norm": 0.009519126266241074, "learning_rate": 4.6091620504704125e-06, "loss": 0.0006, "step": 11708 }, { "epoch": 3.088906476718111, "grad_norm": 0.006958833429962397, "learning_rate": 4.60881034027961e-06, "loss": 0.0023, "step": 11710 }, { "epoch": 3.0894341115947763, "grad_norm": 0.5899072289466858, "learning_rate": 4.608458630088806e-06, "loss": 0.0054, "step": 11712 }, { "epoch": 3.0899617464714417, "grad_norm": 0.04092952236533165, "learning_rate": 4.608106919898004e-06, "loss": 0.0011, "step": 11714 }, { "epoch": 3.090489381348107, "grad_norm": 0.15442653000354767, "learning_rate": 4.607755209707201e-06, "loss": 0.0005, "step": 11716 }, { "epoch": 3.0910170162247725, "grad_norm": 0.008585606701672077, "learning_rate": 4.607403499516398e-06, "loss": 0.0005, "step": 11718 }, { "epoch": 3.091544651101438, "grad_norm": 0.012038502842187881, "learning_rate": 4.607051789325596e-06, "loss": 0.0003, "step": 11720 }, { "epoch": 3.0920722859781034, "grad_norm": 0.1347896009683609, "learning_rate": 4.606700079134793e-06, "loss": 0.0036, "step": 11722 }, { "epoch": 3.0925999208547683, "grad_norm": 0.012322265654802322, "learning_rate": 4.606348368943991e-06, "loss": 0.0004, "step": 11724 }, { "epoch": 3.0931275557314337, "grad_norm": 0.012996979989111423, "learning_rate": 4.605996658753188e-06, "loss": 0.0004, "step": 11726 }, { "epoch": 3.093655190608099, "grad_norm": 0.05858486518263817, "learning_rate": 4.6056449485623846e-06, "loss": 0.0005, "step": 11728 }, { "epoch": 3.0941828254847645, "grad_norm": 0.04901418462395668, "learning_rate": 4.605293238371582e-06, "loss": 0.0006, "step": 11730 }, { "epoch": 3.09471046036143, "grad_norm": 0.34444600343704224, "learning_rate": 4.604941528180779e-06, "loss": 0.0071, "step": 11732 }, { "epoch": 3.0952380952380953, "grad_norm": 0.0069158137775957584, "learning_rate": 4.604589817989976e-06, "loss": 0.0004, "step": 11734 }, { "epoch": 3.0957657301147607, "grad_norm": 0.1416487842798233, "learning_rate": 4.604238107799173e-06, "loss": 0.0011, "step": 11736 }, { "epoch": 3.0962933649914257, "grad_norm": 0.12887749075889587, "learning_rate": 4.603886397608371e-06, "loss": 0.0096, "step": 11738 }, { "epoch": 3.096820999868091, "grad_norm": 0.029948120936751366, "learning_rate": 4.603534687417568e-06, "loss": 0.0007, "step": 11740 }, { "epoch": 3.0973486347447565, "grad_norm": 0.33643466234207153, "learning_rate": 4.603182977226765e-06, "loss": 0.0013, "step": 11742 }, { "epoch": 3.097876269621422, "grad_norm": 0.12422885000705719, "learning_rate": 4.602831267035963e-06, "loss": 0.0009, "step": 11744 }, { "epoch": 3.0984039044980873, "grad_norm": 0.05317883938550949, "learning_rate": 4.60247955684516e-06, "loss": 0.0004, "step": 11746 }, { "epoch": 3.0989315393747527, "grad_norm": 0.03006170317530632, "learning_rate": 4.602127846654357e-06, "loss": 0.0004, "step": 11748 }, { "epoch": 3.099459174251418, "grad_norm": 0.008504381403326988, "learning_rate": 4.6017761364635536e-06, "loss": 0.0004, "step": 11750 }, { "epoch": 3.0999868091280836, "grad_norm": 0.019116511568427086, "learning_rate": 4.601424426272751e-06, "loss": 0.0004, "step": 11752 }, { "epoch": 3.1005144440047485, "grad_norm": 0.007586052641272545, "learning_rate": 4.601072716081949e-06, "loss": 0.0003, "step": 11754 }, { "epoch": 3.101042078881414, "grad_norm": 0.013807597570121288, "learning_rate": 4.600721005891145e-06, "loss": 0.0005, "step": 11756 }, { "epoch": 3.1015697137580793, "grad_norm": 0.019847651943564415, "learning_rate": 4.600369295700343e-06, "loss": 0.0004, "step": 11758 }, { "epoch": 3.1020973486347447, "grad_norm": 0.028790589421987534, "learning_rate": 4.60001758550954e-06, "loss": 0.0014, "step": 11760 }, { "epoch": 3.10262498351141, "grad_norm": 0.008676853030920029, "learning_rate": 4.599665875318738e-06, "loss": 0.0003, "step": 11762 }, { "epoch": 3.1031526183880755, "grad_norm": 0.30788350105285645, "learning_rate": 4.599314165127935e-06, "loss": 0.001, "step": 11764 }, { "epoch": 3.103680253264741, "grad_norm": 0.004081679508090019, "learning_rate": 4.598962454937132e-06, "loss": 0.0002, "step": 11766 }, { "epoch": 3.1042078881414064, "grad_norm": 0.027786845341324806, "learning_rate": 4.5986107447463295e-06, "loss": 0.0005, "step": 11768 }, { "epoch": 3.1047355230180713, "grad_norm": 0.14486044645309448, "learning_rate": 4.5982590345555265e-06, "loss": 0.0014, "step": 11770 }, { "epoch": 3.1052631578947367, "grad_norm": 0.15301825106143951, "learning_rate": 4.5979073243647234e-06, "loss": 0.0007, "step": 11772 }, { "epoch": 3.105790792771402, "grad_norm": 0.18090127408504486, "learning_rate": 4.59755561417392e-06, "loss": 0.0025, "step": 11774 }, { "epoch": 3.1063184276480675, "grad_norm": 0.07873265445232391, "learning_rate": 4.597203903983118e-06, "loss": 0.0076, "step": 11776 }, { "epoch": 3.106846062524733, "grad_norm": 0.04947248101234436, "learning_rate": 4.596852193792315e-06, "loss": 0.0004, "step": 11778 }, { "epoch": 3.1073736974013983, "grad_norm": 0.009536787867546082, "learning_rate": 4.596500483601512e-06, "loss": 0.0003, "step": 11780 }, { "epoch": 3.1079013322780638, "grad_norm": 0.25575605034828186, "learning_rate": 4.59614877341071e-06, "loss": 0.0012, "step": 11782 }, { "epoch": 3.1084289671547287, "grad_norm": 0.007032847497612238, "learning_rate": 4.595797063219907e-06, "loss": 0.0018, "step": 11784 }, { "epoch": 3.108956602031394, "grad_norm": 0.054989323019981384, "learning_rate": 4.595445353029104e-06, "loss": 0.0074, "step": 11786 }, { "epoch": 3.1094842369080595, "grad_norm": 0.03228635713458061, "learning_rate": 4.595093642838302e-06, "loss": 0.0003, "step": 11788 }, { "epoch": 3.110011871784725, "grad_norm": 0.14619873464107513, "learning_rate": 4.5947419326474985e-06, "loss": 0.0008, "step": 11790 }, { "epoch": 3.1105395066613903, "grad_norm": 0.09534019976854324, "learning_rate": 4.594390222456696e-06, "loss": 0.001, "step": 11792 }, { "epoch": 3.1110671415380557, "grad_norm": 0.009323357604444027, "learning_rate": 4.5940385122658925e-06, "loss": 0.0003, "step": 11794 }, { "epoch": 3.111594776414721, "grad_norm": 0.4832366406917572, "learning_rate": 4.59368680207509e-06, "loss": 0.0028, "step": 11796 }, { "epoch": 3.1121224112913866, "grad_norm": 0.1345665603876114, "learning_rate": 4.593335091884287e-06, "loss": 0.0005, "step": 11798 }, { "epoch": 3.1126500461680515, "grad_norm": 0.008876796811819077, "learning_rate": 4.592983381693485e-06, "loss": 0.0004, "step": 11800 }, { "epoch": 3.113177681044717, "grad_norm": 0.010324464179575443, "learning_rate": 4.592631671502682e-06, "loss": 0.0027, "step": 11802 }, { "epoch": 3.1137053159213823, "grad_norm": 0.020563732832670212, "learning_rate": 4.592279961311879e-06, "loss": 0.0005, "step": 11804 }, { "epoch": 3.1142329507980477, "grad_norm": 0.2774774730205536, "learning_rate": 4.591928251121077e-06, "loss": 0.0015, "step": 11806 }, { "epoch": 3.114760585674713, "grad_norm": 0.19910980761051178, "learning_rate": 4.591576540930274e-06, "loss": 0.0133, "step": 11808 }, { "epoch": 3.1152882205513786, "grad_norm": 0.018448451533913612, "learning_rate": 4.591224830739471e-06, "loss": 0.0003, "step": 11810 }, { "epoch": 3.115815855428044, "grad_norm": 0.01382034458220005, "learning_rate": 4.590873120548668e-06, "loss": 0.0005, "step": 11812 }, { "epoch": 3.1163434903047094, "grad_norm": 0.10627186298370361, "learning_rate": 4.590521410357865e-06, "loss": 0.0025, "step": 11814 }, { "epoch": 3.1168711251813743, "grad_norm": 0.4447920322418213, "learning_rate": 4.590169700167062e-06, "loss": 0.0042, "step": 11816 }, { "epoch": 3.1173987600580397, "grad_norm": 0.008457960560917854, "learning_rate": 4.589817989976259e-06, "loss": 0.0003, "step": 11818 }, { "epoch": 3.117926394934705, "grad_norm": 0.016605306416749954, "learning_rate": 4.589466279785457e-06, "loss": 0.0003, "step": 11820 }, { "epoch": 3.1184540298113705, "grad_norm": 0.007464071270078421, "learning_rate": 4.589114569594654e-06, "loss": 0.0003, "step": 11822 }, { "epoch": 3.118981664688036, "grad_norm": 0.3791021704673767, "learning_rate": 4.588762859403851e-06, "loss": 0.0044, "step": 11824 }, { "epoch": 3.1195092995647014, "grad_norm": 0.04336438700556755, "learning_rate": 4.588411149213049e-06, "loss": 0.0021, "step": 11826 }, { "epoch": 3.1200369344413668, "grad_norm": 0.2239460051059723, "learning_rate": 4.588059439022246e-06, "loss": 0.0075, "step": 11828 }, { "epoch": 3.1205645693180317, "grad_norm": 0.027373608201742172, "learning_rate": 4.5877077288314435e-06, "loss": 0.0008, "step": 11830 }, { "epoch": 3.121092204194697, "grad_norm": 0.012824363075196743, "learning_rate": 4.58735601864064e-06, "loss": 0.0003, "step": 11832 }, { "epoch": 3.1216198390713625, "grad_norm": 0.4618995487689972, "learning_rate": 4.5870043084498374e-06, "loss": 0.0057, "step": 11834 }, { "epoch": 3.122147473948028, "grad_norm": 0.020972376689314842, "learning_rate": 4.586652598259035e-06, "loss": 0.0006, "step": 11836 }, { "epoch": 3.1226751088246933, "grad_norm": 0.029987981542944908, "learning_rate": 4.586300888068232e-06, "loss": 0.003, "step": 11838 }, { "epoch": 3.1232027437013588, "grad_norm": 0.015909623354673386, "learning_rate": 4.585949177877429e-06, "loss": 0.0003, "step": 11840 }, { "epoch": 3.123730378578024, "grad_norm": 0.014948150143027306, "learning_rate": 4.585597467686626e-06, "loss": 0.0003, "step": 11842 }, { "epoch": 3.1242580134546896, "grad_norm": 0.13789843022823334, "learning_rate": 4.585245757495824e-06, "loss": 0.0074, "step": 11844 }, { "epoch": 3.1247856483313545, "grad_norm": 0.08473223447799683, "learning_rate": 4.584894047305021e-06, "loss": 0.0005, "step": 11846 }, { "epoch": 3.12531328320802, "grad_norm": 0.022016815841197968, "learning_rate": 4.584542337114218e-06, "loss": 0.0003, "step": 11848 }, { "epoch": 3.1258409180846853, "grad_norm": 0.023044241592288017, "learning_rate": 4.584190626923416e-06, "loss": 0.0006, "step": 11850 }, { "epoch": 3.1263685529613507, "grad_norm": 0.10786644369363785, "learning_rate": 4.5838389167326125e-06, "loss": 0.0018, "step": 11852 }, { "epoch": 3.126896187838016, "grad_norm": 0.028930118307471275, "learning_rate": 4.5834872065418095e-06, "loss": 0.0017, "step": 11854 }, { "epoch": 3.1274238227146816, "grad_norm": 0.02357054501771927, "learning_rate": 4.5831354963510064e-06, "loss": 0.0004, "step": 11856 }, { "epoch": 3.127951457591347, "grad_norm": 0.11490663141012192, "learning_rate": 4.582783786160204e-06, "loss": 0.0007, "step": 11858 }, { "epoch": 3.1284790924680124, "grad_norm": 0.004523042589426041, "learning_rate": 4.582432075969401e-06, "loss": 0.0002, "step": 11860 }, { "epoch": 3.1290067273446773, "grad_norm": 0.019899776205420494, "learning_rate": 4.582080365778598e-06, "loss": 0.0003, "step": 11862 }, { "epoch": 3.1295343622213427, "grad_norm": 0.006462560500949621, "learning_rate": 4.581728655587796e-06, "loss": 0.0004, "step": 11864 }, { "epoch": 3.130061997098008, "grad_norm": 0.01050974614918232, "learning_rate": 4.581376945396993e-06, "loss": 0.0003, "step": 11866 }, { "epoch": 3.1305896319746735, "grad_norm": 0.043225497007369995, "learning_rate": 4.581025235206191e-06, "loss": 0.0003, "step": 11868 }, { "epoch": 3.131117266851339, "grad_norm": 0.507511556148529, "learning_rate": 4.580673525015388e-06, "loss": 0.0073, "step": 11870 }, { "epoch": 3.1316449017280044, "grad_norm": 0.05213571712374687, "learning_rate": 4.580321814824585e-06, "loss": 0.0003, "step": 11872 }, { "epoch": 3.1321725366046698, "grad_norm": 0.005900316406041384, "learning_rate": 4.579970104633782e-06, "loss": 0.0002, "step": 11874 }, { "epoch": 3.1327001714813347, "grad_norm": 0.007352818269282579, "learning_rate": 4.5796183944429785e-06, "loss": 0.0003, "step": 11876 }, { "epoch": 3.133227806358, "grad_norm": 0.31842857599258423, "learning_rate": 4.579266684252176e-06, "loss": 0.0057, "step": 11878 }, { "epoch": 3.1337554412346655, "grad_norm": 0.12250304967164993, "learning_rate": 4.578914974061373e-06, "loss": 0.0119, "step": 11880 }, { "epoch": 3.134283076111331, "grad_norm": 0.017354894429445267, "learning_rate": 4.578563263870571e-06, "loss": 0.0003, "step": 11882 }, { "epoch": 3.1348107109879964, "grad_norm": 0.02783309854567051, "learning_rate": 4.578211553679768e-06, "loss": 0.006, "step": 11884 }, { "epoch": 3.1353383458646618, "grad_norm": 0.009068341925740242, "learning_rate": 4.577859843488965e-06, "loss": 0.0003, "step": 11886 }, { "epoch": 3.135865980741327, "grad_norm": 0.03819664567708969, "learning_rate": 4.577508133298163e-06, "loss": 0.0009, "step": 11888 }, { "epoch": 3.136393615617992, "grad_norm": 0.011617030017077923, "learning_rate": 4.57715642310736e-06, "loss": 0.0003, "step": 11890 }, { "epoch": 3.1369212504946575, "grad_norm": 0.10273388028144836, "learning_rate": 4.576804712916557e-06, "loss": 0.0038, "step": 11892 }, { "epoch": 3.137448885371323, "grad_norm": 0.12025600671768188, "learning_rate": 4.5764530027257545e-06, "loss": 0.0105, "step": 11894 }, { "epoch": 3.1379765202479883, "grad_norm": 0.075055330991745, "learning_rate": 4.576101292534951e-06, "loss": 0.0066, "step": 11896 }, { "epoch": 3.1385041551246537, "grad_norm": 0.4199475944042206, "learning_rate": 4.575749582344149e-06, "loss": 0.0017, "step": 11898 }, { "epoch": 3.139031790001319, "grad_norm": 0.034494880586862564, "learning_rate": 4.575397872153345e-06, "loss": 0.0003, "step": 11900 }, { "epoch": 3.1395594248779846, "grad_norm": 0.24847334623336792, "learning_rate": 4.575046161962543e-06, "loss": 0.0046, "step": 11902 }, { "epoch": 3.14008705975465, "grad_norm": 0.05067957565188408, "learning_rate": 4.57469445177174e-06, "loss": 0.0007, "step": 11904 }, { "epoch": 3.140614694631315, "grad_norm": 0.03534067049622536, "learning_rate": 4.574342741580937e-06, "loss": 0.0004, "step": 11906 }, { "epoch": 3.1411423295079803, "grad_norm": 0.015448996797204018, "learning_rate": 4.573991031390135e-06, "loss": 0.0008, "step": 11908 }, { "epoch": 3.1416699643846457, "grad_norm": 0.32520607113838196, "learning_rate": 4.573639321199332e-06, "loss": 0.0173, "step": 11910 }, { "epoch": 3.142197599261311, "grad_norm": 0.3425092399120331, "learning_rate": 4.57328761100853e-06, "loss": 0.002, "step": 11912 }, { "epoch": 3.1427252341379766, "grad_norm": 0.2107786387205124, "learning_rate": 4.572935900817726e-06, "loss": 0.0021, "step": 11914 }, { "epoch": 3.143252869014642, "grad_norm": 0.007083851844072342, "learning_rate": 4.5725841906269235e-06, "loss": 0.0003, "step": 11916 }, { "epoch": 3.1437805038913074, "grad_norm": 0.04024655371904373, "learning_rate": 4.5722324804361204e-06, "loss": 0.0004, "step": 11918 }, { "epoch": 3.1443081387679728, "grad_norm": 0.011426701210439205, "learning_rate": 4.571880770245318e-06, "loss": 0.0015, "step": 11920 }, { "epoch": 3.1448357736446377, "grad_norm": 0.497798889875412, "learning_rate": 4.571529060054515e-06, "loss": 0.0015, "step": 11922 }, { "epoch": 3.145363408521303, "grad_norm": 0.5606484413146973, "learning_rate": 4.571177349863712e-06, "loss": 0.0012, "step": 11924 }, { "epoch": 3.1458910433979685, "grad_norm": 0.09222064167261124, "learning_rate": 4.57082563967291e-06, "loss": 0.0015, "step": 11926 }, { "epoch": 3.146418678274634, "grad_norm": 0.009872084483504295, "learning_rate": 4.570473929482107e-06, "loss": 0.0003, "step": 11928 }, { "epoch": 3.1469463131512994, "grad_norm": 0.005426726303994656, "learning_rate": 4.570122219291304e-06, "loss": 0.0003, "step": 11930 }, { "epoch": 3.1474739480279648, "grad_norm": 0.009878110140562057, "learning_rate": 4.569770509100502e-06, "loss": 0.0003, "step": 11932 }, { "epoch": 3.14800158290463, "grad_norm": 0.06539720296859741, "learning_rate": 4.569418798909699e-06, "loss": 0.0005, "step": 11934 }, { "epoch": 3.148529217781295, "grad_norm": 0.026434876024723053, "learning_rate": 4.5690670887188956e-06, "loss": 0.0004, "step": 11936 }, { "epoch": 3.1490568526579605, "grad_norm": 0.3377101719379425, "learning_rate": 4.5687153785280925e-06, "loss": 0.0007, "step": 11938 }, { "epoch": 3.149584487534626, "grad_norm": 0.0033795335330069065, "learning_rate": 4.56836366833729e-06, "loss": 0.0003, "step": 11940 }, { "epoch": 3.1501121224112913, "grad_norm": 0.004340365994721651, "learning_rate": 4.568011958146487e-06, "loss": 0.0007, "step": 11942 }, { "epoch": 3.1506397572879568, "grad_norm": 0.2667657434940338, "learning_rate": 4.567660247955684e-06, "loss": 0.0018, "step": 11944 }, { "epoch": 3.151167392164622, "grad_norm": 0.00480148009955883, "learning_rate": 4.567308537764882e-06, "loss": 0.0002, "step": 11946 }, { "epoch": 3.1516950270412876, "grad_norm": 0.10532885044813156, "learning_rate": 4.566956827574079e-06, "loss": 0.0018, "step": 11948 }, { "epoch": 3.152222661917953, "grad_norm": 0.013638848438858986, "learning_rate": 4.566605117383277e-06, "loss": 0.0024, "step": 11950 }, { "epoch": 3.152750296794618, "grad_norm": 0.006095441523939371, "learning_rate": 4.566253407192474e-06, "loss": 0.0003, "step": 11952 }, { "epoch": 3.1532779316712833, "grad_norm": 0.6325020790100098, "learning_rate": 4.565901697001671e-06, "loss": 0.0055, "step": 11954 }, { "epoch": 3.1538055665479487, "grad_norm": 0.43443334102630615, "learning_rate": 4.5655499868108685e-06, "loss": 0.009, "step": 11956 }, { "epoch": 3.154333201424614, "grad_norm": 0.022037187591195107, "learning_rate": 4.565198276620065e-06, "loss": 0.0003, "step": 11958 }, { "epoch": 3.1548608363012796, "grad_norm": 0.007669782266020775, "learning_rate": 4.564846566429262e-06, "loss": 0.0003, "step": 11960 }, { "epoch": 3.155388471177945, "grad_norm": 0.0164104625582695, "learning_rate": 4.564494856238459e-06, "loss": 0.0003, "step": 11962 }, { "epoch": 3.1559161060546104, "grad_norm": 0.012530534528195858, "learning_rate": 4.564143146047657e-06, "loss": 0.0007, "step": 11964 }, { "epoch": 3.1564437409312758, "grad_norm": 0.00988401286303997, "learning_rate": 4.563791435856854e-06, "loss": 0.0003, "step": 11966 }, { "epoch": 3.1569713758079407, "grad_norm": 0.017257215455174446, "learning_rate": 4.563439725666051e-06, "loss": 0.0046, "step": 11968 }, { "epoch": 3.157499010684606, "grad_norm": 0.039436254650354385, "learning_rate": 4.563088015475249e-06, "loss": 0.0008, "step": 11970 }, { "epoch": 3.1580266455612716, "grad_norm": 0.06967024505138397, "learning_rate": 4.562736305284446e-06, "loss": 0.0023, "step": 11972 }, { "epoch": 3.158554280437937, "grad_norm": 0.12148451060056686, "learning_rate": 4.562384595093643e-06, "loss": 0.0015, "step": 11974 }, { "epoch": 3.1590819153146024, "grad_norm": 0.04822390154004097, "learning_rate": 4.56203288490284e-06, "loss": 0.0005, "step": 11976 }, { "epoch": 3.1596095501912678, "grad_norm": 0.010638812556862831, "learning_rate": 4.5616811747120375e-06, "loss": 0.0003, "step": 11978 }, { "epoch": 3.160137185067933, "grad_norm": 0.008199099451303482, "learning_rate": 4.561329464521235e-06, "loss": 0.0003, "step": 11980 }, { "epoch": 3.160664819944598, "grad_norm": 0.043073032051324844, "learning_rate": 4.560977754330431e-06, "loss": 0.0004, "step": 11982 }, { "epoch": 3.1611924548212635, "grad_norm": 0.15437312424182892, "learning_rate": 4.560626044139629e-06, "loss": 0.0006, "step": 11984 }, { "epoch": 3.161720089697929, "grad_norm": 0.01862935535609722, "learning_rate": 4.560274333948826e-06, "loss": 0.003, "step": 11986 }, { "epoch": 3.1622477245745944, "grad_norm": 0.17579610645771027, "learning_rate": 4.559922623758024e-06, "loss": 0.0113, "step": 11988 }, { "epoch": 3.1627753594512598, "grad_norm": 0.16167664527893066, "learning_rate": 4.559570913567221e-06, "loss": 0.0006, "step": 11990 }, { "epoch": 3.163302994327925, "grad_norm": 0.3542146384716034, "learning_rate": 4.559219203376418e-06, "loss": 0.0052, "step": 11992 }, { "epoch": 3.1638306292045906, "grad_norm": 0.003440537955611944, "learning_rate": 4.558867493185616e-06, "loss": 0.0002, "step": 11994 }, { "epoch": 3.164358264081256, "grad_norm": 0.004985513631254435, "learning_rate": 4.558515782994812e-06, "loss": 0.0002, "step": 11996 }, { "epoch": 3.164885898957921, "grad_norm": 0.006369994021952152, "learning_rate": 4.5581640728040095e-06, "loss": 0.0008, "step": 11998 }, { "epoch": 3.1654135338345863, "grad_norm": 0.055853471159935, "learning_rate": 4.5578123626132065e-06, "loss": 0.0073, "step": 12000 }, { "epoch": 3.1654135338345863, "eval_loss": 0.0024942560121417046, "eval_runtime": 304.9182, "eval_samples_per_second": 707.209, "eval_steps_per_second": 88.404, "step": 12000 }, { "epoch": 3.1659411687112518, "grad_norm": 0.003972657956182957, "learning_rate": 4.557460652422404e-06, "loss": 0.0072, "step": 12002 }, { "epoch": 3.166468803587917, "grad_norm": 0.2047801911830902, "learning_rate": 4.557108942231601e-06, "loss": 0.0006, "step": 12004 }, { "epoch": 3.1669964384645826, "grad_norm": 0.15088419616222382, "learning_rate": 4.556757232040798e-06, "loss": 0.0035, "step": 12006 }, { "epoch": 3.167524073341248, "grad_norm": 0.1528642773628235, "learning_rate": 4.556405521849996e-06, "loss": 0.0008, "step": 12008 }, { "epoch": 3.1680517082179134, "grad_norm": 0.07217489928007126, "learning_rate": 4.556053811659193e-06, "loss": 0.0004, "step": 12010 }, { "epoch": 3.168579343094579, "grad_norm": 0.02486291527748108, "learning_rate": 4.55570210146839e-06, "loss": 0.0004, "step": 12012 }, { "epoch": 3.1691069779712437, "grad_norm": 0.013499182648956776, "learning_rate": 4.555350391277588e-06, "loss": 0.0003, "step": 12014 }, { "epoch": 3.169634612847909, "grad_norm": 0.05056057125329971, "learning_rate": 4.554998681086785e-06, "loss": 0.0003, "step": 12016 }, { "epoch": 3.1701622477245746, "grad_norm": 0.03110519051551819, "learning_rate": 4.5546469708959825e-06, "loss": 0.0065, "step": 12018 }, { "epoch": 3.17068988260124, "grad_norm": 0.3263017237186432, "learning_rate": 4.5542952607051786e-06, "loss": 0.0011, "step": 12020 }, { "epoch": 3.1712175174779054, "grad_norm": 0.02797752432525158, "learning_rate": 4.553943550514376e-06, "loss": 0.0003, "step": 12022 }, { "epoch": 3.1717451523545708, "grad_norm": 0.0051276348531246185, "learning_rate": 4.553591840323573e-06, "loss": 0.0015, "step": 12024 }, { "epoch": 3.172272787231236, "grad_norm": 0.005805745720863342, "learning_rate": 4.55324013013277e-06, "loss": 0.0045, "step": 12026 }, { "epoch": 3.172800422107901, "grad_norm": 0.00394471175968647, "learning_rate": 4.552888419941968e-06, "loss": 0.0003, "step": 12028 }, { "epoch": 3.1733280569845665, "grad_norm": 0.04226144030690193, "learning_rate": 4.552536709751165e-06, "loss": 0.0003, "step": 12030 }, { "epoch": 3.173855691861232, "grad_norm": 0.09816519916057587, "learning_rate": 4.552184999560363e-06, "loss": 0.0011, "step": 12032 }, { "epoch": 3.1743833267378974, "grad_norm": 0.045871905982494354, "learning_rate": 4.551833289369559e-06, "loss": 0.0047, "step": 12034 }, { "epoch": 3.1749109616145628, "grad_norm": 0.06560999900102615, "learning_rate": 4.551481579178757e-06, "loss": 0.0009, "step": 12036 }, { "epoch": 3.175438596491228, "grad_norm": 0.032321151345968246, "learning_rate": 4.5511298689879545e-06, "loss": 0.0005, "step": 12038 }, { "epoch": 3.1759662313678936, "grad_norm": 0.013524599373340607, "learning_rate": 4.5507781587971515e-06, "loss": 0.0066, "step": 12040 }, { "epoch": 3.1764938662445585, "grad_norm": 0.12694218754768372, "learning_rate": 4.5504264486063484e-06, "loss": 0.0084, "step": 12042 }, { "epoch": 3.177021501121224, "grad_norm": 0.010668179951608181, "learning_rate": 4.550074738415545e-06, "loss": 0.0003, "step": 12044 }, { "epoch": 3.1775491359978894, "grad_norm": 0.08444081246852875, "learning_rate": 4.549723028224743e-06, "loss": 0.0004, "step": 12046 }, { "epoch": 3.1780767708745548, "grad_norm": 0.33587661385536194, "learning_rate": 4.54937131803394e-06, "loss": 0.0084, "step": 12048 }, { "epoch": 3.17860440575122, "grad_norm": 0.024005411192774773, "learning_rate": 4.549019607843137e-06, "loss": 0.0003, "step": 12050 }, { "epoch": 3.1791320406278856, "grad_norm": 0.7539935111999512, "learning_rate": 4.548667897652335e-06, "loss": 0.0073, "step": 12052 }, { "epoch": 3.179659675504551, "grad_norm": 0.030483419075608253, "learning_rate": 4.548316187461532e-06, "loss": 0.0032, "step": 12054 }, { "epoch": 3.1801873103812164, "grad_norm": 0.41267892718315125, "learning_rate": 4.547964477270729e-06, "loss": 0.0037, "step": 12056 }, { "epoch": 3.1807149452578813, "grad_norm": 0.048512909561395645, "learning_rate": 4.547612767079926e-06, "loss": 0.0005, "step": 12058 }, { "epoch": 3.1812425801345467, "grad_norm": 0.01884591393172741, "learning_rate": 4.5472610568891235e-06, "loss": 0.0036, "step": 12060 }, { "epoch": 3.181770215011212, "grad_norm": 0.009793806821107864, "learning_rate": 4.546909346698321e-06, "loss": 0.0003, "step": 12062 }, { "epoch": 3.1822978498878776, "grad_norm": 0.2652548849582672, "learning_rate": 4.5465576365075174e-06, "loss": 0.0013, "step": 12064 }, { "epoch": 3.182825484764543, "grad_norm": 0.05406182259321213, "learning_rate": 4.546205926316715e-06, "loss": 0.0044, "step": 12066 }, { "epoch": 3.1833531196412084, "grad_norm": 0.655081570148468, "learning_rate": 4.545854216125912e-06, "loss": 0.0029, "step": 12068 }, { "epoch": 3.183880754517874, "grad_norm": 0.13876929879188538, "learning_rate": 4.54550250593511e-06, "loss": 0.008, "step": 12070 }, { "epoch": 3.184408389394539, "grad_norm": 0.03326795995235443, "learning_rate": 4.545150795744307e-06, "loss": 0.0006, "step": 12072 }, { "epoch": 3.184936024271204, "grad_norm": 0.05011816695332527, "learning_rate": 4.544799085553504e-06, "loss": 0.0005, "step": 12074 }, { "epoch": 3.1854636591478696, "grad_norm": 0.020993497222661972, "learning_rate": 4.544447375362702e-06, "loss": 0.0003, "step": 12076 }, { "epoch": 3.185991294024535, "grad_norm": 0.38640084862709045, "learning_rate": 4.544095665171899e-06, "loss": 0.0024, "step": 12078 }, { "epoch": 3.1865189289012004, "grad_norm": 0.13190805912017822, "learning_rate": 4.543743954981096e-06, "loss": 0.0046, "step": 12080 }, { "epoch": 3.1870465637778658, "grad_norm": 0.4203978478908539, "learning_rate": 4.5433922447902926e-06, "loss": 0.0052, "step": 12082 }, { "epoch": 3.187574198654531, "grad_norm": 0.05952928587794304, "learning_rate": 4.54304053459949e-06, "loss": 0.0008, "step": 12084 }, { "epoch": 3.1881018335311966, "grad_norm": 0.03477388620376587, "learning_rate": 4.542688824408687e-06, "loss": 0.0004, "step": 12086 }, { "epoch": 3.1886294684078615, "grad_norm": 0.01690315641462803, "learning_rate": 4.542337114217884e-06, "loss": 0.0004, "step": 12088 }, { "epoch": 3.189157103284527, "grad_norm": 0.0334550216794014, "learning_rate": 4.541985404027082e-06, "loss": 0.0053, "step": 12090 }, { "epoch": 3.1896847381611924, "grad_norm": 0.04744865000247955, "learning_rate": 4.541633693836279e-06, "loss": 0.0004, "step": 12092 }, { "epoch": 3.1902123730378578, "grad_norm": 0.11696913093328476, "learning_rate": 4.541281983645476e-06, "loss": 0.0008, "step": 12094 }, { "epoch": 3.190740007914523, "grad_norm": 0.1003052368760109, "learning_rate": 4.540930273454674e-06, "loss": 0.0039, "step": 12096 }, { "epoch": 3.1912676427911886, "grad_norm": 0.012044078670442104, "learning_rate": 4.540578563263871e-06, "loss": 0.0003, "step": 12098 }, { "epoch": 3.191795277667854, "grad_norm": 0.0998350977897644, "learning_rate": 4.5402268530730685e-06, "loss": 0.0006, "step": 12100 }, { "epoch": 3.1923229125445194, "grad_norm": 0.1442168653011322, "learning_rate": 4.539875142882265e-06, "loss": 0.001, "step": 12102 }, { "epoch": 3.1928505474211843, "grad_norm": 0.00846646074205637, "learning_rate": 4.539523432691462e-06, "loss": 0.0025, "step": 12104 }, { "epoch": 3.1933781822978498, "grad_norm": 0.029308151453733444, "learning_rate": 4.539171722500659e-06, "loss": 0.0003, "step": 12106 }, { "epoch": 3.193905817174515, "grad_norm": 0.2563890516757965, "learning_rate": 4.538820012309857e-06, "loss": 0.0006, "step": 12108 }, { "epoch": 3.1944334520511806, "grad_norm": 0.31608501076698303, "learning_rate": 4.538468302119054e-06, "loss": 0.0041, "step": 12110 }, { "epoch": 3.194961086927846, "grad_norm": 0.004902812652289867, "learning_rate": 4.538116591928251e-06, "loss": 0.0003, "step": 12112 }, { "epoch": 3.1954887218045114, "grad_norm": 0.3779292106628418, "learning_rate": 4.537764881737449e-06, "loss": 0.0147, "step": 12114 }, { "epoch": 3.196016356681177, "grad_norm": 0.1300140917301178, "learning_rate": 4.537413171546645e-06, "loss": 0.0008, "step": 12116 }, { "epoch": 3.196543991557842, "grad_norm": 0.04601786658167839, "learning_rate": 4.537061461355843e-06, "loss": 0.0009, "step": 12118 }, { "epoch": 3.197071626434507, "grad_norm": 0.051678337156772614, "learning_rate": 4.536709751165041e-06, "loss": 0.0007, "step": 12120 }, { "epoch": 3.1975992613111726, "grad_norm": 0.03772282227873802, "learning_rate": 4.5363580409742375e-06, "loss": 0.0014, "step": 12122 }, { "epoch": 3.198126896187838, "grad_norm": 0.09329594671726227, "learning_rate": 4.5360063307834345e-06, "loss": 0.0004, "step": 12124 }, { "epoch": 3.1986545310645034, "grad_norm": 0.00826345756649971, "learning_rate": 4.5356546205926314e-06, "loss": 0.0002, "step": 12126 }, { "epoch": 3.1991821659411688, "grad_norm": 0.009421267546713352, "learning_rate": 4.535302910401829e-06, "loss": 0.0003, "step": 12128 }, { "epoch": 3.199709800817834, "grad_norm": 0.005443992093205452, "learning_rate": 4.534951200211026e-06, "loss": 0.0008, "step": 12130 }, { "epoch": 3.2002374356944996, "grad_norm": 0.03435441106557846, "learning_rate": 4.534599490020223e-06, "loss": 0.0004, "step": 12132 }, { "epoch": 3.2007650705711646, "grad_norm": 0.40295398235321045, "learning_rate": 4.534247779829421e-06, "loss": 0.0011, "step": 12134 }, { "epoch": 3.20129270544783, "grad_norm": 0.16586782038211823, "learning_rate": 4.533896069638618e-06, "loss": 0.0021, "step": 12136 }, { "epoch": 3.2018203403244954, "grad_norm": 0.04256581515073776, "learning_rate": 4.533544359447816e-06, "loss": 0.0006, "step": 12138 }, { "epoch": 3.2023479752011608, "grad_norm": 0.19579027593135834, "learning_rate": 4.533192649257012e-06, "loss": 0.001, "step": 12140 }, { "epoch": 3.202875610077826, "grad_norm": 0.05582655593752861, "learning_rate": 4.53284093906621e-06, "loss": 0.0005, "step": 12142 }, { "epoch": 3.2034032449544916, "grad_norm": 0.24015448987483978, "learning_rate": 4.5324892288754065e-06, "loss": 0.0109, "step": 12144 }, { "epoch": 3.203930879831157, "grad_norm": 0.07875104993581772, "learning_rate": 4.5321375186846035e-06, "loss": 0.0004, "step": 12146 }, { "epoch": 3.2044585147078224, "grad_norm": 0.04837461933493614, "learning_rate": 4.531785808493801e-06, "loss": 0.0005, "step": 12148 }, { "epoch": 3.2049861495844874, "grad_norm": 0.04726254567503929, "learning_rate": 4.531434098302998e-06, "loss": 0.0004, "step": 12150 }, { "epoch": 3.2055137844611528, "grad_norm": 0.02860550582408905, "learning_rate": 4.531082388112196e-06, "loss": 0.0005, "step": 12152 }, { "epoch": 3.206041419337818, "grad_norm": 0.3308354616165161, "learning_rate": 4.530730677921393e-06, "loss": 0.0025, "step": 12154 }, { "epoch": 3.2065690542144836, "grad_norm": 0.46924299001693726, "learning_rate": 4.53037896773059e-06, "loss": 0.0157, "step": 12156 }, { "epoch": 3.207096689091149, "grad_norm": 0.004118442069739103, "learning_rate": 4.530027257539788e-06, "loss": 0.0005, "step": 12158 }, { "epoch": 3.2076243239678144, "grad_norm": 0.005075207911431789, "learning_rate": 4.529675547348985e-06, "loss": 0.0024, "step": 12160 }, { "epoch": 3.20815195884448, "grad_norm": 0.1167229637503624, "learning_rate": 4.529323837158182e-06, "loss": 0.0006, "step": 12162 }, { "epoch": 3.208679593721145, "grad_norm": 0.057026319205760956, "learning_rate": 4.528972126967379e-06, "loss": 0.0023, "step": 12164 }, { "epoch": 3.20920722859781, "grad_norm": 0.036732908338308334, "learning_rate": 4.528620416776576e-06, "loss": 0.0004, "step": 12166 }, { "epoch": 3.2097348634744756, "grad_norm": 0.043929573148489, "learning_rate": 4.528268706585773e-06, "loss": 0.0004, "step": 12168 }, { "epoch": 3.210262498351141, "grad_norm": 0.028081052005290985, "learning_rate": 4.52791699639497e-06, "loss": 0.0003, "step": 12170 }, { "epoch": 3.2107901332278064, "grad_norm": 0.4678610861301422, "learning_rate": 4.527565286204168e-06, "loss": 0.0027, "step": 12172 }, { "epoch": 3.211317768104472, "grad_norm": 0.29073017835617065, "learning_rate": 4.527213576013365e-06, "loss": 0.001, "step": 12174 }, { "epoch": 3.211845402981137, "grad_norm": 0.024130407720804214, "learning_rate": 4.526861865822562e-06, "loss": 0.0006, "step": 12176 }, { "epoch": 3.2123730378578026, "grad_norm": 0.5166576504707336, "learning_rate": 4.52651015563176e-06, "loss": 0.01, "step": 12178 }, { "epoch": 3.2129006727344676, "grad_norm": 0.02251741662621498, "learning_rate": 4.526158445440957e-06, "loss": 0.0003, "step": 12180 }, { "epoch": 3.213428307611133, "grad_norm": 0.017221063375473022, "learning_rate": 4.5258067352501546e-06, "loss": 0.0004, "step": 12182 }, { "epoch": 3.2139559424877984, "grad_norm": 0.005131820682436228, "learning_rate": 4.525455025059351e-06, "loss": 0.0003, "step": 12184 }, { "epoch": 3.2144835773644638, "grad_norm": 0.005381292197853327, "learning_rate": 4.5251033148685485e-06, "loss": 0.0003, "step": 12186 }, { "epoch": 3.215011212241129, "grad_norm": 0.05618206411600113, "learning_rate": 4.5247516046777454e-06, "loss": 0.0004, "step": 12188 }, { "epoch": 3.2155388471177946, "grad_norm": 0.045138902962207794, "learning_rate": 4.524399894486943e-06, "loss": 0.0003, "step": 12190 }, { "epoch": 3.21606648199446, "grad_norm": 0.3067617118358612, "learning_rate": 4.52404818429614e-06, "loss": 0.0062, "step": 12192 }, { "epoch": 3.216594116871125, "grad_norm": 0.043869681656360626, "learning_rate": 4.523696474105337e-06, "loss": 0.0006, "step": 12194 }, { "epoch": 3.2171217517477904, "grad_norm": 0.4789096415042877, "learning_rate": 4.523344763914535e-06, "loss": 0.0093, "step": 12196 }, { "epoch": 3.2176493866244558, "grad_norm": 0.3820975720882416, "learning_rate": 4.522993053723732e-06, "loss": 0.0023, "step": 12198 }, { "epoch": 3.218177021501121, "grad_norm": 0.004364043474197388, "learning_rate": 4.522641343532929e-06, "loss": 0.0003, "step": 12200 }, { "epoch": 3.2187046563777866, "grad_norm": 0.36941948533058167, "learning_rate": 4.522289633342126e-06, "loss": 0.0013, "step": 12202 }, { "epoch": 3.219232291254452, "grad_norm": 0.16413448750972748, "learning_rate": 4.521937923151324e-06, "loss": 0.0108, "step": 12204 }, { "epoch": 3.2197599261311174, "grad_norm": 0.009961903095245361, "learning_rate": 4.5215862129605205e-06, "loss": 0.0005, "step": 12206 }, { "epoch": 3.220287561007783, "grad_norm": 0.008177239447832108, "learning_rate": 4.5212345027697175e-06, "loss": 0.0003, "step": 12208 }, { "epoch": 3.2208151958844478, "grad_norm": 0.03617989644408226, "learning_rate": 4.520882792578915e-06, "loss": 0.0004, "step": 12210 }, { "epoch": 3.221342830761113, "grad_norm": 0.08805456757545471, "learning_rate": 4.520531082388112e-06, "loss": 0.0005, "step": 12212 }, { "epoch": 3.2218704656377786, "grad_norm": 0.011055554263293743, "learning_rate": 4.520179372197309e-06, "loss": 0.0041, "step": 12214 }, { "epoch": 3.222398100514444, "grad_norm": 0.03317518159747124, "learning_rate": 4.519827662006507e-06, "loss": 0.0007, "step": 12216 }, { "epoch": 3.2229257353911094, "grad_norm": 0.008274760097265244, "learning_rate": 4.519475951815704e-06, "loss": 0.0016, "step": 12218 }, { "epoch": 3.223453370267775, "grad_norm": 0.2660207152366638, "learning_rate": 4.519124241624902e-06, "loss": 0.0077, "step": 12220 }, { "epoch": 3.22398100514444, "grad_norm": 0.159395232796669, "learning_rate": 4.518772531434098e-06, "loss": 0.0013, "step": 12222 }, { "epoch": 3.2245086400211056, "grad_norm": 0.12194986641407013, "learning_rate": 4.518420821243296e-06, "loss": 0.0014, "step": 12224 }, { "epoch": 3.2250362748977706, "grad_norm": 0.018128003925085068, "learning_rate": 4.518069111052493e-06, "loss": 0.0003, "step": 12226 }, { "epoch": 3.225563909774436, "grad_norm": 0.2806764543056488, "learning_rate": 4.51771740086169e-06, "loss": 0.0007, "step": 12228 }, { "epoch": 3.2260915446511014, "grad_norm": 0.006340950261801481, "learning_rate": 4.517365690670887e-06, "loss": 0.0003, "step": 12230 }, { "epoch": 3.226619179527767, "grad_norm": 0.004941755905747414, "learning_rate": 4.517013980480084e-06, "loss": 0.0017, "step": 12232 }, { "epoch": 3.227146814404432, "grad_norm": 0.04561445116996765, "learning_rate": 4.516662270289282e-06, "loss": 0.0005, "step": 12234 }, { "epoch": 3.2276744492810976, "grad_norm": 0.03435968980193138, "learning_rate": 4.516310560098479e-06, "loss": 0.0029, "step": 12236 }, { "epoch": 3.228202084157763, "grad_norm": 0.09518557041883469, "learning_rate": 4.515958849907676e-06, "loss": 0.0006, "step": 12238 }, { "epoch": 3.228729719034428, "grad_norm": 0.028886394575238228, "learning_rate": 4.515607139716874e-06, "loss": 0.0004, "step": 12240 }, { "epoch": 3.2292573539110934, "grad_norm": 0.06986699253320694, "learning_rate": 4.515255429526071e-06, "loss": 0.0019, "step": 12242 }, { "epoch": 3.2297849887877588, "grad_norm": 0.02283160760998726, "learning_rate": 4.514903719335268e-06, "loss": 0.0005, "step": 12244 }, { "epoch": 3.230312623664424, "grad_norm": 0.42512184381484985, "learning_rate": 4.514552009144465e-06, "loss": 0.0017, "step": 12246 }, { "epoch": 3.2308402585410896, "grad_norm": 0.9032061100006104, "learning_rate": 4.5142002989536625e-06, "loss": 0.0086, "step": 12248 }, { "epoch": 3.231367893417755, "grad_norm": 0.13230179250240326, "learning_rate": 4.513848588762859e-06, "loss": 0.0007, "step": 12250 }, { "epoch": 3.2318955282944204, "grad_norm": 0.007920272648334503, "learning_rate": 4.513496878572056e-06, "loss": 0.0003, "step": 12252 }, { "epoch": 3.232423163171086, "grad_norm": 0.02715369500219822, "learning_rate": 4.513145168381254e-06, "loss": 0.0008, "step": 12254 }, { "epoch": 3.2329507980477508, "grad_norm": 0.0065386793576180935, "learning_rate": 4.512793458190451e-06, "loss": 0.0003, "step": 12256 }, { "epoch": 3.233478432924416, "grad_norm": 0.08557728677988052, "learning_rate": 4.512441747999649e-06, "loss": 0.001, "step": 12258 }, { "epoch": 3.2340060678010816, "grad_norm": 0.009046108461916447, "learning_rate": 4.512090037808845e-06, "loss": 0.0017, "step": 12260 }, { "epoch": 3.234533702677747, "grad_norm": 0.40728524327278137, "learning_rate": 4.511738327618043e-06, "loss": 0.002, "step": 12262 }, { "epoch": 3.2350613375544124, "grad_norm": 0.23514993488788605, "learning_rate": 4.511386617427241e-06, "loss": 0.0011, "step": 12264 }, { "epoch": 3.235588972431078, "grad_norm": 0.005489823874086142, "learning_rate": 4.511034907236437e-06, "loss": 0.0002, "step": 12266 }, { "epoch": 3.236116607307743, "grad_norm": 0.03948730602860451, "learning_rate": 4.5106831970456345e-06, "loss": 0.0003, "step": 12268 }, { "epoch": 3.2366442421844086, "grad_norm": 0.0053426423110067844, "learning_rate": 4.5103314868548315e-06, "loss": 0.0005, "step": 12270 }, { "epoch": 3.2371718770610736, "grad_norm": 0.00628934521228075, "learning_rate": 4.509979776664029e-06, "loss": 0.0003, "step": 12272 }, { "epoch": 3.237699511937739, "grad_norm": 0.5524286031723022, "learning_rate": 4.509628066473226e-06, "loss": 0.0065, "step": 12274 }, { "epoch": 3.2382271468144044, "grad_norm": 0.011556659825146198, "learning_rate": 4.509276356282423e-06, "loss": 0.0009, "step": 12276 }, { "epoch": 3.23875478169107, "grad_norm": 0.10376019775867462, "learning_rate": 4.508924646091621e-06, "loss": 0.0018, "step": 12278 }, { "epoch": 3.239282416567735, "grad_norm": 2.1094822883605957, "learning_rate": 4.508572935900818e-06, "loss": 0.0071, "step": 12280 }, { "epoch": 3.2398100514444006, "grad_norm": 0.03431997448205948, "learning_rate": 4.508221225710015e-06, "loss": 0.0004, "step": 12282 }, { "epoch": 3.240337686321066, "grad_norm": 0.4473435580730438, "learning_rate": 4.507869515519212e-06, "loss": 0.0012, "step": 12284 }, { "epoch": 3.240865321197731, "grad_norm": 0.12101785093545914, "learning_rate": 4.50751780532841e-06, "loss": 0.0007, "step": 12286 }, { "epoch": 3.2413929560743964, "grad_norm": 0.0298038087785244, "learning_rate": 4.5071660951376074e-06, "loss": 0.0003, "step": 12288 }, { "epoch": 3.2419205909510618, "grad_norm": 0.014196700416505337, "learning_rate": 4.5068143849468036e-06, "loss": 0.0003, "step": 12290 }, { "epoch": 3.242448225827727, "grad_norm": 0.08806198090314865, "learning_rate": 4.506462674756001e-06, "loss": 0.0023, "step": 12292 }, { "epoch": 3.2429758607043926, "grad_norm": 0.14391832053661346, "learning_rate": 4.506110964565198e-06, "loss": 0.0023, "step": 12294 }, { "epoch": 3.243503495581058, "grad_norm": 0.036810245364904404, "learning_rate": 4.505759254374395e-06, "loss": 0.0005, "step": 12296 }, { "epoch": 3.2440311304577234, "grad_norm": 0.21496978402137756, "learning_rate": 4.505407544183593e-06, "loss": 0.0079, "step": 12298 }, { "epoch": 3.244558765334389, "grad_norm": 0.1389593780040741, "learning_rate": 4.50505583399279e-06, "loss": 0.0081, "step": 12300 }, { "epoch": 3.2450864002110538, "grad_norm": 0.16342692077159882, "learning_rate": 4.504704123801988e-06, "loss": 0.0016, "step": 12302 }, { "epoch": 3.245614035087719, "grad_norm": 0.004394391551613808, "learning_rate": 4.504352413611184e-06, "loss": 0.0003, "step": 12304 }, { "epoch": 3.2461416699643846, "grad_norm": 0.4682377278804779, "learning_rate": 4.504000703420382e-06, "loss": 0.0121, "step": 12306 }, { "epoch": 3.24666930484105, "grad_norm": 0.008823269978165627, "learning_rate": 4.503648993229579e-06, "loss": 0.0003, "step": 12308 }, { "epoch": 3.2471969397177154, "grad_norm": 0.0038006131071597338, "learning_rate": 4.5032972830387765e-06, "loss": 0.0003, "step": 12310 }, { "epoch": 3.247724574594381, "grad_norm": 0.6431499123573303, "learning_rate": 4.502945572847973e-06, "loss": 0.0091, "step": 12312 }, { "epoch": 3.248252209471046, "grad_norm": 0.08740153908729553, "learning_rate": 4.50259386265717e-06, "loss": 0.0009, "step": 12314 }, { "epoch": 3.2487798443477116, "grad_norm": 0.008329818025231361, "learning_rate": 4.502242152466368e-06, "loss": 0.0005, "step": 12316 }, { "epoch": 3.2493074792243766, "grad_norm": 0.006171060726046562, "learning_rate": 4.501890442275565e-06, "loss": 0.0129, "step": 12318 }, { "epoch": 3.249835114101042, "grad_norm": 0.6428855061531067, "learning_rate": 4.501538732084762e-06, "loss": 0.0055, "step": 12320 }, { "epoch": 3.2503627489777074, "grad_norm": 0.26387524604797363, "learning_rate": 4.50118702189396e-06, "loss": 0.0135, "step": 12322 }, { "epoch": 3.250890383854373, "grad_norm": 0.020973920822143555, "learning_rate": 4.500835311703157e-06, "loss": 0.0003, "step": 12324 }, { "epoch": 3.251418018731038, "grad_norm": 0.021229758858680725, "learning_rate": 4.500483601512354e-06, "loss": 0.0003, "step": 12326 }, { "epoch": 3.2519456536077036, "grad_norm": 0.04558781906962395, "learning_rate": 4.500131891321551e-06, "loss": 0.0004, "step": 12328 }, { "epoch": 3.252473288484369, "grad_norm": 0.016603637486696243, "learning_rate": 4.4997801811307485e-06, "loss": 0.0003, "step": 12330 }, { "epoch": 3.253000923361034, "grad_norm": 0.09993703663349152, "learning_rate": 4.4994284709399455e-06, "loss": 0.0022, "step": 12332 }, { "epoch": 3.2535285582376994, "grad_norm": 0.11460095643997192, "learning_rate": 4.4990767607491424e-06, "loss": 0.0006, "step": 12334 }, { "epoch": 3.254056193114365, "grad_norm": 0.22470995783805847, "learning_rate": 4.49872505055834e-06, "loss": 0.0043, "step": 12336 }, { "epoch": 3.25458382799103, "grad_norm": 0.10491317510604858, "learning_rate": 4.498373340367537e-06, "loss": 0.0006, "step": 12338 }, { "epoch": 3.2551114628676956, "grad_norm": 0.02253032475709915, "learning_rate": 4.498021630176735e-06, "loss": 0.0017, "step": 12340 }, { "epoch": 3.255639097744361, "grad_norm": 0.011095419526100159, "learning_rate": 4.497669919985931e-06, "loss": 0.0092, "step": 12342 }, { "epoch": 3.2561667326210264, "grad_norm": 0.047018345445394516, "learning_rate": 4.497318209795129e-06, "loss": 0.0005, "step": 12344 }, { "epoch": 3.2566943674976914, "grad_norm": 0.010189000517129898, "learning_rate": 4.496966499604327e-06, "loss": 0.0004, "step": 12346 }, { "epoch": 3.2572220023743568, "grad_norm": 0.2662130296230316, "learning_rate": 4.496614789413524e-06, "loss": 0.0007, "step": 12348 }, { "epoch": 3.257749637251022, "grad_norm": 0.027968699112534523, "learning_rate": 4.496263079222721e-06, "loss": 0.0011, "step": 12350 }, { "epoch": 3.2582772721276876, "grad_norm": 0.12705563008785248, "learning_rate": 4.4959113690319175e-06, "loss": 0.0007, "step": 12352 }, { "epoch": 3.258804907004353, "grad_norm": 0.04816604033112526, "learning_rate": 4.495559658841115e-06, "loss": 0.0008, "step": 12354 }, { "epoch": 3.2593325418810184, "grad_norm": 0.010438727214932442, "learning_rate": 4.495207948650312e-06, "loss": 0.0003, "step": 12356 }, { "epoch": 3.259860176757684, "grad_norm": 0.008083304390311241, "learning_rate": 4.494856238459509e-06, "loss": 0.0003, "step": 12358 }, { "epoch": 3.260387811634349, "grad_norm": 0.05035337805747986, "learning_rate": 4.494504528268707e-06, "loss": 0.0031, "step": 12360 }, { "epoch": 3.2609154465110146, "grad_norm": 0.017723072320222855, "learning_rate": 4.494152818077904e-06, "loss": 0.0003, "step": 12362 }, { "epoch": 3.2614430813876796, "grad_norm": 0.14186035096645355, "learning_rate": 4.493801107887101e-06, "loss": 0.0109, "step": 12364 }, { "epoch": 3.261970716264345, "grad_norm": 0.059380386024713516, "learning_rate": 4.493449397696298e-06, "loss": 0.0023, "step": 12366 }, { "epoch": 3.2624983511410104, "grad_norm": 0.009302753023803234, "learning_rate": 4.493097687505496e-06, "loss": 0.0034, "step": 12368 }, { "epoch": 3.263025986017676, "grad_norm": 0.0065604764968156815, "learning_rate": 4.492745977314693e-06, "loss": 0.0003, "step": 12370 }, { "epoch": 3.263553620894341, "grad_norm": 1.079787254333496, "learning_rate": 4.49239426712389e-06, "loss": 0.0102, "step": 12372 }, { "epoch": 3.2640812557710066, "grad_norm": 0.08367766439914703, "learning_rate": 4.492042556933087e-06, "loss": 0.0033, "step": 12374 }, { "epoch": 3.264608890647672, "grad_norm": 0.5659371614456177, "learning_rate": 4.491690846742284e-06, "loss": 0.004, "step": 12376 }, { "epoch": 3.265136525524337, "grad_norm": 0.009494059719145298, "learning_rate": 4.491339136551482e-06, "loss": 0.0004, "step": 12378 }, { "epoch": 3.2656641604010024, "grad_norm": 0.3979741632938385, "learning_rate": 4.490987426360679e-06, "loss": 0.0014, "step": 12380 }, { "epoch": 3.266191795277668, "grad_norm": 0.7095465660095215, "learning_rate": 4.490635716169876e-06, "loss": 0.0065, "step": 12382 }, { "epoch": 3.266719430154333, "grad_norm": 0.42600658535957336, "learning_rate": 4.490284005979074e-06, "loss": 0.0024, "step": 12384 }, { "epoch": 3.2672470650309986, "grad_norm": 0.021810827776789665, "learning_rate": 4.48993229578827e-06, "loss": 0.0004, "step": 12386 }, { "epoch": 3.267774699907664, "grad_norm": 0.052419163286685944, "learning_rate": 4.489580585597468e-06, "loss": 0.0017, "step": 12388 }, { "epoch": 3.2683023347843294, "grad_norm": 0.8730238080024719, "learning_rate": 4.489228875406665e-06, "loss": 0.0066, "step": 12390 }, { "epoch": 3.2688299696609944, "grad_norm": 0.012643745169043541, "learning_rate": 4.4888771652158625e-06, "loss": 0.0014, "step": 12392 }, { "epoch": 3.26935760453766, "grad_norm": 0.010691413655877113, "learning_rate": 4.4885254550250595e-06, "loss": 0.0003, "step": 12394 }, { "epoch": 3.269885239414325, "grad_norm": 0.06568057090044022, "learning_rate": 4.4881737448342564e-06, "loss": 0.0005, "step": 12396 }, { "epoch": 3.2704128742909906, "grad_norm": 0.10847111791372299, "learning_rate": 4.487822034643454e-06, "loss": 0.0144, "step": 12398 }, { "epoch": 3.270940509167656, "grad_norm": 0.018176157027482986, "learning_rate": 4.487470324452651e-06, "loss": 0.0011, "step": 12400 }, { "epoch": 3.2714681440443214, "grad_norm": 0.05958118662238121, "learning_rate": 4.487118614261848e-06, "loss": 0.0027, "step": 12402 }, { "epoch": 3.271995778920987, "grad_norm": 0.011335051618516445, "learning_rate": 4.486766904071046e-06, "loss": 0.0024, "step": 12404 }, { "epoch": 3.272523413797652, "grad_norm": 0.25309664011001587, "learning_rate": 4.486415193880243e-06, "loss": 0.0024, "step": 12406 }, { "epoch": 3.2730510486743176, "grad_norm": 0.028080113232135773, "learning_rate": 4.486063483689441e-06, "loss": 0.0004, "step": 12408 }, { "epoch": 3.2735786835509826, "grad_norm": 0.009362752549350262, "learning_rate": 4.485711773498637e-06, "loss": 0.001, "step": 12410 }, { "epoch": 3.274106318427648, "grad_norm": 0.29785245656967163, "learning_rate": 4.485360063307835e-06, "loss": 0.0017, "step": 12412 }, { "epoch": 3.2746339533043134, "grad_norm": 0.020673124119639397, "learning_rate": 4.4850083531170315e-06, "loss": 0.0003, "step": 12414 }, { "epoch": 3.275161588180979, "grad_norm": 0.010729745961725712, "learning_rate": 4.484656642926229e-06, "loss": 0.0038, "step": 12416 }, { "epoch": 3.275689223057644, "grad_norm": 0.010316058062016964, "learning_rate": 4.484304932735426e-06, "loss": 0.0003, "step": 12418 }, { "epoch": 3.2762168579343096, "grad_norm": 0.010546907782554626, "learning_rate": 4.483953222544623e-06, "loss": 0.0008, "step": 12420 }, { "epoch": 3.276744492810975, "grad_norm": 0.011472017504274845, "learning_rate": 4.483601512353821e-06, "loss": 0.0004, "step": 12422 }, { "epoch": 3.27727212768764, "grad_norm": 0.4338257908821106, "learning_rate": 4.483249802163017e-06, "loss": 0.0019, "step": 12424 }, { "epoch": 3.2777997625643054, "grad_norm": 0.012770811095833778, "learning_rate": 4.482898091972215e-06, "loss": 0.0003, "step": 12426 }, { "epoch": 3.278327397440971, "grad_norm": 0.31488311290740967, "learning_rate": 4.482546381781412e-06, "loss": 0.0042, "step": 12428 }, { "epoch": 3.278855032317636, "grad_norm": 0.3497833013534546, "learning_rate": 4.48219467159061e-06, "loss": 0.0057, "step": 12430 }, { "epoch": 3.2793826671943016, "grad_norm": 0.34893789887428284, "learning_rate": 4.481842961399807e-06, "loss": 0.0091, "step": 12432 }, { "epoch": 3.279910302070967, "grad_norm": 0.009459028951823711, "learning_rate": 4.481491251209004e-06, "loss": 0.0011, "step": 12434 }, { "epoch": 3.2804379369476324, "grad_norm": 0.019066596403717995, "learning_rate": 4.481139541018201e-06, "loss": 0.001, "step": 12436 }, { "epoch": 3.2809655718242974, "grad_norm": 0.15863363444805145, "learning_rate": 4.480787830827398e-06, "loss": 0.0004, "step": 12438 }, { "epoch": 3.281493206700963, "grad_norm": 0.042766787111759186, "learning_rate": 4.480436120636595e-06, "loss": 0.0009, "step": 12440 }, { "epoch": 3.282020841577628, "grad_norm": 0.007789549883455038, "learning_rate": 4.480084410445793e-06, "loss": 0.0003, "step": 12442 }, { "epoch": 3.2825484764542936, "grad_norm": 0.6582185626029968, "learning_rate": 4.47973270025499e-06, "loss": 0.0033, "step": 12444 }, { "epoch": 3.283076111330959, "grad_norm": 0.0752236470580101, "learning_rate": 4.479380990064188e-06, "loss": 0.0079, "step": 12446 }, { "epoch": 3.2836037462076244, "grad_norm": 0.10665686428546906, "learning_rate": 4.479029279873384e-06, "loss": 0.0005, "step": 12448 }, { "epoch": 3.28413138108429, "grad_norm": 0.022326264530420303, "learning_rate": 4.478677569682582e-06, "loss": 0.0003, "step": 12450 }, { "epoch": 3.2846590159609548, "grad_norm": 0.05219002440571785, "learning_rate": 4.478325859491779e-06, "loss": 0.0006, "step": 12452 }, { "epoch": 3.28518665083762, "grad_norm": 0.009182664565742016, "learning_rate": 4.477974149300976e-06, "loss": 0.0004, "step": 12454 }, { "epoch": 3.2857142857142856, "grad_norm": 0.20679904520511627, "learning_rate": 4.4776224391101735e-06, "loss": 0.0071, "step": 12456 }, { "epoch": 3.286241920590951, "grad_norm": 0.01108650304377079, "learning_rate": 4.47727072891937e-06, "loss": 0.0002, "step": 12458 }, { "epoch": 3.2867695554676164, "grad_norm": 0.5512169003486633, "learning_rate": 4.476919018728568e-06, "loss": 0.0115, "step": 12460 }, { "epoch": 3.287297190344282, "grad_norm": 0.034002505242824554, "learning_rate": 4.476567308537765e-06, "loss": 0.0004, "step": 12462 }, { "epoch": 3.287824825220947, "grad_norm": 0.6967348456382751, "learning_rate": 4.476215598346962e-06, "loss": 0.0064, "step": 12464 }, { "epoch": 3.2883524600976126, "grad_norm": 0.018491564318537712, "learning_rate": 4.47586388815616e-06, "loss": 0.0004, "step": 12466 }, { "epoch": 3.288880094974278, "grad_norm": 0.4795421361923218, "learning_rate": 4.475512177965357e-06, "loss": 0.0032, "step": 12468 }, { "epoch": 3.289407729850943, "grad_norm": 0.06862998008728027, "learning_rate": 4.475160467774554e-06, "loss": 0.0011, "step": 12470 }, { "epoch": 3.2899353647276084, "grad_norm": 0.06928906589746475, "learning_rate": 4.474808757583751e-06, "loss": 0.0008, "step": 12472 }, { "epoch": 3.290462999604274, "grad_norm": 0.049461252987384796, "learning_rate": 4.474457047392949e-06, "loss": 0.0064, "step": 12474 }, { "epoch": 3.290990634480939, "grad_norm": 0.8731264472007751, "learning_rate": 4.4741053372021455e-06, "loss": 0.002, "step": 12476 }, { "epoch": 3.2915182693576046, "grad_norm": 0.37334340810775757, "learning_rate": 4.4737536270113425e-06, "loss": 0.0103, "step": 12478 }, { "epoch": 3.29204590423427, "grad_norm": 0.016909901052713394, "learning_rate": 4.47340191682054e-06, "loss": 0.0004, "step": 12480 }, { "epoch": 3.2925735391109354, "grad_norm": 0.019552545621991158, "learning_rate": 4.473050206629737e-06, "loss": 0.0008, "step": 12482 }, { "epoch": 3.2931011739876004, "grad_norm": 0.10364069789648056, "learning_rate": 4.472698496438934e-06, "loss": 0.0008, "step": 12484 }, { "epoch": 3.293628808864266, "grad_norm": 0.040658000856637955, "learning_rate": 4.472346786248131e-06, "loss": 0.0005, "step": 12486 }, { "epoch": 3.294156443740931, "grad_norm": 0.024479219689965248, "learning_rate": 4.471995076057329e-06, "loss": 0.0004, "step": 12488 }, { "epoch": 3.2946840786175966, "grad_norm": 0.02452715113759041, "learning_rate": 4.471643365866527e-06, "loss": 0.0005, "step": 12490 }, { "epoch": 3.295211713494262, "grad_norm": 0.3867204487323761, "learning_rate": 4.471291655675723e-06, "loss": 0.0008, "step": 12492 }, { "epoch": 3.2957393483709274, "grad_norm": 0.010958710685372353, "learning_rate": 4.470939945484921e-06, "loss": 0.0005, "step": 12494 }, { "epoch": 3.296266983247593, "grad_norm": 0.01113909762352705, "learning_rate": 4.470588235294118e-06, "loss": 0.0004, "step": 12496 }, { "epoch": 3.296794618124258, "grad_norm": 0.1845838874578476, "learning_rate": 4.470236525103315e-06, "loss": 0.0006, "step": 12498 }, { "epoch": 3.297322253000923, "grad_norm": 0.05881233885884285, "learning_rate": 4.469884814912512e-06, "loss": 0.0005, "step": 12500 }, { "epoch": 3.2978498878775886, "grad_norm": 0.008784722536802292, "learning_rate": 4.469533104721709e-06, "loss": 0.0003, "step": 12502 }, { "epoch": 3.298377522754254, "grad_norm": 0.016592832282185555, "learning_rate": 4.469181394530907e-06, "loss": 0.0004, "step": 12504 }, { "epoch": 3.2989051576309194, "grad_norm": 0.008820001035928726, "learning_rate": 4.468829684340104e-06, "loss": 0.0004, "step": 12506 }, { "epoch": 3.299432792507585, "grad_norm": 0.011737555265426636, "learning_rate": 4.468477974149301e-06, "loss": 0.0002, "step": 12508 }, { "epoch": 3.29996042738425, "grad_norm": 0.2416549175977707, "learning_rate": 4.468126263958498e-06, "loss": 0.0024, "step": 12510 }, { "epoch": 3.3004880622609156, "grad_norm": 0.00855992827564478, "learning_rate": 4.467774553767696e-06, "loss": 0.0003, "step": 12512 }, { "epoch": 3.301015697137581, "grad_norm": 0.004324509762227535, "learning_rate": 4.467422843576893e-06, "loss": 0.0046, "step": 12514 }, { "epoch": 3.301543332014246, "grad_norm": 0.38834550976753235, "learning_rate": 4.46707113338609e-06, "loss": 0.0027, "step": 12516 }, { "epoch": 3.3020709668909114, "grad_norm": 0.005474432371556759, "learning_rate": 4.4667194231952875e-06, "loss": 0.002, "step": 12518 }, { "epoch": 3.302598601767577, "grad_norm": 0.00582768814638257, "learning_rate": 4.466367713004484e-06, "loss": 0.0124, "step": 12520 }, { "epoch": 3.303126236644242, "grad_norm": 0.012688642367720604, "learning_rate": 4.466016002813681e-06, "loss": 0.0002, "step": 12522 }, { "epoch": 3.3036538715209076, "grad_norm": 0.00789401400834322, "learning_rate": 4.465664292622879e-06, "loss": 0.0003, "step": 12524 }, { "epoch": 3.304181506397573, "grad_norm": 0.02698424831032753, "learning_rate": 4.465312582432076e-06, "loss": 0.0003, "step": 12526 }, { "epoch": 3.3047091412742384, "grad_norm": 0.005262667313218117, "learning_rate": 4.464960872241274e-06, "loss": 0.0009, "step": 12528 }, { "epoch": 3.3052367761509034, "grad_norm": 0.4137345850467682, "learning_rate": 4.46460916205047e-06, "loss": 0.0019, "step": 12530 }, { "epoch": 3.305764411027569, "grad_norm": 0.009661057032644749, "learning_rate": 4.464257451859668e-06, "loss": 0.0004, "step": 12532 }, { "epoch": 3.306292045904234, "grad_norm": 0.32440242171287537, "learning_rate": 4.463905741668865e-06, "loss": 0.0009, "step": 12534 }, { "epoch": 3.3068196807808996, "grad_norm": 0.8136037588119507, "learning_rate": 4.4635540314780626e-06, "loss": 0.0053, "step": 12536 }, { "epoch": 3.307347315657565, "grad_norm": 0.024691250175237656, "learning_rate": 4.4632023212872595e-06, "loss": 0.0003, "step": 12538 }, { "epoch": 3.3078749505342304, "grad_norm": 0.005706010386347771, "learning_rate": 4.4628506110964565e-06, "loss": 0.0002, "step": 12540 }, { "epoch": 3.308402585410896, "grad_norm": 0.00580272963270545, "learning_rate": 4.462498900905654e-06, "loss": 0.0002, "step": 12542 }, { "epoch": 3.308930220287561, "grad_norm": 0.16471794247627258, "learning_rate": 4.46214719071485e-06, "loss": 0.0008, "step": 12544 }, { "epoch": 3.309457855164226, "grad_norm": 0.011143232695758343, "learning_rate": 4.461795480524048e-06, "loss": 0.0003, "step": 12546 }, { "epoch": 3.3099854900408916, "grad_norm": 0.014647996053099632, "learning_rate": 4.461443770333246e-06, "loss": 0.0027, "step": 12548 }, { "epoch": 3.310513124917557, "grad_norm": 0.0098374392837286, "learning_rate": 4.461092060142443e-06, "loss": 0.0077, "step": 12550 }, { "epoch": 3.3110407597942224, "grad_norm": 0.26370641589164734, "learning_rate": 4.46074034995164e-06, "loss": 0.0047, "step": 12552 }, { "epoch": 3.311568394670888, "grad_norm": 0.7446253299713135, "learning_rate": 4.460388639760837e-06, "loss": 0.0127, "step": 12554 }, { "epoch": 3.312096029547553, "grad_norm": 0.5807605385780334, "learning_rate": 4.460036929570035e-06, "loss": 0.0017, "step": 12556 }, { "epoch": 3.3126236644242186, "grad_norm": 0.22744420170783997, "learning_rate": 4.459685219379232e-06, "loss": 0.0042, "step": 12558 }, { "epoch": 3.313151299300884, "grad_norm": 0.032139167189598083, "learning_rate": 4.4593335091884285e-06, "loss": 0.0005, "step": 12560 }, { "epoch": 3.313678934177549, "grad_norm": 0.1738707721233368, "learning_rate": 4.458981798997626e-06, "loss": 0.0006, "step": 12562 }, { "epoch": 3.3142065690542144, "grad_norm": 0.015379741787910461, "learning_rate": 4.458630088806823e-06, "loss": 0.0006, "step": 12564 }, { "epoch": 3.31473420393088, "grad_norm": 0.017114337533712387, "learning_rate": 4.458278378616021e-06, "loss": 0.0005, "step": 12566 }, { "epoch": 3.315261838807545, "grad_norm": 0.302751749753952, "learning_rate": 4.457926668425217e-06, "loss": 0.0044, "step": 12568 }, { "epoch": 3.3157894736842106, "grad_norm": 0.3695281147956848, "learning_rate": 4.457574958234415e-06, "loss": 0.0065, "step": 12570 }, { "epoch": 3.316317108560876, "grad_norm": 0.6487417221069336, "learning_rate": 4.457223248043612e-06, "loss": 0.0083, "step": 12572 }, { "epoch": 3.3168447434375414, "grad_norm": 0.040270350873470306, "learning_rate": 4.456871537852809e-06, "loss": 0.0006, "step": 12574 }, { "epoch": 3.3173723783142064, "grad_norm": 0.31577885150909424, "learning_rate": 4.456519827662007e-06, "loss": 0.0094, "step": 12576 }, { "epoch": 3.317900013190872, "grad_norm": 0.02321772091090679, "learning_rate": 4.456168117471204e-06, "loss": 0.0012, "step": 12578 }, { "epoch": 3.318427648067537, "grad_norm": 0.06943913549184799, "learning_rate": 4.4558164072804015e-06, "loss": 0.0021, "step": 12580 }, { "epoch": 3.3189552829442026, "grad_norm": 0.040218472480773926, "learning_rate": 4.455464697089598e-06, "loss": 0.0022, "step": 12582 }, { "epoch": 3.319482917820868, "grad_norm": 0.11618883907794952, "learning_rate": 4.455112986898795e-06, "loss": 0.0008, "step": 12584 }, { "epoch": 3.3200105526975334, "grad_norm": 0.03551200032234192, "learning_rate": 4.454761276707993e-06, "loss": 0.0013, "step": 12586 }, { "epoch": 3.320538187574199, "grad_norm": 0.45767155289649963, "learning_rate": 4.45440956651719e-06, "loss": 0.0046, "step": 12588 }, { "epoch": 3.321065822450864, "grad_norm": 0.12044183164834976, "learning_rate": 4.454057856326387e-06, "loss": 0.0035, "step": 12590 }, { "epoch": 3.321593457327529, "grad_norm": 0.04149613156914711, "learning_rate": 4.453706146135584e-06, "loss": 0.0072, "step": 12592 }, { "epoch": 3.3221210922041946, "grad_norm": 0.42394712567329407, "learning_rate": 4.453354435944782e-06, "loss": 0.0028, "step": 12594 }, { "epoch": 3.32264872708086, "grad_norm": 0.1432332992553711, "learning_rate": 4.453002725753979e-06, "loss": 0.0007, "step": 12596 }, { "epoch": 3.3231763619575254, "grad_norm": 0.23593740165233612, "learning_rate": 4.452651015563176e-06, "loss": 0.0008, "step": 12598 }, { "epoch": 3.323703996834191, "grad_norm": 0.06910870969295502, "learning_rate": 4.4522993053723735e-06, "loss": 0.0016, "step": 12600 }, { "epoch": 3.3242316317108562, "grad_norm": 0.009731365367770195, "learning_rate": 4.4519475951815705e-06, "loss": 0.0004, "step": 12602 }, { "epoch": 3.324759266587521, "grad_norm": 0.5830352902412415, "learning_rate": 4.451595884990767e-06, "loss": 0.0022, "step": 12604 }, { "epoch": 3.3252869014641866, "grad_norm": 0.010645650327205658, "learning_rate": 4.451244174799965e-06, "loss": 0.0003, "step": 12606 }, { "epoch": 3.325814536340852, "grad_norm": 0.3208838999271393, "learning_rate": 4.450892464609162e-06, "loss": 0.0049, "step": 12608 }, { "epoch": 3.3263421712175174, "grad_norm": 0.8245735764503479, "learning_rate": 4.45054075441836e-06, "loss": 0.0016, "step": 12610 }, { "epoch": 3.326869806094183, "grad_norm": 0.009729156270623207, "learning_rate": 4.450189044227556e-06, "loss": 0.0049, "step": 12612 }, { "epoch": 3.327397440970848, "grad_norm": 0.00743135018274188, "learning_rate": 4.449837334036754e-06, "loss": 0.0034, "step": 12614 }, { "epoch": 3.3279250758475136, "grad_norm": 0.008616017177700996, "learning_rate": 4.449485623845951e-06, "loss": 0.0003, "step": 12616 }, { "epoch": 3.328452710724179, "grad_norm": 0.015352008864283562, "learning_rate": 4.449133913655149e-06, "loss": 0.0004, "step": 12618 }, { "epoch": 3.3289803456008444, "grad_norm": 0.30605074763298035, "learning_rate": 4.448782203464346e-06, "loss": 0.0008, "step": 12620 }, { "epoch": 3.3295079804775094, "grad_norm": 0.32605063915252686, "learning_rate": 4.4484304932735425e-06, "loss": 0.0063, "step": 12622 }, { "epoch": 3.330035615354175, "grad_norm": 0.2799457013607025, "learning_rate": 4.44807878308274e-06, "loss": 0.005, "step": 12624 }, { "epoch": 3.33056325023084, "grad_norm": 0.046324990689754486, "learning_rate": 4.447727072891937e-06, "loss": 0.0004, "step": 12626 }, { "epoch": 3.3310908851075056, "grad_norm": 0.025541236624121666, "learning_rate": 4.447375362701134e-06, "loss": 0.0008, "step": 12628 }, { "epoch": 3.331618519984171, "grad_norm": 0.08576630055904388, "learning_rate": 4.447023652510331e-06, "loss": 0.0005, "step": 12630 }, { "epoch": 3.3321461548608364, "grad_norm": 0.06222493201494217, "learning_rate": 4.446671942319529e-06, "loss": 0.0004, "step": 12632 }, { "epoch": 3.332673789737502, "grad_norm": 0.156654492020607, "learning_rate": 4.446320232128726e-06, "loss": 0.003, "step": 12634 }, { "epoch": 3.333201424614167, "grad_norm": 0.24611707031726837, "learning_rate": 4.445968521937923e-06, "loss": 0.0027, "step": 12636 }, { "epoch": 3.333729059490832, "grad_norm": 0.007776469923555851, "learning_rate": 4.445616811747121e-06, "loss": 0.0024, "step": 12638 }, { "epoch": 3.3342566943674976, "grad_norm": 0.012301256880164146, "learning_rate": 4.445265101556318e-06, "loss": 0.0003, "step": 12640 }, { "epoch": 3.334784329244163, "grad_norm": 0.010428059846162796, "learning_rate": 4.444913391365515e-06, "loss": 0.0003, "step": 12642 }, { "epoch": 3.3353119641208284, "grad_norm": 0.23530447483062744, "learning_rate": 4.444561681174712e-06, "loss": 0.0112, "step": 12644 }, { "epoch": 3.335839598997494, "grad_norm": 0.03452146053314209, "learning_rate": 4.444209970983909e-06, "loss": 0.0004, "step": 12646 }, { "epoch": 3.3363672338741592, "grad_norm": 0.10298130661249161, "learning_rate": 4.443858260793107e-06, "loss": 0.0004, "step": 12648 }, { "epoch": 3.336894868750824, "grad_norm": 0.01584090292453766, "learning_rate": 4.443506550602303e-06, "loss": 0.0003, "step": 12650 }, { "epoch": 3.3374225036274896, "grad_norm": 0.0037944877985864878, "learning_rate": 4.443154840411501e-06, "loss": 0.0002, "step": 12652 }, { "epoch": 3.337950138504155, "grad_norm": 0.0049001178704202175, "learning_rate": 4.442803130220698e-06, "loss": 0.0004, "step": 12654 }, { "epoch": 3.3384777733808204, "grad_norm": 0.5872870683670044, "learning_rate": 4.442451420029896e-06, "loss": 0.0158, "step": 12656 }, { "epoch": 3.339005408257486, "grad_norm": 0.051068708300590515, "learning_rate": 4.442099709839093e-06, "loss": 0.0005, "step": 12658 }, { "epoch": 3.3395330431341512, "grad_norm": 0.1895696371793747, "learning_rate": 4.44174799964829e-06, "loss": 0.0006, "step": 12660 }, { "epoch": 3.3400606780108166, "grad_norm": 0.014349049888551235, "learning_rate": 4.4413962894574875e-06, "loss": 0.0003, "step": 12662 }, { "epoch": 3.340588312887482, "grad_norm": 0.006434123497456312, "learning_rate": 4.4410445792666845e-06, "loss": 0.0004, "step": 12664 }, { "epoch": 3.3411159477641474, "grad_norm": 0.5044889450073242, "learning_rate": 4.440692869075881e-06, "loss": 0.0058, "step": 12666 }, { "epoch": 3.3416435826408124, "grad_norm": 0.142719104886055, "learning_rate": 4.440341158885079e-06, "loss": 0.0007, "step": 12668 }, { "epoch": 3.342171217517478, "grad_norm": 0.014992307871580124, "learning_rate": 4.439989448694276e-06, "loss": 0.0004, "step": 12670 }, { "epoch": 3.342698852394143, "grad_norm": 0.5776882171630859, "learning_rate": 4.439637738503473e-06, "loss": 0.0012, "step": 12672 }, { "epoch": 3.3432264872708086, "grad_norm": 0.25185427069664, "learning_rate": 4.43928602831267e-06, "loss": 0.0044, "step": 12674 }, { "epoch": 3.343754122147474, "grad_norm": 0.16533680260181427, "learning_rate": 4.438934318121868e-06, "loss": 0.0003, "step": 12676 }, { "epoch": 3.3442817570241394, "grad_norm": 0.005076586734503508, "learning_rate": 4.438582607931065e-06, "loss": 0.0011, "step": 12678 }, { "epoch": 3.344809391900805, "grad_norm": 0.052445415407419205, "learning_rate": 4.438230897740262e-06, "loss": 0.001, "step": 12680 }, { "epoch": 3.34533702677747, "grad_norm": 0.4169932007789612, "learning_rate": 4.4378791875494596e-06, "loss": 0.0185, "step": 12682 }, { "epoch": 3.345864661654135, "grad_norm": 0.032247841358184814, "learning_rate": 4.4375274773586565e-06, "loss": 0.0003, "step": 12684 }, { "epoch": 3.3463922965308006, "grad_norm": 0.015417615883052349, "learning_rate": 4.437175767167854e-06, "loss": 0.0074, "step": 12686 }, { "epoch": 3.346919931407466, "grad_norm": 0.05716388672590256, "learning_rate": 4.4368240569770504e-06, "loss": 0.0007, "step": 12688 }, { "epoch": 3.3474475662841314, "grad_norm": 0.22580693662166595, "learning_rate": 4.436472346786248e-06, "loss": 0.0004, "step": 12690 }, { "epoch": 3.347975201160797, "grad_norm": 0.009287173859775066, "learning_rate": 4.436120636595446e-06, "loss": 0.0003, "step": 12692 }, { "epoch": 3.3485028360374622, "grad_norm": 0.024441368877887726, "learning_rate": 4.435768926404642e-06, "loss": 0.0004, "step": 12694 }, { "epoch": 3.349030470914127, "grad_norm": 0.0055010379292070866, "learning_rate": 4.43541721621384e-06, "loss": 0.0002, "step": 12696 }, { "epoch": 3.3495581057907926, "grad_norm": 0.012133719399571419, "learning_rate": 4.435065506023037e-06, "loss": 0.0004, "step": 12698 }, { "epoch": 3.350085740667458, "grad_norm": 0.11027136445045471, "learning_rate": 4.434713795832235e-06, "loss": 0.0125, "step": 12700 }, { "epoch": 3.3506133755441234, "grad_norm": 0.01449747197329998, "learning_rate": 4.434362085641432e-06, "loss": 0.0003, "step": 12702 }, { "epoch": 3.351141010420789, "grad_norm": 0.36953815817832947, "learning_rate": 4.434010375450629e-06, "loss": 0.007, "step": 12704 }, { "epoch": 3.3516686452974542, "grad_norm": 0.04203972592949867, "learning_rate": 4.433658665259826e-06, "loss": 0.0034, "step": 12706 }, { "epoch": 3.3521962801741196, "grad_norm": 0.02943863347172737, "learning_rate": 4.433306955069023e-06, "loss": 0.0003, "step": 12708 }, { "epoch": 3.352723915050785, "grad_norm": 0.005106951575726271, "learning_rate": 4.43295524487822e-06, "loss": 0.0002, "step": 12710 }, { "epoch": 3.3532515499274504, "grad_norm": 0.021352890878915787, "learning_rate": 4.432603534687417e-06, "loss": 0.0004, "step": 12712 }, { "epoch": 3.3537791848041154, "grad_norm": 0.011154089123010635, "learning_rate": 4.432251824496615e-06, "loss": 0.0002, "step": 12714 }, { "epoch": 3.354306819680781, "grad_norm": 0.01511329598724842, "learning_rate": 4.431900114305813e-06, "loss": 0.0003, "step": 12716 }, { "epoch": 3.354834454557446, "grad_norm": 0.31030353903770447, "learning_rate": 4.431548404115009e-06, "loss": 0.0028, "step": 12718 }, { "epoch": 3.3553620894341116, "grad_norm": 0.0041895280592143536, "learning_rate": 4.431196693924207e-06, "loss": 0.0002, "step": 12720 }, { "epoch": 3.355889724310777, "grad_norm": 0.3420506417751312, "learning_rate": 4.430844983733404e-06, "loss": 0.005, "step": 12722 }, { "epoch": 3.3564173591874424, "grad_norm": 0.02613253705203533, "learning_rate": 4.430493273542601e-06, "loss": 0.0003, "step": 12724 }, { "epoch": 3.356944994064108, "grad_norm": 0.01295258104801178, "learning_rate": 4.4301415633517985e-06, "loss": 0.0002, "step": 12726 }, { "epoch": 3.357472628940773, "grad_norm": 0.005327967461198568, "learning_rate": 4.429789853160995e-06, "loss": 0.0002, "step": 12728 }, { "epoch": 3.358000263817438, "grad_norm": 0.0033278362825512886, "learning_rate": 4.429438142970193e-06, "loss": 0.0003, "step": 12730 }, { "epoch": 3.3585278986941036, "grad_norm": 0.005167438182979822, "learning_rate": 4.429086432779389e-06, "loss": 0.0003, "step": 12732 }, { "epoch": 3.359055533570769, "grad_norm": 0.00262447283603251, "learning_rate": 4.428734722588587e-06, "loss": 0.0006, "step": 12734 }, { "epoch": 3.3595831684474344, "grad_norm": 0.010436486452817917, "learning_rate": 4.428383012397784e-06, "loss": 0.0011, "step": 12736 }, { "epoch": 3.3601108033241, "grad_norm": 0.44075310230255127, "learning_rate": 4.428031302206982e-06, "loss": 0.0016, "step": 12738 }, { "epoch": 3.3606384382007652, "grad_norm": 0.1478743702173233, "learning_rate": 4.427679592016179e-06, "loss": 0.001, "step": 12740 }, { "epoch": 3.36116607307743, "grad_norm": 0.5401932001113892, "learning_rate": 4.427327881825376e-06, "loss": 0.0081, "step": 12742 }, { "epoch": 3.3616937079540956, "grad_norm": 0.03274184465408325, "learning_rate": 4.4269761716345736e-06, "loss": 0.0064, "step": 12744 }, { "epoch": 3.362221342830761, "grad_norm": 0.09984484314918518, "learning_rate": 4.4266244614437705e-06, "loss": 0.0029, "step": 12746 }, { "epoch": 3.3627489777074264, "grad_norm": 0.25904545187950134, "learning_rate": 4.4262727512529675e-06, "loss": 0.0115, "step": 12748 }, { "epoch": 3.363276612584092, "grad_norm": 0.017681585624814034, "learning_rate": 4.425921041062165e-06, "loss": 0.0016, "step": 12750 }, { "epoch": 3.3638042474607572, "grad_norm": 0.29467642307281494, "learning_rate": 4.425569330871362e-06, "loss": 0.0115, "step": 12752 }, { "epoch": 3.3643318823374226, "grad_norm": 0.11500735580921173, "learning_rate": 4.425217620680559e-06, "loss": 0.0008, "step": 12754 }, { "epoch": 3.3648595172140876, "grad_norm": 0.0913921445608139, "learning_rate": 4.424865910489756e-06, "loss": 0.0008, "step": 12756 }, { "epoch": 3.365387152090753, "grad_norm": 0.5997600555419922, "learning_rate": 4.424514200298954e-06, "loss": 0.0124, "step": 12758 }, { "epoch": 3.3659147869674184, "grad_norm": 0.011212792247533798, "learning_rate": 4.424162490108151e-06, "loss": 0.0003, "step": 12760 }, { "epoch": 3.366442421844084, "grad_norm": 0.3650917410850525, "learning_rate": 4.423810779917348e-06, "loss": 0.0028, "step": 12762 }, { "epoch": 3.3669700567207492, "grad_norm": 0.027089567855000496, "learning_rate": 4.423459069726546e-06, "loss": 0.0045, "step": 12764 }, { "epoch": 3.3674976915974146, "grad_norm": 0.02590051479637623, "learning_rate": 4.423107359535743e-06, "loss": 0.0009, "step": 12766 }, { "epoch": 3.36802532647408, "grad_norm": 0.021724363788962364, "learning_rate": 4.42275564934494e-06, "loss": 0.0006, "step": 12768 }, { "epoch": 3.3685529613507454, "grad_norm": 0.055885519832372665, "learning_rate": 4.4224039391541365e-06, "loss": 0.0004, "step": 12770 }, { "epoch": 3.369080596227411, "grad_norm": 0.027788491919636726, "learning_rate": 4.422052228963334e-06, "loss": 0.0011, "step": 12772 }, { "epoch": 3.369608231104076, "grad_norm": 0.016080332919955254, "learning_rate": 4.421700518772532e-06, "loss": 0.0004, "step": 12774 }, { "epoch": 3.370135865980741, "grad_norm": 1.5455926656723022, "learning_rate": 4.421348808581729e-06, "loss": 0.0044, "step": 12776 }, { "epoch": 3.3706635008574066, "grad_norm": 0.1178472638130188, "learning_rate": 4.420997098390926e-06, "loss": 0.0027, "step": 12778 }, { "epoch": 3.371191135734072, "grad_norm": 0.05522898584604263, "learning_rate": 4.420645388200123e-06, "loss": 0.0007, "step": 12780 }, { "epoch": 3.3717187706107374, "grad_norm": 0.022048477083444595, "learning_rate": 4.420293678009321e-06, "loss": 0.0006, "step": 12782 }, { "epoch": 3.372246405487403, "grad_norm": 0.7619496583938599, "learning_rate": 4.419941967818518e-06, "loss": 0.004, "step": 12784 }, { "epoch": 3.3727740403640682, "grad_norm": 0.06936250627040863, "learning_rate": 4.419590257627715e-06, "loss": 0.0008, "step": 12786 }, { "epoch": 3.373301675240733, "grad_norm": 0.01203820202499628, "learning_rate": 4.4192385474369124e-06, "loss": 0.0003, "step": 12788 }, { "epoch": 3.3738293101173986, "grad_norm": 0.5078845620155334, "learning_rate": 4.418886837246109e-06, "loss": 0.0019, "step": 12790 }, { "epoch": 3.374356944994064, "grad_norm": 0.10542596131563187, "learning_rate": 4.418535127055306e-06, "loss": 0.0006, "step": 12792 }, { "epoch": 3.3748845798707294, "grad_norm": 0.8793995976448059, "learning_rate": 4.418183416864503e-06, "loss": 0.0027, "step": 12794 }, { "epoch": 3.375412214747395, "grad_norm": 0.5018479228019714, "learning_rate": 4.417831706673701e-06, "loss": 0.0031, "step": 12796 }, { "epoch": 3.3759398496240602, "grad_norm": 0.5371220111846924, "learning_rate": 4.417479996482898e-06, "loss": 0.0047, "step": 12798 }, { "epoch": 3.3764674845007256, "grad_norm": 0.2563577890396118, "learning_rate": 4.417128286292095e-06, "loss": 0.0019, "step": 12800 }, { "epoch": 3.3769951193773906, "grad_norm": 0.9546219706535339, "learning_rate": 4.416776576101293e-06, "loss": 0.0031, "step": 12802 }, { "epoch": 3.377522754254056, "grad_norm": 0.17379380762577057, "learning_rate": 4.41642486591049e-06, "loss": 0.0015, "step": 12804 }, { "epoch": 3.3780503891307214, "grad_norm": 0.008962096646428108, "learning_rate": 4.4160731557196876e-06, "loss": 0.0003, "step": 12806 }, { "epoch": 3.378578024007387, "grad_norm": 0.0050842249765992165, "learning_rate": 4.4157214455288845e-06, "loss": 0.0002, "step": 12808 }, { "epoch": 3.3791056588840522, "grad_norm": 0.010001907125115395, "learning_rate": 4.4153697353380815e-06, "loss": 0.0003, "step": 12810 }, { "epoch": 3.3796332937607176, "grad_norm": 0.35391339659690857, "learning_rate": 4.415018025147279e-06, "loss": 0.0043, "step": 12812 }, { "epoch": 3.380160928637383, "grad_norm": 0.25186169147491455, "learning_rate": 4.414666314956475e-06, "loss": 0.0029, "step": 12814 }, { "epoch": 3.3806885635140485, "grad_norm": 0.072389155626297, "learning_rate": 4.414314604765673e-06, "loss": 0.0005, "step": 12816 }, { "epoch": 3.381216198390714, "grad_norm": 0.30335360765457153, "learning_rate": 4.41396289457487e-06, "loss": 0.0077, "step": 12818 }, { "epoch": 3.381743833267379, "grad_norm": 0.15081357955932617, "learning_rate": 4.413611184384068e-06, "loss": 0.0037, "step": 12820 }, { "epoch": 3.3822714681440442, "grad_norm": 0.040620408952236176, "learning_rate": 4.413259474193265e-06, "loss": 0.0029, "step": 12822 }, { "epoch": 3.3827991030207096, "grad_norm": 0.21372221410274506, "learning_rate": 4.412907764002462e-06, "loss": 0.0042, "step": 12824 }, { "epoch": 3.383326737897375, "grad_norm": 0.014082163572311401, "learning_rate": 4.41255605381166e-06, "loss": 0.0003, "step": 12826 }, { "epoch": 3.3838543727740404, "grad_norm": 0.05585583671927452, "learning_rate": 4.4122043436208566e-06, "loss": 0.0075, "step": 12828 }, { "epoch": 3.384382007650706, "grad_norm": 1.0482324361801147, "learning_rate": 4.4118526334300535e-06, "loss": 0.004, "step": 12830 }, { "epoch": 3.3849096425273713, "grad_norm": 0.03316523879766464, "learning_rate": 4.411500923239251e-06, "loss": 0.0003, "step": 12832 }, { "epoch": 3.385437277404036, "grad_norm": 0.36667945981025696, "learning_rate": 4.411149213048448e-06, "loss": 0.0013, "step": 12834 }, { "epoch": 3.3859649122807016, "grad_norm": 0.05836760997772217, "learning_rate": 4.410797502857646e-06, "loss": 0.0004, "step": 12836 }, { "epoch": 3.386492547157367, "grad_norm": 0.01971396803855896, "learning_rate": 4.410445792666842e-06, "loss": 0.0003, "step": 12838 }, { "epoch": 3.3870201820340324, "grad_norm": 0.017277706414461136, "learning_rate": 4.41009408247604e-06, "loss": 0.002, "step": 12840 }, { "epoch": 3.387547816910698, "grad_norm": 0.45881497859954834, "learning_rate": 4.409742372285237e-06, "loss": 0.0014, "step": 12842 }, { "epoch": 3.3880754517873632, "grad_norm": 0.30303019285202026, "learning_rate": 4.409390662094434e-06, "loss": 0.0101, "step": 12844 }, { "epoch": 3.3886030866640287, "grad_norm": 0.535009503364563, "learning_rate": 4.409038951903632e-06, "loss": 0.0023, "step": 12846 }, { "epoch": 3.3891307215406936, "grad_norm": 0.5067193508148193, "learning_rate": 4.408687241712829e-06, "loss": 0.0129, "step": 12848 }, { "epoch": 3.389658356417359, "grad_norm": 0.002295175101608038, "learning_rate": 4.4083355315220264e-06, "loss": 0.0002, "step": 12850 }, { "epoch": 3.3901859912940244, "grad_norm": 0.1885605901479721, "learning_rate": 4.4079838213312225e-06, "loss": 0.006, "step": 12852 }, { "epoch": 3.39071362617069, "grad_norm": 0.03845470771193504, "learning_rate": 4.40763211114042e-06, "loss": 0.003, "step": 12854 }, { "epoch": 3.3912412610473552, "grad_norm": 0.01534021645784378, "learning_rate": 4.407280400949617e-06, "loss": 0.0002, "step": 12856 }, { "epoch": 3.3917688959240206, "grad_norm": 0.10416512936353683, "learning_rate": 4.406928690758815e-06, "loss": 0.0004, "step": 12858 }, { "epoch": 3.392296530800686, "grad_norm": 0.019975945353507996, "learning_rate": 4.406576980568012e-06, "loss": 0.0006, "step": 12860 }, { "epoch": 3.3928241656773515, "grad_norm": 0.011558793485164642, "learning_rate": 4.406225270377209e-06, "loss": 0.0007, "step": 12862 }, { "epoch": 3.393351800554017, "grad_norm": 0.07887562364339828, "learning_rate": 4.405873560186407e-06, "loss": 0.0005, "step": 12864 }, { "epoch": 3.393879435430682, "grad_norm": 0.004744641948491335, "learning_rate": 4.405521849995604e-06, "loss": 0.0003, "step": 12866 }, { "epoch": 3.3944070703073472, "grad_norm": 0.030714616179466248, "learning_rate": 4.405170139804801e-06, "loss": 0.0003, "step": 12868 }, { "epoch": 3.3949347051840126, "grad_norm": 0.005386311560869217, "learning_rate": 4.4048184296139985e-06, "loss": 0.0003, "step": 12870 }, { "epoch": 3.395462340060678, "grad_norm": 0.009077444672584534, "learning_rate": 4.4044667194231955e-06, "loss": 0.0002, "step": 12872 }, { "epoch": 3.3959899749373434, "grad_norm": 0.6942857503890991, "learning_rate": 4.404115009232392e-06, "loss": 0.0021, "step": 12874 }, { "epoch": 3.396517609814009, "grad_norm": 0.06458692252635956, "learning_rate": 4.403763299041589e-06, "loss": 0.0043, "step": 12876 }, { "epoch": 3.3970452446906743, "grad_norm": 0.6242871880531311, "learning_rate": 4.403411588850787e-06, "loss": 0.0066, "step": 12878 }, { "epoch": 3.397572879567339, "grad_norm": 0.19448451697826385, "learning_rate": 4.403059878659984e-06, "loss": 0.0007, "step": 12880 }, { "epoch": 3.3981005144440046, "grad_norm": 0.024255990982055664, "learning_rate": 4.402708168469181e-06, "loss": 0.0003, "step": 12882 }, { "epoch": 3.39862814932067, "grad_norm": 0.0095730721950531, "learning_rate": 4.402356458278379e-06, "loss": 0.0007, "step": 12884 }, { "epoch": 3.3991557841973354, "grad_norm": 0.03831090033054352, "learning_rate": 4.402004748087576e-06, "loss": 0.0004, "step": 12886 }, { "epoch": 3.399683419074001, "grad_norm": 0.03855305537581444, "learning_rate": 4.401653037896774e-06, "loss": 0.0031, "step": 12888 }, { "epoch": 3.4002110539506663, "grad_norm": 0.003801249898970127, "learning_rate": 4.4013013277059706e-06, "loss": 0.0002, "step": 12890 }, { "epoch": 3.4007386888273317, "grad_norm": 0.005605947691947222, "learning_rate": 4.4009496175151675e-06, "loss": 0.0002, "step": 12892 }, { "epoch": 3.4012663237039966, "grad_norm": 0.009915296919643879, "learning_rate": 4.400597907324365e-06, "loss": 0.0055, "step": 12894 }, { "epoch": 3.401793958580662, "grad_norm": 0.13092297315597534, "learning_rate": 4.400246197133562e-06, "loss": 0.0014, "step": 12896 }, { "epoch": 3.4023215934573274, "grad_norm": 0.019046423956751823, "learning_rate": 4.399894486942759e-06, "loss": 0.0005, "step": 12898 }, { "epoch": 3.402849228333993, "grad_norm": 0.16916754841804504, "learning_rate": 4.399542776751956e-06, "loss": 0.0009, "step": 12900 }, { "epoch": 3.4033768632106582, "grad_norm": 0.011431372724473476, "learning_rate": 4.399191066561154e-06, "loss": 0.0003, "step": 12902 }, { "epoch": 3.4039044980873236, "grad_norm": 0.07042050361633301, "learning_rate": 4.398839356370351e-06, "loss": 0.0038, "step": 12904 }, { "epoch": 3.404432132963989, "grad_norm": 0.005814836826175451, "learning_rate": 4.398487646179548e-06, "loss": 0.0003, "step": 12906 }, { "epoch": 3.4049597678406545, "grad_norm": 0.08359570801258087, "learning_rate": 4.398135935988746e-06, "loss": 0.0005, "step": 12908 }, { "epoch": 3.4054874027173194, "grad_norm": 0.012396606616675854, "learning_rate": 4.397784225797943e-06, "loss": 0.0003, "step": 12910 }, { "epoch": 3.406015037593985, "grad_norm": 0.002535760635510087, "learning_rate": 4.39743251560714e-06, "loss": 0.0002, "step": 12912 }, { "epoch": 3.4065426724706502, "grad_norm": 0.03191005066037178, "learning_rate": 4.3970808054163365e-06, "loss": 0.0003, "step": 12914 }, { "epoch": 3.4070703073473156, "grad_norm": 0.4337272644042969, "learning_rate": 4.396729095225534e-06, "loss": 0.001, "step": 12916 }, { "epoch": 3.407597942223981, "grad_norm": 0.2157210111618042, "learning_rate": 4.396377385034732e-06, "loss": 0.0011, "step": 12918 }, { "epoch": 3.4081255771006465, "grad_norm": 0.00199515069834888, "learning_rate": 4.396025674843928e-06, "loss": 0.0002, "step": 12920 }, { "epoch": 3.408653211977312, "grad_norm": 0.003015405498445034, "learning_rate": 4.395673964653126e-06, "loss": 0.0002, "step": 12922 }, { "epoch": 3.4091808468539773, "grad_norm": 0.009117339737713337, "learning_rate": 4.395322254462323e-06, "loss": 0.0009, "step": 12924 }, { "epoch": 3.4097084817306422, "grad_norm": 0.008776727132499218, "learning_rate": 4.394970544271521e-06, "loss": 0.0002, "step": 12926 }, { "epoch": 3.4102361166073076, "grad_norm": 0.1698843389749527, "learning_rate": 4.394618834080718e-06, "loss": 0.0072, "step": 12928 }, { "epoch": 3.410763751483973, "grad_norm": 0.21245770156383514, "learning_rate": 4.394267123889915e-06, "loss": 0.0061, "step": 12930 }, { "epoch": 3.4112913863606384, "grad_norm": 0.18126685917377472, "learning_rate": 4.3939154136991125e-06, "loss": 0.0005, "step": 12932 }, { "epoch": 3.411819021237304, "grad_norm": 0.21330712735652924, "learning_rate": 4.393563703508309e-06, "loss": 0.0095, "step": 12934 }, { "epoch": 3.4123466561139693, "grad_norm": 0.03854645416140556, "learning_rate": 4.393211993317506e-06, "loss": 0.0005, "step": 12936 }, { "epoch": 3.4128742909906347, "grad_norm": 0.6679896116256714, "learning_rate": 4.392860283126703e-06, "loss": 0.0009, "step": 12938 }, { "epoch": 3.4134019258672996, "grad_norm": 0.005072934553027153, "learning_rate": 4.392508572935901e-06, "loss": 0.0006, "step": 12940 }, { "epoch": 3.413929560743965, "grad_norm": 0.06728855520486832, "learning_rate": 4.392156862745098e-06, "loss": 0.0004, "step": 12942 }, { "epoch": 3.4144571956206304, "grad_norm": 0.02861570008099079, "learning_rate": 4.391805152554295e-06, "loss": 0.0003, "step": 12944 }, { "epoch": 3.414984830497296, "grad_norm": 0.2957688570022583, "learning_rate": 4.391453442363493e-06, "loss": 0.0111, "step": 12946 }, { "epoch": 3.4155124653739612, "grad_norm": 0.00830431655049324, "learning_rate": 4.39110173217269e-06, "loss": 0.0004, "step": 12948 }, { "epoch": 3.4160401002506267, "grad_norm": 0.1460394263267517, "learning_rate": 4.390750021981887e-06, "loss": 0.0055, "step": 12950 }, { "epoch": 3.416567735127292, "grad_norm": 0.055422719568014145, "learning_rate": 4.3903983117910846e-06, "loss": 0.0005, "step": 12952 }, { "epoch": 3.417095370003957, "grad_norm": 0.08314132690429688, "learning_rate": 4.3900466016002815e-06, "loss": 0.0018, "step": 12954 }, { "epoch": 3.4176230048806224, "grad_norm": 0.009539632126688957, "learning_rate": 4.389694891409479e-06, "loss": 0.0019, "step": 12956 }, { "epoch": 3.418150639757288, "grad_norm": 0.022512521594762802, "learning_rate": 4.389343181218675e-06, "loss": 0.0003, "step": 12958 }, { "epoch": 3.4186782746339532, "grad_norm": 0.13708770275115967, "learning_rate": 4.388991471027873e-06, "loss": 0.0008, "step": 12960 }, { "epoch": 3.4192059095106186, "grad_norm": 0.20408393442630768, "learning_rate": 4.38863976083707e-06, "loss": 0.0017, "step": 12962 }, { "epoch": 3.419733544387284, "grad_norm": 0.015045518055558205, "learning_rate": 4.388288050646267e-06, "loss": 0.0087, "step": 12964 }, { "epoch": 3.4202611792639495, "grad_norm": 0.0076208459213376045, "learning_rate": 4.387936340455465e-06, "loss": 0.0003, "step": 12966 }, { "epoch": 3.420788814140615, "grad_norm": 0.22453254461288452, "learning_rate": 4.387584630264662e-06, "loss": 0.005, "step": 12968 }, { "epoch": 3.4213164490172803, "grad_norm": 0.007002733182162046, "learning_rate": 4.38723292007386e-06, "loss": 0.0005, "step": 12970 }, { "epoch": 3.4218440838939452, "grad_norm": 0.259238064289093, "learning_rate": 4.386881209883056e-06, "loss": 0.0013, "step": 12972 }, { "epoch": 3.4223717187706106, "grad_norm": 0.012806749902665615, "learning_rate": 4.386529499692254e-06, "loss": 0.0003, "step": 12974 }, { "epoch": 3.422899353647276, "grad_norm": 0.026382673531770706, "learning_rate": 4.386177789501451e-06, "loss": 0.005, "step": 12976 }, { "epoch": 3.4234269885239414, "grad_norm": 0.09838806837797165, "learning_rate": 4.385826079310648e-06, "loss": 0.0007, "step": 12978 }, { "epoch": 3.423954623400607, "grad_norm": 0.3180284798145294, "learning_rate": 4.385474369119845e-06, "loss": 0.0011, "step": 12980 }, { "epoch": 3.4244822582772723, "grad_norm": 0.10841096937656403, "learning_rate": 4.385122658929042e-06, "loss": 0.0014, "step": 12982 }, { "epoch": 3.4250098931539377, "grad_norm": 0.44459518790245056, "learning_rate": 4.38477094873824e-06, "loss": 0.0035, "step": 12984 }, { "epoch": 3.4255375280306026, "grad_norm": 0.05566037818789482, "learning_rate": 4.384419238547437e-06, "loss": 0.0008, "step": 12986 }, { "epoch": 3.426065162907268, "grad_norm": 0.06980595737695694, "learning_rate": 4.384067528356634e-06, "loss": 0.0004, "step": 12988 }, { "epoch": 3.4265927977839334, "grad_norm": 0.27994459867477417, "learning_rate": 4.383715818165832e-06, "loss": 0.0093, "step": 12990 }, { "epoch": 3.427120432660599, "grad_norm": 0.06057678908109665, "learning_rate": 4.383364107975029e-06, "loss": 0.0011, "step": 12992 }, { "epoch": 3.4276480675372643, "grad_norm": 0.577612042427063, "learning_rate": 4.383012397784226e-06, "loss": 0.0094, "step": 12994 }, { "epoch": 3.4281757024139297, "grad_norm": 0.014978631399571896, "learning_rate": 4.382660687593423e-06, "loss": 0.0004, "step": 12996 }, { "epoch": 3.428703337290595, "grad_norm": 0.030950209125876427, "learning_rate": 4.38230897740262e-06, "loss": 0.0009, "step": 12998 }, { "epoch": 3.42923097216726, "grad_norm": 0.039899904280900955, "learning_rate": 4.381957267211818e-06, "loss": 0.0004, "step": 13000 }, { "epoch": 3.4297586070439254, "grad_norm": 0.25217360258102417, "learning_rate": 4.381605557021014e-06, "loss": 0.0061, "step": 13002 }, { "epoch": 3.430286241920591, "grad_norm": 0.028798682615160942, "learning_rate": 4.381253846830212e-06, "loss": 0.0006, "step": 13004 }, { "epoch": 3.4308138767972562, "grad_norm": 0.029497837647795677, "learning_rate": 4.380902136639409e-06, "loss": 0.0006, "step": 13006 }, { "epoch": 3.4313415116739217, "grad_norm": 0.34164053201675415, "learning_rate": 4.380550426448607e-06, "loss": 0.0009, "step": 13008 }, { "epoch": 3.431869146550587, "grad_norm": 0.01096024364233017, "learning_rate": 4.380198716257804e-06, "loss": 0.0003, "step": 13010 }, { "epoch": 3.4323967814272525, "grad_norm": 0.012626592069864273, "learning_rate": 4.379847006067001e-06, "loss": 0.0008, "step": 13012 }, { "epoch": 3.432924416303918, "grad_norm": 0.49533241987228394, "learning_rate": 4.3794952958761986e-06, "loss": 0.0014, "step": 13014 }, { "epoch": 3.4334520511805833, "grad_norm": 0.14670780301094055, "learning_rate": 4.3791435856853955e-06, "loss": 0.0007, "step": 13016 }, { "epoch": 3.4339796860572482, "grad_norm": 0.01313002873212099, "learning_rate": 4.3787918754945925e-06, "loss": 0.004, "step": 13018 }, { "epoch": 3.4345073209339136, "grad_norm": 0.07587116956710815, "learning_rate": 4.378440165303789e-06, "loss": 0.001, "step": 13020 }, { "epoch": 3.435034955810579, "grad_norm": 0.005071816500276327, "learning_rate": 4.378088455112987e-06, "loss": 0.0002, "step": 13022 }, { "epoch": 3.4355625906872445, "grad_norm": 0.2918905019760132, "learning_rate": 4.377736744922184e-06, "loss": 0.0085, "step": 13024 }, { "epoch": 3.43609022556391, "grad_norm": 0.008690735325217247, "learning_rate": 4.377385034731381e-06, "loss": 0.0007, "step": 13026 }, { "epoch": 3.4366178604405753, "grad_norm": 0.005056079942733049, "learning_rate": 4.377033324540579e-06, "loss": 0.0002, "step": 13028 }, { "epoch": 3.4371454953172407, "grad_norm": 0.024851324036717415, "learning_rate": 4.376681614349776e-06, "loss": 0.0004, "step": 13030 }, { "epoch": 3.4376731301939056, "grad_norm": 0.0058347187004983425, "learning_rate": 4.376329904158973e-06, "loss": 0.0007, "step": 13032 }, { "epoch": 3.438200765070571, "grad_norm": 0.005545295774936676, "learning_rate": 4.375978193968171e-06, "loss": 0.0002, "step": 13034 }, { "epoch": 3.4387283999472364, "grad_norm": 0.04580283910036087, "learning_rate": 4.3756264837773676e-06, "loss": 0.0032, "step": 13036 }, { "epoch": 3.439256034823902, "grad_norm": 0.004706734325736761, "learning_rate": 4.375274773586565e-06, "loss": 0.0002, "step": 13038 }, { "epoch": 3.4397836697005673, "grad_norm": 0.2643555700778961, "learning_rate": 4.3749230633957615e-06, "loss": 0.0133, "step": 13040 }, { "epoch": 3.4403113045772327, "grad_norm": 0.1345919370651245, "learning_rate": 4.374571353204959e-06, "loss": 0.0033, "step": 13042 }, { "epoch": 3.440838939453898, "grad_norm": 0.04056921601295471, "learning_rate": 4.374219643014156e-06, "loss": 0.004, "step": 13044 }, { "epoch": 3.441366574330563, "grad_norm": 0.00903777964413166, "learning_rate": 4.373867932823354e-06, "loss": 0.0003, "step": 13046 }, { "epoch": 3.4418942092072284, "grad_norm": 0.0035441280342638493, "learning_rate": 4.373516222632551e-06, "loss": 0.0003, "step": 13048 }, { "epoch": 3.442421844083894, "grad_norm": 0.3007441759109497, "learning_rate": 4.373164512441748e-06, "loss": 0.0014, "step": 13050 }, { "epoch": 3.4429494789605593, "grad_norm": 0.5816220641136169, "learning_rate": 4.372812802250946e-06, "loss": 0.0085, "step": 13052 }, { "epoch": 3.4434771138372247, "grad_norm": 0.014657935127615929, "learning_rate": 4.372461092060143e-06, "loss": 0.0004, "step": 13054 }, { "epoch": 3.44400474871389, "grad_norm": 0.014287433587014675, "learning_rate": 4.37210938186934e-06, "loss": 0.0004, "step": 13056 }, { "epoch": 3.4445323835905555, "grad_norm": 0.037068773061037064, "learning_rate": 4.3717576716785374e-06, "loss": 0.0004, "step": 13058 }, { "epoch": 3.445060018467221, "grad_norm": 0.48851144313812256, "learning_rate": 4.371405961487734e-06, "loss": 0.0039, "step": 13060 }, { "epoch": 3.445587653343886, "grad_norm": 0.011253196746110916, "learning_rate": 4.371054251296931e-06, "loss": 0.0003, "step": 13062 }, { "epoch": 3.4461152882205512, "grad_norm": 0.0355893075466156, "learning_rate": 4.370702541106128e-06, "loss": 0.0004, "step": 13064 }, { "epoch": 3.4466429230972166, "grad_norm": 0.0057755508460104465, "learning_rate": 4.370350830915326e-06, "loss": 0.0002, "step": 13066 }, { "epoch": 3.447170557973882, "grad_norm": 0.25666695833206177, "learning_rate": 4.369999120724523e-06, "loss": 0.0095, "step": 13068 }, { "epoch": 3.4476981928505475, "grad_norm": 0.3135892152786255, "learning_rate": 4.36964741053372e-06, "loss": 0.0042, "step": 13070 }, { "epoch": 3.448225827727213, "grad_norm": 0.05849575996398926, "learning_rate": 4.369295700342918e-06, "loss": 0.0006, "step": 13072 }, { "epoch": 3.4487534626038783, "grad_norm": 0.018440738320350647, "learning_rate": 4.368943990152115e-06, "loss": 0.0005, "step": 13074 }, { "epoch": 3.4492810974805437, "grad_norm": 0.03921608254313469, "learning_rate": 4.3685922799613125e-06, "loss": 0.0006, "step": 13076 }, { "epoch": 3.4498087323572086, "grad_norm": 0.01355696376413107, "learning_rate": 4.368240569770509e-06, "loss": 0.0003, "step": 13078 }, { "epoch": 3.450336367233874, "grad_norm": 0.29990339279174805, "learning_rate": 4.3678888595797065e-06, "loss": 0.0118, "step": 13080 }, { "epoch": 3.4508640021105395, "grad_norm": 0.020506462082266808, "learning_rate": 4.367537149388903e-06, "loss": 0.0004, "step": 13082 }, { "epoch": 3.451391636987205, "grad_norm": 0.03232302516698837, "learning_rate": 4.367185439198101e-06, "loss": 0.0005, "step": 13084 }, { "epoch": 3.4519192718638703, "grad_norm": 0.12458925694227219, "learning_rate": 4.366833729007298e-06, "loss": 0.001, "step": 13086 }, { "epoch": 3.4524469067405357, "grad_norm": 0.3955652713775635, "learning_rate": 4.366482018816495e-06, "loss": 0.0078, "step": 13088 }, { "epoch": 3.452974541617201, "grad_norm": 0.13617104291915894, "learning_rate": 4.366130308625693e-06, "loss": 0.001, "step": 13090 }, { "epoch": 3.453502176493866, "grad_norm": 0.5061076283454895, "learning_rate": 4.36577859843489e-06, "loss": 0.0022, "step": 13092 }, { "epoch": 3.4540298113705314, "grad_norm": 0.13075537979602814, "learning_rate": 4.365426888244087e-06, "loss": 0.0008, "step": 13094 }, { "epoch": 3.454557446247197, "grad_norm": 0.05834737420082092, "learning_rate": 4.365075178053285e-06, "loss": 0.0008, "step": 13096 }, { "epoch": 3.4550850811238623, "grad_norm": 0.8869106769561768, "learning_rate": 4.3647234678624816e-06, "loss": 0.0022, "step": 13098 }, { "epoch": 3.4556127160005277, "grad_norm": 0.0748029425740242, "learning_rate": 4.3643717576716785e-06, "loss": 0.0005, "step": 13100 }, { "epoch": 3.456140350877193, "grad_norm": 0.8886088728904724, "learning_rate": 4.3640200474808755e-06, "loss": 0.0031, "step": 13102 }, { "epoch": 3.4566679857538585, "grad_norm": 0.4362086355686188, "learning_rate": 4.363668337290073e-06, "loss": 0.0037, "step": 13104 }, { "epoch": 3.4571956206305234, "grad_norm": 0.052179355174303055, "learning_rate": 4.36331662709927e-06, "loss": 0.0046, "step": 13106 }, { "epoch": 3.457723255507189, "grad_norm": 0.06384149938821793, "learning_rate": 4.362964916908467e-06, "loss": 0.0035, "step": 13108 }, { "epoch": 3.4582508903838542, "grad_norm": 0.40077880024909973, "learning_rate": 4.362613206717665e-06, "loss": 0.0012, "step": 13110 }, { "epoch": 3.4587785252605197, "grad_norm": 0.5288136005401611, "learning_rate": 4.362261496526862e-06, "loss": 0.0034, "step": 13112 }, { "epoch": 3.459306160137185, "grad_norm": 0.28058260679244995, "learning_rate": 4.36190978633606e-06, "loss": 0.0008, "step": 13114 }, { "epoch": 3.4598337950138505, "grad_norm": 0.07991799712181091, "learning_rate": 4.361558076145257e-06, "loss": 0.0006, "step": 13116 }, { "epoch": 3.460361429890516, "grad_norm": 0.07273358851671219, "learning_rate": 4.361206365954454e-06, "loss": 0.0007, "step": 13118 }, { "epoch": 3.4608890647671813, "grad_norm": 0.12322558462619781, "learning_rate": 4.3608546557636514e-06, "loss": 0.0005, "step": 13120 }, { "epoch": 3.4614166996438467, "grad_norm": 0.017828013747930527, "learning_rate": 4.3605029455728475e-06, "loss": 0.0003, "step": 13122 }, { "epoch": 3.4619443345205116, "grad_norm": 0.029375603422522545, "learning_rate": 4.360151235382045e-06, "loss": 0.0032, "step": 13124 }, { "epoch": 3.462471969397177, "grad_norm": 0.027040105313062668, "learning_rate": 4.359799525191242e-06, "loss": 0.0005, "step": 13126 }, { "epoch": 3.4629996042738425, "grad_norm": 0.21456265449523926, "learning_rate": 4.35944781500044e-06, "loss": 0.0055, "step": 13128 }, { "epoch": 3.463527239150508, "grad_norm": 0.15995769202709198, "learning_rate": 4.359096104809637e-06, "loss": 0.0056, "step": 13130 }, { "epoch": 3.4640548740271733, "grad_norm": 0.004556563217192888, "learning_rate": 4.358744394618834e-06, "loss": 0.0002, "step": 13132 }, { "epoch": 3.4645825089038387, "grad_norm": 0.01866115815937519, "learning_rate": 4.358392684428032e-06, "loss": 0.0003, "step": 13134 }, { "epoch": 3.465110143780504, "grad_norm": 0.05169152468442917, "learning_rate": 4.358040974237229e-06, "loss": 0.0095, "step": 13136 }, { "epoch": 3.465637778657169, "grad_norm": 0.0031776067335158587, "learning_rate": 4.357689264046426e-06, "loss": 0.0002, "step": 13138 }, { "epoch": 3.4661654135338344, "grad_norm": 0.017414700239896774, "learning_rate": 4.357337553855623e-06, "loss": 0.0007, "step": 13140 }, { "epoch": 3.4666930484105, "grad_norm": 0.015865182504057884, "learning_rate": 4.3569858436648204e-06, "loss": 0.0014, "step": 13142 }, { "epoch": 3.4672206832871653, "grad_norm": 0.0730496197938919, "learning_rate": 4.356634133474018e-06, "loss": 0.0011, "step": 13144 }, { "epoch": 3.4677483181638307, "grad_norm": 0.0064279162324965, "learning_rate": 4.356282423283214e-06, "loss": 0.0002, "step": 13146 }, { "epoch": 3.468275953040496, "grad_norm": 0.0029395411256700754, "learning_rate": 4.355930713092412e-06, "loss": 0.0028, "step": 13148 }, { "epoch": 3.4688035879171615, "grad_norm": 0.010623577050864697, "learning_rate": 4.355579002901609e-06, "loss": 0.0062, "step": 13150 }, { "epoch": 3.4693312227938264, "grad_norm": 0.0297622699290514, "learning_rate": 4.355227292710806e-06, "loss": 0.0003, "step": 13152 }, { "epoch": 3.469858857670492, "grad_norm": 0.037312477827072144, "learning_rate": 4.354875582520004e-06, "loss": 0.0003, "step": 13154 }, { "epoch": 3.4703864925471573, "grad_norm": 0.025308195501565933, "learning_rate": 4.354523872329201e-06, "loss": 0.0003, "step": 13156 }, { "epoch": 3.4709141274238227, "grad_norm": 0.14201058447360992, "learning_rate": 4.354172162138399e-06, "loss": 0.0008, "step": 13158 }, { "epoch": 3.471441762300488, "grad_norm": 0.35391268134117126, "learning_rate": 4.353820451947595e-06, "loss": 0.0023, "step": 13160 }, { "epoch": 3.4719693971771535, "grad_norm": 0.04690298065543175, "learning_rate": 4.3534687417567925e-06, "loss": 0.0003, "step": 13162 }, { "epoch": 3.472497032053819, "grad_norm": 0.3290429413318634, "learning_rate": 4.3531170315659895e-06, "loss": 0.0017, "step": 13164 }, { "epoch": 3.4730246669304843, "grad_norm": 0.013532433658838272, "learning_rate": 4.352765321375187e-06, "loss": 0.0002, "step": 13166 }, { "epoch": 3.4735523018071497, "grad_norm": 0.3319922387599945, "learning_rate": 4.352413611184384e-06, "loss": 0.0024, "step": 13168 }, { "epoch": 3.4740799366838147, "grad_norm": 0.005596622359007597, "learning_rate": 4.352061900993581e-06, "loss": 0.0013, "step": 13170 }, { "epoch": 3.47460757156048, "grad_norm": 0.012886370532214642, "learning_rate": 4.351710190802779e-06, "loss": 0.0003, "step": 13172 }, { "epoch": 3.4751352064371455, "grad_norm": 0.5337037444114685, "learning_rate": 4.351358480611976e-06, "loss": 0.0009, "step": 13174 }, { "epoch": 3.475662841313811, "grad_norm": 0.010564209893345833, "learning_rate": 4.351006770421173e-06, "loss": 0.0002, "step": 13176 }, { "epoch": 3.4761904761904763, "grad_norm": 0.005613725166767836, "learning_rate": 4.350655060230371e-06, "loss": 0.001, "step": 13178 }, { "epoch": 3.4767181110671417, "grad_norm": 0.11393532156944275, "learning_rate": 4.350303350039568e-06, "loss": 0.0007, "step": 13180 }, { "epoch": 3.477245745943807, "grad_norm": 0.008468561805784702, "learning_rate": 4.3499516398487646e-06, "loss": 0.0003, "step": 13182 }, { "epoch": 3.477773380820472, "grad_norm": 0.008105440996587276, "learning_rate": 4.3495999296579615e-06, "loss": 0.0003, "step": 13184 }, { "epoch": 3.4783010156971375, "grad_norm": 0.019953155890107155, "learning_rate": 4.349248219467159e-06, "loss": 0.0003, "step": 13186 }, { "epoch": 3.478828650573803, "grad_norm": 0.0025226478464901447, "learning_rate": 4.348896509276356e-06, "loss": 0.0002, "step": 13188 }, { "epoch": 3.4793562854504683, "grad_norm": 0.006612213794142008, "learning_rate": 4.348544799085553e-06, "loss": 0.0003, "step": 13190 }, { "epoch": 3.4798839203271337, "grad_norm": 0.4899614155292511, "learning_rate": 4.348193088894751e-06, "loss": 0.0086, "step": 13192 }, { "epoch": 3.480411555203799, "grad_norm": 0.0036378465592861176, "learning_rate": 4.347841378703948e-06, "loss": 0.0002, "step": 13194 }, { "epoch": 3.4809391900804645, "grad_norm": 0.026411496102809906, "learning_rate": 4.347489668513146e-06, "loss": 0.0003, "step": 13196 }, { "epoch": 3.4814668249571294, "grad_norm": 0.005129536148160696, "learning_rate": 4.347137958322342e-06, "loss": 0.0002, "step": 13198 }, { "epoch": 3.481994459833795, "grad_norm": 0.20345628261566162, "learning_rate": 4.34678624813154e-06, "loss": 0.0009, "step": 13200 }, { "epoch": 3.4825220947104603, "grad_norm": 0.21554423868656158, "learning_rate": 4.3464345379407375e-06, "loss": 0.0005, "step": 13202 }, { "epoch": 3.4830497295871257, "grad_norm": 0.0071482970379292965, "learning_rate": 4.3460828277499344e-06, "loss": 0.0003, "step": 13204 }, { "epoch": 3.483577364463791, "grad_norm": 0.014004405587911606, "learning_rate": 4.345731117559131e-06, "loss": 0.0074, "step": 13206 }, { "epoch": 3.4841049993404565, "grad_norm": 0.004129232373088598, "learning_rate": 4.345379407368328e-06, "loss": 0.0034, "step": 13208 }, { "epoch": 3.484632634217122, "grad_norm": 0.011180891655385494, "learning_rate": 4.345027697177526e-06, "loss": 0.0009, "step": 13210 }, { "epoch": 3.4851602690937873, "grad_norm": 0.06254064291715622, "learning_rate": 4.344675986986723e-06, "loss": 0.003, "step": 13212 }, { "epoch": 3.4856879039704527, "grad_norm": 0.006023056339472532, "learning_rate": 4.34432427679592e-06, "loss": 0.005, "step": 13214 }, { "epoch": 3.4862155388471177, "grad_norm": 0.011704923585057259, "learning_rate": 4.343972566605118e-06, "loss": 0.0002, "step": 13216 }, { "epoch": 3.486743173723783, "grad_norm": 0.011373363435268402, "learning_rate": 4.343620856414315e-06, "loss": 0.0025, "step": 13218 }, { "epoch": 3.4872708086004485, "grad_norm": 0.009751264937222004, "learning_rate": 4.343269146223512e-06, "loss": 0.0007, "step": 13220 }, { "epoch": 3.487798443477114, "grad_norm": 0.14088769257068634, "learning_rate": 4.342917436032709e-06, "loss": 0.0038, "step": 13222 }, { "epoch": 3.4883260783537793, "grad_norm": 0.015768088400363922, "learning_rate": 4.3425657258419065e-06, "loss": 0.0002, "step": 13224 }, { "epoch": 3.4888537132304447, "grad_norm": 0.008931217715144157, "learning_rate": 4.3422140156511035e-06, "loss": 0.0002, "step": 13226 }, { "epoch": 3.48938134810711, "grad_norm": 0.016812140122056007, "learning_rate": 4.3418623054603e-06, "loss": 0.0027, "step": 13228 }, { "epoch": 3.489908982983775, "grad_norm": 0.012283899821341038, "learning_rate": 4.341510595269498e-06, "loss": 0.0003, "step": 13230 }, { "epoch": 3.4904366178604405, "grad_norm": 0.015336102806031704, "learning_rate": 4.341158885078695e-06, "loss": 0.0003, "step": 13232 }, { "epoch": 3.490964252737106, "grad_norm": 0.003428102470934391, "learning_rate": 4.340807174887893e-06, "loss": 0.0002, "step": 13234 }, { "epoch": 3.4914918876137713, "grad_norm": 0.026093021035194397, "learning_rate": 4.34045546469709e-06, "loss": 0.0003, "step": 13236 }, { "epoch": 3.4920195224904367, "grad_norm": 0.11161240935325623, "learning_rate": 4.340103754506287e-06, "loss": 0.0005, "step": 13238 }, { "epoch": 3.492547157367102, "grad_norm": 0.2340737134218216, "learning_rate": 4.339752044315485e-06, "loss": 0.0075, "step": 13240 }, { "epoch": 3.4930747922437675, "grad_norm": 0.0055922153405845165, "learning_rate": 4.339400334124681e-06, "loss": 0.007, "step": 13242 }, { "epoch": 3.4936024271204325, "grad_norm": 0.1649760901927948, "learning_rate": 4.3390486239338786e-06, "loss": 0.0076, "step": 13244 }, { "epoch": 3.494130061997098, "grad_norm": 0.31338179111480713, "learning_rate": 4.3386969137430755e-06, "loss": 0.0029, "step": 13246 }, { "epoch": 3.4946576968737633, "grad_norm": 0.012039115652441978, "learning_rate": 4.338345203552273e-06, "loss": 0.0002, "step": 13248 }, { "epoch": 3.4951853317504287, "grad_norm": 0.04089204594492912, "learning_rate": 4.33799349336147e-06, "loss": 0.0003, "step": 13250 }, { "epoch": 3.495712966627094, "grad_norm": 0.007717893458902836, "learning_rate": 4.337641783170667e-06, "loss": 0.0061, "step": 13252 }, { "epoch": 3.4962406015037595, "grad_norm": 0.02583939954638481, "learning_rate": 4.337290072979865e-06, "loss": 0.0031, "step": 13254 }, { "epoch": 3.496768236380425, "grad_norm": 0.2526804208755493, "learning_rate": 4.336938362789062e-06, "loss": 0.0105, "step": 13256 }, { "epoch": 3.49729587125709, "grad_norm": 0.016870586201548576, "learning_rate": 4.336586652598259e-06, "loss": 0.001, "step": 13258 }, { "epoch": 3.4978235061337553, "grad_norm": 0.014898614957928658, "learning_rate": 4.336234942407457e-06, "loss": 0.0002, "step": 13260 }, { "epoch": 3.4983511410104207, "grad_norm": 0.018349995836615562, "learning_rate": 4.335883232216654e-06, "loss": 0.0005, "step": 13262 }, { "epoch": 3.498878775887086, "grad_norm": 0.04864424467086792, "learning_rate": 4.3355315220258515e-06, "loss": 0.0013, "step": 13264 }, { "epoch": 3.4994064107637515, "grad_norm": 0.011088481172919273, "learning_rate": 4.335179811835048e-06, "loss": 0.0003, "step": 13266 }, { "epoch": 3.499934045640417, "grad_norm": 0.02133924327790737, "learning_rate": 4.334828101644245e-06, "loss": 0.004, "step": 13268 }, { "epoch": 3.5004616805170823, "grad_norm": 0.01515919715166092, "learning_rate": 4.334476391453442e-06, "loss": 0.0006, "step": 13270 }, { "epoch": 3.5009893153937477, "grad_norm": 0.05614650994539261, "learning_rate": 4.334124681262639e-06, "loss": 0.0005, "step": 13272 }, { "epoch": 3.501516950270413, "grad_norm": 0.4540683329105377, "learning_rate": 4.333772971071837e-06, "loss": 0.0085, "step": 13274 }, { "epoch": 3.502044585147078, "grad_norm": 0.008722619153559208, "learning_rate": 4.333421260881034e-06, "loss": 0.0049, "step": 13276 }, { "epoch": 3.5025722200237435, "grad_norm": 0.8764664530754089, "learning_rate": 4.333069550690232e-06, "loss": 0.0017, "step": 13278 }, { "epoch": 3.503099854900409, "grad_norm": 0.5214884281158447, "learning_rate": 4.332717840499428e-06, "loss": 0.0055, "step": 13280 }, { "epoch": 3.5036274897770743, "grad_norm": 0.07047612965106964, "learning_rate": 4.332366130308626e-06, "loss": 0.0004, "step": 13282 }, { "epoch": 3.5041551246537397, "grad_norm": 0.009460634551942348, "learning_rate": 4.332014420117823e-06, "loss": 0.0004, "step": 13284 }, { "epoch": 3.504682759530405, "grad_norm": 0.05780990421772003, "learning_rate": 4.3316627099270205e-06, "loss": 0.0006, "step": 13286 }, { "epoch": 3.5052103944070705, "grad_norm": 0.22898143529891968, "learning_rate": 4.3313109997362174e-06, "loss": 0.0011, "step": 13288 }, { "epoch": 3.5057380292837355, "grad_norm": 0.014289306476712227, "learning_rate": 4.330959289545414e-06, "loss": 0.0003, "step": 13290 }, { "epoch": 3.506265664160401, "grad_norm": 0.18695764243602753, "learning_rate": 4.330607579354612e-06, "loss": 0.001, "step": 13292 }, { "epoch": 3.5067932990370663, "grad_norm": 0.33496999740600586, "learning_rate": 4.330255869163809e-06, "loss": 0.0021, "step": 13294 }, { "epoch": 3.5073209339137317, "grad_norm": 0.02822953090071678, "learning_rate": 4.329904158973006e-06, "loss": 0.0007, "step": 13296 }, { "epoch": 3.507848568790397, "grad_norm": 0.020585551857948303, "learning_rate": 4.329552448782204e-06, "loss": 0.0005, "step": 13298 }, { "epoch": 3.5083762036670625, "grad_norm": 0.06561346352100372, "learning_rate": 4.329200738591401e-06, "loss": 0.0005, "step": 13300 }, { "epoch": 3.508903838543728, "grad_norm": 0.0047559537924826145, "learning_rate": 4.328849028400598e-06, "loss": 0.0002, "step": 13302 }, { "epoch": 3.509431473420393, "grad_norm": 0.006506762001663446, "learning_rate": 4.328497318209795e-06, "loss": 0.0019, "step": 13304 }, { "epoch": 3.5099591082970587, "grad_norm": 0.0054887873120605946, "learning_rate": 4.3281456080189926e-06, "loss": 0.0007, "step": 13306 }, { "epoch": 3.5104867431737237, "grad_norm": 0.01929962821304798, "learning_rate": 4.3277938978281895e-06, "loss": 0.0003, "step": 13308 }, { "epoch": 3.511014378050389, "grad_norm": 0.011935449205338955, "learning_rate": 4.3274421876373865e-06, "loss": 0.0016, "step": 13310 }, { "epoch": 3.5115420129270545, "grad_norm": 0.0028760943096131086, "learning_rate": 4.327090477446584e-06, "loss": 0.0024, "step": 13312 }, { "epoch": 3.51206964780372, "grad_norm": 0.031216997653245926, "learning_rate": 4.326738767255781e-06, "loss": 0.0004, "step": 13314 }, { "epoch": 3.5125972826803853, "grad_norm": 0.049394089728593826, "learning_rate": 4.326387057064979e-06, "loss": 0.0032, "step": 13316 }, { "epoch": 3.5131249175570503, "grad_norm": 0.0072382972575724125, "learning_rate": 4.326035346874176e-06, "loss": 0.0004, "step": 13318 }, { "epoch": 3.513652552433716, "grad_norm": 0.009272227995097637, "learning_rate": 4.325683636683373e-06, "loss": 0.0002, "step": 13320 }, { "epoch": 3.514180187310381, "grad_norm": 0.564498245716095, "learning_rate": 4.325331926492571e-06, "loss": 0.0028, "step": 13322 }, { "epoch": 3.5147078221870465, "grad_norm": 0.11753394454717636, "learning_rate": 4.324980216301768e-06, "loss": 0.0006, "step": 13324 }, { "epoch": 3.515235457063712, "grad_norm": 0.034262821078300476, "learning_rate": 4.324628506110965e-06, "loss": 0.0003, "step": 13326 }, { "epoch": 3.5157630919403773, "grad_norm": 0.010153948329389095, "learning_rate": 4.324276795920162e-06, "loss": 0.0015, "step": 13328 }, { "epoch": 3.5162907268170427, "grad_norm": 0.001910247839987278, "learning_rate": 4.323925085729359e-06, "loss": 0.0002, "step": 13330 }, { "epoch": 3.516818361693708, "grad_norm": 0.028594626113772392, "learning_rate": 4.323573375538556e-06, "loss": 0.0011, "step": 13332 }, { "epoch": 3.5173459965703735, "grad_norm": 0.027047058567404747, "learning_rate": 4.323221665347753e-06, "loss": 0.0005, "step": 13334 }, { "epoch": 3.5178736314470385, "grad_norm": 0.0920131728053093, "learning_rate": 4.322869955156951e-06, "loss": 0.0024, "step": 13336 }, { "epoch": 3.518401266323704, "grad_norm": 0.08743248879909515, "learning_rate": 4.322518244966148e-06, "loss": 0.0004, "step": 13338 }, { "epoch": 3.5189289012003693, "grad_norm": 0.08167106658220291, "learning_rate": 4.322166534775345e-06, "loss": 0.0006, "step": 13340 }, { "epoch": 3.5194565360770347, "grad_norm": 0.008926757611334324, "learning_rate": 4.321814824584542e-06, "loss": 0.0011, "step": 13342 }, { "epoch": 3.5199841709537, "grad_norm": 0.028700901195406914, "learning_rate": 4.32146311439374e-06, "loss": 0.0026, "step": 13344 }, { "epoch": 3.5205118058303655, "grad_norm": 0.8650322556495667, "learning_rate": 4.3211114042029375e-06, "loss": 0.0053, "step": 13346 }, { "epoch": 3.521039440707031, "grad_norm": 0.004549696110188961, "learning_rate": 4.320759694012134e-06, "loss": 0.0002, "step": 13348 }, { "epoch": 3.521567075583696, "grad_norm": 0.7402355074882507, "learning_rate": 4.3204079838213314e-06, "loss": 0.0035, "step": 13350 }, { "epoch": 3.5220947104603613, "grad_norm": 0.006012794096022844, "learning_rate": 4.320056273630528e-06, "loss": 0.0002, "step": 13352 }, { "epoch": 3.5226223453370267, "grad_norm": 0.007897492498159409, "learning_rate": 4.319704563439726e-06, "loss": 0.0002, "step": 13354 }, { "epoch": 3.523149980213692, "grad_norm": 0.004594693426042795, "learning_rate": 4.319352853248923e-06, "loss": 0.0007, "step": 13356 }, { "epoch": 3.5236776150903575, "grad_norm": 0.18988509476184845, "learning_rate": 4.31900114305812e-06, "loss": 0.0051, "step": 13358 }, { "epoch": 3.524205249967023, "grad_norm": 0.008360135369002819, "learning_rate": 4.318649432867318e-06, "loss": 0.0002, "step": 13360 }, { "epoch": 3.5247328848436883, "grad_norm": 0.008144166320562363, "learning_rate": 4.318297722676514e-06, "loss": 0.0002, "step": 13362 }, { "epoch": 3.5252605197203533, "grad_norm": 0.6613401174545288, "learning_rate": 4.317946012485712e-06, "loss": 0.0075, "step": 13364 }, { "epoch": 3.525788154597019, "grad_norm": 0.09105957299470901, "learning_rate": 4.317594302294909e-06, "loss": 0.0003, "step": 13366 }, { "epoch": 3.526315789473684, "grad_norm": 0.003258650191128254, "learning_rate": 4.3172425921041066e-06, "loss": 0.0087, "step": 13368 }, { "epoch": 3.5268434243503495, "grad_norm": 0.3835253119468689, "learning_rate": 4.3168908819133035e-06, "loss": 0.0056, "step": 13370 }, { "epoch": 3.527371059227015, "grad_norm": 0.015070777386426926, "learning_rate": 4.3165391717225005e-06, "loss": 0.0011, "step": 13372 }, { "epoch": 3.5278986941036803, "grad_norm": 0.02322353795170784, "learning_rate": 4.316187461531698e-06, "loss": 0.0003, "step": 13374 }, { "epoch": 3.5284263289803457, "grad_norm": 0.025746583938598633, "learning_rate": 4.315835751340895e-06, "loss": 0.0036, "step": 13376 }, { "epoch": 3.528953963857011, "grad_norm": 0.05166883021593094, "learning_rate": 4.315484041150092e-06, "loss": 0.0006, "step": 13378 }, { "epoch": 3.5294815987336765, "grad_norm": 0.07755424827337265, "learning_rate": 4.31513233095929e-06, "loss": 0.0062, "step": 13380 }, { "epoch": 3.5300092336103415, "grad_norm": 0.29034361243247986, "learning_rate": 4.314780620768487e-06, "loss": 0.0011, "step": 13382 }, { "epoch": 3.530536868487007, "grad_norm": 0.04520199075341225, "learning_rate": 4.314428910577685e-06, "loss": 0.0016, "step": 13384 }, { "epoch": 3.5310645033636723, "grad_norm": 0.028783384710550308, "learning_rate": 4.314077200386881e-06, "loss": 0.0003, "step": 13386 }, { "epoch": 3.5315921382403377, "grad_norm": 0.030868535861372948, "learning_rate": 4.313725490196079e-06, "loss": 0.0004, "step": 13388 }, { "epoch": 3.532119773117003, "grad_norm": 0.019039282575249672, "learning_rate": 4.3133737800052756e-06, "loss": 0.0003, "step": 13390 }, { "epoch": 3.5326474079936685, "grad_norm": 0.012805342674255371, "learning_rate": 4.3130220698144725e-06, "loss": 0.0002, "step": 13392 }, { "epoch": 3.533175042870334, "grad_norm": 0.00668678805232048, "learning_rate": 4.31267035962367e-06, "loss": 0.0051, "step": 13394 }, { "epoch": 3.533702677746999, "grad_norm": 0.014877881854772568, "learning_rate": 4.312318649432867e-06, "loss": 0.0027, "step": 13396 }, { "epoch": 3.5342303126236643, "grad_norm": 0.37906238436698914, "learning_rate": 4.311966939242065e-06, "loss": 0.0104, "step": 13398 }, { "epoch": 3.5347579475003297, "grad_norm": 0.007091152481734753, "learning_rate": 4.311615229051261e-06, "loss": 0.0002, "step": 13400 }, { "epoch": 3.535285582376995, "grad_norm": 0.1502850353717804, "learning_rate": 4.311263518860459e-06, "loss": 0.0007, "step": 13402 }, { "epoch": 3.5358132172536605, "grad_norm": 0.05703231319785118, "learning_rate": 4.310911808669657e-06, "loss": 0.0005, "step": 13404 }, { "epoch": 3.536340852130326, "grad_norm": 0.478443443775177, "learning_rate": 4.310560098478854e-06, "loss": 0.0069, "step": 13406 }, { "epoch": 3.5368684870069913, "grad_norm": 0.0059545100666582584, "learning_rate": 4.310208388288051e-06, "loss": 0.0005, "step": 13408 }, { "epoch": 3.5373961218836563, "grad_norm": 0.14704541862010956, "learning_rate": 4.309856678097248e-06, "loss": 0.0018, "step": 13410 }, { "epoch": 3.537923756760322, "grad_norm": 0.02576647326350212, "learning_rate": 4.3095049679064454e-06, "loss": 0.0003, "step": 13412 }, { "epoch": 3.538451391636987, "grad_norm": 0.01410730741918087, "learning_rate": 4.309153257715642e-06, "loss": 0.0003, "step": 13414 }, { "epoch": 3.5389790265136525, "grad_norm": 1.1303704977035522, "learning_rate": 4.308801547524839e-06, "loss": 0.0032, "step": 13416 }, { "epoch": 3.539506661390318, "grad_norm": 0.006491889711469412, "learning_rate": 4.308449837334037e-06, "loss": 0.0003, "step": 13418 }, { "epoch": 3.5400342962669833, "grad_norm": 0.13826636970043182, "learning_rate": 4.308098127143234e-06, "loss": 0.0021, "step": 13420 }, { "epoch": 3.5405619311436487, "grad_norm": 0.06480085849761963, "learning_rate": 4.307746416952431e-06, "loss": 0.0004, "step": 13422 }, { "epoch": 3.541089566020314, "grad_norm": 0.006297117564827204, "learning_rate": 4.307394706761628e-06, "loss": 0.0003, "step": 13424 }, { "epoch": 3.5416172008969795, "grad_norm": 0.007032477296888828, "learning_rate": 4.307042996570826e-06, "loss": 0.0012, "step": 13426 }, { "epoch": 3.5421448357736445, "grad_norm": 0.012903342954814434, "learning_rate": 4.306691286380024e-06, "loss": 0.0003, "step": 13428 }, { "epoch": 3.54267247065031, "grad_norm": 0.04058244451880455, "learning_rate": 4.30633957618922e-06, "loss": 0.0004, "step": 13430 }, { "epoch": 3.5432001055269753, "grad_norm": 0.46897703409194946, "learning_rate": 4.3059878659984175e-06, "loss": 0.0046, "step": 13432 }, { "epoch": 3.5437277404036407, "grad_norm": 0.009358106181025505, "learning_rate": 4.3056361558076145e-06, "loss": 0.0002, "step": 13434 }, { "epoch": 3.544255375280306, "grad_norm": 0.12053932994604111, "learning_rate": 4.305284445616812e-06, "loss": 0.0026, "step": 13436 }, { "epoch": 3.5447830101569715, "grad_norm": 0.05883258953690529, "learning_rate": 4.304932735426009e-06, "loss": 0.0004, "step": 13438 }, { "epoch": 3.545310645033637, "grad_norm": 0.005546258762478828, "learning_rate": 4.304581025235206e-06, "loss": 0.001, "step": 13440 }, { "epoch": 3.545838279910302, "grad_norm": 0.10014908760786057, "learning_rate": 4.304229315044404e-06, "loss": 0.001, "step": 13442 }, { "epoch": 3.5463659147869673, "grad_norm": 0.2943641245365143, "learning_rate": 4.303877604853601e-06, "loss": 0.0043, "step": 13444 }, { "epoch": 3.5468935496636327, "grad_norm": 0.01833757571876049, "learning_rate": 4.303525894662798e-06, "loss": 0.0003, "step": 13446 }, { "epoch": 3.547421184540298, "grad_norm": 0.02262692339718342, "learning_rate": 4.303174184471995e-06, "loss": 0.0011, "step": 13448 }, { "epoch": 3.5479488194169635, "grad_norm": 0.02639024518430233, "learning_rate": 4.302822474281193e-06, "loss": 0.0003, "step": 13450 }, { "epoch": 3.548476454293629, "grad_norm": 0.2987699508666992, "learning_rate": 4.3024707640903896e-06, "loss": 0.0209, "step": 13452 }, { "epoch": 3.5490040891702943, "grad_norm": 0.03407684341073036, "learning_rate": 4.3021190538995865e-06, "loss": 0.0003, "step": 13454 }, { "epoch": 3.5495317240469593, "grad_norm": 0.12434245645999908, "learning_rate": 4.301767343708784e-06, "loss": 0.0015, "step": 13456 }, { "epoch": 3.550059358923625, "grad_norm": 0.6529891490936279, "learning_rate": 4.301415633517981e-06, "loss": 0.0033, "step": 13458 }, { "epoch": 3.55058699380029, "grad_norm": 0.12056410312652588, "learning_rate": 4.301063923327178e-06, "loss": 0.0065, "step": 13460 }, { "epoch": 3.5511146286769555, "grad_norm": 0.25760510563850403, "learning_rate": 4.300712213136376e-06, "loss": 0.0036, "step": 13462 }, { "epoch": 3.551642263553621, "grad_norm": 0.031822819262742996, "learning_rate": 4.300360502945573e-06, "loss": 0.0011, "step": 13464 }, { "epoch": 3.5521698984302863, "grad_norm": 0.12052265554666519, "learning_rate": 4.300008792754771e-06, "loss": 0.0005, "step": 13466 }, { "epoch": 3.5526975333069517, "grad_norm": 0.9439338445663452, "learning_rate": 4.299657082563967e-06, "loss": 0.0018, "step": 13468 }, { "epoch": 3.5532251681836167, "grad_norm": 0.09739114344120026, "learning_rate": 4.299305372373165e-06, "loss": 0.0005, "step": 13470 }, { "epoch": 3.5537528030602825, "grad_norm": 0.0846158117055893, "learning_rate": 4.298953662182362e-06, "loss": 0.0009, "step": 13472 }, { "epoch": 3.5542804379369475, "grad_norm": 0.22919055819511414, "learning_rate": 4.2986019519915594e-06, "loss": 0.001, "step": 13474 }, { "epoch": 3.554808072813613, "grad_norm": 0.01696471869945526, "learning_rate": 4.298250241800756e-06, "loss": 0.0053, "step": 13476 }, { "epoch": 3.5553357076902783, "grad_norm": 0.11864235252141953, "learning_rate": 4.297898531609953e-06, "loss": 0.0007, "step": 13478 }, { "epoch": 3.5558633425669437, "grad_norm": 0.021761205047369003, "learning_rate": 4.297546821419151e-06, "loss": 0.0003, "step": 13480 }, { "epoch": 3.556390977443609, "grad_norm": 0.026681581512093544, "learning_rate": 4.297195111228347e-06, "loss": 0.0003, "step": 13482 }, { "epoch": 3.5569186123202745, "grad_norm": 0.00532354274764657, "learning_rate": 4.296843401037545e-06, "loss": 0.0002, "step": 13484 }, { "epoch": 3.55744624719694, "grad_norm": 0.006739332806318998, "learning_rate": 4.296491690846743e-06, "loss": 0.0003, "step": 13486 }, { "epoch": 3.557973882073605, "grad_norm": 0.054661884903907776, "learning_rate": 4.29613998065594e-06, "loss": 0.0003, "step": 13488 }, { "epoch": 3.5585015169502703, "grad_norm": 0.010520685464143753, "learning_rate": 4.295788270465137e-06, "loss": 0.0015, "step": 13490 }, { "epoch": 3.5590291518269357, "grad_norm": 0.016515502706170082, "learning_rate": 4.295436560274334e-06, "loss": 0.0004, "step": 13492 }, { "epoch": 3.559556786703601, "grad_norm": 0.014449091628193855, "learning_rate": 4.2950848500835315e-06, "loss": 0.0002, "step": 13494 }, { "epoch": 3.5600844215802665, "grad_norm": 0.0034496886655688286, "learning_rate": 4.2947331398927284e-06, "loss": 0.0002, "step": 13496 }, { "epoch": 3.560612056456932, "grad_norm": 0.017161397263407707, "learning_rate": 4.294381429701925e-06, "loss": 0.004, "step": 13498 }, { "epoch": 3.5611396913335973, "grad_norm": 0.003303114091977477, "learning_rate": 4.294029719511123e-06, "loss": 0.0002, "step": 13500 }, { "epoch": 3.5616673262102623, "grad_norm": 0.012962696142494678, "learning_rate": 4.29367800932032e-06, "loss": 0.0011, "step": 13502 }, { "epoch": 3.5621949610869277, "grad_norm": 0.004464369732886553, "learning_rate": 4.293326299129518e-06, "loss": 0.0035, "step": 13504 }, { "epoch": 3.562722595963593, "grad_norm": 0.01454856339842081, "learning_rate": 4.292974588938714e-06, "loss": 0.0003, "step": 13506 }, { "epoch": 3.5632502308402585, "grad_norm": 0.12031516432762146, "learning_rate": 4.292622878747912e-06, "loss": 0.0004, "step": 13508 }, { "epoch": 3.563777865716924, "grad_norm": 0.5175594687461853, "learning_rate": 4.292271168557109e-06, "loss": 0.0033, "step": 13510 }, { "epoch": 3.5643055005935893, "grad_norm": 1.1573107242584229, "learning_rate": 4.291919458366306e-06, "loss": 0.0104, "step": 13512 }, { "epoch": 3.5648331354702547, "grad_norm": 0.2583560049533844, "learning_rate": 4.2915677481755036e-06, "loss": 0.01, "step": 13514 }, { "epoch": 3.5653607703469197, "grad_norm": 0.12043680250644684, "learning_rate": 4.2912160379847005e-06, "loss": 0.0087, "step": 13516 }, { "epoch": 3.5658884052235855, "grad_norm": 0.10114762932062149, "learning_rate": 4.290864327793898e-06, "loss": 0.0003, "step": 13518 }, { "epoch": 3.5664160401002505, "grad_norm": 0.2000860720872879, "learning_rate": 4.290512617603095e-06, "loss": 0.0012, "step": 13520 }, { "epoch": 3.566943674976916, "grad_norm": 0.11029908806085587, "learning_rate": 4.290160907412292e-06, "loss": 0.0006, "step": 13522 }, { "epoch": 3.5674713098535813, "grad_norm": 0.28572240471839905, "learning_rate": 4.28980919722149e-06, "loss": 0.0038, "step": 13524 }, { "epoch": 3.5679989447302467, "grad_norm": 0.10195367783308029, "learning_rate": 4.289457487030687e-06, "loss": 0.0013, "step": 13526 }, { "epoch": 3.568526579606912, "grad_norm": 0.06287452578544617, "learning_rate": 4.289105776839884e-06, "loss": 0.0004, "step": 13528 }, { "epoch": 3.5690542144835775, "grad_norm": 0.6882668137550354, "learning_rate": 4.288754066649081e-06, "loss": 0.0035, "step": 13530 }, { "epoch": 3.569581849360243, "grad_norm": 0.9215105175971985, "learning_rate": 4.288402356458279e-06, "loss": 0.003, "step": 13532 }, { "epoch": 3.570109484236908, "grad_norm": 0.04526599869132042, "learning_rate": 4.288050646267476e-06, "loss": 0.0004, "step": 13534 }, { "epoch": 3.5706371191135733, "grad_norm": 0.017147246748209, "learning_rate": 4.2876989360766726e-06, "loss": 0.0003, "step": 13536 }, { "epoch": 3.5711647539902387, "grad_norm": 0.024976462125778198, "learning_rate": 4.28734722588587e-06, "loss": 0.0003, "step": 13538 }, { "epoch": 3.571692388866904, "grad_norm": 0.011230533942580223, "learning_rate": 4.286995515695067e-06, "loss": 0.0036, "step": 13540 }, { "epoch": 3.5722200237435695, "grad_norm": 0.007072494365274906, "learning_rate": 4.286643805504264e-06, "loss": 0.0002, "step": 13542 }, { "epoch": 3.572747658620235, "grad_norm": 0.026986276730895042, "learning_rate": 4.286292095313462e-06, "loss": 0.0003, "step": 13544 }, { "epoch": 3.5732752934969003, "grad_norm": 0.009494072757661343, "learning_rate": 4.285940385122659e-06, "loss": 0.0094, "step": 13546 }, { "epoch": 3.5738029283735653, "grad_norm": 0.18825967609882355, "learning_rate": 4.285588674931857e-06, "loss": 0.0008, "step": 13548 }, { "epoch": 3.5743305632502307, "grad_norm": 0.37543511390686035, "learning_rate": 4.285236964741053e-06, "loss": 0.0033, "step": 13550 }, { "epoch": 3.574858198126896, "grad_norm": 0.009251265786588192, "learning_rate": 4.284885254550251e-06, "loss": 0.0002, "step": 13552 }, { "epoch": 3.5753858330035615, "grad_norm": 0.05366215109825134, "learning_rate": 4.284533544359448e-06, "loss": 0.0033, "step": 13554 }, { "epoch": 3.575913467880227, "grad_norm": 0.004594447556883097, "learning_rate": 4.2841818341686455e-06, "loss": 0.0002, "step": 13556 }, { "epoch": 3.5764411027568923, "grad_norm": 0.010026147589087486, "learning_rate": 4.2838301239778424e-06, "loss": 0.0002, "step": 13558 }, { "epoch": 3.5769687376335577, "grad_norm": 0.012704823166131973, "learning_rate": 4.283478413787039e-06, "loss": 0.0007, "step": 13560 }, { "epoch": 3.5774963725102227, "grad_norm": 0.16742335259914398, "learning_rate": 4.283126703596237e-06, "loss": 0.0005, "step": 13562 }, { "epoch": 3.5780240073868885, "grad_norm": 0.612978994846344, "learning_rate": 4.282774993405434e-06, "loss": 0.0037, "step": 13564 }, { "epoch": 3.5785516422635535, "grad_norm": 0.153986856341362, "learning_rate": 4.282423283214631e-06, "loss": 0.0124, "step": 13566 }, { "epoch": 3.579079277140219, "grad_norm": 0.013496661558747292, "learning_rate": 4.282071573023828e-06, "loss": 0.0003, "step": 13568 }, { "epoch": 3.5796069120168843, "grad_norm": 0.09030085802078247, "learning_rate": 4.281719862833026e-06, "loss": 0.0006, "step": 13570 }, { "epoch": 3.5801345468935497, "grad_norm": 0.06663908809423447, "learning_rate": 4.281368152642223e-06, "loss": 0.0005, "step": 13572 }, { "epoch": 3.580662181770215, "grad_norm": 0.011103694327175617, "learning_rate": 4.28101644245142e-06, "loss": 0.0005, "step": 13574 }, { "epoch": 3.5811898166468805, "grad_norm": 0.013418439775705338, "learning_rate": 4.2806647322606175e-06, "loss": 0.0047, "step": 13576 }, { "epoch": 3.581717451523546, "grad_norm": 0.0062727611511945724, "learning_rate": 4.2803130220698145e-06, "loss": 0.0003, "step": 13578 }, { "epoch": 3.582245086400211, "grad_norm": 1.183437466621399, "learning_rate": 4.2799613118790115e-06, "loss": 0.0044, "step": 13580 }, { "epoch": 3.5827727212768763, "grad_norm": 0.198531836271286, "learning_rate": 4.279609601688209e-06, "loss": 0.0009, "step": 13582 }, { "epoch": 3.5833003561535417, "grad_norm": 0.020632289350032806, "learning_rate": 4.279257891497406e-06, "loss": 0.0003, "step": 13584 }, { "epoch": 3.583827991030207, "grad_norm": 0.00845661573112011, "learning_rate": 4.278906181306604e-06, "loss": 0.0003, "step": 13586 }, { "epoch": 3.5843556259068725, "grad_norm": 0.033799778670072556, "learning_rate": 4.2785544711158e-06, "loss": 0.0007, "step": 13588 }, { "epoch": 3.584883260783538, "grad_norm": 0.4134957790374756, "learning_rate": 4.278202760924998e-06, "loss": 0.0007, "step": 13590 }, { "epoch": 3.5854108956602033, "grad_norm": 0.43642425537109375, "learning_rate": 4.277851050734195e-06, "loss": 0.0064, "step": 13592 }, { "epoch": 3.5859385305368683, "grad_norm": 0.06264007836580276, "learning_rate": 4.277499340543393e-06, "loss": 0.0057, "step": 13594 }, { "epoch": 3.5864661654135337, "grad_norm": 0.19882048666477203, "learning_rate": 4.27714763035259e-06, "loss": 0.001, "step": 13596 }, { "epoch": 3.586993800290199, "grad_norm": 0.05304780974984169, "learning_rate": 4.2767959201617866e-06, "loss": 0.0004, "step": 13598 }, { "epoch": 3.5875214351668645, "grad_norm": 0.04975859820842743, "learning_rate": 4.276444209970984e-06, "loss": 0.0033, "step": 13600 }, { "epoch": 3.58804907004353, "grad_norm": 0.19692398607730865, "learning_rate": 4.276092499780181e-06, "loss": 0.0031, "step": 13602 }, { "epoch": 3.5885767049201953, "grad_norm": 0.48882389068603516, "learning_rate": 4.275740789589378e-06, "loss": 0.0061, "step": 13604 }, { "epoch": 3.5891043397968607, "grad_norm": 0.05293988808989525, "learning_rate": 4.275389079398576e-06, "loss": 0.0006, "step": 13606 }, { "epoch": 3.5896319746735257, "grad_norm": 0.09753848612308502, "learning_rate": 4.275037369207773e-06, "loss": 0.0029, "step": 13608 }, { "epoch": 3.5901596095501915, "grad_norm": 0.019157666712999344, "learning_rate": 4.27468565901697e-06, "loss": 0.0003, "step": 13610 }, { "epoch": 3.5906872444268565, "grad_norm": 0.023764539510011673, "learning_rate": 4.274333948826167e-06, "loss": 0.0002, "step": 13612 }, { "epoch": 3.591214879303522, "grad_norm": 0.10071200877428055, "learning_rate": 4.273982238635365e-06, "loss": 0.0017, "step": 13614 }, { "epoch": 3.5917425141801873, "grad_norm": 0.11145015060901642, "learning_rate": 4.273630528444562e-06, "loss": 0.0006, "step": 13616 }, { "epoch": 3.5922701490568527, "grad_norm": 0.06977158039808273, "learning_rate": 4.273278818253759e-06, "loss": 0.0026, "step": 13618 }, { "epoch": 3.592797783933518, "grad_norm": 0.42273280024528503, "learning_rate": 4.2729271080629564e-06, "loss": 0.0012, "step": 13620 }, { "epoch": 3.593325418810183, "grad_norm": 0.021538861095905304, "learning_rate": 4.272575397872153e-06, "loss": 0.0002, "step": 13622 }, { "epoch": 3.593853053686849, "grad_norm": 0.03730108588933945, "learning_rate": 4.272223687681351e-06, "loss": 0.0005, "step": 13624 }, { "epoch": 3.594380688563514, "grad_norm": 0.0882207378745079, "learning_rate": 4.271871977490547e-06, "loss": 0.0084, "step": 13626 }, { "epoch": 3.5949083234401793, "grad_norm": 0.38679686188697815, "learning_rate": 4.271520267299745e-06, "loss": 0.0022, "step": 13628 }, { "epoch": 3.5954359583168447, "grad_norm": 0.009160446934401989, "learning_rate": 4.271168557108943e-06, "loss": 0.0002, "step": 13630 }, { "epoch": 3.59596359319351, "grad_norm": 0.01827150210738182, "learning_rate": 4.270816846918139e-06, "loss": 0.0003, "step": 13632 }, { "epoch": 3.5964912280701755, "grad_norm": 0.021424222737550735, "learning_rate": 4.270465136727337e-06, "loss": 0.0003, "step": 13634 }, { "epoch": 3.597018862946841, "grad_norm": 0.17912611365318298, "learning_rate": 4.270113426536534e-06, "loss": 0.0014, "step": 13636 }, { "epoch": 3.5975464978235063, "grad_norm": 0.8461296558380127, "learning_rate": 4.2697617163457315e-06, "loss": 0.0029, "step": 13638 }, { "epoch": 3.5980741327001713, "grad_norm": 0.03892454877495766, "learning_rate": 4.2694100061549285e-06, "loss": 0.0003, "step": 13640 }, { "epoch": 3.5986017675768367, "grad_norm": 0.14161287248134613, "learning_rate": 4.2690582959641254e-06, "loss": 0.0108, "step": 13642 }, { "epoch": 3.599129402453502, "grad_norm": 0.2714644968509674, "learning_rate": 4.268706585773323e-06, "loss": 0.0008, "step": 13644 }, { "epoch": 3.5996570373301675, "grad_norm": 2.8056182861328125, "learning_rate": 4.26835487558252e-06, "loss": 0.0012, "step": 13646 }, { "epoch": 3.600184672206833, "grad_norm": 0.009053030051290989, "learning_rate": 4.268003165391717e-06, "loss": 0.0002, "step": 13648 }, { "epoch": 3.6007123070834983, "grad_norm": 0.007158959284424782, "learning_rate": 4.267651455200914e-06, "loss": 0.0002, "step": 13650 }, { "epoch": 3.6012399419601637, "grad_norm": 0.003271167865023017, "learning_rate": 4.267299745010112e-06, "loss": 0.0075, "step": 13652 }, { "epoch": 3.6017675768368287, "grad_norm": 0.19425509870052338, "learning_rate": 4.26694803481931e-06, "loss": 0.0069, "step": 13654 }, { "epoch": 3.6022952117134945, "grad_norm": 0.01146281324326992, "learning_rate": 4.266596324628506e-06, "loss": 0.0004, "step": 13656 }, { "epoch": 3.6028228465901595, "grad_norm": 0.20100164413452148, "learning_rate": 4.266244614437704e-06, "loss": 0.0042, "step": 13658 }, { "epoch": 3.603350481466825, "grad_norm": 0.003408384509384632, "learning_rate": 4.2658929042469006e-06, "loss": 0.0003, "step": 13660 }, { "epoch": 3.6038781163434903, "grad_norm": 0.011607341468334198, "learning_rate": 4.265541194056098e-06, "loss": 0.0002, "step": 13662 }, { "epoch": 3.6044057512201557, "grad_norm": 0.012346887961030006, "learning_rate": 4.265189483865295e-06, "loss": 0.0003, "step": 13664 }, { "epoch": 3.604933386096821, "grad_norm": 0.07979067414999008, "learning_rate": 4.264837773674492e-06, "loss": 0.0021, "step": 13666 }, { "epoch": 3.605461020973486, "grad_norm": 0.025958556681871414, "learning_rate": 4.26448606348369e-06, "loss": 0.0004, "step": 13668 }, { "epoch": 3.605988655850152, "grad_norm": 0.017258506268262863, "learning_rate": 4.264134353292886e-06, "loss": 0.0002, "step": 13670 }, { "epoch": 3.606516290726817, "grad_norm": 0.28472164273262024, "learning_rate": 4.263782643102084e-06, "loss": 0.0017, "step": 13672 }, { "epoch": 3.6070439256034823, "grad_norm": 0.16730409860610962, "learning_rate": 4.263430932911281e-06, "loss": 0.0016, "step": 13674 }, { "epoch": 3.6075715604801477, "grad_norm": 0.2380766123533249, "learning_rate": 4.263079222720479e-06, "loss": 0.0013, "step": 13676 }, { "epoch": 3.608099195356813, "grad_norm": 0.03864753991365433, "learning_rate": 4.262727512529676e-06, "loss": 0.0003, "step": 13678 }, { "epoch": 3.6086268302334785, "grad_norm": 0.2352236807346344, "learning_rate": 4.262375802338873e-06, "loss": 0.0058, "step": 13680 }, { "epoch": 3.609154465110144, "grad_norm": 0.5572352409362793, "learning_rate": 4.26202409214807e-06, "loss": 0.0014, "step": 13682 }, { "epoch": 3.6096820999868093, "grad_norm": 0.8344260454177856, "learning_rate": 4.261672381957267e-06, "loss": 0.0067, "step": 13684 }, { "epoch": 3.6102097348634743, "grad_norm": 0.5883732438087463, "learning_rate": 4.261320671766464e-06, "loss": 0.0073, "step": 13686 }, { "epoch": 3.6107373697401397, "grad_norm": 0.26798000931739807, "learning_rate": 4.260968961575662e-06, "loss": 0.0009, "step": 13688 }, { "epoch": 3.611265004616805, "grad_norm": 0.303352028131485, "learning_rate": 4.260617251384859e-06, "loss": 0.0036, "step": 13690 }, { "epoch": 3.6117926394934705, "grad_norm": 0.237314373254776, "learning_rate": 4.260265541194057e-06, "loss": 0.0122, "step": 13692 }, { "epoch": 3.612320274370136, "grad_norm": 0.028813263401389122, "learning_rate": 4.259913831003253e-06, "loss": 0.0025, "step": 13694 }, { "epoch": 3.6128479092468013, "grad_norm": 0.23818165063858032, "learning_rate": 4.259562120812451e-06, "loss": 0.0008, "step": 13696 }, { "epoch": 3.6133755441234667, "grad_norm": 0.013571393676102161, "learning_rate": 4.259210410621648e-06, "loss": 0.0005, "step": 13698 }, { "epoch": 3.6139031790001317, "grad_norm": 0.03500622138381004, "learning_rate": 4.258858700430845e-06, "loss": 0.0005, "step": 13700 }, { "epoch": 3.614430813876797, "grad_norm": 0.04310690611600876, "learning_rate": 4.2585069902400425e-06, "loss": 0.0006, "step": 13702 }, { "epoch": 3.6149584487534625, "grad_norm": 0.2869546115398407, "learning_rate": 4.2581552800492394e-06, "loss": 0.0009, "step": 13704 }, { "epoch": 3.615486083630128, "grad_norm": 0.4649309813976288, "learning_rate": 4.257803569858437e-06, "loss": 0.0064, "step": 13706 }, { "epoch": 3.6160137185067933, "grad_norm": 0.7650091052055359, "learning_rate": 4.257451859667633e-06, "loss": 0.0127, "step": 13708 }, { "epoch": 3.6165413533834587, "grad_norm": 0.007611880544573069, "learning_rate": 4.257100149476831e-06, "loss": 0.0003, "step": 13710 }, { "epoch": 3.617068988260124, "grad_norm": 0.11618970334529877, "learning_rate": 4.256748439286029e-06, "loss": 0.0059, "step": 13712 }, { "epoch": 3.617596623136789, "grad_norm": 0.2350221872329712, "learning_rate": 4.256396729095226e-06, "loss": 0.0113, "step": 13714 }, { "epoch": 3.618124258013455, "grad_norm": 0.05399876460433006, "learning_rate": 4.256045018904423e-06, "loss": 0.0005, "step": 13716 }, { "epoch": 3.61865189289012, "grad_norm": 0.029737040400505066, "learning_rate": 4.25569330871362e-06, "loss": 0.0009, "step": 13718 }, { "epoch": 3.6191795277667853, "grad_norm": 0.09047190845012665, "learning_rate": 4.255341598522818e-06, "loss": 0.0009, "step": 13720 }, { "epoch": 3.6197071626434507, "grad_norm": 0.013465723022818565, "learning_rate": 4.2549898883320146e-06, "loss": 0.0003, "step": 13722 }, { "epoch": 3.620234797520116, "grad_norm": 0.019856883212924004, "learning_rate": 4.2546381781412115e-06, "loss": 0.0006, "step": 13724 }, { "epoch": 3.6207624323967815, "grad_norm": 0.025402117520570755, "learning_rate": 4.254286467950409e-06, "loss": 0.0005, "step": 13726 }, { "epoch": 3.621290067273447, "grad_norm": 0.016476482152938843, "learning_rate": 4.253934757759606e-06, "loss": 0.0003, "step": 13728 }, { "epoch": 3.6218177021501123, "grad_norm": 0.143832266330719, "learning_rate": 4.253583047568803e-06, "loss": 0.0008, "step": 13730 }, { "epoch": 3.6223453370267773, "grad_norm": 0.006751845125108957, "learning_rate": 4.253231337378e-06, "loss": 0.0002, "step": 13732 }, { "epoch": 3.6228729719034427, "grad_norm": 0.010107796639204025, "learning_rate": 4.252879627187198e-06, "loss": 0.0007, "step": 13734 }, { "epoch": 3.623400606780108, "grad_norm": 0.03577128052711487, "learning_rate": 4.252527916996395e-06, "loss": 0.0004, "step": 13736 }, { "epoch": 3.6239282416567735, "grad_norm": 0.012300203554332256, "learning_rate": 4.252176206805592e-06, "loss": 0.0002, "step": 13738 }, { "epoch": 3.624455876533439, "grad_norm": 0.030083708465099335, "learning_rate": 4.25182449661479e-06, "loss": 0.0035, "step": 13740 }, { "epoch": 3.6249835114101043, "grad_norm": 0.47179773449897766, "learning_rate": 4.251472786423987e-06, "loss": 0.0047, "step": 13742 }, { "epoch": 3.6255111462867697, "grad_norm": 0.028425397351384163, "learning_rate": 4.251121076233184e-06, "loss": 0.0003, "step": 13744 }, { "epoch": 3.6260387811634347, "grad_norm": 0.030277328565716743, "learning_rate": 4.250769366042381e-06, "loss": 0.0004, "step": 13746 }, { "epoch": 3.6265664160401, "grad_norm": 0.007916267029941082, "learning_rate": 4.250417655851578e-06, "loss": 0.0002, "step": 13748 }, { "epoch": 3.6270940509167655, "grad_norm": 0.2852158844470978, "learning_rate": 4.250065945660776e-06, "loss": 0.0009, "step": 13750 }, { "epoch": 3.627621685793431, "grad_norm": 0.021672893315553665, "learning_rate": 4.249714235469973e-06, "loss": 0.002, "step": 13752 }, { "epoch": 3.6281493206700963, "grad_norm": 0.006667915731668472, "learning_rate": 4.24936252527917e-06, "loss": 0.0005, "step": 13754 }, { "epoch": 3.6286769555467617, "grad_norm": 0.006405039224773645, "learning_rate": 4.249010815088367e-06, "loss": 0.0022, "step": 13756 }, { "epoch": 3.629204590423427, "grad_norm": 0.03661436215043068, "learning_rate": 4.248659104897565e-06, "loss": 0.0003, "step": 13758 }, { "epoch": 3.629732225300092, "grad_norm": 0.3316091001033783, "learning_rate": 4.248307394706762e-06, "loss": 0.007, "step": 13760 }, { "epoch": 3.630259860176758, "grad_norm": 0.0031223103869706392, "learning_rate": 4.247955684515959e-06, "loss": 0.0006, "step": 13762 }, { "epoch": 3.630787495053423, "grad_norm": 0.018644461408257484, "learning_rate": 4.2476039743251565e-06, "loss": 0.0005, "step": 13764 }, { "epoch": 3.6313151299300883, "grad_norm": 0.19649066030979156, "learning_rate": 4.2472522641343534e-06, "loss": 0.0035, "step": 13766 }, { "epoch": 3.6318427648067537, "grad_norm": 0.06979604065418243, "learning_rate": 4.24690055394355e-06, "loss": 0.0032, "step": 13768 }, { "epoch": 3.632370399683419, "grad_norm": 0.027529258280992508, "learning_rate": 4.246548843752748e-06, "loss": 0.0003, "step": 13770 }, { "epoch": 3.6328980345600845, "grad_norm": 0.678734540939331, "learning_rate": 4.246197133561945e-06, "loss": 0.0011, "step": 13772 }, { "epoch": 3.6334256694367495, "grad_norm": 0.01840796135365963, "learning_rate": 4.245845423371143e-06, "loss": 0.0004, "step": 13774 }, { "epoch": 3.6339533043134153, "grad_norm": 0.023459380492568016, "learning_rate": 4.245493713180339e-06, "loss": 0.0023, "step": 13776 }, { "epoch": 3.6344809391900803, "grad_norm": 0.003820749232545495, "learning_rate": 4.245142002989537e-06, "loss": 0.0002, "step": 13778 }, { "epoch": 3.6350085740667457, "grad_norm": 0.25491586327552795, "learning_rate": 4.244790292798734e-06, "loss": 0.0008, "step": 13780 }, { "epoch": 3.635536208943411, "grad_norm": 0.02217891439795494, "learning_rate": 4.244438582607932e-06, "loss": 0.0003, "step": 13782 }, { "epoch": 3.6360638438200765, "grad_norm": 0.0856880471110344, "learning_rate": 4.2440868724171285e-06, "loss": 0.0004, "step": 13784 }, { "epoch": 3.636591478696742, "grad_norm": 0.024918030947446823, "learning_rate": 4.2437351622263255e-06, "loss": 0.0021, "step": 13786 }, { "epoch": 3.6371191135734073, "grad_norm": 0.2739277482032776, "learning_rate": 4.243383452035523e-06, "loss": 0.0014, "step": 13788 }, { "epoch": 3.6376467484500727, "grad_norm": 0.006082275882363319, "learning_rate": 4.243031741844719e-06, "loss": 0.0002, "step": 13790 }, { "epoch": 3.6381743833267377, "grad_norm": 0.8816215395927429, "learning_rate": 4.242680031653917e-06, "loss": 0.0054, "step": 13792 }, { "epoch": 3.638702018203403, "grad_norm": 0.04284320026636124, "learning_rate": 4.242328321463114e-06, "loss": 0.0045, "step": 13794 }, { "epoch": 3.6392296530800685, "grad_norm": 0.4606029987335205, "learning_rate": 4.241976611272312e-06, "loss": 0.0011, "step": 13796 }, { "epoch": 3.639757287956734, "grad_norm": 0.42406409978866577, "learning_rate": 4.241624901081509e-06, "loss": 0.005, "step": 13798 }, { "epoch": 3.6402849228333993, "grad_norm": 0.031321171671152115, "learning_rate": 4.241273190890706e-06, "loss": 0.0003, "step": 13800 }, { "epoch": 3.6408125577100647, "grad_norm": 0.013543152250349522, "learning_rate": 4.240921480699904e-06, "loss": 0.0003, "step": 13802 }, { "epoch": 3.64134019258673, "grad_norm": 0.006208439823240042, "learning_rate": 4.240569770509101e-06, "loss": 0.0002, "step": 13804 }, { "epoch": 3.641867827463395, "grad_norm": 0.046311844140291214, "learning_rate": 4.2402180603182976e-06, "loss": 0.0003, "step": 13806 }, { "epoch": 3.642395462340061, "grad_norm": 0.012265040539205074, "learning_rate": 4.239866350127495e-06, "loss": 0.0005, "step": 13808 }, { "epoch": 3.642923097216726, "grad_norm": 0.1998724639415741, "learning_rate": 4.239514639936692e-06, "loss": 0.001, "step": 13810 }, { "epoch": 3.6434507320933913, "grad_norm": 0.055185381323099136, "learning_rate": 4.23916292974589e-06, "loss": 0.0003, "step": 13812 }, { "epoch": 3.6439783669700567, "grad_norm": 0.004042140208184719, "learning_rate": 4.238811219555086e-06, "loss": 0.0003, "step": 13814 }, { "epoch": 3.644506001846722, "grad_norm": 0.0071443915367126465, "learning_rate": 4.238459509364284e-06, "loss": 0.0004, "step": 13816 }, { "epoch": 3.6450336367233875, "grad_norm": 0.00945385079830885, "learning_rate": 4.238107799173481e-06, "loss": 0.0002, "step": 13818 }, { "epoch": 3.6455612716000525, "grad_norm": 0.050708670169115067, "learning_rate": 4.237756088982678e-06, "loss": 0.0003, "step": 13820 }, { "epoch": 3.6460889064767183, "grad_norm": 0.10063470155000687, "learning_rate": 4.237404378791876e-06, "loss": 0.0023, "step": 13822 }, { "epoch": 3.6466165413533833, "grad_norm": 0.02613634057343006, "learning_rate": 4.237052668601073e-06, "loss": 0.0004, "step": 13824 }, { "epoch": 3.6471441762300487, "grad_norm": 0.04179786890745163, "learning_rate": 4.2367009584102705e-06, "loss": 0.0003, "step": 13826 }, { "epoch": 3.647671811106714, "grad_norm": 0.01777629554271698, "learning_rate": 4.2363492482194674e-06, "loss": 0.0002, "step": 13828 }, { "epoch": 3.6481994459833795, "grad_norm": 0.2440222203731537, "learning_rate": 4.235997538028664e-06, "loss": 0.0078, "step": 13830 }, { "epoch": 3.648727080860045, "grad_norm": 0.005069911479949951, "learning_rate": 4.235645827837862e-06, "loss": 0.0002, "step": 13832 }, { "epoch": 3.6492547157367103, "grad_norm": 0.03850996866822243, "learning_rate": 4.235294117647059e-06, "loss": 0.0005, "step": 13834 }, { "epoch": 3.6497823506133757, "grad_norm": 0.02543960139155388, "learning_rate": 4.234942407456256e-06, "loss": 0.0065, "step": 13836 }, { "epoch": 3.6503099854900407, "grad_norm": 0.6104916930198669, "learning_rate": 4.234590697265453e-06, "loss": 0.0036, "step": 13838 }, { "epoch": 3.650837620366706, "grad_norm": 0.03220175951719284, "learning_rate": 4.234238987074651e-06, "loss": 0.0004, "step": 13840 }, { "epoch": 3.6513652552433715, "grad_norm": 0.0035596652887761593, "learning_rate": 4.233887276883848e-06, "loss": 0.0003, "step": 13842 }, { "epoch": 3.651892890120037, "grad_norm": 0.2415873408317566, "learning_rate": 4.233535566693045e-06, "loss": 0.0004, "step": 13844 }, { "epoch": 3.6524205249967023, "grad_norm": 0.27433502674102783, "learning_rate": 4.2331838565022425e-06, "loss": 0.0011, "step": 13846 }, { "epoch": 3.6529481598733677, "grad_norm": 0.045728135854005814, "learning_rate": 4.2328321463114395e-06, "loss": 0.0003, "step": 13848 }, { "epoch": 3.653475794750033, "grad_norm": 0.026222161948680878, "learning_rate": 4.2324804361206364e-06, "loss": 0.0003, "step": 13850 }, { "epoch": 3.654003429626698, "grad_norm": 0.024468472227454185, "learning_rate": 4.232128725929833e-06, "loss": 0.0026, "step": 13852 }, { "epoch": 3.6545310645033635, "grad_norm": 0.0467621311545372, "learning_rate": 4.231777015739031e-06, "loss": 0.0003, "step": 13854 }, { "epoch": 3.655058699380029, "grad_norm": 0.003580582793802023, "learning_rate": 4.231425305548229e-06, "loss": 0.0002, "step": 13856 }, { "epoch": 3.6555863342566943, "grad_norm": 0.003333684056997299, "learning_rate": 4.231073595357425e-06, "loss": 0.0002, "step": 13858 }, { "epoch": 3.6561139691333597, "grad_norm": 0.16749466955661774, "learning_rate": 4.230721885166623e-06, "loss": 0.0007, "step": 13860 }, { "epoch": 3.656641604010025, "grad_norm": 0.0034096480812877417, "learning_rate": 4.23037017497582e-06, "loss": 0.0002, "step": 13862 }, { "epoch": 3.6571692388866905, "grad_norm": 0.04985048994421959, "learning_rate": 4.230018464785018e-06, "loss": 0.0003, "step": 13864 }, { "epoch": 3.6576968737633555, "grad_norm": 0.00277592777274549, "learning_rate": 4.229666754594215e-06, "loss": 0.0002, "step": 13866 }, { "epoch": 3.6582245086400214, "grad_norm": 0.02211763709783554, "learning_rate": 4.2293150444034116e-06, "loss": 0.0003, "step": 13868 }, { "epoch": 3.6587521435166863, "grad_norm": 0.013276124373078346, "learning_rate": 4.228963334212609e-06, "loss": 0.0105, "step": 13870 }, { "epoch": 3.6592797783933517, "grad_norm": 0.17371200025081635, "learning_rate": 4.228611624021806e-06, "loss": 0.01, "step": 13872 }, { "epoch": 3.659807413270017, "grad_norm": 0.003657842753455043, "learning_rate": 4.228259913831003e-06, "loss": 0.0164, "step": 13874 }, { "epoch": 3.6603350481466825, "grad_norm": 0.49972450733184814, "learning_rate": 4.2279082036402e-06, "loss": 0.001, "step": 13876 }, { "epoch": 3.660862683023348, "grad_norm": 0.007707297336310148, "learning_rate": 4.227556493449398e-06, "loss": 0.0003, "step": 13878 }, { "epoch": 3.6613903179000133, "grad_norm": 0.024043336510658264, "learning_rate": 4.227204783258595e-06, "loss": 0.0005, "step": 13880 }, { "epoch": 3.6619179527766788, "grad_norm": 0.026309624314308167, "learning_rate": 4.226853073067792e-06, "loss": 0.0003, "step": 13882 }, { "epoch": 3.6624455876533437, "grad_norm": 0.03322658687829971, "learning_rate": 4.22650136287699e-06, "loss": 0.0004, "step": 13884 }, { "epoch": 3.662973222530009, "grad_norm": 0.0747385248541832, "learning_rate": 4.226149652686187e-06, "loss": 0.0085, "step": 13886 }, { "epoch": 3.6635008574066745, "grad_norm": 0.2118082195520401, "learning_rate": 4.225797942495384e-06, "loss": 0.0014, "step": 13888 }, { "epoch": 3.66402849228334, "grad_norm": 0.41789546608924866, "learning_rate": 4.225446232304581e-06, "loss": 0.003, "step": 13890 }, { "epoch": 3.6645561271600053, "grad_norm": 0.08608187735080719, "learning_rate": 4.225094522113778e-06, "loss": 0.0005, "step": 13892 }, { "epoch": 3.6650837620366707, "grad_norm": 0.03304670751094818, "learning_rate": 4.224742811922976e-06, "loss": 0.0003, "step": 13894 }, { "epoch": 3.665611396913336, "grad_norm": 0.07239789515733719, "learning_rate": 4.224391101732172e-06, "loss": 0.0004, "step": 13896 }, { "epoch": 3.666139031790001, "grad_norm": 0.0437045581638813, "learning_rate": 4.22403939154137e-06, "loss": 0.0036, "step": 13898 }, { "epoch": 3.6666666666666665, "grad_norm": 0.2837335169315338, "learning_rate": 4.223687681350567e-06, "loss": 0.015, "step": 13900 }, { "epoch": 3.667194301543332, "grad_norm": 0.3007628321647644, "learning_rate": 4.223335971159765e-06, "loss": 0.0086, "step": 13902 }, { "epoch": 3.6677219364199973, "grad_norm": 0.00926361232995987, "learning_rate": 4.222984260968962e-06, "loss": 0.0002, "step": 13904 }, { "epoch": 3.6682495712966627, "grad_norm": 0.010369262658059597, "learning_rate": 4.222632550778159e-06, "loss": 0.0029, "step": 13906 }, { "epoch": 3.668777206173328, "grad_norm": 0.28833892941474915, "learning_rate": 4.2222808405873565e-06, "loss": 0.0015, "step": 13908 }, { "epoch": 3.6693048410499935, "grad_norm": 0.010821734555065632, "learning_rate": 4.221929130396553e-06, "loss": 0.0003, "step": 13910 }, { "epoch": 3.6698324759266585, "grad_norm": 0.01622244529426098, "learning_rate": 4.2215774202057504e-06, "loss": 0.0025, "step": 13912 }, { "epoch": 3.6703601108033244, "grad_norm": 0.006203089840710163, "learning_rate": 4.221225710014948e-06, "loss": 0.0002, "step": 13914 }, { "epoch": 3.6708877456799893, "grad_norm": 0.017250170931220055, "learning_rate": 4.220873999824145e-06, "loss": 0.0002, "step": 13916 }, { "epoch": 3.6714153805566547, "grad_norm": 0.01899174228310585, "learning_rate": 4.220522289633342e-06, "loss": 0.0077, "step": 13918 }, { "epoch": 3.67194301543332, "grad_norm": 0.0041310712695121765, "learning_rate": 4.220170579442539e-06, "loss": 0.0002, "step": 13920 }, { "epoch": 3.6724706503099855, "grad_norm": 0.006308891344815493, "learning_rate": 4.219818869251737e-06, "loss": 0.0003, "step": 13922 }, { "epoch": 3.672998285186651, "grad_norm": 0.6120668649673462, "learning_rate": 4.219467159060934e-06, "loss": 0.005, "step": 13924 }, { "epoch": 3.673525920063316, "grad_norm": 0.907032310962677, "learning_rate": 4.219115448870131e-06, "loss": 0.0101, "step": 13926 }, { "epoch": 3.6740535549399818, "grad_norm": 0.013380941934883595, "learning_rate": 4.218763738679329e-06, "loss": 0.0038, "step": 13928 }, { "epoch": 3.6745811898166467, "grad_norm": 0.0228324793279171, "learning_rate": 4.2184120284885255e-06, "loss": 0.0003, "step": 13930 }, { "epoch": 3.675108824693312, "grad_norm": 0.015273915603756905, "learning_rate": 4.218060318297723e-06, "loss": 0.0004, "step": 13932 }, { "epoch": 3.6756364595699775, "grad_norm": 0.3153962790966034, "learning_rate": 4.2177086081069195e-06, "loss": 0.0006, "step": 13934 }, { "epoch": 3.676164094446643, "grad_norm": 0.12722766399383545, "learning_rate": 4.217356897916117e-06, "loss": 0.0007, "step": 13936 }, { "epoch": 3.6766917293233083, "grad_norm": 0.021321525797247887, "learning_rate": 4.217005187725314e-06, "loss": 0.002, "step": 13938 }, { "epoch": 3.6772193641999738, "grad_norm": 0.00948471948504448, "learning_rate": 4.216653477534511e-06, "loss": 0.0003, "step": 13940 }, { "epoch": 3.677746999076639, "grad_norm": 0.04027727618813515, "learning_rate": 4.216301767343709e-06, "loss": 0.0004, "step": 13942 }, { "epoch": 3.678274633953304, "grad_norm": 0.048212360590696335, "learning_rate": 4.215950057152906e-06, "loss": 0.0031, "step": 13944 }, { "epoch": 3.6788022688299695, "grad_norm": 0.022214291617274284, "learning_rate": 4.215598346962104e-06, "loss": 0.0006, "step": 13946 }, { "epoch": 3.679329903706635, "grad_norm": 0.039588313549757004, "learning_rate": 4.215246636771301e-06, "loss": 0.0008, "step": 13948 }, { "epoch": 3.6798575385833003, "grad_norm": 0.019855642691254616, "learning_rate": 4.214894926580498e-06, "loss": 0.0004, "step": 13950 }, { "epoch": 3.6803851734599657, "grad_norm": 0.013984683901071548, "learning_rate": 4.214543216389695e-06, "loss": 0.0003, "step": 13952 }, { "epoch": 3.680912808336631, "grad_norm": 0.019179942086338997, "learning_rate": 4.214191506198892e-06, "loss": 0.0085, "step": 13954 }, { "epoch": 3.6814404432132966, "grad_norm": 0.7901597023010254, "learning_rate": 4.213839796008089e-06, "loss": 0.0035, "step": 13956 }, { "epoch": 3.6819680780899615, "grad_norm": 0.12661322951316833, "learning_rate": 4.213488085817286e-06, "loss": 0.0005, "step": 13958 }, { "epoch": 3.6824957129666274, "grad_norm": 0.03655165806412697, "learning_rate": 4.213136375626484e-06, "loss": 0.0005, "step": 13960 }, { "epoch": 3.6830233478432923, "grad_norm": 0.35606634616851807, "learning_rate": 4.212784665435681e-06, "loss": 0.0015, "step": 13962 }, { "epoch": 3.6835509827199577, "grad_norm": 0.024329116567969322, "learning_rate": 4.212432955244878e-06, "loss": 0.005, "step": 13964 }, { "epoch": 3.684078617596623, "grad_norm": 0.03500381112098694, "learning_rate": 4.212081245054076e-06, "loss": 0.0004, "step": 13966 }, { "epoch": 3.6846062524732885, "grad_norm": 0.03537396714091301, "learning_rate": 4.211729534863273e-06, "loss": 0.0034, "step": 13968 }, { "epoch": 3.685133887349954, "grad_norm": 0.18529607355594635, "learning_rate": 4.21137782467247e-06, "loss": 0.0008, "step": 13970 }, { "epoch": 3.685661522226619, "grad_norm": 0.34443598985671997, "learning_rate": 4.2110261144816675e-06, "loss": 0.0026, "step": 13972 }, { "epoch": 3.6861891571032848, "grad_norm": 0.07160317152738571, "learning_rate": 4.2106744042908644e-06, "loss": 0.0073, "step": 13974 }, { "epoch": 3.6867167919799497, "grad_norm": 0.005835663992911577, "learning_rate": 4.210322694100062e-06, "loss": 0.0003, "step": 13976 }, { "epoch": 3.687244426856615, "grad_norm": 0.24380816519260406, "learning_rate": 4.209970983909258e-06, "loss": 0.0044, "step": 13978 }, { "epoch": 3.6877720617332805, "grad_norm": 0.005932850297540426, "learning_rate": 4.209619273718456e-06, "loss": 0.0003, "step": 13980 }, { "epoch": 3.688299696609946, "grad_norm": 0.004712051246315241, "learning_rate": 4.209267563527653e-06, "loss": 0.0002, "step": 13982 }, { "epoch": 3.6888273314866113, "grad_norm": 0.004558384418487549, "learning_rate": 4.208915853336851e-06, "loss": 0.0003, "step": 13984 }, { "epoch": 3.6893549663632768, "grad_norm": 0.3786954879760742, "learning_rate": 4.208564143146048e-06, "loss": 0.0069, "step": 13986 }, { "epoch": 3.689882601239942, "grad_norm": 0.5482985377311707, "learning_rate": 4.208212432955245e-06, "loss": 0.0103, "step": 13988 }, { "epoch": 3.690410236116607, "grad_norm": 0.23283492028713226, "learning_rate": 4.207860722764443e-06, "loss": 0.0009, "step": 13990 }, { "epoch": 3.6909378709932725, "grad_norm": 0.015524224378168583, "learning_rate": 4.2075090125736395e-06, "loss": 0.0022, "step": 13992 }, { "epoch": 3.691465505869938, "grad_norm": 0.36210525035858154, "learning_rate": 4.2071573023828365e-06, "loss": 0.002, "step": 13994 }, { "epoch": 3.6919931407466033, "grad_norm": 0.008626147173345089, "learning_rate": 4.2068055921920334e-06, "loss": 0.0002, "step": 13996 }, { "epoch": 3.6925207756232687, "grad_norm": 0.018640194088220596, "learning_rate": 4.206453882001231e-06, "loss": 0.0074, "step": 13998 }, { "epoch": 3.693048410499934, "grad_norm": 0.015416853129863739, "learning_rate": 4.206102171810428e-06, "loss": 0.0049, "step": 14000 }, { "epoch": 3.693048410499934, "eval_loss": 0.0025336863473057747, "eval_runtime": 304.9972, "eval_samples_per_second": 707.026, "eval_steps_per_second": 88.381, "step": 14000 }, { "epoch": 3.6935760453765996, "grad_norm": 0.29510584473609924, "learning_rate": 4.205750461619625e-06, "loss": 0.0024, "step": 14002 }, { "epoch": 3.6941036802532645, "grad_norm": 0.02387135662138462, "learning_rate": 4.205398751428823e-06, "loss": 0.0003, "step": 14004 }, { "epoch": 3.69463131512993, "grad_norm": 0.5333539247512817, "learning_rate": 4.20504704123802e-06, "loss": 0.0139, "step": 14006 }, { "epoch": 3.6951589500065953, "grad_norm": 0.5475810170173645, "learning_rate": 4.204695331047217e-06, "loss": 0.005, "step": 14008 }, { "epoch": 3.6956865848832607, "grad_norm": 0.06384889036417007, "learning_rate": 4.204343620856415e-06, "loss": 0.0005, "step": 14010 }, { "epoch": 3.696214219759926, "grad_norm": 0.2212458997964859, "learning_rate": 4.203991910665612e-06, "loss": 0.0017, "step": 14012 }, { "epoch": 3.6967418546365916, "grad_norm": 0.37286046147346497, "learning_rate": 4.203640200474809e-06, "loss": 0.0078, "step": 14014 }, { "epoch": 3.697269489513257, "grad_norm": 0.031788419932127, "learning_rate": 4.2032884902840055e-06, "loss": 0.0004, "step": 14016 }, { "epoch": 3.697797124389922, "grad_norm": 0.026538215577602386, "learning_rate": 4.202936780093203e-06, "loss": 0.0025, "step": 14018 }, { "epoch": 3.6983247592665878, "grad_norm": 0.15749189257621765, "learning_rate": 4.2025850699024e-06, "loss": 0.0012, "step": 14020 }, { "epoch": 3.6988523941432527, "grad_norm": 0.08353991061449051, "learning_rate": 4.202233359711598e-06, "loss": 0.0007, "step": 14022 }, { "epoch": 3.699380029019918, "grad_norm": 0.015240097418427467, "learning_rate": 4.201881649520795e-06, "loss": 0.0003, "step": 14024 }, { "epoch": 3.6999076638965835, "grad_norm": 0.13586916029453278, "learning_rate": 4.201529939329992e-06, "loss": 0.0081, "step": 14026 }, { "epoch": 3.700435298773249, "grad_norm": 0.02194901555776596, "learning_rate": 4.20117822913919e-06, "loss": 0.0007, "step": 14028 }, { "epoch": 3.7009629336499144, "grad_norm": 0.4984613358974457, "learning_rate": 4.200826518948387e-06, "loss": 0.0024, "step": 14030 }, { "epoch": 3.7014905685265798, "grad_norm": 0.09108718484640121, "learning_rate": 4.200474808757584e-06, "loss": 0.0029, "step": 14032 }, { "epoch": 3.702018203403245, "grad_norm": 0.06270132213830948, "learning_rate": 4.2001230985667815e-06, "loss": 0.0005, "step": 14034 }, { "epoch": 3.70254583827991, "grad_norm": 0.17131811380386353, "learning_rate": 4.199771388375978e-06, "loss": 0.0004, "step": 14036 }, { "epoch": 3.7030734731565755, "grad_norm": 0.07795827090740204, "learning_rate": 4.199419678185175e-06, "loss": 0.0005, "step": 14038 }, { "epoch": 3.703601108033241, "grad_norm": 0.022503813728690147, "learning_rate": 4.199067967994372e-06, "loss": 0.0031, "step": 14040 }, { "epoch": 3.7041287429099063, "grad_norm": 0.004456117283552885, "learning_rate": 4.19871625780357e-06, "loss": 0.0004, "step": 14042 }, { "epoch": 3.7046563777865718, "grad_norm": 0.003344167722389102, "learning_rate": 4.198364547612767e-06, "loss": 0.0002, "step": 14044 }, { "epoch": 3.705184012663237, "grad_norm": 0.4880523383617401, "learning_rate": 4.198012837421964e-06, "loss": 0.0072, "step": 14046 }, { "epoch": 3.7057116475399026, "grad_norm": 0.009631287306547165, "learning_rate": 4.197661127231162e-06, "loss": 0.0004, "step": 14048 }, { "epoch": 3.7062392824165675, "grad_norm": 0.6190251111984253, "learning_rate": 4.197309417040359e-06, "loss": 0.0035, "step": 14050 }, { "epoch": 3.706766917293233, "grad_norm": 0.13639196753501892, "learning_rate": 4.196957706849557e-06, "loss": 0.0006, "step": 14052 }, { "epoch": 3.7072945521698983, "grad_norm": 0.45920804142951965, "learning_rate": 4.196605996658753e-06, "loss": 0.0264, "step": 14054 }, { "epoch": 3.7078221870465637, "grad_norm": 0.04126467928290367, "learning_rate": 4.1962542864679505e-06, "loss": 0.0004, "step": 14056 }, { "epoch": 3.708349821923229, "grad_norm": 0.0178770013153553, "learning_rate": 4.195902576277148e-06, "loss": 0.002, "step": 14058 }, { "epoch": 3.7088774567998946, "grad_norm": 0.044428322464227676, "learning_rate": 4.195550866086344e-06, "loss": 0.0004, "step": 14060 }, { "epoch": 3.70940509167656, "grad_norm": 0.12695221602916718, "learning_rate": 4.195199155895542e-06, "loss": 0.0042, "step": 14062 }, { "epoch": 3.709932726553225, "grad_norm": 0.012639675289392471, "learning_rate": 4.194847445704739e-06, "loss": 0.0004, "step": 14064 }, { "epoch": 3.7104603614298908, "grad_norm": 0.012645655311644077, "learning_rate": 4.194495735513937e-06, "loss": 0.004, "step": 14066 }, { "epoch": 3.7109879963065557, "grad_norm": 0.6889724731445312, "learning_rate": 4.194144025323134e-06, "loss": 0.0057, "step": 14068 }, { "epoch": 3.711515631183221, "grad_norm": 0.26613152027130127, "learning_rate": 4.193792315132331e-06, "loss": 0.0018, "step": 14070 }, { "epoch": 3.7120432660598865, "grad_norm": 0.06001294031739235, "learning_rate": 4.193440604941529e-06, "loss": 0.0005, "step": 14072 }, { "epoch": 3.712570900936552, "grad_norm": 0.07747697830200195, "learning_rate": 4.193088894750726e-06, "loss": 0.0006, "step": 14074 }, { "epoch": 3.7130985358132174, "grad_norm": 0.312536358833313, "learning_rate": 4.1927371845599226e-06, "loss": 0.0009, "step": 14076 }, { "epoch": 3.7136261706898828, "grad_norm": 0.5521472096443176, "learning_rate": 4.1923854743691195e-06, "loss": 0.0045, "step": 14078 }, { "epoch": 3.714153805566548, "grad_norm": 0.22670584917068481, "learning_rate": 4.192033764178317e-06, "loss": 0.0097, "step": 14080 }, { "epoch": 3.714681440443213, "grad_norm": 0.02903805673122406, "learning_rate": 4.191682053987515e-06, "loss": 0.0005, "step": 14082 }, { "epoch": 3.7152090753198785, "grad_norm": 0.24728572368621826, "learning_rate": 4.191330343796711e-06, "loss": 0.0014, "step": 14084 }, { "epoch": 3.715736710196544, "grad_norm": 0.34267884492874146, "learning_rate": 4.190978633605909e-06, "loss": 0.0052, "step": 14086 }, { "epoch": 3.7162643450732094, "grad_norm": 0.6309549808502197, "learning_rate": 4.190626923415106e-06, "loss": 0.0086, "step": 14088 }, { "epoch": 3.7167919799498748, "grad_norm": 0.0634596198797226, "learning_rate": 4.190275213224303e-06, "loss": 0.0006, "step": 14090 }, { "epoch": 3.71731961482654, "grad_norm": 0.10847686976194382, "learning_rate": 4.189923503033501e-06, "loss": 0.0025, "step": 14092 }, { "epoch": 3.7178472497032056, "grad_norm": 0.12388767302036285, "learning_rate": 4.189571792842698e-06, "loss": 0.0024, "step": 14094 }, { "epoch": 3.7183748845798705, "grad_norm": 0.13598424196243286, "learning_rate": 4.1892200826518955e-06, "loss": 0.0024, "step": 14096 }, { "epoch": 3.718902519456536, "grad_norm": 0.006675321143120527, "learning_rate": 4.1888683724610916e-06, "loss": 0.0003, "step": 14098 }, { "epoch": 3.7194301543332013, "grad_norm": 0.15190541744232178, "learning_rate": 4.188516662270289e-06, "loss": 0.0008, "step": 14100 }, { "epoch": 3.7199577892098667, "grad_norm": 0.033030714839696884, "learning_rate": 4.188164952079486e-06, "loss": 0.0004, "step": 14102 }, { "epoch": 3.720485424086532, "grad_norm": 0.006997359450906515, "learning_rate": 4.187813241888684e-06, "loss": 0.0002, "step": 14104 }, { "epoch": 3.7210130589631976, "grad_norm": 0.343244343996048, "learning_rate": 4.187461531697881e-06, "loss": 0.0076, "step": 14106 }, { "epoch": 3.721540693839863, "grad_norm": 0.013113988563418388, "learning_rate": 4.187109821507078e-06, "loss": 0.0003, "step": 14108 }, { "epoch": 3.722068328716528, "grad_norm": 0.07484962791204453, "learning_rate": 4.186758111316276e-06, "loss": 0.0005, "step": 14110 }, { "epoch": 3.722595963593194, "grad_norm": 0.019888246431946754, "learning_rate": 4.186406401125473e-06, "loss": 0.0003, "step": 14112 }, { "epoch": 3.7231235984698587, "grad_norm": 0.09640951454639435, "learning_rate": 4.18605469093467e-06, "loss": 0.0009, "step": 14114 }, { "epoch": 3.723651233346524, "grad_norm": 0.026061804965138435, "learning_rate": 4.1857029807438675e-06, "loss": 0.0071, "step": 14116 }, { "epoch": 3.7241788682231896, "grad_norm": 0.015071297995746136, "learning_rate": 4.1853512705530645e-06, "loss": 0.0003, "step": 14118 }, { "epoch": 3.724706503099855, "grad_norm": 0.03231723979115486, "learning_rate": 4.1849995603622614e-06, "loss": 0.0003, "step": 14120 }, { "epoch": 3.7252341379765204, "grad_norm": 0.015404823236167431, "learning_rate": 4.184647850171458e-06, "loss": 0.0005, "step": 14122 }, { "epoch": 3.7257617728531853, "grad_norm": 0.2533649802207947, "learning_rate": 4.184296139980656e-06, "loss": 0.0082, "step": 14124 }, { "epoch": 3.726289407729851, "grad_norm": 0.015308261848986149, "learning_rate": 4.183944429789853e-06, "loss": 0.0003, "step": 14126 }, { "epoch": 3.726817042606516, "grad_norm": 0.005422370973974466, "learning_rate": 4.18359271959905e-06, "loss": 0.0002, "step": 14128 }, { "epoch": 3.7273446774831815, "grad_norm": 0.007059026975184679, "learning_rate": 4.183241009408248e-06, "loss": 0.0002, "step": 14130 }, { "epoch": 3.727872312359847, "grad_norm": 0.008657361380755901, "learning_rate": 4.182889299217445e-06, "loss": 0.0002, "step": 14132 }, { "epoch": 3.7283999472365124, "grad_norm": 0.6348419189453125, "learning_rate": 4.182537589026643e-06, "loss": 0.0033, "step": 14134 }, { "epoch": 3.7289275821131778, "grad_norm": 0.10044833272695541, "learning_rate": 4.182185878835839e-06, "loss": 0.0004, "step": 14136 }, { "epoch": 3.729455216989843, "grad_norm": 0.004605351481586695, "learning_rate": 4.1818341686450365e-06, "loss": 0.001, "step": 14138 }, { "epoch": 3.7299828518665086, "grad_norm": 0.10917968302965164, "learning_rate": 4.181482458454234e-06, "loss": 0.0039, "step": 14140 }, { "epoch": 3.7305104867431735, "grad_norm": 0.1706705540418625, "learning_rate": 4.181130748263431e-06, "loss": 0.0018, "step": 14142 }, { "epoch": 3.731038121619839, "grad_norm": 0.013067485764622688, "learning_rate": 4.180779038072628e-06, "loss": 0.0002, "step": 14144 }, { "epoch": 3.7315657564965043, "grad_norm": 0.005538357887417078, "learning_rate": 4.180427327881825e-06, "loss": 0.0014, "step": 14146 }, { "epoch": 3.7320933913731698, "grad_norm": 0.005432522390037775, "learning_rate": 4.180075617691023e-06, "loss": 0.0002, "step": 14148 }, { "epoch": 3.732621026249835, "grad_norm": 0.00511956587433815, "learning_rate": 4.17972390750022e-06, "loss": 0.0004, "step": 14150 }, { "epoch": 3.7331486611265006, "grad_norm": 0.006115831900388002, "learning_rate": 4.179372197309417e-06, "loss": 0.0002, "step": 14152 }, { "epoch": 3.733676296003166, "grad_norm": 0.008126936852931976, "learning_rate": 4.179020487118615e-06, "loss": 0.0003, "step": 14154 }, { "epoch": 3.734203930879831, "grad_norm": 0.005471617914736271, "learning_rate": 4.178668776927812e-06, "loss": 0.0002, "step": 14156 }, { "epoch": 3.7347315657564963, "grad_norm": 0.5297627449035645, "learning_rate": 4.178317066737009e-06, "loss": 0.0124, "step": 14158 }, { "epoch": 3.7352592006331617, "grad_norm": 0.42691290378570557, "learning_rate": 4.1779653565462056e-06, "loss": 0.0057, "step": 14160 }, { "epoch": 3.735786835509827, "grad_norm": 1.0440462827682495, "learning_rate": 4.177613646355403e-06, "loss": 0.003, "step": 14162 }, { "epoch": 3.7363144703864926, "grad_norm": 0.036926236003637314, "learning_rate": 4.1772619361646e-06, "loss": 0.0003, "step": 14164 }, { "epoch": 3.736842105263158, "grad_norm": 0.4601336419582367, "learning_rate": 4.176910225973797e-06, "loss": 0.0044, "step": 14166 }, { "epoch": 3.7373697401398234, "grad_norm": 0.0826423317193985, "learning_rate": 4.176558515782995e-06, "loss": 0.001, "step": 14168 }, { "epoch": 3.7378973750164883, "grad_norm": 0.5861263275146484, "learning_rate": 4.176206805592192e-06, "loss": 0.0161, "step": 14170 }, { "epoch": 3.738425009893154, "grad_norm": 0.029003234580159187, "learning_rate": 4.17585509540139e-06, "loss": 0.0006, "step": 14172 }, { "epoch": 3.738952644769819, "grad_norm": 0.37468603253364563, "learning_rate": 4.175503385210587e-06, "loss": 0.0022, "step": 14174 }, { "epoch": 3.7394802796464846, "grad_norm": 0.011812230572104454, "learning_rate": 4.175151675019784e-06, "loss": 0.0066, "step": 14176 }, { "epoch": 3.74000791452315, "grad_norm": 0.4236411154270172, "learning_rate": 4.1747999648289815e-06, "loss": 0.0025, "step": 14178 }, { "epoch": 3.7405355493998154, "grad_norm": 0.7666969299316406, "learning_rate": 4.174448254638178e-06, "loss": 0.0032, "step": 14180 }, { "epoch": 3.7410631842764808, "grad_norm": 0.014086086302995682, "learning_rate": 4.1740965444473754e-06, "loss": 0.002, "step": 14182 }, { "epoch": 3.741590819153146, "grad_norm": 0.003541248617693782, "learning_rate": 4.173744834256572e-06, "loss": 0.0002, "step": 14184 }, { "epoch": 3.7421184540298116, "grad_norm": 0.062385302037000656, "learning_rate": 4.17339312406577e-06, "loss": 0.0003, "step": 14186 }, { "epoch": 3.7426460889064765, "grad_norm": 0.00762715470045805, "learning_rate": 4.173041413874967e-06, "loss": 0.0004, "step": 14188 }, { "epoch": 3.743173723783142, "grad_norm": 0.3708832859992981, "learning_rate": 4.172689703684164e-06, "loss": 0.0047, "step": 14190 }, { "epoch": 3.7437013586598074, "grad_norm": 0.013208967633545399, "learning_rate": 4.172337993493362e-06, "loss": 0.0011, "step": 14192 }, { "epoch": 3.7442289935364728, "grad_norm": 0.02922731451690197, "learning_rate": 4.171986283302559e-06, "loss": 0.0002, "step": 14194 }, { "epoch": 3.744756628413138, "grad_norm": 0.01655382290482521, "learning_rate": 4.171634573111756e-06, "loss": 0.0005, "step": 14196 }, { "epoch": 3.7452842632898036, "grad_norm": 0.23552703857421875, "learning_rate": 4.171282862920954e-06, "loss": 0.0022, "step": 14198 }, { "epoch": 3.745811898166469, "grad_norm": 0.006598257459700108, "learning_rate": 4.1709311527301505e-06, "loss": 0.0006, "step": 14200 }, { "epoch": 3.746339533043134, "grad_norm": 0.14544738829135895, "learning_rate": 4.170579442539348e-06, "loss": 0.0015, "step": 14202 }, { "epoch": 3.7468671679197993, "grad_norm": 0.24837049841880798, "learning_rate": 4.1702277323485444e-06, "loss": 0.0046, "step": 14204 }, { "epoch": 3.7473948027964648, "grad_norm": 0.05767340213060379, "learning_rate": 4.169876022157742e-06, "loss": 0.0008, "step": 14206 }, { "epoch": 3.74792243767313, "grad_norm": 0.008558718487620354, "learning_rate": 4.169524311966939e-06, "loss": 0.0002, "step": 14208 }, { "epoch": 3.7484500725497956, "grad_norm": 0.12125246226787567, "learning_rate": 4.169172601776136e-06, "loss": 0.0007, "step": 14210 }, { "epoch": 3.748977707426461, "grad_norm": 0.03733954578638077, "learning_rate": 4.168820891585334e-06, "loss": 0.0004, "step": 14212 }, { "epoch": 3.7495053423031264, "grad_norm": 0.02235257998108864, "learning_rate": 4.168469181394531e-06, "loss": 0.0003, "step": 14214 }, { "epoch": 3.7500329771797913, "grad_norm": 0.03262903168797493, "learning_rate": 4.168117471203729e-06, "loss": 0.0042, "step": 14216 }, { "epoch": 3.750560612056457, "grad_norm": 0.00582500034943223, "learning_rate": 4.167765761012925e-06, "loss": 0.0002, "step": 14218 }, { "epoch": 3.751088246933122, "grad_norm": 0.010745410807430744, "learning_rate": 4.167414050822123e-06, "loss": 0.0002, "step": 14220 }, { "epoch": 3.7516158818097876, "grad_norm": 0.02198764681816101, "learning_rate": 4.1670623406313196e-06, "loss": 0.0004, "step": 14222 }, { "epoch": 3.752143516686453, "grad_norm": 0.01758197695016861, "learning_rate": 4.166710630440517e-06, "loss": 0.0002, "step": 14224 }, { "epoch": 3.7526711515631184, "grad_norm": 0.03282827511429787, "learning_rate": 4.166358920249714e-06, "loss": 0.0004, "step": 14226 }, { "epoch": 3.7531987864397838, "grad_norm": 2.508021116256714, "learning_rate": 4.166007210058911e-06, "loss": 0.0025, "step": 14228 }, { "epoch": 3.753726421316449, "grad_norm": 0.009374753572046757, "learning_rate": 4.165655499868109e-06, "loss": 0.0002, "step": 14230 }, { "epoch": 3.7542540561931146, "grad_norm": 0.012300011701881886, "learning_rate": 4.165303789677306e-06, "loss": 0.0002, "step": 14232 }, { "epoch": 3.7547816910697795, "grad_norm": 0.0032273298129439354, "learning_rate": 4.164952079486503e-06, "loss": 0.0027, "step": 14234 }, { "epoch": 3.755309325946445, "grad_norm": 0.00926150567829609, "learning_rate": 4.164600369295701e-06, "loss": 0.0008, "step": 14236 }, { "epoch": 3.7558369608231104, "grad_norm": 0.16848593950271606, "learning_rate": 4.164248659104898e-06, "loss": 0.0036, "step": 14238 }, { "epoch": 3.7563645956997758, "grad_norm": 0.5538434386253357, "learning_rate": 4.163896948914095e-06, "loss": 0.0022, "step": 14240 }, { "epoch": 3.756892230576441, "grad_norm": 0.033729661256074905, "learning_rate": 4.163545238723292e-06, "loss": 0.0003, "step": 14242 }, { "epoch": 3.7574198654531066, "grad_norm": 0.052094098180532455, "learning_rate": 4.163193528532489e-06, "loss": 0.0014, "step": 14244 }, { "epoch": 3.757947500329772, "grad_norm": 0.026438236236572266, "learning_rate": 4.162841818341686e-06, "loss": 0.0003, "step": 14246 }, { "epoch": 3.758475135206437, "grad_norm": 0.1713005155324936, "learning_rate": 4.162490108150883e-06, "loss": 0.0024, "step": 14248 }, { "epoch": 3.7590027700831024, "grad_norm": 0.028836315497756004, "learning_rate": 4.162138397960081e-06, "loss": 0.0013, "step": 14250 }, { "epoch": 3.7595304049597678, "grad_norm": 0.003053286811336875, "learning_rate": 4.161786687769278e-06, "loss": 0.0004, "step": 14252 }, { "epoch": 3.760058039836433, "grad_norm": 0.007575612515211105, "learning_rate": 4.161434977578476e-06, "loss": 0.0002, "step": 14254 }, { "epoch": 3.7605856747130986, "grad_norm": 0.0697547197341919, "learning_rate": 4.161083267387673e-06, "loss": 0.0051, "step": 14256 }, { "epoch": 3.761113309589764, "grad_norm": 0.006653785705566406, "learning_rate": 4.16073155719687e-06, "loss": 0.0003, "step": 14258 }, { "epoch": 3.7616409444664294, "grad_norm": 0.8505602478981018, "learning_rate": 4.160379847006068e-06, "loss": 0.0147, "step": 14260 }, { "epoch": 3.7621685793430943, "grad_norm": 0.19475208222866058, "learning_rate": 4.1600281368152645e-06, "loss": 0.0024, "step": 14262 }, { "epoch": 3.76269621421976, "grad_norm": 0.04381697624921799, "learning_rate": 4.1596764266244615e-06, "loss": 0.0004, "step": 14264 }, { "epoch": 3.763223849096425, "grad_norm": 0.030407950282096863, "learning_rate": 4.1593247164336584e-06, "loss": 0.0006, "step": 14266 }, { "epoch": 3.7637514839730906, "grad_norm": 0.11284992843866348, "learning_rate": 4.158973006242856e-06, "loss": 0.0008, "step": 14268 }, { "epoch": 3.764279118849756, "grad_norm": 0.4485659897327423, "learning_rate": 4.158621296052053e-06, "loss": 0.0016, "step": 14270 }, { "epoch": 3.7648067537264214, "grad_norm": 0.059266477823257446, "learning_rate": 4.15826958586125e-06, "loss": 0.0013, "step": 14272 }, { "epoch": 3.765334388603087, "grad_norm": 0.6730778217315674, "learning_rate": 4.157917875670448e-06, "loss": 0.0033, "step": 14274 }, { "epoch": 3.7658620234797517, "grad_norm": 0.032300740480422974, "learning_rate": 4.157566165479645e-06, "loss": 0.0009, "step": 14276 }, { "epoch": 3.7663896583564176, "grad_norm": 0.056591425091028214, "learning_rate": 4.157214455288842e-06, "loss": 0.0005, "step": 14278 }, { "epoch": 3.7669172932330826, "grad_norm": 0.2991614043712616, "learning_rate": 4.156862745098039e-06, "loss": 0.0134, "step": 14280 }, { "epoch": 3.767444928109748, "grad_norm": 0.2906125783920288, "learning_rate": 4.156511034907237e-06, "loss": 0.0083, "step": 14282 }, { "epoch": 3.7679725629864134, "grad_norm": 0.02915995940566063, "learning_rate": 4.156159324716434e-06, "loss": 0.0018, "step": 14284 }, { "epoch": 3.7685001978630788, "grad_norm": 0.022388223558664322, "learning_rate": 4.1558076145256305e-06, "loss": 0.0047, "step": 14286 }, { "epoch": 3.769027832739744, "grad_norm": 0.16188257932662964, "learning_rate": 4.155455904334828e-06, "loss": 0.0009, "step": 14288 }, { "epoch": 3.7695554676164096, "grad_norm": 0.03237359598278999, "learning_rate": 4.155104194144025e-06, "loss": 0.0035, "step": 14290 }, { "epoch": 3.770083102493075, "grad_norm": 0.2717292606830597, "learning_rate": 4.154752483953223e-06, "loss": 0.0012, "step": 14292 }, { "epoch": 3.77061073736974, "grad_norm": 0.024575240910053253, "learning_rate": 4.15440077376242e-06, "loss": 0.0004, "step": 14294 }, { "epoch": 3.7711383722464054, "grad_norm": 0.019377466291189194, "learning_rate": 4.154049063571617e-06, "loss": 0.0003, "step": 14296 }, { "epoch": 3.7716660071230708, "grad_norm": 0.05424511432647705, "learning_rate": 4.153697353380815e-06, "loss": 0.0005, "step": 14298 }, { "epoch": 3.772193641999736, "grad_norm": 0.010983457788825035, "learning_rate": 4.153345643190012e-06, "loss": 0.0003, "step": 14300 }, { "epoch": 3.7727212768764016, "grad_norm": 0.12780290842056274, "learning_rate": 4.152993932999209e-06, "loss": 0.0011, "step": 14302 }, { "epoch": 3.773248911753067, "grad_norm": 0.008435975760221481, "learning_rate": 4.152642222808406e-06, "loss": 0.0003, "step": 14304 }, { "epoch": 3.7737765466297324, "grad_norm": 0.011261259205639362, "learning_rate": 4.152290512617603e-06, "loss": 0.0022, "step": 14306 }, { "epoch": 3.7743041815063973, "grad_norm": 0.036030448973178864, "learning_rate": 4.1519388024268e-06, "loss": 0.0114, "step": 14308 }, { "epoch": 3.7748318163830628, "grad_norm": 0.014596234075725079, "learning_rate": 4.151587092235997e-06, "loss": 0.0003, "step": 14310 }, { "epoch": 3.775359451259728, "grad_norm": 0.053000353276729584, "learning_rate": 4.151235382045195e-06, "loss": 0.0021, "step": 14312 }, { "epoch": 3.7758870861363936, "grad_norm": 0.24853798747062683, "learning_rate": 4.150883671854392e-06, "loss": 0.0011, "step": 14314 }, { "epoch": 3.776414721013059, "grad_norm": 0.011642751283943653, "learning_rate": 4.150531961663589e-06, "loss": 0.0006, "step": 14316 }, { "epoch": 3.7769423558897244, "grad_norm": 0.022944217547774315, "learning_rate": 4.150180251472787e-06, "loss": 0.0003, "step": 14318 }, { "epoch": 3.77746999076639, "grad_norm": 0.01754934713244438, "learning_rate": 4.149828541281984e-06, "loss": 0.0003, "step": 14320 }, { "epoch": 3.7779976256430547, "grad_norm": 0.018683934584259987, "learning_rate": 4.1494768310911816e-06, "loss": 0.0006, "step": 14322 }, { "epoch": 3.7785252605197206, "grad_norm": 0.1012260690331459, "learning_rate": 4.149125120900378e-06, "loss": 0.0045, "step": 14324 }, { "epoch": 3.7790528953963856, "grad_norm": 0.0032113862689584494, "learning_rate": 4.1487734107095755e-06, "loss": 0.0016, "step": 14326 }, { "epoch": 3.779580530273051, "grad_norm": 0.04677390679717064, "learning_rate": 4.1484217005187724e-06, "loss": 0.0003, "step": 14328 }, { "epoch": 3.7801081651497164, "grad_norm": 0.031000688672065735, "learning_rate": 4.14806999032797e-06, "loss": 0.0009, "step": 14330 }, { "epoch": 3.7806358000263818, "grad_norm": 0.005556581076234579, "learning_rate": 4.147718280137167e-06, "loss": 0.0031, "step": 14332 }, { "epoch": 3.781163434903047, "grad_norm": 0.06700368970632553, "learning_rate": 4.147366569946364e-06, "loss": 0.0004, "step": 14334 }, { "epoch": 3.7816910697797126, "grad_norm": 0.17665080726146698, "learning_rate": 4.147014859755562e-06, "loss": 0.0005, "step": 14336 }, { "epoch": 3.782218704656378, "grad_norm": 0.013788411393761635, "learning_rate": 4.146663149564758e-06, "loss": 0.0014, "step": 14338 }, { "epoch": 3.782746339533043, "grad_norm": 0.5074535608291626, "learning_rate": 4.146311439373956e-06, "loss": 0.0025, "step": 14340 }, { "epoch": 3.7832739744097084, "grad_norm": 0.14501601457595825, "learning_rate": 4.145959729183154e-06, "loss": 0.005, "step": 14342 }, { "epoch": 3.7838016092863738, "grad_norm": 0.293626606464386, "learning_rate": 4.145608018992351e-06, "loss": 0.0067, "step": 14344 }, { "epoch": 3.784329244163039, "grad_norm": 0.05881636589765549, "learning_rate": 4.1452563088015475e-06, "loss": 0.0003, "step": 14346 }, { "epoch": 3.7848568790397046, "grad_norm": 0.006307687610387802, "learning_rate": 4.1449045986107445e-06, "loss": 0.0065, "step": 14348 }, { "epoch": 3.78538451391637, "grad_norm": 0.008294209837913513, "learning_rate": 4.144552888419942e-06, "loss": 0.0002, "step": 14350 }, { "epoch": 3.7859121487930354, "grad_norm": 0.402660608291626, "learning_rate": 4.144201178229139e-06, "loss": 0.002, "step": 14352 }, { "epoch": 3.7864397836697004, "grad_norm": 0.029319608584046364, "learning_rate": 4.143849468038336e-06, "loss": 0.0003, "step": 14354 }, { "epoch": 3.7869674185463658, "grad_norm": 0.15794222056865692, "learning_rate": 4.143497757847534e-06, "loss": 0.0007, "step": 14356 }, { "epoch": 3.787495053423031, "grad_norm": 0.02096569538116455, "learning_rate": 4.143146047656731e-06, "loss": 0.0003, "step": 14358 }, { "epoch": 3.7880226882996966, "grad_norm": 0.05025862529873848, "learning_rate": 4.142794337465929e-06, "loss": 0.0004, "step": 14360 }, { "epoch": 3.788550323176362, "grad_norm": 0.03351043909788132, "learning_rate": 4.142442627275125e-06, "loss": 0.0004, "step": 14362 }, { "epoch": 3.7890779580530274, "grad_norm": 0.024182775989174843, "learning_rate": 4.142090917084323e-06, "loss": 0.0004, "step": 14364 }, { "epoch": 3.789605592929693, "grad_norm": 0.2360522747039795, "learning_rate": 4.1417392068935205e-06, "loss": 0.0012, "step": 14366 }, { "epoch": 3.7901332278063578, "grad_norm": 0.007765684276819229, "learning_rate": 4.1413874967027166e-06, "loss": 0.0081, "step": 14368 }, { "epoch": 3.7906608626830236, "grad_norm": 0.04920600727200508, "learning_rate": 4.141035786511914e-06, "loss": 0.0003, "step": 14370 }, { "epoch": 3.7911884975596886, "grad_norm": 0.022487344220280647, "learning_rate": 4.140684076321111e-06, "loss": 0.0004, "step": 14372 }, { "epoch": 3.791716132436354, "grad_norm": 0.30554571747779846, "learning_rate": 4.140332366130309e-06, "loss": 0.0034, "step": 14374 }, { "epoch": 3.7922437673130194, "grad_norm": 0.05761859938502312, "learning_rate": 4.139980655939506e-06, "loss": 0.0004, "step": 14376 }, { "epoch": 3.792771402189685, "grad_norm": 0.005539350211620331, "learning_rate": 4.139628945748703e-06, "loss": 0.001, "step": 14378 }, { "epoch": 3.79329903706635, "grad_norm": 0.007244075182825327, "learning_rate": 4.139277235557901e-06, "loss": 0.0004, "step": 14380 }, { "epoch": 3.7938266719430156, "grad_norm": 0.2646026015281677, "learning_rate": 4.138925525367098e-06, "loss": 0.0013, "step": 14382 }, { "epoch": 3.794354306819681, "grad_norm": 0.4304194748401642, "learning_rate": 4.138573815176295e-06, "loss": 0.0014, "step": 14384 }, { "epoch": 3.794881941696346, "grad_norm": 0.017085427418351173, "learning_rate": 4.138222104985492e-06, "loss": 0.0005, "step": 14386 }, { "epoch": 3.7954095765730114, "grad_norm": 0.05163382738828659, "learning_rate": 4.1378703947946895e-06, "loss": 0.0016, "step": 14388 }, { "epoch": 3.7959372114496768, "grad_norm": 0.2034258246421814, "learning_rate": 4.137518684603886e-06, "loss": 0.0062, "step": 14390 }, { "epoch": 3.796464846326342, "grad_norm": 0.021320875734090805, "learning_rate": 4.137166974413083e-06, "loss": 0.0011, "step": 14392 }, { "epoch": 3.7969924812030076, "grad_norm": 0.010139611549675465, "learning_rate": 4.136815264222281e-06, "loss": 0.0003, "step": 14394 }, { "epoch": 3.797520116079673, "grad_norm": 0.08170759677886963, "learning_rate": 4.136463554031478e-06, "loss": 0.0005, "step": 14396 }, { "epoch": 3.7980477509563384, "grad_norm": 0.010080872103571892, "learning_rate": 4.136111843840675e-06, "loss": 0.0003, "step": 14398 }, { "epoch": 3.7985753858330034, "grad_norm": 0.021655403077602386, "learning_rate": 4.135760133649873e-06, "loss": 0.0003, "step": 14400 }, { "epoch": 3.7991030207096688, "grad_norm": 0.13285177946090698, "learning_rate": 4.13540842345907e-06, "loss": 0.0004, "step": 14402 }, { "epoch": 3.799630655586334, "grad_norm": 0.005530608352273703, "learning_rate": 4.135056713268268e-06, "loss": 0.0002, "step": 14404 }, { "epoch": 3.8001582904629996, "grad_norm": 0.038089435547590256, "learning_rate": 4.134705003077464e-06, "loss": 0.001, "step": 14406 }, { "epoch": 3.800685925339665, "grad_norm": 0.45671799778938293, "learning_rate": 4.1343532928866615e-06, "loss": 0.0049, "step": 14408 }, { "epoch": 3.8012135602163304, "grad_norm": 0.6446999311447144, "learning_rate": 4.1340015826958585e-06, "loss": 0.003, "step": 14410 }, { "epoch": 3.801741195092996, "grad_norm": 0.47503742575645447, "learning_rate": 4.133649872505056e-06, "loss": 0.0113, "step": 14412 }, { "epoch": 3.8022688299696608, "grad_norm": 0.14367367327213287, "learning_rate": 4.133298162314253e-06, "loss": 0.0015, "step": 14414 }, { "epoch": 3.8027964648463266, "grad_norm": 0.004618590697646141, "learning_rate": 4.13294645212345e-06, "loss": 0.0005, "step": 14416 }, { "epoch": 3.8033240997229916, "grad_norm": 0.004057791084051132, "learning_rate": 4.132594741932648e-06, "loss": 0.0002, "step": 14418 }, { "epoch": 3.803851734599657, "grad_norm": 0.013564566150307655, "learning_rate": 4.132243031741845e-06, "loss": 0.0003, "step": 14420 }, { "epoch": 3.8043793694763224, "grad_norm": 0.25528261065483093, "learning_rate": 4.131891321551042e-06, "loss": 0.0011, "step": 14422 }, { "epoch": 3.804907004352988, "grad_norm": 0.0038810106925666332, "learning_rate": 4.13153961136024e-06, "loss": 0.0002, "step": 14424 }, { "epoch": 3.805434639229653, "grad_norm": 0.3163926899433136, "learning_rate": 4.131187901169437e-06, "loss": 0.0014, "step": 14426 }, { "epoch": 3.805962274106318, "grad_norm": 0.2725782096385956, "learning_rate": 4.130836190978634e-06, "loss": 0.0011, "step": 14428 }, { "epoch": 3.806489908982984, "grad_norm": 0.3499221205711365, "learning_rate": 4.1304844807878305e-06, "loss": 0.0017, "step": 14430 }, { "epoch": 3.807017543859649, "grad_norm": 0.2836083173751831, "learning_rate": 4.130132770597028e-06, "loss": 0.0102, "step": 14432 }, { "epoch": 3.8075451787363144, "grad_norm": 1.106080174446106, "learning_rate": 4.129781060406225e-06, "loss": 0.0041, "step": 14434 }, { "epoch": 3.80807281361298, "grad_norm": 0.05710912123322487, "learning_rate": 4.129429350215422e-06, "loss": 0.0034, "step": 14436 }, { "epoch": 3.808600448489645, "grad_norm": 0.018558790907263756, "learning_rate": 4.12907764002462e-06, "loss": 0.001, "step": 14438 }, { "epoch": 3.8091280833663106, "grad_norm": 0.06541863083839417, "learning_rate": 4.128725929833817e-06, "loss": 0.0004, "step": 14440 }, { "epoch": 3.809655718242976, "grad_norm": 0.4844400882720947, "learning_rate": 4.128374219643015e-06, "loss": 0.0019, "step": 14442 }, { "epoch": 3.8101833531196414, "grad_norm": 0.0907730758190155, "learning_rate": 4.128022509452211e-06, "loss": 0.0003, "step": 14444 }, { "epoch": 3.8107109879963064, "grad_norm": 0.029578231275081635, "learning_rate": 4.127670799261409e-06, "loss": 0.0003, "step": 14446 }, { "epoch": 3.8112386228729718, "grad_norm": 0.7651486396789551, "learning_rate": 4.127319089070606e-06, "loss": 0.0053, "step": 14448 }, { "epoch": 3.811766257749637, "grad_norm": 0.026786355301737785, "learning_rate": 4.1269673788798035e-06, "loss": 0.0003, "step": 14450 }, { "epoch": 3.8122938926263026, "grad_norm": 0.10381011664867401, "learning_rate": 4.126615668689e-06, "loss": 0.0006, "step": 14452 }, { "epoch": 3.812821527502968, "grad_norm": 0.09753299504518509, "learning_rate": 4.126263958498197e-06, "loss": 0.0009, "step": 14454 }, { "epoch": 3.8133491623796334, "grad_norm": 0.5776033401489258, "learning_rate": 4.125912248307395e-06, "loss": 0.0048, "step": 14456 }, { "epoch": 3.813876797256299, "grad_norm": 0.03138380125164986, "learning_rate": 4.125560538116592e-06, "loss": 0.0003, "step": 14458 }, { "epoch": 3.8144044321329638, "grad_norm": 0.01337805762887001, "learning_rate": 4.125208827925789e-06, "loss": 0.0002, "step": 14460 }, { "epoch": 3.8149320670096296, "grad_norm": 0.017507312819361687, "learning_rate": 4.124857117734987e-06, "loss": 0.0037, "step": 14462 }, { "epoch": 3.8154597018862946, "grad_norm": 0.0067015052773058414, "learning_rate": 4.124505407544184e-06, "loss": 0.0006, "step": 14464 }, { "epoch": 3.81598733676296, "grad_norm": 0.0038473051972687244, "learning_rate": 4.124153697353381e-06, "loss": 0.0004, "step": 14466 }, { "epoch": 3.8165149716396254, "grad_norm": 0.19075337052345276, "learning_rate": 4.123801987162578e-06, "loss": 0.0008, "step": 14468 }, { "epoch": 3.817042606516291, "grad_norm": 0.44688165187835693, "learning_rate": 4.1234502769717755e-06, "loss": 0.0027, "step": 14470 }, { "epoch": 3.817570241392956, "grad_norm": 0.00812184065580368, "learning_rate": 4.1230985667809725e-06, "loss": 0.0002, "step": 14472 }, { "epoch": 3.818097876269621, "grad_norm": 0.011228200048208237, "learning_rate": 4.1227468565901694e-06, "loss": 0.0002, "step": 14474 }, { "epoch": 3.818625511146287, "grad_norm": 0.07425124943256378, "learning_rate": 4.122395146399367e-06, "loss": 0.0007, "step": 14476 }, { "epoch": 3.819153146022952, "grad_norm": 0.0025105506647378206, "learning_rate": 4.122043436208564e-06, "loss": 0.0002, "step": 14478 }, { "epoch": 3.8196807808996174, "grad_norm": 0.0060182963497936726, "learning_rate": 4.121691726017762e-06, "loss": 0.0002, "step": 14480 }, { "epoch": 3.820208415776283, "grad_norm": 0.28141161799430847, "learning_rate": 4.121340015826959e-06, "loss": 0.0148, "step": 14482 }, { "epoch": 3.820736050652948, "grad_norm": 0.38355526328086853, "learning_rate": 4.120988305636156e-06, "loss": 0.0016, "step": 14484 }, { "epoch": 3.8212636855296136, "grad_norm": 0.002739488147199154, "learning_rate": 4.120636595445354e-06, "loss": 0.0002, "step": 14486 }, { "epoch": 3.821791320406279, "grad_norm": 0.8560352921485901, "learning_rate": 4.12028488525455e-06, "loss": 0.0063, "step": 14488 }, { "epoch": 3.8223189552829444, "grad_norm": 0.13934756815433502, "learning_rate": 4.119933175063748e-06, "loss": 0.0021, "step": 14490 }, { "epoch": 3.8228465901596094, "grad_norm": 0.018086405470967293, "learning_rate": 4.1195814648729445e-06, "loss": 0.0003, "step": 14492 }, { "epoch": 3.8233742250362748, "grad_norm": 0.14353671669960022, "learning_rate": 4.119229754682142e-06, "loss": 0.0052, "step": 14494 }, { "epoch": 3.82390185991294, "grad_norm": 0.06930671632289886, "learning_rate": 4.118878044491339e-06, "loss": 0.0003, "step": 14496 }, { "epoch": 3.8244294947896056, "grad_norm": 0.03299202024936676, "learning_rate": 4.118526334300536e-06, "loss": 0.0009, "step": 14498 }, { "epoch": 3.824957129666271, "grad_norm": 0.02520909160375595, "learning_rate": 4.118174624109734e-06, "loss": 0.0004, "step": 14500 }, { "epoch": 3.8254847645429364, "grad_norm": 0.0067531331442296505, "learning_rate": 4.117822913918931e-06, "loss": 0.0012, "step": 14502 }, { "epoch": 3.826012399419602, "grad_norm": 0.026081087067723274, "learning_rate": 4.117471203728128e-06, "loss": 0.0002, "step": 14504 }, { "epoch": 3.8265400342962668, "grad_norm": 0.3127727806568146, "learning_rate": 4.117119493537325e-06, "loss": 0.002, "step": 14506 }, { "epoch": 3.827067669172932, "grad_norm": 0.1664174199104309, "learning_rate": 4.116767783346523e-06, "loss": 0.0042, "step": 14508 }, { "epoch": 3.8275953040495976, "grad_norm": 0.39372628927230835, "learning_rate": 4.1164160731557205e-06, "loss": 0.0055, "step": 14510 }, { "epoch": 3.828122938926263, "grad_norm": 0.25298601388931274, "learning_rate": 4.116064362964917e-06, "loss": 0.0013, "step": 14512 }, { "epoch": 3.8286505738029284, "grad_norm": 0.02086702734231949, "learning_rate": 4.115712652774114e-06, "loss": 0.0004, "step": 14514 }, { "epoch": 3.829178208679594, "grad_norm": 0.1631067842245102, "learning_rate": 4.115360942583311e-06, "loss": 0.0008, "step": 14516 }, { "epoch": 3.829705843556259, "grad_norm": 0.16967104375362396, "learning_rate": 4.115009232392508e-06, "loss": 0.0065, "step": 14518 }, { "epoch": 3.830233478432924, "grad_norm": 0.2840724587440491, "learning_rate": 4.114657522201706e-06, "loss": 0.0013, "step": 14520 }, { "epoch": 3.83076111330959, "grad_norm": 0.27501365542411804, "learning_rate": 4.114305812010903e-06, "loss": 0.0065, "step": 14522 }, { "epoch": 3.831288748186255, "grad_norm": 0.07094915956258774, "learning_rate": 4.113954101820101e-06, "loss": 0.0005, "step": 14524 }, { "epoch": 3.8318163830629204, "grad_norm": 0.014322449453175068, "learning_rate": 4.113602391629297e-06, "loss": 0.0002, "step": 14526 }, { "epoch": 3.832344017939586, "grad_norm": 0.1105818822979927, "learning_rate": 4.113250681438495e-06, "loss": 0.0008, "step": 14528 }, { "epoch": 3.832871652816251, "grad_norm": 0.04501742124557495, "learning_rate": 4.112898971247692e-06, "loss": 0.0004, "step": 14530 }, { "epoch": 3.8333992876929166, "grad_norm": 0.02934354729950428, "learning_rate": 4.1125472610568895e-06, "loss": 0.0019, "step": 14532 }, { "epoch": 3.833926922569582, "grad_norm": 0.0904729887843132, "learning_rate": 4.1121955508660865e-06, "loss": 0.0004, "step": 14534 }, { "epoch": 3.8344545574462474, "grad_norm": 0.0037885408382862806, "learning_rate": 4.111843840675283e-06, "loss": 0.0002, "step": 14536 }, { "epoch": 3.8349821923229124, "grad_norm": 0.00461132125928998, "learning_rate": 4.111492130484481e-06, "loss": 0.0006, "step": 14538 }, { "epoch": 3.835509827199578, "grad_norm": 0.007876121439039707, "learning_rate": 4.111140420293678e-06, "loss": 0.0002, "step": 14540 }, { "epoch": 3.836037462076243, "grad_norm": 0.12022115290164948, "learning_rate": 4.110788710102875e-06, "loss": 0.0004, "step": 14542 }, { "epoch": 3.8365650969529086, "grad_norm": 0.02606086991727352, "learning_rate": 4.110436999912073e-06, "loss": 0.0003, "step": 14544 }, { "epoch": 3.837092731829574, "grad_norm": 0.07679060101509094, "learning_rate": 4.11008528972127e-06, "loss": 0.0003, "step": 14546 }, { "epoch": 3.8376203667062394, "grad_norm": 0.535470724105835, "learning_rate": 4.109733579530467e-06, "loss": 0.005, "step": 14548 }, { "epoch": 3.838148001582905, "grad_norm": 0.035220228135585785, "learning_rate": 4.109381869339664e-06, "loss": 0.003, "step": 14550 }, { "epoch": 3.8386756364595698, "grad_norm": 0.04168573394417763, "learning_rate": 4.109030159148862e-06, "loss": 0.0003, "step": 14552 }, { "epoch": 3.839203271336235, "grad_norm": 0.4039943516254425, "learning_rate": 4.1086784489580585e-06, "loss": 0.0028, "step": 14554 }, { "epoch": 3.8397309062129006, "grad_norm": 0.0068716746754944324, "learning_rate": 4.1083267387672555e-06, "loss": 0.0002, "step": 14556 }, { "epoch": 3.840258541089566, "grad_norm": 0.004652075003832579, "learning_rate": 4.107975028576453e-06, "loss": 0.0002, "step": 14558 }, { "epoch": 3.8407861759662314, "grad_norm": 0.18624025583267212, "learning_rate": 4.10762331838565e-06, "loss": 0.0053, "step": 14560 }, { "epoch": 3.841313810842897, "grad_norm": 0.00649368017911911, "learning_rate": 4.107271608194848e-06, "loss": 0.0003, "step": 14562 }, { "epoch": 3.841841445719562, "grad_norm": 0.018328042700886726, "learning_rate": 4.106919898004044e-06, "loss": 0.0003, "step": 14564 }, { "epoch": 3.842369080596227, "grad_norm": 0.9610537886619568, "learning_rate": 4.106568187813242e-06, "loss": 0.007, "step": 14566 }, { "epoch": 3.842896715472893, "grad_norm": 0.006520124617964029, "learning_rate": 4.10621647762244e-06, "loss": 0.0004, "step": 14568 }, { "epoch": 3.843424350349558, "grad_norm": 0.040504440665245056, "learning_rate": 4.105864767431637e-06, "loss": 0.0118, "step": 14570 }, { "epoch": 3.8439519852262234, "grad_norm": 0.6646335124969482, "learning_rate": 4.105513057240834e-06, "loss": 0.0095, "step": 14572 }, { "epoch": 3.844479620102889, "grad_norm": 0.720024824142456, "learning_rate": 4.105161347050031e-06, "loss": 0.0098, "step": 14574 }, { "epoch": 3.845007254979554, "grad_norm": 0.01516349520534277, "learning_rate": 4.104809636859228e-06, "loss": 0.0042, "step": 14576 }, { "epoch": 3.8455348898562196, "grad_norm": 0.1750975400209427, "learning_rate": 4.104457926668425e-06, "loss": 0.0075, "step": 14578 }, { "epoch": 3.8460625247328846, "grad_norm": 0.030053136870265007, "learning_rate": 4.104106216477622e-06, "loss": 0.0019, "step": 14580 }, { "epoch": 3.8465901596095504, "grad_norm": 0.0925130546092987, "learning_rate": 4.10375450628682e-06, "loss": 0.0006, "step": 14582 }, { "epoch": 3.8471177944862154, "grad_norm": 0.030541114509105682, "learning_rate": 4.103402796096017e-06, "loss": 0.0006, "step": 14584 }, { "epoch": 3.847645429362881, "grad_norm": 0.1529829353094101, "learning_rate": 4.103051085905214e-06, "loss": 0.0063, "step": 14586 }, { "epoch": 3.848173064239546, "grad_norm": 0.1358359009027481, "learning_rate": 4.102699375714411e-06, "loss": 0.0075, "step": 14588 }, { "epoch": 3.8487006991162116, "grad_norm": 0.020801987498998642, "learning_rate": 4.102347665523609e-06, "loss": 0.0003, "step": 14590 }, { "epoch": 3.849228333992877, "grad_norm": 0.02505071833729744, "learning_rate": 4.1019959553328066e-06, "loss": 0.0004, "step": 14592 }, { "epoch": 3.8497559688695424, "grad_norm": 0.14103439450263977, "learning_rate": 4.101644245142003e-06, "loss": 0.0022, "step": 14594 }, { "epoch": 3.850283603746208, "grad_norm": 0.13450242578983307, "learning_rate": 4.1012925349512005e-06, "loss": 0.001, "step": 14596 }, { "epoch": 3.850811238622873, "grad_norm": 0.033124618232250214, "learning_rate": 4.100940824760397e-06, "loss": 0.0004, "step": 14598 }, { "epoch": 3.851338873499538, "grad_norm": 0.11675163358449936, "learning_rate": 4.100589114569595e-06, "loss": 0.0032, "step": 14600 }, { "epoch": 3.8518665083762036, "grad_norm": 0.018560810014605522, "learning_rate": 4.100237404378792e-06, "loss": 0.0003, "step": 14602 }, { "epoch": 3.852394143252869, "grad_norm": 0.07546187192201614, "learning_rate": 4.099885694187989e-06, "loss": 0.0005, "step": 14604 }, { "epoch": 3.8529217781295344, "grad_norm": 0.0540839321911335, "learning_rate": 4.099533983997187e-06, "loss": 0.0009, "step": 14606 }, { "epoch": 3.8534494130062, "grad_norm": 1.4992380142211914, "learning_rate": 4.099182273806383e-06, "loss": 0.0068, "step": 14608 }, { "epoch": 3.853977047882865, "grad_norm": 0.026884665712714195, "learning_rate": 4.098830563615581e-06, "loss": 0.0033, "step": 14610 }, { "epoch": 3.85450468275953, "grad_norm": 0.4060094654560089, "learning_rate": 4.098478853424778e-06, "loss": 0.006, "step": 14612 }, { "epoch": 3.855032317636196, "grad_norm": 0.027654603123664856, "learning_rate": 4.0981271432339756e-06, "loss": 0.0003, "step": 14614 }, { "epoch": 3.855559952512861, "grad_norm": 0.02802661620080471, "learning_rate": 4.0977754330431725e-06, "loss": 0.0003, "step": 14616 }, { "epoch": 3.8560875873895264, "grad_norm": 0.019920581951737404, "learning_rate": 4.0974237228523695e-06, "loss": 0.0003, "step": 14618 }, { "epoch": 3.856615222266192, "grad_norm": 0.06357387453317642, "learning_rate": 4.097072012661567e-06, "loss": 0.0005, "step": 14620 }, { "epoch": 3.857142857142857, "grad_norm": 0.2669280171394348, "learning_rate": 4.096720302470764e-06, "loss": 0.006, "step": 14622 }, { "epoch": 3.8576704920195226, "grad_norm": 0.01891172304749489, "learning_rate": 4.096368592279961e-06, "loss": 0.0003, "step": 14624 }, { "epoch": 3.8581981268961876, "grad_norm": 0.018169023096561432, "learning_rate": 4.096016882089159e-06, "loss": 0.0002, "step": 14626 }, { "epoch": 3.8587257617728534, "grad_norm": 0.008008058182895184, "learning_rate": 4.095665171898356e-06, "loss": 0.0004, "step": 14628 }, { "epoch": 3.8592533966495184, "grad_norm": 0.008648162707686424, "learning_rate": 4.095313461707554e-06, "loss": 0.0005, "step": 14630 }, { "epoch": 3.859781031526184, "grad_norm": 0.03917274251580238, "learning_rate": 4.09496175151675e-06, "loss": 0.0003, "step": 14632 }, { "epoch": 3.860308666402849, "grad_norm": 0.1267738789319992, "learning_rate": 4.094610041325948e-06, "loss": 0.0026, "step": 14634 }, { "epoch": 3.8608363012795146, "grad_norm": 0.011580312624573708, "learning_rate": 4.094258331135145e-06, "loss": 0.0002, "step": 14636 }, { "epoch": 3.86136393615618, "grad_norm": 0.2424786388874054, "learning_rate": 4.0939066209443415e-06, "loss": 0.0007, "step": 14638 }, { "epoch": 3.8618915710328454, "grad_norm": 0.04037575423717499, "learning_rate": 4.093554910753539e-06, "loss": 0.0004, "step": 14640 }, { "epoch": 3.862419205909511, "grad_norm": 0.1470070630311966, "learning_rate": 4.093203200562736e-06, "loss": 0.002, "step": 14642 }, { "epoch": 3.862946840786176, "grad_norm": 0.6265846490859985, "learning_rate": 4.092851490371934e-06, "loss": 0.0039, "step": 14644 }, { "epoch": 3.863474475662841, "grad_norm": 0.01212995033711195, "learning_rate": 4.09249978018113e-06, "loss": 0.0006, "step": 14646 }, { "epoch": 3.8640021105395066, "grad_norm": 0.015566329471766949, "learning_rate": 4.092148069990328e-06, "loss": 0.0047, "step": 14648 }, { "epoch": 3.864529745416172, "grad_norm": 0.6419702172279358, "learning_rate": 4.091796359799525e-06, "loss": 0.0038, "step": 14650 }, { "epoch": 3.8650573802928374, "grad_norm": 0.16026757657527924, "learning_rate": 4.091444649608723e-06, "loss": 0.0004, "step": 14652 }, { "epoch": 3.865585015169503, "grad_norm": 0.004307456314563751, "learning_rate": 4.09109293941792e-06, "loss": 0.0002, "step": 14654 }, { "epoch": 3.866112650046168, "grad_norm": 0.006732279900461435, "learning_rate": 4.090741229227117e-06, "loss": 0.0003, "step": 14656 }, { "epoch": 3.866640284922833, "grad_norm": 0.23384350538253784, "learning_rate": 4.0903895190363145e-06, "loss": 0.0009, "step": 14658 }, { "epoch": 3.8671679197994986, "grad_norm": 0.11968506872653961, "learning_rate": 4.090037808845511e-06, "loss": 0.0004, "step": 14660 }, { "epoch": 3.867695554676164, "grad_norm": 0.3050784468650818, "learning_rate": 4.089686098654708e-06, "loss": 0.0078, "step": 14662 }, { "epoch": 3.8682231895528294, "grad_norm": 0.5728505253791809, "learning_rate": 4.089334388463906e-06, "loss": 0.0054, "step": 14664 }, { "epoch": 3.868750824429495, "grad_norm": 0.22027330100536346, "learning_rate": 4.088982678273103e-06, "loss": 0.0011, "step": 14666 }, { "epoch": 3.86927845930616, "grad_norm": 0.441588819026947, "learning_rate": 4.0886309680823e-06, "loss": 0.0018, "step": 14668 }, { "epoch": 3.8698060941828256, "grad_norm": 0.4835335910320282, "learning_rate": 4.088279257891497e-06, "loss": 0.0071, "step": 14670 }, { "epoch": 3.8703337290594906, "grad_norm": 0.4133051931858063, "learning_rate": 4.087927547700695e-06, "loss": 0.0017, "step": 14672 }, { "epoch": 3.8708613639361564, "grad_norm": 0.04161013290286064, "learning_rate": 4.087575837509892e-06, "loss": 0.0003, "step": 14674 }, { "epoch": 3.8713889988128214, "grad_norm": 0.010896824300289154, "learning_rate": 4.087224127319089e-06, "loss": 0.0002, "step": 14676 }, { "epoch": 3.871916633689487, "grad_norm": 0.1578928828239441, "learning_rate": 4.0868724171282865e-06, "loss": 0.0026, "step": 14678 }, { "epoch": 3.872444268566152, "grad_norm": 0.006152451038360596, "learning_rate": 4.0865207069374835e-06, "loss": 0.0002, "step": 14680 }, { "epoch": 3.8729719034428176, "grad_norm": 0.24667362868785858, "learning_rate": 4.086168996746681e-06, "loss": 0.0006, "step": 14682 }, { "epoch": 3.873499538319483, "grad_norm": 0.009311462752521038, "learning_rate": 4.085817286555878e-06, "loss": 0.0016, "step": 14684 }, { "epoch": 3.8740271731961484, "grad_norm": 0.012175310403108597, "learning_rate": 4.085465576365075e-06, "loss": 0.0004, "step": 14686 }, { "epoch": 3.874554808072814, "grad_norm": 0.47195956110954285, "learning_rate": 4.085113866174273e-06, "loss": 0.0067, "step": 14688 }, { "epoch": 3.875082442949479, "grad_norm": 0.21606208384037018, "learning_rate": 4.08476215598347e-06, "loss": 0.0005, "step": 14690 }, { "epoch": 3.875610077826144, "grad_norm": 0.003362694289535284, "learning_rate": 4.084410445792667e-06, "loss": 0.0002, "step": 14692 }, { "epoch": 3.8761377127028096, "grad_norm": 0.4078986346721649, "learning_rate": 4.084058735601864e-06, "loss": 0.0057, "step": 14694 }, { "epoch": 3.876665347579475, "grad_norm": 0.06016141548752785, "learning_rate": 4.083707025411062e-06, "loss": 0.0005, "step": 14696 }, { "epoch": 3.8771929824561404, "grad_norm": 0.006921426393091679, "learning_rate": 4.083355315220259e-06, "loss": 0.0005, "step": 14698 }, { "epoch": 3.877720617332806, "grad_norm": 0.8186867833137512, "learning_rate": 4.0830036050294555e-06, "loss": 0.0084, "step": 14700 }, { "epoch": 3.8782482522094712, "grad_norm": 0.10792148858308792, "learning_rate": 4.082651894838653e-06, "loss": 0.0007, "step": 14702 }, { "epoch": 3.878775887086136, "grad_norm": 0.05566566064953804, "learning_rate": 4.08230018464785e-06, "loss": 0.0005, "step": 14704 }, { "epoch": 3.8793035219628016, "grad_norm": 0.19688931107521057, "learning_rate": 4.081948474457047e-06, "loss": 0.0007, "step": 14706 }, { "epoch": 3.879831156839467, "grad_norm": 0.006565159186720848, "learning_rate": 4.081596764266244e-06, "loss": 0.0002, "step": 14708 }, { "epoch": 3.8803587917161324, "grad_norm": 0.009878263808786869, "learning_rate": 4.081245054075442e-06, "loss": 0.0003, "step": 14710 }, { "epoch": 3.880886426592798, "grad_norm": 0.004066093359142542, "learning_rate": 4.08089334388464e-06, "loss": 0.0013, "step": 14712 }, { "epoch": 3.881414061469463, "grad_norm": 0.006540436763316393, "learning_rate": 4.080541633693836e-06, "loss": 0.0002, "step": 14714 }, { "epoch": 3.8819416963461286, "grad_norm": 0.005305809434503317, "learning_rate": 4.080189923503034e-06, "loss": 0.0002, "step": 14716 }, { "epoch": 3.8824693312227936, "grad_norm": 0.00362003524787724, "learning_rate": 4.079838213312231e-06, "loss": 0.0007, "step": 14718 }, { "epoch": 3.8829969660994594, "grad_norm": 0.02482185885310173, "learning_rate": 4.0794865031214284e-06, "loss": 0.0003, "step": 14720 }, { "epoch": 3.8835246009761244, "grad_norm": 0.03148236498236656, "learning_rate": 4.079134792930625e-06, "loss": 0.0002, "step": 14722 }, { "epoch": 3.88405223585279, "grad_norm": 0.010619860142469406, "learning_rate": 4.078783082739822e-06, "loss": 0.0008, "step": 14724 }, { "epoch": 3.884579870729455, "grad_norm": 0.37540051341056824, "learning_rate": 4.07843137254902e-06, "loss": 0.0009, "step": 14726 }, { "epoch": 3.8851075056061206, "grad_norm": 0.0028457639273256063, "learning_rate": 4.078079662358216e-06, "loss": 0.0031, "step": 14728 }, { "epoch": 3.885635140482786, "grad_norm": 0.0036332574672997, "learning_rate": 4.077727952167414e-06, "loss": 0.0002, "step": 14730 }, { "epoch": 3.886162775359451, "grad_norm": 0.12213514745235443, "learning_rate": 4.077376241976611e-06, "loss": 0.0089, "step": 14732 }, { "epoch": 3.886690410236117, "grad_norm": 0.0025157174095511436, "learning_rate": 4.077024531785809e-06, "loss": 0.0002, "step": 14734 }, { "epoch": 3.887218045112782, "grad_norm": 0.004041142296046019, "learning_rate": 4.076672821595006e-06, "loss": 0.0002, "step": 14736 }, { "epoch": 3.887745679989447, "grad_norm": 0.07279825210571289, "learning_rate": 4.076321111404203e-06, "loss": 0.0003, "step": 14738 }, { "epoch": 3.8882733148661126, "grad_norm": 0.00421518087387085, "learning_rate": 4.0759694012134005e-06, "loss": 0.0002, "step": 14740 }, { "epoch": 3.888800949742778, "grad_norm": 0.13331691920757294, "learning_rate": 4.0756176910225975e-06, "loss": 0.0006, "step": 14742 }, { "epoch": 3.8893285846194434, "grad_norm": 0.4405086934566498, "learning_rate": 4.075265980831794e-06, "loss": 0.008, "step": 14744 }, { "epoch": 3.889856219496109, "grad_norm": 0.0019891380798071623, "learning_rate": 4.074914270640992e-06, "loss": 0.0002, "step": 14746 }, { "epoch": 3.8903838543727742, "grad_norm": 0.0023483692202717066, "learning_rate": 4.074562560450189e-06, "loss": 0.0012, "step": 14748 }, { "epoch": 3.890911489249439, "grad_norm": 0.2933798134326935, "learning_rate": 4.074210850259387e-06, "loss": 0.0005, "step": 14750 }, { "epoch": 3.8914391241261046, "grad_norm": 1.1058056354522705, "learning_rate": 4.073859140068583e-06, "loss": 0.0025, "step": 14752 }, { "epoch": 3.89196675900277, "grad_norm": 0.5757985711097717, "learning_rate": 4.073507429877781e-06, "loss": 0.0061, "step": 14754 }, { "epoch": 3.8924943938794354, "grad_norm": 0.8272992372512817, "learning_rate": 4.073155719686978e-06, "loss": 0.0052, "step": 14756 }, { "epoch": 3.893022028756101, "grad_norm": 0.02474563755095005, "learning_rate": 4.072804009496175e-06, "loss": 0.0004, "step": 14758 }, { "epoch": 3.8935496636327662, "grad_norm": 0.03999394550919533, "learning_rate": 4.072452299305373e-06, "loss": 0.0006, "step": 14760 }, { "epoch": 3.8940772985094316, "grad_norm": 0.023128531873226166, "learning_rate": 4.0721005891145695e-06, "loss": 0.0002, "step": 14762 }, { "epoch": 3.8946049333860966, "grad_norm": 0.5286317467689514, "learning_rate": 4.071748878923767e-06, "loss": 0.0035, "step": 14764 }, { "epoch": 3.8951325682627624, "grad_norm": 0.46955153346061707, "learning_rate": 4.0713971687329634e-06, "loss": 0.0016, "step": 14766 }, { "epoch": 3.8956602031394274, "grad_norm": 0.054210662841796875, "learning_rate": 4.071045458542161e-06, "loss": 0.0009, "step": 14768 }, { "epoch": 3.896187838016093, "grad_norm": 0.15166062116622925, "learning_rate": 4.070693748351359e-06, "loss": 0.0006, "step": 14770 }, { "epoch": 3.896715472892758, "grad_norm": 0.00991075299680233, "learning_rate": 4.070342038160556e-06, "loss": 0.0003, "step": 14772 }, { "epoch": 3.8972431077694236, "grad_norm": 0.07775512337684631, "learning_rate": 4.069990327969753e-06, "loss": 0.0006, "step": 14774 }, { "epoch": 3.897770742646089, "grad_norm": 0.03036108799278736, "learning_rate": 4.06963861777895e-06, "loss": 0.0003, "step": 14776 }, { "epoch": 3.898298377522754, "grad_norm": 0.01330969762057066, "learning_rate": 4.069286907588148e-06, "loss": 0.0018, "step": 14778 }, { "epoch": 3.89882601239942, "grad_norm": 0.004910938907414675, "learning_rate": 4.068935197397345e-06, "loss": 0.0027, "step": 14780 }, { "epoch": 3.899353647276085, "grad_norm": 0.006318842060863972, "learning_rate": 4.068583487206542e-06, "loss": 0.0002, "step": 14782 }, { "epoch": 3.89988128215275, "grad_norm": 0.005099440459161997, "learning_rate": 4.068231777015739e-06, "loss": 0.0002, "step": 14784 }, { "epoch": 3.9004089170294156, "grad_norm": 0.0027301181107759476, "learning_rate": 4.067880066824936e-06, "loss": 0.0002, "step": 14786 }, { "epoch": 3.900936551906081, "grad_norm": 0.01038419920951128, "learning_rate": 4.067528356634133e-06, "loss": 0.0004, "step": 14788 }, { "epoch": 3.9014641867827464, "grad_norm": 0.042016495019197464, "learning_rate": 4.06717664644333e-06, "loss": 0.0003, "step": 14790 }, { "epoch": 3.901991821659412, "grad_norm": 0.028251947835087776, "learning_rate": 4.066824936252528e-06, "loss": 0.0021, "step": 14792 }, { "epoch": 3.9025194565360772, "grad_norm": 5.433204650878906, "learning_rate": 4.066473226061726e-06, "loss": 0.0042, "step": 14794 }, { "epoch": 3.903047091412742, "grad_norm": 0.042323656380176544, "learning_rate": 4.066121515870922e-06, "loss": 0.0002, "step": 14796 }, { "epoch": 3.9035747262894076, "grad_norm": 0.362016499042511, "learning_rate": 4.06576980568012e-06, "loss": 0.0038, "step": 14798 }, { "epoch": 3.904102361166073, "grad_norm": 0.046764202415943146, "learning_rate": 4.065418095489317e-06, "loss": 0.0008, "step": 14800 }, { "epoch": 3.9046299960427384, "grad_norm": 0.009529453702270985, "learning_rate": 4.0650663852985145e-06, "loss": 0.0047, "step": 14802 }, { "epoch": 3.905157630919404, "grad_norm": 0.27305057644844055, "learning_rate": 4.0647146751077115e-06, "loss": 0.0061, "step": 14804 }, { "epoch": 3.9056852657960692, "grad_norm": 0.06841269135475159, "learning_rate": 4.064362964916908e-06, "loss": 0.0049, "step": 14806 }, { "epoch": 3.9062129006727346, "grad_norm": 0.5846716165542603, "learning_rate": 4.064011254726106e-06, "loss": 0.0068, "step": 14808 }, { "epoch": 3.9067405355493996, "grad_norm": 0.01386062242090702, "learning_rate": 4.063659544535303e-06, "loss": 0.0003, "step": 14810 }, { "epoch": 3.907268170426065, "grad_norm": 0.010245285928249359, "learning_rate": 4.0633078343445e-06, "loss": 0.0006, "step": 14812 }, { "epoch": 3.9077958053027304, "grad_norm": 0.44019052386283875, "learning_rate": 4.062956124153697e-06, "loss": 0.0042, "step": 14814 }, { "epoch": 3.908323440179396, "grad_norm": 0.4638724625110626, "learning_rate": 4.062604413962895e-06, "loss": 0.002, "step": 14816 }, { "epoch": 3.908851075056061, "grad_norm": 0.005528330337256193, "learning_rate": 4.062252703772092e-06, "loss": 0.0043, "step": 14818 }, { "epoch": 3.9093787099327266, "grad_norm": 0.15359702706336975, "learning_rate": 4.061900993581289e-06, "loss": 0.0008, "step": 14820 }, { "epoch": 3.909906344809392, "grad_norm": 0.349372923374176, "learning_rate": 4.0615492833904866e-06, "loss": 0.0066, "step": 14822 }, { "epoch": 3.910433979686057, "grad_norm": 0.01832953281700611, "learning_rate": 4.0611975731996835e-06, "loss": 0.0002, "step": 14824 }, { "epoch": 3.910961614562723, "grad_norm": 0.12187645584344864, "learning_rate": 4.0608458630088805e-06, "loss": 0.0024, "step": 14826 }, { "epoch": 3.911489249439388, "grad_norm": 0.005105744116008282, "learning_rate": 4.060494152818078e-06, "loss": 0.0002, "step": 14828 }, { "epoch": 3.912016884316053, "grad_norm": 0.017426036298274994, "learning_rate": 4.060142442627275e-06, "loss": 0.0022, "step": 14830 }, { "epoch": 3.9125445191927186, "grad_norm": 0.0037465281784534454, "learning_rate": 4.059790732436473e-06, "loss": 0.0023, "step": 14832 }, { "epoch": 3.913072154069384, "grad_norm": 0.14051122963428497, "learning_rate": 4.059439022245669e-06, "loss": 0.0068, "step": 14834 }, { "epoch": 3.9135997889460494, "grad_norm": 0.024610811844468117, "learning_rate": 4.059087312054867e-06, "loss": 0.0039, "step": 14836 }, { "epoch": 3.914127423822715, "grad_norm": 0.013491551391780376, "learning_rate": 4.058735601864064e-06, "loss": 0.004, "step": 14838 }, { "epoch": 3.9146550586993802, "grad_norm": 0.009190332144498825, "learning_rate": 4.058383891673262e-06, "loss": 0.0002, "step": 14840 }, { "epoch": 3.915182693576045, "grad_norm": 0.1824052333831787, "learning_rate": 4.058032181482459e-06, "loss": 0.0006, "step": 14842 }, { "epoch": 3.9157103284527106, "grad_norm": 0.004784863907843828, "learning_rate": 4.057680471291656e-06, "loss": 0.0002, "step": 14844 }, { "epoch": 3.916237963329376, "grad_norm": 0.07098512351512909, "learning_rate": 4.057328761100853e-06, "loss": 0.0004, "step": 14846 }, { "epoch": 3.9167655982060414, "grad_norm": 0.031608015298843384, "learning_rate": 4.0569770509100495e-06, "loss": 0.0003, "step": 14848 }, { "epoch": 3.917293233082707, "grad_norm": 0.3316761255264282, "learning_rate": 4.056625340719247e-06, "loss": 0.0036, "step": 14850 }, { "epoch": 3.9178208679593722, "grad_norm": 0.014005984179675579, "learning_rate": 4.056273630528445e-06, "loss": 0.0025, "step": 14852 }, { "epoch": 3.9183485028360376, "grad_norm": 0.10619007796049118, "learning_rate": 4.055921920337642e-06, "loss": 0.0066, "step": 14854 }, { "epoch": 3.9188761377127026, "grad_norm": 0.34932127594947815, "learning_rate": 4.055570210146839e-06, "loss": 0.0084, "step": 14856 }, { "epoch": 3.919403772589368, "grad_norm": 0.0291442908346653, "learning_rate": 4.055218499956036e-06, "loss": 0.0003, "step": 14858 }, { "epoch": 3.9199314074660334, "grad_norm": 0.7382806539535522, "learning_rate": 4.054866789765234e-06, "loss": 0.0126, "step": 14860 }, { "epoch": 3.920459042342699, "grad_norm": 0.01105603389441967, "learning_rate": 4.054515079574431e-06, "loss": 0.0011, "step": 14862 }, { "epoch": 3.9209866772193642, "grad_norm": 0.5600464940071106, "learning_rate": 4.054163369383628e-06, "loss": 0.0089, "step": 14864 }, { "epoch": 3.9215143120960296, "grad_norm": 0.12344864755868912, "learning_rate": 4.0538116591928255e-06, "loss": 0.0054, "step": 14866 }, { "epoch": 3.922041946972695, "grad_norm": 0.108052559196949, "learning_rate": 4.053459949002022e-06, "loss": 0.0009, "step": 14868 }, { "epoch": 3.92256958184936, "grad_norm": 0.5208352208137512, "learning_rate": 4.05310823881122e-06, "loss": 0.002, "step": 14870 }, { "epoch": 3.923097216726026, "grad_norm": 0.07351592183113098, "learning_rate": 4.052756528620416e-06, "loss": 0.0008, "step": 14872 }, { "epoch": 3.923624851602691, "grad_norm": 0.21575744450092316, "learning_rate": 4.052404818429614e-06, "loss": 0.0014, "step": 14874 }, { "epoch": 3.924152486479356, "grad_norm": 0.01924113929271698, "learning_rate": 4.052053108238811e-06, "loss": 0.0007, "step": 14876 }, { "epoch": 3.9246801213560216, "grad_norm": 0.04512329772114754, "learning_rate": 4.051701398048009e-06, "loss": 0.0005, "step": 14878 }, { "epoch": 3.925207756232687, "grad_norm": 0.02353145182132721, "learning_rate": 4.051349687857206e-06, "loss": 0.0019, "step": 14880 }, { "epoch": 3.9257353911093524, "grad_norm": 0.023422108963131905, "learning_rate": 4.050997977666403e-06, "loss": 0.0004, "step": 14882 }, { "epoch": 3.926263025986018, "grad_norm": 0.38717716932296753, "learning_rate": 4.0506462674756006e-06, "loss": 0.0011, "step": 14884 }, { "epoch": 3.9267906608626832, "grad_norm": 0.6520988345146179, "learning_rate": 4.0502945572847975e-06, "loss": 0.0061, "step": 14886 }, { "epoch": 3.927318295739348, "grad_norm": 0.04016818851232529, "learning_rate": 4.0499428470939945e-06, "loss": 0.0007, "step": 14888 }, { "epoch": 3.9278459306160136, "grad_norm": 0.017927661538124084, "learning_rate": 4.049591136903192e-06, "loss": 0.0003, "step": 14890 }, { "epoch": 3.928373565492679, "grad_norm": 0.06775154918432236, "learning_rate": 4.049239426712389e-06, "loss": 0.0006, "step": 14892 }, { "epoch": 3.9289012003693444, "grad_norm": 1.5313762426376343, "learning_rate": 4.048887716521586e-06, "loss": 0.0104, "step": 14894 }, { "epoch": 3.92942883524601, "grad_norm": 0.06240496784448624, "learning_rate": 4.048536006330783e-06, "loss": 0.0005, "step": 14896 }, { "epoch": 3.9299564701226752, "grad_norm": 0.114476777613163, "learning_rate": 4.048184296139981e-06, "loss": 0.0007, "step": 14898 }, { "epoch": 3.9304841049993406, "grad_norm": 0.16810691356658936, "learning_rate": 4.047832585949178e-06, "loss": 0.0008, "step": 14900 }, { "epoch": 3.9310117398760056, "grad_norm": 0.04375164210796356, "learning_rate": 4.047480875758375e-06, "loss": 0.0005, "step": 14902 }, { "epoch": 3.931539374752671, "grad_norm": 0.05346313491463661, "learning_rate": 4.047129165567573e-06, "loss": 0.0005, "step": 14904 }, { "epoch": 3.9320670096293364, "grad_norm": 0.015466609969735146, "learning_rate": 4.04677745537677e-06, "loss": 0.0004, "step": 14906 }, { "epoch": 3.932594644506002, "grad_norm": 0.02235412411391735, "learning_rate": 4.046425745185967e-06, "loss": 0.0003, "step": 14908 }, { "epoch": 3.9331222793826672, "grad_norm": 0.010131650604307652, "learning_rate": 4.046074034995164e-06, "loss": 0.0002, "step": 14910 }, { "epoch": 3.9336499142593326, "grad_norm": 0.03474024683237076, "learning_rate": 4.045722324804361e-06, "loss": 0.0002, "step": 14912 }, { "epoch": 3.934177549135998, "grad_norm": 0.005937001667916775, "learning_rate": 4.045370614613559e-06, "loss": 0.0032, "step": 14914 }, { "epoch": 3.934705184012663, "grad_norm": 0.604207456111908, "learning_rate": 4.045018904422755e-06, "loss": 0.0026, "step": 14916 }, { "epoch": 3.935232818889329, "grad_norm": 0.054536812007427216, "learning_rate": 4.044667194231953e-06, "loss": 0.0003, "step": 14918 }, { "epoch": 3.935760453765994, "grad_norm": 0.46857038140296936, "learning_rate": 4.04431548404115e-06, "loss": 0.0033, "step": 14920 }, { "epoch": 3.936288088642659, "grad_norm": 0.08701054006814957, "learning_rate": 4.043963773850348e-06, "loss": 0.0065, "step": 14922 }, { "epoch": 3.9368157235193246, "grad_norm": 0.14453287422657013, "learning_rate": 4.043612063659545e-06, "loss": 0.0005, "step": 14924 }, { "epoch": 3.93734335839599, "grad_norm": 0.01854010671377182, "learning_rate": 4.043260353468742e-06, "loss": 0.0003, "step": 14926 }, { "epoch": 3.9378709932726554, "grad_norm": 0.05096431076526642, "learning_rate": 4.0429086432779394e-06, "loss": 0.0033, "step": 14928 }, { "epoch": 3.9383986281493204, "grad_norm": 0.4061216413974762, "learning_rate": 4.042556933087136e-06, "loss": 0.0044, "step": 14930 }, { "epoch": 3.9389262630259863, "grad_norm": 0.012012061662971973, "learning_rate": 4.042205222896333e-06, "loss": 0.0127, "step": 14932 }, { "epoch": 3.939453897902651, "grad_norm": 0.055233635008335114, "learning_rate": 4.04185351270553e-06, "loss": 0.0014, "step": 14934 }, { "epoch": 3.9399815327793166, "grad_norm": 0.010928763076663017, "learning_rate": 4.041501802514728e-06, "loss": 0.0049, "step": 14936 }, { "epoch": 3.940509167655982, "grad_norm": 0.004522653762251139, "learning_rate": 4.041150092323926e-06, "loss": 0.0031, "step": 14938 }, { "epoch": 3.9410368025326474, "grad_norm": 0.054450854659080505, "learning_rate": 4.040798382133122e-06, "loss": 0.0008, "step": 14940 }, { "epoch": 3.941564437409313, "grad_norm": 0.1188664510846138, "learning_rate": 4.04044667194232e-06, "loss": 0.0013, "step": 14942 }, { "epoch": 3.9420920722859782, "grad_norm": 0.5507012605667114, "learning_rate": 4.040094961751517e-06, "loss": 0.0021, "step": 14944 }, { "epoch": 3.9426197071626436, "grad_norm": 0.009327755309641361, "learning_rate": 4.039743251560714e-06, "loss": 0.0066, "step": 14946 }, { "epoch": 3.9431473420393086, "grad_norm": 0.021489473059773445, "learning_rate": 4.0393915413699115e-06, "loss": 0.0036, "step": 14948 }, { "epoch": 3.943674976915974, "grad_norm": 0.14818313717842102, "learning_rate": 4.0390398311791085e-06, "loss": 0.0051, "step": 14950 }, { "epoch": 3.9442026117926394, "grad_norm": 0.08945269882678986, "learning_rate": 4.038688120988306e-06, "loss": 0.0032, "step": 14952 }, { "epoch": 3.944730246669305, "grad_norm": 0.022596051916480064, "learning_rate": 4.038336410797502e-06, "loss": 0.0004, "step": 14954 }, { "epoch": 3.9452578815459702, "grad_norm": 0.25775715708732605, "learning_rate": 4.0379847006067e-06, "loss": 0.0017, "step": 14956 }, { "epoch": 3.9457855164226356, "grad_norm": 0.030944494530558586, "learning_rate": 4.037632990415897e-06, "loss": 0.0004, "step": 14958 }, { "epoch": 3.946313151299301, "grad_norm": 0.5893661975860596, "learning_rate": 4.037281280225095e-06, "loss": 0.0125, "step": 14960 }, { "epoch": 3.946840786175966, "grad_norm": 0.06279243528842926, "learning_rate": 4.036929570034292e-06, "loss": 0.0007, "step": 14962 }, { "epoch": 3.9473684210526314, "grad_norm": 0.03111395798623562, "learning_rate": 4.036577859843489e-06, "loss": 0.0005, "step": 14964 }, { "epoch": 3.947896055929297, "grad_norm": 0.07334370911121368, "learning_rate": 4.036226149652687e-06, "loss": 0.0009, "step": 14966 }, { "epoch": 3.9484236908059622, "grad_norm": 0.10312946140766144, "learning_rate": 4.0358744394618836e-06, "loss": 0.0009, "step": 14968 }, { "epoch": 3.9489513256826276, "grad_norm": 0.008523762226104736, "learning_rate": 4.0355227292710805e-06, "loss": 0.0003, "step": 14970 }, { "epoch": 3.949478960559293, "grad_norm": 0.03233693912625313, "learning_rate": 4.035171019080278e-06, "loss": 0.0003, "step": 14972 }, { "epoch": 3.9500065954359584, "grad_norm": 0.022240208461880684, "learning_rate": 4.034819308889475e-06, "loss": 0.0004, "step": 14974 }, { "epoch": 3.9505342303126234, "grad_norm": 0.012321263551712036, "learning_rate": 4.034467598698672e-06, "loss": 0.0002, "step": 14976 }, { "epoch": 3.9510618651892893, "grad_norm": 0.009777034632861614, "learning_rate": 4.034115888507869e-06, "loss": 0.0002, "step": 14978 }, { "epoch": 3.951589500065954, "grad_norm": 0.292646586894989, "learning_rate": 4.033764178317067e-06, "loss": 0.0071, "step": 14980 }, { "epoch": 3.9521171349426196, "grad_norm": 0.4716857969760895, "learning_rate": 4.033412468126264e-06, "loss": 0.0047, "step": 14982 }, { "epoch": 3.952644769819285, "grad_norm": 0.021792396903038025, "learning_rate": 4.033060757935461e-06, "loss": 0.0003, "step": 14984 }, { "epoch": 3.9531724046959504, "grad_norm": 0.5643140077590942, "learning_rate": 4.032709047744659e-06, "loss": 0.0064, "step": 14986 }, { "epoch": 3.953700039572616, "grad_norm": 0.022170880809426308, "learning_rate": 4.032357337553856e-06, "loss": 0.0002, "step": 14988 }, { "epoch": 3.9542276744492812, "grad_norm": 0.03429948166012764, "learning_rate": 4.0320056273630534e-06, "loss": 0.0058, "step": 14990 }, { "epoch": 3.9547553093259467, "grad_norm": 0.007257308345288038, "learning_rate": 4.0316539171722495e-06, "loss": 0.0003, "step": 14992 }, { "epoch": 3.9552829442026116, "grad_norm": 0.04357863962650299, "learning_rate": 4.031302206981447e-06, "loss": 0.0014, "step": 14994 }, { "epoch": 3.955810579079277, "grad_norm": 0.31662464141845703, "learning_rate": 4.030950496790645e-06, "loss": 0.0013, "step": 14996 }, { "epoch": 3.9563382139559424, "grad_norm": 0.5022873282432556, "learning_rate": 4.030598786599842e-06, "loss": 0.002, "step": 14998 }, { "epoch": 3.956865848832608, "grad_norm": 0.0796300545334816, "learning_rate": 4.030247076409039e-06, "loss": 0.0004, "step": 15000 }, { "epoch": 3.9573934837092732, "grad_norm": 0.04167192429304123, "learning_rate": 4.029895366218236e-06, "loss": 0.0004, "step": 15002 }, { "epoch": 3.9579211185859386, "grad_norm": 0.19687794148921967, "learning_rate": 4.029543656027434e-06, "loss": 0.0156, "step": 15004 }, { "epoch": 3.958448753462604, "grad_norm": 0.2364587038755417, "learning_rate": 4.029191945836631e-06, "loss": 0.0015, "step": 15006 }, { "epoch": 3.958976388339269, "grad_norm": 0.018140243366360664, "learning_rate": 4.028840235645828e-06, "loss": 0.0003, "step": 15008 }, { "epoch": 3.9595040232159344, "grad_norm": 0.010329612530767918, "learning_rate": 4.0284885254550255e-06, "loss": 0.0003, "step": 15010 }, { "epoch": 3.9600316580926, "grad_norm": 0.3455526828765869, "learning_rate": 4.0281368152642225e-06, "loss": 0.0011, "step": 15012 }, { "epoch": 3.9605592929692652, "grad_norm": 0.016616057604551315, "learning_rate": 4.027785105073419e-06, "loss": 0.0002, "step": 15014 }, { "epoch": 3.9610869278459306, "grad_norm": 0.3297424018383026, "learning_rate": 4.027433394882616e-06, "loss": 0.0074, "step": 15016 }, { "epoch": 3.961614562722596, "grad_norm": 0.17648692429065704, "learning_rate": 4.027081684691814e-06, "loss": 0.0008, "step": 15018 }, { "epoch": 3.9621421975992615, "grad_norm": 0.02207639254629612, "learning_rate": 4.026729974501012e-06, "loss": 0.0003, "step": 15020 }, { "epoch": 3.9626698324759264, "grad_norm": 0.46870899200439453, "learning_rate": 4.026378264310208e-06, "loss": 0.0011, "step": 15022 }, { "epoch": 3.9631974673525923, "grad_norm": 0.006390565074980259, "learning_rate": 4.026026554119406e-06, "loss": 0.0009, "step": 15024 }, { "epoch": 3.9637251022292572, "grad_norm": 0.00720831798389554, "learning_rate": 4.025674843928603e-06, "loss": 0.0002, "step": 15026 }, { "epoch": 3.9642527371059226, "grad_norm": 0.006822033319622278, "learning_rate": 4.025323133737801e-06, "loss": 0.0004, "step": 15028 }, { "epoch": 3.964780371982588, "grad_norm": 0.13058848679065704, "learning_rate": 4.0249714235469976e-06, "loss": 0.0006, "step": 15030 }, { "epoch": 3.9653080068592534, "grad_norm": 0.03320057690143585, "learning_rate": 4.0246197133561945e-06, "loss": 0.0003, "step": 15032 }, { "epoch": 3.965835641735919, "grad_norm": 0.035012755542993546, "learning_rate": 4.024268003165392e-06, "loss": 0.0102, "step": 15034 }, { "epoch": 3.9663632766125843, "grad_norm": 0.09849902987480164, "learning_rate": 4.0239162929745884e-06, "loss": 0.0023, "step": 15036 }, { "epoch": 3.9668909114892497, "grad_norm": 0.10506121814250946, "learning_rate": 4.023564582783786e-06, "loss": 0.0078, "step": 15038 }, { "epoch": 3.9674185463659146, "grad_norm": 0.02055138349533081, "learning_rate": 4.023212872592983e-06, "loss": 0.0002, "step": 15040 }, { "epoch": 3.96794618124258, "grad_norm": 0.010355482809245586, "learning_rate": 4.022861162402181e-06, "loss": 0.0002, "step": 15042 }, { "epoch": 3.9684738161192454, "grad_norm": 0.04272575303912163, "learning_rate": 4.022509452211378e-06, "loss": 0.0004, "step": 15044 }, { "epoch": 3.969001450995911, "grad_norm": 0.081512451171875, "learning_rate": 4.022157742020575e-06, "loss": 0.0064, "step": 15046 }, { "epoch": 3.9695290858725762, "grad_norm": 0.006558577995747328, "learning_rate": 4.021806031829773e-06, "loss": 0.0002, "step": 15048 }, { "epoch": 3.9700567207492417, "grad_norm": 0.020434420555830002, "learning_rate": 4.02145432163897e-06, "loss": 0.0002, "step": 15050 }, { "epoch": 3.970584355625907, "grad_norm": 0.012618296779692173, "learning_rate": 4.021102611448167e-06, "loss": 0.0031, "step": 15052 }, { "epoch": 3.971111990502572, "grad_norm": 0.01442679576575756, "learning_rate": 4.020750901257364e-06, "loss": 0.0042, "step": 15054 }, { "epoch": 3.9716396253792374, "grad_norm": 0.010881087742745876, "learning_rate": 4.020399191066561e-06, "loss": 0.0003, "step": 15056 }, { "epoch": 3.972167260255903, "grad_norm": 0.10030483454465866, "learning_rate": 4.020047480875759e-06, "loss": 0.0004, "step": 15058 }, { "epoch": 3.9726948951325682, "grad_norm": 0.6058385968208313, "learning_rate": 4.019695770684955e-06, "loss": 0.0111, "step": 15060 }, { "epoch": 3.9732225300092336, "grad_norm": 0.09755932539701462, "learning_rate": 4.019344060494153e-06, "loss": 0.0006, "step": 15062 }, { "epoch": 3.973750164885899, "grad_norm": 0.04058809578418732, "learning_rate": 4.01899235030335e-06, "loss": 0.0003, "step": 15064 }, { "epoch": 3.9742777997625645, "grad_norm": 0.4337780177593231, "learning_rate": 4.018640640112547e-06, "loss": 0.0018, "step": 15066 }, { "epoch": 3.9748054346392294, "grad_norm": 0.2145991176366806, "learning_rate": 4.018288929921745e-06, "loss": 0.0062, "step": 15068 }, { "epoch": 3.9753330695158953, "grad_norm": 0.0867135226726532, "learning_rate": 4.017937219730942e-06, "loss": 0.0004, "step": 15070 }, { "epoch": 3.9758607043925602, "grad_norm": 0.01915469393134117, "learning_rate": 4.0175855095401395e-06, "loss": 0.0004, "step": 15072 }, { "epoch": 3.9763883392692256, "grad_norm": 0.024315953254699707, "learning_rate": 4.017233799349336e-06, "loss": 0.0003, "step": 15074 }, { "epoch": 3.976915974145891, "grad_norm": 0.025162115693092346, "learning_rate": 4.016882089158533e-06, "loss": 0.0004, "step": 15076 }, { "epoch": 3.9774436090225564, "grad_norm": 0.1902977079153061, "learning_rate": 4.016530378967731e-06, "loss": 0.0054, "step": 15078 }, { "epoch": 3.977971243899222, "grad_norm": 0.08229722827672958, "learning_rate": 4.016178668776928e-06, "loss": 0.0067, "step": 15080 }, { "epoch": 3.978498878775887, "grad_norm": 0.017391277477145195, "learning_rate": 4.015826958586125e-06, "loss": 0.0002, "step": 15082 }, { "epoch": 3.9790265136525527, "grad_norm": 0.3411961495876312, "learning_rate": 4.015475248395322e-06, "loss": 0.0018, "step": 15084 }, { "epoch": 3.9795541485292176, "grad_norm": 0.01058447826653719, "learning_rate": 4.01512353820452e-06, "loss": 0.0003, "step": 15086 }, { "epoch": 3.980081783405883, "grad_norm": 0.07068940252065659, "learning_rate": 4.014771828013717e-06, "loss": 0.0006, "step": 15088 }, { "epoch": 3.9806094182825484, "grad_norm": 0.0618891641497612, "learning_rate": 4.014420117822914e-06, "loss": 0.0031, "step": 15090 }, { "epoch": 3.981137053159214, "grad_norm": 0.011281929910182953, "learning_rate": 4.0140684076321116e-06, "loss": 0.0035, "step": 15092 }, { "epoch": 3.9816646880358793, "grad_norm": 0.05389130860567093, "learning_rate": 4.0137166974413085e-06, "loss": 0.0004, "step": 15094 }, { "epoch": 3.9821923229125447, "grad_norm": 0.5221900939941406, "learning_rate": 4.0133649872505055e-06, "loss": 0.0078, "step": 15096 }, { "epoch": 3.98271995778921, "grad_norm": 0.17290909588336945, "learning_rate": 4.013013277059702e-06, "loss": 0.0007, "step": 15098 }, { "epoch": 3.983247592665875, "grad_norm": 0.18827851116657257, "learning_rate": 4.0126615668689e-06, "loss": 0.0031, "step": 15100 }, { "epoch": 3.9837752275425404, "grad_norm": 0.00813408475369215, "learning_rate": 4.012309856678097e-06, "loss": 0.0002, "step": 15102 }, { "epoch": 3.984302862419206, "grad_norm": 0.10672970861196518, "learning_rate": 4.011958146487294e-06, "loss": 0.0023, "step": 15104 }, { "epoch": 3.9848304972958712, "grad_norm": 0.03593005985021591, "learning_rate": 4.011606436296492e-06, "loss": 0.0003, "step": 15106 }, { "epoch": 3.9853581321725366, "grad_norm": 0.16911828517913818, "learning_rate": 4.011254726105689e-06, "loss": 0.0005, "step": 15108 }, { "epoch": 3.985885767049202, "grad_norm": 0.00823164451867342, "learning_rate": 4.010903015914887e-06, "loss": 0.0004, "step": 15110 }, { "epoch": 3.9864134019258675, "grad_norm": 0.017974428832530975, "learning_rate": 4.010551305724084e-06, "loss": 0.0003, "step": 15112 }, { "epoch": 3.9869410368025324, "grad_norm": 0.6214865446090698, "learning_rate": 4.010199595533281e-06, "loss": 0.0043, "step": 15114 }, { "epoch": 3.987468671679198, "grad_norm": 0.006206200458109379, "learning_rate": 4.009847885342478e-06, "loss": 0.0002, "step": 15116 }, { "epoch": 3.9879963065558632, "grad_norm": 0.44689756631851196, "learning_rate": 4.009496175151675e-06, "loss": 0.0067, "step": 15118 }, { "epoch": 3.9885239414325286, "grad_norm": 0.06088964268565178, "learning_rate": 4.009144464960872e-06, "loss": 0.0004, "step": 15120 }, { "epoch": 3.989051576309194, "grad_norm": 0.14027826488018036, "learning_rate": 4.008792754770069e-06, "loss": 0.0006, "step": 15122 }, { "epoch": 3.9895792111858595, "grad_norm": 0.07607714831829071, "learning_rate": 4.008441044579267e-06, "loss": 0.0003, "step": 15124 }, { "epoch": 3.990106846062525, "grad_norm": 0.2289760708808899, "learning_rate": 4.008089334388464e-06, "loss": 0.0056, "step": 15126 }, { "epoch": 3.99063448093919, "grad_norm": 0.558377742767334, "learning_rate": 4.007737624197661e-06, "loss": 0.0027, "step": 15128 }, { "epoch": 3.9911621158158557, "grad_norm": 0.4184040129184723, "learning_rate": 4.007385914006859e-06, "loss": 0.0114, "step": 15130 }, { "epoch": 3.9916897506925206, "grad_norm": 0.010896530002355576, "learning_rate": 4.007034203816056e-06, "loss": 0.0002, "step": 15132 }, { "epoch": 3.992217385569186, "grad_norm": 0.06307945400476456, "learning_rate": 4.006682493625253e-06, "loss": 0.0004, "step": 15134 }, { "epoch": 3.9927450204458514, "grad_norm": 0.007470361888408661, "learning_rate": 4.0063307834344504e-06, "loss": 0.0002, "step": 15136 }, { "epoch": 3.993272655322517, "grad_norm": 0.050618935376405716, "learning_rate": 4.005979073243647e-06, "loss": 0.0003, "step": 15138 }, { "epoch": 3.9938002901991823, "grad_norm": 0.005871023517102003, "learning_rate": 4.005627363052845e-06, "loss": 0.0002, "step": 15140 }, { "epoch": 3.9943279250758477, "grad_norm": 0.005134359933435917, "learning_rate": 4.005275652862041e-06, "loss": 0.0009, "step": 15142 }, { "epoch": 3.994855559952513, "grad_norm": 1.0017369985580444, "learning_rate": 4.004923942671239e-06, "loss": 0.008, "step": 15144 }, { "epoch": 3.995383194829178, "grad_norm": 0.014216039329767227, "learning_rate": 4.004572232480436e-06, "loss": 0.0008, "step": 15146 }, { "epoch": 3.9959108297058434, "grad_norm": 0.05305333435535431, "learning_rate": 4.004220522289634e-06, "loss": 0.0004, "step": 15148 }, { "epoch": 3.996438464582509, "grad_norm": 0.03582227975130081, "learning_rate": 4.003868812098831e-06, "loss": 0.0003, "step": 15150 }, { "epoch": 3.9969660994591742, "grad_norm": 0.4993690848350525, "learning_rate": 4.003517101908028e-06, "loss": 0.004, "step": 15152 }, { "epoch": 3.9974937343358397, "grad_norm": 0.0065236142836511135, "learning_rate": 4.0031653917172256e-06, "loss": 0.0003, "step": 15154 }, { "epoch": 3.998021369212505, "grad_norm": 0.008838809095323086, "learning_rate": 4.002813681526422e-06, "loss": 0.0003, "step": 15156 }, { "epoch": 3.9985490040891705, "grad_norm": 0.021674927324056625, "learning_rate": 4.0024619713356195e-06, "loss": 0.004, "step": 15158 }, { "epoch": 3.9990766389658354, "grad_norm": 0.006776808761060238, "learning_rate": 4.002110261144816e-06, "loss": 0.0002, "step": 15160 }, { "epoch": 3.999604273842501, "grad_norm": 0.034288667142391205, "learning_rate": 4.001758550954014e-06, "loss": 0.0003, "step": 15162 }, { "epoch": 4.0, "grad_norm": 0.004190049134194851, "learning_rate": 4.001406840763211e-06, "loss": 0.0066, "step": 15164 }, { "epoch": 4.000527634876665, "grad_norm": 0.011730051599442959, "learning_rate": 4.001055130572408e-06, "loss": 0.0003, "step": 15166 }, { "epoch": 4.001055269753331, "grad_norm": 0.018776698037981987, "learning_rate": 4.000703420381606e-06, "loss": 0.0011, "step": 15168 }, { "epoch": 4.001582904629996, "grad_norm": 0.02201603353023529, "learning_rate": 4.000351710190803e-06, "loss": 0.0042, "step": 15170 }, { "epoch": 4.002110539506662, "grad_norm": 0.03365008533000946, "learning_rate": 4e-06, "loss": 0.002, "step": 15172 }, { "epoch": 4.002638174383327, "grad_norm": 0.039022766053676605, "learning_rate": 3.999648289809198e-06, "loss": 0.0004, "step": 15174 }, { "epoch": 4.003165809259992, "grad_norm": 0.05630526691675186, "learning_rate": 3.9992965796183946e-06, "loss": 0.0003, "step": 15176 }, { "epoch": 4.003693444136657, "grad_norm": 0.17140808701515198, "learning_rate": 3.998944869427592e-06, "loss": 0.0007, "step": 15178 }, { "epoch": 4.004221079013322, "grad_norm": 0.49771368503570557, "learning_rate": 3.9985931592367885e-06, "loss": 0.0048, "step": 15180 }, { "epoch": 4.004748713889988, "grad_norm": 0.053307343274354935, "learning_rate": 3.998241449045986e-06, "loss": 0.0004, "step": 15182 }, { "epoch": 4.005276348766653, "grad_norm": 0.2910599112510681, "learning_rate": 3.997889738855183e-06, "loss": 0.0018, "step": 15184 }, { "epoch": 4.005803983643319, "grad_norm": 0.025449415668845177, "learning_rate": 3.99753802866438e-06, "loss": 0.0055, "step": 15186 }, { "epoch": 4.006331618519984, "grad_norm": 0.07291445136070251, "learning_rate": 3.997186318473578e-06, "loss": 0.0005, "step": 15188 }, { "epoch": 4.00685925339665, "grad_norm": 0.05261271446943283, "learning_rate": 3.996834608282775e-06, "loss": 0.0005, "step": 15190 }, { "epoch": 4.007386888273315, "grad_norm": 0.011698339134454727, "learning_rate": 3.996482898091973e-06, "loss": 0.0065, "step": 15192 }, { "epoch": 4.007914523149981, "grad_norm": 0.021015532314777374, "learning_rate": 3.99613118790117e-06, "loss": 0.0003, "step": 15194 }, { "epoch": 4.008442158026646, "grad_norm": 0.026702381670475006, "learning_rate": 3.995779477710367e-06, "loss": 0.0004, "step": 15196 }, { "epoch": 4.008969792903311, "grad_norm": 0.022406818345189095, "learning_rate": 3.9954277675195644e-06, "loss": 0.0003, "step": 15198 }, { "epoch": 4.009497427779976, "grad_norm": 0.0097163375467062, "learning_rate": 3.995076057328761e-06, "loss": 0.0002, "step": 15200 }, { "epoch": 4.010025062656641, "grad_norm": 0.01594052091240883, "learning_rate": 3.994724347137958e-06, "loss": 0.0002, "step": 15202 }, { "epoch": 4.010552697533307, "grad_norm": 0.0058207125402987, "learning_rate": 3.994372636947155e-06, "loss": 0.0002, "step": 15204 }, { "epoch": 4.011080332409972, "grad_norm": 0.037431467324495316, "learning_rate": 3.994020926756353e-06, "loss": 0.0003, "step": 15206 }, { "epoch": 4.011607967286638, "grad_norm": 0.014962752349674702, "learning_rate": 3.99366921656555e-06, "loss": 0.0005, "step": 15208 }, { "epoch": 4.012135602163303, "grad_norm": 0.03196221962571144, "learning_rate": 3.993317506374747e-06, "loss": 0.0013, "step": 15210 }, { "epoch": 4.012663237039968, "grad_norm": 0.3245704770088196, "learning_rate": 3.992965796183945e-06, "loss": 0.0015, "step": 15212 }, { "epoch": 4.013190871916634, "grad_norm": 0.12213491648435593, "learning_rate": 3.992614085993142e-06, "loss": 0.0007, "step": 15214 }, { "epoch": 4.013718506793299, "grad_norm": 0.7295018434524536, "learning_rate": 3.992262375802339e-06, "loss": 0.0044, "step": 15216 }, { "epoch": 4.014246141669965, "grad_norm": 0.23036131262779236, "learning_rate": 3.991910665611536e-06, "loss": 0.0011, "step": 15218 }, { "epoch": 4.01477377654663, "grad_norm": 0.17681117355823517, "learning_rate": 3.9915589554207335e-06, "loss": 0.0004, "step": 15220 }, { "epoch": 4.015301411423295, "grad_norm": 0.0037591976579278708, "learning_rate": 3.991207245229931e-06, "loss": 0.0002, "step": 15222 }, { "epoch": 4.01582904629996, "grad_norm": 0.020391173660755157, "learning_rate": 3.990855535039127e-06, "loss": 0.0002, "step": 15224 }, { "epoch": 4.016356681176625, "grad_norm": 0.030625691637396812, "learning_rate": 3.990503824848325e-06, "loss": 0.0004, "step": 15226 }, { "epoch": 4.016884316053291, "grad_norm": 0.004226693883538246, "learning_rate": 3.990152114657522e-06, "loss": 0.0002, "step": 15228 }, { "epoch": 4.017411950929956, "grad_norm": 0.0035120518878102303, "learning_rate": 3.98980040446672e-06, "loss": 0.0002, "step": 15230 }, { "epoch": 4.017939585806622, "grad_norm": 0.2400311380624771, "learning_rate": 3.989448694275917e-06, "loss": 0.0054, "step": 15232 }, { "epoch": 4.018467220683287, "grad_norm": 0.05416949838399887, "learning_rate": 3.989096984085114e-06, "loss": 0.0039, "step": 15234 }, { "epoch": 4.018994855559953, "grad_norm": 1.091807246208191, "learning_rate": 3.988745273894312e-06, "loss": 0.0094, "step": 15236 }, { "epoch": 4.019522490436618, "grad_norm": 0.5715978741645813, "learning_rate": 3.9883935637035086e-06, "loss": 0.0025, "step": 15238 }, { "epoch": 4.020050125313283, "grad_norm": 0.3040086627006531, "learning_rate": 3.9880418535127055e-06, "loss": 0.0009, "step": 15240 }, { "epoch": 4.020577760189949, "grad_norm": 0.07004305720329285, "learning_rate": 3.9876901433219025e-06, "loss": 0.0029, "step": 15242 }, { "epoch": 4.021105395066614, "grad_norm": 0.008693939074873924, "learning_rate": 3.9873384331311e-06, "loss": 0.0004, "step": 15244 }, { "epoch": 4.021633029943279, "grad_norm": 0.29466360807418823, "learning_rate": 3.986986722940297e-06, "loss": 0.0066, "step": 15246 }, { "epoch": 4.022160664819944, "grad_norm": 0.020105265080928802, "learning_rate": 3.986635012749494e-06, "loss": 0.0056, "step": 15248 }, { "epoch": 4.02268829969661, "grad_norm": 0.035498734563589096, "learning_rate": 3.986283302558692e-06, "loss": 0.0055, "step": 15250 }, { "epoch": 4.023215934573275, "grad_norm": 0.40290963649749756, "learning_rate": 3.985931592367889e-06, "loss": 0.0013, "step": 15252 }, { "epoch": 4.023743569449941, "grad_norm": 0.012578452937304974, "learning_rate": 3.985579882177086e-06, "loss": 0.0003, "step": 15254 }, { "epoch": 4.024271204326606, "grad_norm": 0.18680115044116974, "learning_rate": 3.985228171986284e-06, "loss": 0.0029, "step": 15256 }, { "epoch": 4.024798839203271, "grad_norm": 0.014475596137344837, "learning_rate": 3.984876461795481e-06, "loss": 0.0031, "step": 15258 }, { "epoch": 4.025326474079937, "grad_norm": 0.233926922082901, "learning_rate": 3.9845247516046784e-06, "loss": 0.0058, "step": 15260 }, { "epoch": 4.025854108956602, "grad_norm": 0.06408880650997162, "learning_rate": 3.9841730414138745e-06, "loss": 0.0007, "step": 15262 }, { "epoch": 4.026381743833268, "grad_norm": 0.46802741289138794, "learning_rate": 3.983821331223072e-06, "loss": 0.0015, "step": 15264 }, { "epoch": 4.026909378709933, "grad_norm": 0.04645736888051033, "learning_rate": 3.983469621032269e-06, "loss": 0.001, "step": 15266 }, { "epoch": 4.0274370135865984, "grad_norm": 0.1641600877046585, "learning_rate": 3.983117910841467e-06, "loss": 0.0006, "step": 15268 }, { "epoch": 4.027964648463263, "grad_norm": 0.011821418069303036, "learning_rate": 3.982766200650664e-06, "loss": 0.0014, "step": 15270 }, { "epoch": 4.028492283339928, "grad_norm": 0.03577462211251259, "learning_rate": 3.982414490459861e-06, "loss": 0.0006, "step": 15272 }, { "epoch": 4.029019918216594, "grad_norm": 0.018702570348978043, "learning_rate": 3.982062780269059e-06, "loss": 0.0013, "step": 15274 }, { "epoch": 4.029547553093259, "grad_norm": 0.02270079404115677, "learning_rate": 3.981711070078255e-06, "loss": 0.0003, "step": 15276 }, { "epoch": 4.030075187969925, "grad_norm": 0.02222856506705284, "learning_rate": 3.981359359887453e-06, "loss": 0.0003, "step": 15278 }, { "epoch": 4.03060282284659, "grad_norm": 0.16802506148815155, "learning_rate": 3.9810076496966505e-06, "loss": 0.0025, "step": 15280 }, { "epoch": 4.031130457723256, "grad_norm": 0.12808141112327576, "learning_rate": 3.9806559395058474e-06, "loss": 0.0004, "step": 15282 }, { "epoch": 4.031658092599921, "grad_norm": 0.02004289999604225, "learning_rate": 3.980304229315044e-06, "loss": 0.0004, "step": 15284 }, { "epoch": 4.032185727476586, "grad_norm": 0.032264284789562225, "learning_rate": 3.979952519124241e-06, "loss": 0.0003, "step": 15286 }, { "epoch": 4.032713362353252, "grad_norm": 0.02208717353641987, "learning_rate": 3.979600808933439e-06, "loss": 0.0003, "step": 15288 }, { "epoch": 4.033240997229917, "grad_norm": 0.01812305673956871, "learning_rate": 3.979249098742636e-06, "loss": 0.0002, "step": 15290 }, { "epoch": 4.033768632106582, "grad_norm": 0.3551846146583557, "learning_rate": 3.978897388551833e-06, "loss": 0.0035, "step": 15292 }, { "epoch": 4.034296266983247, "grad_norm": 0.005468605551868677, "learning_rate": 3.978545678361031e-06, "loss": 0.0002, "step": 15294 }, { "epoch": 4.034823901859913, "grad_norm": 0.2688567340373993, "learning_rate": 3.978193968170228e-06, "loss": 0.0038, "step": 15296 }, { "epoch": 4.035351536736578, "grad_norm": 0.015467955730855465, "learning_rate": 3.977842257979426e-06, "loss": 0.0004, "step": 15298 }, { "epoch": 4.035879171613244, "grad_norm": 0.01789657212793827, "learning_rate": 3.977490547788622e-06, "loss": 0.0048, "step": 15300 }, { "epoch": 4.036406806489909, "grad_norm": 0.36116155982017517, "learning_rate": 3.9771388375978195e-06, "loss": 0.0019, "step": 15302 }, { "epoch": 4.036934441366574, "grad_norm": 0.039408937096595764, "learning_rate": 3.976787127407017e-06, "loss": 0.0008, "step": 15304 }, { "epoch": 4.03746207624324, "grad_norm": 0.007895364426076412, "learning_rate": 3.976435417216213e-06, "loss": 0.0002, "step": 15306 }, { "epoch": 4.037989711119905, "grad_norm": 0.011117496527731419, "learning_rate": 3.976083707025411e-06, "loss": 0.0002, "step": 15308 }, { "epoch": 4.038517345996571, "grad_norm": 0.008099062368273735, "learning_rate": 3.975731996834608e-06, "loss": 0.0002, "step": 15310 }, { "epoch": 4.039044980873236, "grad_norm": 0.008447905071079731, "learning_rate": 3.975380286643806e-06, "loss": 0.0002, "step": 15312 }, { "epoch": 4.0395726157499015, "grad_norm": 0.4610337018966675, "learning_rate": 3.975028576453003e-06, "loss": 0.0067, "step": 15314 }, { "epoch": 4.040100250626566, "grad_norm": 0.04470338672399521, "learning_rate": 3.9746768662622e-06, "loss": 0.0004, "step": 15316 }, { "epoch": 4.040627885503231, "grad_norm": 0.006361295469105244, "learning_rate": 3.974325156071398e-06, "loss": 0.0002, "step": 15318 }, { "epoch": 4.041155520379897, "grad_norm": 0.008632328361272812, "learning_rate": 3.973973445880595e-06, "loss": 0.001, "step": 15320 }, { "epoch": 4.041683155256562, "grad_norm": 0.011873436160385609, "learning_rate": 3.9736217356897916e-06, "loss": 0.0004, "step": 15322 }, { "epoch": 4.042210790133228, "grad_norm": 0.1477673053741455, "learning_rate": 3.9732700254989885e-06, "loss": 0.0008, "step": 15324 }, { "epoch": 4.042738425009893, "grad_norm": 0.04003104567527771, "learning_rate": 3.972918315308186e-06, "loss": 0.0003, "step": 15326 }, { "epoch": 4.043266059886559, "grad_norm": 0.003993747755885124, "learning_rate": 3.972566605117383e-06, "loss": 0.0002, "step": 15328 }, { "epoch": 4.043793694763224, "grad_norm": 0.3846365511417389, "learning_rate": 3.97221489492658e-06, "loss": 0.0052, "step": 15330 }, { "epoch": 4.044321329639889, "grad_norm": 0.006820272654294968, "learning_rate": 3.971863184735778e-06, "loss": 0.0002, "step": 15332 }, { "epoch": 4.044848964516555, "grad_norm": 0.5377770066261292, "learning_rate": 3.971511474544975e-06, "loss": 0.0036, "step": 15334 }, { "epoch": 4.04537659939322, "grad_norm": 0.026741595938801765, "learning_rate": 3.971159764354172e-06, "loss": 0.0003, "step": 15336 }, { "epoch": 4.045904234269885, "grad_norm": 0.0073424153961241245, "learning_rate": 3.97080805416337e-06, "loss": 0.007, "step": 15338 }, { "epoch": 4.04643186914655, "grad_norm": 0.011309729889035225, "learning_rate": 3.970456343972567e-06, "loss": 0.0003, "step": 15340 }, { "epoch": 4.046959504023216, "grad_norm": 0.018407827243208885, "learning_rate": 3.9701046337817645e-06, "loss": 0.0004, "step": 15342 }, { "epoch": 4.047487138899881, "grad_norm": 0.0059195514768362045, "learning_rate": 3.969752923590961e-06, "loss": 0.0006, "step": 15344 }, { "epoch": 4.048014773776547, "grad_norm": 0.18853244185447693, "learning_rate": 3.969401213400158e-06, "loss": 0.0008, "step": 15346 }, { "epoch": 4.048542408653212, "grad_norm": 0.00864496547728777, "learning_rate": 3.969049503209355e-06, "loss": 0.0002, "step": 15348 }, { "epoch": 4.049070043529877, "grad_norm": 0.024633536115288734, "learning_rate": 3.968697793018553e-06, "loss": 0.0001, "step": 15350 }, { "epoch": 4.049597678406543, "grad_norm": 0.3052123785018921, "learning_rate": 3.96834608282775e-06, "loss": 0.0011, "step": 15352 }, { "epoch": 4.050125313283208, "grad_norm": 0.04525567963719368, "learning_rate": 3.967994372636947e-06, "loss": 0.0004, "step": 15354 }, { "epoch": 4.050652948159874, "grad_norm": 0.33581268787384033, "learning_rate": 3.967642662446145e-06, "loss": 0.0095, "step": 15356 }, { "epoch": 4.051180583036539, "grad_norm": 0.0026027564890682697, "learning_rate": 3.967290952255342e-06, "loss": 0.0005, "step": 15358 }, { "epoch": 4.0517082179132045, "grad_norm": 0.020427564159035683, "learning_rate": 3.966939242064539e-06, "loss": 0.0003, "step": 15360 }, { "epoch": 4.052235852789869, "grad_norm": 0.013785741291940212, "learning_rate": 3.9665875318737365e-06, "loss": 0.0019, "step": 15362 }, { "epoch": 4.052763487666534, "grad_norm": 0.41192626953125, "learning_rate": 3.9662358216829335e-06, "loss": 0.0028, "step": 15364 }, { "epoch": 4.0532911225432, "grad_norm": 0.015820948407053947, "learning_rate": 3.9658841114921305e-06, "loss": 0.0003, "step": 15366 }, { "epoch": 4.053818757419865, "grad_norm": 0.008294695988297462, "learning_rate": 3.965532401301327e-06, "loss": 0.0003, "step": 15368 }, { "epoch": 4.054346392296531, "grad_norm": 0.00224445853382349, "learning_rate": 3.965180691110525e-06, "loss": 0.0002, "step": 15370 }, { "epoch": 4.054874027173196, "grad_norm": 0.14870820939540863, "learning_rate": 3.964828980919722e-06, "loss": 0.0005, "step": 15372 }, { "epoch": 4.055401662049862, "grad_norm": 0.27567291259765625, "learning_rate": 3.964477270728919e-06, "loss": 0.002, "step": 15374 }, { "epoch": 4.055929296926527, "grad_norm": 0.00837215781211853, "learning_rate": 3.964125560538117e-06, "loss": 0.0031, "step": 15376 }, { "epoch": 4.056456931803192, "grad_norm": 0.0030925918836146593, "learning_rate": 3.963773850347314e-06, "loss": 0.0029, "step": 15378 }, { "epoch": 4.056984566679858, "grad_norm": 0.018355300650000572, "learning_rate": 3.963422140156512e-06, "loss": 0.0002, "step": 15380 }, { "epoch": 4.057512201556523, "grad_norm": 0.15034040808677673, "learning_rate": 3.963070429965708e-06, "loss": 0.0004, "step": 15382 }, { "epoch": 4.058039836433188, "grad_norm": 0.03295695409178734, "learning_rate": 3.9627187197749056e-06, "loss": 0.0005, "step": 15384 }, { "epoch": 4.058567471309853, "grad_norm": 0.38620835542678833, "learning_rate": 3.9623670095841025e-06, "loss": 0.0041, "step": 15386 }, { "epoch": 4.059095106186519, "grad_norm": 0.006464049220085144, "learning_rate": 3.9620152993933e-06, "loss": 0.0002, "step": 15388 }, { "epoch": 4.059622741063184, "grad_norm": 0.008128787390887737, "learning_rate": 3.961663589202497e-06, "loss": 0.0002, "step": 15390 }, { "epoch": 4.06015037593985, "grad_norm": 0.004861445631831884, "learning_rate": 3.961311879011694e-06, "loss": 0.0003, "step": 15392 }, { "epoch": 4.060678010816515, "grad_norm": 0.006231409497559071, "learning_rate": 3.960960168820892e-06, "loss": 0.0002, "step": 15394 }, { "epoch": 4.06120564569318, "grad_norm": 0.010307754389941692, "learning_rate": 3.960608458630089e-06, "loss": 0.0002, "step": 15396 }, { "epoch": 4.061733280569846, "grad_norm": 0.024421514943242073, "learning_rate": 3.960256748439286e-06, "loss": 0.0005, "step": 15398 }, { "epoch": 4.062260915446511, "grad_norm": 0.012169434688985348, "learning_rate": 3.959905038248484e-06, "loss": 0.0003, "step": 15400 }, { "epoch": 4.062788550323177, "grad_norm": 0.0693393424153328, "learning_rate": 3.959553328057681e-06, "loss": 0.0037, "step": 15402 }, { "epoch": 4.063316185199842, "grad_norm": 0.057035379111766815, "learning_rate": 3.959201617866878e-06, "loss": 0.0006, "step": 15404 }, { "epoch": 4.0638438200765075, "grad_norm": 0.45989975333213806, "learning_rate": 3.958849907676075e-06, "loss": 0.0026, "step": 15406 }, { "epoch": 4.064371454953172, "grad_norm": 0.18429851531982422, "learning_rate": 3.958498197485272e-06, "loss": 0.0009, "step": 15408 }, { "epoch": 4.064899089829837, "grad_norm": 0.020819595083594322, "learning_rate": 3.958146487294469e-06, "loss": 0.0002, "step": 15410 }, { "epoch": 4.065426724706503, "grad_norm": 0.02938694879412651, "learning_rate": 3.957794777103666e-06, "loss": 0.0004, "step": 15412 }, { "epoch": 4.065954359583168, "grad_norm": 0.02934827283024788, "learning_rate": 3.957443066912864e-06, "loss": 0.0003, "step": 15414 }, { "epoch": 4.066481994459834, "grad_norm": 0.022258944809436798, "learning_rate": 3.957091356722061e-06, "loss": 0.0002, "step": 15416 }, { "epoch": 4.067009629336499, "grad_norm": 0.018700553104281425, "learning_rate": 3.956739646531259e-06, "loss": 0.0004, "step": 15418 }, { "epoch": 4.067537264213165, "grad_norm": 0.020135361701250076, "learning_rate": 3.956387936340456e-06, "loss": 0.0002, "step": 15420 }, { "epoch": 4.06806489908983, "grad_norm": 0.07464530318975449, "learning_rate": 3.956036226149653e-06, "loss": 0.0038, "step": 15422 }, { "epoch": 4.068592533966495, "grad_norm": 0.005188970826566219, "learning_rate": 3.9556845159588505e-06, "loss": 0.0002, "step": 15424 }, { "epoch": 4.069120168843161, "grad_norm": 0.09630998224020004, "learning_rate": 3.955332805768047e-06, "loss": 0.0005, "step": 15426 }, { "epoch": 4.069647803719826, "grad_norm": 0.01302378997206688, "learning_rate": 3.9549810955772444e-06, "loss": 0.0002, "step": 15428 }, { "epoch": 4.0701754385964914, "grad_norm": 0.0037148825358599424, "learning_rate": 3.954629385386441e-06, "loss": 0.0002, "step": 15430 }, { "epoch": 4.070703073473156, "grad_norm": 0.3078361749649048, "learning_rate": 3.954277675195639e-06, "loss": 0.0053, "step": 15432 }, { "epoch": 4.071230708349822, "grad_norm": 0.0958610400557518, "learning_rate": 3.953925965004836e-06, "loss": 0.0077, "step": 15434 }, { "epoch": 4.071758343226487, "grad_norm": 0.004370071925222874, "learning_rate": 3.953574254814033e-06, "loss": 0.0002, "step": 15436 }, { "epoch": 4.072285978103152, "grad_norm": 0.024161310866475105, "learning_rate": 3.953222544623231e-06, "loss": 0.0009, "step": 15438 }, { "epoch": 4.072813612979818, "grad_norm": 0.00771679449826479, "learning_rate": 3.952870834432428e-06, "loss": 0.0002, "step": 15440 }, { "epoch": 4.073341247856483, "grad_norm": 0.38160836696624756, "learning_rate": 3.952519124241625e-06, "loss": 0.0025, "step": 15442 }, { "epoch": 4.073868882733149, "grad_norm": 0.012460034340620041, "learning_rate": 3.952167414050822e-06, "loss": 0.0002, "step": 15444 }, { "epoch": 4.074396517609814, "grad_norm": 0.026335321366786957, "learning_rate": 3.9518157038600196e-06, "loss": 0.0065, "step": 15446 }, { "epoch": 4.07492415248648, "grad_norm": 0.026486946269869804, "learning_rate": 3.951463993669217e-06, "loss": 0.0005, "step": 15448 }, { "epoch": 4.075451787363145, "grad_norm": 0.004141529090702534, "learning_rate": 3.9511122834784135e-06, "loss": 0.0002, "step": 15450 }, { "epoch": 4.0759794222398105, "grad_norm": 0.004393373150378466, "learning_rate": 3.950760573287611e-06, "loss": 0.0003, "step": 15452 }, { "epoch": 4.076507057116475, "grad_norm": 0.012238495983183384, "learning_rate": 3.950408863096808e-06, "loss": 0.0004, "step": 15454 }, { "epoch": 4.07703469199314, "grad_norm": 0.013161375187337399, "learning_rate": 3.950057152906005e-06, "loss": 0.0023, "step": 15456 }, { "epoch": 4.077562326869806, "grad_norm": 0.008802817203104496, "learning_rate": 3.949705442715203e-06, "loss": 0.0002, "step": 15458 }, { "epoch": 4.078089961746471, "grad_norm": 0.010682600550353527, "learning_rate": 3.9493537325244e-06, "loss": 0.0003, "step": 15460 }, { "epoch": 4.078617596623137, "grad_norm": 0.02826700545847416, "learning_rate": 3.949002022333598e-06, "loss": 0.0006, "step": 15462 }, { "epoch": 4.079145231499802, "grad_norm": 0.011657754890620708, "learning_rate": 3.948650312142794e-06, "loss": 0.0003, "step": 15464 }, { "epoch": 4.079672866376468, "grad_norm": 0.007533933036029339, "learning_rate": 3.948298601951992e-06, "loss": 0.0002, "step": 15466 }, { "epoch": 4.080200501253133, "grad_norm": 0.040239397436380386, "learning_rate": 3.9479468917611886e-06, "loss": 0.0003, "step": 15468 }, { "epoch": 4.080728136129798, "grad_norm": 0.023081734776496887, "learning_rate": 3.947595181570386e-06, "loss": 0.0002, "step": 15470 }, { "epoch": 4.081255771006464, "grad_norm": 0.004903650376945734, "learning_rate": 3.947243471379583e-06, "loss": 0.0002, "step": 15472 }, { "epoch": 4.081783405883129, "grad_norm": 0.00617413641884923, "learning_rate": 3.94689176118878e-06, "loss": 0.0002, "step": 15474 }, { "epoch": 4.0823110407597945, "grad_norm": 0.024207528680562973, "learning_rate": 3.946540050997978e-06, "loss": 0.0002, "step": 15476 }, { "epoch": 4.082838675636459, "grad_norm": 0.00778456823900342, "learning_rate": 3.946188340807175e-06, "loss": 0.0002, "step": 15478 }, { "epoch": 4.083366310513125, "grad_norm": 0.1029585599899292, "learning_rate": 3.945836630616372e-06, "loss": 0.0004, "step": 15480 }, { "epoch": 4.08389394538979, "grad_norm": 0.11750862747430801, "learning_rate": 3.94548492042557e-06, "loss": 0.0054, "step": 15482 }, { "epoch": 4.084421580266455, "grad_norm": 0.1906864494085312, "learning_rate": 3.945133210234767e-06, "loss": 0.0022, "step": 15484 }, { "epoch": 4.084949215143121, "grad_norm": 0.01719299517571926, "learning_rate": 3.9447815000439645e-06, "loss": 0.0016, "step": 15486 }, { "epoch": 4.085476850019786, "grad_norm": 0.014322016388177872, "learning_rate": 3.944429789853161e-06, "loss": 0.0002, "step": 15488 }, { "epoch": 4.086004484896452, "grad_norm": 0.010248595848679543, "learning_rate": 3.9440780796623584e-06, "loss": 0.0002, "step": 15490 }, { "epoch": 4.086532119773117, "grad_norm": 0.02735552005469799, "learning_rate": 3.943726369471555e-06, "loss": 0.0003, "step": 15492 }, { "epoch": 4.087059754649783, "grad_norm": 0.03886459022760391, "learning_rate": 3.943374659280752e-06, "loss": 0.0003, "step": 15494 }, { "epoch": 4.087587389526448, "grad_norm": 0.7633746862411499, "learning_rate": 3.94302294908995e-06, "loss": 0.0048, "step": 15496 }, { "epoch": 4.0881150244031135, "grad_norm": 0.03226756677031517, "learning_rate": 3.942671238899147e-06, "loss": 0.0019, "step": 15498 }, { "epoch": 4.088642659279778, "grad_norm": 0.037747301161289215, "learning_rate": 3.942319528708345e-06, "loss": 0.0002, "step": 15500 }, { "epoch": 4.089170294156443, "grad_norm": 0.00825591292232275, "learning_rate": 3.941967818517541e-06, "loss": 0.0002, "step": 15502 }, { "epoch": 4.089697929033109, "grad_norm": 0.060895152390003204, "learning_rate": 3.941616108326739e-06, "loss": 0.0006, "step": 15504 }, { "epoch": 4.090225563909774, "grad_norm": 0.0033624533098191023, "learning_rate": 3.941264398135937e-06, "loss": 0.0002, "step": 15506 }, { "epoch": 4.09075319878644, "grad_norm": 0.02146024815738201, "learning_rate": 3.9409126879451336e-06, "loss": 0.0002, "step": 15508 }, { "epoch": 4.091280833663105, "grad_norm": 0.06298971176147461, "learning_rate": 3.9405609777543305e-06, "loss": 0.0003, "step": 15510 }, { "epoch": 4.091808468539771, "grad_norm": 0.0035763175692409277, "learning_rate": 3.9402092675635275e-06, "loss": 0.0007, "step": 15512 }, { "epoch": 4.092336103416436, "grad_norm": 0.0077286288142204285, "learning_rate": 3.939857557372725e-06, "loss": 0.0027, "step": 15514 }, { "epoch": 4.092863738293101, "grad_norm": 0.028719265013933182, "learning_rate": 3.939505847181922e-06, "loss": 0.0002, "step": 15516 }, { "epoch": 4.093391373169767, "grad_norm": 0.005895020440220833, "learning_rate": 3.939154136991119e-06, "loss": 0.0016, "step": 15518 }, { "epoch": 4.093919008046432, "grad_norm": 0.12678171694278717, "learning_rate": 3.938802426800317e-06, "loss": 0.0133, "step": 15520 }, { "epoch": 4.0944466429230975, "grad_norm": 0.018718551844358444, "learning_rate": 3.938450716609514e-06, "loss": 0.0002, "step": 15522 }, { "epoch": 4.094974277799762, "grad_norm": 0.030055711045861244, "learning_rate": 3.938099006418711e-06, "loss": 0.0002, "step": 15524 }, { "epoch": 4.095501912676428, "grad_norm": 0.348297655582428, "learning_rate": 3.937747296227908e-06, "loss": 0.0012, "step": 15526 }, { "epoch": 4.096029547553093, "grad_norm": 0.0025499670300632715, "learning_rate": 3.937395586037106e-06, "loss": 0.0002, "step": 15528 }, { "epoch": 4.096557182429758, "grad_norm": 0.03209163248538971, "learning_rate": 3.9370438758463026e-06, "loss": 0.0002, "step": 15530 }, { "epoch": 4.097084817306424, "grad_norm": 0.2395229935646057, "learning_rate": 3.9366921656554995e-06, "loss": 0.0034, "step": 15532 }, { "epoch": 4.097612452183089, "grad_norm": 0.01093254890292883, "learning_rate": 3.936340455464697e-06, "loss": 0.0002, "step": 15534 }, { "epoch": 4.098140087059755, "grad_norm": 0.08113788068294525, "learning_rate": 3.935988745273894e-06, "loss": 0.0003, "step": 15536 }, { "epoch": 4.09866772193642, "grad_norm": 0.010915415361523628, "learning_rate": 3.935637035083092e-06, "loss": 0.0002, "step": 15538 }, { "epoch": 4.099195356813086, "grad_norm": 0.006467905361205339, "learning_rate": 3.935285324892289e-06, "loss": 0.0002, "step": 15540 }, { "epoch": 4.099722991689751, "grad_norm": 0.027480844408273697, "learning_rate": 3.934933614701486e-06, "loss": 0.0003, "step": 15542 }, { "epoch": 4.100250626566416, "grad_norm": 0.004307540133595467, "learning_rate": 3.934581904510684e-06, "loss": 0.0019, "step": 15544 }, { "epoch": 4.100778261443081, "grad_norm": 0.0104050412774086, "learning_rate": 3.934230194319881e-06, "loss": 0.0002, "step": 15546 }, { "epoch": 4.101305896319746, "grad_norm": 0.017733369022607803, "learning_rate": 3.933878484129078e-06, "loss": 0.0003, "step": 15548 }, { "epoch": 4.101833531196412, "grad_norm": 0.004308492410928011, "learning_rate": 3.933526773938275e-06, "loss": 0.0002, "step": 15550 }, { "epoch": 4.102361166073077, "grad_norm": 0.1813887655735016, "learning_rate": 3.9331750637474724e-06, "loss": 0.0053, "step": 15552 }, { "epoch": 4.102888800949743, "grad_norm": 0.005531326401978731, "learning_rate": 3.932823353556669e-06, "loss": 0.0002, "step": 15554 }, { "epoch": 4.103416435826408, "grad_norm": 0.14717784523963928, "learning_rate": 3.932471643365866e-06, "loss": 0.0009, "step": 15556 }, { "epoch": 4.103944070703074, "grad_norm": 0.3519531488418579, "learning_rate": 3.932119933175064e-06, "loss": 0.0058, "step": 15558 }, { "epoch": 4.104471705579739, "grad_norm": 0.058251459151506424, "learning_rate": 3.931768222984261e-06, "loss": 0.0004, "step": 15560 }, { "epoch": 4.104999340456404, "grad_norm": 0.019590139389038086, "learning_rate": 3.931416512793458e-06, "loss": 0.0002, "step": 15562 }, { "epoch": 4.10552697533307, "grad_norm": 0.01758052594959736, "learning_rate": 3.931064802602656e-06, "loss": 0.0004, "step": 15564 }, { "epoch": 4.106054610209735, "grad_norm": 0.15009233355522156, "learning_rate": 3.930713092411853e-06, "loss": 0.0008, "step": 15566 }, { "epoch": 4.1065822450864005, "grad_norm": 0.019903527572751045, "learning_rate": 3.930361382221051e-06, "loss": 0.0006, "step": 15568 }, { "epoch": 4.107109879963065, "grad_norm": 0.025516560301184654, "learning_rate": 3.930009672030247e-06, "loss": 0.0004, "step": 15570 }, { "epoch": 4.107637514839731, "grad_norm": 0.06583783775568008, "learning_rate": 3.9296579618394445e-06, "loss": 0.0067, "step": 15572 }, { "epoch": 4.108165149716396, "grad_norm": 0.1482706516981125, "learning_rate": 3.9293062516486415e-06, "loss": 0.0004, "step": 15574 }, { "epoch": 4.108692784593061, "grad_norm": 0.2263784259557724, "learning_rate": 3.928954541457839e-06, "loss": 0.0078, "step": 15576 }, { "epoch": 4.109220419469727, "grad_norm": 0.09982461482286453, "learning_rate": 3.928602831267036e-06, "loss": 0.0028, "step": 15578 }, { "epoch": 4.109748054346392, "grad_norm": 0.021082134917378426, "learning_rate": 3.928251121076233e-06, "loss": 0.0004, "step": 15580 }, { "epoch": 4.110275689223058, "grad_norm": 0.012086574919521809, "learning_rate": 3.927899410885431e-06, "loss": 0.0006, "step": 15582 }, { "epoch": 4.110803324099723, "grad_norm": 0.7349265813827515, "learning_rate": 3.927547700694627e-06, "loss": 0.0038, "step": 15584 }, { "epoch": 4.111330958976389, "grad_norm": 0.010905547067523003, "learning_rate": 3.927195990503825e-06, "loss": 0.0002, "step": 15586 }, { "epoch": 4.111858593853054, "grad_norm": 0.012657777406275272, "learning_rate": 3.926844280313022e-06, "loss": 0.0002, "step": 15588 }, { "epoch": 4.112386228729719, "grad_norm": 0.002363832201808691, "learning_rate": 3.92649257012222e-06, "loss": 0.0001, "step": 15590 }, { "epoch": 4.1129138636063844, "grad_norm": 0.2490846812725067, "learning_rate": 3.9261408599314166e-06, "loss": 0.0006, "step": 15592 }, { "epoch": 4.113441498483049, "grad_norm": 0.1569577008485794, "learning_rate": 3.9257891497406135e-06, "loss": 0.0006, "step": 15594 }, { "epoch": 4.113969133359715, "grad_norm": 0.005574304144829512, "learning_rate": 3.925437439549811e-06, "loss": 0.003, "step": 15596 }, { "epoch": 4.11449676823638, "grad_norm": 0.012389657087624073, "learning_rate": 3.925085729359008e-06, "loss": 0.0003, "step": 15598 }, { "epoch": 4.115024403113046, "grad_norm": 0.008528182283043861, "learning_rate": 3.924734019168205e-06, "loss": 0.0002, "step": 15600 }, { "epoch": 4.115552037989711, "grad_norm": 0.515644371509552, "learning_rate": 3.924382308977403e-06, "loss": 0.0016, "step": 15602 }, { "epoch": 4.116079672866377, "grad_norm": 0.01247413456439972, "learning_rate": 3.9240305987866e-06, "loss": 0.0002, "step": 15604 }, { "epoch": 4.116607307743042, "grad_norm": 0.029566597193479538, "learning_rate": 3.923678888595798e-06, "loss": 0.0002, "step": 15606 }, { "epoch": 4.117134942619707, "grad_norm": 0.26074689626693726, "learning_rate": 3.923327178404994e-06, "loss": 0.0031, "step": 15608 }, { "epoch": 4.117662577496373, "grad_norm": 0.23792573809623718, "learning_rate": 3.922975468214192e-06, "loss": 0.003, "step": 15610 }, { "epoch": 4.118190212373038, "grad_norm": 0.0013802830362692475, "learning_rate": 3.922623758023389e-06, "loss": 0.0001, "step": 15612 }, { "epoch": 4.1187178472497035, "grad_norm": 0.11025530844926834, "learning_rate": 3.922272047832586e-06, "loss": 0.0004, "step": 15614 }, { "epoch": 4.119245482126368, "grad_norm": 0.0028570923022925854, "learning_rate": 3.921920337641783e-06, "loss": 0.0012, "step": 15616 }, { "epoch": 4.119773117003034, "grad_norm": 0.004406794905662537, "learning_rate": 3.92156862745098e-06, "loss": 0.0004, "step": 15618 }, { "epoch": 4.120300751879699, "grad_norm": 0.09446415305137634, "learning_rate": 3.921216917260178e-06, "loss": 0.0005, "step": 15620 }, { "epoch": 4.120828386756364, "grad_norm": 0.08245299756526947, "learning_rate": 3.920865207069375e-06, "loss": 0.0008, "step": 15622 }, { "epoch": 4.12135602163303, "grad_norm": 0.007208555471152067, "learning_rate": 3.920513496878572e-06, "loss": 0.0002, "step": 15624 }, { "epoch": 4.121883656509695, "grad_norm": 0.008477705530822277, "learning_rate": 3.92016178668777e-06, "loss": 0.0002, "step": 15626 }, { "epoch": 4.122411291386361, "grad_norm": 0.2826586961746216, "learning_rate": 3.919810076496967e-06, "loss": 0.0011, "step": 15628 }, { "epoch": 4.122938926263026, "grad_norm": 0.028301650658249855, "learning_rate": 3.919458366306164e-06, "loss": 0.0002, "step": 15630 }, { "epoch": 4.123466561139692, "grad_norm": 0.048094164580106735, "learning_rate": 3.919106656115361e-06, "loss": 0.0025, "step": 15632 }, { "epoch": 4.123994196016357, "grad_norm": 0.0029078717343509197, "learning_rate": 3.9187549459245585e-06, "loss": 0.0009, "step": 15634 }, { "epoch": 4.124521830893022, "grad_norm": 0.21768319606781006, "learning_rate": 3.9184032357337554e-06, "loss": 0.0007, "step": 15636 }, { "epoch": 4.1250494657696875, "grad_norm": 0.027309982106089592, "learning_rate": 3.918051525542952e-06, "loss": 0.0003, "step": 15638 }, { "epoch": 4.125577100646352, "grad_norm": 0.30875417590141296, "learning_rate": 3.91769981535215e-06, "loss": 0.0034, "step": 15640 }, { "epoch": 4.126104735523018, "grad_norm": 0.016124391928315163, "learning_rate": 3.917348105161347e-06, "loss": 0.0001, "step": 15642 }, { "epoch": 4.126632370399683, "grad_norm": 0.011941306293010712, "learning_rate": 3.916996394970544e-06, "loss": 0.0003, "step": 15644 }, { "epoch": 4.127160005276349, "grad_norm": 0.00517427921295166, "learning_rate": 3.916644684779741e-06, "loss": 0.0002, "step": 15646 }, { "epoch": 4.127687640153014, "grad_norm": 2.2898313999176025, "learning_rate": 3.916292974588939e-06, "loss": 0.0019, "step": 15648 }, { "epoch": 4.12821527502968, "grad_norm": 0.026593470945954323, "learning_rate": 3.915941264398137e-06, "loss": 0.0008, "step": 15650 }, { "epoch": 4.128742909906345, "grad_norm": 0.041098836809396744, "learning_rate": 3.915589554207333e-06, "loss": 0.0004, "step": 15652 }, { "epoch": 4.12927054478301, "grad_norm": 0.19714227318763733, "learning_rate": 3.9152378440165306e-06, "loss": 0.0029, "step": 15654 }, { "epoch": 4.129798179659676, "grad_norm": 0.0030788045842200518, "learning_rate": 3.9148861338257275e-06, "loss": 0.0052, "step": 15656 }, { "epoch": 4.130325814536341, "grad_norm": 0.02514926716685295, "learning_rate": 3.914534423634925e-06, "loss": 0.0002, "step": 15658 }, { "epoch": 4.1308534494130065, "grad_norm": 0.017156153917312622, "learning_rate": 3.914182713444122e-06, "loss": 0.0002, "step": 15660 }, { "epoch": 4.131381084289671, "grad_norm": 0.007464898284524679, "learning_rate": 3.913831003253319e-06, "loss": 0.0002, "step": 15662 }, { "epoch": 4.131908719166337, "grad_norm": 0.06517993658781052, "learning_rate": 3.913479293062517e-06, "loss": 0.0002, "step": 15664 }, { "epoch": 4.132436354043002, "grad_norm": 0.01385527290403843, "learning_rate": 3.913127582871714e-06, "loss": 0.0002, "step": 15666 }, { "epoch": 4.132963988919667, "grad_norm": 0.02004314959049225, "learning_rate": 3.912775872680911e-06, "loss": 0.0002, "step": 15668 }, { "epoch": 4.133491623796333, "grad_norm": 0.017873143777251244, "learning_rate": 3.912424162490108e-06, "loss": 0.0125, "step": 15670 }, { "epoch": 4.134019258672998, "grad_norm": 0.08690665662288666, "learning_rate": 3.912072452299306e-06, "loss": 0.001, "step": 15672 }, { "epoch": 4.134546893549664, "grad_norm": 0.01221415400505066, "learning_rate": 3.911720742108503e-06, "loss": 0.0002, "step": 15674 }, { "epoch": 4.135074528426329, "grad_norm": 0.0015226347604766488, "learning_rate": 3.9113690319176996e-06, "loss": 0.0001, "step": 15676 }, { "epoch": 4.135602163302995, "grad_norm": 0.009303956292569637, "learning_rate": 3.911017321726897e-06, "loss": 0.0002, "step": 15678 }, { "epoch": 4.13612979817966, "grad_norm": 0.4299522936344147, "learning_rate": 3.910665611536094e-06, "loss": 0.0022, "step": 15680 }, { "epoch": 4.136657433056325, "grad_norm": 0.00508892722427845, "learning_rate": 3.910313901345291e-06, "loss": 0.0002, "step": 15682 }, { "epoch": 4.1371850679329905, "grad_norm": 0.03066755272448063, "learning_rate": 3.909962191154489e-06, "loss": 0.0012, "step": 15684 }, { "epoch": 4.137712702809655, "grad_norm": 0.22705979645252228, "learning_rate": 3.909610480963686e-06, "loss": 0.003, "step": 15686 }, { "epoch": 4.138240337686321, "grad_norm": 0.0065106614492833614, "learning_rate": 3.909258770772884e-06, "loss": 0.0002, "step": 15688 }, { "epoch": 4.138767972562986, "grad_norm": 0.38242948055267334, "learning_rate": 3.90890706058208e-06, "loss": 0.0037, "step": 15690 }, { "epoch": 4.139295607439652, "grad_norm": 0.0034251294564455748, "learning_rate": 3.908555350391278e-06, "loss": 0.002, "step": 15692 }, { "epoch": 4.139823242316317, "grad_norm": 0.0313730388879776, "learning_rate": 3.908203640200475e-06, "loss": 0.0008, "step": 15694 }, { "epoch": 4.140350877192983, "grad_norm": 0.056440286338329315, "learning_rate": 3.9078519300096725e-06, "loss": 0.0081, "step": 15696 }, { "epoch": 4.140878512069648, "grad_norm": 0.10263822227716446, "learning_rate": 3.9075002198188694e-06, "loss": 0.0015, "step": 15698 }, { "epoch": 4.141406146946313, "grad_norm": 0.0500640869140625, "learning_rate": 3.907148509628066e-06, "loss": 0.0005, "step": 15700 }, { "epoch": 4.141933781822979, "grad_norm": 0.03181935474276543, "learning_rate": 3.906796799437264e-06, "loss": 0.0002, "step": 15702 }, { "epoch": 4.142461416699644, "grad_norm": 0.003948109690099955, "learning_rate": 3.90644508924646e-06, "loss": 0.0002, "step": 15704 }, { "epoch": 4.1429890515763095, "grad_norm": 0.27411794662475586, "learning_rate": 3.906093379055658e-06, "loss": 0.0016, "step": 15706 }, { "epoch": 4.143516686452974, "grad_norm": 0.08508067578077316, "learning_rate": 3.905741668864856e-06, "loss": 0.001, "step": 15708 }, { "epoch": 4.14404432132964, "grad_norm": 0.05807200446724892, "learning_rate": 3.905389958674053e-06, "loss": 0.0011, "step": 15710 }, { "epoch": 4.144571956206305, "grad_norm": 0.127189040184021, "learning_rate": 3.90503824848325e-06, "loss": 0.0025, "step": 15712 }, { "epoch": 4.14509959108297, "grad_norm": 0.34397879242897034, "learning_rate": 3.904686538292447e-06, "loss": 0.0013, "step": 15714 }, { "epoch": 4.145627225959636, "grad_norm": 0.12917637825012207, "learning_rate": 3.9043348281016445e-06, "loss": 0.0003, "step": 15716 }, { "epoch": 4.146154860836301, "grad_norm": 0.17272691428661346, "learning_rate": 3.9039831179108415e-06, "loss": 0.0004, "step": 15718 }, { "epoch": 4.146682495712967, "grad_norm": 0.005317548289895058, "learning_rate": 3.9036314077200385e-06, "loss": 0.0002, "step": 15720 }, { "epoch": 4.147210130589632, "grad_norm": 0.5281513929367065, "learning_rate": 3.903279697529236e-06, "loss": 0.0018, "step": 15722 }, { "epoch": 4.147737765466298, "grad_norm": 0.24910904467105865, "learning_rate": 3.902927987338433e-06, "loss": 0.0061, "step": 15724 }, { "epoch": 4.148265400342963, "grad_norm": 0.004259215667843819, "learning_rate": 3.902576277147631e-06, "loss": 0.0002, "step": 15726 }, { "epoch": 4.148793035219628, "grad_norm": 0.09509892761707306, "learning_rate": 3.902224566956827e-06, "loss": 0.0024, "step": 15728 }, { "epoch": 4.1493206700962935, "grad_norm": 0.00369401928037405, "learning_rate": 3.901872856766025e-06, "loss": 0.0001, "step": 15730 }, { "epoch": 4.149848304972958, "grad_norm": 0.008859425783157349, "learning_rate": 3.901521146575223e-06, "loss": 0.0002, "step": 15732 }, { "epoch": 4.150375939849624, "grad_norm": 0.6659661531448364, "learning_rate": 3.901169436384419e-06, "loss": 0.0042, "step": 15734 }, { "epoch": 4.150903574726289, "grad_norm": 0.014711985364556313, "learning_rate": 3.900817726193617e-06, "loss": 0.0002, "step": 15736 }, { "epoch": 4.151431209602955, "grad_norm": 0.21790711581707, "learning_rate": 3.9004660160028136e-06, "loss": 0.0017, "step": 15738 }, { "epoch": 4.15195884447962, "grad_norm": 0.052420876920223236, "learning_rate": 3.900114305812011e-06, "loss": 0.0003, "step": 15740 }, { "epoch": 4.152486479356286, "grad_norm": 0.6586199402809143, "learning_rate": 3.899762595621208e-06, "loss": 0.0025, "step": 15742 }, { "epoch": 4.153014114232951, "grad_norm": 0.0044820052571594715, "learning_rate": 3.899410885430405e-06, "loss": 0.0001, "step": 15744 }, { "epoch": 4.153541749109616, "grad_norm": 0.004512218292802572, "learning_rate": 3.899059175239603e-06, "loss": 0.0003, "step": 15746 }, { "epoch": 4.154069383986282, "grad_norm": 0.005490967538207769, "learning_rate": 3.8987074650488e-06, "loss": 0.0002, "step": 15748 }, { "epoch": 4.154597018862947, "grad_norm": 0.6512535810470581, "learning_rate": 3.898355754857997e-06, "loss": 0.0012, "step": 15750 }, { "epoch": 4.1551246537396125, "grad_norm": 0.11506465077400208, "learning_rate": 3.898004044667194e-06, "loss": 0.0053, "step": 15752 }, { "epoch": 4.1556522886162774, "grad_norm": 0.014362015761435032, "learning_rate": 3.897652334476392e-06, "loss": 0.0062, "step": 15754 }, { "epoch": 4.156179923492943, "grad_norm": 0.013079086318612099, "learning_rate": 3.897300624285589e-06, "loss": 0.0009, "step": 15756 }, { "epoch": 4.156707558369608, "grad_norm": 0.0025338453706353903, "learning_rate": 3.896948914094786e-06, "loss": 0.0001, "step": 15758 }, { "epoch": 4.157235193246273, "grad_norm": 0.020285990089178085, "learning_rate": 3.8965972039039834e-06, "loss": 0.0002, "step": 15760 }, { "epoch": 4.157762828122939, "grad_norm": 0.006269791163504124, "learning_rate": 3.89624549371318e-06, "loss": 0.0012, "step": 15762 }, { "epoch": 4.158290462999604, "grad_norm": 0.011798005551099777, "learning_rate": 3.895893783522377e-06, "loss": 0.0034, "step": 15764 }, { "epoch": 4.15881809787627, "grad_norm": 0.019370798021554947, "learning_rate": 3.895542073331575e-06, "loss": 0.0002, "step": 15766 }, { "epoch": 4.159345732752935, "grad_norm": 0.09754941612482071, "learning_rate": 3.895190363140772e-06, "loss": 0.0006, "step": 15768 }, { "epoch": 4.159873367629601, "grad_norm": 0.044030070304870605, "learning_rate": 3.89483865294997e-06, "loss": 0.0004, "step": 15770 }, { "epoch": 4.160401002506266, "grad_norm": 0.0875082015991211, "learning_rate": 3.894486942759166e-06, "loss": 0.0004, "step": 15772 }, { "epoch": 4.160928637382931, "grad_norm": 0.007185713853687048, "learning_rate": 3.894135232568364e-06, "loss": 0.0002, "step": 15774 }, { "epoch": 4.1614562722595965, "grad_norm": 0.04388301447033882, "learning_rate": 3.893783522377561e-06, "loss": 0.008, "step": 15776 }, { "epoch": 4.161983907136261, "grad_norm": 0.14609791338443756, "learning_rate": 3.8934318121867585e-06, "loss": 0.0014, "step": 15778 }, { "epoch": 4.162511542012927, "grad_norm": 0.3262775242328644, "learning_rate": 3.8930801019959555e-06, "loss": 0.0003, "step": 15780 }, { "epoch": 4.163039176889592, "grad_norm": 0.008757144212722778, "learning_rate": 3.8927283918051524e-06, "loss": 0.0002, "step": 15782 }, { "epoch": 4.163566811766258, "grad_norm": 0.005513541866093874, "learning_rate": 3.89237668161435e-06, "loss": 0.0006, "step": 15784 }, { "epoch": 4.164094446642923, "grad_norm": 0.00509052537381649, "learning_rate": 3.892024971423547e-06, "loss": 0.0013, "step": 15786 }, { "epoch": 4.164622081519588, "grad_norm": 0.0023344536311924458, "learning_rate": 3.891673261232744e-06, "loss": 0.0001, "step": 15788 }, { "epoch": 4.165149716396254, "grad_norm": 0.015231593511998653, "learning_rate": 3.891321551041942e-06, "loss": 0.0002, "step": 15790 }, { "epoch": 4.165677351272919, "grad_norm": 0.0036803018301725388, "learning_rate": 3.890969840851139e-06, "loss": 0.0002, "step": 15792 }, { "epoch": 4.166204986149585, "grad_norm": 0.009769652038812637, "learning_rate": 3.890618130660336e-06, "loss": 0.0002, "step": 15794 }, { "epoch": 4.16673262102625, "grad_norm": 0.015689097344875336, "learning_rate": 3.890266420469533e-06, "loss": 0.0003, "step": 15796 }, { "epoch": 4.1672602559029155, "grad_norm": 0.39159834384918213, "learning_rate": 3.889914710278731e-06, "loss": 0.0008, "step": 15798 }, { "epoch": 4.1677878907795805, "grad_norm": 0.05765332654118538, "learning_rate": 3.8895630000879276e-06, "loss": 0.0022, "step": 15800 }, { "epoch": 4.168315525656246, "grad_norm": 0.015714308246970177, "learning_rate": 3.8892112898971245e-06, "loss": 0.0002, "step": 15802 }, { "epoch": 4.168843160532911, "grad_norm": 0.013537594117224216, "learning_rate": 3.888859579706322e-06, "loss": 0.0002, "step": 15804 }, { "epoch": 4.169370795409576, "grad_norm": 0.41801705956459045, "learning_rate": 3.888507869515519e-06, "loss": 0.001, "step": 15806 }, { "epoch": 4.169898430286242, "grad_norm": 0.10209906846284866, "learning_rate": 3.888156159324717e-06, "loss": 0.0003, "step": 15808 }, { "epoch": 4.170426065162907, "grad_norm": 0.016053229570388794, "learning_rate": 3.887804449133913e-06, "loss": 0.0002, "step": 15810 }, { "epoch": 4.170953700039573, "grad_norm": 0.010225975885987282, "learning_rate": 3.887452738943111e-06, "loss": 0.0002, "step": 15812 }, { "epoch": 4.171481334916238, "grad_norm": 0.01069028489291668, "learning_rate": 3.887101028752308e-06, "loss": 0.0009, "step": 15814 }, { "epoch": 4.172008969792904, "grad_norm": 0.049934837967157364, "learning_rate": 3.886749318561506e-06, "loss": 0.0036, "step": 15816 }, { "epoch": 4.172536604669569, "grad_norm": 0.10493876785039902, "learning_rate": 3.886397608370703e-06, "loss": 0.0006, "step": 15818 }, { "epoch": 4.173064239546234, "grad_norm": 0.2486145943403244, "learning_rate": 3.8860458981799e-06, "loss": 0.0011, "step": 15820 }, { "epoch": 4.1735918744228995, "grad_norm": 0.004418277647346258, "learning_rate": 3.885694187989097e-06, "loss": 0.0001, "step": 15822 }, { "epoch": 4.174119509299564, "grad_norm": 0.1518738716840744, "learning_rate": 3.885342477798294e-06, "loss": 0.0005, "step": 15824 }, { "epoch": 4.17464714417623, "grad_norm": 0.04593885317444801, "learning_rate": 3.884990767607491e-06, "loss": 0.0024, "step": 15826 }, { "epoch": 4.175174779052895, "grad_norm": 0.002135672140866518, "learning_rate": 3.884639057416689e-06, "loss": 0.0002, "step": 15828 }, { "epoch": 4.175702413929561, "grad_norm": 0.3112824261188507, "learning_rate": 3.884287347225886e-06, "loss": 0.0061, "step": 15830 }, { "epoch": 4.176230048806226, "grad_norm": 0.0054410784505307674, "learning_rate": 3.883935637035083e-06, "loss": 0.0001, "step": 15832 }, { "epoch": 4.176757683682891, "grad_norm": 0.004919025115668774, "learning_rate": 3.88358392684428e-06, "loss": 0.0002, "step": 15834 }, { "epoch": 4.177285318559557, "grad_norm": 0.34439384937286377, "learning_rate": 3.883232216653478e-06, "loss": 0.0023, "step": 15836 }, { "epoch": 4.177812953436222, "grad_norm": 0.4880187213420868, "learning_rate": 3.882880506462675e-06, "loss": 0.0057, "step": 15838 }, { "epoch": 4.178340588312888, "grad_norm": 0.01600859872996807, "learning_rate": 3.882528796271872e-06, "loss": 0.0084, "step": 15840 }, { "epoch": 4.178868223189553, "grad_norm": 0.18368762731552124, "learning_rate": 3.8821770860810695e-06, "loss": 0.0007, "step": 15842 }, { "epoch": 4.1793958580662185, "grad_norm": 0.0034662969410419464, "learning_rate": 3.8818253758902664e-06, "loss": 0.0042, "step": 15844 }, { "epoch": 4.1799234929428835, "grad_norm": 0.3211585581302643, "learning_rate": 3.881473665699464e-06, "loss": 0.0084, "step": 15846 }, { "epoch": 4.180451127819548, "grad_norm": 0.004263460170477629, "learning_rate": 3.881121955508661e-06, "loss": 0.0003, "step": 15848 }, { "epoch": 4.180978762696214, "grad_norm": 0.022980209439992905, "learning_rate": 3.880770245317858e-06, "loss": 0.0002, "step": 15850 }, { "epoch": 4.181506397572879, "grad_norm": 0.0822005346417427, "learning_rate": 3.880418535127056e-06, "loss": 0.0033, "step": 15852 }, { "epoch": 4.182034032449545, "grad_norm": 0.03266168758273125, "learning_rate": 3.880066824936252e-06, "loss": 0.0003, "step": 15854 }, { "epoch": 4.18256166732621, "grad_norm": 0.05686155706644058, "learning_rate": 3.87971511474545e-06, "loss": 0.0008, "step": 15856 }, { "epoch": 4.183089302202876, "grad_norm": 0.06003911793231964, "learning_rate": 3.879363404554647e-06, "loss": 0.0004, "step": 15858 }, { "epoch": 4.183616937079541, "grad_norm": 0.042855799198150635, "learning_rate": 3.879011694363845e-06, "loss": 0.0003, "step": 15860 }, { "epoch": 4.184144571956207, "grad_norm": 0.010283555835485458, "learning_rate": 3.8786599841730416e-06, "loss": 0.0002, "step": 15862 }, { "epoch": 4.184672206832872, "grad_norm": 0.05737089365720749, "learning_rate": 3.8783082739822385e-06, "loss": 0.0004, "step": 15864 }, { "epoch": 4.185199841709537, "grad_norm": 0.013649161905050278, "learning_rate": 3.877956563791436e-06, "loss": 0.0011, "step": 15866 }, { "epoch": 4.1857274765862025, "grad_norm": 0.2946839928627014, "learning_rate": 3.877604853600633e-06, "loss": 0.0013, "step": 15868 }, { "epoch": 4.186255111462867, "grad_norm": 0.5481642484664917, "learning_rate": 3.87725314340983e-06, "loss": 0.001, "step": 15870 }, { "epoch": 4.186782746339533, "grad_norm": 0.046951621770858765, "learning_rate": 3.876901433219027e-06, "loss": 0.0005, "step": 15872 }, { "epoch": 4.187310381216198, "grad_norm": 0.03308988735079765, "learning_rate": 3.876549723028225e-06, "loss": 0.0008, "step": 15874 }, { "epoch": 4.187838016092864, "grad_norm": 0.14098520576953888, "learning_rate": 3.876198012837423e-06, "loss": 0.0029, "step": 15876 }, { "epoch": 4.188365650969529, "grad_norm": 0.02918998897075653, "learning_rate": 3.875846302646619e-06, "loss": 0.0004, "step": 15878 }, { "epoch": 4.188893285846194, "grad_norm": 0.67003333568573, "learning_rate": 3.875494592455817e-06, "loss": 0.0011, "step": 15880 }, { "epoch": 4.18942092072286, "grad_norm": 0.07751699537038803, "learning_rate": 3.875142882265014e-06, "loss": 0.0014, "step": 15882 }, { "epoch": 4.189948555599525, "grad_norm": 0.10931876301765442, "learning_rate": 3.8747911720742106e-06, "loss": 0.0008, "step": 15884 }, { "epoch": 4.190476190476191, "grad_norm": 0.02733793668448925, "learning_rate": 3.874439461883408e-06, "loss": 0.0003, "step": 15886 }, { "epoch": 4.191003825352856, "grad_norm": 0.2671443223953247, "learning_rate": 3.874087751692605e-06, "loss": 0.0092, "step": 15888 }, { "epoch": 4.1915314602295215, "grad_norm": 0.06479509174823761, "learning_rate": 3.873736041501803e-06, "loss": 0.0008, "step": 15890 }, { "epoch": 4.1920590951061865, "grad_norm": 0.07948734611272812, "learning_rate": 3.873384331310999e-06, "loss": 0.0002, "step": 15892 }, { "epoch": 4.192586729982851, "grad_norm": 0.019041815772652626, "learning_rate": 3.873032621120197e-06, "loss": 0.0003, "step": 15894 }, { "epoch": 4.193114364859517, "grad_norm": 0.02187296375632286, "learning_rate": 3.872680910929394e-06, "loss": 0.0002, "step": 15896 }, { "epoch": 4.193641999736182, "grad_norm": 0.00718544889241457, "learning_rate": 3.872329200738592e-06, "loss": 0.0002, "step": 15898 }, { "epoch": 4.194169634612848, "grad_norm": 0.00313397916033864, "learning_rate": 3.871977490547789e-06, "loss": 0.0001, "step": 15900 }, { "epoch": 4.194697269489513, "grad_norm": 0.19335603713989258, "learning_rate": 3.871625780356986e-06, "loss": 0.0014, "step": 15902 }, { "epoch": 4.195224904366179, "grad_norm": 0.21039293706417084, "learning_rate": 3.8712740701661835e-06, "loss": 0.0015, "step": 15904 }, { "epoch": 4.195752539242844, "grad_norm": 0.08490274846553802, "learning_rate": 3.8709223599753804e-06, "loss": 0.0003, "step": 15906 }, { "epoch": 4.19628017411951, "grad_norm": 0.05077645555138588, "learning_rate": 3.870570649784577e-06, "loss": 0.0003, "step": 15908 }, { "epoch": 4.196807808996175, "grad_norm": 0.015164357610046864, "learning_rate": 3.870218939593775e-06, "loss": 0.0002, "step": 15910 }, { "epoch": 4.19733544387284, "grad_norm": 0.0024525087792426348, "learning_rate": 3.869867229402972e-06, "loss": 0.0002, "step": 15912 }, { "epoch": 4.1978630787495055, "grad_norm": 0.004017177503556013, "learning_rate": 3.869515519212169e-06, "loss": 0.0011, "step": 15914 }, { "epoch": 4.1983907136261704, "grad_norm": 0.004830264486372471, "learning_rate": 3.869163809021366e-06, "loss": 0.0001, "step": 15916 }, { "epoch": 4.198918348502836, "grad_norm": 0.005708022508770227, "learning_rate": 3.868812098830564e-06, "loss": 0.0026, "step": 15918 }, { "epoch": 4.199445983379501, "grad_norm": 0.38725966215133667, "learning_rate": 3.868460388639761e-06, "loss": 0.0043, "step": 15920 }, { "epoch": 4.199973618256167, "grad_norm": 0.006689759902656078, "learning_rate": 3.868108678448958e-06, "loss": 0.0002, "step": 15922 }, { "epoch": 4.200501253132832, "grad_norm": 0.014648985117673874, "learning_rate": 3.8677569682581555e-06, "loss": 0.0002, "step": 15924 }, { "epoch": 4.201028888009497, "grad_norm": 0.12903480231761932, "learning_rate": 3.8674052580673525e-06, "loss": 0.0031, "step": 15926 }, { "epoch": 4.201556522886163, "grad_norm": 5.460505485534668, "learning_rate": 3.86705354787655e-06, "loss": 0.003, "step": 15928 }, { "epoch": 4.202084157762828, "grad_norm": 0.15305867791175842, "learning_rate": 3.866701837685746e-06, "loss": 0.0004, "step": 15930 }, { "epoch": 4.202611792639494, "grad_norm": 0.07627927511930466, "learning_rate": 3.866350127494944e-06, "loss": 0.0003, "step": 15932 }, { "epoch": 4.203139427516159, "grad_norm": 0.0034469033125787973, "learning_rate": 3.865998417304142e-06, "loss": 0.0002, "step": 15934 }, { "epoch": 4.2036670623928245, "grad_norm": 0.012732292525470257, "learning_rate": 3.865646707113339e-06, "loss": 0.0036, "step": 15936 }, { "epoch": 4.2041946972694895, "grad_norm": 0.06256266683340073, "learning_rate": 3.865294996922536e-06, "loss": 0.0034, "step": 15938 }, { "epoch": 4.204722332146154, "grad_norm": 0.18940740823745728, "learning_rate": 3.864943286731733e-06, "loss": 0.0004, "step": 15940 }, { "epoch": 4.20524996702282, "grad_norm": 0.03776968643069267, "learning_rate": 3.864591576540931e-06, "loss": 0.0002, "step": 15942 }, { "epoch": 4.205777601899485, "grad_norm": 0.06019659340381622, "learning_rate": 3.864239866350128e-06, "loss": 0.0002, "step": 15944 }, { "epoch": 4.206305236776151, "grad_norm": 0.07706326246261597, "learning_rate": 3.8638881561593246e-06, "loss": 0.0004, "step": 15946 }, { "epoch": 4.206832871652816, "grad_norm": 0.17776821553707123, "learning_rate": 3.863536445968522e-06, "loss": 0.0005, "step": 15948 }, { "epoch": 4.207360506529482, "grad_norm": 0.02402534708380699, "learning_rate": 3.863184735777719e-06, "loss": 0.0002, "step": 15950 }, { "epoch": 4.207888141406147, "grad_norm": 0.008464613929390907, "learning_rate": 3.862833025586916e-06, "loss": 0.0007, "step": 15952 }, { "epoch": 4.208415776282813, "grad_norm": 0.047858826816082, "learning_rate": 3.862481315396113e-06, "loss": 0.0003, "step": 15954 }, { "epoch": 4.208943411159478, "grad_norm": 0.026851411908864975, "learning_rate": 3.862129605205311e-06, "loss": 0.0002, "step": 15956 }, { "epoch": 4.209471046036143, "grad_norm": 0.23317958414554596, "learning_rate": 3.861777895014509e-06, "loss": 0.0006, "step": 15958 }, { "epoch": 4.2099986809128085, "grad_norm": 0.009225871413946152, "learning_rate": 3.861426184823705e-06, "loss": 0.0146, "step": 15960 }, { "epoch": 4.2105263157894735, "grad_norm": 0.08352003246545792, "learning_rate": 3.861074474632903e-06, "loss": 0.0085, "step": 15962 }, { "epoch": 4.211053950666139, "grad_norm": 0.035288549959659576, "learning_rate": 3.8607227644421e-06, "loss": 0.0002, "step": 15964 }, { "epoch": 4.211581585542804, "grad_norm": 0.31160375475883484, "learning_rate": 3.8603710542512975e-06, "loss": 0.0014, "step": 15966 }, { "epoch": 4.21210922041947, "grad_norm": 0.0028770221397280693, "learning_rate": 3.8600193440604944e-06, "loss": 0.0003, "step": 15968 }, { "epoch": 4.212636855296135, "grad_norm": 1.8274519443511963, "learning_rate": 3.859667633869691e-06, "loss": 0.0088, "step": 15970 }, { "epoch": 4.2131644901728, "grad_norm": 0.03578510880470276, "learning_rate": 3.859315923678889e-06, "loss": 0.0004, "step": 15972 }, { "epoch": 4.213692125049466, "grad_norm": 0.00900083314627409, "learning_rate": 3.858964213488085e-06, "loss": 0.0008, "step": 15974 }, { "epoch": 4.214219759926131, "grad_norm": 0.008240546099841595, "learning_rate": 3.858612503297283e-06, "loss": 0.001, "step": 15976 }, { "epoch": 4.214747394802797, "grad_norm": 0.06344378739595413, "learning_rate": 3.85826079310648e-06, "loss": 0.0003, "step": 15978 }, { "epoch": 4.215275029679462, "grad_norm": 0.014889645390212536, "learning_rate": 3.857909082915678e-06, "loss": 0.0002, "step": 15980 }, { "epoch": 4.2158026645561275, "grad_norm": 0.0023905225098133087, "learning_rate": 3.857557372724875e-06, "loss": 0.0024, "step": 15982 }, { "epoch": 4.2163302994327925, "grad_norm": 0.028386129066348076, "learning_rate": 3.857205662534072e-06, "loss": 0.0005, "step": 15984 }, { "epoch": 4.216857934309457, "grad_norm": 0.06982515007257462, "learning_rate": 3.8568539523432695e-06, "loss": 0.0004, "step": 15986 }, { "epoch": 4.217385569186123, "grad_norm": 0.0015109265223145485, "learning_rate": 3.8565022421524665e-06, "loss": 0.0005, "step": 15988 }, { "epoch": 4.217913204062788, "grad_norm": 0.002080824924632907, "learning_rate": 3.8561505319616634e-06, "loss": 0.0003, "step": 15990 }, { "epoch": 4.218440838939454, "grad_norm": 0.008132094517350197, "learning_rate": 3.855798821770861e-06, "loss": 0.0002, "step": 15992 }, { "epoch": 4.218968473816119, "grad_norm": 0.014404838904738426, "learning_rate": 3.855447111580058e-06, "loss": 0.0001, "step": 15994 }, { "epoch": 4.219496108692785, "grad_norm": 0.006729718763381243, "learning_rate": 3.855095401389256e-06, "loss": 0.0002, "step": 15996 }, { "epoch": 4.22002374356945, "grad_norm": 0.002656563650816679, "learning_rate": 3.854743691198452e-06, "loss": 0.0003, "step": 15998 }, { "epoch": 4.220551378446116, "grad_norm": 0.0028501416090875864, "learning_rate": 3.85439198100765e-06, "loss": 0.0107, "step": 16000 }, { "epoch": 4.220551378446116, "eval_loss": 0.002078720834106207, "eval_runtime": 303.8254, "eval_samples_per_second": 709.753, "eval_steps_per_second": 88.722, "step": 16000 }, { "epoch": 4.221079013322781, "grad_norm": 0.03029349446296692, "learning_rate": 3.854040270816847e-06, "loss": 0.0002, "step": 16002 }, { "epoch": 4.221606648199446, "grad_norm": 1.2232894897460938, "learning_rate": 3.853688560626044e-06, "loss": 0.0018, "step": 16004 }, { "epoch": 4.2221342830761115, "grad_norm": 0.0067656515166163445, "learning_rate": 3.853336850435242e-06, "loss": 0.0002, "step": 16006 }, { "epoch": 4.2226619179527765, "grad_norm": 0.007036470342427492, "learning_rate": 3.8529851402444386e-06, "loss": 0.0001, "step": 16008 }, { "epoch": 4.223189552829442, "grad_norm": 0.16149915754795074, "learning_rate": 3.852633430053636e-06, "loss": 0.0007, "step": 16010 }, { "epoch": 4.223717187706107, "grad_norm": 0.25220754742622375, "learning_rate": 3.8522817198628325e-06, "loss": 0.0042, "step": 16012 }, { "epoch": 4.224244822582773, "grad_norm": 0.003881464246660471, "learning_rate": 3.85193000967203e-06, "loss": 0.0001, "step": 16014 }, { "epoch": 4.224772457459438, "grad_norm": 0.005557780619710684, "learning_rate": 3.851578299481228e-06, "loss": 0.001, "step": 16016 }, { "epoch": 4.225300092336103, "grad_norm": 0.004805515054613352, "learning_rate": 3.851226589290425e-06, "loss": 0.0008, "step": 16018 }, { "epoch": 4.225827727212769, "grad_norm": 0.007718637119978666, "learning_rate": 3.850874879099622e-06, "loss": 0.0002, "step": 16020 }, { "epoch": 4.226355362089434, "grad_norm": 0.0026186720933765173, "learning_rate": 3.850523168908819e-06, "loss": 0.0001, "step": 16022 }, { "epoch": 4.2268829969661, "grad_norm": 0.030439192429184914, "learning_rate": 3.850171458718017e-06, "loss": 0.0002, "step": 16024 }, { "epoch": 4.227410631842765, "grad_norm": 0.2613825798034668, "learning_rate": 3.849819748527214e-06, "loss": 0.0053, "step": 16026 }, { "epoch": 4.2279382667194305, "grad_norm": 0.034445326775312424, "learning_rate": 3.849468038336411e-06, "loss": 0.0003, "step": 16028 }, { "epoch": 4.2284659015960955, "grad_norm": 0.008289842866361141, "learning_rate": 3.849116328145608e-06, "loss": 0.0002, "step": 16030 }, { "epoch": 4.22899353647276, "grad_norm": 0.009242099709808826, "learning_rate": 3.848764617954805e-06, "loss": 0.0014, "step": 16032 }, { "epoch": 4.229521171349426, "grad_norm": 0.0023460816591978073, "learning_rate": 3.848412907764002e-06, "loss": 0.0004, "step": 16034 }, { "epoch": 4.230048806226091, "grad_norm": 0.014389436691999435, "learning_rate": 3.848061197573199e-06, "loss": 0.0002, "step": 16036 }, { "epoch": 4.230576441102757, "grad_norm": 0.49986696243286133, "learning_rate": 3.847709487382397e-06, "loss": 0.0014, "step": 16038 }, { "epoch": 4.231104075979422, "grad_norm": 0.03039884753525257, "learning_rate": 3.847357777191594e-06, "loss": 0.0003, "step": 16040 }, { "epoch": 4.231631710856088, "grad_norm": 0.011973078362643719, "learning_rate": 3.847006067000791e-06, "loss": 0.0024, "step": 16042 }, { "epoch": 4.232159345732753, "grad_norm": 0.006478939205408096, "learning_rate": 3.846654356809989e-06, "loss": 0.0001, "step": 16044 }, { "epoch": 4.232686980609419, "grad_norm": 0.12960335612297058, "learning_rate": 3.846302646619186e-06, "loss": 0.0003, "step": 16046 }, { "epoch": 4.233214615486084, "grad_norm": 0.11933895945549011, "learning_rate": 3.8459509364283835e-06, "loss": 0.0003, "step": 16048 }, { "epoch": 4.233742250362749, "grad_norm": 0.00327331037260592, "learning_rate": 3.8455992262375805e-06, "loss": 0.0002, "step": 16050 }, { "epoch": 4.2342698852394145, "grad_norm": 0.03728092089295387, "learning_rate": 3.8452475160467774e-06, "loss": 0.0002, "step": 16052 }, { "epoch": 4.2347975201160795, "grad_norm": 0.5237528681755066, "learning_rate": 3.844895805855975e-06, "loss": 0.0042, "step": 16054 }, { "epoch": 4.235325154992745, "grad_norm": 0.007797026075422764, "learning_rate": 3.844544095665172e-06, "loss": 0.0003, "step": 16056 }, { "epoch": 4.23585278986941, "grad_norm": 0.03617237135767937, "learning_rate": 3.844192385474369e-06, "loss": 0.0003, "step": 16058 }, { "epoch": 4.236380424746076, "grad_norm": 0.00689007667824626, "learning_rate": 3.843840675283566e-06, "loss": 0.0002, "step": 16060 }, { "epoch": 4.236908059622741, "grad_norm": 0.010073968209326267, "learning_rate": 3.843488965092764e-06, "loss": 0.0001, "step": 16062 }, { "epoch": 4.237435694499406, "grad_norm": 0.0018738358048722148, "learning_rate": 3.843137254901961e-06, "loss": 0.0001, "step": 16064 }, { "epoch": 4.237963329376072, "grad_norm": 0.2038230299949646, "learning_rate": 3.842785544711158e-06, "loss": 0.001, "step": 16066 }, { "epoch": 4.238490964252737, "grad_norm": 0.004678978584706783, "learning_rate": 3.842433834520356e-06, "loss": 0.0003, "step": 16068 }, { "epoch": 4.239018599129403, "grad_norm": 0.2078235000371933, "learning_rate": 3.8420821243295525e-06, "loss": 0.0079, "step": 16070 }, { "epoch": 4.239546234006068, "grad_norm": 0.02098812721669674, "learning_rate": 3.8417304141387495e-06, "loss": 0.0002, "step": 16072 }, { "epoch": 4.2400738688827335, "grad_norm": 0.019369550049304962, "learning_rate": 3.841378703947947e-06, "loss": 0.0016, "step": 16074 }, { "epoch": 4.2406015037593985, "grad_norm": 0.023197725415229797, "learning_rate": 3.841026993757144e-06, "loss": 0.0003, "step": 16076 }, { "epoch": 4.2411291386360634, "grad_norm": 0.0016192292096093297, "learning_rate": 3.840675283566342e-06, "loss": 0.0001, "step": 16078 }, { "epoch": 4.241656773512729, "grad_norm": 0.004344094544649124, "learning_rate": 3.840323573375538e-06, "loss": 0.0002, "step": 16080 }, { "epoch": 4.242184408389394, "grad_norm": 0.647406816482544, "learning_rate": 3.839971863184736e-06, "loss": 0.0034, "step": 16082 }, { "epoch": 4.24271204326606, "grad_norm": 0.2869473099708557, "learning_rate": 3.839620152993933e-06, "loss": 0.0009, "step": 16084 }, { "epoch": 4.243239678142725, "grad_norm": 0.002437385031953454, "learning_rate": 3.839268442803131e-06, "loss": 0.0004, "step": 16086 }, { "epoch": 4.243767313019391, "grad_norm": 0.0025938486214727163, "learning_rate": 3.838916732612328e-06, "loss": 0.0002, "step": 16088 }, { "epoch": 4.244294947896056, "grad_norm": 0.06020171567797661, "learning_rate": 3.838565022421525e-06, "loss": 0.0002, "step": 16090 }, { "epoch": 4.244822582772722, "grad_norm": 0.07481207698583603, "learning_rate": 3.838213312230722e-06, "loss": 0.0028, "step": 16092 }, { "epoch": 4.245350217649387, "grad_norm": 0.3343992233276367, "learning_rate": 3.8378616020399185e-06, "loss": 0.0069, "step": 16094 }, { "epoch": 4.245877852526052, "grad_norm": 0.038796745240688324, "learning_rate": 3.837509891849116e-06, "loss": 0.0004, "step": 16096 }, { "epoch": 4.2464054874027175, "grad_norm": 0.0057428148575127125, "learning_rate": 3.837158181658313e-06, "loss": 0.0001, "step": 16098 }, { "epoch": 4.2469331222793825, "grad_norm": 0.43686163425445557, "learning_rate": 3.836806471467511e-06, "loss": 0.0068, "step": 16100 }, { "epoch": 4.247460757156048, "grad_norm": 0.0076897782273590565, "learning_rate": 3.836454761276708e-06, "loss": 0.0001, "step": 16102 }, { "epoch": 4.247988392032713, "grad_norm": 0.5804174542427063, "learning_rate": 3.836103051085905e-06, "loss": 0.0017, "step": 16104 }, { "epoch": 4.248516026909379, "grad_norm": 0.0013761776499450207, "learning_rate": 3.835751340895103e-06, "loss": 0.0003, "step": 16106 }, { "epoch": 4.249043661786044, "grad_norm": 0.7365984320640564, "learning_rate": 3.8353996307043e-06, "loss": 0.0128, "step": 16108 }, { "epoch": 4.249571296662709, "grad_norm": 0.06099476292729378, "learning_rate": 3.835047920513497e-06, "loss": 0.0034, "step": 16110 }, { "epoch": 4.250098931539375, "grad_norm": 0.10753441601991653, "learning_rate": 3.8346962103226945e-06, "loss": 0.0004, "step": 16112 }, { "epoch": 4.25062656641604, "grad_norm": 0.0028678446542471647, "learning_rate": 3.8343445001318914e-06, "loss": 0.0009, "step": 16114 }, { "epoch": 4.251154201292706, "grad_norm": 0.0035505068954080343, "learning_rate": 3.833992789941089e-06, "loss": 0.0002, "step": 16116 }, { "epoch": 4.251681836169371, "grad_norm": 0.17545664310455322, "learning_rate": 3.833641079750285e-06, "loss": 0.0005, "step": 16118 }, { "epoch": 4.2522094710460365, "grad_norm": 0.006204547360539436, "learning_rate": 3.833289369559483e-06, "loss": 0.0003, "step": 16120 }, { "epoch": 4.2527371059227015, "grad_norm": 0.0615968182682991, "learning_rate": 3.83293765936868e-06, "loss": 0.0006, "step": 16122 }, { "epoch": 4.2532647407993665, "grad_norm": 0.2695133090019226, "learning_rate": 3.832585949177878e-06, "loss": 0.0061, "step": 16124 }, { "epoch": 4.253792375676032, "grad_norm": 0.10040847957134247, "learning_rate": 3.832234238987075e-06, "loss": 0.0005, "step": 16126 }, { "epoch": 4.254320010552697, "grad_norm": 0.005914687179028988, "learning_rate": 3.831882528796272e-06, "loss": 0.0002, "step": 16128 }, { "epoch": 4.254847645429363, "grad_norm": 0.03786633163690567, "learning_rate": 3.83153081860547e-06, "loss": 0.0004, "step": 16130 }, { "epoch": 4.255375280306028, "grad_norm": 0.03375636413693428, "learning_rate": 3.8311791084146665e-06, "loss": 0.0004, "step": 16132 }, { "epoch": 4.255902915182694, "grad_norm": 0.3084258735179901, "learning_rate": 3.8308273982238635e-06, "loss": 0.0017, "step": 16134 }, { "epoch": 4.256430550059359, "grad_norm": 0.01275400910526514, "learning_rate": 3.830475688033061e-06, "loss": 0.0098, "step": 16136 }, { "epoch": 4.256958184936025, "grad_norm": 0.7739881873130798, "learning_rate": 3.830123977842258e-06, "loss": 0.0111, "step": 16138 }, { "epoch": 4.25748581981269, "grad_norm": 0.4303501844406128, "learning_rate": 3.829772267651455e-06, "loss": 0.0091, "step": 16140 }, { "epoch": 4.258013454689355, "grad_norm": 0.20702552795410156, "learning_rate": 3.829420557460652e-06, "loss": 0.0005, "step": 16142 }, { "epoch": 4.2585410895660205, "grad_norm": 0.012305847369134426, "learning_rate": 3.82906884726985e-06, "loss": 0.0002, "step": 16144 }, { "epoch": 4.2590687244426855, "grad_norm": 0.07054246962070465, "learning_rate": 3.828717137079047e-06, "loss": 0.0004, "step": 16146 }, { "epoch": 4.259596359319351, "grad_norm": 0.005656706169247627, "learning_rate": 3.828365426888244e-06, "loss": 0.0002, "step": 16148 }, { "epoch": 4.260123994196016, "grad_norm": 0.040183573961257935, "learning_rate": 3.828013716697442e-06, "loss": 0.0003, "step": 16150 }, { "epoch": 4.260651629072681, "grad_norm": 0.0076445708982646465, "learning_rate": 3.827662006506639e-06, "loss": 0.0002, "step": 16152 }, { "epoch": 4.261179263949347, "grad_norm": 0.4034319221973419, "learning_rate": 3.827310296315836e-06, "loss": 0.003, "step": 16154 }, { "epoch": 4.261706898826012, "grad_norm": 0.01621922478079796, "learning_rate": 3.8269585861250325e-06, "loss": 0.0003, "step": 16156 }, { "epoch": 4.262234533702678, "grad_norm": 0.03308821842074394, "learning_rate": 3.82660687593423e-06, "loss": 0.0004, "step": 16158 }, { "epoch": 4.262762168579343, "grad_norm": 0.03556229546666145, "learning_rate": 3.826255165743428e-06, "loss": 0.0004, "step": 16160 }, { "epoch": 4.263289803456009, "grad_norm": 0.06261956691741943, "learning_rate": 3.825903455552624e-06, "loss": 0.0004, "step": 16162 }, { "epoch": 4.263817438332674, "grad_norm": 0.011748607270419598, "learning_rate": 3.825551745361822e-06, "loss": 0.0003, "step": 16164 }, { "epoch": 4.2643450732093395, "grad_norm": 0.016723409295082092, "learning_rate": 3.825200035171019e-06, "loss": 0.0004, "step": 16166 }, { "epoch": 4.2648727080860045, "grad_norm": 0.017370648682117462, "learning_rate": 3.824848324980217e-06, "loss": 0.0002, "step": 16168 }, { "epoch": 4.2654003429626695, "grad_norm": 0.00839724950492382, "learning_rate": 3.824496614789414e-06, "loss": 0.0006, "step": 16170 }, { "epoch": 4.265927977839335, "grad_norm": 0.015190375037491322, "learning_rate": 3.824144904598611e-06, "loss": 0.0005, "step": 16172 }, { "epoch": 4.266455612716, "grad_norm": 0.12808293104171753, "learning_rate": 3.8237931944078085e-06, "loss": 0.0003, "step": 16174 }, { "epoch": 4.266983247592666, "grad_norm": 0.0613878108561039, "learning_rate": 3.823441484217005e-06, "loss": 0.0027, "step": 16176 }, { "epoch": 4.267510882469331, "grad_norm": 0.17461684346199036, "learning_rate": 3.823089774026202e-06, "loss": 0.0006, "step": 16178 }, { "epoch": 4.268038517345997, "grad_norm": 0.008882942609488964, "learning_rate": 3.822738063835399e-06, "loss": 0.0077, "step": 16180 }, { "epoch": 4.268566152222662, "grad_norm": 0.016765976324677467, "learning_rate": 3.822386353644597e-06, "loss": 0.0004, "step": 16182 }, { "epoch": 4.269093787099327, "grad_norm": 0.015459964983165264, "learning_rate": 3.822034643453794e-06, "loss": 0.0002, "step": 16184 }, { "epoch": 4.269621421975993, "grad_norm": 0.01075827144086361, "learning_rate": 3.821682933262991e-06, "loss": 0.0002, "step": 16186 }, { "epoch": 4.270149056852658, "grad_norm": 0.006554815452545881, "learning_rate": 3.821331223072189e-06, "loss": 0.0003, "step": 16188 }, { "epoch": 4.2706766917293235, "grad_norm": 0.006260414607822895, "learning_rate": 3.820979512881386e-06, "loss": 0.0009, "step": 16190 }, { "epoch": 4.2712043266059885, "grad_norm": 0.013413693755865097, "learning_rate": 3.820627802690583e-06, "loss": 0.0002, "step": 16192 }, { "epoch": 4.271731961482654, "grad_norm": 0.003656762419268489, "learning_rate": 3.8202760924997805e-06, "loss": 0.0002, "step": 16194 }, { "epoch": 4.272259596359319, "grad_norm": 0.5958022475242615, "learning_rate": 3.8199243823089775e-06, "loss": 0.0017, "step": 16196 }, { "epoch": 4.272787231235984, "grad_norm": 0.008871975354850292, "learning_rate": 3.819572672118175e-06, "loss": 0.0002, "step": 16198 }, { "epoch": 4.27331486611265, "grad_norm": 0.7663992047309875, "learning_rate": 3.819220961927371e-06, "loss": 0.0012, "step": 16200 }, { "epoch": 4.273842500989315, "grad_norm": 0.15035387873649597, "learning_rate": 3.818869251736569e-06, "loss": 0.0006, "step": 16202 }, { "epoch": 4.274370135865981, "grad_norm": 0.008472980000078678, "learning_rate": 3.818517541545766e-06, "loss": 0.0033, "step": 16204 }, { "epoch": 4.274897770742646, "grad_norm": 0.0017718159360811114, "learning_rate": 3.818165831354964e-06, "loss": 0.0005, "step": 16206 }, { "epoch": 4.275425405619312, "grad_norm": 0.018208151683211327, "learning_rate": 3.817814121164161e-06, "loss": 0.0015, "step": 16208 }, { "epoch": 4.275953040495977, "grad_norm": 0.16531722247600555, "learning_rate": 3.817462410973358e-06, "loss": 0.0006, "step": 16210 }, { "epoch": 4.2764806753726425, "grad_norm": 0.5511554479598999, "learning_rate": 3.817110700782556e-06, "loss": 0.0013, "step": 16212 }, { "epoch": 4.2770083102493075, "grad_norm": 0.0204060859978199, "learning_rate": 3.816758990591753e-06, "loss": 0.0018, "step": 16214 }, { "epoch": 4.2775359451259725, "grad_norm": 0.03460371121764183, "learning_rate": 3.8164072804009495e-06, "loss": 0.0003, "step": 16216 }, { "epoch": 4.278063580002638, "grad_norm": 0.005915848072618246, "learning_rate": 3.816055570210147e-06, "loss": 0.0004, "step": 16218 }, { "epoch": 4.278591214879303, "grad_norm": 0.0036131273955106735, "learning_rate": 3.815703860019344e-06, "loss": 0.0005, "step": 16220 }, { "epoch": 4.279118849755969, "grad_norm": 0.014284476637840271, "learning_rate": 3.815352149828541e-06, "loss": 0.0002, "step": 16222 }, { "epoch": 4.279646484632634, "grad_norm": 0.0036473253276199102, "learning_rate": 3.815000439637738e-06, "loss": 0.0013, "step": 16224 }, { "epoch": 4.2801741195093, "grad_norm": 0.022544939070940018, "learning_rate": 3.814648729446936e-06, "loss": 0.0014, "step": 16226 }, { "epoch": 4.280701754385965, "grad_norm": 0.04023168236017227, "learning_rate": 3.8142970192561334e-06, "loss": 0.0029, "step": 16228 }, { "epoch": 4.28122938926263, "grad_norm": 0.008355803787708282, "learning_rate": 3.81394530906533e-06, "loss": 0.0002, "step": 16230 }, { "epoch": 4.281757024139296, "grad_norm": 0.04556867480278015, "learning_rate": 3.8135935988745273e-06, "loss": 0.0005, "step": 16232 }, { "epoch": 4.282284659015961, "grad_norm": 0.33478567004203796, "learning_rate": 3.8132418886837247e-06, "loss": 0.0018, "step": 16234 }, { "epoch": 4.2828122938926265, "grad_norm": 0.008991200476884842, "learning_rate": 3.812890178492922e-06, "loss": 0.0008, "step": 16236 }, { "epoch": 4.2833399287692915, "grad_norm": 0.004984759259968996, "learning_rate": 3.812538468302119e-06, "loss": 0.0071, "step": 16238 }, { "epoch": 4.283867563645957, "grad_norm": 0.002673847135156393, "learning_rate": 3.8121867581113164e-06, "loss": 0.0002, "step": 16240 }, { "epoch": 4.284395198522622, "grad_norm": 0.009256202727556229, "learning_rate": 3.8118350479205137e-06, "loss": 0.0002, "step": 16242 }, { "epoch": 4.284922833399287, "grad_norm": 0.18666446208953857, "learning_rate": 3.811483337729711e-06, "loss": 0.0008, "step": 16244 }, { "epoch": 4.285450468275953, "grad_norm": 0.024363486096262932, "learning_rate": 3.811131627538908e-06, "loss": 0.0002, "step": 16246 }, { "epoch": 4.285978103152618, "grad_norm": 0.2146298587322235, "learning_rate": 3.8107799173481054e-06, "loss": 0.0071, "step": 16248 }, { "epoch": 4.286505738029284, "grad_norm": 0.008586736395955086, "learning_rate": 3.810428207157303e-06, "loss": 0.0055, "step": 16250 }, { "epoch": 4.287033372905949, "grad_norm": 0.01652555540204048, "learning_rate": 3.8100764969664994e-06, "loss": 0.0003, "step": 16252 }, { "epoch": 4.287561007782615, "grad_norm": 0.16729344427585602, "learning_rate": 3.8097247867756967e-06, "loss": 0.0005, "step": 16254 }, { "epoch": 4.28808864265928, "grad_norm": 0.012697446160018444, "learning_rate": 3.809373076584894e-06, "loss": 0.0003, "step": 16256 }, { "epoch": 4.2886162775359455, "grad_norm": 0.7117272019386292, "learning_rate": 3.8090213663940915e-06, "loss": 0.0015, "step": 16258 }, { "epoch": 4.2891439124126105, "grad_norm": 0.11799023300409317, "learning_rate": 3.8086696562032884e-06, "loss": 0.0004, "step": 16260 }, { "epoch": 4.2896715472892755, "grad_norm": 0.4542711079120636, "learning_rate": 3.808317946012486e-06, "loss": 0.0018, "step": 16262 }, { "epoch": 4.290199182165941, "grad_norm": 0.019448211416602135, "learning_rate": 3.807966235821683e-06, "loss": 0.0007, "step": 16264 }, { "epoch": 4.290726817042606, "grad_norm": 0.014461658895015717, "learning_rate": 3.8076145256308806e-06, "loss": 0.0002, "step": 16266 }, { "epoch": 4.291254451919272, "grad_norm": 0.010416229255497456, "learning_rate": 3.807262815440077e-06, "loss": 0.0002, "step": 16268 }, { "epoch": 4.291782086795937, "grad_norm": 0.015971500426530838, "learning_rate": 3.8069111052492745e-06, "loss": 0.0016, "step": 16270 }, { "epoch": 4.292309721672603, "grad_norm": 0.022452760487794876, "learning_rate": 3.8065593950584723e-06, "loss": 0.0002, "step": 16272 }, { "epoch": 4.292837356549268, "grad_norm": 0.8913946747779846, "learning_rate": 3.8062076848676696e-06, "loss": 0.0022, "step": 16274 }, { "epoch": 4.293364991425933, "grad_norm": 0.6415695548057556, "learning_rate": 3.805855974676866e-06, "loss": 0.0005, "step": 16276 }, { "epoch": 4.293892626302599, "grad_norm": 0.4110892713069916, "learning_rate": 3.8055042644860635e-06, "loss": 0.0039, "step": 16278 }, { "epoch": 4.294420261179264, "grad_norm": 0.2771661877632141, "learning_rate": 3.805152554295261e-06, "loss": 0.0071, "step": 16280 }, { "epoch": 4.2949478960559295, "grad_norm": 0.21690119802951813, "learning_rate": 3.804800844104458e-06, "loss": 0.0064, "step": 16282 }, { "epoch": 4.2954755309325945, "grad_norm": 0.027372706681489944, "learning_rate": 3.8044491339136552e-06, "loss": 0.0002, "step": 16284 }, { "epoch": 4.29600316580926, "grad_norm": 0.0374656580388546, "learning_rate": 3.8040974237228526e-06, "loss": 0.0037, "step": 16286 }, { "epoch": 4.296530800685925, "grad_norm": 0.3459298312664032, "learning_rate": 3.80374571353205e-06, "loss": 0.006, "step": 16288 }, { "epoch": 4.29705843556259, "grad_norm": 0.4566391408443451, "learning_rate": 3.8033940033412465e-06, "loss": 0.0053, "step": 16290 }, { "epoch": 4.297586070439256, "grad_norm": 0.702284574508667, "learning_rate": 3.803042293150444e-06, "loss": 0.0019, "step": 16292 }, { "epoch": 4.298113705315921, "grad_norm": 0.008432727307081223, "learning_rate": 3.8026905829596413e-06, "loss": 0.0002, "step": 16294 }, { "epoch": 4.298641340192587, "grad_norm": 0.01026888843625784, "learning_rate": 3.8023388727688387e-06, "loss": 0.0008, "step": 16296 }, { "epoch": 4.299168975069252, "grad_norm": 0.014033317565917969, "learning_rate": 3.8019871625780356e-06, "loss": 0.0002, "step": 16298 }, { "epoch": 4.299696609945918, "grad_norm": 0.21659588813781738, "learning_rate": 3.801635452387233e-06, "loss": 0.0009, "step": 16300 }, { "epoch": 4.300224244822583, "grad_norm": 0.016193266957998276, "learning_rate": 3.8012837421964304e-06, "loss": 0.0002, "step": 16302 }, { "epoch": 4.3007518796992485, "grad_norm": 0.020883288234472275, "learning_rate": 3.8009320320056277e-06, "loss": 0.0064, "step": 16304 }, { "epoch": 4.3012795145759135, "grad_norm": 0.21585284173488617, "learning_rate": 3.8005803218148247e-06, "loss": 0.0041, "step": 16306 }, { "epoch": 4.3018071494525785, "grad_norm": 0.009318094700574875, "learning_rate": 3.800228611624022e-06, "loss": 0.0002, "step": 16308 }, { "epoch": 4.302334784329244, "grad_norm": 0.05877389386296272, "learning_rate": 3.7998769014332194e-06, "loss": 0.0004, "step": 16310 }, { "epoch": 4.302862419205909, "grad_norm": 0.013483946211636066, "learning_rate": 3.799525191242416e-06, "loss": 0.0002, "step": 16312 }, { "epoch": 4.303390054082575, "grad_norm": 0.10812932252883911, "learning_rate": 3.7991734810516133e-06, "loss": 0.0027, "step": 16314 }, { "epoch": 4.30391768895924, "grad_norm": 0.12318140268325806, "learning_rate": 3.7988217708608107e-06, "loss": 0.0012, "step": 16316 }, { "epoch": 4.304445323835906, "grad_norm": 0.05793527141213417, "learning_rate": 3.798470060670008e-06, "loss": 0.0003, "step": 16318 }, { "epoch": 4.304972958712571, "grad_norm": 0.7562992572784424, "learning_rate": 3.798118350479205e-06, "loss": 0.0081, "step": 16320 }, { "epoch": 4.305500593589236, "grad_norm": 0.12408608198165894, "learning_rate": 3.7977666402884024e-06, "loss": 0.0007, "step": 16322 }, { "epoch": 4.306028228465902, "grad_norm": 0.033430445939302444, "learning_rate": 3.7974149300976e-06, "loss": 0.0005, "step": 16324 }, { "epoch": 4.306555863342567, "grad_norm": 0.00821550004184246, "learning_rate": 3.797063219906797e-06, "loss": 0.0006, "step": 16326 }, { "epoch": 4.3070834982192325, "grad_norm": 0.10040564835071564, "learning_rate": 3.7967115097159937e-06, "loss": 0.0005, "step": 16328 }, { "epoch": 4.3076111330958975, "grad_norm": 0.5646281838417053, "learning_rate": 3.7963597995251915e-06, "loss": 0.0012, "step": 16330 }, { "epoch": 4.308138767972563, "grad_norm": 0.018265537917613983, "learning_rate": 3.796008089334389e-06, "loss": 0.0081, "step": 16332 }, { "epoch": 4.308666402849228, "grad_norm": 0.1449040174484253, "learning_rate": 3.7956563791435863e-06, "loss": 0.0044, "step": 16334 }, { "epoch": 4.309194037725893, "grad_norm": 0.6108478903770447, "learning_rate": 3.7953046689527828e-06, "loss": 0.0012, "step": 16336 }, { "epoch": 4.309721672602559, "grad_norm": 0.022016536444425583, "learning_rate": 3.79495295876198e-06, "loss": 0.0001, "step": 16338 }, { "epoch": 4.310249307479224, "grad_norm": 0.3263590335845947, "learning_rate": 3.7946012485711775e-06, "loss": 0.011, "step": 16340 }, { "epoch": 4.31077694235589, "grad_norm": 0.007124297320842743, "learning_rate": 3.7942495383803745e-06, "loss": 0.0002, "step": 16342 }, { "epoch": 4.311304577232555, "grad_norm": 0.00494261272251606, "learning_rate": 3.793897828189572e-06, "loss": 0.0002, "step": 16344 }, { "epoch": 4.311832212109221, "grad_norm": 0.2193242460489273, "learning_rate": 3.7935461179987692e-06, "loss": 0.0007, "step": 16346 }, { "epoch": 4.312359846985886, "grad_norm": 0.009448423981666565, "learning_rate": 3.7931944078079666e-06, "loss": 0.0003, "step": 16348 }, { "epoch": 4.3128874818625516, "grad_norm": 0.13963063061237335, "learning_rate": 3.792842697617163e-06, "loss": 0.0006, "step": 16350 }, { "epoch": 4.3134151167392165, "grad_norm": 0.043952587991952896, "learning_rate": 3.7924909874263605e-06, "loss": 0.0012, "step": 16352 }, { "epoch": 4.3139427516158815, "grad_norm": 0.01408801693469286, "learning_rate": 3.792139277235558e-06, "loss": 0.0002, "step": 16354 }, { "epoch": 4.314470386492547, "grad_norm": 0.01648387499153614, "learning_rate": 3.7917875670447557e-06, "loss": 0.0013, "step": 16356 }, { "epoch": 4.314998021369212, "grad_norm": 0.13115915656089783, "learning_rate": 3.7914358568539522e-06, "loss": 0.0004, "step": 16358 }, { "epoch": 4.315525656245878, "grad_norm": 0.025618214160203934, "learning_rate": 3.7910841466631496e-06, "loss": 0.0023, "step": 16360 }, { "epoch": 4.316053291122543, "grad_norm": 0.005963204428553581, "learning_rate": 3.790732436472347e-06, "loss": 0.0059, "step": 16362 }, { "epoch": 4.316580925999209, "grad_norm": 0.04239794239401817, "learning_rate": 3.7903807262815444e-06, "loss": 0.0099, "step": 16364 }, { "epoch": 4.317108560875874, "grad_norm": 0.05211113765835762, "learning_rate": 3.7900290160907413e-06, "loss": 0.0003, "step": 16366 }, { "epoch": 4.317636195752539, "grad_norm": 0.004441159777343273, "learning_rate": 3.7896773058999387e-06, "loss": 0.0002, "step": 16368 }, { "epoch": 4.318163830629205, "grad_norm": 0.15162016451358795, "learning_rate": 3.789325595709136e-06, "loss": 0.0024, "step": 16370 }, { "epoch": 4.31869146550587, "grad_norm": 0.013612274080514908, "learning_rate": 3.7889738855183326e-06, "loss": 0.0003, "step": 16372 }, { "epoch": 4.3192191003825355, "grad_norm": 0.6313546895980835, "learning_rate": 3.78862217532753e-06, "loss": 0.0043, "step": 16374 }, { "epoch": 4.3197467352592005, "grad_norm": 0.14659006893634796, "learning_rate": 3.7882704651367273e-06, "loss": 0.0009, "step": 16376 }, { "epoch": 4.320274370135866, "grad_norm": 0.0078092277981340885, "learning_rate": 3.7879187549459247e-06, "loss": 0.0004, "step": 16378 }, { "epoch": 4.320802005012531, "grad_norm": 0.38758376240730286, "learning_rate": 3.7875670447551217e-06, "loss": 0.0018, "step": 16380 }, { "epoch": 4.321329639889196, "grad_norm": 0.0033848758321255445, "learning_rate": 3.787215334564319e-06, "loss": 0.0002, "step": 16382 }, { "epoch": 4.321857274765862, "grad_norm": 0.017448555678129196, "learning_rate": 3.7868636243735164e-06, "loss": 0.0002, "step": 16384 }, { "epoch": 4.322384909642527, "grad_norm": 0.09529108554124832, "learning_rate": 3.786511914182714e-06, "loss": 0.0012, "step": 16386 }, { "epoch": 4.322912544519193, "grad_norm": 0.010758920572698116, "learning_rate": 3.7861602039919107e-06, "loss": 0.0112, "step": 16388 }, { "epoch": 4.323440179395858, "grad_norm": 0.10225663334131241, "learning_rate": 3.785808493801108e-06, "loss": 0.0004, "step": 16390 }, { "epoch": 4.323967814272524, "grad_norm": 0.03867892175912857, "learning_rate": 3.7854567836103055e-06, "loss": 0.007, "step": 16392 }, { "epoch": 4.324495449149189, "grad_norm": 0.10394871979951859, "learning_rate": 3.785105073419503e-06, "loss": 0.0007, "step": 16394 }, { "epoch": 4.325023084025855, "grad_norm": 0.35863643884658813, "learning_rate": 3.7847533632286994e-06, "loss": 0.0102, "step": 16396 }, { "epoch": 4.3255507189025195, "grad_norm": 0.391236275434494, "learning_rate": 3.7844016530378968e-06, "loss": 0.0013, "step": 16398 }, { "epoch": 4.3260783537791845, "grad_norm": 0.0937899500131607, "learning_rate": 3.784049942847094e-06, "loss": 0.0012, "step": 16400 }, { "epoch": 4.32660598865585, "grad_norm": 0.3252440094947815, "learning_rate": 3.783698232656291e-06, "loss": 0.0012, "step": 16402 }, { "epoch": 4.327133623532515, "grad_norm": 0.16546350717544556, "learning_rate": 3.7833465224654885e-06, "loss": 0.0005, "step": 16404 }, { "epoch": 4.327661258409181, "grad_norm": 0.008050359785556793, "learning_rate": 3.782994812274686e-06, "loss": 0.0002, "step": 16406 }, { "epoch": 4.328188893285846, "grad_norm": 0.023392314091324806, "learning_rate": 3.7826431020838832e-06, "loss": 0.0002, "step": 16408 }, { "epoch": 4.328716528162512, "grad_norm": 0.007047195918858051, "learning_rate": 3.7822913918930798e-06, "loss": 0.0002, "step": 16410 }, { "epoch": 4.329244163039177, "grad_norm": 0.13877232372760773, "learning_rate": 3.781939681702277e-06, "loss": 0.0046, "step": 16412 }, { "epoch": 4.329771797915842, "grad_norm": 0.23001627624034882, "learning_rate": 3.781587971511475e-06, "loss": 0.0011, "step": 16414 }, { "epoch": 4.330299432792508, "grad_norm": 0.33697107434272766, "learning_rate": 3.7812362613206723e-06, "loss": 0.0007, "step": 16416 }, { "epoch": 4.330827067669173, "grad_norm": 0.004083529580384493, "learning_rate": 3.780884551129869e-06, "loss": 0.0002, "step": 16418 }, { "epoch": 4.3313547025458385, "grad_norm": 0.24817714095115662, "learning_rate": 3.7805328409390662e-06, "loss": 0.0008, "step": 16420 }, { "epoch": 4.3318823374225035, "grad_norm": 0.3841945230960846, "learning_rate": 3.7801811307482636e-06, "loss": 0.0033, "step": 16422 }, { "epoch": 4.332409972299169, "grad_norm": 0.07374857366085052, "learning_rate": 3.779829420557461e-06, "loss": 0.0003, "step": 16424 }, { "epoch": 4.332937607175834, "grad_norm": 0.009115214459598064, "learning_rate": 3.779477710366658e-06, "loss": 0.0002, "step": 16426 }, { "epoch": 4.333465242052499, "grad_norm": 0.11870554834604263, "learning_rate": 3.7791260001758553e-06, "loss": 0.0004, "step": 16428 }, { "epoch": 4.333992876929165, "grad_norm": 0.14294403791427612, "learning_rate": 3.7787742899850527e-06, "loss": 0.0046, "step": 16430 }, { "epoch": 4.33452051180583, "grad_norm": 0.03339964523911476, "learning_rate": 3.778422579794249e-06, "loss": 0.0069, "step": 16432 }, { "epoch": 4.335048146682496, "grad_norm": 0.009646146558225155, "learning_rate": 3.7780708696034466e-06, "loss": 0.0137, "step": 16434 }, { "epoch": 4.335575781559161, "grad_norm": 0.027507638558745384, "learning_rate": 3.777719159412644e-06, "loss": 0.0025, "step": 16436 }, { "epoch": 4.336103416435827, "grad_norm": 0.05440441519021988, "learning_rate": 3.7773674492218413e-06, "loss": 0.0003, "step": 16438 }, { "epoch": 4.336631051312492, "grad_norm": 0.029666360467672348, "learning_rate": 3.7770157390310383e-06, "loss": 0.0028, "step": 16440 }, { "epoch": 4.337158686189158, "grad_norm": 0.27984803915023804, "learning_rate": 3.7766640288402357e-06, "loss": 0.0016, "step": 16442 }, { "epoch": 4.3376863210658225, "grad_norm": 0.02704716846346855, "learning_rate": 3.776312318649433e-06, "loss": 0.0003, "step": 16444 }, { "epoch": 4.3382139559424875, "grad_norm": 0.22449912130832672, "learning_rate": 3.7759606084586304e-06, "loss": 0.0051, "step": 16446 }, { "epoch": 4.338741590819153, "grad_norm": 0.06791041791439056, "learning_rate": 3.7756088982678274e-06, "loss": 0.0003, "step": 16448 }, { "epoch": 4.339269225695818, "grad_norm": 0.2337476909160614, "learning_rate": 3.7752571880770247e-06, "loss": 0.0021, "step": 16450 }, { "epoch": 4.339796860572484, "grad_norm": 0.015291785821318626, "learning_rate": 3.774905477886222e-06, "loss": 0.0006, "step": 16452 }, { "epoch": 4.340324495449149, "grad_norm": 0.00751062436029315, "learning_rate": 3.7745537676954195e-06, "loss": 0.0002, "step": 16454 }, { "epoch": 4.340852130325814, "grad_norm": 0.26285985112190247, "learning_rate": 3.774202057504616e-06, "loss": 0.0071, "step": 16456 }, { "epoch": 4.34137976520248, "grad_norm": 0.0072501422837376595, "learning_rate": 3.7738503473138134e-06, "loss": 0.0003, "step": 16458 }, { "epoch": 4.341907400079145, "grad_norm": 1.3900586366653442, "learning_rate": 3.7734986371230108e-06, "loss": 0.0031, "step": 16460 }, { "epoch": 4.342435034955811, "grad_norm": 0.4609934687614441, "learning_rate": 3.7731469269322077e-06, "loss": 0.0071, "step": 16462 }, { "epoch": 4.342962669832476, "grad_norm": 0.021474359557032585, "learning_rate": 3.772795216741405e-06, "loss": 0.0003, "step": 16464 }, { "epoch": 4.3434903047091415, "grad_norm": 0.022111231461167336, "learning_rate": 3.7724435065506025e-06, "loss": 0.0028, "step": 16466 }, { "epoch": 4.3440179395858065, "grad_norm": 0.009709381498396397, "learning_rate": 3.7720917963598e-06, "loss": 0.0002, "step": 16468 }, { "epoch": 4.344545574462472, "grad_norm": 0.14906951785087585, "learning_rate": 3.7717400861689964e-06, "loss": 0.0035, "step": 16470 }, { "epoch": 4.345073209339137, "grad_norm": 0.07868121564388275, "learning_rate": 3.771388375978194e-06, "loss": 0.0035, "step": 16472 }, { "epoch": 4.345600844215802, "grad_norm": 0.009795810095965862, "learning_rate": 3.7710366657873916e-06, "loss": 0.0002, "step": 16474 }, { "epoch": 4.346128479092468, "grad_norm": 0.02428121492266655, "learning_rate": 3.770684955596589e-06, "loss": 0.0004, "step": 16476 }, { "epoch": 4.346656113969133, "grad_norm": 0.006552839186042547, "learning_rate": 3.7703332454057855e-06, "loss": 0.0022, "step": 16478 }, { "epoch": 4.347183748845799, "grad_norm": 0.004918556194752455, "learning_rate": 3.769981535214983e-06, "loss": 0.0002, "step": 16480 }, { "epoch": 4.347711383722464, "grad_norm": 0.009570869617164135, "learning_rate": 3.76962982502418e-06, "loss": 0.0002, "step": 16482 }, { "epoch": 4.34823901859913, "grad_norm": 0.009633962996304035, "learning_rate": 3.7692781148333776e-06, "loss": 0.0002, "step": 16484 }, { "epoch": 4.348766653475795, "grad_norm": 0.017523659393191338, "learning_rate": 3.7689264046425745e-06, "loss": 0.0014, "step": 16486 }, { "epoch": 4.349294288352461, "grad_norm": 0.31011509895324707, "learning_rate": 3.768574694451772e-06, "loss": 0.0007, "step": 16488 }, { "epoch": 4.3498219232291255, "grad_norm": 0.006599161308258772, "learning_rate": 3.7682229842609693e-06, "loss": 0.0029, "step": 16490 }, { "epoch": 4.3503495581057905, "grad_norm": 0.053470272570848465, "learning_rate": 3.767871274070166e-06, "loss": 0.0035, "step": 16492 }, { "epoch": 4.350877192982456, "grad_norm": 0.004440102260559797, "learning_rate": 3.767519563879363e-06, "loss": 0.0002, "step": 16494 }, { "epoch": 4.351404827859121, "grad_norm": 0.010113739408552647, "learning_rate": 3.7671678536885606e-06, "loss": 0.0004, "step": 16496 }, { "epoch": 4.351932462735787, "grad_norm": 0.18514367938041687, "learning_rate": 3.766816143497758e-06, "loss": 0.0052, "step": 16498 }, { "epoch": 4.352460097612452, "grad_norm": 0.15598565340042114, "learning_rate": 3.766464433306955e-06, "loss": 0.0088, "step": 16500 }, { "epoch": 4.352987732489117, "grad_norm": 0.40882429480552673, "learning_rate": 3.7661127231161523e-06, "loss": 0.0052, "step": 16502 }, { "epoch": 4.353515367365783, "grad_norm": 0.007939253933727741, "learning_rate": 3.7657610129253497e-06, "loss": 0.0021, "step": 16504 }, { "epoch": 4.354043002242448, "grad_norm": 0.18344607949256897, "learning_rate": 3.765409302734547e-06, "loss": 0.0005, "step": 16506 }, { "epoch": 4.354570637119114, "grad_norm": 0.026448866352438927, "learning_rate": 3.765057592543744e-06, "loss": 0.0006, "step": 16508 }, { "epoch": 4.355098271995779, "grad_norm": 0.01491778064519167, "learning_rate": 3.7647058823529414e-06, "loss": 0.0051, "step": 16510 }, { "epoch": 4.3556259068724446, "grad_norm": 0.055413197726011276, "learning_rate": 3.7643541721621387e-06, "loss": 0.0006, "step": 16512 }, { "epoch": 4.3561535417491095, "grad_norm": 0.4594123065471649, "learning_rate": 3.764002461971336e-06, "loss": 0.0044, "step": 16514 }, { "epoch": 4.356681176625775, "grad_norm": 0.35301148891448975, "learning_rate": 3.7636507517805326e-06, "loss": 0.0008, "step": 16516 }, { "epoch": 4.35720881150244, "grad_norm": 0.0633312314748764, "learning_rate": 3.76329904158973e-06, "loss": 0.0004, "step": 16518 }, { "epoch": 4.357736446379105, "grad_norm": 0.010581014677882195, "learning_rate": 3.7629473313989274e-06, "loss": 0.0002, "step": 16520 }, { "epoch": 4.358264081255771, "grad_norm": 0.0060071335174143314, "learning_rate": 3.7625956212081243e-06, "loss": 0.0002, "step": 16522 }, { "epoch": 4.358791716132436, "grad_norm": 0.041863732039928436, "learning_rate": 3.7622439110173217e-06, "loss": 0.0003, "step": 16524 }, { "epoch": 4.359319351009102, "grad_norm": 0.024421950802206993, "learning_rate": 3.761892200826519e-06, "loss": 0.002, "step": 16526 }, { "epoch": 4.359846985885767, "grad_norm": 0.008553273044526577, "learning_rate": 3.7615404906357165e-06, "loss": 0.0002, "step": 16528 }, { "epoch": 4.360374620762433, "grad_norm": 0.007225222419947386, "learning_rate": 3.7611887804449134e-06, "loss": 0.0002, "step": 16530 }, { "epoch": 4.360902255639098, "grad_norm": 0.7192826867103577, "learning_rate": 3.760837070254111e-06, "loss": 0.0064, "step": 16532 }, { "epoch": 4.361429890515763, "grad_norm": 0.006249614991247654, "learning_rate": 3.760485360063308e-06, "loss": 0.0007, "step": 16534 }, { "epoch": 4.3619575253924285, "grad_norm": 0.005714669823646545, "learning_rate": 3.7601336498725055e-06, "loss": 0.0003, "step": 16536 }, { "epoch": 4.3624851602690935, "grad_norm": 0.20397692918777466, "learning_rate": 3.759781939681702e-06, "loss": 0.0036, "step": 16538 }, { "epoch": 4.363012795145759, "grad_norm": 0.01506305206567049, "learning_rate": 3.7594302294908995e-06, "loss": 0.0002, "step": 16540 }, { "epoch": 4.363540430022424, "grad_norm": 0.021535223349928856, "learning_rate": 3.759078519300097e-06, "loss": 0.0002, "step": 16542 }, { "epoch": 4.36406806489909, "grad_norm": 0.03106827661395073, "learning_rate": 3.758726809109294e-06, "loss": 0.0004, "step": 16544 }, { "epoch": 4.364595699775755, "grad_norm": 0.011691977269947529, "learning_rate": 3.758375098918491e-06, "loss": 0.0004, "step": 16546 }, { "epoch": 4.36512333465242, "grad_norm": 0.016717100515961647, "learning_rate": 3.7580233887276885e-06, "loss": 0.0014, "step": 16548 }, { "epoch": 4.365650969529086, "grad_norm": 0.215570330619812, "learning_rate": 3.757671678536886e-06, "loss": 0.0084, "step": 16550 }, { "epoch": 4.366178604405751, "grad_norm": 0.019854312762618065, "learning_rate": 3.7573199683460824e-06, "loss": 0.0005, "step": 16552 }, { "epoch": 4.366706239282417, "grad_norm": 0.13807708024978638, "learning_rate": 3.75696825815528e-06, "loss": 0.0014, "step": 16554 }, { "epoch": 4.367233874159082, "grad_norm": 0.10278207063674927, "learning_rate": 3.756616547964477e-06, "loss": 0.0014, "step": 16556 }, { "epoch": 4.367761509035748, "grad_norm": 0.01227511279284954, "learning_rate": 3.756264837773675e-06, "loss": 0.0003, "step": 16558 }, { "epoch": 4.3682891439124125, "grad_norm": 0.056891437619924545, "learning_rate": 3.7559131275828715e-06, "loss": 0.0014, "step": 16560 }, { "epoch": 4.368816778789078, "grad_norm": 0.010194867849349976, "learning_rate": 3.755561417392069e-06, "loss": 0.0003, "step": 16562 }, { "epoch": 4.369344413665743, "grad_norm": 0.006097650621086359, "learning_rate": 3.7552097072012663e-06, "loss": 0.0011, "step": 16564 }, { "epoch": 4.369872048542408, "grad_norm": 0.006967099849134684, "learning_rate": 3.7548579970104636e-06, "loss": 0.0021, "step": 16566 }, { "epoch": 4.370399683419074, "grad_norm": 0.0081017529591918, "learning_rate": 3.7545062868196606e-06, "loss": 0.0002, "step": 16568 }, { "epoch": 4.370927318295739, "grad_norm": 0.02072901464998722, "learning_rate": 3.754154576628858e-06, "loss": 0.0003, "step": 16570 }, { "epoch": 4.371454953172405, "grad_norm": 0.01773720607161522, "learning_rate": 3.7538028664380553e-06, "loss": 0.0003, "step": 16572 }, { "epoch": 4.37198258804907, "grad_norm": 0.004876228049397469, "learning_rate": 3.7534511562472527e-06, "loss": 0.009, "step": 16574 }, { "epoch": 4.372510222925736, "grad_norm": 0.4372299909591675, "learning_rate": 3.7530994460564493e-06, "loss": 0.0014, "step": 16576 }, { "epoch": 4.373037857802401, "grad_norm": 0.007408834993839264, "learning_rate": 3.7527477358656466e-06, "loss": 0.0002, "step": 16578 }, { "epoch": 4.373565492679066, "grad_norm": 0.009722205810248852, "learning_rate": 3.752396025674844e-06, "loss": 0.0002, "step": 16580 }, { "epoch": 4.3740931275557315, "grad_norm": 0.010274361819028854, "learning_rate": 3.752044315484041e-06, "loss": 0.0004, "step": 16582 }, { "epoch": 4.3746207624323965, "grad_norm": 0.00818052887916565, "learning_rate": 3.7516926052932383e-06, "loss": 0.0004, "step": 16584 }, { "epoch": 4.375148397309062, "grad_norm": 0.03240134194493294, "learning_rate": 3.7513408951024357e-06, "loss": 0.0002, "step": 16586 }, { "epoch": 4.375676032185727, "grad_norm": 0.007717230822890997, "learning_rate": 3.750989184911633e-06, "loss": 0.0097, "step": 16588 }, { "epoch": 4.376203667062393, "grad_norm": 0.188811257481575, "learning_rate": 3.75063747472083e-06, "loss": 0.0007, "step": 16590 }, { "epoch": 4.376731301939058, "grad_norm": 0.006873037666082382, "learning_rate": 3.7502857645300274e-06, "loss": 0.0001, "step": 16592 }, { "epoch": 4.377258936815723, "grad_norm": 0.01513624470680952, "learning_rate": 3.749934054339225e-06, "loss": 0.0003, "step": 16594 }, { "epoch": 4.377786571692389, "grad_norm": 0.044714875519275665, "learning_rate": 3.749582344148422e-06, "loss": 0.0002, "step": 16596 }, { "epoch": 4.378314206569054, "grad_norm": 0.12697216868400574, "learning_rate": 3.7492306339576187e-06, "loss": 0.0005, "step": 16598 }, { "epoch": 4.37884184144572, "grad_norm": 0.3396361470222473, "learning_rate": 3.748878923766816e-06, "loss": 0.0019, "step": 16600 }, { "epoch": 4.379369476322385, "grad_norm": 0.0869160145521164, "learning_rate": 3.7485272135760134e-06, "loss": 0.0004, "step": 16602 }, { "epoch": 4.379897111199051, "grad_norm": 0.14413438737392426, "learning_rate": 3.748175503385211e-06, "loss": 0.0006, "step": 16604 }, { "epoch": 4.3804247460757155, "grad_norm": 0.0951743870973587, "learning_rate": 3.7478237931944078e-06, "loss": 0.0035, "step": 16606 }, { "epoch": 4.380952380952381, "grad_norm": 0.004902318585664034, "learning_rate": 3.747472083003605e-06, "loss": 0.0003, "step": 16608 }, { "epoch": 4.381480015829046, "grad_norm": 0.07072292268276215, "learning_rate": 3.7471203728128025e-06, "loss": 0.0006, "step": 16610 }, { "epoch": 4.382007650705711, "grad_norm": 0.11522157490253448, "learning_rate": 3.746768662621999e-06, "loss": 0.0007, "step": 16612 }, { "epoch": 4.382535285582377, "grad_norm": 0.002461787313222885, "learning_rate": 3.7464169524311964e-06, "loss": 0.0012, "step": 16614 }, { "epoch": 4.383062920459042, "grad_norm": 0.032880835235118866, "learning_rate": 3.7460652422403942e-06, "loss": 0.0021, "step": 16616 }, { "epoch": 4.383590555335708, "grad_norm": 0.005082794930785894, "learning_rate": 3.7457135320495916e-06, "loss": 0.0001, "step": 16618 }, { "epoch": 4.384118190212373, "grad_norm": 0.3471984267234802, "learning_rate": 3.745361821858788e-06, "loss": 0.0074, "step": 16620 }, { "epoch": 4.384645825089039, "grad_norm": 0.0026859662029892206, "learning_rate": 3.7450101116679855e-06, "loss": 0.0004, "step": 16622 }, { "epoch": 4.385173459965704, "grad_norm": 0.04076172783970833, "learning_rate": 3.744658401477183e-06, "loss": 0.0002, "step": 16624 }, { "epoch": 4.385701094842369, "grad_norm": 0.143970787525177, "learning_rate": 3.7443066912863803e-06, "loss": 0.0004, "step": 16626 }, { "epoch": 4.3862287297190345, "grad_norm": 0.004945780150592327, "learning_rate": 3.7439549810955772e-06, "loss": 0.0035, "step": 16628 }, { "epoch": 4.3867563645956995, "grad_norm": 0.07901127636432648, "learning_rate": 3.7436032709047746e-06, "loss": 0.0003, "step": 16630 }, { "epoch": 4.387283999472365, "grad_norm": 0.00567342946305871, "learning_rate": 3.743251560713972e-06, "loss": 0.0041, "step": 16632 }, { "epoch": 4.38781163434903, "grad_norm": 0.0557495653629303, "learning_rate": 3.7428998505231693e-06, "loss": 0.0029, "step": 16634 }, { "epoch": 4.388339269225696, "grad_norm": 0.019195128232240677, "learning_rate": 3.742548140332366e-06, "loss": 0.0002, "step": 16636 }, { "epoch": 4.388866904102361, "grad_norm": 0.05934744328260422, "learning_rate": 3.7421964301415632e-06, "loss": 0.003, "step": 16638 }, { "epoch": 4.389394538979026, "grad_norm": 0.0031030720565468073, "learning_rate": 3.7418447199507606e-06, "loss": 0.0001, "step": 16640 }, { "epoch": 4.389922173855692, "grad_norm": 0.0076874904334545135, "learning_rate": 3.7414930097599576e-06, "loss": 0.0001, "step": 16642 }, { "epoch": 4.390449808732357, "grad_norm": 0.5277312994003296, "learning_rate": 3.741141299569155e-06, "loss": 0.0066, "step": 16644 }, { "epoch": 4.390977443609023, "grad_norm": 0.024424327537417412, "learning_rate": 3.7407895893783523e-06, "loss": 0.0002, "step": 16646 }, { "epoch": 4.391505078485688, "grad_norm": 0.1718611866235733, "learning_rate": 3.7404378791875497e-06, "loss": 0.0006, "step": 16648 }, { "epoch": 4.392032713362354, "grad_norm": 0.20786647498607635, "learning_rate": 3.7400861689967467e-06, "loss": 0.0019, "step": 16650 }, { "epoch": 4.3925603482390185, "grad_norm": 0.06258576363325119, "learning_rate": 3.739734458805944e-06, "loss": 0.0033, "step": 16652 }, { "epoch": 4.393087983115684, "grad_norm": 0.06311975419521332, "learning_rate": 3.7393827486151414e-06, "loss": 0.0005, "step": 16654 }, { "epoch": 4.393615617992349, "grad_norm": 0.009238377213478088, "learning_rate": 3.7390310384243388e-06, "loss": 0.0002, "step": 16656 }, { "epoch": 4.394143252869014, "grad_norm": 0.43707001209259033, "learning_rate": 3.7386793282335353e-06, "loss": 0.0049, "step": 16658 }, { "epoch": 4.39467088774568, "grad_norm": 0.3874296247959137, "learning_rate": 3.7383276180427327e-06, "loss": 0.0023, "step": 16660 }, { "epoch": 4.395198522622345, "grad_norm": 0.028618933632969856, "learning_rate": 3.73797590785193e-06, "loss": 0.0002, "step": 16662 }, { "epoch": 4.395726157499011, "grad_norm": 0.04436352476477623, "learning_rate": 3.7376241976611274e-06, "loss": 0.0006, "step": 16664 }, { "epoch": 4.396253792375676, "grad_norm": 0.011451479978859425, "learning_rate": 3.7372724874703244e-06, "loss": 0.0002, "step": 16666 }, { "epoch": 4.396781427252342, "grad_norm": 0.02500896155834198, "learning_rate": 3.7369207772795218e-06, "loss": 0.0019, "step": 16668 }, { "epoch": 4.397309062129007, "grad_norm": 0.014806645922362804, "learning_rate": 3.736569067088719e-06, "loss": 0.0001, "step": 16670 }, { "epoch": 4.397836697005672, "grad_norm": 0.012027638033032417, "learning_rate": 3.7362173568979157e-06, "loss": 0.0002, "step": 16672 }, { "epoch": 4.3983643318823376, "grad_norm": 0.08642548322677612, "learning_rate": 3.7358656467071135e-06, "loss": 0.0021, "step": 16674 }, { "epoch": 4.3988919667590025, "grad_norm": 0.08909928053617477, "learning_rate": 3.735513936516311e-06, "loss": 0.0003, "step": 16676 }, { "epoch": 4.399419601635668, "grad_norm": 0.3049388825893402, "learning_rate": 3.7351622263255082e-06, "loss": 0.0039, "step": 16678 }, { "epoch": 4.399947236512333, "grad_norm": 0.30886492133140564, "learning_rate": 3.7348105161347048e-06, "loss": 0.0012, "step": 16680 }, { "epoch": 4.400474871388999, "grad_norm": 0.0021712463349103928, "learning_rate": 3.734458805943902e-06, "loss": 0.0001, "step": 16682 }, { "epoch": 4.401002506265664, "grad_norm": 0.041878826916217804, "learning_rate": 3.7341070957530995e-06, "loss": 0.0003, "step": 16684 }, { "epoch": 4.401530141142329, "grad_norm": 0.03652232512831688, "learning_rate": 3.733755385562297e-06, "loss": 0.0065, "step": 16686 }, { "epoch": 4.402057776018995, "grad_norm": 0.04003726318478584, "learning_rate": 3.733403675371494e-06, "loss": 0.0002, "step": 16688 }, { "epoch": 4.40258541089566, "grad_norm": 0.02399357780814171, "learning_rate": 3.733051965180691e-06, "loss": 0.0003, "step": 16690 }, { "epoch": 4.403113045772326, "grad_norm": 0.03889581561088562, "learning_rate": 3.7327002549898886e-06, "loss": 0.0072, "step": 16692 }, { "epoch": 4.403640680648991, "grad_norm": 0.028280556201934814, "learning_rate": 3.732348544799086e-06, "loss": 0.0005, "step": 16694 }, { "epoch": 4.404168315525657, "grad_norm": 0.04442038759589195, "learning_rate": 3.7319968346082825e-06, "loss": 0.0049, "step": 16696 }, { "epoch": 4.4046959504023215, "grad_norm": 0.06962963938713074, "learning_rate": 3.73164512441748e-06, "loss": 0.0026, "step": 16698 }, { "epoch": 4.405223585278987, "grad_norm": 1.4593160152435303, "learning_rate": 3.7312934142266777e-06, "loss": 0.0064, "step": 16700 }, { "epoch": 4.405751220155652, "grad_norm": 0.04719037190079689, "learning_rate": 3.730941704035874e-06, "loss": 0.0004, "step": 16702 }, { "epoch": 4.406278855032317, "grad_norm": 0.6818158030509949, "learning_rate": 3.7305899938450716e-06, "loss": 0.0029, "step": 16704 }, { "epoch": 4.406806489908983, "grad_norm": 0.19914953410625458, "learning_rate": 3.730238283654269e-06, "loss": 0.0043, "step": 16706 }, { "epoch": 4.407334124785648, "grad_norm": 0.247419074177742, "learning_rate": 3.7298865734634663e-06, "loss": 0.0016, "step": 16708 }, { "epoch": 4.407861759662314, "grad_norm": 0.10441382974386215, "learning_rate": 3.7295348632726633e-06, "loss": 0.0051, "step": 16710 }, { "epoch": 4.408389394538979, "grad_norm": 0.044875383377075195, "learning_rate": 3.7291831530818606e-06, "loss": 0.0016, "step": 16712 }, { "epoch": 4.408917029415645, "grad_norm": 0.04868781194090843, "learning_rate": 3.728831442891058e-06, "loss": 0.0003, "step": 16714 }, { "epoch": 4.40944466429231, "grad_norm": 0.15626752376556396, "learning_rate": 3.7284797327002554e-06, "loss": 0.0004, "step": 16716 }, { "epoch": 4.409972299168975, "grad_norm": 0.06594283133745193, "learning_rate": 3.728128022509452e-06, "loss": 0.0005, "step": 16718 }, { "epoch": 4.410499934045641, "grad_norm": 0.007830563001334667, "learning_rate": 3.7277763123186493e-06, "loss": 0.0002, "step": 16720 }, { "epoch": 4.4110275689223055, "grad_norm": 0.21527531743049622, "learning_rate": 3.7274246021278467e-06, "loss": 0.0044, "step": 16722 }, { "epoch": 4.411555203798971, "grad_norm": 0.5050538182258606, "learning_rate": 3.727072891937044e-06, "loss": 0.004, "step": 16724 }, { "epoch": 4.412082838675636, "grad_norm": 0.2538295388221741, "learning_rate": 3.726721181746241e-06, "loss": 0.0017, "step": 16726 }, { "epoch": 4.412610473552302, "grad_norm": 0.015619097277522087, "learning_rate": 3.7263694715554384e-06, "loss": 0.0039, "step": 16728 }, { "epoch": 4.413138108428967, "grad_norm": 0.07590503990650177, "learning_rate": 3.7260177613646358e-06, "loss": 0.0009, "step": 16730 }, { "epoch": 4.413665743305632, "grad_norm": 0.019771292805671692, "learning_rate": 3.725666051173833e-06, "loss": 0.0005, "step": 16732 }, { "epoch": 4.414193378182298, "grad_norm": 0.21105992794036865, "learning_rate": 3.72531434098303e-06, "loss": 0.0028, "step": 16734 }, { "epoch": 4.414721013058963, "grad_norm": 0.028353553265333176, "learning_rate": 3.7249626307922275e-06, "loss": 0.0002, "step": 16736 }, { "epoch": 4.415248647935629, "grad_norm": 0.011975069530308247, "learning_rate": 3.724610920601425e-06, "loss": 0.0014, "step": 16738 }, { "epoch": 4.415776282812294, "grad_norm": 0.053150929510593414, "learning_rate": 3.7242592104106214e-06, "loss": 0.0064, "step": 16740 }, { "epoch": 4.41630391768896, "grad_norm": 0.014547325670719147, "learning_rate": 3.7239075002198187e-06, "loss": 0.0008, "step": 16742 }, { "epoch": 4.4168315525656245, "grad_norm": 0.029017100110650063, "learning_rate": 3.723555790029016e-06, "loss": 0.0007, "step": 16744 }, { "epoch": 4.41735918744229, "grad_norm": 0.02827637270092964, "learning_rate": 3.7232040798382135e-06, "loss": 0.0011, "step": 16746 }, { "epoch": 4.417886822318955, "grad_norm": 0.02582143060863018, "learning_rate": 3.7228523696474104e-06, "loss": 0.0003, "step": 16748 }, { "epoch": 4.41841445719562, "grad_norm": 0.00903269462287426, "learning_rate": 3.722500659456608e-06, "loss": 0.0004, "step": 16750 }, { "epoch": 4.418942092072286, "grad_norm": 0.026321392506361008, "learning_rate": 3.722148949265805e-06, "loss": 0.0004, "step": 16752 }, { "epoch": 4.419469726948951, "grad_norm": 0.2885950803756714, "learning_rate": 3.7217972390750026e-06, "loss": 0.0015, "step": 16754 }, { "epoch": 4.419997361825617, "grad_norm": 0.009013657458126545, "learning_rate": 3.721445528884199e-06, "loss": 0.0002, "step": 16756 }, { "epoch": 4.420524996702282, "grad_norm": 0.04191753268241882, "learning_rate": 3.721093818693397e-06, "loss": 0.0004, "step": 16758 }, { "epoch": 4.421052631578947, "grad_norm": 0.00888816174119711, "learning_rate": 3.7207421085025943e-06, "loss": 0.0004, "step": 16760 }, { "epoch": 4.421580266455613, "grad_norm": 0.3570042848587036, "learning_rate": 3.7203903983117917e-06, "loss": 0.0032, "step": 16762 }, { "epoch": 4.422107901332278, "grad_norm": 0.019974205642938614, "learning_rate": 3.720038688120988e-06, "loss": 0.0002, "step": 16764 }, { "epoch": 4.422635536208944, "grad_norm": 0.5064122080802917, "learning_rate": 3.7196869779301856e-06, "loss": 0.0036, "step": 16766 }, { "epoch": 4.4231631710856085, "grad_norm": 0.031491536647081375, "learning_rate": 3.719335267739383e-06, "loss": 0.0004, "step": 16768 }, { "epoch": 4.423690805962274, "grad_norm": 0.0028551232535392046, "learning_rate": 3.71898355754858e-06, "loss": 0.0028, "step": 16770 }, { "epoch": 4.424218440838939, "grad_norm": 0.004876549821346998, "learning_rate": 3.7186318473577773e-06, "loss": 0.0002, "step": 16772 }, { "epoch": 4.424746075715605, "grad_norm": 0.07191460579633713, "learning_rate": 3.7182801371669746e-06, "loss": 0.0004, "step": 16774 }, { "epoch": 4.42527371059227, "grad_norm": 0.01978413015604019, "learning_rate": 3.717928426976172e-06, "loss": 0.0005, "step": 16776 }, { "epoch": 4.425801345468935, "grad_norm": 0.01731666922569275, "learning_rate": 3.7175767167853685e-06, "loss": 0.0049, "step": 16778 }, { "epoch": 4.426328980345601, "grad_norm": 0.12284410744905472, "learning_rate": 3.717225006594566e-06, "loss": 0.0014, "step": 16780 }, { "epoch": 4.426856615222266, "grad_norm": 0.019162006676197052, "learning_rate": 3.7168732964037633e-06, "loss": 0.0003, "step": 16782 }, { "epoch": 4.427384250098932, "grad_norm": 0.12623508274555206, "learning_rate": 3.716521586212961e-06, "loss": 0.0061, "step": 16784 }, { "epoch": 4.427911884975597, "grad_norm": 0.3537874221801758, "learning_rate": 3.7161698760221576e-06, "loss": 0.0024, "step": 16786 }, { "epoch": 4.428439519852263, "grad_norm": 0.019119461998343468, "learning_rate": 3.715818165831355e-06, "loss": 0.0003, "step": 16788 }, { "epoch": 4.4289671547289275, "grad_norm": 0.003538319608196616, "learning_rate": 3.7154664556405524e-06, "loss": 0.0002, "step": 16790 }, { "epoch": 4.429494789605593, "grad_norm": 0.004692107904702425, "learning_rate": 3.7151147454497498e-06, "loss": 0.0016, "step": 16792 }, { "epoch": 4.430022424482258, "grad_norm": 0.04311230778694153, "learning_rate": 3.7147630352589467e-06, "loss": 0.0003, "step": 16794 }, { "epoch": 4.430550059358923, "grad_norm": 0.06615298986434937, "learning_rate": 3.714411325068144e-06, "loss": 0.0014, "step": 16796 }, { "epoch": 4.431077694235589, "grad_norm": 0.010588807985186577, "learning_rate": 3.7140596148773415e-06, "loss": 0.0002, "step": 16798 }, { "epoch": 4.431605329112254, "grad_norm": 0.27289602160453796, "learning_rate": 3.713707904686538e-06, "loss": 0.0139, "step": 16800 }, { "epoch": 4.43213296398892, "grad_norm": 0.013077586889266968, "learning_rate": 3.7133561944957354e-06, "loss": 0.0002, "step": 16802 }, { "epoch": 4.432660598865585, "grad_norm": 0.010306376032531261, "learning_rate": 3.7130044843049327e-06, "loss": 0.0002, "step": 16804 }, { "epoch": 4.43318823374225, "grad_norm": 0.003836392890661955, "learning_rate": 3.71265277411413e-06, "loss": 0.006, "step": 16806 }, { "epoch": 4.433715868618916, "grad_norm": 0.034620724618434906, "learning_rate": 3.712301063923327e-06, "loss": 0.0004, "step": 16808 }, { "epoch": 4.434243503495581, "grad_norm": 0.16447332501411438, "learning_rate": 3.7119493537325244e-06, "loss": 0.0037, "step": 16810 }, { "epoch": 4.434771138372247, "grad_norm": 0.01455045398324728, "learning_rate": 3.711597643541722e-06, "loss": 0.0002, "step": 16812 }, { "epoch": 4.4352987732489115, "grad_norm": 0.03204917907714844, "learning_rate": 3.711245933350919e-06, "loss": 0.0004, "step": 16814 }, { "epoch": 4.435826408125577, "grad_norm": 0.018228642642498016, "learning_rate": 3.710894223160116e-06, "loss": 0.0002, "step": 16816 }, { "epoch": 4.436354043002242, "grad_norm": 0.016353894025087357, "learning_rate": 3.7105425129693135e-06, "loss": 0.0003, "step": 16818 }, { "epoch": 4.436881677878908, "grad_norm": 0.10100340098142624, "learning_rate": 3.710190802778511e-06, "loss": 0.0004, "step": 16820 }, { "epoch": 4.437409312755573, "grad_norm": 0.05413922294974327, "learning_rate": 3.7098390925877083e-06, "loss": 0.0003, "step": 16822 }, { "epoch": 4.437936947632238, "grad_norm": 0.012727589346468449, "learning_rate": 3.709487382396905e-06, "loss": 0.0002, "step": 16824 }, { "epoch": 4.438464582508904, "grad_norm": 0.012451578862965107, "learning_rate": 3.709135672206102e-06, "loss": 0.0002, "step": 16826 }, { "epoch": 4.438992217385569, "grad_norm": 0.04807136580348015, "learning_rate": 3.7087839620152996e-06, "loss": 0.0004, "step": 16828 }, { "epoch": 4.439519852262235, "grad_norm": 0.1826871633529663, "learning_rate": 3.7084322518244965e-06, "loss": 0.0005, "step": 16830 }, { "epoch": 4.4400474871389, "grad_norm": 0.2050265371799469, "learning_rate": 3.708080541633694e-06, "loss": 0.0024, "step": 16832 }, { "epoch": 4.440575122015566, "grad_norm": 0.10321920365095139, "learning_rate": 3.7077288314428913e-06, "loss": 0.0014, "step": 16834 }, { "epoch": 4.4411027568922306, "grad_norm": 0.19882343709468842, "learning_rate": 3.7073771212520886e-06, "loss": 0.0062, "step": 16836 }, { "epoch": 4.4416303917688955, "grad_norm": 0.060548536479473114, "learning_rate": 3.707025411061285e-06, "loss": 0.0002, "step": 16838 }, { "epoch": 4.442158026645561, "grad_norm": 0.1993180364370346, "learning_rate": 3.7066737008704825e-06, "loss": 0.0008, "step": 16840 }, { "epoch": 4.442685661522226, "grad_norm": 0.2392488420009613, "learning_rate": 3.7063219906796803e-06, "loss": 0.0039, "step": 16842 }, { "epoch": 4.443213296398892, "grad_norm": 0.007813329808413982, "learning_rate": 3.7059702804888777e-06, "loss": 0.0001, "step": 16844 }, { "epoch": 4.443740931275557, "grad_norm": 0.0052410815842449665, "learning_rate": 3.7056185702980742e-06, "loss": 0.0004, "step": 16846 }, { "epoch": 4.444268566152223, "grad_norm": 0.8080492615699768, "learning_rate": 3.7052668601072716e-06, "loss": 0.0032, "step": 16848 }, { "epoch": 4.444796201028888, "grad_norm": 0.16802728176116943, "learning_rate": 3.704915149916469e-06, "loss": 0.0004, "step": 16850 }, { "epoch": 4.445323835905553, "grad_norm": 0.010497014969587326, "learning_rate": 3.7045634397256664e-06, "loss": 0.0016, "step": 16852 }, { "epoch": 4.445851470782219, "grad_norm": 0.00618558656424284, "learning_rate": 3.7042117295348633e-06, "loss": 0.0002, "step": 16854 }, { "epoch": 4.446379105658884, "grad_norm": 0.0075432006269693375, "learning_rate": 3.7038600193440607e-06, "loss": 0.0042, "step": 16856 }, { "epoch": 4.44690674053555, "grad_norm": 0.013135206885635853, "learning_rate": 3.703508309153258e-06, "loss": 0.0002, "step": 16858 }, { "epoch": 4.4474343754122145, "grad_norm": 0.6862590909004211, "learning_rate": 3.7031565989624546e-06, "loss": 0.0033, "step": 16860 }, { "epoch": 4.44796201028888, "grad_norm": 0.04268990084528923, "learning_rate": 3.702804888771652e-06, "loss": 0.0003, "step": 16862 }, { "epoch": 4.448489645165545, "grad_norm": 0.31717178225517273, "learning_rate": 3.7024531785808494e-06, "loss": 0.001, "step": 16864 }, { "epoch": 4.449017280042211, "grad_norm": 0.40574413537979126, "learning_rate": 3.7021014683900467e-06, "loss": 0.0009, "step": 16866 }, { "epoch": 4.449544914918876, "grad_norm": 0.12475447356700897, "learning_rate": 3.7017497581992437e-06, "loss": 0.0014, "step": 16868 }, { "epoch": 4.450072549795541, "grad_norm": 0.02557365596294403, "learning_rate": 3.701398048008441e-06, "loss": 0.0004, "step": 16870 }, { "epoch": 4.450600184672207, "grad_norm": 0.009506426751613617, "learning_rate": 3.7010463378176384e-06, "loss": 0.0003, "step": 16872 }, { "epoch": 4.451127819548872, "grad_norm": 0.177870973944664, "learning_rate": 3.700694627626836e-06, "loss": 0.0085, "step": 16874 }, { "epoch": 4.451655454425538, "grad_norm": 0.024516437202692032, "learning_rate": 3.7003429174360328e-06, "loss": 0.0002, "step": 16876 }, { "epoch": 4.452183089302203, "grad_norm": 0.003617997048422694, "learning_rate": 3.69999120724523e-06, "loss": 0.0004, "step": 16878 }, { "epoch": 4.452710724178869, "grad_norm": 0.040196385234594345, "learning_rate": 3.6996394970544275e-06, "loss": 0.0074, "step": 16880 }, { "epoch": 4.453238359055534, "grad_norm": 0.16066354513168335, "learning_rate": 3.699287786863625e-06, "loss": 0.0015, "step": 16882 }, { "epoch": 4.4537659939321985, "grad_norm": 0.024545686319470406, "learning_rate": 3.6989360766728214e-06, "loss": 0.0002, "step": 16884 }, { "epoch": 4.454293628808864, "grad_norm": 1.2739779949188232, "learning_rate": 3.698584366482019e-06, "loss": 0.0046, "step": 16886 }, { "epoch": 4.454821263685529, "grad_norm": 0.009247357957065105, "learning_rate": 3.698232656291216e-06, "loss": 0.0002, "step": 16888 }, { "epoch": 4.455348898562195, "grad_norm": 0.07453419268131256, "learning_rate": 3.697880946100413e-06, "loss": 0.0011, "step": 16890 }, { "epoch": 4.45587653343886, "grad_norm": 0.17677782475948334, "learning_rate": 3.6975292359096105e-06, "loss": 0.0008, "step": 16892 }, { "epoch": 4.456404168315526, "grad_norm": 0.19624702632427216, "learning_rate": 3.697177525718808e-06, "loss": 0.0011, "step": 16894 }, { "epoch": 4.456931803192191, "grad_norm": 0.2849843204021454, "learning_rate": 3.6968258155280052e-06, "loss": 0.0036, "step": 16896 }, { "epoch": 4.457459438068856, "grad_norm": 0.07823050767183304, "learning_rate": 3.6964741053372018e-06, "loss": 0.0005, "step": 16898 }, { "epoch": 4.457987072945522, "grad_norm": 0.6586148142814636, "learning_rate": 3.6961223951463996e-06, "loss": 0.0024, "step": 16900 }, { "epoch": 4.458514707822187, "grad_norm": 0.004171679262071848, "learning_rate": 3.695770684955597e-06, "loss": 0.0005, "step": 16902 }, { "epoch": 4.459042342698853, "grad_norm": 0.05905339866876602, "learning_rate": 3.6954189747647943e-06, "loss": 0.0004, "step": 16904 }, { "epoch": 4.4595699775755175, "grad_norm": 0.017910510301589966, "learning_rate": 3.695067264573991e-06, "loss": 0.0043, "step": 16906 }, { "epoch": 4.460097612452183, "grad_norm": 0.05483406037092209, "learning_rate": 3.6947155543831882e-06, "loss": 0.0008, "step": 16908 }, { "epoch": 4.460625247328848, "grad_norm": 0.039858460426330566, "learning_rate": 3.6943638441923856e-06, "loss": 0.0003, "step": 16910 }, { "epoch": 4.461152882205514, "grad_norm": 0.021894022822380066, "learning_rate": 3.694012134001583e-06, "loss": 0.0002, "step": 16912 }, { "epoch": 4.461680517082179, "grad_norm": 0.05257827416062355, "learning_rate": 3.69366042381078e-06, "loss": 0.0079, "step": 16914 }, { "epoch": 4.462208151958844, "grad_norm": 0.3883807361125946, "learning_rate": 3.6933087136199773e-06, "loss": 0.0012, "step": 16916 }, { "epoch": 4.46273578683551, "grad_norm": 0.011725736781954765, "learning_rate": 3.6929570034291747e-06, "loss": 0.0004, "step": 16918 }, { "epoch": 4.463263421712175, "grad_norm": 0.06344228982925415, "learning_rate": 3.6926052932383712e-06, "loss": 0.0006, "step": 16920 }, { "epoch": 4.463791056588841, "grad_norm": 0.011217758990824223, "learning_rate": 3.6922535830475686e-06, "loss": 0.0002, "step": 16922 }, { "epoch": 4.464318691465506, "grad_norm": 0.24580146372318268, "learning_rate": 3.691901872856766e-06, "loss": 0.0008, "step": 16924 }, { "epoch": 4.464846326342172, "grad_norm": 0.5650198459625244, "learning_rate": 3.6915501626659638e-06, "loss": 0.0019, "step": 16926 }, { "epoch": 4.465373961218837, "grad_norm": 0.29386040568351746, "learning_rate": 3.6911984524751603e-06, "loss": 0.0111, "step": 16928 }, { "epoch": 4.4659015960955015, "grad_norm": 0.12390121072530746, "learning_rate": 3.6908467422843577e-06, "loss": 0.0037, "step": 16930 }, { "epoch": 4.466429230972167, "grad_norm": 0.002348080277442932, "learning_rate": 3.690495032093555e-06, "loss": 0.0002, "step": 16932 }, { "epoch": 4.466956865848832, "grad_norm": 0.10032326728105545, "learning_rate": 3.6901433219027524e-06, "loss": 0.0003, "step": 16934 }, { "epoch": 4.467484500725498, "grad_norm": 0.20294518768787384, "learning_rate": 3.6897916117119494e-06, "loss": 0.0008, "step": 16936 }, { "epoch": 4.468012135602163, "grad_norm": 0.01848726160824299, "learning_rate": 3.6894399015211468e-06, "loss": 0.0002, "step": 16938 }, { "epoch": 4.468539770478829, "grad_norm": 0.0066950577311217785, "learning_rate": 3.689088191330344e-06, "loss": 0.0007, "step": 16940 }, { "epoch": 4.469067405355494, "grad_norm": 0.10151255875825882, "learning_rate": 3.6887364811395415e-06, "loss": 0.0003, "step": 16942 }, { "epoch": 4.469595040232159, "grad_norm": 0.006589781958609819, "learning_rate": 3.688384770948738e-06, "loss": 0.0007, "step": 16944 }, { "epoch": 4.470122675108825, "grad_norm": 0.043132103979587555, "learning_rate": 3.6880330607579354e-06, "loss": 0.0003, "step": 16946 }, { "epoch": 4.47065030998549, "grad_norm": 0.5841728448867798, "learning_rate": 3.6876813505671328e-06, "loss": 0.0039, "step": 16948 }, { "epoch": 4.471177944862156, "grad_norm": 0.7787566781044006, "learning_rate": 3.6873296403763297e-06, "loss": 0.0083, "step": 16950 }, { "epoch": 4.4717055797388205, "grad_norm": 0.005848669912666082, "learning_rate": 3.686977930185527e-06, "loss": 0.0001, "step": 16952 }, { "epoch": 4.472233214615486, "grad_norm": 0.04122437909245491, "learning_rate": 3.6866262199947245e-06, "loss": 0.0003, "step": 16954 }, { "epoch": 4.472760849492151, "grad_norm": 0.005167060066014528, "learning_rate": 3.686274509803922e-06, "loss": 0.0002, "step": 16956 }, { "epoch": 4.473288484368817, "grad_norm": 0.006973241921514273, "learning_rate": 3.685922799613119e-06, "loss": 0.0002, "step": 16958 }, { "epoch": 4.473816119245482, "grad_norm": 0.04568824544548988, "learning_rate": 3.685571089422316e-06, "loss": 0.0003, "step": 16960 }, { "epoch": 4.474343754122147, "grad_norm": 0.16156005859375, "learning_rate": 3.6852193792315136e-06, "loss": 0.0004, "step": 16962 }, { "epoch": 4.474871388998813, "grad_norm": 0.015258805826306343, "learning_rate": 3.684867669040711e-06, "loss": 0.0002, "step": 16964 }, { "epoch": 4.475399023875478, "grad_norm": 0.19261150062084198, "learning_rate": 3.6845159588499075e-06, "loss": 0.0009, "step": 16966 }, { "epoch": 4.475926658752144, "grad_norm": 0.09300478547811508, "learning_rate": 3.684164248659105e-06, "loss": 0.0022, "step": 16968 }, { "epoch": 4.476454293628809, "grad_norm": 0.008603483438491821, "learning_rate": 3.6838125384683022e-06, "loss": 0.0063, "step": 16970 }, { "epoch": 4.476981928505475, "grad_norm": 0.010516841895878315, "learning_rate": 3.6834608282774996e-06, "loss": 0.0116, "step": 16972 }, { "epoch": 4.47750956338214, "grad_norm": 0.4977281391620636, "learning_rate": 3.6831091180866966e-06, "loss": 0.0038, "step": 16974 }, { "epoch": 4.4780371982588045, "grad_norm": 0.024075202643871307, "learning_rate": 3.682757407895894e-06, "loss": 0.0081, "step": 16976 }, { "epoch": 4.47856483313547, "grad_norm": 0.016936223953962326, "learning_rate": 3.6824056977050913e-06, "loss": 0.0002, "step": 16978 }, { "epoch": 4.479092468012135, "grad_norm": 0.1915324628353119, "learning_rate": 3.682053987514288e-06, "loss": 0.0009, "step": 16980 }, { "epoch": 4.479620102888801, "grad_norm": 0.09169300645589828, "learning_rate": 3.6817022773234852e-06, "loss": 0.001, "step": 16982 }, { "epoch": 4.480147737765466, "grad_norm": 0.17367155849933624, "learning_rate": 3.681350567132683e-06, "loss": 0.0013, "step": 16984 }, { "epoch": 4.480675372642132, "grad_norm": 0.12162040919065475, "learning_rate": 3.6809988569418804e-06, "loss": 0.0009, "step": 16986 }, { "epoch": 4.481203007518797, "grad_norm": 0.013810724020004272, "learning_rate": 3.680647146751077e-06, "loss": 0.0032, "step": 16988 }, { "epoch": 4.481730642395462, "grad_norm": 0.030554722994565964, "learning_rate": 3.6802954365602743e-06, "loss": 0.0026, "step": 16990 }, { "epoch": 4.482258277272128, "grad_norm": 0.1817506104707718, "learning_rate": 3.6799437263694717e-06, "loss": 0.0008, "step": 16992 }, { "epoch": 4.482785912148793, "grad_norm": 0.0097825787961483, "learning_rate": 3.679592016178669e-06, "loss": 0.0003, "step": 16994 }, { "epoch": 4.483313547025459, "grad_norm": 0.009544762782752514, "learning_rate": 3.679240305987866e-06, "loss": 0.0002, "step": 16996 }, { "epoch": 4.4838411819021236, "grad_norm": 0.2166886329650879, "learning_rate": 3.6788885957970634e-06, "loss": 0.0126, "step": 16998 }, { "epoch": 4.484368816778789, "grad_norm": 0.1230948269367218, "learning_rate": 3.6785368856062607e-06, "loss": 0.0004, "step": 17000 }, { "epoch": 4.484896451655454, "grad_norm": 0.013940260745584965, "learning_rate": 3.678185175415458e-06, "loss": 0.0039, "step": 17002 }, { "epoch": 4.48542408653212, "grad_norm": 0.019580231979489326, "learning_rate": 3.6778334652246547e-06, "loss": 0.0002, "step": 17004 }, { "epoch": 4.485951721408785, "grad_norm": 0.08216147124767303, "learning_rate": 3.677481755033852e-06, "loss": 0.0006, "step": 17006 }, { "epoch": 4.48647935628545, "grad_norm": 0.01774238795042038, "learning_rate": 3.6771300448430494e-06, "loss": 0.0002, "step": 17008 }, { "epoch": 4.487006991162116, "grad_norm": 0.30738770961761475, "learning_rate": 3.6767783346522464e-06, "loss": 0.0007, "step": 17010 }, { "epoch": 4.487534626038781, "grad_norm": 0.06780872493982315, "learning_rate": 3.6764266244614437e-06, "loss": 0.0005, "step": 17012 }, { "epoch": 4.488062260915447, "grad_norm": 0.44477325677871704, "learning_rate": 3.676074914270641e-06, "loss": 0.0005, "step": 17014 }, { "epoch": 4.488589895792112, "grad_norm": 0.1685747504234314, "learning_rate": 3.6757232040798385e-06, "loss": 0.0019, "step": 17016 }, { "epoch": 4.489117530668778, "grad_norm": 0.012131011113524437, "learning_rate": 3.6753714938890354e-06, "loss": 0.0048, "step": 17018 }, { "epoch": 4.489645165545443, "grad_norm": 0.0566360242664814, "learning_rate": 3.675019783698233e-06, "loss": 0.0025, "step": 17020 }, { "epoch": 4.4901728004221075, "grad_norm": 0.04813653975725174, "learning_rate": 3.67466807350743e-06, "loss": 0.0003, "step": 17022 }, { "epoch": 4.490700435298773, "grad_norm": 0.012092206627130508, "learning_rate": 3.6743163633166276e-06, "loss": 0.0002, "step": 17024 }, { "epoch": 4.491228070175438, "grad_norm": 0.08910726010799408, "learning_rate": 3.673964653125824e-06, "loss": 0.0005, "step": 17026 }, { "epoch": 4.491755705052104, "grad_norm": 0.008583664894104004, "learning_rate": 3.6736129429350215e-06, "loss": 0.003, "step": 17028 }, { "epoch": 4.492283339928769, "grad_norm": 0.045435164123773575, "learning_rate": 3.673261232744219e-06, "loss": 0.0003, "step": 17030 }, { "epoch": 4.492810974805435, "grad_norm": 0.17886966466903687, "learning_rate": 3.6729095225534162e-06, "loss": 0.0005, "step": 17032 }, { "epoch": 4.4933386096821, "grad_norm": 0.17577561736106873, "learning_rate": 3.672557812362613e-06, "loss": 0.0006, "step": 17034 }, { "epoch": 4.493866244558765, "grad_norm": 0.01536682527512312, "learning_rate": 3.6722061021718105e-06, "loss": 0.0005, "step": 17036 }, { "epoch": 4.494393879435431, "grad_norm": 0.020472880452871323, "learning_rate": 3.671854391981008e-06, "loss": 0.0002, "step": 17038 }, { "epoch": 4.494921514312096, "grad_norm": 0.00930006057024002, "learning_rate": 3.6715026817902045e-06, "loss": 0.0005, "step": 17040 }, { "epoch": 4.495449149188762, "grad_norm": 0.018302544951438904, "learning_rate": 3.6711509715994023e-06, "loss": 0.0002, "step": 17042 }, { "epoch": 4.495976784065427, "grad_norm": 0.005293836817145348, "learning_rate": 3.6707992614085996e-06, "loss": 0.0032, "step": 17044 }, { "epoch": 4.496504418942092, "grad_norm": 0.011602486483752728, "learning_rate": 3.670447551217797e-06, "loss": 0.0035, "step": 17046 }, { "epoch": 4.497032053818757, "grad_norm": 0.03206409513950348, "learning_rate": 3.6700958410269935e-06, "loss": 0.0002, "step": 17048 }, { "epoch": 4.497559688695423, "grad_norm": 0.48588404059410095, "learning_rate": 3.669744130836191e-06, "loss": 0.0045, "step": 17050 }, { "epoch": 4.498087323572088, "grad_norm": 0.21482178568840027, "learning_rate": 3.6693924206453883e-06, "loss": 0.0049, "step": 17052 }, { "epoch": 4.498614958448753, "grad_norm": 0.11268188804388046, "learning_rate": 3.6690407104545857e-06, "loss": 0.0005, "step": 17054 }, { "epoch": 4.499142593325419, "grad_norm": 0.5435635447502136, "learning_rate": 3.6686890002637826e-06, "loss": 0.0014, "step": 17056 }, { "epoch": 4.499670228202084, "grad_norm": 0.018060898408293724, "learning_rate": 3.66833729007298e-06, "loss": 0.0004, "step": 17058 }, { "epoch": 4.50019786307875, "grad_norm": 0.011595341376960278, "learning_rate": 3.6679855798821774e-06, "loss": 0.0002, "step": 17060 }, { "epoch": 4.500725497955415, "grad_norm": 0.0027895302046090364, "learning_rate": 3.6676338696913747e-06, "loss": 0.0043, "step": 17062 }, { "epoch": 4.50125313283208, "grad_norm": 0.013629961758852005, "learning_rate": 3.6672821595005713e-06, "loss": 0.0002, "step": 17064 }, { "epoch": 4.501780767708746, "grad_norm": 0.008525987155735493, "learning_rate": 3.6669304493097686e-06, "loss": 0.0001, "step": 17066 }, { "epoch": 4.5023084025854105, "grad_norm": 0.004950609989464283, "learning_rate": 3.6665787391189664e-06, "loss": 0.0072, "step": 17068 }, { "epoch": 4.502836037462076, "grad_norm": 0.007659472990781069, "learning_rate": 3.666227028928163e-06, "loss": 0.0005, "step": 17070 }, { "epoch": 4.503363672338741, "grad_norm": 0.00341258873231709, "learning_rate": 3.6658753187373603e-06, "loss": 0.0003, "step": 17072 }, { "epoch": 4.503891307215407, "grad_norm": 0.015007239766418934, "learning_rate": 3.6655236085465577e-06, "loss": 0.0023, "step": 17074 }, { "epoch": 4.504418942092072, "grad_norm": 0.021804368123412132, "learning_rate": 3.665171898355755e-06, "loss": 0.0003, "step": 17076 }, { "epoch": 4.504946576968738, "grad_norm": 0.028525209054350853, "learning_rate": 3.664820188164952e-06, "loss": 0.0003, "step": 17078 }, { "epoch": 4.505474211845403, "grad_norm": 0.09691330790519714, "learning_rate": 3.6644684779741494e-06, "loss": 0.0003, "step": 17080 }, { "epoch": 4.506001846722068, "grad_norm": 0.009441880509257317, "learning_rate": 3.664116767783347e-06, "loss": 0.0023, "step": 17082 }, { "epoch": 4.506529481598734, "grad_norm": 0.364897221326828, "learning_rate": 3.663765057592544e-06, "loss": 0.0013, "step": 17084 }, { "epoch": 4.507057116475399, "grad_norm": 0.1509970724582672, "learning_rate": 3.6634133474017407e-06, "loss": 0.0007, "step": 17086 }, { "epoch": 4.507584751352065, "grad_norm": 0.008674307726323605, "learning_rate": 3.663061637210938e-06, "loss": 0.0003, "step": 17088 }, { "epoch": 4.50811238622873, "grad_norm": 0.280819833278656, "learning_rate": 3.6627099270201355e-06, "loss": 0.0009, "step": 17090 }, { "epoch": 4.508640021105395, "grad_norm": 0.011943675577640533, "learning_rate": 3.662358216829333e-06, "loss": 0.0003, "step": 17092 }, { "epoch": 4.50916765598206, "grad_norm": 0.02321954257786274, "learning_rate": 3.66200650663853e-06, "loss": 0.0005, "step": 17094 }, { "epoch": 4.509695290858726, "grad_norm": 0.02992870658636093, "learning_rate": 3.661654796447727e-06, "loss": 0.0003, "step": 17096 }, { "epoch": 4.510222925735391, "grad_norm": 0.19172105193138123, "learning_rate": 3.6613030862569245e-06, "loss": 0.0006, "step": 17098 }, { "epoch": 4.510750560612056, "grad_norm": 0.003161999862641096, "learning_rate": 3.6609513760661215e-06, "loss": 0.0002, "step": 17100 }, { "epoch": 4.511278195488722, "grad_norm": 0.027209291234612465, "learning_rate": 3.660599665875319e-06, "loss": 0.0002, "step": 17102 }, { "epoch": 4.511805830365387, "grad_norm": 0.07004880160093307, "learning_rate": 3.6602479556845162e-06, "loss": 0.0003, "step": 17104 }, { "epoch": 4.512333465242053, "grad_norm": 0.02933340333402157, "learning_rate": 3.6598962454937136e-06, "loss": 0.0002, "step": 17106 }, { "epoch": 4.512861100118718, "grad_norm": 0.019315052777528763, "learning_rate": 3.65954453530291e-06, "loss": 0.0022, "step": 17108 }, { "epoch": 4.513388734995383, "grad_norm": 0.44151780009269714, "learning_rate": 3.6591928251121075e-06, "loss": 0.0026, "step": 17110 }, { "epoch": 4.513916369872049, "grad_norm": 0.0042256927117705345, "learning_rate": 3.658841114921305e-06, "loss": 0.0002, "step": 17112 }, { "epoch": 4.5144440047487135, "grad_norm": 0.07618512958288193, "learning_rate": 3.6584894047305023e-06, "loss": 0.0003, "step": 17114 }, { "epoch": 4.514971639625379, "grad_norm": 0.08161848783493042, "learning_rate": 3.6581376945396992e-06, "loss": 0.0004, "step": 17116 }, { "epoch": 4.515499274502044, "grad_norm": 0.031147658824920654, "learning_rate": 3.6577859843488966e-06, "loss": 0.0003, "step": 17118 }, { "epoch": 4.51602690937871, "grad_norm": 0.8779159188270569, "learning_rate": 3.657434274158094e-06, "loss": 0.02, "step": 17120 }, { "epoch": 4.516554544255375, "grad_norm": 0.01011172030121088, "learning_rate": 3.6570825639672914e-06, "loss": 0.0002, "step": 17122 }, { "epoch": 4.517082179132041, "grad_norm": 0.007730013225227594, "learning_rate": 3.656730853776488e-06, "loss": 0.0004, "step": 17124 }, { "epoch": 4.517609814008706, "grad_norm": 0.7453497052192688, "learning_rate": 3.6563791435856857e-06, "loss": 0.0077, "step": 17126 }, { "epoch": 4.518137448885371, "grad_norm": 0.005123456008732319, "learning_rate": 3.656027433394883e-06, "loss": 0.0002, "step": 17128 }, { "epoch": 4.518665083762037, "grad_norm": 0.018857000395655632, "learning_rate": 3.6556757232040796e-06, "loss": 0.0002, "step": 17130 }, { "epoch": 4.519192718638702, "grad_norm": 0.7376148104667664, "learning_rate": 3.655324013013277e-06, "loss": 0.0044, "step": 17132 }, { "epoch": 4.519720353515368, "grad_norm": 0.10342801362276077, "learning_rate": 3.6549723028224743e-06, "loss": 0.0004, "step": 17134 }, { "epoch": 4.520247988392033, "grad_norm": 0.026445217430591583, "learning_rate": 3.6546205926316717e-06, "loss": 0.0003, "step": 17136 }, { "epoch": 4.520775623268698, "grad_norm": 0.029896676540374756, "learning_rate": 3.6542688824408687e-06, "loss": 0.0009, "step": 17138 }, { "epoch": 4.521303258145363, "grad_norm": 0.053252529352903366, "learning_rate": 3.653917172250066e-06, "loss": 0.0004, "step": 17140 }, { "epoch": 4.521830893022029, "grad_norm": 0.022314881905913353, "learning_rate": 3.6535654620592634e-06, "loss": 0.0017, "step": 17142 }, { "epoch": 4.522358527898694, "grad_norm": 0.06456093490123749, "learning_rate": 3.653213751868461e-06, "loss": 0.0003, "step": 17144 }, { "epoch": 4.522886162775359, "grad_norm": 0.06365892291069031, "learning_rate": 3.6528620416776573e-06, "loss": 0.0008, "step": 17146 }, { "epoch": 4.523413797652025, "grad_norm": 0.45305779576301575, "learning_rate": 3.6525103314868547e-06, "loss": 0.0014, "step": 17148 }, { "epoch": 4.52394143252869, "grad_norm": 0.09441360086202621, "learning_rate": 3.652158621296052e-06, "loss": 0.0009, "step": 17150 }, { "epoch": 4.524469067405356, "grad_norm": 0.013364649377763271, "learning_rate": 3.65180691110525e-06, "loss": 0.0036, "step": 17152 }, { "epoch": 4.524996702282021, "grad_norm": 0.008813456632196903, "learning_rate": 3.6514552009144464e-06, "loss": 0.0017, "step": 17154 }, { "epoch": 4.525524337158686, "grad_norm": 0.010730416513979435, "learning_rate": 3.6511034907236438e-06, "loss": 0.0054, "step": 17156 }, { "epoch": 4.526051972035352, "grad_norm": 0.012977665290236473, "learning_rate": 3.650751780532841e-06, "loss": 0.0005, "step": 17158 }, { "epoch": 4.5265796069120166, "grad_norm": 1.1937466859817505, "learning_rate": 3.650400070342038e-06, "loss": 0.0023, "step": 17160 }, { "epoch": 4.527107241788682, "grad_norm": 0.04143935441970825, "learning_rate": 3.6500483601512355e-06, "loss": 0.0115, "step": 17162 }, { "epoch": 4.527634876665347, "grad_norm": 0.02881716750562191, "learning_rate": 3.649696649960433e-06, "loss": 0.0002, "step": 17164 }, { "epoch": 4.528162511542013, "grad_norm": 0.03820120170712471, "learning_rate": 3.6493449397696302e-06, "loss": 0.0003, "step": 17166 }, { "epoch": 4.528690146418678, "grad_norm": 0.016155580058693886, "learning_rate": 3.6489932295788268e-06, "loss": 0.0002, "step": 17168 }, { "epoch": 4.529217781295344, "grad_norm": 0.017823241651058197, "learning_rate": 3.648641519388024e-06, "loss": 0.0014, "step": 17170 }, { "epoch": 4.529745416172009, "grad_norm": 0.03234166279435158, "learning_rate": 3.6482898091972215e-06, "loss": 0.0004, "step": 17172 }, { "epoch": 4.530273051048674, "grad_norm": 0.12130827456712723, "learning_rate": 3.647938099006419e-06, "loss": 0.0004, "step": 17174 }, { "epoch": 4.53080068592534, "grad_norm": 0.005882737692445517, "learning_rate": 3.647586388815616e-06, "loss": 0.0001, "step": 17176 }, { "epoch": 4.531328320802005, "grad_norm": 0.0084995673969388, "learning_rate": 3.6472346786248132e-06, "loss": 0.0002, "step": 17178 }, { "epoch": 4.531855955678671, "grad_norm": 0.4868861734867096, "learning_rate": 3.6468829684340106e-06, "loss": 0.0008, "step": 17180 }, { "epoch": 4.532383590555336, "grad_norm": 0.33975183963775635, "learning_rate": 3.646531258243208e-06, "loss": 0.0024, "step": 17182 }, { "epoch": 4.532911225432001, "grad_norm": 0.0718270018696785, "learning_rate": 3.646179548052405e-06, "loss": 0.0016, "step": 17184 }, { "epoch": 4.533438860308666, "grad_norm": 0.513364851474762, "learning_rate": 3.6458278378616023e-06, "loss": 0.0034, "step": 17186 }, { "epoch": 4.533966495185332, "grad_norm": 0.011209867894649506, "learning_rate": 3.6454761276707997e-06, "loss": 0.0002, "step": 17188 }, { "epoch": 4.534494130061997, "grad_norm": 0.05842909589409828, "learning_rate": 3.645124417479996e-06, "loss": 0.0005, "step": 17190 }, { "epoch": 4.535021764938662, "grad_norm": 0.2179500013589859, "learning_rate": 3.6447727072891936e-06, "loss": 0.0008, "step": 17192 }, { "epoch": 4.535549399815328, "grad_norm": 0.41874566674232483, "learning_rate": 3.644420997098391e-06, "loss": 0.004, "step": 17194 }, { "epoch": 4.536077034691993, "grad_norm": 0.004180282354354858, "learning_rate": 3.6440692869075883e-06, "loss": 0.0004, "step": 17196 }, { "epoch": 4.536604669568659, "grad_norm": 0.09104014188051224, "learning_rate": 3.6437175767167853e-06, "loss": 0.0005, "step": 17198 }, { "epoch": 4.537132304445324, "grad_norm": 0.13527999818325043, "learning_rate": 3.6433658665259827e-06, "loss": 0.0025, "step": 17200 }, { "epoch": 4.537659939321989, "grad_norm": 0.00545934634283185, "learning_rate": 3.64301415633518e-06, "loss": 0.0003, "step": 17202 }, { "epoch": 4.538187574198655, "grad_norm": 0.010156627744436264, "learning_rate": 3.6426624461443774e-06, "loss": 0.0012, "step": 17204 }, { "epoch": 4.53871520907532, "grad_norm": 0.1085263192653656, "learning_rate": 3.642310735953574e-06, "loss": 0.0032, "step": 17206 }, { "epoch": 4.539242843951985, "grad_norm": 0.7140331864356995, "learning_rate": 3.6419590257627713e-06, "loss": 0.0004, "step": 17208 }, { "epoch": 4.53977047882865, "grad_norm": 0.013637113384902477, "learning_rate": 3.641607315571969e-06, "loss": 0.0003, "step": 17210 }, { "epoch": 4.540298113705316, "grad_norm": 0.11849811673164368, "learning_rate": 3.6412556053811665e-06, "loss": 0.0007, "step": 17212 }, { "epoch": 4.540825748581981, "grad_norm": 0.05935148894786835, "learning_rate": 3.640903895190363e-06, "loss": 0.0004, "step": 17214 }, { "epoch": 4.541353383458647, "grad_norm": 0.01121502835303545, "learning_rate": 3.6405521849995604e-06, "loss": 0.0003, "step": 17216 }, { "epoch": 4.541881018335312, "grad_norm": 0.026440542191267014, "learning_rate": 3.6402004748087578e-06, "loss": 0.0004, "step": 17218 }, { "epoch": 4.542408653211977, "grad_norm": 0.007284343242645264, "learning_rate": 3.6398487646179547e-06, "loss": 0.0002, "step": 17220 }, { "epoch": 4.542936288088643, "grad_norm": 0.10473661869764328, "learning_rate": 3.639497054427152e-06, "loss": 0.0005, "step": 17222 }, { "epoch": 4.543463922965308, "grad_norm": 0.1215500682592392, "learning_rate": 3.6391453442363495e-06, "loss": 0.0046, "step": 17224 }, { "epoch": 4.543991557841974, "grad_norm": 0.08041120320558548, "learning_rate": 3.638793634045547e-06, "loss": 0.0005, "step": 17226 }, { "epoch": 4.544519192718639, "grad_norm": 0.05060645192861557, "learning_rate": 3.6384419238547434e-06, "loss": 0.0005, "step": 17228 }, { "epoch": 4.545046827595304, "grad_norm": 0.03475019335746765, "learning_rate": 3.6380902136639408e-06, "loss": 0.0002, "step": 17230 }, { "epoch": 4.545574462471969, "grad_norm": 0.1745293140411377, "learning_rate": 3.637738503473138e-06, "loss": 0.0027, "step": 17232 }, { "epoch": 4.546102097348635, "grad_norm": 0.1885538548231125, "learning_rate": 3.6373867932823355e-06, "loss": 0.001, "step": 17234 }, { "epoch": 4.5466297322253, "grad_norm": 0.093732088804245, "learning_rate": 3.6370350830915325e-06, "loss": 0.0006, "step": 17236 }, { "epoch": 4.547157367101965, "grad_norm": 0.015060706064105034, "learning_rate": 3.63668337290073e-06, "loss": 0.0008, "step": 17238 }, { "epoch": 4.547685001978631, "grad_norm": 0.01906280219554901, "learning_rate": 3.6363316627099272e-06, "loss": 0.0009, "step": 17240 }, { "epoch": 4.548212636855296, "grad_norm": 0.006422347389161587, "learning_rate": 3.6359799525191246e-06, "loss": 0.0002, "step": 17242 }, { "epoch": 4.548740271731962, "grad_norm": 0.011579311452805996, "learning_rate": 3.6356282423283215e-06, "loss": 0.0045, "step": 17244 }, { "epoch": 4.549267906608627, "grad_norm": 0.004380648955702782, "learning_rate": 3.635276532137519e-06, "loss": 0.0019, "step": 17246 }, { "epoch": 4.549795541485292, "grad_norm": 0.041969310492277145, "learning_rate": 3.6349248219467163e-06, "loss": 0.0018, "step": 17248 }, { "epoch": 4.550323176361958, "grad_norm": 0.45945122838020325, "learning_rate": 3.634573111755913e-06, "loss": 0.0048, "step": 17250 }, { "epoch": 4.550850811238623, "grad_norm": 0.022607319056987762, "learning_rate": 3.63422140156511e-06, "loss": 0.0022, "step": 17252 }, { "epoch": 4.551378446115288, "grad_norm": 0.227174773812294, "learning_rate": 3.6338696913743076e-06, "loss": 0.0009, "step": 17254 }, { "epoch": 4.551906080991953, "grad_norm": 0.03115147538483143, "learning_rate": 3.633517981183505e-06, "loss": 0.0002, "step": 17256 }, { "epoch": 4.552433715868619, "grad_norm": 0.5587384700775146, "learning_rate": 3.633166270992702e-06, "loss": 0.0031, "step": 17258 }, { "epoch": 4.552961350745284, "grad_norm": 0.007268073502928019, "learning_rate": 3.6328145608018993e-06, "loss": 0.0004, "step": 17260 }, { "epoch": 4.55348898562195, "grad_norm": 0.02468181401491165, "learning_rate": 3.6324628506110967e-06, "loss": 0.0002, "step": 17262 }, { "epoch": 4.554016620498615, "grad_norm": 0.0027504407335072756, "learning_rate": 3.632111140420294e-06, "loss": 0.0003, "step": 17264 }, { "epoch": 4.55454425537528, "grad_norm": 0.03019443340599537, "learning_rate": 3.6317594302294906e-06, "loss": 0.0002, "step": 17266 }, { "epoch": 4.555071890251946, "grad_norm": 0.011311667039990425, "learning_rate": 3.6314077200386884e-06, "loss": 0.0018, "step": 17268 }, { "epoch": 4.555599525128611, "grad_norm": 0.0019826097413897514, "learning_rate": 3.6310560098478857e-06, "loss": 0.0001, "step": 17270 }, { "epoch": 4.556127160005277, "grad_norm": 0.084577776491642, "learning_rate": 3.630704299657083e-06, "loss": 0.0005, "step": 17272 }, { "epoch": 4.556654794881942, "grad_norm": 0.10882766544818878, "learning_rate": 3.6303525894662796e-06, "loss": 0.0061, "step": 17274 }, { "epoch": 4.5571824297586065, "grad_norm": 0.14560005068778992, "learning_rate": 3.630000879275477e-06, "loss": 0.0034, "step": 17276 }, { "epoch": 4.557710064635272, "grad_norm": 0.3237191438674927, "learning_rate": 3.6296491690846744e-06, "loss": 0.0019, "step": 17278 }, { "epoch": 4.558237699511937, "grad_norm": 0.03174613416194916, "learning_rate": 3.6292974588938713e-06, "loss": 0.0003, "step": 17280 }, { "epoch": 4.558765334388603, "grad_norm": 0.008109305053949356, "learning_rate": 3.6289457487030687e-06, "loss": 0.0002, "step": 17282 }, { "epoch": 4.559292969265268, "grad_norm": 0.011679023504257202, "learning_rate": 3.628594038512266e-06, "loss": 0.0002, "step": 17284 }, { "epoch": 4.559820604141934, "grad_norm": 0.6175535321235657, "learning_rate": 3.6282423283214635e-06, "loss": 0.0079, "step": 17286 }, { "epoch": 4.560348239018599, "grad_norm": 0.014817330054938793, "learning_rate": 3.62789061813066e-06, "loss": 0.0002, "step": 17288 }, { "epoch": 4.560875873895265, "grad_norm": 0.0344846136868, "learning_rate": 3.6275389079398574e-06, "loss": 0.0003, "step": 17290 }, { "epoch": 4.56140350877193, "grad_norm": 0.005934268236160278, "learning_rate": 3.6271871977490548e-06, "loss": 0.0012, "step": 17292 }, { "epoch": 4.561931143648595, "grad_norm": 0.010071353986859322, "learning_rate": 3.626835487558252e-06, "loss": 0.0001, "step": 17294 }, { "epoch": 4.562458778525261, "grad_norm": 0.36571481823921204, "learning_rate": 3.626483777367449e-06, "loss": 0.0008, "step": 17296 }, { "epoch": 4.562986413401926, "grad_norm": 0.4319853186607361, "learning_rate": 3.6261320671766465e-06, "loss": 0.0069, "step": 17298 }, { "epoch": 4.563514048278591, "grad_norm": 0.003944070544093847, "learning_rate": 3.625780356985844e-06, "loss": 0.0001, "step": 17300 }, { "epoch": 4.564041683155256, "grad_norm": 0.04099253565073013, "learning_rate": 3.625428646795041e-06, "loss": 0.0002, "step": 17302 }, { "epoch": 4.564569318031922, "grad_norm": 0.3319357931613922, "learning_rate": 3.625076936604238e-06, "loss": 0.0019, "step": 17304 }, { "epoch": 4.565096952908587, "grad_norm": 0.2178669422864914, "learning_rate": 3.6247252264134355e-06, "loss": 0.0006, "step": 17306 }, { "epoch": 4.565624587785253, "grad_norm": 0.3120884597301483, "learning_rate": 3.624373516222633e-06, "loss": 0.0011, "step": 17308 }, { "epoch": 4.566152222661918, "grad_norm": 0.03314727544784546, "learning_rate": 3.6240218060318294e-06, "loss": 0.0002, "step": 17310 }, { "epoch": 4.566679857538583, "grad_norm": 0.002990593435242772, "learning_rate": 3.623670095841027e-06, "loss": 0.0003, "step": 17312 }, { "epoch": 4.567207492415249, "grad_norm": 0.004251797683537006, "learning_rate": 3.623318385650224e-06, "loss": 0.0001, "step": 17314 }, { "epoch": 4.567735127291914, "grad_norm": 0.007170961704105139, "learning_rate": 3.6229666754594216e-06, "loss": 0.0025, "step": 17316 }, { "epoch": 4.56826276216858, "grad_norm": 0.005017043557018042, "learning_rate": 3.6226149652686185e-06, "loss": 0.0001, "step": 17318 }, { "epoch": 4.568790397045245, "grad_norm": 0.005992566235363483, "learning_rate": 3.622263255077816e-06, "loss": 0.0003, "step": 17320 }, { "epoch": 4.5693180319219096, "grad_norm": 0.1839148849248886, "learning_rate": 3.6219115448870133e-06, "loss": 0.0006, "step": 17322 }, { "epoch": 4.569845666798575, "grad_norm": 0.004307563882321119, "learning_rate": 3.6215598346962106e-06, "loss": 0.0002, "step": 17324 }, { "epoch": 4.57037330167524, "grad_norm": 0.09322845190763474, "learning_rate": 3.6212081245054076e-06, "loss": 0.0043, "step": 17326 }, { "epoch": 4.570900936551906, "grad_norm": 0.7408265471458435, "learning_rate": 3.620856414314605e-06, "loss": 0.0025, "step": 17328 }, { "epoch": 4.571428571428571, "grad_norm": 0.2438998818397522, "learning_rate": 3.6205047041238024e-06, "loss": 0.0015, "step": 17330 }, { "epoch": 4.571956206305237, "grad_norm": 0.0031536875758320093, "learning_rate": 3.6201529939329997e-06, "loss": 0.0002, "step": 17332 }, { "epoch": 4.572483841181902, "grad_norm": 0.0036437183152884245, "learning_rate": 3.6198012837421963e-06, "loss": 0.0002, "step": 17334 }, { "epoch": 4.573011476058568, "grad_norm": 0.11527334898710251, "learning_rate": 3.6194495735513936e-06, "loss": 0.0002, "step": 17336 }, { "epoch": 4.573539110935233, "grad_norm": 0.02361145429313183, "learning_rate": 3.619097863360591e-06, "loss": 0.0002, "step": 17338 }, { "epoch": 4.574066745811898, "grad_norm": 0.01912532187998295, "learning_rate": 3.6187461531697884e-06, "loss": 0.001, "step": 17340 }, { "epoch": 4.574594380688564, "grad_norm": 0.02490398846566677, "learning_rate": 3.6183944429789853e-06, "loss": 0.0003, "step": 17342 }, { "epoch": 4.575122015565229, "grad_norm": 0.003084319643676281, "learning_rate": 3.6180427327881827e-06, "loss": 0.0001, "step": 17344 }, { "epoch": 4.575649650441894, "grad_norm": 0.0053060054779052734, "learning_rate": 3.61769102259738e-06, "loss": 0.0002, "step": 17346 }, { "epoch": 4.576177285318559, "grad_norm": 0.20979611575603485, "learning_rate": 3.6173393124065766e-06, "loss": 0.0147, "step": 17348 }, { "epoch": 4.576704920195225, "grad_norm": 0.0023496232461184263, "learning_rate": 3.616987602215774e-06, "loss": 0.0012, "step": 17350 }, { "epoch": 4.57723255507189, "grad_norm": 0.010137178935110569, "learning_rate": 3.6166358920249714e-06, "loss": 0.0002, "step": 17352 }, { "epoch": 4.577760189948556, "grad_norm": 0.013969630002975464, "learning_rate": 3.616284181834169e-06, "loss": 0.0049, "step": 17354 }, { "epoch": 4.578287824825221, "grad_norm": 0.00954892672598362, "learning_rate": 3.6159324716433657e-06, "loss": 0.0003, "step": 17356 }, { "epoch": 4.578815459701886, "grad_norm": 0.004375574178993702, "learning_rate": 3.615580761452563e-06, "loss": 0.0002, "step": 17358 }, { "epoch": 4.579343094578552, "grad_norm": 0.2976689338684082, "learning_rate": 3.6152290512617605e-06, "loss": 0.013, "step": 17360 }, { "epoch": 4.579870729455217, "grad_norm": 0.002683008788153529, "learning_rate": 3.614877341070958e-06, "loss": 0.0001, "step": 17362 }, { "epoch": 4.580398364331883, "grad_norm": 0.0022791149094700813, "learning_rate": 3.6145256308801548e-06, "loss": 0.0008, "step": 17364 }, { "epoch": 4.580925999208548, "grad_norm": 0.009066877886652946, "learning_rate": 3.614173920689352e-06, "loss": 0.0002, "step": 17366 }, { "epoch": 4.581453634085213, "grad_norm": 0.18134230375289917, "learning_rate": 3.6138222104985495e-06, "loss": 0.0009, "step": 17368 }, { "epoch": 4.581981268961878, "grad_norm": 0.3421534597873688, "learning_rate": 3.613470500307747e-06, "loss": 0.0017, "step": 17370 }, { "epoch": 4.582508903838543, "grad_norm": 0.020112669095396996, "learning_rate": 3.6131187901169434e-06, "loss": 0.0002, "step": 17372 }, { "epoch": 4.583036538715209, "grad_norm": 0.4011353850364685, "learning_rate": 3.612767079926141e-06, "loss": 0.004, "step": 17374 }, { "epoch": 4.583564173591874, "grad_norm": 0.011150750331580639, "learning_rate": 3.612415369735338e-06, "loss": 0.0003, "step": 17376 }, { "epoch": 4.58409180846854, "grad_norm": 0.015932774171233177, "learning_rate": 3.612063659544535e-06, "loss": 0.0002, "step": 17378 }, { "epoch": 4.584619443345205, "grad_norm": 0.014930874109268188, "learning_rate": 3.6117119493537325e-06, "loss": 0.0002, "step": 17380 }, { "epoch": 4.585147078221871, "grad_norm": 0.21035781502723694, "learning_rate": 3.61136023916293e-06, "loss": 0.0006, "step": 17382 }, { "epoch": 4.585674713098536, "grad_norm": 0.027212409302592278, "learning_rate": 3.6110085289721273e-06, "loss": 0.0002, "step": 17384 }, { "epoch": 4.586202347975201, "grad_norm": 0.7650735974311829, "learning_rate": 3.6106568187813242e-06, "loss": 0.0022, "step": 17386 }, { "epoch": 4.586729982851867, "grad_norm": 0.02953837439417839, "learning_rate": 3.6103051085905216e-06, "loss": 0.003, "step": 17388 }, { "epoch": 4.587257617728532, "grad_norm": 0.010424870997667313, "learning_rate": 3.609953398399719e-06, "loss": 0.0002, "step": 17390 }, { "epoch": 4.587785252605197, "grad_norm": 0.05535602942109108, "learning_rate": 3.6096016882089163e-06, "loss": 0.0003, "step": 17392 }, { "epoch": 4.588312887481862, "grad_norm": 0.4903623163700104, "learning_rate": 3.609249978018113e-06, "loss": 0.0016, "step": 17394 }, { "epoch": 4.588840522358528, "grad_norm": 0.006828059442341328, "learning_rate": 3.6088982678273103e-06, "loss": 0.0003, "step": 17396 }, { "epoch": 4.589368157235193, "grad_norm": 0.038644492626190186, "learning_rate": 3.6085465576365076e-06, "loss": 0.0003, "step": 17398 }, { "epoch": 4.589895792111859, "grad_norm": 0.013294837437570095, "learning_rate": 3.608194847445705e-06, "loss": 0.0001, "step": 17400 }, { "epoch": 4.590423426988524, "grad_norm": 0.753032386302948, "learning_rate": 3.607843137254902e-06, "loss": 0.011, "step": 17402 }, { "epoch": 4.590951061865189, "grad_norm": 0.007220265455543995, "learning_rate": 3.6074914270640993e-06, "loss": 0.0002, "step": 17404 }, { "epoch": 4.591478696741855, "grad_norm": 0.014989666640758514, "learning_rate": 3.6071397168732967e-06, "loss": 0.0002, "step": 17406 }, { "epoch": 4.59200633161852, "grad_norm": 0.00904763862490654, "learning_rate": 3.6067880066824932e-06, "loss": 0.0029, "step": 17408 }, { "epoch": 4.592533966495186, "grad_norm": 0.02747165784239769, "learning_rate": 3.6064362964916906e-06, "loss": 0.0002, "step": 17410 }, { "epoch": 4.593061601371851, "grad_norm": 0.6296753287315369, "learning_rate": 3.6060845863008884e-06, "loss": 0.0015, "step": 17412 }, { "epoch": 4.593589236248516, "grad_norm": 0.005378753878176212, "learning_rate": 3.6057328761100858e-06, "loss": 0.0002, "step": 17414 }, { "epoch": 4.594116871125181, "grad_norm": 0.07845966517925262, "learning_rate": 3.6053811659192823e-06, "loss": 0.0038, "step": 17416 }, { "epoch": 4.594644506001846, "grad_norm": 0.2882533371448517, "learning_rate": 3.6050294557284797e-06, "loss": 0.0105, "step": 17418 }, { "epoch": 4.595172140878512, "grad_norm": 0.004153863526880741, "learning_rate": 3.604677745537677e-06, "loss": 0.0001, "step": 17420 }, { "epoch": 4.595699775755177, "grad_norm": 0.13113224506378174, "learning_rate": 3.6043260353468744e-06, "loss": 0.0003, "step": 17422 }, { "epoch": 4.596227410631843, "grad_norm": 0.10540929436683655, "learning_rate": 3.6039743251560714e-06, "loss": 0.0008, "step": 17424 }, { "epoch": 4.596755045508508, "grad_norm": 0.03211156278848648, "learning_rate": 3.6036226149652688e-06, "loss": 0.0003, "step": 17426 }, { "epoch": 4.597282680385174, "grad_norm": 0.00756262568756938, "learning_rate": 3.603270904774466e-06, "loss": 0.0002, "step": 17428 }, { "epoch": 4.597810315261839, "grad_norm": 0.01359361782670021, "learning_rate": 3.6029191945836635e-06, "loss": 0.0036, "step": 17430 }, { "epoch": 4.598337950138504, "grad_norm": 0.01103373896330595, "learning_rate": 3.60256748439286e-06, "loss": 0.0002, "step": 17432 }, { "epoch": 4.59886558501517, "grad_norm": 0.37810394167900085, "learning_rate": 3.6022157742020574e-06, "loss": 0.0037, "step": 17434 }, { "epoch": 4.599393219891835, "grad_norm": 0.3477596044540405, "learning_rate": 3.601864064011255e-06, "loss": 0.007, "step": 17436 }, { "epoch": 4.5999208547685, "grad_norm": 0.03837345913052559, "learning_rate": 3.6015123538204518e-06, "loss": 0.0007, "step": 17438 }, { "epoch": 4.600448489645165, "grad_norm": 0.05210207402706146, "learning_rate": 3.601160643629649e-06, "loss": 0.0004, "step": 17440 }, { "epoch": 4.600976124521831, "grad_norm": 0.012519423849880695, "learning_rate": 3.6008089334388465e-06, "loss": 0.0004, "step": 17442 }, { "epoch": 4.601503759398496, "grad_norm": 0.4316253960132599, "learning_rate": 3.600457223248044e-06, "loss": 0.0025, "step": 17444 }, { "epoch": 4.602031394275162, "grad_norm": 0.028540119528770447, "learning_rate": 3.600105513057241e-06, "loss": 0.0003, "step": 17446 }, { "epoch": 4.602559029151827, "grad_norm": 0.04163114354014397, "learning_rate": 3.5997538028664382e-06, "loss": 0.0013, "step": 17448 }, { "epoch": 4.603086664028492, "grad_norm": 0.712627649307251, "learning_rate": 3.5994020926756356e-06, "loss": 0.0044, "step": 17450 }, { "epoch": 4.603614298905158, "grad_norm": 0.01091526448726654, "learning_rate": 3.599050382484833e-06, "loss": 0.0004, "step": 17452 }, { "epoch": 4.604141933781823, "grad_norm": 0.014512639492750168, "learning_rate": 3.5986986722940295e-06, "loss": 0.0007, "step": 17454 }, { "epoch": 4.604669568658489, "grad_norm": 0.00863723922520876, "learning_rate": 3.598346962103227e-06, "loss": 0.0045, "step": 17456 }, { "epoch": 4.605197203535154, "grad_norm": 0.5147298574447632, "learning_rate": 3.5979952519124242e-06, "loss": 0.0015, "step": 17458 }, { "epoch": 4.605724838411819, "grad_norm": 0.008592319674789906, "learning_rate": 3.5976435417216216e-06, "loss": 0.0002, "step": 17460 }, { "epoch": 4.606252473288484, "grad_norm": 0.017343631014227867, "learning_rate": 3.5972918315308186e-06, "loss": 0.0008, "step": 17462 }, { "epoch": 4.606780108165149, "grad_norm": 0.0782710611820221, "learning_rate": 3.596940121340016e-06, "loss": 0.0023, "step": 17464 }, { "epoch": 4.607307743041815, "grad_norm": 0.2816620171070099, "learning_rate": 3.5965884111492133e-06, "loss": 0.0035, "step": 17466 }, { "epoch": 4.60783537791848, "grad_norm": 0.14194385707378387, "learning_rate": 3.59623670095841e-06, "loss": 0.0006, "step": 17468 }, { "epoch": 4.608363012795146, "grad_norm": 0.02305811271071434, "learning_rate": 3.5958849907676077e-06, "loss": 0.0148, "step": 17470 }, { "epoch": 4.608890647671811, "grad_norm": 0.49895212054252625, "learning_rate": 3.595533280576805e-06, "loss": 0.0031, "step": 17472 }, { "epoch": 4.609418282548477, "grad_norm": 0.1342875361442566, "learning_rate": 3.5951815703860024e-06, "loss": 0.0007, "step": 17474 }, { "epoch": 4.609945917425142, "grad_norm": 0.024266749620437622, "learning_rate": 3.594829860195199e-06, "loss": 0.0003, "step": 17476 }, { "epoch": 4.610473552301807, "grad_norm": 0.07956655323505402, "learning_rate": 3.5944781500043963e-06, "loss": 0.0003, "step": 17478 }, { "epoch": 4.611001187178473, "grad_norm": 0.0037674710620194674, "learning_rate": 3.5941264398135937e-06, "loss": 0.0018, "step": 17480 }, { "epoch": 4.611528822055138, "grad_norm": 0.43381965160369873, "learning_rate": 3.593774729622791e-06, "loss": 0.0024, "step": 17482 }, { "epoch": 4.612056456931803, "grad_norm": 0.0526575930416584, "learning_rate": 3.593423019431988e-06, "loss": 0.0006, "step": 17484 }, { "epoch": 4.612584091808468, "grad_norm": 0.29617759585380554, "learning_rate": 3.5930713092411854e-06, "loss": 0.0056, "step": 17486 }, { "epoch": 4.613111726685134, "grad_norm": 0.17833563685417175, "learning_rate": 3.5927195990503828e-06, "loss": 0.0004, "step": 17488 }, { "epoch": 4.613639361561799, "grad_norm": 0.08139600604772568, "learning_rate": 3.59236788885958e-06, "loss": 0.004, "step": 17490 }, { "epoch": 4.614166996438465, "grad_norm": 0.020553473383188248, "learning_rate": 3.5920161786687767e-06, "loss": 0.003, "step": 17492 }, { "epoch": 4.61469463131513, "grad_norm": 0.03131626546382904, "learning_rate": 3.591664468477974e-06, "loss": 0.0003, "step": 17494 }, { "epoch": 4.615222266191795, "grad_norm": 0.09115313738584518, "learning_rate": 3.591312758287172e-06, "loss": 0.0035, "step": 17496 }, { "epoch": 4.615749901068461, "grad_norm": 0.18944256007671356, "learning_rate": 3.5909610480963684e-06, "loss": 0.0006, "step": 17498 }, { "epoch": 4.616277535945126, "grad_norm": 0.19168619811534882, "learning_rate": 3.5906093379055657e-06, "loss": 0.0041, "step": 17500 }, { "epoch": 4.616805170821792, "grad_norm": 0.02390487678349018, "learning_rate": 3.590257627714763e-06, "loss": 0.0022, "step": 17502 }, { "epoch": 4.617332805698457, "grad_norm": 0.04020779952406883, "learning_rate": 3.5899059175239605e-06, "loss": 0.0039, "step": 17504 }, { "epoch": 4.617860440575122, "grad_norm": 0.08853394538164139, "learning_rate": 3.5895542073331575e-06, "loss": 0.0004, "step": 17506 }, { "epoch": 4.618388075451787, "grad_norm": 0.02621692419052124, "learning_rate": 3.589202497142355e-06, "loss": 0.0003, "step": 17508 }, { "epoch": 4.618915710328452, "grad_norm": 0.06476103514432907, "learning_rate": 3.588850786951552e-06, "loss": 0.0007, "step": 17510 }, { "epoch": 4.619443345205118, "grad_norm": 0.03438713774085045, "learning_rate": 3.5884990767607496e-06, "loss": 0.0002, "step": 17512 }, { "epoch": 4.619970980081783, "grad_norm": 0.006950012408196926, "learning_rate": 3.588147366569946e-06, "loss": 0.0027, "step": 17514 }, { "epoch": 4.620498614958449, "grad_norm": 0.013917201198637486, "learning_rate": 3.5877956563791435e-06, "loss": 0.0002, "step": 17516 }, { "epoch": 4.621026249835114, "grad_norm": 0.05321337282657623, "learning_rate": 3.587443946188341e-06, "loss": 0.0027, "step": 17518 }, { "epoch": 4.62155388471178, "grad_norm": 0.04955146089196205, "learning_rate": 3.5870922359975382e-06, "loss": 0.0003, "step": 17520 }, { "epoch": 4.622081519588445, "grad_norm": 0.10501457750797272, "learning_rate": 3.586740525806735e-06, "loss": 0.0004, "step": 17522 }, { "epoch": 4.62260915446511, "grad_norm": 0.008467324078083038, "learning_rate": 3.5863888156159326e-06, "loss": 0.0002, "step": 17524 }, { "epoch": 4.623136789341776, "grad_norm": 0.004749752581119537, "learning_rate": 3.58603710542513e-06, "loss": 0.0002, "step": 17526 }, { "epoch": 4.623664424218441, "grad_norm": 0.003434853395447135, "learning_rate": 3.585685395234327e-06, "loss": 0.0003, "step": 17528 }, { "epoch": 4.624192059095106, "grad_norm": 0.008989880792796612, "learning_rate": 3.5853336850435243e-06, "loss": 0.002, "step": 17530 }, { "epoch": 4.624719693971771, "grad_norm": 0.023156872019171715, "learning_rate": 3.5849819748527216e-06, "loss": 0.0005, "step": 17532 }, { "epoch": 4.625247328848437, "grad_norm": 0.08537560701370239, "learning_rate": 3.584630264661919e-06, "loss": 0.0003, "step": 17534 }, { "epoch": 4.625774963725102, "grad_norm": 0.002319296356290579, "learning_rate": 3.5842785544711156e-06, "loss": 0.0017, "step": 17536 }, { "epoch": 4.626302598601768, "grad_norm": 0.5300726890563965, "learning_rate": 3.583926844280313e-06, "loss": 0.0021, "step": 17538 }, { "epoch": 4.626830233478433, "grad_norm": 0.01258411817252636, "learning_rate": 3.5835751340895103e-06, "loss": 0.0001, "step": 17540 }, { "epoch": 4.627357868355098, "grad_norm": 0.16439718008041382, "learning_rate": 3.5832234238987077e-06, "loss": 0.0003, "step": 17542 }, { "epoch": 4.627885503231764, "grad_norm": 0.0062669492326676846, "learning_rate": 3.5828717137079046e-06, "loss": 0.0001, "step": 17544 }, { "epoch": 4.628413138108429, "grad_norm": 0.0032317261211574078, "learning_rate": 3.582520003517102e-06, "loss": 0.0002, "step": 17546 }, { "epoch": 4.628940772985095, "grad_norm": 0.008345465175807476, "learning_rate": 3.5821682933262994e-06, "loss": 0.0002, "step": 17548 }, { "epoch": 4.62946840786176, "grad_norm": 0.04455753415822983, "learning_rate": 3.5818165831354968e-06, "loss": 0.0004, "step": 17550 }, { "epoch": 4.629996042738425, "grad_norm": 0.020702602341771126, "learning_rate": 3.5814648729446933e-06, "loss": 0.0027, "step": 17552 }, { "epoch": 4.63052367761509, "grad_norm": 0.11222758889198303, "learning_rate": 3.581113162753891e-06, "loss": 0.0031, "step": 17554 }, { "epoch": 4.631051312491755, "grad_norm": 0.5673640966415405, "learning_rate": 3.5807614525630885e-06, "loss": 0.0013, "step": 17556 }, { "epoch": 4.631578947368421, "grad_norm": 0.08145107328891754, "learning_rate": 3.580409742372285e-06, "loss": 0.0003, "step": 17558 }, { "epoch": 4.632106582245086, "grad_norm": 0.18092389404773712, "learning_rate": 3.5800580321814824e-06, "loss": 0.0006, "step": 17560 }, { "epoch": 4.632634217121752, "grad_norm": 0.25509709119796753, "learning_rate": 3.5797063219906797e-06, "loss": 0.0007, "step": 17562 }, { "epoch": 4.633161851998417, "grad_norm": 0.0029977643862366676, "learning_rate": 3.579354611799877e-06, "loss": 0.0001, "step": 17564 }, { "epoch": 4.633689486875083, "grad_norm": 0.003921999596059322, "learning_rate": 3.579002901609074e-06, "loss": 0.0001, "step": 17566 }, { "epoch": 4.634217121751748, "grad_norm": 0.00220699654892087, "learning_rate": 3.5786511914182714e-06, "loss": 0.0001, "step": 17568 }, { "epoch": 4.634744756628413, "grad_norm": 0.023191476240754128, "learning_rate": 3.578299481227469e-06, "loss": 0.0002, "step": 17570 }, { "epoch": 4.635272391505079, "grad_norm": 0.00319939898326993, "learning_rate": 3.577947771036666e-06, "loss": 0.0002, "step": 17572 }, { "epoch": 4.635800026381744, "grad_norm": 0.0018823780119419098, "learning_rate": 3.5775960608458627e-06, "loss": 0.0059, "step": 17574 }, { "epoch": 4.6363276612584094, "grad_norm": 0.00467119412496686, "learning_rate": 3.57724435065506e-06, "loss": 0.0037, "step": 17576 }, { "epoch": 4.636855296135074, "grad_norm": 0.0762011855840683, "learning_rate": 3.5768926404642575e-06, "loss": 0.0003, "step": 17578 }, { "epoch": 4.63738293101174, "grad_norm": 0.7832328677177429, "learning_rate": 3.5765409302734553e-06, "loss": 0.0042, "step": 17580 }, { "epoch": 4.637910565888405, "grad_norm": 0.1565455049276352, "learning_rate": 3.576189220082652e-06, "loss": 0.0003, "step": 17582 }, { "epoch": 4.63843820076507, "grad_norm": 0.008300150744616985, "learning_rate": 3.575837509891849e-06, "loss": 0.0003, "step": 17584 }, { "epoch": 4.638965835641736, "grad_norm": 0.17850463092327118, "learning_rate": 3.5754857997010466e-06, "loss": 0.0003, "step": 17586 }, { "epoch": 4.639493470518401, "grad_norm": 0.36164575815200806, "learning_rate": 3.5751340895102435e-06, "loss": 0.0033, "step": 17588 }, { "epoch": 4.640021105395067, "grad_norm": 0.005179708823561668, "learning_rate": 3.574782379319441e-06, "loss": 0.0003, "step": 17590 }, { "epoch": 4.640548740271732, "grad_norm": 0.01564096100628376, "learning_rate": 3.5744306691286383e-06, "loss": 0.0003, "step": 17592 }, { "epoch": 4.641076375148398, "grad_norm": 0.05147085338830948, "learning_rate": 3.5740789589378356e-06, "loss": 0.0003, "step": 17594 }, { "epoch": 4.641604010025063, "grad_norm": 0.08962289243936539, "learning_rate": 3.573727248747032e-06, "loss": 0.0019, "step": 17596 }, { "epoch": 4.642131644901728, "grad_norm": 0.5351833701133728, "learning_rate": 3.5733755385562295e-06, "loss": 0.0017, "step": 17598 }, { "epoch": 4.642659279778393, "grad_norm": 0.00786581914871931, "learning_rate": 3.573023828365427e-06, "loss": 0.0011, "step": 17600 }, { "epoch": 4.643186914655058, "grad_norm": 0.013957900926470757, "learning_rate": 3.5726721181746243e-06, "loss": 0.0028, "step": 17602 }, { "epoch": 4.643714549531724, "grad_norm": 0.32658085227012634, "learning_rate": 3.5723204079838212e-06, "loss": 0.0069, "step": 17604 }, { "epoch": 4.644242184408389, "grad_norm": 0.0051723383367061615, "learning_rate": 3.5719686977930186e-06, "loss": 0.0011, "step": 17606 }, { "epoch": 4.644769819285055, "grad_norm": 0.5784115195274353, "learning_rate": 3.571616987602216e-06, "loss": 0.0019, "step": 17608 }, { "epoch": 4.64529745416172, "grad_norm": 0.5380728840827942, "learning_rate": 3.5712652774114134e-06, "loss": 0.0107, "step": 17610 }, { "epoch": 4.645825089038386, "grad_norm": 0.014691413380205631, "learning_rate": 3.5709135672206103e-06, "loss": 0.0003, "step": 17612 }, { "epoch": 4.646352723915051, "grad_norm": 0.024123365059494972, "learning_rate": 3.5705618570298077e-06, "loss": 0.0003, "step": 17614 }, { "epoch": 4.646880358791716, "grad_norm": 0.03812114894390106, "learning_rate": 3.570210146839005e-06, "loss": 0.0055, "step": 17616 }, { "epoch": 4.647407993668382, "grad_norm": 0.16654568910598755, "learning_rate": 3.5698584366482016e-06, "loss": 0.0004, "step": 17618 }, { "epoch": 4.647935628545047, "grad_norm": 0.0507921501994133, "learning_rate": 3.569506726457399e-06, "loss": 0.0004, "step": 17620 }, { "epoch": 4.6484632634217125, "grad_norm": 0.009833579882979393, "learning_rate": 3.5691550162665964e-06, "loss": 0.0002, "step": 17622 }, { "epoch": 4.648990898298377, "grad_norm": 0.007917221635580063, "learning_rate": 3.5688033060757937e-06, "loss": 0.0006, "step": 17624 }, { "epoch": 4.649518533175042, "grad_norm": 0.006225932855159044, "learning_rate": 3.5684515958849907e-06, "loss": 0.0002, "step": 17626 }, { "epoch": 4.650046168051708, "grad_norm": 0.010190276429057121, "learning_rate": 3.568099885694188e-06, "loss": 0.0033, "step": 17628 }, { "epoch": 4.650573802928373, "grad_norm": 0.5002617835998535, "learning_rate": 3.5677481755033854e-06, "loss": 0.0025, "step": 17630 }, { "epoch": 4.651101437805039, "grad_norm": 0.013055557385087013, "learning_rate": 3.567396465312583e-06, "loss": 0.0002, "step": 17632 }, { "epoch": 4.651629072681704, "grad_norm": 0.00313532049767673, "learning_rate": 3.5670447551217793e-06, "loss": 0.0001, "step": 17634 }, { "epoch": 4.65215670755837, "grad_norm": 0.006406744942069054, "learning_rate": 3.5666930449309767e-06, "loss": 0.0018, "step": 17636 }, { "epoch": 4.652684342435035, "grad_norm": 0.0052948277443647385, "learning_rate": 3.5663413347401745e-06, "loss": 0.0001, "step": 17638 }, { "epoch": 4.653211977311701, "grad_norm": 0.012621531262993813, "learning_rate": 3.565989624549372e-06, "loss": 0.0002, "step": 17640 }, { "epoch": 4.653739612188366, "grad_norm": 0.003939883317798376, "learning_rate": 3.5656379143585684e-06, "loss": 0.0002, "step": 17642 }, { "epoch": 4.654267247065031, "grad_norm": 0.028214886784553528, "learning_rate": 3.565286204167766e-06, "loss": 0.0002, "step": 17644 }, { "epoch": 4.654794881941696, "grad_norm": 0.023147623986005783, "learning_rate": 3.564934493976963e-06, "loss": 0.0001, "step": 17646 }, { "epoch": 4.655322516818361, "grad_norm": 0.015177157707512379, "learning_rate": 3.56458278378616e-06, "loss": 0.0001, "step": 17648 }, { "epoch": 4.655850151695027, "grad_norm": 0.08437996357679367, "learning_rate": 3.5642310735953575e-06, "loss": 0.0025, "step": 17650 }, { "epoch": 4.656377786571692, "grad_norm": 0.7078818678855896, "learning_rate": 3.563879363404555e-06, "loss": 0.0031, "step": 17652 }, { "epoch": 4.656905421448358, "grad_norm": 0.5862777233123779, "learning_rate": 3.5635276532137523e-06, "loss": 0.0038, "step": 17654 }, { "epoch": 4.657433056325023, "grad_norm": 0.11762794107198715, "learning_rate": 3.5631759430229488e-06, "loss": 0.0011, "step": 17656 }, { "epoch": 4.657960691201689, "grad_norm": 0.03673502802848816, "learning_rate": 3.562824232832146e-06, "loss": 0.0003, "step": 17658 }, { "epoch": 4.658488326078354, "grad_norm": 0.00499454140663147, "learning_rate": 3.5624725226413435e-06, "loss": 0.0001, "step": 17660 }, { "epoch": 4.659015960955019, "grad_norm": 0.16169029474258423, "learning_rate": 3.562120812450541e-06, "loss": 0.0005, "step": 17662 }, { "epoch": 4.659543595831685, "grad_norm": 0.09749490022659302, "learning_rate": 3.561769102259738e-06, "loss": 0.0007, "step": 17664 }, { "epoch": 4.66007123070835, "grad_norm": 0.05309244245290756, "learning_rate": 3.5614173920689352e-06, "loss": 0.0007, "step": 17666 }, { "epoch": 4.6605988655850155, "grad_norm": 0.44323045015335083, "learning_rate": 3.5610656818781326e-06, "loss": 0.0014, "step": 17668 }, { "epoch": 4.66112650046168, "grad_norm": 0.3670271337032318, "learning_rate": 3.56071397168733e-06, "loss": 0.0015, "step": 17670 }, { "epoch": 4.661654135338345, "grad_norm": 0.0058629936538636684, "learning_rate": 3.560362261496527e-06, "loss": 0.0003, "step": 17672 }, { "epoch": 4.662181770215011, "grad_norm": 0.02882654406130314, "learning_rate": 3.5600105513057243e-06, "loss": 0.0003, "step": 17674 }, { "epoch": 4.662709405091676, "grad_norm": 0.007657256908714771, "learning_rate": 3.5596588411149217e-06, "loss": 0.0001, "step": 17676 }, { "epoch": 4.663237039968342, "grad_norm": 0.016880201175808907, "learning_rate": 3.5593071309241182e-06, "loss": 0.0002, "step": 17678 }, { "epoch": 4.663764674845007, "grad_norm": 0.00735127367079258, "learning_rate": 3.5589554207333156e-06, "loss": 0.0001, "step": 17680 }, { "epoch": 4.664292309721673, "grad_norm": 0.0843573734164238, "learning_rate": 3.558603710542513e-06, "loss": 0.0004, "step": 17682 }, { "epoch": 4.664819944598338, "grad_norm": 0.1329551339149475, "learning_rate": 3.5582520003517104e-06, "loss": 0.0004, "step": 17684 }, { "epoch": 4.665347579475004, "grad_norm": 0.006658731494098902, "learning_rate": 3.5579002901609073e-06, "loss": 0.0001, "step": 17686 }, { "epoch": 4.665875214351669, "grad_norm": 0.013846651650965214, "learning_rate": 3.5575485799701047e-06, "loss": 0.0002, "step": 17688 }, { "epoch": 4.666402849228334, "grad_norm": 0.20987281203269958, "learning_rate": 3.557196869779302e-06, "loss": 0.0002, "step": 17690 }, { "epoch": 4.666930484104999, "grad_norm": 0.004830559715628624, "learning_rate": 3.5568451595884994e-06, "loss": 0.0033, "step": 17692 }, { "epoch": 4.667458118981664, "grad_norm": 0.04700477421283722, "learning_rate": 3.556493449397696e-06, "loss": 0.0002, "step": 17694 }, { "epoch": 4.66798575385833, "grad_norm": 0.002177442889660597, "learning_rate": 3.5561417392068938e-06, "loss": 0.0001, "step": 17696 }, { "epoch": 4.668513388734995, "grad_norm": 0.3997281789779663, "learning_rate": 3.555790029016091e-06, "loss": 0.005, "step": 17698 }, { "epoch": 4.669041023611661, "grad_norm": 0.018742399290204048, "learning_rate": 3.5554383188252885e-06, "loss": 0.0005, "step": 17700 }, { "epoch": 4.669568658488326, "grad_norm": 0.009380063973367214, "learning_rate": 3.555086608634485e-06, "loss": 0.0042, "step": 17702 }, { "epoch": 4.670096293364992, "grad_norm": 0.0022299280390143394, "learning_rate": 3.5547348984436824e-06, "loss": 0.0001, "step": 17704 }, { "epoch": 4.670623928241657, "grad_norm": 0.0032939512748271227, "learning_rate": 3.55438318825288e-06, "loss": 0.0002, "step": 17706 }, { "epoch": 4.671151563118322, "grad_norm": 0.015708986669778824, "learning_rate": 3.5540314780620767e-06, "loss": 0.0003, "step": 17708 }, { "epoch": 4.671679197994988, "grad_norm": 0.011158489622175694, "learning_rate": 3.553679767871274e-06, "loss": 0.0047, "step": 17710 }, { "epoch": 4.672206832871653, "grad_norm": 0.05245424062013626, "learning_rate": 3.5533280576804715e-06, "loss": 0.0004, "step": 17712 }, { "epoch": 4.6727344677483185, "grad_norm": 0.03494587913155556, "learning_rate": 3.552976347489669e-06, "loss": 0.0005, "step": 17714 }, { "epoch": 4.673262102624983, "grad_norm": 0.033177003264427185, "learning_rate": 3.5526246372988654e-06, "loss": 0.0002, "step": 17716 }, { "epoch": 4.673789737501648, "grad_norm": 0.11414383351802826, "learning_rate": 3.5522729271080628e-06, "loss": 0.0003, "step": 17718 }, { "epoch": 4.674317372378314, "grad_norm": 0.003453126410022378, "learning_rate": 3.55192121691726e-06, "loss": 0.0001, "step": 17720 }, { "epoch": 4.674845007254979, "grad_norm": 0.010813470929861069, "learning_rate": 3.551569506726458e-06, "loss": 0.0002, "step": 17722 }, { "epoch": 4.675372642131645, "grad_norm": 0.003763052402064204, "learning_rate": 3.5512177965356545e-06, "loss": 0.0002, "step": 17724 }, { "epoch": 4.67590027700831, "grad_norm": 0.26768314838409424, "learning_rate": 3.550866086344852e-06, "loss": 0.011, "step": 17726 }, { "epoch": 4.676427911884976, "grad_norm": 0.04993162304162979, "learning_rate": 3.5505143761540492e-06, "loss": 0.0029, "step": 17728 }, { "epoch": 4.676955546761641, "grad_norm": 0.002746215322986245, "learning_rate": 3.5501626659632466e-06, "loss": 0.0001, "step": 17730 }, { "epoch": 4.677483181638307, "grad_norm": 0.003951991908252239, "learning_rate": 3.5498109557724436e-06, "loss": 0.0008, "step": 17732 }, { "epoch": 4.678010816514972, "grad_norm": 0.378145694732666, "learning_rate": 3.549459245581641e-06, "loss": 0.0006, "step": 17734 }, { "epoch": 4.678538451391637, "grad_norm": 0.4226040840148926, "learning_rate": 3.5491075353908383e-06, "loss": 0.0091, "step": 17736 }, { "epoch": 4.6790660862683024, "grad_norm": 0.004869726486504078, "learning_rate": 3.548755825200035e-06, "loss": 0.0014, "step": 17738 }, { "epoch": 4.679593721144967, "grad_norm": 0.00377068854868412, "learning_rate": 3.5484041150092322e-06, "loss": 0.0002, "step": 17740 }, { "epoch": 4.680121356021633, "grad_norm": 0.016132308170199394, "learning_rate": 3.5480524048184296e-06, "loss": 0.004, "step": 17742 }, { "epoch": 4.680648990898298, "grad_norm": 0.0587674044072628, "learning_rate": 3.547700694627627e-06, "loss": 0.0003, "step": 17744 }, { "epoch": 4.681176625774964, "grad_norm": 0.10766869783401489, "learning_rate": 3.547348984436824e-06, "loss": 0.0006, "step": 17746 }, { "epoch": 4.681704260651629, "grad_norm": 0.3721432089805603, "learning_rate": 3.547173129341423e-06, "loss": 0.0035, "step": 17748 }, { "epoch": 4.682231895528295, "grad_norm": 0.10298068821430206, "learning_rate": 3.54682141915062e-06, "loss": 0.0007, "step": 17750 }, { "epoch": 4.68275953040496, "grad_norm": 0.037146225571632385, "learning_rate": 3.546469708959817e-06, "loss": 0.0003, "step": 17752 }, { "epoch": 4.683287165281625, "grad_norm": 0.520212709903717, "learning_rate": 3.5461179987690145e-06, "loss": 0.0086, "step": 17754 }, { "epoch": 4.683814800158291, "grad_norm": 0.22155962884426117, "learning_rate": 3.545766288578212e-06, "loss": 0.0008, "step": 17756 }, { "epoch": 4.684342435034956, "grad_norm": 0.0035849220585078, "learning_rate": 3.5454145783874084e-06, "loss": 0.0003, "step": 17758 }, { "epoch": 4.6848700699116215, "grad_norm": 0.041607264429330826, "learning_rate": 3.545062868196606e-06, "loss": 0.0002, "step": 17760 }, { "epoch": 4.685397704788286, "grad_norm": 0.03843379020690918, "learning_rate": 3.544711158005803e-06, "loss": 0.0086, "step": 17762 }, { "epoch": 4.685925339664951, "grad_norm": 0.10279350727796555, "learning_rate": 3.5443594478150006e-06, "loss": 0.0004, "step": 17764 }, { "epoch": 4.686452974541617, "grad_norm": 0.1758967489004135, "learning_rate": 3.5440077376241975e-06, "loss": 0.0025, "step": 17766 }, { "epoch": 4.686980609418282, "grad_norm": 0.19008412957191467, "learning_rate": 3.543656027433395e-06, "loss": 0.0009, "step": 17768 }, { "epoch": 4.687508244294948, "grad_norm": 0.005814798176288605, "learning_rate": 3.5433043172425923e-06, "loss": 0.0001, "step": 17770 }, { "epoch": 4.688035879171613, "grad_norm": 0.5209631323814392, "learning_rate": 3.5429526070517896e-06, "loss": 0.0052, "step": 17772 }, { "epoch": 4.688563514048279, "grad_norm": 0.004256241023540497, "learning_rate": 3.5426008968609866e-06, "loss": 0.001, "step": 17774 }, { "epoch": 4.689091148924944, "grad_norm": 0.02326129376888275, "learning_rate": 3.542249186670184e-06, "loss": 0.0002, "step": 17776 }, { "epoch": 4.68961878380161, "grad_norm": 0.013717934489250183, "learning_rate": 3.5418974764793813e-06, "loss": 0.0008, "step": 17778 }, { "epoch": 4.690146418678275, "grad_norm": 0.003040014300495386, "learning_rate": 3.5415457662885787e-06, "loss": 0.0005, "step": 17780 }, { "epoch": 4.69067405355494, "grad_norm": 0.018516240641474724, "learning_rate": 3.5411940560977752e-06, "loss": 0.0002, "step": 17782 }, { "epoch": 4.6912016884316055, "grad_norm": 0.012299870140850544, "learning_rate": 3.5408423459069726e-06, "loss": 0.0002, "step": 17784 }, { "epoch": 4.69172932330827, "grad_norm": 0.12631623446941376, "learning_rate": 3.54049063571617e-06, "loss": 0.0003, "step": 17786 }, { "epoch": 4.692256958184936, "grad_norm": 0.008219659328460693, "learning_rate": 3.540138925525367e-06, "loss": 0.0001, "step": 17788 }, { "epoch": 4.692784593061601, "grad_norm": 0.003357442794367671, "learning_rate": 3.5397872153345643e-06, "loss": 0.0001, "step": 17790 }, { "epoch": 4.693312227938267, "grad_norm": 0.01221825648099184, "learning_rate": 3.5394355051437617e-06, "loss": 0.0003, "step": 17792 }, { "epoch": 4.693839862814932, "grad_norm": 0.02378598041832447, "learning_rate": 3.539083794952959e-06, "loss": 0.0002, "step": 17794 }, { "epoch": 4.694367497691598, "grad_norm": 0.4857470691204071, "learning_rate": 3.5387320847621556e-06, "loss": 0.0046, "step": 17796 }, { "epoch": 4.694895132568263, "grad_norm": 0.01793508790433407, "learning_rate": 3.5383803745713534e-06, "loss": 0.0002, "step": 17798 }, { "epoch": 4.695422767444928, "grad_norm": 0.05481494218111038, "learning_rate": 3.5380286643805508e-06, "loss": 0.0008, "step": 17800 }, { "epoch": 4.695950402321594, "grad_norm": 0.005704235751181841, "learning_rate": 3.537676954189748e-06, "loss": 0.0022, "step": 17802 }, { "epoch": 4.696478037198259, "grad_norm": 0.019086161628365517, "learning_rate": 3.5373252439989447e-06, "loss": 0.0003, "step": 17804 }, { "epoch": 4.6970056720749245, "grad_norm": 0.03131133317947388, "learning_rate": 3.536973533808142e-06, "loss": 0.0003, "step": 17806 }, { "epoch": 4.697533306951589, "grad_norm": 0.1309075951576233, "learning_rate": 3.5366218236173394e-06, "loss": 0.0004, "step": 17808 }, { "epoch": 4.698060941828254, "grad_norm": 0.08624521642923355, "learning_rate": 3.536270113426537e-06, "loss": 0.0004, "step": 17810 }, { "epoch": 4.69858857670492, "grad_norm": 0.11004336178302765, "learning_rate": 3.5359184032357338e-06, "loss": 0.0047, "step": 17812 }, { "epoch": 4.699116211581585, "grad_norm": 0.003487248905003071, "learning_rate": 3.535566693044931e-06, "loss": 0.0002, "step": 17814 }, { "epoch": 4.699643846458251, "grad_norm": 0.07222560048103333, "learning_rate": 3.5352149828541285e-06, "loss": 0.0004, "step": 17816 }, { "epoch": 4.700171481334916, "grad_norm": 0.02599034085869789, "learning_rate": 3.534863272663325e-06, "loss": 0.0023, "step": 17818 }, { "epoch": 4.700699116211582, "grad_norm": 0.025540949776768684, "learning_rate": 3.5345115624725224e-06, "loss": 0.0002, "step": 17820 }, { "epoch": 4.701226751088247, "grad_norm": 0.030717365443706512, "learning_rate": 3.53415985228172e-06, "loss": 0.0002, "step": 17822 }, { "epoch": 4.701754385964913, "grad_norm": 0.010175768285989761, "learning_rate": 3.5338081420909176e-06, "loss": 0.0002, "step": 17824 }, { "epoch": 4.702282020841578, "grad_norm": 0.2225414514541626, "learning_rate": 3.533456431900114e-06, "loss": 0.0003, "step": 17826 }, { "epoch": 4.702809655718243, "grad_norm": 0.014572570100426674, "learning_rate": 3.5331047217093115e-06, "loss": 0.0002, "step": 17828 }, { "epoch": 4.7033372905949085, "grad_norm": 0.07607205957174301, "learning_rate": 3.532753011518509e-06, "loss": 0.0024, "step": 17830 }, { "epoch": 4.703864925471573, "grad_norm": 0.008547266013920307, "learning_rate": 3.5324013013277063e-06, "loss": 0.0002, "step": 17832 }, { "epoch": 4.704392560348239, "grad_norm": 0.026681913062930107, "learning_rate": 3.532049591136903e-06, "loss": 0.0003, "step": 17834 }, { "epoch": 4.704920195224904, "grad_norm": 0.08699960261583328, "learning_rate": 3.5316978809461006e-06, "loss": 0.0005, "step": 17836 }, { "epoch": 4.70544783010157, "grad_norm": 0.8245229721069336, "learning_rate": 3.531346170755298e-06, "loss": 0.0111, "step": 17838 }, { "epoch": 4.705975464978235, "grad_norm": 0.30456632375717163, "learning_rate": 3.5309944605644953e-06, "loss": 0.0016, "step": 17840 }, { "epoch": 4.706503099854901, "grad_norm": 0.007945218123495579, "learning_rate": 3.530642750373692e-06, "loss": 0.0001, "step": 17842 }, { "epoch": 4.707030734731566, "grad_norm": 0.02977529913187027, "learning_rate": 3.5302910401828892e-06, "loss": 0.0002, "step": 17844 }, { "epoch": 4.707558369608231, "grad_norm": 0.06890704482793808, "learning_rate": 3.5299393299920866e-06, "loss": 0.0008, "step": 17846 }, { "epoch": 4.708086004484897, "grad_norm": 0.4379606544971466, "learning_rate": 3.5295876198012836e-06, "loss": 0.0009, "step": 17848 }, { "epoch": 4.708613639361562, "grad_norm": 0.0047829910181462765, "learning_rate": 3.529235909610481e-06, "loss": 0.0022, "step": 17850 }, { "epoch": 4.7091412742382275, "grad_norm": 0.007519986480474472, "learning_rate": 3.5288841994196783e-06, "loss": 0.0005, "step": 17852 }, { "epoch": 4.709668909114892, "grad_norm": 0.03622119501233101, "learning_rate": 3.5285324892288757e-06, "loss": 0.0005, "step": 17854 }, { "epoch": 4.710196543991557, "grad_norm": 0.032951656728982925, "learning_rate": 3.5281807790380726e-06, "loss": 0.0002, "step": 17856 }, { "epoch": 4.710724178868223, "grad_norm": 0.003404403105378151, "learning_rate": 3.52782906884727e-06, "loss": 0.0007, "step": 17858 }, { "epoch": 4.711251813744888, "grad_norm": 0.10460801422595978, "learning_rate": 3.5274773586564674e-06, "loss": 0.0003, "step": 17860 }, { "epoch": 4.711779448621554, "grad_norm": 0.4613626003265381, "learning_rate": 3.5271256484656648e-06, "loss": 0.0009, "step": 17862 }, { "epoch": 4.712307083498219, "grad_norm": 0.05352730304002762, "learning_rate": 3.5267739382748613e-06, "loss": 0.0031, "step": 17864 }, { "epoch": 4.712834718374885, "grad_norm": 0.006565145682543516, "learning_rate": 3.5264222280840587e-06, "loss": 0.0002, "step": 17866 }, { "epoch": 4.71336235325155, "grad_norm": 0.08107179403305054, "learning_rate": 3.526070517893256e-06, "loss": 0.0033, "step": 17868 }, { "epoch": 4.713889988128216, "grad_norm": 0.07811125367879868, "learning_rate": 3.5257188077024534e-06, "loss": 0.0003, "step": 17870 }, { "epoch": 4.714417623004881, "grad_norm": 0.005974563304334879, "learning_rate": 3.5253670975116504e-06, "loss": 0.0004, "step": 17872 }, { "epoch": 4.714945257881546, "grad_norm": 0.036226026713848114, "learning_rate": 3.5250153873208478e-06, "loss": 0.003, "step": 17874 }, { "epoch": 4.7154728927582115, "grad_norm": 0.08850273489952087, "learning_rate": 3.524663677130045e-06, "loss": 0.0002, "step": 17876 }, { "epoch": 4.716000527634876, "grad_norm": 0.46618080139160156, "learning_rate": 3.5243119669392417e-06, "loss": 0.0069, "step": 17878 }, { "epoch": 4.716528162511542, "grad_norm": 0.15422223508358002, "learning_rate": 3.523960256748439e-06, "loss": 0.0003, "step": 17880 }, { "epoch": 4.717055797388207, "grad_norm": 0.004942831117659807, "learning_rate": 3.523608546557637e-06, "loss": 0.0002, "step": 17882 }, { "epoch": 4.717583432264873, "grad_norm": 0.10854528099298477, "learning_rate": 3.5232568363668342e-06, "loss": 0.0019, "step": 17884 }, { "epoch": 4.718111067141538, "grad_norm": 0.07645054161548615, "learning_rate": 3.5229051261760307e-06, "loss": 0.0002, "step": 17886 }, { "epoch": 4.718638702018204, "grad_norm": 0.03280089050531387, "learning_rate": 3.522553415985228e-06, "loss": 0.0012, "step": 17888 }, { "epoch": 4.719166336894869, "grad_norm": 0.027171125635504723, "learning_rate": 3.5222017057944255e-06, "loss": 0.0002, "step": 17890 }, { "epoch": 4.719693971771534, "grad_norm": 0.364031583070755, "learning_rate": 3.521849995603623e-06, "loss": 0.0005, "step": 17892 }, { "epoch": 4.7202216066482, "grad_norm": 0.006805961020290852, "learning_rate": 3.52149828541282e-06, "loss": 0.0002, "step": 17894 }, { "epoch": 4.720749241524865, "grad_norm": 0.07412953674793243, "learning_rate": 3.521146575222017e-06, "loss": 0.0003, "step": 17896 }, { "epoch": 4.7212768764015305, "grad_norm": 0.004932590760290623, "learning_rate": 3.5207948650312146e-06, "loss": 0.0002, "step": 17898 }, { "epoch": 4.7218045112781954, "grad_norm": 0.004731677006930113, "learning_rate": 3.520443154840412e-06, "loss": 0.0003, "step": 17900 }, { "epoch": 4.72233214615486, "grad_norm": 0.35386374592781067, "learning_rate": 3.5200914446496085e-06, "loss": 0.0004, "step": 17902 }, { "epoch": 4.722859781031526, "grad_norm": 0.1672450751066208, "learning_rate": 3.519739734458806e-06, "loss": 0.0005, "step": 17904 }, { "epoch": 4.723387415908191, "grad_norm": 1.313406229019165, "learning_rate": 3.5193880242680032e-06, "loss": 0.0022, "step": 17906 }, { "epoch": 4.723915050784857, "grad_norm": 0.0024926585610955954, "learning_rate": 3.5190363140772e-06, "loss": 0.0002, "step": 17908 }, { "epoch": 4.724442685661522, "grad_norm": 0.001677831169217825, "learning_rate": 3.5186846038863976e-06, "loss": 0.0002, "step": 17910 }, { "epoch": 4.724970320538188, "grad_norm": 0.012687490321695805, "learning_rate": 3.518332893695595e-06, "loss": 0.0002, "step": 17912 }, { "epoch": 4.725497955414853, "grad_norm": 0.0023024564143270254, "learning_rate": 3.5179811835047923e-06, "loss": 0.0002, "step": 17914 }, { "epoch": 4.726025590291519, "grad_norm": 0.00272082700394094, "learning_rate": 3.5176294733139893e-06, "loss": 0.0001, "step": 17916 }, { "epoch": 4.726553225168184, "grad_norm": 0.0043417178094387054, "learning_rate": 3.5172777631231866e-06, "loss": 0.0001, "step": 17918 }, { "epoch": 4.727080860044849, "grad_norm": 0.015040735714137554, "learning_rate": 3.516926052932384e-06, "loss": 0.0008, "step": 17920 }, { "epoch": 4.7276084949215145, "grad_norm": 0.17288917303085327, "learning_rate": 3.5165743427415814e-06, "loss": 0.0081, "step": 17922 }, { "epoch": 4.728136129798179, "grad_norm": 0.008826857432723045, "learning_rate": 3.516222632550778e-06, "loss": 0.0001, "step": 17924 }, { "epoch": 4.728663764674845, "grad_norm": 0.01988041028380394, "learning_rate": 3.5158709223599753e-06, "loss": 0.0001, "step": 17926 }, { "epoch": 4.72919139955151, "grad_norm": 0.02295723557472229, "learning_rate": 3.5155192121691727e-06, "loss": 0.0001, "step": 17928 }, { "epoch": 4.729719034428175, "grad_norm": 0.995678722858429, "learning_rate": 3.51516750197837e-06, "loss": 0.0056, "step": 17930 }, { "epoch": 4.730246669304841, "grad_norm": 0.003654533065855503, "learning_rate": 3.514815791787567e-06, "loss": 0.0001, "step": 17932 }, { "epoch": 4.730774304181506, "grad_norm": 0.194446861743927, "learning_rate": 3.5144640815967644e-06, "loss": 0.0032, "step": 17934 }, { "epoch": 4.731301939058172, "grad_norm": 0.07701268047094345, "learning_rate": 3.5141123714059618e-06, "loss": 0.0002, "step": 17936 }, { "epoch": 4.731829573934837, "grad_norm": 0.0036966558545827866, "learning_rate": 3.5137606612151583e-06, "loss": 0.0004, "step": 17938 }, { "epoch": 4.732357208811503, "grad_norm": 0.027758093550801277, "learning_rate": 3.513408951024356e-06, "loss": 0.0065, "step": 17940 }, { "epoch": 4.732884843688168, "grad_norm": 0.478809118270874, "learning_rate": 3.5130572408335535e-06, "loss": 0.0011, "step": 17942 }, { "epoch": 4.7334124785648335, "grad_norm": 0.02732202038168907, "learning_rate": 3.512705530642751e-06, "loss": 0.0003, "step": 17944 }, { "epoch": 4.7339401134414985, "grad_norm": 0.03142506629228592, "learning_rate": 3.5123538204519474e-06, "loss": 0.0009, "step": 17946 }, { "epoch": 4.734467748318163, "grad_norm": 0.11149828135967255, "learning_rate": 3.5120021102611447e-06, "loss": 0.0011, "step": 17948 }, { "epoch": 4.734995383194829, "grad_norm": 0.14897514879703522, "learning_rate": 3.511650400070342e-06, "loss": 0.0004, "step": 17950 }, { "epoch": 4.735523018071494, "grad_norm": 0.12661369144916534, "learning_rate": 3.5112986898795395e-06, "loss": 0.0003, "step": 17952 }, { "epoch": 4.73605065294816, "grad_norm": 0.08942064642906189, "learning_rate": 3.5109469796887364e-06, "loss": 0.002, "step": 17954 }, { "epoch": 4.736578287824825, "grad_norm": 0.0040490273386240005, "learning_rate": 3.510595269497934e-06, "loss": 0.0001, "step": 17956 }, { "epoch": 4.737105922701491, "grad_norm": 0.21867728233337402, "learning_rate": 3.510243559307131e-06, "loss": 0.0004, "step": 17958 }, { "epoch": 4.737633557578156, "grad_norm": 0.0035062457900494337, "learning_rate": 3.5098918491163286e-06, "loss": 0.0014, "step": 17960 }, { "epoch": 4.738161192454822, "grad_norm": 0.011118064634501934, "learning_rate": 3.509540138925525e-06, "loss": 0.002, "step": 17962 }, { "epoch": 4.738688827331487, "grad_norm": 0.05146733298897743, "learning_rate": 3.5091884287347225e-06, "loss": 0.0003, "step": 17964 }, { "epoch": 4.739216462208152, "grad_norm": 0.002673614537343383, "learning_rate": 3.5088367185439203e-06, "loss": 0.0001, "step": 17966 }, { "epoch": 4.7397440970848175, "grad_norm": 0.1565670669078827, "learning_rate": 3.508485008353117e-06, "loss": 0.0033, "step": 17968 }, { "epoch": 4.740271731961482, "grad_norm": 0.10845273733139038, "learning_rate": 3.508133298162314e-06, "loss": 0.0035, "step": 17970 }, { "epoch": 4.740799366838148, "grad_norm": 0.003975671716034412, "learning_rate": 3.5077815879715116e-06, "loss": 0.0029, "step": 17972 }, { "epoch": 4.741327001714813, "grad_norm": 0.14137379825115204, "learning_rate": 3.507429877780709e-06, "loss": 0.0005, "step": 17974 }, { "epoch": 4.741854636591478, "grad_norm": 0.02604871243238449, "learning_rate": 3.507078167589906e-06, "loss": 0.0002, "step": 17976 }, { "epoch": 4.742382271468144, "grad_norm": 0.026365328580141068, "learning_rate": 3.5067264573991033e-06, "loss": 0.0032, "step": 17978 }, { "epoch": 4.742909906344809, "grad_norm": 2.8139405250549316, "learning_rate": 3.5063747472083006e-06, "loss": 0.0011, "step": 17980 }, { "epoch": 4.743437541221475, "grad_norm": 0.052044227719306946, "learning_rate": 3.506023037017498e-06, "loss": 0.0021, "step": 17982 }, { "epoch": 4.74396517609814, "grad_norm": 0.00541065214201808, "learning_rate": 3.5056713268266945e-06, "loss": 0.0001, "step": 17984 }, { "epoch": 4.744492810974806, "grad_norm": 1.1548222303390503, "learning_rate": 3.505319616635892e-06, "loss": 0.0085, "step": 17986 }, { "epoch": 4.745020445851471, "grad_norm": 0.0027419147081673145, "learning_rate": 3.5049679064450893e-06, "loss": 0.0034, "step": 17988 }, { "epoch": 4.7455480807281365, "grad_norm": 0.03389890491962433, "learning_rate": 3.5046161962542867e-06, "loss": 0.0028, "step": 17990 }, { "epoch": 4.7460757156048015, "grad_norm": 0.36055755615234375, "learning_rate": 3.5042644860634836e-06, "loss": 0.0055, "step": 17992 }, { "epoch": 4.746603350481466, "grad_norm": 0.15897205471992493, "learning_rate": 3.503912775872681e-06, "loss": 0.0004, "step": 17994 }, { "epoch": 4.747130985358132, "grad_norm": 0.051843564957380295, "learning_rate": 3.5035610656818784e-06, "loss": 0.0011, "step": 17996 }, { "epoch": 4.747658620234797, "grad_norm": 0.015873469412326813, "learning_rate": 3.5032093554910753e-06, "loss": 0.0002, "step": 17998 }, { "epoch": 4.748186255111463, "grad_norm": 0.026631787419319153, "learning_rate": 3.5028576453002727e-06, "loss": 0.0006, "step": 18000 }, { "epoch": 4.748186255111463, "eval_loss": 0.002509024925529957, "eval_runtime": 304.7015, "eval_samples_per_second": 707.712, "eval_steps_per_second": 88.467, "step": 18000 }, { "epoch": 4.748713889988128, "grad_norm": 0.18241678178310394, "learning_rate": 3.50250593510947e-06, "loss": 0.0004, "step": 18002 }, { "epoch": 4.749241524864794, "grad_norm": 0.16077211499214172, "learning_rate": 3.5021542249186674e-06, "loss": 0.0004, "step": 18004 }, { "epoch": 4.749769159741459, "grad_norm": 0.004710849840193987, "learning_rate": 3.501802514727864e-06, "loss": 0.0001, "step": 18006 }, { "epoch": 4.750296794618125, "grad_norm": 0.010291796177625656, "learning_rate": 3.5014508045370614e-06, "loss": 0.0003, "step": 18008 }, { "epoch": 4.75082442949479, "grad_norm": 0.01876794546842575, "learning_rate": 3.5010990943462587e-06, "loss": 0.0002, "step": 18010 }, { "epoch": 4.751352064371455, "grad_norm": 0.07874179631471634, "learning_rate": 3.500747384155456e-06, "loss": 0.0002, "step": 18012 }, { "epoch": 4.7518796992481205, "grad_norm": 0.029565434902906418, "learning_rate": 3.500395673964653e-06, "loss": 0.0001, "step": 18014 }, { "epoch": 4.752407334124785, "grad_norm": 0.3145572245121002, "learning_rate": 3.5000439637738504e-06, "loss": 0.0009, "step": 18016 }, { "epoch": 4.752934969001451, "grad_norm": 0.011952213011682034, "learning_rate": 3.499692253583048e-06, "loss": 0.0001, "step": 18018 }, { "epoch": 4.753462603878116, "grad_norm": 0.00826419610530138, "learning_rate": 3.499340543392245e-06, "loss": 0.0001, "step": 18020 }, { "epoch": 4.753990238754781, "grad_norm": 0.04731660336256027, "learning_rate": 3.4989888332014417e-06, "loss": 0.0004, "step": 18022 }, { "epoch": 4.754517873631447, "grad_norm": 0.0013212825870141387, "learning_rate": 3.4986371230106395e-06, "loss": 0.0001, "step": 18024 }, { "epoch": 4.755045508508112, "grad_norm": 0.023694906383752823, "learning_rate": 3.498285412819837e-06, "loss": 0.0002, "step": 18026 }, { "epoch": 4.755573143384778, "grad_norm": 0.017743706703186035, "learning_rate": 3.4979337026290334e-06, "loss": 0.004, "step": 18028 }, { "epoch": 4.756100778261443, "grad_norm": 0.3848455250263214, "learning_rate": 3.497581992438231e-06, "loss": 0.0014, "step": 18030 }, { "epoch": 4.756628413138109, "grad_norm": 0.007105078082531691, "learning_rate": 3.497230282247428e-06, "loss": 0.0004, "step": 18032 }, { "epoch": 4.757156048014774, "grad_norm": 0.5700516700744629, "learning_rate": 3.4968785720566255e-06, "loss": 0.0015, "step": 18034 }, { "epoch": 4.7576836828914395, "grad_norm": 0.012290027923882008, "learning_rate": 3.4965268618658225e-06, "loss": 0.0043, "step": 18036 }, { "epoch": 4.7582113177681045, "grad_norm": 0.005060745868831873, "learning_rate": 3.49617515167502e-06, "loss": 0.0002, "step": 18038 }, { "epoch": 4.758738952644769, "grad_norm": 0.0023219215217977762, "learning_rate": 3.4958234414842173e-06, "loss": 0.0002, "step": 18040 }, { "epoch": 4.759266587521435, "grad_norm": 0.027274813503026962, "learning_rate": 3.4954717312934146e-06, "loss": 0.0003, "step": 18042 }, { "epoch": 4.7597942223981, "grad_norm": 0.03877600282430649, "learning_rate": 3.495120021102611e-06, "loss": 0.0003, "step": 18044 }, { "epoch": 4.760321857274766, "grad_norm": 0.40893739461898804, "learning_rate": 3.4947683109118085e-06, "loss": 0.0017, "step": 18046 }, { "epoch": 4.760849492151431, "grad_norm": 0.7800226807594299, "learning_rate": 3.494416600721006e-06, "loss": 0.0017, "step": 18048 }, { "epoch": 4.761377127028097, "grad_norm": 0.16749775409698486, "learning_rate": 3.4940648905302037e-06, "loss": 0.0005, "step": 18050 }, { "epoch": 4.761904761904762, "grad_norm": 0.5702663660049438, "learning_rate": 3.4937131803394002e-06, "loss": 0.0035, "step": 18052 }, { "epoch": 4.762432396781428, "grad_norm": 0.005922272801399231, "learning_rate": 3.4933614701485976e-06, "loss": 0.0001, "step": 18054 }, { "epoch": 4.762960031658093, "grad_norm": 0.18001878261566162, "learning_rate": 3.493009759957795e-06, "loss": 0.0004, "step": 18056 }, { "epoch": 4.763487666534758, "grad_norm": 0.002593592507764697, "learning_rate": 3.492658049766992e-06, "loss": 0.0002, "step": 18058 }, { "epoch": 4.7640153014114235, "grad_norm": 0.0714433342218399, "learning_rate": 3.4923063395761893e-06, "loss": 0.0004, "step": 18060 }, { "epoch": 4.7645429362880884, "grad_norm": 0.003258081618696451, "learning_rate": 3.4919546293853867e-06, "loss": 0.009, "step": 18062 }, { "epoch": 4.765070571164754, "grad_norm": 0.6856198310852051, "learning_rate": 3.491602919194584e-06, "loss": 0.0037, "step": 18064 }, { "epoch": 4.765598206041419, "grad_norm": 0.01361461728811264, "learning_rate": 3.4912512090037806e-06, "loss": 0.0087, "step": 18066 }, { "epoch": 4.766125840918084, "grad_norm": 0.0059590768069028854, "learning_rate": 3.490899498812978e-06, "loss": 0.0001, "step": 18068 }, { "epoch": 4.76665347579475, "grad_norm": 0.34079068899154663, "learning_rate": 3.4905477886221753e-06, "loss": 0.0055, "step": 18070 }, { "epoch": 4.767181110671415, "grad_norm": 0.011443285271525383, "learning_rate": 3.4901960784313727e-06, "loss": 0.0002, "step": 18072 }, { "epoch": 4.767708745548081, "grad_norm": 0.39781445264816284, "learning_rate": 3.4898443682405697e-06, "loss": 0.0047, "step": 18074 }, { "epoch": 4.768236380424746, "grad_norm": 0.8084171414375305, "learning_rate": 3.489492658049767e-06, "loss": 0.0021, "step": 18076 }, { "epoch": 4.768764015301412, "grad_norm": 0.008749461732804775, "learning_rate": 3.4891409478589644e-06, "loss": 0.0003, "step": 18078 }, { "epoch": 4.769291650178077, "grad_norm": 0.14360299706459045, "learning_rate": 3.488789237668162e-06, "loss": 0.0007, "step": 18080 }, { "epoch": 4.7698192850547425, "grad_norm": 0.04428786039352417, "learning_rate": 3.4884375274773588e-06, "loss": 0.0003, "step": 18082 }, { "epoch": 4.7703469199314075, "grad_norm": 0.033519092947244644, "learning_rate": 3.488085817286556e-06, "loss": 0.0003, "step": 18084 }, { "epoch": 4.770874554808072, "grad_norm": 0.03809747099876404, "learning_rate": 3.4877341070957535e-06, "loss": 0.0029, "step": 18086 }, { "epoch": 4.771402189684738, "grad_norm": 0.17638778686523438, "learning_rate": 3.48738239690495e-06, "loss": 0.001, "step": 18088 }, { "epoch": 4.771929824561403, "grad_norm": 0.3115963637828827, "learning_rate": 3.4870306867141474e-06, "loss": 0.0123, "step": 18090 }, { "epoch": 4.772457459438069, "grad_norm": 0.05478885769844055, "learning_rate": 3.4866789765233448e-06, "loss": 0.003, "step": 18092 }, { "epoch": 4.772985094314734, "grad_norm": 0.07927176356315613, "learning_rate": 3.486327266332542e-06, "loss": 0.0004, "step": 18094 }, { "epoch": 4.7735127291914, "grad_norm": 0.014687737450003624, "learning_rate": 3.485975556141739e-06, "loss": 0.0002, "step": 18096 }, { "epoch": 4.774040364068065, "grad_norm": 0.14427463710308075, "learning_rate": 3.4856238459509365e-06, "loss": 0.0013, "step": 18098 }, { "epoch": 4.774567998944731, "grad_norm": 0.007248813286423683, "learning_rate": 3.485272135760134e-06, "loss": 0.0007, "step": 18100 }, { "epoch": 4.775095633821396, "grad_norm": 0.012736822478473186, "learning_rate": 3.4849204255693312e-06, "loss": 0.0002, "step": 18102 }, { "epoch": 4.775623268698061, "grad_norm": 0.005113860126584768, "learning_rate": 3.4845687153785278e-06, "loss": 0.0002, "step": 18104 }, { "epoch": 4.7761509035747265, "grad_norm": 0.03209300711750984, "learning_rate": 3.484217005187725e-06, "loss": 0.0003, "step": 18106 }, { "epoch": 4.7766785384513915, "grad_norm": 0.009786642156541348, "learning_rate": 3.483865294996923e-06, "loss": 0.0001, "step": 18108 }, { "epoch": 4.777206173328057, "grad_norm": 0.014166830107569695, "learning_rate": 3.4835135848061203e-06, "loss": 0.0002, "step": 18110 }, { "epoch": 4.777733808204722, "grad_norm": 0.0943334624171257, "learning_rate": 3.483161874615317e-06, "loss": 0.0005, "step": 18112 }, { "epoch": 4.778261443081387, "grad_norm": 0.13624893128871918, "learning_rate": 3.4828101644245142e-06, "loss": 0.0004, "step": 18114 }, { "epoch": 4.778789077958053, "grad_norm": 0.005979696288704872, "learning_rate": 3.4824584542337116e-06, "loss": 0.0001, "step": 18116 }, { "epoch": 4.779316712834718, "grad_norm": 0.022783122956752777, "learning_rate": 3.4821067440429086e-06, "loss": 0.0112, "step": 18118 }, { "epoch": 4.779844347711384, "grad_norm": 0.004541317000985146, "learning_rate": 3.481755033852106e-06, "loss": 0.0002, "step": 18120 }, { "epoch": 4.780371982588049, "grad_norm": 0.0032956136856228113, "learning_rate": 3.4814033236613033e-06, "loss": 0.0001, "step": 18122 }, { "epoch": 4.780899617464715, "grad_norm": 0.006998805794864893, "learning_rate": 3.4810516134705007e-06, "loss": 0.0004, "step": 18124 }, { "epoch": 4.78142725234138, "grad_norm": 0.00921151228249073, "learning_rate": 3.4806999032796972e-06, "loss": 0.0002, "step": 18126 }, { "epoch": 4.7819548872180455, "grad_norm": 0.0046369777992367744, "learning_rate": 3.4803481930888946e-06, "loss": 0.0002, "step": 18128 }, { "epoch": 4.7824825220947105, "grad_norm": 0.004843158647418022, "learning_rate": 3.479996482898092e-06, "loss": 0.0038, "step": 18130 }, { "epoch": 4.783010156971375, "grad_norm": 0.04465356841683388, "learning_rate": 3.4796447727072893e-06, "loss": 0.0003, "step": 18132 }, { "epoch": 4.783537791848041, "grad_norm": 0.17758223414421082, "learning_rate": 3.4792930625164863e-06, "loss": 0.0007, "step": 18134 }, { "epoch": 4.784065426724706, "grad_norm": 0.04385454207658768, "learning_rate": 3.4789413523256837e-06, "loss": 0.0005, "step": 18136 }, { "epoch": 4.784593061601372, "grad_norm": 0.10047972947359085, "learning_rate": 3.478589642134881e-06, "loss": 0.0006, "step": 18138 }, { "epoch": 4.785120696478037, "grad_norm": 0.1680288165807724, "learning_rate": 3.4782379319440784e-06, "loss": 0.0134, "step": 18140 }, { "epoch": 4.785648331354703, "grad_norm": 0.22759731113910675, "learning_rate": 3.4778862217532754e-06, "loss": 0.0137, "step": 18142 }, { "epoch": 4.786175966231368, "grad_norm": 0.12139731645584106, "learning_rate": 3.4775345115624727e-06, "loss": 0.0005, "step": 18144 }, { "epoch": 4.786703601108034, "grad_norm": 0.024666091427206993, "learning_rate": 3.47718280137167e-06, "loss": 0.0043, "step": 18146 }, { "epoch": 4.787231235984699, "grad_norm": 0.08435170352458954, "learning_rate": 3.4768310911808667e-06, "loss": 0.0048, "step": 18148 }, { "epoch": 4.787758870861364, "grad_norm": 0.2020907700061798, "learning_rate": 3.476479380990064e-06, "loss": 0.01, "step": 18150 }, { "epoch": 4.7882865057380295, "grad_norm": 0.13968504965305328, "learning_rate": 3.4761276707992614e-06, "loss": 0.002, "step": 18152 }, { "epoch": 4.7888141406146945, "grad_norm": 0.06386607140302658, "learning_rate": 3.4757759606084588e-06, "loss": 0.0007, "step": 18154 }, { "epoch": 4.78934177549136, "grad_norm": 0.012561355717480183, "learning_rate": 3.4754242504176557e-06, "loss": 0.0003, "step": 18156 }, { "epoch": 4.789869410368025, "grad_norm": 0.04223385453224182, "learning_rate": 3.475072540226853e-06, "loss": 0.0002, "step": 18158 }, { "epoch": 4.79039704524469, "grad_norm": 0.11280576884746552, "learning_rate": 3.4747208300360505e-06, "loss": 0.0003, "step": 18160 }, { "epoch": 4.790924680121356, "grad_norm": 0.08175200968980789, "learning_rate": 3.474369119845248e-06, "loss": 0.0096, "step": 18162 }, { "epoch": 4.791452314998021, "grad_norm": 0.0029092913027852774, "learning_rate": 3.4740174096544444e-06, "loss": 0.0002, "step": 18164 }, { "epoch": 4.791979949874687, "grad_norm": 0.22109459340572357, "learning_rate": 3.473665699463642e-06, "loss": 0.0083, "step": 18166 }, { "epoch": 4.792507584751352, "grad_norm": 0.10157034546136856, "learning_rate": 3.4733139892728396e-06, "loss": 0.0005, "step": 18168 }, { "epoch": 4.793035219628018, "grad_norm": 0.0700036883354187, "learning_rate": 3.472962279082037e-06, "loss": 0.0002, "step": 18170 }, { "epoch": 4.793562854504683, "grad_norm": 0.005316259805113077, "learning_rate": 3.4726105688912335e-06, "loss": 0.0002, "step": 18172 }, { "epoch": 4.7940904893813485, "grad_norm": 0.125863179564476, "learning_rate": 3.472258858700431e-06, "loss": 0.0006, "step": 18174 }, { "epoch": 4.7946181242580135, "grad_norm": 0.04257359355688095, "learning_rate": 3.4719071485096282e-06, "loss": 0.0004, "step": 18176 }, { "epoch": 4.795145759134678, "grad_norm": 0.06440289318561554, "learning_rate": 3.471555438318825e-06, "loss": 0.0003, "step": 18178 }, { "epoch": 4.795673394011344, "grad_norm": 0.2857295274734497, "learning_rate": 3.4712037281280225e-06, "loss": 0.0013, "step": 18180 }, { "epoch": 4.796201028888009, "grad_norm": 0.02102544531226158, "learning_rate": 3.47085201793722e-06, "loss": 0.0033, "step": 18182 }, { "epoch": 4.796728663764675, "grad_norm": 0.0851234495639801, "learning_rate": 3.4705003077464173e-06, "loss": 0.0005, "step": 18184 }, { "epoch": 4.79725629864134, "grad_norm": 0.028908785432577133, "learning_rate": 3.470148597555614e-06, "loss": 0.0004, "step": 18186 }, { "epoch": 4.797783933518006, "grad_norm": 0.011748493649065495, "learning_rate": 3.469796887364811e-06, "loss": 0.0003, "step": 18188 }, { "epoch": 4.798311568394671, "grad_norm": 0.1084463819861412, "learning_rate": 3.4694451771740086e-06, "loss": 0.0003, "step": 18190 }, { "epoch": 4.798839203271337, "grad_norm": 0.012196156196296215, "learning_rate": 3.4690934669832064e-06, "loss": 0.0003, "step": 18192 }, { "epoch": 4.799366838148002, "grad_norm": 0.006102032959461212, "learning_rate": 3.468741756792403e-06, "loss": 0.0003, "step": 18194 }, { "epoch": 4.799894473024667, "grad_norm": 0.09139350801706314, "learning_rate": 3.4683900466016003e-06, "loss": 0.0003, "step": 18196 }, { "epoch": 4.8004221079013325, "grad_norm": 0.002495894208550453, "learning_rate": 3.4680383364107977e-06, "loss": 0.0001, "step": 18198 }, { "epoch": 4.8009497427779975, "grad_norm": 0.005035399459302425, "learning_rate": 3.467686626219995e-06, "loss": 0.0002, "step": 18200 }, { "epoch": 4.801477377654663, "grad_norm": 0.0037756317760795355, "learning_rate": 3.467334916029192e-06, "loss": 0.0001, "step": 18202 }, { "epoch": 4.802005012531328, "grad_norm": 0.11035028845071793, "learning_rate": 3.4669832058383894e-06, "loss": 0.0004, "step": 18204 }, { "epoch": 4.802532647407993, "grad_norm": 0.002807939425110817, "learning_rate": 3.4666314956475867e-06, "loss": 0.0003, "step": 18206 }, { "epoch": 4.803060282284659, "grad_norm": 0.0027189808897674084, "learning_rate": 3.4662797854567833e-06, "loss": 0.0001, "step": 18208 }, { "epoch": 4.803587917161324, "grad_norm": 0.001667026779614389, "learning_rate": 3.4659280752659806e-06, "loss": 0.0001, "step": 18210 }, { "epoch": 4.80411555203799, "grad_norm": 0.014106468297541142, "learning_rate": 3.465576365075178e-06, "loss": 0.0001, "step": 18212 }, { "epoch": 4.804643186914655, "grad_norm": 0.005266082473099232, "learning_rate": 3.4652246548843754e-06, "loss": 0.0001, "step": 18214 }, { "epoch": 4.805170821791321, "grad_norm": 0.006601118482649326, "learning_rate": 3.4648729446935724e-06, "loss": 0.0001, "step": 18216 }, { "epoch": 4.805698456667986, "grad_norm": 0.0021159423049539328, "learning_rate": 3.4645212345027697e-06, "loss": 0.0001, "step": 18218 }, { "epoch": 4.8062260915446515, "grad_norm": 0.005192378535866737, "learning_rate": 3.464169524311967e-06, "loss": 0.0009, "step": 18220 }, { "epoch": 4.8067537264213165, "grad_norm": 0.0032155418302863836, "learning_rate": 3.4638178141211645e-06, "loss": 0.0028, "step": 18222 }, { "epoch": 4.8072813612979814, "grad_norm": 0.011976776644587517, "learning_rate": 3.4634661039303614e-06, "loss": 0.0001, "step": 18224 }, { "epoch": 4.807808996174647, "grad_norm": 0.005332053638994694, "learning_rate": 3.463114393739559e-06, "loss": 0.0001, "step": 18226 }, { "epoch": 4.808336631051312, "grad_norm": 0.3639756143093109, "learning_rate": 3.462762683548756e-06, "loss": 0.004, "step": 18228 }, { "epoch": 4.808864265927978, "grad_norm": 0.0035726798232644796, "learning_rate": 3.4624109733579536e-06, "loss": 0.0001, "step": 18230 }, { "epoch": 4.809391900804643, "grad_norm": 0.009314402006566525, "learning_rate": 3.46205926316715e-06, "loss": 0.0001, "step": 18232 }, { "epoch": 4.809919535681309, "grad_norm": 0.1151021271944046, "learning_rate": 3.4617075529763475e-06, "loss": 0.0039, "step": 18234 }, { "epoch": 4.810447170557974, "grad_norm": 0.024464914575219154, "learning_rate": 3.461355842785545e-06, "loss": 0.0003, "step": 18236 }, { "epoch": 4.810974805434639, "grad_norm": 0.005518082529306412, "learning_rate": 3.461004132594742e-06, "loss": 0.0002, "step": 18238 }, { "epoch": 4.811502440311305, "grad_norm": 0.004421977326273918, "learning_rate": 3.460652422403939e-06, "loss": 0.0001, "step": 18240 }, { "epoch": 4.81203007518797, "grad_norm": 0.006808096077293158, "learning_rate": 3.4603007122131365e-06, "loss": 0.0002, "step": 18242 }, { "epoch": 4.8125577100646355, "grad_norm": 0.0726243183016777, "learning_rate": 3.459949002022334e-06, "loss": 0.0003, "step": 18244 }, { "epoch": 4.8130853449413005, "grad_norm": 0.17205668985843658, "learning_rate": 3.4595972918315304e-06, "loss": 0.0005, "step": 18246 }, { "epoch": 4.813612979817966, "grad_norm": 0.4490783214569092, "learning_rate": 3.459245581640728e-06, "loss": 0.001, "step": 18248 }, { "epoch": 4.814140614694631, "grad_norm": 0.4297046959400177, "learning_rate": 3.4588938714499256e-06, "loss": 0.0035, "step": 18250 }, { "epoch": 4.814668249571296, "grad_norm": 0.002368164248764515, "learning_rate": 3.458542161259123e-06, "loss": 0.0001, "step": 18252 }, { "epoch": 4.815195884447962, "grad_norm": 0.25330331921577454, "learning_rate": 3.4581904510683195e-06, "loss": 0.0066, "step": 18254 }, { "epoch": 4.815723519324627, "grad_norm": 0.011710701510310173, "learning_rate": 3.457838740877517e-06, "loss": 0.0001, "step": 18256 }, { "epoch": 4.816251154201293, "grad_norm": 0.006412793882191181, "learning_rate": 3.4574870306867143e-06, "loss": 0.0001, "step": 18258 }, { "epoch": 4.816778789077958, "grad_norm": 0.012110445648431778, "learning_rate": 3.4571353204959117e-06, "loss": 0.0001, "step": 18260 }, { "epoch": 4.817306423954624, "grad_norm": 0.010129961185157299, "learning_rate": 3.4567836103051086e-06, "loss": 0.0001, "step": 18262 }, { "epoch": 4.817834058831289, "grad_norm": 0.6268406510353088, "learning_rate": 3.456431900114306e-06, "loss": 0.0012, "step": 18264 }, { "epoch": 4.8183616937079545, "grad_norm": 0.062472108751535416, "learning_rate": 3.4560801899235034e-06, "loss": 0.0041, "step": 18266 }, { "epoch": 4.8188893285846195, "grad_norm": 0.09336777031421661, "learning_rate": 3.4557284797327007e-06, "loss": 0.0004, "step": 18268 }, { "epoch": 4.8194169634612845, "grad_norm": 0.21339336037635803, "learning_rate": 3.4553767695418973e-06, "loss": 0.0072, "step": 18270 }, { "epoch": 4.81994459833795, "grad_norm": 0.022491449490189552, "learning_rate": 3.4550250593510946e-06, "loss": 0.0005, "step": 18272 }, { "epoch": 4.820472233214615, "grad_norm": 0.0822751522064209, "learning_rate": 3.454673349160292e-06, "loss": 0.0007, "step": 18274 }, { "epoch": 4.820999868091281, "grad_norm": 0.29322612285614014, "learning_rate": 3.454321638969489e-06, "loss": 0.0038, "step": 18276 }, { "epoch": 4.821527502967946, "grad_norm": 0.0063051460310816765, "learning_rate": 3.4539699287786863e-06, "loss": 0.0029, "step": 18278 }, { "epoch": 4.822055137844611, "grad_norm": 0.008180931210517883, "learning_rate": 3.4536182185878837e-06, "loss": 0.0002, "step": 18280 }, { "epoch": 4.822582772721277, "grad_norm": 0.023388726636767387, "learning_rate": 3.453266508397081e-06, "loss": 0.0002, "step": 18282 }, { "epoch": 4.823110407597942, "grad_norm": 0.003931857645511627, "learning_rate": 3.452914798206278e-06, "loss": 0.0001, "step": 18284 }, { "epoch": 4.823638042474608, "grad_norm": 0.005611553322523832, "learning_rate": 3.4525630880154754e-06, "loss": 0.0012, "step": 18286 }, { "epoch": 4.824165677351273, "grad_norm": 0.06787663698196411, "learning_rate": 3.452211377824673e-06, "loss": 0.0003, "step": 18288 }, { "epoch": 4.8246933122279385, "grad_norm": 0.01126518752425909, "learning_rate": 3.45185966763387e-06, "loss": 0.0004, "step": 18290 }, { "epoch": 4.8252209471046035, "grad_norm": 0.07968209683895111, "learning_rate": 3.4515079574430667e-06, "loss": 0.0004, "step": 18292 }, { "epoch": 4.825748581981269, "grad_norm": 0.10382623970508575, "learning_rate": 3.451156247252264e-06, "loss": 0.0005, "step": 18294 }, { "epoch": 4.826276216857934, "grad_norm": 0.056878264993429184, "learning_rate": 3.4508045370614615e-06, "loss": 0.0004, "step": 18296 }, { "epoch": 4.826803851734599, "grad_norm": 0.03577692061662674, "learning_rate": 3.450452826870659e-06, "loss": 0.0002, "step": 18298 }, { "epoch": 4.827331486611265, "grad_norm": 0.015162313356995583, "learning_rate": 3.4501011166798558e-06, "loss": 0.0002, "step": 18300 }, { "epoch": 4.82785912148793, "grad_norm": 0.01607891544699669, "learning_rate": 3.449749406489053e-06, "loss": 0.0004, "step": 18302 }, { "epoch": 4.828386756364596, "grad_norm": 0.2108805626630783, "learning_rate": 3.4493976962982505e-06, "loss": 0.0042, "step": 18304 }, { "epoch": 4.828914391241261, "grad_norm": 0.00601853197440505, "learning_rate": 3.449045986107447e-06, "loss": 0.0042, "step": 18306 }, { "epoch": 4.829442026117927, "grad_norm": 0.018972894176840782, "learning_rate": 3.448694275916645e-06, "loss": 0.0004, "step": 18308 }, { "epoch": 4.829969660994592, "grad_norm": 0.038180556148290634, "learning_rate": 3.4483425657258422e-06, "loss": 0.0002, "step": 18310 }, { "epoch": 4.8304972958712575, "grad_norm": 0.03537289425730705, "learning_rate": 3.4479908555350396e-06, "loss": 0.0003, "step": 18312 }, { "epoch": 4.8310249307479225, "grad_norm": 0.030036352574825287, "learning_rate": 3.447639145344236e-06, "loss": 0.0001, "step": 18314 }, { "epoch": 4.8315525656245875, "grad_norm": 0.010558471083641052, "learning_rate": 3.4472874351534335e-06, "loss": 0.0003, "step": 18316 }, { "epoch": 4.832080200501253, "grad_norm": 0.0048131453804671764, "learning_rate": 3.446935724962631e-06, "loss": 0.0002, "step": 18318 }, { "epoch": 4.832607835377918, "grad_norm": 0.007145274896174669, "learning_rate": 3.4465840147718283e-06, "loss": 0.0002, "step": 18320 }, { "epoch": 4.833135470254584, "grad_norm": 0.0020557958632707596, "learning_rate": 3.4462323045810252e-06, "loss": 0.0001, "step": 18322 }, { "epoch": 4.833663105131249, "grad_norm": 0.006941491737961769, "learning_rate": 3.4458805943902226e-06, "loss": 0.0001, "step": 18324 }, { "epoch": 4.834190740007914, "grad_norm": 0.01368627231568098, "learning_rate": 3.44552888419942e-06, "loss": 0.0019, "step": 18326 }, { "epoch": 4.83471837488458, "grad_norm": 0.3233935534954071, "learning_rate": 3.4451771740086174e-06, "loss": 0.0015, "step": 18328 }, { "epoch": 4.835246009761245, "grad_norm": 0.17238007485866547, "learning_rate": 3.444825463817814e-06, "loss": 0.001, "step": 18330 }, { "epoch": 4.835773644637911, "grad_norm": 0.03652877360582352, "learning_rate": 3.4444737536270113e-06, "loss": 0.0003, "step": 18332 }, { "epoch": 4.836301279514576, "grad_norm": 0.01486257091164589, "learning_rate": 3.444122043436209e-06, "loss": 0.0001, "step": 18334 }, { "epoch": 4.8368289143912415, "grad_norm": 0.18877460062503815, "learning_rate": 3.4437703332454056e-06, "loss": 0.0007, "step": 18336 }, { "epoch": 4.8373565492679065, "grad_norm": 0.004993464797735214, "learning_rate": 3.443418623054603e-06, "loss": 0.0006, "step": 18338 }, { "epoch": 4.837884184144572, "grad_norm": 0.008895843289792538, "learning_rate": 3.4430669128638003e-06, "loss": 0.0039, "step": 18340 }, { "epoch": 4.838411819021237, "grad_norm": 0.0014425964327529073, "learning_rate": 3.4427152026729977e-06, "loss": 0.0002, "step": 18342 }, { "epoch": 4.838939453897902, "grad_norm": 0.001682461705058813, "learning_rate": 3.4423634924821947e-06, "loss": 0.0056, "step": 18344 }, { "epoch": 4.839467088774568, "grad_norm": 0.001723341178148985, "learning_rate": 3.442011782291392e-06, "loss": 0.0001, "step": 18346 }, { "epoch": 4.839994723651233, "grad_norm": 0.003367292694747448, "learning_rate": 3.4416600721005894e-06, "loss": 0.0002, "step": 18348 }, { "epoch": 4.840522358527899, "grad_norm": 1.1909947395324707, "learning_rate": 3.441308361909787e-06, "loss": 0.0038, "step": 18350 }, { "epoch": 4.841049993404564, "grad_norm": 0.09405332058668137, "learning_rate": 3.4409566517189833e-06, "loss": 0.0074, "step": 18352 }, { "epoch": 4.84157762828123, "grad_norm": 0.10288020968437195, "learning_rate": 3.4406049415281807e-06, "loss": 0.0009, "step": 18354 }, { "epoch": 4.842105263157895, "grad_norm": 0.6811575293540955, "learning_rate": 3.440253231337378e-06, "loss": 0.0017, "step": 18356 }, { "epoch": 4.8426328980345605, "grad_norm": 0.11524675786495209, "learning_rate": 3.4399015211465754e-06, "loss": 0.0039, "step": 18358 }, { "epoch": 4.8431605329112255, "grad_norm": 0.44055667519569397, "learning_rate": 3.4395498109557724e-06, "loss": 0.0009, "step": 18360 }, { "epoch": 4.8436881677878905, "grad_norm": 0.01175488531589508, "learning_rate": 3.4391981007649698e-06, "loss": 0.0002, "step": 18362 }, { "epoch": 4.844215802664556, "grad_norm": 0.06009668484330177, "learning_rate": 3.438846390574167e-06, "loss": 0.0006, "step": 18364 }, { "epoch": 4.844743437541221, "grad_norm": 0.25697600841522217, "learning_rate": 3.438494680383364e-06, "loss": 0.0006, "step": 18366 }, { "epoch": 4.845271072417887, "grad_norm": 0.059282075613737106, "learning_rate": 3.4381429701925615e-06, "loss": 0.0022, "step": 18368 }, { "epoch": 4.845798707294552, "grad_norm": 0.007842608727514744, "learning_rate": 3.437791260001759e-06, "loss": 0.0021, "step": 18370 }, { "epoch": 4.846326342171217, "grad_norm": 0.01069254893809557, "learning_rate": 3.4374395498109562e-06, "loss": 0.0002, "step": 18372 }, { "epoch": 4.846853977047883, "grad_norm": 0.04672640189528465, "learning_rate": 3.4370878396201528e-06, "loss": 0.0003, "step": 18374 }, { "epoch": 4.847381611924548, "grad_norm": 0.24888738989830017, "learning_rate": 3.43673612942935e-06, "loss": 0.001, "step": 18376 }, { "epoch": 4.847909246801214, "grad_norm": 0.05102735385298729, "learning_rate": 3.4363844192385475e-06, "loss": 0.0006, "step": 18378 }, { "epoch": 4.848436881677879, "grad_norm": 0.015634967014193535, "learning_rate": 3.436032709047745e-06, "loss": 0.0015, "step": 18380 }, { "epoch": 4.8489645165545445, "grad_norm": 0.008018233813345432, "learning_rate": 3.435680998856942e-06, "loss": 0.0001, "step": 18382 }, { "epoch": 4.8494921514312095, "grad_norm": 1.206738829612732, "learning_rate": 3.4353292886661392e-06, "loss": 0.004, "step": 18384 }, { "epoch": 4.850019786307875, "grad_norm": 0.0067365653812885284, "learning_rate": 3.4349775784753366e-06, "loss": 0.0014, "step": 18386 }, { "epoch": 4.85054742118454, "grad_norm": 0.013832874596118927, "learning_rate": 3.434625868284534e-06, "loss": 0.0006, "step": 18388 }, { "epoch": 4.851075056061205, "grad_norm": 0.01813877746462822, "learning_rate": 3.4342741580937305e-06, "loss": 0.0003, "step": 18390 }, { "epoch": 4.851602690937871, "grad_norm": 0.012887069955468178, "learning_rate": 3.4339224479029283e-06, "loss": 0.0001, "step": 18392 }, { "epoch": 4.852130325814536, "grad_norm": 0.0128894979134202, "learning_rate": 3.4335707377121257e-06, "loss": 0.0015, "step": 18394 }, { "epoch": 4.852657960691202, "grad_norm": 0.0020492710173130035, "learning_rate": 3.433219027521322e-06, "loss": 0.0001, "step": 18396 }, { "epoch": 4.853185595567867, "grad_norm": 0.4422370195388794, "learning_rate": 3.4328673173305196e-06, "loss": 0.0049, "step": 18398 }, { "epoch": 4.853713230444533, "grad_norm": 0.006692556664347649, "learning_rate": 3.432515607139717e-06, "loss": 0.0002, "step": 18400 }, { "epoch": 4.854240865321198, "grad_norm": 0.008997255004942417, "learning_rate": 3.4321638969489143e-06, "loss": 0.0003, "step": 18402 }, { "epoch": 4.8547685001978635, "grad_norm": 0.1634204387664795, "learning_rate": 3.4318121867581113e-06, "loss": 0.0119, "step": 18404 }, { "epoch": 4.8552961350745285, "grad_norm": 0.015809109434485435, "learning_rate": 3.4314604765673087e-06, "loss": 0.0003, "step": 18406 }, { "epoch": 4.8558237699511935, "grad_norm": 0.1442493051290512, "learning_rate": 3.431108766376506e-06, "loss": 0.0018, "step": 18408 }, { "epoch": 4.856351404827859, "grad_norm": 0.0029653070960193872, "learning_rate": 3.4307570561857034e-06, "loss": 0.0001, "step": 18410 }, { "epoch": 4.856879039704524, "grad_norm": 0.19335971772670746, "learning_rate": 3.4304053459949e-06, "loss": 0.0031, "step": 18412 }, { "epoch": 4.85740667458119, "grad_norm": 0.055272504687309265, "learning_rate": 3.4300536358040973e-06, "loss": 0.0003, "step": 18414 }, { "epoch": 4.857934309457855, "grad_norm": 0.04648040980100632, "learning_rate": 3.4297019256132947e-06, "loss": 0.0031, "step": 18416 }, { "epoch": 4.85846194433452, "grad_norm": 0.08837142586708069, "learning_rate": 3.429350215422492e-06, "loss": 0.0009, "step": 18418 }, { "epoch": 4.858989579211186, "grad_norm": 0.04259196296334267, "learning_rate": 3.428998505231689e-06, "loss": 0.0003, "step": 18420 }, { "epoch": 4.859517214087851, "grad_norm": 0.39335641264915466, "learning_rate": 3.4286467950408864e-06, "loss": 0.0015, "step": 18422 }, { "epoch": 4.860044848964517, "grad_norm": 0.2138194590806961, "learning_rate": 3.4282950848500838e-06, "loss": 0.0007, "step": 18424 }, { "epoch": 4.860572483841182, "grad_norm": 0.0036574515979737043, "learning_rate": 3.4279433746592807e-06, "loss": 0.0001, "step": 18426 }, { "epoch": 4.8611001187178475, "grad_norm": 1.1613819599151611, "learning_rate": 3.427591664468478e-06, "loss": 0.0056, "step": 18428 }, { "epoch": 4.8616277535945125, "grad_norm": 0.01972496137022972, "learning_rate": 3.4272399542776755e-06, "loss": 0.0002, "step": 18430 }, { "epoch": 4.862155388471178, "grad_norm": 0.28452473878860474, "learning_rate": 3.426888244086873e-06, "loss": 0.0138, "step": 18432 }, { "epoch": 4.862683023347843, "grad_norm": 0.0041730585508048534, "learning_rate": 3.4265365338960694e-06, "loss": 0.0001, "step": 18434 }, { "epoch": 4.863210658224508, "grad_norm": 0.29511842131614685, "learning_rate": 3.4261848237052668e-06, "loss": 0.0006, "step": 18436 }, { "epoch": 4.863738293101174, "grad_norm": 0.0042869155295193195, "learning_rate": 3.425833113514464e-06, "loss": 0.0001, "step": 18438 }, { "epoch": 4.864265927977839, "grad_norm": 0.06678656488656998, "learning_rate": 3.4254814033236615e-06, "loss": 0.0017, "step": 18440 }, { "epoch": 4.864793562854505, "grad_norm": 0.004775466397404671, "learning_rate": 3.4251296931328585e-06, "loss": 0.0003, "step": 18442 }, { "epoch": 4.86532119773117, "grad_norm": 0.00666404515504837, "learning_rate": 3.424777982942056e-06, "loss": 0.0078, "step": 18444 }, { "epoch": 4.865848832607836, "grad_norm": 0.004533746745437384, "learning_rate": 3.424426272751253e-06, "loss": 0.0001, "step": 18446 }, { "epoch": 4.866376467484501, "grad_norm": 0.013179251924157143, "learning_rate": 3.4240745625604506e-06, "loss": 0.0009, "step": 18448 }, { "epoch": 4.8669041023611666, "grad_norm": 0.005912558175623417, "learning_rate": 3.4237228523696475e-06, "loss": 0.0048, "step": 18450 }, { "epoch": 4.8674317372378315, "grad_norm": 0.029023200273513794, "learning_rate": 3.423371142178845e-06, "loss": 0.0002, "step": 18452 }, { "epoch": 4.8679593721144965, "grad_norm": 0.01544464286416769, "learning_rate": 3.4230194319880423e-06, "loss": 0.0009, "step": 18454 }, { "epoch": 4.868487006991162, "grad_norm": 0.10565544664859772, "learning_rate": 3.422667721797239e-06, "loss": 0.0003, "step": 18456 }, { "epoch": 4.869014641867827, "grad_norm": 0.04872157424688339, "learning_rate": 3.422316011606436e-06, "loss": 0.0003, "step": 18458 }, { "epoch": 4.869542276744493, "grad_norm": 0.01277145929634571, "learning_rate": 3.4219643014156336e-06, "loss": 0.0001, "step": 18460 }, { "epoch": 4.870069911621158, "grad_norm": 0.008379180915653706, "learning_rate": 3.421612591224831e-06, "loss": 0.0002, "step": 18462 }, { "epoch": 4.870597546497823, "grad_norm": 0.0636860802769661, "learning_rate": 3.421260881034028e-06, "loss": 0.0007, "step": 18464 }, { "epoch": 4.871125181374489, "grad_norm": 0.0070204283110797405, "learning_rate": 3.4209091708432253e-06, "loss": 0.0002, "step": 18466 }, { "epoch": 4.871652816251154, "grad_norm": 0.08071251958608627, "learning_rate": 3.4205574606524227e-06, "loss": 0.0003, "step": 18468 }, { "epoch": 4.87218045112782, "grad_norm": 0.06928785145282745, "learning_rate": 3.42020575046162e-06, "loss": 0.0008, "step": 18470 }, { "epoch": 4.872708086004485, "grad_norm": 0.016119448468089104, "learning_rate": 3.4198540402708166e-06, "loss": 0.0002, "step": 18472 }, { "epoch": 4.8732357208811505, "grad_norm": 0.021141622215509415, "learning_rate": 3.419502330080014e-06, "loss": 0.0002, "step": 18474 }, { "epoch": 4.8737633557578155, "grad_norm": 0.033380914479494095, "learning_rate": 3.4191506198892113e-06, "loss": 0.0004, "step": 18476 }, { "epoch": 4.874290990634481, "grad_norm": 0.0018662404036149383, "learning_rate": 3.418798909698409e-06, "loss": 0.0002, "step": 18478 }, { "epoch": 4.874818625511146, "grad_norm": 0.003678677137941122, "learning_rate": 3.4184471995076056e-06, "loss": 0.0001, "step": 18480 }, { "epoch": 4.875346260387811, "grad_norm": 0.006788542028516531, "learning_rate": 3.418095489316803e-06, "loss": 0.0012, "step": 18482 }, { "epoch": 4.875873895264477, "grad_norm": 0.3398001492023468, "learning_rate": 3.4177437791260004e-06, "loss": 0.0049, "step": 18484 }, { "epoch": 4.876401530141142, "grad_norm": 0.006135171744972467, "learning_rate": 3.4173920689351973e-06, "loss": 0.0004, "step": 18486 }, { "epoch": 4.876929165017808, "grad_norm": 0.24424393475055695, "learning_rate": 3.4170403587443947e-06, "loss": 0.001, "step": 18488 }, { "epoch": 4.877456799894473, "grad_norm": 0.770307183265686, "learning_rate": 3.416688648553592e-06, "loss": 0.0028, "step": 18490 }, { "epoch": 4.877984434771139, "grad_norm": 0.007901420816779137, "learning_rate": 3.4163369383627895e-06, "loss": 0.0002, "step": 18492 }, { "epoch": 4.878512069647804, "grad_norm": 0.009923248551785946, "learning_rate": 3.415985228171986e-06, "loss": 0.0001, "step": 18494 }, { "epoch": 4.87903970452447, "grad_norm": 0.30025991797447205, "learning_rate": 3.4156335179811834e-06, "loss": 0.0027, "step": 18496 }, { "epoch": 4.8795673394011345, "grad_norm": 0.026790468022227287, "learning_rate": 3.4152818077903807e-06, "loss": 0.0002, "step": 18498 }, { "epoch": 4.8800949742777995, "grad_norm": 0.04030488431453705, "learning_rate": 3.414930097599578e-06, "loss": 0.0002, "step": 18500 }, { "epoch": 4.880622609154465, "grad_norm": 0.051802828907966614, "learning_rate": 3.414578387408775e-06, "loss": 0.0004, "step": 18502 }, { "epoch": 4.88115024403113, "grad_norm": 0.18419277667999268, "learning_rate": 3.4142266772179725e-06, "loss": 0.0006, "step": 18504 }, { "epoch": 4.881677878907796, "grad_norm": 0.02229374647140503, "learning_rate": 3.41387496702717e-06, "loss": 0.0022, "step": 18506 }, { "epoch": 4.882205513784461, "grad_norm": 0.043758321553468704, "learning_rate": 3.413523256836367e-06, "loss": 0.0002, "step": 18508 }, { "epoch": 4.882733148661126, "grad_norm": 0.0019869087263941765, "learning_rate": 3.413171546645564e-06, "loss": 0.0001, "step": 18510 }, { "epoch": 4.883260783537792, "grad_norm": 0.0020946671720594168, "learning_rate": 3.4128198364547615e-06, "loss": 0.0001, "step": 18512 }, { "epoch": 4.883788418414457, "grad_norm": 0.001645789248868823, "learning_rate": 3.412468126263959e-06, "loss": 0.0002, "step": 18514 }, { "epoch": 4.884316053291123, "grad_norm": 0.05259966105222702, "learning_rate": 3.4121164160731554e-06, "loss": 0.0009, "step": 18516 }, { "epoch": 4.884843688167788, "grad_norm": 0.003866087179630995, "learning_rate": 3.411764705882353e-06, "loss": 0.0001, "step": 18518 }, { "epoch": 4.8853713230444535, "grad_norm": 0.004529758356511593, "learning_rate": 3.41141299569155e-06, "loss": 0.0001, "step": 18520 }, { "epoch": 4.8858989579211185, "grad_norm": 0.01615804247558117, "learning_rate": 3.4110612855007476e-06, "loss": 0.0003, "step": 18522 }, { "epoch": 4.886426592797784, "grad_norm": 0.09104273468255997, "learning_rate": 3.4107095753099445e-06, "loss": 0.0022, "step": 18524 }, { "epoch": 4.886954227674449, "grad_norm": 0.4031926989555359, "learning_rate": 3.410357865119142e-06, "loss": 0.0028, "step": 18526 }, { "epoch": 4.887481862551114, "grad_norm": 0.006000349298119545, "learning_rate": 3.4100061549283393e-06, "loss": 0.0002, "step": 18528 }, { "epoch": 4.88800949742778, "grad_norm": 0.2115718126296997, "learning_rate": 3.4096544447375366e-06, "loss": 0.0006, "step": 18530 }, { "epoch": 4.888537132304445, "grad_norm": 0.013705822639167309, "learning_rate": 3.409302734546733e-06, "loss": 0.0001, "step": 18532 }, { "epoch": 4.889064767181111, "grad_norm": 0.001369850360788405, "learning_rate": 3.4089510243559305e-06, "loss": 0.0002, "step": 18534 }, { "epoch": 4.889592402057776, "grad_norm": 0.006896620616316795, "learning_rate": 3.4085993141651283e-06, "loss": 0.0002, "step": 18536 }, { "epoch": 4.890120036934442, "grad_norm": 0.03209059312939644, "learning_rate": 3.4082476039743257e-06, "loss": 0.0002, "step": 18538 }, { "epoch": 4.890647671811107, "grad_norm": 0.10758519172668457, "learning_rate": 3.4078958937835223e-06, "loss": 0.0007, "step": 18540 }, { "epoch": 4.891175306687772, "grad_norm": 0.00770231569185853, "learning_rate": 3.4075441835927196e-06, "loss": 0.0035, "step": 18542 }, { "epoch": 4.8917029415644375, "grad_norm": 0.005903477314859629, "learning_rate": 3.407192473401917e-06, "loss": 0.0001, "step": 18544 }, { "epoch": 4.8922305764411025, "grad_norm": 0.18451176583766937, "learning_rate": 3.406840763211114e-06, "loss": 0.0031, "step": 18546 }, { "epoch": 4.892758211317768, "grad_norm": 0.0017671394161880016, "learning_rate": 3.4064890530203113e-06, "loss": 0.0001, "step": 18548 }, { "epoch": 4.893285846194433, "grad_norm": 0.02812226489186287, "learning_rate": 3.4061373428295087e-06, "loss": 0.0001, "step": 18550 }, { "epoch": 4.893813481071099, "grad_norm": 0.04035191237926483, "learning_rate": 3.405785632638706e-06, "loss": 0.0002, "step": 18552 }, { "epoch": 4.894341115947764, "grad_norm": 0.00851944088935852, "learning_rate": 3.4054339224479026e-06, "loss": 0.0002, "step": 18554 }, { "epoch": 4.894868750824429, "grad_norm": 0.019780931994318962, "learning_rate": 3.4050822122571e-06, "loss": 0.0002, "step": 18556 }, { "epoch": 4.895396385701095, "grad_norm": 0.003133020829409361, "learning_rate": 3.4047305020662974e-06, "loss": 0.0001, "step": 18558 }, { "epoch": 4.89592402057776, "grad_norm": 0.004127685911953449, "learning_rate": 3.4043787918754947e-06, "loss": 0.0001, "step": 18560 }, { "epoch": 4.896451655454426, "grad_norm": 0.013457000255584717, "learning_rate": 3.4040270816846917e-06, "loss": 0.0003, "step": 18562 }, { "epoch": 4.896979290331091, "grad_norm": 0.0022747814655303955, "learning_rate": 3.403675371493889e-06, "loss": 0.0002, "step": 18564 }, { "epoch": 4.8975069252077565, "grad_norm": 0.004150595050305128, "learning_rate": 3.4033236613030864e-06, "loss": 0.0001, "step": 18566 }, { "epoch": 4.8980345600844215, "grad_norm": 0.006691450718790293, "learning_rate": 3.402971951112284e-06, "loss": 0.0001, "step": 18568 }, { "epoch": 4.898562194961087, "grad_norm": 0.08326371014118195, "learning_rate": 3.4026202409214808e-06, "loss": 0.002, "step": 18570 }, { "epoch": 4.899089829837752, "grad_norm": 0.575946033000946, "learning_rate": 3.402268530730678e-06, "loss": 0.0044, "step": 18572 }, { "epoch": 4.899617464714417, "grad_norm": 0.45319297909736633, "learning_rate": 3.4019168205398755e-06, "loss": 0.0017, "step": 18574 }, { "epoch": 4.900145099591083, "grad_norm": 0.6483225226402283, "learning_rate": 3.401565110349072e-06, "loss": 0.0014, "step": 18576 }, { "epoch": 4.900672734467748, "grad_norm": 0.2555001378059387, "learning_rate": 3.4012134001582694e-06, "loss": 0.0115, "step": 18578 }, { "epoch": 4.901200369344414, "grad_norm": 0.06881218403577805, "learning_rate": 3.400861689967467e-06, "loss": 0.0005, "step": 18580 }, { "epoch": 4.901728004221079, "grad_norm": 0.4606402516365051, "learning_rate": 3.400509979776664e-06, "loss": 0.0033, "step": 18582 }, { "epoch": 4.902255639097744, "grad_norm": 0.052481234073638916, "learning_rate": 3.400158269585861e-06, "loss": 0.0003, "step": 18584 }, { "epoch": 4.90278327397441, "grad_norm": 0.9125550985336304, "learning_rate": 3.3998065593950585e-06, "loss": 0.0067, "step": 18586 }, { "epoch": 4.903310908851075, "grad_norm": 0.34404879808425903, "learning_rate": 3.399454849204256e-06, "loss": 0.0011, "step": 18588 }, { "epoch": 4.9038385437277405, "grad_norm": 0.39359089732170105, "learning_rate": 3.3991031390134533e-06, "loss": 0.0017, "step": 18590 }, { "epoch": 4.9043661786044055, "grad_norm": 0.025071999058127403, "learning_rate": 3.39875142882265e-06, "loss": 0.0001, "step": 18592 }, { "epoch": 4.904893813481071, "grad_norm": 0.1690998524427414, "learning_rate": 3.3983997186318476e-06, "loss": 0.0013, "step": 18594 }, { "epoch": 4.905421448357736, "grad_norm": 0.02898409590125084, "learning_rate": 3.398048008441045e-06, "loss": 0.0001, "step": 18596 }, { "epoch": 4.905949083234402, "grad_norm": 0.1372232586145401, "learning_rate": 3.3976962982502423e-06, "loss": 0.0006, "step": 18598 }, { "epoch": 4.906476718111067, "grad_norm": 0.0981559157371521, "learning_rate": 3.397344588059439e-06, "loss": 0.0004, "step": 18600 }, { "epoch": 4.907004352987732, "grad_norm": 0.12363046407699585, "learning_rate": 3.3969928778686362e-06, "loss": 0.0005, "step": 18602 }, { "epoch": 4.907531987864398, "grad_norm": 0.007425817660987377, "learning_rate": 3.3966411676778336e-06, "loss": 0.0003, "step": 18604 }, { "epoch": 4.908059622741063, "grad_norm": 0.44999051094055176, "learning_rate": 3.3962894574870306e-06, "loss": 0.0027, "step": 18606 }, { "epoch": 4.908587257617729, "grad_norm": 0.03874938562512398, "learning_rate": 3.395937747296228e-06, "loss": 0.0002, "step": 18608 }, { "epoch": 4.909114892494394, "grad_norm": 0.004997908137738705, "learning_rate": 3.3955860371054253e-06, "loss": 0.0001, "step": 18610 }, { "epoch": 4.9096425273710596, "grad_norm": 0.003533827606588602, "learning_rate": 3.3952343269146227e-06, "loss": 0.0002, "step": 18612 }, { "epoch": 4.9101701622477245, "grad_norm": 0.011994459666311741, "learning_rate": 3.3948826167238192e-06, "loss": 0.0001, "step": 18614 }, { "epoch": 4.91069779712439, "grad_norm": 0.00917983241379261, "learning_rate": 3.3945309065330166e-06, "loss": 0.0004, "step": 18616 }, { "epoch": 4.911225432001055, "grad_norm": 0.23949500918388367, "learning_rate": 3.394179196342214e-06, "loss": 0.0005, "step": 18618 }, { "epoch": 4.91175306687772, "grad_norm": 0.005300498567521572, "learning_rate": 3.3938274861514118e-06, "loss": 0.0002, "step": 18620 }, { "epoch": 4.912280701754386, "grad_norm": 0.41162362694740295, "learning_rate": 3.3934757759606083e-06, "loss": 0.006, "step": 18622 }, { "epoch": 4.912808336631051, "grad_norm": 0.27719080448150635, "learning_rate": 3.3931240657698057e-06, "loss": 0.0007, "step": 18624 }, { "epoch": 4.913335971507717, "grad_norm": 0.1951165348291397, "learning_rate": 3.392772355579003e-06, "loss": 0.0105, "step": 18626 }, { "epoch": 4.913863606384382, "grad_norm": 0.0135816540569067, "learning_rate": 3.3924206453882004e-06, "loss": 0.0003, "step": 18628 }, { "epoch": 4.914391241261047, "grad_norm": 0.015713391825556755, "learning_rate": 3.3920689351973974e-06, "loss": 0.0004, "step": 18630 }, { "epoch": 4.914918876137713, "grad_norm": 0.42162376642227173, "learning_rate": 3.3917172250065948e-06, "loss": 0.0005, "step": 18632 }, { "epoch": 4.915446511014378, "grad_norm": 0.25425052642822266, "learning_rate": 3.391365514815792e-06, "loss": 0.0084, "step": 18634 }, { "epoch": 4.9159741458910435, "grad_norm": 0.09913081675767899, "learning_rate": 3.3910138046249887e-06, "loss": 0.0003, "step": 18636 }, { "epoch": 4.9165017807677085, "grad_norm": 0.2698518931865692, "learning_rate": 3.390662094434186e-06, "loss": 0.0017, "step": 18638 }, { "epoch": 4.917029415644374, "grad_norm": 0.011946117505431175, "learning_rate": 3.3903103842433834e-06, "loss": 0.0003, "step": 18640 }, { "epoch": 4.917557050521039, "grad_norm": 0.014729581773281097, "learning_rate": 3.389958674052581e-06, "loss": 0.0004, "step": 18642 }, { "epoch": 4.918084685397705, "grad_norm": 0.26666495203971863, "learning_rate": 3.3896069638617778e-06, "loss": 0.0019, "step": 18644 }, { "epoch": 4.91861232027437, "grad_norm": 0.024400046095252037, "learning_rate": 3.389255253670975e-06, "loss": 0.0001, "step": 18646 }, { "epoch": 4.919139955151035, "grad_norm": 0.021625375375151634, "learning_rate": 3.3889035434801725e-06, "loss": 0.0003, "step": 18648 }, { "epoch": 4.919667590027701, "grad_norm": 0.004590705968439579, "learning_rate": 3.38855183328937e-06, "loss": 0.0002, "step": 18650 }, { "epoch": 4.920195224904366, "grad_norm": 0.2896398901939392, "learning_rate": 3.388200123098567e-06, "loss": 0.0074, "step": 18652 }, { "epoch": 4.920722859781032, "grad_norm": 0.12427442520856857, "learning_rate": 3.387848412907764e-06, "loss": 0.0006, "step": 18654 }, { "epoch": 4.921250494657697, "grad_norm": 0.006655819248408079, "learning_rate": 3.3874967027169616e-06, "loss": 0.0002, "step": 18656 }, { "epoch": 4.9217781295343626, "grad_norm": 0.02242918498814106, "learning_rate": 3.387144992526159e-06, "loss": 0.0008, "step": 18658 }, { "epoch": 4.9223057644110275, "grad_norm": 0.04695408418774605, "learning_rate": 3.3867932823353555e-06, "loss": 0.0029, "step": 18660 }, { "epoch": 4.922833399287693, "grad_norm": 0.041575342416763306, "learning_rate": 3.386441572144553e-06, "loss": 0.0004, "step": 18662 }, { "epoch": 4.923361034164358, "grad_norm": 0.012748712673783302, "learning_rate": 3.3860898619537502e-06, "loss": 0.0002, "step": 18664 }, { "epoch": 4.923888669041023, "grad_norm": 0.021067509427666664, "learning_rate": 3.385738151762947e-06, "loss": 0.002, "step": 18666 }, { "epoch": 4.924416303917689, "grad_norm": 0.010535898618400097, "learning_rate": 3.3853864415721446e-06, "loss": 0.0041, "step": 18668 }, { "epoch": 4.924943938794354, "grad_norm": 0.0035082099493592978, "learning_rate": 3.385034731381342e-06, "loss": 0.0002, "step": 18670 }, { "epoch": 4.92547157367102, "grad_norm": 0.004497067537158728, "learning_rate": 3.3846830211905393e-06, "loss": 0.0001, "step": 18672 }, { "epoch": 4.925999208547685, "grad_norm": 0.12375402450561523, "learning_rate": 3.384331310999736e-06, "loss": 0.0018, "step": 18674 }, { "epoch": 4.92652684342435, "grad_norm": 0.1361958533525467, "learning_rate": 3.3839796008089332e-06, "loss": 0.0036, "step": 18676 }, { "epoch": 4.927054478301016, "grad_norm": 0.01821497455239296, "learning_rate": 3.383627890618131e-06, "loss": 0.0001, "step": 18678 }, { "epoch": 4.927582113177681, "grad_norm": 0.0030869394540786743, "learning_rate": 3.3832761804273284e-06, "loss": 0.0002, "step": 18680 }, { "epoch": 4.9281097480543465, "grad_norm": 0.5835244655609131, "learning_rate": 3.382924470236525e-06, "loss": 0.0042, "step": 18682 }, { "epoch": 4.9286373829310115, "grad_norm": 0.00831050518900156, "learning_rate": 3.3825727600457223e-06, "loss": 0.0017, "step": 18684 }, { "epoch": 4.929165017807677, "grad_norm": 0.5500089526176453, "learning_rate": 3.3822210498549197e-06, "loss": 0.0014, "step": 18686 }, { "epoch": 4.929692652684342, "grad_norm": 0.01348783727735281, "learning_rate": 3.381869339664117e-06, "loss": 0.0002, "step": 18688 }, { "epoch": 4.930220287561008, "grad_norm": 0.0178789421916008, "learning_rate": 3.381517629473314e-06, "loss": 0.0008, "step": 18690 }, { "epoch": 4.930747922437673, "grad_norm": 0.006440753582865, "learning_rate": 3.3811659192825114e-06, "loss": 0.0003, "step": 18692 }, { "epoch": 4.931275557314338, "grad_norm": 0.867338240146637, "learning_rate": 3.3808142090917088e-06, "loss": 0.004, "step": 18694 }, { "epoch": 4.931803192191004, "grad_norm": 0.006256389897316694, "learning_rate": 3.3804624989009053e-06, "loss": 0.0001, "step": 18696 }, { "epoch": 4.932330827067669, "grad_norm": 0.1566256731748581, "learning_rate": 3.3801107887101027e-06, "loss": 0.0017, "step": 18698 }, { "epoch": 4.932858461944335, "grad_norm": 0.004034844692796469, "learning_rate": 3.3797590785193e-06, "loss": 0.0001, "step": 18700 }, { "epoch": 4.933386096821, "grad_norm": 0.022312117740511894, "learning_rate": 3.3794073683284974e-06, "loss": 0.0002, "step": 18702 }, { "epoch": 4.933913731697666, "grad_norm": 0.11224163323640823, "learning_rate": 3.3790556581376944e-06, "loss": 0.0003, "step": 18704 }, { "epoch": 4.9344413665743305, "grad_norm": 0.004916087724268436, "learning_rate": 3.3787039479468917e-06, "loss": 0.0001, "step": 18706 }, { "epoch": 4.934969001450996, "grad_norm": 0.3824150264263153, "learning_rate": 3.378352237756089e-06, "loss": 0.002, "step": 18708 }, { "epoch": 4.935496636327661, "grad_norm": 0.001210472546517849, "learning_rate": 3.3780005275652865e-06, "loss": 0.0001, "step": 18710 }, { "epoch": 4.936024271204326, "grad_norm": 0.002487865975126624, "learning_rate": 3.3776488173744834e-06, "loss": 0.0001, "step": 18712 }, { "epoch": 4.936551906080992, "grad_norm": 0.0031843092292547226, "learning_rate": 3.377297107183681e-06, "loss": 0.0015, "step": 18714 }, { "epoch": 4.937079540957657, "grad_norm": 0.3014453053474426, "learning_rate": 3.376945396992878e-06, "loss": 0.0013, "step": 18716 }, { "epoch": 4.937607175834323, "grad_norm": 0.7962591052055359, "learning_rate": 3.3765936868020756e-06, "loss": 0.016, "step": 18718 }, { "epoch": 4.938134810710988, "grad_norm": 0.0035269276704639196, "learning_rate": 3.376241976611272e-06, "loss": 0.0001, "step": 18720 }, { "epoch": 4.938662445587653, "grad_norm": 0.004007289186120033, "learning_rate": 3.3758902664204695e-06, "loss": 0.0001, "step": 18722 }, { "epoch": 4.939190080464319, "grad_norm": 0.02277037687599659, "learning_rate": 3.375538556229667e-06, "loss": 0.0004, "step": 18724 }, { "epoch": 4.939717715340984, "grad_norm": 0.12288355827331543, "learning_rate": 3.375186846038864e-06, "loss": 0.0002, "step": 18726 }, { "epoch": 4.9402453502176495, "grad_norm": 0.4615280330181122, "learning_rate": 3.374835135848061e-06, "loss": 0.0038, "step": 18728 }, { "epoch": 4.9407729850943145, "grad_norm": 0.03888964280486107, "learning_rate": 3.3744834256572586e-06, "loss": 0.0004, "step": 18730 }, { "epoch": 4.94130061997098, "grad_norm": 0.0077783274464309216, "learning_rate": 3.374131715466456e-06, "loss": 0.0002, "step": 18732 }, { "epoch": 4.941828254847645, "grad_norm": 0.03066619299352169, "learning_rate": 3.3737800052756525e-06, "loss": 0.0002, "step": 18734 }, { "epoch": 4.942355889724311, "grad_norm": 0.288173109292984, "learning_rate": 3.3734282950848503e-06, "loss": 0.0016, "step": 18736 }, { "epoch": 4.942883524600976, "grad_norm": 0.0032782957423478365, "learning_rate": 3.3730765848940476e-06, "loss": 0.0001, "step": 18738 }, { "epoch": 4.943411159477641, "grad_norm": 0.009624956175684929, "learning_rate": 3.372724874703245e-06, "loss": 0.0001, "step": 18740 }, { "epoch": 4.943938794354307, "grad_norm": 0.002912333933636546, "learning_rate": 3.3723731645124415e-06, "loss": 0.0001, "step": 18742 }, { "epoch": 4.944466429230972, "grad_norm": 0.016117045655846596, "learning_rate": 3.372021454321639e-06, "loss": 0.0001, "step": 18744 }, { "epoch": 4.944994064107638, "grad_norm": 0.0015576137229800224, "learning_rate": 3.3716697441308363e-06, "loss": 0.001, "step": 18746 }, { "epoch": 4.945521698984303, "grad_norm": 0.031739722937345505, "learning_rate": 3.3713180339400337e-06, "loss": 0.0071, "step": 18748 }, { "epoch": 4.946049333860969, "grad_norm": 0.014876465313136578, "learning_rate": 3.3709663237492306e-06, "loss": 0.0021, "step": 18750 }, { "epoch": 4.9465769687376335, "grad_norm": 0.34325486421585083, "learning_rate": 3.370614613558428e-06, "loss": 0.0007, "step": 18752 }, { "epoch": 4.947104603614299, "grad_norm": 0.2756442129611969, "learning_rate": 3.3702629033676254e-06, "loss": 0.0006, "step": 18754 }, { "epoch": 4.947632238490964, "grad_norm": 0.02486349828541279, "learning_rate": 3.369911193176822e-06, "loss": 0.0007, "step": 18756 }, { "epoch": 4.948159873367629, "grad_norm": 0.4197828471660614, "learning_rate": 3.3695594829860193e-06, "loss": 0.0134, "step": 18758 }, { "epoch": 4.948687508244295, "grad_norm": 0.007034790236502886, "learning_rate": 3.3692077727952167e-06, "loss": 0.0002, "step": 18760 }, { "epoch": 4.94921514312096, "grad_norm": 0.10644753277301788, "learning_rate": 3.3688560626044145e-06, "loss": 0.01, "step": 18762 }, { "epoch": 4.949742777997626, "grad_norm": 1.0006593465805054, "learning_rate": 3.368504352413611e-06, "loss": 0.008, "step": 18764 }, { "epoch": 4.950270412874291, "grad_norm": 0.04629570245742798, "learning_rate": 3.3681526422228084e-06, "loss": 0.0012, "step": 18766 }, { "epoch": 4.950798047750956, "grad_norm": 0.004614970181137323, "learning_rate": 3.3678009320320057e-06, "loss": 0.0002, "step": 18768 }, { "epoch": 4.951325682627622, "grad_norm": 0.07958231121301651, "learning_rate": 3.367449221841203e-06, "loss": 0.0061, "step": 18770 }, { "epoch": 4.951853317504287, "grad_norm": 0.01866612210869789, "learning_rate": 3.3670975116504e-06, "loss": 0.0008, "step": 18772 }, { "epoch": 4.9523809523809526, "grad_norm": 0.01697615720331669, "learning_rate": 3.3667458014595974e-06, "loss": 0.0018, "step": 18774 }, { "epoch": 4.9529085872576175, "grad_norm": 0.3764452636241913, "learning_rate": 3.366394091268795e-06, "loss": 0.0006, "step": 18776 }, { "epoch": 4.953436222134283, "grad_norm": 0.10961240530014038, "learning_rate": 3.366042381077992e-06, "loss": 0.0086, "step": 18778 }, { "epoch": 4.953963857010948, "grad_norm": 0.6299934983253479, "learning_rate": 3.3656906708871887e-06, "loss": 0.003, "step": 18780 }, { "epoch": 4.954491491887614, "grad_norm": 0.14394471049308777, "learning_rate": 3.365338960696386e-06, "loss": 0.003, "step": 18782 }, { "epoch": 4.955019126764279, "grad_norm": 0.1294964700937271, "learning_rate": 3.3649872505055835e-06, "loss": 0.0021, "step": 18784 }, { "epoch": 4.955546761640944, "grad_norm": 0.2417605221271515, "learning_rate": 3.3646355403147804e-06, "loss": 0.0012, "step": 18786 }, { "epoch": 4.95607439651761, "grad_norm": 0.6212524771690369, "learning_rate": 3.364283830123978e-06, "loss": 0.0025, "step": 18788 }, { "epoch": 4.956602031394275, "grad_norm": 0.03982343152165413, "learning_rate": 3.363932119933175e-06, "loss": 0.0031, "step": 18790 }, { "epoch": 4.957129666270941, "grad_norm": 0.004650580696761608, "learning_rate": 3.3635804097423726e-06, "loss": 0.0001, "step": 18792 }, { "epoch": 4.957657301147606, "grad_norm": 0.13238966464996338, "learning_rate": 3.3632286995515695e-06, "loss": 0.0004, "step": 18794 }, { "epoch": 4.958184936024272, "grad_norm": 0.002603552071377635, "learning_rate": 3.362876989360767e-06, "loss": 0.0003, "step": 18796 }, { "epoch": 4.9587125709009365, "grad_norm": 0.009418299421668053, "learning_rate": 3.3625252791699643e-06, "loss": 0.0001, "step": 18798 }, { "epoch": 4.959240205777602, "grad_norm": 0.0018148478120565414, "learning_rate": 3.3621735689791616e-06, "loss": 0.0003, "step": 18800 }, { "epoch": 4.959767840654267, "grad_norm": 0.00420738011598587, "learning_rate": 3.361821858788358e-06, "loss": 0.0002, "step": 18802 }, { "epoch": 4.960295475530932, "grad_norm": 0.007145201321691275, "learning_rate": 3.3614701485975555e-06, "loss": 0.0002, "step": 18804 }, { "epoch": 4.960823110407598, "grad_norm": 0.017948919907212257, "learning_rate": 3.361118438406753e-06, "loss": 0.0036, "step": 18806 }, { "epoch": 4.961350745284263, "grad_norm": 0.167745903134346, "learning_rate": 3.3607667282159503e-06, "loss": 0.0006, "step": 18808 }, { "epoch": 4.961878380160929, "grad_norm": 0.0015252841403707862, "learning_rate": 3.3604150180251472e-06, "loss": 0.0055, "step": 18810 }, { "epoch": 4.962406015037594, "grad_norm": 0.004940187558531761, "learning_rate": 3.3600633078343446e-06, "loss": 0.0015, "step": 18812 }, { "epoch": 4.962933649914259, "grad_norm": 0.24657316505908966, "learning_rate": 3.359711597643542e-06, "loss": 0.0004, "step": 18814 }, { "epoch": 4.963461284790925, "grad_norm": 0.002280775923281908, "learning_rate": 3.3593598874527385e-06, "loss": 0.0001, "step": 18816 }, { "epoch": 4.96398891966759, "grad_norm": 0.26610738039016724, "learning_rate": 3.359008177261936e-06, "loss": 0.001, "step": 18818 }, { "epoch": 4.9645165545442556, "grad_norm": 0.003876608097925782, "learning_rate": 3.3586564670711337e-06, "loss": 0.0003, "step": 18820 }, { "epoch": 4.9650441894209205, "grad_norm": 0.02325773425400257, "learning_rate": 3.358304756880331e-06, "loss": 0.0006, "step": 18822 }, { "epoch": 4.965571824297586, "grad_norm": 0.43332964181900024, "learning_rate": 3.3579530466895276e-06, "loss": 0.0047, "step": 18824 }, { "epoch": 4.966099459174251, "grad_norm": 0.005547536537051201, "learning_rate": 3.357601336498725e-06, "loss": 0.0001, "step": 18826 }, { "epoch": 4.966627094050917, "grad_norm": 0.03130101040005684, "learning_rate": 3.3572496263079224e-06, "loss": 0.0002, "step": 18828 }, { "epoch": 4.967154728927582, "grad_norm": 0.003874043235555291, "learning_rate": 3.3568979161171197e-06, "loss": 0.0001, "step": 18830 }, { "epoch": 4.967682363804247, "grad_norm": 0.01395034696906805, "learning_rate": 3.3565462059263167e-06, "loss": 0.0018, "step": 18832 }, { "epoch": 4.968209998680913, "grad_norm": 0.07027392834424973, "learning_rate": 3.356194495735514e-06, "loss": 0.0003, "step": 18834 }, { "epoch": 4.968737633557578, "grad_norm": 0.005055807065218687, "learning_rate": 3.3558427855447114e-06, "loss": 0.0003, "step": 18836 }, { "epoch": 4.969265268434244, "grad_norm": 0.011210449039936066, "learning_rate": 3.355491075353909e-06, "loss": 0.0003, "step": 18838 }, { "epoch": 4.969792903310909, "grad_norm": 0.02098279818892479, "learning_rate": 3.3551393651631053e-06, "loss": 0.0002, "step": 18840 }, { "epoch": 4.970320538187575, "grad_norm": 0.16688460111618042, "learning_rate": 3.3547876549723027e-06, "loss": 0.0019, "step": 18842 }, { "epoch": 4.9708481730642395, "grad_norm": 0.007800564169883728, "learning_rate": 3.3544359447815e-06, "loss": 0.0001, "step": 18844 }, { "epoch": 4.971375807940905, "grad_norm": 0.0332857184112072, "learning_rate": 3.354084234590697e-06, "loss": 0.0005, "step": 18846 }, { "epoch": 4.97190344281757, "grad_norm": 0.014097915962338448, "learning_rate": 3.3537325243998944e-06, "loss": 0.0002, "step": 18848 }, { "epoch": 4.972431077694235, "grad_norm": 0.0023251213133335114, "learning_rate": 3.353380814209092e-06, "loss": 0.0025, "step": 18850 }, { "epoch": 4.972958712570901, "grad_norm": 0.005064132157713175, "learning_rate": 3.353029104018289e-06, "loss": 0.0001, "step": 18852 }, { "epoch": 4.973486347447566, "grad_norm": 0.4522189795970917, "learning_rate": 3.352677393827486e-06, "loss": 0.0025, "step": 18854 }, { "epoch": 4.974013982324232, "grad_norm": 0.8809122443199158, "learning_rate": 3.3523256836366835e-06, "loss": 0.0052, "step": 18856 }, { "epoch": 4.974541617200897, "grad_norm": 0.004817991517484188, "learning_rate": 3.351973973445881e-06, "loss": 0.0001, "step": 18858 }, { "epoch": 4.975069252077562, "grad_norm": 0.6245167851448059, "learning_rate": 3.3516222632550782e-06, "loss": 0.0061, "step": 18860 }, { "epoch": 4.975596886954228, "grad_norm": 0.03006359376013279, "learning_rate": 3.3512705530642748e-06, "loss": 0.0002, "step": 18862 }, { "epoch": 4.976124521830893, "grad_norm": 0.2952217757701874, "learning_rate": 3.350918842873472e-06, "loss": 0.0011, "step": 18864 }, { "epoch": 4.976652156707559, "grad_norm": 0.007335249800235033, "learning_rate": 3.3505671326826695e-06, "loss": 0.0002, "step": 18866 }, { "epoch": 4.9771797915842235, "grad_norm": 0.27391013503074646, "learning_rate": 3.350215422491867e-06, "loss": 0.0072, "step": 18868 }, { "epoch": 4.977707426460889, "grad_norm": 0.0346459336578846, "learning_rate": 3.349863712301064e-06, "loss": 0.0004, "step": 18870 }, { "epoch": 4.978235061337554, "grad_norm": 0.17476578056812286, "learning_rate": 3.3495120021102612e-06, "loss": 0.0006, "step": 18872 }, { "epoch": 4.97876269621422, "grad_norm": 0.014438331127166748, "learning_rate": 3.3491602919194586e-06, "loss": 0.0003, "step": 18874 }, { "epoch": 4.979290331090885, "grad_norm": 0.024728886783123016, "learning_rate": 3.348808581728656e-06, "loss": 0.0122, "step": 18876 }, { "epoch": 4.97981796596755, "grad_norm": 0.1822051852941513, "learning_rate": 3.348456871537853e-06, "loss": 0.0007, "step": 18878 }, { "epoch": 4.980345600844216, "grad_norm": 0.10246299207210541, "learning_rate": 3.3481051613470503e-06, "loss": 0.0062, "step": 18880 }, { "epoch": 4.980873235720881, "grad_norm": 0.008027877658605576, "learning_rate": 3.3477534511562477e-06, "loss": 0.0004, "step": 18882 }, { "epoch": 4.981400870597547, "grad_norm": 0.2051309049129486, "learning_rate": 3.3474017409654442e-06, "loss": 0.001, "step": 18884 }, { "epoch": 4.981928505474212, "grad_norm": 0.019121548160910606, "learning_rate": 3.3470500307746416e-06, "loss": 0.0002, "step": 18886 }, { "epoch": 4.982456140350877, "grad_norm": 0.6946781873703003, "learning_rate": 3.346698320583839e-06, "loss": 0.003, "step": 18888 }, { "epoch": 4.9829837752275425, "grad_norm": 0.1488547921180725, "learning_rate": 3.3463466103930363e-06, "loss": 0.0004, "step": 18890 }, { "epoch": 4.9835114101042075, "grad_norm": 1.2684506177902222, "learning_rate": 3.3459949002022333e-06, "loss": 0.0036, "step": 18892 }, { "epoch": 4.984039044980873, "grad_norm": 0.02110123448073864, "learning_rate": 3.3456431900114307e-06, "loss": 0.0011, "step": 18894 }, { "epoch": 4.984566679857538, "grad_norm": 0.011003942228853703, "learning_rate": 3.345291479820628e-06, "loss": 0.0002, "step": 18896 }, { "epoch": 4.985094314734204, "grad_norm": 0.6762385964393616, "learning_rate": 3.3449397696298254e-06, "loss": 0.003, "step": 18898 }, { "epoch": 4.985621949610869, "grad_norm": 0.016250431537628174, "learning_rate": 3.344588059439022e-06, "loss": 0.0002, "step": 18900 }, { "epoch": 4.986149584487535, "grad_norm": 0.005520412232726812, "learning_rate": 3.3442363492482193e-06, "loss": 0.0001, "step": 18902 }, { "epoch": 4.9866772193642, "grad_norm": 0.08315442502498627, "learning_rate": 3.343884639057417e-06, "loss": 0.0005, "step": 18904 }, { "epoch": 4.987204854240865, "grad_norm": 0.09992721676826477, "learning_rate": 3.3435329288666145e-06, "loss": 0.0004, "step": 18906 }, { "epoch": 4.987732489117531, "grad_norm": 0.003334179986268282, "learning_rate": 3.343181218675811e-06, "loss": 0.0002, "step": 18908 }, { "epoch": 4.988260123994196, "grad_norm": 0.1073303148150444, "learning_rate": 3.3428295084850084e-06, "loss": 0.0004, "step": 18910 }, { "epoch": 4.988787758870862, "grad_norm": 0.0032748919911682606, "learning_rate": 3.3424777982942058e-06, "loss": 0.0031, "step": 18912 }, { "epoch": 4.9893153937475265, "grad_norm": 0.007873627357184887, "learning_rate": 3.3421260881034027e-06, "loss": 0.0011, "step": 18914 }, { "epoch": 4.989843028624192, "grad_norm": 0.2645203471183777, "learning_rate": 3.3417743779126e-06, "loss": 0.0041, "step": 18916 }, { "epoch": 4.990370663500857, "grad_norm": 0.024829505011439323, "learning_rate": 3.3414226677217975e-06, "loss": 0.0069, "step": 18918 }, { "epoch": 4.990898298377523, "grad_norm": 0.007255772594362497, "learning_rate": 3.341070957530995e-06, "loss": 0.0001, "step": 18920 }, { "epoch": 4.991425933254188, "grad_norm": 0.006173485424369574, "learning_rate": 3.3407192473401914e-06, "loss": 0.0004, "step": 18922 }, { "epoch": 4.991953568130853, "grad_norm": 0.014264610596001148, "learning_rate": 3.3403675371493888e-06, "loss": 0.0003, "step": 18924 }, { "epoch": 4.992481203007519, "grad_norm": 0.023221662268042564, "learning_rate": 3.340015826958586e-06, "loss": 0.0002, "step": 18926 }, { "epoch": 4.993008837884184, "grad_norm": 0.0036585908383131027, "learning_rate": 3.3396641167677835e-06, "loss": 0.0145, "step": 18928 }, { "epoch": 4.99353647276085, "grad_norm": 0.1893455684185028, "learning_rate": 3.3393124065769805e-06, "loss": 0.0018, "step": 18930 }, { "epoch": 4.994064107637515, "grad_norm": 0.03909609094262123, "learning_rate": 3.338960696386178e-06, "loss": 0.0003, "step": 18932 }, { "epoch": 4.99459174251418, "grad_norm": 0.6157294511795044, "learning_rate": 3.3386089861953752e-06, "loss": 0.0013, "step": 18934 }, { "epoch": 4.9951193773908455, "grad_norm": 0.003667277516797185, "learning_rate": 3.3382572760045726e-06, "loss": 0.0012, "step": 18936 }, { "epoch": 4.9956470122675105, "grad_norm": 0.009116364642977715, "learning_rate": 3.3379055658137696e-06, "loss": 0.0002, "step": 18938 }, { "epoch": 4.996174647144176, "grad_norm": 0.010046330280601978, "learning_rate": 3.337553855622967e-06, "loss": 0.0003, "step": 18940 }, { "epoch": 4.996702282020841, "grad_norm": 0.025145728141069412, "learning_rate": 3.3372021454321643e-06, "loss": 0.0002, "step": 18942 }, { "epoch": 4.997229916897507, "grad_norm": 0.20992891490459442, "learning_rate": 3.336850435241361e-06, "loss": 0.0007, "step": 18944 }, { "epoch": 4.997757551774172, "grad_norm": 0.4113720655441284, "learning_rate": 3.336498725050558e-06, "loss": 0.0025, "step": 18946 }, { "epoch": 4.998285186650838, "grad_norm": 0.16304123401641846, "learning_rate": 3.3361470148597556e-06, "loss": 0.0004, "step": 18948 }, { "epoch": 4.998812821527503, "grad_norm": 0.006766538601368666, "learning_rate": 3.335795304668953e-06, "loss": 0.0002, "step": 18950 }, { "epoch": 4.999340456404168, "grad_norm": 0.11366577446460724, "learning_rate": 3.33544359447815e-06, "loss": 0.0042, "step": 18952 }, { "epoch": 4.999868091280834, "grad_norm": 0.8502700328826904, "learning_rate": 3.3350918842873473e-06, "loss": 0.0021, "step": 18954 }, { "epoch": 5.0002638174383325, "grad_norm": 0.039683934301137924, "learning_rate": 3.3347401740965447e-06, "loss": 0.0003, "step": 18956 }, { "epoch": 5.000791452314998, "grad_norm": 0.043131425976753235, "learning_rate": 3.334388463905742e-06, "loss": 0.0002, "step": 18958 }, { "epoch": 5.001319087191663, "grad_norm": 0.005566669628024101, "learning_rate": 3.3340367537149386e-06, "loss": 0.0014, "step": 18960 }, { "epoch": 5.001846722068329, "grad_norm": 0.008293628692626953, "learning_rate": 3.3336850435241364e-06, "loss": 0.0001, "step": 18962 }, { "epoch": 5.002374356944994, "grad_norm": 0.1880076378583908, "learning_rate": 3.3333333333333337e-06, "loss": 0.0009, "step": 18964 }, { "epoch": 5.002901991821659, "grad_norm": 0.09991281479597092, "learning_rate": 3.332981623142531e-06, "loss": 0.001, "step": 18966 }, { "epoch": 5.003429626698325, "grad_norm": 0.025507744401693344, "learning_rate": 3.3326299129517277e-06, "loss": 0.0011, "step": 18968 }, { "epoch": 5.00395726157499, "grad_norm": 0.10101959854364395, "learning_rate": 3.332278202760925e-06, "loss": 0.0006, "step": 18970 }, { "epoch": 5.004484896451656, "grad_norm": 0.06057767942547798, "learning_rate": 3.3319264925701224e-06, "loss": 0.0025, "step": 18972 }, { "epoch": 5.005012531328321, "grad_norm": 0.011840152554214, "learning_rate": 3.3315747823793194e-06, "loss": 0.0002, "step": 18974 }, { "epoch": 5.0055401662049865, "grad_norm": 0.18819282948970795, "learning_rate": 3.3312230721885167e-06, "loss": 0.0008, "step": 18976 }, { "epoch": 5.0060678010816515, "grad_norm": 0.18029150366783142, "learning_rate": 3.330871361997714e-06, "loss": 0.0009, "step": 18978 }, { "epoch": 5.0065954359583165, "grad_norm": 0.011365145444869995, "learning_rate": 3.3305196518069115e-06, "loss": 0.0004, "step": 18980 }, { "epoch": 5.007123070834982, "grad_norm": 0.0017102311830967665, "learning_rate": 3.330167941616108e-06, "loss": 0.0004, "step": 18982 }, { "epoch": 5.007650705711647, "grad_norm": 0.02386614866554737, "learning_rate": 3.3298162314253054e-06, "loss": 0.0002, "step": 18984 }, { "epoch": 5.008178340588313, "grad_norm": 0.019486255943775177, "learning_rate": 3.3294645212345028e-06, "loss": 0.0002, "step": 18986 }, { "epoch": 5.008705975464978, "grad_norm": 0.0078022717498242855, "learning_rate": 3.3291128110437006e-06, "loss": 0.0101, "step": 18988 }, { "epoch": 5.009233610341644, "grad_norm": 0.005288041662424803, "learning_rate": 3.328761100852897e-06, "loss": 0.0024, "step": 18990 }, { "epoch": 5.009761245218309, "grad_norm": 0.003372324164956808, "learning_rate": 3.3284093906620945e-06, "loss": 0.0005, "step": 18992 }, { "epoch": 5.010288880094974, "grad_norm": 0.002873521763831377, "learning_rate": 3.328057680471292e-06, "loss": 0.0001, "step": 18994 }, { "epoch": 5.01081651497164, "grad_norm": 0.004129112232476473, "learning_rate": 3.3277059702804892e-06, "loss": 0.0044, "step": 18996 }, { "epoch": 5.011344149848305, "grad_norm": 0.00955105572938919, "learning_rate": 3.327354260089686e-06, "loss": 0.0002, "step": 18998 }, { "epoch": 5.0118717847249705, "grad_norm": 0.037003058940172195, "learning_rate": 3.3270025498988835e-06, "loss": 0.0005, "step": 19000 }, { "epoch": 5.0123994196016355, "grad_norm": 0.011724979616701603, "learning_rate": 3.326650839708081e-06, "loss": 0.0001, "step": 19002 }, { "epoch": 5.012927054478301, "grad_norm": 0.0033018682152032852, "learning_rate": 3.3262991295172775e-06, "loss": 0.002, "step": 19004 }, { "epoch": 5.013454689354966, "grad_norm": 0.010785656981170177, "learning_rate": 3.325947419326475e-06, "loss": 0.0002, "step": 19006 }, { "epoch": 5.013982324231632, "grad_norm": 0.004029314033687115, "learning_rate": 3.325595709135672e-06, "loss": 0.0001, "step": 19008 }, { "epoch": 5.014509959108297, "grad_norm": 0.006199866067618132, "learning_rate": 3.3252439989448696e-06, "loss": 0.0001, "step": 19010 }, { "epoch": 5.015037593984962, "grad_norm": 0.1177535355091095, "learning_rate": 3.3248922887540665e-06, "loss": 0.0005, "step": 19012 }, { "epoch": 5.015565228861628, "grad_norm": 0.005567936226725578, "learning_rate": 3.324540578563264e-06, "loss": 0.0017, "step": 19014 }, { "epoch": 5.016092863738293, "grad_norm": 0.0030631395056843758, "learning_rate": 3.3241888683724613e-06, "loss": 0.0001, "step": 19016 }, { "epoch": 5.016620498614959, "grad_norm": 0.04472992569208145, "learning_rate": 3.3238371581816587e-06, "loss": 0.0002, "step": 19018 }, { "epoch": 5.017148133491624, "grad_norm": 0.08806360512971878, "learning_rate": 3.3234854479908556e-06, "loss": 0.0019, "step": 19020 }, { "epoch": 5.0176757683682895, "grad_norm": 0.0014186243060976267, "learning_rate": 3.323133737800053e-06, "loss": 0.0001, "step": 19022 }, { "epoch": 5.0182034032449545, "grad_norm": 0.0037652826867997646, "learning_rate": 3.3227820276092504e-06, "loss": 0.0001, "step": 19024 }, { "epoch": 5.0187310381216195, "grad_norm": 0.398120254278183, "learning_rate": 3.3224303174184477e-06, "loss": 0.001, "step": 19026 }, { "epoch": 5.019258672998285, "grad_norm": 0.002396479481831193, "learning_rate": 3.3220786072276443e-06, "loss": 0.0001, "step": 19028 }, { "epoch": 5.01978630787495, "grad_norm": 0.0023194358218461275, "learning_rate": 3.3217268970368416e-06, "loss": 0.0002, "step": 19030 }, { "epoch": 5.020313942751616, "grad_norm": 0.004082771018147469, "learning_rate": 3.321375186846039e-06, "loss": 0.0001, "step": 19032 }, { "epoch": 5.020841577628281, "grad_norm": 0.5987210273742676, "learning_rate": 3.321023476655236e-06, "loss": 0.0032, "step": 19034 }, { "epoch": 5.021369212504947, "grad_norm": 0.029110245406627655, "learning_rate": 3.3206717664644333e-06, "loss": 0.0002, "step": 19036 }, { "epoch": 5.021896847381612, "grad_norm": 0.002222245093435049, "learning_rate": 3.3203200562736307e-06, "loss": 0.0001, "step": 19038 }, { "epoch": 5.022424482258277, "grad_norm": 0.0021205435041338205, "learning_rate": 3.319968346082828e-06, "loss": 0.0001, "step": 19040 }, { "epoch": 5.022952117134943, "grad_norm": 0.3424559533596039, "learning_rate": 3.3196166358920246e-06, "loss": 0.002, "step": 19042 }, { "epoch": 5.023479752011608, "grad_norm": 0.22212128341197968, "learning_rate": 3.319264925701222e-06, "loss": 0.0006, "step": 19044 }, { "epoch": 5.0240073868882735, "grad_norm": 0.07933001965284348, "learning_rate": 3.31891321551042e-06, "loss": 0.0028, "step": 19046 }, { "epoch": 5.0245350217649385, "grad_norm": 0.03047039732336998, "learning_rate": 3.318561505319617e-06, "loss": 0.0002, "step": 19048 }, { "epoch": 5.025062656641604, "grad_norm": 0.003867111634463072, "learning_rate": 3.3182097951288137e-06, "loss": 0.0003, "step": 19050 }, { "epoch": 5.025590291518269, "grad_norm": 0.04225653409957886, "learning_rate": 3.317858084938011e-06, "loss": 0.0002, "step": 19052 }, { "epoch": 5.026117926394934, "grad_norm": 0.003494629170745611, "learning_rate": 3.3175063747472085e-06, "loss": 0.0005, "step": 19054 }, { "epoch": 5.0266455612716, "grad_norm": 0.2177554965019226, "learning_rate": 3.317154664556406e-06, "loss": 0.0014, "step": 19056 }, { "epoch": 5.027173196148265, "grad_norm": 0.07430800050497055, "learning_rate": 3.316802954365603e-06, "loss": 0.0009, "step": 19058 }, { "epoch": 5.027700831024931, "grad_norm": 0.21630647778511047, "learning_rate": 3.3164512441748e-06, "loss": 0.0012, "step": 19060 }, { "epoch": 5.028228465901596, "grad_norm": 0.03828504681587219, "learning_rate": 3.3160995339839975e-06, "loss": 0.0004, "step": 19062 }, { "epoch": 5.028756100778262, "grad_norm": 0.13630954921245575, "learning_rate": 3.315747823793194e-06, "loss": 0.0004, "step": 19064 }, { "epoch": 5.029283735654927, "grad_norm": 0.0613778717815876, "learning_rate": 3.3153961136023914e-06, "loss": 0.0041, "step": 19066 }, { "epoch": 5.0298113705315926, "grad_norm": 0.009283439256250858, "learning_rate": 3.315044403411589e-06, "loss": 0.0008, "step": 19068 }, { "epoch": 5.0303390054082575, "grad_norm": 0.09566093981266022, "learning_rate": 3.314692693220786e-06, "loss": 0.001, "step": 19070 }, { "epoch": 5.0308666402849225, "grad_norm": 0.11684291064739227, "learning_rate": 3.314340983029983e-06, "loss": 0.0005, "step": 19072 }, { "epoch": 5.031394275161588, "grad_norm": 0.3219934105873108, "learning_rate": 3.3139892728391805e-06, "loss": 0.0011, "step": 19074 }, { "epoch": 5.031921910038253, "grad_norm": 0.18561358749866486, "learning_rate": 3.313637562648378e-06, "loss": 0.0005, "step": 19076 }, { "epoch": 5.032449544914919, "grad_norm": 0.041116684675216675, "learning_rate": 3.3132858524575753e-06, "loss": 0.0002, "step": 19078 }, { "epoch": 5.032977179791584, "grad_norm": 0.002562872366979718, "learning_rate": 3.3129341422667722e-06, "loss": 0.0002, "step": 19080 }, { "epoch": 5.03350481466825, "grad_norm": 0.004866653587669134, "learning_rate": 3.3125824320759696e-06, "loss": 0.0002, "step": 19082 }, { "epoch": 5.034032449544915, "grad_norm": 0.0035541648976504803, "learning_rate": 3.312230721885167e-06, "loss": 0.0001, "step": 19084 }, { "epoch": 5.03456008442158, "grad_norm": 0.074720598757267, "learning_rate": 3.3118790116943644e-06, "loss": 0.0002, "step": 19086 }, { "epoch": 5.035087719298246, "grad_norm": 0.005711452104151249, "learning_rate": 3.311527301503561e-06, "loss": 0.0001, "step": 19088 }, { "epoch": 5.035615354174911, "grad_norm": 0.036539945751428604, "learning_rate": 3.3111755913127583e-06, "loss": 0.0002, "step": 19090 }, { "epoch": 5.0361429890515765, "grad_norm": 0.0025812892708927393, "learning_rate": 3.3108238811219556e-06, "loss": 0.0002, "step": 19092 }, { "epoch": 5.0366706239282415, "grad_norm": 0.003934127744287252, "learning_rate": 3.3104721709311526e-06, "loss": 0.0001, "step": 19094 }, { "epoch": 5.037198258804907, "grad_norm": 0.005461468826979399, "learning_rate": 3.31012046074035e-06, "loss": 0.0001, "step": 19096 }, { "epoch": 5.037725893681572, "grad_norm": 0.001943163457326591, "learning_rate": 3.3097687505495473e-06, "loss": 0.0064, "step": 19098 }, { "epoch": 5.038253528558237, "grad_norm": 0.029944168403744698, "learning_rate": 3.3094170403587447e-06, "loss": 0.0003, "step": 19100 }, { "epoch": 5.038781163434903, "grad_norm": 0.017000891268253326, "learning_rate": 3.3090653301679412e-06, "loss": 0.0014, "step": 19102 }, { "epoch": 5.039308798311568, "grad_norm": 0.0031147864647209644, "learning_rate": 3.308713619977139e-06, "loss": 0.0001, "step": 19104 }, { "epoch": 5.039836433188234, "grad_norm": 0.043141186237335205, "learning_rate": 3.3083619097863364e-06, "loss": 0.0002, "step": 19106 }, { "epoch": 5.040364068064899, "grad_norm": 0.025539236143231392, "learning_rate": 3.308010199595534e-06, "loss": 0.0001, "step": 19108 }, { "epoch": 5.040891702941565, "grad_norm": 0.022055773064494133, "learning_rate": 3.3076584894047303e-06, "loss": 0.0002, "step": 19110 }, { "epoch": 5.04141933781823, "grad_norm": 0.034634944051504135, "learning_rate": 3.3073067792139277e-06, "loss": 0.0112, "step": 19112 }, { "epoch": 5.041946972694896, "grad_norm": 0.4269574284553528, "learning_rate": 3.306955069023125e-06, "loss": 0.0048, "step": 19114 }, { "epoch": 5.0424746075715605, "grad_norm": 0.004061069805175066, "learning_rate": 3.3066033588323225e-06, "loss": 0.0004, "step": 19116 }, { "epoch": 5.0430022424482255, "grad_norm": 0.008309041149914265, "learning_rate": 3.3062516486415194e-06, "loss": 0.0001, "step": 19118 }, { "epoch": 5.043529877324891, "grad_norm": 0.015971655026078224, "learning_rate": 3.3058999384507168e-06, "loss": 0.0001, "step": 19120 }, { "epoch": 5.044057512201556, "grad_norm": 0.016322635114192963, "learning_rate": 3.305548228259914e-06, "loss": 0.0001, "step": 19122 }, { "epoch": 5.044585147078222, "grad_norm": 0.08014455437660217, "learning_rate": 3.3051965180691107e-06, "loss": 0.0004, "step": 19124 }, { "epoch": 5.045112781954887, "grad_norm": 0.00598027091473341, "learning_rate": 3.304844807878308e-06, "loss": 0.0001, "step": 19126 }, { "epoch": 5.045640416831553, "grad_norm": 0.027941111475229263, "learning_rate": 3.3044930976875054e-06, "loss": 0.0002, "step": 19128 }, { "epoch": 5.046168051708218, "grad_norm": 0.0067240833304822445, "learning_rate": 3.304141387496703e-06, "loss": 0.0014, "step": 19130 }, { "epoch": 5.046695686584883, "grad_norm": 0.001985569717362523, "learning_rate": 3.3037896773058998e-06, "loss": 0.0002, "step": 19132 }, { "epoch": 5.047223321461549, "grad_norm": 0.05958234518766403, "learning_rate": 3.303437967115097e-06, "loss": 0.0003, "step": 19134 }, { "epoch": 5.047750956338214, "grad_norm": 0.006635978817939758, "learning_rate": 3.3030862569242945e-06, "loss": 0.0001, "step": 19136 }, { "epoch": 5.0482785912148795, "grad_norm": 0.19782738387584686, "learning_rate": 3.302734546733492e-06, "loss": 0.0065, "step": 19138 }, { "epoch": 5.0488062260915445, "grad_norm": 0.004498288035392761, "learning_rate": 3.302382836542689e-06, "loss": 0.0001, "step": 19140 }, { "epoch": 5.04933386096821, "grad_norm": 0.028340492397546768, "learning_rate": 3.3020311263518862e-06, "loss": 0.0002, "step": 19142 }, { "epoch": 5.049861495844875, "grad_norm": 0.11569562554359436, "learning_rate": 3.3016794161610836e-06, "loss": 0.0021, "step": 19144 }, { "epoch": 5.05038913072154, "grad_norm": 0.7579430937767029, "learning_rate": 3.301327705970281e-06, "loss": 0.0013, "step": 19146 }, { "epoch": 5.050916765598206, "grad_norm": 0.02195681631565094, "learning_rate": 3.3009759957794775e-06, "loss": 0.0006, "step": 19148 }, { "epoch": 5.051444400474871, "grad_norm": 0.01564309373497963, "learning_rate": 3.300624285588675e-06, "loss": 0.0002, "step": 19150 }, { "epoch": 5.051972035351537, "grad_norm": 0.05539070814847946, "learning_rate": 3.3002725753978723e-06, "loss": 0.0002, "step": 19152 }, { "epoch": 5.052499670228202, "grad_norm": 0.04671907052397728, "learning_rate": 3.299920865207069e-06, "loss": 0.0002, "step": 19154 }, { "epoch": 5.053027305104868, "grad_norm": 0.030595002695918083, "learning_rate": 3.2995691550162666e-06, "loss": 0.0002, "step": 19156 }, { "epoch": 5.053554939981533, "grad_norm": 0.004485098645091057, "learning_rate": 3.299217444825464e-06, "loss": 0.0001, "step": 19158 }, { "epoch": 5.054082574858199, "grad_norm": 0.005117990542203188, "learning_rate": 3.2988657346346613e-06, "loss": 0.0001, "step": 19160 }, { "epoch": 5.0546102097348635, "grad_norm": 0.06603167951107025, "learning_rate": 3.2985140244438583e-06, "loss": 0.0021, "step": 19162 }, { "epoch": 5.0551378446115285, "grad_norm": 0.018511822447180748, "learning_rate": 3.2981623142530557e-06, "loss": 0.0001, "step": 19164 }, { "epoch": 5.055665479488194, "grad_norm": 0.003960104193538427, "learning_rate": 3.297810604062253e-06, "loss": 0.0016, "step": 19166 }, { "epoch": 5.056193114364859, "grad_norm": 0.004938255995512009, "learning_rate": 3.2974588938714504e-06, "loss": 0.0001, "step": 19168 }, { "epoch": 5.056720749241525, "grad_norm": 0.23956286907196045, "learning_rate": 3.297107183680647e-06, "loss": 0.0072, "step": 19170 }, { "epoch": 5.05724838411819, "grad_norm": 0.5144068002700806, "learning_rate": 3.2967554734898443e-06, "loss": 0.0067, "step": 19172 }, { "epoch": 5.057776018994856, "grad_norm": 0.002589515643194318, "learning_rate": 3.2964037632990417e-06, "loss": 0.0002, "step": 19174 }, { "epoch": 5.058303653871521, "grad_norm": 0.0076522729359567165, "learning_rate": 3.296052053108239e-06, "loss": 0.0001, "step": 19176 }, { "epoch": 5.058831288748186, "grad_norm": 0.002521658781915903, "learning_rate": 3.295700342917436e-06, "loss": 0.0001, "step": 19178 }, { "epoch": 5.059358923624852, "grad_norm": 0.002933983691036701, "learning_rate": 3.2953486327266334e-06, "loss": 0.0007, "step": 19180 }, { "epoch": 5.059886558501517, "grad_norm": 0.006074841134250164, "learning_rate": 3.2949969225358308e-06, "loss": 0.0001, "step": 19182 }, { "epoch": 5.0604141933781825, "grad_norm": 0.002986686071380973, "learning_rate": 3.2946452123450273e-06, "loss": 0.0002, "step": 19184 }, { "epoch": 5.0609418282548475, "grad_norm": 0.4263380765914917, "learning_rate": 3.2942935021542247e-06, "loss": 0.0012, "step": 19186 }, { "epoch": 5.061469463131513, "grad_norm": 0.5020505785942078, "learning_rate": 3.293941791963422e-06, "loss": 0.0013, "step": 19188 }, { "epoch": 5.061997098008178, "grad_norm": 0.03308161348104477, "learning_rate": 3.29359008177262e-06, "loss": 0.0003, "step": 19190 }, { "epoch": 5.062524732884843, "grad_norm": 0.0023454264737665653, "learning_rate": 3.2932383715818164e-06, "loss": 0.0003, "step": 19192 }, { "epoch": 5.063052367761509, "grad_norm": 0.06206173822283745, "learning_rate": 3.2928866613910138e-06, "loss": 0.0107, "step": 19194 }, { "epoch": 5.063580002638174, "grad_norm": 0.013161507435142994, "learning_rate": 3.292534951200211e-06, "loss": 0.0002, "step": 19196 }, { "epoch": 5.06410763751484, "grad_norm": 0.33169519901275635, "learning_rate": 3.2921832410094085e-06, "loss": 0.001, "step": 19198 }, { "epoch": 5.064635272391505, "grad_norm": 0.018646294251084328, "learning_rate": 3.2918315308186055e-06, "loss": 0.0002, "step": 19200 }, { "epoch": 5.065162907268171, "grad_norm": 0.0029145104344934225, "learning_rate": 3.291479820627803e-06, "loss": 0.0001, "step": 19202 }, { "epoch": 5.065690542144836, "grad_norm": 0.0024969419464468956, "learning_rate": 3.2911281104370002e-06, "loss": 0.0001, "step": 19204 }, { "epoch": 5.066218177021501, "grad_norm": 0.009225775487720966, "learning_rate": 3.2907764002461976e-06, "loss": 0.0037, "step": 19206 }, { "epoch": 5.0667458118981665, "grad_norm": 0.0018621949711814523, "learning_rate": 3.290424690055394e-06, "loss": 0.0001, "step": 19208 }, { "epoch": 5.0672734467748315, "grad_norm": 0.005165813956409693, "learning_rate": 3.2900729798645915e-06, "loss": 0.0002, "step": 19210 }, { "epoch": 5.067801081651497, "grad_norm": 0.06299090385437012, "learning_rate": 3.289721269673789e-06, "loss": 0.0034, "step": 19212 }, { "epoch": 5.068328716528162, "grad_norm": 0.3991602957248688, "learning_rate": 3.289369559482986e-06, "loss": 0.0009, "step": 19214 }, { "epoch": 5.068856351404828, "grad_norm": 0.005282257683575153, "learning_rate": 3.289017849292183e-06, "loss": 0.0001, "step": 19216 }, { "epoch": 5.069383986281493, "grad_norm": 0.19954416155815125, "learning_rate": 3.2886661391013806e-06, "loss": 0.0018, "step": 19218 }, { "epoch": 5.069911621158159, "grad_norm": 0.05412499979138374, "learning_rate": 3.288314428910578e-06, "loss": 0.0003, "step": 19220 }, { "epoch": 5.070439256034824, "grad_norm": 0.3063642680644989, "learning_rate": 3.287962718719775e-06, "loss": 0.0061, "step": 19222 }, { "epoch": 5.070966890911489, "grad_norm": 0.3191189765930176, "learning_rate": 3.2876110085289723e-06, "loss": 0.0007, "step": 19224 }, { "epoch": 5.071494525788155, "grad_norm": 0.044401030987501144, "learning_rate": 3.2872592983381697e-06, "loss": 0.0002, "step": 19226 }, { "epoch": 5.07202216066482, "grad_norm": 0.014963921159505844, "learning_rate": 3.286907588147367e-06, "loss": 0.0008, "step": 19228 }, { "epoch": 5.0725497955414856, "grad_norm": 0.4121313989162445, "learning_rate": 3.2865558779565636e-06, "loss": 0.0032, "step": 19230 }, { "epoch": 5.0730774304181505, "grad_norm": 0.00640044454485178, "learning_rate": 3.286204167765761e-06, "loss": 0.0018, "step": 19232 }, { "epoch": 5.073605065294816, "grad_norm": 0.0028683480340987444, "learning_rate": 3.2858524575749583e-06, "loss": 0.0001, "step": 19234 }, { "epoch": 5.074132700171481, "grad_norm": 0.015612192451953888, "learning_rate": 3.2855007473841557e-06, "loss": 0.0002, "step": 19236 }, { "epoch": 5.074660335048146, "grad_norm": 0.006975959520787001, "learning_rate": 3.2851490371933526e-06, "loss": 0.0002, "step": 19238 }, { "epoch": 5.075187969924812, "grad_norm": 0.32287824153900146, "learning_rate": 3.28479732700255e-06, "loss": 0.0029, "step": 19240 }, { "epoch": 5.075715604801477, "grad_norm": 0.0021176086738705635, "learning_rate": 3.2844456168117474e-06, "loss": 0.003, "step": 19242 }, { "epoch": 5.076243239678143, "grad_norm": 0.045691221952438354, "learning_rate": 3.284093906620944e-06, "loss": 0.0002, "step": 19244 }, { "epoch": 5.076770874554808, "grad_norm": 0.009888016618788242, "learning_rate": 3.2837421964301413e-06, "loss": 0.0023, "step": 19246 }, { "epoch": 5.077298509431474, "grad_norm": 0.005323396530002356, "learning_rate": 3.283390486239339e-06, "loss": 0.0001, "step": 19248 }, { "epoch": 5.077826144308139, "grad_norm": 0.02307051792740822, "learning_rate": 3.2830387760485365e-06, "loss": 0.0018, "step": 19250 }, { "epoch": 5.078353779184804, "grad_norm": 0.007417148910462856, "learning_rate": 3.282687065857733e-06, "loss": 0.0017, "step": 19252 }, { "epoch": 5.0788814140614695, "grad_norm": 0.02345137670636177, "learning_rate": 3.2823353556669304e-06, "loss": 0.0002, "step": 19254 }, { "epoch": 5.0794090489381345, "grad_norm": 0.14236770570278168, "learning_rate": 3.2819836454761278e-06, "loss": 0.0069, "step": 19256 }, { "epoch": 5.0799366838148, "grad_norm": 0.27851563692092896, "learning_rate": 3.281631935285325e-06, "loss": 0.0014, "step": 19258 }, { "epoch": 5.080464318691465, "grad_norm": 0.009810270741581917, "learning_rate": 3.281280225094522e-06, "loss": 0.0002, "step": 19260 }, { "epoch": 5.080991953568131, "grad_norm": 0.6765734553337097, "learning_rate": 3.2809285149037195e-06, "loss": 0.0025, "step": 19262 }, { "epoch": 5.081519588444796, "grad_norm": 0.02558416686952114, "learning_rate": 3.280576804712917e-06, "loss": 0.0003, "step": 19264 }, { "epoch": 5.082047223321462, "grad_norm": 0.02270415797829628, "learning_rate": 3.280225094522114e-06, "loss": 0.0006, "step": 19266 }, { "epoch": 5.082574858198127, "grad_norm": 0.03646071255207062, "learning_rate": 3.2798733843313107e-06, "loss": 0.0003, "step": 19268 }, { "epoch": 5.083102493074792, "grad_norm": 0.031410444527864456, "learning_rate": 3.279521674140508e-06, "loss": 0.0005, "step": 19270 }, { "epoch": 5.083630127951458, "grad_norm": 0.010994122363626957, "learning_rate": 3.2791699639497055e-06, "loss": 0.0003, "step": 19272 }, { "epoch": 5.084157762828123, "grad_norm": 0.19500970840454102, "learning_rate": 3.2788182537589024e-06, "loss": 0.0007, "step": 19274 }, { "epoch": 5.084685397704789, "grad_norm": 0.0660582035779953, "learning_rate": 3.2784665435681e-06, "loss": 0.0004, "step": 19276 }, { "epoch": 5.0852130325814535, "grad_norm": 0.004181434400379658, "learning_rate": 3.278114833377297e-06, "loss": 0.0001, "step": 19278 }, { "epoch": 5.085740667458119, "grad_norm": 0.08288087695837021, "learning_rate": 3.2777631231864946e-06, "loss": 0.0003, "step": 19280 }, { "epoch": 5.086268302334784, "grad_norm": 0.008661758154630661, "learning_rate": 3.2774114129956915e-06, "loss": 0.0002, "step": 19282 }, { "epoch": 5.086795937211449, "grad_norm": 0.28034019470214844, "learning_rate": 3.277059702804889e-06, "loss": 0.0061, "step": 19284 }, { "epoch": 5.087323572088115, "grad_norm": 0.039105307310819626, "learning_rate": 3.2767079926140863e-06, "loss": 0.0003, "step": 19286 }, { "epoch": 5.08785120696478, "grad_norm": 0.6240517497062683, "learning_rate": 3.2763562824232836e-06, "loss": 0.0007, "step": 19288 }, { "epoch": 5.088378841841446, "grad_norm": 0.014950890094041824, "learning_rate": 3.27600457223248e-06, "loss": 0.0003, "step": 19290 }, { "epoch": 5.088906476718111, "grad_norm": 0.0036245211958885193, "learning_rate": 3.2756528620416776e-06, "loss": 0.0001, "step": 19292 }, { "epoch": 5.089434111594777, "grad_norm": 0.030896548181772232, "learning_rate": 3.275301151850875e-06, "loss": 0.0002, "step": 19294 }, { "epoch": 5.089961746471442, "grad_norm": 0.0685446485877037, "learning_rate": 3.2749494416600723e-06, "loss": 0.0003, "step": 19296 }, { "epoch": 5.090489381348107, "grad_norm": 0.031884074211120605, "learning_rate": 3.2745977314692693e-06, "loss": 0.0002, "step": 19298 }, { "epoch": 5.0910170162247725, "grad_norm": 0.005287298001348972, "learning_rate": 3.2742460212784666e-06, "loss": 0.0001, "step": 19300 }, { "epoch": 5.0915446511014375, "grad_norm": 0.21013468503952026, "learning_rate": 3.273894311087664e-06, "loss": 0.0026, "step": 19302 }, { "epoch": 5.092072285978103, "grad_norm": 0.007390388287603855, "learning_rate": 3.2735426008968605e-06, "loss": 0.0002, "step": 19304 }, { "epoch": 5.092599920854768, "grad_norm": 0.006755439564585686, "learning_rate": 3.2731908907060583e-06, "loss": 0.0031, "step": 19306 }, { "epoch": 5.093127555731434, "grad_norm": 0.0039921971037983894, "learning_rate": 3.2728391805152557e-06, "loss": 0.0001, "step": 19308 }, { "epoch": 5.093655190608099, "grad_norm": 0.003997852094471455, "learning_rate": 3.272487470324453e-06, "loss": 0.0004, "step": 19310 }, { "epoch": 5.094182825484765, "grad_norm": 0.012165054678916931, "learning_rate": 3.2721357601336496e-06, "loss": 0.0002, "step": 19312 }, { "epoch": 5.09471046036143, "grad_norm": 0.019766535609960556, "learning_rate": 3.271784049942847e-06, "loss": 0.0002, "step": 19314 }, { "epoch": 5.095238095238095, "grad_norm": 0.005980262998491526, "learning_rate": 3.2714323397520444e-06, "loss": 0.0002, "step": 19316 }, { "epoch": 5.095765730114761, "grad_norm": 0.013647359795868397, "learning_rate": 3.2710806295612417e-06, "loss": 0.0033, "step": 19318 }, { "epoch": 5.096293364991426, "grad_norm": 0.0033025043085217476, "learning_rate": 3.2707289193704387e-06, "loss": 0.0002, "step": 19320 }, { "epoch": 5.096820999868092, "grad_norm": 0.176886647939682, "learning_rate": 3.270377209179636e-06, "loss": 0.0012, "step": 19322 }, { "epoch": 5.0973486347447565, "grad_norm": 0.015660349279642105, "learning_rate": 3.2700254989888334e-06, "loss": 0.0002, "step": 19324 }, { "epoch": 5.097876269621422, "grad_norm": 0.021180719137191772, "learning_rate": 3.269673788798031e-06, "loss": 0.0015, "step": 19326 }, { "epoch": 5.098403904498087, "grad_norm": 0.05649209022521973, "learning_rate": 3.2693220786072274e-06, "loss": 0.0004, "step": 19328 }, { "epoch": 5.098931539374752, "grad_norm": 0.008532802574336529, "learning_rate": 3.2689703684164247e-06, "loss": 0.0002, "step": 19330 }, { "epoch": 5.099459174251418, "grad_norm": 0.007905816659331322, "learning_rate": 3.2686186582256225e-06, "loss": 0.0002, "step": 19332 }, { "epoch": 5.099986809128083, "grad_norm": 0.1501837968826294, "learning_rate": 3.268266948034819e-06, "loss": 0.0005, "step": 19334 }, { "epoch": 5.100514444004749, "grad_norm": 0.056788697838783264, "learning_rate": 3.2679152378440164e-06, "loss": 0.0055, "step": 19336 }, { "epoch": 5.101042078881414, "grad_norm": 0.0026667683850973845, "learning_rate": 3.267563527653214e-06, "loss": 0.0002, "step": 19338 }, { "epoch": 5.10156971375808, "grad_norm": 0.0054477122612297535, "learning_rate": 3.267211817462411e-06, "loss": 0.0002, "step": 19340 }, { "epoch": 5.102097348634745, "grad_norm": 0.38973546028137207, "learning_rate": 3.266860107271608e-06, "loss": 0.0022, "step": 19342 }, { "epoch": 5.10262498351141, "grad_norm": 0.00892886333167553, "learning_rate": 3.2665083970808055e-06, "loss": 0.0002, "step": 19344 }, { "epoch": 5.1031526183880755, "grad_norm": 0.010112590156495571, "learning_rate": 3.266156686890003e-06, "loss": 0.0002, "step": 19346 }, { "epoch": 5.1036802532647405, "grad_norm": 0.9686932563781738, "learning_rate": 3.2658049766992003e-06, "loss": 0.0016, "step": 19348 }, { "epoch": 5.104207888141406, "grad_norm": 0.03401508554816246, "learning_rate": 3.265453266508397e-06, "loss": 0.0004, "step": 19350 }, { "epoch": 5.104735523018071, "grad_norm": 0.014589368365705013, "learning_rate": 3.265101556317594e-06, "loss": 0.0002, "step": 19352 }, { "epoch": 5.105263157894737, "grad_norm": 0.030926231294870377, "learning_rate": 3.2647498461267915e-06, "loss": 0.0009, "step": 19354 }, { "epoch": 5.105790792771402, "grad_norm": 0.02358243241906166, "learning_rate": 3.264398135935989e-06, "loss": 0.0003, "step": 19356 }, { "epoch": 5.106318427648068, "grad_norm": 0.019833747297525406, "learning_rate": 3.264046425745186e-06, "loss": 0.0002, "step": 19358 }, { "epoch": 5.106846062524733, "grad_norm": 0.004149723798036575, "learning_rate": 3.2636947155543833e-06, "loss": 0.0009, "step": 19360 }, { "epoch": 5.107373697401398, "grad_norm": 0.17675729095935822, "learning_rate": 3.2633430053635806e-06, "loss": 0.0007, "step": 19362 }, { "epoch": 5.107901332278064, "grad_norm": 0.28803032636642456, "learning_rate": 3.2629912951727776e-06, "loss": 0.0014, "step": 19364 }, { "epoch": 5.108428967154729, "grad_norm": 0.05205063149333, "learning_rate": 3.262639584981975e-06, "loss": 0.0002, "step": 19366 }, { "epoch": 5.108956602031395, "grad_norm": 0.03603363409638405, "learning_rate": 3.2622878747911723e-06, "loss": 0.0011, "step": 19368 }, { "epoch": 5.1094842369080595, "grad_norm": 0.028521498665213585, "learning_rate": 3.2619361646003697e-06, "loss": 0.0003, "step": 19370 }, { "epoch": 5.110011871784725, "grad_norm": 0.489080011844635, "learning_rate": 3.2615844544095662e-06, "loss": 0.0062, "step": 19372 }, { "epoch": 5.11053950666139, "grad_norm": 0.0034507554955780506, "learning_rate": 3.2612327442187636e-06, "loss": 0.0001, "step": 19374 }, { "epoch": 5.111067141538055, "grad_norm": 0.012777306139469147, "learning_rate": 3.260881034027961e-06, "loss": 0.0008, "step": 19376 }, { "epoch": 5.111594776414721, "grad_norm": 0.002847437048330903, "learning_rate": 3.2605293238371584e-06, "loss": 0.0001, "step": 19378 }, { "epoch": 5.112122411291386, "grad_norm": 0.044145677238702774, "learning_rate": 3.2601776136463553e-06, "loss": 0.0003, "step": 19380 }, { "epoch": 5.112650046168052, "grad_norm": 0.008601346053183079, "learning_rate": 3.2598259034555527e-06, "loss": 0.0006, "step": 19382 }, { "epoch": 5.113177681044717, "grad_norm": 0.003898241091519594, "learning_rate": 3.25947419326475e-06, "loss": 0.0001, "step": 19384 }, { "epoch": 5.113705315921383, "grad_norm": 0.3362778127193451, "learning_rate": 3.2591224830739474e-06, "loss": 0.0006, "step": 19386 }, { "epoch": 5.114232950798048, "grad_norm": 0.0050134435296058655, "learning_rate": 3.258770772883144e-06, "loss": 0.0006, "step": 19388 }, { "epoch": 5.114760585674713, "grad_norm": 0.10834688693284988, "learning_rate": 3.2584190626923418e-06, "loss": 0.0044, "step": 19390 }, { "epoch": 5.1152882205513786, "grad_norm": 0.09456238150596619, "learning_rate": 3.258067352501539e-06, "loss": 0.0003, "step": 19392 }, { "epoch": 5.1158158554280435, "grad_norm": 0.008268134668469429, "learning_rate": 3.2577156423107357e-06, "loss": 0.0001, "step": 19394 }, { "epoch": 5.116343490304709, "grad_norm": 0.0029019054491072893, "learning_rate": 3.257363932119933e-06, "loss": 0.0001, "step": 19396 }, { "epoch": 5.116871125181374, "grad_norm": 0.038795214146375656, "learning_rate": 3.2570122219291304e-06, "loss": 0.0003, "step": 19398 }, { "epoch": 5.11739876005804, "grad_norm": 0.00361527968198061, "learning_rate": 3.256660511738328e-06, "loss": 0.0021, "step": 19400 }, { "epoch": 5.117926394934705, "grad_norm": 0.01297915168106556, "learning_rate": 3.2563088015475248e-06, "loss": 0.0002, "step": 19402 }, { "epoch": 5.11845402981137, "grad_norm": 0.14137119054794312, "learning_rate": 3.255957091356722e-06, "loss": 0.0004, "step": 19404 }, { "epoch": 5.118981664688036, "grad_norm": 0.33403918147087097, "learning_rate": 3.2556053811659195e-06, "loss": 0.0009, "step": 19406 }, { "epoch": 5.119509299564701, "grad_norm": 0.0030581997707486153, "learning_rate": 3.255253670975117e-06, "loss": 0.0002, "step": 19408 }, { "epoch": 5.120036934441367, "grad_norm": 0.0031932920683175325, "learning_rate": 3.2549019607843134e-06, "loss": 0.0001, "step": 19410 }, { "epoch": 5.120564569318032, "grad_norm": 0.15484727919101715, "learning_rate": 3.2545502505935108e-06, "loss": 0.0004, "step": 19412 }, { "epoch": 5.121092204194698, "grad_norm": 0.0017033240292221308, "learning_rate": 3.254198540402708e-06, "loss": 0.0008, "step": 19414 }, { "epoch": 5.1216198390713625, "grad_norm": 0.0032199097331613302, "learning_rate": 3.253846830211906e-06, "loss": 0.0001, "step": 19416 }, { "epoch": 5.122147473948028, "grad_norm": 0.18573689460754395, "learning_rate": 3.2534951200211025e-06, "loss": 0.0007, "step": 19418 }, { "epoch": 5.122675108824693, "grad_norm": 0.004033010918647051, "learning_rate": 3.2531434098303e-06, "loss": 0.0002, "step": 19420 }, { "epoch": 5.123202743701358, "grad_norm": 0.004337730817496777, "learning_rate": 3.2527916996394972e-06, "loss": 0.0002, "step": 19422 }, { "epoch": 5.123730378578024, "grad_norm": 0.019769057631492615, "learning_rate": 3.252439989448694e-06, "loss": 0.0001, "step": 19424 }, { "epoch": 5.124258013454689, "grad_norm": 0.016932209953665733, "learning_rate": 3.2520882792578916e-06, "loss": 0.0002, "step": 19426 }, { "epoch": 5.124785648331355, "grad_norm": 0.024197274819016457, "learning_rate": 3.251736569067089e-06, "loss": 0.0002, "step": 19428 }, { "epoch": 5.12531328320802, "grad_norm": 0.003057809080928564, "learning_rate": 3.2513848588762863e-06, "loss": 0.0005, "step": 19430 }, { "epoch": 5.125840918084686, "grad_norm": 0.01101876050233841, "learning_rate": 3.251033148685483e-06, "loss": 0.0001, "step": 19432 }, { "epoch": 5.126368552961351, "grad_norm": 0.28309205174446106, "learning_rate": 3.2506814384946802e-06, "loss": 0.0005, "step": 19434 }, { "epoch": 5.126896187838016, "grad_norm": 0.0025627969298511744, "learning_rate": 3.2503297283038776e-06, "loss": 0.0001, "step": 19436 }, { "epoch": 5.127423822714682, "grad_norm": 0.006702285259962082, "learning_rate": 3.249978018113075e-06, "loss": 0.0002, "step": 19438 }, { "epoch": 5.1279514575913465, "grad_norm": 0.001735421479679644, "learning_rate": 3.249626307922272e-06, "loss": 0.0001, "step": 19440 }, { "epoch": 5.128479092468012, "grad_norm": 0.003418811596930027, "learning_rate": 3.2492745977314693e-06, "loss": 0.0001, "step": 19442 }, { "epoch": 5.129006727344677, "grad_norm": 0.013702359050512314, "learning_rate": 3.2489228875406667e-06, "loss": 0.0001, "step": 19444 }, { "epoch": 5.129534362221343, "grad_norm": 0.00573482783511281, "learning_rate": 3.248571177349864e-06, "loss": 0.0002, "step": 19446 }, { "epoch": 5.130061997098008, "grad_norm": 0.002173643559217453, "learning_rate": 3.248219467159061e-06, "loss": 0.0006, "step": 19448 }, { "epoch": 5.130589631974673, "grad_norm": 0.002980646677315235, "learning_rate": 3.2478677569682584e-06, "loss": 0.0004, "step": 19450 }, { "epoch": 5.131117266851339, "grad_norm": 0.4927031993865967, "learning_rate": 3.2475160467774558e-06, "loss": 0.0008, "step": 19452 }, { "epoch": 5.131644901728004, "grad_norm": 0.0010893166763707995, "learning_rate": 3.2471643365866523e-06, "loss": 0.0001, "step": 19454 }, { "epoch": 5.13217253660467, "grad_norm": 0.0033990826923400164, "learning_rate": 3.2468126263958497e-06, "loss": 0.0001, "step": 19456 }, { "epoch": 5.132700171481335, "grad_norm": 0.003959095571190119, "learning_rate": 3.246460916205047e-06, "loss": 0.0001, "step": 19458 }, { "epoch": 5.133227806358001, "grad_norm": 0.3172599971294403, "learning_rate": 3.2461092060142444e-06, "loss": 0.0002, "step": 19460 }, { "epoch": 5.1337554412346655, "grad_norm": 0.001728441333398223, "learning_rate": 3.2457574958234414e-06, "loss": 0.0001, "step": 19462 }, { "epoch": 5.1342830761113305, "grad_norm": 0.007723859511315823, "learning_rate": 3.2454057856326387e-06, "loss": 0.0001, "step": 19464 }, { "epoch": 5.134810710987996, "grad_norm": 0.12151776999235153, "learning_rate": 3.245054075441836e-06, "loss": 0.0004, "step": 19466 }, { "epoch": 5.135338345864661, "grad_norm": 0.008234493434429169, "learning_rate": 3.2447023652510335e-06, "loss": 0.0001, "step": 19468 }, { "epoch": 5.135865980741327, "grad_norm": 0.06630922853946686, "learning_rate": 3.24435065506023e-06, "loss": 0.0002, "step": 19470 }, { "epoch": 5.136393615617992, "grad_norm": 0.02018694579601288, "learning_rate": 3.2439989448694274e-06, "loss": 0.0001, "step": 19472 }, { "epoch": 5.136921250494658, "grad_norm": 0.00500619737431407, "learning_rate": 3.243647234678625e-06, "loss": 0.0002, "step": 19474 }, { "epoch": 5.137448885371323, "grad_norm": 0.3379937708377838, "learning_rate": 3.2432955244878226e-06, "loss": 0.0048, "step": 19476 }, { "epoch": 5.137976520247989, "grad_norm": 0.0015865961322560906, "learning_rate": 3.242943814297019e-06, "loss": 0.0001, "step": 19478 }, { "epoch": 5.138504155124654, "grad_norm": 0.016766129061579704, "learning_rate": 3.2425921041062165e-06, "loss": 0.0038, "step": 19480 }, { "epoch": 5.139031790001319, "grad_norm": 0.0028402709867805243, "learning_rate": 3.242240393915414e-06, "loss": 0.0001, "step": 19482 }, { "epoch": 5.139559424877985, "grad_norm": 0.006734485272318125, "learning_rate": 3.2418886837246112e-06, "loss": 0.0004, "step": 19484 }, { "epoch": 5.1400870597546495, "grad_norm": 0.017462315037846565, "learning_rate": 3.241536973533808e-06, "loss": 0.0001, "step": 19486 }, { "epoch": 5.140614694631315, "grad_norm": 0.006429873872548342, "learning_rate": 3.2411852633430056e-06, "loss": 0.0109, "step": 19488 }, { "epoch": 5.14114232950798, "grad_norm": 0.12615562975406647, "learning_rate": 3.240833553152203e-06, "loss": 0.0041, "step": 19490 }, { "epoch": 5.141669964384646, "grad_norm": 0.0024819402024149895, "learning_rate": 3.2404818429613995e-06, "loss": 0.0047, "step": 19492 }, { "epoch": 5.142197599261311, "grad_norm": 0.1467159539461136, "learning_rate": 3.240130132770597e-06, "loss": 0.0005, "step": 19494 }, { "epoch": 5.142725234137976, "grad_norm": 0.007536160293966532, "learning_rate": 3.2397784225797942e-06, "loss": 0.0003, "step": 19496 }, { "epoch": 5.143252869014642, "grad_norm": 0.00621713837608695, "learning_rate": 3.2394267123889916e-06, "loss": 0.0004, "step": 19498 }, { "epoch": 5.143780503891307, "grad_norm": 0.03870362043380737, "learning_rate": 3.2390750021981885e-06, "loss": 0.0002, "step": 19500 }, { "epoch": 5.144308138767973, "grad_norm": 0.005792456213384867, "learning_rate": 3.238723292007386e-06, "loss": 0.0001, "step": 19502 }, { "epoch": 5.144835773644638, "grad_norm": 0.05432724952697754, "learning_rate": 3.2383715818165833e-06, "loss": 0.0026, "step": 19504 }, { "epoch": 5.145363408521304, "grad_norm": 0.015187445096671581, "learning_rate": 3.2380198716257807e-06, "loss": 0.0032, "step": 19506 }, { "epoch": 5.1458910433979685, "grad_norm": 0.04589689522981644, "learning_rate": 3.2376681614349776e-06, "loss": 0.0002, "step": 19508 }, { "epoch": 5.1464186782746335, "grad_norm": 0.13934513926506042, "learning_rate": 3.237316451244175e-06, "loss": 0.0003, "step": 19510 }, { "epoch": 5.146946313151299, "grad_norm": 0.14068284630775452, "learning_rate": 3.2369647410533724e-06, "loss": 0.0027, "step": 19512 }, { "epoch": 5.147473948027964, "grad_norm": 0.0582677386701107, "learning_rate": 3.2366130308625698e-06, "loss": 0.0003, "step": 19514 }, { "epoch": 5.14800158290463, "grad_norm": 0.013487536460161209, "learning_rate": 3.2362613206717663e-06, "loss": 0.0002, "step": 19516 }, { "epoch": 5.148529217781295, "grad_norm": 0.6557664275169373, "learning_rate": 3.2359096104809637e-06, "loss": 0.0003, "step": 19518 }, { "epoch": 5.149056852657961, "grad_norm": 0.8053545355796814, "learning_rate": 3.235557900290161e-06, "loss": 0.0055, "step": 19520 }, { "epoch": 5.149584487534626, "grad_norm": 0.02180531993508339, "learning_rate": 3.235206190099358e-06, "loss": 0.0004, "step": 19522 }, { "epoch": 5.150112122411292, "grad_norm": 0.005132298916578293, "learning_rate": 3.2348544799085554e-06, "loss": 0.0047, "step": 19524 }, { "epoch": 5.150639757287957, "grad_norm": 0.1642535775899887, "learning_rate": 3.2345027697177527e-06, "loss": 0.0006, "step": 19526 }, { "epoch": 5.151167392164622, "grad_norm": 0.006201997399330139, "learning_rate": 3.23415105952695e-06, "loss": 0.0001, "step": 19528 }, { "epoch": 5.151695027041288, "grad_norm": 0.0054079326801002026, "learning_rate": 3.2337993493361466e-06, "loss": 0.0004, "step": 19530 }, { "epoch": 5.1522226619179525, "grad_norm": 0.017124133184552193, "learning_rate": 3.2334476391453444e-06, "loss": 0.0002, "step": 19532 }, { "epoch": 5.152750296794618, "grad_norm": 0.03334104269742966, "learning_rate": 3.233095928954542e-06, "loss": 0.0004, "step": 19534 }, { "epoch": 5.153277931671283, "grad_norm": 0.16549451649188995, "learning_rate": 3.232744218763739e-06, "loss": 0.0021, "step": 19536 }, { "epoch": 5.153805566547949, "grad_norm": 0.02566898614168167, "learning_rate": 3.2323925085729357e-06, "loss": 0.0004, "step": 19538 }, { "epoch": 5.154333201424614, "grad_norm": 0.27700984477996826, "learning_rate": 3.232040798382133e-06, "loss": 0.0014, "step": 19540 }, { "epoch": 5.154860836301279, "grad_norm": 0.07767418771982193, "learning_rate": 3.2316890881913305e-06, "loss": 0.0004, "step": 19542 }, { "epoch": 5.155388471177945, "grad_norm": 0.018205732107162476, "learning_rate": 3.231337378000528e-06, "loss": 0.0002, "step": 19544 }, { "epoch": 5.15591610605461, "grad_norm": 0.008268923498690128, "learning_rate": 3.230985667809725e-06, "loss": 0.0025, "step": 19546 }, { "epoch": 5.156443740931276, "grad_norm": 0.00526302307844162, "learning_rate": 3.230633957618922e-06, "loss": 0.0003, "step": 19548 }, { "epoch": 5.156971375807941, "grad_norm": 0.054568421095609665, "learning_rate": 3.2302822474281196e-06, "loss": 0.0003, "step": 19550 }, { "epoch": 5.157499010684607, "grad_norm": 0.09517184644937515, "learning_rate": 3.229930537237316e-06, "loss": 0.0038, "step": 19552 }, { "epoch": 5.1580266455612716, "grad_norm": 0.005214410368353128, "learning_rate": 3.2295788270465135e-06, "loss": 0.0001, "step": 19554 }, { "epoch": 5.1585542804379365, "grad_norm": 0.004004824440926313, "learning_rate": 3.229227116855711e-06, "loss": 0.0001, "step": 19556 }, { "epoch": 5.159081915314602, "grad_norm": 0.062211208045482635, "learning_rate": 3.2288754066649086e-06, "loss": 0.0002, "step": 19558 }, { "epoch": 5.159609550191267, "grad_norm": 0.04174605384469032, "learning_rate": 3.228523696474105e-06, "loss": 0.0002, "step": 19560 }, { "epoch": 5.160137185067933, "grad_norm": 0.10662841796875, "learning_rate": 3.2281719862833025e-06, "loss": 0.0003, "step": 19562 }, { "epoch": 5.160664819944598, "grad_norm": 0.04880639538168907, "learning_rate": 3.2278202760925e-06, "loss": 0.0004, "step": 19564 }, { "epoch": 5.161192454821264, "grad_norm": 0.019913187250494957, "learning_rate": 3.2274685659016973e-06, "loss": 0.0002, "step": 19566 }, { "epoch": 5.161720089697929, "grad_norm": 0.00850941613316536, "learning_rate": 3.2271168557108942e-06, "loss": 0.0002, "step": 19568 }, { "epoch": 5.162247724574595, "grad_norm": 0.24258878827095032, "learning_rate": 3.2267651455200916e-06, "loss": 0.0004, "step": 19570 }, { "epoch": 5.16277535945126, "grad_norm": 0.004827488679438829, "learning_rate": 3.226413435329289e-06, "loss": 0.0001, "step": 19572 }, { "epoch": 5.163302994327925, "grad_norm": 0.01047088485211134, "learning_rate": 3.2260617251384864e-06, "loss": 0.0003, "step": 19574 }, { "epoch": 5.163830629204591, "grad_norm": 0.010710399597883224, "learning_rate": 3.225710014947683e-06, "loss": 0.0001, "step": 19576 }, { "epoch": 5.1643582640812555, "grad_norm": 0.019693417474627495, "learning_rate": 3.2253583047568803e-06, "loss": 0.0002, "step": 19578 }, { "epoch": 5.164885898957921, "grad_norm": 0.034185219556093216, "learning_rate": 3.2250065945660777e-06, "loss": 0.0021, "step": 19580 }, { "epoch": 5.165413533834586, "grad_norm": 0.01663302816450596, "learning_rate": 3.2246548843752746e-06, "loss": 0.0001, "step": 19582 }, { "epoch": 5.165941168711252, "grad_norm": 0.001906386110931635, "learning_rate": 3.224303174184472e-06, "loss": 0.0002, "step": 19584 }, { "epoch": 5.166468803587917, "grad_norm": 0.6490888595581055, "learning_rate": 3.2239514639936694e-06, "loss": 0.0018, "step": 19586 }, { "epoch": 5.166996438464582, "grad_norm": 0.004835737403482199, "learning_rate": 3.2235997538028667e-06, "loss": 0.0001, "step": 19588 }, { "epoch": 5.167524073341248, "grad_norm": 0.014635729603469372, "learning_rate": 3.2232480436120637e-06, "loss": 0.0002, "step": 19590 }, { "epoch": 5.168051708217913, "grad_norm": 0.08894426375627518, "learning_rate": 3.222896333421261e-06, "loss": 0.0002, "step": 19592 }, { "epoch": 5.168579343094579, "grad_norm": 0.009794521145522594, "learning_rate": 3.2225446232304584e-06, "loss": 0.0001, "step": 19594 }, { "epoch": 5.169106977971244, "grad_norm": 0.4794710576534271, "learning_rate": 3.222192913039656e-06, "loss": 0.0028, "step": 19596 }, { "epoch": 5.16963461284791, "grad_norm": 0.014620802365243435, "learning_rate": 3.2218412028488523e-06, "loss": 0.0001, "step": 19598 }, { "epoch": 5.1701622477245746, "grad_norm": 0.0016434931894764304, "learning_rate": 3.2214894926580497e-06, "loss": 0.0012, "step": 19600 }, { "epoch": 5.1706898826012395, "grad_norm": 0.0015250685391947627, "learning_rate": 3.221137782467247e-06, "loss": 0.0001, "step": 19602 }, { "epoch": 5.171217517477905, "grad_norm": 0.34968623518943787, "learning_rate": 3.2207860722764445e-06, "loss": 0.001, "step": 19604 }, { "epoch": 5.17174515235457, "grad_norm": 0.06039310619235039, "learning_rate": 3.2204343620856414e-06, "loss": 0.0002, "step": 19606 }, { "epoch": 5.172272787231236, "grad_norm": 0.013459050096571445, "learning_rate": 3.220082651894839e-06, "loss": 0.0001, "step": 19608 }, { "epoch": 5.172800422107901, "grad_norm": 0.56996089220047, "learning_rate": 3.219730941704036e-06, "loss": 0.0028, "step": 19610 }, { "epoch": 5.173328056984567, "grad_norm": 0.0031150116119533777, "learning_rate": 3.2193792315132327e-06, "loss": 0.0001, "step": 19612 }, { "epoch": 5.173855691861232, "grad_norm": 0.016379477456212044, "learning_rate": 3.21902752132243e-06, "loss": 0.0002, "step": 19614 }, { "epoch": 5.174383326737898, "grad_norm": 0.008900594897568226, "learning_rate": 3.218675811131628e-06, "loss": 0.0002, "step": 19616 }, { "epoch": 5.174910961614563, "grad_norm": 0.4231770932674408, "learning_rate": 3.2183241009408253e-06, "loss": 0.0033, "step": 19618 }, { "epoch": 5.175438596491228, "grad_norm": 0.33790603280067444, "learning_rate": 3.2179723907500218e-06, "loss": 0.0005, "step": 19620 }, { "epoch": 5.175966231367894, "grad_norm": 0.007024634163826704, "learning_rate": 3.217620680559219e-06, "loss": 0.0001, "step": 19622 }, { "epoch": 5.1764938662445585, "grad_norm": 0.08388370275497437, "learning_rate": 3.2172689703684165e-06, "loss": 0.0002, "step": 19624 }, { "epoch": 5.177021501121224, "grad_norm": 0.0036805206909775734, "learning_rate": 3.216917260177614e-06, "loss": 0.0002, "step": 19626 }, { "epoch": 5.177549135997889, "grad_norm": 0.0012690168805420399, "learning_rate": 3.216565549986811e-06, "loss": 0.0018, "step": 19628 }, { "epoch": 5.178076770874555, "grad_norm": 0.0015423253644257784, "learning_rate": 3.2162138397960082e-06, "loss": 0.0008, "step": 19630 }, { "epoch": 5.17860440575122, "grad_norm": 0.013925883919000626, "learning_rate": 3.2158621296052056e-06, "loss": 0.0002, "step": 19632 }, { "epoch": 5.179132040627885, "grad_norm": 0.00924247968941927, "learning_rate": 3.215510419414403e-06, "loss": 0.0001, "step": 19634 }, { "epoch": 5.179659675504551, "grad_norm": 0.0008713502902537584, "learning_rate": 3.2151587092235995e-06, "loss": 0.0001, "step": 19636 }, { "epoch": 5.180187310381216, "grad_norm": 0.0018760678358376026, "learning_rate": 3.214806999032797e-06, "loss": 0.0001, "step": 19638 }, { "epoch": 5.180714945257882, "grad_norm": 0.3553193509578705, "learning_rate": 3.2144552888419943e-06, "loss": 0.0006, "step": 19640 }, { "epoch": 5.181242580134547, "grad_norm": 0.32720935344696045, "learning_rate": 3.2141035786511912e-06, "loss": 0.001, "step": 19642 }, { "epoch": 5.181770215011213, "grad_norm": 0.03239378705620766, "learning_rate": 3.2137518684603886e-06, "loss": 0.0002, "step": 19644 }, { "epoch": 5.182297849887878, "grad_norm": 0.004994670394808054, "learning_rate": 3.213400158269586e-06, "loss": 0.0001, "step": 19646 }, { "epoch": 5.1828254847645425, "grad_norm": 0.16829130053520203, "learning_rate": 3.2130484480787834e-06, "loss": 0.0002, "step": 19648 }, { "epoch": 5.183353119641208, "grad_norm": 0.0985187441110611, "learning_rate": 3.2126967378879803e-06, "loss": 0.0016, "step": 19650 }, { "epoch": 5.183880754517873, "grad_norm": 0.005203739739954472, "learning_rate": 3.2123450276971777e-06, "loss": 0.0001, "step": 19652 }, { "epoch": 5.184408389394539, "grad_norm": 0.0015944434562698007, "learning_rate": 3.211993317506375e-06, "loss": 0.0001, "step": 19654 }, { "epoch": 5.184936024271204, "grad_norm": 0.007517294492572546, "learning_rate": 3.2116416073155724e-06, "loss": 0.0001, "step": 19656 }, { "epoch": 5.18546365914787, "grad_norm": 0.18467941880226135, "learning_rate": 3.211289897124769e-06, "loss": 0.0003, "step": 19658 }, { "epoch": 5.185991294024535, "grad_norm": 0.008099398575723171, "learning_rate": 3.2109381869339663e-06, "loss": 0.0001, "step": 19660 }, { "epoch": 5.186518928901201, "grad_norm": 0.0039904760196805, "learning_rate": 3.2105864767431637e-06, "loss": 0.0001, "step": 19662 }, { "epoch": 5.187046563777866, "grad_norm": 0.016499049961566925, "learning_rate": 3.210234766552361e-06, "loss": 0.0021, "step": 19664 }, { "epoch": 5.187574198654531, "grad_norm": 0.002085304819047451, "learning_rate": 3.209883056361558e-06, "loss": 0.0001, "step": 19666 }, { "epoch": 5.188101833531197, "grad_norm": 0.17180317640304565, "learning_rate": 3.2095313461707554e-06, "loss": 0.0083, "step": 19668 }, { "epoch": 5.1886294684078615, "grad_norm": 0.09291784465312958, "learning_rate": 3.209179635979953e-06, "loss": 0.0002, "step": 19670 }, { "epoch": 5.189157103284527, "grad_norm": 0.006164781749248505, "learning_rate": 3.2088279257891493e-06, "loss": 0.0002, "step": 19672 }, { "epoch": 5.189684738161192, "grad_norm": 0.7993720769882202, "learning_rate": 3.208476215598347e-06, "loss": 0.0016, "step": 19674 }, { "epoch": 5.190212373037858, "grad_norm": 0.007235061377286911, "learning_rate": 3.2081245054075445e-06, "loss": 0.0001, "step": 19676 }, { "epoch": 5.190740007914523, "grad_norm": 0.0015465790638700128, "learning_rate": 3.207772795216742e-06, "loss": 0.0001, "step": 19678 }, { "epoch": 5.191267642791188, "grad_norm": 0.024196693673729897, "learning_rate": 3.2074210850259384e-06, "loss": 0.0004, "step": 19680 }, { "epoch": 5.191795277667854, "grad_norm": 0.0020083060953766108, "learning_rate": 3.2070693748351358e-06, "loss": 0.0001, "step": 19682 }, { "epoch": 5.192322912544519, "grad_norm": 0.5488449931144714, "learning_rate": 3.206717664644333e-06, "loss": 0.0009, "step": 19684 }, { "epoch": 5.192850547421185, "grad_norm": 0.5960031151771545, "learning_rate": 3.2063659544535305e-06, "loss": 0.0011, "step": 19686 }, { "epoch": 5.19337818229785, "grad_norm": 0.010559934191405773, "learning_rate": 3.2060142442627275e-06, "loss": 0.0001, "step": 19688 }, { "epoch": 5.193905817174516, "grad_norm": 0.0015810636105015874, "learning_rate": 3.205662534071925e-06, "loss": 0.0017, "step": 19690 }, { "epoch": 5.194433452051181, "grad_norm": 0.7515488862991333, "learning_rate": 3.2053108238811222e-06, "loss": 0.0018, "step": 19692 }, { "epoch": 5.1949610869278455, "grad_norm": 0.7119676470756531, "learning_rate": 3.2049591136903196e-06, "loss": 0.0033, "step": 19694 }, { "epoch": 5.195488721804511, "grad_norm": 0.004953525494784117, "learning_rate": 3.204607403499516e-06, "loss": 0.0002, "step": 19696 }, { "epoch": 5.196016356681176, "grad_norm": 0.377812922000885, "learning_rate": 3.2042556933087135e-06, "loss": 0.0009, "step": 19698 }, { "epoch": 5.196543991557842, "grad_norm": 0.06439629942178726, "learning_rate": 3.2039039831179113e-06, "loss": 0.0002, "step": 19700 }, { "epoch": 5.197071626434507, "grad_norm": 0.0023627611808478832, "learning_rate": 3.203552272927108e-06, "loss": 0.0001, "step": 19702 }, { "epoch": 5.197599261311173, "grad_norm": 0.09529411792755127, "learning_rate": 3.2032005627363052e-06, "loss": 0.0004, "step": 19704 }, { "epoch": 5.198126896187838, "grad_norm": 0.017537161707878113, "learning_rate": 3.2028488525455026e-06, "loss": 0.0007, "step": 19706 }, { "epoch": 5.198654531064504, "grad_norm": 0.008965765126049519, "learning_rate": 3.2024971423547e-06, "loss": 0.0003, "step": 19708 }, { "epoch": 5.199182165941169, "grad_norm": 0.004249454941600561, "learning_rate": 3.202145432163897e-06, "loss": 0.0006, "step": 19710 }, { "epoch": 5.199709800817834, "grad_norm": 0.0046729459427297115, "learning_rate": 3.2017937219730943e-06, "loss": 0.0001, "step": 19712 }, { "epoch": 5.2002374356945, "grad_norm": 0.03368491306900978, "learning_rate": 3.2014420117822917e-06, "loss": 0.0002, "step": 19714 }, { "epoch": 5.2007650705711646, "grad_norm": 0.009233631193637848, "learning_rate": 3.201090301591489e-06, "loss": 0.0001, "step": 19716 }, { "epoch": 5.20129270544783, "grad_norm": 0.0028568485286086798, "learning_rate": 3.2007385914006856e-06, "loss": 0.0086, "step": 19718 }, { "epoch": 5.201820340324495, "grad_norm": 0.01534193567931652, "learning_rate": 3.200386881209883e-06, "loss": 0.0002, "step": 19720 }, { "epoch": 5.202347975201161, "grad_norm": 0.5309775471687317, "learning_rate": 3.2000351710190803e-06, "loss": 0.0038, "step": 19722 }, { "epoch": 5.202875610077826, "grad_norm": 0.004667941480875015, "learning_rate": 3.1996834608282777e-06, "loss": 0.0002, "step": 19724 }, { "epoch": 5.203403244954491, "grad_norm": 0.008212571032345295, "learning_rate": 3.1993317506374747e-06, "loss": 0.0001, "step": 19726 }, { "epoch": 5.203930879831157, "grad_norm": 0.46860387921333313, "learning_rate": 3.198980040446672e-06, "loss": 0.001, "step": 19728 }, { "epoch": 5.204458514707822, "grad_norm": 0.1934634894132614, "learning_rate": 3.1986283302558694e-06, "loss": 0.0069, "step": 19730 }, { "epoch": 5.204986149584488, "grad_norm": 0.011708544567227364, "learning_rate": 3.1982766200650664e-06, "loss": 0.0004, "step": 19732 }, { "epoch": 5.205513784461153, "grad_norm": 0.024100815877318382, "learning_rate": 3.1979249098742637e-06, "loss": 0.0042, "step": 19734 }, { "epoch": 5.206041419337819, "grad_norm": 0.014525776728987694, "learning_rate": 3.197573199683461e-06, "loss": 0.0128, "step": 19736 }, { "epoch": 5.206569054214484, "grad_norm": 0.04412967711687088, "learning_rate": 3.1972214894926585e-06, "loss": 0.0005, "step": 19738 }, { "epoch": 5.2070966890911485, "grad_norm": 0.07610934972763062, "learning_rate": 3.196869779301855e-06, "loss": 0.001, "step": 19740 }, { "epoch": 5.207624323967814, "grad_norm": 0.03808318451046944, "learning_rate": 3.1965180691110524e-06, "loss": 0.0004, "step": 19742 }, { "epoch": 5.208151958844479, "grad_norm": 0.019538937136530876, "learning_rate": 3.1961663589202498e-06, "loss": 0.0003, "step": 19744 }, { "epoch": 5.208679593721145, "grad_norm": 0.015252087265253067, "learning_rate": 3.195814648729447e-06, "loss": 0.0002, "step": 19746 }, { "epoch": 5.20920722859781, "grad_norm": 0.030663851648569107, "learning_rate": 3.195462938538644e-06, "loss": 0.0003, "step": 19748 }, { "epoch": 5.209734863474476, "grad_norm": 0.16614574193954468, "learning_rate": 3.1951112283478415e-06, "loss": 0.0014, "step": 19750 }, { "epoch": 5.210262498351141, "grad_norm": 0.011106369085609913, "learning_rate": 3.194759518157039e-06, "loss": 0.0002, "step": 19752 }, { "epoch": 5.210790133227806, "grad_norm": 0.013302470557391644, "learning_rate": 3.1944078079662362e-06, "loss": 0.0006, "step": 19754 }, { "epoch": 5.211317768104472, "grad_norm": 0.041710738092660904, "learning_rate": 3.1940560977754328e-06, "loss": 0.0003, "step": 19756 }, { "epoch": 5.211845402981137, "grad_norm": 0.07908377796411514, "learning_rate": 3.1937043875846306e-06, "loss": 0.0018, "step": 19758 }, { "epoch": 5.212373037857803, "grad_norm": 0.03901590034365654, "learning_rate": 3.193352677393828e-06, "loss": 0.0002, "step": 19760 }, { "epoch": 5.2129006727344676, "grad_norm": 0.0023114315699785948, "learning_rate": 3.1930009672030245e-06, "loss": 0.0002, "step": 19762 }, { "epoch": 5.213428307611133, "grad_norm": 0.009868709370493889, "learning_rate": 3.192649257012222e-06, "loss": 0.0001, "step": 19764 }, { "epoch": 5.213955942487798, "grad_norm": 0.0017561696004122496, "learning_rate": 3.192297546821419e-06, "loss": 0.0001, "step": 19766 }, { "epoch": 5.214483577364464, "grad_norm": 0.0036594695411622524, "learning_rate": 3.1919458366306166e-06, "loss": 0.0003, "step": 19768 }, { "epoch": 5.215011212241129, "grad_norm": 1.4497108459472656, "learning_rate": 3.1915941264398135e-06, "loss": 0.0007, "step": 19770 }, { "epoch": 5.215538847117794, "grad_norm": 0.0024417077656835318, "learning_rate": 3.191242416249011e-06, "loss": 0.0001, "step": 19772 }, { "epoch": 5.21606648199446, "grad_norm": 0.009085634723305702, "learning_rate": 3.1908907060582083e-06, "loss": 0.0002, "step": 19774 }, { "epoch": 5.216594116871125, "grad_norm": 0.015052353963255882, "learning_rate": 3.1905389958674057e-06, "loss": 0.0002, "step": 19776 }, { "epoch": 5.217121751747791, "grad_norm": 0.005090833175927401, "learning_rate": 3.190187285676602e-06, "loss": 0.0001, "step": 19778 }, { "epoch": 5.217649386624456, "grad_norm": 0.37802737951278687, "learning_rate": 3.1898355754857996e-06, "loss": 0.0007, "step": 19780 }, { "epoch": 5.218177021501122, "grad_norm": 0.052954208105802536, "learning_rate": 3.189483865294997e-06, "loss": 0.0002, "step": 19782 }, { "epoch": 5.218704656377787, "grad_norm": 0.008044705726206303, "learning_rate": 3.1891321551041947e-06, "loss": 0.0001, "step": 19784 }, { "epoch": 5.2192322912544515, "grad_norm": 0.009942793287336826, "learning_rate": 3.1887804449133913e-06, "loss": 0.0024, "step": 19786 }, { "epoch": 5.219759926131117, "grad_norm": 0.00437009334564209, "learning_rate": 3.1884287347225887e-06, "loss": 0.0008, "step": 19788 }, { "epoch": 5.220287561007782, "grad_norm": 0.1283257156610489, "learning_rate": 3.188077024531786e-06, "loss": 0.0007, "step": 19790 }, { "epoch": 5.220815195884448, "grad_norm": 0.001306431950069964, "learning_rate": 3.187725314340983e-06, "loss": 0.0001, "step": 19792 }, { "epoch": 5.221342830761113, "grad_norm": 0.002490402664989233, "learning_rate": 3.1873736041501804e-06, "loss": 0.0001, "step": 19794 }, { "epoch": 5.221870465637779, "grad_norm": 0.00918648112565279, "learning_rate": 3.1870218939593777e-06, "loss": 0.0001, "step": 19796 }, { "epoch": 5.222398100514444, "grad_norm": 0.055336739867925644, "learning_rate": 3.186670183768575e-06, "loss": 0.0001, "step": 19798 }, { "epoch": 5.222925735391109, "grad_norm": 0.06112392991781235, "learning_rate": 3.1863184735777716e-06, "loss": 0.0016, "step": 19800 }, { "epoch": 5.223453370267775, "grad_norm": 0.13621915876865387, "learning_rate": 3.185966763386969e-06, "loss": 0.003, "step": 19802 }, { "epoch": 5.22398100514444, "grad_norm": 0.0014914678176864982, "learning_rate": 3.1856150531961664e-06, "loss": 0.0001, "step": 19804 }, { "epoch": 5.224508640021106, "grad_norm": 0.004015233833342791, "learning_rate": 3.1852633430053638e-06, "loss": 0.0013, "step": 19806 }, { "epoch": 5.225036274897771, "grad_norm": 0.0506969578564167, "learning_rate": 3.1849116328145607e-06, "loss": 0.0002, "step": 19808 }, { "epoch": 5.225563909774436, "grad_norm": Infinity, "learning_rate": 3.184559922623758e-06, "loss": 0.0012, "step": 19810 }, { "epoch": 5.226091544651101, "grad_norm": 0.005514264106750488, "learning_rate": 3.1843840675283566e-06, "loss": 0.0016, "step": 19812 }, { "epoch": 5.226619179527766, "grad_norm": 0.009717871434986591, "learning_rate": 3.184032357337554e-06, "loss": 0.0001, "step": 19814 }, { "epoch": 5.227146814404432, "grad_norm": 0.18950092792510986, "learning_rate": 3.1836806471467513e-06, "loss": 0.0058, "step": 19816 }, { "epoch": 5.227674449281097, "grad_norm": 0.015638820827007294, "learning_rate": 3.1833289369559487e-06, "loss": 0.0001, "step": 19818 }, { "epoch": 5.228202084157763, "grad_norm": 0.0017730190884321928, "learning_rate": 3.1829772267651452e-06, "loss": 0.0001, "step": 19820 }, { "epoch": 5.228729719034428, "grad_norm": 0.07562500983476639, "learning_rate": 3.1826255165743426e-06, "loss": 0.0003, "step": 19822 }, { "epoch": 5.229257353911094, "grad_norm": 0.05501028150320053, "learning_rate": 3.18227380638354e-06, "loss": 0.0004, "step": 19824 }, { "epoch": 5.229784988787759, "grad_norm": 0.09486577659845352, "learning_rate": 3.1819220961927374e-06, "loss": 0.0006, "step": 19826 }, { "epoch": 5.230312623664425, "grad_norm": 0.006914726458489895, "learning_rate": 3.1815703860019343e-06, "loss": 0.0002, "step": 19828 }, { "epoch": 5.23084025854109, "grad_norm": 0.13896965980529785, "learning_rate": 3.1812186758111317e-06, "loss": 0.0004, "step": 19830 }, { "epoch": 5.2313678934177545, "grad_norm": 0.0062525211833417416, "learning_rate": 3.180866965620329e-06, "loss": 0.0001, "step": 19832 }, { "epoch": 5.23189552829442, "grad_norm": 0.024210968986153603, "learning_rate": 3.1805152554295264e-06, "loss": 0.0004, "step": 19834 }, { "epoch": 5.232423163171085, "grad_norm": 0.005203255917876959, "learning_rate": 3.1801635452387234e-06, "loss": 0.0005, "step": 19836 }, { "epoch": 5.232950798047751, "grad_norm": 0.03700409457087517, "learning_rate": 3.1798118350479208e-06, "loss": 0.0015, "step": 19838 }, { "epoch": 5.233478432924416, "grad_norm": 0.014053719118237495, "learning_rate": 3.179460124857118e-06, "loss": 0.0003, "step": 19840 }, { "epoch": 5.234006067801082, "grad_norm": 0.07980462908744812, "learning_rate": 3.1791084146663147e-06, "loss": 0.0021, "step": 19842 }, { "epoch": 5.234533702677747, "grad_norm": 0.3834536075592041, "learning_rate": 3.178756704475512e-06, "loss": 0.0047, "step": 19844 }, { "epoch": 5.235061337554412, "grad_norm": 0.015313912183046341, "learning_rate": 3.1784049942847094e-06, "loss": 0.0002, "step": 19846 }, { "epoch": 5.235588972431078, "grad_norm": 0.1991456001996994, "learning_rate": 3.178053284093907e-06, "loss": 0.0022, "step": 19848 }, { "epoch": 5.236116607307743, "grad_norm": 0.027449291199445724, "learning_rate": 3.1777015739031037e-06, "loss": 0.0003, "step": 19850 }, { "epoch": 5.236644242184409, "grad_norm": 0.02917012944817543, "learning_rate": 3.177349863712301e-06, "loss": 0.0001, "step": 19852 }, { "epoch": 5.237171877061074, "grad_norm": 0.005750253330916166, "learning_rate": 3.1769981535214985e-06, "loss": 0.0002, "step": 19854 }, { "epoch": 5.237699511937739, "grad_norm": 0.012018837034702301, "learning_rate": 3.176646443330696e-06, "loss": 0.0001, "step": 19856 }, { "epoch": 5.238227146814404, "grad_norm": 0.006862722337245941, "learning_rate": 3.1762947331398924e-06, "loss": 0.0028, "step": 19858 }, { "epoch": 5.238754781691069, "grad_norm": 0.11809685081243515, "learning_rate": 3.17594302294909e-06, "loss": 0.0006, "step": 19860 }, { "epoch": 5.239282416567735, "grad_norm": 0.004398743622004986, "learning_rate": 3.1755913127582876e-06, "loss": 0.0001, "step": 19862 }, { "epoch": 5.2398100514444, "grad_norm": 0.0009891303488984704, "learning_rate": 3.175239602567485e-06, "loss": 0.0002, "step": 19864 }, { "epoch": 5.240337686321066, "grad_norm": 0.10113672912120819, "learning_rate": 3.1748878923766815e-06, "loss": 0.0007, "step": 19866 }, { "epoch": 5.240865321197731, "grad_norm": 0.0022413316182792187, "learning_rate": 3.174536182185879e-06, "loss": 0.0001, "step": 19868 }, { "epoch": 5.241392956074397, "grad_norm": 0.01169448159635067, "learning_rate": 3.1741844719950762e-06, "loss": 0.0001, "step": 19870 }, { "epoch": 5.241920590951062, "grad_norm": 0.11158254742622375, "learning_rate": 3.173832761804273e-06, "loss": 0.0005, "step": 19872 }, { "epoch": 5.242448225827728, "grad_norm": 0.057958994060754776, "learning_rate": 3.1734810516134706e-06, "loss": 0.0001, "step": 19874 }, { "epoch": 5.242975860704393, "grad_norm": 0.005644058808684349, "learning_rate": 3.173129341422668e-06, "loss": 0.0001, "step": 19876 }, { "epoch": 5.2435034955810575, "grad_norm": 0.020895566791296005, "learning_rate": 3.1727776312318653e-06, "loss": 0.0001, "step": 19878 }, { "epoch": 5.244031130457723, "grad_norm": 0.001785656320862472, "learning_rate": 3.172425921041062e-06, "loss": 0.0002, "step": 19880 }, { "epoch": 5.244558765334388, "grad_norm": 0.0038102525286376476, "learning_rate": 3.1720742108502592e-06, "loss": 0.0001, "step": 19882 }, { "epoch": 5.245086400211054, "grad_norm": 0.009781795553863049, "learning_rate": 3.1717225006594566e-06, "loss": 0.0001, "step": 19884 }, { "epoch": 5.245614035087719, "grad_norm": 0.14392291009426117, "learning_rate": 3.1713707904686544e-06, "loss": 0.0016, "step": 19886 }, { "epoch": 5.246141669964385, "grad_norm": 0.006781395990401506, "learning_rate": 3.171019080277851e-06, "loss": 0.0001, "step": 19888 }, { "epoch": 5.24666930484105, "grad_norm": 0.23712973296642303, "learning_rate": 3.1706673700870483e-06, "loss": 0.0057, "step": 19890 }, { "epoch": 5.247196939717715, "grad_norm": 0.4574565887451172, "learning_rate": 3.1703156598962457e-06, "loss": 0.0015, "step": 19892 }, { "epoch": 5.247724574594381, "grad_norm": 0.015613817609846592, "learning_rate": 3.169963949705443e-06, "loss": 0.0001, "step": 19894 }, { "epoch": 5.248252209471046, "grad_norm": 0.015587994828820229, "learning_rate": 3.16961223951464e-06, "loss": 0.0002, "step": 19896 }, { "epoch": 5.248779844347712, "grad_norm": 0.01289428025484085, "learning_rate": 3.1692605293238374e-06, "loss": 0.0001, "step": 19898 }, { "epoch": 5.249307479224377, "grad_norm": 0.006037889048457146, "learning_rate": 3.1689088191330348e-06, "loss": 0.0002, "step": 19900 }, { "epoch": 5.249835114101042, "grad_norm": 0.0063100773841142654, "learning_rate": 3.1685571089422313e-06, "loss": 0.0001, "step": 19902 }, { "epoch": 5.250362748977707, "grad_norm": 0.45686736702919006, "learning_rate": 3.1682053987514287e-06, "loss": 0.001, "step": 19904 }, { "epoch": 5.250890383854372, "grad_norm": 0.18477463722229004, "learning_rate": 3.167853688560626e-06, "loss": 0.0025, "step": 19906 }, { "epoch": 5.251418018731038, "grad_norm": 0.008791016414761543, "learning_rate": 3.1675019783698234e-06, "loss": 0.0001, "step": 19908 }, { "epoch": 5.251945653607703, "grad_norm": 0.00812536757439375, "learning_rate": 3.1671502681790204e-06, "loss": 0.0001, "step": 19910 }, { "epoch": 5.252473288484369, "grad_norm": 0.023885656148195267, "learning_rate": 3.1667985579882177e-06, "loss": 0.0002, "step": 19912 }, { "epoch": 5.253000923361034, "grad_norm": 0.002905269619077444, "learning_rate": 3.166446847797415e-06, "loss": 0.0002, "step": 19914 }, { "epoch": 5.2535285582377, "grad_norm": 0.3060549795627594, "learning_rate": 3.1660951376066125e-06, "loss": 0.0025, "step": 19916 }, { "epoch": 5.254056193114365, "grad_norm": 0.0792110413312912, "learning_rate": 3.1657434274158094e-06, "loss": 0.0005, "step": 19918 }, { "epoch": 5.254583827991031, "grad_norm": 0.051791854202747345, "learning_rate": 3.165391717225007e-06, "loss": 0.0002, "step": 19920 }, { "epoch": 5.255111462867696, "grad_norm": 0.05281074345111847, "learning_rate": 3.165040007034204e-06, "loss": 0.0002, "step": 19922 }, { "epoch": 5.2556390977443606, "grad_norm": 0.003319705603644252, "learning_rate": 3.1646882968434016e-06, "loss": 0.0005, "step": 19924 }, { "epoch": 5.256166732621026, "grad_norm": 0.19686968624591827, "learning_rate": 3.164336586652598e-06, "loss": 0.0005, "step": 19926 }, { "epoch": 5.256694367497691, "grad_norm": 0.2354183793067932, "learning_rate": 3.1639848764617955e-06, "loss": 0.0064, "step": 19928 }, { "epoch": 5.257222002374357, "grad_norm": 0.061526551842689514, "learning_rate": 3.163633166270993e-06, "loss": 0.0004, "step": 19930 }, { "epoch": 5.257749637251022, "grad_norm": 0.40803074836730957, "learning_rate": 3.16328145608019e-06, "loss": 0.0054, "step": 19932 }, { "epoch": 5.258277272127688, "grad_norm": 0.29650798439979553, "learning_rate": 3.162929745889387e-06, "loss": 0.005, "step": 19934 }, { "epoch": 5.258804907004353, "grad_norm": 0.06541619449853897, "learning_rate": 3.1625780356985846e-06, "loss": 0.0006, "step": 19936 }, { "epoch": 5.259332541881018, "grad_norm": 0.5664782524108887, "learning_rate": 3.162226325507782e-06, "loss": 0.0048, "step": 19938 }, { "epoch": 5.259860176757684, "grad_norm": 0.006385027430951595, "learning_rate": 3.1618746153169785e-06, "loss": 0.0001, "step": 19940 }, { "epoch": 5.260387811634349, "grad_norm": 0.3595961034297943, "learning_rate": 3.161522905126176e-06, "loss": 0.0009, "step": 19942 }, { "epoch": 5.260915446511015, "grad_norm": 0.05542007461190224, "learning_rate": 3.1611711949353736e-06, "loss": 0.0011, "step": 19944 }, { "epoch": 5.26144308138768, "grad_norm": 0.08821289986371994, "learning_rate": 3.160819484744571e-06, "loss": 0.0008, "step": 19946 }, { "epoch": 5.261970716264345, "grad_norm": 0.05551775172352791, "learning_rate": 3.1604677745537675e-06, "loss": 0.0003, "step": 19948 }, { "epoch": 5.26249835114101, "grad_norm": 0.025658367201685905, "learning_rate": 3.160116064362965e-06, "loss": 0.0036, "step": 19950 }, { "epoch": 5.263025986017675, "grad_norm": 0.040705032646656036, "learning_rate": 3.1597643541721623e-06, "loss": 0.0003, "step": 19952 }, { "epoch": 5.263553620894341, "grad_norm": 0.002634249394759536, "learning_rate": 3.1594126439813597e-06, "loss": 0.0003, "step": 19954 }, { "epoch": 5.264081255771006, "grad_norm": 0.03723890334367752, "learning_rate": 3.1590609337905566e-06, "loss": 0.0003, "step": 19956 }, { "epoch": 5.264608890647672, "grad_norm": 0.004196057096123695, "learning_rate": 3.158709223599754e-06, "loss": 0.0001, "step": 19958 }, { "epoch": 5.265136525524337, "grad_norm": 0.01116764172911644, "learning_rate": 3.1583575134089514e-06, "loss": 0.0001, "step": 19960 }, { "epoch": 5.265664160401003, "grad_norm": 0.007989729754626751, "learning_rate": 3.158005803218148e-06, "loss": 0.0017, "step": 19962 }, { "epoch": 5.266191795277668, "grad_norm": 0.006477365270256996, "learning_rate": 3.1576540930273453e-06, "loss": 0.0002, "step": 19964 }, { "epoch": 5.266719430154334, "grad_norm": 0.004459640011191368, "learning_rate": 3.1573023828365426e-06, "loss": 0.0001, "step": 19966 }, { "epoch": 5.267247065030999, "grad_norm": 0.12716059386730194, "learning_rate": 3.15695067264574e-06, "loss": 0.0034, "step": 19968 }, { "epoch": 5.267774699907664, "grad_norm": 0.014089767821133137, "learning_rate": 3.156598962454937e-06, "loss": 0.0002, "step": 19970 }, { "epoch": 5.268302334784329, "grad_norm": 0.008613976649940014, "learning_rate": 3.1562472522641344e-06, "loss": 0.0001, "step": 19972 }, { "epoch": 5.268829969660994, "grad_norm": 0.03389643505215645, "learning_rate": 3.1558955420733317e-06, "loss": 0.0002, "step": 19974 }, { "epoch": 5.26935760453766, "grad_norm": 0.0013783497270196676, "learning_rate": 3.155543831882529e-06, "loss": 0.0004, "step": 19976 }, { "epoch": 5.269885239414325, "grad_norm": 0.0047700111754238605, "learning_rate": 3.155192121691726e-06, "loss": 0.0013, "step": 19978 }, { "epoch": 5.270412874290991, "grad_norm": 0.0031104793306440115, "learning_rate": 3.1548404115009234e-06, "loss": 0.0001, "step": 19980 }, { "epoch": 5.270940509167656, "grad_norm": 0.006164670456200838, "learning_rate": 3.154488701310121e-06, "loss": 0.0001, "step": 19982 }, { "epoch": 5.271468144044321, "grad_norm": 0.01096603274345398, "learning_rate": 3.154136991119318e-06, "loss": 0.0002, "step": 19984 }, { "epoch": 5.271995778920987, "grad_norm": 0.018546007573604584, "learning_rate": 3.1537852809285147e-06, "loss": 0.0002, "step": 19986 }, { "epoch": 5.272523413797652, "grad_norm": 0.002131555462256074, "learning_rate": 3.153433570737712e-06, "loss": 0.0001, "step": 19988 }, { "epoch": 5.273051048674318, "grad_norm": 0.002763856668025255, "learning_rate": 3.1530818605469095e-06, "loss": 0.0002, "step": 19990 }, { "epoch": 5.273578683550983, "grad_norm": 0.008812056854367256, "learning_rate": 3.1527301503561064e-06, "loss": 0.0007, "step": 19992 }, { "epoch": 5.274106318427648, "grad_norm": 0.00897153653204441, "learning_rate": 3.152378440165304e-06, "loss": 0.0001, "step": 19994 }, { "epoch": 5.274633953304313, "grad_norm": 0.30236008763313293, "learning_rate": 3.152026729974501e-06, "loss": 0.0068, "step": 19996 }, { "epoch": 5.275161588180978, "grad_norm": 0.013683860190212727, "learning_rate": 3.1516750197836985e-06, "loss": 0.0032, "step": 19998 }, { "epoch": 5.275689223057644, "grad_norm": 0.002489592880010605, "learning_rate": 3.151323309592895e-06, "loss": 0.0002, "step": 20000 }, { "epoch": 5.275689223057644, "eval_loss": 0.0019461425254121423, "eval_runtime": 304.1126, "eval_samples_per_second": 709.083, "eval_steps_per_second": 88.638, "step": 20000 }, { "epoch": 5.276216857934309, "grad_norm": 0.016122208908200264, "learning_rate": 3.150971599402093e-06, "loss": 0.0001, "step": 20002 }, { "epoch": 5.276744492810975, "grad_norm": 0.010529412887990475, "learning_rate": 3.1506198892112903e-06, "loss": 0.0001, "step": 20004 }, { "epoch": 5.27727212768764, "grad_norm": 0.006902644410729408, "learning_rate": 3.1502681790204876e-06, "loss": 0.0003, "step": 20006 }, { "epoch": 5.277799762564306, "grad_norm": 0.05674930289387703, "learning_rate": 3.149916468829684e-06, "loss": 0.0003, "step": 20008 }, { "epoch": 5.278327397440971, "grad_norm": 0.27168652415275574, "learning_rate": 3.1495647586388815e-06, "loss": 0.0036, "step": 20010 }, { "epoch": 5.278855032317637, "grad_norm": 1.212103247642517, "learning_rate": 3.149213048448079e-06, "loss": 0.0034, "step": 20012 }, { "epoch": 5.279382667194302, "grad_norm": 0.002291352953761816, "learning_rate": 3.1488613382572763e-06, "loss": 0.0002, "step": 20014 }, { "epoch": 5.279910302070967, "grad_norm": 0.20662137866020203, "learning_rate": 3.1485096280664732e-06, "loss": 0.0011, "step": 20016 }, { "epoch": 5.280437936947632, "grad_norm": 0.011075161397457123, "learning_rate": 3.1481579178756706e-06, "loss": 0.0004, "step": 20018 }, { "epoch": 5.280965571824297, "grad_norm": 0.03893759474158287, "learning_rate": 3.147806207684868e-06, "loss": 0.0003, "step": 20020 }, { "epoch": 5.281493206700963, "grad_norm": 0.0036991217639297247, "learning_rate": 3.1474544974940645e-06, "loss": 0.0003, "step": 20022 }, { "epoch": 5.282020841577628, "grad_norm": 0.3213101327419281, "learning_rate": 3.147102787303262e-06, "loss": 0.0039, "step": 20024 }, { "epoch": 5.282548476454294, "grad_norm": 0.041417594999074936, "learning_rate": 3.1467510771124593e-06, "loss": 0.0055, "step": 20026 }, { "epoch": 5.283076111330959, "grad_norm": 0.3789888918399811, "learning_rate": 3.146399366921657e-06, "loss": 0.0022, "step": 20028 }, { "epoch": 5.283603746207624, "grad_norm": 0.04583153873682022, "learning_rate": 3.1460476567308536e-06, "loss": 0.0006, "step": 20030 }, { "epoch": 5.28413138108429, "grad_norm": 0.0071450211107730865, "learning_rate": 3.145695946540051e-06, "loss": 0.0002, "step": 20032 }, { "epoch": 5.284659015960955, "grad_norm": 0.0030535683035850525, "learning_rate": 3.1453442363492483e-06, "loss": 0.0053, "step": 20034 }, { "epoch": 5.285186650837621, "grad_norm": 0.030891045928001404, "learning_rate": 3.1449925261584457e-06, "loss": 0.0002, "step": 20036 }, { "epoch": 5.285714285714286, "grad_norm": 0.052723709493875504, "learning_rate": 3.1446408159676427e-06, "loss": 0.0002, "step": 20038 }, { "epoch": 5.286241920590951, "grad_norm": 0.005892832763493061, "learning_rate": 3.14428910577684e-06, "loss": 0.0006, "step": 20040 }, { "epoch": 5.286769555467616, "grad_norm": 0.009537159465253353, "learning_rate": 3.1439373955860374e-06, "loss": 0.0001, "step": 20042 }, { "epoch": 5.287297190344281, "grad_norm": 0.00366606330499053, "learning_rate": 3.143585685395235e-06, "loss": 0.0114, "step": 20044 }, { "epoch": 5.287824825220947, "grad_norm": 0.008156551979482174, "learning_rate": 3.1432339752044313e-06, "loss": 0.0014, "step": 20046 }, { "epoch": 5.288352460097612, "grad_norm": 0.013097278773784637, "learning_rate": 3.1428822650136287e-06, "loss": 0.0002, "step": 20048 }, { "epoch": 5.288880094974278, "grad_norm": 0.07753439992666245, "learning_rate": 3.142530554822826e-06, "loss": 0.0006, "step": 20050 }, { "epoch": 5.289407729850943, "grad_norm": 0.013837953098118305, "learning_rate": 3.142178844632023e-06, "loss": 0.0003, "step": 20052 }, { "epoch": 5.289935364727609, "grad_norm": 0.07200545072555542, "learning_rate": 3.1418271344412204e-06, "loss": 0.0006, "step": 20054 }, { "epoch": 5.290462999604274, "grad_norm": 0.009670154191553593, "learning_rate": 3.1414754242504178e-06, "loss": 0.0002, "step": 20056 }, { "epoch": 5.29099063448094, "grad_norm": 0.003008197993040085, "learning_rate": 3.141123714059615e-06, "loss": 0.0001, "step": 20058 }, { "epoch": 5.291518269357605, "grad_norm": 0.005687167402356863, "learning_rate": 3.140772003868812e-06, "loss": 0.0001, "step": 20060 }, { "epoch": 5.29204590423427, "grad_norm": 0.010518805123865604, "learning_rate": 3.1404202936780095e-06, "loss": 0.0001, "step": 20062 }, { "epoch": 5.292573539110935, "grad_norm": 0.00597790814936161, "learning_rate": 3.140068583487207e-06, "loss": 0.0002, "step": 20064 }, { "epoch": 5.2931011739876, "grad_norm": 0.001588302431628108, "learning_rate": 3.1397168732964042e-06, "loss": 0.0005, "step": 20066 }, { "epoch": 5.293628808864266, "grad_norm": 1.086695909500122, "learning_rate": 3.1393651631056008e-06, "loss": 0.0054, "step": 20068 }, { "epoch": 5.294156443740931, "grad_norm": 0.007922971621155739, "learning_rate": 3.139013452914798e-06, "loss": 0.0003, "step": 20070 }, { "epoch": 5.294684078617596, "grad_norm": 0.014128990471363068, "learning_rate": 3.1386617427239955e-06, "loss": 0.0002, "step": 20072 }, { "epoch": 5.295211713494262, "grad_norm": 0.014225970022380352, "learning_rate": 3.138310032533193e-06, "loss": 0.0002, "step": 20074 }, { "epoch": 5.295739348370927, "grad_norm": 0.019103439524769783, "learning_rate": 3.13795832234239e-06, "loss": 0.0002, "step": 20076 }, { "epoch": 5.296266983247593, "grad_norm": 0.0014916762011125684, "learning_rate": 3.1376066121515872e-06, "loss": 0.0001, "step": 20078 }, { "epoch": 5.296794618124258, "grad_norm": 0.004385316278785467, "learning_rate": 3.1372549019607846e-06, "loss": 0.0036, "step": 20080 }, { "epoch": 5.297322253000924, "grad_norm": 0.17578546702861786, "learning_rate": 3.136903191769981e-06, "loss": 0.0008, "step": 20082 }, { "epoch": 5.297849887877589, "grad_norm": 0.13658280670642853, "learning_rate": 3.1365514815791785e-06, "loss": 0.0007, "step": 20084 }, { "epoch": 5.298377522754254, "grad_norm": 0.12597571313381195, "learning_rate": 3.1361997713883763e-06, "loss": 0.0012, "step": 20086 }, { "epoch": 5.298905157630919, "grad_norm": 0.012691935524344444, "learning_rate": 3.1358480611975737e-06, "loss": 0.0003, "step": 20088 }, { "epoch": 5.299432792507584, "grad_norm": 0.005911156069487333, "learning_rate": 3.1354963510067702e-06, "loss": 0.0001, "step": 20090 }, { "epoch": 5.29996042738425, "grad_norm": 0.009511969983577728, "learning_rate": 3.1351446408159676e-06, "loss": 0.0001, "step": 20092 }, { "epoch": 5.300488062260915, "grad_norm": 0.007168640848249197, "learning_rate": 3.134792930625165e-06, "loss": 0.0002, "step": 20094 }, { "epoch": 5.301015697137581, "grad_norm": 0.006553767714649439, "learning_rate": 3.1344412204343623e-06, "loss": 0.0007, "step": 20096 }, { "epoch": 5.301543332014246, "grad_norm": 0.009125554002821445, "learning_rate": 3.1340895102435593e-06, "loss": 0.0002, "step": 20098 }, { "epoch": 5.302070966890912, "grad_norm": 0.014401113614439964, "learning_rate": 3.1337378000527567e-06, "loss": 0.0099, "step": 20100 }, { "epoch": 5.302598601767577, "grad_norm": 0.08136427402496338, "learning_rate": 3.133386089861954e-06, "loss": 0.0014, "step": 20102 }, { "epoch": 5.303126236644243, "grad_norm": 0.1768854558467865, "learning_rate": 3.1330343796711514e-06, "loss": 0.0045, "step": 20104 }, { "epoch": 5.303653871520908, "grad_norm": 0.20797550678253174, "learning_rate": 3.132682669480348e-06, "loss": 0.0003, "step": 20106 }, { "epoch": 5.304181506397573, "grad_norm": 0.005052283871918917, "learning_rate": 3.1323309592895453e-06, "loss": 0.0001, "step": 20108 }, { "epoch": 5.304709141274238, "grad_norm": 0.005094411317259073, "learning_rate": 3.1319792490987427e-06, "loss": 0.0001, "step": 20110 }, { "epoch": 5.305236776150903, "grad_norm": 0.0023044648114591837, "learning_rate": 3.1316275389079397e-06, "loss": 0.0001, "step": 20112 }, { "epoch": 5.305764411027569, "grad_norm": 0.0054204571060836315, "learning_rate": 3.131275828717137e-06, "loss": 0.0002, "step": 20114 }, { "epoch": 5.306292045904234, "grad_norm": 0.007283410523086786, "learning_rate": 3.1309241185263344e-06, "loss": 0.0001, "step": 20116 }, { "epoch": 5.306819680780899, "grad_norm": 0.0029844832606613636, "learning_rate": 3.1305724083355318e-06, "loss": 0.0001, "step": 20118 }, { "epoch": 5.307347315657565, "grad_norm": 0.13258495926856995, "learning_rate": 3.1302206981447287e-06, "loss": 0.0003, "step": 20120 }, { "epoch": 5.30787495053423, "grad_norm": 0.016211507841944695, "learning_rate": 3.129868987953926e-06, "loss": 0.0011, "step": 20122 }, { "epoch": 5.308402585410896, "grad_norm": 0.0022818055003881454, "learning_rate": 3.1295172777631235e-06, "loss": 0.0001, "step": 20124 }, { "epoch": 5.308930220287561, "grad_norm": 0.004201310221105814, "learning_rate": 3.129165567572321e-06, "loss": 0.0002, "step": 20126 }, { "epoch": 5.309457855164227, "grad_norm": 0.0029867906123399734, "learning_rate": 3.1288138573815174e-06, "loss": 0.0001, "step": 20128 }, { "epoch": 5.309985490040892, "grad_norm": 0.17956089973449707, "learning_rate": 3.1284621471907148e-06, "loss": 0.0064, "step": 20130 }, { "epoch": 5.3105131249175574, "grad_norm": 0.006509259808808565, "learning_rate": 3.128110436999912e-06, "loss": 0.0008, "step": 20132 }, { "epoch": 5.311040759794222, "grad_norm": 0.006591075100004673, "learning_rate": 3.1277587268091095e-06, "loss": 0.0049, "step": 20134 }, { "epoch": 5.311568394670887, "grad_norm": 0.013453309424221516, "learning_rate": 3.1274070166183065e-06, "loss": 0.0001, "step": 20136 }, { "epoch": 5.312096029547553, "grad_norm": 0.011382093653082848, "learning_rate": 3.127055306427504e-06, "loss": 0.0037, "step": 20138 }, { "epoch": 5.312623664424218, "grad_norm": 0.011084301397204399, "learning_rate": 3.1267035962367012e-06, "loss": 0.0002, "step": 20140 }, { "epoch": 5.313151299300884, "grad_norm": 0.006472433917224407, "learning_rate": 3.1263518860458977e-06, "loss": 0.0001, "step": 20142 }, { "epoch": 5.313678934177549, "grad_norm": 0.04911481961607933, "learning_rate": 3.1260001758550955e-06, "loss": 0.0005, "step": 20144 }, { "epoch": 5.314206569054215, "grad_norm": 0.1402527242898941, "learning_rate": 3.125648465664293e-06, "loss": 0.0005, "step": 20146 }, { "epoch": 5.31473420393088, "grad_norm": 0.003420952707529068, "learning_rate": 3.1252967554734903e-06, "loss": 0.0001, "step": 20148 }, { "epoch": 5.315261838807545, "grad_norm": 0.9438023567199707, "learning_rate": 3.124945045282687e-06, "loss": 0.0039, "step": 20150 }, { "epoch": 5.315789473684211, "grad_norm": 0.21868523955345154, "learning_rate": 3.124593335091884e-06, "loss": 0.0084, "step": 20152 }, { "epoch": 5.316317108560876, "grad_norm": 0.006225903518497944, "learning_rate": 3.1242416249010816e-06, "loss": 0.0001, "step": 20154 }, { "epoch": 5.316844743437541, "grad_norm": 0.1318693906068802, "learning_rate": 3.123889914710279e-06, "loss": 0.0005, "step": 20156 }, { "epoch": 5.317372378314206, "grad_norm": 0.02866855263710022, "learning_rate": 3.123538204519476e-06, "loss": 0.0001, "step": 20158 }, { "epoch": 5.317900013190872, "grad_norm": 0.013593428768217564, "learning_rate": 3.1231864943286733e-06, "loss": 0.0001, "step": 20160 }, { "epoch": 5.318427648067537, "grad_norm": 0.005202896427363157, "learning_rate": 3.1228347841378707e-06, "loss": 0.0017, "step": 20162 }, { "epoch": 5.318955282944202, "grad_norm": 0.002560005523264408, "learning_rate": 3.122483073947068e-06, "loss": 0.0001, "step": 20164 }, { "epoch": 5.319482917820868, "grad_norm": 0.011995099484920502, "learning_rate": 3.1221313637562646e-06, "loss": 0.0001, "step": 20166 }, { "epoch": 5.320010552697533, "grad_norm": 0.0018920984584838152, "learning_rate": 3.121779653565462e-06, "loss": 0.0001, "step": 20168 }, { "epoch": 5.320538187574199, "grad_norm": 0.015805210918188095, "learning_rate": 3.1214279433746597e-06, "loss": 0.0002, "step": 20170 }, { "epoch": 5.321065822450864, "grad_norm": 0.5982699394226074, "learning_rate": 3.1210762331838563e-06, "loss": 0.011, "step": 20172 }, { "epoch": 5.32159345732753, "grad_norm": 0.005862273275852203, "learning_rate": 3.1207245229930536e-06, "loss": 0.0001, "step": 20174 }, { "epoch": 5.322121092204195, "grad_norm": 0.0024233900476247072, "learning_rate": 3.120372812802251e-06, "loss": 0.0001, "step": 20176 }, { "epoch": 5.3226487270808605, "grad_norm": 0.015487181022763252, "learning_rate": 3.1200211026114484e-06, "loss": 0.0002, "step": 20178 }, { "epoch": 5.323176361957525, "grad_norm": 0.01729806885123253, "learning_rate": 3.1196693924206453e-06, "loss": 0.0002, "step": 20180 }, { "epoch": 5.32370399683419, "grad_norm": 0.027921229600906372, "learning_rate": 3.1193176822298427e-06, "loss": 0.0004, "step": 20182 }, { "epoch": 5.324231631710856, "grad_norm": 0.0074347518384456635, "learning_rate": 3.11896597203904e-06, "loss": 0.0017, "step": 20184 }, { "epoch": 5.324759266587521, "grad_norm": 0.0038469566497951746, "learning_rate": 3.1186142618482375e-06, "loss": 0.0032, "step": 20186 }, { "epoch": 5.325286901464187, "grad_norm": 0.0068137007765471935, "learning_rate": 3.118262551657434e-06, "loss": 0.0002, "step": 20188 }, { "epoch": 5.325814536340852, "grad_norm": 0.0036937249824404716, "learning_rate": 3.1179108414666314e-06, "loss": 0.0001, "step": 20190 }, { "epoch": 5.326342171217518, "grad_norm": 0.11323093622922897, "learning_rate": 3.1175591312758288e-06, "loss": 0.0003, "step": 20192 }, { "epoch": 5.326869806094183, "grad_norm": 0.1507563441991806, "learning_rate": 3.117207421085026e-06, "loss": 0.0042, "step": 20194 }, { "epoch": 5.327397440970848, "grad_norm": 0.02273755893111229, "learning_rate": 3.116855710894223e-06, "loss": 0.0003, "step": 20196 }, { "epoch": 5.327925075847514, "grad_norm": 0.0025804704055190086, "learning_rate": 3.1165040007034205e-06, "loss": 0.0001, "step": 20198 }, { "epoch": 5.328452710724179, "grad_norm": 0.21537776291370392, "learning_rate": 3.116152290512618e-06, "loss": 0.0095, "step": 20200 }, { "epoch": 5.328980345600844, "grad_norm": 0.059495050460100174, "learning_rate": 3.115800580321815e-06, "loss": 0.0005, "step": 20202 }, { "epoch": 5.329507980477509, "grad_norm": 0.0036347396671772003, "learning_rate": 3.115448870131012e-06, "loss": 0.0003, "step": 20204 }, { "epoch": 5.330035615354175, "grad_norm": 0.011045986786484718, "learning_rate": 3.1150971599402095e-06, "loss": 0.0002, "step": 20206 }, { "epoch": 5.33056325023084, "grad_norm": 0.3413003981113434, "learning_rate": 3.114745449749407e-06, "loss": 0.0047, "step": 20208 }, { "epoch": 5.331090885107505, "grad_norm": 0.019175538793206215, "learning_rate": 3.1143937395586034e-06, "loss": 0.0006, "step": 20210 }, { "epoch": 5.331618519984171, "grad_norm": 0.19480031728744507, "learning_rate": 3.114042029367801e-06, "loss": 0.0007, "step": 20212 }, { "epoch": 5.332146154860836, "grad_norm": 0.4619605243206024, "learning_rate": 3.113690319176998e-06, "loss": 0.0017, "step": 20214 }, { "epoch": 5.332673789737502, "grad_norm": 0.01817024126648903, "learning_rate": 3.1133386089861956e-06, "loss": 0.0002, "step": 20216 }, { "epoch": 5.333201424614167, "grad_norm": 0.02746841311454773, "learning_rate": 3.1129868987953925e-06, "loss": 0.0005, "step": 20218 }, { "epoch": 5.333729059490833, "grad_norm": 0.008004304952919483, "learning_rate": 3.11263518860459e-06, "loss": 0.0001, "step": 20220 }, { "epoch": 5.334256694367498, "grad_norm": 0.006406373344361782, "learning_rate": 3.1122834784137873e-06, "loss": 0.0002, "step": 20222 }, { "epoch": 5.3347843292441635, "grad_norm": 0.007261650636792183, "learning_rate": 3.1119317682229847e-06, "loss": 0.0002, "step": 20224 }, { "epoch": 5.335311964120828, "grad_norm": 0.029534045606851578, "learning_rate": 3.111580058032181e-06, "loss": 0.0003, "step": 20226 }, { "epoch": 5.335839598997493, "grad_norm": 0.014379741623997688, "learning_rate": 3.111228347841379e-06, "loss": 0.0002, "step": 20228 }, { "epoch": 5.336367233874159, "grad_norm": 0.011332129128277302, "learning_rate": 3.1108766376505764e-06, "loss": 0.0003, "step": 20230 }, { "epoch": 5.336894868750824, "grad_norm": 0.4300891160964966, "learning_rate": 3.110524927459773e-06, "loss": 0.001, "step": 20232 }, { "epoch": 5.33742250362749, "grad_norm": 0.04293451085686684, "learning_rate": 3.1101732172689703e-06, "loss": 0.0027, "step": 20234 }, { "epoch": 5.337950138504155, "grad_norm": 0.017005857080221176, "learning_rate": 3.1098215070781676e-06, "loss": 0.0003, "step": 20236 }, { "epoch": 5.338477773380821, "grad_norm": 0.014026505872607231, "learning_rate": 3.109469796887365e-06, "loss": 0.0002, "step": 20238 }, { "epoch": 5.339005408257486, "grad_norm": 0.005087933037430048, "learning_rate": 3.109118086696562e-06, "loss": 0.0002, "step": 20240 }, { "epoch": 5.339533043134151, "grad_norm": 0.007829387672245502, "learning_rate": 3.1087663765057593e-06, "loss": 0.0003, "step": 20242 }, { "epoch": 5.340060678010817, "grad_norm": 0.19912423193454742, "learning_rate": 3.1084146663149567e-06, "loss": 0.0037, "step": 20244 }, { "epoch": 5.340588312887482, "grad_norm": 0.019110694527626038, "learning_rate": 3.108062956124154e-06, "loss": 0.0014, "step": 20246 }, { "epoch": 5.341115947764147, "grad_norm": 0.3247969448566437, "learning_rate": 3.1077112459333506e-06, "loss": 0.0012, "step": 20248 }, { "epoch": 5.341643582640812, "grad_norm": 0.224511057138443, "learning_rate": 3.107359535742548e-06, "loss": 0.0015, "step": 20250 }, { "epoch": 5.342171217517478, "grad_norm": 0.02590758167207241, "learning_rate": 3.1070078255517454e-06, "loss": 0.0003, "step": 20252 }, { "epoch": 5.342698852394143, "grad_norm": 0.08506114780902863, "learning_rate": 3.1066561153609428e-06, "loss": 0.0022, "step": 20254 }, { "epoch": 5.343226487270808, "grad_norm": 0.020291149616241455, "learning_rate": 3.1063044051701397e-06, "loss": 0.0031, "step": 20256 }, { "epoch": 5.343754122147474, "grad_norm": 0.21895858645439148, "learning_rate": 3.105952694979337e-06, "loss": 0.0007, "step": 20258 }, { "epoch": 5.344281757024139, "grad_norm": 0.011498050764203072, "learning_rate": 3.1056009847885345e-06, "loss": 0.0001, "step": 20260 }, { "epoch": 5.344809391900805, "grad_norm": 0.0026063425466418266, "learning_rate": 3.1052492745977314e-06, "loss": 0.0001, "step": 20262 }, { "epoch": 5.34533702677747, "grad_norm": 0.0036906825844198465, "learning_rate": 3.1048975644069288e-06, "loss": 0.0002, "step": 20264 }, { "epoch": 5.345864661654136, "grad_norm": 0.007645245175808668, "learning_rate": 3.104545854216126e-06, "loss": 0.0002, "step": 20266 }, { "epoch": 5.346392296530801, "grad_norm": 0.010305448435246944, "learning_rate": 3.1041941440253235e-06, "loss": 0.0002, "step": 20268 }, { "epoch": 5.3469199314074665, "grad_norm": 0.025078972801566124, "learning_rate": 3.10384243383452e-06, "loss": 0.0015, "step": 20270 }, { "epoch": 5.347447566284131, "grad_norm": 0.019089706242084503, "learning_rate": 3.1034907236437174e-06, "loss": 0.0002, "step": 20272 }, { "epoch": 5.347975201160796, "grad_norm": 0.01457407046109438, "learning_rate": 3.103139013452915e-06, "loss": 0.0001, "step": 20274 }, { "epoch": 5.348502836037462, "grad_norm": 0.0072392974980175495, "learning_rate": 3.102787303262112e-06, "loss": 0.0001, "step": 20276 }, { "epoch": 5.349030470914127, "grad_norm": 0.002766196383163333, "learning_rate": 3.102435593071309e-06, "loss": 0.0001, "step": 20278 }, { "epoch": 5.349558105790793, "grad_norm": 0.01552986167371273, "learning_rate": 3.1020838828805065e-06, "loss": 0.002, "step": 20280 }, { "epoch": 5.350085740667458, "grad_norm": 0.3548204302787781, "learning_rate": 3.101732172689704e-06, "loss": 0.0053, "step": 20282 }, { "epoch": 5.350613375544124, "grad_norm": 0.0048806858249008656, "learning_rate": 3.1013804624989013e-06, "loss": 0.0002, "step": 20284 }, { "epoch": 5.351141010420789, "grad_norm": 0.003915280103683472, "learning_rate": 3.1010287523080982e-06, "loss": 0.0001, "step": 20286 }, { "epoch": 5.351668645297454, "grad_norm": 0.004345255438238382, "learning_rate": 3.1006770421172956e-06, "loss": 0.0001, "step": 20288 }, { "epoch": 5.35219628017412, "grad_norm": 0.0714947059750557, "learning_rate": 3.100325331926493e-06, "loss": 0.0003, "step": 20290 }, { "epoch": 5.352723915050785, "grad_norm": 0.003646446857601404, "learning_rate": 3.0999736217356895e-06, "loss": 0.0033, "step": 20292 }, { "epoch": 5.3532515499274504, "grad_norm": 0.01734762266278267, "learning_rate": 3.099621911544887e-06, "loss": 0.0003, "step": 20294 }, { "epoch": 5.353779184804115, "grad_norm": 0.006451161578297615, "learning_rate": 3.0992702013540843e-06, "loss": 0.0002, "step": 20296 }, { "epoch": 5.354306819680781, "grad_norm": 0.02054118737578392, "learning_rate": 3.0989184911632816e-06, "loss": 0.0003, "step": 20298 }, { "epoch": 5.354834454557446, "grad_norm": 0.21161623299121857, "learning_rate": 3.0985667809724786e-06, "loss": 0.0019, "step": 20300 }, { "epoch": 5.355362089434111, "grad_norm": 0.00733364699408412, "learning_rate": 3.098215070781676e-06, "loss": 0.0012, "step": 20302 }, { "epoch": 5.355889724310777, "grad_norm": 0.023035570979118347, "learning_rate": 3.0978633605908733e-06, "loss": 0.0017, "step": 20304 }, { "epoch": 5.356417359187442, "grad_norm": 0.0021665231324732304, "learning_rate": 3.0975116504000707e-06, "loss": 0.0001, "step": 20306 }, { "epoch": 5.356944994064108, "grad_norm": 0.00510356854647398, "learning_rate": 3.0971599402092672e-06, "loss": 0.0037, "step": 20308 }, { "epoch": 5.357472628940773, "grad_norm": 0.19193542003631592, "learning_rate": 3.0968082300184646e-06, "loss": 0.0045, "step": 20310 }, { "epoch": 5.358000263817439, "grad_norm": 0.016870548948645592, "learning_rate": 3.096456519827662e-06, "loss": 0.0001, "step": 20312 }, { "epoch": 5.358527898694104, "grad_norm": 0.003904543351382017, "learning_rate": 3.09610480963686e-06, "loss": 0.0003, "step": 20314 }, { "epoch": 5.3590555335707695, "grad_norm": 0.005929847247898579, "learning_rate": 3.0957530994460563e-06, "loss": 0.0001, "step": 20316 }, { "epoch": 5.359583168447434, "grad_norm": 0.09232104569673538, "learning_rate": 3.0954013892552537e-06, "loss": 0.0004, "step": 20318 }, { "epoch": 5.360110803324099, "grad_norm": 0.0034175680484622717, "learning_rate": 3.095049679064451e-06, "loss": 0.0001, "step": 20320 }, { "epoch": 5.360638438200765, "grad_norm": 0.002623643260449171, "learning_rate": 3.094697968873648e-06, "loss": 0.0001, "step": 20322 }, { "epoch": 5.36116607307743, "grad_norm": 0.004325853195041418, "learning_rate": 3.0943462586828454e-06, "loss": 0.0016, "step": 20324 }, { "epoch": 5.361693707954096, "grad_norm": 0.016335023567080498, "learning_rate": 3.0939945484920428e-06, "loss": 0.0001, "step": 20326 }, { "epoch": 5.362221342830761, "grad_norm": 0.009894092567265034, "learning_rate": 3.09364283830124e-06, "loss": 0.0001, "step": 20328 }, { "epoch": 5.362748977707427, "grad_norm": 0.2483990341424942, "learning_rate": 3.0932911281104367e-06, "loss": 0.0041, "step": 20330 }, { "epoch": 5.363276612584092, "grad_norm": 0.007570057176053524, "learning_rate": 3.092939417919634e-06, "loss": 0.002, "step": 20332 }, { "epoch": 5.363804247460757, "grad_norm": 0.015471828170120716, "learning_rate": 3.0925877077288314e-06, "loss": 0.0009, "step": 20334 }, { "epoch": 5.364331882337423, "grad_norm": 0.036382999271154404, "learning_rate": 3.092235997538029e-06, "loss": 0.0002, "step": 20336 }, { "epoch": 5.364859517214088, "grad_norm": 0.0975150540471077, "learning_rate": 3.0918842873472258e-06, "loss": 0.0003, "step": 20338 }, { "epoch": 5.3653871520907535, "grad_norm": 0.003955089952796698, "learning_rate": 3.091532577156423e-06, "loss": 0.0001, "step": 20340 }, { "epoch": 5.365914786967418, "grad_norm": 0.0017141805728897452, "learning_rate": 3.0911808669656205e-06, "loss": 0.0002, "step": 20342 }, { "epoch": 5.366442421844084, "grad_norm": 0.3291282057762146, "learning_rate": 3.090829156774818e-06, "loss": 0.0023, "step": 20344 }, { "epoch": 5.366970056720749, "grad_norm": 0.0031302382703870535, "learning_rate": 3.090477446584015e-06, "loss": 0.0001, "step": 20346 }, { "epoch": 5.367497691597414, "grad_norm": 0.27396121621131897, "learning_rate": 3.0901257363932122e-06, "loss": 0.0032, "step": 20348 }, { "epoch": 5.36802532647408, "grad_norm": 0.015350278466939926, "learning_rate": 3.0897740262024096e-06, "loss": 0.0006, "step": 20350 }, { "epoch": 5.368552961350745, "grad_norm": 0.0047760773450136185, "learning_rate": 3.089422316011606e-06, "loss": 0.0002, "step": 20352 }, { "epoch": 5.369080596227411, "grad_norm": 0.0028759408742189407, "learning_rate": 3.0890706058208035e-06, "loss": 0.0002, "step": 20354 }, { "epoch": 5.369608231104076, "grad_norm": 0.027046117931604385, "learning_rate": 3.088718895630001e-06, "loss": 0.0003, "step": 20356 }, { "epoch": 5.370135865980742, "grad_norm": 0.022760745137929916, "learning_rate": 3.0883671854391982e-06, "loss": 0.0001, "step": 20358 }, { "epoch": 5.370663500857407, "grad_norm": 0.0067617385648190975, "learning_rate": 3.088015475248395e-06, "loss": 0.0001, "step": 20360 }, { "epoch": 5.3711911357340725, "grad_norm": 0.0374675989151001, "learning_rate": 3.0876637650575926e-06, "loss": 0.0004, "step": 20362 }, { "epoch": 5.371718770610737, "grad_norm": 0.005158040206879377, "learning_rate": 3.08731205486679e-06, "loss": 0.0001, "step": 20364 }, { "epoch": 5.372246405487402, "grad_norm": 0.0036461837589740753, "learning_rate": 3.0869603446759873e-06, "loss": 0.0001, "step": 20366 }, { "epoch": 5.372774040364068, "grad_norm": 0.010535189881920815, "learning_rate": 3.086608634485184e-06, "loss": 0.0008, "step": 20368 }, { "epoch": 5.373301675240733, "grad_norm": 0.3198275864124298, "learning_rate": 3.0862569242943812e-06, "loss": 0.0008, "step": 20370 }, { "epoch": 5.373829310117399, "grad_norm": 0.0026054326444864273, "learning_rate": 3.085905214103579e-06, "loss": 0.0001, "step": 20372 }, { "epoch": 5.374356944994064, "grad_norm": 0.03711225092411041, "learning_rate": 3.0855535039127764e-06, "loss": 0.0002, "step": 20374 }, { "epoch": 5.374884579870729, "grad_norm": 0.013240784406661987, "learning_rate": 3.085201793721973e-06, "loss": 0.0004, "step": 20376 }, { "epoch": 5.375412214747395, "grad_norm": 0.004660879261791706, "learning_rate": 3.0848500835311703e-06, "loss": 0.0001, "step": 20378 }, { "epoch": 5.37593984962406, "grad_norm": 0.0012045089388266206, "learning_rate": 3.0844983733403677e-06, "loss": 0.0001, "step": 20380 }, { "epoch": 5.376467484500726, "grad_norm": 0.017923539504408836, "learning_rate": 3.0841466631495646e-06, "loss": 0.0001, "step": 20382 }, { "epoch": 5.376995119377391, "grad_norm": 0.06484434753656387, "learning_rate": 3.083794952958762e-06, "loss": 0.0002, "step": 20384 }, { "epoch": 5.3775227542540565, "grad_norm": 0.2524901032447815, "learning_rate": 3.0834432427679594e-06, "loss": 0.0012, "step": 20386 }, { "epoch": 5.378050389130721, "grad_norm": 0.19258280098438263, "learning_rate": 3.0830915325771568e-06, "loss": 0.0005, "step": 20388 }, { "epoch": 5.378578024007387, "grad_norm": 0.012544971890747547, "learning_rate": 3.0827398223863533e-06, "loss": 0.0013, "step": 20390 }, { "epoch": 5.379105658884052, "grad_norm": 0.004984042141586542, "learning_rate": 3.0823881121955507e-06, "loss": 0.0002, "step": 20392 }, { "epoch": 5.379633293760717, "grad_norm": 0.0023434790782630444, "learning_rate": 3.082036402004748e-06, "loss": 0.0001, "step": 20394 }, { "epoch": 5.380160928637383, "grad_norm": 2.2432355880737305, "learning_rate": 3.0816846918139454e-06, "loss": 0.0075, "step": 20396 }, { "epoch": 5.380688563514048, "grad_norm": 0.001783018815331161, "learning_rate": 3.0813329816231424e-06, "loss": 0.0001, "step": 20398 }, { "epoch": 5.381216198390714, "grad_norm": 0.047675587236881256, "learning_rate": 3.0809812714323398e-06, "loss": 0.0003, "step": 20400 }, { "epoch": 5.381743833267379, "grad_norm": 0.027589017525315285, "learning_rate": 3.080629561241537e-06, "loss": 0.0002, "step": 20402 }, { "epoch": 5.382271468144045, "grad_norm": 0.0023991221096366644, "learning_rate": 3.0802778510507345e-06, "loss": 0.0001, "step": 20404 }, { "epoch": 5.38279910302071, "grad_norm": 0.0011617058189585805, "learning_rate": 3.0799261408599315e-06, "loss": 0.0002, "step": 20406 }, { "epoch": 5.3833267378973755, "grad_norm": 0.05007045716047287, "learning_rate": 3.079574430669129e-06, "loss": 0.0002, "step": 20408 }, { "epoch": 5.38385437277404, "grad_norm": 0.002018372993916273, "learning_rate": 3.079222720478326e-06, "loss": 0.0001, "step": 20410 }, { "epoch": 5.384382007650705, "grad_norm": 0.4333343505859375, "learning_rate": 3.0788710102875236e-06, "loss": 0.0008, "step": 20412 }, { "epoch": 5.384909642527371, "grad_norm": 0.0026064745616167784, "learning_rate": 3.07851930009672e-06, "loss": 0.001, "step": 20414 }, { "epoch": 5.385437277404036, "grad_norm": 0.0033575824927538633, "learning_rate": 3.0781675899059175e-06, "loss": 0.0001, "step": 20416 }, { "epoch": 5.385964912280702, "grad_norm": 0.005590933840721846, "learning_rate": 3.077815879715115e-06, "loss": 0.0034, "step": 20418 }, { "epoch": 5.386492547157367, "grad_norm": 0.5356491804122925, "learning_rate": 3.077464169524312e-06, "loss": 0.0034, "step": 20420 }, { "epoch": 5.387020182034032, "grad_norm": 0.001814538729377091, "learning_rate": 3.077112459333509e-06, "loss": 0.0042, "step": 20422 }, { "epoch": 5.387547816910698, "grad_norm": 0.0014554557856172323, "learning_rate": 3.0767607491427066e-06, "loss": 0.0008, "step": 20424 }, { "epoch": 5.388075451787363, "grad_norm": 0.00572094926610589, "learning_rate": 3.076409038951904e-06, "loss": 0.0002, "step": 20426 }, { "epoch": 5.388603086664029, "grad_norm": 0.29220250248908997, "learning_rate": 3.0760573287611005e-06, "loss": 0.0051, "step": 20428 }, { "epoch": 5.389130721540694, "grad_norm": 0.23213639855384827, "learning_rate": 3.0757056185702983e-06, "loss": 0.0008, "step": 20430 }, { "epoch": 5.3896583564173595, "grad_norm": 0.002450974192470312, "learning_rate": 3.0753539083794956e-06, "loss": 0.0003, "step": 20432 }, { "epoch": 5.390185991294024, "grad_norm": 0.0017570351483300328, "learning_rate": 3.075002198188693e-06, "loss": 0.0001, "step": 20434 }, { "epoch": 5.39071362617069, "grad_norm": 0.017312871292233467, "learning_rate": 3.0746504879978896e-06, "loss": 0.0002, "step": 20436 }, { "epoch": 5.391241261047355, "grad_norm": 0.0022050966508686543, "learning_rate": 3.074298777807087e-06, "loss": 0.0029, "step": 20438 }, { "epoch": 5.39176889592402, "grad_norm": 0.03146763890981674, "learning_rate": 3.0739470676162843e-06, "loss": 0.0007, "step": 20440 }, { "epoch": 5.392296530800686, "grad_norm": 0.027644352987408638, "learning_rate": 3.0735953574254817e-06, "loss": 0.0002, "step": 20442 }, { "epoch": 5.392824165677351, "grad_norm": 0.004534624516963959, "learning_rate": 3.0732436472346786e-06, "loss": 0.0001, "step": 20444 }, { "epoch": 5.393351800554017, "grad_norm": 0.0015582925407215953, "learning_rate": 3.072891937043876e-06, "loss": 0.0001, "step": 20446 }, { "epoch": 5.393879435430682, "grad_norm": 0.001934381783939898, "learning_rate": 3.0725402268530734e-06, "loss": 0.0001, "step": 20448 }, { "epoch": 5.394407070307348, "grad_norm": 0.004559091757982969, "learning_rate": 3.07218851666227e-06, "loss": 0.0001, "step": 20450 }, { "epoch": 5.394934705184013, "grad_norm": 0.03587094694375992, "learning_rate": 3.0718368064714673e-06, "loss": 0.0002, "step": 20452 }, { "epoch": 5.395462340060678, "grad_norm": 0.0032401399221271276, "learning_rate": 3.0714850962806647e-06, "loss": 0.0001, "step": 20454 }, { "epoch": 5.3959899749373434, "grad_norm": 0.10556139796972275, "learning_rate": 3.0711333860898625e-06, "loss": 0.0022, "step": 20456 }, { "epoch": 5.396517609814008, "grad_norm": 0.0016808506334200501, "learning_rate": 3.070781675899059e-06, "loss": 0.0003, "step": 20458 }, { "epoch": 5.397045244690674, "grad_norm": 0.00540305208414793, "learning_rate": 3.0704299657082564e-06, "loss": 0.003, "step": 20460 }, { "epoch": 5.397572879567339, "grad_norm": 0.3270147144794464, "learning_rate": 3.0700782555174537e-06, "loss": 0.0028, "step": 20462 }, { "epoch": 5.398100514444005, "grad_norm": 0.0016684600850567222, "learning_rate": 3.069726545326651e-06, "loss": 0.0015, "step": 20464 }, { "epoch": 5.39862814932067, "grad_norm": 0.016856499016284943, "learning_rate": 3.069374835135848e-06, "loss": 0.0001, "step": 20466 }, { "epoch": 5.399155784197335, "grad_norm": 0.12046240270137787, "learning_rate": 3.0690231249450455e-06, "loss": 0.0135, "step": 20468 }, { "epoch": 5.399683419074001, "grad_norm": 0.0028506340458989143, "learning_rate": 3.068671414754243e-06, "loss": 0.0003, "step": 20470 }, { "epoch": 5.400211053950666, "grad_norm": 0.0009935337584465742, "learning_rate": 3.06831970456344e-06, "loss": 0.0065, "step": 20472 }, { "epoch": 5.400738688827332, "grad_norm": 0.08008608967065811, "learning_rate": 3.0679679943726367e-06, "loss": 0.0003, "step": 20474 }, { "epoch": 5.401266323703997, "grad_norm": 0.0022265627048909664, "learning_rate": 3.067616284181834e-06, "loss": 0.0001, "step": 20476 }, { "epoch": 5.4017939585806625, "grad_norm": 0.0038936310447752476, "learning_rate": 3.0672645739910315e-06, "loss": 0.0001, "step": 20478 }, { "epoch": 5.402321593457327, "grad_norm": 0.0069981231354177, "learning_rate": 3.0669128638002284e-06, "loss": 0.0001, "step": 20480 }, { "epoch": 5.402849228333993, "grad_norm": 0.010941320098936558, "learning_rate": 3.066561153609426e-06, "loss": 0.013, "step": 20482 }, { "epoch": 5.403376863210658, "grad_norm": 0.07852569222450256, "learning_rate": 3.066209443418623e-06, "loss": 0.0028, "step": 20484 }, { "epoch": 5.403904498087323, "grad_norm": 0.005672744940966368, "learning_rate": 3.0658577332278206e-06, "loss": 0.0001, "step": 20486 }, { "epoch": 5.404432132963989, "grad_norm": 0.7254163026809692, "learning_rate": 3.0655060230370175e-06, "loss": 0.0033, "step": 20488 }, { "epoch": 5.404959767840654, "grad_norm": 0.08455609530210495, "learning_rate": 3.065154312846215e-06, "loss": 0.0003, "step": 20490 }, { "epoch": 5.40548740271732, "grad_norm": 0.10133984684944153, "learning_rate": 3.0648026026554123e-06, "loss": 0.0005, "step": 20492 }, { "epoch": 5.406015037593985, "grad_norm": 0.28451091051101685, "learning_rate": 3.0644508924646096e-06, "loss": 0.0118, "step": 20494 }, { "epoch": 5.406542672470651, "grad_norm": 0.0019343041349202394, "learning_rate": 3.064099182273806e-06, "loss": 0.0003, "step": 20496 }, { "epoch": 5.407070307347316, "grad_norm": 0.08445724844932556, "learning_rate": 3.0637474720830035e-06, "loss": 0.0003, "step": 20498 }, { "epoch": 5.407597942223981, "grad_norm": 0.1397736817598343, "learning_rate": 3.063395761892201e-06, "loss": 0.0004, "step": 20500 }, { "epoch": 5.4081255771006465, "grad_norm": 0.03134920448064804, "learning_rate": 3.0630440517013983e-06, "loss": 0.0002, "step": 20502 }, { "epoch": 5.408653211977311, "grad_norm": 0.10801135748624802, "learning_rate": 3.0626923415105953e-06, "loss": 0.0003, "step": 20504 }, { "epoch": 5.409180846853977, "grad_norm": 0.12240474671125412, "learning_rate": 3.0623406313197926e-06, "loss": 0.002, "step": 20506 }, { "epoch": 5.409708481730642, "grad_norm": 0.03657786175608635, "learning_rate": 3.06198892112899e-06, "loss": 0.0017, "step": 20508 }, { "epoch": 5.410236116607308, "grad_norm": 0.019792253151535988, "learning_rate": 3.0616372109381865e-06, "loss": 0.0003, "step": 20510 }, { "epoch": 5.410763751483973, "grad_norm": 0.003797851735725999, "learning_rate": 3.061285500747384e-06, "loss": 0.0055, "step": 20512 }, { "epoch": 5.411291386360638, "grad_norm": 0.01616065762937069, "learning_rate": 3.0609337905565817e-06, "loss": 0.0001, "step": 20514 }, { "epoch": 5.411819021237304, "grad_norm": 0.035941921174526215, "learning_rate": 3.060582080365779e-06, "loss": 0.0003, "step": 20516 }, { "epoch": 5.412346656113969, "grad_norm": 0.0029312854167073965, "learning_rate": 3.0602303701749756e-06, "loss": 0.0001, "step": 20518 }, { "epoch": 5.412874290990635, "grad_norm": 0.09466966241598129, "learning_rate": 3.059878659984173e-06, "loss": 0.0003, "step": 20520 }, { "epoch": 5.4134019258673, "grad_norm": 0.033459678292274475, "learning_rate": 3.0595269497933704e-06, "loss": 0.0002, "step": 20522 }, { "epoch": 5.4139295607439655, "grad_norm": 0.01148197427392006, "learning_rate": 3.0591752396025677e-06, "loss": 0.0002, "step": 20524 }, { "epoch": 5.41445719562063, "grad_norm": 0.02021436020731926, "learning_rate": 3.0588235294117647e-06, "loss": 0.0002, "step": 20526 }, { "epoch": 5.414984830497296, "grad_norm": 0.12399943917989731, "learning_rate": 3.058471819220962e-06, "loss": 0.0006, "step": 20528 }, { "epoch": 5.415512465373961, "grad_norm": 0.3497781753540039, "learning_rate": 3.0581201090301594e-06, "loss": 0.0013, "step": 20530 }, { "epoch": 5.416040100250626, "grad_norm": 0.0038834980223327875, "learning_rate": 3.057768398839357e-06, "loss": 0.0001, "step": 20532 }, { "epoch": 5.416567735127292, "grad_norm": 0.028344394639134407, "learning_rate": 3.0574166886485533e-06, "loss": 0.0002, "step": 20534 }, { "epoch": 5.417095370003957, "grad_norm": 0.007197970990091562, "learning_rate": 3.0570649784577507e-06, "loss": 0.0001, "step": 20536 }, { "epoch": 5.417623004880623, "grad_norm": 0.004523738753050566, "learning_rate": 3.056713268266948e-06, "loss": 0.0001, "step": 20538 }, { "epoch": 5.418150639757288, "grad_norm": 0.5345818996429443, "learning_rate": 3.056361558076145e-06, "loss": 0.0067, "step": 20540 }, { "epoch": 5.418678274633954, "grad_norm": 0.00951245054602623, "learning_rate": 3.0560098478853424e-06, "loss": 0.0001, "step": 20542 }, { "epoch": 5.419205909510619, "grad_norm": 0.25601041316986084, "learning_rate": 3.05565813769454e-06, "loss": 0.0027, "step": 20544 }, { "epoch": 5.419733544387284, "grad_norm": 0.3491314649581909, "learning_rate": 3.055306427503737e-06, "loss": 0.004, "step": 20546 }, { "epoch": 5.4202611792639495, "grad_norm": 0.013488521799445152, "learning_rate": 3.054954717312934e-06, "loss": 0.0001, "step": 20548 }, { "epoch": 5.420788814140614, "grad_norm": 0.2650526165962219, "learning_rate": 3.0546030071221315e-06, "loss": 0.0015, "step": 20550 }, { "epoch": 5.42131644901728, "grad_norm": 0.03122851252555847, "learning_rate": 3.054251296931329e-06, "loss": 0.0009, "step": 20552 }, { "epoch": 5.421844083893945, "grad_norm": 0.11530479788780212, "learning_rate": 3.0538995867405263e-06, "loss": 0.0003, "step": 20554 }, { "epoch": 5.422371718770611, "grad_norm": 0.6077573895454407, "learning_rate": 3.053547876549723e-06, "loss": 0.0006, "step": 20556 }, { "epoch": 5.422899353647276, "grad_norm": 0.08930092304944992, "learning_rate": 3.05319616635892e-06, "loss": 0.0005, "step": 20558 }, { "epoch": 5.423426988523941, "grad_norm": 0.07201112061738968, "learning_rate": 3.0528444561681175e-06, "loss": 0.0004, "step": 20560 }, { "epoch": 5.423954623400607, "grad_norm": 0.08099175244569778, "learning_rate": 3.052492745977315e-06, "loss": 0.0015, "step": 20562 }, { "epoch": 5.424482258277272, "grad_norm": 0.20252825319766998, "learning_rate": 3.052141035786512e-06, "loss": 0.0006, "step": 20564 }, { "epoch": 5.425009893153938, "grad_norm": 0.03334122151136398, "learning_rate": 3.0517893255957092e-06, "loss": 0.0002, "step": 20566 }, { "epoch": 5.425537528030603, "grad_norm": 0.26017823815345764, "learning_rate": 3.0514376154049066e-06, "loss": 0.0006, "step": 20568 }, { "epoch": 5.4260651629072685, "grad_norm": 0.0034816537518054247, "learning_rate": 3.051085905214103e-06, "loss": 0.0001, "step": 20570 }, { "epoch": 5.426592797783933, "grad_norm": 0.04816442355513573, "learning_rate": 3.050734195023301e-06, "loss": 0.0002, "step": 20572 }, { "epoch": 5.427120432660599, "grad_norm": 0.017500566318631172, "learning_rate": 3.0503824848324983e-06, "loss": 0.0002, "step": 20574 }, { "epoch": 5.427648067537264, "grad_norm": 0.006937580648809671, "learning_rate": 3.0500307746416957e-06, "loss": 0.0001, "step": 20576 }, { "epoch": 5.428175702413929, "grad_norm": 0.02624279446899891, "learning_rate": 3.0496790644508922e-06, "loss": 0.0002, "step": 20578 }, { "epoch": 5.428703337290595, "grad_norm": 0.015599487349390984, "learning_rate": 3.0493273542600896e-06, "loss": 0.0001, "step": 20580 }, { "epoch": 5.42923097216726, "grad_norm": 0.0011761690257117152, "learning_rate": 3.048975644069287e-06, "loss": 0.0015, "step": 20582 }, { "epoch": 5.429758607043926, "grad_norm": 0.0015391623601317406, "learning_rate": 3.0486239338784844e-06, "loss": 0.0003, "step": 20584 }, { "epoch": 5.430286241920591, "grad_norm": 0.001375610358081758, "learning_rate": 3.0482722236876813e-06, "loss": 0.0001, "step": 20586 }, { "epoch": 5.430813876797257, "grad_norm": 0.024820467457175255, "learning_rate": 3.0479205134968787e-06, "loss": 0.0002, "step": 20588 }, { "epoch": 5.431341511673922, "grad_norm": 0.0197658259421587, "learning_rate": 3.047568803306076e-06, "loss": 0.0004, "step": 20590 }, { "epoch": 5.431869146550587, "grad_norm": 0.004582608584314585, "learning_rate": 3.0472170931152734e-06, "loss": 0.0001, "step": 20592 }, { "epoch": 5.4323967814272525, "grad_norm": 0.009465600363910198, "learning_rate": 3.04686538292447e-06, "loss": 0.0001, "step": 20594 }, { "epoch": 5.432924416303917, "grad_norm": 0.01724856160581112, "learning_rate": 3.0465136727336673e-06, "loss": 0.0006, "step": 20596 }, { "epoch": 5.433452051180583, "grad_norm": 0.0027561120223253965, "learning_rate": 3.046161962542865e-06, "loss": 0.0001, "step": 20598 }, { "epoch": 5.433979686057248, "grad_norm": 0.05431057885289192, "learning_rate": 3.0458102523520617e-06, "loss": 0.0008, "step": 20600 }, { "epoch": 5.434507320933914, "grad_norm": 0.0021028639748692513, "learning_rate": 3.045458542161259e-06, "loss": 0.0001, "step": 20602 }, { "epoch": 5.435034955810579, "grad_norm": 0.00419158861041069, "learning_rate": 3.0451068319704564e-06, "loss": 0.0001, "step": 20604 }, { "epoch": 5.435562590687244, "grad_norm": 0.15155234932899475, "learning_rate": 3.044755121779654e-06, "loss": 0.0012, "step": 20606 }, { "epoch": 5.43609022556391, "grad_norm": 0.0015270705334842205, "learning_rate": 3.0444034115888507e-06, "loss": 0.0001, "step": 20608 }, { "epoch": 5.436617860440575, "grad_norm": 0.0031025295611470938, "learning_rate": 3.044051701398048e-06, "loss": 0.0001, "step": 20610 }, { "epoch": 5.437145495317241, "grad_norm": 0.0036258636973798275, "learning_rate": 3.0436999912072455e-06, "loss": 0.0003, "step": 20612 }, { "epoch": 5.437673130193906, "grad_norm": 0.11915095895528793, "learning_rate": 3.043348281016443e-06, "loss": 0.0032, "step": 20614 }, { "epoch": 5.4382007650705715, "grad_norm": 0.007201820146292448, "learning_rate": 3.0429965708256394e-06, "loss": 0.0009, "step": 20616 }, { "epoch": 5.4387283999472364, "grad_norm": 0.0040985168889164925, "learning_rate": 3.0426448606348368e-06, "loss": 0.0001, "step": 20618 }, { "epoch": 5.439256034823902, "grad_norm": 0.010180237703025341, "learning_rate": 3.042293150444034e-06, "loss": 0.0001, "step": 20620 }, { "epoch": 5.439783669700567, "grad_norm": 0.0027765866834670305, "learning_rate": 3.0419414402532315e-06, "loss": 0.0001, "step": 20622 }, { "epoch": 5.440311304577232, "grad_norm": 0.0014015543274581432, "learning_rate": 3.0415897300624285e-06, "loss": 0.0001, "step": 20624 }, { "epoch": 5.440838939453898, "grad_norm": 0.0019308411283418536, "learning_rate": 3.041238019871626e-06, "loss": 0.0001, "step": 20626 }, { "epoch": 5.441366574330563, "grad_norm": 0.3051851689815521, "learning_rate": 3.0408863096808232e-06, "loss": 0.0072, "step": 20628 }, { "epoch": 5.441894209207229, "grad_norm": 0.049596935510635376, "learning_rate": 3.04053459949002e-06, "loss": 0.0002, "step": 20630 }, { "epoch": 5.442421844083894, "grad_norm": 0.09117928892374039, "learning_rate": 3.0401828892992176e-06, "loss": 0.003, "step": 20632 }, { "epoch": 5.44294947896056, "grad_norm": 0.015215469524264336, "learning_rate": 3.039831179108415e-06, "loss": 0.0008, "step": 20634 }, { "epoch": 5.443477113837225, "grad_norm": 0.004500707145780325, "learning_rate": 3.0394794689176123e-06, "loss": 0.0001, "step": 20636 }, { "epoch": 5.44400474871389, "grad_norm": 0.017898304387927055, "learning_rate": 3.039127758726809e-06, "loss": 0.0018, "step": 20638 }, { "epoch": 5.4445323835905555, "grad_norm": 0.004978294484317303, "learning_rate": 3.0387760485360062e-06, "loss": 0.0001, "step": 20640 }, { "epoch": 5.44506001846722, "grad_norm": 0.006934655364602804, "learning_rate": 3.0384243383452036e-06, "loss": 0.0001, "step": 20642 }, { "epoch": 5.445587653343886, "grad_norm": 0.028180664405226707, "learning_rate": 3.038072628154401e-06, "loss": 0.0002, "step": 20644 }, { "epoch": 5.446115288220551, "grad_norm": 0.010535548441112041, "learning_rate": 3.037720917963598e-06, "loss": 0.0001, "step": 20646 }, { "epoch": 5.446642923097217, "grad_norm": 0.02023519016802311, "learning_rate": 3.0373692077727953e-06, "loss": 0.0018, "step": 20648 }, { "epoch": 5.447170557973882, "grad_norm": 0.8781350255012512, "learning_rate": 3.0370174975819927e-06, "loss": 0.0044, "step": 20650 }, { "epoch": 5.447698192850547, "grad_norm": 0.38413819670677185, "learning_rate": 3.03666578739119e-06, "loss": 0.0026, "step": 20652 }, { "epoch": 5.448225827727213, "grad_norm": 0.002758760703727603, "learning_rate": 3.0363140772003866e-06, "loss": 0.0002, "step": 20654 }, { "epoch": 5.448753462603878, "grad_norm": 0.021910840645432472, "learning_rate": 3.0359623670095844e-06, "loss": 0.0021, "step": 20656 }, { "epoch": 5.449281097480544, "grad_norm": 0.00794446375221014, "learning_rate": 3.0356106568187818e-06, "loss": 0.0001, "step": 20658 }, { "epoch": 5.449808732357209, "grad_norm": 0.011399188078939915, "learning_rate": 3.0352589466279783e-06, "loss": 0.0023, "step": 20660 }, { "epoch": 5.4503363672338745, "grad_norm": 0.01406603679060936, "learning_rate": 3.0349072364371757e-06, "loss": 0.0082, "step": 20662 }, { "epoch": 5.4508640021105395, "grad_norm": 0.06762347370386124, "learning_rate": 3.034555526246373e-06, "loss": 0.0029, "step": 20664 }, { "epoch": 5.451391636987205, "grad_norm": 0.05254707485437393, "learning_rate": 3.0342038160555704e-06, "loss": 0.0004, "step": 20666 }, { "epoch": 5.45191927186387, "grad_norm": 0.2668738067150116, "learning_rate": 3.0338521058647674e-06, "loss": 0.0007, "step": 20668 }, { "epoch": 5.452446906740535, "grad_norm": 0.03798529878258705, "learning_rate": 3.0335003956739647e-06, "loss": 0.0002, "step": 20670 }, { "epoch": 5.452974541617201, "grad_norm": 0.013930262066423893, "learning_rate": 3.033148685483162e-06, "loss": 0.0002, "step": 20672 }, { "epoch": 5.453502176493866, "grad_norm": 0.15129806101322174, "learning_rate": 3.0327969752923595e-06, "loss": 0.0011, "step": 20674 }, { "epoch": 5.454029811370532, "grad_norm": 0.01677946373820305, "learning_rate": 3.032445265101556e-06, "loss": 0.0004, "step": 20676 }, { "epoch": 5.454557446247197, "grad_norm": 0.008749553933739662, "learning_rate": 3.0320935549107534e-06, "loss": 0.0002, "step": 20678 }, { "epoch": 5.455085081123863, "grad_norm": 0.012952227145433426, "learning_rate": 3.0317418447199508e-06, "loss": 0.0001, "step": 20680 }, { "epoch": 5.455612716000528, "grad_norm": 0.017104730010032654, "learning_rate": 3.0313901345291486e-06, "loss": 0.0001, "step": 20682 }, { "epoch": 5.456140350877193, "grad_norm": 0.006474662572145462, "learning_rate": 3.031038424338345e-06, "loss": 0.0001, "step": 20684 }, { "epoch": 5.4566679857538585, "grad_norm": 0.12039674073457718, "learning_rate": 3.0306867141475425e-06, "loss": 0.0027, "step": 20686 }, { "epoch": 5.457195620630523, "grad_norm": 0.006974806077778339, "learning_rate": 3.03033500395674e-06, "loss": 0.0001, "step": 20688 }, { "epoch": 5.457723255507189, "grad_norm": 0.026057127863168716, "learning_rate": 3.029983293765937e-06, "loss": 0.0006, "step": 20690 }, { "epoch": 5.458250890383854, "grad_norm": 0.001982756657525897, "learning_rate": 3.029631583575134e-06, "loss": 0.0001, "step": 20692 }, { "epoch": 5.45877852526052, "grad_norm": 0.036394473165273666, "learning_rate": 3.0292798733843316e-06, "loss": 0.0022, "step": 20694 }, { "epoch": 5.459306160137185, "grad_norm": 0.002873908029869199, "learning_rate": 3.028928163193529e-06, "loss": 0.0004, "step": 20696 }, { "epoch": 5.45983379501385, "grad_norm": 0.37617698311805725, "learning_rate": 3.0285764530027255e-06, "loss": 0.0048, "step": 20698 }, { "epoch": 5.460361429890516, "grad_norm": 0.065971240401268, "learning_rate": 3.028224742811923e-06, "loss": 0.0006, "step": 20700 }, { "epoch": 5.460889064767181, "grad_norm": 0.0415046289563179, "learning_rate": 3.0278730326211202e-06, "loss": 0.0003, "step": 20702 }, { "epoch": 5.461416699643847, "grad_norm": 0.06042175740003586, "learning_rate": 3.0275213224303176e-06, "loss": 0.0002, "step": 20704 }, { "epoch": 5.461944334520512, "grad_norm": 0.029945140704512596, "learning_rate": 3.0271696122395145e-06, "loss": 0.001, "step": 20706 }, { "epoch": 5.4624719693971775, "grad_norm": 0.06796055287122726, "learning_rate": 3.026817902048712e-06, "loss": 0.003, "step": 20708 }, { "epoch": 5.4629996042738425, "grad_norm": 0.3267383873462677, "learning_rate": 3.0264661918579093e-06, "loss": 0.0006, "step": 20710 }, { "epoch": 5.463527239150508, "grad_norm": 0.006170423235744238, "learning_rate": 3.0261144816671067e-06, "loss": 0.0002, "step": 20712 }, { "epoch": 5.464054874027173, "grad_norm": 0.0869658887386322, "learning_rate": 3.0257627714763036e-06, "loss": 0.0006, "step": 20714 }, { "epoch": 5.464582508903838, "grad_norm": 0.007145601790398359, "learning_rate": 3.025411061285501e-06, "loss": 0.0002, "step": 20716 }, { "epoch": 5.465110143780504, "grad_norm": 0.008183090016245842, "learning_rate": 3.0250593510946984e-06, "loss": 0.0001, "step": 20718 }, { "epoch": 5.465637778657169, "grad_norm": 0.016515057533979416, "learning_rate": 3.024707640903895e-06, "loss": 0.0002, "step": 20720 }, { "epoch": 5.466165413533835, "grad_norm": 0.2956833243370056, "learning_rate": 3.0243559307130923e-06, "loss": 0.0006, "step": 20722 }, { "epoch": 5.4666930484105, "grad_norm": 0.038135454058647156, "learning_rate": 3.0240042205222897e-06, "loss": 0.0034, "step": 20724 }, { "epoch": 5.467220683287165, "grad_norm": 0.0817229151725769, "learning_rate": 3.023652510331487e-06, "loss": 0.0002, "step": 20726 }, { "epoch": 5.467748318163831, "grad_norm": 0.03525024279952049, "learning_rate": 3.023300800140684e-06, "loss": 0.0056, "step": 20728 }, { "epoch": 5.468275953040496, "grad_norm": 0.0021740663796663284, "learning_rate": 3.0229490899498814e-06, "loss": 0.0002, "step": 20730 }, { "epoch": 5.4688035879171615, "grad_norm": 0.011636283248662949, "learning_rate": 3.0225973797590787e-06, "loss": 0.0024, "step": 20732 }, { "epoch": 5.469331222793826, "grad_norm": 0.008619134314358234, "learning_rate": 3.022245669568276e-06, "loss": 0.0004, "step": 20734 }, { "epoch": 5.469858857670492, "grad_norm": 0.0010369623778387904, "learning_rate": 3.0218939593774726e-06, "loss": 0.0001, "step": 20736 }, { "epoch": 5.470386492547157, "grad_norm": 0.0043241954408586025, "learning_rate": 3.02154224918667e-06, "loss": 0.0139, "step": 20738 }, { "epoch": 5.470914127423823, "grad_norm": 0.003235871670767665, "learning_rate": 3.021190538995868e-06, "loss": 0.0001, "step": 20740 }, { "epoch": 5.471441762300488, "grad_norm": 0.4333297610282898, "learning_rate": 3.020838828805065e-06, "loss": 0.0028, "step": 20742 }, { "epoch": 5.471969397177153, "grad_norm": 0.01567106693983078, "learning_rate": 3.0204871186142617e-06, "loss": 0.0001, "step": 20744 }, { "epoch": 5.472497032053819, "grad_norm": 0.023544445633888245, "learning_rate": 3.020135408423459e-06, "loss": 0.0002, "step": 20746 }, { "epoch": 5.473024666930484, "grad_norm": 0.005935209803283215, "learning_rate": 3.0197836982326565e-06, "loss": 0.0001, "step": 20748 }, { "epoch": 5.47355230180715, "grad_norm": 0.0019756348337978125, "learning_rate": 3.0194319880418534e-06, "loss": 0.0001, "step": 20750 }, { "epoch": 5.474079936683815, "grad_norm": 0.13048331439495087, "learning_rate": 3.019080277851051e-06, "loss": 0.0002, "step": 20752 }, { "epoch": 5.4746075715604805, "grad_norm": 0.0028739424888044596, "learning_rate": 3.018728567660248e-06, "loss": 0.0001, "step": 20754 }, { "epoch": 5.4751352064371455, "grad_norm": 0.009052664041519165, "learning_rate": 3.0183768574694456e-06, "loss": 0.0001, "step": 20756 }, { "epoch": 5.475662841313811, "grad_norm": 0.006708852481096983, "learning_rate": 3.018025147278642e-06, "loss": 0.0001, "step": 20758 }, { "epoch": 5.476190476190476, "grad_norm": 0.016997961327433586, "learning_rate": 3.0176734370878395e-06, "loss": 0.0002, "step": 20760 }, { "epoch": 5.476718111067141, "grad_norm": 0.03005141206085682, "learning_rate": 3.017321726897037e-06, "loss": 0.0004, "step": 20762 }, { "epoch": 5.477245745943807, "grad_norm": 0.0021095150150358677, "learning_rate": 3.016970016706234e-06, "loss": 0.0001, "step": 20764 }, { "epoch": 5.477773380820472, "grad_norm": 0.0011756432941183448, "learning_rate": 3.016618306515431e-06, "loss": 0.0001, "step": 20766 }, { "epoch": 5.478301015697138, "grad_norm": 0.12149126827716827, "learning_rate": 3.0162665963246285e-06, "loss": 0.0005, "step": 20768 }, { "epoch": 5.478828650573803, "grad_norm": 0.2778417468070984, "learning_rate": 3.015914886133826e-06, "loss": 0.0037, "step": 20770 }, { "epoch": 5.479356285450468, "grad_norm": 0.005958411376923323, "learning_rate": 3.0155631759430233e-06, "loss": 0.0002, "step": 20772 }, { "epoch": 5.479883920327134, "grad_norm": 0.0025840867310762405, "learning_rate": 3.0152114657522202e-06, "loss": 0.0001, "step": 20774 }, { "epoch": 5.480411555203799, "grad_norm": 0.0028656336944550276, "learning_rate": 3.0148597555614176e-06, "loss": 0.0016, "step": 20776 }, { "epoch": 5.4809391900804645, "grad_norm": 0.002275855280458927, "learning_rate": 3.014508045370615e-06, "loss": 0.0001, "step": 20778 }, { "epoch": 5.4814668249571294, "grad_norm": 0.03478701412677765, "learning_rate": 3.0141563351798115e-06, "loss": 0.0021, "step": 20780 }, { "epoch": 5.481994459833795, "grad_norm": 0.00566618237644434, "learning_rate": 3.013804624989009e-06, "loss": 0.0003, "step": 20782 }, { "epoch": 5.48252209471046, "grad_norm": 0.606983482837677, "learning_rate": 3.0134529147982063e-06, "loss": 0.0024, "step": 20784 }, { "epoch": 5.483049729587126, "grad_norm": 0.012623145245015621, "learning_rate": 3.0131012046074036e-06, "loss": 0.0126, "step": 20786 }, { "epoch": 5.483577364463791, "grad_norm": 0.0033581301104277372, "learning_rate": 3.0127494944166006e-06, "loss": 0.0001, "step": 20788 }, { "epoch": 5.484104999340456, "grad_norm": 0.006332591641694307, "learning_rate": 3.012397784225798e-06, "loss": 0.0034, "step": 20790 }, { "epoch": 5.484632634217122, "grad_norm": 0.059007804840803146, "learning_rate": 3.0120460740349954e-06, "loss": 0.0003, "step": 20792 }, { "epoch": 5.485160269093787, "grad_norm": 0.016920488327741623, "learning_rate": 3.0116943638441927e-06, "loss": 0.0006, "step": 20794 }, { "epoch": 5.485687903970453, "grad_norm": 0.00433393707498908, "learning_rate": 3.0113426536533893e-06, "loss": 0.0001, "step": 20796 }, { "epoch": 5.486215538847118, "grad_norm": 0.02339932695031166, "learning_rate": 3.010990943462587e-06, "loss": 0.0001, "step": 20798 }, { "epoch": 5.4867431737237835, "grad_norm": 0.04345498979091644, "learning_rate": 3.0106392332717844e-06, "loss": 0.0002, "step": 20800 }, { "epoch": 5.4872708086004485, "grad_norm": 0.002056980738416314, "learning_rate": 3.010287523080982e-06, "loss": 0.0001, "step": 20802 }, { "epoch": 5.487798443477113, "grad_norm": 0.009476852603256702, "learning_rate": 3.0099358128901783e-06, "loss": 0.0002, "step": 20804 }, { "epoch": 5.488326078353779, "grad_norm": 0.2678183615207672, "learning_rate": 3.0095841026993757e-06, "loss": 0.0079, "step": 20806 }, { "epoch": 5.488853713230444, "grad_norm": 0.010982802137732506, "learning_rate": 3.009232392508573e-06, "loss": 0.0001, "step": 20808 }, { "epoch": 5.48938134810711, "grad_norm": 0.0182606503367424, "learning_rate": 3.00888068231777e-06, "loss": 0.0002, "step": 20810 }, { "epoch": 5.489908982983775, "grad_norm": 0.1475614756345749, "learning_rate": 3.0085289721269674e-06, "loss": 0.0005, "step": 20812 }, { "epoch": 5.490436617860441, "grad_norm": 0.03400508686900139, "learning_rate": 3.008177261936165e-06, "loss": 0.0004, "step": 20814 }, { "epoch": 5.490964252737106, "grad_norm": 0.008169295266270638, "learning_rate": 3.007825551745362e-06, "loss": 0.0001, "step": 20816 }, { "epoch": 5.491491887613771, "grad_norm": 0.0026046154089272022, "learning_rate": 3.0074738415545587e-06, "loss": 0.0001, "step": 20818 }, { "epoch": 5.492019522490437, "grad_norm": 0.10414716601371765, "learning_rate": 3.007122131363756e-06, "loss": 0.0003, "step": 20820 }, { "epoch": 5.492547157367102, "grad_norm": 1.4924832582473755, "learning_rate": 3.0067704211729534e-06, "loss": 0.0012, "step": 20822 }, { "epoch": 5.4930747922437675, "grad_norm": 0.001907559228129685, "learning_rate": 3.0064187109821512e-06, "loss": 0.0033, "step": 20824 }, { "epoch": 5.4936024271204325, "grad_norm": 0.016417577862739563, "learning_rate": 3.0060670007913478e-06, "loss": 0.0001, "step": 20826 }, { "epoch": 5.494130061997098, "grad_norm": 0.0031028895173221827, "learning_rate": 3.005715290600545e-06, "loss": 0.0001, "step": 20828 }, { "epoch": 5.494657696873763, "grad_norm": 0.0453183650970459, "learning_rate": 3.0053635804097425e-06, "loss": 0.0019, "step": 20830 }, { "epoch": 5.495185331750429, "grad_norm": 0.012907895259559155, "learning_rate": 3.00501187021894e-06, "loss": 0.0002, "step": 20832 }, { "epoch": 5.495712966627094, "grad_norm": 0.023018231615424156, "learning_rate": 3.004660160028137e-06, "loss": 0.0002, "step": 20834 }, { "epoch": 5.496240601503759, "grad_norm": 0.0102622015401721, "learning_rate": 3.0043084498373342e-06, "loss": 0.0002, "step": 20836 }, { "epoch": 5.496768236380425, "grad_norm": 0.009959963150322437, "learning_rate": 3.0039567396465316e-06, "loss": 0.0001, "step": 20838 }, { "epoch": 5.49729587125709, "grad_norm": 0.0040958477184176445, "learning_rate": 3.003605029455728e-06, "loss": 0.0002, "step": 20840 }, { "epoch": 5.497823506133756, "grad_norm": 0.16757230460643768, "learning_rate": 3.0032533192649255e-06, "loss": 0.0007, "step": 20842 }, { "epoch": 5.498351141010421, "grad_norm": 0.04317134618759155, "learning_rate": 3.002901609074123e-06, "loss": 0.0002, "step": 20844 }, { "epoch": 5.4988787758870865, "grad_norm": 0.003942594397813082, "learning_rate": 3.0025498988833203e-06, "loss": 0.0001, "step": 20846 }, { "epoch": 5.4994064107637515, "grad_norm": 0.012968357652425766, "learning_rate": 3.0021981886925172e-06, "loss": 0.0005, "step": 20848 }, { "epoch": 5.499934045640416, "grad_norm": 0.04594854637980461, "learning_rate": 3.0018464785017146e-06, "loss": 0.0002, "step": 20850 }, { "epoch": 5.500461680517082, "grad_norm": 0.0039009233005344868, "learning_rate": 3.001494768310912e-06, "loss": 0.0001, "step": 20852 }, { "epoch": 5.500989315393747, "grad_norm": 0.016687598079442978, "learning_rate": 3.0011430581201093e-06, "loss": 0.0003, "step": 20854 }, { "epoch": 5.501516950270413, "grad_norm": 1.422896385192871, "learning_rate": 3.0007913479293063e-06, "loss": 0.0031, "step": 20856 }, { "epoch": 5.502044585147078, "grad_norm": 0.006385954096913338, "learning_rate": 3.0004396377385037e-06, "loss": 0.0001, "step": 20858 }, { "epoch": 5.502572220023744, "grad_norm": 0.005094064865261316, "learning_rate": 3.000087927547701e-06, "loss": 0.0002, "step": 20860 }, { "epoch": 5.503099854900409, "grad_norm": 0.010821722447872162, "learning_rate": 2.999736217356898e-06, "loss": 0.0015, "step": 20862 }, { "epoch": 5.503627489777074, "grad_norm": 0.035073209553956985, "learning_rate": 2.9993845071660954e-06, "loss": 0.0002, "step": 20864 }, { "epoch": 5.50415512465374, "grad_norm": 0.007509442511945963, "learning_rate": 2.9990327969752923e-06, "loss": 0.0005, "step": 20866 }, { "epoch": 5.504682759530405, "grad_norm": 0.009658985771238804, "learning_rate": 2.9986810867844897e-06, "loss": 0.0003, "step": 20868 }, { "epoch": 5.5052103944070705, "grad_norm": 0.007172711193561554, "learning_rate": 2.998329376593687e-06, "loss": 0.0001, "step": 20870 }, { "epoch": 5.5057380292837355, "grad_norm": 0.002727760700508952, "learning_rate": 2.997977666402884e-06, "loss": 0.0003, "step": 20872 }, { "epoch": 5.506265664160401, "grad_norm": 0.008233390748500824, "learning_rate": 2.9976259562120814e-06, "loss": 0.0001, "step": 20874 }, { "epoch": 5.506793299037066, "grad_norm": 0.01951010897755623, "learning_rate": 2.9972742460212784e-06, "loss": 0.0001, "step": 20876 }, { "epoch": 5.507320933913732, "grad_norm": 0.003075327957049012, "learning_rate": 2.9969225358304757e-06, "loss": 0.0001, "step": 20878 }, { "epoch": 5.507848568790397, "grad_norm": 0.004037915263324976, "learning_rate": 2.9965708256396727e-06, "loss": 0.0017, "step": 20880 }, { "epoch": 5.508376203667062, "grad_norm": 0.016514647752046585, "learning_rate": 2.9962191154488705e-06, "loss": 0.0001, "step": 20882 }, { "epoch": 5.508903838543728, "grad_norm": 0.0018634608713909984, "learning_rate": 2.9958674052580674e-06, "loss": 0.0001, "step": 20884 }, { "epoch": 5.509431473420393, "grad_norm": 0.08232330530881882, "learning_rate": 2.995515695067265e-06, "loss": 0.0002, "step": 20886 }, { "epoch": 5.509959108297059, "grad_norm": 0.002999513642862439, "learning_rate": 2.9951639848764618e-06, "loss": 0.0001, "step": 20888 }, { "epoch": 5.510486743173724, "grad_norm": 0.01751078851521015, "learning_rate": 2.994812274685659e-06, "loss": 0.0001, "step": 20890 }, { "epoch": 5.5110143780503895, "grad_norm": 0.025785177946090698, "learning_rate": 2.994460564494856e-06, "loss": 0.0004, "step": 20892 }, { "epoch": 5.5115420129270545, "grad_norm": 0.7435897588729858, "learning_rate": 2.9941088543040535e-06, "loss": 0.0014, "step": 20894 }, { "epoch": 5.512069647803719, "grad_norm": 0.0016509384149685502, "learning_rate": 2.993757144113251e-06, "loss": 0.0042, "step": 20896 }, { "epoch": 5.512597282680385, "grad_norm": 0.013509289361536503, "learning_rate": 2.9934054339224482e-06, "loss": 0.0001, "step": 20898 }, { "epoch": 5.51312491755705, "grad_norm": 0.07347624748945236, "learning_rate": 2.993053723731645e-06, "loss": 0.0003, "step": 20900 }, { "epoch": 5.513652552433716, "grad_norm": 0.008831334300339222, "learning_rate": 2.992702013540842e-06, "loss": 0.0004, "step": 20902 }, { "epoch": 5.514180187310381, "grad_norm": 0.1331595778465271, "learning_rate": 2.9923503033500395e-06, "loss": 0.0004, "step": 20904 }, { "epoch": 5.514707822187047, "grad_norm": 0.012649470940232277, "learning_rate": 2.991998593159237e-06, "loss": 0.0001, "step": 20906 }, { "epoch": 5.515235457063712, "grad_norm": 0.006138521246612072, "learning_rate": 2.9916468829684343e-06, "loss": 0.0001, "step": 20908 }, { "epoch": 5.515763091940377, "grad_norm": 0.13611039519309998, "learning_rate": 2.991295172777631e-06, "loss": 0.0003, "step": 20910 }, { "epoch": 5.516290726817043, "grad_norm": 0.004400611389428377, "learning_rate": 2.9909434625868286e-06, "loss": 0.0006, "step": 20912 }, { "epoch": 5.516818361693708, "grad_norm": 0.002185961464419961, "learning_rate": 2.9905917523960255e-06, "loss": 0.0001, "step": 20914 }, { "epoch": 5.5173459965703735, "grad_norm": 0.26105010509490967, "learning_rate": 2.990240042205223e-06, "loss": 0.0007, "step": 20916 }, { "epoch": 5.5178736314470385, "grad_norm": 0.6677675247192383, "learning_rate": 2.9898883320144203e-06, "loss": 0.0038, "step": 20918 }, { "epoch": 5.518401266323704, "grad_norm": 0.020240383222699165, "learning_rate": 2.9895366218236177e-06, "loss": 0.0001, "step": 20920 }, { "epoch": 5.518928901200369, "grad_norm": 0.0014504005666822195, "learning_rate": 2.9891849116328146e-06, "loss": 0.0011, "step": 20922 }, { "epoch": 5.519456536077035, "grad_norm": 0.40896350145339966, "learning_rate": 2.988833201442012e-06, "loss": 0.0012, "step": 20924 }, { "epoch": 5.5199841709537, "grad_norm": 0.0008712951675988734, "learning_rate": 2.988481491251209e-06, "loss": 0.0001, "step": 20926 }, { "epoch": 5.520511805830365, "grad_norm": 0.658231794834137, "learning_rate": 2.9881297810604063e-06, "loss": 0.002, "step": 20928 }, { "epoch": 5.521039440707031, "grad_norm": 0.004954948555678129, "learning_rate": 2.9877780708696037e-06, "loss": 0.0002, "step": 20930 }, { "epoch": 5.521567075583696, "grad_norm": 0.001350316102616489, "learning_rate": 2.9874263606788007e-06, "loss": 0.0001, "step": 20932 }, { "epoch": 5.522094710460362, "grad_norm": 0.13436436653137207, "learning_rate": 2.987074650487998e-06, "loss": 0.001, "step": 20934 }, { "epoch": 5.522622345337027, "grad_norm": 0.07585926353931427, "learning_rate": 2.986722940297195e-06, "loss": 0.0002, "step": 20936 }, { "epoch": 5.523149980213692, "grad_norm": 0.30365636944770813, "learning_rate": 2.9863712301063924e-06, "loss": 0.0009, "step": 20938 }, { "epoch": 5.5236776150903575, "grad_norm": 0.0023154220543801785, "learning_rate": 2.9860195199155897e-06, "loss": 0.0003, "step": 20940 }, { "epoch": 5.5242052499670224, "grad_norm": 0.056415122002363205, "learning_rate": 2.985667809724787e-06, "loss": 0.0002, "step": 20942 }, { "epoch": 5.524732884843688, "grad_norm": 0.011673756875097752, "learning_rate": 2.985316099533984e-06, "loss": 0.0001, "step": 20944 }, { "epoch": 5.525260519720353, "grad_norm": 0.6533709168434143, "learning_rate": 2.9849643893431814e-06, "loss": 0.0047, "step": 20946 }, { "epoch": 5.525788154597019, "grad_norm": 0.003866608487442136, "learning_rate": 2.9846126791523784e-06, "loss": 0.0001, "step": 20948 }, { "epoch": 5.526315789473684, "grad_norm": 0.18139617145061493, "learning_rate": 2.9842609689615758e-06, "loss": 0.0023, "step": 20950 }, { "epoch": 5.52684342435035, "grad_norm": 0.005758008919656277, "learning_rate": 2.9839092587707727e-06, "loss": 0.0001, "step": 20952 }, { "epoch": 5.527371059227015, "grad_norm": 0.0022908432874828577, "learning_rate": 2.9835575485799705e-06, "loss": 0.0001, "step": 20954 }, { "epoch": 5.52789869410368, "grad_norm": 0.006682624574750662, "learning_rate": 2.9832058383891675e-06, "loss": 0.0001, "step": 20956 }, { "epoch": 5.528426328980346, "grad_norm": 0.026575397700071335, "learning_rate": 2.982854128198365e-06, "loss": 0.0002, "step": 20958 }, { "epoch": 5.528953963857011, "grad_norm": 0.14840762317180634, "learning_rate": 2.982502418007562e-06, "loss": 0.0013, "step": 20960 }, { "epoch": 5.5294815987336765, "grad_norm": 0.04423150047659874, "learning_rate": 2.9821507078167587e-06, "loss": 0.0002, "step": 20962 }, { "epoch": 5.5300092336103415, "grad_norm": 0.002954168478026986, "learning_rate": 2.981798997625956e-06, "loss": 0.0001, "step": 20964 }, { "epoch": 5.530536868487007, "grad_norm": 0.03710077702999115, "learning_rate": 2.9814472874351535e-06, "loss": 0.001, "step": 20966 }, { "epoch": 5.531064503363672, "grad_norm": 0.013961507007479668, "learning_rate": 2.981095577244351e-06, "loss": 0.0034, "step": 20968 }, { "epoch": 5.531592138240338, "grad_norm": 0.014701715670526028, "learning_rate": 2.980743867053548e-06, "loss": 0.0006, "step": 20970 }, { "epoch": 5.532119773117003, "grad_norm": 0.004792570602148771, "learning_rate": 2.980392156862745e-06, "loss": 0.0002, "step": 20972 }, { "epoch": 5.532647407993668, "grad_norm": 0.005785131361335516, "learning_rate": 2.980040446671942e-06, "loss": 0.0001, "step": 20974 }, { "epoch": 5.533175042870334, "grad_norm": 0.01603495515882969, "learning_rate": 2.9796887364811395e-06, "loss": 0.0001, "step": 20976 }, { "epoch": 5.533702677746999, "grad_norm": 0.026477010920643806, "learning_rate": 2.979337026290337e-06, "loss": 0.0002, "step": 20978 }, { "epoch": 5.534230312623665, "grad_norm": 0.021571118384599686, "learning_rate": 2.9789853160995343e-06, "loss": 0.0005, "step": 20980 }, { "epoch": 5.53475794750033, "grad_norm": 0.38706180453300476, "learning_rate": 2.9786336059087312e-06, "loss": 0.0044, "step": 20982 }, { "epoch": 5.535285582376995, "grad_norm": 0.007626134902238846, "learning_rate": 2.9782818957179286e-06, "loss": 0.0001, "step": 20984 }, { "epoch": 5.5358132172536605, "grad_norm": 0.2261965125799179, "learning_rate": 2.9779301855271256e-06, "loss": 0.0046, "step": 20986 }, { "epoch": 5.5363408521303255, "grad_norm": 0.013442875817418098, "learning_rate": 2.977578475336323e-06, "loss": 0.0001, "step": 20988 }, { "epoch": 5.536868487006991, "grad_norm": 0.03523557260632515, "learning_rate": 2.9772267651455203e-06, "loss": 0.0001, "step": 20990 }, { "epoch": 5.537396121883656, "grad_norm": 0.004302312154322863, "learning_rate": 2.9768750549547173e-06, "loss": 0.0001, "step": 20992 }, { "epoch": 5.537923756760322, "grad_norm": 0.7217485904693604, "learning_rate": 2.9765233447639146e-06, "loss": 0.0044, "step": 20994 }, { "epoch": 5.538451391636987, "grad_norm": 0.002368980087339878, "learning_rate": 2.9761716345731116e-06, "loss": 0.0001, "step": 20996 }, { "epoch": 5.538979026513653, "grad_norm": 0.033588651567697525, "learning_rate": 2.975819924382309e-06, "loss": 0.0004, "step": 20998 }, { "epoch": 5.539506661390318, "grad_norm": 0.023887882009148598, "learning_rate": 2.9754682141915063e-06, "loss": 0.0003, "step": 21000 }, { "epoch": 5.540034296266983, "grad_norm": 0.003670694539323449, "learning_rate": 2.9751165040007037e-06, "loss": 0.0002, "step": 21002 }, { "epoch": 5.540561931143649, "grad_norm": 1.062401533126831, "learning_rate": 2.9747647938099007e-06, "loss": 0.0061, "step": 21004 }, { "epoch": 5.541089566020314, "grad_norm": 0.013641465455293655, "learning_rate": 2.974413083619098e-06, "loss": 0.0002, "step": 21006 }, { "epoch": 5.5416172008969795, "grad_norm": 0.09879019111394882, "learning_rate": 2.974061373428295e-06, "loss": 0.0004, "step": 21008 }, { "epoch": 5.5421448357736445, "grad_norm": 0.2524011433124542, "learning_rate": 2.9737096632374924e-06, "loss": 0.0006, "step": 21010 }, { "epoch": 5.54267247065031, "grad_norm": 0.4222033619880676, "learning_rate": 2.9733579530466898e-06, "loss": 0.0033, "step": 21012 }, { "epoch": 5.543200105526975, "grad_norm": 0.03819282352924347, "learning_rate": 2.973006242855887e-06, "loss": 0.0002, "step": 21014 }, { "epoch": 5.543727740403641, "grad_norm": 0.006351366639137268, "learning_rate": 2.972654532665084e-06, "loss": 0.0001, "step": 21016 }, { "epoch": 5.544255375280306, "grad_norm": 0.04057994857430458, "learning_rate": 2.9723028224742815e-06, "loss": 0.0005, "step": 21018 }, { "epoch": 5.544783010156971, "grad_norm": 0.021031325682997704, "learning_rate": 2.9719511122834784e-06, "loss": 0.0039, "step": 21020 }, { "epoch": 5.545310645033637, "grad_norm": 0.0029722340404987335, "learning_rate": 2.9715994020926754e-06, "loss": 0.0001, "step": 21022 }, { "epoch": 5.545838279910302, "grad_norm": 0.016032377257943153, "learning_rate": 2.971247691901873e-06, "loss": 0.0049, "step": 21024 }, { "epoch": 5.546365914786968, "grad_norm": 0.0020341770723462105, "learning_rate": 2.97089598171107e-06, "loss": 0.0001, "step": 21026 }, { "epoch": 5.546893549663633, "grad_norm": 0.04397691786289215, "learning_rate": 2.9705442715202675e-06, "loss": 0.0003, "step": 21028 }, { "epoch": 5.547421184540298, "grad_norm": 0.026617594063282013, "learning_rate": 2.9701925613294644e-06, "loss": 0.0002, "step": 21030 }, { "epoch": 5.5479488194169635, "grad_norm": 0.014481757767498493, "learning_rate": 2.969840851138662e-06, "loss": 0.0001, "step": 21032 }, { "epoch": 5.5484764542936285, "grad_norm": 0.0025455940049141645, "learning_rate": 2.9694891409478588e-06, "loss": 0.0001, "step": 21034 }, { "epoch": 5.549004089170294, "grad_norm": 0.7732670903205872, "learning_rate": 2.969137430757056e-06, "loss": 0.0031, "step": 21036 }, { "epoch": 5.549531724046959, "grad_norm": 0.16746778786182404, "learning_rate": 2.9687857205662535e-06, "loss": 0.0028, "step": 21038 }, { "epoch": 5.550059358923625, "grad_norm": 0.14008627831935883, "learning_rate": 2.968434010375451e-06, "loss": 0.0003, "step": 21040 }, { "epoch": 5.55058699380029, "grad_norm": 0.6386743783950806, "learning_rate": 2.968082300184648e-06, "loss": 0.0106, "step": 21042 }, { "epoch": 5.551114628676956, "grad_norm": 0.00413613673299551, "learning_rate": 2.9677305899938452e-06, "loss": 0.0002, "step": 21044 }, { "epoch": 5.551642263553621, "grad_norm": 0.0020844379905611277, "learning_rate": 2.967378879803042e-06, "loss": 0.0001, "step": 21046 }, { "epoch": 5.552169898430286, "grad_norm": 0.5205719470977783, "learning_rate": 2.9670271696122396e-06, "loss": 0.0016, "step": 21048 }, { "epoch": 5.552697533306952, "grad_norm": 0.002136046066880226, "learning_rate": 2.966675459421437e-06, "loss": 0.0021, "step": 21050 }, { "epoch": 5.553225168183617, "grad_norm": 0.008275552652776241, "learning_rate": 2.966323749230634e-06, "loss": 0.0002, "step": 21052 }, { "epoch": 5.5537528030602825, "grad_norm": 0.004066569730639458, "learning_rate": 2.9659720390398313e-06, "loss": 0.0004, "step": 21054 }, { "epoch": 5.5542804379369475, "grad_norm": 0.20253311097621918, "learning_rate": 2.9656203288490282e-06, "loss": 0.0006, "step": 21056 }, { "epoch": 5.554808072813613, "grad_norm": 0.006659559439867735, "learning_rate": 2.9652686186582256e-06, "loss": 0.0046, "step": 21058 }, { "epoch": 5.555335707690278, "grad_norm": 0.01820243149995804, "learning_rate": 2.964916908467423e-06, "loss": 0.0009, "step": 21060 }, { "epoch": 5.555863342566944, "grad_norm": 0.1490667313337326, "learning_rate": 2.9645651982766203e-06, "loss": 0.0005, "step": 21062 }, { "epoch": 5.556390977443609, "grad_norm": 0.026334207504987717, "learning_rate": 2.9642134880858173e-06, "loss": 0.0038, "step": 21064 }, { "epoch": 5.556918612320274, "grad_norm": 0.07470513880252838, "learning_rate": 2.9638617778950147e-06, "loss": 0.0003, "step": 21066 }, { "epoch": 5.55744624719694, "grad_norm": 0.007492078933864832, "learning_rate": 2.9635100677042116e-06, "loss": 0.0002, "step": 21068 }, { "epoch": 5.557973882073605, "grad_norm": 0.5857018232345581, "learning_rate": 2.963158357513409e-06, "loss": 0.0024, "step": 21070 }, { "epoch": 5.558501516950271, "grad_norm": 0.007732801139354706, "learning_rate": 2.9628066473226064e-06, "loss": 0.0001, "step": 21072 }, { "epoch": 5.559029151826936, "grad_norm": 0.009362911805510521, "learning_rate": 2.9624549371318037e-06, "loss": 0.0004, "step": 21074 }, { "epoch": 5.559556786703601, "grad_norm": 0.00442123506218195, "learning_rate": 2.9621032269410007e-06, "loss": 0.0005, "step": 21076 }, { "epoch": 5.5600844215802665, "grad_norm": 0.006503557786345482, "learning_rate": 2.961751516750198e-06, "loss": 0.0001, "step": 21078 }, { "epoch": 5.5606120564569315, "grad_norm": 0.01492978073656559, "learning_rate": 2.961399806559395e-06, "loss": 0.0001, "step": 21080 }, { "epoch": 5.561139691333597, "grad_norm": 0.5771136283874512, "learning_rate": 2.9610480963685924e-06, "loss": 0.0054, "step": 21082 }, { "epoch": 5.561667326210262, "grad_norm": 0.03790814429521561, "learning_rate": 2.9606963861777898e-06, "loss": 0.0002, "step": 21084 }, { "epoch": 5.562194961086928, "grad_norm": 0.0028639717493206263, "learning_rate": 2.9603446759869867e-06, "loss": 0.0002, "step": 21086 }, { "epoch": 5.562722595963593, "grad_norm": 0.2570750415325165, "learning_rate": 2.959992965796184e-06, "loss": 0.0006, "step": 21088 }, { "epoch": 5.563250230840259, "grad_norm": 0.004048999398946762, "learning_rate": 2.959641255605381e-06, "loss": 0.0017, "step": 21090 }, { "epoch": 5.563777865716924, "grad_norm": 0.009773163124918938, "learning_rate": 2.9592895454145784e-06, "loss": 0.0004, "step": 21092 }, { "epoch": 5.564305500593589, "grad_norm": 0.09333355724811554, "learning_rate": 2.9589378352237754e-06, "loss": 0.0134, "step": 21094 }, { "epoch": 5.564833135470255, "grad_norm": 0.008874905295670033, "learning_rate": 2.958586125032973e-06, "loss": 0.0002, "step": 21096 }, { "epoch": 5.56536077034692, "grad_norm": 0.23482266068458557, "learning_rate": 2.95823441484217e-06, "loss": 0.0006, "step": 21098 }, { "epoch": 5.5658884052235855, "grad_norm": 0.012768339365720749, "learning_rate": 2.9578827046513675e-06, "loss": 0.0001, "step": 21100 }, { "epoch": 5.5664160401002505, "grad_norm": 0.006776558235287666, "learning_rate": 2.9575309944605645e-06, "loss": 0.0002, "step": 21102 }, { "epoch": 5.566943674976916, "grad_norm": 0.0011425691191107035, "learning_rate": 2.957179284269762e-06, "loss": 0.0001, "step": 21104 }, { "epoch": 5.567471309853581, "grad_norm": 0.11986841261386871, "learning_rate": 2.956827574078959e-06, "loss": 0.0006, "step": 21106 }, { "epoch": 5.567998944730247, "grad_norm": 0.0056150672025978565, "learning_rate": 2.956475863888156e-06, "loss": 0.0001, "step": 21108 }, { "epoch": 5.568526579606912, "grad_norm": 0.401051789522171, "learning_rate": 2.9561241536973536e-06, "loss": 0.0062, "step": 21110 }, { "epoch": 5.569054214483577, "grad_norm": 0.007931586354970932, "learning_rate": 2.9557724435065505e-06, "loss": 0.0001, "step": 21112 }, { "epoch": 5.569581849360243, "grad_norm": 0.01023838296532631, "learning_rate": 2.955420733315748e-06, "loss": 0.0002, "step": 21114 }, { "epoch": 5.570109484236908, "grad_norm": 0.01448886375874281, "learning_rate": 2.955069023124945e-06, "loss": 0.0001, "step": 21116 }, { "epoch": 5.570637119113574, "grad_norm": 0.0058131166733801365, "learning_rate": 2.954717312934142e-06, "loss": 0.0031, "step": 21118 }, { "epoch": 5.571164753990239, "grad_norm": 0.008775112219154835, "learning_rate": 2.9543656027433396e-06, "loss": 0.0018, "step": 21120 }, { "epoch": 5.571692388866904, "grad_norm": 0.01669803448021412, "learning_rate": 2.954013892552537e-06, "loss": 0.0002, "step": 21122 }, { "epoch": 5.5722200237435695, "grad_norm": 0.025274282321333885, "learning_rate": 2.953662182361734e-06, "loss": 0.0003, "step": 21124 }, { "epoch": 5.5727476586202345, "grad_norm": 0.31387078762054443, "learning_rate": 2.9533104721709313e-06, "loss": 0.0006, "step": 21126 }, { "epoch": 5.5732752934969, "grad_norm": 0.027856845408678055, "learning_rate": 2.9529587619801282e-06, "loss": 0.0043, "step": 21128 }, { "epoch": 5.573802928373565, "grad_norm": 0.009828178212046623, "learning_rate": 2.9526070517893256e-06, "loss": 0.0002, "step": 21130 }, { "epoch": 5.574330563250231, "grad_norm": 0.004473051987588406, "learning_rate": 2.952255341598523e-06, "loss": 0.0016, "step": 21132 }, { "epoch": 5.574858198126896, "grad_norm": 0.04879698157310486, "learning_rate": 2.9519036314077204e-06, "loss": 0.0002, "step": 21134 }, { "epoch": 5.575385833003562, "grad_norm": 0.42574775218963623, "learning_rate": 2.9515519212169173e-06, "loss": 0.0031, "step": 21136 }, { "epoch": 5.575913467880227, "grad_norm": 0.00137412385083735, "learning_rate": 2.9512002110261147e-06, "loss": 0.0001, "step": 21138 }, { "epoch": 5.576441102756892, "grad_norm": 0.003760832827538252, "learning_rate": 2.9508485008353116e-06, "loss": 0.0001, "step": 21140 }, { "epoch": 5.576968737633558, "grad_norm": 0.03770466148853302, "learning_rate": 2.950496790644509e-06, "loss": 0.0011, "step": 21142 }, { "epoch": 5.577496372510223, "grad_norm": 0.011423896998167038, "learning_rate": 2.9501450804537064e-06, "loss": 0.0001, "step": 21144 }, { "epoch": 5.5780240073868885, "grad_norm": 0.005466114729642868, "learning_rate": 2.9497933702629034e-06, "loss": 0.002, "step": 21146 }, { "epoch": 5.5785516422635535, "grad_norm": 0.011009423062205315, "learning_rate": 2.9494416600721007e-06, "loss": 0.0004, "step": 21148 }, { "epoch": 5.579079277140219, "grad_norm": 0.0014833657769486308, "learning_rate": 2.9490899498812977e-06, "loss": 0.0001, "step": 21150 }, { "epoch": 5.579606912016884, "grad_norm": 0.0016942688962444663, "learning_rate": 2.948738239690495e-06, "loss": 0.0001, "step": 21152 }, { "epoch": 5.58013454689355, "grad_norm": 0.006796605885028839, "learning_rate": 2.9483865294996924e-06, "loss": 0.0002, "step": 21154 }, { "epoch": 5.580662181770215, "grad_norm": 0.2540760934352875, "learning_rate": 2.94803481930889e-06, "loss": 0.0004, "step": 21156 }, { "epoch": 5.58118981664688, "grad_norm": 0.005823743995279074, "learning_rate": 2.9476831091180868e-06, "loss": 0.0001, "step": 21158 }, { "epoch": 5.581717451523546, "grad_norm": 0.04265545308589935, "learning_rate": 2.947331398927284e-06, "loss": 0.0002, "step": 21160 }, { "epoch": 5.582245086400211, "grad_norm": 0.007331442087888718, "learning_rate": 2.946979688736481e-06, "loss": 0.0001, "step": 21162 }, { "epoch": 5.582772721276877, "grad_norm": 1.200351595878601, "learning_rate": 2.9466279785456785e-06, "loss": 0.0052, "step": 21164 }, { "epoch": 5.583300356153542, "grad_norm": 0.06306202709674835, "learning_rate": 2.9462762683548754e-06, "loss": 0.0101, "step": 21166 }, { "epoch": 5.583827991030207, "grad_norm": 0.002355849603191018, "learning_rate": 2.9459245581640732e-06, "loss": 0.0002, "step": 21168 }, { "epoch": 5.5843556259068725, "grad_norm": 0.028533434495329857, "learning_rate": 2.94557284797327e-06, "loss": 0.0035, "step": 21170 }, { "epoch": 5.5848832607835375, "grad_norm": 0.01647360436618328, "learning_rate": 2.9452211377824675e-06, "loss": 0.0002, "step": 21172 }, { "epoch": 5.585410895660203, "grad_norm": 0.006095369812101126, "learning_rate": 2.9448694275916645e-06, "loss": 0.004, "step": 21174 }, { "epoch": 5.585938530536868, "grad_norm": 0.027224572375416756, "learning_rate": 2.9445177174008614e-06, "loss": 0.0009, "step": 21176 }, { "epoch": 5.586466165413534, "grad_norm": 0.0050921146757900715, "learning_rate": 2.944166007210059e-06, "loss": 0.0029, "step": 21178 }, { "epoch": 5.586993800290199, "grad_norm": 0.002627462847158313, "learning_rate": 2.943814297019256e-06, "loss": 0.0024, "step": 21180 }, { "epoch": 5.587521435166865, "grad_norm": 0.015720589086413383, "learning_rate": 2.9434625868284536e-06, "loss": 0.0002, "step": 21182 }, { "epoch": 5.58804907004353, "grad_norm": 0.0035175641532987356, "learning_rate": 2.9431108766376505e-06, "loss": 0.0002, "step": 21184 }, { "epoch": 5.588576704920195, "grad_norm": 0.006735756993293762, "learning_rate": 2.942759166446848e-06, "loss": 0.0002, "step": 21186 }, { "epoch": 5.589104339796861, "grad_norm": 0.07360895723104477, "learning_rate": 2.942407456256045e-06, "loss": 0.0003, "step": 21188 }, { "epoch": 5.589631974673526, "grad_norm": 0.5423714518547058, "learning_rate": 2.9420557460652422e-06, "loss": 0.0019, "step": 21190 }, { "epoch": 5.5901596095501915, "grad_norm": 0.021504292264580727, "learning_rate": 2.9417040358744396e-06, "loss": 0.0001, "step": 21192 }, { "epoch": 5.5906872444268565, "grad_norm": 0.2474454641342163, "learning_rate": 2.941352325683637e-06, "loss": 0.0018, "step": 21194 }, { "epoch": 5.591214879303522, "grad_norm": 0.3230714797973633, "learning_rate": 2.941000615492834e-06, "loss": 0.0016, "step": 21196 }, { "epoch": 5.591742514180187, "grad_norm": 0.19774587452411652, "learning_rate": 2.9406489053020313e-06, "loss": 0.0021, "step": 21198 }, { "epoch": 5.592270149056853, "grad_norm": 0.0936097726225853, "learning_rate": 2.9402971951112283e-06, "loss": 0.0003, "step": 21200 }, { "epoch": 5.592797783933518, "grad_norm": 0.2793867290019989, "learning_rate": 2.9399454849204256e-06, "loss": 0.002, "step": 21202 }, { "epoch": 5.593325418810183, "grad_norm": 0.0025831745006144047, "learning_rate": 2.939593774729623e-06, "loss": 0.0001, "step": 21204 }, { "epoch": 5.593853053686849, "grad_norm": 0.08036980032920837, "learning_rate": 2.93924206453882e-06, "loss": 0.0002, "step": 21206 }, { "epoch": 5.594380688563514, "grad_norm": 0.039806220680475235, "learning_rate": 2.9388903543480173e-06, "loss": 0.0002, "step": 21208 }, { "epoch": 5.59490832344018, "grad_norm": 0.007797162514179945, "learning_rate": 2.9385386441572143e-06, "loss": 0.0007, "step": 21210 }, { "epoch": 5.595435958316845, "grad_norm": 0.01296317856758833, "learning_rate": 2.9381869339664117e-06, "loss": 0.0002, "step": 21212 }, { "epoch": 5.59596359319351, "grad_norm": 0.008288347162306309, "learning_rate": 2.937835223775609e-06, "loss": 0.0001, "step": 21214 }, { "epoch": 5.5964912280701755, "grad_norm": 0.0022414380218833685, "learning_rate": 2.9374835135848064e-06, "loss": 0.0001, "step": 21216 }, { "epoch": 5.5970188629468405, "grad_norm": 0.0034866519272327423, "learning_rate": 2.9371318033940034e-06, "loss": 0.0022, "step": 21218 }, { "epoch": 5.597546497823506, "grad_norm": 0.006184196099638939, "learning_rate": 2.9367800932032008e-06, "loss": 0.0001, "step": 21220 }, { "epoch": 5.598074132700171, "grad_norm": 0.010105093009769917, "learning_rate": 2.9364283830123977e-06, "loss": 0.0001, "step": 21222 }, { "epoch": 5.598601767576837, "grad_norm": 0.08579902350902557, "learning_rate": 2.936076672821595e-06, "loss": 0.0008, "step": 21224 }, { "epoch": 5.599129402453502, "grad_norm": 0.007637712173163891, "learning_rate": 2.9357249626307925e-06, "loss": 0.0001, "step": 21226 }, { "epoch": 5.599657037330168, "grad_norm": 1.050140380859375, "learning_rate": 2.93537325243999e-06, "loss": 0.0034, "step": 21228 }, { "epoch": 5.600184672206833, "grad_norm": 0.19562344253063202, "learning_rate": 2.9350215422491868e-06, "loss": 0.0005, "step": 21230 }, { "epoch": 5.600712307083498, "grad_norm": 0.005069876089692116, "learning_rate": 2.934669832058384e-06, "loss": 0.0001, "step": 21232 }, { "epoch": 5.601239941960164, "grad_norm": 0.010526777245104313, "learning_rate": 2.934318121867581e-06, "loss": 0.0001, "step": 21234 }, { "epoch": 5.601767576836829, "grad_norm": 0.017393361777067184, "learning_rate": 2.933966411676778e-06, "loss": 0.0001, "step": 21236 }, { "epoch": 5.6022952117134945, "grad_norm": 0.003360216738656163, "learning_rate": 2.933614701485976e-06, "loss": 0.0057, "step": 21238 }, { "epoch": 5.6028228465901595, "grad_norm": 0.035606201738119125, "learning_rate": 2.933262991295173e-06, "loss": 0.0035, "step": 21240 }, { "epoch": 5.6033504814668245, "grad_norm": 0.035756804049015045, "learning_rate": 2.93291128110437e-06, "loss": 0.0002, "step": 21242 }, { "epoch": 5.60387811634349, "grad_norm": 0.08520203083753586, "learning_rate": 2.932559570913567e-06, "loss": 0.0003, "step": 21244 }, { "epoch": 5.604405751220155, "grad_norm": 0.24517998099327087, "learning_rate": 2.9322078607227645e-06, "loss": 0.0037, "step": 21246 }, { "epoch": 5.604933386096821, "grad_norm": 0.004915938246995211, "learning_rate": 2.9318561505319615e-06, "loss": 0.0001, "step": 21248 }, { "epoch": 5.605461020973486, "grad_norm": 0.3066351115703583, "learning_rate": 2.931504440341159e-06, "loss": 0.0025, "step": 21250 }, { "epoch": 5.605988655850152, "grad_norm": 0.002825974253937602, "learning_rate": 2.9311527301503562e-06, "loss": 0.0003, "step": 21252 }, { "epoch": 5.606516290726817, "grad_norm": 0.03084474243223667, "learning_rate": 2.9308010199595536e-06, "loss": 0.0002, "step": 21254 }, { "epoch": 5.607043925603483, "grad_norm": 0.34967562556266785, "learning_rate": 2.9304493097687506e-06, "loss": 0.0059, "step": 21256 }, { "epoch": 5.607571560480148, "grad_norm": 0.04439593479037285, "learning_rate": 2.930097599577948e-06, "loss": 0.0007, "step": 21258 }, { "epoch": 5.608099195356813, "grad_norm": 0.020925909280776978, "learning_rate": 2.929745889387145e-06, "loss": 0.0003, "step": 21260 }, { "epoch": 5.6086268302334785, "grad_norm": 0.13343657553195953, "learning_rate": 2.9293941791963423e-06, "loss": 0.0013, "step": 21262 }, { "epoch": 5.6091544651101435, "grad_norm": 0.23205311596393585, "learning_rate": 2.9290424690055396e-06, "loss": 0.0044, "step": 21264 }, { "epoch": 5.609682099986809, "grad_norm": 0.017495423555374146, "learning_rate": 2.9286907588147366e-06, "loss": 0.0002, "step": 21266 }, { "epoch": 5.610209734863474, "grad_norm": 0.42189928889274597, "learning_rate": 2.928339048623934e-06, "loss": 0.0029, "step": 21268 }, { "epoch": 5.61073736974014, "grad_norm": 0.27363914251327515, "learning_rate": 2.927987338433131e-06, "loss": 0.0008, "step": 21270 }, { "epoch": 5.611265004616805, "grad_norm": 1.9491989612579346, "learning_rate": 2.9276356282423283e-06, "loss": 0.0007, "step": 21272 }, { "epoch": 5.611792639493471, "grad_norm": 0.004739797208458185, "learning_rate": 2.9272839180515257e-06, "loss": 0.0002, "step": 21274 }, { "epoch": 5.612320274370136, "grad_norm": 0.18557526171207428, "learning_rate": 2.926932207860723e-06, "loss": 0.0017, "step": 21276 }, { "epoch": 5.612847909246801, "grad_norm": 0.007486825343221426, "learning_rate": 2.92658049766992e-06, "loss": 0.001, "step": 21278 }, { "epoch": 5.613375544123467, "grad_norm": 0.36845722794532776, "learning_rate": 2.9262287874791174e-06, "loss": 0.0037, "step": 21280 }, { "epoch": 5.613903179000132, "grad_norm": 0.14317847788333893, "learning_rate": 2.9258770772883143e-06, "loss": 0.0006, "step": 21282 }, { "epoch": 5.6144308138767975, "grad_norm": 0.08147422969341278, "learning_rate": 2.9255253670975117e-06, "loss": 0.0053, "step": 21284 }, { "epoch": 5.6149584487534625, "grad_norm": 0.004818194080144167, "learning_rate": 2.925173656906709e-06, "loss": 0.0001, "step": 21286 }, { "epoch": 5.6154860836301275, "grad_norm": 0.01682962290942669, "learning_rate": 2.9248219467159064e-06, "loss": 0.0002, "step": 21288 }, { "epoch": 5.616013718506793, "grad_norm": 0.2633344829082489, "learning_rate": 2.9244702365251034e-06, "loss": 0.0047, "step": 21290 }, { "epoch": 5.616541353383458, "grad_norm": 0.006361954379826784, "learning_rate": 2.9241185263343008e-06, "loss": 0.0009, "step": 21292 }, { "epoch": 5.617068988260124, "grad_norm": 0.006306163966655731, "learning_rate": 2.9237668161434977e-06, "loss": 0.0001, "step": 21294 }, { "epoch": 5.617596623136789, "grad_norm": 0.20705455541610718, "learning_rate": 2.923415105952695e-06, "loss": 0.0005, "step": 21296 }, { "epoch": 5.618124258013455, "grad_norm": 0.29925963282585144, "learning_rate": 2.9230633957618925e-06, "loss": 0.0006, "step": 21298 }, { "epoch": 5.61865189289012, "grad_norm": 0.08077902346849442, "learning_rate": 2.9227116855710894e-06, "loss": 0.0004, "step": 21300 }, { "epoch": 5.619179527766786, "grad_norm": 0.43080049753189087, "learning_rate": 2.922359975380287e-06, "loss": 0.0014, "step": 21302 }, { "epoch": 5.619707162643451, "grad_norm": 0.014726776629686356, "learning_rate": 2.9220082651894838e-06, "loss": 0.0004, "step": 21304 }, { "epoch": 5.620234797520116, "grad_norm": 0.045971114188432693, "learning_rate": 2.921656554998681e-06, "loss": 0.0002, "step": 21306 }, { "epoch": 5.6207624323967815, "grad_norm": 0.05630998685956001, "learning_rate": 2.921304844807878e-06, "loss": 0.0005, "step": 21308 }, { "epoch": 5.6212900672734465, "grad_norm": 0.0035857269540429115, "learning_rate": 2.920953134617076e-06, "loss": 0.0001, "step": 21310 }, { "epoch": 5.621817702150112, "grad_norm": 0.008727481588721275, "learning_rate": 2.920601424426273e-06, "loss": 0.0001, "step": 21312 }, { "epoch": 5.622345337026777, "grad_norm": 0.01134426798671484, "learning_rate": 2.9202497142354702e-06, "loss": 0.003, "step": 21314 }, { "epoch": 5.622872971903443, "grad_norm": 0.0052155740559101105, "learning_rate": 2.919898004044667e-06, "loss": 0.0001, "step": 21316 }, { "epoch": 5.623400606780108, "grad_norm": 0.020196687430143356, "learning_rate": 2.9195462938538645e-06, "loss": 0.0002, "step": 21318 }, { "epoch": 5.623928241656774, "grad_norm": 0.003774078795686364, "learning_rate": 2.9191945836630615e-06, "loss": 0.0001, "step": 21320 }, { "epoch": 5.624455876533439, "grad_norm": 0.009186559356749058, "learning_rate": 2.9188428734722593e-06, "loss": 0.0002, "step": 21322 }, { "epoch": 5.624983511410104, "grad_norm": 0.00702038174495101, "learning_rate": 2.9184911632814563e-06, "loss": 0.0003, "step": 21324 }, { "epoch": 5.62551114628677, "grad_norm": 0.0034248020965605974, "learning_rate": 2.918139453090653e-06, "loss": 0.0004, "step": 21326 }, { "epoch": 5.626038781163435, "grad_norm": 0.1940026730298996, "learning_rate": 2.9177877428998506e-06, "loss": 0.0068, "step": 21328 }, { "epoch": 5.6265664160401005, "grad_norm": 0.0022906200028955936, "learning_rate": 2.9174360327090475e-06, "loss": 0.0001, "step": 21330 }, { "epoch": 5.6270940509167655, "grad_norm": 0.0036101217847317457, "learning_rate": 2.917084322518245e-06, "loss": 0.0001, "step": 21332 }, { "epoch": 5.6276216857934305, "grad_norm": 0.03098858892917633, "learning_rate": 2.9167326123274423e-06, "loss": 0.0002, "step": 21334 }, { "epoch": 5.628149320670096, "grad_norm": 0.0028034080751240253, "learning_rate": 2.9163809021366397e-06, "loss": 0.0062, "step": 21336 }, { "epoch": 5.628676955546761, "grad_norm": 0.0016789669170975685, "learning_rate": 2.9160291919458366e-06, "loss": 0.0001, "step": 21338 }, { "epoch": 5.629204590423427, "grad_norm": 0.18907809257507324, "learning_rate": 2.915677481755034e-06, "loss": 0.0015, "step": 21340 }, { "epoch": 5.629732225300092, "grad_norm": 0.5018657445907593, "learning_rate": 2.915325771564231e-06, "loss": 0.0013, "step": 21342 }, { "epoch": 5.630259860176758, "grad_norm": 0.00753259239718318, "learning_rate": 2.9149740613734283e-06, "loss": 0.0001, "step": 21344 }, { "epoch": 5.630787495053423, "grad_norm": 0.050235021859407425, "learning_rate": 2.9146223511826257e-06, "loss": 0.0046, "step": 21346 }, { "epoch": 5.631315129930089, "grad_norm": 0.016450410708785057, "learning_rate": 2.914270640991823e-06, "loss": 0.0016, "step": 21348 }, { "epoch": 5.631842764806754, "grad_norm": 0.0030224097426980734, "learning_rate": 2.91391893080102e-06, "loss": 0.0054, "step": 21350 }, { "epoch": 5.632370399683419, "grad_norm": 0.010040712542831898, "learning_rate": 2.9135672206102174e-06, "loss": 0.0001, "step": 21352 }, { "epoch": 5.6328980345600845, "grad_norm": 0.010452503338456154, "learning_rate": 2.9132155104194143e-06, "loss": 0.0002, "step": 21354 }, { "epoch": 5.6334256694367495, "grad_norm": 0.005434071179479361, "learning_rate": 2.9128638002286117e-06, "loss": 0.0002, "step": 21356 }, { "epoch": 5.633953304313415, "grad_norm": 0.13137345016002655, "learning_rate": 2.912512090037809e-06, "loss": 0.0013, "step": 21358 }, { "epoch": 5.63448093919008, "grad_norm": 0.11472294479608536, "learning_rate": 2.912160379847006e-06, "loss": 0.0004, "step": 21360 }, { "epoch": 5.635008574066746, "grad_norm": 0.002084281062707305, "learning_rate": 2.9118086696562034e-06, "loss": 0.0037, "step": 21362 }, { "epoch": 5.635536208943411, "grad_norm": 0.011324268765747547, "learning_rate": 2.9114569594654004e-06, "loss": 0.0001, "step": 21364 }, { "epoch": 5.636063843820077, "grad_norm": 0.015941986814141273, "learning_rate": 2.9111052492745978e-06, "loss": 0.0005, "step": 21366 }, { "epoch": 5.636591478696742, "grad_norm": 0.08209279179573059, "learning_rate": 2.910753539083795e-06, "loss": 0.0004, "step": 21368 }, { "epoch": 5.637119113573407, "grad_norm": 0.022456562146544456, "learning_rate": 2.9104018288929925e-06, "loss": 0.0005, "step": 21370 }, { "epoch": 5.637646748450073, "grad_norm": 0.19081173837184906, "learning_rate": 2.9100501187021895e-06, "loss": 0.0008, "step": 21372 }, { "epoch": 5.638174383326738, "grad_norm": 0.17102093994617462, "learning_rate": 2.909698408511387e-06, "loss": 0.0028, "step": 21374 }, { "epoch": 5.6387020182034036, "grad_norm": 0.061179984360933304, "learning_rate": 2.9093466983205838e-06, "loss": 0.0003, "step": 21376 }, { "epoch": 5.6392296530800685, "grad_norm": 0.022819973528385162, "learning_rate": 2.908994988129781e-06, "loss": 0.0002, "step": 21378 }, { "epoch": 5.6397572879567335, "grad_norm": 0.03688124939799309, "learning_rate": 2.9086432779389785e-06, "loss": 0.0002, "step": 21380 }, { "epoch": 5.640284922833399, "grad_norm": 0.005322953220456839, "learning_rate": 2.908291567748176e-06, "loss": 0.0001, "step": 21382 }, { "epoch": 5.640812557710064, "grad_norm": 0.039749715477228165, "learning_rate": 2.907939857557373e-06, "loss": 0.0004, "step": 21384 }, { "epoch": 5.64134019258673, "grad_norm": 0.031563520431518555, "learning_rate": 2.90758814736657e-06, "loss": 0.0001, "step": 21386 }, { "epoch": 5.641867827463395, "grad_norm": 0.0030911234207451344, "learning_rate": 2.907236437175767e-06, "loss": 0.0001, "step": 21388 }, { "epoch": 5.642395462340061, "grad_norm": 0.04707719385623932, "learning_rate": 2.906884726984964e-06, "loss": 0.0001, "step": 21390 }, { "epoch": 5.642923097216726, "grad_norm": 0.4904797375202179, "learning_rate": 2.9065330167941615e-06, "loss": 0.0015, "step": 21392 }, { "epoch": 5.643450732093392, "grad_norm": 0.006914941594004631, "learning_rate": 2.906181306603359e-06, "loss": 0.0001, "step": 21394 }, { "epoch": 5.643978366970057, "grad_norm": 0.0062588476575911045, "learning_rate": 2.9058295964125563e-06, "loss": 0.0015, "step": 21396 }, { "epoch": 5.644506001846722, "grad_norm": 0.004738871473819017, "learning_rate": 2.9054778862217532e-06, "loss": 0.0001, "step": 21398 }, { "epoch": 5.6450336367233875, "grad_norm": 0.0014260256430134177, "learning_rate": 2.9051261760309506e-06, "loss": 0.0001, "step": 21400 }, { "epoch": 5.6455612716000525, "grad_norm": 0.0015069801593199372, "learning_rate": 2.9047744658401476e-06, "loss": 0.0002, "step": 21402 }, { "epoch": 5.646088906476718, "grad_norm": 0.48603251576423645, "learning_rate": 2.904422755649345e-06, "loss": 0.0024, "step": 21404 }, { "epoch": 5.646616541353383, "grad_norm": 0.0021379636600613594, "learning_rate": 2.9040710454585423e-06, "loss": 0.0009, "step": 21406 }, { "epoch": 5.647144176230049, "grad_norm": 0.007163712289184332, "learning_rate": 2.9037193352677397e-06, "loss": 0.0057, "step": 21408 }, { "epoch": 5.647671811106714, "grad_norm": 0.03677753359079361, "learning_rate": 2.9033676250769366e-06, "loss": 0.0002, "step": 21410 }, { "epoch": 5.64819944598338, "grad_norm": 0.05722200125455856, "learning_rate": 2.903015914886134e-06, "loss": 0.0005, "step": 21412 }, { "epoch": 5.648727080860045, "grad_norm": 0.00283352960832417, "learning_rate": 2.902664204695331e-06, "loss": 0.0006, "step": 21414 }, { "epoch": 5.64925471573671, "grad_norm": 0.01701815240085125, "learning_rate": 2.9023124945045283e-06, "loss": 0.0117, "step": 21416 }, { "epoch": 5.649782350613376, "grad_norm": 0.006065288092941046, "learning_rate": 2.9019607843137257e-06, "loss": 0.0003, "step": 21418 }, { "epoch": 5.650309985490041, "grad_norm": 0.13011203706264496, "learning_rate": 2.9016090741229227e-06, "loss": 0.0009, "step": 21420 }, { "epoch": 5.650837620366707, "grad_norm": 0.020103951916098595, "learning_rate": 2.90125736393212e-06, "loss": 0.0001, "step": 21422 }, { "epoch": 5.6513652552433715, "grad_norm": 0.007385965436697006, "learning_rate": 2.900905653741317e-06, "loss": 0.0041, "step": 21424 }, { "epoch": 5.6518928901200365, "grad_norm": 0.04035855829715729, "learning_rate": 2.9005539435505144e-06, "loss": 0.0014, "step": 21426 }, { "epoch": 5.652420524996702, "grad_norm": 0.010641985572874546, "learning_rate": 2.9002022333597117e-06, "loss": 0.0005, "step": 21428 }, { "epoch": 5.652948159873367, "grad_norm": 0.00396796315908432, "learning_rate": 2.899850523168909e-06, "loss": 0.0002, "step": 21430 }, { "epoch": 5.653475794750033, "grad_norm": 0.030296219512820244, "learning_rate": 2.899498812978106e-06, "loss": 0.0012, "step": 21432 }, { "epoch": 5.654003429626698, "grad_norm": 0.012029272504150867, "learning_rate": 2.8991471027873035e-06, "loss": 0.0001, "step": 21434 }, { "epoch": 5.654531064503364, "grad_norm": 0.010150106623768806, "learning_rate": 2.8987953925965004e-06, "loss": 0.0002, "step": 21436 }, { "epoch": 5.655058699380029, "grad_norm": 0.12053582817316055, "learning_rate": 2.8984436824056978e-06, "loss": 0.001, "step": 21438 }, { "epoch": 5.655586334256695, "grad_norm": 0.01071936171501875, "learning_rate": 2.898091972214895e-06, "loss": 0.0004, "step": 21440 }, { "epoch": 5.65611396913336, "grad_norm": 0.0035442663356661797, "learning_rate": 2.8977402620240925e-06, "loss": 0.0001, "step": 21442 }, { "epoch": 5.656641604010025, "grad_norm": 0.03172658383846283, "learning_rate": 2.8973885518332895e-06, "loss": 0.0002, "step": 21444 }, { "epoch": 5.6571692388866905, "grad_norm": 0.013332146219909191, "learning_rate": 2.8970368416424864e-06, "loss": 0.0002, "step": 21446 }, { "epoch": 5.6576968737633555, "grad_norm": 0.10363566875457764, "learning_rate": 2.896685131451684e-06, "loss": 0.0004, "step": 21448 }, { "epoch": 5.658224508640021, "grad_norm": 0.01215374656021595, "learning_rate": 2.8963334212608808e-06, "loss": 0.0099, "step": 21450 }, { "epoch": 5.658752143516686, "grad_norm": 0.8459118604660034, "learning_rate": 2.8959817110700786e-06, "loss": 0.0072, "step": 21452 }, { "epoch": 5.659279778393352, "grad_norm": 0.46327123045921326, "learning_rate": 2.8956300008792755e-06, "loss": 0.001, "step": 21454 }, { "epoch": 5.659807413270017, "grad_norm": 0.05890794098377228, "learning_rate": 2.895278290688473e-06, "loss": 0.0002, "step": 21456 }, { "epoch": 5.660335048146683, "grad_norm": 0.06859151273965836, "learning_rate": 2.89492658049767e-06, "loss": 0.0003, "step": 21458 }, { "epoch": 5.660862683023348, "grad_norm": 0.008457598276436329, "learning_rate": 2.8945748703068672e-06, "loss": 0.009, "step": 21460 }, { "epoch": 5.661390317900013, "grad_norm": 0.034825798124074936, "learning_rate": 2.894223160116064e-06, "loss": 0.0003, "step": 21462 }, { "epoch": 5.661917952776679, "grad_norm": 0.0339481458067894, "learning_rate": 2.8938714499252615e-06, "loss": 0.0002, "step": 21464 }, { "epoch": 5.662445587653344, "grad_norm": 0.9868984818458557, "learning_rate": 2.893519739734459e-06, "loss": 0.0049, "step": 21466 }, { "epoch": 5.66297322253001, "grad_norm": 0.0019896544981747866, "learning_rate": 2.8931680295436563e-06, "loss": 0.0001, "step": 21468 }, { "epoch": 5.6635008574066745, "grad_norm": 0.0021104090847074986, "learning_rate": 2.8928163193528533e-06, "loss": 0.0001, "step": 21470 }, { "epoch": 5.6640284922833395, "grad_norm": 0.08555036783218384, "learning_rate": 2.8924646091620506e-06, "loss": 0.0003, "step": 21472 }, { "epoch": 5.664556127160005, "grad_norm": 0.01819479465484619, "learning_rate": 2.8921128989712476e-06, "loss": 0.0019, "step": 21474 }, { "epoch": 5.66508376203667, "grad_norm": 0.0628466084599495, "learning_rate": 2.891761188780445e-06, "loss": 0.0002, "step": 21476 }, { "epoch": 5.665611396913336, "grad_norm": 0.012629472650587559, "learning_rate": 2.8914094785896423e-06, "loss": 0.0005, "step": 21478 }, { "epoch": 5.666139031790001, "grad_norm": 0.001112215919420123, "learning_rate": 2.8910577683988393e-06, "loss": 0.0001, "step": 21480 }, { "epoch": 5.666666666666667, "grad_norm": 0.5323435664176941, "learning_rate": 2.8907060582080367e-06, "loss": 0.0118, "step": 21482 }, { "epoch": 5.667194301543332, "grad_norm": 0.007486648391932249, "learning_rate": 2.8903543480172336e-06, "loss": 0.0001, "step": 21484 }, { "epoch": 5.667721936419998, "grad_norm": 0.0323738269507885, "learning_rate": 2.890002637826431e-06, "loss": 0.0002, "step": 21486 }, { "epoch": 5.668249571296663, "grad_norm": 0.44527772068977356, "learning_rate": 2.8896509276356284e-06, "loss": 0.0008, "step": 21488 }, { "epoch": 5.668777206173328, "grad_norm": 0.057341765612363815, "learning_rate": 2.8892992174448257e-06, "loss": 0.0011, "step": 21490 }, { "epoch": 5.6693048410499935, "grad_norm": 0.027614030987024307, "learning_rate": 2.8889475072540227e-06, "loss": 0.0029, "step": 21492 }, { "epoch": 5.6698324759266585, "grad_norm": 0.2298804223537445, "learning_rate": 2.88859579706322e-06, "loss": 0.0004, "step": 21494 }, { "epoch": 5.670360110803324, "grad_norm": 0.01745346374809742, "learning_rate": 2.888244086872417e-06, "loss": 0.0097, "step": 21496 }, { "epoch": 5.670887745679989, "grad_norm": 0.0016073893057182431, "learning_rate": 2.8878923766816144e-06, "loss": 0.0001, "step": 21498 }, { "epoch": 5.671415380556655, "grad_norm": 0.00825578160583973, "learning_rate": 2.8875406664908118e-06, "loss": 0.0002, "step": 21500 }, { "epoch": 5.67194301543332, "grad_norm": 0.5861929655075073, "learning_rate": 2.887188956300009e-06, "loss": 0.0007, "step": 21502 }, { "epoch": 5.672470650309986, "grad_norm": 0.0022309813648462296, "learning_rate": 2.886837246109206e-06, "loss": 0.0001, "step": 21504 }, { "epoch": 5.672998285186651, "grad_norm": 0.012612816877663136, "learning_rate": 2.8864855359184035e-06, "loss": 0.0001, "step": 21506 }, { "epoch": 5.673525920063316, "grad_norm": 0.06382744759321213, "learning_rate": 2.8861338257276004e-06, "loss": 0.0062, "step": 21508 }, { "epoch": 5.674053554939982, "grad_norm": 0.015430059283971786, "learning_rate": 2.885782115536798e-06, "loss": 0.0001, "step": 21510 }, { "epoch": 5.674581189816647, "grad_norm": 0.4003145694732666, "learning_rate": 2.885430405345995e-06, "loss": 0.0047, "step": 21512 }, { "epoch": 5.675108824693313, "grad_norm": 0.0038577772211283445, "learning_rate": 2.885078695155192e-06, "loss": 0.0002, "step": 21514 }, { "epoch": 5.6756364595699775, "grad_norm": 0.024835098534822464, "learning_rate": 2.8847269849643895e-06, "loss": 0.0002, "step": 21516 }, { "epoch": 5.6761640944466425, "grad_norm": 0.0675869733095169, "learning_rate": 2.8843752747735865e-06, "loss": 0.0002, "step": 21518 }, { "epoch": 5.676691729323308, "grad_norm": 0.004727444611489773, "learning_rate": 2.884023564582784e-06, "loss": 0.0001, "step": 21520 }, { "epoch": 5.677219364199973, "grad_norm": 0.0026886409614235163, "learning_rate": 2.883671854391981e-06, "loss": 0.0032, "step": 21522 }, { "epoch": 5.677746999076639, "grad_norm": 0.49649059772491455, "learning_rate": 2.8833201442011786e-06, "loss": 0.001, "step": 21524 }, { "epoch": 5.678274633953304, "grad_norm": 0.20249596238136292, "learning_rate": 2.8829684340103755e-06, "loss": 0.0057, "step": 21526 }, { "epoch": 5.67880226882997, "grad_norm": 0.515159010887146, "learning_rate": 2.882616723819573e-06, "loss": 0.0013, "step": 21528 }, { "epoch": 5.679329903706635, "grad_norm": 0.009707999415695667, "learning_rate": 2.88226501362877e-06, "loss": 0.0001, "step": 21530 }, { "epoch": 5.679857538583301, "grad_norm": 0.04092290997505188, "learning_rate": 2.8819133034379672e-06, "loss": 0.0015, "step": 21532 }, { "epoch": 5.680385173459966, "grad_norm": 0.006958371493965387, "learning_rate": 2.881561593247164e-06, "loss": 0.0029, "step": 21534 }, { "epoch": 5.680912808336631, "grad_norm": 0.029270697385072708, "learning_rate": 2.881209883056362e-06, "loss": 0.0003, "step": 21536 }, { "epoch": 5.6814404432132966, "grad_norm": 0.025880850851535797, "learning_rate": 2.880858172865559e-06, "loss": 0.0002, "step": 21538 }, { "epoch": 5.6819680780899615, "grad_norm": 0.012508871033787727, "learning_rate": 2.880506462674756e-06, "loss": 0.0002, "step": 21540 }, { "epoch": 5.682495712966627, "grad_norm": 0.684294581413269, "learning_rate": 2.8801547524839533e-06, "loss": 0.0031, "step": 21542 }, { "epoch": 5.683023347843292, "grad_norm": 0.0840071365237236, "learning_rate": 2.8798030422931502e-06, "loss": 0.0002, "step": 21544 }, { "epoch": 5.683550982719957, "grad_norm": 0.5331345796585083, "learning_rate": 2.8794513321023476e-06, "loss": 0.0018, "step": 21546 }, { "epoch": 5.684078617596623, "grad_norm": 0.03538508713245392, "learning_rate": 2.879099621911545e-06, "loss": 0.0003, "step": 21548 }, { "epoch": 5.684606252473288, "grad_norm": 0.1876188963651657, "learning_rate": 2.8787479117207424e-06, "loss": 0.0015, "step": 21550 }, { "epoch": 5.685133887349954, "grad_norm": 0.10529302060604095, "learning_rate": 2.8783962015299393e-06, "loss": 0.0031, "step": 21552 }, { "epoch": 5.685661522226619, "grad_norm": 0.023138677701354027, "learning_rate": 2.8780444913391367e-06, "loss": 0.0001, "step": 21554 }, { "epoch": 5.686189157103285, "grad_norm": 0.2537700831890106, "learning_rate": 2.8776927811483336e-06, "loss": 0.0007, "step": 21556 }, { "epoch": 5.68671679197995, "grad_norm": 0.0023352433927357197, "learning_rate": 2.877341070957531e-06, "loss": 0.0001, "step": 21558 }, { "epoch": 5.687244426856616, "grad_norm": 0.16337068378925323, "learning_rate": 2.8769893607667284e-06, "loss": 0.0005, "step": 21560 }, { "epoch": 5.6877720617332805, "grad_norm": 0.7162315845489502, "learning_rate": 2.8766376505759258e-06, "loss": 0.0022, "step": 21562 }, { "epoch": 5.6882996966099455, "grad_norm": 0.006579590030014515, "learning_rate": 2.8762859403851227e-06, "loss": 0.0001, "step": 21564 }, { "epoch": 5.688827331486611, "grad_norm": 0.052528779953718185, "learning_rate": 2.87593423019432e-06, "loss": 0.0002, "step": 21566 }, { "epoch": 5.689354966363276, "grad_norm": 0.017697101458907127, "learning_rate": 2.875582520003517e-06, "loss": 0.0002, "step": 21568 }, { "epoch": 5.689882601239942, "grad_norm": 0.5496494174003601, "learning_rate": 2.8752308098127144e-06, "loss": 0.0056, "step": 21570 }, { "epoch": 5.690410236116607, "grad_norm": 0.014093843288719654, "learning_rate": 2.874879099621912e-06, "loss": 0.0001, "step": 21572 }, { "epoch": 5.690937870993273, "grad_norm": 0.00987053383141756, "learning_rate": 2.8745273894311088e-06, "loss": 0.0001, "step": 21574 }, { "epoch": 5.691465505869938, "grad_norm": 0.01757821999490261, "learning_rate": 2.874175679240306e-06, "loss": 0.0002, "step": 21576 }, { "epoch": 5.691993140746604, "grad_norm": 0.003720168024301529, "learning_rate": 2.873823969049503e-06, "loss": 0.0001, "step": 21578 }, { "epoch": 5.692520775623269, "grad_norm": 0.02352476678788662, "learning_rate": 2.8734722588587005e-06, "loss": 0.0003, "step": 21580 }, { "epoch": 5.693048410499934, "grad_norm": 0.12760302424430847, "learning_rate": 2.873120548667898e-06, "loss": 0.0004, "step": 21582 }, { "epoch": 5.6935760453766, "grad_norm": 0.22765368223190308, "learning_rate": 2.872768838477095e-06, "loss": 0.0029, "step": 21584 }, { "epoch": 5.6941036802532645, "grad_norm": 0.003441013628616929, "learning_rate": 2.872417128286292e-06, "loss": 0.0003, "step": 21586 }, { "epoch": 5.69463131512993, "grad_norm": 0.010961643420159817, "learning_rate": 2.8720654180954895e-06, "loss": 0.0047, "step": 21588 }, { "epoch": 5.695158950006595, "grad_norm": 0.04731426388025284, "learning_rate": 2.8717137079046865e-06, "loss": 0.0038, "step": 21590 }, { "epoch": 5.69568658488326, "grad_norm": 0.01999617926776409, "learning_rate": 2.871361997713884e-06, "loss": 0.0002, "step": 21592 }, { "epoch": 5.696214219759926, "grad_norm": 0.5279198884963989, "learning_rate": 2.8710102875230812e-06, "loss": 0.0025, "step": 21594 }, { "epoch": 5.696741854636591, "grad_norm": 0.003097405657172203, "learning_rate": 2.8706585773322786e-06, "loss": 0.002, "step": 21596 }, { "epoch": 5.697269489513257, "grad_norm": 0.06392711400985718, "learning_rate": 2.8703068671414756e-06, "loss": 0.0029, "step": 21598 }, { "epoch": 5.697797124389922, "grad_norm": 0.054577119648456573, "learning_rate": 2.8699551569506725e-06, "loss": 0.0002, "step": 21600 }, { "epoch": 5.698324759266588, "grad_norm": 0.030652910470962524, "learning_rate": 2.86960344675987e-06, "loss": 0.0002, "step": 21602 }, { "epoch": 5.698852394143253, "grad_norm": 0.00551297701895237, "learning_rate": 2.869251736569067e-06, "loss": 0.0001, "step": 21604 }, { "epoch": 5.699380029019919, "grad_norm": 0.017626512795686722, "learning_rate": 2.8689000263782642e-06, "loss": 0.003, "step": 21606 }, { "epoch": 5.6999076638965835, "grad_norm": 0.0038272610399872065, "learning_rate": 2.8685483161874616e-06, "loss": 0.0001, "step": 21608 }, { "epoch": 5.7004352987732485, "grad_norm": 0.004053211305290461, "learning_rate": 2.868196605996659e-06, "loss": 0.0001, "step": 21610 }, { "epoch": 5.700962933649914, "grad_norm": 0.008215907029807568, "learning_rate": 2.867844895805856e-06, "loss": 0.0035, "step": 21612 }, { "epoch": 5.701490568526579, "grad_norm": 0.08136790990829468, "learning_rate": 2.8674931856150533e-06, "loss": 0.0003, "step": 21614 }, { "epoch": 5.702018203403245, "grad_norm": 0.005805574357509613, "learning_rate": 2.8671414754242503e-06, "loss": 0.0002, "step": 21616 }, { "epoch": 5.70254583827991, "grad_norm": 0.0119768176227808, "learning_rate": 2.8667897652334476e-06, "loss": 0.0007, "step": 21618 }, { "epoch": 5.703073473156576, "grad_norm": 0.0059617361985147, "learning_rate": 2.866438055042645e-06, "loss": 0.0001, "step": 21620 }, { "epoch": 5.703601108033241, "grad_norm": 0.7707934975624084, "learning_rate": 2.8660863448518424e-06, "loss": 0.0034, "step": 21622 }, { "epoch": 5.704128742909907, "grad_norm": 0.19956322014331818, "learning_rate": 2.8657346346610393e-06, "loss": 0.0002, "step": 21624 }, { "epoch": 5.704656377786572, "grad_norm": 0.019648799672722816, "learning_rate": 2.8653829244702367e-06, "loss": 0.0011, "step": 21626 }, { "epoch": 5.705184012663237, "grad_norm": 0.22713416814804077, "learning_rate": 2.8650312142794337e-06, "loss": 0.0006, "step": 21628 }, { "epoch": 5.705711647539903, "grad_norm": 0.16030964255332947, "learning_rate": 2.864679504088631e-06, "loss": 0.0006, "step": 21630 }, { "epoch": 5.7062392824165675, "grad_norm": 0.09330694377422333, "learning_rate": 2.8643277938978284e-06, "loss": 0.0022, "step": 21632 }, { "epoch": 5.706766917293233, "grad_norm": 0.00829894095659256, "learning_rate": 2.8639760837070254e-06, "loss": 0.0006, "step": 21634 }, { "epoch": 5.707294552169898, "grad_norm": 0.06716322153806686, "learning_rate": 2.8636243735162227e-06, "loss": 0.0011, "step": 21636 }, { "epoch": 5.707822187046563, "grad_norm": 0.010227765887975693, "learning_rate": 2.8632726633254197e-06, "loss": 0.0003, "step": 21638 }, { "epoch": 5.708349821923229, "grad_norm": 0.032599277794361115, "learning_rate": 2.862920953134617e-06, "loss": 0.0002, "step": 21640 }, { "epoch": 5.708877456799894, "grad_norm": 0.06147509440779686, "learning_rate": 2.8625692429438144e-06, "loss": 0.0003, "step": 21642 }, { "epoch": 5.70940509167656, "grad_norm": 0.016162659972906113, "learning_rate": 2.862217532753012e-06, "loss": 0.0002, "step": 21644 }, { "epoch": 5.709932726553225, "grad_norm": 0.00659138523042202, "learning_rate": 2.8618658225622088e-06, "loss": 0.0002, "step": 21646 }, { "epoch": 5.710460361429891, "grad_norm": 0.05114618316292763, "learning_rate": 2.861514112371406e-06, "loss": 0.0011, "step": 21648 }, { "epoch": 5.710987996306556, "grad_norm": 0.2358834445476532, "learning_rate": 2.861162402180603e-06, "loss": 0.0007, "step": 21650 }, { "epoch": 5.711515631183222, "grad_norm": 0.2071741223335266, "learning_rate": 2.8608106919898005e-06, "loss": 0.0014, "step": 21652 }, { "epoch": 5.7120432660598865, "grad_norm": 0.025359364226460457, "learning_rate": 2.860458981798998e-06, "loss": 0.0002, "step": 21654 }, { "epoch": 5.7125709009365515, "grad_norm": 0.011211828328669071, "learning_rate": 2.8601072716081952e-06, "loss": 0.0002, "step": 21656 }, { "epoch": 5.713098535813217, "grad_norm": 0.03832389414310455, "learning_rate": 2.859755561417392e-06, "loss": 0.008, "step": 21658 }, { "epoch": 5.713626170689882, "grad_norm": 0.057356588542461395, "learning_rate": 2.859403851226589e-06, "loss": 0.0003, "step": 21660 }, { "epoch": 5.714153805566548, "grad_norm": 0.0485294833779335, "learning_rate": 2.8590521410357865e-06, "loss": 0.0025, "step": 21662 }, { "epoch": 5.714681440443213, "grad_norm": 0.0075987158343195915, "learning_rate": 2.8587004308449835e-06, "loss": 0.0085, "step": 21664 }, { "epoch": 5.715209075319879, "grad_norm": 0.0867234468460083, "learning_rate": 2.8583487206541813e-06, "loss": 0.0002, "step": 21666 }, { "epoch": 5.715736710196544, "grad_norm": 0.04711690545082092, "learning_rate": 2.8579970104633782e-06, "loss": 0.0002, "step": 21668 }, { "epoch": 5.71626434507321, "grad_norm": 0.004365517757833004, "learning_rate": 2.8576453002725756e-06, "loss": 0.0001, "step": 21670 }, { "epoch": 5.716791979949875, "grad_norm": 0.005638845264911652, "learning_rate": 2.8572935900817725e-06, "loss": 0.0005, "step": 21672 }, { "epoch": 5.71731961482654, "grad_norm": 0.21796771883964539, "learning_rate": 2.85694187989097e-06, "loss": 0.0045, "step": 21674 }, { "epoch": 5.717847249703206, "grad_norm": 0.06466604024171829, "learning_rate": 2.856590169700167e-06, "loss": 0.0002, "step": 21676 }, { "epoch": 5.7183748845798705, "grad_norm": 0.0633675828576088, "learning_rate": 2.8562384595093647e-06, "loss": 0.0002, "step": 21678 }, { "epoch": 5.718902519456536, "grad_norm": 0.020404987037181854, "learning_rate": 2.8558867493185616e-06, "loss": 0.0002, "step": 21680 }, { "epoch": 5.719430154333201, "grad_norm": 0.003641392569988966, "learning_rate": 2.855535039127759e-06, "loss": 0.0001, "step": 21682 }, { "epoch": 5.719957789209866, "grad_norm": 0.2190903276205063, "learning_rate": 2.855183328936956e-06, "loss": 0.0009, "step": 21684 }, { "epoch": 5.720485424086532, "grad_norm": 0.03913670405745506, "learning_rate": 2.8548316187461533e-06, "loss": 0.0008, "step": 21686 }, { "epoch": 5.721013058963197, "grad_norm": 0.008114445954561234, "learning_rate": 2.8544799085553503e-06, "loss": 0.0001, "step": 21688 }, { "epoch": 5.721540693839863, "grad_norm": 0.09377703070640564, "learning_rate": 2.8541281983645477e-06, "loss": 0.0005, "step": 21690 }, { "epoch": 5.722068328716528, "grad_norm": 0.00473767751827836, "learning_rate": 2.853776488173745e-06, "loss": 0.0004, "step": 21692 }, { "epoch": 5.722595963593194, "grad_norm": 0.005180212203413248, "learning_rate": 2.853424777982942e-06, "loss": 0.0003, "step": 21694 }, { "epoch": 5.723123598469859, "grad_norm": 0.0023127456661313772, "learning_rate": 2.8530730677921394e-06, "loss": 0.0001, "step": 21696 }, { "epoch": 5.723651233346525, "grad_norm": 0.00652084406465292, "learning_rate": 2.8527213576013363e-06, "loss": 0.0002, "step": 21698 }, { "epoch": 5.7241788682231896, "grad_norm": 0.2878442704677582, "learning_rate": 2.8523696474105337e-06, "loss": 0.0098, "step": 21700 }, { "epoch": 5.7247065030998545, "grad_norm": 0.01573135145008564, "learning_rate": 2.852017937219731e-06, "loss": 0.0002, "step": 21702 }, { "epoch": 5.72523413797652, "grad_norm": 0.004167243838310242, "learning_rate": 2.8516662270289284e-06, "loss": 0.0001, "step": 21704 }, { "epoch": 5.725761772853185, "grad_norm": 0.010398443788290024, "learning_rate": 2.8513145168381254e-06, "loss": 0.0001, "step": 21706 }, { "epoch": 5.726289407729851, "grad_norm": 0.02808384969830513, "learning_rate": 2.8509628066473228e-06, "loss": 0.0018, "step": 21708 }, { "epoch": 5.726817042606516, "grad_norm": 0.005375288426876068, "learning_rate": 2.8506110964565197e-06, "loss": 0.0003, "step": 21710 }, { "epoch": 5.727344677483182, "grad_norm": 0.01710422709584236, "learning_rate": 2.850259386265717e-06, "loss": 0.0002, "step": 21712 }, { "epoch": 5.727872312359847, "grad_norm": 0.290901780128479, "learning_rate": 2.8499076760749145e-06, "loss": 0.0005, "step": 21714 }, { "epoch": 5.728399947236513, "grad_norm": 0.02776670642197132, "learning_rate": 2.849555965884112e-06, "loss": 0.0001, "step": 21716 }, { "epoch": 5.728927582113178, "grad_norm": 0.006704515311866999, "learning_rate": 2.849204255693309e-06, "loss": 0.0048, "step": 21718 }, { "epoch": 5.729455216989843, "grad_norm": 0.06877835839986801, "learning_rate": 2.8488525455025058e-06, "loss": 0.0004, "step": 21720 }, { "epoch": 5.729982851866509, "grad_norm": 0.007662803400307894, "learning_rate": 2.848500835311703e-06, "loss": 0.0001, "step": 21722 }, { "epoch": 5.7305104867431735, "grad_norm": 0.022718343883752823, "learning_rate": 2.8481491251209005e-06, "loss": 0.0002, "step": 21724 }, { "epoch": 5.731038121619839, "grad_norm": 0.009032237343490124, "learning_rate": 2.847797414930098e-06, "loss": 0.0001, "step": 21726 }, { "epoch": 5.731565756496504, "grad_norm": 0.003407146781682968, "learning_rate": 2.847445704739295e-06, "loss": 0.0001, "step": 21728 }, { "epoch": 5.732093391373169, "grad_norm": 0.00589369423687458, "learning_rate": 2.847093994548492e-06, "loss": 0.0001, "step": 21730 }, { "epoch": 5.732621026249835, "grad_norm": 0.07120798528194427, "learning_rate": 2.846742284357689e-06, "loss": 0.0003, "step": 21732 }, { "epoch": 5.7331486611265, "grad_norm": 0.0028591486625373363, "learning_rate": 2.8463905741668865e-06, "loss": 0.0001, "step": 21734 }, { "epoch": 5.733676296003166, "grad_norm": 0.006425521336495876, "learning_rate": 2.846038863976084e-06, "loss": 0.001, "step": 21736 }, { "epoch": 5.734203930879831, "grad_norm": 0.006439647171646357, "learning_rate": 2.8456871537852813e-06, "loss": 0.0016, "step": 21738 }, { "epoch": 5.734731565756497, "grad_norm": 0.013376748189330101, "learning_rate": 2.8453354435944782e-06, "loss": 0.0001, "step": 21740 }, { "epoch": 5.735259200633162, "grad_norm": 0.022478893399238586, "learning_rate": 2.8449837334036756e-06, "loss": 0.0001, "step": 21742 }, { "epoch": 5.735786835509828, "grad_norm": 0.16506333649158478, "learning_rate": 2.8446320232128726e-06, "loss": 0.0006, "step": 21744 }, { "epoch": 5.736314470386493, "grad_norm": 0.015955815091729164, "learning_rate": 2.84428031302207e-06, "loss": 0.0001, "step": 21746 }, { "epoch": 5.7368421052631575, "grad_norm": 0.03155853599309921, "learning_rate": 2.843928602831267e-06, "loss": 0.0002, "step": 21748 }, { "epoch": 5.737369740139823, "grad_norm": 0.011238635517656803, "learning_rate": 2.8435768926404643e-06, "loss": 0.0002, "step": 21750 }, { "epoch": 5.737897375016488, "grad_norm": 0.010095341131091118, "learning_rate": 2.8432251824496617e-06, "loss": 0.0003, "step": 21752 }, { "epoch": 5.738425009893154, "grad_norm": 0.09629720449447632, "learning_rate": 2.8428734722588586e-06, "loss": 0.0004, "step": 21754 }, { "epoch": 5.738952644769819, "grad_norm": 0.011885436251759529, "learning_rate": 2.842521762068056e-06, "loss": 0.0013, "step": 21756 }, { "epoch": 5.739480279646485, "grad_norm": 0.5632408261299133, "learning_rate": 2.842170051877253e-06, "loss": 0.0004, "step": 21758 }, { "epoch": 5.74000791452315, "grad_norm": 0.01088289637118578, "learning_rate": 2.8418183416864503e-06, "loss": 0.0001, "step": 21760 }, { "epoch": 5.740535549399816, "grad_norm": 0.02018820494413376, "learning_rate": 2.8414666314956477e-06, "loss": 0.0001, "step": 21762 }, { "epoch": 5.741063184276481, "grad_norm": 0.027334176003932953, "learning_rate": 2.841114921304845e-06, "loss": 0.0067, "step": 21764 }, { "epoch": 5.741590819153146, "grad_norm": 0.0043553500436246395, "learning_rate": 2.840763211114042e-06, "loss": 0.0004, "step": 21766 }, { "epoch": 5.742118454029812, "grad_norm": 0.002969711786136031, "learning_rate": 2.8404115009232394e-06, "loss": 0.0014, "step": 21768 }, { "epoch": 5.7426460889064765, "grad_norm": 0.6446977257728577, "learning_rate": 2.8400597907324363e-06, "loss": 0.0008, "step": 21770 }, { "epoch": 5.743173723783142, "grad_norm": 0.0038415868766605854, "learning_rate": 2.8397080805416337e-06, "loss": 0.0002, "step": 21772 }, { "epoch": 5.743701358659807, "grad_norm": 0.1437150090932846, "learning_rate": 2.839356370350831e-06, "loss": 0.0003, "step": 21774 }, { "epoch": 5.744228993536472, "grad_norm": 0.0034214386250823736, "learning_rate": 2.8390046601600285e-06, "loss": 0.0001, "step": 21776 }, { "epoch": 5.744756628413138, "grad_norm": 0.01117765810340643, "learning_rate": 2.8386529499692254e-06, "loss": 0.0001, "step": 21778 }, { "epoch": 5.745284263289803, "grad_norm": 0.02902257815003395, "learning_rate": 2.8383012397784224e-06, "loss": 0.0002, "step": 21780 }, { "epoch": 5.745811898166469, "grad_norm": 0.0010609956225380301, "learning_rate": 2.8379495295876197e-06, "loss": 0.0002, "step": 21782 }, { "epoch": 5.746339533043134, "grad_norm": 0.0013330456567928195, "learning_rate": 2.837597819396817e-06, "loss": 0.0001, "step": 21784 }, { "epoch": 5.7468671679198, "grad_norm": 0.0034464802592992783, "learning_rate": 2.8372461092060145e-06, "loss": 0.0001, "step": 21786 }, { "epoch": 5.747394802796465, "grad_norm": 0.08926410973072052, "learning_rate": 2.8368943990152115e-06, "loss": 0.0109, "step": 21788 }, { "epoch": 5.747922437673131, "grad_norm": 0.04694423824548721, "learning_rate": 2.836542688824409e-06, "loss": 0.0002, "step": 21790 }, { "epoch": 5.748450072549796, "grad_norm": 0.005577193107455969, "learning_rate": 2.8361909786336058e-06, "loss": 0.0001, "step": 21792 }, { "epoch": 5.7489777074264605, "grad_norm": 0.09213666617870331, "learning_rate": 2.835839268442803e-06, "loss": 0.0005, "step": 21794 }, { "epoch": 5.749505342303126, "grad_norm": 0.1628422886133194, "learning_rate": 2.8354875582520005e-06, "loss": 0.0006, "step": 21796 }, { "epoch": 5.750032977179791, "grad_norm": 0.09157269448041916, "learning_rate": 2.835135848061198e-06, "loss": 0.0002, "step": 21798 }, { "epoch": 5.750560612056457, "grad_norm": 0.019299108535051346, "learning_rate": 2.834784137870395e-06, "loss": 0.0016, "step": 21800 }, { "epoch": 5.751088246933122, "grad_norm": 0.013501368463039398, "learning_rate": 2.8344324276795922e-06, "loss": 0.0001, "step": 21802 }, { "epoch": 5.751615881809788, "grad_norm": 0.09823495149612427, "learning_rate": 2.834080717488789e-06, "loss": 0.0007, "step": 21804 }, { "epoch": 5.752143516686453, "grad_norm": 0.002538308734074235, "learning_rate": 2.8337290072979866e-06, "loss": 0.0001, "step": 21806 }, { "epoch": 5.752671151563119, "grad_norm": 0.0013988495338708162, "learning_rate": 2.833377297107184e-06, "loss": 0.0003, "step": 21808 }, { "epoch": 5.753198786439784, "grad_norm": 0.20931614935398102, "learning_rate": 2.8330255869163813e-06, "loss": 0.002, "step": 21810 }, { "epoch": 5.753726421316449, "grad_norm": 0.148895725607872, "learning_rate": 2.8326738767255783e-06, "loss": 0.0008, "step": 21812 }, { "epoch": 5.754254056193115, "grad_norm": 0.010763530619442463, "learning_rate": 2.8323221665347752e-06, "loss": 0.0001, "step": 21814 }, { "epoch": 5.7547816910697795, "grad_norm": 0.0021265011746436357, "learning_rate": 2.8319704563439726e-06, "loss": 0.0013, "step": 21816 }, { "epoch": 5.755309325946445, "grad_norm": 0.012457582168281078, "learning_rate": 2.8316187461531695e-06, "loss": 0.0001, "step": 21818 }, { "epoch": 5.75583696082311, "grad_norm": 0.12567543983459473, "learning_rate": 2.831267035962367e-06, "loss": 0.0002, "step": 21820 }, { "epoch": 5.756364595699775, "grad_norm": 0.012693657539784908, "learning_rate": 2.8309153257715643e-06, "loss": 0.0001, "step": 21822 }, { "epoch": 5.756892230576441, "grad_norm": 0.0019867660012096167, "learning_rate": 2.8305636155807617e-06, "loss": 0.0001, "step": 21824 }, { "epoch": 5.757419865453106, "grad_norm": 1.045509934425354, "learning_rate": 2.8302119053899586e-06, "loss": 0.0075, "step": 21826 }, { "epoch": 5.757947500329772, "grad_norm": 0.010792570188641548, "learning_rate": 2.829860195199156e-06, "loss": 0.0001, "step": 21828 }, { "epoch": 5.758475135206437, "grad_norm": 0.0006718814256601036, "learning_rate": 2.829508485008353e-06, "loss": 0.0001, "step": 21830 }, { "epoch": 5.759002770083103, "grad_norm": 0.026622306555509567, "learning_rate": 2.8291567748175503e-06, "loss": 0.0005, "step": 21832 }, { "epoch": 5.759530404959768, "grad_norm": 0.03326435014605522, "learning_rate": 2.8288050646267477e-06, "loss": 0.0085, "step": 21834 }, { "epoch": 5.760058039836434, "grad_norm": 0.0018164466600865126, "learning_rate": 2.828453354435945e-06, "loss": 0.0001, "step": 21836 }, { "epoch": 5.760585674713099, "grad_norm": 0.029414940625429153, "learning_rate": 2.828101644245142e-06, "loss": 0.0002, "step": 21838 }, { "epoch": 5.7611133095897635, "grad_norm": 0.6002362966537476, "learning_rate": 2.8277499340543394e-06, "loss": 0.0008, "step": 21840 }, { "epoch": 5.761640944466429, "grad_norm": 0.015500418841838837, "learning_rate": 2.8273982238635364e-06, "loss": 0.0002, "step": 21842 }, { "epoch": 5.762168579343094, "grad_norm": 0.010438297875225544, "learning_rate": 2.8270465136727337e-06, "loss": 0.004, "step": 21844 }, { "epoch": 5.76269621421976, "grad_norm": 0.003422815352678299, "learning_rate": 2.826694803481931e-06, "loss": 0.0001, "step": 21846 }, { "epoch": 5.763223849096425, "grad_norm": 0.07025785744190216, "learning_rate": 2.826343093291128e-06, "loss": 0.0004, "step": 21848 }, { "epoch": 5.763751483973091, "grad_norm": 0.03707880899310112, "learning_rate": 2.8259913831003254e-06, "loss": 0.0002, "step": 21850 }, { "epoch": 5.764279118849756, "grad_norm": 0.07867974787950516, "learning_rate": 2.8256396729095224e-06, "loss": 0.0017, "step": 21852 }, { "epoch": 5.764806753726421, "grad_norm": 0.0071899015456438065, "learning_rate": 2.8252879627187198e-06, "loss": 0.003, "step": 21854 }, { "epoch": 5.765334388603087, "grad_norm": 0.0013682425487786531, "learning_rate": 2.824936252527917e-06, "loss": 0.0001, "step": 21856 }, { "epoch": 5.765862023479752, "grad_norm": 0.007223512977361679, "learning_rate": 2.8245845423371145e-06, "loss": 0.0003, "step": 21858 }, { "epoch": 5.766389658356418, "grad_norm": 0.011572149582207203, "learning_rate": 2.8242328321463115e-06, "loss": 0.0014, "step": 21860 }, { "epoch": 5.7669172932330826, "grad_norm": 0.4698497951030731, "learning_rate": 2.823881121955509e-06, "loss": 0.0015, "step": 21862 }, { "epoch": 5.767444928109748, "grad_norm": 0.32304847240448, "learning_rate": 2.823529411764706e-06, "loss": 0.0005, "step": 21864 }, { "epoch": 5.767972562986413, "grad_norm": 0.002648751251399517, "learning_rate": 2.823177701573903e-06, "loss": 0.0001, "step": 21866 }, { "epoch": 5.768500197863078, "grad_norm": 0.0019512652652338147, "learning_rate": 2.8228259913831006e-06, "loss": 0.0001, "step": 21868 }, { "epoch": 5.769027832739744, "grad_norm": 0.009611689485609531, "learning_rate": 2.822474281192298e-06, "loss": 0.0032, "step": 21870 }, { "epoch": 5.769555467616409, "grad_norm": 0.01458941213786602, "learning_rate": 2.822122571001495e-06, "loss": 0.0002, "step": 21872 }, { "epoch": 5.770083102493075, "grad_norm": 0.07018227875232697, "learning_rate": 2.821770860810692e-06, "loss": 0.009, "step": 21874 }, { "epoch": 5.77061073736974, "grad_norm": 0.001445795176550746, "learning_rate": 2.8214191506198892e-06, "loss": 0.0018, "step": 21876 }, { "epoch": 5.771138372246406, "grad_norm": 0.018781643360853195, "learning_rate": 2.821067440429086e-06, "loss": 0.0003, "step": 21878 }, { "epoch": 5.771666007123071, "grad_norm": 0.02805875428020954, "learning_rate": 2.820715730238284e-06, "loss": 0.0002, "step": 21880 }, { "epoch": 5.772193641999737, "grad_norm": 1.5159341096878052, "learning_rate": 2.820364020047481e-06, "loss": 0.0067, "step": 21882 }, { "epoch": 5.772721276876402, "grad_norm": 0.013738210313022137, "learning_rate": 2.8200123098566783e-06, "loss": 0.0002, "step": 21884 }, { "epoch": 5.7732489117530665, "grad_norm": 0.01734185218811035, "learning_rate": 2.8196605996658752e-06, "loss": 0.0002, "step": 21886 }, { "epoch": 5.773776546629732, "grad_norm": 0.2860662639141083, "learning_rate": 2.8193088894750726e-06, "loss": 0.0009, "step": 21888 }, { "epoch": 5.774304181506397, "grad_norm": 0.02962561324238777, "learning_rate": 2.8189571792842696e-06, "loss": 0.0002, "step": 21890 }, { "epoch": 5.774831816383063, "grad_norm": 0.008628643117845058, "learning_rate": 2.8186054690934674e-06, "loss": 0.0003, "step": 21892 }, { "epoch": 5.775359451259728, "grad_norm": 0.4100892245769501, "learning_rate": 2.8182537589026643e-06, "loss": 0.0011, "step": 21894 }, { "epoch": 5.775887086136393, "grad_norm": 0.13480283319950104, "learning_rate": 2.8179020487118617e-06, "loss": 0.001, "step": 21896 }, { "epoch": 5.776414721013059, "grad_norm": 0.3547387719154358, "learning_rate": 2.8175503385210587e-06, "loss": 0.0006, "step": 21898 }, { "epoch": 5.776942355889724, "grad_norm": 0.030676016584038734, "learning_rate": 2.817198628330256e-06, "loss": 0.0002, "step": 21900 }, { "epoch": 5.77746999076639, "grad_norm": 0.10410359501838684, "learning_rate": 2.816846918139453e-06, "loss": 0.0003, "step": 21902 }, { "epoch": 5.777997625643055, "grad_norm": 0.4577944874763489, "learning_rate": 2.8164952079486504e-06, "loss": 0.0029, "step": 21904 }, { "epoch": 5.778525260519721, "grad_norm": 0.7214722037315369, "learning_rate": 2.8161434977578477e-06, "loss": 0.0014, "step": 21906 }, { "epoch": 5.779052895396386, "grad_norm": 0.2417197972536087, "learning_rate": 2.8157917875670447e-06, "loss": 0.0005, "step": 21908 }, { "epoch": 5.779580530273051, "grad_norm": 0.0027711859438568354, "learning_rate": 2.815440077376242e-06, "loss": 0.0065, "step": 21910 }, { "epoch": 5.780108165149716, "grad_norm": 0.06295754760503769, "learning_rate": 2.815088367185439e-06, "loss": 0.0003, "step": 21912 }, { "epoch": 5.780635800026381, "grad_norm": 0.0234343521296978, "learning_rate": 2.8147366569946364e-06, "loss": 0.0002, "step": 21914 }, { "epoch": 5.781163434903047, "grad_norm": 0.35418611764907837, "learning_rate": 2.8143849468038338e-06, "loss": 0.0009, "step": 21916 }, { "epoch": 5.781691069779712, "grad_norm": 0.00818735919892788, "learning_rate": 2.814033236613031e-06, "loss": 0.0129, "step": 21918 }, { "epoch": 5.782218704656378, "grad_norm": 0.014662651345133781, "learning_rate": 2.813681526422228e-06, "loss": 0.0042, "step": 21920 }, { "epoch": 5.782746339533043, "grad_norm": 0.005089078098535538, "learning_rate": 2.8133298162314255e-06, "loss": 0.0001, "step": 21922 }, { "epoch": 5.783273974409709, "grad_norm": 0.004949694965034723, "learning_rate": 2.8129781060406224e-06, "loss": 0.0008, "step": 21924 }, { "epoch": 5.783801609286374, "grad_norm": 0.026665475219488144, "learning_rate": 2.81262639584982e-06, "loss": 0.0001, "step": 21926 }, { "epoch": 5.78432924416304, "grad_norm": 0.01058221235871315, "learning_rate": 2.812274685659017e-06, "loss": 0.0001, "step": 21928 }, { "epoch": 5.784856879039705, "grad_norm": 0.047643646597862244, "learning_rate": 2.8119229754682145e-06, "loss": 0.0002, "step": 21930 }, { "epoch": 5.7853845139163695, "grad_norm": 0.08494799584150314, "learning_rate": 2.8115712652774115e-06, "loss": 0.0003, "step": 21932 }, { "epoch": 5.785912148793035, "grad_norm": 0.11063171178102493, "learning_rate": 2.8112195550866085e-06, "loss": 0.0025, "step": 21934 }, { "epoch": 5.7864397836697, "grad_norm": 0.01961921527981758, "learning_rate": 2.810867844895806e-06, "loss": 0.0045, "step": 21936 }, { "epoch": 5.786967418546366, "grad_norm": 0.01679936610162258, "learning_rate": 2.810516134705003e-06, "loss": 0.0002, "step": 21938 }, { "epoch": 5.787495053423031, "grad_norm": 0.014029646292328835, "learning_rate": 2.8101644245142006e-06, "loss": 0.0009, "step": 21940 }, { "epoch": 5.788022688299696, "grad_norm": 0.45854219794273376, "learning_rate": 2.8098127143233975e-06, "loss": 0.0006, "step": 21942 }, { "epoch": 5.788550323176362, "grad_norm": 0.22971071302890778, "learning_rate": 2.809461004132595e-06, "loss": 0.0064, "step": 21944 }, { "epoch": 5.789077958053027, "grad_norm": 0.34929555654525757, "learning_rate": 2.809109293941792e-06, "loss": 0.0046, "step": 21946 }, { "epoch": 5.789605592929693, "grad_norm": 0.060287006199359894, "learning_rate": 2.8087575837509892e-06, "loss": 0.0004, "step": 21948 }, { "epoch": 5.790133227806358, "grad_norm": 0.0073357028886675835, "learning_rate": 2.8084058735601866e-06, "loss": 0.0001, "step": 21950 }, { "epoch": 5.790660862683024, "grad_norm": 0.00411994056776166, "learning_rate": 2.808054163369384e-06, "loss": 0.0002, "step": 21952 }, { "epoch": 5.791188497559689, "grad_norm": 0.14850285649299622, "learning_rate": 2.807702453178581e-06, "loss": 0.0017, "step": 21954 }, { "epoch": 5.791716132436354, "grad_norm": 0.2513355612754822, "learning_rate": 2.8073507429877783e-06, "loss": 0.0007, "step": 21956 }, { "epoch": 5.792243767313019, "grad_norm": 0.007106957025825977, "learning_rate": 2.8069990327969753e-06, "loss": 0.0002, "step": 21958 }, { "epoch": 5.792771402189684, "grad_norm": 0.0075149741023778915, "learning_rate": 2.8066473226061726e-06, "loss": 0.0002, "step": 21960 }, { "epoch": 5.79329903706635, "grad_norm": 0.06874799728393555, "learning_rate": 2.8062956124153696e-06, "loss": 0.0002, "step": 21962 }, { "epoch": 5.793826671943015, "grad_norm": 0.045581258833408356, "learning_rate": 2.805943902224567e-06, "loss": 0.0002, "step": 21964 }, { "epoch": 5.794354306819681, "grad_norm": 0.18412023782730103, "learning_rate": 2.8055921920337643e-06, "loss": 0.001, "step": 21966 }, { "epoch": 5.794881941696346, "grad_norm": 0.04647982493042946, "learning_rate": 2.8052404818429613e-06, "loss": 0.0002, "step": 21968 }, { "epoch": 5.795409576573012, "grad_norm": 0.36182311177253723, "learning_rate": 2.8048887716521587e-06, "loss": 0.0019, "step": 21970 }, { "epoch": 5.795937211449677, "grad_norm": 0.0027778903022408485, "learning_rate": 2.8045370614613556e-06, "loss": 0.0018, "step": 21972 }, { "epoch": 5.796464846326343, "grad_norm": 0.8947866559028625, "learning_rate": 2.804185351270553e-06, "loss": 0.0047, "step": 21974 }, { "epoch": 5.796992481203008, "grad_norm": 0.3617337942123413, "learning_rate": 2.8038336410797504e-06, "loss": 0.0007, "step": 21976 }, { "epoch": 5.7975201160796725, "grad_norm": 0.011088536120951176, "learning_rate": 2.8034819308889478e-06, "loss": 0.0001, "step": 21978 }, { "epoch": 5.798047750956338, "grad_norm": 0.0341617576777935, "learning_rate": 2.8031302206981447e-06, "loss": 0.0004, "step": 21980 }, { "epoch": 5.798575385833003, "grad_norm": 0.05169392004609108, "learning_rate": 2.802778510507342e-06, "loss": 0.0003, "step": 21982 }, { "epoch": 5.799103020709669, "grad_norm": 0.005639202427119017, "learning_rate": 2.802426800316539e-06, "loss": 0.0001, "step": 21984 }, { "epoch": 5.799630655586334, "grad_norm": 0.04634575918316841, "learning_rate": 2.8020750901257364e-06, "loss": 0.0013, "step": 21986 }, { "epoch": 5.800158290462999, "grad_norm": 0.16105106472969055, "learning_rate": 2.801723379934934e-06, "loss": 0.0003, "step": 21988 }, { "epoch": 5.800685925339665, "grad_norm": 0.0632176399230957, "learning_rate": 2.801371669744131e-06, "loss": 0.0002, "step": 21990 }, { "epoch": 5.80121356021633, "grad_norm": 0.011179274879395962, "learning_rate": 2.801019959553328e-06, "loss": 0.0001, "step": 21992 }, { "epoch": 5.801741195092996, "grad_norm": 0.00774705084040761, "learning_rate": 2.800668249362525e-06, "loss": 0.0001, "step": 21994 }, { "epoch": 5.802268829969661, "grad_norm": 0.008354579098522663, "learning_rate": 2.8003165391717224e-06, "loss": 0.002, "step": 21996 }, { "epoch": 5.802796464846327, "grad_norm": 0.12138457596302032, "learning_rate": 2.79996482898092e-06, "loss": 0.0003, "step": 21998 }, { "epoch": 5.803324099722992, "grad_norm": 0.013789957389235497, "learning_rate": 2.799613118790117e-06, "loss": 0.0002, "step": 22000 }, { "epoch": 5.803324099722992, "eval_loss": 0.0019572784658521414, "eval_runtime": 303.668, "eval_samples_per_second": 710.121, "eval_steps_per_second": 88.768, "step": 22000 }, { "epoch": 5.803851734599657, "grad_norm": 0.6469155550003052, "learning_rate": 2.799261408599314e-06, "loss": 0.0015, "step": 22002 }, { "epoch": 5.804379369476322, "grad_norm": 0.0024800412356853485, "learning_rate": 2.7989096984085115e-06, "loss": 0.0001, "step": 22004 }, { "epoch": 5.804907004352987, "grad_norm": 0.016744500026106834, "learning_rate": 2.7985579882177085e-06, "loss": 0.0001, "step": 22006 }, { "epoch": 5.805434639229653, "grad_norm": 0.015376617200672626, "learning_rate": 2.798206278026906e-06, "loss": 0.0002, "step": 22008 }, { "epoch": 5.805962274106318, "grad_norm": 0.006947014480829239, "learning_rate": 2.7978545678361032e-06, "loss": 0.0001, "step": 22010 }, { "epoch": 5.806489908982984, "grad_norm": 0.8857398629188538, "learning_rate": 2.7975028576453006e-06, "loss": 0.0011, "step": 22012 }, { "epoch": 5.807017543859649, "grad_norm": 0.21837005019187927, "learning_rate": 2.7971511474544976e-06, "loss": 0.0013, "step": 22014 }, { "epoch": 5.807545178736315, "grad_norm": 0.005695729982107878, "learning_rate": 2.796799437263695e-06, "loss": 0.0001, "step": 22016 }, { "epoch": 5.80807281361298, "grad_norm": 0.005199280567467213, "learning_rate": 2.796447727072892e-06, "loss": 0.0002, "step": 22018 }, { "epoch": 5.808600448489646, "grad_norm": 0.004650027956813574, "learning_rate": 2.7960960168820893e-06, "loss": 0.0001, "step": 22020 }, { "epoch": 5.809128083366311, "grad_norm": 0.03144872188568115, "learning_rate": 2.7957443066912866e-06, "loss": 0.0001, "step": 22022 }, { "epoch": 5.8096557182429756, "grad_norm": 0.0073876152746379375, "learning_rate": 2.7953925965004836e-06, "loss": 0.0001, "step": 22024 }, { "epoch": 5.810183353119641, "grad_norm": 0.003539271419867873, "learning_rate": 2.795040886309681e-06, "loss": 0.0002, "step": 22026 }, { "epoch": 5.810710987996306, "grad_norm": 0.0109795443713665, "learning_rate": 2.794689176118878e-06, "loss": 0.0001, "step": 22028 }, { "epoch": 5.811238622872972, "grad_norm": 0.4006474018096924, "learning_rate": 2.7943374659280753e-06, "loss": 0.0067, "step": 22030 }, { "epoch": 5.811766257749637, "grad_norm": 0.01850126124918461, "learning_rate": 2.7939857557372722e-06, "loss": 0.0002, "step": 22032 }, { "epoch": 5.812293892626302, "grad_norm": 0.040033407509326935, "learning_rate": 2.79363404554647e-06, "loss": 0.0017, "step": 22034 }, { "epoch": 5.812821527502968, "grad_norm": 0.0024422816932201385, "learning_rate": 2.793282335355667e-06, "loss": 0.0001, "step": 22036 }, { "epoch": 5.813349162379633, "grad_norm": 0.007701881695538759, "learning_rate": 2.7929306251648644e-06, "loss": 0.0001, "step": 22038 }, { "epoch": 5.813876797256299, "grad_norm": 0.0027989193331450224, "learning_rate": 2.7925789149740613e-06, "loss": 0.0001, "step": 22040 }, { "epoch": 5.814404432132964, "grad_norm": 0.0024863819126039743, "learning_rate": 2.7922272047832587e-06, "loss": 0.0003, "step": 22042 }, { "epoch": 5.81493206700963, "grad_norm": 0.0040742759592831135, "learning_rate": 2.7918754945924557e-06, "loss": 0.0013, "step": 22044 }, { "epoch": 5.815459701886295, "grad_norm": 0.14755357801914215, "learning_rate": 2.791523784401653e-06, "loss": 0.0132, "step": 22046 }, { "epoch": 5.81598733676296, "grad_norm": 0.0018007726175710559, "learning_rate": 2.7911720742108504e-06, "loss": 0.0001, "step": 22048 }, { "epoch": 5.816514971639625, "grad_norm": 0.6003415584564209, "learning_rate": 2.7908203640200478e-06, "loss": 0.0027, "step": 22050 }, { "epoch": 5.81704260651629, "grad_norm": 0.04109002277255058, "learning_rate": 2.7904686538292447e-06, "loss": 0.0002, "step": 22052 }, { "epoch": 5.817570241392956, "grad_norm": 0.016922449693083763, "learning_rate": 2.7901169436384417e-06, "loss": 0.0001, "step": 22054 }, { "epoch": 5.818097876269621, "grad_norm": 0.18256087601184845, "learning_rate": 2.789765233447639e-06, "loss": 0.0003, "step": 22056 }, { "epoch": 5.818625511146287, "grad_norm": 0.004293479025363922, "learning_rate": 2.7894135232568364e-06, "loss": 0.0014, "step": 22058 }, { "epoch": 5.819153146022952, "grad_norm": 0.2673634886741638, "learning_rate": 2.789061813066034e-06, "loss": 0.0035, "step": 22060 }, { "epoch": 5.819680780899618, "grad_norm": 0.0021158719900995493, "learning_rate": 2.7887101028752308e-06, "loss": 0.0002, "step": 22062 }, { "epoch": 5.820208415776283, "grad_norm": 0.12095954269170761, "learning_rate": 2.788358392684428e-06, "loss": 0.0045, "step": 22064 }, { "epoch": 5.820736050652949, "grad_norm": 0.003673372557386756, "learning_rate": 2.788006682493625e-06, "loss": 0.0001, "step": 22066 }, { "epoch": 5.821263685529614, "grad_norm": 0.5070303082466125, "learning_rate": 2.7876549723028225e-06, "loss": 0.0101, "step": 22068 }, { "epoch": 5.821791320406279, "grad_norm": 0.02776186726987362, "learning_rate": 2.78730326211202e-06, "loss": 0.0001, "step": 22070 }, { "epoch": 5.822318955282944, "grad_norm": 0.2998844385147095, "learning_rate": 2.7869515519212172e-06, "loss": 0.0051, "step": 22072 }, { "epoch": 5.822846590159609, "grad_norm": 0.09204403311014175, "learning_rate": 2.786599841730414e-06, "loss": 0.0003, "step": 22074 }, { "epoch": 5.823374225036275, "grad_norm": 0.00957579631358385, "learning_rate": 2.7862481315396116e-06, "loss": 0.0002, "step": 22076 }, { "epoch": 5.82390185991294, "grad_norm": 0.009307418949902058, "learning_rate": 2.7858964213488085e-06, "loss": 0.0001, "step": 22078 }, { "epoch": 5.824429494789605, "grad_norm": 0.019775981083512306, "learning_rate": 2.785544711158006e-06, "loss": 0.0004, "step": 22080 }, { "epoch": 5.824957129666271, "grad_norm": 0.010017390362918377, "learning_rate": 2.7851930009672033e-06, "loss": 0.0001, "step": 22082 }, { "epoch": 5.825484764542936, "grad_norm": 0.016767848283052444, "learning_rate": 2.7848412907764e-06, "loss": 0.0002, "step": 22084 }, { "epoch": 5.826012399419602, "grad_norm": 0.006164263002574444, "learning_rate": 2.7844895805855976e-06, "loss": 0.0001, "step": 22086 }, { "epoch": 5.826540034296267, "grad_norm": 0.017699982970952988, "learning_rate": 2.7841378703947945e-06, "loss": 0.0001, "step": 22088 }, { "epoch": 5.827067669172933, "grad_norm": 0.04280341789126396, "learning_rate": 2.783786160203992e-06, "loss": 0.0004, "step": 22090 }, { "epoch": 5.827595304049598, "grad_norm": 0.010634133592247963, "learning_rate": 2.7834344500131893e-06, "loss": 0.0002, "step": 22092 }, { "epoch": 5.828122938926263, "grad_norm": 0.005487373564392328, "learning_rate": 2.7830827398223867e-06, "loss": 0.0009, "step": 22094 }, { "epoch": 5.828650573802928, "grad_norm": 0.3164307773113251, "learning_rate": 2.7827310296315836e-06, "loss": 0.0087, "step": 22096 }, { "epoch": 5.829178208679593, "grad_norm": 0.035672370344400406, "learning_rate": 2.782379319440781e-06, "loss": 0.0002, "step": 22098 }, { "epoch": 5.829705843556259, "grad_norm": 0.014913303777575493, "learning_rate": 2.782027609249978e-06, "loss": 0.0006, "step": 22100 }, { "epoch": 5.830233478432924, "grad_norm": 0.002870972268283367, "learning_rate": 2.7816758990591753e-06, "loss": 0.0002, "step": 22102 }, { "epoch": 5.83076111330959, "grad_norm": 0.055007144808769226, "learning_rate": 2.7813241888683723e-06, "loss": 0.0004, "step": 22104 }, { "epoch": 5.831288748186255, "grad_norm": 0.01948542147874832, "learning_rate": 2.78097247867757e-06, "loss": 0.0006, "step": 22106 }, { "epoch": 5.831816383062921, "grad_norm": 0.005729961674660444, "learning_rate": 2.780620768486767e-06, "loss": 0.0018, "step": 22108 }, { "epoch": 5.832344017939586, "grad_norm": 0.005655678454786539, "learning_rate": 2.7802690582959644e-06, "loss": 0.0005, "step": 22110 }, { "epoch": 5.832871652816252, "grad_norm": 0.007840299047529697, "learning_rate": 2.7799173481051614e-06, "loss": 0.0002, "step": 22112 }, { "epoch": 5.833399287692917, "grad_norm": 0.003033491550013423, "learning_rate": 2.7795656379143587e-06, "loss": 0.0044, "step": 22114 }, { "epoch": 5.833926922569582, "grad_norm": NaN, "learning_rate": 2.7792139277235557e-06, "loss": 0.0014, "step": 22116 }, { "epoch": 5.834454557446247, "grad_norm": 0.22639000415802002, "learning_rate": 2.7790380726281546e-06, "loss": 0.0006, "step": 22118 }, { "epoch": 5.834982192322912, "grad_norm": 0.11494427174329758, "learning_rate": 2.7786863624373515e-06, "loss": 0.0005, "step": 22120 }, { "epoch": 5.835509827199578, "grad_norm": 0.27805855870246887, "learning_rate": 2.778334652246549e-06, "loss": 0.0034, "step": 22122 }, { "epoch": 5.836037462076243, "grad_norm": 1.5541911125183105, "learning_rate": 2.7779829420557463e-06, "loss": 0.0015, "step": 22124 }, { "epoch": 5.836565096952908, "grad_norm": 0.1940673291683197, "learning_rate": 2.7776312318649437e-06, "loss": 0.0007, "step": 22126 }, { "epoch": 5.837092731829574, "grad_norm": 0.12883105874061584, "learning_rate": 2.7772795216741406e-06, "loss": 0.005, "step": 22128 }, { "epoch": 5.837620366706239, "grad_norm": 0.01215376053005457, "learning_rate": 2.776927811483338e-06, "loss": 0.001, "step": 22130 }, { "epoch": 5.838148001582905, "grad_norm": 0.22714762389659882, "learning_rate": 2.776576101292535e-06, "loss": 0.0018, "step": 22132 }, { "epoch": 5.83867563645957, "grad_norm": 0.010484127327799797, "learning_rate": 2.776224391101732e-06, "loss": 0.0001, "step": 22134 }, { "epoch": 5.839203271336236, "grad_norm": 0.012404104694724083, "learning_rate": 2.7758726809109297e-06, "loss": 0.0003, "step": 22136 }, { "epoch": 5.839730906212901, "grad_norm": 0.020791105926036835, "learning_rate": 2.7755209707201266e-06, "loss": 0.0004, "step": 22138 }, { "epoch": 5.840258541089566, "grad_norm": 0.045307666063308716, "learning_rate": 2.775169260529324e-06, "loss": 0.0001, "step": 22140 }, { "epoch": 5.840786175966231, "grad_norm": 0.02006019838154316, "learning_rate": 2.774817550338521e-06, "loss": 0.0003, "step": 22142 }, { "epoch": 5.841313810842896, "grad_norm": 0.0030745286494493484, "learning_rate": 2.7744658401477183e-06, "loss": 0.0002, "step": 22144 }, { "epoch": 5.841841445719562, "grad_norm": 0.16927918791770935, "learning_rate": 2.7741141299569153e-06, "loss": 0.0029, "step": 22146 }, { "epoch": 5.842369080596227, "grad_norm": 0.002218939596787095, "learning_rate": 2.7737624197661127e-06, "loss": 0.0001, "step": 22148 }, { "epoch": 5.842896715472893, "grad_norm": 0.01073724776506424, "learning_rate": 2.77341070957531e-06, "loss": 0.0001, "step": 22150 }, { "epoch": 5.843424350349558, "grad_norm": 0.03753441199660301, "learning_rate": 2.7730589993845074e-06, "loss": 0.0006, "step": 22152 }, { "epoch": 5.843951985226224, "grad_norm": 0.020855462178587914, "learning_rate": 2.7727072891937044e-06, "loss": 0.0007, "step": 22154 }, { "epoch": 5.844479620102889, "grad_norm": 0.011601760983467102, "learning_rate": 2.7723555790029018e-06, "loss": 0.0002, "step": 22156 }, { "epoch": 5.845007254979555, "grad_norm": 0.015945864841341972, "learning_rate": 2.7720038688120987e-06, "loss": 0.0002, "step": 22158 }, { "epoch": 5.84553488985622, "grad_norm": 0.0036104111932218075, "learning_rate": 2.771652158621296e-06, "loss": 0.0001, "step": 22160 }, { "epoch": 5.846062524732885, "grad_norm": 0.002132692839950323, "learning_rate": 2.7713004484304935e-06, "loss": 0.0011, "step": 22162 }, { "epoch": 5.84659015960955, "grad_norm": 0.003579716430976987, "learning_rate": 2.7709487382396904e-06, "loss": 0.0002, "step": 22164 }, { "epoch": 5.847117794486215, "grad_norm": 0.02026011049747467, "learning_rate": 2.770597028048888e-06, "loss": 0.0001, "step": 22166 }, { "epoch": 5.847645429362881, "grad_norm": 0.451944500207901, "learning_rate": 2.7702453178580847e-06, "loss": 0.0019, "step": 22168 }, { "epoch": 5.848173064239546, "grad_norm": 0.03793933987617493, "learning_rate": 2.769893607667282e-06, "loss": 0.0002, "step": 22170 }, { "epoch": 5.848700699116211, "grad_norm": 0.0035293379332870245, "learning_rate": 2.7695418974764795e-06, "loss": 0.0002, "step": 22172 }, { "epoch": 5.849228333992877, "grad_norm": 0.9402475357055664, "learning_rate": 2.769190187285677e-06, "loss": 0.0031, "step": 22174 }, { "epoch": 5.849755968869542, "grad_norm": 0.12820635735988617, "learning_rate": 2.768838477094874e-06, "loss": 0.0046, "step": 22176 }, { "epoch": 5.850283603746208, "grad_norm": 0.009521543979644775, "learning_rate": 2.768486766904071e-06, "loss": 0.0004, "step": 22178 }, { "epoch": 5.850811238622873, "grad_norm": 0.06175326928496361, "learning_rate": 2.768135056713268e-06, "loss": 0.0044, "step": 22180 }, { "epoch": 5.851338873499539, "grad_norm": 0.09726986289024353, "learning_rate": 2.7677833465224655e-06, "loss": 0.0005, "step": 22182 }, { "epoch": 5.851866508376204, "grad_norm": 0.008549961261451244, "learning_rate": 2.767431636331663e-06, "loss": 0.0001, "step": 22184 }, { "epoch": 5.852394143252869, "grad_norm": 0.5832505226135254, "learning_rate": 2.7670799261408603e-06, "loss": 0.0013, "step": 22186 }, { "epoch": 5.852921778129534, "grad_norm": 0.0063459910452365875, "learning_rate": 2.7667282159500572e-06, "loss": 0.0009, "step": 22188 }, { "epoch": 5.853449413006199, "grad_norm": 0.034241821616888046, "learning_rate": 2.7663765057592546e-06, "loss": 0.0002, "step": 22190 }, { "epoch": 5.853977047882865, "grad_norm": 0.01236072089523077, "learning_rate": 2.7660247955684516e-06, "loss": 0.0002, "step": 22192 }, { "epoch": 5.85450468275953, "grad_norm": 0.16003237664699554, "learning_rate": 2.765673085377649e-06, "loss": 0.0003, "step": 22194 }, { "epoch": 5.855032317636196, "grad_norm": 0.08279886096715927, "learning_rate": 2.7653213751868463e-06, "loss": 0.0005, "step": 22196 }, { "epoch": 5.855559952512861, "grad_norm": 0.17017009854316711, "learning_rate": 2.7649696649960433e-06, "loss": 0.0051, "step": 22198 }, { "epoch": 5.856087587389526, "grad_norm": 0.022811830043792725, "learning_rate": 2.7646179548052406e-06, "loss": 0.0001, "step": 22200 }, { "epoch": 5.856615222266192, "grad_norm": 0.6116649508476257, "learning_rate": 2.7642662446144376e-06, "loss": 0.0039, "step": 22202 }, { "epoch": 5.857142857142857, "grad_norm": 0.005964590236544609, "learning_rate": 2.763914534423635e-06, "loss": 0.0002, "step": 22204 }, { "epoch": 5.857670492019523, "grad_norm": 0.33215829730033875, "learning_rate": 2.763562824232832e-06, "loss": 0.0005, "step": 22206 }, { "epoch": 5.858198126896188, "grad_norm": 0.010524993762373924, "learning_rate": 2.7632111140420297e-06, "loss": 0.0002, "step": 22208 }, { "epoch": 5.858725761772853, "grad_norm": 0.00774833420291543, "learning_rate": 2.7628594038512267e-06, "loss": 0.0002, "step": 22210 }, { "epoch": 5.859253396649518, "grad_norm": 0.008883205242455006, "learning_rate": 2.762507693660424e-06, "loss": 0.0002, "step": 22212 }, { "epoch": 5.859781031526184, "grad_norm": 0.14239951968193054, "learning_rate": 2.762155983469621e-06, "loss": 0.0006, "step": 22214 }, { "epoch": 5.860308666402849, "grad_norm": 0.05916015803813934, "learning_rate": 2.7618042732788184e-06, "loss": 0.0002, "step": 22216 }, { "epoch": 5.860836301279514, "grad_norm": 0.010856647044420242, "learning_rate": 2.7614525630880153e-06, "loss": 0.0004, "step": 22218 }, { "epoch": 5.86136393615618, "grad_norm": 0.086087666451931, "learning_rate": 2.761100852897213e-06, "loss": 0.0011, "step": 22220 }, { "epoch": 5.861891571032845, "grad_norm": 0.08576309680938721, "learning_rate": 2.76074914270641e-06, "loss": 0.0004, "step": 22222 }, { "epoch": 5.862419205909511, "grad_norm": 0.0024026173632591963, "learning_rate": 2.760397432515607e-06, "loss": 0.0048, "step": 22224 }, { "epoch": 5.862946840786176, "grad_norm": 0.006311262492090464, "learning_rate": 2.7600457223248044e-06, "loss": 0.0003, "step": 22226 }, { "epoch": 5.863474475662842, "grad_norm": 0.004853322170674801, "learning_rate": 2.7596940121340014e-06, "loss": 0.0009, "step": 22228 }, { "epoch": 5.864002110539507, "grad_norm": 0.28407132625579834, "learning_rate": 2.7593423019431987e-06, "loss": 0.0026, "step": 22230 }, { "epoch": 5.8645297454161724, "grad_norm": 0.008415859192609787, "learning_rate": 2.758990591752396e-06, "loss": 0.0036, "step": 22232 }, { "epoch": 5.865057380292837, "grad_norm": 0.008925553411245346, "learning_rate": 2.7586388815615935e-06, "loss": 0.0004, "step": 22234 }, { "epoch": 5.865585015169502, "grad_norm": 0.9368447661399841, "learning_rate": 2.7582871713707904e-06, "loss": 0.003, "step": 22236 }, { "epoch": 5.866112650046168, "grad_norm": 0.026741288602352142, "learning_rate": 2.757935461179988e-06, "loss": 0.0002, "step": 22238 }, { "epoch": 5.866640284922833, "grad_norm": 0.019671471789479256, "learning_rate": 2.7575837509891848e-06, "loss": 0.0002, "step": 22240 }, { "epoch": 5.867167919799499, "grad_norm": 0.6313822269439697, "learning_rate": 2.757232040798382e-06, "loss": 0.0012, "step": 22242 }, { "epoch": 5.867695554676164, "grad_norm": 0.013806968927383423, "learning_rate": 2.7568803306075795e-06, "loss": 0.0001, "step": 22244 }, { "epoch": 5.868223189552829, "grad_norm": 0.006011425983160734, "learning_rate": 2.756528620416777e-06, "loss": 0.0001, "step": 22246 }, { "epoch": 5.868750824429495, "grad_norm": 0.005305607803165913, "learning_rate": 2.756176910225974e-06, "loss": 0.0001, "step": 22248 }, { "epoch": 5.86927845930616, "grad_norm": 0.029475105926394463, "learning_rate": 2.7558252000351712e-06, "loss": 0.0002, "step": 22250 }, { "epoch": 5.869806094182826, "grad_norm": 0.013208845630288124, "learning_rate": 2.755473489844368e-06, "loss": 0.0003, "step": 22252 }, { "epoch": 5.870333729059491, "grad_norm": 0.23154877126216888, "learning_rate": 2.7551217796535656e-06, "loss": 0.0004, "step": 22254 }, { "epoch": 5.870861363936156, "grad_norm": 0.006870236713439226, "learning_rate": 2.754770069462763e-06, "loss": 0.0001, "step": 22256 }, { "epoch": 5.871388998812821, "grad_norm": 0.4624292552471161, "learning_rate": 2.75441835927196e-06, "loss": 0.0123, "step": 22258 }, { "epoch": 5.871916633689487, "grad_norm": 0.004819077905267477, "learning_rate": 2.7540666490811573e-06, "loss": 0.0001, "step": 22260 }, { "epoch": 5.872444268566152, "grad_norm": 0.0018828200409188867, "learning_rate": 2.753714938890354e-06, "loss": 0.0001, "step": 22262 }, { "epoch": 5.872971903442817, "grad_norm": 0.0018822698621079326, "learning_rate": 2.7533632286995516e-06, "loss": 0.0025, "step": 22264 }, { "epoch": 5.873499538319483, "grad_norm": 0.005649988539516926, "learning_rate": 2.753011518508749e-06, "loss": 0.0001, "step": 22266 }, { "epoch": 5.874027173196148, "grad_norm": 0.036233797669410706, "learning_rate": 2.7526598083179463e-06, "loss": 0.0006, "step": 22268 }, { "epoch": 5.874554808072814, "grad_norm": 0.023213880136609077, "learning_rate": 2.7523080981271433e-06, "loss": 0.0002, "step": 22270 }, { "epoch": 5.875082442949479, "grad_norm": 0.014591042883694172, "learning_rate": 2.7519563879363407e-06, "loss": 0.0001, "step": 22272 }, { "epoch": 5.875610077826145, "grad_norm": 0.0029576688539236784, "learning_rate": 2.7516046777455376e-06, "loss": 0.0009, "step": 22274 }, { "epoch": 5.87613771270281, "grad_norm": 0.1047486886382103, "learning_rate": 2.751252967554735e-06, "loss": 0.0013, "step": 22276 }, { "epoch": 5.8766653475794755, "grad_norm": 0.004516515415161848, "learning_rate": 2.7509012573639324e-06, "loss": 0.0001, "step": 22278 }, { "epoch": 5.87719298245614, "grad_norm": 0.01738537847995758, "learning_rate": 2.7505495471731297e-06, "loss": 0.0019, "step": 22280 }, { "epoch": 5.877720617332805, "grad_norm": 0.0042143589816987514, "learning_rate": 2.7501978369823267e-06, "loss": 0.0001, "step": 22282 }, { "epoch": 5.878248252209471, "grad_norm": 0.012431926093995571, "learning_rate": 2.7498461267915236e-06, "loss": 0.0001, "step": 22284 }, { "epoch": 5.878775887086136, "grad_norm": 0.111357182264328, "learning_rate": 2.749494416600721e-06, "loss": 0.0033, "step": 22286 }, { "epoch": 5.879303521962802, "grad_norm": 0.3081890046596527, "learning_rate": 2.749142706409918e-06, "loss": 0.0061, "step": 22288 }, { "epoch": 5.879831156839467, "grad_norm": 0.006160663906484842, "learning_rate": 2.7487909962191154e-06, "loss": 0.0002, "step": 22290 }, { "epoch": 5.880358791716132, "grad_norm": 0.015151369385421276, "learning_rate": 2.7484392860283127e-06, "loss": 0.0001, "step": 22292 }, { "epoch": 5.880886426592798, "grad_norm": 0.17000508308410645, "learning_rate": 2.74808757583751e-06, "loss": 0.0003, "step": 22294 }, { "epoch": 5.881414061469463, "grad_norm": 0.01672527566552162, "learning_rate": 2.747735865646707e-06, "loss": 0.0001, "step": 22296 }, { "epoch": 5.881941696346129, "grad_norm": 0.0027183822821825743, "learning_rate": 2.7473841554559044e-06, "loss": 0.0003, "step": 22298 }, { "epoch": 5.882469331222794, "grad_norm": 0.0034504500217735767, "learning_rate": 2.7470324452651014e-06, "loss": 0.0001, "step": 22300 }, { "epoch": 5.882996966099459, "grad_norm": 0.017315739765763283, "learning_rate": 2.7466807350742988e-06, "loss": 0.0036, "step": 22302 }, { "epoch": 5.883524600976124, "grad_norm": 0.005087206140160561, "learning_rate": 2.746329024883496e-06, "loss": 0.004, "step": 22304 }, { "epoch": 5.88405223585279, "grad_norm": 0.05606130138039589, "learning_rate": 2.7459773146926935e-06, "loss": 0.0002, "step": 22306 }, { "epoch": 5.884579870729455, "grad_norm": 0.01057588029652834, "learning_rate": 2.7456256045018905e-06, "loss": 0.0021, "step": 22308 }, { "epoch": 5.88510750560612, "grad_norm": 0.2131873071193695, "learning_rate": 2.745273894311088e-06, "loss": 0.0056, "step": 22310 }, { "epoch": 5.885635140482786, "grad_norm": 0.003195227589458227, "learning_rate": 2.744922184120285e-06, "loss": 0.0001, "step": 22312 }, { "epoch": 5.886162775359451, "grad_norm": 0.006578185595571995, "learning_rate": 2.744570473929482e-06, "loss": 0.0001, "step": 22314 }, { "epoch": 5.886690410236117, "grad_norm": 0.013930289074778557, "learning_rate": 2.7442187637386795e-06, "loss": 0.0001, "step": 22316 }, { "epoch": 5.887218045112782, "grad_norm": 0.012044521048665047, "learning_rate": 2.7438670535478765e-06, "loss": 0.0008, "step": 22318 }, { "epoch": 5.887745679989448, "grad_norm": 0.023584231734275818, "learning_rate": 2.743515343357074e-06, "loss": 0.0001, "step": 22320 }, { "epoch": 5.888273314866113, "grad_norm": 0.02290922962129116, "learning_rate": 2.743163633166271e-06, "loss": 0.0002, "step": 22322 }, { "epoch": 5.8888009497427785, "grad_norm": 0.11285410076379776, "learning_rate": 2.742811922975468e-06, "loss": 0.0027, "step": 22324 }, { "epoch": 5.889328584619443, "grad_norm": 0.032408956438302994, "learning_rate": 2.7424602127846656e-06, "loss": 0.0002, "step": 22326 }, { "epoch": 5.889856219496108, "grad_norm": 0.007216001860797405, "learning_rate": 2.742108502593863e-06, "loss": 0.0001, "step": 22328 }, { "epoch": 5.890383854372774, "grad_norm": 0.004624437540769577, "learning_rate": 2.74175679240306e-06, "loss": 0.0001, "step": 22330 }, { "epoch": 5.890911489249439, "grad_norm": 0.17606115341186523, "learning_rate": 2.7414050822122573e-06, "loss": 0.0009, "step": 22332 }, { "epoch": 5.891439124126105, "grad_norm": 0.4241591989994049, "learning_rate": 2.7410533720214542e-06, "loss": 0.0015, "step": 22334 }, { "epoch": 5.89196675900277, "grad_norm": 0.2834128439426422, "learning_rate": 2.7407016618306516e-06, "loss": 0.0021, "step": 22336 }, { "epoch": 5.892494393879435, "grad_norm": 0.29293254017829895, "learning_rate": 2.740349951639849e-06, "loss": 0.0016, "step": 22338 }, { "epoch": 5.893022028756101, "grad_norm": 0.004269895609468222, "learning_rate": 2.7399982414490464e-06, "loss": 0.0001, "step": 22340 }, { "epoch": 5.893549663632766, "grad_norm": 0.008095599710941315, "learning_rate": 2.7396465312582433e-06, "loss": 0.0001, "step": 22342 }, { "epoch": 5.894077298509432, "grad_norm": 0.016587935388088226, "learning_rate": 2.7392948210674403e-06, "loss": 0.0001, "step": 22344 }, { "epoch": 5.894604933386097, "grad_norm": 0.007287974935024977, "learning_rate": 2.7389431108766376e-06, "loss": 0.0029, "step": 22346 }, { "epoch": 5.895132568262762, "grad_norm": 0.0019768306519836187, "learning_rate": 2.7385914006858346e-06, "loss": 0.0001, "step": 22348 }, { "epoch": 5.895660203139427, "grad_norm": 0.0068140821531414986, "learning_rate": 2.7382396904950324e-06, "loss": 0.0013, "step": 22350 }, { "epoch": 5.896187838016093, "grad_norm": 0.0025022830814123154, "learning_rate": 2.7378879803042293e-06, "loss": 0.0001, "step": 22352 }, { "epoch": 5.896715472892758, "grad_norm": 0.017747335135936737, "learning_rate": 2.7375362701134267e-06, "loss": 0.0007, "step": 22354 }, { "epoch": 5.897243107769423, "grad_norm": 0.017927637323737144, "learning_rate": 2.7371845599226237e-06, "loss": 0.0009, "step": 22356 }, { "epoch": 5.897770742646089, "grad_norm": 0.007255433592945337, "learning_rate": 2.736832849731821e-06, "loss": 0.0001, "step": 22358 }, { "epoch": 5.898298377522754, "grad_norm": 0.040898099541664124, "learning_rate": 2.736481139541018e-06, "loss": 0.0002, "step": 22360 }, { "epoch": 5.89882601239942, "grad_norm": 0.026064153760671616, "learning_rate": 2.736129429350216e-06, "loss": 0.0045, "step": 22362 }, { "epoch": 5.899353647276085, "grad_norm": 0.014826473779976368, "learning_rate": 2.7357777191594128e-06, "loss": 0.0002, "step": 22364 }, { "epoch": 5.899881282152751, "grad_norm": 0.021302828565239906, "learning_rate": 2.73542600896861e-06, "loss": 0.005, "step": 22366 }, { "epoch": 5.900408917029416, "grad_norm": 0.0037233380135148764, "learning_rate": 2.735074298777807e-06, "loss": 0.0002, "step": 22368 }, { "epoch": 5.9009365519060815, "grad_norm": 0.7210665345191956, "learning_rate": 2.7347225885870045e-06, "loss": 0.0027, "step": 22370 }, { "epoch": 5.901464186782746, "grad_norm": 0.009976253844797611, "learning_rate": 2.7343708783962014e-06, "loss": 0.0037, "step": 22372 }, { "epoch": 5.901991821659411, "grad_norm": 0.011411091312766075, "learning_rate": 2.7340191682053988e-06, "loss": 0.0002, "step": 22374 }, { "epoch": 5.902519456536077, "grad_norm": 0.024001214653253555, "learning_rate": 2.733667458014596e-06, "loss": 0.0013, "step": 22376 }, { "epoch": 5.903047091412742, "grad_norm": 0.019515875726938248, "learning_rate": 2.733315747823793e-06, "loss": 0.0003, "step": 22378 }, { "epoch": 5.903574726289408, "grad_norm": 0.010300075635313988, "learning_rate": 2.7329640376329905e-06, "loss": 0.0028, "step": 22380 }, { "epoch": 5.904102361166073, "grad_norm": 0.07360988110303879, "learning_rate": 2.7326123274421874e-06, "loss": 0.0013, "step": 22382 }, { "epoch": 5.904629996042738, "grad_norm": 0.00589577853679657, "learning_rate": 2.732260617251385e-06, "loss": 0.0001, "step": 22384 }, { "epoch": 5.905157630919404, "grad_norm": 0.2745855748653412, "learning_rate": 2.731908907060582e-06, "loss": 0.0104, "step": 22386 }, { "epoch": 5.905685265796069, "grad_norm": 0.04779978096485138, "learning_rate": 2.7315571968697796e-06, "loss": 0.0002, "step": 22388 }, { "epoch": 5.906212900672735, "grad_norm": 0.008110293187201023, "learning_rate": 2.7312054866789765e-06, "loss": 0.0001, "step": 22390 }, { "epoch": 5.9067405355494, "grad_norm": 0.016789212822914124, "learning_rate": 2.730853776488174e-06, "loss": 0.0002, "step": 22392 }, { "epoch": 5.9072681704260654, "grad_norm": 0.0248599573969841, "learning_rate": 2.730502066297371e-06, "loss": 0.0001, "step": 22394 }, { "epoch": 5.90779580530273, "grad_norm": 0.001054786960594356, "learning_rate": 2.7301503561065682e-06, "loss": 0.0001, "step": 22396 }, { "epoch": 5.908323440179396, "grad_norm": 0.002767211990430951, "learning_rate": 2.7297986459157656e-06, "loss": 0.0001, "step": 22398 }, { "epoch": 5.908851075056061, "grad_norm": 0.005243634805083275, "learning_rate": 2.729446935724963e-06, "loss": 0.0001, "step": 22400 }, { "epoch": 5.909378709932726, "grad_norm": 0.3280152976512909, "learning_rate": 2.72909522553416e-06, "loss": 0.0015, "step": 22402 }, { "epoch": 5.909906344809392, "grad_norm": 0.0038930694572627544, "learning_rate": 2.7287435153433573e-06, "loss": 0.0001, "step": 22404 }, { "epoch": 5.910433979686057, "grad_norm": 0.007161095272749662, "learning_rate": 2.7283918051525543e-06, "loss": 0.0001, "step": 22406 }, { "epoch": 5.910961614562723, "grad_norm": 0.012614910490810871, "learning_rate": 2.7280400949617516e-06, "loss": 0.0008, "step": 22408 }, { "epoch": 5.911489249439388, "grad_norm": 0.0045968759804964066, "learning_rate": 2.727688384770949e-06, "loss": 0.0005, "step": 22410 }, { "epoch": 5.912016884316054, "grad_norm": 0.0012100995518267155, "learning_rate": 2.727336674580146e-06, "loss": 0.0001, "step": 22412 }, { "epoch": 5.912544519192719, "grad_norm": 0.12268199771642685, "learning_rate": 2.7269849643893433e-06, "loss": 0.0027, "step": 22414 }, { "epoch": 5.9130721540693845, "grad_norm": 0.1279996782541275, "learning_rate": 2.7266332541985403e-06, "loss": 0.0032, "step": 22416 }, { "epoch": 5.913599788946049, "grad_norm": 0.012237762100994587, "learning_rate": 2.7262815440077377e-06, "loss": 0.0001, "step": 22418 }, { "epoch": 5.914127423822714, "grad_norm": 0.04423307999968529, "learning_rate": 2.725929833816935e-06, "loss": 0.0003, "step": 22420 }, { "epoch": 5.91465505869938, "grad_norm": 0.003814120776951313, "learning_rate": 2.7255781236261324e-06, "loss": 0.0002, "step": 22422 }, { "epoch": 5.915182693576045, "grad_norm": 0.006846580654382706, "learning_rate": 2.7252264134353294e-06, "loss": 0.0003, "step": 22424 }, { "epoch": 5.915710328452711, "grad_norm": 0.23358793556690216, "learning_rate": 2.7248747032445267e-06, "loss": 0.0009, "step": 22426 }, { "epoch": 5.916237963329376, "grad_norm": 0.022458922117948532, "learning_rate": 2.7245229930537237e-06, "loss": 0.0002, "step": 22428 }, { "epoch": 5.916765598206041, "grad_norm": 0.013537874445319176, "learning_rate": 2.724171282862921e-06, "loss": 0.0001, "step": 22430 }, { "epoch": 5.917293233082707, "grad_norm": 0.12462577223777771, "learning_rate": 2.723819572672118e-06, "loss": 0.0013, "step": 22432 }, { "epoch": 5.917820867959372, "grad_norm": 0.04610462859272957, "learning_rate": 2.723467862481316e-06, "loss": 0.0033, "step": 22434 }, { "epoch": 5.918348502836038, "grad_norm": 0.007503817323595285, "learning_rate": 2.7231161522905128e-06, "loss": 0.0001, "step": 22436 }, { "epoch": 5.918876137712703, "grad_norm": 0.008391516283154488, "learning_rate": 2.7227644420997097e-06, "loss": 0.0001, "step": 22438 }, { "epoch": 5.9194037725893685, "grad_norm": 0.0084766810759902, "learning_rate": 2.722412731908907e-06, "loss": 0.0001, "step": 22440 }, { "epoch": 5.919931407466033, "grad_norm": 0.027349000796675682, "learning_rate": 2.722061021718104e-06, "loss": 0.0002, "step": 22442 }, { "epoch": 5.920459042342699, "grad_norm": 0.0030524509493261576, "learning_rate": 2.7217093115273014e-06, "loss": 0.0001, "step": 22444 }, { "epoch": 5.920986677219364, "grad_norm": 0.08772069960832596, "learning_rate": 2.721357601336499e-06, "loss": 0.0004, "step": 22446 }, { "epoch": 5.921514312096029, "grad_norm": 0.027373936027288437, "learning_rate": 2.721005891145696e-06, "loss": 0.0003, "step": 22448 }, { "epoch": 5.922041946972695, "grad_norm": 0.005043756682425737, "learning_rate": 2.720654180954893e-06, "loss": 0.0001, "step": 22450 }, { "epoch": 5.92256958184936, "grad_norm": 0.012672953307628632, "learning_rate": 2.7203024707640905e-06, "loss": 0.0001, "step": 22452 }, { "epoch": 5.923097216726026, "grad_norm": 0.010036368854343891, "learning_rate": 2.7199507605732875e-06, "loss": 0.0001, "step": 22454 }, { "epoch": 5.923624851602691, "grad_norm": 1.1842783689498901, "learning_rate": 2.719599050382485e-06, "loss": 0.0046, "step": 22456 }, { "epoch": 5.924152486479357, "grad_norm": 0.010037297382950783, "learning_rate": 2.7192473401916822e-06, "loss": 0.0001, "step": 22458 }, { "epoch": 5.924680121356022, "grad_norm": 0.02469806931912899, "learning_rate": 2.7188956300008796e-06, "loss": 0.0005, "step": 22460 }, { "epoch": 5.9252077562326875, "grad_norm": 0.17730355262756348, "learning_rate": 2.7185439198100765e-06, "loss": 0.0006, "step": 22462 }, { "epoch": 5.925735391109352, "grad_norm": 0.033039044588804245, "learning_rate": 2.718192209619274e-06, "loss": 0.0003, "step": 22464 }, { "epoch": 5.926263025986017, "grad_norm": 0.003725331975147128, "learning_rate": 2.717840499428471e-06, "loss": 0.0002, "step": 22466 }, { "epoch": 5.926790660862683, "grad_norm": 0.01651441678404808, "learning_rate": 2.7174887892376683e-06, "loss": 0.0002, "step": 22468 }, { "epoch": 5.927318295739348, "grad_norm": 1.377548098564148, "learning_rate": 2.7171370790468656e-06, "loss": 0.0027, "step": 22470 }, { "epoch": 5.927845930616014, "grad_norm": 0.008755048736929893, "learning_rate": 2.7167853688560626e-06, "loss": 0.0001, "step": 22472 }, { "epoch": 5.928373565492679, "grad_norm": 0.004023678135126829, "learning_rate": 2.71643365866526e-06, "loss": 0.0001, "step": 22474 }, { "epoch": 5.928901200369344, "grad_norm": 0.05933342128992081, "learning_rate": 2.716081948474457e-06, "loss": 0.0004, "step": 22476 }, { "epoch": 5.92942883524601, "grad_norm": 0.008852595463395119, "learning_rate": 2.7157302382836543e-06, "loss": 0.0004, "step": 22478 }, { "epoch": 5.929956470122675, "grad_norm": 0.00943159032613039, "learning_rate": 2.7153785280928517e-06, "loss": 0.0001, "step": 22480 }, { "epoch": 5.930484104999341, "grad_norm": 0.09792781621217728, "learning_rate": 2.715026817902049e-06, "loss": 0.0003, "step": 22482 }, { "epoch": 5.931011739876006, "grad_norm": 0.10177028924226761, "learning_rate": 2.714675107711246e-06, "loss": 0.0015, "step": 22484 }, { "epoch": 5.9315393747526715, "grad_norm": 0.008843233808875084, "learning_rate": 2.7143233975204434e-06, "loss": 0.0002, "step": 22486 }, { "epoch": 5.932067009629336, "grad_norm": 0.0029400521889328957, "learning_rate": 2.7139716873296403e-06, "loss": 0.0002, "step": 22488 }, { "epoch": 5.932594644506002, "grad_norm": 1.011466383934021, "learning_rate": 2.7136199771388377e-06, "loss": 0.0019, "step": 22490 }, { "epoch": 5.933122279382667, "grad_norm": 0.14497409760951996, "learning_rate": 2.713268266948035e-06, "loss": 0.003, "step": 22492 }, { "epoch": 5.933649914259332, "grad_norm": 0.021701131016016006, "learning_rate": 2.7129165567572324e-06, "loss": 0.0034, "step": 22494 }, { "epoch": 5.934177549135998, "grad_norm": 0.004724849481135607, "learning_rate": 2.7125648465664294e-06, "loss": 0.0001, "step": 22496 }, { "epoch": 5.934705184012663, "grad_norm": 0.3604699969291687, "learning_rate": 2.7122131363756263e-06, "loss": 0.0015, "step": 22498 }, { "epoch": 5.935232818889329, "grad_norm": 0.010607135482132435, "learning_rate": 2.7118614261848237e-06, "loss": 0.0002, "step": 22500 }, { "epoch": 5.935760453765994, "grad_norm": 0.3336467742919922, "learning_rate": 2.7115097159940207e-06, "loss": 0.0014, "step": 22502 }, { "epoch": 5.93628808864266, "grad_norm": 0.006839873269200325, "learning_rate": 2.7111580058032185e-06, "loss": 0.0001, "step": 22504 }, { "epoch": 5.936815723519325, "grad_norm": 0.14870969951152802, "learning_rate": 2.7108062956124154e-06, "loss": 0.0004, "step": 22506 }, { "epoch": 5.93734335839599, "grad_norm": 0.05825088918209076, "learning_rate": 2.710454585421613e-06, "loss": 0.0002, "step": 22508 }, { "epoch": 5.937870993272655, "grad_norm": 0.005363536532968283, "learning_rate": 2.7101028752308098e-06, "loss": 0.0002, "step": 22510 }, { "epoch": 5.93839862814932, "grad_norm": 0.09506582468748093, "learning_rate": 2.709751165040007e-06, "loss": 0.0005, "step": 22512 }, { "epoch": 5.938926263025986, "grad_norm": 0.002203919691964984, "learning_rate": 2.709399454849204e-06, "loss": 0.0001, "step": 22514 }, { "epoch": 5.939453897902651, "grad_norm": 0.03999215364456177, "learning_rate": 2.7090477446584015e-06, "loss": 0.0001, "step": 22516 }, { "epoch": 5.939981532779317, "grad_norm": 0.5525211095809937, "learning_rate": 2.708696034467599e-06, "loss": 0.0029, "step": 22518 }, { "epoch": 5.940509167655982, "grad_norm": 0.014270237646996975, "learning_rate": 2.7083443242767962e-06, "loss": 0.0001, "step": 22520 }, { "epoch": 5.941036802532647, "grad_norm": 0.0026732042897492647, "learning_rate": 2.707992614085993e-06, "loss": 0.0007, "step": 22522 }, { "epoch": 5.941564437409313, "grad_norm": 0.002338240621611476, "learning_rate": 2.7076409038951905e-06, "loss": 0.0016, "step": 22524 }, { "epoch": 5.942092072285978, "grad_norm": 0.021089399233460426, "learning_rate": 2.7072891937043875e-06, "loss": 0.0001, "step": 22526 }, { "epoch": 5.942619707162644, "grad_norm": 0.014407897368073463, "learning_rate": 2.706937483513585e-06, "loss": 0.0003, "step": 22528 }, { "epoch": 5.943147342039309, "grad_norm": 0.11032325774431229, "learning_rate": 2.7065857733227822e-06, "loss": 0.003, "step": 22530 }, { "epoch": 5.9436749769159745, "grad_norm": 0.10581216961145401, "learning_rate": 2.706234063131979e-06, "loss": 0.0003, "step": 22532 }, { "epoch": 5.944202611792639, "grad_norm": 0.028529657050967216, "learning_rate": 2.7058823529411766e-06, "loss": 0.0001, "step": 22534 }, { "epoch": 5.944730246669305, "grad_norm": 0.909692645072937, "learning_rate": 2.7055306427503735e-06, "loss": 0.0062, "step": 22536 }, { "epoch": 5.94525788154597, "grad_norm": 0.015254747122526169, "learning_rate": 2.705178932559571e-06, "loss": 0.0001, "step": 22538 }, { "epoch": 5.945785516422635, "grad_norm": 0.7390628457069397, "learning_rate": 2.7048272223687683e-06, "loss": 0.0028, "step": 22540 }, { "epoch": 5.946313151299301, "grad_norm": 0.06977379322052002, "learning_rate": 2.7044755121779657e-06, "loss": 0.0001, "step": 22542 }, { "epoch": 5.946840786175966, "grad_norm": 0.459899365901947, "learning_rate": 2.7041238019871626e-06, "loss": 0.0009, "step": 22544 }, { "epoch": 5.947368421052632, "grad_norm": 0.4901150166988373, "learning_rate": 2.70377209179636e-06, "loss": 0.0075, "step": 22546 }, { "epoch": 5.947896055929297, "grad_norm": 0.0032082349061965942, "learning_rate": 2.703420381605557e-06, "loss": 0.0055, "step": 22548 }, { "epoch": 5.948423690805962, "grad_norm": 0.14672206342220306, "learning_rate": 2.7030686714147543e-06, "loss": 0.0005, "step": 22550 }, { "epoch": 5.948951325682628, "grad_norm": 0.26616910099983215, "learning_rate": 2.7027169612239517e-06, "loss": 0.0009, "step": 22552 }, { "epoch": 5.949478960559293, "grad_norm": 0.508529007434845, "learning_rate": 2.702365251033149e-06, "loss": 0.0011, "step": 22554 }, { "epoch": 5.9500065954359584, "grad_norm": 0.041139960289001465, "learning_rate": 2.702013540842346e-06, "loss": 0.0003, "step": 22556 }, { "epoch": 5.950534230312623, "grad_norm": 0.1121673732995987, "learning_rate": 2.701661830651543e-06, "loss": 0.0003, "step": 22558 }, { "epoch": 5.951061865189289, "grad_norm": 0.0067949010990560055, "learning_rate": 2.7013101204607403e-06, "loss": 0.0002, "step": 22560 }, { "epoch": 5.951589500065954, "grad_norm": 0.09014051407575607, "learning_rate": 2.7009584102699377e-06, "loss": 0.0087, "step": 22562 }, { "epoch": 5.95211713494262, "grad_norm": 0.06606544554233551, "learning_rate": 2.700606700079135e-06, "loss": 0.0006, "step": 22564 }, { "epoch": 5.952644769819285, "grad_norm": 0.005211380310356617, "learning_rate": 2.700254989888332e-06, "loss": 0.0001, "step": 22566 }, { "epoch": 5.95317240469595, "grad_norm": 0.009166748262941837, "learning_rate": 2.6999032796975294e-06, "loss": 0.0003, "step": 22568 }, { "epoch": 5.953700039572616, "grad_norm": 0.009258078411221504, "learning_rate": 2.6995515695067264e-06, "loss": 0.0001, "step": 22570 }, { "epoch": 5.954227674449281, "grad_norm": 0.018310075625777245, "learning_rate": 2.6991998593159237e-06, "loss": 0.0001, "step": 22572 }, { "epoch": 5.954755309325947, "grad_norm": 0.026835523545742035, "learning_rate": 2.6988481491251207e-06, "loss": 0.0022, "step": 22574 }, { "epoch": 5.955282944202612, "grad_norm": 0.02204003557562828, "learning_rate": 2.6984964389343185e-06, "loss": 0.0001, "step": 22576 }, { "epoch": 5.9558105790792775, "grad_norm": 0.8922990560531616, "learning_rate": 2.6981447287435155e-06, "loss": 0.0004, "step": 22578 }, { "epoch": 5.956338213955942, "grad_norm": 0.04227229580283165, "learning_rate": 2.697793018552713e-06, "loss": 0.0003, "step": 22580 }, { "epoch": 5.956865848832608, "grad_norm": 0.014423858374357224, "learning_rate": 2.6974413083619098e-06, "loss": 0.0001, "step": 22582 }, { "epoch": 5.957393483709273, "grad_norm": 0.004807505756616592, "learning_rate": 2.697089598171107e-06, "loss": 0.0001, "step": 22584 }, { "epoch": 5.957921118585938, "grad_norm": 0.17322991788387299, "learning_rate": 2.696737887980304e-06, "loss": 0.0003, "step": 22586 }, { "epoch": 5.958448753462604, "grad_norm": 0.269048273563385, "learning_rate": 2.6963861777895015e-06, "loss": 0.0005, "step": 22588 }, { "epoch": 5.958976388339269, "grad_norm": 0.0028902075719088316, "learning_rate": 2.696034467598699e-06, "loss": 0.0016, "step": 22590 }, { "epoch": 5.959504023215935, "grad_norm": 0.21188656985759735, "learning_rate": 2.695682757407896e-06, "loss": 0.0057, "step": 22592 }, { "epoch": 5.9600316580926, "grad_norm": 0.06047503277659416, "learning_rate": 2.695331047217093e-06, "loss": 0.0003, "step": 22594 }, { "epoch": 5.960559292969265, "grad_norm": 0.03453980013728142, "learning_rate": 2.69497933702629e-06, "loss": 0.0003, "step": 22596 }, { "epoch": 5.961086927845931, "grad_norm": 0.006091838236898184, "learning_rate": 2.6946276268354875e-06, "loss": 0.0002, "step": 22598 }, { "epoch": 5.961614562722596, "grad_norm": 0.007879049517214298, "learning_rate": 2.694275916644685e-06, "loss": 0.001, "step": 22600 }, { "epoch": 5.9621421975992615, "grad_norm": 0.007260581944137812, "learning_rate": 2.6939242064538823e-06, "loss": 0.0002, "step": 22602 }, { "epoch": 5.962669832475926, "grad_norm": 0.2833733558654785, "learning_rate": 2.6935724962630792e-06, "loss": 0.0005, "step": 22604 }, { "epoch": 5.963197467352592, "grad_norm": 0.00781636405736208, "learning_rate": 2.6932207860722766e-06, "loss": 0.0001, "step": 22606 }, { "epoch": 5.963725102229257, "grad_norm": 0.030242372304201126, "learning_rate": 2.6928690758814735e-06, "loss": 0.0002, "step": 22608 }, { "epoch": 5.964252737105923, "grad_norm": 0.07608921825885773, "learning_rate": 2.692517365690671e-06, "loss": 0.0003, "step": 22610 }, { "epoch": 5.964780371982588, "grad_norm": 0.00097942678257823, "learning_rate": 2.6921656554998683e-06, "loss": 0.0001, "step": 22612 }, { "epoch": 5.965308006859253, "grad_norm": 0.01945403777062893, "learning_rate": 2.6918139453090657e-06, "loss": 0.0001, "step": 22614 }, { "epoch": 5.965835641735919, "grad_norm": 0.07754354923963547, "learning_rate": 2.6914622351182626e-06, "loss": 0.0018, "step": 22616 }, { "epoch": 5.966363276612584, "grad_norm": 0.011241217143833637, "learning_rate": 2.6911105249274596e-06, "loss": 0.0001, "step": 22618 }, { "epoch": 5.96689091148925, "grad_norm": 0.009419934824109077, "learning_rate": 2.690758814736657e-06, "loss": 0.0001, "step": 22620 }, { "epoch": 5.967418546365915, "grad_norm": 0.003341011004522443, "learning_rate": 2.6904071045458543e-06, "loss": 0.0001, "step": 22622 }, { "epoch": 5.9679461812425805, "grad_norm": 0.016380509361624718, "learning_rate": 2.6900553943550517e-06, "loss": 0.0053, "step": 22624 }, { "epoch": 5.968473816119245, "grad_norm": 0.005804088898003101, "learning_rate": 2.6897036841642487e-06, "loss": 0.0001, "step": 22626 }, { "epoch": 5.969001450995911, "grad_norm": 0.021264487877488136, "learning_rate": 2.689351973973446e-06, "loss": 0.0002, "step": 22628 }, { "epoch": 5.969529085872576, "grad_norm": 0.001598490052856505, "learning_rate": 2.689000263782643e-06, "loss": 0.0002, "step": 22630 }, { "epoch": 5.970056720749241, "grad_norm": 0.08852989226579666, "learning_rate": 2.6886485535918404e-06, "loss": 0.0006, "step": 22632 }, { "epoch": 5.970584355625907, "grad_norm": 0.003680776571854949, "learning_rate": 2.6882968434010377e-06, "loss": 0.0001, "step": 22634 }, { "epoch": 5.971111990502572, "grad_norm": 0.23449914157390594, "learning_rate": 2.687945133210235e-06, "loss": 0.0015, "step": 22636 }, { "epoch": 5.971639625379238, "grad_norm": 0.034087639302015305, "learning_rate": 2.687593423019432e-06, "loss": 0.0026, "step": 22638 }, { "epoch": 5.972167260255903, "grad_norm": 0.009617478586733341, "learning_rate": 2.6872417128286294e-06, "loss": 0.0001, "step": 22640 }, { "epoch": 5.972694895132568, "grad_norm": 0.03431412950158119, "learning_rate": 2.6868900026378264e-06, "loss": 0.0002, "step": 22642 }, { "epoch": 5.973222530009234, "grad_norm": 0.04979642480611801, "learning_rate": 2.6865382924470238e-06, "loss": 0.0048, "step": 22644 }, { "epoch": 5.973750164885899, "grad_norm": 0.005857144948095083, "learning_rate": 2.6861865822562207e-06, "loss": 0.0001, "step": 22646 }, { "epoch": 5.9742777997625645, "grad_norm": 0.24348442256450653, "learning_rate": 2.685834872065418e-06, "loss": 0.001, "step": 22648 }, { "epoch": 5.974805434639229, "grad_norm": 0.24834880232810974, "learning_rate": 2.6854831618746155e-06, "loss": 0.0013, "step": 22650 }, { "epoch": 5.975333069515895, "grad_norm": 0.05487653613090515, "learning_rate": 2.6851314516838124e-06, "loss": 0.0004, "step": 22652 }, { "epoch": 5.97586070439256, "grad_norm": 0.015652574598789215, "learning_rate": 2.68477974149301e-06, "loss": 0.0002, "step": 22654 }, { "epoch": 5.976388339269226, "grad_norm": 0.02045748196542263, "learning_rate": 2.6844280313022068e-06, "loss": 0.0025, "step": 22656 }, { "epoch": 5.976915974145891, "grad_norm": 0.011377930641174316, "learning_rate": 2.684076321111404e-06, "loss": 0.0001, "step": 22658 }, { "epoch": 5.977443609022556, "grad_norm": 0.013035775162279606, "learning_rate": 2.6837246109206015e-06, "loss": 0.0004, "step": 22660 }, { "epoch": 5.977971243899222, "grad_norm": 0.20895862579345703, "learning_rate": 2.683372900729799e-06, "loss": 0.001, "step": 22662 }, { "epoch": 5.978498878775887, "grad_norm": 0.24441102147102356, "learning_rate": 2.683021190538996e-06, "loss": 0.001, "step": 22664 }, { "epoch": 5.979026513652553, "grad_norm": 0.339927613735199, "learning_rate": 2.6826694803481932e-06, "loss": 0.0037, "step": 22666 }, { "epoch": 5.979554148529218, "grad_norm": 0.020063478499650955, "learning_rate": 2.68231777015739e-06, "loss": 0.0003, "step": 22668 }, { "epoch": 5.9800817834058835, "grad_norm": 0.05738069489598274, "learning_rate": 2.6819660599665875e-06, "loss": 0.0002, "step": 22670 }, { "epoch": 5.980609418282548, "grad_norm": 0.00596831226721406, "learning_rate": 2.681614349775785e-06, "loss": 0.0002, "step": 22672 }, { "epoch": 5.981137053159214, "grad_norm": 0.06426271796226501, "learning_rate": 2.6812626395849823e-06, "loss": 0.0006, "step": 22674 }, { "epoch": 5.981664688035879, "grad_norm": 0.004108562599867582, "learning_rate": 2.6809109293941792e-06, "loss": 0.0001, "step": 22676 }, { "epoch": 5.982192322912544, "grad_norm": 0.008985188789665699, "learning_rate": 2.680559219203376e-06, "loss": 0.0003, "step": 22678 }, { "epoch": 5.98271995778921, "grad_norm": 0.004240616224706173, "learning_rate": 2.6802075090125736e-06, "loss": 0.0001, "step": 22680 }, { "epoch": 5.983247592665875, "grad_norm": 0.00470581604167819, "learning_rate": 2.679855798821771e-06, "loss": 0.0001, "step": 22682 }, { "epoch": 5.983775227542541, "grad_norm": 0.06713919341564178, "learning_rate": 2.6795040886309683e-06, "loss": 0.0003, "step": 22684 }, { "epoch": 5.984302862419206, "grad_norm": 0.007488015573471785, "learning_rate": 2.6791523784401653e-06, "loss": 0.0002, "step": 22686 }, { "epoch": 5.984830497295871, "grad_norm": 0.014110688120126724, "learning_rate": 2.6788006682493627e-06, "loss": 0.0048, "step": 22688 }, { "epoch": 5.985358132172537, "grad_norm": 0.03948267921805382, "learning_rate": 2.6784489580585596e-06, "loss": 0.0085, "step": 22690 }, { "epoch": 5.985885767049202, "grad_norm": 0.005791081115603447, "learning_rate": 2.678097247867757e-06, "loss": 0.0001, "step": 22692 }, { "epoch": 5.9864134019258675, "grad_norm": 0.0029839116614311934, "learning_rate": 2.6777455376769544e-06, "loss": 0.0001, "step": 22694 }, { "epoch": 5.986941036802532, "grad_norm": 0.08795689046382904, "learning_rate": 2.6773938274861517e-06, "loss": 0.0003, "step": 22696 }, { "epoch": 5.987468671679198, "grad_norm": 0.006001537200063467, "learning_rate": 2.6770421172953487e-06, "loss": 0.0001, "step": 22698 }, { "epoch": 5.987996306555863, "grad_norm": 0.024088382720947266, "learning_rate": 2.676690407104546e-06, "loss": 0.0015, "step": 22700 }, { "epoch": 5.988523941432529, "grad_norm": 0.005164931528270245, "learning_rate": 2.676338696913743e-06, "loss": 0.0001, "step": 22702 }, { "epoch": 5.989051576309194, "grad_norm": 0.6710535287857056, "learning_rate": 2.6759869867229404e-06, "loss": 0.0036, "step": 22704 }, { "epoch": 5.989579211185859, "grad_norm": 0.024339837953448296, "learning_rate": 2.6756352765321378e-06, "loss": 0.0024, "step": 22706 }, { "epoch": 5.990106846062525, "grad_norm": 0.10572412610054016, "learning_rate": 2.675283566341335e-06, "loss": 0.0027, "step": 22708 }, { "epoch": 5.99063448093919, "grad_norm": 0.042128995060920715, "learning_rate": 2.674931856150532e-06, "loss": 0.0007, "step": 22710 }, { "epoch": 5.991162115815856, "grad_norm": 0.02810591273009777, "learning_rate": 2.674580145959729e-06, "loss": 0.0002, "step": 22712 }, { "epoch": 5.991689750692521, "grad_norm": 0.003525890177115798, "learning_rate": 2.6742284357689264e-06, "loss": 0.0002, "step": 22714 }, { "epoch": 5.9922173855691865, "grad_norm": 0.012605281546711922, "learning_rate": 2.6738767255781234e-06, "loss": 0.0004, "step": 22716 }, { "epoch": 5.9927450204458514, "grad_norm": 0.08684269338846207, "learning_rate": 2.673525015387321e-06, "loss": 0.0041, "step": 22718 }, { "epoch": 5.993272655322517, "grad_norm": 0.0194687619805336, "learning_rate": 2.673173305196518e-06, "loss": 0.0018, "step": 22720 }, { "epoch": 5.993800290199182, "grad_norm": 0.004437274765223265, "learning_rate": 2.6728215950057155e-06, "loss": 0.0001, "step": 22722 }, { "epoch": 5.994327925075847, "grad_norm": 0.008066188544034958, "learning_rate": 2.6724698848149125e-06, "loss": 0.0006, "step": 22724 }, { "epoch": 5.994855559952513, "grad_norm": 0.733939528465271, "learning_rate": 2.67211817462411e-06, "loss": 0.005, "step": 22726 }, { "epoch": 5.995383194829178, "grad_norm": 0.09073752909898758, "learning_rate": 2.6717664644333068e-06, "loss": 0.0002, "step": 22728 }, { "epoch": 5.995910829705844, "grad_norm": 0.014905333518981934, "learning_rate": 2.671414754242504e-06, "loss": 0.0001, "step": 22730 }, { "epoch": 5.996438464582509, "grad_norm": 0.07942602783441544, "learning_rate": 2.6710630440517015e-06, "loss": 0.0005, "step": 22732 }, { "epoch": 5.996966099459174, "grad_norm": 0.0038374403957277536, "learning_rate": 2.670711333860899e-06, "loss": 0.0003, "step": 22734 }, { "epoch": 5.99749373433584, "grad_norm": 0.010085538029670715, "learning_rate": 2.670359623670096e-06, "loss": 0.0008, "step": 22736 }, { "epoch": 5.998021369212505, "grad_norm": 0.004948985762894154, "learning_rate": 2.6700079134792932e-06, "loss": 0.0012, "step": 22738 }, { "epoch": 5.9985490040891705, "grad_norm": 0.009483233094215393, "learning_rate": 2.66965620328849e-06, "loss": 0.0002, "step": 22740 }, { "epoch": 5.999076638965835, "grad_norm": 0.002210867591202259, "learning_rate": 2.6693044930976876e-06, "loss": 0.0001, "step": 22742 }, { "epoch": 5.999604273842501, "grad_norm": 0.03555840626358986, "learning_rate": 2.668952782906885e-06, "loss": 0.0002, "step": 22744 }, { "epoch": 6.0, "grad_norm": 0.09305989742279053, "learning_rate": 2.668601072716082e-06, "loss": 0.0003, "step": 22746 }, { "epoch": 6.000527634876665, "grad_norm": 0.00492745079100132, "learning_rate": 2.6682493625252793e-06, "loss": 0.0003, "step": 22748 }, { "epoch": 6.001055269753331, "grad_norm": 0.02405647747218609, "learning_rate": 2.6678976523344762e-06, "loss": 0.0012, "step": 22750 }, { "epoch": 6.001582904629996, "grad_norm": 0.013828531838953495, "learning_rate": 2.6675459421436736e-06, "loss": 0.0002, "step": 22752 }, { "epoch": 6.002110539506662, "grad_norm": 0.30145251750946045, "learning_rate": 2.667194231952871e-06, "loss": 0.0009, "step": 22754 }, { "epoch": 6.002638174383327, "grad_norm": 0.2073189914226532, "learning_rate": 2.6668425217620684e-06, "loss": 0.0011, "step": 22756 }, { "epoch": 6.003165809259992, "grad_norm": 0.003889563027769327, "learning_rate": 2.6664908115712653e-06, "loss": 0.0001, "step": 22758 }, { "epoch": 6.003693444136657, "grad_norm": 0.016549531370401382, "learning_rate": 2.6661391013804627e-06, "loss": 0.0002, "step": 22760 }, { "epoch": 6.004221079013322, "grad_norm": 0.05818986892700195, "learning_rate": 2.6657873911896596e-06, "loss": 0.0002, "step": 22762 }, { "epoch": 6.004748713889988, "grad_norm": 0.011949238367378712, "learning_rate": 2.665435680998857e-06, "loss": 0.0036, "step": 22764 }, { "epoch": 6.005276348766653, "grad_norm": 0.29194119572639465, "learning_rate": 2.6650839708080544e-06, "loss": 0.0061, "step": 22766 }, { "epoch": 6.005803983643319, "grad_norm": 0.07574667036533356, "learning_rate": 2.6647322606172518e-06, "loss": 0.0046, "step": 22768 }, { "epoch": 6.006331618519984, "grad_norm": 0.634768545627594, "learning_rate": 2.6643805504264487e-06, "loss": 0.0062, "step": 22770 }, { "epoch": 6.00685925339665, "grad_norm": 0.04700125753879547, "learning_rate": 2.6640288402356457e-06, "loss": 0.0002, "step": 22772 }, { "epoch": 6.007386888273315, "grad_norm": 0.7330460548400879, "learning_rate": 2.663677130044843e-06, "loss": 0.0013, "step": 22774 }, { "epoch": 6.007914523149981, "grad_norm": 0.09428287297487259, "learning_rate": 2.6633254198540404e-06, "loss": 0.0019, "step": 22776 }, { "epoch": 6.008442158026646, "grad_norm": 0.42680808901786804, "learning_rate": 2.662973709663238e-06, "loss": 0.001, "step": 22778 }, { "epoch": 6.008969792903311, "grad_norm": 0.004377919249236584, "learning_rate": 2.6626219994724347e-06, "loss": 0.0002, "step": 22780 }, { "epoch": 6.009497427779976, "grad_norm": 0.004746728111058474, "learning_rate": 2.662270289281632e-06, "loss": 0.0001, "step": 22782 }, { "epoch": 6.010025062656641, "grad_norm": 0.0014671820681542158, "learning_rate": 2.661918579090829e-06, "loss": 0.0001, "step": 22784 }, { "epoch": 6.010552697533307, "grad_norm": 0.001454066950827837, "learning_rate": 2.6615668689000264e-06, "loss": 0.0001, "step": 22786 }, { "epoch": 6.011080332409972, "grad_norm": 0.009015827439725399, "learning_rate": 2.6612151587092234e-06, "loss": 0.0001, "step": 22788 }, { "epoch": 6.011607967286638, "grad_norm": 0.023877661675214767, "learning_rate": 2.660863448518421e-06, "loss": 0.0002, "step": 22790 }, { "epoch": 6.012135602163303, "grad_norm": 0.9860995411872864, "learning_rate": 2.660511738327618e-06, "loss": 0.0015, "step": 22792 }, { "epoch": 6.012663237039968, "grad_norm": 0.0025183474645018578, "learning_rate": 2.6601600281368155e-06, "loss": 0.0001, "step": 22794 }, { "epoch": 6.013190871916634, "grad_norm": 0.003705438459292054, "learning_rate": 2.6598083179460125e-06, "loss": 0.0002, "step": 22796 }, { "epoch": 6.013718506793299, "grad_norm": 0.018155237659811974, "learning_rate": 2.65945660775521e-06, "loss": 0.0002, "step": 22798 }, { "epoch": 6.014246141669965, "grad_norm": 0.45334798097610474, "learning_rate": 2.659104897564407e-06, "loss": 0.0036, "step": 22800 }, { "epoch": 6.01477377654663, "grad_norm": 0.0028627216815948486, "learning_rate": 2.658753187373604e-06, "loss": 0.0001, "step": 22802 }, { "epoch": 6.015301411423295, "grad_norm": 0.034563399851322174, "learning_rate": 2.6584014771828016e-06, "loss": 0.0002, "step": 22804 }, { "epoch": 6.01582904629996, "grad_norm": 0.04517634958028793, "learning_rate": 2.6580497669919985e-06, "loss": 0.0003, "step": 22806 }, { "epoch": 6.016356681176625, "grad_norm": 0.2685965895652771, "learning_rate": 2.657698056801196e-06, "loss": 0.0032, "step": 22808 }, { "epoch": 6.016884316053291, "grad_norm": 0.19405771791934967, "learning_rate": 2.657346346610393e-06, "loss": 0.0023, "step": 22810 }, { "epoch": 6.017411950929956, "grad_norm": 0.0017188966739922762, "learning_rate": 2.6569946364195902e-06, "loss": 0.0025, "step": 22812 }, { "epoch": 6.017939585806622, "grad_norm": 0.008995946496725082, "learning_rate": 2.6566429262287876e-06, "loss": 0.0002, "step": 22814 }, { "epoch": 6.018467220683287, "grad_norm": 0.004515775945037603, "learning_rate": 2.656291216037985e-06, "loss": 0.0001, "step": 22816 }, { "epoch": 6.018994855559953, "grad_norm": 0.02044348046183586, "learning_rate": 2.655939505847182e-06, "loss": 0.0002, "step": 22818 }, { "epoch": 6.019522490436618, "grad_norm": 0.12482977658510208, "learning_rate": 2.6555877956563793e-06, "loss": 0.0009, "step": 22820 }, { "epoch": 6.020050125313283, "grad_norm": 0.011131958104670048, "learning_rate": 2.6552360854655762e-06, "loss": 0.0003, "step": 22822 }, { "epoch": 6.020577760189949, "grad_norm": 0.002799640642479062, "learning_rate": 2.6548843752747736e-06, "loss": 0.0002, "step": 22824 }, { "epoch": 6.021105395066614, "grad_norm": 0.09689559042453766, "learning_rate": 2.654532665083971e-06, "loss": 0.0002, "step": 22826 }, { "epoch": 6.021633029943279, "grad_norm": 0.01078103855252266, "learning_rate": 2.6541809548931684e-06, "loss": 0.0001, "step": 22828 }, { "epoch": 6.022160664819944, "grad_norm": 0.0037075942382216454, "learning_rate": 2.6538292447023653e-06, "loss": 0.0001, "step": 22830 }, { "epoch": 6.02268829969661, "grad_norm": 0.004866050090640783, "learning_rate": 2.6534775345115623e-06, "loss": 0.0004, "step": 22832 }, { "epoch": 6.023215934573275, "grad_norm": 0.19429512321949005, "learning_rate": 2.6531258243207597e-06, "loss": 0.0009, "step": 22834 }, { "epoch": 6.023743569449941, "grad_norm": 0.019704367965459824, "learning_rate": 2.652774114129957e-06, "loss": 0.0001, "step": 22836 }, { "epoch": 6.024271204326606, "grad_norm": 0.3097217381000519, "learning_rate": 2.6524224039391544e-06, "loss": 0.005, "step": 22838 }, { "epoch": 6.024798839203271, "grad_norm": 0.005406402982771397, "learning_rate": 2.6520706937483514e-06, "loss": 0.0019, "step": 22840 }, { "epoch": 6.025326474079937, "grad_norm": 0.02476261928677559, "learning_rate": 2.6517189835575487e-06, "loss": 0.0001, "step": 22842 }, { "epoch": 6.025854108956602, "grad_norm": 0.0345241017639637, "learning_rate": 2.6513672733667457e-06, "loss": 0.0002, "step": 22844 }, { "epoch": 6.026381743833268, "grad_norm": 0.06705056130886078, "learning_rate": 2.651015563175943e-06, "loss": 0.0004, "step": 22846 }, { "epoch": 6.026909378709933, "grad_norm": 0.03066893480718136, "learning_rate": 2.6506638529851404e-06, "loss": 0.0002, "step": 22848 }, { "epoch": 6.0274370135865984, "grad_norm": 0.025811342522501945, "learning_rate": 2.650312142794338e-06, "loss": 0.0003, "step": 22850 }, { "epoch": 6.027964648463263, "grad_norm": 0.11791753768920898, "learning_rate": 2.6499604326035348e-06, "loss": 0.0002, "step": 22852 }, { "epoch": 6.028492283339928, "grad_norm": 0.0068242186680436134, "learning_rate": 2.649608722412732e-06, "loss": 0.0011, "step": 22854 }, { "epoch": 6.029019918216594, "grad_norm": 0.03348473459482193, "learning_rate": 2.649257012221929e-06, "loss": 0.0002, "step": 22856 }, { "epoch": 6.029547553093259, "grad_norm": 1.1585869789123535, "learning_rate": 2.6489053020311265e-06, "loss": 0.0016, "step": 22858 }, { "epoch": 6.030075187969925, "grad_norm": 0.7817906141281128, "learning_rate": 2.648553591840324e-06, "loss": 0.0036, "step": 22860 }, { "epoch": 6.03060282284659, "grad_norm": 0.003877912648022175, "learning_rate": 2.648201881649521e-06, "loss": 0.0001, "step": 22862 }, { "epoch": 6.031130457723256, "grad_norm": 0.46393847465515137, "learning_rate": 2.647850171458718e-06, "loss": 0.0035, "step": 22864 }, { "epoch": 6.031658092599921, "grad_norm": 0.004401362035423517, "learning_rate": 2.647498461267915e-06, "loss": 0.0001, "step": 22866 }, { "epoch": 6.032185727476586, "grad_norm": 0.001024662866257131, "learning_rate": 2.6471467510771125e-06, "loss": 0.0001, "step": 22868 }, { "epoch": 6.032713362353252, "grad_norm": 0.0073541817255318165, "learning_rate": 2.6467950408863095e-06, "loss": 0.0002, "step": 22870 }, { "epoch": 6.033240997229917, "grad_norm": 0.6509734988212585, "learning_rate": 2.646443330695507e-06, "loss": 0.0032, "step": 22872 }, { "epoch": 6.033768632106582, "grad_norm": 0.24158884584903717, "learning_rate": 2.646091620504704e-06, "loss": 0.0006, "step": 22874 }, { "epoch": 6.034296266983247, "grad_norm": 0.694162130355835, "learning_rate": 2.6457399103139016e-06, "loss": 0.0044, "step": 22876 }, { "epoch": 6.034823901859913, "grad_norm": 0.026182960718870163, "learning_rate": 2.6453882001230985e-06, "loss": 0.0002, "step": 22878 }, { "epoch": 6.035351536736578, "grad_norm": 0.0008901031105779111, "learning_rate": 2.645036489932296e-06, "loss": 0.0001, "step": 22880 }, { "epoch": 6.035879171613244, "grad_norm": 0.00739309610798955, "learning_rate": 2.644684779741493e-06, "loss": 0.003, "step": 22882 }, { "epoch": 6.036406806489909, "grad_norm": 0.009429220110177994, "learning_rate": 2.6443330695506902e-06, "loss": 0.0003, "step": 22884 }, { "epoch": 6.036934441366574, "grad_norm": 0.22895283997058868, "learning_rate": 2.6439813593598876e-06, "loss": 0.0003, "step": 22886 }, { "epoch": 6.03746207624324, "grad_norm": 0.0369846373796463, "learning_rate": 2.643629649169085e-06, "loss": 0.0006, "step": 22888 }, { "epoch": 6.037989711119905, "grad_norm": 0.006848360877484083, "learning_rate": 2.643277938978282e-06, "loss": 0.0001, "step": 22890 }, { "epoch": 6.038517345996571, "grad_norm": 0.3144707679748535, "learning_rate": 2.642926228787479e-06, "loss": 0.0043, "step": 22892 }, { "epoch": 6.039044980873236, "grad_norm": 0.0034511121921241283, "learning_rate": 2.6425745185966763e-06, "loss": 0.0001, "step": 22894 }, { "epoch": 6.0395726157499015, "grad_norm": 0.01071006990969181, "learning_rate": 2.6422228084058737e-06, "loss": 0.0002, "step": 22896 }, { "epoch": 6.040100250626566, "grad_norm": 0.1089349314570427, "learning_rate": 2.641871098215071e-06, "loss": 0.0006, "step": 22898 }, { "epoch": 6.040627885503231, "grad_norm": 0.116389699280262, "learning_rate": 2.641519388024268e-06, "loss": 0.0003, "step": 22900 }, { "epoch": 6.041155520379897, "grad_norm": 0.11639847606420517, "learning_rate": 2.6411676778334654e-06, "loss": 0.0005, "step": 22902 }, { "epoch": 6.041683155256562, "grad_norm": 0.003185731591656804, "learning_rate": 2.6408159676426623e-06, "loss": 0.0016, "step": 22904 }, { "epoch": 6.042210790133228, "grad_norm": 0.007525259163230658, "learning_rate": 2.6404642574518597e-06, "loss": 0.0002, "step": 22906 }, { "epoch": 6.042738425009893, "grad_norm": 0.5204064249992371, "learning_rate": 2.640112547261057e-06, "loss": 0.0018, "step": 22908 }, { "epoch": 6.043266059886559, "grad_norm": 0.006439025513827801, "learning_rate": 2.6397608370702544e-06, "loss": 0.0001, "step": 22910 }, { "epoch": 6.043793694763224, "grad_norm": 0.04166216403245926, "learning_rate": 2.6394091268794514e-06, "loss": 0.0003, "step": 22912 }, { "epoch": 6.044321329639889, "grad_norm": 0.365907222032547, "learning_rate": 2.6390574166886488e-06, "loss": 0.0006, "step": 22914 }, { "epoch": 6.044848964516555, "grad_norm": 0.004268527962267399, "learning_rate": 2.6387057064978457e-06, "loss": 0.0004, "step": 22916 }, { "epoch": 6.04537659939322, "grad_norm": 0.8697079420089722, "learning_rate": 2.638353996307043e-06, "loss": 0.0117, "step": 22918 }, { "epoch": 6.045904234269885, "grad_norm": 0.07958071678876877, "learning_rate": 2.6380022861162405e-06, "loss": 0.0001, "step": 22920 }, { "epoch": 6.04643186914655, "grad_norm": 0.0068286750465631485, "learning_rate": 2.6376505759254374e-06, "loss": 0.0005, "step": 22922 }, { "epoch": 6.046959504023216, "grad_norm": 0.008348866365849972, "learning_rate": 2.637298865734635e-06, "loss": 0.0001, "step": 22924 }, { "epoch": 6.047487138899881, "grad_norm": 0.0020890270825475454, "learning_rate": 2.6369471555438317e-06, "loss": 0.0006, "step": 22926 }, { "epoch": 6.048014773776547, "grad_norm": 0.10662371665239334, "learning_rate": 2.636595445353029e-06, "loss": 0.0002, "step": 22928 }, { "epoch": 6.048542408653212, "grad_norm": 0.23805421590805054, "learning_rate": 2.636243735162226e-06, "loss": 0.0033, "step": 22930 }, { "epoch": 6.049070043529877, "grad_norm": 0.23545017838478088, "learning_rate": 2.635892024971424e-06, "loss": 0.0005, "step": 22932 }, { "epoch": 6.049597678406543, "grad_norm": 0.005410804413259029, "learning_rate": 2.635540314780621e-06, "loss": 0.0001, "step": 22934 }, { "epoch": 6.050125313283208, "grad_norm": 0.026851631700992584, "learning_rate": 2.635188604589818e-06, "loss": 0.0011, "step": 22936 }, { "epoch": 6.050652948159874, "grad_norm": 0.003655634354799986, "learning_rate": 2.634836894399015e-06, "loss": 0.0001, "step": 22938 }, { "epoch": 6.051180583036539, "grad_norm": 0.0032969925086945295, "learning_rate": 2.6344851842082125e-06, "loss": 0.0001, "step": 22940 }, { "epoch": 6.0517082179132045, "grad_norm": 0.005749303847551346, "learning_rate": 2.6341334740174095e-06, "loss": 0.0001, "step": 22942 }, { "epoch": 6.052235852789869, "grad_norm": 0.010882892645895481, "learning_rate": 2.633781763826607e-06, "loss": 0.0001, "step": 22944 }, { "epoch": 6.052763487666534, "grad_norm": 0.002894315868616104, "learning_rate": 2.6334300536358042e-06, "loss": 0.0001, "step": 22946 }, { "epoch": 6.0532911225432, "grad_norm": 0.006618115119636059, "learning_rate": 2.6330783434450016e-06, "loss": 0.0001, "step": 22948 }, { "epoch": 6.053818757419865, "grad_norm": 0.003708680160343647, "learning_rate": 2.6327266332541986e-06, "loss": 0.0001, "step": 22950 }, { "epoch": 6.054346392296531, "grad_norm": 0.2831241488456726, "learning_rate": 2.6323749230633955e-06, "loss": 0.0005, "step": 22952 }, { "epoch": 6.054874027173196, "grad_norm": 0.0016976811457425356, "learning_rate": 2.632023212872593e-06, "loss": 0.0001, "step": 22954 }, { "epoch": 6.055401662049862, "grad_norm": 0.012771605513989925, "learning_rate": 2.6316715026817903e-06, "loss": 0.0001, "step": 22956 }, { "epoch": 6.055929296926527, "grad_norm": 0.009182379581034184, "learning_rate": 2.6313197924909876e-06, "loss": 0.0001, "step": 22958 }, { "epoch": 6.056456931803192, "grad_norm": 0.015280078165233135, "learning_rate": 2.6309680823001846e-06, "loss": 0.0001, "step": 22960 }, { "epoch": 6.056984566679858, "grad_norm": 0.004011248704046011, "learning_rate": 2.630616372109382e-06, "loss": 0.0001, "step": 22962 }, { "epoch": 6.057512201556523, "grad_norm": 0.005856298841536045, "learning_rate": 2.630264661918579e-06, "loss": 0.0001, "step": 22964 }, { "epoch": 6.058039836433188, "grad_norm": 0.005945029202848673, "learning_rate": 2.6299129517277763e-06, "loss": 0.0001, "step": 22966 }, { "epoch": 6.058567471309853, "grad_norm": 0.07489421963691711, "learning_rate": 2.6295612415369737e-06, "loss": 0.0023, "step": 22968 }, { "epoch": 6.059095106186519, "grad_norm": 0.18555541336536407, "learning_rate": 2.629209531346171e-06, "loss": 0.0002, "step": 22970 }, { "epoch": 6.059622741063184, "grad_norm": 0.01795627735555172, "learning_rate": 2.628857821155368e-06, "loss": 0.0001, "step": 22972 }, { "epoch": 6.06015037593985, "grad_norm": 0.022084975615143776, "learning_rate": 2.6285061109645654e-06, "loss": 0.0002, "step": 22974 }, { "epoch": 6.060678010816515, "grad_norm": 0.019113117828965187, "learning_rate": 2.6281544007737623e-06, "loss": 0.0003, "step": 22976 }, { "epoch": 6.06120564569318, "grad_norm": 0.02854173630475998, "learning_rate": 2.6278026905829597e-06, "loss": 0.0001, "step": 22978 }, { "epoch": 6.061733280569846, "grad_norm": 0.004164600279182196, "learning_rate": 2.627450980392157e-06, "loss": 0.0032, "step": 22980 }, { "epoch": 6.062260915446511, "grad_norm": 0.002885947935283184, "learning_rate": 2.627099270201354e-06, "loss": 0.0001, "step": 22982 }, { "epoch": 6.062788550323177, "grad_norm": 0.017888521775603294, "learning_rate": 2.6267475600105514e-06, "loss": 0.0001, "step": 22984 }, { "epoch": 6.063316185199842, "grad_norm": 0.012895080260932446, "learning_rate": 2.6263958498197484e-06, "loss": 0.0001, "step": 22986 }, { "epoch": 6.0638438200765075, "grad_norm": 0.007483003195375204, "learning_rate": 2.6260441396289457e-06, "loss": 0.0001, "step": 22988 }, { "epoch": 6.064371454953172, "grad_norm": 0.003524273168295622, "learning_rate": 2.625692429438143e-06, "loss": 0.0001, "step": 22990 }, { "epoch": 6.064899089829837, "grad_norm": 0.012378595769405365, "learning_rate": 2.6253407192473405e-06, "loss": 0.0001, "step": 22992 }, { "epoch": 6.065426724706503, "grad_norm": 0.004061636980623007, "learning_rate": 2.6249890090565374e-06, "loss": 0.0001, "step": 22994 }, { "epoch": 6.065954359583168, "grad_norm": 0.0018086895579472184, "learning_rate": 2.624637298865735e-06, "loss": 0.0001, "step": 22996 }, { "epoch": 6.066481994459834, "grad_norm": 0.15728986263275146, "learning_rate": 2.6242855886749318e-06, "loss": 0.0006, "step": 22998 }, { "epoch": 6.067009629336499, "grad_norm": 0.0020790514536201954, "learning_rate": 2.623933878484129e-06, "loss": 0.0001, "step": 23000 }, { "epoch": 6.067537264213165, "grad_norm": 0.00137498346157372, "learning_rate": 2.623582168293326e-06, "loss": 0.0001, "step": 23002 }, { "epoch": 6.06806489908983, "grad_norm": 0.02795068733394146, "learning_rate": 2.623230458102524e-06, "loss": 0.0001, "step": 23004 }, { "epoch": 6.068592533966495, "grad_norm": 0.016075097024440765, "learning_rate": 2.622878747911721e-06, "loss": 0.0002, "step": 23006 }, { "epoch": 6.069120168843161, "grad_norm": 0.0022999488282948732, "learning_rate": 2.6225270377209182e-06, "loss": 0.0001, "step": 23008 }, { "epoch": 6.069647803719826, "grad_norm": 0.007172958459705114, "learning_rate": 2.622175327530115e-06, "loss": 0.0001, "step": 23010 }, { "epoch": 6.0701754385964914, "grad_norm": 0.10260460525751114, "learning_rate": 2.621823617339312e-06, "loss": 0.0014, "step": 23012 }, { "epoch": 6.070703073473156, "grad_norm": 0.6931057572364807, "learning_rate": 2.6214719071485095e-06, "loss": 0.0029, "step": 23014 }, { "epoch": 6.071230708349822, "grad_norm": 0.0021069487556815147, "learning_rate": 2.621120196957707e-06, "loss": 0.0001, "step": 23016 }, { "epoch": 6.071758343226487, "grad_norm": 0.4961918294429779, "learning_rate": 2.6207684867669043e-06, "loss": 0.0065, "step": 23018 }, { "epoch": 6.072285978103152, "grad_norm": 0.0009702967363409698, "learning_rate": 2.6204167765761012e-06, "loss": 0.0002, "step": 23020 }, { "epoch": 6.072813612979818, "grad_norm": 0.001706438371911645, "learning_rate": 2.6200650663852986e-06, "loss": 0.0001, "step": 23022 }, { "epoch": 6.073341247856483, "grad_norm": 0.0068820626474916935, "learning_rate": 2.6197133561944955e-06, "loss": 0.0001, "step": 23024 }, { "epoch": 6.073868882733149, "grad_norm": 0.09249178320169449, "learning_rate": 2.619361646003693e-06, "loss": 0.0002, "step": 23026 }, { "epoch": 6.074396517609814, "grad_norm": 0.002933600451797247, "learning_rate": 2.6190099358128903e-06, "loss": 0.0033, "step": 23028 }, { "epoch": 6.07492415248648, "grad_norm": 0.010704824700951576, "learning_rate": 2.6186582256220877e-06, "loss": 0.0001, "step": 23030 }, { "epoch": 6.075451787363145, "grad_norm": 0.020758455619215965, "learning_rate": 2.6183065154312846e-06, "loss": 0.0002, "step": 23032 }, { "epoch": 6.0759794222398105, "grad_norm": 0.29953277111053467, "learning_rate": 2.617954805240482e-06, "loss": 0.0005, "step": 23034 }, { "epoch": 6.076507057116475, "grad_norm": 0.0740845650434494, "learning_rate": 2.617603095049679e-06, "loss": 0.0063, "step": 23036 }, { "epoch": 6.07703469199314, "grad_norm": 0.2875111401081085, "learning_rate": 2.6172513848588763e-06, "loss": 0.0008, "step": 23038 }, { "epoch": 6.077562326869806, "grad_norm": 0.012901866808533669, "learning_rate": 2.6168996746680737e-06, "loss": 0.0002, "step": 23040 }, { "epoch": 6.078089961746471, "grad_norm": 0.030146615579724312, "learning_rate": 2.616547964477271e-06, "loss": 0.0004, "step": 23042 }, { "epoch": 6.078617596623137, "grad_norm": 0.06793678551912308, "learning_rate": 2.616196254286468e-06, "loss": 0.0029, "step": 23044 }, { "epoch": 6.079145231499802, "grad_norm": 0.014789558947086334, "learning_rate": 2.615844544095665e-06, "loss": 0.0023, "step": 23046 }, { "epoch": 6.079672866376468, "grad_norm": 0.01431737095117569, "learning_rate": 2.6154928339048624e-06, "loss": 0.0001, "step": 23048 }, { "epoch": 6.080200501253133, "grad_norm": 0.004266174044460058, "learning_rate": 2.6151411237140597e-06, "loss": 0.0002, "step": 23050 }, { "epoch": 6.080728136129798, "grad_norm": 0.1471543163061142, "learning_rate": 2.614789413523257e-06, "loss": 0.0027, "step": 23052 }, { "epoch": 6.081255771006464, "grad_norm": 0.052321743220090866, "learning_rate": 2.614437703332454e-06, "loss": 0.0006, "step": 23054 }, { "epoch": 6.081783405883129, "grad_norm": 0.05804084613919258, "learning_rate": 2.6140859931416514e-06, "loss": 0.0002, "step": 23056 }, { "epoch": 6.0823110407597945, "grad_norm": 0.0013069679262116551, "learning_rate": 2.6137342829508484e-06, "loss": 0.0001, "step": 23058 }, { "epoch": 6.082838675636459, "grad_norm": 0.008966470137238503, "learning_rate": 2.6133825727600458e-06, "loss": 0.0001, "step": 23060 }, { "epoch": 6.083366310513125, "grad_norm": 0.007892895489931107, "learning_rate": 2.613030862569243e-06, "loss": 0.0002, "step": 23062 }, { "epoch": 6.08389394538979, "grad_norm": 0.3797305226325989, "learning_rate": 2.6126791523784405e-06, "loss": 0.0012, "step": 23064 }, { "epoch": 6.084421580266455, "grad_norm": 0.198558509349823, "learning_rate": 2.6123274421876375e-06, "loss": 0.0006, "step": 23066 }, { "epoch": 6.084949215143121, "grad_norm": 0.001003359560854733, "learning_rate": 2.611975731996835e-06, "loss": 0.0001, "step": 23068 }, { "epoch": 6.085476850019786, "grad_norm": 0.009073992259800434, "learning_rate": 2.611624021806032e-06, "loss": 0.0001, "step": 23070 }, { "epoch": 6.086004484896452, "grad_norm": 0.0030534095130860806, "learning_rate": 2.611272311615229e-06, "loss": 0.0001, "step": 23072 }, { "epoch": 6.086532119773117, "grad_norm": 0.14882400631904602, "learning_rate": 2.6109206014244265e-06, "loss": 0.0065, "step": 23074 }, { "epoch": 6.087059754649783, "grad_norm": 0.0028996795881539583, "learning_rate": 2.6105688912336235e-06, "loss": 0.0002, "step": 23076 }, { "epoch": 6.087587389526448, "grad_norm": 0.7006451487541199, "learning_rate": 2.610217181042821e-06, "loss": 0.0065, "step": 23078 }, { "epoch": 6.0881150244031135, "grad_norm": 0.1604824960231781, "learning_rate": 2.609865470852018e-06, "loss": 0.0008, "step": 23080 }, { "epoch": 6.088642659279778, "grad_norm": 0.1443605124950409, "learning_rate": 2.609513760661215e-06, "loss": 0.0008, "step": 23082 }, { "epoch": 6.089170294156443, "grad_norm": 0.0028345687314867973, "learning_rate": 2.609162050470412e-06, "loss": 0.0001, "step": 23084 }, { "epoch": 6.089697929033109, "grad_norm": 0.2667548954486847, "learning_rate": 2.6088103402796095e-06, "loss": 0.0006, "step": 23086 }, { "epoch": 6.090225563909774, "grad_norm": 0.006923678796738386, "learning_rate": 2.608458630088807e-06, "loss": 0.0002, "step": 23088 }, { "epoch": 6.09075319878644, "grad_norm": 0.01752515323460102, "learning_rate": 2.6081069198980043e-06, "loss": 0.0001, "step": 23090 }, { "epoch": 6.091280833663105, "grad_norm": 0.21392199397087097, "learning_rate": 2.6077552097072012e-06, "loss": 0.0004, "step": 23092 }, { "epoch": 6.091808468539771, "grad_norm": 0.010113094933331013, "learning_rate": 2.6074034995163986e-06, "loss": 0.0001, "step": 23094 }, { "epoch": 6.092336103416436, "grad_norm": 0.005860926583409309, "learning_rate": 2.6070517893255956e-06, "loss": 0.0001, "step": 23096 }, { "epoch": 6.092863738293101, "grad_norm": 0.002651276532560587, "learning_rate": 2.606700079134793e-06, "loss": 0.0001, "step": 23098 }, { "epoch": 6.093391373169767, "grad_norm": 0.004074914380908012, "learning_rate": 2.6063483689439903e-06, "loss": 0.0001, "step": 23100 }, { "epoch": 6.093919008046432, "grad_norm": 0.39872869849205017, "learning_rate": 2.6059966587531877e-06, "loss": 0.0022, "step": 23102 }, { "epoch": 6.0944466429230975, "grad_norm": 0.33420175313949585, "learning_rate": 2.6056449485623846e-06, "loss": 0.0028, "step": 23104 }, { "epoch": 6.094974277799762, "grad_norm": 0.5270063877105713, "learning_rate": 2.6052932383715816e-06, "loss": 0.0007, "step": 23106 }, { "epoch": 6.095501912676428, "grad_norm": 0.8576833009719849, "learning_rate": 2.604941528180779e-06, "loss": 0.0008, "step": 23108 }, { "epoch": 6.096029547553093, "grad_norm": 0.004495244938880205, "learning_rate": 2.6045898179899764e-06, "loss": 0.0001, "step": 23110 }, { "epoch": 6.096557182429758, "grad_norm": 0.0025855996645987034, "learning_rate": 2.6042381077991737e-06, "loss": 0.0002, "step": 23112 }, { "epoch": 6.097084817306424, "grad_norm": 0.0063085234723985195, "learning_rate": 2.6038863976083707e-06, "loss": 0.0059, "step": 23114 }, { "epoch": 6.097612452183089, "grad_norm": 0.008556846529245377, "learning_rate": 2.603534687417568e-06, "loss": 0.0001, "step": 23116 }, { "epoch": 6.098140087059755, "grad_norm": 0.0033393055200576782, "learning_rate": 2.603182977226765e-06, "loss": 0.0011, "step": 23118 }, { "epoch": 6.09866772193642, "grad_norm": 0.016545768827199936, "learning_rate": 2.6028312670359624e-06, "loss": 0.0002, "step": 23120 }, { "epoch": 6.099195356813086, "grad_norm": 1.628388524055481, "learning_rate": 2.6024795568451598e-06, "loss": 0.0026, "step": 23122 }, { "epoch": 6.099722991689751, "grad_norm": 0.026176869869232178, "learning_rate": 2.602127846654357e-06, "loss": 0.0002, "step": 23124 }, { "epoch": 6.100250626566416, "grad_norm": 0.06280318647623062, "learning_rate": 2.601776136463554e-06, "loss": 0.0002, "step": 23126 }, { "epoch": 6.100778261443081, "grad_norm": 0.009957005269825459, "learning_rate": 2.6014244262727515e-06, "loss": 0.0003, "step": 23128 }, { "epoch": 6.101305896319746, "grad_norm": 0.004448591265827417, "learning_rate": 2.6010727160819484e-06, "loss": 0.0001, "step": 23130 }, { "epoch": 6.101833531196412, "grad_norm": 0.010481719858944416, "learning_rate": 2.600721005891146e-06, "loss": 0.0027, "step": 23132 }, { "epoch": 6.102361166073077, "grad_norm": 0.005048608873039484, "learning_rate": 2.600369295700343e-06, "loss": 0.0003, "step": 23134 }, { "epoch": 6.102888800949743, "grad_norm": 0.01043496374040842, "learning_rate": 2.60001758550954e-06, "loss": 0.0048, "step": 23136 }, { "epoch": 6.103416435826408, "grad_norm": 0.19084078073501587, "learning_rate": 2.5996658753187375e-06, "loss": 0.0008, "step": 23138 }, { "epoch": 6.103944070703074, "grad_norm": 0.018992681056261063, "learning_rate": 2.5993141651279344e-06, "loss": 0.0001, "step": 23140 }, { "epoch": 6.104471705579739, "grad_norm": 0.0015073124086484313, "learning_rate": 2.598962454937132e-06, "loss": 0.0001, "step": 23142 }, { "epoch": 6.104999340456404, "grad_norm": 0.0024311919696629047, "learning_rate": 2.5986107447463288e-06, "loss": 0.0001, "step": 23144 }, { "epoch": 6.10552697533307, "grad_norm": 0.37725088000297546, "learning_rate": 2.5982590345555266e-06, "loss": 0.0049, "step": 23146 }, { "epoch": 6.106054610209735, "grad_norm": 0.14442747831344604, "learning_rate": 2.5979073243647235e-06, "loss": 0.0002, "step": 23148 }, { "epoch": 6.1065822450864005, "grad_norm": 0.031280726194381714, "learning_rate": 2.597555614173921e-06, "loss": 0.0004, "step": 23150 }, { "epoch": 6.107109879963065, "grad_norm": 0.002345639280974865, "learning_rate": 2.597203903983118e-06, "loss": 0.0004, "step": 23152 }, { "epoch": 6.107637514839731, "grad_norm": 0.004129132721573114, "learning_rate": 2.5968521937923152e-06, "loss": 0.0001, "step": 23154 }, { "epoch": 6.108165149716396, "grad_norm": 0.0017456519417464733, "learning_rate": 2.596500483601512e-06, "loss": 0.0014, "step": 23156 }, { "epoch": 6.108692784593061, "grad_norm": 0.838391125202179, "learning_rate": 2.59614877341071e-06, "loss": 0.0033, "step": 23158 }, { "epoch": 6.109220419469727, "grad_norm": 0.003109677927568555, "learning_rate": 2.595797063219907e-06, "loss": 0.0027, "step": 23160 }, { "epoch": 6.109748054346392, "grad_norm": 0.0038119792006909847, "learning_rate": 2.5954453530291043e-06, "loss": 0.0013, "step": 23162 }, { "epoch": 6.110275689223058, "grad_norm": 0.00191014027222991, "learning_rate": 2.5950936428383013e-06, "loss": 0.0001, "step": 23164 }, { "epoch": 6.110803324099723, "grad_norm": 0.003193710697814822, "learning_rate": 2.5947419326474982e-06, "loss": 0.0001, "step": 23166 }, { "epoch": 6.111330958976389, "grad_norm": 0.018298417329788208, "learning_rate": 2.5943902224566956e-06, "loss": 0.0002, "step": 23168 }, { "epoch": 6.111858593853054, "grad_norm": 0.0016169280279427767, "learning_rate": 2.594038512265893e-06, "loss": 0.0003, "step": 23170 }, { "epoch": 6.112386228729719, "grad_norm": 0.0013000572798773646, "learning_rate": 2.5936868020750903e-06, "loss": 0.0001, "step": 23172 }, { "epoch": 6.1129138636063844, "grad_norm": 0.003137928433716297, "learning_rate": 2.5933350918842873e-06, "loss": 0.0007, "step": 23174 }, { "epoch": 6.113441498483049, "grad_norm": 0.005675771739333868, "learning_rate": 2.5929833816934847e-06, "loss": 0.0001, "step": 23176 }, { "epoch": 6.113969133359715, "grad_norm": 0.006443183869123459, "learning_rate": 2.5926316715026816e-06, "loss": 0.0001, "step": 23178 }, { "epoch": 6.11449676823638, "grad_norm": 0.017506001517176628, "learning_rate": 2.592279961311879e-06, "loss": 0.0018, "step": 23180 }, { "epoch": 6.115024403113046, "grad_norm": 0.12173517793416977, "learning_rate": 2.5919282511210764e-06, "loss": 0.0002, "step": 23182 }, { "epoch": 6.115552037989711, "grad_norm": 0.028388887643814087, "learning_rate": 2.5915765409302738e-06, "loss": 0.0034, "step": 23184 }, { "epoch": 6.116079672866377, "grad_norm": 0.23497694730758667, "learning_rate": 2.5912248307394707e-06, "loss": 0.0004, "step": 23186 }, { "epoch": 6.116607307743042, "grad_norm": 0.05730912834405899, "learning_rate": 2.590873120548668e-06, "loss": 0.0037, "step": 23188 }, { "epoch": 6.117134942619707, "grad_norm": 0.02474335953593254, "learning_rate": 2.590521410357865e-06, "loss": 0.0005, "step": 23190 }, { "epoch": 6.117662577496373, "grad_norm": 0.34999239444732666, "learning_rate": 2.5901697001670624e-06, "loss": 0.0014, "step": 23192 }, { "epoch": 6.118190212373038, "grad_norm": 0.0023434513714164495, "learning_rate": 2.5898179899762598e-06, "loss": 0.0001, "step": 23194 }, { "epoch": 6.1187178472497035, "grad_norm": 0.025106903165578842, "learning_rate": 2.5894662797854567e-06, "loss": 0.0002, "step": 23196 }, { "epoch": 6.119245482126368, "grad_norm": 0.005237163044512272, "learning_rate": 2.589114569594654e-06, "loss": 0.0001, "step": 23198 }, { "epoch": 6.119773117003034, "grad_norm": 0.005010145716369152, "learning_rate": 2.588762859403851e-06, "loss": 0.0001, "step": 23200 }, { "epoch": 6.120300751879699, "grad_norm": 0.001453834818676114, "learning_rate": 2.5884111492130484e-06, "loss": 0.0001, "step": 23202 }, { "epoch": 6.120828386756364, "grad_norm": 0.021978802978992462, "learning_rate": 2.588059439022246e-06, "loss": 0.0001, "step": 23204 }, { "epoch": 6.12135602163303, "grad_norm": 0.008301082998514175, "learning_rate": 2.587707728831443e-06, "loss": 0.0001, "step": 23206 }, { "epoch": 6.121883656509695, "grad_norm": 0.019652049988508224, "learning_rate": 2.58735601864064e-06, "loss": 0.0001, "step": 23208 }, { "epoch": 6.122411291386361, "grad_norm": 0.13223032653331757, "learning_rate": 2.5870043084498375e-06, "loss": 0.001, "step": 23210 }, { "epoch": 6.122938926263026, "grad_norm": 0.0011103851720690727, "learning_rate": 2.5866525982590345e-06, "loss": 0.0001, "step": 23212 }, { "epoch": 6.123466561139692, "grad_norm": 0.012693332508206367, "learning_rate": 2.586300888068232e-06, "loss": 0.0001, "step": 23214 }, { "epoch": 6.123994196016357, "grad_norm": 0.006448271684348583, "learning_rate": 2.5859491778774292e-06, "loss": 0.0001, "step": 23216 }, { "epoch": 6.124521830893022, "grad_norm": 0.004803441930562258, "learning_rate": 2.5855974676866266e-06, "loss": 0.0004, "step": 23218 }, { "epoch": 6.1250494657696875, "grad_norm": 0.10940619558095932, "learning_rate": 2.5852457574958236e-06, "loss": 0.0112, "step": 23220 }, { "epoch": 6.125577100646352, "grad_norm": 0.0026940167881548405, "learning_rate": 2.584894047305021e-06, "loss": 0.002, "step": 23222 }, { "epoch": 6.126104735523018, "grad_norm": 0.0009030470973812044, "learning_rate": 2.584542337114218e-06, "loss": 0.0035, "step": 23224 }, { "epoch": 6.126632370399683, "grad_norm": 0.0024488677736371756, "learning_rate": 2.584190626923415e-06, "loss": 0.0001, "step": 23226 }, { "epoch": 6.127160005276349, "grad_norm": 0.002644289517775178, "learning_rate": 2.583838916732612e-06, "loss": 0.0001, "step": 23228 }, { "epoch": 6.127687640153014, "grad_norm": 0.003378194523975253, "learning_rate": 2.5834872065418096e-06, "loss": 0.0001, "step": 23230 }, { "epoch": 6.12821527502968, "grad_norm": 0.00957066286355257, "learning_rate": 2.583135496351007e-06, "loss": 0.0001, "step": 23232 }, { "epoch": 6.128742909906345, "grad_norm": 0.005559791345149279, "learning_rate": 2.582783786160204e-06, "loss": 0.0001, "step": 23234 }, { "epoch": 6.12927054478301, "grad_norm": 0.006135452538728714, "learning_rate": 2.5824320759694013e-06, "loss": 0.0001, "step": 23236 }, { "epoch": 6.129798179659676, "grad_norm": 0.00768340565264225, "learning_rate": 2.5820803657785982e-06, "loss": 0.0003, "step": 23238 }, { "epoch": 6.130325814536341, "grad_norm": 0.14377425611019135, "learning_rate": 2.5817286555877956e-06, "loss": 0.0004, "step": 23240 }, { "epoch": 6.1308534494130065, "grad_norm": 0.22158344089984894, "learning_rate": 2.581376945396993e-06, "loss": 0.0049, "step": 23242 }, { "epoch": 6.131381084289671, "grad_norm": 0.008125015534460545, "learning_rate": 2.5810252352061904e-06, "loss": 0.0001, "step": 23244 }, { "epoch": 6.131908719166337, "grad_norm": 0.006096248980611563, "learning_rate": 2.5806735250153873e-06, "loss": 0.0001, "step": 23246 }, { "epoch": 6.132436354043002, "grad_norm": 0.004977000877261162, "learning_rate": 2.5803218148245847e-06, "loss": 0.0001, "step": 23248 }, { "epoch": 6.132963988919667, "grad_norm": 0.3773101270198822, "learning_rate": 2.5799701046337816e-06, "loss": 0.0006, "step": 23250 }, { "epoch": 6.133491623796333, "grad_norm": 0.002775122644379735, "learning_rate": 2.579618394442979e-06, "loss": 0.0025, "step": 23252 }, { "epoch": 6.134019258672998, "grad_norm": 0.02715391106903553, "learning_rate": 2.5792666842521764e-06, "loss": 0.0001, "step": 23254 }, { "epoch": 6.134546893549664, "grad_norm": 0.002388527151197195, "learning_rate": 2.5789149740613734e-06, "loss": 0.0001, "step": 23256 }, { "epoch": 6.135074528426329, "grad_norm": 0.019528251141309738, "learning_rate": 2.5785632638705707e-06, "loss": 0.0001, "step": 23258 }, { "epoch": 6.135602163302995, "grad_norm": 0.17123638093471527, "learning_rate": 2.5782115536797677e-06, "loss": 0.0003, "step": 23260 }, { "epoch": 6.13612979817966, "grad_norm": 0.010978850536048412, "learning_rate": 2.577859843488965e-06, "loss": 0.0002, "step": 23262 }, { "epoch": 6.136657433056325, "grad_norm": 0.0038939029909670353, "learning_rate": 2.5775081332981624e-06, "loss": 0.0001, "step": 23264 }, { "epoch": 6.1371850679329905, "grad_norm": 0.008871525526046753, "learning_rate": 2.57715642310736e-06, "loss": 0.0001, "step": 23266 }, { "epoch": 6.137712702809655, "grad_norm": 0.004263410810381174, "learning_rate": 2.5768047129165568e-06, "loss": 0.0001, "step": 23268 }, { "epoch": 6.138240337686321, "grad_norm": 0.01704450510442257, "learning_rate": 2.576453002725754e-06, "loss": 0.0001, "step": 23270 }, { "epoch": 6.138767972562986, "grad_norm": 0.0026572737842798233, "learning_rate": 2.576101292534951e-06, "loss": 0.0001, "step": 23272 }, { "epoch": 6.139295607439652, "grad_norm": 0.12256001681089401, "learning_rate": 2.5757495823441485e-06, "loss": 0.0103, "step": 23274 }, { "epoch": 6.139823242316317, "grad_norm": 0.07509907335042953, "learning_rate": 2.575397872153346e-06, "loss": 0.0022, "step": 23276 }, { "epoch": 6.140350877192983, "grad_norm": 0.001837834483012557, "learning_rate": 2.5750461619625432e-06, "loss": 0.0004, "step": 23278 }, { "epoch": 6.140878512069648, "grad_norm": 0.009354970417916775, "learning_rate": 2.57469445177174e-06, "loss": 0.0001, "step": 23280 }, { "epoch": 6.141406146946313, "grad_norm": 0.009237313643097878, "learning_rate": 2.5743427415809375e-06, "loss": 0.0002, "step": 23282 }, { "epoch": 6.141933781822979, "grad_norm": 0.008719643577933311, "learning_rate": 2.5739910313901345e-06, "loss": 0.0003, "step": 23284 }, { "epoch": 6.142461416699644, "grad_norm": 0.003547854721546173, "learning_rate": 2.5736393211993315e-06, "loss": 0.0056, "step": 23286 }, { "epoch": 6.1429890515763095, "grad_norm": 0.02165074087679386, "learning_rate": 2.5732876110085292e-06, "loss": 0.0002, "step": 23288 }, { "epoch": 6.143516686452974, "grad_norm": 0.05093333125114441, "learning_rate": 2.572935900817726e-06, "loss": 0.0001, "step": 23290 }, { "epoch": 6.14404432132964, "grad_norm": 0.0034431908279657364, "learning_rate": 2.5725841906269236e-06, "loss": 0.0001, "step": 23292 }, { "epoch": 6.144571956206305, "grad_norm": 0.026609789580106735, "learning_rate": 2.5722324804361205e-06, "loss": 0.0024, "step": 23294 }, { "epoch": 6.14509959108297, "grad_norm": 0.015018275007605553, "learning_rate": 2.571880770245318e-06, "loss": 0.0002, "step": 23296 }, { "epoch": 6.145627225959636, "grad_norm": 0.09679887443780899, "learning_rate": 2.571529060054515e-06, "loss": 0.0004, "step": 23298 }, { "epoch": 6.146154860836301, "grad_norm": 0.009953909553587437, "learning_rate": 2.5711773498637122e-06, "loss": 0.0002, "step": 23300 }, { "epoch": 6.146682495712967, "grad_norm": 0.004995525348931551, "learning_rate": 2.5708256396729096e-06, "loss": 0.0002, "step": 23302 }, { "epoch": 6.147210130589632, "grad_norm": 0.3175485134124756, "learning_rate": 2.570473929482107e-06, "loss": 0.0018, "step": 23304 }, { "epoch": 6.147737765466298, "grad_norm": 0.00585057120770216, "learning_rate": 2.570122219291304e-06, "loss": 0.0001, "step": 23306 }, { "epoch": 6.148265400342963, "grad_norm": 0.0656430721282959, "learning_rate": 2.5697705091005013e-06, "loss": 0.0097, "step": 23308 }, { "epoch": 6.148793035219628, "grad_norm": 0.005259951576590538, "learning_rate": 2.5694187989096983e-06, "loss": 0.0001, "step": 23310 }, { "epoch": 6.1493206700962935, "grad_norm": 0.004327061120420694, "learning_rate": 2.5690670887188956e-06, "loss": 0.0001, "step": 23312 }, { "epoch": 6.149848304972958, "grad_norm": 0.004133015405386686, "learning_rate": 2.568715378528093e-06, "loss": 0.0004, "step": 23314 }, { "epoch": 6.150375939849624, "grad_norm": 0.14538414776325226, "learning_rate": 2.56836366833729e-06, "loss": 0.0004, "step": 23316 }, { "epoch": 6.150903574726289, "grad_norm": 0.03948052227497101, "learning_rate": 2.5680119581464873e-06, "loss": 0.0002, "step": 23318 }, { "epoch": 6.151431209602955, "grad_norm": 0.019099755212664604, "learning_rate": 2.5676602479556843e-06, "loss": 0.0001, "step": 23320 }, { "epoch": 6.15195884447962, "grad_norm": 0.007421310991048813, "learning_rate": 2.5673085377648817e-06, "loss": 0.0001, "step": 23322 }, { "epoch": 6.152486479356286, "grad_norm": 0.08819644153118134, "learning_rate": 2.566956827574079e-06, "loss": 0.0004, "step": 23324 }, { "epoch": 6.153014114232951, "grad_norm": 0.5247269868850708, "learning_rate": 2.5666051173832764e-06, "loss": 0.0027, "step": 23326 }, { "epoch": 6.153541749109616, "grad_norm": 0.005034327507019043, "learning_rate": 2.5662534071924734e-06, "loss": 0.0001, "step": 23328 }, { "epoch": 6.154069383986282, "grad_norm": 0.026557231321930885, "learning_rate": 2.5659016970016708e-06, "loss": 0.0059, "step": 23330 }, { "epoch": 6.154597018862947, "grad_norm": 0.02725721336901188, "learning_rate": 2.5655499868108677e-06, "loss": 0.0002, "step": 23332 }, { "epoch": 6.1551246537396125, "grad_norm": 0.0494476780295372, "learning_rate": 2.565198276620065e-06, "loss": 0.0008, "step": 23334 }, { "epoch": 6.1556522886162774, "grad_norm": 0.05313917621970177, "learning_rate": 2.5648465664292625e-06, "loss": 0.0002, "step": 23336 }, { "epoch": 6.156179923492943, "grad_norm": 0.006942293141037226, "learning_rate": 2.56449485623846e-06, "loss": 0.0001, "step": 23338 }, { "epoch": 6.156707558369608, "grad_norm": 0.01979653351008892, "learning_rate": 2.5641431460476568e-06, "loss": 0.0001, "step": 23340 }, { "epoch": 6.157235193246273, "grad_norm": 0.6022982597351074, "learning_rate": 2.563791435856854e-06, "loss": 0.0067, "step": 23342 }, { "epoch": 6.157762828122939, "grad_norm": 0.13882772624492645, "learning_rate": 2.563439725666051e-06, "loss": 0.0007, "step": 23344 }, { "epoch": 6.158290462999604, "grad_norm": 0.0926324799656868, "learning_rate": 2.5630880154752485e-06, "loss": 0.0004, "step": 23346 }, { "epoch": 6.15881809787627, "grad_norm": 0.1280309110879898, "learning_rate": 2.562736305284446e-06, "loss": 0.0003, "step": 23348 }, { "epoch": 6.159345732752935, "grad_norm": 0.8482315540313721, "learning_rate": 2.562384595093643e-06, "loss": 0.0025, "step": 23350 }, { "epoch": 6.159873367629601, "grad_norm": 0.23555463552474976, "learning_rate": 2.56203288490284e-06, "loss": 0.0004, "step": 23352 }, { "epoch": 6.160401002506266, "grad_norm": 0.009439940564334393, "learning_rate": 2.561681174712037e-06, "loss": 0.0001, "step": 23354 }, { "epoch": 6.160928637382931, "grad_norm": 0.09063389152288437, "learning_rate": 2.5613294645212345e-06, "loss": 0.0004, "step": 23356 }, { "epoch": 6.1614562722595965, "grad_norm": 0.06620486080646515, "learning_rate": 2.5609777543304315e-06, "loss": 0.0002, "step": 23358 }, { "epoch": 6.161983907136261, "grad_norm": 0.027072235941886902, "learning_rate": 2.5606260441396293e-06, "loss": 0.0001, "step": 23360 }, { "epoch": 6.162511542012927, "grad_norm": 0.04398869350552559, "learning_rate": 2.5602743339488262e-06, "loss": 0.0001, "step": 23362 }, { "epoch": 6.163039176889592, "grad_norm": 0.0033021410927176476, "learning_rate": 2.5599226237580236e-06, "loss": 0.0005, "step": 23364 }, { "epoch": 6.163566811766258, "grad_norm": 0.005024549085646868, "learning_rate": 2.5595709135672206e-06, "loss": 0.0001, "step": 23366 }, { "epoch": 6.164094446642923, "grad_norm": 0.0953163430094719, "learning_rate": 2.559219203376418e-06, "loss": 0.0023, "step": 23368 }, { "epoch": 6.164622081519588, "grad_norm": 0.005804297048598528, "learning_rate": 2.558867493185615e-06, "loss": 0.0001, "step": 23370 }, { "epoch": 6.165149716396254, "grad_norm": 0.0019899634644389153, "learning_rate": 2.5585157829948127e-06, "loss": 0.0013, "step": 23372 }, { "epoch": 6.165677351272919, "grad_norm": 0.0759904757142067, "learning_rate": 2.5581640728040096e-06, "loss": 0.0017, "step": 23374 }, { "epoch": 6.166204986149585, "grad_norm": 0.005949961952865124, "learning_rate": 2.557812362613207e-06, "loss": 0.0005, "step": 23376 }, { "epoch": 6.16673262102625, "grad_norm": 0.08568727970123291, "learning_rate": 2.557460652422404e-06, "loss": 0.0005, "step": 23378 }, { "epoch": 6.1672602559029155, "grad_norm": 0.002131668385118246, "learning_rate": 2.557108942231601e-06, "loss": 0.0001, "step": 23380 }, { "epoch": 6.1677878907795805, "grad_norm": 0.2810046970844269, "learning_rate": 2.5567572320407983e-06, "loss": 0.0067, "step": 23382 }, { "epoch": 6.168315525656246, "grad_norm": 0.0022872479166835546, "learning_rate": 2.5564055218499957e-06, "loss": 0.0001, "step": 23384 }, { "epoch": 6.168843160532911, "grad_norm": 0.008600304834544659, "learning_rate": 2.556053811659193e-06, "loss": 0.0001, "step": 23386 }, { "epoch": 6.169370795409576, "grad_norm": 0.00073112768586725, "learning_rate": 2.55570210146839e-06, "loss": 0.0001, "step": 23388 }, { "epoch": 6.169898430286242, "grad_norm": 0.004213486332446337, "learning_rate": 2.5553503912775874e-06, "loss": 0.0001, "step": 23390 }, { "epoch": 6.170426065162907, "grad_norm": 0.024915147572755814, "learning_rate": 2.5549986810867843e-06, "loss": 0.0001, "step": 23392 }, { "epoch": 6.170953700039573, "grad_norm": 0.0048879110254347324, "learning_rate": 2.5546469708959817e-06, "loss": 0.0001, "step": 23394 }, { "epoch": 6.171481334916238, "grad_norm": 0.011015848256647587, "learning_rate": 2.554295260705179e-06, "loss": 0.0001, "step": 23396 }, { "epoch": 6.172008969792904, "grad_norm": 0.21260714530944824, "learning_rate": 2.5539435505143765e-06, "loss": 0.012, "step": 23398 }, { "epoch": 6.172536604669569, "grad_norm": 0.003754952223971486, "learning_rate": 2.5535918403235734e-06, "loss": 0.0001, "step": 23400 }, { "epoch": 6.173064239546234, "grad_norm": 0.00430180411785841, "learning_rate": 2.5532401301327708e-06, "loss": 0.0001, "step": 23402 }, { "epoch": 6.1735918744228995, "grad_norm": 0.20275139808654785, "learning_rate": 2.5528884199419677e-06, "loss": 0.0004, "step": 23404 }, { "epoch": 6.174119509299564, "grad_norm": 0.005839088000357151, "learning_rate": 2.552536709751165e-06, "loss": 0.0001, "step": 23406 }, { "epoch": 6.17464714417623, "grad_norm": 0.023157941177487373, "learning_rate": 2.5521849995603625e-06, "loss": 0.0001, "step": 23408 }, { "epoch": 6.175174779052895, "grad_norm": 0.12041417509317398, "learning_rate": 2.5518332893695594e-06, "loss": 0.0003, "step": 23410 }, { "epoch": 6.175702413929561, "grad_norm": 0.0670049786567688, "learning_rate": 2.551481579178757e-06, "loss": 0.0002, "step": 23412 }, { "epoch": 6.176230048806226, "grad_norm": 0.011008371599018574, "learning_rate": 2.5511298689879538e-06, "loss": 0.0001, "step": 23414 }, { "epoch": 6.176757683682891, "grad_norm": 0.010585985146462917, "learning_rate": 2.550778158797151e-06, "loss": 0.0001, "step": 23416 }, { "epoch": 6.177285318559557, "grad_norm": 0.006655486766248941, "learning_rate": 2.5504264486063485e-06, "loss": 0.0001, "step": 23418 }, { "epoch": 6.177812953436222, "grad_norm": 0.0013780365698039532, "learning_rate": 2.550074738415546e-06, "loss": 0.0001, "step": 23420 }, { "epoch": 6.178340588312888, "grad_norm": 0.0041112350299954414, "learning_rate": 2.549723028224743e-06, "loss": 0.0011, "step": 23422 }, { "epoch": 6.178868223189553, "grad_norm": 0.0017346187960356474, "learning_rate": 2.5493713180339402e-06, "loss": 0.0001, "step": 23424 }, { "epoch": 6.1793958580662185, "grad_norm": 0.005663829389959574, "learning_rate": 2.549019607843137e-06, "loss": 0.0001, "step": 23426 }, { "epoch": 6.1799234929428835, "grad_norm": 0.2541101574897766, "learning_rate": 2.5486678976523345e-06, "loss": 0.0064, "step": 23428 }, { "epoch": 6.180451127819548, "grad_norm": 0.4331907033920288, "learning_rate": 2.548316187461532e-06, "loss": 0.0007, "step": 23430 }, { "epoch": 6.180978762696214, "grad_norm": 0.001451111282221973, "learning_rate": 2.5479644772707293e-06, "loss": 0.0001, "step": 23432 }, { "epoch": 6.181506397572879, "grad_norm": 0.005540730431675911, "learning_rate": 2.5476127670799263e-06, "loss": 0.0023, "step": 23434 }, { "epoch": 6.182034032449545, "grad_norm": 0.004599760752171278, "learning_rate": 2.5472610568891236e-06, "loss": 0.0002, "step": 23436 }, { "epoch": 6.18256166732621, "grad_norm": 0.00975702702999115, "learning_rate": 2.5469093466983206e-06, "loss": 0.0001, "step": 23438 }, { "epoch": 6.183089302202876, "grad_norm": 0.0014738013269379735, "learning_rate": 2.5465576365075175e-06, "loss": 0.0001, "step": 23440 }, { "epoch": 6.183616937079541, "grad_norm": 0.22296369075775146, "learning_rate": 2.546205926316715e-06, "loss": 0.0008, "step": 23442 }, { "epoch": 6.184144571956207, "grad_norm": 0.5223017930984497, "learning_rate": 2.5458542161259123e-06, "loss": 0.0019, "step": 23444 }, { "epoch": 6.184672206832872, "grad_norm": 0.35409921407699585, "learning_rate": 2.5455025059351097e-06, "loss": 0.0062, "step": 23446 }, { "epoch": 6.185199841709537, "grad_norm": 0.5678011178970337, "learning_rate": 2.5451507957443066e-06, "loss": 0.0008, "step": 23448 }, { "epoch": 6.1857274765862025, "grad_norm": 0.020560406148433685, "learning_rate": 2.544799085553504e-06, "loss": 0.0001, "step": 23450 }, { "epoch": 6.186255111462867, "grad_norm": 0.04740668460726738, "learning_rate": 2.544447375362701e-06, "loss": 0.0002, "step": 23452 }, { "epoch": 6.186782746339533, "grad_norm": 0.001517162541858852, "learning_rate": 2.5440956651718983e-06, "loss": 0.0001, "step": 23454 }, { "epoch": 6.187310381216198, "grad_norm": 0.018507272005081177, "learning_rate": 2.5437439549810957e-06, "loss": 0.0001, "step": 23456 }, { "epoch": 6.187838016092864, "grad_norm": 0.0022614223416894674, "learning_rate": 2.543392244790293e-06, "loss": 0.0001, "step": 23458 }, { "epoch": 6.188365650969529, "grad_norm": 0.03254012018442154, "learning_rate": 2.54304053459949e-06, "loss": 0.0001, "step": 23460 }, { "epoch": 6.188893285846194, "grad_norm": 0.36198922991752625, "learning_rate": 2.5426888244086874e-06, "loss": 0.0004, "step": 23462 }, { "epoch": 6.18942092072286, "grad_norm": 0.4796197712421417, "learning_rate": 2.5423371142178843e-06, "loss": 0.0028, "step": 23464 }, { "epoch": 6.189948555599525, "grad_norm": 0.0035195646341890097, "learning_rate": 2.5419854040270817e-06, "loss": 0.0001, "step": 23466 }, { "epoch": 6.190476190476191, "grad_norm": 0.14670994877815247, "learning_rate": 2.541633693836279e-06, "loss": 0.0007, "step": 23468 }, { "epoch": 6.191003825352856, "grad_norm": 0.1589188277721405, "learning_rate": 2.541281983645476e-06, "loss": 0.0004, "step": 23470 }, { "epoch": 6.1915314602295215, "grad_norm": 0.1843455582857132, "learning_rate": 2.5409302734546734e-06, "loss": 0.0003, "step": 23472 }, { "epoch": 6.1920590951061865, "grad_norm": 0.026337426155805588, "learning_rate": 2.5405785632638704e-06, "loss": 0.0002, "step": 23474 }, { "epoch": 6.192586729982851, "grad_norm": 0.21627134084701538, "learning_rate": 2.5402268530730678e-06, "loss": 0.0025, "step": 23476 }, { "epoch": 6.193114364859517, "grad_norm": 0.005521752871572971, "learning_rate": 2.539875142882265e-06, "loss": 0.0001, "step": 23478 }, { "epoch": 6.193641999736182, "grad_norm": 0.38766878843307495, "learning_rate": 2.5395234326914625e-06, "loss": 0.0035, "step": 23480 }, { "epoch": 6.194169634612848, "grad_norm": 0.005745342932641506, "learning_rate": 2.5391717225006595e-06, "loss": 0.0001, "step": 23482 }, { "epoch": 6.194697269489513, "grad_norm": 0.006839680951088667, "learning_rate": 2.538820012309857e-06, "loss": 0.0001, "step": 23484 }, { "epoch": 6.195224904366179, "grad_norm": 0.13742482662200928, "learning_rate": 2.538468302119054e-06, "loss": 0.0003, "step": 23486 }, { "epoch": 6.195752539242844, "grad_norm": 0.002901933854445815, "learning_rate": 2.538116591928251e-06, "loss": 0.0003, "step": 23488 }, { "epoch": 6.19628017411951, "grad_norm": 0.0015294376062229276, "learning_rate": 2.5377648817374485e-06, "loss": 0.0001, "step": 23490 }, { "epoch": 6.196807808996175, "grad_norm": 0.06339596956968307, "learning_rate": 2.537413171546646e-06, "loss": 0.0001, "step": 23492 }, { "epoch": 6.19733544387284, "grad_norm": 0.006953933276236057, "learning_rate": 2.537061461355843e-06, "loss": 0.0001, "step": 23494 }, { "epoch": 6.1978630787495055, "grad_norm": 0.00538665009662509, "learning_rate": 2.5367097511650402e-06, "loss": 0.0005, "step": 23496 }, { "epoch": 6.1983907136261704, "grad_norm": 0.040996842086315155, "learning_rate": 2.536358040974237e-06, "loss": 0.0103, "step": 23498 }, { "epoch": 6.198918348502836, "grad_norm": 0.5276994109153748, "learning_rate": 2.536006330783434e-06, "loss": 0.0083, "step": 23500 }, { "epoch": 6.199445983379501, "grad_norm": 0.020750857889652252, "learning_rate": 2.535654620592632e-06, "loss": 0.0027, "step": 23502 }, { "epoch": 6.199973618256167, "grad_norm": 0.0015021563740447164, "learning_rate": 2.535302910401829e-06, "loss": 0.0008, "step": 23504 }, { "epoch": 6.200501253132832, "grad_norm": 0.0009465448674745858, "learning_rate": 2.5349512002110263e-06, "loss": 0.0002, "step": 23506 }, { "epoch": 6.201028888009497, "grad_norm": 0.0038342904299497604, "learning_rate": 2.5345994900202232e-06, "loss": 0.0001, "step": 23508 }, { "epoch": 6.201556522886163, "grad_norm": 0.007982318289577961, "learning_rate": 2.5342477798294206e-06, "loss": 0.0003, "step": 23510 }, { "epoch": 6.202084157762828, "grad_norm": 0.003898613853380084, "learning_rate": 2.5338960696386176e-06, "loss": 0.0001, "step": 23512 }, { "epoch": 6.202611792639494, "grad_norm": 0.006964511703699827, "learning_rate": 2.5335443594478154e-06, "loss": 0.0001, "step": 23514 }, { "epoch": 6.203139427516159, "grad_norm": 0.005212926771491766, "learning_rate": 2.5331926492570123e-06, "loss": 0.0001, "step": 23516 }, { "epoch": 6.2036670623928245, "grad_norm": 0.0484875924885273, "learning_rate": 2.5328409390662097e-06, "loss": 0.0002, "step": 23518 }, { "epoch": 6.2041946972694895, "grad_norm": 0.014624010771512985, "learning_rate": 2.5324892288754066e-06, "loss": 0.0001, "step": 23520 }, { "epoch": 6.204722332146154, "grad_norm": 0.005835252348333597, "learning_rate": 2.532137518684604e-06, "loss": 0.0001, "step": 23522 }, { "epoch": 6.20524996702282, "grad_norm": 0.06488819420337677, "learning_rate": 2.531785808493801e-06, "loss": 0.0002, "step": 23524 }, { "epoch": 6.205777601899485, "grad_norm": 0.0022906309459358454, "learning_rate": 2.5314340983029983e-06, "loss": 0.0001, "step": 23526 }, { "epoch": 6.206305236776151, "grad_norm": 0.014928937889635563, "learning_rate": 2.5310823881121957e-06, "loss": 0.0001, "step": 23528 }, { "epoch": 6.206832871652816, "grad_norm": 1.2472025156021118, "learning_rate": 2.5307306779213927e-06, "loss": 0.0012, "step": 23530 }, { "epoch": 6.207360506529482, "grad_norm": 0.009555241093039513, "learning_rate": 2.53037896773059e-06, "loss": 0.0001, "step": 23532 }, { "epoch": 6.207888141406147, "grad_norm": 0.015915175899863243, "learning_rate": 2.530027257539787e-06, "loss": 0.0001, "step": 23534 }, { "epoch": 6.208415776282813, "grad_norm": 0.0021278257481753826, "learning_rate": 2.5296755473489844e-06, "loss": 0.0001, "step": 23536 }, { "epoch": 6.208943411159478, "grad_norm": 0.019693417474627495, "learning_rate": 2.5293238371581818e-06, "loss": 0.0001, "step": 23538 }, { "epoch": 6.209471046036143, "grad_norm": 0.0047835176810622215, "learning_rate": 2.528972126967379e-06, "loss": 0.0001, "step": 23540 }, { "epoch": 6.2099986809128085, "grad_norm": 0.0143427150323987, "learning_rate": 2.528620416776576e-06, "loss": 0.0001, "step": 23542 }, { "epoch": 6.2105263157894735, "grad_norm": 0.4753772020339966, "learning_rate": 2.5282687065857735e-06, "loss": 0.0016, "step": 23544 }, { "epoch": 6.211053950666139, "grad_norm": 0.08826509863138199, "learning_rate": 2.5279169963949704e-06, "loss": 0.0016, "step": 23546 }, { "epoch": 6.211581585542804, "grad_norm": 0.09516122937202454, "learning_rate": 2.5275652862041678e-06, "loss": 0.0005, "step": 23548 }, { "epoch": 6.21210922041947, "grad_norm": 0.17742061614990234, "learning_rate": 2.527213576013365e-06, "loss": 0.0046, "step": 23550 }, { "epoch": 6.212636855296135, "grad_norm": 0.0324615053832531, "learning_rate": 2.5268618658225625e-06, "loss": 0.0024, "step": 23552 }, { "epoch": 6.2131644901728, "grad_norm": 0.0015195851447060704, "learning_rate": 2.5265101556317595e-06, "loss": 0.0001, "step": 23554 }, { "epoch": 6.213692125049466, "grad_norm": 0.008824172429740429, "learning_rate": 2.526158445440957e-06, "loss": 0.0002, "step": 23556 }, { "epoch": 6.214219759926131, "grad_norm": 0.3802284896373749, "learning_rate": 2.525806735250154e-06, "loss": 0.0006, "step": 23558 }, { "epoch": 6.214747394802797, "grad_norm": 0.028617139905691147, "learning_rate": 2.525455025059351e-06, "loss": 0.0001, "step": 23560 }, { "epoch": 6.215275029679462, "grad_norm": 0.004023576620966196, "learning_rate": 2.5251033148685486e-06, "loss": 0.0005, "step": 23562 }, { "epoch": 6.2158026645561275, "grad_norm": 0.0021863963920623064, "learning_rate": 2.5247516046777455e-06, "loss": 0.0004, "step": 23564 }, { "epoch": 6.2163302994327925, "grad_norm": 0.0020606990437954664, "learning_rate": 2.524399894486943e-06, "loss": 0.0002, "step": 23566 }, { "epoch": 6.216857934309457, "grad_norm": 0.05825618654489517, "learning_rate": 2.52404818429614e-06, "loss": 0.0002, "step": 23568 }, { "epoch": 6.217385569186123, "grad_norm": 0.0036065455060452223, "learning_rate": 2.5236964741053372e-06, "loss": 0.0001, "step": 23570 }, { "epoch": 6.217913204062788, "grad_norm": 0.21719856560230255, "learning_rate": 2.5233447639145346e-06, "loss": 0.0005, "step": 23572 }, { "epoch": 6.218440838939454, "grad_norm": 0.023393407464027405, "learning_rate": 2.522993053723732e-06, "loss": 0.0016, "step": 23574 }, { "epoch": 6.218968473816119, "grad_norm": 0.0062077902257442474, "learning_rate": 2.522641343532929e-06, "loss": 0.0001, "step": 23576 }, { "epoch": 6.219496108692785, "grad_norm": 0.012025423347949982, "learning_rate": 2.5222896333421263e-06, "loss": 0.0001, "step": 23578 }, { "epoch": 6.22002374356945, "grad_norm": 0.3715076148509979, "learning_rate": 2.5219379231513233e-06, "loss": 0.001, "step": 23580 }, { "epoch": 6.220551378446116, "grad_norm": 0.007079086732119322, "learning_rate": 2.5215862129605206e-06, "loss": 0.0002, "step": 23582 }, { "epoch": 6.221079013322781, "grad_norm": 0.5021139979362488, "learning_rate": 2.5212345027697176e-06, "loss": 0.0035, "step": 23584 }, { "epoch": 6.221606648199446, "grad_norm": 0.09872614592313766, "learning_rate": 2.5208827925789154e-06, "loss": 0.0019, "step": 23586 }, { "epoch": 6.2221342830761115, "grad_norm": 0.031688135117292404, "learning_rate": 2.5205310823881123e-06, "loss": 0.0002, "step": 23588 }, { "epoch": 6.2226619179527765, "grad_norm": 0.015170375816524029, "learning_rate": 2.5201793721973093e-06, "loss": 0.0001, "step": 23590 }, { "epoch": 6.223189552829442, "grad_norm": 0.12343383580446243, "learning_rate": 2.5198276620065067e-06, "loss": 0.0005, "step": 23592 }, { "epoch": 6.223717187706107, "grad_norm": 0.020339185371994972, "learning_rate": 2.5194759518157036e-06, "loss": 0.0001, "step": 23594 }, { "epoch": 6.224244822582773, "grad_norm": 0.006989162415266037, "learning_rate": 2.519124241624901e-06, "loss": 0.0001, "step": 23596 }, { "epoch": 6.224772457459438, "grad_norm": 0.004835797939449549, "learning_rate": 2.5187725314340984e-06, "loss": 0.0001, "step": 23598 }, { "epoch": 6.225300092336103, "grad_norm": 0.047337912023067474, "learning_rate": 2.5184208212432957e-06, "loss": 0.0002, "step": 23600 }, { "epoch": 6.225827727212769, "grad_norm": 0.002250660676509142, "learning_rate": 2.5180691110524927e-06, "loss": 0.0001, "step": 23602 }, { "epoch": 6.226355362089434, "grad_norm": 0.34558457136154175, "learning_rate": 2.51771740086169e-06, "loss": 0.0005, "step": 23604 }, { "epoch": 6.2268829969661, "grad_norm": 0.0038277029525488615, "learning_rate": 2.517365690670887e-06, "loss": 0.0001, "step": 23606 }, { "epoch": 6.227410631842765, "grad_norm": 0.0019718939438462257, "learning_rate": 2.5170139804800844e-06, "loss": 0.0003, "step": 23608 }, { "epoch": 6.2279382667194305, "grad_norm": 0.0017028189031407237, "learning_rate": 2.5166622702892818e-06, "loss": 0.0001, "step": 23610 }, { "epoch": 6.2284659015960955, "grad_norm": 0.0007547807181254029, "learning_rate": 2.516310560098479e-06, "loss": 0.0001, "step": 23612 }, { "epoch": 6.22899353647276, "grad_norm": 0.0020900056697428226, "learning_rate": 2.515958849907676e-06, "loss": 0.0001, "step": 23614 }, { "epoch": 6.229521171349426, "grad_norm": 0.05461885407567024, "learning_rate": 2.5156071397168735e-06, "loss": 0.0002, "step": 23616 }, { "epoch": 6.230048806226091, "grad_norm": 0.07483621686697006, "learning_rate": 2.5152554295260704e-06, "loss": 0.0017, "step": 23618 }, { "epoch": 6.230576441102757, "grad_norm": 0.11675973236560822, "learning_rate": 2.514903719335268e-06, "loss": 0.0038, "step": 23620 }, { "epoch": 6.231104075979422, "grad_norm": 0.005807127803564072, "learning_rate": 2.514552009144465e-06, "loss": 0.0001, "step": 23622 }, { "epoch": 6.231631710856088, "grad_norm": 0.009762940928339958, "learning_rate": 2.514200298953662e-06, "loss": 0.0005, "step": 23624 }, { "epoch": 6.232159345732753, "grad_norm": 0.00450563058257103, "learning_rate": 2.5138485887628595e-06, "loss": 0.0009, "step": 23626 }, { "epoch": 6.232686980609419, "grad_norm": 0.2502373456954956, "learning_rate": 2.5134968785720565e-06, "loss": 0.0005, "step": 23628 }, { "epoch": 6.233214615486084, "grad_norm": 0.002443052129819989, "learning_rate": 2.513145168381254e-06, "loss": 0.0002, "step": 23630 }, { "epoch": 6.233742250362749, "grad_norm": 0.0005832897732034326, "learning_rate": 2.5127934581904512e-06, "loss": 0.0005, "step": 23632 }, { "epoch": 6.2342698852394145, "grad_norm": 0.0040614488534629345, "learning_rate": 2.5124417479996486e-06, "loss": 0.0001, "step": 23634 }, { "epoch": 6.2347975201160795, "grad_norm": 0.0011200036387890577, "learning_rate": 2.5120900378088455e-06, "loss": 0.0001, "step": 23636 }, { "epoch": 6.235325154992745, "grad_norm": 0.008274574764072895, "learning_rate": 2.511738327618043e-06, "loss": 0.0001, "step": 23638 }, { "epoch": 6.23585278986941, "grad_norm": 0.0029606225434690714, "learning_rate": 2.51138661742724e-06, "loss": 0.0001, "step": 23640 }, { "epoch": 6.236380424746076, "grad_norm": 0.0011247940128669143, "learning_rate": 2.5110349072364372e-06, "loss": 0.0001, "step": 23642 }, { "epoch": 6.236908059622741, "grad_norm": 0.001228720648214221, "learning_rate": 2.5106831970456346e-06, "loss": 0.0001, "step": 23644 }, { "epoch": 6.237435694499406, "grad_norm": 0.0016466950764879584, "learning_rate": 2.510331486854832e-06, "loss": 0.0016, "step": 23646 }, { "epoch": 6.237963329376072, "grad_norm": 0.008964253589510918, "learning_rate": 2.509979776664029e-06, "loss": 0.0001, "step": 23648 }, { "epoch": 6.238490964252737, "grad_norm": 0.009128320962190628, "learning_rate": 2.5096280664732263e-06, "loss": 0.0001, "step": 23650 }, { "epoch": 6.239018599129403, "grad_norm": 0.3416026532649994, "learning_rate": 2.5092763562824233e-06, "loss": 0.0006, "step": 23652 }, { "epoch": 6.239546234006068, "grad_norm": 0.005181680433452129, "learning_rate": 2.5089246460916202e-06, "loss": 0.0001, "step": 23654 }, { "epoch": 6.2400738688827335, "grad_norm": 0.445309042930603, "learning_rate": 2.508572935900818e-06, "loss": 0.002, "step": 23656 }, { "epoch": 6.2406015037593985, "grad_norm": 0.00706753833219409, "learning_rate": 2.508221225710015e-06, "loss": 0.0005, "step": 23658 }, { "epoch": 6.2411291386360634, "grad_norm": 0.03496916964650154, "learning_rate": 2.5078695155192124e-06, "loss": 0.0002, "step": 23660 }, { "epoch": 6.241656773512729, "grad_norm": 0.005025513470172882, "learning_rate": 2.5075178053284093e-06, "loss": 0.0001, "step": 23662 }, { "epoch": 6.242184408389394, "grad_norm": 0.8776650428771973, "learning_rate": 2.5071660951376067e-06, "loss": 0.0021, "step": 23664 }, { "epoch": 6.24271204326606, "grad_norm": 0.030463624745607376, "learning_rate": 2.5068143849468036e-06, "loss": 0.0005, "step": 23666 }, { "epoch": 6.243239678142725, "grad_norm": 0.01688973233103752, "learning_rate": 2.506462674756001e-06, "loss": 0.0001, "step": 23668 }, { "epoch": 6.243767313019391, "grad_norm": 0.45530983805656433, "learning_rate": 2.5061109645651984e-06, "loss": 0.0031, "step": 23670 }, { "epoch": 6.244294947896056, "grad_norm": 0.001093700877390802, "learning_rate": 2.5057592543743958e-06, "loss": 0.0001, "step": 23672 }, { "epoch": 6.244822582772722, "grad_norm": 0.001601468538865447, "learning_rate": 2.5054075441835927e-06, "loss": 0.0001, "step": 23674 }, { "epoch": 6.245350217649387, "grad_norm": 0.4901675581932068, "learning_rate": 2.50505583399279e-06, "loss": 0.0035, "step": 23676 }, { "epoch": 6.245877852526052, "grad_norm": 0.00217942101880908, "learning_rate": 2.504704123801987e-06, "loss": 0.0001, "step": 23678 }, { "epoch": 6.2464054874027175, "grad_norm": 0.11253155767917633, "learning_rate": 2.5043524136111844e-06, "loss": 0.0003, "step": 23680 }, { "epoch": 6.2469331222793825, "grad_norm": 0.5937380194664001, "learning_rate": 2.504000703420382e-06, "loss": 0.0088, "step": 23682 }, { "epoch": 6.247460757156048, "grad_norm": 0.11009589582681656, "learning_rate": 2.5036489932295788e-06, "loss": 0.0002, "step": 23684 }, { "epoch": 6.247988392032713, "grad_norm": 0.0038101649843156338, "learning_rate": 2.503297283038776e-06, "loss": 0.0001, "step": 23686 }, { "epoch": 6.248516026909379, "grad_norm": 0.0021331554744392633, "learning_rate": 2.502945572847973e-06, "loss": 0.0001, "step": 23688 }, { "epoch": 6.249043661786044, "grad_norm": 0.08977736532688141, "learning_rate": 2.5025938626571705e-06, "loss": 0.0014, "step": 23690 }, { "epoch": 6.249571296662709, "grad_norm": 0.004945346154272556, "learning_rate": 2.502242152466368e-06, "loss": 0.0014, "step": 23692 }, { "epoch": 6.250098931539375, "grad_norm": 0.014649537391960621, "learning_rate": 2.501890442275565e-06, "loss": 0.0001, "step": 23694 }, { "epoch": 6.25062656641604, "grad_norm": 0.08974394202232361, "learning_rate": 2.501538732084762e-06, "loss": 0.0005, "step": 23696 }, { "epoch": 6.251154201292706, "grad_norm": 0.08376859873533249, "learning_rate": 2.5011870218939595e-06, "loss": 0.0006, "step": 23698 }, { "epoch": 6.251681836169371, "grad_norm": 0.0022423965856432915, "learning_rate": 2.5008353117031565e-06, "loss": 0.0002, "step": 23700 }, { "epoch": 6.2522094710460365, "grad_norm": 0.6665648221969604, "learning_rate": 2.500483601512354e-06, "loss": 0.0008, "step": 23702 }, { "epoch": 6.2527371059227015, "grad_norm": 0.0038731342647224665, "learning_rate": 2.5001318913215512e-06, "loss": 0.0001, "step": 23704 }, { "epoch": 6.2532647407993665, "grad_norm": 0.006023276597261429, "learning_rate": 2.4997801811307486e-06, "loss": 0.0001, "step": 23706 }, { "epoch": 6.253792375676032, "grad_norm": 0.16263645887374878, "learning_rate": 2.4994284709399456e-06, "loss": 0.0002, "step": 23708 }, { "epoch": 6.254320010552697, "grad_norm": 0.03130555897951126, "learning_rate": 2.499076760749143e-06, "loss": 0.0001, "step": 23710 }, { "epoch": 6.254847645429363, "grad_norm": 0.1707611232995987, "learning_rate": 2.49872505055834e-06, "loss": 0.0059, "step": 23712 }, { "epoch": 6.255375280306028, "grad_norm": 0.013148213736712933, "learning_rate": 2.4983733403675373e-06, "loss": 0.0001, "step": 23714 }, { "epoch": 6.255902915182694, "grad_norm": 0.011575739830732346, "learning_rate": 2.4980216301767346e-06, "loss": 0.0001, "step": 23716 }, { "epoch": 6.256430550059359, "grad_norm": 0.0381401926279068, "learning_rate": 2.4976699199859316e-06, "loss": 0.0002, "step": 23718 }, { "epoch": 6.256958184936025, "grad_norm": 0.007529224269092083, "learning_rate": 2.497318209795129e-06, "loss": 0.0001, "step": 23720 }, { "epoch": 6.25748581981269, "grad_norm": 0.045980069786310196, "learning_rate": 2.496966499604326e-06, "loss": 0.0002, "step": 23722 }, { "epoch": 6.258013454689355, "grad_norm": 0.017527000978589058, "learning_rate": 2.4966147894135233e-06, "loss": 0.0001, "step": 23724 }, { "epoch": 6.2585410895660205, "grad_norm": 0.01160190999507904, "learning_rate": 2.4962630792227203e-06, "loss": 0.0003, "step": 23726 }, { "epoch": 6.2590687244426855, "grad_norm": 0.025811562314629555, "learning_rate": 2.495911369031918e-06, "loss": 0.0002, "step": 23728 }, { "epoch": 6.259596359319351, "grad_norm": 0.0030615876894444227, "learning_rate": 2.495559658841115e-06, "loss": 0.0001, "step": 23730 }, { "epoch": 6.260123994196016, "grad_norm": 0.06204782426357269, "learning_rate": 2.4952079486503124e-06, "loss": 0.0002, "step": 23732 }, { "epoch": 6.260651629072681, "grad_norm": 0.0834478884935379, "learning_rate": 2.4948562384595093e-06, "loss": 0.0012, "step": 23734 }, { "epoch": 6.261179263949347, "grad_norm": 0.5208181142807007, "learning_rate": 2.4945045282687067e-06, "loss": 0.0057, "step": 23736 }, { "epoch": 6.261706898826012, "grad_norm": 0.0019513285951688886, "learning_rate": 2.4941528180779037e-06, "loss": 0.0002, "step": 23738 }, { "epoch": 6.262234533702678, "grad_norm": 0.0010325481416657567, "learning_rate": 2.493801107887101e-06, "loss": 0.0001, "step": 23740 }, { "epoch": 6.262762168579343, "grad_norm": 0.9687163829803467, "learning_rate": 2.4934493976962984e-06, "loss": 0.0014, "step": 23742 }, { "epoch": 6.263289803456009, "grad_norm": 0.0024049957282841206, "learning_rate": 2.4930976875054954e-06, "loss": 0.0001, "step": 23744 }, { "epoch": 6.263817438332674, "grad_norm": 0.005895869340747595, "learning_rate": 2.4927459773146927e-06, "loss": 0.0052, "step": 23746 }, { "epoch": 6.2643450732093395, "grad_norm": 0.11660785228013992, "learning_rate": 2.4923942671238897e-06, "loss": 0.0028, "step": 23748 }, { "epoch": 6.2648727080860045, "grad_norm": 0.10803331434726715, "learning_rate": 2.492042556933087e-06, "loss": 0.0042, "step": 23750 }, { "epoch": 6.2654003429626695, "grad_norm": 0.004935794975608587, "learning_rate": 2.4916908467422845e-06, "loss": 0.0001, "step": 23752 }, { "epoch": 6.265927977839335, "grad_norm": 0.1768513172864914, "learning_rate": 2.491339136551482e-06, "loss": 0.0015, "step": 23754 }, { "epoch": 6.266455612716, "grad_norm": 0.022802390158176422, "learning_rate": 2.4909874263606788e-06, "loss": 0.0003, "step": 23756 }, { "epoch": 6.266983247592666, "grad_norm": 0.004166888538748026, "learning_rate": 2.490635716169876e-06, "loss": 0.0027, "step": 23758 }, { "epoch": 6.267510882469331, "grad_norm": 0.01054465677589178, "learning_rate": 2.490284005979073e-06, "loss": 0.0002, "step": 23760 }, { "epoch": 6.268038517345997, "grad_norm": 0.02110126242041588, "learning_rate": 2.4899322957882705e-06, "loss": 0.0002, "step": 23762 }, { "epoch": 6.268566152222662, "grad_norm": 0.3111984133720398, "learning_rate": 2.489580585597468e-06, "loss": 0.001, "step": 23764 }, { "epoch": 6.269093787099327, "grad_norm": 0.39650824666023254, "learning_rate": 2.4892288754066652e-06, "loss": 0.0031, "step": 23766 }, { "epoch": 6.269621421975993, "grad_norm": 0.0015937373973429203, "learning_rate": 2.488877165215862e-06, "loss": 0.0002, "step": 23768 }, { "epoch": 6.270149056852658, "grad_norm": 0.004741957876831293, "learning_rate": 2.4885254550250596e-06, "loss": 0.0001, "step": 23770 }, { "epoch": 6.2706766917293235, "grad_norm": 0.039336126297712326, "learning_rate": 2.4881737448342565e-06, "loss": 0.0016, "step": 23772 }, { "epoch": 6.2712043266059885, "grad_norm": 0.01494282204657793, "learning_rate": 2.487822034643454e-06, "loss": 0.0001, "step": 23774 }, { "epoch": 6.271731961482654, "grad_norm": 0.005671584513038397, "learning_rate": 2.4874703244526513e-06, "loss": 0.0002, "step": 23776 }, { "epoch": 6.272259596359319, "grad_norm": 0.08144963532686234, "learning_rate": 2.4871186142618482e-06, "loss": 0.0006, "step": 23778 }, { "epoch": 6.272787231235984, "grad_norm": 0.02288791909813881, "learning_rate": 2.4867669040710456e-06, "loss": 0.0038, "step": 23780 }, { "epoch": 6.27331486611265, "grad_norm": 0.003277147188782692, "learning_rate": 2.4864151938802425e-06, "loss": 0.0001, "step": 23782 }, { "epoch": 6.273842500989315, "grad_norm": 0.001340297283604741, "learning_rate": 2.48606348368944e-06, "loss": 0.0002, "step": 23784 }, { "epoch": 6.274370135865981, "grad_norm": 0.017414113506674767, "learning_rate": 2.4857117734986373e-06, "loss": 0.0038, "step": 23786 }, { "epoch": 6.274897770742646, "grad_norm": 0.11056714504957199, "learning_rate": 2.4853600633078347e-06, "loss": 0.0005, "step": 23788 }, { "epoch": 6.275425405619312, "grad_norm": 0.032549481838941574, "learning_rate": 2.4850083531170316e-06, "loss": 0.0002, "step": 23790 }, { "epoch": 6.275953040495977, "grad_norm": 0.0027196453884243965, "learning_rate": 2.484656642926229e-06, "loss": 0.0005, "step": 23792 }, { "epoch": 6.2764806753726425, "grad_norm": 0.04238707199692726, "learning_rate": 2.484304932735426e-06, "loss": 0.0002, "step": 23794 }, { "epoch": 6.2770083102493075, "grad_norm": 0.0253720935434103, "learning_rate": 2.4839532225446233e-06, "loss": 0.0001, "step": 23796 }, { "epoch": 6.2775359451259725, "grad_norm": 0.034369271248579025, "learning_rate": 2.4836015123538203e-06, "loss": 0.0002, "step": 23798 }, { "epoch": 6.278063580002638, "grad_norm": 0.017586398869752884, "learning_rate": 2.483249802163018e-06, "loss": 0.0002, "step": 23800 }, { "epoch": 6.278591214879303, "grad_norm": 0.015325441025197506, "learning_rate": 2.482898091972215e-06, "loss": 0.0001, "step": 23802 }, { "epoch": 6.279118849755969, "grad_norm": 0.02889890968799591, "learning_rate": 2.482546381781412e-06, "loss": 0.0002, "step": 23804 }, { "epoch": 6.279646484632634, "grad_norm": 0.017077501863241196, "learning_rate": 2.4821946715906094e-06, "loss": 0.0003, "step": 23806 }, { "epoch": 6.2801741195093, "grad_norm": 0.13493715226650238, "learning_rate": 2.4818429613998063e-06, "loss": 0.0144, "step": 23808 }, { "epoch": 6.280701754385965, "grad_norm": 0.003339479211717844, "learning_rate": 2.4814912512090037e-06, "loss": 0.0001, "step": 23810 }, { "epoch": 6.28122938926263, "grad_norm": 0.07246597856283188, "learning_rate": 2.481139541018201e-06, "loss": 0.0005, "step": 23812 }, { "epoch": 6.281757024139296, "grad_norm": 0.02280305325984955, "learning_rate": 2.4807878308273984e-06, "loss": 0.0002, "step": 23814 }, { "epoch": 6.282284659015961, "grad_norm": 0.15660347044467926, "learning_rate": 2.4804361206365954e-06, "loss": 0.0048, "step": 23816 }, { "epoch": 6.2828122938926265, "grad_norm": 0.005072265863418579, "learning_rate": 2.4800844104457928e-06, "loss": 0.0001, "step": 23818 }, { "epoch": 6.2833399287692915, "grad_norm": 0.023929351940751076, "learning_rate": 2.4797327002549897e-06, "loss": 0.0001, "step": 23820 }, { "epoch": 6.283867563645957, "grad_norm": 0.11641842871904373, "learning_rate": 2.479380990064187e-06, "loss": 0.0002, "step": 23822 }, { "epoch": 6.284395198522622, "grad_norm": 1.0754852294921875, "learning_rate": 2.4790292798733845e-06, "loss": 0.0011, "step": 23824 }, { "epoch": 6.284922833399287, "grad_norm": 0.0062712449580430984, "learning_rate": 2.478677569682582e-06, "loss": 0.0003, "step": 23826 }, { "epoch": 6.285450468275953, "grad_norm": 0.002142425160855055, "learning_rate": 2.478325859491779e-06, "loss": 0.0001, "step": 23828 }, { "epoch": 6.285978103152618, "grad_norm": 0.0015455374959856272, "learning_rate": 2.477974149300976e-06, "loss": 0.0001, "step": 23830 }, { "epoch": 6.286505738029284, "grad_norm": 0.028106940910220146, "learning_rate": 2.477622439110173e-06, "loss": 0.0002, "step": 23832 }, { "epoch": 6.287033372905949, "grad_norm": 0.0328318290412426, "learning_rate": 2.4772707289193705e-06, "loss": 0.0002, "step": 23834 }, { "epoch": 6.287561007782615, "grad_norm": 0.01819530315697193, "learning_rate": 2.476919018728568e-06, "loss": 0.0001, "step": 23836 }, { "epoch": 6.28808864265928, "grad_norm": 0.008601411245763302, "learning_rate": 2.476567308537765e-06, "loss": 0.0002, "step": 23838 }, { "epoch": 6.2886162775359455, "grad_norm": 0.06136098876595497, "learning_rate": 2.4762155983469622e-06, "loss": 0.0001, "step": 23840 }, { "epoch": 6.2891439124126105, "grad_norm": 0.04897403344511986, "learning_rate": 2.475863888156159e-06, "loss": 0.0002, "step": 23842 }, { "epoch": 6.2896715472892755, "grad_norm": 0.00929255224764347, "learning_rate": 2.4755121779653565e-06, "loss": 0.0001, "step": 23844 }, { "epoch": 6.290199182165941, "grad_norm": 0.44476306438446045, "learning_rate": 2.475160467774554e-06, "loss": 0.0017, "step": 23846 }, { "epoch": 6.290726817042606, "grad_norm": 0.1092483177781105, "learning_rate": 2.4748087575837513e-06, "loss": 0.0024, "step": 23848 }, { "epoch": 6.291254451919272, "grad_norm": 0.016809873282909393, "learning_rate": 2.4744570473929482e-06, "loss": 0.0001, "step": 23850 }, { "epoch": 6.291782086795937, "grad_norm": 0.0361567884683609, "learning_rate": 2.4741053372021456e-06, "loss": 0.0001, "step": 23852 }, { "epoch": 6.292309721672603, "grad_norm": 0.04490155726671219, "learning_rate": 2.4737536270113426e-06, "loss": 0.0009, "step": 23854 }, { "epoch": 6.292837356549268, "grad_norm": 0.08581513166427612, "learning_rate": 2.47340191682054e-06, "loss": 0.0071, "step": 23856 }, { "epoch": 6.293364991425933, "grad_norm": 0.001283691730350256, "learning_rate": 2.4730502066297373e-06, "loss": 0.0001, "step": 23858 }, { "epoch": 6.293892626302599, "grad_norm": 0.0038261101581156254, "learning_rate": 2.4726984964389347e-06, "loss": 0.0015, "step": 23860 }, { "epoch": 6.294420261179264, "grad_norm": 0.003761905711144209, "learning_rate": 2.4723467862481317e-06, "loss": 0.0001, "step": 23862 }, { "epoch": 6.2949478960559295, "grad_norm": 0.015470130369067192, "learning_rate": 2.4719950760573286e-06, "loss": 0.0002, "step": 23864 }, { "epoch": 6.2954755309325945, "grad_norm": 0.04697898402810097, "learning_rate": 2.471643365866526e-06, "loss": 0.0001, "step": 23866 }, { "epoch": 6.29600316580926, "grad_norm": 0.0009280676604248583, "learning_rate": 2.471291655675723e-06, "loss": 0.0009, "step": 23868 }, { "epoch": 6.296530800685925, "grad_norm": 0.013449003919959068, "learning_rate": 2.4709399454849207e-06, "loss": 0.0001, "step": 23870 }, { "epoch": 6.29705843556259, "grad_norm": 0.0014422605745494366, "learning_rate": 2.4705882352941177e-06, "loss": 0.0001, "step": 23872 }, { "epoch": 6.297586070439256, "grad_norm": 0.0025701678823679686, "learning_rate": 2.470236525103315e-06, "loss": 0.0001, "step": 23874 }, { "epoch": 6.298113705315921, "grad_norm": 0.1400076448917389, "learning_rate": 2.469884814912512e-06, "loss": 0.0004, "step": 23876 }, { "epoch": 6.298641340192587, "grad_norm": 0.0029634658712893724, "learning_rate": 2.4695331047217094e-06, "loss": 0.0001, "step": 23878 }, { "epoch": 6.299168975069252, "grad_norm": 0.0028306462336331606, "learning_rate": 2.4691813945309063e-06, "loss": 0.0001, "step": 23880 }, { "epoch": 6.299696609945918, "grad_norm": 0.43696460127830505, "learning_rate": 2.4688296843401037e-06, "loss": 0.0017, "step": 23882 }, { "epoch": 6.300224244822583, "grad_norm": 0.006358352489769459, "learning_rate": 2.468477974149301e-06, "loss": 0.0001, "step": 23884 }, { "epoch": 6.3007518796992485, "grad_norm": 0.004789266269654036, "learning_rate": 2.4681262639584985e-06, "loss": 0.0001, "step": 23886 }, { "epoch": 6.3012795145759135, "grad_norm": 0.003072519786655903, "learning_rate": 2.4677745537676954e-06, "loss": 0.0001, "step": 23888 }, { "epoch": 6.3018071494525785, "grad_norm": 0.0012091273674741387, "learning_rate": 2.467422843576893e-06, "loss": 0.0001, "step": 23890 }, { "epoch": 6.302334784329244, "grad_norm": 0.029027286916971207, "learning_rate": 2.4670711333860897e-06, "loss": 0.0002, "step": 23892 }, { "epoch": 6.302862419205909, "grad_norm": 0.13323576748371124, "learning_rate": 2.466719423195287e-06, "loss": 0.0005, "step": 23894 }, { "epoch": 6.303390054082575, "grad_norm": 0.15495912730693817, "learning_rate": 2.4663677130044845e-06, "loss": 0.0004, "step": 23896 }, { "epoch": 6.30391768895924, "grad_norm": 0.045052941888570786, "learning_rate": 2.4660160028136815e-06, "loss": 0.0001, "step": 23898 }, { "epoch": 6.304445323835906, "grad_norm": 0.058555230498313904, "learning_rate": 2.465664292622879e-06, "loss": 0.0016, "step": 23900 }, { "epoch": 6.304972958712571, "grad_norm": 0.001142057590186596, "learning_rate": 2.4653125824320758e-06, "loss": 0.0001, "step": 23902 }, { "epoch": 6.305500593589236, "grad_norm": 0.006550799589604139, "learning_rate": 2.464960872241273e-06, "loss": 0.0001, "step": 23904 }, { "epoch": 6.306028228465902, "grad_norm": 0.02311841771006584, "learning_rate": 2.4646091620504705e-06, "loss": 0.0001, "step": 23906 }, { "epoch": 6.306555863342567, "grad_norm": 0.006800278555601835, "learning_rate": 2.464257451859668e-06, "loss": 0.0012, "step": 23908 }, { "epoch": 6.3070834982192325, "grad_norm": 0.0021173465065658092, "learning_rate": 2.463905741668865e-06, "loss": 0.0001, "step": 23910 }, { "epoch": 6.3076111330958975, "grad_norm": 0.09238944947719574, "learning_rate": 2.4635540314780622e-06, "loss": 0.0004, "step": 23912 }, { "epoch": 6.308138767972563, "grad_norm": 0.001715353922918439, "learning_rate": 2.463202321287259e-06, "loss": 0.0012, "step": 23914 }, { "epoch": 6.308666402849228, "grad_norm": 0.056800760328769684, "learning_rate": 2.4628506110964566e-06, "loss": 0.0032, "step": 23916 }, { "epoch": 6.309194037725893, "grad_norm": 0.005491773597896099, "learning_rate": 2.462498900905654e-06, "loss": 0.0001, "step": 23918 }, { "epoch": 6.309721672602559, "grad_norm": 0.02501230686903, "learning_rate": 2.4621471907148513e-06, "loss": 0.0001, "step": 23920 }, { "epoch": 6.310249307479224, "grad_norm": 0.003008587984368205, "learning_rate": 2.4617954805240483e-06, "loss": 0.0029, "step": 23922 }, { "epoch": 6.31077694235589, "grad_norm": 0.002392539521679282, "learning_rate": 2.4614437703332452e-06, "loss": 0.0001, "step": 23924 }, { "epoch": 6.311304577232555, "grad_norm": 0.00389477820135653, "learning_rate": 2.4610920601424426e-06, "loss": 0.0001, "step": 23926 }, { "epoch": 6.311832212109221, "grad_norm": 0.0045869373716413975, "learning_rate": 2.46074034995164e-06, "loss": 0.0001, "step": 23928 }, { "epoch": 6.312359846985886, "grad_norm": 0.13843348622322083, "learning_rate": 2.4603886397608373e-06, "loss": 0.0002, "step": 23930 }, { "epoch": 6.3128874818625516, "grad_norm": 0.08041729032993317, "learning_rate": 2.4600369295700343e-06, "loss": 0.0001, "step": 23932 }, { "epoch": 6.3134151167392165, "grad_norm": 0.0029460955411195755, "learning_rate": 2.4596852193792317e-06, "loss": 0.0001, "step": 23934 }, { "epoch": 6.3139427516158815, "grad_norm": 0.18497484922409058, "learning_rate": 2.4593335091884286e-06, "loss": 0.0004, "step": 23936 }, { "epoch": 6.314470386492547, "grad_norm": 0.000765194883570075, "learning_rate": 2.458981798997626e-06, "loss": 0.0067, "step": 23938 }, { "epoch": 6.314998021369212, "grad_norm": 0.199469655752182, "learning_rate": 2.458630088806823e-06, "loss": 0.0003, "step": 23940 }, { "epoch": 6.315525656245878, "grad_norm": 0.009587457403540611, "learning_rate": 2.4582783786160208e-06, "loss": 0.0001, "step": 23942 }, { "epoch": 6.316053291122543, "grad_norm": 0.03574644774198532, "learning_rate": 2.4579266684252177e-06, "loss": 0.0001, "step": 23944 }, { "epoch": 6.316580925999209, "grad_norm": 0.022325143218040466, "learning_rate": 2.457574958234415e-06, "loss": 0.0001, "step": 23946 }, { "epoch": 6.317108560875874, "grad_norm": 0.0040003033354878426, "learning_rate": 2.457223248043612e-06, "loss": 0.0001, "step": 23948 }, { "epoch": 6.317636195752539, "grad_norm": 0.4989176392555237, "learning_rate": 2.4568715378528094e-06, "loss": 0.0027, "step": 23950 }, { "epoch": 6.318163830629205, "grad_norm": 0.007344273384660482, "learning_rate": 2.4565198276620064e-06, "loss": 0.0001, "step": 23952 }, { "epoch": 6.31869146550587, "grad_norm": 0.028423041105270386, "learning_rate": 2.456168117471204e-06, "loss": 0.0004, "step": 23954 }, { "epoch": 6.3192191003825355, "grad_norm": 0.007557904813438654, "learning_rate": 2.455816407280401e-06, "loss": 0.0021, "step": 23956 }, { "epoch": 6.3197467352592005, "grad_norm": 0.02023189887404442, "learning_rate": 2.455464697089598e-06, "loss": 0.0001, "step": 23958 }, { "epoch": 6.320274370135866, "grad_norm": 0.010444892570376396, "learning_rate": 2.4551129868987954e-06, "loss": 0.0008, "step": 23960 }, { "epoch": 6.320802005012531, "grad_norm": 0.022624611854553223, "learning_rate": 2.4547612767079924e-06, "loss": 0.0001, "step": 23962 }, { "epoch": 6.321329639889196, "grad_norm": 0.004400306846946478, "learning_rate": 2.4544095665171898e-06, "loss": 0.0004, "step": 23964 }, { "epoch": 6.321857274765862, "grad_norm": 0.6290735006332397, "learning_rate": 2.454057856326387e-06, "loss": 0.005, "step": 23966 }, { "epoch": 6.322384909642527, "grad_norm": 0.003955783322453499, "learning_rate": 2.4537061461355845e-06, "loss": 0.0002, "step": 23968 }, { "epoch": 6.322912544519193, "grad_norm": 0.006983554922044277, "learning_rate": 2.4533544359447815e-06, "loss": 0.0001, "step": 23970 }, { "epoch": 6.323440179395858, "grad_norm": 0.005417536478489637, "learning_rate": 2.453002725753979e-06, "loss": 0.0001, "step": 23972 }, { "epoch": 6.323967814272524, "grad_norm": 0.002039453247562051, "learning_rate": 2.452651015563176e-06, "loss": 0.0002, "step": 23974 }, { "epoch": 6.324495449149189, "grad_norm": 0.003178546205163002, "learning_rate": 2.452299305372373e-06, "loss": 0.0002, "step": 23976 }, { "epoch": 6.325023084025855, "grad_norm": 0.016416829079389572, "learning_rate": 2.4519475951815706e-06, "loss": 0.0001, "step": 23978 }, { "epoch": 6.3255507189025195, "grad_norm": 0.03685123473405838, "learning_rate": 2.451595884990768e-06, "loss": 0.0002, "step": 23980 }, { "epoch": 6.3260783537791845, "grad_norm": 0.9582915306091309, "learning_rate": 2.451244174799965e-06, "loss": 0.0063, "step": 23982 }, { "epoch": 6.32660598865585, "grad_norm": 0.10800822079181671, "learning_rate": 2.4508924646091623e-06, "loss": 0.0034, "step": 23984 }, { "epoch": 6.327133623532515, "grad_norm": 0.001651568105444312, "learning_rate": 2.4505407544183592e-06, "loss": 0.0001, "step": 23986 }, { "epoch": 6.327661258409181, "grad_norm": 0.30306729674339294, "learning_rate": 2.4501890442275566e-06, "loss": 0.0005, "step": 23988 }, { "epoch": 6.328188893285846, "grad_norm": 0.00877034105360508, "learning_rate": 2.449837334036754e-06, "loss": 0.0001, "step": 23990 }, { "epoch": 6.328716528162512, "grad_norm": 0.0027883255388587713, "learning_rate": 2.449485623845951e-06, "loss": 0.0001, "step": 23992 }, { "epoch": 6.329244163039177, "grad_norm": 0.10391736030578613, "learning_rate": 2.4491339136551483e-06, "loss": 0.0108, "step": 23994 }, { "epoch": 6.329771797915842, "grad_norm": 0.005604529287666082, "learning_rate": 2.4487822034643452e-06, "loss": 0.0001, "step": 23996 }, { "epoch": 6.330299432792508, "grad_norm": 0.054082684218883514, "learning_rate": 2.4484304932735426e-06, "loss": 0.0002, "step": 23998 }, { "epoch": 6.330827067669173, "grad_norm": 0.01737794280052185, "learning_rate": 2.44807878308274e-06, "loss": 0.0002, "step": 24000 }, { "epoch": 6.330827067669173, "eval_loss": 0.002064550993964076, "eval_runtime": 302.7128, "eval_samples_per_second": 712.362, "eval_steps_per_second": 89.048, "step": 24000 }, { "epoch": 6.3313547025458385, "grad_norm": 0.006569819059222937, "learning_rate": 2.4477270728919374e-06, "loss": 0.0003, "step": 24002 }, { "epoch": 6.3318823374225035, "grad_norm": 0.03398379310965538, "learning_rate": 2.4473753627011343e-06, "loss": 0.0001, "step": 24004 }, { "epoch": 6.332409972299169, "grad_norm": 0.007338919211179018, "learning_rate": 2.4470236525103317e-06, "loss": 0.001, "step": 24006 }, { "epoch": 6.332937607175834, "grad_norm": 0.14694194495677948, "learning_rate": 2.4466719423195287e-06, "loss": 0.0003, "step": 24008 }, { "epoch": 6.333465242052499, "grad_norm": 0.002635578392073512, "learning_rate": 2.446320232128726e-06, "loss": 0.0002, "step": 24010 }, { "epoch": 6.333992876929165, "grad_norm": 0.004988787230104208, "learning_rate": 2.4459685219379234e-06, "loss": 0.0002, "step": 24012 }, { "epoch": 6.33452051180583, "grad_norm": 0.016564909368753433, "learning_rate": 2.4456168117471208e-06, "loss": 0.0001, "step": 24014 }, { "epoch": 6.335048146682496, "grad_norm": 0.006219524424523115, "learning_rate": 2.4452651015563177e-06, "loss": 0.0023, "step": 24016 }, { "epoch": 6.335575781559161, "grad_norm": 0.0030422366689890623, "learning_rate": 2.4449133913655147e-06, "loss": 0.0006, "step": 24018 }, { "epoch": 6.336103416435827, "grad_norm": 0.017861679196357727, "learning_rate": 2.444561681174712e-06, "loss": 0.0001, "step": 24020 }, { "epoch": 6.336631051312492, "grad_norm": 0.01424019131809473, "learning_rate": 2.444209970983909e-06, "loss": 0.0001, "step": 24022 }, { "epoch": 6.337158686189158, "grad_norm": 0.0010116840712726116, "learning_rate": 2.4438582607931064e-06, "loss": 0.0001, "step": 24024 }, { "epoch": 6.3376863210658225, "grad_norm": 0.008087895810604095, "learning_rate": 2.4435065506023038e-06, "loss": 0.0006, "step": 24026 }, { "epoch": 6.3382139559424875, "grad_norm": 0.5661208629608154, "learning_rate": 2.443154840411501e-06, "loss": 0.0029, "step": 24028 }, { "epoch": 6.338741590819153, "grad_norm": 0.0029558492824435234, "learning_rate": 2.442803130220698e-06, "loss": 0.0001, "step": 24030 }, { "epoch": 6.339269225695818, "grad_norm": 0.001241528894752264, "learning_rate": 2.4424514200298955e-06, "loss": 0.0001, "step": 24032 }, { "epoch": 6.339796860572484, "grad_norm": 0.0018865364836528897, "learning_rate": 2.4420997098390924e-06, "loss": 0.0001, "step": 24034 }, { "epoch": 6.340324495449149, "grad_norm": 0.0050595905631780624, "learning_rate": 2.44174799964829e-06, "loss": 0.0004, "step": 24036 }, { "epoch": 6.340852130325814, "grad_norm": 0.00382232409901917, "learning_rate": 2.441396289457487e-06, "loss": 0.0001, "step": 24038 }, { "epoch": 6.34137976520248, "grad_norm": 0.37702614068984985, "learning_rate": 2.4410445792666846e-06, "loss": 0.0021, "step": 24040 }, { "epoch": 6.341907400079145, "grad_norm": 0.012221941724419594, "learning_rate": 2.4406928690758815e-06, "loss": 0.0001, "step": 24042 }, { "epoch": 6.342435034955811, "grad_norm": 0.017860950902104378, "learning_rate": 2.440341158885079e-06, "loss": 0.0001, "step": 24044 }, { "epoch": 6.342962669832476, "grad_norm": 0.010094054043293, "learning_rate": 2.439989448694276e-06, "loss": 0.0002, "step": 24046 }, { "epoch": 6.3434903047091415, "grad_norm": 0.010705332271754742, "learning_rate": 2.439637738503473e-06, "loss": 0.0002, "step": 24048 }, { "epoch": 6.3440179395858065, "grad_norm": 0.005347505211830139, "learning_rate": 2.4392860283126706e-06, "loss": 0.0025, "step": 24050 }, { "epoch": 6.344545574462472, "grad_norm": 0.5007676482200623, "learning_rate": 2.4389343181218675e-06, "loss": 0.0061, "step": 24052 }, { "epoch": 6.345073209339137, "grad_norm": 0.04881604015827179, "learning_rate": 2.438582607931065e-06, "loss": 0.0002, "step": 24054 }, { "epoch": 6.345600844215802, "grad_norm": 0.007741925306618214, "learning_rate": 2.438230897740262e-06, "loss": 0.0043, "step": 24056 }, { "epoch": 6.346128479092468, "grad_norm": 0.038219790905714035, "learning_rate": 2.4378791875494592e-06, "loss": 0.0001, "step": 24058 }, { "epoch": 6.346656113969133, "grad_norm": 0.7061755657196045, "learning_rate": 2.4375274773586566e-06, "loss": 0.0008, "step": 24060 }, { "epoch": 6.347183748845799, "grad_norm": 0.03420773148536682, "learning_rate": 2.437175767167854e-06, "loss": 0.0002, "step": 24062 }, { "epoch": 6.347711383722464, "grad_norm": 0.25843513011932373, "learning_rate": 2.436824056977051e-06, "loss": 0.0006, "step": 24064 }, { "epoch": 6.34823901859913, "grad_norm": 0.003531355643644929, "learning_rate": 2.4364723467862483e-06, "loss": 0.0001, "step": 24066 }, { "epoch": 6.348766653475795, "grad_norm": 0.012648047879338264, "learning_rate": 2.4361206365954453e-06, "loss": 0.0001, "step": 24068 }, { "epoch": 6.349294288352461, "grad_norm": 0.09526146948337555, "learning_rate": 2.4357689264046426e-06, "loss": 0.0019, "step": 24070 }, { "epoch": 6.3498219232291255, "grad_norm": 0.00459143565967679, "learning_rate": 2.43541721621384e-06, "loss": 0.0001, "step": 24072 }, { "epoch": 6.3503495581057905, "grad_norm": 0.005412059370428324, "learning_rate": 2.4350655060230374e-06, "loss": 0.0001, "step": 24074 }, { "epoch": 6.350877192982456, "grad_norm": 0.6411781907081604, "learning_rate": 2.4347137958322344e-06, "loss": 0.0011, "step": 24076 }, { "epoch": 6.351404827859121, "grad_norm": 0.07459572702646255, "learning_rate": 2.4343620856414313e-06, "loss": 0.0011, "step": 24078 }, { "epoch": 6.351932462735787, "grad_norm": 0.0012616468593478203, "learning_rate": 2.4340103754506287e-06, "loss": 0.0001, "step": 24080 }, { "epoch": 6.352460097612452, "grad_norm": 0.0008265355136245489, "learning_rate": 2.4336586652598256e-06, "loss": 0.0001, "step": 24082 }, { "epoch": 6.352987732489117, "grad_norm": 0.0017106373561546206, "learning_rate": 2.4333069550690234e-06, "loss": 0.0001, "step": 24084 }, { "epoch": 6.353515367365783, "grad_norm": 0.02082224190235138, "learning_rate": 2.4329552448782204e-06, "loss": 0.0004, "step": 24086 }, { "epoch": 6.354043002242448, "grad_norm": 0.0010111982701346278, "learning_rate": 2.4326035346874178e-06, "loss": 0.0001, "step": 24088 }, { "epoch": 6.354570637119114, "grad_norm": 0.003271704539656639, "learning_rate": 2.4322518244966147e-06, "loss": 0.013, "step": 24090 }, { "epoch": 6.355098271995779, "grad_norm": 0.0019796097185462713, "learning_rate": 2.431900114305812e-06, "loss": 0.0001, "step": 24092 }, { "epoch": 6.3556259068724446, "grad_norm": 0.37743207812309265, "learning_rate": 2.431548404115009e-06, "loss": 0.0067, "step": 24094 }, { "epoch": 6.3561535417491095, "grad_norm": 0.19982542097568512, "learning_rate": 2.4311966939242064e-06, "loss": 0.0004, "step": 24096 }, { "epoch": 6.356681176625775, "grad_norm": 0.002028973074629903, "learning_rate": 2.430844983733404e-06, "loss": 0.0023, "step": 24098 }, { "epoch": 6.35720881150244, "grad_norm": 0.5021629333496094, "learning_rate": 2.430493273542601e-06, "loss": 0.0031, "step": 24100 }, { "epoch": 6.357736446379105, "grad_norm": 0.12511472404003143, "learning_rate": 2.430141563351798e-06, "loss": 0.0004, "step": 24102 }, { "epoch": 6.358264081255771, "grad_norm": 0.003696132218465209, "learning_rate": 2.4297898531609955e-06, "loss": 0.0001, "step": 24104 }, { "epoch": 6.358791716132436, "grad_norm": 0.0034351134672760963, "learning_rate": 2.4294381429701924e-06, "loss": 0.0001, "step": 24106 }, { "epoch": 6.359319351009102, "grad_norm": 0.8717883229255676, "learning_rate": 2.42908643277939e-06, "loss": 0.0034, "step": 24108 }, { "epoch": 6.359846985885767, "grad_norm": 0.0015094259288161993, "learning_rate": 2.428734722588587e-06, "loss": 0.0001, "step": 24110 }, { "epoch": 6.360374620762433, "grad_norm": 0.0017025263514369726, "learning_rate": 2.428383012397784e-06, "loss": 0.0001, "step": 24112 }, { "epoch": 6.360902255639098, "grad_norm": 0.003667554585263133, "learning_rate": 2.4280313022069815e-06, "loss": 0.0001, "step": 24114 }, { "epoch": 6.361429890515763, "grad_norm": 0.33489105105400085, "learning_rate": 2.4276795920161785e-06, "loss": 0.0008, "step": 24116 }, { "epoch": 6.3619575253924285, "grad_norm": 0.0053121852688491344, "learning_rate": 2.427327881825376e-06, "loss": 0.0001, "step": 24118 }, { "epoch": 6.3624851602690935, "grad_norm": 0.0373038612306118, "learning_rate": 2.4269761716345732e-06, "loss": 0.0001, "step": 24120 }, { "epoch": 6.363012795145759, "grad_norm": 0.02174742892384529, "learning_rate": 2.4266244614437706e-06, "loss": 0.0005, "step": 24122 }, { "epoch": 6.363540430022424, "grad_norm": 0.05800651013851166, "learning_rate": 2.4262727512529676e-06, "loss": 0.0064, "step": 24124 }, { "epoch": 6.36406806489909, "grad_norm": 0.03536314144730568, "learning_rate": 2.425921041062165e-06, "loss": 0.0004, "step": 24126 }, { "epoch": 6.364595699775755, "grad_norm": 0.001449194154702127, "learning_rate": 2.425569330871362e-06, "loss": 0.0017, "step": 24128 }, { "epoch": 6.36512333465242, "grad_norm": 0.17844988405704498, "learning_rate": 2.4252176206805593e-06, "loss": 0.0009, "step": 24130 }, { "epoch": 6.365650969529086, "grad_norm": 0.1610037237405777, "learning_rate": 2.4248659104897566e-06, "loss": 0.0004, "step": 24132 }, { "epoch": 6.366178604405751, "grad_norm": 0.12005294859409332, "learning_rate": 2.424514200298954e-06, "loss": 0.0007, "step": 24134 }, { "epoch": 6.366706239282417, "grad_norm": 0.010807872749865055, "learning_rate": 2.424162490108151e-06, "loss": 0.0001, "step": 24136 }, { "epoch": 6.367233874159082, "grad_norm": 0.002864086301997304, "learning_rate": 2.423810779917348e-06, "loss": 0.0002, "step": 24138 }, { "epoch": 6.367761509035748, "grad_norm": 0.011280419304966927, "learning_rate": 2.4234590697265453e-06, "loss": 0.0001, "step": 24140 }, { "epoch": 6.3682891439124125, "grad_norm": 0.00368869979865849, "learning_rate": 2.4231073595357427e-06, "loss": 0.0016, "step": 24142 }, { "epoch": 6.368816778789078, "grad_norm": 0.09198351204395294, "learning_rate": 2.42275564934494e-06, "loss": 0.0005, "step": 24144 }, { "epoch": 6.369344413665743, "grad_norm": 0.007959800772368908, "learning_rate": 2.422403939154137e-06, "loss": 0.0001, "step": 24146 }, { "epoch": 6.369872048542408, "grad_norm": 0.007565653882920742, "learning_rate": 2.4220522289633344e-06, "loss": 0.0001, "step": 24148 }, { "epoch": 6.370399683419074, "grad_norm": 0.009469871409237385, "learning_rate": 2.4217005187725313e-06, "loss": 0.0001, "step": 24150 }, { "epoch": 6.370927318295739, "grad_norm": 0.005124155897647142, "learning_rate": 2.4213488085817287e-06, "loss": 0.0001, "step": 24152 }, { "epoch": 6.371454953172405, "grad_norm": 0.003810752648860216, "learning_rate": 2.4209970983909257e-06, "loss": 0.0001, "step": 24154 }, { "epoch": 6.37198258804907, "grad_norm": 0.2689377963542938, "learning_rate": 2.4206453882001235e-06, "loss": 0.0006, "step": 24156 }, { "epoch": 6.372510222925736, "grad_norm": 0.49045708775520325, "learning_rate": 2.4202936780093204e-06, "loss": 0.0058, "step": 24158 }, { "epoch": 6.373037857802401, "grad_norm": 0.0022769940551370382, "learning_rate": 2.4199419678185178e-06, "loss": 0.0001, "step": 24160 }, { "epoch": 6.373565492679066, "grad_norm": 0.017044706270098686, "learning_rate": 2.4195902576277147e-06, "loss": 0.0057, "step": 24162 }, { "epoch": 6.3740931275557315, "grad_norm": 0.009780483320355415, "learning_rate": 2.419238547436912e-06, "loss": 0.0002, "step": 24164 }, { "epoch": 6.3746207624323965, "grad_norm": 0.039096761494874954, "learning_rate": 2.418886837246109e-06, "loss": 0.0002, "step": 24166 }, { "epoch": 6.375148397309062, "grad_norm": 0.008754352107644081, "learning_rate": 2.4185351270553064e-06, "loss": 0.0001, "step": 24168 }, { "epoch": 6.375676032185727, "grad_norm": 0.07739361375570297, "learning_rate": 2.418183416864504e-06, "loss": 0.0005, "step": 24170 }, { "epoch": 6.376203667062393, "grad_norm": 0.007956109941005707, "learning_rate": 2.4178317066737008e-06, "loss": 0.0001, "step": 24172 }, { "epoch": 6.376731301939058, "grad_norm": 0.008320354856550694, "learning_rate": 2.417479996482898e-06, "loss": 0.0001, "step": 24174 }, { "epoch": 6.377258936815723, "grad_norm": 0.5043384432792664, "learning_rate": 2.417128286292095e-06, "loss": 0.0035, "step": 24176 }, { "epoch": 6.377786571692389, "grad_norm": 0.009527179412543774, "learning_rate": 2.4167765761012925e-06, "loss": 0.0007, "step": 24178 }, { "epoch": 6.378314206569054, "grad_norm": 0.11933081597089767, "learning_rate": 2.41642486591049e-06, "loss": 0.0003, "step": 24180 }, { "epoch": 6.37884184144572, "grad_norm": 0.003083015326410532, "learning_rate": 2.4160731557196872e-06, "loss": 0.0001, "step": 24182 }, { "epoch": 6.379369476322385, "grad_norm": 0.023400476202368736, "learning_rate": 2.415721445528884e-06, "loss": 0.0001, "step": 24184 }, { "epoch": 6.379897111199051, "grad_norm": 0.0038102613762021065, "learning_rate": 2.4153697353380816e-06, "loss": 0.0001, "step": 24186 }, { "epoch": 6.3804247460757155, "grad_norm": 0.01782459206879139, "learning_rate": 2.4150180251472785e-06, "loss": 0.0001, "step": 24188 }, { "epoch": 6.380952380952381, "grad_norm": 0.07656212896108627, "learning_rate": 2.414666314956476e-06, "loss": 0.0004, "step": 24190 }, { "epoch": 6.381480015829046, "grad_norm": 0.04008113965392113, "learning_rate": 2.4143146047656733e-06, "loss": 0.0003, "step": 24192 }, { "epoch": 6.382007650705711, "grad_norm": 0.018782731145620346, "learning_rate": 2.4139628945748706e-06, "loss": 0.0001, "step": 24194 }, { "epoch": 6.382535285582377, "grad_norm": 0.012326487340033054, "learning_rate": 2.4136111843840676e-06, "loss": 0.0001, "step": 24196 }, { "epoch": 6.383062920459042, "grad_norm": 0.14743168652057648, "learning_rate": 2.4132594741932645e-06, "loss": 0.0005, "step": 24198 }, { "epoch": 6.383590555335708, "grad_norm": 0.024092258885502815, "learning_rate": 2.412907764002462e-06, "loss": 0.0001, "step": 24200 }, { "epoch": 6.384118190212373, "grad_norm": 0.004189283587038517, "learning_rate": 2.4125560538116593e-06, "loss": 0.0001, "step": 24202 }, { "epoch": 6.384645825089039, "grad_norm": 0.0027111589442938566, "learning_rate": 2.4122043436208567e-06, "loss": 0.0018, "step": 24204 }, { "epoch": 6.385173459965704, "grad_norm": 0.007719642948359251, "learning_rate": 2.4118526334300536e-06, "loss": 0.0002, "step": 24206 }, { "epoch": 6.385701094842369, "grad_norm": 0.010591979138553143, "learning_rate": 2.411500923239251e-06, "loss": 0.0039, "step": 24208 }, { "epoch": 6.3862287297190345, "grad_norm": 0.06652751564979553, "learning_rate": 2.411149213048448e-06, "loss": 0.0001, "step": 24210 }, { "epoch": 6.3867563645956995, "grad_norm": 0.019192982465028763, "learning_rate": 2.4107975028576453e-06, "loss": 0.0026, "step": 24212 }, { "epoch": 6.387283999472365, "grad_norm": 0.02218114398419857, "learning_rate": 2.4104457926668427e-06, "loss": 0.0001, "step": 24214 }, { "epoch": 6.38781163434903, "grad_norm": 0.009115158580243587, "learning_rate": 2.41009408247604e-06, "loss": 0.0002, "step": 24216 }, { "epoch": 6.388339269225696, "grad_norm": 0.005942982621490955, "learning_rate": 2.409742372285237e-06, "loss": 0.0001, "step": 24218 }, { "epoch": 6.388866904102361, "grad_norm": 0.037543006241321564, "learning_rate": 2.4093906620944344e-06, "loss": 0.0002, "step": 24220 }, { "epoch": 6.389394538979026, "grad_norm": 0.01292413379997015, "learning_rate": 2.4090389519036314e-06, "loss": 0.0016, "step": 24222 }, { "epoch": 6.389922173855692, "grad_norm": 0.06564980000257492, "learning_rate": 2.4086872417128287e-06, "loss": 0.0002, "step": 24224 }, { "epoch": 6.390449808732357, "grad_norm": 0.007242615334689617, "learning_rate": 2.408335531522026e-06, "loss": 0.0001, "step": 24226 }, { "epoch": 6.390977443609023, "grad_norm": 0.0017357559408992529, "learning_rate": 2.407983821331223e-06, "loss": 0.0002, "step": 24228 }, { "epoch": 6.391505078485688, "grad_norm": 0.18061715364456177, "learning_rate": 2.4076321111404204e-06, "loss": 0.0006, "step": 24230 }, { "epoch": 6.392032713362354, "grad_norm": 0.007555573713034391, "learning_rate": 2.4072804009496174e-06, "loss": 0.0002, "step": 24232 }, { "epoch": 6.3925603482390185, "grad_norm": 0.038607243448495865, "learning_rate": 2.4069286907588148e-06, "loss": 0.0001, "step": 24234 }, { "epoch": 6.393087983115684, "grad_norm": 0.051996972411870956, "learning_rate": 2.4065769805680117e-06, "loss": 0.0002, "step": 24236 }, { "epoch": 6.393615617992349, "grad_norm": 0.08383344858884811, "learning_rate": 2.406225270377209e-06, "loss": 0.0002, "step": 24238 }, { "epoch": 6.394143252869014, "grad_norm": 0.004689889494329691, "learning_rate": 2.4058735601864065e-06, "loss": 0.0006, "step": 24240 }, { "epoch": 6.39467088774568, "grad_norm": 0.0060361819341778755, "learning_rate": 2.405521849995604e-06, "loss": 0.0001, "step": 24242 }, { "epoch": 6.395198522622345, "grad_norm": 0.00202885246835649, "learning_rate": 2.405170139804801e-06, "loss": 0.0001, "step": 24244 }, { "epoch": 6.395726157499011, "grad_norm": 0.18709765374660492, "learning_rate": 2.404818429613998e-06, "loss": 0.0002, "step": 24246 }, { "epoch": 6.396253792375676, "grad_norm": 0.003312777727842331, "learning_rate": 2.404466719423195e-06, "loss": 0.0001, "step": 24248 }, { "epoch": 6.396781427252342, "grad_norm": 0.1398593634366989, "learning_rate": 2.4041150092323925e-06, "loss": 0.0006, "step": 24250 }, { "epoch": 6.397309062129007, "grad_norm": 0.006135057657957077, "learning_rate": 2.40376329904159e-06, "loss": 0.0001, "step": 24252 }, { "epoch": 6.397836697005672, "grad_norm": 0.0030578160658478737, "learning_rate": 2.4034115888507873e-06, "loss": 0.0001, "step": 24254 }, { "epoch": 6.3983643318823376, "grad_norm": 0.018418362364172935, "learning_rate": 2.403059878659984e-06, "loss": 0.0002, "step": 24256 }, { "epoch": 6.3988919667590025, "grad_norm": 0.003372115781530738, "learning_rate": 2.4027081684691816e-06, "loss": 0.0001, "step": 24258 }, { "epoch": 6.399419601635668, "grad_norm": 0.000818107568193227, "learning_rate": 2.4023564582783785e-06, "loss": 0.0001, "step": 24260 }, { "epoch": 6.399947236512333, "grad_norm": 0.0014875595225021243, "learning_rate": 2.402004748087576e-06, "loss": 0.0001, "step": 24262 }, { "epoch": 6.400474871388999, "grad_norm": 0.0016873121494427323, "learning_rate": 2.4016530378967733e-06, "loss": 0.0001, "step": 24264 }, { "epoch": 6.401002506265664, "grad_norm": 0.006628656294196844, "learning_rate": 2.4013013277059702e-06, "loss": 0.0004, "step": 24266 }, { "epoch": 6.401530141142329, "grad_norm": 0.001861134311184287, "learning_rate": 2.4009496175151676e-06, "loss": 0.0002, "step": 24268 }, { "epoch": 6.402057776018995, "grad_norm": 0.0009170022676698864, "learning_rate": 2.4005979073243646e-06, "loss": 0.0001, "step": 24270 }, { "epoch": 6.40258541089566, "grad_norm": 0.05328226089477539, "learning_rate": 2.400246197133562e-06, "loss": 0.0001, "step": 24272 }, { "epoch": 6.403113045772326, "grad_norm": 0.002794130938127637, "learning_rate": 2.3998944869427593e-06, "loss": 0.0002, "step": 24274 }, { "epoch": 6.403640680648991, "grad_norm": 0.0033511545043438673, "learning_rate": 2.3995427767519567e-06, "loss": 0.0001, "step": 24276 }, { "epoch": 6.404168315525657, "grad_norm": 0.0026138576213270426, "learning_rate": 2.3991910665611536e-06, "loss": 0.0002, "step": 24278 }, { "epoch": 6.4046959504023215, "grad_norm": 0.03489018231630325, "learning_rate": 2.398839356370351e-06, "loss": 0.0002, "step": 24280 }, { "epoch": 6.405223585278987, "grad_norm": 0.04476863518357277, "learning_rate": 2.398487646179548e-06, "loss": 0.0001, "step": 24282 }, { "epoch": 6.405751220155652, "grad_norm": 0.003720424836501479, "learning_rate": 2.3981359359887453e-06, "loss": 0.0001, "step": 24284 }, { "epoch": 6.406278855032317, "grad_norm": 0.023506004363298416, "learning_rate": 2.3977842257979427e-06, "loss": 0.0002, "step": 24286 }, { "epoch": 6.406806489908983, "grad_norm": 0.042251814156770706, "learning_rate": 2.39743251560714e-06, "loss": 0.0002, "step": 24288 }, { "epoch": 6.407334124785648, "grad_norm": 0.14559121429920197, "learning_rate": 2.397080805416337e-06, "loss": 0.0004, "step": 24290 }, { "epoch": 6.407861759662314, "grad_norm": 0.1309085488319397, "learning_rate": 2.396729095225534e-06, "loss": 0.0004, "step": 24292 }, { "epoch": 6.408389394538979, "grad_norm": 0.022502638399600983, "learning_rate": 2.3963773850347314e-06, "loss": 0.0002, "step": 24294 }, { "epoch": 6.408917029415645, "grad_norm": 0.003141846740618348, "learning_rate": 2.3960256748439283e-06, "loss": 0.0001, "step": 24296 }, { "epoch": 6.40944466429231, "grad_norm": 0.0018559073796495795, "learning_rate": 2.395673964653126e-06, "loss": 0.0001, "step": 24298 }, { "epoch": 6.409972299168975, "grad_norm": 0.6195290684700012, "learning_rate": 2.395322254462323e-06, "loss": 0.0018, "step": 24300 }, { "epoch": 6.410499934045641, "grad_norm": 0.5939748883247375, "learning_rate": 2.3949705442715205e-06, "loss": 0.0039, "step": 24302 }, { "epoch": 6.4110275689223055, "grad_norm": 0.007471121847629547, "learning_rate": 2.3946188340807174e-06, "loss": 0.0002, "step": 24304 }, { "epoch": 6.411555203798971, "grad_norm": 0.13274499773979187, "learning_rate": 2.394267123889915e-06, "loss": 0.0005, "step": 24306 }, { "epoch": 6.412082838675636, "grad_norm": 0.3859129250049591, "learning_rate": 2.3939154136991117e-06, "loss": 0.0005, "step": 24308 }, { "epoch": 6.412610473552302, "grad_norm": 1.1923606395721436, "learning_rate": 2.3935637035083095e-06, "loss": 0.0054, "step": 24310 }, { "epoch": 6.413138108428967, "grad_norm": 0.012376271188259125, "learning_rate": 2.3932119933175065e-06, "loss": 0.0018, "step": 24312 }, { "epoch": 6.413665743305632, "grad_norm": 0.0034206791315227747, "learning_rate": 2.392860283126704e-06, "loss": 0.0006, "step": 24314 }, { "epoch": 6.414193378182298, "grad_norm": 0.030675286427140236, "learning_rate": 2.392508572935901e-06, "loss": 0.0002, "step": 24316 }, { "epoch": 6.414721013058963, "grad_norm": 0.015597477555274963, "learning_rate": 2.392156862745098e-06, "loss": 0.0005, "step": 24318 }, { "epoch": 6.415248647935629, "grad_norm": 0.06860864907503128, "learning_rate": 2.391805152554295e-06, "loss": 0.0002, "step": 24320 }, { "epoch": 6.415776282812294, "grad_norm": 0.31921347975730896, "learning_rate": 2.3914534423634925e-06, "loss": 0.0011, "step": 24322 }, { "epoch": 6.41630391768896, "grad_norm": 0.09248476475477219, "learning_rate": 2.39110173217269e-06, "loss": 0.0002, "step": 24324 }, { "epoch": 6.4168315525656245, "grad_norm": 0.09283416718244553, "learning_rate": 2.390750021981887e-06, "loss": 0.0003, "step": 24326 }, { "epoch": 6.41735918744229, "grad_norm": 0.0022925189696252346, "learning_rate": 2.3903983117910842e-06, "loss": 0.0002, "step": 24328 }, { "epoch": 6.417886822318955, "grad_norm": 0.004847338423132896, "learning_rate": 2.390046601600281e-06, "loss": 0.002, "step": 24330 }, { "epoch": 6.41841445719562, "grad_norm": 0.11292438209056854, "learning_rate": 2.3896948914094786e-06, "loss": 0.0013, "step": 24332 }, { "epoch": 6.418942092072286, "grad_norm": 0.03244318440556526, "learning_rate": 2.389343181218676e-06, "loss": 0.0001, "step": 24334 }, { "epoch": 6.419469726948951, "grad_norm": 0.1783829927444458, "learning_rate": 2.3889914710278733e-06, "loss": 0.0062, "step": 24336 }, { "epoch": 6.419997361825617, "grad_norm": 0.00309918075799942, "learning_rate": 2.3886397608370703e-06, "loss": 0.0002, "step": 24338 }, { "epoch": 6.420524996702282, "grad_norm": 0.1350833922624588, "learning_rate": 2.3882880506462676e-06, "loss": 0.003, "step": 24340 }, { "epoch": 6.421052631578947, "grad_norm": 0.0023836023174226284, "learning_rate": 2.3879363404554646e-06, "loss": 0.0001, "step": 24342 }, { "epoch": 6.421580266455613, "grad_norm": 0.04615175724029541, "learning_rate": 2.387584630264662e-06, "loss": 0.0036, "step": 24344 }, { "epoch": 6.422107901332278, "grad_norm": 0.0106732826679945, "learning_rate": 2.3872329200738593e-06, "loss": 0.0002, "step": 24346 }, { "epoch": 6.422635536208944, "grad_norm": 0.6747729778289795, "learning_rate": 2.3868812098830567e-06, "loss": 0.0064, "step": 24348 }, { "epoch": 6.4231631710856085, "grad_norm": 0.02536788210272789, "learning_rate": 2.3865294996922537e-06, "loss": 0.0014, "step": 24350 }, { "epoch": 6.423690805962274, "grad_norm": 0.050276994705200195, "learning_rate": 2.3861777895014506e-06, "loss": 0.0002, "step": 24352 }, { "epoch": 6.424218440838939, "grad_norm": 0.2465272843837738, "learning_rate": 2.385826079310648e-06, "loss": 0.001, "step": 24354 }, { "epoch": 6.424746075715605, "grad_norm": 0.02370557375252247, "learning_rate": 2.3854743691198454e-06, "loss": 0.0002, "step": 24356 }, { "epoch": 6.42527371059227, "grad_norm": 0.04154595732688904, "learning_rate": 2.3851226589290427e-06, "loss": 0.0002, "step": 24358 }, { "epoch": 6.425801345468935, "grad_norm": 0.1769532859325409, "learning_rate": 2.3847709487382397e-06, "loss": 0.0004, "step": 24360 }, { "epoch": 6.426328980345601, "grad_norm": 0.1500728875398636, "learning_rate": 2.384419238547437e-06, "loss": 0.0004, "step": 24362 }, { "epoch": 6.426856615222266, "grad_norm": 0.026287643238902092, "learning_rate": 2.384067528356634e-06, "loss": 0.0012, "step": 24364 }, { "epoch": 6.427384250098932, "grad_norm": 0.09424195438623428, "learning_rate": 2.3837158181658314e-06, "loss": 0.0003, "step": 24366 }, { "epoch": 6.427911884975597, "grad_norm": 0.10357566922903061, "learning_rate": 2.3833641079750288e-06, "loss": 0.0046, "step": 24368 }, { "epoch": 6.428439519852263, "grad_norm": 0.11300700157880783, "learning_rate": 2.383012397784226e-06, "loss": 0.0033, "step": 24370 }, { "epoch": 6.4289671547289275, "grad_norm": 0.002254159189760685, "learning_rate": 2.382660687593423e-06, "loss": 0.0001, "step": 24372 }, { "epoch": 6.429494789605593, "grad_norm": 0.11644255369901657, "learning_rate": 2.3823089774026205e-06, "loss": 0.0033, "step": 24374 }, { "epoch": 6.430022424482258, "grad_norm": 0.009672172367572784, "learning_rate": 2.3819572672118174e-06, "loss": 0.0002, "step": 24376 }, { "epoch": 6.430550059358923, "grad_norm": 0.002002500230446458, "learning_rate": 2.381605557021015e-06, "loss": 0.0001, "step": 24378 }, { "epoch": 6.431077694235589, "grad_norm": 0.00459685642272234, "learning_rate": 2.3812538468302118e-06, "loss": 0.0001, "step": 24380 }, { "epoch": 6.431605329112254, "grad_norm": 0.011271066963672638, "learning_rate": 2.380902136639409e-06, "loss": 0.0002, "step": 24382 }, { "epoch": 6.43213296398892, "grad_norm": 0.00452108308672905, "learning_rate": 2.3805504264486065e-06, "loss": 0.0004, "step": 24384 }, { "epoch": 6.432660598865585, "grad_norm": 0.003438010811805725, "learning_rate": 2.3801987162578035e-06, "loss": 0.0078, "step": 24386 }, { "epoch": 6.43318823374225, "grad_norm": 0.008286985568702221, "learning_rate": 2.379847006067001e-06, "loss": 0.0001, "step": 24388 }, { "epoch": 6.433715868618916, "grad_norm": 0.9169626235961914, "learning_rate": 2.379495295876198e-06, "loss": 0.002, "step": 24390 }, { "epoch": 6.434243503495581, "grad_norm": 0.3776134252548218, "learning_rate": 2.379143585685395e-06, "loss": 0.001, "step": 24392 }, { "epoch": 6.434771138372247, "grad_norm": 0.0029361085034906864, "learning_rate": 2.3787918754945926e-06, "loss": 0.0001, "step": 24394 }, { "epoch": 6.4352987732489115, "grad_norm": 0.004859859589487314, "learning_rate": 2.37844016530379e-06, "loss": 0.0001, "step": 24396 }, { "epoch": 6.435826408125577, "grad_norm": 0.014424318447709084, "learning_rate": 2.378088455112987e-06, "loss": 0.0001, "step": 24398 }, { "epoch": 6.436354043002242, "grad_norm": 0.03531263396143913, "learning_rate": 2.3777367449221843e-06, "loss": 0.0003, "step": 24400 }, { "epoch": 6.436881677878908, "grad_norm": 0.025937994942069054, "learning_rate": 2.377385034731381e-06, "loss": 0.0001, "step": 24402 }, { "epoch": 6.437409312755573, "grad_norm": 0.013678055256605148, "learning_rate": 2.3770333245405786e-06, "loss": 0.0001, "step": 24404 }, { "epoch": 6.437936947632238, "grad_norm": 0.01058882661163807, "learning_rate": 2.376681614349776e-06, "loss": 0.0002, "step": 24406 }, { "epoch": 6.438464582508904, "grad_norm": 0.28343167901039124, "learning_rate": 2.3763299041589733e-06, "loss": 0.0005, "step": 24408 }, { "epoch": 6.438992217385569, "grad_norm": 0.11463800817728043, "learning_rate": 2.3759781939681703e-06, "loss": 0.0002, "step": 24410 }, { "epoch": 6.439519852262235, "grad_norm": 0.009792066179215908, "learning_rate": 2.3756264837773672e-06, "loss": 0.0001, "step": 24412 }, { "epoch": 6.4400474871389, "grad_norm": 0.005294619593769312, "learning_rate": 2.3752747735865646e-06, "loss": 0.0001, "step": 24414 }, { "epoch": 6.440575122015566, "grad_norm": 0.015442072413861752, "learning_rate": 2.374923063395762e-06, "loss": 0.0001, "step": 24416 }, { "epoch": 6.4411027568922306, "grad_norm": 0.03621479123830795, "learning_rate": 2.3745713532049594e-06, "loss": 0.0002, "step": 24418 }, { "epoch": 6.4416303917688955, "grad_norm": 0.014685380272567272, "learning_rate": 2.3742196430141563e-06, "loss": 0.0001, "step": 24420 }, { "epoch": 6.442158026645561, "grad_norm": 0.010307634249329567, "learning_rate": 2.3738679328233537e-06, "loss": 0.0016, "step": 24422 }, { "epoch": 6.442685661522226, "grad_norm": 0.01693759299814701, "learning_rate": 2.3735162226325506e-06, "loss": 0.0002, "step": 24424 }, { "epoch": 6.443213296398892, "grad_norm": 0.002420326229184866, "learning_rate": 2.373164512441748e-06, "loss": 0.0001, "step": 24426 }, { "epoch": 6.443740931275557, "grad_norm": 0.0022009231615811586, "learning_rate": 2.3728128022509454e-06, "loss": 0.0002, "step": 24428 }, { "epoch": 6.444268566152223, "grad_norm": 0.001618977403268218, "learning_rate": 2.3724610920601428e-06, "loss": 0.0001, "step": 24430 }, { "epoch": 6.444796201028888, "grad_norm": 0.0019453956047073007, "learning_rate": 2.3721093818693397e-06, "loss": 0.0001, "step": 24432 }, { "epoch": 6.445323835905553, "grad_norm": 0.0029280995950102806, "learning_rate": 2.371757671678537e-06, "loss": 0.0032, "step": 24434 }, { "epoch": 6.445851470782219, "grad_norm": 0.0022023245692253113, "learning_rate": 2.371405961487734e-06, "loss": 0.0001, "step": 24436 }, { "epoch": 6.446379105658884, "grad_norm": 0.03851255774497986, "learning_rate": 2.3710542512969314e-06, "loss": 0.0002, "step": 24438 }, { "epoch": 6.44690674053555, "grad_norm": 0.005204461514949799, "learning_rate": 2.370702541106129e-06, "loss": 0.0038, "step": 24440 }, { "epoch": 6.4474343754122145, "grad_norm": 0.003234823467209935, "learning_rate": 2.3703508309153258e-06, "loss": 0.0004, "step": 24442 }, { "epoch": 6.44796201028888, "grad_norm": 0.18775632977485657, "learning_rate": 2.369999120724523e-06, "loss": 0.0003, "step": 24444 }, { "epoch": 6.448489645165545, "grad_norm": 0.002068927511572838, "learning_rate": 2.36964741053372e-06, "loss": 0.0001, "step": 24446 }, { "epoch": 6.449017280042211, "grad_norm": 0.19914333522319794, "learning_rate": 2.3692957003429175e-06, "loss": 0.0003, "step": 24448 }, { "epoch": 6.449544914918876, "grad_norm": 0.007562971208244562, "learning_rate": 2.3689439901521144e-06, "loss": 0.0001, "step": 24450 }, { "epoch": 6.450072549795541, "grad_norm": 0.1427297443151474, "learning_rate": 2.368592279961312e-06, "loss": 0.0025, "step": 24452 }, { "epoch": 6.450600184672207, "grad_norm": 0.027955396100878716, "learning_rate": 2.368240569770509e-06, "loss": 0.0004, "step": 24454 }, { "epoch": 6.451127819548872, "grad_norm": 0.14875134825706482, "learning_rate": 2.3678888595797065e-06, "loss": 0.0006, "step": 24456 }, { "epoch": 6.451655454425538, "grad_norm": 0.005865011364221573, "learning_rate": 2.3675371493889035e-06, "loss": 0.0001, "step": 24458 }, { "epoch": 6.452183089302203, "grad_norm": 0.02177192084491253, "learning_rate": 2.367185439198101e-06, "loss": 0.0001, "step": 24460 }, { "epoch": 6.452710724178869, "grad_norm": 0.05390071123838425, "learning_rate": 2.366833729007298e-06, "loss": 0.0002, "step": 24462 }, { "epoch": 6.453238359055534, "grad_norm": 0.006339055951684713, "learning_rate": 2.366482018816495e-06, "loss": 0.0001, "step": 24464 }, { "epoch": 6.4537659939321985, "grad_norm": 0.28558865189552307, "learning_rate": 2.3661303086256926e-06, "loss": 0.0008, "step": 24466 }, { "epoch": 6.454293628808864, "grad_norm": 0.012591088190674782, "learning_rate": 2.36577859843489e-06, "loss": 0.0001, "step": 24468 }, { "epoch": 6.454821263685529, "grad_norm": 0.914025604724884, "learning_rate": 2.365426888244087e-06, "loss": 0.0032, "step": 24470 }, { "epoch": 6.455348898562195, "grad_norm": 0.17330382764339447, "learning_rate": 2.365075178053284e-06, "loss": 0.0003, "step": 24472 }, { "epoch": 6.45587653343886, "grad_norm": 0.001249258522875607, "learning_rate": 2.3647234678624812e-06, "loss": 0.0001, "step": 24474 }, { "epoch": 6.456404168315526, "grad_norm": 0.38589608669281006, "learning_rate": 2.3643717576716786e-06, "loss": 0.0047, "step": 24476 }, { "epoch": 6.456931803192191, "grad_norm": 0.0042247469536960125, "learning_rate": 2.364020047480876e-06, "loss": 0.0001, "step": 24478 }, { "epoch": 6.457459438068856, "grad_norm": 0.0027644005604088306, "learning_rate": 2.363668337290073e-06, "loss": 0.0001, "step": 24480 }, { "epoch": 6.457987072945522, "grad_norm": 0.0020545220468193293, "learning_rate": 2.3633166270992703e-06, "loss": 0.0001, "step": 24482 }, { "epoch": 6.458514707822187, "grad_norm": 0.007745794951915741, "learning_rate": 2.3629649169084673e-06, "loss": 0.0002, "step": 24484 }, { "epoch": 6.459042342698853, "grad_norm": 0.015563352033495903, "learning_rate": 2.3626132067176646e-06, "loss": 0.0001, "step": 24486 }, { "epoch": 6.4595699775755175, "grad_norm": 0.005551987327635288, "learning_rate": 2.362261496526862e-06, "loss": 0.0001, "step": 24488 }, { "epoch": 6.460097612452183, "grad_norm": 0.052189286798238754, "learning_rate": 2.3619097863360594e-06, "loss": 0.0004, "step": 24490 }, { "epoch": 6.460625247328848, "grad_norm": 0.03821150213479996, "learning_rate": 2.3615580761452563e-06, "loss": 0.0002, "step": 24492 }, { "epoch": 6.461152882205514, "grad_norm": 0.0053030080161988735, "learning_rate": 2.3612063659544537e-06, "loss": 0.0001, "step": 24494 }, { "epoch": 6.461680517082179, "grad_norm": 0.018844755366444588, "learning_rate": 2.3608546557636507e-06, "loss": 0.0001, "step": 24496 }, { "epoch": 6.462208151958844, "grad_norm": 0.002076726173982024, "learning_rate": 2.360502945572848e-06, "loss": 0.0002, "step": 24498 }, { "epoch": 6.46273578683551, "grad_norm": 0.0013881383929401636, "learning_rate": 2.3601512353820454e-06, "loss": 0.0001, "step": 24500 }, { "epoch": 6.463263421712175, "grad_norm": 0.0019287812756374478, "learning_rate": 2.3597995251912424e-06, "loss": 0.0001, "step": 24502 }, { "epoch": 6.463791056588841, "grad_norm": 0.40014541149139404, "learning_rate": 2.3594478150004398e-06, "loss": 0.0006, "step": 24504 }, { "epoch": 6.464318691465506, "grad_norm": 0.12246216088533401, "learning_rate": 2.3590961048096367e-06, "loss": 0.0002, "step": 24506 }, { "epoch": 6.464846326342172, "grad_norm": 0.6003112196922302, "learning_rate": 2.358744394618834e-06, "loss": 0.0007, "step": 24508 }, { "epoch": 6.465373961218837, "grad_norm": 0.019159119576215744, "learning_rate": 2.358392684428031e-06, "loss": 0.0014, "step": 24510 }, { "epoch": 6.4659015960955015, "grad_norm": 0.0005608104984275997, "learning_rate": 2.358040974237229e-06, "loss": 0.0062, "step": 24512 }, { "epoch": 6.466429230972167, "grad_norm": 0.06907230615615845, "learning_rate": 2.3576892640464258e-06, "loss": 0.0002, "step": 24514 }, { "epoch": 6.466956865848832, "grad_norm": 0.030454937368631363, "learning_rate": 2.357337553855623e-06, "loss": 0.0002, "step": 24516 }, { "epoch": 6.467484500725498, "grad_norm": 0.0012516326969489455, "learning_rate": 2.35698584366482e-06, "loss": 0.0015, "step": 24518 }, { "epoch": 6.468012135602163, "grad_norm": 0.008747225627303123, "learning_rate": 2.3566341334740175e-06, "loss": 0.0001, "step": 24520 }, { "epoch": 6.468539770478829, "grad_norm": 0.00821169838309288, "learning_rate": 2.3562824232832144e-06, "loss": 0.0001, "step": 24522 }, { "epoch": 6.469067405355494, "grad_norm": 0.001471090130507946, "learning_rate": 2.3559307130924122e-06, "loss": 0.0001, "step": 24524 }, { "epoch": 6.469595040232159, "grad_norm": 0.002946541178971529, "learning_rate": 2.355579002901609e-06, "loss": 0.0001, "step": 24526 }, { "epoch": 6.470122675108825, "grad_norm": 0.006958620622754097, "learning_rate": 2.3552272927108066e-06, "loss": 0.0001, "step": 24528 }, { "epoch": 6.47065030998549, "grad_norm": 0.0008872178150340915, "learning_rate": 2.3548755825200035e-06, "loss": 0.0001, "step": 24530 }, { "epoch": 6.471177944862156, "grad_norm": 0.0007833848358131945, "learning_rate": 2.3545238723292005e-06, "loss": 0.0001, "step": 24532 }, { "epoch": 6.4717055797388205, "grad_norm": 0.9903832077980042, "learning_rate": 2.354172162138398e-06, "loss": 0.0022, "step": 24534 }, { "epoch": 6.472233214615486, "grad_norm": 0.010925150476396084, "learning_rate": 2.3538204519475952e-06, "loss": 0.0001, "step": 24536 }, { "epoch": 6.472760849492151, "grad_norm": 1.2682344913482666, "learning_rate": 2.3534687417567926e-06, "loss": 0.0019, "step": 24538 }, { "epoch": 6.473288484368817, "grad_norm": 0.07830589264631271, "learning_rate": 2.3531170315659896e-06, "loss": 0.0002, "step": 24540 }, { "epoch": 6.473816119245482, "grad_norm": 0.002577394712716341, "learning_rate": 2.352765321375187e-06, "loss": 0.0003, "step": 24542 }, { "epoch": 6.474343754122147, "grad_norm": 0.001434379257261753, "learning_rate": 2.352413611184384e-06, "loss": 0.0001, "step": 24544 }, { "epoch": 6.474871388998813, "grad_norm": 0.018631765618920326, "learning_rate": 2.3520619009935813e-06, "loss": 0.0001, "step": 24546 }, { "epoch": 6.475399023875478, "grad_norm": 0.00251315557397902, "learning_rate": 2.3517101908027786e-06, "loss": 0.0001, "step": 24548 }, { "epoch": 6.475926658752144, "grad_norm": 0.1487383246421814, "learning_rate": 2.351358480611976e-06, "loss": 0.0015, "step": 24550 }, { "epoch": 6.476454293628809, "grad_norm": 0.008144965395331383, "learning_rate": 2.351006770421173e-06, "loss": 0.0001, "step": 24552 }, { "epoch": 6.476981928505475, "grad_norm": 0.42692744731903076, "learning_rate": 2.3506550602303703e-06, "loss": 0.0007, "step": 24554 }, { "epoch": 6.47750956338214, "grad_norm": 0.00295701390132308, "learning_rate": 2.3503033500395673e-06, "loss": 0.0001, "step": 24556 }, { "epoch": 6.4780371982588045, "grad_norm": 0.2283526211977005, "learning_rate": 2.3499516398487647e-06, "loss": 0.0004, "step": 24558 }, { "epoch": 6.47856483313547, "grad_norm": 0.3487521708011627, "learning_rate": 2.349599929657962e-06, "loss": 0.0146, "step": 24560 }, { "epoch": 6.479092468012135, "grad_norm": 0.01074075885117054, "learning_rate": 2.3492482194671594e-06, "loss": 0.0001, "step": 24562 }, { "epoch": 6.479620102888801, "grad_norm": 0.05994730070233345, "learning_rate": 2.3488965092763564e-06, "loss": 0.0002, "step": 24564 }, { "epoch": 6.480147737765466, "grad_norm": 0.005835828371345997, "learning_rate": 2.3485447990855533e-06, "loss": 0.0001, "step": 24566 }, { "epoch": 6.480675372642132, "grad_norm": 0.89158034324646, "learning_rate": 2.3481930888947507e-06, "loss": 0.006, "step": 24568 }, { "epoch": 6.481203007518797, "grad_norm": 0.10587507486343384, "learning_rate": 2.347841378703948e-06, "loss": 0.0057, "step": 24570 }, { "epoch": 6.481730642395462, "grad_norm": 0.14369580149650574, "learning_rate": 2.3474896685131454e-06, "loss": 0.0002, "step": 24572 }, { "epoch": 6.482258277272128, "grad_norm": 0.0039043715223670006, "learning_rate": 2.3471379583223424e-06, "loss": 0.0001, "step": 24574 }, { "epoch": 6.482785912148793, "grad_norm": 0.6197925806045532, "learning_rate": 2.3467862481315398e-06, "loss": 0.0021, "step": 24576 }, { "epoch": 6.483313547025459, "grad_norm": 0.0070785460993647575, "learning_rate": 2.3464345379407367e-06, "loss": 0.0002, "step": 24578 }, { "epoch": 6.4838411819021236, "grad_norm": 0.5505849123001099, "learning_rate": 2.346082827749934e-06, "loss": 0.0009, "step": 24580 }, { "epoch": 6.484368816778789, "grad_norm": 0.0029033622704446316, "learning_rate": 2.3457311175591315e-06, "loss": 0.0003, "step": 24582 }, { "epoch": 6.484896451655454, "grad_norm": 0.030263187363743782, "learning_rate": 2.345379407368329e-06, "loss": 0.0002, "step": 24584 }, { "epoch": 6.48542408653212, "grad_norm": 0.023641418665647507, "learning_rate": 2.345027697177526e-06, "loss": 0.0002, "step": 24586 }, { "epoch": 6.485951721408785, "grad_norm": 0.06406548619270325, "learning_rate": 2.344675986986723e-06, "loss": 0.0013, "step": 24588 }, { "epoch": 6.48647935628545, "grad_norm": 0.20003430545330048, "learning_rate": 2.34432427679592e-06, "loss": 0.0005, "step": 24590 }, { "epoch": 6.487006991162116, "grad_norm": 0.004332163371145725, "learning_rate": 2.3439725666051175e-06, "loss": 0.0001, "step": 24592 }, { "epoch": 6.487534626038781, "grad_norm": 0.11507558077573776, "learning_rate": 2.3436208564143145e-06, "loss": 0.0002, "step": 24594 }, { "epoch": 6.488062260915447, "grad_norm": 0.0035847288090735674, "learning_rate": 2.343269146223512e-06, "loss": 0.0001, "step": 24596 }, { "epoch": 6.488589895792112, "grad_norm": 1.0288872718811035, "learning_rate": 2.3429174360327092e-06, "loss": 0.002, "step": 24598 }, { "epoch": 6.489117530668778, "grad_norm": 0.04895925149321556, "learning_rate": 2.342565725841906e-06, "loss": 0.0005, "step": 24600 }, { "epoch": 6.489645165545443, "grad_norm": 0.020392734557390213, "learning_rate": 2.3422140156511035e-06, "loss": 0.0002, "step": 24602 }, { "epoch": 6.4901728004221075, "grad_norm": 0.06299719959497452, "learning_rate": 2.3418623054603005e-06, "loss": 0.0002, "step": 24604 }, { "epoch": 6.490700435298773, "grad_norm": 0.012502988800406456, "learning_rate": 2.341510595269498e-06, "loss": 0.0001, "step": 24606 }, { "epoch": 6.491228070175438, "grad_norm": 0.540534496307373, "learning_rate": 2.3411588850786953e-06, "loss": 0.0059, "step": 24608 }, { "epoch": 6.491755705052104, "grad_norm": 0.0058235591277480125, "learning_rate": 2.3408071748878926e-06, "loss": 0.0001, "step": 24610 }, { "epoch": 6.492283339928769, "grad_norm": 0.004045591223984957, "learning_rate": 2.3404554646970896e-06, "loss": 0.003, "step": 24612 }, { "epoch": 6.492810974805435, "grad_norm": 0.0032643231097608805, "learning_rate": 2.340103754506287e-06, "loss": 0.0001, "step": 24614 }, { "epoch": 6.4933386096821, "grad_norm": 0.0010865051299333572, "learning_rate": 2.339752044315484e-06, "loss": 0.0001, "step": 24616 }, { "epoch": 6.493866244558765, "grad_norm": 0.006126099266111851, "learning_rate": 2.3394003341246813e-06, "loss": 0.0001, "step": 24618 }, { "epoch": 6.494393879435431, "grad_norm": 0.9831401109695435, "learning_rate": 2.3390486239338787e-06, "loss": 0.0026, "step": 24620 }, { "epoch": 6.494921514312096, "grad_norm": 0.0011095140362158418, "learning_rate": 2.338696913743076e-06, "loss": 0.0001, "step": 24622 }, { "epoch": 6.495449149188762, "grad_norm": 0.002134512411430478, "learning_rate": 2.338345203552273e-06, "loss": 0.0001, "step": 24624 }, { "epoch": 6.495976784065427, "grad_norm": 0.12712222337722778, "learning_rate": 2.33799349336147e-06, "loss": 0.0003, "step": 24626 }, { "epoch": 6.496504418942092, "grad_norm": 0.006086100824177265, "learning_rate": 2.3376417831706673e-06, "loss": 0.0005, "step": 24628 }, { "epoch": 6.497032053818757, "grad_norm": 0.0027557460125535727, "learning_rate": 2.3372900729798647e-06, "loss": 0.0001, "step": 24630 }, { "epoch": 6.497559688695423, "grad_norm": 0.14828604459762573, "learning_rate": 2.336938362789062e-06, "loss": 0.0004, "step": 24632 }, { "epoch": 6.498087323572088, "grad_norm": 0.29535630345344543, "learning_rate": 2.336586652598259e-06, "loss": 0.0038, "step": 24634 }, { "epoch": 6.498614958448753, "grad_norm": 0.32135429978370667, "learning_rate": 2.3362349424074564e-06, "loss": 0.013, "step": 24636 }, { "epoch": 6.499142593325419, "grad_norm": 0.2996337413787842, "learning_rate": 2.3358832322166533e-06, "loss": 0.0003, "step": 24638 }, { "epoch": 6.499670228202084, "grad_norm": 1.4192415475845337, "learning_rate": 2.3355315220258507e-06, "loss": 0.0075, "step": 24640 }, { "epoch": 6.50019786307875, "grad_norm": 0.0968925803899765, "learning_rate": 2.335179811835048e-06, "loss": 0.0003, "step": 24642 }, { "epoch": 6.500725497955415, "grad_norm": 0.10784435272216797, "learning_rate": 2.3348281016442455e-06, "loss": 0.0003, "step": 24644 }, { "epoch": 6.50125313283208, "grad_norm": 0.4262966513633728, "learning_rate": 2.3344763914534424e-06, "loss": 0.0004, "step": 24646 }, { "epoch": 6.501780767708746, "grad_norm": 0.008481400087475777, "learning_rate": 2.33412468126264e-06, "loss": 0.0001, "step": 24648 }, { "epoch": 6.5023084025854105, "grad_norm": 0.00522503349930048, "learning_rate": 2.3337729710718368e-06, "loss": 0.0001, "step": 24650 }, { "epoch": 6.502836037462076, "grad_norm": 0.09964394569396973, "learning_rate": 2.333421260881034e-06, "loss": 0.0017, "step": 24652 }, { "epoch": 6.503363672338741, "grad_norm": 0.007211057003587484, "learning_rate": 2.3330695506902315e-06, "loss": 0.0001, "step": 24654 }, { "epoch": 6.503891307215407, "grad_norm": 0.11914221197366714, "learning_rate": 2.3327178404994285e-06, "loss": 0.0011, "step": 24656 }, { "epoch": 6.504418942092072, "grad_norm": 0.008091996423900127, "learning_rate": 2.332366130308626e-06, "loss": 0.0002, "step": 24658 }, { "epoch": 6.504946576968738, "grad_norm": 0.0791073814034462, "learning_rate": 2.3320144201178228e-06, "loss": 0.0017, "step": 24660 }, { "epoch": 6.505474211845403, "grad_norm": 0.009207709692418575, "learning_rate": 2.33166270992702e-06, "loss": 0.0001, "step": 24662 }, { "epoch": 6.506001846722068, "grad_norm": 0.4431487023830414, "learning_rate": 2.331310999736217e-06, "loss": 0.0028, "step": 24664 }, { "epoch": 6.506529481598734, "grad_norm": 0.1744399070739746, "learning_rate": 2.330959289545415e-06, "loss": 0.0007, "step": 24666 }, { "epoch": 6.507057116475399, "grad_norm": 0.0027594738639891148, "learning_rate": 2.330607579354612e-06, "loss": 0.0001, "step": 24668 }, { "epoch": 6.507584751352065, "grad_norm": 0.0031042080372571945, "learning_rate": 2.3302558691638092e-06, "loss": 0.0027, "step": 24670 }, { "epoch": 6.50811238622873, "grad_norm": 0.00870050210505724, "learning_rate": 2.329904158973006e-06, "loss": 0.0042, "step": 24672 }, { "epoch": 6.508640021105395, "grad_norm": 1.385365605354309, "learning_rate": 2.3295524487822036e-06, "loss": 0.0025, "step": 24674 }, { "epoch": 6.50916765598206, "grad_norm": 0.016652416437864304, "learning_rate": 2.3292007385914005e-06, "loss": 0.0001, "step": 24676 }, { "epoch": 6.509695290858726, "grad_norm": 0.08093398064374924, "learning_rate": 2.328849028400598e-06, "loss": 0.0025, "step": 24678 }, { "epoch": 6.510222925735391, "grad_norm": 0.3088972866535187, "learning_rate": 2.3284973182097953e-06, "loss": 0.0025, "step": 24680 }, { "epoch": 6.510750560612056, "grad_norm": 0.34067806601524353, "learning_rate": 2.3281456080189927e-06, "loss": 0.0007, "step": 24682 }, { "epoch": 6.511278195488722, "grad_norm": 0.03222653269767761, "learning_rate": 2.3277938978281896e-06, "loss": 0.0002, "step": 24684 }, { "epoch": 6.511805830365387, "grad_norm": 0.07830186933279037, "learning_rate": 2.3274421876373866e-06, "loss": 0.0015, "step": 24686 }, { "epoch": 6.512333465242053, "grad_norm": 0.0028657251968979836, "learning_rate": 2.327090477446584e-06, "loss": 0.0001, "step": 24688 }, { "epoch": 6.512861100118718, "grad_norm": 0.001600444782525301, "learning_rate": 2.3267387672557813e-06, "loss": 0.0001, "step": 24690 }, { "epoch": 6.513388734995383, "grad_norm": 0.013842570595443249, "learning_rate": 2.3263870570649787e-06, "loss": 0.0001, "step": 24692 }, { "epoch": 6.513916369872049, "grad_norm": 0.1591746211051941, "learning_rate": 2.3260353468741756e-06, "loss": 0.0014, "step": 24694 }, { "epoch": 6.5144440047487135, "grad_norm": 0.04158637300133705, "learning_rate": 2.325683636683373e-06, "loss": 0.0002, "step": 24696 }, { "epoch": 6.514971639625379, "grad_norm": 0.01135702058672905, "learning_rate": 2.32533192649257e-06, "loss": 0.0001, "step": 24698 }, { "epoch": 6.515499274502044, "grad_norm": 0.001344640040770173, "learning_rate": 2.3249802163017673e-06, "loss": 0.0001, "step": 24700 }, { "epoch": 6.51602690937871, "grad_norm": 0.0014788967091590166, "learning_rate": 2.3246285061109647e-06, "loss": 0.0001, "step": 24702 }, { "epoch": 6.516554544255375, "grad_norm": 0.03737911954522133, "learning_rate": 2.324276795920162e-06, "loss": 0.0021, "step": 24704 }, { "epoch": 6.517082179132041, "grad_norm": 0.12125089019536972, "learning_rate": 2.323925085729359e-06, "loss": 0.0002, "step": 24706 }, { "epoch": 6.517609814008706, "grad_norm": 0.14003904163837433, "learning_rate": 2.3235733755385564e-06, "loss": 0.0003, "step": 24708 }, { "epoch": 6.518137448885371, "grad_norm": 0.007760449778288603, "learning_rate": 2.3232216653477534e-06, "loss": 0.0001, "step": 24710 }, { "epoch": 6.518665083762037, "grad_norm": 0.02012372575700283, "learning_rate": 2.3228699551569507e-06, "loss": 0.0001, "step": 24712 }, { "epoch": 6.519192718638702, "grad_norm": 0.14119265973567963, "learning_rate": 2.322518244966148e-06, "loss": 0.0047, "step": 24714 }, { "epoch": 6.519720353515368, "grad_norm": 0.018135836347937584, "learning_rate": 2.322166534775345e-06, "loss": 0.0001, "step": 24716 }, { "epoch": 6.520247988392033, "grad_norm": 0.020900344476103783, "learning_rate": 2.3218148245845425e-06, "loss": 0.0005, "step": 24718 }, { "epoch": 6.520775623268698, "grad_norm": 0.004359798040241003, "learning_rate": 2.3214631143937394e-06, "loss": 0.0044, "step": 24720 }, { "epoch": 6.521303258145363, "grad_norm": 0.006887100636959076, "learning_rate": 2.3211114042029368e-06, "loss": 0.0052, "step": 24722 }, { "epoch": 6.521830893022029, "grad_norm": 0.5324274897575378, "learning_rate": 2.320759694012134e-06, "loss": 0.001, "step": 24724 }, { "epoch": 6.522358527898694, "grad_norm": 0.6577027440071106, "learning_rate": 2.3204079838213315e-06, "loss": 0.002, "step": 24726 }, { "epoch": 6.522886162775359, "grad_norm": 0.5510252714157104, "learning_rate": 2.3200562736305285e-06, "loss": 0.0012, "step": 24728 }, { "epoch": 6.523413797652025, "grad_norm": 0.0012294388143345714, "learning_rate": 2.319704563439726e-06, "loss": 0.0001, "step": 24730 }, { "epoch": 6.52394143252869, "grad_norm": 0.01891196332871914, "learning_rate": 2.319352853248923e-06, "loss": 0.0001, "step": 24732 }, { "epoch": 6.524469067405356, "grad_norm": 0.003278717864304781, "learning_rate": 2.31900114305812e-06, "loss": 0.0022, "step": 24734 }, { "epoch": 6.524996702282021, "grad_norm": 0.007079073693603277, "learning_rate": 2.318649432867317e-06, "loss": 0.0002, "step": 24736 }, { "epoch": 6.525524337158686, "grad_norm": 0.017110157757997513, "learning_rate": 2.318297722676515e-06, "loss": 0.0003, "step": 24738 }, { "epoch": 6.526051972035352, "grad_norm": 0.0021904276218265295, "learning_rate": 2.317946012485712e-06, "loss": 0.0001, "step": 24740 }, { "epoch": 6.5265796069120166, "grad_norm": 0.060273997485637665, "learning_rate": 2.3175943022949093e-06, "loss": 0.0002, "step": 24742 }, { "epoch": 6.527107241788682, "grad_norm": 0.01960133947432041, "learning_rate": 2.3172425921041062e-06, "loss": 0.0001, "step": 24744 }, { "epoch": 6.527634876665347, "grad_norm": 0.013907826505601406, "learning_rate": 2.316890881913303e-06, "loss": 0.0001, "step": 24746 }, { "epoch": 6.528162511542013, "grad_norm": 0.004407146479934454, "learning_rate": 2.3165391717225005e-06, "loss": 0.0001, "step": 24748 }, { "epoch": 6.528690146418678, "grad_norm": 0.039029546082019806, "learning_rate": 2.316187461531698e-06, "loss": 0.0027, "step": 24750 }, { "epoch": 6.529217781295344, "grad_norm": 0.019300436601042747, "learning_rate": 2.3158357513408953e-06, "loss": 0.0001, "step": 24752 }, { "epoch": 6.529745416172009, "grad_norm": 0.010313049890100956, "learning_rate": 2.3154840411500923e-06, "loss": 0.0095, "step": 24754 }, { "epoch": 6.530273051048674, "grad_norm": 0.013692209497094154, "learning_rate": 2.3151323309592896e-06, "loss": 0.0001, "step": 24756 }, { "epoch": 6.53080068592534, "grad_norm": 0.6303525567054749, "learning_rate": 2.3147806207684866e-06, "loss": 0.001, "step": 24758 }, { "epoch": 6.531328320802005, "grad_norm": 0.013183945789933205, "learning_rate": 2.314428910577684e-06, "loss": 0.0001, "step": 24760 }, { "epoch": 6.531855955678671, "grad_norm": 0.002137075876817107, "learning_rate": 2.3140772003868813e-06, "loss": 0.0006, "step": 24762 }, { "epoch": 6.532383590555336, "grad_norm": 0.006035694386810064, "learning_rate": 2.3137254901960787e-06, "loss": 0.0001, "step": 24764 }, { "epoch": 6.532911225432001, "grad_norm": 0.37339404225349426, "learning_rate": 2.3133737800052757e-06, "loss": 0.007, "step": 24766 }, { "epoch": 6.533438860308666, "grad_norm": 0.04175891727209091, "learning_rate": 2.313022069814473e-06, "loss": 0.0003, "step": 24768 }, { "epoch": 6.533966495185332, "grad_norm": 0.005465970374643803, "learning_rate": 2.31267035962367e-06, "loss": 0.0001, "step": 24770 }, { "epoch": 6.534494130061997, "grad_norm": 0.5903480052947998, "learning_rate": 2.3123186494328674e-06, "loss": 0.0015, "step": 24772 }, { "epoch": 6.535021764938662, "grad_norm": 1.1951351165771484, "learning_rate": 2.3119669392420647e-06, "loss": 0.0015, "step": 24774 }, { "epoch": 6.535549399815328, "grad_norm": 0.1960376650094986, "learning_rate": 2.3116152290512617e-06, "loss": 0.0005, "step": 24776 }, { "epoch": 6.536077034691993, "grad_norm": 0.1357962191104889, "learning_rate": 2.311263518860459e-06, "loss": 0.0003, "step": 24778 }, { "epoch": 6.536604669568659, "grad_norm": 0.004082909785211086, "learning_rate": 2.310911808669656e-06, "loss": 0.0001, "step": 24780 }, { "epoch": 6.537132304445324, "grad_norm": 0.010139388963580132, "learning_rate": 2.3105600984788534e-06, "loss": 0.0001, "step": 24782 }, { "epoch": 6.537659939321989, "grad_norm": 0.004402786493301392, "learning_rate": 2.3102083882880508e-06, "loss": 0.0001, "step": 24784 }, { "epoch": 6.538187574198655, "grad_norm": 0.0025410945527255535, "learning_rate": 2.309856678097248e-06, "loss": 0.0021, "step": 24786 }, { "epoch": 6.53871520907532, "grad_norm": 0.01278635673224926, "learning_rate": 2.309504967906445e-06, "loss": 0.001, "step": 24788 }, { "epoch": 6.539242843951985, "grad_norm": 0.20393946766853333, "learning_rate": 2.3091532577156425e-06, "loss": 0.0005, "step": 24790 }, { "epoch": 6.53977047882865, "grad_norm": 0.0033949962817132473, "learning_rate": 2.3088015475248394e-06, "loss": 0.0001, "step": 24792 }, { "epoch": 6.540298113705316, "grad_norm": 0.004918168764561415, "learning_rate": 2.308449837334037e-06, "loss": 0.0002, "step": 24794 }, { "epoch": 6.540825748581981, "grad_norm": 0.0021959515288472176, "learning_rate": 2.308098127143234e-06, "loss": 0.0002, "step": 24796 }, { "epoch": 6.541353383458647, "grad_norm": 0.00563664548099041, "learning_rate": 2.3077464169524316e-06, "loss": 0.0001, "step": 24798 }, { "epoch": 6.541881018335312, "grad_norm": 0.059688907116651535, "learning_rate": 2.3073947067616285e-06, "loss": 0.0002, "step": 24800 }, { "epoch": 6.542408653211977, "grad_norm": 0.004714615643024445, "learning_rate": 2.307042996570826e-06, "loss": 0.0003, "step": 24802 }, { "epoch": 6.542936288088643, "grad_norm": 0.025426115840673447, "learning_rate": 2.306691286380023e-06, "loss": 0.0001, "step": 24804 }, { "epoch": 6.543463922965308, "grad_norm": 0.05052274838089943, "learning_rate": 2.30633957618922e-06, "loss": 0.0002, "step": 24806 }, { "epoch": 6.543991557841974, "grad_norm": 0.027371959760785103, "learning_rate": 2.305987865998417e-06, "loss": 0.0003, "step": 24808 }, { "epoch": 6.544519192718639, "grad_norm": 0.004965769127011299, "learning_rate": 2.3056361558076145e-06, "loss": 0.0001, "step": 24810 }, { "epoch": 6.545046827595304, "grad_norm": 0.03608398884534836, "learning_rate": 2.305284445616812e-06, "loss": 0.0001, "step": 24812 }, { "epoch": 6.545574462471969, "grad_norm": 0.15642699599266052, "learning_rate": 2.304932735426009e-06, "loss": 0.0007, "step": 24814 }, { "epoch": 6.546102097348635, "grad_norm": 0.004189171362668276, "learning_rate": 2.3045810252352062e-06, "loss": 0.0001, "step": 24816 }, { "epoch": 6.5466297322253, "grad_norm": 0.03764388710260391, "learning_rate": 2.304229315044403e-06, "loss": 0.0002, "step": 24818 }, { "epoch": 6.547157367101965, "grad_norm": 0.022270478308200836, "learning_rate": 2.3038776048536006e-06, "loss": 0.0002, "step": 24820 }, { "epoch": 6.547685001978631, "grad_norm": 0.5172247290611267, "learning_rate": 2.303525894662798e-06, "loss": 0.0009, "step": 24822 }, { "epoch": 6.548212636855296, "grad_norm": 0.02862609550356865, "learning_rate": 2.3031741844719953e-06, "loss": 0.0001, "step": 24824 }, { "epoch": 6.548740271731962, "grad_norm": 0.0023077945224940777, "learning_rate": 2.3028224742811923e-06, "loss": 0.0001, "step": 24826 }, { "epoch": 6.549267906608627, "grad_norm": 0.004338637460023165, "learning_rate": 2.3024707640903897e-06, "loss": 0.0001, "step": 24828 }, { "epoch": 6.549795541485292, "grad_norm": 0.0024836035445332527, "learning_rate": 2.3021190538995866e-06, "loss": 0.0004, "step": 24830 }, { "epoch": 6.550323176361958, "grad_norm": 0.0030868048779666424, "learning_rate": 2.301767343708784e-06, "loss": 0.0001, "step": 24832 }, { "epoch": 6.550850811238623, "grad_norm": 0.0029414461459964514, "learning_rate": 2.3014156335179814e-06, "loss": 0.0001, "step": 24834 }, { "epoch": 6.551378446115288, "grad_norm": 0.017973028123378754, "learning_rate": 2.3010639233271783e-06, "loss": 0.0001, "step": 24836 }, { "epoch": 6.551906080991953, "grad_norm": 0.001566405058838427, "learning_rate": 2.3007122131363757e-06, "loss": 0.0001, "step": 24838 }, { "epoch": 6.552433715868619, "grad_norm": 0.0019003766356036067, "learning_rate": 2.3003605029455726e-06, "loss": 0.0001, "step": 24840 }, { "epoch": 6.552961350745284, "grad_norm": 0.1819448620080948, "learning_rate": 2.30000879275477e-06, "loss": 0.0059, "step": 24842 }, { "epoch": 6.55348898562195, "grad_norm": 0.002177236834540963, "learning_rate": 2.2996570825639674e-06, "loss": 0.0001, "step": 24844 }, { "epoch": 6.554016620498615, "grad_norm": 0.00918660406023264, "learning_rate": 2.2993053723731648e-06, "loss": 0.0001, "step": 24846 }, { "epoch": 6.55454425537528, "grad_norm": 0.016852114349603653, "learning_rate": 2.2989536621823617e-06, "loss": 0.0001, "step": 24848 }, { "epoch": 6.555071890251946, "grad_norm": 0.0017727320082485676, "learning_rate": 2.298601951991559e-06, "loss": 0.0001, "step": 24850 }, { "epoch": 6.555599525128611, "grad_norm": 0.018934834748506546, "learning_rate": 2.298250241800756e-06, "loss": 0.0001, "step": 24852 }, { "epoch": 6.556127160005277, "grad_norm": 0.005145934876054525, "learning_rate": 2.2978985316099534e-06, "loss": 0.0001, "step": 24854 }, { "epoch": 6.556654794881942, "grad_norm": 0.0059177628718316555, "learning_rate": 2.297546821419151e-06, "loss": 0.0003, "step": 24856 }, { "epoch": 6.5571824297586065, "grad_norm": 0.7147813439369202, "learning_rate": 2.297195111228348e-06, "loss": 0.0019, "step": 24858 }, { "epoch": 6.557710064635272, "grad_norm": 0.0034838514402508736, "learning_rate": 2.296843401037545e-06, "loss": 0.0001, "step": 24860 }, { "epoch": 6.558237699511937, "grad_norm": 0.00611259276047349, "learning_rate": 2.2964916908467425e-06, "loss": 0.0002, "step": 24862 }, { "epoch": 6.558765334388603, "grad_norm": 0.006368127651512623, "learning_rate": 2.2961399806559395e-06, "loss": 0.0031, "step": 24864 }, { "epoch": 6.559292969265268, "grad_norm": 0.0032368835527449846, "learning_rate": 2.295788270465137e-06, "loss": 0.0001, "step": 24866 }, { "epoch": 6.559820604141934, "grad_norm": 0.002107650740072131, "learning_rate": 2.295436560274334e-06, "loss": 0.0001, "step": 24868 }, { "epoch": 6.560348239018599, "grad_norm": 0.001545066712424159, "learning_rate": 2.295084850083531e-06, "loss": 0.0003, "step": 24870 }, { "epoch": 6.560875873895265, "grad_norm": 0.1179480254650116, "learning_rate": 2.2947331398927285e-06, "loss": 0.0003, "step": 24872 }, { "epoch": 6.56140350877193, "grad_norm": 0.01948234625160694, "learning_rate": 2.2943814297019255e-06, "loss": 0.0002, "step": 24874 }, { "epoch": 6.561931143648595, "grad_norm": 0.670997679233551, "learning_rate": 2.294029719511123e-06, "loss": 0.0046, "step": 24876 }, { "epoch": 6.562458778525261, "grad_norm": 0.008368685841560364, "learning_rate": 2.29367800932032e-06, "loss": 0.0001, "step": 24878 }, { "epoch": 6.562986413401926, "grad_norm": 0.0014024991542100906, "learning_rate": 2.2933262991295176e-06, "loss": 0.0001, "step": 24880 }, { "epoch": 6.563514048278591, "grad_norm": 0.034048181027173996, "learning_rate": 2.2929745889387146e-06, "loss": 0.0008, "step": 24882 }, { "epoch": 6.564041683155256, "grad_norm": 0.0015752509934827685, "learning_rate": 2.292622878747912e-06, "loss": 0.0001, "step": 24884 }, { "epoch": 6.564569318031922, "grad_norm": 0.005629846826195717, "learning_rate": 2.292271168557109e-06, "loss": 0.003, "step": 24886 }, { "epoch": 6.565096952908587, "grad_norm": 0.23543772101402283, "learning_rate": 2.2919194583663063e-06, "loss": 0.0008, "step": 24888 }, { "epoch": 6.565624587785253, "grad_norm": 0.0011890858877450228, "learning_rate": 2.2915677481755032e-06, "loss": 0.0048, "step": 24890 }, { "epoch": 6.566152222661918, "grad_norm": 0.01687162183225155, "learning_rate": 2.2912160379847006e-06, "loss": 0.0001, "step": 24892 }, { "epoch": 6.566679857538583, "grad_norm": 0.002366309752687812, "learning_rate": 2.290864327793898e-06, "loss": 0.0002, "step": 24894 }, { "epoch": 6.567207492415249, "grad_norm": 0.036028072237968445, "learning_rate": 2.2905126176030954e-06, "loss": 0.0003, "step": 24896 }, { "epoch": 6.567735127291914, "grad_norm": 0.27601271867752075, "learning_rate": 2.2901609074122923e-06, "loss": 0.0009, "step": 24898 }, { "epoch": 6.56826276216858, "grad_norm": 0.015246260911226273, "learning_rate": 2.2898091972214893e-06, "loss": 0.0007, "step": 24900 }, { "epoch": 6.568790397045245, "grad_norm": 0.002546842908486724, "learning_rate": 2.2894574870306866e-06, "loss": 0.0001, "step": 24902 }, { "epoch": 6.5693180319219096, "grad_norm": 0.0323968231678009, "learning_rate": 2.289105776839884e-06, "loss": 0.0002, "step": 24904 }, { "epoch": 6.569845666798575, "grad_norm": 0.022756526246666908, "learning_rate": 2.2887540666490814e-06, "loss": 0.0001, "step": 24906 }, { "epoch": 6.57037330167524, "grad_norm": 0.04530736058950424, "learning_rate": 2.2884023564582783e-06, "loss": 0.0002, "step": 24908 }, { "epoch": 6.570900936551906, "grad_norm": 0.001726936548948288, "learning_rate": 2.2880506462674757e-06, "loss": 0.0001, "step": 24910 }, { "epoch": 6.571428571428571, "grad_norm": 0.0332476943731308, "learning_rate": 2.2876989360766727e-06, "loss": 0.0001, "step": 24912 }, { "epoch": 6.571956206305237, "grad_norm": 0.00677643995732069, "learning_rate": 2.28734722588587e-06, "loss": 0.0028, "step": 24914 }, { "epoch": 6.572483841181902, "grad_norm": 0.01781499944627285, "learning_rate": 2.2869955156950674e-06, "loss": 0.0003, "step": 24916 }, { "epoch": 6.573011476058568, "grad_norm": 0.0020740891341120005, "learning_rate": 2.286643805504265e-06, "loss": 0.0002, "step": 24918 }, { "epoch": 6.573539110935233, "grad_norm": 0.015945475548505783, "learning_rate": 2.2862920953134617e-06, "loss": 0.0001, "step": 24920 }, { "epoch": 6.574066745811898, "grad_norm": 0.01078153308480978, "learning_rate": 2.285940385122659e-06, "loss": 0.0001, "step": 24922 }, { "epoch": 6.574594380688564, "grad_norm": 0.012317730113863945, "learning_rate": 2.285588674931856e-06, "loss": 0.0012, "step": 24924 }, { "epoch": 6.575122015565229, "grad_norm": 0.07957704365253448, "learning_rate": 2.2852369647410534e-06, "loss": 0.0019, "step": 24926 }, { "epoch": 6.575649650441894, "grad_norm": 0.02253839187324047, "learning_rate": 2.284885254550251e-06, "loss": 0.0003, "step": 24928 }, { "epoch": 6.576177285318559, "grad_norm": 0.002621488878503442, "learning_rate": 2.2845335443594478e-06, "loss": 0.0001, "step": 24930 }, { "epoch": 6.576704920195225, "grad_norm": 0.03677641972899437, "learning_rate": 2.284181834168645e-06, "loss": 0.0001, "step": 24932 }, { "epoch": 6.57723255507189, "grad_norm": 0.015226524323225021, "learning_rate": 2.283830123977842e-06, "loss": 0.0001, "step": 24934 }, { "epoch": 6.577760189948556, "grad_norm": 0.0022728738840669394, "learning_rate": 2.2834784137870395e-06, "loss": 0.0135, "step": 24936 }, { "epoch": 6.578287824825221, "grad_norm": 0.005068502388894558, "learning_rate": 2.283126703596237e-06, "loss": 0.0001, "step": 24938 }, { "epoch": 6.578815459701886, "grad_norm": 0.002802544040605426, "learning_rate": 2.2827749934054342e-06, "loss": 0.0008, "step": 24940 }, { "epoch": 6.579343094578552, "grad_norm": 0.004057647660374641, "learning_rate": 2.282423283214631e-06, "loss": 0.0001, "step": 24942 }, { "epoch": 6.579870729455217, "grad_norm": 0.004686489701271057, "learning_rate": 2.2820715730238286e-06, "loss": 0.0001, "step": 24944 }, { "epoch": 6.580398364331883, "grad_norm": 0.022661607712507248, "learning_rate": 2.2817198628330255e-06, "loss": 0.0001, "step": 24946 }, { "epoch": 6.580925999208548, "grad_norm": 0.0015863986918702722, "learning_rate": 2.281368152642223e-06, "loss": 0.0001, "step": 24948 }, { "epoch": 6.581453634085213, "grad_norm": 0.0007639514515176415, "learning_rate": 2.28101644245142e-06, "loss": 0.0001, "step": 24950 }, { "epoch": 6.581981268961878, "grad_norm": 0.03945903107523918, "learning_rate": 2.2806647322606176e-06, "loss": 0.0003, "step": 24952 }, { "epoch": 6.582508903838543, "grad_norm": 0.1006418988108635, "learning_rate": 2.2803130220698146e-06, "loss": 0.0004, "step": 24954 }, { "epoch": 6.583036538715209, "grad_norm": 0.0028382448945194483, "learning_rate": 2.279961311879012e-06, "loss": 0.0033, "step": 24956 }, { "epoch": 6.583564173591874, "grad_norm": 0.026420623064041138, "learning_rate": 2.279609601688209e-06, "loss": 0.0001, "step": 24958 }, { "epoch": 6.58409180846854, "grad_norm": 0.0017067515291273594, "learning_rate": 2.279257891497406e-06, "loss": 0.0001, "step": 24960 }, { "epoch": 6.584619443345205, "grad_norm": 0.0065148877911269665, "learning_rate": 2.2789061813066032e-06, "loss": 0.0001, "step": 24962 }, { "epoch": 6.585147078221871, "grad_norm": 0.006093451287597418, "learning_rate": 2.2785544711158006e-06, "loss": 0.0001, "step": 24964 }, { "epoch": 6.585674713098536, "grad_norm": 0.015486658550798893, "learning_rate": 2.278202760924998e-06, "loss": 0.0001, "step": 24966 }, { "epoch": 6.586202347975201, "grad_norm": 0.38648462295532227, "learning_rate": 2.277851050734195e-06, "loss": 0.0063, "step": 24968 }, { "epoch": 6.586729982851867, "grad_norm": 0.408443421125412, "learning_rate": 2.2774993405433923e-06, "loss": 0.0017, "step": 24970 }, { "epoch": 6.587257617728532, "grad_norm": 0.02258377894759178, "learning_rate": 2.2771476303525893e-06, "loss": 0.0001, "step": 24972 }, { "epoch": 6.587785252605197, "grad_norm": 0.0052404385060071945, "learning_rate": 2.2767959201617867e-06, "loss": 0.0001, "step": 24974 }, { "epoch": 6.588312887481862, "grad_norm": 0.04014383256435394, "learning_rate": 2.276444209970984e-06, "loss": 0.0004, "step": 24976 }, { "epoch": 6.588840522358528, "grad_norm": 0.005946105811744928, "learning_rate": 2.2760924997801814e-06, "loss": 0.0002, "step": 24978 }, { "epoch": 6.589368157235193, "grad_norm": 0.018365686759352684, "learning_rate": 2.2757407895893784e-06, "loss": 0.0018, "step": 24980 }, { "epoch": 6.589895792111859, "grad_norm": 0.19415946304798126, "learning_rate": 2.2753890793985757e-06, "loss": 0.0025, "step": 24982 }, { "epoch": 6.590423426988524, "grad_norm": 0.005325102247297764, "learning_rate": 2.2750373692077727e-06, "loss": 0.0001, "step": 24984 }, { "epoch": 6.590951061865189, "grad_norm": 0.020516742020845413, "learning_rate": 2.27468565901697e-06, "loss": 0.0003, "step": 24986 }, { "epoch": 6.591478696741855, "grad_norm": 0.0035135832149535418, "learning_rate": 2.2743339488261674e-06, "loss": 0.0002, "step": 24988 }, { "epoch": 6.59200633161852, "grad_norm": 0.008606934919953346, "learning_rate": 2.2739822386353644e-06, "loss": 0.0001, "step": 24990 }, { "epoch": 6.592533966495186, "grad_norm": 0.4456424117088318, "learning_rate": 2.2736305284445618e-06, "loss": 0.0009, "step": 24992 }, { "epoch": 6.593061601371851, "grad_norm": 0.004951914772391319, "learning_rate": 2.2732788182537587e-06, "loss": 0.0001, "step": 24994 }, { "epoch": 6.593589236248516, "grad_norm": 0.0052743894048035145, "learning_rate": 2.272927108062956e-06, "loss": 0.0005, "step": 24996 }, { "epoch": 6.594116871125181, "grad_norm": 0.033454205840826035, "learning_rate": 2.2725753978721535e-06, "loss": 0.0001, "step": 24998 }, { "epoch": 6.594644506001846, "grad_norm": 0.01418271753937006, "learning_rate": 2.272223687681351e-06, "loss": 0.0002, "step": 25000 }, { "epoch": 6.595172140878512, "grad_norm": 0.1566755175590515, "learning_rate": 2.271871977490548e-06, "loss": 0.0022, "step": 25002 }, { "epoch": 6.595699775755177, "grad_norm": 0.0016476697055622935, "learning_rate": 2.271520267299745e-06, "loss": 0.0001, "step": 25004 }, { "epoch": 6.596227410631843, "grad_norm": 0.0026743903290480375, "learning_rate": 2.271168557108942e-06, "loss": 0.0001, "step": 25006 }, { "epoch": 6.596755045508508, "grad_norm": 0.03339619189500809, "learning_rate": 2.2708168469181395e-06, "loss": 0.0002, "step": 25008 }, { "epoch": 6.597282680385174, "grad_norm": 0.03300803527235985, "learning_rate": 2.270465136727337e-06, "loss": 0.0001, "step": 25010 }, { "epoch": 6.597810315261839, "grad_norm": 0.006503596436232328, "learning_rate": 2.2701134265365343e-06, "loss": 0.0001, "step": 25012 }, { "epoch": 6.598337950138504, "grad_norm": 0.03644294664263725, "learning_rate": 2.269761716345731e-06, "loss": 0.0002, "step": 25014 }, { "epoch": 6.59886558501517, "grad_norm": 0.08155340701341629, "learning_rate": 2.2694100061549286e-06, "loss": 0.0004, "step": 25016 }, { "epoch": 6.599393219891835, "grad_norm": 0.010166474618017673, "learning_rate": 2.2690582959641255e-06, "loss": 0.0001, "step": 25018 }, { "epoch": 6.5999208547685, "grad_norm": 0.00662489328533411, "learning_rate": 2.2687065857733225e-06, "loss": 0.0001, "step": 25020 }, { "epoch": 6.600448489645165, "grad_norm": 0.14347833395004272, "learning_rate": 2.2683548755825203e-06, "loss": 0.0002, "step": 25022 }, { "epoch": 6.600976124521831, "grad_norm": 0.009369607083499432, "learning_rate": 2.2680031653917172e-06, "loss": 0.0031, "step": 25024 }, { "epoch": 6.601503759398496, "grad_norm": 0.11378468573093414, "learning_rate": 2.2676514552009146e-06, "loss": 0.0002, "step": 25026 }, { "epoch": 6.602031394275162, "grad_norm": 0.0036901026032865047, "learning_rate": 2.2672997450101116e-06, "loss": 0.0001, "step": 25028 }, { "epoch": 6.602559029151827, "grad_norm": 0.004511919338256121, "learning_rate": 2.266948034819309e-06, "loss": 0.0001, "step": 25030 }, { "epoch": 6.603086664028492, "grad_norm": 0.0017991603817790747, "learning_rate": 2.266596324628506e-06, "loss": 0.0024, "step": 25032 }, { "epoch": 6.603614298905158, "grad_norm": 0.025598665699362755, "learning_rate": 2.2662446144377033e-06, "loss": 0.0001, "step": 25034 }, { "epoch": 6.604141933781823, "grad_norm": 0.002094130264595151, "learning_rate": 2.2658929042469006e-06, "loss": 0.0001, "step": 25036 }, { "epoch": 6.604669568658489, "grad_norm": 0.007187253795564175, "learning_rate": 2.265541194056098e-06, "loss": 0.0001, "step": 25038 }, { "epoch": 6.605197203535154, "grad_norm": 0.0055595701560378075, "learning_rate": 2.265189483865295e-06, "loss": 0.0044, "step": 25040 }, { "epoch": 6.605724838411819, "grad_norm": 0.014368679374456406, "learning_rate": 2.2648377736744924e-06, "loss": 0.0009, "step": 25042 }, { "epoch": 6.606252473288484, "grad_norm": 0.03386814892292023, "learning_rate": 2.2644860634836893e-06, "loss": 0.0001, "step": 25044 }, { "epoch": 6.606780108165149, "grad_norm": 0.009864460676908493, "learning_rate": 2.2641343532928867e-06, "loss": 0.0002, "step": 25046 }, { "epoch": 6.607307743041815, "grad_norm": 0.17131763696670532, "learning_rate": 2.263782643102084e-06, "loss": 0.0003, "step": 25048 }, { "epoch": 6.60783537791848, "grad_norm": 0.01041149441152811, "learning_rate": 2.263430932911281e-06, "loss": 0.0001, "step": 25050 }, { "epoch": 6.608363012795146, "grad_norm": 0.21577797830104828, "learning_rate": 2.2630792227204784e-06, "loss": 0.0015, "step": 25052 }, { "epoch": 6.608890647671811, "grad_norm": 0.07650193572044373, "learning_rate": 2.2627275125296753e-06, "loss": 0.0002, "step": 25054 }, { "epoch": 6.609418282548477, "grad_norm": 0.05970126762986183, "learning_rate": 2.2623758023388727e-06, "loss": 0.0002, "step": 25056 }, { "epoch": 6.609945917425142, "grad_norm": 0.24989727139472961, "learning_rate": 2.26202409214807e-06, "loss": 0.0068, "step": 25058 }, { "epoch": 6.610473552301807, "grad_norm": 0.5174506306648254, "learning_rate": 2.2616723819572675e-06, "loss": 0.0005, "step": 25060 }, { "epoch": 6.611001187178473, "grad_norm": 0.002668547909706831, "learning_rate": 2.2613206717664644e-06, "loss": 0.0001, "step": 25062 }, { "epoch": 6.611528822055138, "grad_norm": 0.008500274270772934, "learning_rate": 2.260968961575662e-06, "loss": 0.0002, "step": 25064 }, { "epoch": 6.612056456931803, "grad_norm": 0.0018961814930662513, "learning_rate": 2.2606172513848587e-06, "loss": 0.0001, "step": 25066 }, { "epoch": 6.612584091808468, "grad_norm": 0.04413291811943054, "learning_rate": 2.260265541194056e-06, "loss": 0.0001, "step": 25068 }, { "epoch": 6.613111726685134, "grad_norm": 0.6231346130371094, "learning_rate": 2.2599138310032535e-06, "loss": 0.0047, "step": 25070 }, { "epoch": 6.613639361561799, "grad_norm": 0.4408589005470276, "learning_rate": 2.259562120812451e-06, "loss": 0.0006, "step": 25072 }, { "epoch": 6.614166996438465, "grad_norm": 0.7591952085494995, "learning_rate": 2.259210410621648e-06, "loss": 0.0092, "step": 25074 }, { "epoch": 6.61469463131513, "grad_norm": 0.006281024310737848, "learning_rate": 2.258858700430845e-06, "loss": 0.0001, "step": 25076 }, { "epoch": 6.615222266191795, "grad_norm": 0.028795961290597916, "learning_rate": 2.258506990240042e-06, "loss": 0.0002, "step": 25078 }, { "epoch": 6.615749901068461, "grad_norm": 0.01730194129049778, "learning_rate": 2.2581552800492395e-06, "loss": 0.0026, "step": 25080 }, { "epoch": 6.616277535945126, "grad_norm": 0.023842716589570045, "learning_rate": 2.257803569858437e-06, "loss": 0.0001, "step": 25082 }, { "epoch": 6.616805170821792, "grad_norm": 0.006101137492805719, "learning_rate": 2.257451859667634e-06, "loss": 0.0001, "step": 25084 }, { "epoch": 6.617332805698457, "grad_norm": 0.09302935004234314, "learning_rate": 2.2571001494768312e-06, "loss": 0.0002, "step": 25086 }, { "epoch": 6.617860440575122, "grad_norm": 0.03236936405301094, "learning_rate": 2.256748439286028e-06, "loss": 0.0002, "step": 25088 }, { "epoch": 6.618388075451787, "grad_norm": 0.001997058978304267, "learning_rate": 2.2563967290952256e-06, "loss": 0.0001, "step": 25090 }, { "epoch": 6.618915710328452, "grad_norm": 0.015143131837248802, "learning_rate": 2.2560450189044225e-06, "loss": 0.0003, "step": 25092 }, { "epoch": 6.619443345205118, "grad_norm": 0.0017432362074032426, "learning_rate": 2.2556933087136203e-06, "loss": 0.0001, "step": 25094 }, { "epoch": 6.619970980081783, "grad_norm": 0.010603692382574081, "learning_rate": 2.2553415985228173e-06, "loss": 0.0001, "step": 25096 }, { "epoch": 6.620498614958449, "grad_norm": 0.0019833575934171677, "learning_rate": 2.2549898883320146e-06, "loss": 0.0001, "step": 25098 }, { "epoch": 6.621026249835114, "grad_norm": 0.001356900087557733, "learning_rate": 2.2546381781412116e-06, "loss": 0.0002, "step": 25100 }, { "epoch": 6.62155388471178, "grad_norm": 0.0069925859570503235, "learning_rate": 2.254286467950409e-06, "loss": 0.0001, "step": 25102 }, { "epoch": 6.622081519588445, "grad_norm": 0.0024285395629704, "learning_rate": 2.253934757759606e-06, "loss": 0.0001, "step": 25104 }, { "epoch": 6.62260915446511, "grad_norm": 0.014531601220369339, "learning_rate": 2.2535830475688037e-06, "loss": 0.0001, "step": 25106 }, { "epoch": 6.623136789341776, "grad_norm": 0.061107952147722244, "learning_rate": 2.2532313373780007e-06, "loss": 0.0002, "step": 25108 }, { "epoch": 6.623664424218441, "grad_norm": 0.010379553772509098, "learning_rate": 2.2528796271871976e-06, "loss": 0.0051, "step": 25110 }, { "epoch": 6.624192059095106, "grad_norm": 0.16378721594810486, "learning_rate": 2.252527916996395e-06, "loss": 0.0044, "step": 25112 }, { "epoch": 6.624719693971771, "grad_norm": 0.14643216133117676, "learning_rate": 2.252176206805592e-06, "loss": 0.0003, "step": 25114 }, { "epoch": 6.625247328848437, "grad_norm": 0.009572647511959076, "learning_rate": 2.2518244966147893e-06, "loss": 0.0001, "step": 25116 }, { "epoch": 6.625774963725102, "grad_norm": 0.012273963540792465, "learning_rate": 2.2514727864239867e-06, "loss": 0.0001, "step": 25118 }, { "epoch": 6.626302598601768, "grad_norm": 0.35759371519088745, "learning_rate": 2.251121076233184e-06, "loss": 0.0045, "step": 25120 }, { "epoch": 6.626830233478433, "grad_norm": 0.05741220712661743, "learning_rate": 2.250769366042381e-06, "loss": 0.0002, "step": 25122 }, { "epoch": 6.627357868355098, "grad_norm": 0.011330678127706051, "learning_rate": 2.2504176558515784e-06, "loss": 0.0002, "step": 25124 }, { "epoch": 6.627885503231764, "grad_norm": 0.6213687658309937, "learning_rate": 2.2500659456607754e-06, "loss": 0.0004, "step": 25126 }, { "epoch": 6.628413138108429, "grad_norm": 0.013991067185997963, "learning_rate": 2.2497142354699727e-06, "loss": 0.0001, "step": 25128 }, { "epoch": 6.628940772985095, "grad_norm": 0.09993631392717361, "learning_rate": 2.24936252527917e-06, "loss": 0.0003, "step": 25130 }, { "epoch": 6.62946840786176, "grad_norm": 0.0030004859436303377, "learning_rate": 2.2490108150883675e-06, "loss": 0.0001, "step": 25132 }, { "epoch": 6.629996042738425, "grad_norm": 0.07522979378700256, "learning_rate": 2.2486591048975644e-06, "loss": 0.0034, "step": 25134 }, { "epoch": 6.63052367761509, "grad_norm": 0.16121706366539001, "learning_rate": 2.248307394706762e-06, "loss": 0.0004, "step": 25136 }, { "epoch": 6.631051312491755, "grad_norm": 0.018710805103182793, "learning_rate": 2.2479556845159588e-06, "loss": 0.0001, "step": 25138 }, { "epoch": 6.631578947368421, "grad_norm": 0.21721409261226654, "learning_rate": 2.247603974325156e-06, "loss": 0.0007, "step": 25140 }, { "epoch": 6.632106582245086, "grad_norm": 0.02476445771753788, "learning_rate": 2.2472522641343535e-06, "loss": 0.0002, "step": 25142 }, { "epoch": 6.632634217121752, "grad_norm": 0.0030318014323711395, "learning_rate": 2.2469005539435505e-06, "loss": 0.0002, "step": 25144 }, { "epoch": 6.633161851998417, "grad_norm": 0.0065431660041213036, "learning_rate": 2.246548843752748e-06, "loss": 0.0028, "step": 25146 }, { "epoch": 6.633689486875083, "grad_norm": 0.0037981607019901276, "learning_rate": 2.246197133561945e-06, "loss": 0.0037, "step": 25148 }, { "epoch": 6.634217121751748, "grad_norm": 0.00512727489694953, "learning_rate": 2.245845423371142e-06, "loss": 0.0005, "step": 25150 }, { "epoch": 6.634744756628413, "grad_norm": 0.002648410852998495, "learning_rate": 2.2454937131803396e-06, "loss": 0.0001, "step": 25152 }, { "epoch": 6.635272391505079, "grad_norm": 0.0013157484354451299, "learning_rate": 2.245142002989537e-06, "loss": 0.0001, "step": 25154 }, { "epoch": 6.635800026381744, "grad_norm": 0.0165381021797657, "learning_rate": 2.244790292798734e-06, "loss": 0.0008, "step": 25156 }, { "epoch": 6.6363276612584094, "grad_norm": 0.010942978784441948, "learning_rate": 2.2444385826079313e-06, "loss": 0.0002, "step": 25158 }, { "epoch": 6.636855296135074, "grad_norm": 0.22390542924404144, "learning_rate": 2.2440868724171282e-06, "loss": 0.0004, "step": 25160 }, { "epoch": 6.63738293101174, "grad_norm": 0.0151955122128129, "learning_rate": 2.2437351622263256e-06, "loss": 0.0001, "step": 25162 }, { "epoch": 6.637910565888405, "grad_norm": 0.011130303144454956, "learning_rate": 2.243383452035523e-06, "loss": 0.0001, "step": 25164 }, { "epoch": 6.63843820076507, "grad_norm": 0.001054677995853126, "learning_rate": 2.2430317418447203e-06, "loss": 0.0001, "step": 25166 }, { "epoch": 6.638965835641736, "grad_norm": 0.0019453467102721334, "learning_rate": 2.2426800316539173e-06, "loss": 0.0001, "step": 25168 }, { "epoch": 6.639493470518401, "grad_norm": 0.038916755467653275, "learning_rate": 2.2423283214631147e-06, "loss": 0.0001, "step": 25170 }, { "epoch": 6.640021105395067, "grad_norm": 0.0035347372759133577, "learning_rate": 2.2419766112723116e-06, "loss": 0.0008, "step": 25172 }, { "epoch": 6.640548740271732, "grad_norm": 0.0813651829957962, "learning_rate": 2.2416249010815086e-06, "loss": 0.002, "step": 25174 }, { "epoch": 6.641076375148398, "grad_norm": 0.05705543980002403, "learning_rate": 2.241273190890706e-06, "loss": 0.0002, "step": 25176 }, { "epoch": 6.641604010025063, "grad_norm": 0.0017133142100647092, "learning_rate": 2.2409214806999033e-06, "loss": 0.0001, "step": 25178 }, { "epoch": 6.642131644901728, "grad_norm": 0.007638080045580864, "learning_rate": 2.2405697705091007e-06, "loss": 0.0001, "step": 25180 }, { "epoch": 6.642659279778393, "grad_norm": 0.011119011789560318, "learning_rate": 2.2402180603182977e-06, "loss": 0.0001, "step": 25182 }, { "epoch": 6.643186914655058, "grad_norm": 0.016723839566111565, "learning_rate": 2.239866350127495e-06, "loss": 0.0001, "step": 25184 }, { "epoch": 6.643714549531724, "grad_norm": 0.001358303939923644, "learning_rate": 2.239514639936692e-06, "loss": 0.0001, "step": 25186 }, { "epoch": 6.644242184408389, "grad_norm": 0.001004787627607584, "learning_rate": 2.2391629297458894e-06, "loss": 0.0001, "step": 25188 }, { "epoch": 6.644769819285055, "grad_norm": 0.000610293704085052, "learning_rate": 2.2388112195550867e-06, "loss": 0.0001, "step": 25190 }, { "epoch": 6.64529745416172, "grad_norm": 0.0016808313084766269, "learning_rate": 2.238459509364284e-06, "loss": 0.0001, "step": 25192 }, { "epoch": 6.645825089038386, "grad_norm": 0.0025261440314352512, "learning_rate": 2.238107799173481e-06, "loss": 0.0001, "step": 25194 }, { "epoch": 6.646352723915051, "grad_norm": 0.20213842391967773, "learning_rate": 2.2377560889826784e-06, "loss": 0.0006, "step": 25196 }, { "epoch": 6.646880358791716, "grad_norm": 0.009278210811316967, "learning_rate": 2.2374043787918754e-06, "loss": 0.0001, "step": 25198 }, { "epoch": 6.647407993668382, "grad_norm": 0.05814163014292717, "learning_rate": 2.2370526686010728e-06, "loss": 0.0001, "step": 25200 }, { "epoch": 6.647935628545047, "grad_norm": 0.0014878881629556417, "learning_rate": 2.23670095841027e-06, "loss": 0.0001, "step": 25202 }, { "epoch": 6.6484632634217125, "grad_norm": 0.00638541067019105, "learning_rate": 2.236349248219467e-06, "loss": 0.0001, "step": 25204 }, { "epoch": 6.648990898298377, "grad_norm": 0.004146184306591749, "learning_rate": 2.2359975380286645e-06, "loss": 0.0001, "step": 25206 }, { "epoch": 6.649518533175042, "grad_norm": 0.036070361733436584, "learning_rate": 2.2356458278378614e-06, "loss": 0.0021, "step": 25208 }, { "epoch": 6.650046168051708, "grad_norm": 0.005387904588133097, "learning_rate": 2.235294117647059e-06, "loss": 0.0001, "step": 25210 }, { "epoch": 6.650573802928373, "grad_norm": 0.0013254546793177724, "learning_rate": 2.234942407456256e-06, "loss": 0.0026, "step": 25212 }, { "epoch": 6.651101437805039, "grad_norm": 0.3395769000053406, "learning_rate": 2.2345906972654535e-06, "loss": 0.0058, "step": 25214 }, { "epoch": 6.651629072681704, "grad_norm": 0.6366004347801208, "learning_rate": 2.2342389870746505e-06, "loss": 0.0015, "step": 25216 }, { "epoch": 6.65215670755837, "grad_norm": 0.002677993383258581, "learning_rate": 2.233887276883848e-06, "loss": 0.0001, "step": 25218 }, { "epoch": 6.652684342435035, "grad_norm": 0.001342460629530251, "learning_rate": 2.233535566693045e-06, "loss": 0.0001, "step": 25220 }, { "epoch": 6.653211977311701, "grad_norm": 0.02433677762746811, "learning_rate": 2.233183856502242e-06, "loss": 0.0008, "step": 25222 }, { "epoch": 6.653739612188366, "grad_norm": 0.0028876308351755142, "learning_rate": 2.2328321463114396e-06, "loss": 0.0001, "step": 25224 }, { "epoch": 6.654267247065031, "grad_norm": 0.002252599224448204, "learning_rate": 2.232480436120637e-06, "loss": 0.0001, "step": 25226 }, { "epoch": 6.654794881941696, "grad_norm": 0.038208600133657455, "learning_rate": 2.232128725929834e-06, "loss": 0.0034, "step": 25228 }, { "epoch": 6.655322516818361, "grad_norm": 0.004243816714733839, "learning_rate": 2.2317770157390313e-06, "loss": 0.0038, "step": 25230 }, { "epoch": 6.655850151695027, "grad_norm": 1.7621636390686035, "learning_rate": 2.2314253055482282e-06, "loss": 0.0069, "step": 25232 }, { "epoch": 6.656377786571692, "grad_norm": 0.0067041190341115, "learning_rate": 2.231073595357425e-06, "loss": 0.0003, "step": 25234 }, { "epoch": 6.656905421448358, "grad_norm": 0.017061829566955566, "learning_rate": 2.230721885166623e-06, "loss": 0.0001, "step": 25236 }, { "epoch": 6.657433056325023, "grad_norm": 0.008295160718262196, "learning_rate": 2.23037017497582e-06, "loss": 0.0001, "step": 25238 }, { "epoch": 6.657960691201689, "grad_norm": 0.3895286023616791, "learning_rate": 2.2300184647850173e-06, "loss": 0.0009, "step": 25240 }, { "epoch": 6.658488326078354, "grad_norm": 0.03744891285896301, "learning_rate": 2.2296667545942143e-06, "loss": 0.0002, "step": 25242 }, { "epoch": 6.659015960955019, "grad_norm": 0.04487629979848862, "learning_rate": 2.2293150444034116e-06, "loss": 0.0002, "step": 25244 }, { "epoch": 6.659543595831685, "grad_norm": 0.018918069079518318, "learning_rate": 2.2289633342126086e-06, "loss": 0.0002, "step": 25246 }, { "epoch": 6.66007123070835, "grad_norm": 0.04204045608639717, "learning_rate": 2.228611624021806e-06, "loss": 0.0001, "step": 25248 }, { "epoch": 6.6605988655850155, "grad_norm": 0.07940749824047089, "learning_rate": 2.2282599138310033e-06, "loss": 0.0002, "step": 25250 }, { "epoch": 6.66112650046168, "grad_norm": 1.6262685060501099, "learning_rate": 2.2279082036402007e-06, "loss": 0.0033, "step": 25252 }, { "epoch": 6.661654135338345, "grad_norm": 0.14919623732566833, "learning_rate": 2.2275564934493977e-06, "loss": 0.0003, "step": 25254 }, { "epoch": 6.662181770215011, "grad_norm": 0.010930788703262806, "learning_rate": 2.227204783258595e-06, "loss": 0.0001, "step": 25256 }, { "epoch": 6.662709405091676, "grad_norm": 0.01088373176753521, "learning_rate": 2.226853073067792e-06, "loss": 0.0001, "step": 25258 }, { "epoch": 6.663237039968342, "grad_norm": 0.001457139034755528, "learning_rate": 2.2265013628769894e-06, "loss": 0.0053, "step": 25260 }, { "epoch": 6.663764674845007, "grad_norm": 0.0011102849384769797, "learning_rate": 2.2261496526861868e-06, "loss": 0.0078, "step": 25262 }, { "epoch": 6.664292309721673, "grad_norm": 0.0026323029305785894, "learning_rate": 2.2257979424953837e-06, "loss": 0.0001, "step": 25264 }, { "epoch": 6.664819944598338, "grad_norm": 0.010931986384093761, "learning_rate": 2.225446232304581e-06, "loss": 0.0001, "step": 25266 }, { "epoch": 6.665347579475004, "grad_norm": 0.004092552233487368, "learning_rate": 2.225094522113778e-06, "loss": 0.0003, "step": 25268 }, { "epoch": 6.665875214351669, "grad_norm": 0.002364298328757286, "learning_rate": 2.2247428119229754e-06, "loss": 0.0001, "step": 25270 }, { "epoch": 6.666402849228334, "grad_norm": 0.6835140585899353, "learning_rate": 2.224391101732173e-06, "loss": 0.0046, "step": 25272 }, { "epoch": 6.666930484104999, "grad_norm": 0.0010925012174993753, "learning_rate": 2.22403939154137e-06, "loss": 0.0001, "step": 25274 }, { "epoch": 6.667458118981664, "grad_norm": 0.0022618588991463184, "learning_rate": 2.223687681350567e-06, "loss": 0.0001, "step": 25276 }, { "epoch": 6.66798575385833, "grad_norm": 0.0019254550570622087, "learning_rate": 2.2233359711597645e-06, "loss": 0.0001, "step": 25278 }, { "epoch": 6.668513388734995, "grad_norm": 0.22347386181354523, "learning_rate": 2.2229842609689614e-06, "loss": 0.0008, "step": 25280 }, { "epoch": 6.669041023611661, "grad_norm": 0.01957411877810955, "learning_rate": 2.222632550778159e-06, "loss": 0.0001, "step": 25282 }, { "epoch": 6.669568658488326, "grad_norm": 0.002566161099821329, "learning_rate": 2.222280840587356e-06, "loss": 0.0001, "step": 25284 }, { "epoch": 6.670096293364992, "grad_norm": 0.0016948606353253126, "learning_rate": 2.2219291303965536e-06, "loss": 0.0001, "step": 25286 }, { "epoch": 6.670623928241657, "grad_norm": 0.0019129043212160468, "learning_rate": 2.2215774202057505e-06, "loss": 0.0001, "step": 25288 }, { "epoch": 6.671151563118322, "grad_norm": 0.01682506687939167, "learning_rate": 2.221225710014948e-06, "loss": 0.0003, "step": 25290 }, { "epoch": 6.671679197994988, "grad_norm": 0.6796988844871521, "learning_rate": 2.220873999824145e-06, "loss": 0.0027, "step": 25292 }, { "epoch": 6.672206832871653, "grad_norm": 0.001420779386535287, "learning_rate": 2.2205222896333422e-06, "loss": 0.0001, "step": 25294 }, { "epoch": 6.6727344677483185, "grad_norm": 0.050813112407922745, "learning_rate": 2.2201705794425396e-06, "loss": 0.0002, "step": 25296 }, { "epoch": 6.673262102624983, "grad_norm": 0.024705292657017708, "learning_rate": 2.2198188692517366e-06, "loss": 0.0001, "step": 25298 }, { "epoch": 6.673789737501648, "grad_norm": 0.004392167553305626, "learning_rate": 2.219467159060934e-06, "loss": 0.0001, "step": 25300 }, { "epoch": 6.674317372378314, "grad_norm": 0.006979052443057299, "learning_rate": 2.219115448870131e-06, "loss": 0.0001, "step": 25302 }, { "epoch": 6.674845007254979, "grad_norm": 0.00428793765604496, "learning_rate": 2.2187637386793283e-06, "loss": 0.0001, "step": 25304 }, { "epoch": 6.675372642131645, "grad_norm": 0.0010353174293413758, "learning_rate": 2.2184120284885252e-06, "loss": 0.0001, "step": 25306 }, { "epoch": 6.67590027700831, "grad_norm": 0.016565702855587006, "learning_rate": 2.218060318297723e-06, "loss": 0.0001, "step": 25308 }, { "epoch": 6.676427911884976, "grad_norm": 0.4869631826877594, "learning_rate": 2.21770860810692e-06, "loss": 0.0051, "step": 25310 }, { "epoch": 6.676955546761641, "grad_norm": 0.09728643298149109, "learning_rate": 2.2173568979161173e-06, "loss": 0.0031, "step": 25312 }, { "epoch": 6.677483181638307, "grad_norm": 0.31913086771965027, "learning_rate": 2.2170051877253143e-06, "loss": 0.0007, "step": 25314 }, { "epoch": 6.678010816514972, "grad_norm": 0.013363005593419075, "learning_rate": 2.2166534775345117e-06, "loss": 0.0001, "step": 25316 }, { "epoch": 6.678538451391637, "grad_norm": 0.004232555162161589, "learning_rate": 2.2163017673437086e-06, "loss": 0.0001, "step": 25318 }, { "epoch": 6.6790660862683024, "grad_norm": 0.016700906679034233, "learning_rate": 2.2159500571529064e-06, "loss": 0.0001, "step": 25320 }, { "epoch": 6.679593721144967, "grad_norm": 0.0026396350003778934, "learning_rate": 2.2155983469621034e-06, "loss": 0.0009, "step": 25322 }, { "epoch": 6.680121356021633, "grad_norm": 0.06882402300834656, "learning_rate": 2.2152466367713003e-06, "loss": 0.0002, "step": 25324 }, { "epoch": 6.680648990898298, "grad_norm": 0.00880223885178566, "learning_rate": 2.2148949265804977e-06, "loss": 0.0053, "step": 25326 }, { "epoch": 6.681176625774964, "grad_norm": 0.03243421018123627, "learning_rate": 2.2145432163896947e-06, "loss": 0.0002, "step": 25328 }, { "epoch": 6.681704260651629, "grad_norm": 0.01491599902510643, "learning_rate": 2.214191506198892e-06, "loss": 0.0002, "step": 25330 }, { "epoch": 6.682231895528295, "grad_norm": 0.2606843411922455, "learning_rate": 2.2138397960080894e-06, "loss": 0.0005, "step": 25332 }, { "epoch": 6.68275953040496, "grad_norm": 0.092954620718956, "learning_rate": 2.2134880858172868e-06, "loss": 0.0022, "step": 25334 }, { "epoch": 6.683287165281625, "grad_norm": 0.00683532515540719, "learning_rate": 2.2131363756264837e-06, "loss": 0.0001, "step": 25336 }, { "epoch": 6.683814800158291, "grad_norm": 0.0029479756485670805, "learning_rate": 2.212784665435681e-06, "loss": 0.0001, "step": 25338 }, { "epoch": 6.684342435034956, "grad_norm": 0.016336288303136826, "learning_rate": 2.212432955244878e-06, "loss": 0.0002, "step": 25340 }, { "epoch": 6.6848700699116215, "grad_norm": 0.06681805104017258, "learning_rate": 2.2120812450540754e-06, "loss": 0.0018, "step": 25342 }, { "epoch": 6.685397704788286, "grad_norm": 0.009821174666285515, "learning_rate": 2.211729534863273e-06, "loss": 0.0003, "step": 25344 }, { "epoch": 6.685925339664951, "grad_norm": 0.013078884221613407, "learning_rate": 2.21137782467247e-06, "loss": 0.0001, "step": 25346 }, { "epoch": 6.686452974541617, "grad_norm": 0.004651060327887535, "learning_rate": 2.211026114481667e-06, "loss": 0.0001, "step": 25348 }, { "epoch": 6.686980609418282, "grad_norm": 0.0028051023837178946, "learning_rate": 2.2106744042908645e-06, "loss": 0.0001, "step": 25350 }, { "epoch": 6.687508244294948, "grad_norm": 0.0031055565923452377, "learning_rate": 2.2103226941000615e-06, "loss": 0.0001, "step": 25352 }, { "epoch": 6.688035879171613, "grad_norm": 0.004621447529643774, "learning_rate": 2.209970983909259e-06, "loss": 0.0001, "step": 25354 }, { "epoch": 6.688563514048279, "grad_norm": 0.01607828587293625, "learning_rate": 2.2096192737184562e-06, "loss": 0.0001, "step": 25356 }, { "epoch": 6.689091148924944, "grad_norm": 0.002342631807550788, "learning_rate": 2.209267563527653e-06, "loss": 0.0001, "step": 25358 }, { "epoch": 6.68961878380161, "grad_norm": 0.6656019687652588, "learning_rate": 2.2089158533368506e-06, "loss": 0.0007, "step": 25360 }, { "epoch": 6.690146418678275, "grad_norm": 0.003061227034777403, "learning_rate": 2.2085641431460475e-06, "loss": 0.0001, "step": 25362 }, { "epoch": 6.69067405355494, "grad_norm": 0.13538582623004913, "learning_rate": 2.208212432955245e-06, "loss": 0.0004, "step": 25364 }, { "epoch": 6.6912016884316055, "grad_norm": 0.022857626900076866, "learning_rate": 2.2078607227644423e-06, "loss": 0.0002, "step": 25366 }, { "epoch": 6.69172932330827, "grad_norm": 0.042399656027555466, "learning_rate": 2.2075090125736396e-06, "loss": 0.0002, "step": 25368 }, { "epoch": 6.692256958184936, "grad_norm": 0.03178839385509491, "learning_rate": 2.2071573023828366e-06, "loss": 0.0002, "step": 25370 }, { "epoch": 6.692784593061601, "grad_norm": 0.004508879501372576, "learning_rate": 2.206805592192034e-06, "loss": 0.0001, "step": 25372 }, { "epoch": 6.693312227938267, "grad_norm": 0.36757034063339233, "learning_rate": 2.206453882001231e-06, "loss": 0.0011, "step": 25374 }, { "epoch": 6.693839862814932, "grad_norm": 0.037605270743370056, "learning_rate": 2.2061021718104283e-06, "loss": 0.0002, "step": 25376 }, { "epoch": 6.694367497691598, "grad_norm": 0.005494963377714157, "learning_rate": 2.2057504616196257e-06, "loss": 0.0001, "step": 25378 }, { "epoch": 6.694895132568263, "grad_norm": 0.3520253300666809, "learning_rate": 2.205398751428823e-06, "loss": 0.0046, "step": 25380 }, { "epoch": 6.695422767444928, "grad_norm": 0.0016269789775833488, "learning_rate": 2.20504704123802e-06, "loss": 0.0001, "step": 25382 }, { "epoch": 6.695950402321594, "grad_norm": 0.018204346299171448, "learning_rate": 2.204695331047217e-06, "loss": 0.0001, "step": 25384 }, { "epoch": 6.696478037198259, "grad_norm": 0.009404120035469532, "learning_rate": 2.2043436208564143e-06, "loss": 0.0001, "step": 25386 }, { "epoch": 6.6970056720749245, "grad_norm": 0.07633911073207855, "learning_rate": 2.2039919106656113e-06, "loss": 0.0008, "step": 25388 }, { "epoch": 6.697533306951589, "grad_norm": 0.0025448875967413187, "learning_rate": 2.2036402004748086e-06, "loss": 0.0001, "step": 25390 }, { "epoch": 6.698060941828254, "grad_norm": 0.04227033630013466, "learning_rate": 2.203288490284006e-06, "loss": 0.003, "step": 25392 }, { "epoch": 6.69858857670492, "grad_norm": 0.2896161377429962, "learning_rate": 2.2029367800932034e-06, "loss": 0.0029, "step": 25394 }, { "epoch": 6.699116211581585, "grad_norm": 0.017407888546586037, "learning_rate": 2.2025850699024004e-06, "loss": 0.0003, "step": 25396 }, { "epoch": 6.699643846458251, "grad_norm": 0.0038638918194919825, "learning_rate": 2.2022333597115977e-06, "loss": 0.0001, "step": 25398 }, { "epoch": 6.700171481334916, "grad_norm": 0.08462801575660706, "learning_rate": 2.2018816495207947e-06, "loss": 0.0018, "step": 25400 }, { "epoch": 6.700699116211582, "grad_norm": 0.030916763469576836, "learning_rate": 2.201529939329992e-06, "loss": 0.0002, "step": 25402 }, { "epoch": 6.701226751088247, "grad_norm": 0.01064396183937788, "learning_rate": 2.2011782291391894e-06, "loss": 0.0001, "step": 25404 }, { "epoch": 6.701754385964913, "grad_norm": 0.08245538175106049, "learning_rate": 2.200826518948387e-06, "loss": 0.0002, "step": 25406 }, { "epoch": 6.702282020841578, "grad_norm": 0.0008850188460201025, "learning_rate": 2.2004748087575838e-06, "loss": 0.0014, "step": 25408 }, { "epoch": 6.702809655718243, "grad_norm": 0.025911061093211174, "learning_rate": 2.200123098566781e-06, "loss": 0.0002, "step": 25410 }, { "epoch": 6.7033372905949085, "grad_norm": 0.001241758232936263, "learning_rate": 2.199771388375978e-06, "loss": 0.0001, "step": 25412 }, { "epoch": 6.703864925471573, "grad_norm": 0.34042835235595703, "learning_rate": 2.1994196781851755e-06, "loss": 0.0031, "step": 25414 }, { "epoch": 6.704392560348239, "grad_norm": 0.04976823925971985, "learning_rate": 2.199067967994373e-06, "loss": 0.0003, "step": 25416 }, { "epoch": 6.704920195224904, "grad_norm": 0.006039894185960293, "learning_rate": 2.19871625780357e-06, "loss": 0.0001, "step": 25418 }, { "epoch": 6.70544783010157, "grad_norm": 0.01795286126434803, "learning_rate": 2.198364547612767e-06, "loss": 0.0007, "step": 25420 }, { "epoch": 6.705975464978235, "grad_norm": 0.008145052939653397, "learning_rate": 2.198012837421964e-06, "loss": 0.0001, "step": 25422 }, { "epoch": 6.706503099854901, "grad_norm": 0.003549798857420683, "learning_rate": 2.1976611272311615e-06, "loss": 0.0001, "step": 25424 }, { "epoch": 6.707030734731566, "grad_norm": 0.01181965135037899, "learning_rate": 2.197309417040359e-06, "loss": 0.0012, "step": 25426 }, { "epoch": 6.707558369608231, "grad_norm": 0.0014499693643301725, "learning_rate": 2.1969577068495562e-06, "loss": 0.0001, "step": 25428 }, { "epoch": 6.708086004484897, "grad_norm": 0.004020622931420803, "learning_rate": 2.196605996658753e-06, "loss": 0.0021, "step": 25430 }, { "epoch": 6.708613639361562, "grad_norm": 0.010169806890189648, "learning_rate": 2.1962542864679506e-06, "loss": 0.0001, "step": 25432 }, { "epoch": 6.7091412742382275, "grad_norm": 0.0012770711909979582, "learning_rate": 2.1959025762771475e-06, "loss": 0.0004, "step": 25434 }, { "epoch": 6.709668909114892, "grad_norm": 0.01431481447070837, "learning_rate": 2.195550866086345e-06, "loss": 0.0004, "step": 25436 }, { "epoch": 6.710196543991557, "grad_norm": 0.00683228112757206, "learning_rate": 2.1951991558955423e-06, "loss": 0.0001, "step": 25438 }, { "epoch": 6.710724178868223, "grad_norm": 0.0054540326818823814, "learning_rate": 2.1948474457047397e-06, "loss": 0.0005, "step": 25440 }, { "epoch": 6.711251813744888, "grad_norm": 0.030562683939933777, "learning_rate": 2.1944957355139366e-06, "loss": 0.0007, "step": 25442 }, { "epoch": 6.711779448621554, "grad_norm": 0.2542096972465515, "learning_rate": 2.1941440253231336e-06, "loss": 0.0003, "step": 25444 }, { "epoch": 6.712307083498219, "grad_norm": 0.02782304212450981, "learning_rate": 2.193792315132331e-06, "loss": 0.0002, "step": 25446 }, { "epoch": 6.712834718374885, "grad_norm": 0.004307001829147339, "learning_rate": 2.193440604941528e-06, "loss": 0.0001, "step": 25448 }, { "epoch": 6.71336235325155, "grad_norm": 0.11836110055446625, "learning_rate": 2.1930888947507257e-06, "loss": 0.0011, "step": 25450 }, { "epoch": 6.713889988128216, "grad_norm": 0.007527939043939114, "learning_rate": 2.1927371845599226e-06, "loss": 0.0001, "step": 25452 }, { "epoch": 6.714417623004881, "grad_norm": 0.3439792990684509, "learning_rate": 2.19238547436912e-06, "loss": 0.0022, "step": 25454 }, { "epoch": 6.714945257881546, "grad_norm": 0.0034420222509652376, "learning_rate": 2.192033764178317e-06, "loss": 0.0001, "step": 25456 }, { "epoch": 6.7154728927582115, "grad_norm": 0.0010485207894816995, "learning_rate": 2.1916820539875143e-06, "loss": 0.0001, "step": 25458 }, { "epoch": 6.716000527634876, "grad_norm": 0.0356484055519104, "learning_rate": 2.1913303437967113e-06, "loss": 0.0002, "step": 25460 }, { "epoch": 6.716528162511542, "grad_norm": 0.09459470212459564, "learning_rate": 2.190978633605909e-06, "loss": 0.0002, "step": 25462 }, { "epoch": 6.717055797388207, "grad_norm": 0.04459404945373535, "learning_rate": 2.190626923415106e-06, "loss": 0.0005, "step": 25464 }, { "epoch": 6.717583432264873, "grad_norm": 0.06807012856006622, "learning_rate": 2.1902752132243034e-06, "loss": 0.0003, "step": 25466 }, { "epoch": 6.718111067141538, "grad_norm": 0.06055976822972298, "learning_rate": 2.1899235030335004e-06, "loss": 0.0002, "step": 25468 }, { "epoch": 6.718638702018204, "grad_norm": 0.4798453450202942, "learning_rate": 2.1895717928426978e-06, "loss": 0.0015, "step": 25470 }, { "epoch": 6.719166336894869, "grad_norm": 0.09867528825998306, "learning_rate": 2.1892200826518947e-06, "loss": 0.0017, "step": 25472 }, { "epoch": 6.719693971771534, "grad_norm": 0.16878077387809753, "learning_rate": 2.188868372461092e-06, "loss": 0.0005, "step": 25474 }, { "epoch": 6.7202216066482, "grad_norm": 0.029000524431467056, "learning_rate": 2.1885166622702895e-06, "loss": 0.0002, "step": 25476 }, { "epoch": 6.720749241524865, "grad_norm": 0.0057109687477350235, "learning_rate": 2.1881649520794864e-06, "loss": 0.0002, "step": 25478 }, { "epoch": 6.7212768764015305, "grad_norm": 0.060238759964704514, "learning_rate": 2.1878132418886838e-06, "loss": 0.0006, "step": 25480 }, { "epoch": 6.7218045112781954, "grad_norm": 0.00627940334379673, "learning_rate": 2.1874615316978807e-06, "loss": 0.004, "step": 25482 }, { "epoch": 6.72233214615486, "grad_norm": 0.019599171355366707, "learning_rate": 2.187109821507078e-06, "loss": 0.0001, "step": 25484 }, { "epoch": 6.722859781031526, "grad_norm": 0.001320160343311727, "learning_rate": 2.1867581113162755e-06, "loss": 0.0001, "step": 25486 }, { "epoch": 6.723387415908191, "grad_norm": 0.06570825725793839, "learning_rate": 2.186406401125473e-06, "loss": 0.0003, "step": 25488 }, { "epoch": 6.723915050784857, "grad_norm": 0.004768031183630228, "learning_rate": 2.18605469093467e-06, "loss": 0.0001, "step": 25490 }, { "epoch": 6.724442685661522, "grad_norm": 0.16394899785518646, "learning_rate": 2.185702980743867e-06, "loss": 0.001, "step": 25492 }, { "epoch": 6.724970320538188, "grad_norm": 0.0023960170801728964, "learning_rate": 2.185351270553064e-06, "loss": 0.0001, "step": 25494 }, { "epoch": 6.725497955414853, "grad_norm": 0.004093721508979797, "learning_rate": 2.1849995603622615e-06, "loss": 0.0001, "step": 25496 }, { "epoch": 6.726025590291519, "grad_norm": 0.007468594238162041, "learning_rate": 2.184647850171459e-06, "loss": 0.0001, "step": 25498 }, { "epoch": 6.726553225168184, "grad_norm": 0.00912255235016346, "learning_rate": 2.1842961399806563e-06, "loss": 0.0001, "step": 25500 }, { "epoch": 6.727080860044849, "grad_norm": 0.021027229726314545, "learning_rate": 2.1839444297898532e-06, "loss": 0.0001, "step": 25502 }, { "epoch": 6.7276084949215145, "grad_norm": 0.0030549399089068174, "learning_rate": 2.1835927195990506e-06, "loss": 0.0002, "step": 25504 }, { "epoch": 6.728136129798179, "grad_norm": 0.07942583411931992, "learning_rate": 2.1832410094082476e-06, "loss": 0.002, "step": 25506 }, { "epoch": 6.728663764674845, "grad_norm": 0.001507923356257379, "learning_rate": 2.182889299217445e-06, "loss": 0.0001, "step": 25508 }, { "epoch": 6.72919139955151, "grad_norm": 0.004852922633290291, "learning_rate": 2.1825375890266423e-06, "loss": 0.0001, "step": 25510 }, { "epoch": 6.729719034428175, "grad_norm": 0.02334371767938137, "learning_rate": 2.1821858788358393e-06, "loss": 0.0001, "step": 25512 }, { "epoch": 6.730246669304841, "grad_norm": 0.47380101680755615, "learning_rate": 2.1818341686450366e-06, "loss": 0.002, "step": 25514 }, { "epoch": 6.730774304181506, "grad_norm": 0.1205802634358406, "learning_rate": 2.1814824584542336e-06, "loss": 0.0004, "step": 25516 }, { "epoch": 6.731301939058172, "grad_norm": 0.02267593890428543, "learning_rate": 2.181130748263431e-06, "loss": 0.0002, "step": 25518 }, { "epoch": 6.731829573934837, "grad_norm": 0.14874188601970673, "learning_rate": 2.1807790380726283e-06, "loss": 0.0003, "step": 25520 }, { "epoch": 6.732357208811503, "grad_norm": 0.0024485790636390448, "learning_rate": 2.1804273278818257e-06, "loss": 0.0001, "step": 25522 }, { "epoch": 6.732884843688168, "grad_norm": 0.603962779045105, "learning_rate": 2.1800756176910227e-06, "loss": 0.0011, "step": 25524 }, { "epoch": 6.7334124785648335, "grad_norm": 0.004520490299910307, "learning_rate": 2.17972390750022e-06, "loss": 0.0001, "step": 25526 }, { "epoch": 6.7339401134414985, "grad_norm": 0.015108448453247547, "learning_rate": 2.179372197309417e-06, "loss": 0.0001, "step": 25528 }, { "epoch": 6.734467748318163, "grad_norm": 0.027307918295264244, "learning_rate": 2.1790204871186144e-06, "loss": 0.0001, "step": 25530 }, { "epoch": 6.734995383194829, "grad_norm": 0.01508890651166439, "learning_rate": 2.1786687769278113e-06, "loss": 0.0002, "step": 25532 }, { "epoch": 6.735523018071494, "grad_norm": 0.036485686898231506, "learning_rate": 2.178317066737009e-06, "loss": 0.0006, "step": 25534 }, { "epoch": 6.73605065294816, "grad_norm": 0.04346833750605583, "learning_rate": 2.177965356546206e-06, "loss": 0.0003, "step": 25536 }, { "epoch": 6.736578287824825, "grad_norm": 0.04823825880885124, "learning_rate": 2.177613646355403e-06, "loss": 0.0002, "step": 25538 }, { "epoch": 6.737105922701491, "grad_norm": 0.005349102430045605, "learning_rate": 2.1772619361646004e-06, "loss": 0.0001, "step": 25540 }, { "epoch": 6.737633557578156, "grad_norm": 1.5310754776000977, "learning_rate": 2.1769102259737974e-06, "loss": 0.0024, "step": 25542 }, { "epoch": 6.738161192454822, "grad_norm": 0.01962387189269066, "learning_rate": 2.1765585157829947e-06, "loss": 0.0038, "step": 25544 }, { "epoch": 6.738688827331487, "grad_norm": 0.04347217082977295, "learning_rate": 2.176206805592192e-06, "loss": 0.0003, "step": 25546 }, { "epoch": 6.739216462208152, "grad_norm": 0.06716787070035934, "learning_rate": 2.1758550954013895e-06, "loss": 0.0003, "step": 25548 }, { "epoch": 6.7397440970848175, "grad_norm": 0.0013085015816614032, "learning_rate": 2.1755033852105864e-06, "loss": 0.0001, "step": 25550 }, { "epoch": 6.740271731961482, "grad_norm": 0.005654558073729277, "learning_rate": 2.175151675019784e-06, "loss": 0.0001, "step": 25552 }, { "epoch": 6.740799366838148, "grad_norm": 0.018653690814971924, "learning_rate": 2.1747999648289808e-06, "loss": 0.0003, "step": 25554 }, { "epoch": 6.741327001714813, "grad_norm": 0.016571471467614174, "learning_rate": 2.174448254638178e-06, "loss": 0.0001, "step": 25556 }, { "epoch": 6.741854636591478, "grad_norm": 0.4218987822532654, "learning_rate": 2.1740965444473755e-06, "loss": 0.0018, "step": 25558 }, { "epoch": 6.742382271468144, "grad_norm": 0.0034368420019745827, "learning_rate": 2.173744834256573e-06, "loss": 0.0001, "step": 25560 }, { "epoch": 6.742909906344809, "grad_norm": 0.003112730337306857, "learning_rate": 2.17339312406577e-06, "loss": 0.0002, "step": 25562 }, { "epoch": 6.743437541221475, "grad_norm": 0.0017130442429333925, "learning_rate": 2.1730414138749672e-06, "loss": 0.001, "step": 25564 }, { "epoch": 6.74396517609814, "grad_norm": 0.0021629638504236937, "learning_rate": 2.172689703684164e-06, "loss": 0.0001, "step": 25566 }, { "epoch": 6.744492810974806, "grad_norm": 0.0030055155511945486, "learning_rate": 2.1723379934933615e-06, "loss": 0.0001, "step": 25568 }, { "epoch": 6.745020445851471, "grad_norm": 0.0006243981188163161, "learning_rate": 2.171986283302559e-06, "loss": 0.0014, "step": 25570 }, { "epoch": 6.7455480807281365, "grad_norm": 0.002368098357692361, "learning_rate": 2.171634573111756e-06, "loss": 0.0001, "step": 25572 }, { "epoch": 6.7460757156048015, "grad_norm": 0.37790247797966003, "learning_rate": 2.1712828629209533e-06, "loss": 0.0067, "step": 25574 }, { "epoch": 6.746603350481466, "grad_norm": 0.0046087345108389854, "learning_rate": 2.17093115273015e-06, "loss": 0.0001, "step": 25576 }, { "epoch": 6.747130985358132, "grad_norm": 0.3858441412448883, "learning_rate": 2.1705794425393476e-06, "loss": 0.0021, "step": 25578 }, { "epoch": 6.747658620234797, "grad_norm": 0.39256447553634644, "learning_rate": 2.170227732348545e-06, "loss": 0.0012, "step": 25580 }, { "epoch": 6.748186255111463, "grad_norm": 0.042143382132053375, "learning_rate": 2.1698760221577423e-06, "loss": 0.0002, "step": 25582 }, { "epoch": 6.748713889988128, "grad_norm": 0.009068178944289684, "learning_rate": 2.1695243119669393e-06, "loss": 0.0001, "step": 25584 }, { "epoch": 6.749241524864794, "grad_norm": 0.32928717136383057, "learning_rate": 2.1691726017761367e-06, "loss": 0.0016, "step": 25586 }, { "epoch": 6.749769159741459, "grad_norm": 0.1895148605108261, "learning_rate": 2.1688208915853336e-06, "loss": 0.0024, "step": 25588 }, { "epoch": 6.750296794618125, "grad_norm": 0.0017620862927287817, "learning_rate": 2.168469181394531e-06, "loss": 0.0001, "step": 25590 }, { "epoch": 6.75082442949479, "grad_norm": 0.35882386565208435, "learning_rate": 2.1681174712037284e-06, "loss": 0.0005, "step": 25592 }, { "epoch": 6.751352064371455, "grad_norm": 0.0028494156431406736, "learning_rate": 2.1677657610129257e-06, "loss": 0.0001, "step": 25594 }, { "epoch": 6.7518796992481205, "grad_norm": 0.25592297315597534, "learning_rate": 2.1674140508221227e-06, "loss": 0.0053, "step": 25596 }, { "epoch": 6.752407334124785, "grad_norm": 0.08212630450725555, "learning_rate": 2.1670623406313196e-06, "loss": 0.0024, "step": 25598 }, { "epoch": 6.752934969001451, "grad_norm": 0.004188014194369316, "learning_rate": 2.166710630440517e-06, "loss": 0.0001, "step": 25600 }, { "epoch": 6.753462603878116, "grad_norm": 0.06578280031681061, "learning_rate": 2.166358920249714e-06, "loss": 0.0002, "step": 25602 }, { "epoch": 6.753990238754781, "grad_norm": 0.5473745465278625, "learning_rate": 2.1660072100589113e-06, "loss": 0.002, "step": 25604 }, { "epoch": 6.754517873631447, "grad_norm": 0.011829346418380737, "learning_rate": 2.1656554998681087e-06, "loss": 0.0002, "step": 25606 }, { "epoch": 6.755045508508112, "grad_norm": 0.37861067056655884, "learning_rate": 2.165303789677306e-06, "loss": 0.0003, "step": 25608 }, { "epoch": 6.755573143384778, "grad_norm": 0.005357587244361639, "learning_rate": 2.164952079486503e-06, "loss": 0.0001, "step": 25610 }, { "epoch": 6.756100778261443, "grad_norm": 0.0008396547054871917, "learning_rate": 2.1646003692957004e-06, "loss": 0.0001, "step": 25612 }, { "epoch": 6.756628413138109, "grad_norm": 0.011835088022053242, "learning_rate": 2.1642486591048974e-06, "loss": 0.0033, "step": 25614 }, { "epoch": 6.757156048014774, "grad_norm": 0.030967816710472107, "learning_rate": 2.1638969489140948e-06, "loss": 0.0001, "step": 25616 }, { "epoch": 6.7576836828914395, "grad_norm": 0.005635552573949099, "learning_rate": 2.163545238723292e-06, "loss": 0.0015, "step": 25618 }, { "epoch": 6.7582113177681045, "grad_norm": 0.01604701392352581, "learning_rate": 2.1631935285324895e-06, "loss": 0.0001, "step": 25620 }, { "epoch": 6.758738952644769, "grad_norm": 0.1642289161682129, "learning_rate": 2.1628418183416865e-06, "loss": 0.0018, "step": 25622 }, { "epoch": 6.759266587521435, "grad_norm": 0.0017809777054935694, "learning_rate": 2.162490108150884e-06, "loss": 0.0001, "step": 25624 }, { "epoch": 6.7597942223981, "grad_norm": 0.008343297988176346, "learning_rate": 2.162138397960081e-06, "loss": 0.0014, "step": 25626 }, { "epoch": 6.760321857274766, "grad_norm": 0.014138398692011833, "learning_rate": 2.161786687769278e-06, "loss": 0.0014, "step": 25628 }, { "epoch": 6.760849492151431, "grad_norm": 0.006013864651322365, "learning_rate": 2.1614349775784755e-06, "loss": 0.0001, "step": 25630 }, { "epoch": 6.761377127028097, "grad_norm": 0.05377817153930664, "learning_rate": 2.1610832673876725e-06, "loss": 0.0004, "step": 25632 }, { "epoch": 6.761904761904762, "grad_norm": 0.009986615739762783, "learning_rate": 2.16073155719687e-06, "loss": 0.0002, "step": 25634 }, { "epoch": 6.762432396781428, "grad_norm": 0.022327501326799393, "learning_rate": 2.160379847006067e-06, "loss": 0.0001, "step": 25636 }, { "epoch": 6.762960031658093, "grad_norm": 0.0029041022062301636, "learning_rate": 2.160028136815264e-06, "loss": 0.0001, "step": 25638 }, { "epoch": 6.763487666534758, "grad_norm": 0.007098169066011906, "learning_rate": 2.1596764266244616e-06, "loss": 0.0001, "step": 25640 }, { "epoch": 6.7640153014114235, "grad_norm": 0.10646305978298187, "learning_rate": 2.159324716433659e-06, "loss": 0.0002, "step": 25642 }, { "epoch": 6.7645429362880884, "grad_norm": 0.009467017836868763, "learning_rate": 2.158973006242856e-06, "loss": 0.0049, "step": 25644 }, { "epoch": 6.765070571164754, "grad_norm": 0.08308165520429611, "learning_rate": 2.1586212960520533e-06, "loss": 0.0017, "step": 25646 }, { "epoch": 6.765598206041419, "grad_norm": 0.814525306224823, "learning_rate": 2.1582695858612502e-06, "loss": 0.0035, "step": 25648 }, { "epoch": 6.766125840918084, "grad_norm": 0.006004608701914549, "learning_rate": 2.1579178756704476e-06, "loss": 0.0002, "step": 25650 }, { "epoch": 6.76665347579475, "grad_norm": 0.004144763108342886, "learning_rate": 2.157566165479645e-06, "loss": 0.0006, "step": 25652 }, { "epoch": 6.767181110671415, "grad_norm": 0.0026624652091413736, "learning_rate": 2.1572144552888424e-06, "loss": 0.0001, "step": 25654 }, { "epoch": 6.767708745548081, "grad_norm": 0.00701114209368825, "learning_rate": 2.1568627450980393e-06, "loss": 0.0004, "step": 25656 }, { "epoch": 6.768236380424746, "grad_norm": 0.010623669251799583, "learning_rate": 2.1565110349072363e-06, "loss": 0.0001, "step": 25658 }, { "epoch": 6.768764015301412, "grad_norm": 0.2738482654094696, "learning_rate": 2.1561593247164336e-06, "loss": 0.003, "step": 25660 }, { "epoch": 6.769291650178077, "grad_norm": 0.07497154176235199, "learning_rate": 2.1558076145256306e-06, "loss": 0.0003, "step": 25662 }, { "epoch": 6.7698192850547425, "grad_norm": 0.009484464302659035, "learning_rate": 2.1554559043348284e-06, "loss": 0.0002, "step": 25664 }, { "epoch": 6.7703469199314075, "grad_norm": 0.2741134762763977, "learning_rate": 2.1551041941440253e-06, "loss": 0.0031, "step": 25666 }, { "epoch": 6.770874554808072, "grad_norm": 0.6580122113227844, "learning_rate": 2.1547524839532227e-06, "loss": 0.0017, "step": 25668 }, { "epoch": 6.771402189684738, "grad_norm": 0.0064002457074820995, "learning_rate": 2.1544007737624197e-06, "loss": 0.0001, "step": 25670 }, { "epoch": 6.771929824561403, "grad_norm": 0.002682363847270608, "learning_rate": 2.154049063571617e-06, "loss": 0.0023, "step": 25672 }, { "epoch": 6.772457459438069, "grad_norm": 0.004264492075890303, "learning_rate": 2.153697353380814e-06, "loss": 0.0017, "step": 25674 }, { "epoch": 6.772985094314734, "grad_norm": 0.0038867341354489326, "learning_rate": 2.153345643190012e-06, "loss": 0.0001, "step": 25676 }, { "epoch": 6.7735127291914, "grad_norm": 0.043851181864738464, "learning_rate": 2.1529939329992087e-06, "loss": 0.0002, "step": 25678 }, { "epoch": 6.774040364068065, "grad_norm": 0.175606369972229, "learning_rate": 2.152642222808406e-06, "loss": 0.0058, "step": 25680 }, { "epoch": 6.774567998944731, "grad_norm": 0.12784650921821594, "learning_rate": 2.152290512617603e-06, "loss": 0.006, "step": 25682 }, { "epoch": 6.775095633821396, "grad_norm": 0.04361672326922417, "learning_rate": 2.1519388024268005e-06, "loss": 0.0005, "step": 25684 }, { "epoch": 6.775623268698061, "grad_norm": 0.5141222476959229, "learning_rate": 2.1515870922359974e-06, "loss": 0.0191, "step": 25686 }, { "epoch": 6.7761509035747265, "grad_norm": 0.0038943311665207148, "learning_rate": 2.1512353820451948e-06, "loss": 0.0001, "step": 25688 }, { "epoch": 6.7766785384513915, "grad_norm": 0.0017929880414158106, "learning_rate": 2.150883671854392e-06, "loss": 0.0002, "step": 25690 }, { "epoch": 6.777206173328057, "grad_norm": 0.02972925454378128, "learning_rate": 2.150531961663589e-06, "loss": 0.0002, "step": 25692 }, { "epoch": 6.777733808204722, "grad_norm": 0.004262148402631283, "learning_rate": 2.1501802514727865e-06, "loss": 0.0001, "step": 25694 }, { "epoch": 6.778261443081387, "grad_norm": 0.9329048991203308, "learning_rate": 2.1498285412819834e-06, "loss": 0.0012, "step": 25696 }, { "epoch": 6.778789077958053, "grad_norm": 0.0043469080701470375, "learning_rate": 2.149476831091181e-06, "loss": 0.0029, "step": 25698 }, { "epoch": 6.779316712834718, "grad_norm": 0.006443345453590155, "learning_rate": 2.149125120900378e-06, "loss": 0.0001, "step": 25700 }, { "epoch": 6.779844347711384, "grad_norm": 0.10210391879081726, "learning_rate": 2.1487734107095756e-06, "loss": 0.0039, "step": 25702 }, { "epoch": 6.780371982588049, "grad_norm": 0.03790484741330147, "learning_rate": 2.1484217005187725e-06, "loss": 0.0004, "step": 25704 }, { "epoch": 6.780899617464715, "grad_norm": 0.00716007174924016, "learning_rate": 2.14806999032797e-06, "loss": 0.0001, "step": 25706 }, { "epoch": 6.78142725234138, "grad_norm": 0.23316040635108948, "learning_rate": 2.147718280137167e-06, "loss": 0.0006, "step": 25708 }, { "epoch": 6.7819548872180455, "grad_norm": 0.521525502204895, "learning_rate": 2.1473665699463642e-06, "loss": 0.0001, "step": 25710 }, { "epoch": 6.7824825220947105, "grad_norm": 0.2246771603822708, "learning_rate": 2.1470148597555616e-06, "loss": 0.0047, "step": 25712 }, { "epoch": 6.783010156971375, "grad_norm": 0.0028471655678004026, "learning_rate": 2.146663149564759e-06, "loss": 0.0001, "step": 25714 }, { "epoch": 6.783537791848041, "grad_norm": 0.007522133179008961, "learning_rate": 2.146311439373956e-06, "loss": 0.0001, "step": 25716 }, { "epoch": 6.784065426724706, "grad_norm": 0.9915517568588257, "learning_rate": 2.145959729183153e-06, "loss": 0.0061, "step": 25718 }, { "epoch": 6.784593061601372, "grad_norm": 0.00601558480411768, "learning_rate": 2.1456080189923503e-06, "loss": 0.0001, "step": 25720 }, { "epoch": 6.785120696478037, "grad_norm": 0.05406584590673447, "learning_rate": 2.1452563088015476e-06, "loss": 0.0003, "step": 25722 }, { "epoch": 6.785648331354703, "grad_norm": 0.0029890232253819704, "learning_rate": 2.144904598610745e-06, "loss": 0.0001, "step": 25724 }, { "epoch": 6.786175966231368, "grad_norm": 0.39136186242103577, "learning_rate": 2.144552888419942e-06, "loss": 0.0051, "step": 25726 }, { "epoch": 6.786703601108034, "grad_norm": 0.07478583604097366, "learning_rate": 2.1442011782291393e-06, "loss": 0.0019, "step": 25728 }, { "epoch": 6.787231235984699, "grad_norm": 0.0021919505670666695, "learning_rate": 2.1438494680383363e-06, "loss": 0.0001, "step": 25730 }, { "epoch": 6.787758870861364, "grad_norm": 0.00922513846307993, "learning_rate": 2.1434977578475337e-06, "loss": 0.0001, "step": 25732 }, { "epoch": 6.7882865057380295, "grad_norm": 0.0035349589306861162, "learning_rate": 2.143146047656731e-06, "loss": 0.0001, "step": 25734 }, { "epoch": 6.7888141406146945, "grad_norm": 0.02903483249247074, "learning_rate": 2.1427943374659284e-06, "loss": 0.0001, "step": 25736 }, { "epoch": 6.78934177549136, "grad_norm": 0.03354673832654953, "learning_rate": 2.1424426272751254e-06, "loss": 0.0003, "step": 25738 }, { "epoch": 6.789869410368025, "grad_norm": 0.011640969663858414, "learning_rate": 2.1420909170843227e-06, "loss": 0.0002, "step": 25740 }, { "epoch": 6.79039704524469, "grad_norm": 0.0070824576541781425, "learning_rate": 2.1417392068935197e-06, "loss": 0.0001, "step": 25742 }, { "epoch": 6.790924680121356, "grad_norm": 0.06950830668210983, "learning_rate": 2.141387496702717e-06, "loss": 0.0013, "step": 25744 }, { "epoch": 6.791452314998021, "grad_norm": 0.040573593229055405, "learning_rate": 2.141035786511914e-06, "loss": 0.0002, "step": 25746 }, { "epoch": 6.791979949874687, "grad_norm": 0.05248093232512474, "learning_rate": 2.1406840763211114e-06, "loss": 0.002, "step": 25748 }, { "epoch": 6.792507584751352, "grad_norm": 0.004935337696224451, "learning_rate": 2.1403323661303088e-06, "loss": 0.003, "step": 25750 }, { "epoch": 6.793035219628018, "grad_norm": 0.612267017364502, "learning_rate": 2.1399806559395057e-06, "loss": 0.004, "step": 25752 }, { "epoch": 6.793562854504683, "grad_norm": 0.07812018692493439, "learning_rate": 2.139628945748703e-06, "loss": 0.0003, "step": 25754 }, { "epoch": 6.7940904893813485, "grad_norm": 0.01423229742795229, "learning_rate": 2.1392772355579e-06, "loss": 0.0125, "step": 25756 }, { "epoch": 6.7946181242580135, "grad_norm": 0.0029370367992669344, "learning_rate": 2.1389255253670974e-06, "loss": 0.0001, "step": 25758 }, { "epoch": 6.795145759134678, "grad_norm": 0.02959461696445942, "learning_rate": 2.138573815176295e-06, "loss": 0.0002, "step": 25760 }, { "epoch": 6.795673394011344, "grad_norm": 0.007050636224448681, "learning_rate": 2.138222104985492e-06, "loss": 0.0001, "step": 25762 }, { "epoch": 6.796201028888009, "grad_norm": 0.6557292342185974, "learning_rate": 2.137870394794689e-06, "loss": 0.0022, "step": 25764 }, { "epoch": 6.796728663764675, "grad_norm": 0.007444508373737335, "learning_rate": 2.1375186846038865e-06, "loss": 0.0001, "step": 25766 }, { "epoch": 6.79725629864134, "grad_norm": 0.012216786853969097, "learning_rate": 2.1371669744130835e-06, "loss": 0.0096, "step": 25768 }, { "epoch": 6.797783933518006, "grad_norm": 0.007894116453826427, "learning_rate": 2.136815264222281e-06, "loss": 0.0007, "step": 25770 }, { "epoch": 6.798311568394671, "grad_norm": 0.009082332253456116, "learning_rate": 2.1364635540314782e-06, "loss": 0.0002, "step": 25772 }, { "epoch": 6.798839203271337, "grad_norm": 0.024223249405622482, "learning_rate": 2.1361118438406756e-06, "loss": 0.0002, "step": 25774 }, { "epoch": 6.799366838148002, "grad_norm": 0.023764219135046005, "learning_rate": 2.1357601336498725e-06, "loss": 0.0002, "step": 25776 }, { "epoch": 6.799894473024667, "grad_norm": 0.004414340015500784, "learning_rate": 2.1354084234590695e-06, "loss": 0.0001, "step": 25778 }, { "epoch": 6.8004221079013325, "grad_norm": 0.1798204928636551, "learning_rate": 2.135056713268267e-06, "loss": 0.0004, "step": 25780 }, { "epoch": 6.8009497427779975, "grad_norm": 0.14252811670303345, "learning_rate": 2.1347050030774642e-06, "loss": 0.0004, "step": 25782 }, { "epoch": 6.801477377654663, "grad_norm": 0.015026597306132317, "learning_rate": 2.1343532928866616e-06, "loss": 0.0002, "step": 25784 }, { "epoch": 6.802005012531328, "grad_norm": 0.012247084639966488, "learning_rate": 2.1340015826958586e-06, "loss": 0.0001, "step": 25786 }, { "epoch": 6.802532647407993, "grad_norm": 0.38635310530662537, "learning_rate": 2.133649872505056e-06, "loss": 0.0054, "step": 25788 }, { "epoch": 6.803060282284659, "grad_norm": 0.10272029042243958, "learning_rate": 2.133298162314253e-06, "loss": 0.0037, "step": 25790 }, { "epoch": 6.803587917161324, "grad_norm": 0.01220758631825447, "learning_rate": 2.1329464521234503e-06, "loss": 0.0001, "step": 25792 }, { "epoch": 6.80411555203799, "grad_norm": 0.09187991917133331, "learning_rate": 2.1325947419326477e-06, "loss": 0.0003, "step": 25794 }, { "epoch": 6.804643186914655, "grad_norm": 0.002668116707354784, "learning_rate": 2.132243031741845e-06, "loss": 0.0002, "step": 25796 }, { "epoch": 6.805170821791321, "grad_norm": 0.017284154891967773, "learning_rate": 2.131891321551042e-06, "loss": 0.0001, "step": 25798 }, { "epoch": 6.805698456667986, "grad_norm": 0.0017542182467877865, "learning_rate": 2.1315396113602394e-06, "loss": 0.0001, "step": 25800 }, { "epoch": 6.8062260915446515, "grad_norm": 0.019982753321528435, "learning_rate": 2.1311879011694363e-06, "loss": 0.0002, "step": 25802 }, { "epoch": 6.8067537264213165, "grad_norm": 0.00781379546970129, "learning_rate": 2.1308361909786337e-06, "loss": 0.0002, "step": 25804 }, { "epoch": 6.8072813612979814, "grad_norm": 0.007497009821236134, "learning_rate": 2.130484480787831e-06, "loss": 0.0002, "step": 25806 }, { "epoch": 6.807808996174647, "grad_norm": 0.009289777837693691, "learning_rate": 2.1301327705970284e-06, "loss": 0.0001, "step": 25808 }, { "epoch": 6.808336631051312, "grad_norm": 0.004595339763909578, "learning_rate": 2.1297810604062254e-06, "loss": 0.0003, "step": 25810 }, { "epoch": 6.808864265927978, "grad_norm": 0.02492353692650795, "learning_rate": 2.1294293502154223e-06, "loss": 0.0002, "step": 25812 }, { "epoch": 6.809391900804643, "grad_norm": 0.010846294462680817, "learning_rate": 2.1290776400246197e-06, "loss": 0.0002, "step": 25814 }, { "epoch": 6.809919535681309, "grad_norm": 0.004434171132743359, "learning_rate": 2.1287259298338167e-06, "loss": 0.0037, "step": 25816 }, { "epoch": 6.810447170557974, "grad_norm": 0.022200800478458405, "learning_rate": 2.1283742196430145e-06, "loss": 0.0001, "step": 25818 }, { "epoch": 6.810974805434639, "grad_norm": 0.0019880798645317554, "learning_rate": 2.1280225094522114e-06, "loss": 0.0023, "step": 25820 }, { "epoch": 6.811502440311305, "grad_norm": 0.02433975413441658, "learning_rate": 2.127670799261409e-06, "loss": 0.0002, "step": 25822 }, { "epoch": 6.81203007518797, "grad_norm": 1.128056287765503, "learning_rate": 2.1273190890706058e-06, "loss": 0.0076, "step": 25824 }, { "epoch": 6.8125577100646355, "grad_norm": 0.001881107222288847, "learning_rate": 2.126967378879803e-06, "loss": 0.0001, "step": 25826 }, { "epoch": 6.8130853449413005, "grad_norm": 0.0013682700227946043, "learning_rate": 2.126615668689e-06, "loss": 0.0001, "step": 25828 }, { "epoch": 6.813612979817966, "grad_norm": 0.0008777063339948654, "learning_rate": 2.1262639584981975e-06, "loss": 0.0001, "step": 25830 }, { "epoch": 6.814140614694631, "grad_norm": 0.02206927165389061, "learning_rate": 2.125912248307395e-06, "loss": 0.0002, "step": 25832 }, { "epoch": 6.814668249571296, "grad_norm": 0.6608272194862366, "learning_rate": 2.125560538116592e-06, "loss": 0.0079, "step": 25834 }, { "epoch": 6.815195884447962, "grad_norm": 0.0010768963256850839, "learning_rate": 2.125208827925789e-06, "loss": 0.0001, "step": 25836 }, { "epoch": 6.815723519324627, "grad_norm": 0.0045422958210110664, "learning_rate": 2.1248571177349865e-06, "loss": 0.0001, "step": 25838 }, { "epoch": 6.816251154201293, "grad_norm": 0.0053296079859137535, "learning_rate": 2.1245054075441835e-06, "loss": 0.0001, "step": 25840 }, { "epoch": 6.816778789077958, "grad_norm": 0.0020031218882650137, "learning_rate": 2.124153697353381e-06, "loss": 0.0002, "step": 25842 }, { "epoch": 6.817306423954624, "grad_norm": 0.6096267700195312, "learning_rate": 2.1238019871625782e-06, "loss": 0.0032, "step": 25844 }, { "epoch": 6.817834058831289, "grad_norm": 0.0042910645715892315, "learning_rate": 2.123450276971775e-06, "loss": 0.0033, "step": 25846 }, { "epoch": 6.8183616937079545, "grad_norm": 0.003670548787340522, "learning_rate": 2.1230985667809726e-06, "loss": 0.0004, "step": 25848 }, { "epoch": 6.8188893285846195, "grad_norm": 0.0044053690508008, "learning_rate": 2.1227468565901695e-06, "loss": 0.0001, "step": 25850 }, { "epoch": 6.8194169634612845, "grad_norm": 0.005748304072767496, "learning_rate": 2.122395146399367e-06, "loss": 0.0001, "step": 25852 }, { "epoch": 6.81994459833795, "grad_norm": 0.003611015621572733, "learning_rate": 2.1220434362085643e-06, "loss": 0.0001, "step": 25854 }, { "epoch": 6.820472233214615, "grad_norm": 0.0029996486846357584, "learning_rate": 2.1216917260177616e-06, "loss": 0.0001, "step": 25856 }, { "epoch": 6.820999868091281, "grad_norm": 0.007228253409266472, "learning_rate": 2.1213400158269586e-06, "loss": 0.0001, "step": 25858 }, { "epoch": 6.821527502967946, "grad_norm": 0.016071124002337456, "learning_rate": 2.120988305636156e-06, "loss": 0.0001, "step": 25860 }, { "epoch": 6.822055137844611, "grad_norm": 0.05513782799243927, "learning_rate": 2.120636595445353e-06, "loss": 0.0046, "step": 25862 }, { "epoch": 6.822582772721277, "grad_norm": 0.03829468786716461, "learning_rate": 2.1202848852545503e-06, "loss": 0.0002, "step": 25864 }, { "epoch": 6.823110407597942, "grad_norm": 0.0020255278795957565, "learning_rate": 2.1199331750637477e-06, "loss": 0.0001, "step": 25866 }, { "epoch": 6.823638042474608, "grad_norm": 0.27171215415000916, "learning_rate": 2.119581464872945e-06, "loss": 0.002, "step": 25868 }, { "epoch": 6.824165677351273, "grad_norm": 0.0018132852856069803, "learning_rate": 2.119229754682142e-06, "loss": 0.0001, "step": 25870 }, { "epoch": 6.8246933122279385, "grad_norm": 0.004298571962863207, "learning_rate": 2.118878044491339e-06, "loss": 0.0001, "step": 25872 }, { "epoch": 6.8252209471046035, "grad_norm": 0.014087776653468609, "learning_rate": 2.1185263343005363e-06, "loss": 0.0003, "step": 25874 }, { "epoch": 6.825748581981269, "grad_norm": 0.012287108227610588, "learning_rate": 2.1181746241097337e-06, "loss": 0.0001, "step": 25876 }, { "epoch": 6.826276216857934, "grad_norm": 0.11075418442487717, "learning_rate": 2.117822913918931e-06, "loss": 0.0003, "step": 25878 }, { "epoch": 6.826803851734599, "grad_norm": 0.0035945973359048367, "learning_rate": 2.117471203728128e-06, "loss": 0.0001, "step": 25880 }, { "epoch": 6.827331486611265, "grad_norm": 0.00434160465374589, "learning_rate": 2.1171194935373254e-06, "loss": 0.0001, "step": 25882 }, { "epoch": 6.82785912148793, "grad_norm": 0.008203660137951374, "learning_rate": 2.1167677833465224e-06, "loss": 0.0001, "step": 25884 }, { "epoch": 6.828386756364596, "grad_norm": 0.014838744886219501, "learning_rate": 2.1164160731557197e-06, "loss": 0.0001, "step": 25886 }, { "epoch": 6.828914391241261, "grad_norm": 0.03621704503893852, "learning_rate": 2.1160643629649167e-06, "loss": 0.0002, "step": 25888 }, { "epoch": 6.829442026117927, "grad_norm": 0.11528493463993073, "learning_rate": 2.1157126527741145e-06, "loss": 0.0006, "step": 25890 }, { "epoch": 6.829969660994592, "grad_norm": 0.08638186752796173, "learning_rate": 2.1153609425833114e-06, "loss": 0.0001, "step": 25892 }, { "epoch": 6.8304972958712575, "grad_norm": 0.009589307941496372, "learning_rate": 2.115009232392509e-06, "loss": 0.0001, "step": 25894 }, { "epoch": 6.8310249307479225, "grad_norm": 0.00479314848780632, "learning_rate": 2.1146575222017058e-06, "loss": 0.0001, "step": 25896 }, { "epoch": 6.8315525656245875, "grad_norm": 0.11973585933446884, "learning_rate": 2.114305812010903e-06, "loss": 0.0003, "step": 25898 }, { "epoch": 6.832080200501253, "grad_norm": 0.012696697376668453, "learning_rate": 2.1139541018201e-06, "loss": 0.0002, "step": 25900 }, { "epoch": 6.832607835377918, "grad_norm": 0.0008144189487211406, "learning_rate": 2.1136023916292975e-06, "loss": 0.0001, "step": 25902 }, { "epoch": 6.833135470254584, "grad_norm": 0.0021951200906187296, "learning_rate": 2.113250681438495e-06, "loss": 0.0001, "step": 25904 }, { "epoch": 6.833663105131249, "grad_norm": 0.015842532739043236, "learning_rate": 2.112898971247692e-06, "loss": 0.0001, "step": 25906 }, { "epoch": 6.834190740007914, "grad_norm": 0.007089794613420963, "learning_rate": 2.112547261056889e-06, "loss": 0.0001, "step": 25908 }, { "epoch": 6.83471837488458, "grad_norm": 0.009596366435289383, "learning_rate": 2.112195550866086e-06, "loss": 0.0003, "step": 25910 }, { "epoch": 6.835246009761245, "grad_norm": 0.002852768637239933, "learning_rate": 2.1118438406752835e-06, "loss": 0.0001, "step": 25912 }, { "epoch": 6.835773644637911, "grad_norm": 0.006270970683544874, "learning_rate": 2.111492130484481e-06, "loss": 0.0001, "step": 25914 }, { "epoch": 6.836301279514576, "grad_norm": 0.18603400886058807, "learning_rate": 2.1111404202936783e-06, "loss": 0.0118, "step": 25916 }, { "epoch": 6.8368289143912415, "grad_norm": 0.28170493245124817, "learning_rate": 2.1107887101028752e-06, "loss": 0.0042, "step": 25918 }, { "epoch": 6.8373565492679065, "grad_norm": 0.00431123236194253, "learning_rate": 2.1104369999120726e-06, "loss": 0.0001, "step": 25920 }, { "epoch": 6.837884184144572, "grad_norm": 0.026224106550216675, "learning_rate": 2.1100852897212695e-06, "loss": 0.0001, "step": 25922 }, { "epoch": 6.838411819021237, "grad_norm": 0.024371450766921043, "learning_rate": 2.109733579530467e-06, "loss": 0.0001, "step": 25924 }, { "epoch": 6.838939453897902, "grad_norm": 0.31644105911254883, "learning_rate": 2.1093818693396643e-06, "loss": 0.0014, "step": 25926 }, { "epoch": 6.839467088774568, "grad_norm": 0.001705373520962894, "learning_rate": 2.1090301591488617e-06, "loss": 0.0001, "step": 25928 }, { "epoch": 6.839994723651233, "grad_norm": 0.023660898208618164, "learning_rate": 2.1086784489580586e-06, "loss": 0.0008, "step": 25930 }, { "epoch": 6.840522358527899, "grad_norm": 0.0315406434237957, "learning_rate": 2.1083267387672556e-06, "loss": 0.0001, "step": 25932 }, { "epoch": 6.841049993404564, "grad_norm": 0.5379896759986877, "learning_rate": 2.107975028576453e-06, "loss": 0.0007, "step": 25934 }, { "epoch": 6.84157762828123, "grad_norm": 0.06748300790786743, "learning_rate": 2.1076233183856503e-06, "loss": 0.0002, "step": 25936 }, { "epoch": 6.842105263157895, "grad_norm": 0.020921751856803894, "learning_rate": 2.1072716081948477e-06, "loss": 0.0002, "step": 25938 }, { "epoch": 6.8426328980345605, "grad_norm": 0.004476651549339294, "learning_rate": 2.1069198980040447e-06, "loss": 0.0001, "step": 25940 }, { "epoch": 6.8431605329112255, "grad_norm": 0.40711963176727295, "learning_rate": 2.106568187813242e-06, "loss": 0.0008, "step": 25942 }, { "epoch": 6.8436881677878905, "grad_norm": 0.013616909272968769, "learning_rate": 2.106216477622439e-06, "loss": 0.0001, "step": 25944 }, { "epoch": 6.844215802664556, "grad_norm": 0.004324107430875301, "learning_rate": 2.1058647674316364e-06, "loss": 0.0001, "step": 25946 }, { "epoch": 6.844743437541221, "grad_norm": 0.009507881477475166, "learning_rate": 2.1055130572408337e-06, "loss": 0.0001, "step": 25948 }, { "epoch": 6.845271072417887, "grad_norm": 0.0030799475498497486, "learning_rate": 2.105161347050031e-06, "loss": 0.0001, "step": 25950 }, { "epoch": 6.845798707294552, "grad_norm": 0.012438895180821419, "learning_rate": 2.104809636859228e-06, "loss": 0.0001, "step": 25952 }, { "epoch": 6.846326342171217, "grad_norm": 1.0311790704727173, "learning_rate": 2.1044579266684254e-06, "loss": 0.0013, "step": 25954 }, { "epoch": 6.846853977047883, "grad_norm": 0.01839045248925686, "learning_rate": 2.1041062164776224e-06, "loss": 0.0001, "step": 25956 }, { "epoch": 6.847381611924548, "grad_norm": 0.002306840382516384, "learning_rate": 2.1037545062868198e-06, "loss": 0.0001, "step": 25958 }, { "epoch": 6.847909246801214, "grad_norm": 0.10825276374816895, "learning_rate": 2.1034027960960167e-06, "loss": 0.0003, "step": 25960 }, { "epoch": 6.848436881677879, "grad_norm": 0.006983993109315634, "learning_rate": 2.103051085905214e-06, "loss": 0.0001, "step": 25962 }, { "epoch": 6.8489645165545445, "grad_norm": 0.001767090754583478, "learning_rate": 2.1026993757144115e-06, "loss": 0.0003, "step": 25964 }, { "epoch": 6.8494921514312095, "grad_norm": 0.004837543237954378, "learning_rate": 2.1023476655236084e-06, "loss": 0.0001, "step": 25966 }, { "epoch": 6.850019786307875, "grad_norm": 0.011954707093536854, "learning_rate": 2.101995955332806e-06, "loss": 0.0001, "step": 25968 }, { "epoch": 6.85054742118454, "grad_norm": 0.021441904827952385, "learning_rate": 2.1016442451420028e-06, "loss": 0.0001, "step": 25970 }, { "epoch": 6.851075056061205, "grad_norm": 0.0041161784902215, "learning_rate": 2.1012925349512e-06, "loss": 0.0023, "step": 25972 }, { "epoch": 6.851602690937871, "grad_norm": 0.11108405143022537, "learning_rate": 2.1009408247603975e-06, "loss": 0.0003, "step": 25974 }, { "epoch": 6.852130325814536, "grad_norm": 0.01599280722439289, "learning_rate": 2.100589114569595e-06, "loss": 0.0001, "step": 25976 }, { "epoch": 6.852657960691202, "grad_norm": 0.0014160730643197894, "learning_rate": 2.100237404378792e-06, "loss": 0.0001, "step": 25978 }, { "epoch": 6.853185595567867, "grad_norm": 0.009258677251636982, "learning_rate": 2.099885694187989e-06, "loss": 0.0001, "step": 25980 }, { "epoch": 6.853713230444533, "grad_norm": 0.013079619035124779, "learning_rate": 2.099533983997186e-06, "loss": 0.0001, "step": 25982 }, { "epoch": 6.854240865321198, "grad_norm": 0.09857025742530823, "learning_rate": 2.0991822738063835e-06, "loss": 0.0004, "step": 25984 }, { "epoch": 6.8547685001978635, "grad_norm": 0.005104263313114643, "learning_rate": 2.098830563615581e-06, "loss": 0.0001, "step": 25986 }, { "epoch": 6.8552961350745285, "grad_norm": 0.003487833309918642, "learning_rate": 2.0984788534247783e-06, "loss": 0.0003, "step": 25988 }, { "epoch": 6.8558237699511935, "grad_norm": 0.0013922156067565084, "learning_rate": 2.0981271432339752e-06, "loss": 0.0001, "step": 25990 }, { "epoch": 6.856351404827859, "grad_norm": 0.01435695681720972, "learning_rate": 2.097775433043172e-06, "loss": 0.0001, "step": 25992 }, { "epoch": 6.856879039704524, "grad_norm": 0.02368946745991707, "learning_rate": 2.0974237228523696e-06, "loss": 0.0001, "step": 25994 }, { "epoch": 6.85740667458119, "grad_norm": 0.04506992921233177, "learning_rate": 2.097072012661567e-06, "loss": 0.0002, "step": 25996 }, { "epoch": 6.857934309457855, "grad_norm": 0.0016658940585330129, "learning_rate": 2.0967203024707643e-06, "loss": 0.0003, "step": 25998 }, { "epoch": 6.85846194433452, "grad_norm": 0.007267135661095381, "learning_rate": 2.0963685922799613e-06, "loss": 0.0047, "step": 26000 }, { "epoch": 6.85846194433452, "eval_loss": 0.0016246421728283167, "eval_runtime": 303.7907, "eval_samples_per_second": 709.834, "eval_steps_per_second": 88.732, "step": 26000 } ], "logging_steps": 2, "max_steps": 37910, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.5028218940447785e+18, "train_batch_size": 64, "trial_name": null, "trial_params": null }