diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,42034 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.7225259507237302, + "eval_steps": 500, + "global_step": 30000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00012042099178728837, + "grad_norm": 3.4532573223114014, + "learning_rate": 1.9258545979778525e-09, + "loss": 1.2769, + "step": 5 + }, + { + "epoch": 0.00024084198357457673, + "grad_norm": 3.2849907875061035, + "learning_rate": 4.333172845450169e-09, + "loss": 1.2562, + "step": 10 + }, + { + "epoch": 0.00036126297536186506, + "grad_norm": 3.0698747634887695, + "learning_rate": 6.740491092922484e-09, + "loss": 1.2727, + "step": 15 + }, + { + "epoch": 0.00048168396714915346, + "grad_norm": 3.3526523113250732, + "learning_rate": 9.1478093403948e-09, + "loss": 1.2471, + "step": 20 + }, + { + "epoch": 0.0006021049589364418, + "grad_norm": 3.5746824741363525, + "learning_rate": 1.1555127587867116e-08, + "loss": 1.2575, + "step": 25 + }, + { + "epoch": 0.0007225259507237301, + "grad_norm": 3.2632179260253906, + "learning_rate": 1.3962445835339432e-08, + "loss": 1.1786, + "step": 30 + }, + { + "epoch": 0.0008429469425110186, + "grad_norm": 3.0086328983306885, + "learning_rate": 1.636976408281175e-08, + "loss": 1.4168, + "step": 35 + }, + { + "epoch": 0.0009633679342983069, + "grad_norm": 3.5532827377319336, + "learning_rate": 1.8777082330284063e-08, + "loss": 1.2169, + "step": 40 + }, + { + "epoch": 0.0010837889260855952, + "grad_norm": 3.0027787685394287, + "learning_rate": 2.118440057775638e-08, + "loss": 1.2513, + "step": 45 + }, + { + "epoch": 0.0012042099178728835, + "grad_norm": 3.6059303283691406, + "learning_rate": 2.3591718825228694e-08, + "loss": 1.2505, + "step": 50 + }, + { + "epoch": 0.0013246309096601719, + "grad_norm": 2.860373020172119, + "learning_rate": 2.599903707270101e-08, + "loss": 1.2457, + "step": 55 + }, + { + "epoch": 0.0014450519014474602, + "grad_norm": 3.1921334266662598, + "learning_rate": 2.8406355320173326e-08, + "loss": 1.2627, + "step": 60 + }, + { + "epoch": 0.0015654728932347486, + "grad_norm": 3.080982208251953, + "learning_rate": 3.081367356764564e-08, + "loss": 1.1995, + "step": 65 + }, + { + "epoch": 0.0016858938850220371, + "grad_norm": 3.90539288520813, + "learning_rate": 3.3220991815117954e-08, + "loss": 1.3055, + "step": 70 + }, + { + "epoch": 0.0018063148768093255, + "grad_norm": 3.3694944381713867, + "learning_rate": 3.5628310062590275e-08, + "loss": 1.2396, + "step": 75 + }, + { + "epoch": 0.0019267358685966139, + "grad_norm": 3.218294143676758, + "learning_rate": 3.803562831006259e-08, + "loss": 1.2292, + "step": 80 + }, + { + "epoch": 0.002047156860383902, + "grad_norm": 3.063572406768799, + "learning_rate": 4.04429465575349e-08, + "loss": 1.253, + "step": 85 + }, + { + "epoch": 0.0021675778521711903, + "grad_norm": 3.4931082725524902, + "learning_rate": 4.285026480500722e-08, + "loss": 1.2709, + "step": 90 + }, + { + "epoch": 0.002287998843958479, + "grad_norm": 3.7916605472564697, + "learning_rate": 4.525758305247954e-08, + "loss": 1.2878, + "step": 95 + }, + { + "epoch": 0.002408419835745767, + "grad_norm": 2.3481788635253906, + "learning_rate": 4.766490129995185e-08, + "loss": 1.2535, + "step": 100 + }, + { + "epoch": 0.0025288408275330556, + "grad_norm": 2.8467800617218018, + "learning_rate": 5.0072219547424166e-08, + "loss": 1.2573, + "step": 105 + }, + { + "epoch": 0.0026492618193203438, + "grad_norm": 3.1305465698242188, + "learning_rate": 5.2479537794896487e-08, + "loss": 1.2112, + "step": 110 + }, + { + "epoch": 0.0027696828111076323, + "grad_norm": 2.8494081497192383, + "learning_rate": 5.48868560423688e-08, + "loss": 1.2003, + "step": 115 + }, + { + "epoch": 0.0028901038028949205, + "grad_norm": 3.01526141166687, + "learning_rate": 5.7294174289841115e-08, + "loss": 1.2025, + "step": 120 + }, + { + "epoch": 0.003010524794682209, + "grad_norm": 3.138826370239258, + "learning_rate": 5.970149253731343e-08, + "loss": 1.2197, + "step": 125 + }, + { + "epoch": 0.003130945786469497, + "grad_norm": 3.5005788803100586, + "learning_rate": 6.210881078478574e-08, + "loss": 1.2479, + "step": 130 + }, + { + "epoch": 0.0032513667782567857, + "grad_norm": 3.51312255859375, + "learning_rate": 6.451612903225806e-08, + "loss": 1.2142, + "step": 135 + }, + { + "epoch": 0.0033717877700440743, + "grad_norm": 2.6358306407928467, + "learning_rate": 6.692344727973037e-08, + "loss": 1.2545, + "step": 140 + }, + { + "epoch": 0.0034922087618313624, + "grad_norm": 3.6043667793273926, + "learning_rate": 6.93307655272027e-08, + "loss": 1.2427, + "step": 145 + }, + { + "epoch": 0.003612629753618651, + "grad_norm": 3.161194324493408, + "learning_rate": 7.173808377467501e-08, + "loss": 1.2686, + "step": 150 + }, + { + "epoch": 0.003733050745405939, + "grad_norm": 3.4229471683502197, + "learning_rate": 7.414540202214733e-08, + "loss": 1.3099, + "step": 155 + }, + { + "epoch": 0.0038534717371932277, + "grad_norm": 3.3181447982788086, + "learning_rate": 7.655272026961963e-08, + "loss": 1.2736, + "step": 160 + }, + { + "epoch": 0.003973892728980516, + "grad_norm": 3.1612415313720703, + "learning_rate": 7.896003851709195e-08, + "loss": 1.2001, + "step": 165 + }, + { + "epoch": 0.004094313720767804, + "grad_norm": 2.884935140609741, + "learning_rate": 8.136735676456427e-08, + "loss": 1.2524, + "step": 170 + }, + { + "epoch": 0.0042147347125550925, + "grad_norm": 2.7151708602905273, + "learning_rate": 8.377467501203658e-08, + "loss": 1.2547, + "step": 175 + }, + { + "epoch": 0.004335155704342381, + "grad_norm": 3.2015910148620605, + "learning_rate": 8.618199325950891e-08, + "loss": 1.2492, + "step": 180 + }, + { + "epoch": 0.00445557669612967, + "grad_norm": 3.480416774749756, + "learning_rate": 8.858931150698122e-08, + "loss": 1.2558, + "step": 185 + }, + { + "epoch": 0.004575997687916958, + "grad_norm": 3.1630730628967285, + "learning_rate": 9.099662975445354e-08, + "loss": 1.2078, + "step": 190 + }, + { + "epoch": 0.004696418679704246, + "grad_norm": 3.1624763011932373, + "learning_rate": 9.340394800192584e-08, + "loss": 1.2021, + "step": 195 + }, + { + "epoch": 0.004816839671491534, + "grad_norm": 2.6630239486694336, + "learning_rate": 9.581126624939817e-08, + "loss": 1.1915, + "step": 200 + }, + { + "epoch": 0.004937260663278823, + "grad_norm": 2.7018954753875732, + "learning_rate": 9.821858449687048e-08, + "loss": 1.2762, + "step": 205 + }, + { + "epoch": 0.005057681655066111, + "grad_norm": 3.0080630779266357, + "learning_rate": 1.006259027443428e-07, + "loss": 1.2456, + "step": 210 + }, + { + "epoch": 0.005178102646853399, + "grad_norm": 2.8256232738494873, + "learning_rate": 1.0303322099181512e-07, + "loss": 1.218, + "step": 215 + }, + { + "epoch": 0.0052985236386406875, + "grad_norm": 3.1638636589050293, + "learning_rate": 1.0544053923928744e-07, + "loss": 1.2508, + "step": 220 + }, + { + "epoch": 0.0054189446304279765, + "grad_norm": 2.64296555519104, + "learning_rate": 1.0784785748675975e-07, + "loss": 1.2768, + "step": 225 + }, + { + "epoch": 0.005539365622215265, + "grad_norm": 2.2767012119293213, + "learning_rate": 1.1025517573423205e-07, + "loss": 1.1927, + "step": 230 + }, + { + "epoch": 0.005659786614002553, + "grad_norm": 2.666334629058838, + "learning_rate": 1.1266249398170438e-07, + "loss": 1.1981, + "step": 235 + }, + { + "epoch": 0.005780207605789841, + "grad_norm": 3.0129833221435547, + "learning_rate": 1.1506981222917669e-07, + "loss": 1.1702, + "step": 240 + }, + { + "epoch": 0.00590062859757713, + "grad_norm": 2.534895896911621, + "learning_rate": 1.17477130476649e-07, + "loss": 1.2273, + "step": 245 + }, + { + "epoch": 0.006021049589364418, + "grad_norm": 3.0159912109375, + "learning_rate": 1.1988444872412132e-07, + "loss": 1.1828, + "step": 250 + }, + { + "epoch": 0.006141470581151706, + "grad_norm": 2.4064695835113525, + "learning_rate": 1.2229176697159363e-07, + "loss": 1.1927, + "step": 255 + }, + { + "epoch": 0.006261891572938994, + "grad_norm": 2.506269693374634, + "learning_rate": 1.2469908521906595e-07, + "loss": 1.2452, + "step": 260 + }, + { + "epoch": 0.006382312564726283, + "grad_norm": 2.26957631111145, + "learning_rate": 1.271064034665383e-07, + "loss": 1.2155, + "step": 265 + }, + { + "epoch": 0.0065027335565135715, + "grad_norm": 2.604719638824463, + "learning_rate": 1.295137217140106e-07, + "loss": 1.1983, + "step": 270 + }, + { + "epoch": 0.00662315454830086, + "grad_norm": 2.3419127464294434, + "learning_rate": 1.3192103996148292e-07, + "loss": 1.2137, + "step": 275 + }, + { + "epoch": 0.006743575540088149, + "grad_norm": 2.5202889442443848, + "learning_rate": 1.343283582089552e-07, + "loss": 1.2272, + "step": 280 + }, + { + "epoch": 0.006863996531875437, + "grad_norm": 2.3413245677948, + "learning_rate": 1.3673567645642752e-07, + "loss": 1.2049, + "step": 285 + }, + { + "epoch": 0.006984417523662725, + "grad_norm": 2.6377112865448, + "learning_rate": 1.3914299470389983e-07, + "loss": 1.2673, + "step": 290 + }, + { + "epoch": 0.007104838515450013, + "grad_norm": 2.182490825653076, + "learning_rate": 1.4155031295137215e-07, + "loss": 1.2062, + "step": 295 + }, + { + "epoch": 0.007225259507237302, + "grad_norm": 2.1873981952667236, + "learning_rate": 1.439576311988445e-07, + "loss": 1.1677, + "step": 300 + }, + { + "epoch": 0.00734568049902459, + "grad_norm": 2.436870574951172, + "learning_rate": 1.463649494463168e-07, + "loss": 1.1813, + "step": 305 + }, + { + "epoch": 0.007466101490811878, + "grad_norm": 2.0647244453430176, + "learning_rate": 1.4877226769378912e-07, + "loss": 1.1143, + "step": 310 + }, + { + "epoch": 0.007586522482599166, + "grad_norm": 2.045997381210327, + "learning_rate": 1.5117958594126143e-07, + "loss": 1.2223, + "step": 315 + }, + { + "epoch": 0.007706943474386455, + "grad_norm": 2.0606868267059326, + "learning_rate": 1.5358690418873374e-07, + "loss": 1.184, + "step": 320 + }, + { + "epoch": 0.007827364466173744, + "grad_norm": 2.2403106689453125, + "learning_rate": 1.5599422243620606e-07, + "loss": 1.1599, + "step": 325 + }, + { + "epoch": 0.007947785457961033, + "grad_norm": 1.9159278869628906, + "learning_rate": 1.5840154068367837e-07, + "loss": 1.1863, + "step": 330 + }, + { + "epoch": 0.00806820644974832, + "grad_norm": 2.374685525894165, + "learning_rate": 1.608088589311507e-07, + "loss": 1.2206, + "step": 335 + }, + { + "epoch": 0.008188627441535609, + "grad_norm": 2.1852266788482666, + "learning_rate": 1.6321617717862303e-07, + "loss": 1.165, + "step": 340 + }, + { + "epoch": 0.008309048433322896, + "grad_norm": 2.137631416320801, + "learning_rate": 1.6562349542609534e-07, + "loss": 1.177, + "step": 345 + }, + { + "epoch": 0.008429469425110185, + "grad_norm": 2.00748348236084, + "learning_rate": 1.6803081367356763e-07, + "loss": 1.0761, + "step": 350 + }, + { + "epoch": 0.008549890416897474, + "grad_norm": 1.9672129154205322, + "learning_rate": 1.7043813192103994e-07, + "loss": 1.1491, + "step": 355 + }, + { + "epoch": 0.008670311408684761, + "grad_norm": 2.3764731884002686, + "learning_rate": 1.7284545016851226e-07, + "loss": 1.2437, + "step": 360 + }, + { + "epoch": 0.00879073240047205, + "grad_norm": 2.0609636306762695, + "learning_rate": 1.7525276841598457e-07, + "loss": 1.1461, + "step": 365 + }, + { + "epoch": 0.00891115339225934, + "grad_norm": 1.9100985527038574, + "learning_rate": 1.7766008666345688e-07, + "loss": 1.1708, + "step": 370 + }, + { + "epoch": 0.009031574384046627, + "grad_norm": 1.9871859550476074, + "learning_rate": 1.8006740491092923e-07, + "loss": 1.1183, + "step": 375 + }, + { + "epoch": 0.009151995375833916, + "grad_norm": 2.0929617881774902, + "learning_rate": 1.8247472315840154e-07, + "loss": 1.2154, + "step": 380 + }, + { + "epoch": 0.009272416367621203, + "grad_norm": 2.0678470134735107, + "learning_rate": 1.8488204140587385e-07, + "loss": 1.1872, + "step": 385 + }, + { + "epoch": 0.009392837359408492, + "grad_norm": 2.0919220447540283, + "learning_rate": 1.8728935965334617e-07, + "loss": 1.1782, + "step": 390 + }, + { + "epoch": 0.009513258351195781, + "grad_norm": 1.8138477802276611, + "learning_rate": 1.8969667790081848e-07, + "loss": 1.1245, + "step": 395 + }, + { + "epoch": 0.009633679342983068, + "grad_norm": 2.0728824138641357, + "learning_rate": 1.921039961482908e-07, + "loss": 1.1498, + "step": 400 + }, + { + "epoch": 0.009754100334770357, + "grad_norm": 2.1343467235565186, + "learning_rate": 1.945113143957631e-07, + "loss": 1.2057, + "step": 405 + }, + { + "epoch": 0.009874521326557646, + "grad_norm": 1.8334988355636597, + "learning_rate": 1.9691863264323545e-07, + "loss": 1.1354, + "step": 410 + }, + { + "epoch": 0.009994942318344933, + "grad_norm": 1.8179374933242798, + "learning_rate": 1.9932595089070774e-07, + "loss": 1.172, + "step": 415 + }, + { + "epoch": 0.010115363310132222, + "grad_norm": 1.8935538530349731, + "learning_rate": 2.0173326913818005e-07, + "loss": 1.2059, + "step": 420 + }, + { + "epoch": 0.010235784301919511, + "grad_norm": 2.025412082672119, + "learning_rate": 2.0414058738565237e-07, + "loss": 1.1687, + "step": 425 + }, + { + "epoch": 0.010356205293706799, + "grad_norm": 1.9169843196868896, + "learning_rate": 2.0654790563312468e-07, + "loss": 1.2163, + "step": 430 + }, + { + "epoch": 0.010476626285494088, + "grad_norm": 1.9890685081481934, + "learning_rate": 2.08955223880597e-07, + "loss": 1.1798, + "step": 435 + }, + { + "epoch": 0.010597047277281375, + "grad_norm": 1.7384068965911865, + "learning_rate": 2.113625421280693e-07, + "loss": 1.103, + "step": 440 + }, + { + "epoch": 0.010717468269068664, + "grad_norm": 1.8030136823654175, + "learning_rate": 2.1376986037554165e-07, + "loss": 1.2152, + "step": 445 + }, + { + "epoch": 0.010837889260855953, + "grad_norm": 1.7656060457229614, + "learning_rate": 2.1617717862301396e-07, + "loss": 1.1988, + "step": 450 + }, + { + "epoch": 0.01095831025264324, + "grad_norm": 2.1653716564178467, + "learning_rate": 2.1858449687048628e-07, + "loss": 1.1923, + "step": 455 + }, + { + "epoch": 0.01107873124443053, + "grad_norm": 2.042724132537842, + "learning_rate": 2.209918151179586e-07, + "loss": 1.216, + "step": 460 + }, + { + "epoch": 0.011199152236217818, + "grad_norm": 1.65592360496521, + "learning_rate": 2.233991333654309e-07, + "loss": 1.1506, + "step": 465 + }, + { + "epoch": 0.011319573228005106, + "grad_norm": 1.6511238813400269, + "learning_rate": 2.2580645161290322e-07, + "loss": 1.1547, + "step": 470 + }, + { + "epoch": 0.011439994219792395, + "grad_norm": 1.8964343070983887, + "learning_rate": 2.2821376986037553e-07, + "loss": 1.1387, + "step": 475 + }, + { + "epoch": 0.011560415211579682, + "grad_norm": 1.615464210510254, + "learning_rate": 2.3062108810784782e-07, + "loss": 1.1492, + "step": 480 + }, + { + "epoch": 0.01168083620336697, + "grad_norm": 2.1037256717681885, + "learning_rate": 2.3302840635532016e-07, + "loss": 1.1664, + "step": 485 + }, + { + "epoch": 0.01180125719515426, + "grad_norm": 1.4680767059326172, + "learning_rate": 2.3543572460279248e-07, + "loss": 1.1456, + "step": 490 + }, + { + "epoch": 0.011921678186941547, + "grad_norm": 1.7871757745742798, + "learning_rate": 2.378430428502648e-07, + "loss": 1.1777, + "step": 495 + }, + { + "epoch": 0.012042099178728836, + "grad_norm": 1.7084572315216064, + "learning_rate": 2.402503610977371e-07, + "loss": 1.1512, + "step": 500 + }, + { + "epoch": 0.012162520170516125, + "grad_norm": 1.815544605255127, + "learning_rate": 2.426576793452094e-07, + "loss": 1.153, + "step": 505 + }, + { + "epoch": 0.012282941162303412, + "grad_norm": 1.6143004894256592, + "learning_rate": 2.4506499759268173e-07, + "loss": 1.131, + "step": 510 + }, + { + "epoch": 0.012403362154090701, + "grad_norm": 2.2032082080841064, + "learning_rate": 2.4747231584015405e-07, + "loss": 1.1916, + "step": 515 + }, + { + "epoch": 0.012523783145877989, + "grad_norm": 1.9207594394683838, + "learning_rate": 2.4987963408762636e-07, + "loss": 1.1398, + "step": 520 + }, + { + "epoch": 0.012644204137665278, + "grad_norm": 1.954811930656433, + "learning_rate": 2.522869523350987e-07, + "loss": 1.162, + "step": 525 + }, + { + "epoch": 0.012764625129452567, + "grad_norm": 1.676818609237671, + "learning_rate": 2.54694270582571e-07, + "loss": 1.1526, + "step": 530 + }, + { + "epoch": 0.012885046121239854, + "grad_norm": 1.7171730995178223, + "learning_rate": 2.5710158883004335e-07, + "loss": 1.1389, + "step": 535 + }, + { + "epoch": 0.013005467113027143, + "grad_norm": 1.7267348766326904, + "learning_rate": 2.5950890707751567e-07, + "loss": 1.1892, + "step": 540 + }, + { + "epoch": 0.013125888104814432, + "grad_norm": 1.7431941032409668, + "learning_rate": 2.61916225324988e-07, + "loss": 1.0527, + "step": 545 + }, + { + "epoch": 0.01324630909660172, + "grad_norm": 1.5393019914627075, + "learning_rate": 2.643235435724603e-07, + "loss": 1.1091, + "step": 550 + }, + { + "epoch": 0.013366730088389008, + "grad_norm": 1.6455602645874023, + "learning_rate": 2.667308618199326e-07, + "loss": 1.108, + "step": 555 + }, + { + "epoch": 0.013487151080176297, + "grad_norm": 1.696184754371643, + "learning_rate": 2.691381800674049e-07, + "loss": 1.1133, + "step": 560 + }, + { + "epoch": 0.013607572071963584, + "grad_norm": 1.7459622621536255, + "learning_rate": 2.7154549831487724e-07, + "loss": 1.1074, + "step": 565 + }, + { + "epoch": 0.013727993063750873, + "grad_norm": 1.5549553632736206, + "learning_rate": 2.7395281656234955e-07, + "loss": 1.1855, + "step": 570 + }, + { + "epoch": 0.01384841405553816, + "grad_norm": 1.6191879510879517, + "learning_rate": 2.7636013480982187e-07, + "loss": 1.075, + "step": 575 + }, + { + "epoch": 0.01396883504732545, + "grad_norm": 1.6304470300674438, + "learning_rate": 2.787674530572942e-07, + "loss": 1.1123, + "step": 580 + }, + { + "epoch": 0.014089256039112739, + "grad_norm": 1.7658319473266602, + "learning_rate": 2.8117477130476644e-07, + "loss": 1.1768, + "step": 585 + }, + { + "epoch": 0.014209677030900026, + "grad_norm": 1.5940040349960327, + "learning_rate": 2.8358208955223876e-07, + "loss": 1.1377, + "step": 590 + }, + { + "epoch": 0.014330098022687315, + "grad_norm": 1.5074690580368042, + "learning_rate": 2.8598940779971107e-07, + "loss": 1.1538, + "step": 595 + }, + { + "epoch": 0.014450519014474604, + "grad_norm": 1.7436341047286987, + "learning_rate": 2.883967260471834e-07, + "loss": 1.0854, + "step": 600 + }, + { + "epoch": 0.014570940006261891, + "grad_norm": 1.6860382556915283, + "learning_rate": 2.908040442946557e-07, + "loss": 1.1191, + "step": 605 + }, + { + "epoch": 0.01469136099804918, + "grad_norm": 1.799471378326416, + "learning_rate": 2.9321136254212807e-07, + "loss": 1.1518, + "step": 610 + }, + { + "epoch": 0.014811781989836468, + "grad_norm": 1.7893131971359253, + "learning_rate": 2.956186807896004e-07, + "loss": 1.1027, + "step": 615 + }, + { + "epoch": 0.014932202981623757, + "grad_norm": 1.6366047859191895, + "learning_rate": 2.980259990370727e-07, + "loss": 1.1171, + "step": 620 + }, + { + "epoch": 0.015052623973411046, + "grad_norm": 1.6394248008728027, + "learning_rate": 3.00433317284545e-07, + "loss": 1.101, + "step": 625 + }, + { + "epoch": 0.015173044965198333, + "grad_norm": 1.658291220664978, + "learning_rate": 3.028406355320173e-07, + "loss": 1.144, + "step": 630 + }, + { + "epoch": 0.015293465956985622, + "grad_norm": 1.7101342678070068, + "learning_rate": 3.0524795377948964e-07, + "loss": 1.0963, + "step": 635 + }, + { + "epoch": 0.01541388694877291, + "grad_norm": 2.0829336643218994, + "learning_rate": 3.0765527202696195e-07, + "loss": 1.1316, + "step": 640 + }, + { + "epoch": 0.015534307940560198, + "grad_norm": 1.643818736076355, + "learning_rate": 3.1006259027443426e-07, + "loss": 1.1402, + "step": 645 + }, + { + "epoch": 0.015654728932347487, + "grad_norm": 1.5390375852584839, + "learning_rate": 3.124699085219066e-07, + "loss": 1.1286, + "step": 650 + }, + { + "epoch": 0.015775149924134774, + "grad_norm": 1.5295696258544922, + "learning_rate": 3.148772267693789e-07, + "loss": 1.0913, + "step": 655 + }, + { + "epoch": 0.015895570915922065, + "grad_norm": 1.7846722602844238, + "learning_rate": 3.172845450168512e-07, + "loss": 1.1475, + "step": 660 + }, + { + "epoch": 0.016015991907709352, + "grad_norm": 1.5267643928527832, + "learning_rate": 3.196918632643235e-07, + "loss": 1.147, + "step": 665 + }, + { + "epoch": 0.01613641289949664, + "grad_norm": 1.5753979682922363, + "learning_rate": 3.2209918151179583e-07, + "loss": 1.1521, + "step": 670 + }, + { + "epoch": 0.016256833891283927, + "grad_norm": 1.597781777381897, + "learning_rate": 3.2450649975926815e-07, + "loss": 1.1069, + "step": 675 + }, + { + "epoch": 0.016377254883071218, + "grad_norm": 1.9727338552474976, + "learning_rate": 3.2691381800674046e-07, + "loss": 1.1125, + "step": 680 + }, + { + "epoch": 0.016497675874858505, + "grad_norm": 1.6481128931045532, + "learning_rate": 3.2932113625421283e-07, + "loss": 1.1188, + "step": 685 + }, + { + "epoch": 0.016618096866645792, + "grad_norm": 1.6717129945755005, + "learning_rate": 3.3172845450168514e-07, + "loss": 1.1382, + "step": 690 + }, + { + "epoch": 0.016738517858433083, + "grad_norm": 1.7000445127487183, + "learning_rate": 3.3413577274915746e-07, + "loss": 1.0947, + "step": 695 + }, + { + "epoch": 0.01685893885022037, + "grad_norm": 1.8228042125701904, + "learning_rate": 3.3654309099662977e-07, + "loss": 1.109, + "step": 700 + }, + { + "epoch": 0.016979359842007657, + "grad_norm": 1.5345516204833984, + "learning_rate": 3.389504092441021e-07, + "loss": 1.1233, + "step": 705 + }, + { + "epoch": 0.017099780833794948, + "grad_norm": 1.7012462615966797, + "learning_rate": 3.413577274915744e-07, + "loss": 1.091, + "step": 710 + }, + { + "epoch": 0.017220201825582235, + "grad_norm": 1.6733700037002563, + "learning_rate": 3.437650457390467e-07, + "loss": 1.1274, + "step": 715 + }, + { + "epoch": 0.017340622817369523, + "grad_norm": 1.6725404262542725, + "learning_rate": 3.46172363986519e-07, + "loss": 1.1021, + "step": 720 + }, + { + "epoch": 0.017461043809156813, + "grad_norm": 1.902208924293518, + "learning_rate": 3.485796822339913e-07, + "loss": 1.1609, + "step": 725 + }, + { + "epoch": 0.0175814648009441, + "grad_norm": 1.8290274143218994, + "learning_rate": 3.509870004814636e-07, + "loss": 1.0894, + "step": 730 + }, + { + "epoch": 0.017701885792731388, + "grad_norm": 1.551406979560852, + "learning_rate": 3.533943187289359e-07, + "loss": 1.0974, + "step": 735 + }, + { + "epoch": 0.01782230678451868, + "grad_norm": 1.6504721641540527, + "learning_rate": 3.5580163697640823e-07, + "loss": 1.0495, + "step": 740 + }, + { + "epoch": 0.017942727776305966, + "grad_norm": 1.7599427700042725, + "learning_rate": 3.5820895522388055e-07, + "loss": 1.0976, + "step": 745 + }, + { + "epoch": 0.018063148768093253, + "grad_norm": 1.696821689605713, + "learning_rate": 3.6061627347135286e-07, + "loss": 1.1485, + "step": 750 + }, + { + "epoch": 0.018183569759880544, + "grad_norm": 1.7811964750289917, + "learning_rate": 3.6302359171882523e-07, + "loss": 1.0991, + "step": 755 + }, + { + "epoch": 0.01830399075166783, + "grad_norm": 1.7591949701309204, + "learning_rate": 3.6543090996629754e-07, + "loss": 1.1346, + "step": 760 + }, + { + "epoch": 0.01842441174345512, + "grad_norm": 1.4767632484436035, + "learning_rate": 3.6783822821376986e-07, + "loss": 1.0937, + "step": 765 + }, + { + "epoch": 0.018544832735242406, + "grad_norm": 1.8599060773849487, + "learning_rate": 3.7024554646124217e-07, + "loss": 1.0742, + "step": 770 + }, + { + "epoch": 0.018665253727029697, + "grad_norm": 1.5535491704940796, + "learning_rate": 3.726528647087145e-07, + "loss": 1.1141, + "step": 775 + }, + { + "epoch": 0.018785674718816984, + "grad_norm": 1.7345106601715088, + "learning_rate": 3.750601829561868e-07, + "loss": 1.0477, + "step": 780 + }, + { + "epoch": 0.01890609571060427, + "grad_norm": 1.6146178245544434, + "learning_rate": 3.774675012036591e-07, + "loss": 1.0765, + "step": 785 + }, + { + "epoch": 0.019026516702391562, + "grad_norm": 1.61898672580719, + "learning_rate": 3.798748194511314e-07, + "loss": 1.0599, + "step": 790 + }, + { + "epoch": 0.01914693769417885, + "grad_norm": 1.8047658205032349, + "learning_rate": 3.8228213769860374e-07, + "loss": 1.1565, + "step": 795 + }, + { + "epoch": 0.019267358685966136, + "grad_norm": 1.7502000331878662, + "learning_rate": 3.8468945594607605e-07, + "loss": 1.1029, + "step": 800 + }, + { + "epoch": 0.019387779677753427, + "grad_norm": 1.7901040315628052, + "learning_rate": 3.8709677419354837e-07, + "loss": 1.0321, + "step": 805 + }, + { + "epoch": 0.019508200669540714, + "grad_norm": 1.710308313369751, + "learning_rate": 3.895040924410207e-07, + "loss": 1.1007, + "step": 810 + }, + { + "epoch": 0.019628621661328, + "grad_norm": 1.6469848155975342, + "learning_rate": 3.91911410688493e-07, + "loss": 1.0919, + "step": 815 + }, + { + "epoch": 0.019749042653115292, + "grad_norm": 1.9024361371994019, + "learning_rate": 3.943187289359653e-07, + "loss": 1.1408, + "step": 820 + }, + { + "epoch": 0.01986946364490258, + "grad_norm": 1.4699581861495972, + "learning_rate": 3.967260471834376e-07, + "loss": 1.0799, + "step": 825 + }, + { + "epoch": 0.019989884636689867, + "grad_norm": 1.7625906467437744, + "learning_rate": 3.9913336543091e-07, + "loss": 1.0956, + "step": 830 + }, + { + "epoch": 0.020110305628477158, + "grad_norm": 1.45587158203125, + "learning_rate": 4.015406836783823e-07, + "loss": 1.1026, + "step": 835 + }, + { + "epoch": 0.020230726620264445, + "grad_norm": 1.5896087884902954, + "learning_rate": 4.039480019258546e-07, + "loss": 1.1299, + "step": 840 + }, + { + "epoch": 0.020351147612051732, + "grad_norm": 1.5886921882629395, + "learning_rate": 4.0635532017332693e-07, + "loss": 1.0504, + "step": 845 + }, + { + "epoch": 0.020471568603839023, + "grad_norm": 1.5642247200012207, + "learning_rate": 4.0876263842079925e-07, + "loss": 1.0425, + "step": 850 + }, + { + "epoch": 0.02059198959562631, + "grad_norm": 1.7490164041519165, + "learning_rate": 4.111699566682715e-07, + "loss": 1.047, + "step": 855 + }, + { + "epoch": 0.020712410587413597, + "grad_norm": 1.6565054655075073, + "learning_rate": 4.135772749157438e-07, + "loss": 1.0785, + "step": 860 + }, + { + "epoch": 0.020832831579200885, + "grad_norm": 1.748234510421753, + "learning_rate": 4.1598459316321614e-07, + "loss": 1.09, + "step": 865 + }, + { + "epoch": 0.020953252570988175, + "grad_norm": 1.5565547943115234, + "learning_rate": 4.1839191141068845e-07, + "loss": 1.043, + "step": 870 + }, + { + "epoch": 0.021073673562775463, + "grad_norm": 1.482420802116394, + "learning_rate": 4.2079922965816076e-07, + "loss": 1.0658, + "step": 875 + }, + { + "epoch": 0.02119409455456275, + "grad_norm": 1.6103140115737915, + "learning_rate": 4.232065479056331e-07, + "loss": 1.0916, + "step": 880 + }, + { + "epoch": 0.02131451554635004, + "grad_norm": 1.6495821475982666, + "learning_rate": 4.256138661531054e-07, + "loss": 1.0948, + "step": 885 + }, + { + "epoch": 0.021434936538137328, + "grad_norm": 1.6755701303482056, + "learning_rate": 4.280211844005777e-07, + "loss": 1.0383, + "step": 890 + }, + { + "epoch": 0.021555357529924615, + "grad_norm": 1.5431663990020752, + "learning_rate": 4.3042850264805e-07, + "loss": 1.1197, + "step": 895 + }, + { + "epoch": 0.021675778521711906, + "grad_norm": 1.7741978168487549, + "learning_rate": 4.3283582089552234e-07, + "loss": 1.0617, + "step": 900 + }, + { + "epoch": 0.021796199513499193, + "grad_norm": 1.4438221454620361, + "learning_rate": 4.352431391429947e-07, + "loss": 1.0777, + "step": 905 + }, + { + "epoch": 0.02191662050528648, + "grad_norm": 1.625065565109253, + "learning_rate": 4.37650457390467e-07, + "loss": 1.01, + "step": 910 + }, + { + "epoch": 0.02203704149707377, + "grad_norm": 1.5660688877105713, + "learning_rate": 4.4005777563793933e-07, + "loss": 1.0914, + "step": 915 + }, + { + "epoch": 0.02215746248886106, + "grad_norm": 1.5976072549819946, + "learning_rate": 4.4246509388541164e-07, + "loss": 1.0577, + "step": 920 + }, + { + "epoch": 0.022277883480648346, + "grad_norm": 1.7111833095550537, + "learning_rate": 4.4487241213288396e-07, + "loss": 1.0814, + "step": 925 + }, + { + "epoch": 0.022398304472435637, + "grad_norm": 1.7777079343795776, + "learning_rate": 4.4727973038035627e-07, + "loss": 1.0339, + "step": 930 + }, + { + "epoch": 0.022518725464222924, + "grad_norm": 1.6317858695983887, + "learning_rate": 4.496870486278286e-07, + "loss": 1.0387, + "step": 935 + }, + { + "epoch": 0.02263914645601021, + "grad_norm": 1.6453614234924316, + "learning_rate": 4.520943668753009e-07, + "loss": 1.078, + "step": 940 + }, + { + "epoch": 0.0227595674477975, + "grad_norm": 1.5312925577163696, + "learning_rate": 4.545016851227732e-07, + "loss": 1.0616, + "step": 945 + }, + { + "epoch": 0.02287998843958479, + "grad_norm": 1.4723235368728638, + "learning_rate": 4.5690900337024553e-07, + "loss": 1.054, + "step": 950 + }, + { + "epoch": 0.023000409431372076, + "grad_norm": 1.6957050561904907, + "learning_rate": 4.5931632161771784e-07, + "loss": 1.0612, + "step": 955 + }, + { + "epoch": 0.023120830423159364, + "grad_norm": 1.5759074687957764, + "learning_rate": 4.6172363986519016e-07, + "loss": 1.005, + "step": 960 + }, + { + "epoch": 0.023241251414946654, + "grad_norm": 1.7994741201400757, + "learning_rate": 4.6413095811266247e-07, + "loss": 1.115, + "step": 965 + }, + { + "epoch": 0.02336167240673394, + "grad_norm": 1.5642887353897095, + "learning_rate": 4.665382763601348e-07, + "loss": 1.1111, + "step": 970 + }, + { + "epoch": 0.02348209339852123, + "grad_norm": 1.8060336112976074, + "learning_rate": 4.6894559460760715e-07, + "loss": 1.0129, + "step": 975 + }, + { + "epoch": 0.02360251439030852, + "grad_norm": 1.6655142307281494, + "learning_rate": 4.7135291285507947e-07, + "loss": 1.0414, + "step": 980 + }, + { + "epoch": 0.023722935382095807, + "grad_norm": 1.7621194124221802, + "learning_rate": 4.737602311025518e-07, + "loss": 1.1035, + "step": 985 + }, + { + "epoch": 0.023843356373883094, + "grad_norm": 1.5201829671859741, + "learning_rate": 4.7616754935002404e-07, + "loss": 1.0852, + "step": 990 + }, + { + "epoch": 0.023963777365670385, + "grad_norm": 1.6027559041976929, + "learning_rate": 4.785748675974964e-07, + "loss": 1.0891, + "step": 995 + }, + { + "epoch": 0.024084198357457672, + "grad_norm": 1.7190868854522705, + "learning_rate": 4.809821858449687e-07, + "loss": 1.092, + "step": 1000 + }, + { + "epoch": 0.02420461934924496, + "grad_norm": 1.8872677087783813, + "learning_rate": 4.83389504092441e-07, + "loss": 1.0614, + "step": 1005 + }, + { + "epoch": 0.02432504034103225, + "grad_norm": 1.4623500108718872, + "learning_rate": 4.857968223399134e-07, + "loss": 1.0281, + "step": 1010 + }, + { + "epoch": 0.024445461332819537, + "grad_norm": 1.6613128185272217, + "learning_rate": 4.882041405873856e-07, + "loss": 1.0937, + "step": 1015 + }, + { + "epoch": 0.024565882324606825, + "grad_norm": 1.533061146736145, + "learning_rate": 4.90611458834858e-07, + "loss": 1.0469, + "step": 1020 + }, + { + "epoch": 0.024686303316394115, + "grad_norm": 1.6154255867004395, + "learning_rate": 4.930187770823302e-07, + "loss": 1.0317, + "step": 1025 + }, + { + "epoch": 0.024806724308181403, + "grad_norm": 2.1915018558502197, + "learning_rate": 4.954260953298026e-07, + "loss": 1.0589, + "step": 1030 + }, + { + "epoch": 0.02492714529996869, + "grad_norm": 1.678594708442688, + "learning_rate": 4.978334135772749e-07, + "loss": 0.9996, + "step": 1035 + }, + { + "epoch": 0.025047566291755977, + "grad_norm": 2.124476909637451, + "learning_rate": 5.002407318247472e-07, + "loss": 1.0824, + "step": 1040 + }, + { + "epoch": 0.025167987283543268, + "grad_norm": 1.5182719230651855, + "learning_rate": 5.026480500722196e-07, + "loss": 1.0825, + "step": 1045 + }, + { + "epoch": 0.025288408275330555, + "grad_norm": 2.0846803188323975, + "learning_rate": 5.050553683196919e-07, + "loss": 1.037, + "step": 1050 + }, + { + "epoch": 0.025408829267117843, + "grad_norm": 1.5982701778411865, + "learning_rate": 5.074626865671642e-07, + "loss": 0.988, + "step": 1055 + }, + { + "epoch": 0.025529250258905133, + "grad_norm": 1.4213273525238037, + "learning_rate": 5.098700048146365e-07, + "loss": 1.0748, + "step": 1060 + }, + { + "epoch": 0.02564967125069242, + "grad_norm": 1.5166436433792114, + "learning_rate": 5.122773230621088e-07, + "loss": 1.0678, + "step": 1065 + }, + { + "epoch": 0.025770092242479708, + "grad_norm": 1.5300348997116089, + "learning_rate": 5.146846413095811e-07, + "loss": 1.0731, + "step": 1070 + }, + { + "epoch": 0.025890513234267, + "grad_norm": 1.6487699747085571, + "learning_rate": 5.170919595570534e-07, + "loss": 1.0892, + "step": 1075 + }, + { + "epoch": 0.026010934226054286, + "grad_norm": 1.641617774963379, + "learning_rate": 5.194992778045257e-07, + "loss": 1.0668, + "step": 1080 + }, + { + "epoch": 0.026131355217841573, + "grad_norm": 1.632634162902832, + "learning_rate": 5.21906596051998e-07, + "loss": 1.1131, + "step": 1085 + }, + { + "epoch": 0.026251776209628864, + "grad_norm": 1.6787406206130981, + "learning_rate": 5.243139142994704e-07, + "loss": 1.0292, + "step": 1090 + }, + { + "epoch": 0.02637219720141615, + "grad_norm": 1.5313917398452759, + "learning_rate": 5.267212325469426e-07, + "loss": 1.0467, + "step": 1095 + }, + { + "epoch": 0.02649261819320344, + "grad_norm": 1.5146945714950562, + "learning_rate": 5.29128550794415e-07, + "loss": 1.097, + "step": 1100 + }, + { + "epoch": 0.02661303918499073, + "grad_norm": 1.5477287769317627, + "learning_rate": 5.315358690418873e-07, + "loss": 1.0578, + "step": 1105 + }, + { + "epoch": 0.026733460176778016, + "grad_norm": 1.5698777437210083, + "learning_rate": 5.339431872893596e-07, + "loss": 1.0324, + "step": 1110 + }, + { + "epoch": 0.026853881168565304, + "grad_norm": 1.6078376770019531, + "learning_rate": 5.363505055368319e-07, + "loss": 1.0931, + "step": 1115 + }, + { + "epoch": 0.026974302160352594, + "grad_norm": 1.6052172183990479, + "learning_rate": 5.387578237843043e-07, + "loss": 0.9986, + "step": 1120 + }, + { + "epoch": 0.02709472315213988, + "grad_norm": 1.492550015449524, + "learning_rate": 5.411651420317765e-07, + "loss": 1.0325, + "step": 1125 + }, + { + "epoch": 0.02721514414392717, + "grad_norm": 1.6259959936141968, + "learning_rate": 5.435724602792489e-07, + "loss": 1.0123, + "step": 1130 + }, + { + "epoch": 0.027335565135714456, + "grad_norm": 1.3507508039474487, + "learning_rate": 5.459797785267211e-07, + "loss": 1.0907, + "step": 1135 + }, + { + "epoch": 0.027455986127501747, + "grad_norm": 1.743194818496704, + "learning_rate": 5.483870967741935e-07, + "loss": 1.0432, + "step": 1140 + }, + { + "epoch": 0.027576407119289034, + "grad_norm": 1.5899051427841187, + "learning_rate": 5.507944150216658e-07, + "loss": 1.0519, + "step": 1145 + }, + { + "epoch": 0.02769682811107632, + "grad_norm": 1.5947670936584473, + "learning_rate": 5.532017332691381e-07, + "loss": 1.1119, + "step": 1150 + }, + { + "epoch": 0.027817249102863612, + "grad_norm": 1.6205607652664185, + "learning_rate": 5.556090515166104e-07, + "loss": 1.0232, + "step": 1155 + }, + { + "epoch": 0.0279376700946509, + "grad_norm": 1.6683999300003052, + "learning_rate": 5.580163697640829e-07, + "loss": 1.0367, + "step": 1160 + }, + { + "epoch": 0.028058091086438187, + "grad_norm": 1.691907525062561, + "learning_rate": 5.604236880115551e-07, + "loss": 1.0982, + "step": 1165 + }, + { + "epoch": 0.028178512078225477, + "grad_norm": 1.7351024150848389, + "learning_rate": 5.628310062590275e-07, + "loss": 1.0043, + "step": 1170 + }, + { + "epoch": 0.028298933070012765, + "grad_norm": 1.624670386314392, + "learning_rate": 5.652383245064998e-07, + "loss": 1.0359, + "step": 1175 + }, + { + "epoch": 0.028419354061800052, + "grad_norm": 1.611996054649353, + "learning_rate": 5.676456427539721e-07, + "loss": 1.0703, + "step": 1180 + }, + { + "epoch": 0.028539775053587343, + "grad_norm": 1.5940327644348145, + "learning_rate": 5.700529610014444e-07, + "loss": 1.0034, + "step": 1185 + }, + { + "epoch": 0.02866019604537463, + "grad_norm": 1.5508549213409424, + "learning_rate": 5.724602792489168e-07, + "loss": 0.9925, + "step": 1190 + }, + { + "epoch": 0.028780617037161917, + "grad_norm": 1.7031129598617554, + "learning_rate": 5.74867597496389e-07, + "loss": 1.0722, + "step": 1195 + }, + { + "epoch": 0.028901038028949208, + "grad_norm": 1.542175531387329, + "learning_rate": 5.772749157438613e-07, + "loss": 1.0558, + "step": 1200 + }, + { + "epoch": 0.029021459020736495, + "grad_norm": 1.6221057176589966, + "learning_rate": 5.796822339913337e-07, + "loss": 0.9962, + "step": 1205 + }, + { + "epoch": 0.029141880012523783, + "grad_norm": 1.7214235067367554, + "learning_rate": 5.820895522388059e-07, + "loss": 1.0642, + "step": 1210 + }, + { + "epoch": 0.02926230100431107, + "grad_norm": 1.7093576192855835, + "learning_rate": 5.844968704862783e-07, + "loss": 1.0311, + "step": 1215 + }, + { + "epoch": 0.02938272199609836, + "grad_norm": 1.5216920375823975, + "learning_rate": 5.869041887337505e-07, + "loss": 1.0486, + "step": 1220 + }, + { + "epoch": 0.029503142987885648, + "grad_norm": 1.604498028755188, + "learning_rate": 5.893115069812229e-07, + "loss": 1.0437, + "step": 1225 + }, + { + "epoch": 0.029623563979672935, + "grad_norm": 1.562648057937622, + "learning_rate": 5.917188252286952e-07, + "loss": 1.1011, + "step": 1230 + }, + { + "epoch": 0.029743984971460226, + "grad_norm": 1.6788482666015625, + "learning_rate": 5.941261434761675e-07, + "loss": 1.0703, + "step": 1235 + }, + { + "epoch": 0.029864405963247513, + "grad_norm": 1.5976691246032715, + "learning_rate": 5.965334617236398e-07, + "loss": 1.0202, + "step": 1240 + }, + { + "epoch": 0.0299848269550348, + "grad_norm": 1.937235713005066, + "learning_rate": 5.989407799711122e-07, + "loss": 1.0541, + "step": 1245 + }, + { + "epoch": 0.03010524794682209, + "grad_norm": 1.5281552076339722, + "learning_rate": 6.013480982185844e-07, + "loss": 1.0317, + "step": 1250 + }, + { + "epoch": 0.03022566893860938, + "grad_norm": 1.8217495679855347, + "learning_rate": 6.037554164660568e-07, + "loss": 1.0604, + "step": 1255 + }, + { + "epoch": 0.030346089930396666, + "grad_norm": 1.394348382949829, + "learning_rate": 6.061627347135291e-07, + "loss": 1.0598, + "step": 1260 + }, + { + "epoch": 0.030466510922183956, + "grad_norm": 1.6202285289764404, + "learning_rate": 6.085700529610014e-07, + "loss": 1.0583, + "step": 1265 + }, + { + "epoch": 0.030586931913971244, + "grad_norm": 1.7116400003433228, + "learning_rate": 6.109773712084737e-07, + "loss": 1.0783, + "step": 1270 + }, + { + "epoch": 0.03070735290575853, + "grad_norm": 1.7912710905075073, + "learning_rate": 6.13384689455946e-07, + "loss": 1.0799, + "step": 1275 + }, + { + "epoch": 0.03082777389754582, + "grad_norm": 1.6622925996780396, + "learning_rate": 6.157920077034183e-07, + "loss": 1.0426, + "step": 1280 + }, + { + "epoch": 0.03094819488933311, + "grad_norm": 1.9057449102401733, + "learning_rate": 6.181993259508907e-07, + "loss": 1.064, + "step": 1285 + }, + { + "epoch": 0.031068615881120396, + "grad_norm": 1.7786000967025757, + "learning_rate": 6.206066441983629e-07, + "loss": 1.062, + "step": 1290 + }, + { + "epoch": 0.031189036872907687, + "grad_norm": 1.4491337537765503, + "learning_rate": 6.230139624458353e-07, + "loss": 1.0659, + "step": 1295 + }, + { + "epoch": 0.031309457864694974, + "grad_norm": 1.662503957748413, + "learning_rate": 6.254212806933076e-07, + "loss": 1.0827, + "step": 1300 + }, + { + "epoch": 0.03142987885648226, + "grad_norm": 1.7556451559066772, + "learning_rate": 6.2782859894078e-07, + "loss": 1.0, + "step": 1305 + }, + { + "epoch": 0.03155029984826955, + "grad_norm": 1.7393630743026733, + "learning_rate": 6.302359171882523e-07, + "loss": 1.046, + "step": 1310 + }, + { + "epoch": 0.031670720840056836, + "grad_norm": 1.614888310432434, + "learning_rate": 6.326432354357247e-07, + "loss": 1.04, + "step": 1315 + }, + { + "epoch": 0.03179114183184413, + "grad_norm": 1.588411808013916, + "learning_rate": 6.350505536831969e-07, + "loss": 1.1276, + "step": 1320 + }, + { + "epoch": 0.03191156282363142, + "grad_norm": 1.5537513494491577, + "learning_rate": 6.374578719306693e-07, + "loss": 1.0437, + "step": 1325 + }, + { + "epoch": 0.032031983815418705, + "grad_norm": 1.452207326889038, + "learning_rate": 6.398651901781416e-07, + "loss": 1.0394, + "step": 1330 + }, + { + "epoch": 0.03215240480720599, + "grad_norm": 1.6652417182922363, + "learning_rate": 6.422725084256138e-07, + "loss": 1.021, + "step": 1335 + }, + { + "epoch": 0.03227282579899328, + "grad_norm": 1.5968703031539917, + "learning_rate": 6.446798266730862e-07, + "loss": 1.0361, + "step": 1340 + }, + { + "epoch": 0.03239324679078057, + "grad_norm": 1.6294167041778564, + "learning_rate": 6.470871449205584e-07, + "loss": 1.0288, + "step": 1345 + }, + { + "epoch": 0.032513667782567854, + "grad_norm": 2.108187675476074, + "learning_rate": 6.494944631680308e-07, + "loss": 1.1072, + "step": 1350 + }, + { + "epoch": 0.03263408877435515, + "grad_norm": 1.5419865846633911, + "learning_rate": 6.519017814155031e-07, + "loss": 0.9997, + "step": 1355 + }, + { + "epoch": 0.032754509766142435, + "grad_norm": 1.8960027694702148, + "learning_rate": 6.543090996629754e-07, + "loss": 1.0657, + "step": 1360 + }, + { + "epoch": 0.03287493075792972, + "grad_norm": 1.8205565214157104, + "learning_rate": 6.567164179104477e-07, + "loss": 1.0377, + "step": 1365 + }, + { + "epoch": 0.03299535174971701, + "grad_norm": 1.4616715908050537, + "learning_rate": 6.591237361579201e-07, + "loss": 1.0416, + "step": 1370 + }, + { + "epoch": 0.0331157727415043, + "grad_norm": 1.6263893842697144, + "learning_rate": 6.615310544053923e-07, + "loss": 1.0303, + "step": 1375 + }, + { + "epoch": 0.033236193733291584, + "grad_norm": 1.5705087184906006, + "learning_rate": 6.639383726528647e-07, + "loss": 1.0414, + "step": 1380 + }, + { + "epoch": 0.03335661472507888, + "grad_norm": 1.559043526649475, + "learning_rate": 6.66345690900337e-07, + "loss": 1.0892, + "step": 1385 + }, + { + "epoch": 0.033477035716866166, + "grad_norm": 1.5846097469329834, + "learning_rate": 6.687530091478093e-07, + "loss": 1.0132, + "step": 1390 + }, + { + "epoch": 0.03359745670865345, + "grad_norm": 1.6560180187225342, + "learning_rate": 6.711603273952816e-07, + "loss": 1.0384, + "step": 1395 + }, + { + "epoch": 0.03371787770044074, + "grad_norm": 1.416507601737976, + "learning_rate": 6.73567645642754e-07, + "loss": 0.9696, + "step": 1400 + }, + { + "epoch": 0.03383829869222803, + "grad_norm": 1.463869333267212, + "learning_rate": 6.759749638902262e-07, + "loss": 1.0099, + "step": 1405 + }, + { + "epoch": 0.033958719684015315, + "grad_norm": 1.4500248432159424, + "learning_rate": 6.783822821376986e-07, + "loss": 1.0786, + "step": 1410 + }, + { + "epoch": 0.03407914067580261, + "grad_norm": 1.8260598182678223, + "learning_rate": 6.807896003851708e-07, + "loss": 1.0813, + "step": 1415 + }, + { + "epoch": 0.034199561667589896, + "grad_norm": 1.6602531671524048, + "learning_rate": 6.831969186326432e-07, + "loss": 0.9867, + "step": 1420 + }, + { + "epoch": 0.034319982659377184, + "grad_norm": 1.6317486763000488, + "learning_rate": 6.856042368801155e-07, + "loss": 0.9797, + "step": 1425 + }, + { + "epoch": 0.03444040365116447, + "grad_norm": 1.5111490488052368, + "learning_rate": 6.880115551275878e-07, + "loss": 1.0456, + "step": 1430 + }, + { + "epoch": 0.03456082464295176, + "grad_norm": 1.5642411708831787, + "learning_rate": 6.904188733750601e-07, + "loss": 0.9404, + "step": 1435 + }, + { + "epoch": 0.034681245634739045, + "grad_norm": 1.9178805351257324, + "learning_rate": 6.928261916225325e-07, + "loss": 1.0056, + "step": 1440 + }, + { + "epoch": 0.03480166662652633, + "grad_norm": 1.6757012605667114, + "learning_rate": 6.952335098700047e-07, + "loss": 1.0829, + "step": 1445 + }, + { + "epoch": 0.03492208761831363, + "grad_norm": 1.7237210273742676, + "learning_rate": 6.976408281174772e-07, + "loss": 1.0273, + "step": 1450 + }, + { + "epoch": 0.035042508610100914, + "grad_norm": 2.0043551921844482, + "learning_rate": 7.000481463649495e-07, + "loss": 1.0238, + "step": 1455 + }, + { + "epoch": 0.0351629296018882, + "grad_norm": 1.5664793252944946, + "learning_rate": 7.024554646124218e-07, + "loss": 0.9788, + "step": 1460 + }, + { + "epoch": 0.03528335059367549, + "grad_norm": 1.595779299736023, + "learning_rate": 7.048627828598941e-07, + "loss": 1.0589, + "step": 1465 + }, + { + "epoch": 0.035403771585462776, + "grad_norm": 1.4906885623931885, + "learning_rate": 7.072701011073664e-07, + "loss": 1.0575, + "step": 1470 + }, + { + "epoch": 0.03552419257725006, + "grad_norm": 1.6824171543121338, + "learning_rate": 7.096774193548387e-07, + "loss": 1.074, + "step": 1475 + }, + { + "epoch": 0.03564461356903736, + "grad_norm": 1.4851865768432617, + "learning_rate": 7.12084737602311e-07, + "loss": 1.0499, + "step": 1480 + }, + { + "epoch": 0.035765034560824645, + "grad_norm": 1.6125143766403198, + "learning_rate": 7.144920558497833e-07, + "loss": 1.0288, + "step": 1485 + }, + { + "epoch": 0.03588545555261193, + "grad_norm": 1.6875344514846802, + "learning_rate": 7.168993740972556e-07, + "loss": 1.0351, + "step": 1490 + }, + { + "epoch": 0.03600587654439922, + "grad_norm": 1.6106306314468384, + "learning_rate": 7.19306692344728e-07, + "loss": 1.0325, + "step": 1495 + }, + { + "epoch": 0.03612629753618651, + "grad_norm": 1.4459638595581055, + "learning_rate": 7.217140105922002e-07, + "loss": 1.0201, + "step": 1500 + }, + { + "epoch": 0.036246718527973794, + "grad_norm": 1.626741647720337, + "learning_rate": 7.241213288396726e-07, + "loss": 1.0355, + "step": 1505 + }, + { + "epoch": 0.03636713951976109, + "grad_norm": 1.4596201181411743, + "learning_rate": 7.265286470871449e-07, + "loss": 1.0033, + "step": 1510 + }, + { + "epoch": 0.036487560511548375, + "grad_norm": 1.3791264295578003, + "learning_rate": 7.289359653346172e-07, + "loss": 1.0504, + "step": 1515 + }, + { + "epoch": 0.03660798150333566, + "grad_norm": 1.399931788444519, + "learning_rate": 7.313432835820895e-07, + "loss": 1.0174, + "step": 1520 + }, + { + "epoch": 0.03672840249512295, + "grad_norm": 1.4853743314743042, + "learning_rate": 7.337506018295619e-07, + "loss": 1.0472, + "step": 1525 + }, + { + "epoch": 0.03684882348691024, + "grad_norm": 1.5996602773666382, + "learning_rate": 7.361579200770341e-07, + "loss": 1.0267, + "step": 1530 + }, + { + "epoch": 0.036969244478697524, + "grad_norm": 1.67296302318573, + "learning_rate": 7.385652383245065e-07, + "loss": 1.0361, + "step": 1535 + }, + { + "epoch": 0.03708966547048481, + "grad_norm": 1.6642286777496338, + "learning_rate": 7.409725565719787e-07, + "loss": 1.0457, + "step": 1540 + }, + { + "epoch": 0.037210086462272106, + "grad_norm": 1.4878441095352173, + "learning_rate": 7.433798748194511e-07, + "loss": 1.0817, + "step": 1545 + }, + { + "epoch": 0.03733050745405939, + "grad_norm": 1.816658854484558, + "learning_rate": 7.457871930669234e-07, + "loss": 1.0329, + "step": 1550 + }, + { + "epoch": 0.03745092844584668, + "grad_norm": 1.480438470840454, + "learning_rate": 7.481945113143957e-07, + "loss": 1.0022, + "step": 1555 + }, + { + "epoch": 0.03757134943763397, + "grad_norm": 1.4642038345336914, + "learning_rate": 7.50601829561868e-07, + "loss": 1.0261, + "step": 1560 + }, + { + "epoch": 0.037691770429421255, + "grad_norm": 1.5570502281188965, + "learning_rate": 7.530091478093404e-07, + "loss": 1.0731, + "step": 1565 + }, + { + "epoch": 0.03781219142120854, + "grad_norm": 1.5301499366760254, + "learning_rate": 7.554164660568126e-07, + "loss": 1.0498, + "step": 1570 + }, + { + "epoch": 0.037932612412995836, + "grad_norm": 1.3333464860916138, + "learning_rate": 7.57823784304285e-07, + "loss": 0.9946, + "step": 1575 + }, + { + "epoch": 0.038053033404783124, + "grad_norm": 1.6789193153381348, + "learning_rate": 7.602311025517573e-07, + "loss": 0.9817, + "step": 1580 + }, + { + "epoch": 0.03817345439657041, + "grad_norm": 1.4923834800720215, + "learning_rate": 7.626384207992296e-07, + "loss": 1.0567, + "step": 1585 + }, + { + "epoch": 0.0382938753883577, + "grad_norm": 1.7668758630752563, + "learning_rate": 7.650457390467019e-07, + "loss": 0.984, + "step": 1590 + }, + { + "epoch": 0.038414296380144985, + "grad_norm": 1.4018738269805908, + "learning_rate": 7.674530572941742e-07, + "loss": 0.9654, + "step": 1595 + }, + { + "epoch": 0.03853471737193227, + "grad_norm": 1.733635425567627, + "learning_rate": 7.698603755416466e-07, + "loss": 1.0245, + "step": 1600 + }, + { + "epoch": 0.03865513836371957, + "grad_norm": 1.5820351839065552, + "learning_rate": 7.722676937891189e-07, + "loss": 1.0502, + "step": 1605 + }, + { + "epoch": 0.038775559355506854, + "grad_norm": 1.7299373149871826, + "learning_rate": 7.746750120365913e-07, + "loss": 1.0013, + "step": 1610 + }, + { + "epoch": 0.03889598034729414, + "grad_norm": 1.5306358337402344, + "learning_rate": 7.770823302840635e-07, + "loss": 1.0434, + "step": 1615 + }, + { + "epoch": 0.03901640133908143, + "grad_norm": 1.6972306966781616, + "learning_rate": 7.794896485315359e-07, + "loss": 1.0443, + "step": 1620 + }, + { + "epoch": 0.039136822330868716, + "grad_norm": 1.8630884885787964, + "learning_rate": 7.818969667790081e-07, + "loss": 0.9966, + "step": 1625 + }, + { + "epoch": 0.039257243322656, + "grad_norm": 1.4289274215698242, + "learning_rate": 7.843042850264805e-07, + "loss": 1.0051, + "step": 1630 + }, + { + "epoch": 0.03937766431444329, + "grad_norm": 1.3741828203201294, + "learning_rate": 7.867116032739528e-07, + "loss": 0.9737, + "step": 1635 + }, + { + "epoch": 0.039498085306230585, + "grad_norm": 1.6088579893112183, + "learning_rate": 7.891189215214251e-07, + "loss": 1.0336, + "step": 1640 + }, + { + "epoch": 0.03961850629801787, + "grad_norm": 1.3409250974655151, + "learning_rate": 7.915262397688974e-07, + "loss": 1.0477, + "step": 1645 + }, + { + "epoch": 0.03973892728980516, + "grad_norm": 1.3570778369903564, + "learning_rate": 7.939335580163698e-07, + "loss": 0.9611, + "step": 1650 + }, + { + "epoch": 0.03985934828159245, + "grad_norm": 1.4607723951339722, + "learning_rate": 7.96340876263842e-07, + "loss": 1.0476, + "step": 1655 + }, + { + "epoch": 0.039979769273379734, + "grad_norm": 1.4103513956069946, + "learning_rate": 7.987481945113144e-07, + "loss": 1.0038, + "step": 1660 + }, + { + "epoch": 0.04010019026516702, + "grad_norm": 1.7692028284072876, + "learning_rate": 8.011555127587867e-07, + "loss": 1.0486, + "step": 1665 + }, + { + "epoch": 0.040220611256954315, + "grad_norm": 1.7130166292190552, + "learning_rate": 8.03562831006259e-07, + "loss": 1.0355, + "step": 1670 + }, + { + "epoch": 0.0403410322487416, + "grad_norm": 1.7267684936523438, + "learning_rate": 8.059701492537313e-07, + "loss": 0.9746, + "step": 1675 + }, + { + "epoch": 0.04046145324052889, + "grad_norm": 1.5484259128570557, + "learning_rate": 8.083774675012036e-07, + "loss": 1.08, + "step": 1680 + }, + { + "epoch": 0.04058187423231618, + "grad_norm": 1.6294459104537964, + "learning_rate": 8.107847857486759e-07, + "loss": 1.0619, + "step": 1685 + }, + { + "epoch": 0.040702295224103464, + "grad_norm": 1.4222997426986694, + "learning_rate": 8.131921039961483e-07, + "loss": 1.0091, + "step": 1690 + }, + { + "epoch": 0.04082271621589075, + "grad_norm": 1.7014527320861816, + "learning_rate": 8.155994222436205e-07, + "loss": 1.0323, + "step": 1695 + }, + { + "epoch": 0.040943137207678046, + "grad_norm": 1.7980411052703857, + "learning_rate": 8.180067404910929e-07, + "loss": 1.064, + "step": 1700 + }, + { + "epoch": 0.04106355819946533, + "grad_norm": 1.468579888343811, + "learning_rate": 8.204140587385652e-07, + "loss": 0.9758, + "step": 1705 + }, + { + "epoch": 0.04118397919125262, + "grad_norm": 1.6334229707717896, + "learning_rate": 8.228213769860375e-07, + "loss": 1.0022, + "step": 1710 + }, + { + "epoch": 0.04130440018303991, + "grad_norm": 1.5733845233917236, + "learning_rate": 8.252286952335098e-07, + "loss": 1.0361, + "step": 1715 + }, + { + "epoch": 0.041424821174827195, + "grad_norm": 1.8664427995681763, + "learning_rate": 8.276360134809822e-07, + "loss": 1.0545, + "step": 1720 + }, + { + "epoch": 0.04154524216661448, + "grad_norm": 1.562538743019104, + "learning_rate": 8.300433317284544e-07, + "loss": 1.0639, + "step": 1725 + }, + { + "epoch": 0.04166566315840177, + "grad_norm": 1.882519245147705, + "learning_rate": 8.324506499759267e-07, + "loss": 0.9948, + "step": 1730 + }, + { + "epoch": 0.041786084150189064, + "grad_norm": 1.424924373626709, + "learning_rate": 8.34857968223399e-07, + "loss": 1.0339, + "step": 1735 + }, + { + "epoch": 0.04190650514197635, + "grad_norm": 1.9680640697479248, + "learning_rate": 8.372652864708713e-07, + "loss": 1.0502, + "step": 1740 + }, + { + "epoch": 0.04202692613376364, + "grad_norm": 1.8039034605026245, + "learning_rate": 8.396726047183438e-07, + "loss": 1.0537, + "step": 1745 + }, + { + "epoch": 0.042147347125550925, + "grad_norm": 1.5843199491500854, + "learning_rate": 8.42079922965816e-07, + "loss": 1.0155, + "step": 1750 + }, + { + "epoch": 0.04226776811733821, + "grad_norm": 1.2623687982559204, + "learning_rate": 8.444872412132884e-07, + "loss": 0.992, + "step": 1755 + }, + { + "epoch": 0.0423881891091255, + "grad_norm": 2.0166735649108887, + "learning_rate": 8.468945594607607e-07, + "loss": 1.0148, + "step": 1760 + }, + { + "epoch": 0.042508610100912794, + "grad_norm": 1.8024189472198486, + "learning_rate": 8.49301877708233e-07, + "loss": 1.0526, + "step": 1765 + }, + { + "epoch": 0.04262903109270008, + "grad_norm": 1.444460391998291, + "learning_rate": 8.517091959557053e-07, + "loss": 1.0287, + "step": 1770 + }, + { + "epoch": 0.04274945208448737, + "grad_norm": 1.9382981061935425, + "learning_rate": 8.541165142031777e-07, + "loss": 1.0336, + "step": 1775 + }, + { + "epoch": 0.042869873076274656, + "grad_norm": 1.5555084943771362, + "learning_rate": 8.565238324506499e-07, + "loss": 1.0116, + "step": 1780 + }, + { + "epoch": 0.04299029406806194, + "grad_norm": 1.4860131740570068, + "learning_rate": 8.589311506981223e-07, + "loss": 1.0343, + "step": 1785 + }, + { + "epoch": 0.04311071505984923, + "grad_norm": 1.5422332286834717, + "learning_rate": 8.613384689455946e-07, + "loss": 1.0307, + "step": 1790 + }, + { + "epoch": 0.04323113605163652, + "grad_norm": 1.5868923664093018, + "learning_rate": 8.637457871930669e-07, + "loss": 1.0291, + "step": 1795 + }, + { + "epoch": 0.04335155704342381, + "grad_norm": 1.5933997631072998, + "learning_rate": 8.661531054405392e-07, + "loss": 1.0708, + "step": 1800 + }, + { + "epoch": 0.0434719780352111, + "grad_norm": 1.8912562131881714, + "learning_rate": 8.685604236880116e-07, + "loss": 1.0921, + "step": 1805 + }, + { + "epoch": 0.04359239902699839, + "grad_norm": 1.5516586303710938, + "learning_rate": 8.709677419354838e-07, + "loss": 1.0165, + "step": 1810 + }, + { + "epoch": 0.043712820018785674, + "grad_norm": 1.5522100925445557, + "learning_rate": 8.733750601829562e-07, + "loss": 1.0371, + "step": 1815 + }, + { + "epoch": 0.04383324101057296, + "grad_norm": 1.924184799194336, + "learning_rate": 8.757823784304284e-07, + "loss": 1.0591, + "step": 1820 + }, + { + "epoch": 0.04395366200236025, + "grad_norm": 1.5833402872085571, + "learning_rate": 8.781896966779008e-07, + "loss": 1.0504, + "step": 1825 + }, + { + "epoch": 0.04407408299414754, + "grad_norm": 1.7274823188781738, + "learning_rate": 8.805970149253731e-07, + "loss": 1.0242, + "step": 1830 + }, + { + "epoch": 0.04419450398593483, + "grad_norm": 1.5076647996902466, + "learning_rate": 8.830043331728454e-07, + "loss": 0.9765, + "step": 1835 + }, + { + "epoch": 0.04431492497772212, + "grad_norm": 1.5085982084274292, + "learning_rate": 8.854116514203177e-07, + "loss": 1.0185, + "step": 1840 + }, + { + "epoch": 0.044435345969509404, + "grad_norm": 1.6915234327316284, + "learning_rate": 8.878189696677901e-07, + "loss": 1.0761, + "step": 1845 + }, + { + "epoch": 0.04455576696129669, + "grad_norm": 1.6578340530395508, + "learning_rate": 8.902262879152623e-07, + "loss": 1.0643, + "step": 1850 + }, + { + "epoch": 0.04467618795308398, + "grad_norm": 1.6258350610733032, + "learning_rate": 8.926336061627347e-07, + "loss": 1.0495, + "step": 1855 + }, + { + "epoch": 0.04479660894487127, + "grad_norm": 1.6774884462356567, + "learning_rate": 8.95040924410207e-07, + "loss": 1.0767, + "step": 1860 + }, + { + "epoch": 0.04491702993665856, + "grad_norm": 1.6079150438308716, + "learning_rate": 8.974482426576792e-07, + "loss": 0.9649, + "step": 1865 + }, + { + "epoch": 0.04503745092844585, + "grad_norm": 1.6209561824798584, + "learning_rate": 8.998555609051516e-07, + "loss": 0.9927, + "step": 1870 + }, + { + "epoch": 0.045157871920233135, + "grad_norm": 1.641926884651184, + "learning_rate": 9.022628791526238e-07, + "loss": 1.029, + "step": 1875 + }, + { + "epoch": 0.04527829291202042, + "grad_norm": 1.6065603494644165, + "learning_rate": 9.046701974000962e-07, + "loss": 1.0018, + "step": 1880 + }, + { + "epoch": 0.04539871390380771, + "grad_norm": 1.5539517402648926, + "learning_rate": 9.070775156475685e-07, + "loss": 1.0798, + "step": 1885 + }, + { + "epoch": 0.045519134895595, + "grad_norm": 1.6610114574432373, + "learning_rate": 9.094848338950409e-07, + "loss": 1.0386, + "step": 1890 + }, + { + "epoch": 0.04563955588738229, + "grad_norm": 1.5111974477767944, + "learning_rate": 9.118921521425132e-07, + "loss": 1.0406, + "step": 1895 + }, + { + "epoch": 0.04575997687916958, + "grad_norm": 2.0480804443359375, + "learning_rate": 9.142994703899856e-07, + "loss": 1.0007, + "step": 1900 + }, + { + "epoch": 0.045880397870956866, + "grad_norm": 1.7117605209350586, + "learning_rate": 9.167067886374578e-07, + "loss": 0.9986, + "step": 1905 + }, + { + "epoch": 0.04600081886274415, + "grad_norm": 1.505280613899231, + "learning_rate": 9.191141068849302e-07, + "loss": 1.0267, + "step": 1910 + }, + { + "epoch": 0.04612123985453144, + "grad_norm": 1.8697929382324219, + "learning_rate": 9.215214251324025e-07, + "loss": 1.0041, + "step": 1915 + }, + { + "epoch": 0.04624166084631873, + "grad_norm": 1.4194835424423218, + "learning_rate": 9.239287433798748e-07, + "loss": 1.0155, + "step": 1920 + }, + { + "epoch": 0.04636208183810602, + "grad_norm": 1.8681567907333374, + "learning_rate": 9.263360616273471e-07, + "loss": 1.0041, + "step": 1925 + }, + { + "epoch": 0.04648250282989331, + "grad_norm": 1.5799516439437866, + "learning_rate": 9.287433798748195e-07, + "loss": 0.996, + "step": 1930 + }, + { + "epoch": 0.046602923821680596, + "grad_norm": 1.585576057434082, + "learning_rate": 9.311506981222917e-07, + "loss": 1.0328, + "step": 1935 + }, + { + "epoch": 0.04672334481346788, + "grad_norm": 1.6536821126937866, + "learning_rate": 9.335580163697641e-07, + "loss": 1.0476, + "step": 1940 + }, + { + "epoch": 0.04684376580525517, + "grad_norm": 1.5552594661712646, + "learning_rate": 9.359653346172363e-07, + "loss": 1.0186, + "step": 1945 + }, + { + "epoch": 0.04696418679704246, + "grad_norm": 1.459959864616394, + "learning_rate": 9.383726528647087e-07, + "loss": 1.0039, + "step": 1950 + }, + { + "epoch": 0.04708460778882975, + "grad_norm": 1.63764226436615, + "learning_rate": 9.40779971112181e-07, + "loss": 1.0606, + "step": 1955 + }, + { + "epoch": 0.04720502878061704, + "grad_norm": 1.6388280391693115, + "learning_rate": 9.431872893596533e-07, + "loss": 0.9829, + "step": 1960 + }, + { + "epoch": 0.04732544977240433, + "grad_norm": 1.6173887252807617, + "learning_rate": 9.455946076071256e-07, + "loss": 1.0445, + "step": 1965 + }, + { + "epoch": 0.047445870764191614, + "grad_norm": 1.4738174676895142, + "learning_rate": 9.48001925854598e-07, + "loss": 1.0223, + "step": 1970 + }, + { + "epoch": 0.0475662917559789, + "grad_norm": 1.7925279140472412, + "learning_rate": 9.504092441020702e-07, + "loss": 1.0145, + "step": 1975 + }, + { + "epoch": 0.04768671274776619, + "grad_norm": 1.609230399131775, + "learning_rate": 9.528165623495426e-07, + "loss": 1.0355, + "step": 1980 + }, + { + "epoch": 0.047807133739553476, + "grad_norm": 1.5456503629684448, + "learning_rate": 9.552238805970149e-07, + "loss": 0.9816, + "step": 1985 + }, + { + "epoch": 0.04792755473134077, + "grad_norm": 1.7006056308746338, + "learning_rate": 9.576311988444872e-07, + "loss": 1.0325, + "step": 1990 + }, + { + "epoch": 0.04804797572312806, + "grad_norm": 2.056192636489868, + "learning_rate": 9.600385170919594e-07, + "loss": 1.0202, + "step": 1995 + }, + { + "epoch": 0.048168396714915344, + "grad_norm": 1.6051387786865234, + "learning_rate": 9.624458353394317e-07, + "loss": 0.9992, + "step": 2000 + }, + { + "epoch": 0.04828881770670263, + "grad_norm": 1.550959587097168, + "learning_rate": 9.648531535869041e-07, + "loss": 0.9809, + "step": 2005 + }, + { + "epoch": 0.04840923869848992, + "grad_norm": 1.8543497323989868, + "learning_rate": 9.672604718343765e-07, + "loss": 1.0177, + "step": 2010 + }, + { + "epoch": 0.048529659690277206, + "grad_norm": 1.6538995504379272, + "learning_rate": 9.696677900818486e-07, + "loss": 1.0336, + "step": 2015 + }, + { + "epoch": 0.0486500806820645, + "grad_norm": 1.763832926750183, + "learning_rate": 9.72075108329321e-07, + "loss": 0.9986, + "step": 2020 + }, + { + "epoch": 0.04877050167385179, + "grad_norm": 1.794128656387329, + "learning_rate": 9.744824265767934e-07, + "loss": 1.0349, + "step": 2025 + }, + { + "epoch": 0.048890922665639075, + "grad_norm": 1.6576578617095947, + "learning_rate": 9.768897448242657e-07, + "loss": 1.0408, + "step": 2030 + }, + { + "epoch": 0.04901134365742636, + "grad_norm": 1.539139986038208, + "learning_rate": 9.792970630717381e-07, + "loss": 0.9983, + "step": 2035 + }, + { + "epoch": 0.04913176464921365, + "grad_norm": 1.5743350982666016, + "learning_rate": 9.817043813192105e-07, + "loss": 1.0335, + "step": 2040 + }, + { + "epoch": 0.04925218564100094, + "grad_norm": 1.6326196193695068, + "learning_rate": 9.841116995666826e-07, + "loss": 1.0018, + "step": 2045 + }, + { + "epoch": 0.04937260663278823, + "grad_norm": 1.6246923208236694, + "learning_rate": 9.86519017814155e-07, + "loss": 0.9942, + "step": 2050 + }, + { + "epoch": 0.04949302762457552, + "grad_norm": 1.564854621887207, + "learning_rate": 9.889263360616274e-07, + "loss": 1.0211, + "step": 2055 + }, + { + "epoch": 0.049613448616362806, + "grad_norm": 1.5778380632400513, + "learning_rate": 9.913336543090997e-07, + "loss": 1.0246, + "step": 2060 + }, + { + "epoch": 0.04973386960815009, + "grad_norm": 1.7073304653167725, + "learning_rate": 9.937409725565719e-07, + "loss": 1.0102, + "step": 2065 + }, + { + "epoch": 0.04985429059993738, + "grad_norm": 1.4603596925735474, + "learning_rate": 9.961482908040443e-07, + "loss": 0.9871, + "step": 2070 + }, + { + "epoch": 0.04997471159172467, + "grad_norm": 1.6223366260528564, + "learning_rate": 9.985556090515166e-07, + "loss": 1.0269, + "step": 2075 + }, + { + "epoch": 0.050095132583511955, + "grad_norm": 1.8474642038345337, + "learning_rate": 9.999492952033262e-07, + "loss": 0.9398, + "step": 2080 + }, + { + "epoch": 0.05021555357529925, + "grad_norm": 1.6487759351730347, + "learning_rate": 9.998225332116418e-07, + "loss": 0.9937, + "step": 2085 + }, + { + "epoch": 0.050335974567086536, + "grad_norm": 1.697744607925415, + "learning_rate": 9.996957712199575e-07, + "loss": 0.9984, + "step": 2090 + }, + { + "epoch": 0.05045639555887382, + "grad_norm": 1.4020692110061646, + "learning_rate": 9.995690092282729e-07, + "loss": 0.9828, + "step": 2095 + }, + { + "epoch": 0.05057681655066111, + "grad_norm": 1.589512586593628, + "learning_rate": 9.994422472365885e-07, + "loss": 0.9955, + "step": 2100 + }, + { + "epoch": 0.0506972375424484, + "grad_norm": 1.4988205432891846, + "learning_rate": 9.993154852449042e-07, + "loss": 1.0136, + "step": 2105 + }, + { + "epoch": 0.050817658534235685, + "grad_norm": 1.6529805660247803, + "learning_rate": 9.991887232532196e-07, + "loss": 0.9957, + "step": 2110 + }, + { + "epoch": 0.05093807952602298, + "grad_norm": 1.5760482549667358, + "learning_rate": 9.990619612615353e-07, + "loss": 0.9742, + "step": 2115 + }, + { + "epoch": 0.05105850051781027, + "grad_norm": 1.7098240852355957, + "learning_rate": 9.98935199269851e-07, + "loss": 0.9826, + "step": 2120 + }, + { + "epoch": 0.051178921509597554, + "grad_norm": 1.4694714546203613, + "learning_rate": 9.988084372781663e-07, + "loss": 1.0453, + "step": 2125 + }, + { + "epoch": 0.05129934250138484, + "grad_norm": 1.719585657119751, + "learning_rate": 9.98681675286482e-07, + "loss": 1.0561, + "step": 2130 + }, + { + "epoch": 0.05141976349317213, + "grad_norm": 1.5589990615844727, + "learning_rate": 9.985549132947976e-07, + "loss": 0.9995, + "step": 2135 + }, + { + "epoch": 0.051540184484959416, + "grad_norm": 1.570941686630249, + "learning_rate": 9.984281513031133e-07, + "loss": 1.0074, + "step": 2140 + }, + { + "epoch": 0.05166060547674671, + "grad_norm": 1.4263135194778442, + "learning_rate": 9.98301389311429e-07, + "loss": 0.9939, + "step": 2145 + }, + { + "epoch": 0.051781026468534, + "grad_norm": 1.6503669023513794, + "learning_rate": 9.981746273197443e-07, + "loss": 0.9384, + "step": 2150 + }, + { + "epoch": 0.051901447460321284, + "grad_norm": 1.5674190521240234, + "learning_rate": 9.9804786532806e-07, + "loss": 1.0988, + "step": 2155 + }, + { + "epoch": 0.05202186845210857, + "grad_norm": 1.8552207946777344, + "learning_rate": 9.979211033363756e-07, + "loss": 0.9471, + "step": 2160 + }, + { + "epoch": 0.05214228944389586, + "grad_norm": 1.5317758321762085, + "learning_rate": 9.977943413446913e-07, + "loss": 1.0026, + "step": 2165 + }, + { + "epoch": 0.052262710435683146, + "grad_norm": 1.5184342861175537, + "learning_rate": 9.976675793530067e-07, + "loss": 0.9657, + "step": 2170 + }, + { + "epoch": 0.052383131427470433, + "grad_norm": 2.0550878047943115, + "learning_rate": 9.975408173613224e-07, + "loss": 1.0551, + "step": 2175 + }, + { + "epoch": 0.05250355241925773, + "grad_norm": 1.670849084854126, + "learning_rate": 9.97414055369638e-07, + "loss": 1.0307, + "step": 2180 + }, + { + "epoch": 0.052623973411045015, + "grad_norm": 1.8367807865142822, + "learning_rate": 9.972872933779534e-07, + "loss": 1.0552, + "step": 2185 + }, + { + "epoch": 0.0527443944028323, + "grad_norm": 1.5710960626602173, + "learning_rate": 9.97160531386269e-07, + "loss": 1.026, + "step": 2190 + }, + { + "epoch": 0.05286481539461959, + "grad_norm": 1.4356889724731445, + "learning_rate": 9.970337693945847e-07, + "loss": 0.9868, + "step": 2195 + }, + { + "epoch": 0.05298523638640688, + "grad_norm": 1.4902724027633667, + "learning_rate": 9.969070074029002e-07, + "loss": 1.0166, + "step": 2200 + }, + { + "epoch": 0.053105657378194164, + "grad_norm": 1.6974903345108032, + "learning_rate": 9.967802454112158e-07, + "loss": 1.0271, + "step": 2205 + }, + { + "epoch": 0.05322607836998146, + "grad_norm": 1.4314632415771484, + "learning_rate": 9.966534834195315e-07, + "loss": 0.983, + "step": 2210 + }, + { + "epoch": 0.053346499361768746, + "grad_norm": 1.472601294517517, + "learning_rate": 9.96526721427847e-07, + "loss": 1.024, + "step": 2215 + }, + { + "epoch": 0.05346692035355603, + "grad_norm": 1.6418883800506592, + "learning_rate": 9.963999594361625e-07, + "loss": 1.0011, + "step": 2220 + }, + { + "epoch": 0.05358734134534332, + "grad_norm": 1.492514729499817, + "learning_rate": 9.962731974444782e-07, + "loss": 1.0287, + "step": 2225 + }, + { + "epoch": 0.05370776233713061, + "grad_norm": 1.6566858291625977, + "learning_rate": 9.961464354527938e-07, + "loss": 1.0115, + "step": 2230 + }, + { + "epoch": 0.053828183328917895, + "grad_norm": 2.257582187652588, + "learning_rate": 9.960196734611095e-07, + "loss": 1.0276, + "step": 2235 + }, + { + "epoch": 0.05394860432070519, + "grad_norm": 1.6120405197143555, + "learning_rate": 9.95892911469425e-07, + "loss": 1.0478, + "step": 2240 + }, + { + "epoch": 0.054069025312492476, + "grad_norm": 1.6881752014160156, + "learning_rate": 9.957661494777405e-07, + "loss": 1.0088, + "step": 2245 + }, + { + "epoch": 0.05418944630427976, + "grad_norm": 1.7539931535720825, + "learning_rate": 9.956393874860562e-07, + "loss": 1.0064, + "step": 2250 + }, + { + "epoch": 0.05430986729606705, + "grad_norm": 1.7022444009780884, + "learning_rate": 9.955126254943718e-07, + "loss": 1.0547, + "step": 2255 + }, + { + "epoch": 0.05443028828785434, + "grad_norm": 1.4357643127441406, + "learning_rate": 9.953858635026873e-07, + "loss": 1.0532, + "step": 2260 + }, + { + "epoch": 0.054550709279641625, + "grad_norm": 1.4944684505462646, + "learning_rate": 9.95259101511003e-07, + "loss": 0.996, + "step": 2265 + }, + { + "epoch": 0.05467113027142891, + "grad_norm": 1.4375295639038086, + "learning_rate": 9.951323395193186e-07, + "loss": 1.0292, + "step": 2270 + }, + { + "epoch": 0.05479155126321621, + "grad_norm": 1.7429416179656982, + "learning_rate": 9.95005577527634e-07, + "loss": 0.9918, + "step": 2275 + }, + { + "epoch": 0.054911972255003494, + "grad_norm": 1.5450338125228882, + "learning_rate": 9.948788155359496e-07, + "loss": 1.0145, + "step": 2280 + }, + { + "epoch": 0.05503239324679078, + "grad_norm": 1.7032642364501953, + "learning_rate": 9.947520535442653e-07, + "loss": 1.0643, + "step": 2285 + }, + { + "epoch": 0.05515281423857807, + "grad_norm": 1.4787014722824097, + "learning_rate": 9.946252915525807e-07, + "loss": 0.972, + "step": 2290 + }, + { + "epoch": 0.055273235230365356, + "grad_norm": 1.6551034450531006, + "learning_rate": 9.944985295608964e-07, + "loss": 1.0394, + "step": 2295 + }, + { + "epoch": 0.05539365622215264, + "grad_norm": 1.5106158256530762, + "learning_rate": 9.94371767569212e-07, + "loss": 1.0251, + "step": 2300 + }, + { + "epoch": 0.05551407721393994, + "grad_norm": 1.7415740489959717, + "learning_rate": 9.942450055775276e-07, + "loss": 1.0066, + "step": 2305 + }, + { + "epoch": 0.055634498205727224, + "grad_norm": 1.625821590423584, + "learning_rate": 9.94118243585843e-07, + "loss": 0.9502, + "step": 2310 + }, + { + "epoch": 0.05575491919751451, + "grad_norm": 1.480774164199829, + "learning_rate": 9.939914815941587e-07, + "loss": 0.9622, + "step": 2315 + }, + { + "epoch": 0.0558753401893018, + "grad_norm": 1.4830437898635864, + "learning_rate": 9.938647196024744e-07, + "loss": 0.9566, + "step": 2320 + }, + { + "epoch": 0.055995761181089086, + "grad_norm": 1.5242736339569092, + "learning_rate": 9.9373795761079e-07, + "loss": 0.9981, + "step": 2325 + }, + { + "epoch": 0.056116182172876374, + "grad_norm": 1.4055622816085815, + "learning_rate": 9.936111956191057e-07, + "loss": 1.0229, + "step": 2330 + }, + { + "epoch": 0.05623660316466366, + "grad_norm": 2.0330963134765625, + "learning_rate": 9.93484433627421e-07, + "loss": 1.0014, + "step": 2335 + }, + { + "epoch": 0.056357024156450955, + "grad_norm": 1.824340581893921, + "learning_rate": 9.933576716357367e-07, + "loss": 0.9525, + "step": 2340 + }, + { + "epoch": 0.05647744514823824, + "grad_norm": 1.5875630378723145, + "learning_rate": 9.932309096440524e-07, + "loss": 0.9849, + "step": 2345 + }, + { + "epoch": 0.05659786614002553, + "grad_norm": 1.5911989212036133, + "learning_rate": 9.931041476523678e-07, + "loss": 1.0125, + "step": 2350 + }, + { + "epoch": 0.05671828713181282, + "grad_norm": 1.567030906677246, + "learning_rate": 9.929773856606835e-07, + "loss": 1.0127, + "step": 2355 + }, + { + "epoch": 0.056838708123600104, + "grad_norm": 1.8666136264801025, + "learning_rate": 9.92850623668999e-07, + "loss": 1.0164, + "step": 2360 + }, + { + "epoch": 0.05695912911538739, + "grad_norm": 1.4268726110458374, + "learning_rate": 9.927238616773145e-07, + "loss": 0.9955, + "step": 2365 + }, + { + "epoch": 0.057079550107174686, + "grad_norm": 1.6934856176376343, + "learning_rate": 9.925970996856302e-07, + "loss": 0.978, + "step": 2370 + }, + { + "epoch": 0.05719997109896197, + "grad_norm": 1.5638610124588013, + "learning_rate": 9.924703376939458e-07, + "loss": 1.0284, + "step": 2375 + }, + { + "epoch": 0.05732039209074926, + "grad_norm": 1.6345804929733276, + "learning_rate": 9.923435757022613e-07, + "loss": 1.0061, + "step": 2380 + }, + { + "epoch": 0.05744081308253655, + "grad_norm": 1.7583974599838257, + "learning_rate": 9.92216813710577e-07, + "loss": 1.0355, + "step": 2385 + }, + { + "epoch": 0.057561234074323835, + "grad_norm": 1.576433539390564, + "learning_rate": 9.920900517188925e-07, + "loss": 0.9556, + "step": 2390 + }, + { + "epoch": 0.05768165506611112, + "grad_norm": 1.4995986223220825, + "learning_rate": 9.919632897272082e-07, + "loss": 0.9899, + "step": 2395 + }, + { + "epoch": 0.057802076057898416, + "grad_norm": 1.8596760034561157, + "learning_rate": 9.918365277355238e-07, + "loss": 1.0413, + "step": 2400 + }, + { + "epoch": 0.0579224970496857, + "grad_norm": 2.1781084537506104, + "learning_rate": 9.917097657438393e-07, + "loss": 1.0664, + "step": 2405 + }, + { + "epoch": 0.05804291804147299, + "grad_norm": 1.719193458557129, + "learning_rate": 9.91583003752155e-07, + "loss": 1.0398, + "step": 2410 + }, + { + "epoch": 0.05816333903326028, + "grad_norm": 1.5527453422546387, + "learning_rate": 9.914562417604706e-07, + "loss": 0.8899, + "step": 2415 + }, + { + "epoch": 0.058283760025047565, + "grad_norm": 2.198544979095459, + "learning_rate": 9.913294797687862e-07, + "loss": 1.0026, + "step": 2420 + }, + { + "epoch": 0.05840418101683485, + "grad_norm": 1.8652313947677612, + "learning_rate": 9.912027177771016e-07, + "loss": 1.0415, + "step": 2425 + }, + { + "epoch": 0.05852460200862214, + "grad_norm": 1.5211336612701416, + "learning_rate": 9.910759557854173e-07, + "loss": 0.9695, + "step": 2430 + }, + { + "epoch": 0.058645023000409434, + "grad_norm": 1.603398323059082, + "learning_rate": 9.90949193793733e-07, + "loss": 0.973, + "step": 2435 + }, + { + "epoch": 0.05876544399219672, + "grad_norm": 1.6345428228378296, + "learning_rate": 9.908224318020484e-07, + "loss": 0.9492, + "step": 2440 + }, + { + "epoch": 0.05888586498398401, + "grad_norm": 1.4910589456558228, + "learning_rate": 9.90695669810364e-07, + "loss": 0.9742, + "step": 2445 + }, + { + "epoch": 0.059006285975771296, + "grad_norm": 1.7999168634414673, + "learning_rate": 9.905689078186796e-07, + "loss": 0.9763, + "step": 2450 + }, + { + "epoch": 0.05912670696755858, + "grad_norm": 1.7445992231369019, + "learning_rate": 9.90442145826995e-07, + "loss": 1.0366, + "step": 2455 + }, + { + "epoch": 0.05924712795934587, + "grad_norm": 1.6648802757263184, + "learning_rate": 9.903153838353107e-07, + "loss": 1.0165, + "step": 2460 + }, + { + "epoch": 0.059367548951133164, + "grad_norm": 1.5104074478149414, + "learning_rate": 9.901886218436264e-07, + "loss": 0.9897, + "step": 2465 + }, + { + "epoch": 0.05948796994292045, + "grad_norm": 1.7375459671020508, + "learning_rate": 9.900618598519418e-07, + "loss": 1.0129, + "step": 2470 + }, + { + "epoch": 0.05960839093470774, + "grad_norm": 1.4074209928512573, + "learning_rate": 9.899350978602574e-07, + "loss": 1.0411, + "step": 2475 + }, + { + "epoch": 0.059728811926495026, + "grad_norm": 1.6192816495895386, + "learning_rate": 9.89808335868573e-07, + "loss": 0.995, + "step": 2480 + }, + { + "epoch": 0.059849232918282314, + "grad_norm": 1.6760953664779663, + "learning_rate": 9.896815738768887e-07, + "loss": 1.0046, + "step": 2485 + }, + { + "epoch": 0.0599696539100696, + "grad_norm": 1.579380989074707, + "learning_rate": 9.895548118852044e-07, + "loss": 0.9926, + "step": 2490 + }, + { + "epoch": 0.060090074901856895, + "grad_norm": 1.7261121273040771, + "learning_rate": 9.894280498935198e-07, + "loss": 0.9732, + "step": 2495 + }, + { + "epoch": 0.06021049589364418, + "grad_norm": 1.5446956157684326, + "learning_rate": 9.893012879018355e-07, + "loss": 1.0288, + "step": 2500 + }, + { + "epoch": 0.06033091688543147, + "grad_norm": 1.4820080995559692, + "learning_rate": 9.89174525910151e-07, + "loss": 1.0583, + "step": 2505 + }, + { + "epoch": 0.06045133787721876, + "grad_norm": 1.5800060033798218, + "learning_rate": 9.890477639184667e-07, + "loss": 1.0293, + "step": 2510 + }, + { + "epoch": 0.060571758869006044, + "grad_norm": 1.6209781169891357, + "learning_rate": 9.889210019267822e-07, + "loss": 1.0297, + "step": 2515 + }, + { + "epoch": 0.06069217986079333, + "grad_norm": 1.3738642930984497, + "learning_rate": 9.887942399350978e-07, + "loss": 1.0049, + "step": 2520 + }, + { + "epoch": 0.06081260085258062, + "grad_norm": 1.6053427457809448, + "learning_rate": 9.886674779434135e-07, + "loss": 0.9941, + "step": 2525 + }, + { + "epoch": 0.06093302184436791, + "grad_norm": 1.4712995290756226, + "learning_rate": 9.88540715951729e-07, + "loss": 0.9638, + "step": 2530 + }, + { + "epoch": 0.0610534428361552, + "grad_norm": 1.7111667394638062, + "learning_rate": 9.884139539600445e-07, + "loss": 1.0174, + "step": 2535 + }, + { + "epoch": 0.06117386382794249, + "grad_norm": 1.4965132474899292, + "learning_rate": 9.882871919683602e-07, + "loss": 0.9886, + "step": 2540 + }, + { + "epoch": 0.061294284819729775, + "grad_norm": 1.615824580192566, + "learning_rate": 9.881604299766756e-07, + "loss": 0.9891, + "step": 2545 + }, + { + "epoch": 0.06141470581151706, + "grad_norm": 1.4951409101486206, + "learning_rate": 9.880336679849913e-07, + "loss": 1.0106, + "step": 2550 + }, + { + "epoch": 0.06153512680330435, + "grad_norm": 1.630304217338562, + "learning_rate": 9.87906905993307e-07, + "loss": 0.937, + "step": 2555 + }, + { + "epoch": 0.06165554779509164, + "grad_norm": 1.622524380683899, + "learning_rate": 9.877801440016226e-07, + "loss": 0.9855, + "step": 2560 + }, + { + "epoch": 0.06177596878687893, + "grad_norm": 1.665725588798523, + "learning_rate": 9.87653382009938e-07, + "loss": 1.0499, + "step": 2565 + }, + { + "epoch": 0.06189638977866622, + "grad_norm": 1.747391700744629, + "learning_rate": 9.875266200182536e-07, + "loss": 0.9747, + "step": 2570 + }, + { + "epoch": 0.062016810770453505, + "grad_norm": 1.7850977182388306, + "learning_rate": 9.873998580265693e-07, + "loss": 0.9773, + "step": 2575 + }, + { + "epoch": 0.06213723176224079, + "grad_norm": 1.7369146347045898, + "learning_rate": 9.87273096034885e-07, + "loss": 1.0316, + "step": 2580 + }, + { + "epoch": 0.06225765275402808, + "grad_norm": 1.810365915298462, + "learning_rate": 9.871463340432006e-07, + "loss": 0.9694, + "step": 2585 + }, + { + "epoch": 0.062378073745815374, + "grad_norm": 1.6777886152267456, + "learning_rate": 9.87019572051516e-07, + "loss": 0.9939, + "step": 2590 + }, + { + "epoch": 0.06249849473760266, + "grad_norm": 1.7758405208587646, + "learning_rate": 9.868928100598316e-07, + "loss": 1.0482, + "step": 2595 + }, + { + "epoch": 0.06261891572938995, + "grad_norm": 1.699005365371704, + "learning_rate": 9.867660480681473e-07, + "loss": 1.0163, + "step": 2600 + }, + { + "epoch": 0.06273933672117724, + "grad_norm": 1.5089378356933594, + "learning_rate": 9.866392860764627e-07, + "loss": 0.9912, + "step": 2605 + }, + { + "epoch": 0.06285975771296452, + "grad_norm": 1.4108537435531616, + "learning_rate": 9.865125240847784e-07, + "loss": 1.0056, + "step": 2610 + }, + { + "epoch": 0.06298017870475181, + "grad_norm": 1.6560865640640259, + "learning_rate": 9.86385762093094e-07, + "loss": 1.024, + "step": 2615 + }, + { + "epoch": 0.0631005996965391, + "grad_norm": 1.54850172996521, + "learning_rate": 9.862590001014094e-07, + "loss": 1.0085, + "step": 2620 + }, + { + "epoch": 0.06322102068832638, + "grad_norm": 1.6445883512496948, + "learning_rate": 9.86132238109725e-07, + "loss": 1.0119, + "step": 2625 + }, + { + "epoch": 0.06334144168011367, + "grad_norm": 1.811767816543579, + "learning_rate": 9.860054761180407e-07, + "loss": 0.9318, + "step": 2630 + }, + { + "epoch": 0.06346186267190096, + "grad_norm": 1.6554980278015137, + "learning_rate": 9.858787141263562e-07, + "loss": 1.0083, + "step": 2635 + }, + { + "epoch": 0.06358228366368826, + "grad_norm": 1.773821473121643, + "learning_rate": 9.857519521346718e-07, + "loss": 1.0159, + "step": 2640 + }, + { + "epoch": 0.06370270465547555, + "grad_norm": 1.6003626585006714, + "learning_rate": 9.856251901429875e-07, + "loss": 0.9948, + "step": 2645 + }, + { + "epoch": 0.06382312564726283, + "grad_norm": 1.6465872526168823, + "learning_rate": 9.85498428151303e-07, + "loss": 1.0038, + "step": 2650 + }, + { + "epoch": 0.06394354663905012, + "grad_norm": 1.4264850616455078, + "learning_rate": 9.853716661596185e-07, + "loss": 1.0058, + "step": 2655 + }, + { + "epoch": 0.06406396763083741, + "grad_norm": 1.4336827993392944, + "learning_rate": 9.852449041679342e-07, + "loss": 1.0192, + "step": 2660 + }, + { + "epoch": 0.0641843886226247, + "grad_norm": 1.6807986497879028, + "learning_rate": 9.851181421762498e-07, + "loss": 0.9417, + "step": 2665 + }, + { + "epoch": 0.06430480961441198, + "grad_norm": 1.5016063451766968, + "learning_rate": 9.849913801845655e-07, + "loss": 0.9898, + "step": 2670 + }, + { + "epoch": 0.06442523060619927, + "grad_norm": 1.5583468675613403, + "learning_rate": 9.848646181928811e-07, + "loss": 1.0263, + "step": 2675 + }, + { + "epoch": 0.06454565159798656, + "grad_norm": 1.9095042943954468, + "learning_rate": 9.847378562011966e-07, + "loss": 0.9524, + "step": 2680 + }, + { + "epoch": 0.06466607258977385, + "grad_norm": 1.6604355573654175, + "learning_rate": 9.846110942095122e-07, + "loss": 0.9932, + "step": 2685 + }, + { + "epoch": 0.06478649358156113, + "grad_norm": 1.5880812406539917, + "learning_rate": 9.844843322178278e-07, + "loss": 0.9439, + "step": 2690 + }, + { + "epoch": 0.06490691457334842, + "grad_norm": 1.512092113494873, + "learning_rate": 9.843575702261433e-07, + "loss": 0.9756, + "step": 2695 + }, + { + "epoch": 0.06502733556513571, + "grad_norm": 1.510452151298523, + "learning_rate": 9.84230808234459e-07, + "loss": 0.9915, + "step": 2700 + }, + { + "epoch": 0.06514775655692301, + "grad_norm": 1.5895094871520996, + "learning_rate": 9.841040462427746e-07, + "loss": 1.0063, + "step": 2705 + }, + { + "epoch": 0.0652681775487103, + "grad_norm": 1.780158519744873, + "learning_rate": 9.8397728425109e-07, + "loss": 1.0395, + "step": 2710 + }, + { + "epoch": 0.06538859854049758, + "grad_norm": 1.534387469291687, + "learning_rate": 9.838505222594056e-07, + "loss": 0.9337, + "step": 2715 + }, + { + "epoch": 0.06550901953228487, + "grad_norm": 1.6160880327224731, + "learning_rate": 9.837237602677213e-07, + "loss": 0.9846, + "step": 2720 + }, + { + "epoch": 0.06562944052407216, + "grad_norm": 1.616960883140564, + "learning_rate": 9.835969982760367e-07, + "loss": 1.0247, + "step": 2725 + }, + { + "epoch": 0.06574986151585945, + "grad_norm": 1.6236367225646973, + "learning_rate": 9.834702362843524e-07, + "loss": 0.9983, + "step": 2730 + }, + { + "epoch": 0.06587028250764673, + "grad_norm": 2.1704113483428955, + "learning_rate": 9.83343474292668e-07, + "loss": 1.0474, + "step": 2735 + }, + { + "epoch": 0.06599070349943402, + "grad_norm": 1.4857268333435059, + "learning_rate": 9.832167123009837e-07, + "loss": 0.9496, + "step": 2740 + }, + { + "epoch": 0.06611112449122131, + "grad_norm": 1.6319400072097778, + "learning_rate": 9.830899503092993e-07, + "loss": 0.972, + "step": 2745 + }, + { + "epoch": 0.0662315454830086, + "grad_norm": 1.406339168548584, + "learning_rate": 9.829631883176147e-07, + "loss": 0.9842, + "step": 2750 + }, + { + "epoch": 0.06635196647479588, + "grad_norm": 1.6158806085586548, + "learning_rate": 9.828364263259304e-07, + "loss": 0.9952, + "step": 2755 + }, + { + "epoch": 0.06647238746658317, + "grad_norm": 1.6224943399429321, + "learning_rate": 9.82709664334246e-07, + "loss": 0.961, + "step": 2760 + }, + { + "epoch": 0.06659280845837047, + "grad_norm": 1.630590558052063, + "learning_rate": 9.825829023425617e-07, + "loss": 1.0236, + "step": 2765 + }, + { + "epoch": 0.06671322945015776, + "grad_norm": 1.5676782131195068, + "learning_rate": 9.82456140350877e-07, + "loss": 0.9279, + "step": 2770 + }, + { + "epoch": 0.06683365044194504, + "grad_norm": 1.5544155836105347, + "learning_rate": 9.823293783591927e-07, + "loss": 0.9843, + "step": 2775 + }, + { + "epoch": 0.06695407143373233, + "grad_norm": 1.5769226551055908, + "learning_rate": 9.822026163675084e-07, + "loss": 1.0193, + "step": 2780 + }, + { + "epoch": 0.06707449242551962, + "grad_norm": 1.502962350845337, + "learning_rate": 9.820758543758238e-07, + "loss": 1.0003, + "step": 2785 + }, + { + "epoch": 0.0671949134173069, + "grad_norm": 1.9905481338500977, + "learning_rate": 9.819490923841395e-07, + "loss": 0.9875, + "step": 2790 + }, + { + "epoch": 0.0673153344090942, + "grad_norm": 1.6774263381958008, + "learning_rate": 9.818223303924551e-07, + "loss": 1.0727, + "step": 2795 + }, + { + "epoch": 0.06743575540088148, + "grad_norm": 1.4511786699295044, + "learning_rate": 9.816955684007708e-07, + "loss": 0.9355, + "step": 2800 + }, + { + "epoch": 0.06755617639266877, + "grad_norm": 1.5723068714141846, + "learning_rate": 9.815688064090862e-07, + "loss": 0.9647, + "step": 2805 + }, + { + "epoch": 0.06767659738445606, + "grad_norm": 1.5582698583602905, + "learning_rate": 9.814420444174018e-07, + "loss": 0.9908, + "step": 2810 + }, + { + "epoch": 0.06779701837624334, + "grad_norm": 1.5775694847106934, + "learning_rate": 9.813152824257175e-07, + "loss": 0.9866, + "step": 2815 + }, + { + "epoch": 0.06791743936803063, + "grad_norm": 1.4566597938537598, + "learning_rate": 9.81188520434033e-07, + "loss": 1.0293, + "step": 2820 + }, + { + "epoch": 0.06803786035981792, + "grad_norm": 1.9800313711166382, + "learning_rate": 9.810617584423486e-07, + "loss": 0.8688, + "step": 2825 + }, + { + "epoch": 0.06815828135160522, + "grad_norm": 1.5676807165145874, + "learning_rate": 9.809349964506642e-07, + "loss": 1.027, + "step": 2830 + }, + { + "epoch": 0.0682787023433925, + "grad_norm": 1.5596281290054321, + "learning_rate": 9.808082344589798e-07, + "loss": 0.9814, + "step": 2835 + }, + { + "epoch": 0.06839912333517979, + "grad_norm": 1.5297009944915771, + "learning_rate": 9.806814724672953e-07, + "loss": 0.9466, + "step": 2840 + }, + { + "epoch": 0.06851954432696708, + "grad_norm": 1.5337249040603638, + "learning_rate": 9.80554710475611e-07, + "loss": 0.9952, + "step": 2845 + }, + { + "epoch": 0.06863996531875437, + "grad_norm": 1.7600308656692505, + "learning_rate": 9.804279484839266e-07, + "loss": 0.9941, + "step": 2850 + }, + { + "epoch": 0.06876038631054165, + "grad_norm": 1.5565025806427002, + "learning_rate": 9.803011864922422e-07, + "loss": 0.9536, + "step": 2855 + }, + { + "epoch": 0.06888080730232894, + "grad_norm": 1.4753719568252563, + "learning_rate": 9.801744245005579e-07, + "loss": 0.9562, + "step": 2860 + }, + { + "epoch": 0.06900122829411623, + "grad_norm": 1.6010030508041382, + "learning_rate": 9.800476625088733e-07, + "loss": 1.0187, + "step": 2865 + }, + { + "epoch": 0.06912164928590352, + "grad_norm": 1.723764181137085, + "learning_rate": 9.79920900517189e-07, + "loss": 0.9999, + "step": 2870 + }, + { + "epoch": 0.0692420702776908, + "grad_norm": 1.480420470237732, + "learning_rate": 9.797941385255046e-07, + "loss": 1.0608, + "step": 2875 + }, + { + "epoch": 0.06936249126947809, + "grad_norm": 1.699066400527954, + "learning_rate": 9.7966737653382e-07, + "loss": 1.0086, + "step": 2880 + }, + { + "epoch": 0.06948291226126538, + "grad_norm": 1.6982381343841553, + "learning_rate": 9.795406145421357e-07, + "loss": 0.9616, + "step": 2885 + }, + { + "epoch": 0.06960333325305267, + "grad_norm": 1.5258110761642456, + "learning_rate": 9.794138525504513e-07, + "loss": 0.9569, + "step": 2890 + }, + { + "epoch": 0.06972375424483997, + "grad_norm": 1.3778504133224487, + "learning_rate": 9.792870905587667e-07, + "loss": 0.949, + "step": 2895 + }, + { + "epoch": 0.06984417523662725, + "grad_norm": 1.5082603693008423, + "learning_rate": 9.791603285670824e-07, + "loss": 0.9212, + "step": 2900 + }, + { + "epoch": 0.06996459622841454, + "grad_norm": 1.683454155921936, + "learning_rate": 9.79033566575398e-07, + "loss": 1.0048, + "step": 2905 + }, + { + "epoch": 0.07008501722020183, + "grad_norm": 1.6343798637390137, + "learning_rate": 9.789068045837135e-07, + "loss": 0.9412, + "step": 2910 + }, + { + "epoch": 0.07020543821198912, + "grad_norm": 2.1352267265319824, + "learning_rate": 9.78780042592029e-07, + "loss": 1.0182, + "step": 2915 + }, + { + "epoch": 0.0703258592037764, + "grad_norm": 1.613693118095398, + "learning_rate": 9.786532806003447e-07, + "loss": 0.9761, + "step": 2920 + }, + { + "epoch": 0.07044628019556369, + "grad_norm": 1.7342922687530518, + "learning_rate": 9.785265186086604e-07, + "loss": 0.9448, + "step": 2925 + }, + { + "epoch": 0.07056670118735098, + "grad_norm": 1.664732575416565, + "learning_rate": 9.78399756616976e-07, + "loss": 1.026, + "step": 2930 + }, + { + "epoch": 0.07068712217913826, + "grad_norm": 1.4946379661560059, + "learning_rate": 9.782729946252915e-07, + "loss": 0.935, + "step": 2935 + }, + { + "epoch": 0.07080754317092555, + "grad_norm": 1.7126940488815308, + "learning_rate": 9.781462326336071e-07, + "loss": 0.9844, + "step": 2940 + }, + { + "epoch": 0.07092796416271284, + "grad_norm": 1.847761869430542, + "learning_rate": 9.780194706419228e-07, + "loss": 1.0524, + "step": 2945 + }, + { + "epoch": 0.07104838515450013, + "grad_norm": 1.6522741317749023, + "learning_rate": 9.778927086502384e-07, + "loss": 0.9787, + "step": 2950 + }, + { + "epoch": 0.07116880614628743, + "grad_norm": 1.5920339822769165, + "learning_rate": 9.777659466585538e-07, + "loss": 0.9595, + "step": 2955 + }, + { + "epoch": 0.07128922713807472, + "grad_norm": 1.5285476446151733, + "learning_rate": 9.776391846668695e-07, + "loss": 0.9854, + "step": 2960 + }, + { + "epoch": 0.071409648129862, + "grad_norm": 1.3394943475723267, + "learning_rate": 9.775124226751851e-07, + "loss": 0.956, + "step": 2965 + }, + { + "epoch": 0.07153006912164929, + "grad_norm": 1.6529779434204102, + "learning_rate": 9.773856606835006e-07, + "loss": 1.0041, + "step": 2970 + }, + { + "epoch": 0.07165049011343658, + "grad_norm": 1.643385887145996, + "learning_rate": 9.772588986918162e-07, + "loss": 1.0442, + "step": 2975 + }, + { + "epoch": 0.07177091110522386, + "grad_norm": 1.5586971044540405, + "learning_rate": 9.771321367001318e-07, + "loss": 0.9751, + "step": 2980 + }, + { + "epoch": 0.07189133209701115, + "grad_norm": 1.46442711353302, + "learning_rate": 9.770053747084473e-07, + "loss": 1.0208, + "step": 2985 + }, + { + "epoch": 0.07201175308879844, + "grad_norm": 1.5164238214492798, + "learning_rate": 9.76878612716763e-07, + "loss": 1.0347, + "step": 2990 + }, + { + "epoch": 0.07213217408058573, + "grad_norm": 1.6594327688217163, + "learning_rate": 9.767518507250786e-07, + "loss": 1.0027, + "step": 2995 + }, + { + "epoch": 0.07225259507237301, + "grad_norm": 1.7102785110473633, + "learning_rate": 9.766250887333942e-07, + "loss": 1.0035, + "step": 3000 + }, + { + "epoch": 0.0723730160641603, + "grad_norm": 1.5333776473999023, + "learning_rate": 9.764983267417096e-07, + "loss": 0.9626, + "step": 3005 + }, + { + "epoch": 0.07249343705594759, + "grad_norm": 2.1526007652282715, + "learning_rate": 9.763715647500253e-07, + "loss": 0.9312, + "step": 3010 + }, + { + "epoch": 0.07261385804773487, + "grad_norm": 1.6529592275619507, + "learning_rate": 9.76244802758341e-07, + "loss": 0.9714, + "step": 3015 + }, + { + "epoch": 0.07273427903952218, + "grad_norm": 1.569183349609375, + "learning_rate": 9.761180407666566e-07, + "loss": 0.9682, + "step": 3020 + }, + { + "epoch": 0.07285470003130946, + "grad_norm": 1.4431354999542236, + "learning_rate": 9.75991278774972e-07, + "loss": 1.0084, + "step": 3025 + }, + { + "epoch": 0.07297512102309675, + "grad_norm": 1.7939332723617554, + "learning_rate": 9.758645167832877e-07, + "loss": 0.9695, + "step": 3030 + }, + { + "epoch": 0.07309554201488404, + "grad_norm": 1.7346773147583008, + "learning_rate": 9.757377547916033e-07, + "loss": 0.9873, + "step": 3035 + }, + { + "epoch": 0.07321596300667133, + "grad_norm": 1.540472388267517, + "learning_rate": 9.75610992799919e-07, + "loss": 0.9653, + "step": 3040 + }, + { + "epoch": 0.07333638399845861, + "grad_norm": 1.637434720993042, + "learning_rate": 9.754842308082344e-07, + "loss": 0.9974, + "step": 3045 + }, + { + "epoch": 0.0734568049902459, + "grad_norm": 1.5349056720733643, + "learning_rate": 9.7535746881655e-07, + "loss": 0.9389, + "step": 3050 + }, + { + "epoch": 0.07357722598203319, + "grad_norm": 1.4215797185897827, + "learning_rate": 9.752307068248657e-07, + "loss": 0.9786, + "step": 3055 + }, + { + "epoch": 0.07369764697382047, + "grad_norm": 1.6199102401733398, + "learning_rate": 9.75103944833181e-07, + "loss": 0.9213, + "step": 3060 + }, + { + "epoch": 0.07381806796560776, + "grad_norm": 1.6182783842086792, + "learning_rate": 9.749771828414967e-07, + "loss": 0.9973, + "step": 3065 + }, + { + "epoch": 0.07393848895739505, + "grad_norm": 1.4855315685272217, + "learning_rate": 9.748504208498124e-07, + "loss": 0.9413, + "step": 3070 + }, + { + "epoch": 0.07405890994918234, + "grad_norm": 1.6804499626159668, + "learning_rate": 9.747236588581278e-07, + "loss": 0.9306, + "step": 3075 + }, + { + "epoch": 0.07417933094096962, + "grad_norm": 1.6445069313049316, + "learning_rate": 9.745968968664435e-07, + "loss": 0.9883, + "step": 3080 + }, + { + "epoch": 0.07429975193275692, + "grad_norm": 1.3424928188323975, + "learning_rate": 9.744701348747591e-07, + "loss": 0.9506, + "step": 3085 + }, + { + "epoch": 0.07442017292454421, + "grad_norm": 1.3568273782730103, + "learning_rate": 9.743433728830748e-07, + "loss": 1.0278, + "step": 3090 + }, + { + "epoch": 0.0745405939163315, + "grad_norm": 1.8795121908187866, + "learning_rate": 9.742166108913902e-07, + "loss": 0.9812, + "step": 3095 + }, + { + "epoch": 0.07466101490811879, + "grad_norm": 1.6932713985443115, + "learning_rate": 9.740898488997058e-07, + "loss": 0.9878, + "step": 3100 + }, + { + "epoch": 0.07478143589990607, + "grad_norm": 1.490128517150879, + "learning_rate": 9.739630869080215e-07, + "loss": 0.927, + "step": 3105 + }, + { + "epoch": 0.07490185689169336, + "grad_norm": 1.718519687652588, + "learning_rate": 9.738363249163371e-07, + "loss": 1.0131, + "step": 3110 + }, + { + "epoch": 0.07502227788348065, + "grad_norm": 1.6648001670837402, + "learning_rate": 9.737095629246528e-07, + "loss": 0.9911, + "step": 3115 + }, + { + "epoch": 0.07514269887526794, + "grad_norm": 1.541821837425232, + "learning_rate": 9.735828009329682e-07, + "loss": 1.0154, + "step": 3120 + }, + { + "epoch": 0.07526311986705522, + "grad_norm": 1.4682984352111816, + "learning_rate": 9.734560389412838e-07, + "loss": 0.981, + "step": 3125 + }, + { + "epoch": 0.07538354085884251, + "grad_norm": 1.4823180437088013, + "learning_rate": 9.733292769495995e-07, + "loss": 1.0265, + "step": 3130 + }, + { + "epoch": 0.0755039618506298, + "grad_norm": 1.359682559967041, + "learning_rate": 9.73202514957915e-07, + "loss": 0.9137, + "step": 3135 + }, + { + "epoch": 0.07562438284241708, + "grad_norm": 1.5946718454360962, + "learning_rate": 9.730757529662306e-07, + "loss": 0.9695, + "step": 3140 + }, + { + "epoch": 0.07574480383420437, + "grad_norm": 1.5661665201187134, + "learning_rate": 9.729489909745462e-07, + "loss": 0.9655, + "step": 3145 + }, + { + "epoch": 0.07586522482599167, + "grad_norm": 1.5131149291992188, + "learning_rate": 9.728222289828616e-07, + "loss": 0.9829, + "step": 3150 + }, + { + "epoch": 0.07598564581777896, + "grad_norm": 1.6737961769104004, + "learning_rate": 9.726954669911773e-07, + "loss": 0.9591, + "step": 3155 + }, + { + "epoch": 0.07610606680956625, + "grad_norm": 1.9588265419006348, + "learning_rate": 9.72568704999493e-07, + "loss": 0.9509, + "step": 3160 + }, + { + "epoch": 0.07622648780135353, + "grad_norm": 1.6424431800842285, + "learning_rate": 9.724419430078084e-07, + "loss": 0.9529, + "step": 3165 + }, + { + "epoch": 0.07634690879314082, + "grad_norm": 1.5378398895263672, + "learning_rate": 9.72315181016124e-07, + "loss": 1.002, + "step": 3170 + }, + { + "epoch": 0.07646732978492811, + "grad_norm": 1.6808853149414062, + "learning_rate": 9.721884190244397e-07, + "loss": 1.004, + "step": 3175 + }, + { + "epoch": 0.0765877507767154, + "grad_norm": 1.6004494428634644, + "learning_rate": 9.720616570327553e-07, + "loss": 1.0206, + "step": 3180 + }, + { + "epoch": 0.07670817176850268, + "grad_norm": 1.5458874702453613, + "learning_rate": 9.71934895041071e-07, + "loss": 0.972, + "step": 3185 + }, + { + "epoch": 0.07682859276028997, + "grad_norm": 1.4363664388656616, + "learning_rate": 9.718081330493864e-07, + "loss": 1.0765, + "step": 3190 + }, + { + "epoch": 0.07694901375207726, + "grad_norm": 1.6539244651794434, + "learning_rate": 9.71681371057702e-07, + "loss": 1.0203, + "step": 3195 + }, + { + "epoch": 0.07706943474386455, + "grad_norm": 1.4915926456451416, + "learning_rate": 9.715546090660177e-07, + "loss": 0.9945, + "step": 3200 + }, + { + "epoch": 0.07718985573565183, + "grad_norm": 1.519199013710022, + "learning_rate": 9.714278470743333e-07, + "loss": 0.9774, + "step": 3205 + }, + { + "epoch": 0.07731027672743913, + "grad_norm": 1.5417412519454956, + "learning_rate": 9.713010850826488e-07, + "loss": 0.9993, + "step": 3210 + }, + { + "epoch": 0.07743069771922642, + "grad_norm": 1.4990864992141724, + "learning_rate": 9.711743230909644e-07, + "loss": 0.9397, + "step": 3215 + }, + { + "epoch": 0.07755111871101371, + "grad_norm": 2.056589126586914, + "learning_rate": 9.7104756109928e-07, + "loss": 1.0334, + "step": 3220 + }, + { + "epoch": 0.077671539702801, + "grad_norm": 1.4858630895614624, + "learning_rate": 9.709207991075955e-07, + "loss": 0.9464, + "step": 3225 + }, + { + "epoch": 0.07779196069458828, + "grad_norm": 1.5432004928588867, + "learning_rate": 9.707940371159111e-07, + "loss": 0.9859, + "step": 3230 + }, + { + "epoch": 0.07791238168637557, + "grad_norm": 1.7187660932540894, + "learning_rate": 9.706672751242268e-07, + "loss": 0.995, + "step": 3235 + }, + { + "epoch": 0.07803280267816286, + "grad_norm": 1.5984174013137817, + "learning_rate": 9.705405131325422e-07, + "loss": 1.0028, + "step": 3240 + }, + { + "epoch": 0.07815322366995014, + "grad_norm": 1.5194472074508667, + "learning_rate": 9.704137511408578e-07, + "loss": 1.0039, + "step": 3245 + }, + { + "epoch": 0.07827364466173743, + "grad_norm": 1.6343015432357788, + "learning_rate": 9.702869891491735e-07, + "loss": 1.0423, + "step": 3250 + }, + { + "epoch": 0.07839406565352472, + "grad_norm": 1.5536481142044067, + "learning_rate": 9.70160227157489e-07, + "loss": 0.9829, + "step": 3255 + }, + { + "epoch": 0.078514486645312, + "grad_norm": 1.6728906631469727, + "learning_rate": 9.700334651658046e-07, + "loss": 0.9838, + "step": 3260 + }, + { + "epoch": 0.0786349076370993, + "grad_norm": 1.785743236541748, + "learning_rate": 9.699067031741202e-07, + "loss": 1.0017, + "step": 3265 + }, + { + "epoch": 0.07875532862888658, + "grad_norm": 1.5994573831558228, + "learning_rate": 9.697799411824359e-07, + "loss": 1.0219, + "step": 3270 + }, + { + "epoch": 0.07887574962067388, + "grad_norm": 1.6276570558547974, + "learning_rate": 9.696531791907515e-07, + "loss": 0.9886, + "step": 3275 + }, + { + "epoch": 0.07899617061246117, + "grad_norm": 1.6569063663482666, + "learning_rate": 9.69526417199067e-07, + "loss": 0.9907, + "step": 3280 + }, + { + "epoch": 0.07911659160424846, + "grad_norm": 1.6754655838012695, + "learning_rate": 9.693996552073826e-07, + "loss": 0.9841, + "step": 3285 + }, + { + "epoch": 0.07923701259603574, + "grad_norm": 1.7406693696975708, + "learning_rate": 9.692728932156982e-07, + "loss": 1.0043, + "step": 3290 + }, + { + "epoch": 0.07935743358782303, + "grad_norm": 1.4916058778762817, + "learning_rate": 9.691461312240139e-07, + "loss": 0.9928, + "step": 3295 + }, + { + "epoch": 0.07947785457961032, + "grad_norm": 1.5197107791900635, + "learning_rate": 9.690193692323293e-07, + "loss": 0.9679, + "step": 3300 + }, + { + "epoch": 0.0795982755713976, + "grad_norm": 1.7245734930038452, + "learning_rate": 9.68892607240645e-07, + "loss": 0.9643, + "step": 3305 + }, + { + "epoch": 0.0797186965631849, + "grad_norm": 1.7159137725830078, + "learning_rate": 9.687658452489606e-07, + "loss": 0.9778, + "step": 3310 + }, + { + "epoch": 0.07983911755497218, + "grad_norm": 1.564124345779419, + "learning_rate": 9.68639083257276e-07, + "loss": 0.9566, + "step": 3315 + }, + { + "epoch": 0.07995953854675947, + "grad_norm": 1.5709251165390015, + "learning_rate": 9.685123212655917e-07, + "loss": 0.9781, + "step": 3320 + }, + { + "epoch": 0.08007995953854675, + "grad_norm": 1.6757510900497437, + "learning_rate": 9.683855592739073e-07, + "loss": 1.0377, + "step": 3325 + }, + { + "epoch": 0.08020038053033404, + "grad_norm": 1.6469229459762573, + "learning_rate": 9.682587972822227e-07, + "loss": 0.9878, + "step": 3330 + }, + { + "epoch": 0.08032080152212133, + "grad_norm": 1.445010781288147, + "learning_rate": 9.681320352905384e-07, + "loss": 0.9342, + "step": 3335 + }, + { + "epoch": 0.08044122251390863, + "grad_norm": 2.0901291370391846, + "learning_rate": 9.68005273298854e-07, + "loss": 0.9842, + "step": 3340 + }, + { + "epoch": 0.08056164350569592, + "grad_norm": 1.6954982280731201, + "learning_rate": 9.678785113071697e-07, + "loss": 0.9938, + "step": 3345 + }, + { + "epoch": 0.0806820644974832, + "grad_norm": 1.504270076751709, + "learning_rate": 9.677517493154851e-07, + "loss": 0.906, + "step": 3350 + }, + { + "epoch": 0.08080248548927049, + "grad_norm": 1.5847922563552856, + "learning_rate": 9.676249873238008e-07, + "loss": 0.939, + "step": 3355 + }, + { + "epoch": 0.08092290648105778, + "grad_norm": 1.6328890323638916, + "learning_rate": 9.674982253321164e-07, + "loss": 0.9739, + "step": 3360 + }, + { + "epoch": 0.08104332747284507, + "grad_norm": 1.4796109199523926, + "learning_rate": 9.67371463340432e-07, + "loss": 0.9276, + "step": 3365 + }, + { + "epoch": 0.08116374846463235, + "grad_norm": 1.4965159893035889, + "learning_rate": 9.672447013487477e-07, + "loss": 0.9853, + "step": 3370 + }, + { + "epoch": 0.08128416945641964, + "grad_norm": 1.587471842765808, + "learning_rate": 9.671179393570631e-07, + "loss": 0.9829, + "step": 3375 + }, + { + "epoch": 0.08140459044820693, + "grad_norm": 1.473659873008728, + "learning_rate": 9.669911773653788e-07, + "loss": 0.9669, + "step": 3380 + }, + { + "epoch": 0.08152501143999422, + "grad_norm": 1.6593642234802246, + "learning_rate": 9.668644153736944e-07, + "loss": 0.9811, + "step": 3385 + }, + { + "epoch": 0.0816454324317815, + "grad_norm": 1.6420273780822754, + "learning_rate": 9.667376533820098e-07, + "loss": 0.9429, + "step": 3390 + }, + { + "epoch": 0.08176585342356879, + "grad_norm": 1.4429707527160645, + "learning_rate": 9.666108913903255e-07, + "loss": 0.8926, + "step": 3395 + }, + { + "epoch": 0.08188627441535609, + "grad_norm": 1.7320809364318848, + "learning_rate": 9.664841293986411e-07, + "loss": 1.0105, + "step": 3400 + }, + { + "epoch": 0.08200669540714338, + "grad_norm": 1.5669728517532349, + "learning_rate": 9.663573674069566e-07, + "loss": 1.0101, + "step": 3405 + }, + { + "epoch": 0.08212711639893067, + "grad_norm": 1.6901319026947021, + "learning_rate": 9.662306054152722e-07, + "loss": 0.9212, + "step": 3410 + }, + { + "epoch": 0.08224753739071795, + "grad_norm": 1.5522770881652832, + "learning_rate": 9.661038434235879e-07, + "loss": 1.0259, + "step": 3415 + }, + { + "epoch": 0.08236795838250524, + "grad_norm": 1.5578824281692505, + "learning_rate": 9.659770814319033e-07, + "loss": 0.9705, + "step": 3420 + }, + { + "epoch": 0.08248837937429253, + "grad_norm": 1.5385942459106445, + "learning_rate": 9.65850319440219e-07, + "loss": 0.9389, + "step": 3425 + }, + { + "epoch": 0.08260880036607982, + "grad_norm": 1.453081488609314, + "learning_rate": 9.657235574485346e-07, + "loss": 0.917, + "step": 3430 + }, + { + "epoch": 0.0827292213578671, + "grad_norm": 1.5530083179473877, + "learning_rate": 9.655967954568502e-07, + "loss": 0.989, + "step": 3435 + }, + { + "epoch": 0.08284964234965439, + "grad_norm": 1.7280933856964111, + "learning_rate": 9.654700334651657e-07, + "loss": 0.9389, + "step": 3440 + }, + { + "epoch": 0.08297006334144168, + "grad_norm": 1.5677049160003662, + "learning_rate": 9.653432714734813e-07, + "loss": 0.9811, + "step": 3445 + }, + { + "epoch": 0.08309048433322896, + "grad_norm": 1.63473379611969, + "learning_rate": 9.65216509481797e-07, + "loss": 0.9095, + "step": 3450 + }, + { + "epoch": 0.08321090532501625, + "grad_norm": 1.622157335281372, + "learning_rate": 9.650897474901126e-07, + "loss": 0.9601, + "step": 3455 + }, + { + "epoch": 0.08333132631680354, + "grad_norm": 1.4803037643432617, + "learning_rate": 9.649629854984282e-07, + "loss": 0.9547, + "step": 3460 + }, + { + "epoch": 0.08345174730859084, + "grad_norm": 1.5026421546936035, + "learning_rate": 9.648362235067437e-07, + "loss": 1.0317, + "step": 3465 + }, + { + "epoch": 0.08357216830037813, + "grad_norm": 1.8309855461120605, + "learning_rate": 9.647094615150593e-07, + "loss": 0.956, + "step": 3470 + }, + { + "epoch": 0.08369258929216541, + "grad_norm": 1.4832587242126465, + "learning_rate": 9.64582699523375e-07, + "loss": 0.939, + "step": 3475 + }, + { + "epoch": 0.0838130102839527, + "grad_norm": 1.4295638799667358, + "learning_rate": 9.644559375316904e-07, + "loss": 1.0362, + "step": 3480 + }, + { + "epoch": 0.08393343127573999, + "grad_norm": 1.5575592517852783, + "learning_rate": 9.64329175540006e-07, + "loss": 1.0136, + "step": 3485 + }, + { + "epoch": 0.08405385226752728, + "grad_norm": 1.6031485795974731, + "learning_rate": 9.642024135483217e-07, + "loss": 0.9677, + "step": 3490 + }, + { + "epoch": 0.08417427325931456, + "grad_norm": 1.580514669418335, + "learning_rate": 9.640756515566371e-07, + "loss": 0.96, + "step": 3495 + }, + { + "epoch": 0.08429469425110185, + "grad_norm": 1.5536184310913086, + "learning_rate": 9.639488895649528e-07, + "loss": 0.9376, + "step": 3500 + }, + { + "epoch": 0.08441511524288914, + "grad_norm": 1.4722299575805664, + "learning_rate": 9.638221275732684e-07, + "loss": 0.9752, + "step": 3505 + }, + { + "epoch": 0.08453553623467643, + "grad_norm": 1.56247878074646, + "learning_rate": 9.636953655815838e-07, + "loss": 0.9566, + "step": 3510 + }, + { + "epoch": 0.08465595722646371, + "grad_norm": 1.4400372505187988, + "learning_rate": 9.635686035898995e-07, + "loss": 0.9702, + "step": 3515 + }, + { + "epoch": 0.084776378218251, + "grad_norm": 1.4799162149429321, + "learning_rate": 9.634418415982151e-07, + "loss": 0.9713, + "step": 3520 + }, + { + "epoch": 0.08489679921003829, + "grad_norm": 1.4603667259216309, + "learning_rate": 9.633150796065308e-07, + "loss": 0.9461, + "step": 3525 + }, + { + "epoch": 0.08501722020182559, + "grad_norm": 1.6365811824798584, + "learning_rate": 9.631883176148464e-07, + "loss": 0.9602, + "step": 3530 + }, + { + "epoch": 0.08513764119361288, + "grad_norm": 1.5613988637924194, + "learning_rate": 9.630615556231618e-07, + "loss": 0.9674, + "step": 3535 + }, + { + "epoch": 0.08525806218540016, + "grad_norm": 1.7290339469909668, + "learning_rate": 9.629347936314775e-07, + "loss": 0.9317, + "step": 3540 + }, + { + "epoch": 0.08537848317718745, + "grad_norm": 1.7438316345214844, + "learning_rate": 9.628080316397931e-07, + "loss": 0.957, + "step": 3545 + }, + { + "epoch": 0.08549890416897474, + "grad_norm": 1.7681242227554321, + "learning_rate": 9.626812696481088e-07, + "loss": 0.9307, + "step": 3550 + }, + { + "epoch": 0.08561932516076202, + "grad_norm": 1.491039752960205, + "learning_rate": 9.625545076564242e-07, + "loss": 0.9786, + "step": 3555 + }, + { + "epoch": 0.08573974615254931, + "grad_norm": 1.6438604593276978, + "learning_rate": 9.624277456647399e-07, + "loss": 0.9276, + "step": 3560 + }, + { + "epoch": 0.0858601671443366, + "grad_norm": 1.577164649963379, + "learning_rate": 9.623009836730555e-07, + "loss": 1.0229, + "step": 3565 + }, + { + "epoch": 0.08598058813612389, + "grad_norm": 1.5834629535675049, + "learning_rate": 9.62174221681371e-07, + "loss": 1.0192, + "step": 3570 + }, + { + "epoch": 0.08610100912791117, + "grad_norm": 1.7259495258331299, + "learning_rate": 9.620474596896866e-07, + "loss": 0.9634, + "step": 3575 + }, + { + "epoch": 0.08622143011969846, + "grad_norm": 1.504324197769165, + "learning_rate": 9.619206976980022e-07, + "loss": 0.9599, + "step": 3580 + }, + { + "epoch": 0.08634185111148575, + "grad_norm": 1.4879755973815918, + "learning_rate": 9.617939357063177e-07, + "loss": 0.9769, + "step": 3585 + }, + { + "epoch": 0.08646227210327304, + "grad_norm": 1.4818052053451538, + "learning_rate": 9.616671737146333e-07, + "loss": 0.9764, + "step": 3590 + }, + { + "epoch": 0.08658269309506034, + "grad_norm": 1.4573898315429688, + "learning_rate": 9.61540411722949e-07, + "loss": 0.9719, + "step": 3595 + }, + { + "epoch": 0.08670311408684762, + "grad_norm": 1.512553334236145, + "learning_rate": 9.614136497312644e-07, + "loss": 0.9713, + "step": 3600 + }, + { + "epoch": 0.08682353507863491, + "grad_norm": 1.518481969833374, + "learning_rate": 9.6128688773958e-07, + "loss": 1.0032, + "step": 3605 + }, + { + "epoch": 0.0869439560704222, + "grad_norm": 1.7753887176513672, + "learning_rate": 9.611601257478957e-07, + "loss": 0.9889, + "step": 3610 + }, + { + "epoch": 0.08706437706220949, + "grad_norm": 1.4080522060394287, + "learning_rate": 9.610333637562113e-07, + "loss": 0.9666, + "step": 3615 + }, + { + "epoch": 0.08718479805399677, + "grad_norm": 1.6740397214889526, + "learning_rate": 9.60906601764527e-07, + "loss": 0.9517, + "step": 3620 + }, + { + "epoch": 0.08730521904578406, + "grad_norm": 1.6367727518081665, + "learning_rate": 9.607798397728424e-07, + "loss": 0.9712, + "step": 3625 + }, + { + "epoch": 0.08742564003757135, + "grad_norm": 1.6772522926330566, + "learning_rate": 9.60653077781158e-07, + "loss": 0.9832, + "step": 3630 + }, + { + "epoch": 0.08754606102935863, + "grad_norm": 1.4191092252731323, + "learning_rate": 9.605263157894737e-07, + "loss": 0.947, + "step": 3635 + }, + { + "epoch": 0.08766648202114592, + "grad_norm": 1.4476778507232666, + "learning_rate": 9.603995537977893e-07, + "loss": 0.957, + "step": 3640 + }, + { + "epoch": 0.08778690301293321, + "grad_norm": 1.9380329847335815, + "learning_rate": 9.602727918061048e-07, + "loss": 1.0579, + "step": 3645 + }, + { + "epoch": 0.0879073240047205, + "grad_norm": 1.3948523998260498, + "learning_rate": 9.601460298144204e-07, + "loss": 0.9996, + "step": 3650 + }, + { + "epoch": 0.0880277449965078, + "grad_norm": 1.8436353206634521, + "learning_rate": 9.60019267822736e-07, + "loss": 0.9496, + "step": 3655 + }, + { + "epoch": 0.08814816598829509, + "grad_norm": 1.5903769731521606, + "learning_rate": 9.598925058310515e-07, + "loss": 0.993, + "step": 3660 + }, + { + "epoch": 0.08826858698008237, + "grad_norm": 1.574457049369812, + "learning_rate": 9.597657438393671e-07, + "loss": 1.0448, + "step": 3665 + }, + { + "epoch": 0.08838900797186966, + "grad_norm": 1.6137624979019165, + "learning_rate": 9.596389818476828e-07, + "loss": 1.0311, + "step": 3670 + }, + { + "epoch": 0.08850942896365695, + "grad_norm": 1.7495733499526978, + "learning_rate": 9.595122198559982e-07, + "loss": 1.0024, + "step": 3675 + }, + { + "epoch": 0.08862984995544423, + "grad_norm": 1.5811641216278076, + "learning_rate": 9.593854578643138e-07, + "loss": 0.9636, + "step": 3680 + }, + { + "epoch": 0.08875027094723152, + "grad_norm": 1.7564260959625244, + "learning_rate": 9.592586958726295e-07, + "loss": 0.9737, + "step": 3685 + }, + { + "epoch": 0.08887069193901881, + "grad_norm": 1.8012256622314453, + "learning_rate": 9.591319338809451e-07, + "loss": 0.991, + "step": 3690 + }, + { + "epoch": 0.0889911129308061, + "grad_norm": 1.920172095298767, + "learning_rate": 9.590051718892606e-07, + "loss": 1.0374, + "step": 3695 + }, + { + "epoch": 0.08911153392259338, + "grad_norm": 1.4977744817733765, + "learning_rate": 9.588784098975762e-07, + "loss": 0.9509, + "step": 3700 + }, + { + "epoch": 0.08923195491438067, + "grad_norm": 1.7665150165557861, + "learning_rate": 9.587516479058919e-07, + "loss": 0.9735, + "step": 3705 + }, + { + "epoch": 0.08935237590616796, + "grad_norm": 1.5105448961257935, + "learning_rate": 9.586248859142075e-07, + "loss": 1.0022, + "step": 3710 + }, + { + "epoch": 0.08947279689795525, + "grad_norm": 1.488258957862854, + "learning_rate": 9.584981239225232e-07, + "loss": 1.0177, + "step": 3715 + }, + { + "epoch": 0.08959321788974255, + "grad_norm": 1.6075270175933838, + "learning_rate": 9.583713619308386e-07, + "loss": 0.9024, + "step": 3720 + }, + { + "epoch": 0.08971363888152983, + "grad_norm": 1.5839999914169312, + "learning_rate": 9.582445999391542e-07, + "loss": 0.9843, + "step": 3725 + }, + { + "epoch": 0.08983405987331712, + "grad_norm": 1.4419506788253784, + "learning_rate": 9.581178379474699e-07, + "loss": 0.9383, + "step": 3730 + }, + { + "epoch": 0.08995448086510441, + "grad_norm": 1.660599946975708, + "learning_rate": 9.579910759557853e-07, + "loss": 0.9855, + "step": 3735 + }, + { + "epoch": 0.0900749018568917, + "grad_norm": 1.6551530361175537, + "learning_rate": 9.57864313964101e-07, + "loss": 0.9816, + "step": 3740 + }, + { + "epoch": 0.09019532284867898, + "grad_norm": 1.4094029664993286, + "learning_rate": 9.577375519724166e-07, + "loss": 0.9733, + "step": 3745 + }, + { + "epoch": 0.09031574384046627, + "grad_norm": 1.5762144327163696, + "learning_rate": 9.57610789980732e-07, + "loss": 0.9718, + "step": 3750 + }, + { + "epoch": 0.09043616483225356, + "grad_norm": 1.6266120672225952, + "learning_rate": 9.574840279890477e-07, + "loss": 0.9761, + "step": 3755 + }, + { + "epoch": 0.09055658582404084, + "grad_norm": 1.5777689218521118, + "learning_rate": 9.573572659973633e-07, + "loss": 1.0, + "step": 3760 + }, + { + "epoch": 0.09067700681582813, + "grad_norm": 1.5066728591918945, + "learning_rate": 9.572305040056788e-07, + "loss": 0.9302, + "step": 3765 + }, + { + "epoch": 0.09079742780761542, + "grad_norm": 1.7050273418426514, + "learning_rate": 9.571037420139944e-07, + "loss": 0.953, + "step": 3770 + }, + { + "epoch": 0.0909178487994027, + "grad_norm": 1.7422499656677246, + "learning_rate": 9.5697698002231e-07, + "loss": 1.003, + "step": 3775 + }, + { + "epoch": 0.09103826979119, + "grad_norm": 1.5633741617202759, + "learning_rate": 9.568502180306257e-07, + "loss": 0.9682, + "step": 3780 + }, + { + "epoch": 0.0911586907829773, + "grad_norm": 1.7610095739364624, + "learning_rate": 9.567234560389411e-07, + "loss": 0.9223, + "step": 3785 + }, + { + "epoch": 0.09127911177476458, + "grad_norm": 1.6257444620132446, + "learning_rate": 9.565966940472568e-07, + "loss": 0.9416, + "step": 3790 + }, + { + "epoch": 0.09139953276655187, + "grad_norm": 1.455300211906433, + "learning_rate": 9.564699320555724e-07, + "loss": 1.0258, + "step": 3795 + }, + { + "epoch": 0.09151995375833916, + "grad_norm": 1.596089243888855, + "learning_rate": 9.56343170063888e-07, + "loss": 1.001, + "step": 3800 + }, + { + "epoch": 0.09164037475012644, + "grad_norm": 1.5101442337036133, + "learning_rate": 9.562164080722037e-07, + "loss": 1.0037, + "step": 3805 + }, + { + "epoch": 0.09176079574191373, + "grad_norm": 1.5162341594696045, + "learning_rate": 9.560896460805191e-07, + "loss": 1.0177, + "step": 3810 + }, + { + "epoch": 0.09188121673370102, + "grad_norm": 1.6944547891616821, + "learning_rate": 9.559628840888348e-07, + "loss": 0.9239, + "step": 3815 + }, + { + "epoch": 0.0920016377254883, + "grad_norm": 1.5876063108444214, + "learning_rate": 9.558361220971504e-07, + "loss": 0.9869, + "step": 3820 + }, + { + "epoch": 0.09212205871727559, + "grad_norm": 1.616463303565979, + "learning_rate": 9.55709360105466e-07, + "loss": 1.03, + "step": 3825 + }, + { + "epoch": 0.09224247970906288, + "grad_norm": 1.567016363143921, + "learning_rate": 9.555825981137815e-07, + "loss": 0.9364, + "step": 3830 + }, + { + "epoch": 0.09236290070085017, + "grad_norm": 1.412610650062561, + "learning_rate": 9.554558361220971e-07, + "loss": 0.9713, + "step": 3835 + }, + { + "epoch": 0.09248332169263745, + "grad_norm": 1.6522778272628784, + "learning_rate": 9.553290741304128e-07, + "loss": 0.9959, + "step": 3840 + }, + { + "epoch": 0.09260374268442476, + "grad_norm": 1.5945355892181396, + "learning_rate": 9.552023121387282e-07, + "loss": 0.9766, + "step": 3845 + }, + { + "epoch": 0.09272416367621204, + "grad_norm": 1.6672556400299072, + "learning_rate": 9.550755501470439e-07, + "loss": 0.9743, + "step": 3850 + }, + { + "epoch": 0.09284458466799933, + "grad_norm": 1.7128037214279175, + "learning_rate": 9.549487881553595e-07, + "loss": 0.982, + "step": 3855 + }, + { + "epoch": 0.09296500565978662, + "grad_norm": 1.6894267797470093, + "learning_rate": 9.54822026163675e-07, + "loss": 0.9418, + "step": 3860 + }, + { + "epoch": 0.0930854266515739, + "grad_norm": 1.9653635025024414, + "learning_rate": 9.546952641719906e-07, + "loss": 1.0066, + "step": 3865 + }, + { + "epoch": 0.09320584764336119, + "grad_norm": 1.4464094638824463, + "learning_rate": 9.545685021803062e-07, + "loss": 0.9741, + "step": 3870 + }, + { + "epoch": 0.09332626863514848, + "grad_norm": 1.7934719324111938, + "learning_rate": 9.544417401886219e-07, + "loss": 0.9716, + "step": 3875 + }, + { + "epoch": 0.09344668962693577, + "grad_norm": 1.5995069742202759, + "learning_rate": 9.543149781969373e-07, + "loss": 0.9481, + "step": 3880 + }, + { + "epoch": 0.09356711061872305, + "grad_norm": 1.2907156944274902, + "learning_rate": 9.54188216205253e-07, + "loss": 0.8987, + "step": 3885 + }, + { + "epoch": 0.09368753161051034, + "grad_norm": 1.5202016830444336, + "learning_rate": 9.540614542135686e-07, + "loss": 0.897, + "step": 3890 + }, + { + "epoch": 0.09380795260229763, + "grad_norm": 1.5255939960479736, + "learning_rate": 9.539346922218842e-07, + "loss": 0.9924, + "step": 3895 + }, + { + "epoch": 0.09392837359408492, + "grad_norm": 1.9490069150924683, + "learning_rate": 9.538079302301999e-07, + "loss": 1.005, + "step": 3900 + }, + { + "epoch": 0.0940487945858722, + "grad_norm": 1.5038361549377441, + "learning_rate": 9.536811682385153e-07, + "loss": 1.0007, + "step": 3905 + }, + { + "epoch": 0.0941692155776595, + "grad_norm": 1.6892212629318237, + "learning_rate": 9.53554406246831e-07, + "loss": 0.9309, + "step": 3910 + }, + { + "epoch": 0.09428963656944679, + "grad_norm": 1.5403926372528076, + "learning_rate": 9.534276442551465e-07, + "loss": 0.9412, + "step": 3915 + }, + { + "epoch": 0.09441005756123408, + "grad_norm": 1.5528895854949951, + "learning_rate": 9.53300882263462e-07, + "loss": 0.9814, + "step": 3920 + }, + { + "epoch": 0.09453047855302137, + "grad_norm": 1.7345143556594849, + "learning_rate": 9.531741202717777e-07, + "loss": 1.0062, + "step": 3925 + }, + { + "epoch": 0.09465089954480865, + "grad_norm": 1.5540204048156738, + "learning_rate": 9.530473582800933e-07, + "loss": 0.9077, + "step": 3930 + }, + { + "epoch": 0.09477132053659594, + "grad_norm": 1.5319840908050537, + "learning_rate": 9.529205962884088e-07, + "loss": 0.9675, + "step": 3935 + }, + { + "epoch": 0.09489174152838323, + "grad_norm": 1.5255876779556274, + "learning_rate": 9.527938342967244e-07, + "loss": 0.9512, + "step": 3940 + }, + { + "epoch": 0.09501216252017052, + "grad_norm": 1.393852710723877, + "learning_rate": 9.526670723050401e-07, + "loss": 0.9679, + "step": 3945 + }, + { + "epoch": 0.0951325835119578, + "grad_norm": 1.5980697870254517, + "learning_rate": 9.525403103133556e-07, + "loss": 0.9291, + "step": 3950 + }, + { + "epoch": 0.09525300450374509, + "grad_norm": 1.8581165075302124, + "learning_rate": 9.524135483216712e-07, + "loss": 0.8802, + "step": 3955 + }, + { + "epoch": 0.09537342549553238, + "grad_norm": 1.8296480178833008, + "learning_rate": 9.522867863299868e-07, + "loss": 0.9591, + "step": 3960 + }, + { + "epoch": 0.09549384648731966, + "grad_norm": 1.7010616064071655, + "learning_rate": 9.521600243383023e-07, + "loss": 0.9367, + "step": 3965 + }, + { + "epoch": 0.09561426747910695, + "grad_norm": 1.459513545036316, + "learning_rate": 9.52033262346618e-07, + "loss": 0.9193, + "step": 3970 + }, + { + "epoch": 0.09573468847089425, + "grad_norm": 1.5484166145324707, + "learning_rate": 9.519065003549336e-07, + "loss": 0.97, + "step": 3975 + }, + { + "epoch": 0.09585510946268154, + "grad_norm": 1.3898035287857056, + "learning_rate": 9.517797383632491e-07, + "loss": 1.0005, + "step": 3980 + }, + { + "epoch": 0.09597553045446883, + "grad_norm": 1.6612249612808228, + "learning_rate": 9.516529763715647e-07, + "loss": 0.9713, + "step": 3985 + }, + { + "epoch": 0.09609595144625611, + "grad_norm": 1.581580638885498, + "learning_rate": 9.515262143798803e-07, + "loss": 0.9966, + "step": 3990 + }, + { + "epoch": 0.0962163724380434, + "grad_norm": 1.6020958423614502, + "learning_rate": 9.513994523881959e-07, + "loss": 0.9582, + "step": 3995 + }, + { + "epoch": 0.09633679342983069, + "grad_norm": 1.6202304363250732, + "learning_rate": 9.512726903965115e-07, + "loss": 0.9673, + "step": 4000 + }, + { + "epoch": 0.09645721442161798, + "grad_norm": 1.4806082248687744, + "learning_rate": 9.51145928404827e-07, + "loss": 0.9414, + "step": 4005 + }, + { + "epoch": 0.09657763541340526, + "grad_norm": 1.4153348207473755, + "learning_rate": 9.510191664131426e-07, + "loss": 1.0094, + "step": 4010 + }, + { + "epoch": 0.09669805640519255, + "grad_norm": 1.5099906921386719, + "learning_rate": 9.508924044214582e-07, + "loss": 1.0002, + "step": 4015 + }, + { + "epoch": 0.09681847739697984, + "grad_norm": 1.4537502527236938, + "learning_rate": 9.507656424297739e-07, + "loss": 0.9604, + "step": 4020 + }, + { + "epoch": 0.09693889838876713, + "grad_norm": 1.5901180505752563, + "learning_rate": 9.506388804380894e-07, + "loss": 0.9957, + "step": 4025 + }, + { + "epoch": 0.09705931938055441, + "grad_norm": 1.530016303062439, + "learning_rate": 9.50512118446405e-07, + "loss": 0.9687, + "step": 4030 + }, + { + "epoch": 0.0971797403723417, + "grad_norm": 1.5487666130065918, + "learning_rate": 9.503853564547206e-07, + "loss": 0.964, + "step": 4035 + }, + { + "epoch": 0.097300161364129, + "grad_norm": 1.649968147277832, + "learning_rate": 9.502585944630361e-07, + "loss": 0.995, + "step": 4040 + }, + { + "epoch": 0.09742058235591629, + "grad_norm": 1.5910958051681519, + "learning_rate": 9.501318324713518e-07, + "loss": 0.9521, + "step": 4045 + }, + { + "epoch": 0.09754100334770358, + "grad_norm": 1.509109377861023, + "learning_rate": 9.500050704796674e-07, + "loss": 0.9757, + "step": 4050 + }, + { + "epoch": 0.09766142433949086, + "grad_norm": 1.4290871620178223, + "learning_rate": 9.498783084879829e-07, + "loss": 0.9602, + "step": 4055 + }, + { + "epoch": 0.09778184533127815, + "grad_norm": 1.5802230834960938, + "learning_rate": 9.497515464962985e-07, + "loss": 0.9717, + "step": 4060 + }, + { + "epoch": 0.09790226632306544, + "grad_norm": 1.4167861938476562, + "learning_rate": 9.496247845046142e-07, + "loss": 0.9509, + "step": 4065 + }, + { + "epoch": 0.09802268731485272, + "grad_norm": 1.5323904752731323, + "learning_rate": 9.494980225129297e-07, + "loss": 0.9557, + "step": 4070 + }, + { + "epoch": 0.09814310830664001, + "grad_norm": 1.2644950151443481, + "learning_rate": 9.493712605212452e-07, + "loss": 0.9239, + "step": 4075 + }, + { + "epoch": 0.0982635292984273, + "grad_norm": 1.574794054031372, + "learning_rate": 9.492444985295609e-07, + "loss": 0.9739, + "step": 4080 + }, + { + "epoch": 0.09838395029021459, + "grad_norm": 1.424659252166748, + "learning_rate": 9.491177365378764e-07, + "loss": 0.9867, + "step": 4085 + }, + { + "epoch": 0.09850437128200187, + "grad_norm": 1.5145343542099, + "learning_rate": 9.489909745461921e-07, + "loss": 0.9996, + "step": 4090 + }, + { + "epoch": 0.09862479227378916, + "grad_norm": 1.4513949155807495, + "learning_rate": 9.488642125545077e-07, + "loss": 0.9537, + "step": 4095 + }, + { + "epoch": 0.09874521326557646, + "grad_norm": 1.501413345336914, + "learning_rate": 9.487374505628231e-07, + "loss": 0.9569, + "step": 4100 + }, + { + "epoch": 0.09886563425736375, + "grad_norm": 1.536135196685791, + "learning_rate": 9.486106885711388e-07, + "loss": 0.9226, + "step": 4105 + }, + { + "epoch": 0.09898605524915104, + "grad_norm": 1.2886914014816284, + "learning_rate": 9.484839265794544e-07, + "loss": 0.9593, + "step": 4110 + }, + { + "epoch": 0.09910647624093832, + "grad_norm": 1.841726303100586, + "learning_rate": 9.4835716458777e-07, + "loss": 1.0258, + "step": 4115 + }, + { + "epoch": 0.09922689723272561, + "grad_norm": 1.5284498929977417, + "learning_rate": 9.482304025960855e-07, + "loss": 0.9678, + "step": 4120 + }, + { + "epoch": 0.0993473182245129, + "grad_norm": 3.6049418449401855, + "learning_rate": 9.481036406044011e-07, + "loss": 0.9033, + "step": 4125 + }, + { + "epoch": 0.09946773921630019, + "grad_norm": 1.560465931892395, + "learning_rate": 9.479768786127167e-07, + "loss": 0.9126, + "step": 4130 + }, + { + "epoch": 0.09958816020808747, + "grad_norm": 1.5048307180404663, + "learning_rate": 9.478501166210323e-07, + "loss": 0.9472, + "step": 4135 + }, + { + "epoch": 0.09970858119987476, + "grad_norm": 1.4856950044631958, + "learning_rate": 9.47723354629348e-07, + "loss": 0.9714, + "step": 4140 + }, + { + "epoch": 0.09982900219166205, + "grad_norm": 1.6258323192596436, + "learning_rate": 9.475965926376634e-07, + "loss": 0.9564, + "step": 4145 + }, + { + "epoch": 0.09994942318344933, + "grad_norm": 1.552347183227539, + "learning_rate": 9.474698306459791e-07, + "loss": 0.9473, + "step": 4150 + }, + { + "epoch": 0.10006984417523662, + "grad_norm": 1.8179153203964233, + "learning_rate": 9.473430686542947e-07, + "loss": 0.9837, + "step": 4155 + }, + { + "epoch": 0.10019026516702391, + "grad_norm": 1.4746826887130737, + "learning_rate": 9.472163066626102e-07, + "loss": 0.9202, + "step": 4160 + }, + { + "epoch": 0.10031068615881121, + "grad_norm": 1.7761483192443848, + "learning_rate": 9.470895446709259e-07, + "loss": 1.0175, + "step": 4165 + }, + { + "epoch": 0.1004311071505985, + "grad_norm": 1.3966078758239746, + "learning_rate": 9.469627826792414e-07, + "loss": 0.9678, + "step": 4170 + }, + { + "epoch": 0.10055152814238578, + "grad_norm": 1.6143893003463745, + "learning_rate": 9.46836020687557e-07, + "loss": 0.9992, + "step": 4175 + }, + { + "epoch": 0.10067194913417307, + "grad_norm": 1.681153655052185, + "learning_rate": 9.467092586958726e-07, + "loss": 0.965, + "step": 4180 + }, + { + "epoch": 0.10079237012596036, + "grad_norm": 1.4772725105285645, + "learning_rate": 9.465824967041883e-07, + "loss": 1.012, + "step": 4185 + }, + { + "epoch": 0.10091279111774765, + "grad_norm": 1.9717448949813843, + "learning_rate": 9.464557347125037e-07, + "loss": 1.0125, + "step": 4190 + }, + { + "epoch": 0.10103321210953493, + "grad_norm": 1.4508270025253296, + "learning_rate": 9.463289727208193e-07, + "loss": 0.9695, + "step": 4195 + }, + { + "epoch": 0.10115363310132222, + "grad_norm": 1.4963538646697998, + "learning_rate": 9.46202210729135e-07, + "loss": 1.0119, + "step": 4200 + }, + { + "epoch": 0.10127405409310951, + "grad_norm": 1.5520143508911133, + "learning_rate": 9.460754487374505e-07, + "loss": 0.9696, + "step": 4205 + }, + { + "epoch": 0.1013944750848968, + "grad_norm": 1.6408535242080688, + "learning_rate": 9.459486867457662e-07, + "loss": 0.9551, + "step": 4210 + }, + { + "epoch": 0.10151489607668408, + "grad_norm": 1.5083121061325073, + "learning_rate": 9.458219247540817e-07, + "loss": 0.9692, + "step": 4215 + }, + { + "epoch": 0.10163531706847137, + "grad_norm": 1.5416884422302246, + "learning_rate": 9.456951627623972e-07, + "loss": 0.9981, + "step": 4220 + }, + { + "epoch": 0.10175573806025866, + "grad_norm": 1.4579458236694336, + "learning_rate": 9.455684007707129e-07, + "loss": 0.9631, + "step": 4225 + }, + { + "epoch": 0.10187615905204596, + "grad_norm": 1.7778942584991455, + "learning_rate": 9.454416387790285e-07, + "loss": 0.9359, + "step": 4230 + }, + { + "epoch": 0.10199658004383325, + "grad_norm": 1.5496573448181152, + "learning_rate": 9.45314876787344e-07, + "loss": 0.9948, + "step": 4235 + }, + { + "epoch": 0.10211700103562053, + "grad_norm": 1.6221665143966675, + "learning_rate": 9.451881147956596e-07, + "loss": 0.9882, + "step": 4240 + }, + { + "epoch": 0.10223742202740782, + "grad_norm": 2.9152276515960693, + "learning_rate": 9.450613528039752e-07, + "loss": 0.9812, + "step": 4245 + }, + { + "epoch": 0.10235784301919511, + "grad_norm": 1.5360208749771118, + "learning_rate": 9.449345908122908e-07, + "loss": 0.944, + "step": 4250 + }, + { + "epoch": 0.1024782640109824, + "grad_norm": 1.5541127920150757, + "learning_rate": 9.448078288206064e-07, + "loss": 0.9994, + "step": 4255 + }, + { + "epoch": 0.10259868500276968, + "grad_norm": 1.4846186637878418, + "learning_rate": 9.44681066828922e-07, + "loss": 0.9377, + "step": 4260 + }, + { + "epoch": 0.10271910599455697, + "grad_norm": 1.5796730518341064, + "learning_rate": 9.445543048372375e-07, + "loss": 0.903, + "step": 4265 + }, + { + "epoch": 0.10283952698634426, + "grad_norm": 1.6794935464859009, + "learning_rate": 9.444275428455532e-07, + "loss": 0.9928, + "step": 4270 + }, + { + "epoch": 0.10295994797813154, + "grad_norm": 1.5276776552200317, + "learning_rate": 9.443007808538688e-07, + "loss": 0.9501, + "step": 4275 + }, + { + "epoch": 0.10308036896991883, + "grad_norm": 1.5344417095184326, + "learning_rate": 9.441740188621843e-07, + "loss": 0.9711, + "step": 4280 + }, + { + "epoch": 0.10320078996170612, + "grad_norm": 1.7467256784439087, + "learning_rate": 9.440472568704999e-07, + "loss": 0.9535, + "step": 4285 + }, + { + "epoch": 0.10332121095349342, + "grad_norm": 1.38759183883667, + "learning_rate": 9.439204948788155e-07, + "loss": 0.9555, + "step": 4290 + }, + { + "epoch": 0.10344163194528071, + "grad_norm": 1.624312162399292, + "learning_rate": 9.437937328871311e-07, + "loss": 1.0224, + "step": 4295 + }, + { + "epoch": 0.103562052937068, + "grad_norm": 1.673833966255188, + "learning_rate": 9.436669708954467e-07, + "loss": 0.9612, + "step": 4300 + }, + { + "epoch": 0.10368247392885528, + "grad_norm": 1.6893752813339233, + "learning_rate": 9.435402089037622e-07, + "loss": 0.9885, + "step": 4305 + }, + { + "epoch": 0.10380289492064257, + "grad_norm": 1.6171820163726807, + "learning_rate": 9.434134469120778e-07, + "loss": 1.0103, + "step": 4310 + }, + { + "epoch": 0.10392331591242986, + "grad_norm": 1.706410527229309, + "learning_rate": 9.432866849203934e-07, + "loss": 0.9841, + "step": 4315 + }, + { + "epoch": 0.10404373690421714, + "grad_norm": 1.5108826160430908, + "learning_rate": 9.431599229287091e-07, + "loss": 0.9814, + "step": 4320 + }, + { + "epoch": 0.10416415789600443, + "grad_norm": 1.5877901315689087, + "learning_rate": 9.430331609370246e-07, + "loss": 0.9791, + "step": 4325 + }, + { + "epoch": 0.10428457888779172, + "grad_norm": 1.650411605834961, + "learning_rate": 9.429063989453401e-07, + "loss": 0.9648, + "step": 4330 + }, + { + "epoch": 0.104404999879579, + "grad_norm": 1.6145753860473633, + "learning_rate": 9.427796369536558e-07, + "loss": 0.9916, + "step": 4335 + }, + { + "epoch": 0.10452542087136629, + "grad_norm": 1.7058037519454956, + "learning_rate": 9.426528749619713e-07, + "loss": 0.9741, + "step": 4340 + }, + { + "epoch": 0.10464584186315358, + "grad_norm": 1.5574986934661865, + "learning_rate": 9.42526112970287e-07, + "loss": 0.9752, + "step": 4345 + }, + { + "epoch": 0.10476626285494087, + "grad_norm": 1.51565420627594, + "learning_rate": 9.423993509786026e-07, + "loss": 0.968, + "step": 4350 + }, + { + "epoch": 0.10488668384672817, + "grad_norm": 1.8395243883132935, + "learning_rate": 9.422725889869181e-07, + "loss": 0.953, + "step": 4355 + }, + { + "epoch": 0.10500710483851546, + "grad_norm": 1.5606211423873901, + "learning_rate": 9.421458269952337e-07, + "loss": 0.957, + "step": 4360 + }, + { + "epoch": 0.10512752583030274, + "grad_norm": 1.6414927244186401, + "learning_rate": 9.420190650035493e-07, + "loss": 0.9739, + "step": 4365 + }, + { + "epoch": 0.10524794682209003, + "grad_norm": 1.4883346557617188, + "learning_rate": 9.418923030118649e-07, + "loss": 0.9875, + "step": 4370 + }, + { + "epoch": 0.10536836781387732, + "grad_norm": 1.480233073234558, + "learning_rate": 9.417655410201804e-07, + "loss": 0.9975, + "step": 4375 + }, + { + "epoch": 0.1054887888056646, + "grad_norm": 1.6190452575683594, + "learning_rate": 9.416387790284961e-07, + "loss": 1.0012, + "step": 4380 + }, + { + "epoch": 0.10560920979745189, + "grad_norm": 1.6386826038360596, + "learning_rate": 9.415120170368116e-07, + "loss": 0.9849, + "step": 4385 + }, + { + "epoch": 0.10572963078923918, + "grad_norm": 1.881450891494751, + "learning_rate": 9.413852550451272e-07, + "loss": 0.923, + "step": 4390 + }, + { + "epoch": 0.10585005178102647, + "grad_norm": 1.536907434463501, + "learning_rate": 9.412584930534429e-07, + "loss": 0.9716, + "step": 4395 + }, + { + "epoch": 0.10597047277281375, + "grad_norm": 2.033087968826294, + "learning_rate": 9.411317310617583e-07, + "loss": 0.945, + "step": 4400 + }, + { + "epoch": 0.10609089376460104, + "grad_norm": 1.5700300931930542, + "learning_rate": 9.41004969070074e-07, + "loss": 0.9874, + "step": 4405 + }, + { + "epoch": 0.10621131475638833, + "grad_norm": 1.6118327379226685, + "learning_rate": 9.408782070783896e-07, + "loss": 0.9513, + "step": 4410 + }, + { + "epoch": 0.10633173574817562, + "grad_norm": 1.5397894382476807, + "learning_rate": 9.407514450867052e-07, + "loss": 0.9445, + "step": 4415 + }, + { + "epoch": 0.10645215673996292, + "grad_norm": 1.7848899364471436, + "learning_rate": 9.406246830950207e-07, + "loss": 0.9626, + "step": 4420 + }, + { + "epoch": 0.1065725777317502, + "grad_norm": 1.5513091087341309, + "learning_rate": 9.404979211033363e-07, + "loss": 0.9744, + "step": 4425 + }, + { + "epoch": 0.10669299872353749, + "grad_norm": 1.535009741783142, + "learning_rate": 9.403711591116519e-07, + "loss": 0.9383, + "step": 4430 + }, + { + "epoch": 0.10681341971532478, + "grad_norm": 1.59181547164917, + "learning_rate": 9.402443971199675e-07, + "loss": 0.9882, + "step": 4435 + }, + { + "epoch": 0.10693384070711207, + "grad_norm": 1.7125526666641235, + "learning_rate": 9.401176351282832e-07, + "loss": 0.979, + "step": 4440 + }, + { + "epoch": 0.10705426169889935, + "grad_norm": 1.619505524635315, + "learning_rate": 9.399908731365986e-07, + "loss": 0.9604, + "step": 4445 + }, + { + "epoch": 0.10717468269068664, + "grad_norm": 1.9453794956207275, + "learning_rate": 9.398641111449142e-07, + "loss": 0.9507, + "step": 4450 + }, + { + "epoch": 0.10729510368247393, + "grad_norm": 1.5830106735229492, + "learning_rate": 9.397373491532299e-07, + "loss": 0.9718, + "step": 4455 + }, + { + "epoch": 0.10741552467426121, + "grad_norm": 1.6598644256591797, + "learning_rate": 9.396105871615454e-07, + "loss": 1.0022, + "step": 4460 + }, + { + "epoch": 0.1075359456660485, + "grad_norm": 1.6971837282180786, + "learning_rate": 9.394838251698611e-07, + "loss": 1.0149, + "step": 4465 + }, + { + "epoch": 0.10765636665783579, + "grad_norm": 1.7452025413513184, + "learning_rate": 9.393570631781766e-07, + "loss": 0.9395, + "step": 4470 + }, + { + "epoch": 0.10777678764962308, + "grad_norm": 1.655214786529541, + "learning_rate": 9.392303011864922e-07, + "loss": 0.9422, + "step": 4475 + }, + { + "epoch": 0.10789720864141038, + "grad_norm": 1.542333960533142, + "learning_rate": 9.391035391948078e-07, + "loss": 1.021, + "step": 4480 + }, + { + "epoch": 0.10801762963319766, + "grad_norm": 1.7552878856658936, + "learning_rate": 9.389767772031234e-07, + "loss": 0.9563, + "step": 4485 + }, + { + "epoch": 0.10813805062498495, + "grad_norm": 1.8255192041397095, + "learning_rate": 9.388500152114389e-07, + "loss": 0.9953, + "step": 4490 + }, + { + "epoch": 0.10825847161677224, + "grad_norm": 1.5070478916168213, + "learning_rate": 9.387232532197545e-07, + "loss": 0.9908, + "step": 4495 + }, + { + "epoch": 0.10837889260855953, + "grad_norm": 1.4872255325317383, + "learning_rate": 9.385964912280702e-07, + "loss": 0.9305, + "step": 4500 + }, + { + "epoch": 0.10849931360034681, + "grad_norm": 1.501749873161316, + "learning_rate": 9.384697292363857e-07, + "loss": 0.9861, + "step": 4505 + }, + { + "epoch": 0.1086197345921341, + "grad_norm": 1.7855017185211182, + "learning_rate": 9.383429672447013e-07, + "loss": 0.9322, + "step": 4510 + }, + { + "epoch": 0.10874015558392139, + "grad_norm": 1.3943707942962646, + "learning_rate": 9.382162052530169e-07, + "loss": 0.9588, + "step": 4515 + }, + { + "epoch": 0.10886057657570868, + "grad_norm": 1.5438886880874634, + "learning_rate": 9.380894432613324e-07, + "loss": 0.9358, + "step": 4520 + }, + { + "epoch": 0.10898099756749596, + "grad_norm": 1.761030673980713, + "learning_rate": 9.379626812696481e-07, + "loss": 0.9704, + "step": 4525 + }, + { + "epoch": 0.10910141855928325, + "grad_norm": 1.877665638923645, + "learning_rate": 9.378359192779637e-07, + "loss": 1.0007, + "step": 4530 + }, + { + "epoch": 0.10922183955107054, + "grad_norm": 1.6068402528762817, + "learning_rate": 9.377091572862791e-07, + "loss": 0.9647, + "step": 4535 + }, + { + "epoch": 0.10934226054285782, + "grad_norm": 1.6051069498062134, + "learning_rate": 9.375823952945948e-07, + "loss": 0.9498, + "step": 4540 + }, + { + "epoch": 0.10946268153464513, + "grad_norm": 1.457535982131958, + "learning_rate": 9.374556333029104e-07, + "loss": 0.9663, + "step": 4545 + }, + { + "epoch": 0.10958310252643241, + "grad_norm": 1.6678305864334106, + "learning_rate": 9.37328871311226e-07, + "loss": 1.0178, + "step": 4550 + }, + { + "epoch": 0.1097035235182197, + "grad_norm": 1.4807989597320557, + "learning_rate": 9.372021093195416e-07, + "loss": 0.8829, + "step": 4555 + }, + { + "epoch": 0.10982394451000699, + "grad_norm": 1.5890085697174072, + "learning_rate": 9.370753473278572e-07, + "loss": 0.9951, + "step": 4560 + }, + { + "epoch": 0.10994436550179428, + "grad_norm": 1.8150923252105713, + "learning_rate": 9.369485853361727e-07, + "loss": 0.8929, + "step": 4565 + }, + { + "epoch": 0.11006478649358156, + "grad_norm": 1.5665993690490723, + "learning_rate": 9.368218233444883e-07, + "loss": 0.9729, + "step": 4570 + }, + { + "epoch": 0.11018520748536885, + "grad_norm": 1.9737279415130615, + "learning_rate": 9.36695061352804e-07, + "loss": 1.0065, + "step": 4575 + }, + { + "epoch": 0.11030562847715614, + "grad_norm": 1.4127687215805054, + "learning_rate": 9.365682993611195e-07, + "loss": 0.9853, + "step": 4580 + }, + { + "epoch": 0.11042604946894342, + "grad_norm": 1.557056188583374, + "learning_rate": 9.364415373694351e-07, + "loss": 0.9623, + "step": 4585 + }, + { + "epoch": 0.11054647046073071, + "grad_norm": 1.6299985647201538, + "learning_rate": 9.363147753777507e-07, + "loss": 0.9772, + "step": 4590 + }, + { + "epoch": 0.110666891452518, + "grad_norm": 1.655799388885498, + "learning_rate": 9.361880133860662e-07, + "loss": 1.0002, + "step": 4595 + }, + { + "epoch": 0.11078731244430529, + "grad_norm": 1.546984314918518, + "learning_rate": 9.360612513943819e-07, + "loss": 0.8815, + "step": 4600 + }, + { + "epoch": 0.11090773343609257, + "grad_norm": 1.4920119047164917, + "learning_rate": 9.359344894026974e-07, + "loss": 0.9835, + "step": 4605 + }, + { + "epoch": 0.11102815442787987, + "grad_norm": 1.5333186388015747, + "learning_rate": 9.35807727411013e-07, + "loss": 0.9534, + "step": 4610 + }, + { + "epoch": 0.11114857541966716, + "grad_norm": 1.6916743516921997, + "learning_rate": 9.356809654193286e-07, + "loss": 0.9304, + "step": 4615 + }, + { + "epoch": 0.11126899641145445, + "grad_norm": 1.8019620180130005, + "learning_rate": 9.355542034276443e-07, + "loss": 1.0045, + "step": 4620 + }, + { + "epoch": 0.11138941740324174, + "grad_norm": 1.4818156957626343, + "learning_rate": 9.354274414359598e-07, + "loss": 0.9803, + "step": 4625 + }, + { + "epoch": 0.11150983839502902, + "grad_norm": 1.5333963632583618, + "learning_rate": 9.353006794442753e-07, + "loss": 0.9324, + "step": 4630 + }, + { + "epoch": 0.11163025938681631, + "grad_norm": 1.7001012563705444, + "learning_rate": 9.35173917452591e-07, + "loss": 0.8934, + "step": 4635 + }, + { + "epoch": 0.1117506803786036, + "grad_norm": 1.4808404445648193, + "learning_rate": 9.350471554609065e-07, + "loss": 0.9245, + "step": 4640 + }, + { + "epoch": 0.11187110137039089, + "grad_norm": 1.511756181716919, + "learning_rate": 9.349203934692222e-07, + "loss": 0.9419, + "step": 4645 + }, + { + "epoch": 0.11199152236217817, + "grad_norm": 1.5101591348648071, + "learning_rate": 9.347936314775378e-07, + "loss": 0.9783, + "step": 4650 + }, + { + "epoch": 0.11211194335396546, + "grad_norm": 1.513121247291565, + "learning_rate": 9.346668694858532e-07, + "loss": 1.0224, + "step": 4655 + }, + { + "epoch": 0.11223236434575275, + "grad_norm": 1.4988608360290527, + "learning_rate": 9.345401074941689e-07, + "loss": 0.9689, + "step": 4660 + }, + { + "epoch": 0.11235278533754003, + "grad_norm": 1.5721975564956665, + "learning_rate": 9.344133455024845e-07, + "loss": 0.9555, + "step": 4665 + }, + { + "epoch": 0.11247320632932732, + "grad_norm": 1.696729302406311, + "learning_rate": 9.342865835108001e-07, + "loss": 0.9495, + "step": 4670 + }, + { + "epoch": 0.11259362732111462, + "grad_norm": 1.4797080755233765, + "learning_rate": 9.341598215191156e-07, + "loss": 0.9938, + "step": 4675 + }, + { + "epoch": 0.11271404831290191, + "grad_norm": 1.5340688228607178, + "learning_rate": 9.340330595274313e-07, + "loss": 0.9271, + "step": 4680 + }, + { + "epoch": 0.1128344693046892, + "grad_norm": 1.5662153959274292, + "learning_rate": 9.339062975357468e-07, + "loss": 0.98, + "step": 4685 + }, + { + "epoch": 0.11295489029647648, + "grad_norm": 1.5172141790390015, + "learning_rate": 9.337795355440624e-07, + "loss": 0.958, + "step": 4690 + }, + { + "epoch": 0.11307531128826377, + "grad_norm": 1.6660382747650146, + "learning_rate": 9.336527735523781e-07, + "loss": 0.9115, + "step": 4695 + }, + { + "epoch": 0.11319573228005106, + "grad_norm": 1.740270972251892, + "learning_rate": 9.335260115606935e-07, + "loss": 0.9381, + "step": 4700 + }, + { + "epoch": 0.11331615327183835, + "grad_norm": 1.401108980178833, + "learning_rate": 9.333992495690092e-07, + "loss": 0.9772, + "step": 4705 + }, + { + "epoch": 0.11343657426362563, + "grad_norm": 1.6223112344741821, + "learning_rate": 9.332724875773248e-07, + "loss": 0.9462, + "step": 4710 + }, + { + "epoch": 0.11355699525541292, + "grad_norm": 1.654981255531311, + "learning_rate": 9.331457255856403e-07, + "loss": 0.9887, + "step": 4715 + }, + { + "epoch": 0.11367741624720021, + "grad_norm": 1.5551520586013794, + "learning_rate": 9.330189635939559e-07, + "loss": 0.9886, + "step": 4720 + }, + { + "epoch": 0.1137978372389875, + "grad_norm": 2.040355920791626, + "learning_rate": 9.328922016022715e-07, + "loss": 0.9203, + "step": 4725 + }, + { + "epoch": 0.11391825823077478, + "grad_norm": 1.7608866691589355, + "learning_rate": 9.327654396105871e-07, + "loss": 0.962, + "step": 4730 + }, + { + "epoch": 0.11403867922256208, + "grad_norm": 2.0532608032226562, + "learning_rate": 9.326386776189027e-07, + "loss": 0.9713, + "step": 4735 + }, + { + "epoch": 0.11415910021434937, + "grad_norm": 1.3905706405639648, + "learning_rate": 9.325119156272184e-07, + "loss": 0.9415, + "step": 4740 + }, + { + "epoch": 0.11427952120613666, + "grad_norm": 1.5101875066757202, + "learning_rate": 9.323851536355338e-07, + "loss": 1.0173, + "step": 4745 + }, + { + "epoch": 0.11439994219792395, + "grad_norm": 1.4829469919204712, + "learning_rate": 9.322583916438494e-07, + "loss": 1.0155, + "step": 4750 + }, + { + "epoch": 0.11452036318971123, + "grad_norm": 1.5244263410568237, + "learning_rate": 9.321316296521651e-07, + "loss": 0.9863, + "step": 4755 + }, + { + "epoch": 0.11464078418149852, + "grad_norm": 1.638397216796875, + "learning_rate": 9.320048676604806e-07, + "loss": 1.004, + "step": 4760 + }, + { + "epoch": 0.11476120517328581, + "grad_norm": 1.5108238458633423, + "learning_rate": 9.318781056687963e-07, + "loss": 0.9795, + "step": 4765 + }, + { + "epoch": 0.1148816261650731, + "grad_norm": 1.6074522733688354, + "learning_rate": 9.317513436771118e-07, + "loss": 1.0123, + "step": 4770 + }, + { + "epoch": 0.11500204715686038, + "grad_norm": 1.4743024110794067, + "learning_rate": 9.316245816854273e-07, + "loss": 0.9394, + "step": 4775 + }, + { + "epoch": 0.11512246814864767, + "grad_norm": 1.5913125276565552, + "learning_rate": 9.31497819693743e-07, + "loss": 0.9358, + "step": 4780 + }, + { + "epoch": 0.11524288914043496, + "grad_norm": 1.4869250059127808, + "learning_rate": 9.313710577020586e-07, + "loss": 0.9499, + "step": 4785 + }, + { + "epoch": 0.11536331013222224, + "grad_norm": 1.6868833303451538, + "learning_rate": 9.312442957103741e-07, + "loss": 0.9614, + "step": 4790 + }, + { + "epoch": 0.11548373112400953, + "grad_norm": 1.4460880756378174, + "learning_rate": 9.311175337186897e-07, + "loss": 0.8808, + "step": 4795 + }, + { + "epoch": 0.11560415211579683, + "grad_norm": 1.3952046632766724, + "learning_rate": 9.309907717270054e-07, + "loss": 0.9999, + "step": 4800 + }, + { + "epoch": 0.11572457310758412, + "grad_norm": 1.459100365638733, + "learning_rate": 9.308640097353209e-07, + "loss": 0.9508, + "step": 4805 + }, + { + "epoch": 0.1158449940993714, + "grad_norm": 1.4393701553344727, + "learning_rate": 9.307372477436365e-07, + "loss": 0.9974, + "step": 4810 + }, + { + "epoch": 0.1159654150911587, + "grad_norm": 1.6273918151855469, + "learning_rate": 9.306104857519521e-07, + "loss": 0.9248, + "step": 4815 + }, + { + "epoch": 0.11608583608294598, + "grad_norm": 1.478814959526062, + "learning_rate": 9.304837237602676e-07, + "loss": 0.9576, + "step": 4820 + }, + { + "epoch": 0.11620625707473327, + "grad_norm": 1.8840432167053223, + "learning_rate": 9.303569617685833e-07, + "loss": 1.0513, + "step": 4825 + }, + { + "epoch": 0.11632667806652056, + "grad_norm": 1.6974865198135376, + "learning_rate": 9.302301997768989e-07, + "loss": 0.9699, + "step": 4830 + }, + { + "epoch": 0.11644709905830784, + "grad_norm": 1.5943163633346558, + "learning_rate": 9.301034377852143e-07, + "loss": 0.946, + "step": 4835 + }, + { + "epoch": 0.11656752005009513, + "grad_norm": 1.5880036354064941, + "learning_rate": 9.2997667579353e-07, + "loss": 0.9744, + "step": 4840 + }, + { + "epoch": 0.11668794104188242, + "grad_norm": 1.8085670471191406, + "learning_rate": 9.298499138018456e-07, + "loss": 0.9736, + "step": 4845 + }, + { + "epoch": 0.1168083620336697, + "grad_norm": 1.504788875579834, + "learning_rate": 9.297231518101612e-07, + "loss": 0.9505, + "step": 4850 + }, + { + "epoch": 0.11692878302545699, + "grad_norm": 1.4562904834747314, + "learning_rate": 9.295963898184768e-07, + "loss": 0.9379, + "step": 4855 + }, + { + "epoch": 0.11704920401724428, + "grad_norm": 1.4931610822677612, + "learning_rate": 9.294696278267923e-07, + "loss": 0.9242, + "step": 4860 + }, + { + "epoch": 0.11716962500903158, + "grad_norm": 1.4892380237579346, + "learning_rate": 9.29342865835108e-07, + "loss": 0.9444, + "step": 4865 + }, + { + "epoch": 0.11729004600081887, + "grad_norm": 1.8332098722457886, + "learning_rate": 9.292161038434235e-07, + "loss": 1.0237, + "step": 4870 + }, + { + "epoch": 0.11741046699260616, + "grad_norm": 1.5724458694458008, + "learning_rate": 9.290893418517392e-07, + "loss": 0.8696, + "step": 4875 + }, + { + "epoch": 0.11753088798439344, + "grad_norm": 1.5204848051071167, + "learning_rate": 9.289625798600548e-07, + "loss": 0.9876, + "step": 4880 + }, + { + "epoch": 0.11765130897618073, + "grad_norm": 1.428585410118103, + "learning_rate": 9.288358178683703e-07, + "loss": 0.935, + "step": 4885 + }, + { + "epoch": 0.11777172996796802, + "grad_norm": 1.7297282218933105, + "learning_rate": 9.287090558766859e-07, + "loss": 0.9904, + "step": 4890 + }, + { + "epoch": 0.1178921509597553, + "grad_norm": 1.5793259143829346, + "learning_rate": 9.285822938850015e-07, + "loss": 0.9694, + "step": 4895 + }, + { + "epoch": 0.11801257195154259, + "grad_norm": 1.7904987335205078, + "learning_rate": 9.284555318933171e-07, + "loss": 0.9852, + "step": 4900 + }, + { + "epoch": 0.11813299294332988, + "grad_norm": 1.4578253030776978, + "learning_rate": 9.283287699016326e-07, + "loss": 0.9786, + "step": 4905 + }, + { + "epoch": 0.11825341393511717, + "grad_norm": 1.439958930015564, + "learning_rate": 9.282020079099483e-07, + "loss": 0.9291, + "step": 4910 + }, + { + "epoch": 0.11837383492690445, + "grad_norm": 1.6253753900527954, + "learning_rate": 9.280752459182638e-07, + "loss": 0.9948, + "step": 4915 + }, + { + "epoch": 0.11849425591869174, + "grad_norm": 1.617333173751831, + "learning_rate": 9.279484839265794e-07, + "loss": 0.9229, + "step": 4920 + }, + { + "epoch": 0.11861467691047904, + "grad_norm": 1.7600502967834473, + "learning_rate": 9.278217219348951e-07, + "loss": 0.9895, + "step": 4925 + }, + { + "epoch": 0.11873509790226633, + "grad_norm": 1.636748194694519, + "learning_rate": 9.276949599432105e-07, + "loss": 0.9766, + "step": 4930 + }, + { + "epoch": 0.11885551889405362, + "grad_norm": 1.6138620376586914, + "learning_rate": 9.275681979515262e-07, + "loss": 0.9108, + "step": 4935 + }, + { + "epoch": 0.1189759398858409, + "grad_norm": 1.5964850187301636, + "learning_rate": 9.274414359598418e-07, + "loss": 0.9817, + "step": 4940 + }, + { + "epoch": 0.11909636087762819, + "grad_norm": 1.6752839088439941, + "learning_rate": 9.273146739681574e-07, + "loss": 0.9545, + "step": 4945 + }, + { + "epoch": 0.11921678186941548, + "grad_norm": 1.6496039628982544, + "learning_rate": 9.271879119764729e-07, + "loss": 0.9907, + "step": 4950 + }, + { + "epoch": 0.11933720286120277, + "grad_norm": 1.4864082336425781, + "learning_rate": 9.270611499847885e-07, + "loss": 0.9827, + "step": 4955 + }, + { + "epoch": 0.11945762385299005, + "grad_norm": 1.483738660812378, + "learning_rate": 9.269343879931041e-07, + "loss": 0.9609, + "step": 4960 + }, + { + "epoch": 0.11957804484477734, + "grad_norm": 1.645350694656372, + "learning_rate": 9.268076260014197e-07, + "loss": 0.9546, + "step": 4965 + }, + { + "epoch": 0.11969846583656463, + "grad_norm": 1.473766803741455, + "learning_rate": 9.266808640097354e-07, + "loss": 0.9515, + "step": 4970 + }, + { + "epoch": 0.11981888682835191, + "grad_norm": 1.4186217784881592, + "learning_rate": 9.265541020180508e-07, + "loss": 1.0048, + "step": 4975 + }, + { + "epoch": 0.1199393078201392, + "grad_norm": 1.6372003555297852, + "learning_rate": 9.264273400263664e-07, + "loss": 0.9712, + "step": 4980 + }, + { + "epoch": 0.12005972881192649, + "grad_norm": 2.0134482383728027, + "learning_rate": 9.263005780346821e-07, + "loss": 0.9544, + "step": 4985 + }, + { + "epoch": 0.12018014980371379, + "grad_norm": 1.6138123273849487, + "learning_rate": 9.261738160429976e-07, + "loss": 0.9704, + "step": 4990 + }, + { + "epoch": 0.12030057079550108, + "grad_norm": 1.5256341695785522, + "learning_rate": 9.260470540513133e-07, + "loss": 0.9477, + "step": 4995 + }, + { + "epoch": 0.12042099178728836, + "grad_norm": 1.558856725692749, + "learning_rate": 9.259202920596288e-07, + "loss": 0.9431, + "step": 5000 + }, + { + "epoch": 0.12054141277907565, + "grad_norm": 1.8143106698989868, + "learning_rate": 9.257935300679444e-07, + "loss": 0.9618, + "step": 5005 + }, + { + "epoch": 0.12066183377086294, + "grad_norm": 1.6698336601257324, + "learning_rate": 9.2566676807626e-07, + "loss": 0.9523, + "step": 5010 + }, + { + "epoch": 0.12078225476265023, + "grad_norm": 1.6376757621765137, + "learning_rate": 9.255400060845756e-07, + "loss": 0.8967, + "step": 5015 + }, + { + "epoch": 0.12090267575443751, + "grad_norm": 1.6007685661315918, + "learning_rate": 9.254132440928911e-07, + "loss": 0.9103, + "step": 5020 + }, + { + "epoch": 0.1210230967462248, + "grad_norm": 1.5988738536834717, + "learning_rate": 9.252864821012067e-07, + "loss": 0.9777, + "step": 5025 + }, + { + "epoch": 0.12114351773801209, + "grad_norm": 1.5461716651916504, + "learning_rate": 9.251597201095224e-07, + "loss": 0.9269, + "step": 5030 + }, + { + "epoch": 0.12126393872979938, + "grad_norm": 1.6856049299240112, + "learning_rate": 9.250329581178379e-07, + "loss": 0.9482, + "step": 5035 + }, + { + "epoch": 0.12138435972158666, + "grad_norm": 1.4944417476654053, + "learning_rate": 9.249061961261535e-07, + "loss": 0.9646, + "step": 5040 + }, + { + "epoch": 0.12150478071337395, + "grad_norm": 1.87632155418396, + "learning_rate": 9.247794341344691e-07, + "loss": 0.9717, + "step": 5045 + }, + { + "epoch": 0.12162520170516124, + "grad_norm": 1.8691729307174683, + "learning_rate": 9.246526721427846e-07, + "loss": 0.929, + "step": 5050 + }, + { + "epoch": 0.12174562269694854, + "grad_norm": 1.5722233057022095, + "learning_rate": 9.245259101511003e-07, + "loss": 0.9637, + "step": 5055 + }, + { + "epoch": 0.12186604368873583, + "grad_norm": 1.6120997667312622, + "learning_rate": 9.243991481594159e-07, + "loss": 1.0072, + "step": 5060 + }, + { + "epoch": 0.12198646468052311, + "grad_norm": 1.6073572635650635, + "learning_rate": 9.242723861677313e-07, + "loss": 0.9322, + "step": 5065 + }, + { + "epoch": 0.1221068856723104, + "grad_norm": 1.6781420707702637, + "learning_rate": 9.24145624176047e-07, + "loss": 0.9434, + "step": 5070 + }, + { + "epoch": 0.12222730666409769, + "grad_norm": 1.8128712177276611, + "learning_rate": 9.240188621843626e-07, + "loss": 0.918, + "step": 5075 + }, + { + "epoch": 0.12234772765588497, + "grad_norm": 1.5147145986557007, + "learning_rate": 9.238921001926782e-07, + "loss": 0.9762, + "step": 5080 + }, + { + "epoch": 0.12246814864767226, + "grad_norm": 1.45526123046875, + "learning_rate": 9.237653382009938e-07, + "loss": 0.9586, + "step": 5085 + }, + { + "epoch": 0.12258856963945955, + "grad_norm": 1.5992141962051392, + "learning_rate": 9.236385762093094e-07, + "loss": 0.933, + "step": 5090 + }, + { + "epoch": 0.12270899063124684, + "grad_norm": 1.419451117515564, + "learning_rate": 9.235118142176249e-07, + "loss": 0.9963, + "step": 5095 + }, + { + "epoch": 0.12282941162303412, + "grad_norm": 1.6753170490264893, + "learning_rate": 9.233850522259405e-07, + "loss": 0.9369, + "step": 5100 + }, + { + "epoch": 0.12294983261482141, + "grad_norm": 1.876473307609558, + "learning_rate": 9.232582902342562e-07, + "loss": 0.9491, + "step": 5105 + }, + { + "epoch": 0.1230702536066087, + "grad_norm": 1.5998787879943848, + "learning_rate": 9.231315282425717e-07, + "loss": 0.9891, + "step": 5110 + }, + { + "epoch": 0.12319067459839599, + "grad_norm": 1.7187938690185547, + "learning_rate": 9.230047662508873e-07, + "loss": 0.9506, + "step": 5115 + }, + { + "epoch": 0.12331109559018329, + "grad_norm": 1.3768280744552612, + "learning_rate": 9.228780042592029e-07, + "loss": 0.9974, + "step": 5120 + }, + { + "epoch": 0.12343151658197057, + "grad_norm": 1.4900585412979126, + "learning_rate": 9.227512422675184e-07, + "loss": 0.9339, + "step": 5125 + }, + { + "epoch": 0.12355193757375786, + "grad_norm": 1.7107560634613037, + "learning_rate": 9.226244802758341e-07, + "loss": 0.9153, + "step": 5130 + }, + { + "epoch": 0.12367235856554515, + "grad_norm": 2.090656042098999, + "learning_rate": 9.224977182841496e-07, + "loss": 0.895, + "step": 5135 + }, + { + "epoch": 0.12379277955733244, + "grad_norm": 1.7037839889526367, + "learning_rate": 9.223709562924652e-07, + "loss": 1.0013, + "step": 5140 + }, + { + "epoch": 0.12391320054911972, + "grad_norm": 1.5398280620574951, + "learning_rate": 9.222441943007808e-07, + "loss": 0.9445, + "step": 5145 + }, + { + "epoch": 0.12403362154090701, + "grad_norm": 1.6887266635894775, + "learning_rate": 9.221174323090965e-07, + "loss": 0.9767, + "step": 5150 + }, + { + "epoch": 0.1241540425326943, + "grad_norm": 1.401342511177063, + "learning_rate": 9.21990670317412e-07, + "loss": 0.8865, + "step": 5155 + }, + { + "epoch": 0.12427446352448158, + "grad_norm": 1.3718047142028809, + "learning_rate": 9.218639083257275e-07, + "loss": 0.9621, + "step": 5160 + }, + { + "epoch": 0.12439488451626887, + "grad_norm": 1.6709517240524292, + "learning_rate": 9.217371463340432e-07, + "loss": 0.967, + "step": 5165 + }, + { + "epoch": 0.12451530550805616, + "grad_norm": 1.510709285736084, + "learning_rate": 9.216103843423587e-07, + "loss": 0.9674, + "step": 5170 + }, + { + "epoch": 0.12463572649984345, + "grad_norm": 1.467331886291504, + "learning_rate": 9.214836223506744e-07, + "loss": 0.9637, + "step": 5175 + }, + { + "epoch": 0.12475614749163075, + "grad_norm": 1.5031909942626953, + "learning_rate": 9.2135686035899e-07, + "loss": 0.9485, + "step": 5180 + }, + { + "epoch": 0.12487656848341804, + "grad_norm": 1.759961485862732, + "learning_rate": 9.212300983673054e-07, + "loss": 1.0108, + "step": 5185 + }, + { + "epoch": 0.12499698947520532, + "grad_norm": 1.519850492477417, + "learning_rate": 9.211033363756211e-07, + "loss": 0.9442, + "step": 5190 + }, + { + "epoch": 0.1251174104669926, + "grad_norm": 1.8199149370193481, + "learning_rate": 9.209765743839367e-07, + "loss": 0.9235, + "step": 5195 + }, + { + "epoch": 0.1252378314587799, + "grad_norm": 1.6225138902664185, + "learning_rate": 9.208498123922523e-07, + "loss": 0.9856, + "step": 5200 + }, + { + "epoch": 0.12535825245056717, + "grad_norm": 1.6960610151290894, + "learning_rate": 9.207230504005678e-07, + "loss": 1.0328, + "step": 5205 + }, + { + "epoch": 0.12547867344235447, + "grad_norm": 1.9315990209579468, + "learning_rate": 9.205962884088835e-07, + "loss": 0.9509, + "step": 5210 + }, + { + "epoch": 0.12559909443414177, + "grad_norm": 1.6579340696334839, + "learning_rate": 9.20469526417199e-07, + "loss": 0.9352, + "step": 5215 + }, + { + "epoch": 0.12571951542592905, + "grad_norm": 1.6070151329040527, + "learning_rate": 9.203427644255146e-07, + "loss": 0.9809, + "step": 5220 + }, + { + "epoch": 0.12583993641771635, + "grad_norm": 1.4353399276733398, + "learning_rate": 9.202160024338303e-07, + "loss": 0.9512, + "step": 5225 + }, + { + "epoch": 0.12596035740950362, + "grad_norm": 1.5951476097106934, + "learning_rate": 9.200892404421457e-07, + "loss": 0.9959, + "step": 5230 + }, + { + "epoch": 0.12608077840129092, + "grad_norm": 1.600804090499878, + "learning_rate": 9.199624784504614e-07, + "loss": 0.9749, + "step": 5235 + }, + { + "epoch": 0.1262011993930782, + "grad_norm": 1.6272754669189453, + "learning_rate": 9.19835716458777e-07, + "loss": 0.9287, + "step": 5240 + }, + { + "epoch": 0.1263216203848655, + "grad_norm": 1.556373953819275, + "learning_rate": 9.197089544670925e-07, + "loss": 0.9435, + "step": 5245 + }, + { + "epoch": 0.12644204137665277, + "grad_norm": 1.7332128286361694, + "learning_rate": 9.195821924754081e-07, + "loss": 0.9235, + "step": 5250 + }, + { + "epoch": 0.12656246236844007, + "grad_norm": 1.6399269104003906, + "learning_rate": 9.194554304837237e-07, + "loss": 0.9436, + "step": 5255 + }, + { + "epoch": 0.12668288336022734, + "grad_norm": 1.756186842918396, + "learning_rate": 9.193286684920393e-07, + "loss": 0.9137, + "step": 5260 + }, + { + "epoch": 0.12680330435201465, + "grad_norm": 1.9386248588562012, + "learning_rate": 9.192019065003549e-07, + "loss": 0.9831, + "step": 5265 + }, + { + "epoch": 0.12692372534380192, + "grad_norm": 1.8038384914398193, + "learning_rate": 9.190751445086706e-07, + "loss": 0.8885, + "step": 5270 + }, + { + "epoch": 0.12704414633558922, + "grad_norm": 1.5861470699310303, + "learning_rate": 9.18948382516986e-07, + "loss": 0.9518, + "step": 5275 + }, + { + "epoch": 0.12716456732737652, + "grad_norm": 1.5452990531921387, + "learning_rate": 9.188216205253016e-07, + "loss": 1.0352, + "step": 5280 + }, + { + "epoch": 0.1272849883191638, + "grad_norm": 1.6214449405670166, + "learning_rate": 9.186948585336173e-07, + "loss": 1.0022, + "step": 5285 + }, + { + "epoch": 0.1274054093109511, + "grad_norm": 1.5629680156707764, + "learning_rate": 9.185680965419328e-07, + "loss": 0.9569, + "step": 5290 + }, + { + "epoch": 0.12752583030273837, + "grad_norm": 1.5142781734466553, + "learning_rate": 9.184413345502485e-07, + "loss": 0.9449, + "step": 5295 + }, + { + "epoch": 0.12764625129452567, + "grad_norm": 1.5961127281188965, + "learning_rate": 9.18314572558564e-07, + "loss": 0.9837, + "step": 5300 + }, + { + "epoch": 0.12776667228631294, + "grad_norm": 1.583109974861145, + "learning_rate": 9.181878105668795e-07, + "loss": 0.9623, + "step": 5305 + }, + { + "epoch": 0.12788709327810024, + "grad_norm": 1.740462303161621, + "learning_rate": 9.180610485751952e-07, + "loss": 0.9572, + "step": 5310 + }, + { + "epoch": 0.12800751426988752, + "grad_norm": 1.6290134191513062, + "learning_rate": 9.179342865835108e-07, + "loss": 0.9361, + "step": 5315 + }, + { + "epoch": 0.12812793526167482, + "grad_norm": 1.3808825016021729, + "learning_rate": 9.178075245918263e-07, + "loss": 0.9961, + "step": 5320 + }, + { + "epoch": 0.1282483562534621, + "grad_norm": 1.8814772367477417, + "learning_rate": 9.176807626001419e-07, + "loss": 0.9583, + "step": 5325 + }, + { + "epoch": 0.1283687772452494, + "grad_norm": 1.5854055881500244, + "learning_rate": 9.175540006084576e-07, + "loss": 0.9677, + "step": 5330 + }, + { + "epoch": 0.12848919823703667, + "grad_norm": 1.688438892364502, + "learning_rate": 9.174272386167731e-07, + "loss": 0.9858, + "step": 5335 + }, + { + "epoch": 0.12860961922882397, + "grad_norm": 1.4895740747451782, + "learning_rate": 9.173004766250887e-07, + "loss": 0.9622, + "step": 5340 + }, + { + "epoch": 0.12873004022061127, + "grad_norm": 1.4580119848251343, + "learning_rate": 9.171737146334043e-07, + "loss": 0.9619, + "step": 5345 + }, + { + "epoch": 0.12885046121239854, + "grad_norm": 1.5655674934387207, + "learning_rate": 9.170469526417198e-07, + "loss": 0.9412, + "step": 5350 + }, + { + "epoch": 0.12897088220418584, + "grad_norm": 1.5745370388031006, + "learning_rate": 9.169201906500355e-07, + "loss": 0.9373, + "step": 5355 + }, + { + "epoch": 0.12909130319597312, + "grad_norm": 1.4132782220840454, + "learning_rate": 9.167934286583511e-07, + "loss": 0.9671, + "step": 5360 + }, + { + "epoch": 0.12921172418776042, + "grad_norm": 1.5725817680358887, + "learning_rate": 9.166666666666665e-07, + "loss": 0.9964, + "step": 5365 + }, + { + "epoch": 0.1293321451795477, + "grad_norm": 1.7641382217407227, + "learning_rate": 9.165399046749822e-07, + "loss": 0.9692, + "step": 5370 + }, + { + "epoch": 0.129452566171335, + "grad_norm": 1.7528362274169922, + "learning_rate": 9.164131426832978e-07, + "loss": 0.938, + "step": 5375 + }, + { + "epoch": 0.12957298716312227, + "grad_norm": 1.7025678157806396, + "learning_rate": 9.162863806916134e-07, + "loss": 0.9336, + "step": 5380 + }, + { + "epoch": 0.12969340815490957, + "grad_norm": 1.630113959312439, + "learning_rate": 9.16159618699929e-07, + "loss": 1.0092, + "step": 5385 + }, + { + "epoch": 0.12981382914669684, + "grad_norm": 1.58848237991333, + "learning_rate": 9.160328567082445e-07, + "loss": 0.9254, + "step": 5390 + }, + { + "epoch": 0.12993425013848414, + "grad_norm": 1.4365260601043701, + "learning_rate": 9.159060947165601e-07, + "loss": 0.9769, + "step": 5395 + }, + { + "epoch": 0.13005467113027142, + "grad_norm": 1.4236555099487305, + "learning_rate": 9.157793327248757e-07, + "loss": 0.9208, + "step": 5400 + }, + { + "epoch": 0.13017509212205872, + "grad_norm": 1.6881691217422485, + "learning_rate": 9.156525707331914e-07, + "loss": 0.9772, + "step": 5405 + }, + { + "epoch": 0.13029551311384602, + "grad_norm": 1.568904995918274, + "learning_rate": 9.155258087415069e-07, + "loss": 0.9121, + "step": 5410 + }, + { + "epoch": 0.1304159341056333, + "grad_norm": 1.721828818321228, + "learning_rate": 9.153990467498225e-07, + "loss": 0.9717, + "step": 5415 + }, + { + "epoch": 0.1305363550974206, + "grad_norm": 1.6432464122772217, + "learning_rate": 9.152722847581381e-07, + "loss": 0.9182, + "step": 5420 + }, + { + "epoch": 0.13065677608920787, + "grad_norm": 1.5352879762649536, + "learning_rate": 9.151455227664536e-07, + "loss": 0.9659, + "step": 5425 + }, + { + "epoch": 0.13077719708099517, + "grad_norm": 1.8074719905853271, + "learning_rate": 9.150187607747693e-07, + "loss": 0.9903, + "step": 5430 + }, + { + "epoch": 0.13089761807278244, + "grad_norm": 1.5905964374542236, + "learning_rate": 9.148919987830848e-07, + "loss": 0.9566, + "step": 5435 + }, + { + "epoch": 0.13101803906456974, + "grad_norm": 1.4441380500793457, + "learning_rate": 9.147652367914004e-07, + "loss": 0.9314, + "step": 5440 + }, + { + "epoch": 0.13113846005635701, + "grad_norm": 1.55857253074646, + "learning_rate": 9.14638474799716e-07, + "loss": 0.9121, + "step": 5445 + }, + { + "epoch": 0.13125888104814432, + "grad_norm": 1.6368721723556519, + "learning_rate": 9.145117128080317e-07, + "loss": 0.9487, + "step": 5450 + }, + { + "epoch": 0.1313793020399316, + "grad_norm": 1.7417676448822021, + "learning_rate": 9.143849508163472e-07, + "loss": 0.9368, + "step": 5455 + }, + { + "epoch": 0.1314997230317189, + "grad_norm": 1.3386549949645996, + "learning_rate": 9.142581888246627e-07, + "loss": 0.9817, + "step": 5460 + }, + { + "epoch": 0.1316201440235062, + "grad_norm": 1.568506121635437, + "learning_rate": 9.141314268329784e-07, + "loss": 0.9796, + "step": 5465 + }, + { + "epoch": 0.13174056501529346, + "grad_norm": 1.5980567932128906, + "learning_rate": 9.140046648412939e-07, + "loss": 0.9429, + "step": 5470 + }, + { + "epoch": 0.13186098600708077, + "grad_norm": 1.4643741846084595, + "learning_rate": 9.138779028496096e-07, + "loss": 0.9005, + "step": 5475 + }, + { + "epoch": 0.13198140699886804, + "grad_norm": 1.727617621421814, + "learning_rate": 9.137511408579252e-07, + "loss": 1.002, + "step": 5480 + }, + { + "epoch": 0.13210182799065534, + "grad_norm": 1.4297009706497192, + "learning_rate": 9.136243788662406e-07, + "loss": 0.9769, + "step": 5485 + }, + { + "epoch": 0.13222224898244261, + "grad_norm": 1.4668093919754028, + "learning_rate": 9.134976168745563e-07, + "loss": 1.007, + "step": 5490 + }, + { + "epoch": 0.13234266997422992, + "grad_norm": 1.4713921546936035, + "learning_rate": 9.133708548828719e-07, + "loss": 0.923, + "step": 5495 + }, + { + "epoch": 0.1324630909660172, + "grad_norm": 1.6667219400405884, + "learning_rate": 9.132440928911875e-07, + "loss": 0.9222, + "step": 5500 + }, + { + "epoch": 0.1325835119578045, + "grad_norm": 1.664129376411438, + "learning_rate": 9.13117330899503e-07, + "loss": 0.9041, + "step": 5505 + }, + { + "epoch": 0.13270393294959176, + "grad_norm": 1.6742534637451172, + "learning_rate": 9.129905689078186e-07, + "loss": 0.9022, + "step": 5510 + }, + { + "epoch": 0.13282435394137906, + "grad_norm": 1.4654879570007324, + "learning_rate": 9.128638069161342e-07, + "loss": 0.95, + "step": 5515 + }, + { + "epoch": 0.13294477493316634, + "grad_norm": 1.748233675956726, + "learning_rate": 9.127370449244498e-07, + "loss": 0.957, + "step": 5520 + }, + { + "epoch": 0.13306519592495364, + "grad_norm": 1.5935348272323608, + "learning_rate": 9.126102829327655e-07, + "loss": 0.9468, + "step": 5525 + }, + { + "epoch": 0.13318561691674094, + "grad_norm": 1.6781808137893677, + "learning_rate": 9.124835209410809e-07, + "loss": 0.9734, + "step": 5530 + }, + { + "epoch": 0.1333060379085282, + "grad_norm": 1.6739495992660522, + "learning_rate": 9.123567589493966e-07, + "loss": 0.9759, + "step": 5535 + }, + { + "epoch": 0.13342645890031551, + "grad_norm": 1.5237581729888916, + "learning_rate": 9.122299969577122e-07, + "loss": 0.9928, + "step": 5540 + }, + { + "epoch": 0.1335468798921028, + "grad_norm": 1.8118046522140503, + "learning_rate": 9.121032349660277e-07, + "loss": 0.9387, + "step": 5545 + }, + { + "epoch": 0.1336673008838901, + "grad_norm": 1.53839910030365, + "learning_rate": 9.119764729743433e-07, + "loss": 0.9461, + "step": 5550 + }, + { + "epoch": 0.13378772187567736, + "grad_norm": 1.5928133726119995, + "learning_rate": 9.118497109826589e-07, + "loss": 0.9554, + "step": 5555 + }, + { + "epoch": 0.13390814286746466, + "grad_norm": 1.980698585510254, + "learning_rate": 9.117229489909745e-07, + "loss": 0.9315, + "step": 5560 + }, + { + "epoch": 0.13402856385925194, + "grad_norm": 1.5155107975006104, + "learning_rate": 9.115961869992901e-07, + "loss": 0.9696, + "step": 5565 + }, + { + "epoch": 0.13414898485103924, + "grad_norm": 1.5934728384017944, + "learning_rate": 9.114694250076057e-07, + "loss": 0.9984, + "step": 5570 + }, + { + "epoch": 0.1342694058428265, + "grad_norm": 1.5468522310256958, + "learning_rate": 9.113426630159212e-07, + "loss": 0.931, + "step": 5575 + }, + { + "epoch": 0.1343898268346138, + "grad_norm": 1.7637958526611328, + "learning_rate": 9.112159010242368e-07, + "loss": 0.9437, + "step": 5580 + }, + { + "epoch": 0.13451024782640109, + "grad_norm": 1.5114047527313232, + "learning_rate": 9.110891390325525e-07, + "loss": 0.9994, + "step": 5585 + }, + { + "epoch": 0.1346306688181884, + "grad_norm": 1.611918330192566, + "learning_rate": 9.10962377040868e-07, + "loss": 0.9207, + "step": 5590 + }, + { + "epoch": 0.1347510898099757, + "grad_norm": 1.6683274507522583, + "learning_rate": 9.108356150491837e-07, + "loss": 0.9548, + "step": 5595 + }, + { + "epoch": 0.13487151080176296, + "grad_norm": 1.4041019678115845, + "learning_rate": 9.107088530574992e-07, + "loss": 0.9524, + "step": 5600 + }, + { + "epoch": 0.13499193179355026, + "grad_norm": 1.4725638628005981, + "learning_rate": 9.105820910658147e-07, + "loss": 0.9636, + "step": 5605 + }, + { + "epoch": 0.13511235278533754, + "grad_norm": 1.5536869764328003, + "learning_rate": 9.104553290741304e-07, + "loss": 0.9784, + "step": 5610 + }, + { + "epoch": 0.13523277377712484, + "grad_norm": 1.4738366603851318, + "learning_rate": 9.10328567082446e-07, + "loss": 0.9641, + "step": 5615 + }, + { + "epoch": 0.1353531947689121, + "grad_norm": 1.5258055925369263, + "learning_rate": 9.102018050907615e-07, + "loss": 0.9735, + "step": 5620 + }, + { + "epoch": 0.1354736157606994, + "grad_norm": 1.5292253494262695, + "learning_rate": 9.100750430990771e-07, + "loss": 0.9347, + "step": 5625 + }, + { + "epoch": 0.13559403675248669, + "grad_norm": 1.5491033792495728, + "learning_rate": 9.099482811073927e-07, + "loss": 1.0053, + "step": 5630 + }, + { + "epoch": 0.135714457744274, + "grad_norm": 1.5524201393127441, + "learning_rate": 9.098215191157083e-07, + "loss": 0.9651, + "step": 5635 + }, + { + "epoch": 0.13583487873606126, + "grad_norm": 1.6256749629974365, + "learning_rate": 9.096947571240239e-07, + "loss": 0.9522, + "step": 5640 + }, + { + "epoch": 0.13595529972784856, + "grad_norm": 1.6116081476211548, + "learning_rate": 9.095679951323395e-07, + "loss": 0.9779, + "step": 5645 + }, + { + "epoch": 0.13607572071963583, + "grad_norm": 1.7412595748901367, + "learning_rate": 9.09441233140655e-07, + "loss": 0.9511, + "step": 5650 + }, + { + "epoch": 0.13619614171142314, + "grad_norm": 1.6832035779953003, + "learning_rate": 9.093144711489706e-07, + "loss": 0.9324, + "step": 5655 + }, + { + "epoch": 0.13631656270321044, + "grad_norm": 1.6584666967391968, + "learning_rate": 9.091877091572863e-07, + "loss": 0.9868, + "step": 5660 + }, + { + "epoch": 0.1364369836949977, + "grad_norm": 1.8463393449783325, + "learning_rate": 9.090609471656017e-07, + "loss": 0.989, + "step": 5665 + }, + { + "epoch": 0.136557404686785, + "grad_norm": 1.6989035606384277, + "learning_rate": 9.089341851739174e-07, + "loss": 0.9642, + "step": 5670 + }, + { + "epoch": 0.13667782567857228, + "grad_norm": 1.5193052291870117, + "learning_rate": 9.08807423182233e-07, + "loss": 0.9344, + "step": 5675 + }, + { + "epoch": 0.13679824667035959, + "grad_norm": 1.4853129386901855, + "learning_rate": 9.086806611905486e-07, + "loss": 0.9027, + "step": 5680 + }, + { + "epoch": 0.13691866766214686, + "grad_norm": 1.9148509502410889, + "learning_rate": 9.085538991988642e-07, + "loss": 0.9396, + "step": 5685 + }, + { + "epoch": 0.13703908865393416, + "grad_norm": 1.6306005716323853, + "learning_rate": 9.084271372071797e-07, + "loss": 0.9338, + "step": 5690 + }, + { + "epoch": 0.13715950964572143, + "grad_norm": 1.5270458459854126, + "learning_rate": 9.083003752154953e-07, + "loss": 0.8957, + "step": 5695 + }, + { + "epoch": 0.13727993063750873, + "grad_norm": 1.4417778253555298, + "learning_rate": 9.081736132238109e-07, + "loss": 0.9709, + "step": 5700 + }, + { + "epoch": 0.137400351629296, + "grad_norm": 1.6104899644851685, + "learning_rate": 9.080468512321266e-07, + "loss": 0.9205, + "step": 5705 + }, + { + "epoch": 0.1375207726210833, + "grad_norm": 1.6175036430358887, + "learning_rate": 9.079200892404421e-07, + "loss": 0.9469, + "step": 5710 + }, + { + "epoch": 0.13764119361287058, + "grad_norm": 1.5673390626907349, + "learning_rate": 9.077933272487576e-07, + "loss": 0.9618, + "step": 5715 + }, + { + "epoch": 0.13776161460465788, + "grad_norm": 1.7446295022964478, + "learning_rate": 9.076665652570733e-07, + "loss": 0.925, + "step": 5720 + }, + { + "epoch": 0.13788203559644518, + "grad_norm": 1.4070531129837036, + "learning_rate": 9.075398032653888e-07, + "loss": 0.9859, + "step": 5725 + }, + { + "epoch": 0.13800245658823246, + "grad_norm": 1.5243618488311768, + "learning_rate": 9.074130412737045e-07, + "loss": 0.9412, + "step": 5730 + }, + { + "epoch": 0.13812287758001976, + "grad_norm": 1.5630604028701782, + "learning_rate": 9.0728627928202e-07, + "loss": 0.9375, + "step": 5735 + }, + { + "epoch": 0.13824329857180703, + "grad_norm": 1.7350032329559326, + "learning_rate": 9.071595172903356e-07, + "loss": 0.9522, + "step": 5740 + }, + { + "epoch": 0.13836371956359433, + "grad_norm": 1.8089555501937866, + "learning_rate": 9.070327552986512e-07, + "loss": 0.9314, + "step": 5745 + }, + { + "epoch": 0.1384841405553816, + "grad_norm": 1.511834740638733, + "learning_rate": 9.069059933069668e-07, + "loss": 1.0104, + "step": 5750 + }, + { + "epoch": 0.1386045615471689, + "grad_norm": 1.5103285312652588, + "learning_rate": 9.067792313152824e-07, + "loss": 0.9809, + "step": 5755 + }, + { + "epoch": 0.13872498253895618, + "grad_norm": 1.5690333843231201, + "learning_rate": 9.066524693235979e-07, + "loss": 0.996, + "step": 5760 + }, + { + "epoch": 0.13884540353074348, + "grad_norm": 1.577641248703003, + "learning_rate": 9.065257073319136e-07, + "loss": 0.9487, + "step": 5765 + }, + { + "epoch": 0.13896582452253076, + "grad_norm": 1.4639778137207031, + "learning_rate": 9.063989453402291e-07, + "loss": 0.971, + "step": 5770 + }, + { + "epoch": 0.13908624551431806, + "grad_norm": 1.6108697652816772, + "learning_rate": 9.062721833485447e-07, + "loss": 0.9708, + "step": 5775 + }, + { + "epoch": 0.13920666650610533, + "grad_norm": 1.603767991065979, + "learning_rate": 9.061454213568604e-07, + "loss": 0.8838, + "step": 5780 + }, + { + "epoch": 0.13932708749789263, + "grad_norm": 1.5478912591934204, + "learning_rate": 9.060186593651758e-07, + "loss": 0.9728, + "step": 5785 + }, + { + "epoch": 0.13944750848967993, + "grad_norm": 1.4290289878845215, + "learning_rate": 9.058918973734915e-07, + "loss": 0.863, + "step": 5790 + }, + { + "epoch": 0.1395679294814672, + "grad_norm": 1.6638075113296509, + "learning_rate": 9.057651353818071e-07, + "loss": 0.9495, + "step": 5795 + }, + { + "epoch": 0.1396883504732545, + "grad_norm": 1.6155606508255005, + "learning_rate": 9.056383733901227e-07, + "loss": 0.9387, + "step": 5800 + }, + { + "epoch": 0.13980877146504178, + "grad_norm": 1.5489352941513062, + "learning_rate": 9.055116113984382e-07, + "loss": 0.9324, + "step": 5805 + }, + { + "epoch": 0.13992919245682908, + "grad_norm": 1.4846972227096558, + "learning_rate": 9.053848494067538e-07, + "loss": 0.9736, + "step": 5810 + }, + { + "epoch": 0.14004961344861636, + "grad_norm": 1.772792935371399, + "learning_rate": 9.052580874150694e-07, + "loss": 0.9563, + "step": 5815 + }, + { + "epoch": 0.14017003444040366, + "grad_norm": 1.4382880926132202, + "learning_rate": 9.05131325423385e-07, + "loss": 0.9554, + "step": 5820 + }, + { + "epoch": 0.14029045543219093, + "grad_norm": 1.7088966369628906, + "learning_rate": 9.050045634317007e-07, + "loss": 1.0169, + "step": 5825 + }, + { + "epoch": 0.14041087642397823, + "grad_norm": 1.3709522485733032, + "learning_rate": 9.048778014400161e-07, + "loss": 0.9589, + "step": 5830 + }, + { + "epoch": 0.1405312974157655, + "grad_norm": 1.8668538331985474, + "learning_rate": 9.047510394483317e-07, + "loss": 0.9706, + "step": 5835 + }, + { + "epoch": 0.1406517184075528, + "grad_norm": 1.7845852375030518, + "learning_rate": 9.046242774566474e-07, + "loss": 0.9407, + "step": 5840 + }, + { + "epoch": 0.14077213939934008, + "grad_norm": 1.5803017616271973, + "learning_rate": 9.044975154649629e-07, + "loss": 0.9663, + "step": 5845 + }, + { + "epoch": 0.14089256039112738, + "grad_norm": 1.6961511373519897, + "learning_rate": 9.043707534732785e-07, + "loss": 0.9783, + "step": 5850 + }, + { + "epoch": 0.14101298138291468, + "grad_norm": 1.4206011295318604, + "learning_rate": 9.042439914815941e-07, + "loss": 0.9357, + "step": 5855 + }, + { + "epoch": 0.14113340237470196, + "grad_norm": 1.9127111434936523, + "learning_rate": 9.041172294899096e-07, + "loss": 0.9295, + "step": 5860 + }, + { + "epoch": 0.14125382336648926, + "grad_norm": 1.4716366529464722, + "learning_rate": 9.039904674982253e-07, + "loss": 0.9509, + "step": 5865 + }, + { + "epoch": 0.14137424435827653, + "grad_norm": 1.47152841091156, + "learning_rate": 9.038637055065409e-07, + "loss": 0.9803, + "step": 5870 + }, + { + "epoch": 0.14149466535006383, + "grad_norm": 1.5282686948776245, + "learning_rate": 9.037369435148564e-07, + "loss": 0.9496, + "step": 5875 + }, + { + "epoch": 0.1416150863418511, + "grad_norm": 1.6607931852340698, + "learning_rate": 9.03610181523172e-07, + "loss": 0.9748, + "step": 5880 + }, + { + "epoch": 0.1417355073336384, + "grad_norm": 1.5049351453781128, + "learning_rate": 9.034834195314877e-07, + "loss": 0.9323, + "step": 5885 + }, + { + "epoch": 0.14185592832542568, + "grad_norm": 1.4687178134918213, + "learning_rate": 9.033566575398033e-07, + "loss": 0.953, + "step": 5890 + }, + { + "epoch": 0.14197634931721298, + "grad_norm": 1.6209310293197632, + "learning_rate": 9.032298955481188e-07, + "loss": 0.927, + "step": 5895 + }, + { + "epoch": 0.14209677030900025, + "grad_norm": 1.742845892906189, + "learning_rate": 9.031031335564344e-07, + "loss": 0.9633, + "step": 5900 + }, + { + "epoch": 0.14221719130078755, + "grad_norm": 1.5639408826828003, + "learning_rate": 9.0297637156475e-07, + "loss": 0.9433, + "step": 5905 + }, + { + "epoch": 0.14233761229257486, + "grad_norm": 1.7477506399154663, + "learning_rate": 9.028496095730656e-07, + "loss": 0.9144, + "step": 5910 + }, + { + "epoch": 0.14245803328436213, + "grad_norm": 1.8129380941390991, + "learning_rate": 9.027228475813812e-07, + "loss": 0.9531, + "step": 5915 + }, + { + "epoch": 0.14257845427614943, + "grad_norm": 1.5080280303955078, + "learning_rate": 9.025960855896967e-07, + "loss": 0.987, + "step": 5920 + }, + { + "epoch": 0.1426988752679367, + "grad_norm": 1.4849168062210083, + "learning_rate": 9.024693235980123e-07, + "loss": 0.9883, + "step": 5925 + }, + { + "epoch": 0.142819296259724, + "grad_norm": 1.9817742109298706, + "learning_rate": 9.023425616063279e-07, + "loss": 0.9211, + "step": 5930 + }, + { + "epoch": 0.14293971725151128, + "grad_norm": 1.593101143836975, + "learning_rate": 9.022157996146436e-07, + "loss": 0.9607, + "step": 5935 + }, + { + "epoch": 0.14306013824329858, + "grad_norm": 2.151980400085449, + "learning_rate": 9.020890376229591e-07, + "loss": 0.9711, + "step": 5940 + }, + { + "epoch": 0.14318055923508585, + "grad_norm": 1.5875486135482788, + "learning_rate": 9.019622756312747e-07, + "loss": 0.9618, + "step": 5945 + }, + { + "epoch": 0.14330098022687315, + "grad_norm": 1.7066106796264648, + "learning_rate": 9.018355136395903e-07, + "loss": 0.9255, + "step": 5950 + }, + { + "epoch": 0.14342140121866043, + "grad_norm": 1.424856424331665, + "learning_rate": 9.017087516479058e-07, + "loss": 0.9177, + "step": 5955 + }, + { + "epoch": 0.14354182221044773, + "grad_norm": 1.345221996307373, + "learning_rate": 9.015819896562215e-07, + "loss": 0.9523, + "step": 5960 + }, + { + "epoch": 0.143662243202235, + "grad_norm": 1.4497827291488647, + "learning_rate": 9.014552276645371e-07, + "loss": 0.9444, + "step": 5965 + }, + { + "epoch": 0.1437826641940223, + "grad_norm": 1.830645203590393, + "learning_rate": 9.013284656728526e-07, + "loss": 0.9451, + "step": 5970 + }, + { + "epoch": 0.1439030851858096, + "grad_norm": 1.7493947744369507, + "learning_rate": 9.012017036811682e-07, + "loss": 0.9562, + "step": 5975 + }, + { + "epoch": 0.14402350617759688, + "grad_norm": 1.5937539339065552, + "learning_rate": 9.010749416894839e-07, + "loss": 1.0019, + "step": 5980 + }, + { + "epoch": 0.14414392716938418, + "grad_norm": 1.4270418882369995, + "learning_rate": 9.009481796977994e-07, + "loss": 0.9903, + "step": 5985 + }, + { + "epoch": 0.14426434816117145, + "grad_norm": 1.5541683435440063, + "learning_rate": 9.008214177061149e-07, + "loss": 0.9484, + "step": 5990 + }, + { + "epoch": 0.14438476915295875, + "grad_norm": 1.7125719785690308, + "learning_rate": 9.006946557144306e-07, + "loss": 0.9652, + "step": 5995 + }, + { + "epoch": 0.14450519014474603, + "grad_norm": 1.6315988302230835, + "learning_rate": 9.005678937227461e-07, + "loss": 0.9524, + "step": 6000 + }, + { + "epoch": 0.14462561113653333, + "grad_norm": 1.493573546409607, + "learning_rate": 9.004411317310618e-07, + "loss": 0.8963, + "step": 6005 + }, + { + "epoch": 0.1447460321283206, + "grad_norm": 1.3541276454925537, + "learning_rate": 9.003143697393774e-07, + "loss": 1.0076, + "step": 6010 + }, + { + "epoch": 0.1448664531201079, + "grad_norm": 1.7591745853424072, + "learning_rate": 9.001876077476928e-07, + "loss": 0.9745, + "step": 6015 + }, + { + "epoch": 0.14498687411189518, + "grad_norm": 1.5763760805130005, + "learning_rate": 9.000608457560085e-07, + "loss": 0.972, + "step": 6020 + }, + { + "epoch": 0.14510729510368248, + "grad_norm": 1.6166871786117554, + "learning_rate": 8.999340837643241e-07, + "loss": 0.9752, + "step": 6025 + }, + { + "epoch": 0.14522771609546975, + "grad_norm": 1.7187212705612183, + "learning_rate": 8.998073217726397e-07, + "loss": 0.9813, + "step": 6030 + }, + { + "epoch": 0.14534813708725705, + "grad_norm": 1.5862038135528564, + "learning_rate": 8.996805597809552e-07, + "loss": 0.9084, + "step": 6035 + }, + { + "epoch": 0.14546855807904435, + "grad_norm": 1.7693798542022705, + "learning_rate": 8.995537977892708e-07, + "loss": 0.8757, + "step": 6040 + }, + { + "epoch": 0.14558897907083163, + "grad_norm": 1.809888482093811, + "learning_rate": 8.994270357975864e-07, + "loss": 0.937, + "step": 6045 + }, + { + "epoch": 0.14570940006261893, + "grad_norm": 1.5466762781143188, + "learning_rate": 8.99300273805902e-07, + "loss": 0.9364, + "step": 6050 + }, + { + "epoch": 0.1458298210544062, + "grad_norm": 1.6745027303695679, + "learning_rate": 8.991735118142177e-07, + "loss": 0.9831, + "step": 6055 + }, + { + "epoch": 0.1459502420461935, + "grad_norm": 1.58736252784729, + "learning_rate": 8.990467498225331e-07, + "loss": 0.971, + "step": 6060 + }, + { + "epoch": 0.14607066303798077, + "grad_norm": 1.7206737995147705, + "learning_rate": 8.989199878308488e-07, + "loss": 0.9727, + "step": 6065 + }, + { + "epoch": 0.14619108402976808, + "grad_norm": 1.7877613306045532, + "learning_rate": 8.987932258391644e-07, + "loss": 0.9706, + "step": 6070 + }, + { + "epoch": 0.14631150502155535, + "grad_norm": 1.5204682350158691, + "learning_rate": 8.986664638474799e-07, + "loss": 0.8874, + "step": 6075 + }, + { + "epoch": 0.14643192601334265, + "grad_norm": 1.5084939002990723, + "learning_rate": 8.985397018557956e-07, + "loss": 0.9579, + "step": 6080 + }, + { + "epoch": 0.14655234700512992, + "grad_norm": 1.596987247467041, + "learning_rate": 8.984129398641111e-07, + "loss": 0.9857, + "step": 6085 + }, + { + "epoch": 0.14667276799691722, + "grad_norm": 1.6701748371124268, + "learning_rate": 8.982861778724267e-07, + "loss": 0.9154, + "step": 6090 + }, + { + "epoch": 0.1467931889887045, + "grad_norm": 1.676477074623108, + "learning_rate": 8.981594158807423e-07, + "loss": 0.8887, + "step": 6095 + }, + { + "epoch": 0.1469136099804918, + "grad_norm": 1.4097827672958374, + "learning_rate": 8.98032653889058e-07, + "loss": 0.9449, + "step": 6100 + }, + { + "epoch": 0.1470340309722791, + "grad_norm": 1.5947428941726685, + "learning_rate": 8.979058918973734e-07, + "loss": 0.9222, + "step": 6105 + }, + { + "epoch": 0.14715445196406637, + "grad_norm": 1.4588384628295898, + "learning_rate": 8.97779129905689e-07, + "loss": 0.9933, + "step": 6110 + }, + { + "epoch": 0.14727487295585368, + "grad_norm": 1.5385102033615112, + "learning_rate": 8.976523679140047e-07, + "loss": 0.9369, + "step": 6115 + }, + { + "epoch": 0.14739529394764095, + "grad_norm": 1.8313229084014893, + "learning_rate": 8.975256059223202e-07, + "loss": 0.9278, + "step": 6120 + }, + { + "epoch": 0.14751571493942825, + "grad_norm": 1.6547642946243286, + "learning_rate": 8.973988439306359e-07, + "loss": 0.9718, + "step": 6125 + }, + { + "epoch": 0.14763613593121552, + "grad_norm": 1.3734008073806763, + "learning_rate": 8.972720819389514e-07, + "loss": 0.8818, + "step": 6130 + }, + { + "epoch": 0.14775655692300282, + "grad_norm": 1.4598811864852905, + "learning_rate": 8.971453199472669e-07, + "loss": 0.9302, + "step": 6135 + }, + { + "epoch": 0.1478769779147901, + "grad_norm": 1.5441845655441284, + "learning_rate": 8.970185579555826e-07, + "loss": 0.8982, + "step": 6140 + }, + { + "epoch": 0.1479973989065774, + "grad_norm": 1.6037344932556152, + "learning_rate": 8.968917959638982e-07, + "loss": 0.9638, + "step": 6145 + }, + { + "epoch": 0.14811781989836467, + "grad_norm": 1.66959810256958, + "learning_rate": 8.967650339722137e-07, + "loss": 0.9568, + "step": 6150 + }, + { + "epoch": 0.14823824089015197, + "grad_norm": 1.5276808738708496, + "learning_rate": 8.966382719805293e-07, + "loss": 0.9364, + "step": 6155 + }, + { + "epoch": 0.14835866188193925, + "grad_norm": 1.5752981901168823, + "learning_rate": 8.965115099888449e-07, + "loss": 0.9797, + "step": 6160 + }, + { + "epoch": 0.14847908287372655, + "grad_norm": 1.6811892986297607, + "learning_rate": 8.963847479971605e-07, + "loss": 0.9202, + "step": 6165 + }, + { + "epoch": 0.14859950386551385, + "grad_norm": 1.6610900163650513, + "learning_rate": 8.962579860054761e-07, + "loss": 0.9118, + "step": 6170 + }, + { + "epoch": 0.14871992485730112, + "grad_norm": 1.4799003601074219, + "learning_rate": 8.961312240137917e-07, + "loss": 0.9738, + "step": 6175 + }, + { + "epoch": 0.14884034584908842, + "grad_norm": 1.5117777585983276, + "learning_rate": 8.960044620221072e-07, + "loss": 0.9589, + "step": 6180 + }, + { + "epoch": 0.1489607668408757, + "grad_norm": 1.5448501110076904, + "learning_rate": 8.958777000304228e-07, + "loss": 0.9391, + "step": 6185 + }, + { + "epoch": 0.149081187832663, + "grad_norm": 1.6339643001556396, + "learning_rate": 8.957509380387385e-07, + "loss": 0.947, + "step": 6190 + }, + { + "epoch": 0.14920160882445027, + "grad_norm": 1.4847960472106934, + "learning_rate": 8.95624176047054e-07, + "loss": 1.0043, + "step": 6195 + }, + { + "epoch": 0.14932202981623757, + "grad_norm": 1.7087690830230713, + "learning_rate": 8.954974140553696e-07, + "loss": 1.0199, + "step": 6200 + }, + { + "epoch": 0.14944245080802485, + "grad_norm": 1.7362369298934937, + "learning_rate": 8.953706520636852e-07, + "loss": 0.906, + "step": 6205 + }, + { + "epoch": 0.14956287179981215, + "grad_norm": 1.414970874786377, + "learning_rate": 8.952438900720008e-07, + "loss": 0.9886, + "step": 6210 + }, + { + "epoch": 0.14968329279159942, + "grad_norm": 1.4704195261001587, + "learning_rate": 8.951171280803164e-07, + "loss": 0.9758, + "step": 6215 + }, + { + "epoch": 0.14980371378338672, + "grad_norm": 1.5782198905944824, + "learning_rate": 8.949903660886319e-07, + "loss": 1.0132, + "step": 6220 + }, + { + "epoch": 0.149924134775174, + "grad_norm": 1.6264985799789429, + "learning_rate": 8.948636040969475e-07, + "loss": 0.9184, + "step": 6225 + }, + { + "epoch": 0.1500445557669613, + "grad_norm": 1.6577425003051758, + "learning_rate": 8.947368421052631e-07, + "loss": 0.9571, + "step": 6230 + }, + { + "epoch": 0.1501649767587486, + "grad_norm": 1.3714252710342407, + "learning_rate": 8.946100801135788e-07, + "loss": 0.9219, + "step": 6235 + }, + { + "epoch": 0.15028539775053587, + "grad_norm": 1.5763870477676392, + "learning_rate": 8.944833181218943e-07, + "loss": 0.9337, + "step": 6240 + }, + { + "epoch": 0.15040581874232317, + "grad_norm": 1.5407581329345703, + "learning_rate": 8.943565561302098e-07, + "loss": 0.9042, + "step": 6245 + }, + { + "epoch": 0.15052623973411045, + "grad_norm": 1.387267827987671, + "learning_rate": 8.942297941385255e-07, + "loss": 0.9513, + "step": 6250 + }, + { + "epoch": 0.15064666072589775, + "grad_norm": 1.731074571609497, + "learning_rate": 8.94103032146841e-07, + "loss": 0.9562, + "step": 6255 + }, + { + "epoch": 0.15076708171768502, + "grad_norm": 1.5412241220474243, + "learning_rate": 8.939762701551567e-07, + "loss": 0.9866, + "step": 6260 + }, + { + "epoch": 0.15088750270947232, + "grad_norm": 1.3752002716064453, + "learning_rate": 8.938495081634723e-07, + "loss": 0.9786, + "step": 6265 + }, + { + "epoch": 0.1510079237012596, + "grad_norm": 1.5093090534210205, + "learning_rate": 8.937227461717878e-07, + "loss": 0.8847, + "step": 6270 + }, + { + "epoch": 0.1511283446930469, + "grad_norm": 1.5734307765960693, + "learning_rate": 8.935959841801034e-07, + "loss": 0.9767, + "step": 6275 + }, + { + "epoch": 0.15124876568483417, + "grad_norm": 1.5351828336715698, + "learning_rate": 8.93469222188419e-07, + "loss": 0.9378, + "step": 6280 + }, + { + "epoch": 0.15136918667662147, + "grad_norm": 1.6842786073684692, + "learning_rate": 8.933424601967346e-07, + "loss": 0.9786, + "step": 6285 + }, + { + "epoch": 0.15148960766840874, + "grad_norm": 1.781619906425476, + "learning_rate": 8.932156982050501e-07, + "loss": 0.9514, + "step": 6290 + }, + { + "epoch": 0.15161002866019604, + "grad_norm": 1.6095762252807617, + "learning_rate": 8.930889362133658e-07, + "loss": 0.9219, + "step": 6295 + }, + { + "epoch": 0.15173044965198335, + "grad_norm": 1.392642855644226, + "learning_rate": 8.929621742216813e-07, + "loss": 0.9286, + "step": 6300 + }, + { + "epoch": 0.15185087064377062, + "grad_norm": 1.4566751718521118, + "learning_rate": 8.928354122299969e-07, + "loss": 0.9776, + "step": 6305 + }, + { + "epoch": 0.15197129163555792, + "grad_norm": 1.5674692392349243, + "learning_rate": 8.927086502383126e-07, + "loss": 0.9211, + "step": 6310 + }, + { + "epoch": 0.1520917126273452, + "grad_norm": 1.6276681423187256, + "learning_rate": 8.92581888246628e-07, + "loss": 0.961, + "step": 6315 + }, + { + "epoch": 0.1522121336191325, + "grad_norm": 1.4062598943710327, + "learning_rate": 8.924551262549437e-07, + "loss": 0.9144, + "step": 6320 + }, + { + "epoch": 0.15233255461091977, + "grad_norm": 1.5248562097549438, + "learning_rate": 8.923283642632593e-07, + "loss": 0.986, + "step": 6325 + }, + { + "epoch": 0.15245297560270707, + "grad_norm": 1.496514916419983, + "learning_rate": 8.922016022715749e-07, + "loss": 0.9464, + "step": 6330 + }, + { + "epoch": 0.15257339659449434, + "grad_norm": 1.688599944114685, + "learning_rate": 8.920748402798904e-07, + "loss": 0.9531, + "step": 6335 + }, + { + "epoch": 0.15269381758628164, + "grad_norm": 1.6141420602798462, + "learning_rate": 8.91948078288206e-07, + "loss": 0.9509, + "step": 6340 + }, + { + "epoch": 0.15281423857806892, + "grad_norm": 1.5606366395950317, + "learning_rate": 8.918213162965216e-07, + "loss": 1.0047, + "step": 6345 + }, + { + "epoch": 0.15293465956985622, + "grad_norm": 1.5859097242355347, + "learning_rate": 8.916945543048372e-07, + "loss": 0.9504, + "step": 6350 + }, + { + "epoch": 0.15305508056164352, + "grad_norm": 1.4285775423049927, + "learning_rate": 8.915677923131529e-07, + "loss": 0.9887, + "step": 6355 + }, + { + "epoch": 0.1531755015534308, + "grad_norm": 1.529550313949585, + "learning_rate": 8.914410303214683e-07, + "loss": 0.9791, + "step": 6360 + }, + { + "epoch": 0.1532959225452181, + "grad_norm": 1.4174224138259888, + "learning_rate": 8.913142683297839e-07, + "loss": 0.9772, + "step": 6365 + }, + { + "epoch": 0.15341634353700537, + "grad_norm": 1.4871257543563843, + "learning_rate": 8.911875063380996e-07, + "loss": 0.8747, + "step": 6370 + }, + { + "epoch": 0.15353676452879267, + "grad_norm": 1.4553872346878052, + "learning_rate": 8.910607443464151e-07, + "loss": 0.9605, + "step": 6375 + }, + { + "epoch": 0.15365718552057994, + "grad_norm": 1.5385500192642212, + "learning_rate": 8.909339823547307e-07, + "loss": 0.9069, + "step": 6380 + }, + { + "epoch": 0.15377760651236724, + "grad_norm": 1.5096920728683472, + "learning_rate": 8.908072203630463e-07, + "loss": 0.9155, + "step": 6385 + }, + { + "epoch": 0.15389802750415452, + "grad_norm": 1.5188795328140259, + "learning_rate": 8.906804583713618e-07, + "loss": 0.9971, + "step": 6390 + }, + { + "epoch": 0.15401844849594182, + "grad_norm": 1.3419337272644043, + "learning_rate": 8.905536963796775e-07, + "loss": 0.9307, + "step": 6395 + }, + { + "epoch": 0.1541388694877291, + "grad_norm": 1.4751125574111938, + "learning_rate": 8.904269343879931e-07, + "loss": 0.8893, + "step": 6400 + }, + { + "epoch": 0.1542592904795164, + "grad_norm": 1.5324145555496216, + "learning_rate": 8.903001723963086e-07, + "loss": 0.966, + "step": 6405 + }, + { + "epoch": 0.15437971147130367, + "grad_norm": 1.554923176765442, + "learning_rate": 8.901734104046242e-07, + "loss": 0.9722, + "step": 6410 + }, + { + "epoch": 0.15450013246309097, + "grad_norm": 1.5499253273010254, + "learning_rate": 8.900466484129399e-07, + "loss": 0.9327, + "step": 6415 + }, + { + "epoch": 0.15462055345487827, + "grad_norm": 1.539252519607544, + "learning_rate": 8.899198864212554e-07, + "loss": 0.9854, + "step": 6420 + }, + { + "epoch": 0.15474097444666554, + "grad_norm": 1.5881555080413818, + "learning_rate": 8.89793124429571e-07, + "loss": 1.0085, + "step": 6425 + }, + { + "epoch": 0.15486139543845284, + "grad_norm": 1.6186283826828003, + "learning_rate": 8.896663624378866e-07, + "loss": 0.9865, + "step": 6430 + }, + { + "epoch": 0.15498181643024012, + "grad_norm": 1.7926065921783447, + "learning_rate": 8.895396004462021e-07, + "loss": 1.0098, + "step": 6435 + }, + { + "epoch": 0.15510223742202742, + "grad_norm": 1.7826013565063477, + "learning_rate": 8.894128384545178e-07, + "loss": 0.9576, + "step": 6440 + }, + { + "epoch": 0.1552226584138147, + "grad_norm": 1.736943006515503, + "learning_rate": 8.892860764628334e-07, + "loss": 0.9785, + "step": 6445 + }, + { + "epoch": 0.155343079405602, + "grad_norm": 1.822184681892395, + "learning_rate": 8.891593144711488e-07, + "loss": 0.9646, + "step": 6450 + }, + { + "epoch": 0.15546350039738926, + "grad_norm": 1.5065011978149414, + "learning_rate": 8.890325524794645e-07, + "loss": 0.9547, + "step": 6455 + }, + { + "epoch": 0.15558392138917657, + "grad_norm": 1.5549193620681763, + "learning_rate": 8.889057904877801e-07, + "loss": 0.9702, + "step": 6460 + }, + { + "epoch": 0.15570434238096384, + "grad_norm": 1.4823460578918457, + "learning_rate": 8.887790284960957e-07, + "loss": 0.9393, + "step": 6465 + }, + { + "epoch": 0.15582476337275114, + "grad_norm": 1.5357725620269775, + "learning_rate": 8.886522665044113e-07, + "loss": 1.0052, + "step": 6470 + }, + { + "epoch": 0.15594518436453841, + "grad_norm": 1.395545244216919, + "learning_rate": 8.885255045127269e-07, + "loss": 0.927, + "step": 6475 + }, + { + "epoch": 0.15606560535632572, + "grad_norm": 1.5250643491744995, + "learning_rate": 8.883987425210424e-07, + "loss": 0.984, + "step": 6480 + }, + { + "epoch": 0.15618602634811302, + "grad_norm": 1.5090745687484741, + "learning_rate": 8.88271980529358e-07, + "loss": 0.9602, + "step": 6485 + }, + { + "epoch": 0.1563064473399003, + "grad_norm": 1.5585459470748901, + "learning_rate": 8.881452185376737e-07, + "loss": 0.9441, + "step": 6490 + }, + { + "epoch": 0.1564268683316876, + "grad_norm": 1.7480950355529785, + "learning_rate": 8.880184565459891e-07, + "loss": 0.9616, + "step": 6495 + }, + { + "epoch": 0.15654728932347486, + "grad_norm": 1.503509759902954, + "learning_rate": 8.878916945543048e-07, + "loss": 0.9515, + "step": 6500 + }, + { + "epoch": 0.15666771031526217, + "grad_norm": 1.5634033679962158, + "learning_rate": 8.877649325626204e-07, + "loss": 1.0264, + "step": 6505 + }, + { + "epoch": 0.15678813130704944, + "grad_norm": 1.530869960784912, + "learning_rate": 8.876381705709359e-07, + "loss": 0.984, + "step": 6510 + }, + { + "epoch": 0.15690855229883674, + "grad_norm": 1.5853240489959717, + "learning_rate": 8.875114085792516e-07, + "loss": 1.0086, + "step": 6515 + }, + { + "epoch": 0.157028973290624, + "grad_norm": 1.508705496788025, + "learning_rate": 8.873846465875671e-07, + "loss": 0.9516, + "step": 6520 + }, + { + "epoch": 0.15714939428241131, + "grad_norm": 1.5178998708724976, + "learning_rate": 8.872578845958827e-07, + "loss": 0.9062, + "step": 6525 + }, + { + "epoch": 0.1572698152741986, + "grad_norm": 1.529740810394287, + "learning_rate": 8.871311226041983e-07, + "loss": 0.9721, + "step": 6530 + }, + { + "epoch": 0.1573902362659859, + "grad_norm": 1.5887424945831299, + "learning_rate": 8.87004360612514e-07, + "loss": 0.9015, + "step": 6535 + }, + { + "epoch": 0.15751065725777316, + "grad_norm": 2.0315704345703125, + "learning_rate": 8.868775986208295e-07, + "loss": 0.9583, + "step": 6540 + }, + { + "epoch": 0.15763107824956046, + "grad_norm": 1.5251991748809814, + "learning_rate": 8.86750836629145e-07, + "loss": 0.9607, + "step": 6545 + }, + { + "epoch": 0.15775149924134776, + "grad_norm": 1.5378426313400269, + "learning_rate": 8.866240746374607e-07, + "loss": 1.0108, + "step": 6550 + }, + { + "epoch": 0.15787192023313504, + "grad_norm": 1.4061293601989746, + "learning_rate": 8.864973126457762e-07, + "loss": 0.8955, + "step": 6555 + }, + { + "epoch": 0.15799234122492234, + "grad_norm": 1.381935477256775, + "learning_rate": 8.863705506540919e-07, + "loss": 0.9266, + "step": 6560 + }, + { + "epoch": 0.1581127622167096, + "grad_norm": 1.6147645711898804, + "learning_rate": 8.862437886624074e-07, + "loss": 0.9526, + "step": 6565 + }, + { + "epoch": 0.1582331832084969, + "grad_norm": 1.4142693281173706, + "learning_rate": 8.861170266707229e-07, + "loss": 0.9508, + "step": 6570 + }, + { + "epoch": 0.1583536042002842, + "grad_norm": 1.6666898727416992, + "learning_rate": 8.859902646790386e-07, + "loss": 0.9025, + "step": 6575 + }, + { + "epoch": 0.1584740251920715, + "grad_norm": 1.609081745147705, + "learning_rate": 8.858635026873542e-07, + "loss": 0.9337, + "step": 6580 + }, + { + "epoch": 0.15859444618385876, + "grad_norm": 1.6501929759979248, + "learning_rate": 8.857367406956698e-07, + "loss": 0.8925, + "step": 6585 + }, + { + "epoch": 0.15871486717564606, + "grad_norm": 1.5381691455841064, + "learning_rate": 8.856099787039853e-07, + "loss": 0.9307, + "step": 6590 + }, + { + "epoch": 0.15883528816743334, + "grad_norm": 1.5835388898849487, + "learning_rate": 8.85483216712301e-07, + "loss": 0.9817, + "step": 6595 + }, + { + "epoch": 0.15895570915922064, + "grad_norm": 3.076845407485962, + "learning_rate": 8.853564547206165e-07, + "loss": 0.9501, + "step": 6600 + }, + { + "epoch": 0.1590761301510079, + "grad_norm": 1.6130303144454956, + "learning_rate": 8.852296927289321e-07, + "loss": 0.9755, + "step": 6605 + }, + { + "epoch": 0.1591965511427952, + "grad_norm": 1.55154287815094, + "learning_rate": 8.851029307372478e-07, + "loss": 0.9794, + "step": 6610 + }, + { + "epoch": 0.1593169721345825, + "grad_norm": 1.5113273859024048, + "learning_rate": 8.849761687455632e-07, + "loss": 0.9276, + "step": 6615 + }, + { + "epoch": 0.1594373931263698, + "grad_norm": 1.5491209030151367, + "learning_rate": 8.848494067538789e-07, + "loss": 1.0023, + "step": 6620 + }, + { + "epoch": 0.1595578141181571, + "grad_norm": 1.6405673027038574, + "learning_rate": 8.847226447621945e-07, + "loss": 0.9533, + "step": 6625 + }, + { + "epoch": 0.15967823510994436, + "grad_norm": 1.6780866384506226, + "learning_rate": 8.8459588277051e-07, + "loss": 0.959, + "step": 6630 + }, + { + "epoch": 0.15979865610173166, + "grad_norm": 1.3100441694259644, + "learning_rate": 8.844691207788256e-07, + "loss": 1.0109, + "step": 6635 + }, + { + "epoch": 0.15991907709351894, + "grad_norm": 1.7152460813522339, + "learning_rate": 8.843423587871412e-07, + "loss": 0.9075, + "step": 6640 + }, + { + "epoch": 0.16003949808530624, + "grad_norm": 1.6447054147720337, + "learning_rate": 8.842155967954568e-07, + "loss": 0.937, + "step": 6645 + }, + { + "epoch": 0.1601599190770935, + "grad_norm": 1.7059388160705566, + "learning_rate": 8.840888348037724e-07, + "loss": 0.9517, + "step": 6650 + }, + { + "epoch": 0.1602803400688808, + "grad_norm": 1.547312617301941, + "learning_rate": 8.839620728120881e-07, + "loss": 0.8843, + "step": 6655 + }, + { + "epoch": 0.16040076106066808, + "grad_norm": 1.666043758392334, + "learning_rate": 8.838353108204035e-07, + "loss": 0.9763, + "step": 6660 + }, + { + "epoch": 0.16052118205245539, + "grad_norm": 1.5424047708511353, + "learning_rate": 8.837085488287191e-07, + "loss": 0.9772, + "step": 6665 + }, + { + "epoch": 0.16064160304424266, + "grad_norm": 1.7461053133010864, + "learning_rate": 8.835817868370348e-07, + "loss": 0.9384, + "step": 6670 + }, + { + "epoch": 0.16076202403602996, + "grad_norm": 1.5185924768447876, + "learning_rate": 8.834550248453503e-07, + "loss": 0.9413, + "step": 6675 + }, + { + "epoch": 0.16088244502781726, + "grad_norm": 1.5015673637390137, + "learning_rate": 8.833282628536659e-07, + "loss": 0.9503, + "step": 6680 + }, + { + "epoch": 0.16100286601960453, + "grad_norm": 1.4696515798568726, + "learning_rate": 8.832015008619815e-07, + "loss": 0.9829, + "step": 6685 + }, + { + "epoch": 0.16112328701139184, + "grad_norm": 1.4103220701217651, + "learning_rate": 8.83074738870297e-07, + "loss": 0.9143, + "step": 6690 + }, + { + "epoch": 0.1612437080031791, + "grad_norm": 1.5549726486206055, + "learning_rate": 8.829479768786127e-07, + "loss": 0.9381, + "step": 6695 + }, + { + "epoch": 0.1613641289949664, + "grad_norm": 1.6166776418685913, + "learning_rate": 8.828212148869283e-07, + "loss": 0.9271, + "step": 6700 + }, + { + "epoch": 0.16148454998675368, + "grad_norm": 1.652779459953308, + "learning_rate": 8.826944528952438e-07, + "loss": 0.9363, + "step": 6705 + }, + { + "epoch": 0.16160497097854098, + "grad_norm": 1.5943504571914673, + "learning_rate": 8.825676909035594e-07, + "loss": 0.9475, + "step": 6710 + }, + { + "epoch": 0.16172539197032826, + "grad_norm": 1.6199977397918701, + "learning_rate": 8.82440928911875e-07, + "loss": 0.9631, + "step": 6715 + }, + { + "epoch": 0.16184581296211556, + "grad_norm": 1.6312978267669678, + "learning_rate": 8.823141669201906e-07, + "loss": 0.9328, + "step": 6720 + }, + { + "epoch": 0.16196623395390283, + "grad_norm": 1.6854356527328491, + "learning_rate": 8.821874049285062e-07, + "loss": 1.0102, + "step": 6725 + }, + { + "epoch": 0.16208665494569013, + "grad_norm": 1.5627775192260742, + "learning_rate": 8.820606429368218e-07, + "loss": 1.0122, + "step": 6730 + }, + { + "epoch": 0.1622070759374774, + "grad_norm": 1.4499491453170776, + "learning_rate": 8.819338809451373e-07, + "loss": 0.9514, + "step": 6735 + }, + { + "epoch": 0.1623274969292647, + "grad_norm": 1.5392065048217773, + "learning_rate": 8.81807118953453e-07, + "loss": 0.9317, + "step": 6740 + }, + { + "epoch": 0.162447917921052, + "grad_norm": 1.5162497758865356, + "learning_rate": 8.816803569617686e-07, + "loss": 0.9393, + "step": 6745 + }, + { + "epoch": 0.16256833891283928, + "grad_norm": 1.8237184286117554, + "learning_rate": 8.81553594970084e-07, + "loss": 0.9112, + "step": 6750 + }, + { + "epoch": 0.16268875990462658, + "grad_norm": 1.511694073677063, + "learning_rate": 8.814268329783997e-07, + "loss": 0.9365, + "step": 6755 + }, + { + "epoch": 0.16280918089641386, + "grad_norm": 1.4274612665176392, + "learning_rate": 8.813000709867153e-07, + "loss": 0.9378, + "step": 6760 + }, + { + "epoch": 0.16292960188820116, + "grad_norm": 1.5141350030899048, + "learning_rate": 8.811733089950309e-07, + "loss": 0.9561, + "step": 6765 + }, + { + "epoch": 0.16305002287998843, + "grad_norm": 1.6848194599151611, + "learning_rate": 8.810465470033465e-07, + "loss": 0.9606, + "step": 6770 + }, + { + "epoch": 0.16317044387177573, + "grad_norm": 1.487057089805603, + "learning_rate": 8.80919785011662e-07, + "loss": 0.9161, + "step": 6775 + }, + { + "epoch": 0.163290864863563, + "grad_norm": 1.4446853399276733, + "learning_rate": 8.807930230199776e-07, + "loss": 0.9692, + "step": 6780 + }, + { + "epoch": 0.1634112858553503, + "grad_norm": 2.0542047023773193, + "learning_rate": 8.806662610282932e-07, + "loss": 0.9392, + "step": 6785 + }, + { + "epoch": 0.16353170684713758, + "grad_norm": 1.7039285898208618, + "learning_rate": 8.805394990366089e-07, + "loss": 0.9278, + "step": 6790 + }, + { + "epoch": 0.16365212783892488, + "grad_norm": 1.584222435951233, + "learning_rate": 8.804127370449243e-07, + "loss": 0.932, + "step": 6795 + }, + { + "epoch": 0.16377254883071218, + "grad_norm": 1.6305924654006958, + "learning_rate": 8.8028597505324e-07, + "loss": 0.9655, + "step": 6800 + }, + { + "epoch": 0.16389296982249946, + "grad_norm": 1.6200332641601562, + "learning_rate": 8.801592130615556e-07, + "loss": 0.96, + "step": 6805 + }, + { + "epoch": 0.16401339081428676, + "grad_norm": 1.4420777559280396, + "learning_rate": 8.800324510698711e-07, + "loss": 0.9659, + "step": 6810 + }, + { + "epoch": 0.16413381180607403, + "grad_norm": 1.7470130920410156, + "learning_rate": 8.799056890781868e-07, + "loss": 0.924, + "step": 6815 + }, + { + "epoch": 0.16425423279786133, + "grad_norm": 1.611938714981079, + "learning_rate": 8.797789270865023e-07, + "loss": 0.981, + "step": 6820 + }, + { + "epoch": 0.1643746537896486, + "grad_norm": 1.7520917654037476, + "learning_rate": 8.796521650948179e-07, + "loss": 0.9833, + "step": 6825 + }, + { + "epoch": 0.1644950747814359, + "grad_norm": 1.7744758129119873, + "learning_rate": 8.795254031031335e-07, + "loss": 0.9853, + "step": 6830 + }, + { + "epoch": 0.16461549577322318, + "grad_norm": 1.465274691581726, + "learning_rate": 8.793986411114491e-07, + "loss": 0.9643, + "step": 6835 + }, + { + "epoch": 0.16473591676501048, + "grad_norm": 1.4554154872894287, + "learning_rate": 8.792718791197647e-07, + "loss": 0.9115, + "step": 6840 + }, + { + "epoch": 0.16485633775679776, + "grad_norm": 1.8319108486175537, + "learning_rate": 8.791451171280802e-07, + "loss": 0.9758, + "step": 6845 + }, + { + "epoch": 0.16497675874858506, + "grad_norm": 1.3700926303863525, + "learning_rate": 8.790183551363959e-07, + "loss": 0.987, + "step": 6850 + }, + { + "epoch": 0.16509717974037233, + "grad_norm": 1.4417375326156616, + "learning_rate": 8.788915931447114e-07, + "loss": 0.9503, + "step": 6855 + }, + { + "epoch": 0.16521760073215963, + "grad_norm": 1.8359936475753784, + "learning_rate": 8.787648311530271e-07, + "loss": 0.939, + "step": 6860 + }, + { + "epoch": 0.16533802172394693, + "grad_norm": 1.694645643234253, + "learning_rate": 8.786380691613426e-07, + "loss": 0.9382, + "step": 6865 + }, + { + "epoch": 0.1654584427157342, + "grad_norm": 1.778012752532959, + "learning_rate": 8.785113071696581e-07, + "loss": 0.9851, + "step": 6870 + }, + { + "epoch": 0.1655788637075215, + "grad_norm": 1.6528106927871704, + "learning_rate": 8.783845451779738e-07, + "loss": 0.9749, + "step": 6875 + }, + { + "epoch": 0.16569928469930878, + "grad_norm": 1.4038870334625244, + "learning_rate": 8.782577831862894e-07, + "loss": 0.9657, + "step": 6880 + }, + { + "epoch": 0.16581970569109608, + "grad_norm": 1.967231035232544, + "learning_rate": 8.78131021194605e-07, + "loss": 0.9531, + "step": 6885 + }, + { + "epoch": 0.16594012668288335, + "grad_norm": 1.5113967657089233, + "learning_rate": 8.780042592029205e-07, + "loss": 0.9842, + "step": 6890 + }, + { + "epoch": 0.16606054767467066, + "grad_norm": 1.4383405447006226, + "learning_rate": 8.778774972112361e-07, + "loss": 0.9339, + "step": 6895 + }, + { + "epoch": 0.16618096866645793, + "grad_norm": 1.5188686847686768, + "learning_rate": 8.777507352195517e-07, + "loss": 0.994, + "step": 6900 + }, + { + "epoch": 0.16630138965824523, + "grad_norm": 1.4522814750671387, + "learning_rate": 8.776239732278673e-07, + "loss": 0.9889, + "step": 6905 + }, + { + "epoch": 0.1664218106500325, + "grad_norm": 1.40193510055542, + "learning_rate": 8.77497211236183e-07, + "loss": 0.9585, + "step": 6910 + }, + { + "epoch": 0.1665422316418198, + "grad_norm": 1.6679961681365967, + "learning_rate": 8.773704492444985e-07, + "loss": 0.9333, + "step": 6915 + }, + { + "epoch": 0.16666265263360708, + "grad_norm": 1.6849275827407837, + "learning_rate": 8.77243687252814e-07, + "loss": 0.9663, + "step": 6920 + }, + { + "epoch": 0.16678307362539438, + "grad_norm": 1.5537941455841064, + "learning_rate": 8.771169252611297e-07, + "loss": 0.969, + "step": 6925 + }, + { + "epoch": 0.16690349461718168, + "grad_norm": 1.6917616128921509, + "learning_rate": 8.769901632694453e-07, + "loss": 0.9102, + "step": 6930 + }, + { + "epoch": 0.16702391560896895, + "grad_norm": 1.4796494245529175, + "learning_rate": 8.768634012777608e-07, + "loss": 0.9696, + "step": 6935 + }, + { + "epoch": 0.16714433660075625, + "grad_norm": 1.639575481414795, + "learning_rate": 8.767366392860764e-07, + "loss": 0.9919, + "step": 6940 + }, + { + "epoch": 0.16726475759254353, + "grad_norm": 1.4652765989303589, + "learning_rate": 8.766098772943921e-07, + "loss": 0.9667, + "step": 6945 + }, + { + "epoch": 0.16738517858433083, + "grad_norm": 1.4565825462341309, + "learning_rate": 8.764831153027076e-07, + "loss": 0.9219, + "step": 6950 + }, + { + "epoch": 0.1675055995761181, + "grad_norm": 1.7221384048461914, + "learning_rate": 8.763563533110232e-07, + "loss": 0.9487, + "step": 6955 + }, + { + "epoch": 0.1676260205679054, + "grad_norm": 1.753455638885498, + "learning_rate": 8.762295913193388e-07, + "loss": 0.91, + "step": 6960 + }, + { + "epoch": 0.16774644155969268, + "grad_norm": 1.534593939781189, + "learning_rate": 8.761028293276543e-07, + "loss": 0.9343, + "step": 6965 + }, + { + "epoch": 0.16786686255147998, + "grad_norm": 1.6360231637954712, + "learning_rate": 8.7597606733597e-07, + "loss": 0.9446, + "step": 6970 + }, + { + "epoch": 0.16798728354326725, + "grad_norm": 1.6140836477279663, + "learning_rate": 8.758493053442856e-07, + "loss": 0.9353, + "step": 6975 + }, + { + "epoch": 0.16810770453505455, + "grad_norm": 1.7073837518692017, + "learning_rate": 8.75722543352601e-07, + "loss": 0.9966, + "step": 6980 + }, + { + "epoch": 0.16822812552684183, + "grad_norm": 1.6094011068344116, + "learning_rate": 8.755957813609167e-07, + "loss": 0.9944, + "step": 6985 + }, + { + "epoch": 0.16834854651862913, + "grad_norm": 1.502415418624878, + "learning_rate": 8.754690193692323e-07, + "loss": 0.9864, + "step": 6990 + }, + { + "epoch": 0.16846896751041643, + "grad_norm": 1.5931711196899414, + "learning_rate": 8.753422573775479e-07, + "loss": 0.8975, + "step": 6995 + }, + { + "epoch": 0.1685893885022037, + "grad_norm": 1.5415033102035522, + "learning_rate": 8.752154953858635e-07, + "loss": 0.9557, + "step": 7000 + }, + { + "epoch": 0.168709809493991, + "grad_norm": 1.4606413841247559, + "learning_rate": 8.750887333941791e-07, + "loss": 0.9077, + "step": 7005 + }, + { + "epoch": 0.16883023048577828, + "grad_norm": 1.534355640411377, + "learning_rate": 8.749619714024946e-07, + "loss": 0.9229, + "step": 7010 + }, + { + "epoch": 0.16895065147756558, + "grad_norm": 1.5767483711242676, + "learning_rate": 8.748352094108102e-07, + "loss": 1.0249, + "step": 7015 + }, + { + "epoch": 0.16907107246935285, + "grad_norm": 1.371443748474121, + "learning_rate": 8.747084474191259e-07, + "loss": 0.9365, + "step": 7020 + }, + { + "epoch": 0.16919149346114015, + "grad_norm": 1.5228232145309448, + "learning_rate": 8.745816854274414e-07, + "loss": 0.9641, + "step": 7025 + }, + { + "epoch": 0.16931191445292743, + "grad_norm": 1.520755648612976, + "learning_rate": 8.74454923435757e-07, + "loss": 0.9721, + "step": 7030 + }, + { + "epoch": 0.16943233544471473, + "grad_norm": 1.4273567199707031, + "learning_rate": 8.743281614440726e-07, + "loss": 0.902, + "step": 7035 + }, + { + "epoch": 0.169552756436502, + "grad_norm": 1.4603309631347656, + "learning_rate": 8.742013994523881e-07, + "loss": 0.9388, + "step": 7040 + }, + { + "epoch": 0.1696731774282893, + "grad_norm": 3.162548303604126, + "learning_rate": 8.740746374607038e-07, + "loss": 0.9625, + "step": 7045 + }, + { + "epoch": 0.16979359842007657, + "grad_norm": 1.5650300979614258, + "learning_rate": 8.739478754690193e-07, + "loss": 0.9908, + "step": 7050 + }, + { + "epoch": 0.16991401941186388, + "grad_norm": 1.647933006286621, + "learning_rate": 8.738211134773349e-07, + "loss": 0.9675, + "step": 7055 + }, + { + "epoch": 0.17003444040365118, + "grad_norm": 1.657855749130249, + "learning_rate": 8.736943514856505e-07, + "loss": 0.9611, + "step": 7060 + }, + { + "epoch": 0.17015486139543845, + "grad_norm": 1.6878466606140137, + "learning_rate": 8.735675894939662e-07, + "loss": 0.9491, + "step": 7065 + }, + { + "epoch": 0.17027528238722575, + "grad_norm": 1.745612621307373, + "learning_rate": 8.734408275022817e-07, + "loss": 0.9741, + "step": 7070 + }, + { + "epoch": 0.17039570337901302, + "grad_norm": 1.6257210969924927, + "learning_rate": 8.733140655105972e-07, + "loss": 0.9407, + "step": 7075 + }, + { + "epoch": 0.17051612437080033, + "grad_norm": 1.6022166013717651, + "learning_rate": 8.731873035189129e-07, + "loss": 0.8677, + "step": 7080 + }, + { + "epoch": 0.1706365453625876, + "grad_norm": 1.5448520183563232, + "learning_rate": 8.730605415272284e-07, + "loss": 0.985, + "step": 7085 + }, + { + "epoch": 0.1707569663543749, + "grad_norm": 1.5920443534851074, + "learning_rate": 8.729337795355441e-07, + "loss": 0.9544, + "step": 7090 + }, + { + "epoch": 0.17087738734616217, + "grad_norm": 1.6187429428100586, + "learning_rate": 8.728070175438597e-07, + "loss": 0.961, + "step": 7095 + }, + { + "epoch": 0.17099780833794948, + "grad_norm": 1.5467497110366821, + "learning_rate": 8.726802555521751e-07, + "loss": 0.9439, + "step": 7100 + }, + { + "epoch": 0.17111822932973675, + "grad_norm": 1.6352099180221558, + "learning_rate": 8.725534935604908e-07, + "loss": 0.945, + "step": 7105 + }, + { + "epoch": 0.17123865032152405, + "grad_norm": 1.3417030572891235, + "learning_rate": 8.724267315688064e-07, + "loss": 0.9185, + "step": 7110 + }, + { + "epoch": 0.17135907131331132, + "grad_norm": 1.594738245010376, + "learning_rate": 8.72299969577122e-07, + "loss": 0.9201, + "step": 7115 + }, + { + "epoch": 0.17147949230509862, + "grad_norm": 1.7222914695739746, + "learning_rate": 8.721732075854375e-07, + "loss": 0.927, + "step": 7120 + }, + { + "epoch": 0.17159991329688593, + "grad_norm": 2.0674643516540527, + "learning_rate": 8.720464455937532e-07, + "loss": 0.9905, + "step": 7125 + }, + { + "epoch": 0.1717203342886732, + "grad_norm": 1.612656831741333, + "learning_rate": 8.719196836020687e-07, + "loss": 0.9274, + "step": 7130 + }, + { + "epoch": 0.1718407552804605, + "grad_norm": 1.4258354902267456, + "learning_rate": 8.717929216103843e-07, + "loss": 0.971, + "step": 7135 + }, + { + "epoch": 0.17196117627224777, + "grad_norm": 1.4199203252792358, + "learning_rate": 8.716661596187e-07, + "loss": 0.9387, + "step": 7140 + }, + { + "epoch": 0.17208159726403507, + "grad_norm": 1.5075029134750366, + "learning_rate": 8.715393976270154e-07, + "loss": 0.9702, + "step": 7145 + }, + { + "epoch": 0.17220201825582235, + "grad_norm": 1.5064207315444946, + "learning_rate": 8.714126356353311e-07, + "loss": 0.8867, + "step": 7150 + }, + { + "epoch": 0.17232243924760965, + "grad_norm": 1.5721375942230225, + "learning_rate": 8.712858736436467e-07, + "loss": 0.956, + "step": 7155 + }, + { + "epoch": 0.17244286023939692, + "grad_norm": 1.6020773649215698, + "learning_rate": 8.711591116519622e-07, + "loss": 0.94, + "step": 7160 + }, + { + "epoch": 0.17256328123118422, + "grad_norm": 1.4800697565078735, + "learning_rate": 8.710323496602778e-07, + "loss": 0.9859, + "step": 7165 + }, + { + "epoch": 0.1726837022229715, + "grad_norm": 1.772230625152588, + "learning_rate": 8.709055876685934e-07, + "loss": 0.9597, + "step": 7170 + }, + { + "epoch": 0.1728041232147588, + "grad_norm": 1.602545142173767, + "learning_rate": 8.70778825676909e-07, + "loss": 0.9412, + "step": 7175 + }, + { + "epoch": 0.17292454420654607, + "grad_norm": 1.5732465982437134, + "learning_rate": 8.706520636852246e-07, + "loss": 0.9165, + "step": 7180 + }, + { + "epoch": 0.17304496519833337, + "grad_norm": 1.5865917205810547, + "learning_rate": 8.705253016935403e-07, + "loss": 0.9101, + "step": 7185 + }, + { + "epoch": 0.17316538619012067, + "grad_norm": 1.4857765436172485, + "learning_rate": 8.703985397018557e-07, + "loss": 0.9447, + "step": 7190 + }, + { + "epoch": 0.17328580718190795, + "grad_norm": 1.5675077438354492, + "learning_rate": 8.702717777101713e-07, + "loss": 0.9187, + "step": 7195 + }, + { + "epoch": 0.17340622817369525, + "grad_norm": 1.5084619522094727, + "learning_rate": 8.70145015718487e-07, + "loss": 0.966, + "step": 7200 + }, + { + "epoch": 0.17352664916548252, + "grad_norm": 1.5520141124725342, + "learning_rate": 8.700182537268025e-07, + "loss": 0.9421, + "step": 7205 + }, + { + "epoch": 0.17364707015726982, + "grad_norm": 1.9939416646957397, + "learning_rate": 8.698914917351182e-07, + "loss": 0.9815, + "step": 7210 + }, + { + "epoch": 0.1737674911490571, + "grad_norm": 1.4251264333724976, + "learning_rate": 8.697647297434337e-07, + "loss": 0.9374, + "step": 7215 + }, + { + "epoch": 0.1738879121408444, + "grad_norm": 1.5417747497558594, + "learning_rate": 8.696379677517492e-07, + "loss": 0.9117, + "step": 7220 + }, + { + "epoch": 0.17400833313263167, + "grad_norm": 1.4149068593978882, + "learning_rate": 8.695112057600649e-07, + "loss": 0.897, + "step": 7225 + }, + { + "epoch": 0.17412875412441897, + "grad_norm": 1.4917889833450317, + "learning_rate": 8.693844437683805e-07, + "loss": 0.9394, + "step": 7230 + }, + { + "epoch": 0.17424917511620625, + "grad_norm": 1.6822994947433472, + "learning_rate": 8.69257681776696e-07, + "loss": 0.8761, + "step": 7235 + }, + { + "epoch": 0.17436959610799355, + "grad_norm": 1.4337525367736816, + "learning_rate": 8.691309197850116e-07, + "loss": 0.8911, + "step": 7240 + }, + { + "epoch": 0.17449001709978085, + "grad_norm": 1.5922350883483887, + "learning_rate": 8.690041577933273e-07, + "loss": 0.9328, + "step": 7245 + }, + { + "epoch": 0.17461043809156812, + "grad_norm": 1.6190274953842163, + "learning_rate": 8.688773958016428e-07, + "loss": 0.952, + "step": 7250 + }, + { + "epoch": 0.17473085908335542, + "grad_norm": 1.870335578918457, + "learning_rate": 8.687506338099584e-07, + "loss": 0.9857, + "step": 7255 + }, + { + "epoch": 0.1748512800751427, + "grad_norm": 1.4689422845840454, + "learning_rate": 8.68623871818274e-07, + "loss": 0.8954, + "step": 7260 + }, + { + "epoch": 0.17497170106693, + "grad_norm": 1.566745400428772, + "learning_rate": 8.684971098265895e-07, + "loss": 0.9357, + "step": 7265 + }, + { + "epoch": 0.17509212205871727, + "grad_norm": 1.6396536827087402, + "learning_rate": 8.683703478349052e-07, + "loss": 0.9431, + "step": 7270 + }, + { + "epoch": 0.17521254305050457, + "grad_norm": 1.8179210424423218, + "learning_rate": 8.682435858432208e-07, + "loss": 0.9463, + "step": 7275 + }, + { + "epoch": 0.17533296404229184, + "grad_norm": 2.6301631927490234, + "learning_rate": 8.681168238515362e-07, + "loss": 0.9387, + "step": 7280 + }, + { + "epoch": 0.17545338503407915, + "grad_norm": 1.613966941833496, + "learning_rate": 8.679900618598519e-07, + "loss": 0.9599, + "step": 7285 + }, + { + "epoch": 0.17557380602586642, + "grad_norm": 1.6236636638641357, + "learning_rate": 8.678632998681675e-07, + "loss": 0.9078, + "step": 7290 + }, + { + "epoch": 0.17569422701765372, + "grad_norm": 1.417366862297058, + "learning_rate": 8.677365378764831e-07, + "loss": 0.9107, + "step": 7295 + }, + { + "epoch": 0.175814648009441, + "grad_norm": 1.6419947147369385, + "learning_rate": 8.676097758847987e-07, + "loss": 0.9224, + "step": 7300 + }, + { + "epoch": 0.1759350690012283, + "grad_norm": 1.635498285293579, + "learning_rate": 8.674830138931142e-07, + "loss": 1.0329, + "step": 7305 + }, + { + "epoch": 0.1760554899930156, + "grad_norm": 1.461501121520996, + "learning_rate": 8.673562519014298e-07, + "loss": 0.9704, + "step": 7310 + }, + { + "epoch": 0.17617591098480287, + "grad_norm": 2.0301854610443115, + "learning_rate": 8.672294899097454e-07, + "loss": 0.957, + "step": 7315 + }, + { + "epoch": 0.17629633197659017, + "grad_norm": 1.6653895378112793, + "learning_rate": 8.671027279180611e-07, + "loss": 1.0023, + "step": 7320 + }, + { + "epoch": 0.17641675296837744, + "grad_norm": 1.5719858407974243, + "learning_rate": 8.669759659263766e-07, + "loss": 0.9648, + "step": 7325 + }, + { + "epoch": 0.17653717396016474, + "grad_norm": 1.5789759159088135, + "learning_rate": 8.668492039346922e-07, + "loss": 0.9642, + "step": 7330 + }, + { + "epoch": 0.17665759495195202, + "grad_norm": 1.499147891998291, + "learning_rate": 8.667224419430078e-07, + "loss": 0.9344, + "step": 7335 + }, + { + "epoch": 0.17677801594373932, + "grad_norm": 1.4714581966400146, + "learning_rate": 8.665956799513233e-07, + "loss": 0.9036, + "step": 7340 + }, + { + "epoch": 0.1768984369355266, + "grad_norm": 1.5490918159484863, + "learning_rate": 8.66468917959639e-07, + "loss": 1.0156, + "step": 7345 + }, + { + "epoch": 0.1770188579273139, + "grad_norm": 1.5557063817977905, + "learning_rate": 8.663421559679545e-07, + "loss": 0.9306, + "step": 7350 + }, + { + "epoch": 0.17713927891910117, + "grad_norm": 1.6066994667053223, + "learning_rate": 8.662153939762701e-07, + "loss": 0.9709, + "step": 7355 + }, + { + "epoch": 0.17725969991088847, + "grad_norm": 1.4699970483779907, + "learning_rate": 8.660886319845857e-07, + "loss": 0.9337, + "step": 7360 + }, + { + "epoch": 0.17738012090267574, + "grad_norm": 1.4439870119094849, + "learning_rate": 8.659618699929013e-07, + "loss": 0.9177, + "step": 7365 + }, + { + "epoch": 0.17750054189446304, + "grad_norm": 1.50325608253479, + "learning_rate": 8.658351080012169e-07, + "loss": 0.9252, + "step": 7370 + }, + { + "epoch": 0.17762096288625034, + "grad_norm": 1.4546009302139282, + "learning_rate": 8.657083460095324e-07, + "loss": 0.9483, + "step": 7375 + }, + { + "epoch": 0.17774138387803762, + "grad_norm": 1.5476328134536743, + "learning_rate": 8.655815840178481e-07, + "loss": 0.9219, + "step": 7380 + }, + { + "epoch": 0.17786180486982492, + "grad_norm": 1.4623545408248901, + "learning_rate": 8.654548220261636e-07, + "loss": 0.9562, + "step": 7385 + }, + { + "epoch": 0.1779822258616122, + "grad_norm": 1.5667556524276733, + "learning_rate": 8.653280600344793e-07, + "loss": 0.9165, + "step": 7390 + }, + { + "epoch": 0.1781026468533995, + "grad_norm": 1.402410626411438, + "learning_rate": 8.652012980427949e-07, + "loss": 0.9527, + "step": 7395 + }, + { + "epoch": 0.17822306784518677, + "grad_norm": 1.6034663915634155, + "learning_rate": 8.650745360511103e-07, + "loss": 0.9819, + "step": 7400 + }, + { + "epoch": 0.17834348883697407, + "grad_norm": 1.5540601015090942, + "learning_rate": 8.64947774059426e-07, + "loss": 0.9365, + "step": 7405 + }, + { + "epoch": 0.17846390982876134, + "grad_norm": 1.5132516622543335, + "learning_rate": 8.648210120677416e-07, + "loss": 0.9918, + "step": 7410 + }, + { + "epoch": 0.17858433082054864, + "grad_norm": 1.5507590770721436, + "learning_rate": 8.646942500760572e-07, + "loss": 0.9234, + "step": 7415 + }, + { + "epoch": 0.17870475181233592, + "grad_norm": 1.5630336999893188, + "learning_rate": 8.645674880843727e-07, + "loss": 0.94, + "step": 7420 + }, + { + "epoch": 0.17882517280412322, + "grad_norm": 1.7237917184829712, + "learning_rate": 8.644407260926883e-07, + "loss": 0.9653, + "step": 7425 + }, + { + "epoch": 0.1789455937959105, + "grad_norm": 1.4693753719329834, + "learning_rate": 8.643139641010039e-07, + "loss": 0.9563, + "step": 7430 + }, + { + "epoch": 0.1790660147876978, + "grad_norm": 1.636465072631836, + "learning_rate": 8.641872021093195e-07, + "loss": 0.9552, + "step": 7435 + }, + { + "epoch": 0.1791864357794851, + "grad_norm": 1.3864063024520874, + "learning_rate": 8.640604401176352e-07, + "loss": 0.9553, + "step": 7440 + }, + { + "epoch": 0.17930685677127237, + "grad_norm": 1.6570584774017334, + "learning_rate": 8.639336781259506e-07, + "loss": 0.9229, + "step": 7445 + }, + { + "epoch": 0.17942727776305967, + "grad_norm": 1.4525080919265747, + "learning_rate": 8.638069161342662e-07, + "loss": 0.9361, + "step": 7450 + }, + { + "epoch": 0.17954769875484694, + "grad_norm": 1.5122401714324951, + "learning_rate": 8.636801541425819e-07, + "loss": 0.9189, + "step": 7455 + }, + { + "epoch": 0.17966811974663424, + "grad_norm": 1.5133171081542969, + "learning_rate": 8.635533921508974e-07, + "loss": 0.9435, + "step": 7460 + }, + { + "epoch": 0.17978854073842152, + "grad_norm": 1.5368260145187378, + "learning_rate": 8.63426630159213e-07, + "loss": 0.9743, + "step": 7465 + }, + { + "epoch": 0.17990896173020882, + "grad_norm": 1.7313311100006104, + "learning_rate": 8.632998681675286e-07, + "loss": 0.916, + "step": 7470 + }, + { + "epoch": 0.1800293827219961, + "grad_norm": 1.5005427598953247, + "learning_rate": 8.631731061758442e-07, + "loss": 0.9366, + "step": 7475 + }, + { + "epoch": 0.1801498037137834, + "grad_norm": 1.3697391748428345, + "learning_rate": 8.630463441841598e-07, + "loss": 0.9251, + "step": 7480 + }, + { + "epoch": 0.18027022470557066, + "grad_norm": 1.6520494222640991, + "learning_rate": 8.629195821924754e-07, + "loss": 0.9587, + "step": 7485 + }, + { + "epoch": 0.18039064569735797, + "grad_norm": 1.641788363456726, + "learning_rate": 8.627928202007909e-07, + "loss": 0.9348, + "step": 7490 + }, + { + "epoch": 0.18051106668914524, + "grad_norm": 1.6810346841812134, + "learning_rate": 8.626660582091065e-07, + "loss": 0.9282, + "step": 7495 + }, + { + "epoch": 0.18063148768093254, + "grad_norm": 1.5090734958648682, + "learning_rate": 8.625392962174222e-07, + "loss": 0.9644, + "step": 7500 + }, + { + "epoch": 0.18075190867271984, + "grad_norm": 1.4409652948379517, + "learning_rate": 8.624125342257377e-07, + "loss": 0.8875, + "step": 7505 + }, + { + "epoch": 0.18087232966450711, + "grad_norm": 1.6661747694015503, + "learning_rate": 8.622857722340534e-07, + "loss": 0.9867, + "step": 7510 + }, + { + "epoch": 0.18099275065629442, + "grad_norm": 1.784266471862793, + "learning_rate": 8.621590102423689e-07, + "loss": 0.9067, + "step": 7515 + }, + { + "epoch": 0.1811131716480817, + "grad_norm": 1.497228741645813, + "learning_rate": 8.620322482506844e-07, + "loss": 0.9663, + "step": 7520 + }, + { + "epoch": 0.181233592639869, + "grad_norm": 1.8661032915115356, + "learning_rate": 8.619054862590001e-07, + "loss": 0.9239, + "step": 7525 + }, + { + "epoch": 0.18135401363165626, + "grad_norm": 1.5789985656738281, + "learning_rate": 8.617787242673157e-07, + "loss": 1.0048, + "step": 7530 + }, + { + "epoch": 0.18147443462344356, + "grad_norm": 1.6937061548233032, + "learning_rate": 8.616519622756312e-07, + "loss": 0.9206, + "step": 7535 + }, + { + "epoch": 0.18159485561523084, + "grad_norm": 1.5689666271209717, + "learning_rate": 8.615252002839468e-07, + "loss": 0.9583, + "step": 7540 + }, + { + "epoch": 0.18171527660701814, + "grad_norm": 1.628058671951294, + "learning_rate": 8.613984382922624e-07, + "loss": 0.9669, + "step": 7545 + }, + { + "epoch": 0.1818356975988054, + "grad_norm": 1.5606167316436768, + "learning_rate": 8.61271676300578e-07, + "loss": 0.9188, + "step": 7550 + }, + { + "epoch": 0.1819561185905927, + "grad_norm": 1.4785312414169312, + "learning_rate": 8.611449143088936e-07, + "loss": 0.9194, + "step": 7555 + }, + { + "epoch": 0.18207653958238, + "grad_norm": 1.4869170188903809, + "learning_rate": 8.610181523172092e-07, + "loss": 0.9779, + "step": 7560 + }, + { + "epoch": 0.1821969605741673, + "grad_norm": 1.5507047176361084, + "learning_rate": 8.608913903255247e-07, + "loss": 0.9377, + "step": 7565 + }, + { + "epoch": 0.1823173815659546, + "grad_norm": 1.3956730365753174, + "learning_rate": 8.607646283338403e-07, + "loss": 0.9261, + "step": 7570 + }, + { + "epoch": 0.18243780255774186, + "grad_norm": 1.5166481733322144, + "learning_rate": 8.60637866342156e-07, + "loss": 0.9878, + "step": 7575 + }, + { + "epoch": 0.18255822354952916, + "grad_norm": 1.4868241548538208, + "learning_rate": 8.605111043504714e-07, + "loss": 0.8817, + "step": 7580 + }, + { + "epoch": 0.18267864454131644, + "grad_norm": 1.660776138305664, + "learning_rate": 8.603843423587871e-07, + "loss": 0.9763, + "step": 7585 + }, + { + "epoch": 0.18279906553310374, + "grad_norm": 1.8276817798614502, + "learning_rate": 8.602575803671027e-07, + "loss": 0.9639, + "step": 7590 + }, + { + "epoch": 0.182919486524891, + "grad_norm": 1.818725347518921, + "learning_rate": 8.601308183754183e-07, + "loss": 0.9588, + "step": 7595 + }, + { + "epoch": 0.1830399075166783, + "grad_norm": 1.521095871925354, + "learning_rate": 8.600040563837339e-07, + "loss": 0.9631, + "step": 7600 + }, + { + "epoch": 0.1831603285084656, + "grad_norm": 1.6158627271652222, + "learning_rate": 8.598772943920494e-07, + "loss": 0.9082, + "step": 7605 + }, + { + "epoch": 0.1832807495002529, + "grad_norm": 1.6289167404174805, + "learning_rate": 8.59750532400365e-07, + "loss": 0.9901, + "step": 7610 + }, + { + "epoch": 0.18340117049204016, + "grad_norm": 1.6304922103881836, + "learning_rate": 8.596237704086806e-07, + "loss": 0.9861, + "step": 7615 + }, + { + "epoch": 0.18352159148382746, + "grad_norm": 1.5539021492004395, + "learning_rate": 8.594970084169963e-07, + "loss": 0.9694, + "step": 7620 + }, + { + "epoch": 0.18364201247561474, + "grad_norm": 1.6585330963134766, + "learning_rate": 8.593702464253118e-07, + "loss": 0.9466, + "step": 7625 + }, + { + "epoch": 0.18376243346740204, + "grad_norm": 1.7435808181762695, + "learning_rate": 8.592434844336273e-07, + "loss": 0.9547, + "step": 7630 + }, + { + "epoch": 0.18388285445918934, + "grad_norm": 1.4508938789367676, + "learning_rate": 8.59116722441943e-07, + "loss": 0.9705, + "step": 7635 + }, + { + "epoch": 0.1840032754509766, + "grad_norm": 1.6086461544036865, + "learning_rate": 8.589899604502585e-07, + "loss": 0.8958, + "step": 7640 + }, + { + "epoch": 0.1841236964427639, + "grad_norm": 1.4307724237442017, + "learning_rate": 8.588631984585742e-07, + "loss": 0.9562, + "step": 7645 + }, + { + "epoch": 0.18424411743455119, + "grad_norm": 1.552018642425537, + "learning_rate": 8.587364364668897e-07, + "loss": 0.933, + "step": 7650 + }, + { + "epoch": 0.1843645384263385, + "grad_norm": 1.5186680555343628, + "learning_rate": 8.586096744752052e-07, + "loss": 0.9275, + "step": 7655 + }, + { + "epoch": 0.18448495941812576, + "grad_norm": 1.5889519453048706, + "learning_rate": 8.584829124835209e-07, + "loss": 0.9499, + "step": 7660 + }, + { + "epoch": 0.18460538040991306, + "grad_norm": 1.6468712091445923, + "learning_rate": 8.583561504918365e-07, + "loss": 0.9632, + "step": 7665 + }, + { + "epoch": 0.18472580140170033, + "grad_norm": 1.4589017629623413, + "learning_rate": 8.582293885001521e-07, + "loss": 0.9375, + "step": 7670 + }, + { + "epoch": 0.18484622239348764, + "grad_norm": 1.484344482421875, + "learning_rate": 8.581026265084676e-07, + "loss": 0.9317, + "step": 7675 + }, + { + "epoch": 0.1849666433852749, + "grad_norm": 1.5107464790344238, + "learning_rate": 8.579758645167833e-07, + "loss": 0.9496, + "step": 7680 + }, + { + "epoch": 0.1850870643770622, + "grad_norm": 1.4735205173492432, + "learning_rate": 8.578491025250988e-07, + "loss": 0.9515, + "step": 7685 + }, + { + "epoch": 0.1852074853688495, + "grad_norm": 1.405005931854248, + "learning_rate": 8.577223405334144e-07, + "loss": 0.9233, + "step": 7690 + }, + { + "epoch": 0.18532790636063678, + "grad_norm": 1.4967010021209717, + "learning_rate": 8.575955785417301e-07, + "loss": 0.9095, + "step": 7695 + }, + { + "epoch": 0.18544832735242409, + "grad_norm": 1.5843182802200317, + "learning_rate": 8.574688165500455e-07, + "loss": 0.9234, + "step": 7700 + }, + { + "epoch": 0.18556874834421136, + "grad_norm": 1.563092827796936, + "learning_rate": 8.573420545583612e-07, + "loss": 0.9395, + "step": 7705 + }, + { + "epoch": 0.18568916933599866, + "grad_norm": 1.7785170078277588, + "learning_rate": 8.572152925666768e-07, + "loss": 0.9331, + "step": 7710 + }, + { + "epoch": 0.18580959032778593, + "grad_norm": 1.48356294631958, + "learning_rate": 8.570885305749924e-07, + "loss": 0.9174, + "step": 7715 + }, + { + "epoch": 0.18593001131957324, + "grad_norm": 1.706422209739685, + "learning_rate": 8.569617685833079e-07, + "loss": 0.9301, + "step": 7720 + }, + { + "epoch": 0.1860504323113605, + "grad_norm": 1.7067279815673828, + "learning_rate": 8.568350065916235e-07, + "loss": 0.9196, + "step": 7725 + }, + { + "epoch": 0.1861708533031478, + "grad_norm": 1.6843154430389404, + "learning_rate": 8.567082445999391e-07, + "loss": 0.9272, + "step": 7730 + }, + { + "epoch": 0.18629127429493508, + "grad_norm": 1.6786079406738281, + "learning_rate": 8.565814826082547e-07, + "loss": 0.9272, + "step": 7735 + }, + { + "epoch": 0.18641169528672238, + "grad_norm": 1.850604772567749, + "learning_rate": 8.564547206165704e-07, + "loss": 0.9236, + "step": 7740 + }, + { + "epoch": 0.18653211627850966, + "grad_norm": 1.6428070068359375, + "learning_rate": 8.563279586248858e-07, + "loss": 0.9401, + "step": 7745 + }, + { + "epoch": 0.18665253727029696, + "grad_norm": 1.5928632020950317, + "learning_rate": 8.562011966332014e-07, + "loss": 0.9904, + "step": 7750 + }, + { + "epoch": 0.18677295826208426, + "grad_norm": 1.7247638702392578, + "learning_rate": 8.560744346415171e-07, + "loss": 0.9355, + "step": 7755 + }, + { + "epoch": 0.18689337925387153, + "grad_norm": 1.6918985843658447, + "learning_rate": 8.559476726498326e-07, + "loss": 0.9348, + "step": 7760 + }, + { + "epoch": 0.18701380024565883, + "grad_norm": 1.6758086681365967, + "learning_rate": 8.558209106581482e-07, + "loss": 0.9397, + "step": 7765 + }, + { + "epoch": 0.1871342212374461, + "grad_norm": 1.3978140354156494, + "learning_rate": 8.556941486664638e-07, + "loss": 0.9691, + "step": 7770 + }, + { + "epoch": 0.1872546422292334, + "grad_norm": 1.5308520793914795, + "learning_rate": 8.555673866747793e-07, + "loss": 0.9205, + "step": 7775 + }, + { + "epoch": 0.18737506322102068, + "grad_norm": 1.3909852504730225, + "learning_rate": 8.55440624683095e-07, + "loss": 0.9421, + "step": 7780 + }, + { + "epoch": 0.18749548421280798, + "grad_norm": 1.5220245122909546, + "learning_rate": 8.553138626914106e-07, + "loss": 0.9511, + "step": 7785 + }, + { + "epoch": 0.18761590520459526, + "grad_norm": 1.8534610271453857, + "learning_rate": 8.551871006997261e-07, + "loss": 0.9701, + "step": 7790 + }, + { + "epoch": 0.18773632619638256, + "grad_norm": 1.5921791791915894, + "learning_rate": 8.550603387080417e-07, + "loss": 0.9832, + "step": 7795 + }, + { + "epoch": 0.18785674718816983, + "grad_norm": 1.5496329069137573, + "learning_rate": 8.549335767163574e-07, + "loss": 0.9646, + "step": 7800 + }, + { + "epoch": 0.18797716817995713, + "grad_norm": 1.608414888381958, + "learning_rate": 8.548068147246729e-07, + "loss": 0.9486, + "step": 7805 + }, + { + "epoch": 0.1880975891717444, + "grad_norm": 1.4896926879882812, + "learning_rate": 8.546800527329884e-07, + "loss": 0.9662, + "step": 7810 + }, + { + "epoch": 0.1882180101635317, + "grad_norm": 1.7178910970687866, + "learning_rate": 8.545532907413041e-07, + "loss": 0.949, + "step": 7815 + }, + { + "epoch": 0.188338431155319, + "grad_norm": 1.6171796321868896, + "learning_rate": 8.544265287496196e-07, + "loss": 0.9203, + "step": 7820 + }, + { + "epoch": 0.18845885214710628, + "grad_norm": 1.8074618577957153, + "learning_rate": 8.542997667579353e-07, + "loss": 0.925, + "step": 7825 + }, + { + "epoch": 0.18857927313889358, + "grad_norm": 1.5316404104232788, + "learning_rate": 8.541730047662509e-07, + "loss": 0.9347, + "step": 7830 + }, + { + "epoch": 0.18869969413068086, + "grad_norm": 1.57062566280365, + "learning_rate": 8.540462427745663e-07, + "loss": 0.9009, + "step": 7835 + }, + { + "epoch": 0.18882011512246816, + "grad_norm": 1.4911035299301147, + "learning_rate": 8.53919480782882e-07, + "loss": 0.9158, + "step": 7840 + }, + { + "epoch": 0.18894053611425543, + "grad_norm": 1.526751160621643, + "learning_rate": 8.537927187911976e-07, + "loss": 0.9323, + "step": 7845 + }, + { + "epoch": 0.18906095710604273, + "grad_norm": 1.5808849334716797, + "learning_rate": 8.536659567995132e-07, + "loss": 0.9654, + "step": 7850 + }, + { + "epoch": 0.18918137809783, + "grad_norm": 1.5629630088806152, + "learning_rate": 8.535391948078288e-07, + "loss": 0.9316, + "step": 7855 + }, + { + "epoch": 0.1893017990896173, + "grad_norm": 1.66767156124115, + "learning_rate": 8.534124328161444e-07, + "loss": 0.9788, + "step": 7860 + }, + { + "epoch": 0.18942222008140458, + "grad_norm": 1.495460033416748, + "learning_rate": 8.532856708244599e-07, + "loss": 0.8883, + "step": 7865 + }, + { + "epoch": 0.18954264107319188, + "grad_norm": 1.5910550355911255, + "learning_rate": 8.531589088327755e-07, + "loss": 0.9311, + "step": 7870 + }, + { + "epoch": 0.18966306206497915, + "grad_norm": 1.4263969659805298, + "learning_rate": 8.530321468410912e-07, + "loss": 0.9181, + "step": 7875 + }, + { + "epoch": 0.18978348305676646, + "grad_norm": 1.524699330329895, + "learning_rate": 8.529053848494066e-07, + "loss": 0.9138, + "step": 7880 + }, + { + "epoch": 0.18990390404855376, + "grad_norm": 1.612310767173767, + "learning_rate": 8.527786228577223e-07, + "loss": 0.9042, + "step": 7885 + }, + { + "epoch": 0.19002432504034103, + "grad_norm": 1.527094841003418, + "learning_rate": 8.526518608660379e-07, + "loss": 0.9604, + "step": 7890 + }, + { + "epoch": 0.19014474603212833, + "grad_norm": 1.7105774879455566, + "learning_rate": 8.525250988743534e-07, + "loss": 0.975, + "step": 7895 + }, + { + "epoch": 0.1902651670239156, + "grad_norm": 1.6466574668884277, + "learning_rate": 8.523983368826691e-07, + "loss": 0.944, + "step": 7900 + }, + { + "epoch": 0.1903855880157029, + "grad_norm": 1.4922170639038086, + "learning_rate": 8.522715748909846e-07, + "loss": 0.9308, + "step": 7905 + }, + { + "epoch": 0.19050600900749018, + "grad_norm": 1.6497938632965088, + "learning_rate": 8.521448128993002e-07, + "loss": 0.9541, + "step": 7910 + }, + { + "epoch": 0.19062642999927748, + "grad_norm": 1.5365254878997803, + "learning_rate": 8.520180509076158e-07, + "loss": 0.9478, + "step": 7915 + }, + { + "epoch": 0.19074685099106475, + "grad_norm": 1.4882341623306274, + "learning_rate": 8.518912889159315e-07, + "loss": 0.984, + "step": 7920 + }, + { + "epoch": 0.19086727198285205, + "grad_norm": 1.5878196954727173, + "learning_rate": 8.517645269242469e-07, + "loss": 0.9844, + "step": 7925 + }, + { + "epoch": 0.19098769297463933, + "grad_norm": 1.6832592487335205, + "learning_rate": 8.516377649325625e-07, + "loss": 0.9382, + "step": 7930 + }, + { + "epoch": 0.19110811396642663, + "grad_norm": 1.73088800907135, + "learning_rate": 8.515110029408782e-07, + "loss": 0.9062, + "step": 7935 + }, + { + "epoch": 0.1912285349582139, + "grad_norm": 1.6562163829803467, + "learning_rate": 8.513842409491938e-07, + "loss": 0.9283, + "step": 7940 + }, + { + "epoch": 0.1913489559500012, + "grad_norm": 1.4478557109832764, + "learning_rate": 8.512574789575094e-07, + "loss": 0.9725, + "step": 7945 + }, + { + "epoch": 0.1914693769417885, + "grad_norm": 1.61317777633667, + "learning_rate": 8.511307169658249e-07, + "loss": 0.9291, + "step": 7950 + }, + { + "epoch": 0.19158979793357578, + "grad_norm": 1.579496145248413, + "learning_rate": 8.510039549741405e-07, + "loss": 0.9744, + "step": 7955 + }, + { + "epoch": 0.19171021892536308, + "grad_norm": 1.7063086032867432, + "learning_rate": 8.508771929824561e-07, + "loss": 0.9523, + "step": 7960 + }, + { + "epoch": 0.19183063991715035, + "grad_norm": 1.4805172681808472, + "learning_rate": 8.507504309907717e-07, + "loss": 0.9667, + "step": 7965 + }, + { + "epoch": 0.19195106090893765, + "grad_norm": 1.4771111011505127, + "learning_rate": 8.506236689990874e-07, + "loss": 0.9738, + "step": 7970 + }, + { + "epoch": 0.19207148190072493, + "grad_norm": 1.4734350442886353, + "learning_rate": 8.504969070074028e-07, + "loss": 0.9327, + "step": 7975 + }, + { + "epoch": 0.19219190289251223, + "grad_norm": 1.5960177183151245, + "learning_rate": 8.503701450157185e-07, + "loss": 0.9299, + "step": 7980 + }, + { + "epoch": 0.1923123238842995, + "grad_norm": 1.7017934322357178, + "learning_rate": 8.502433830240341e-07, + "loss": 0.9444, + "step": 7985 + }, + { + "epoch": 0.1924327448760868, + "grad_norm": 1.5495920181274414, + "learning_rate": 8.501166210323496e-07, + "loss": 0.9296, + "step": 7990 + }, + { + "epoch": 0.19255316586787408, + "grad_norm": 1.8156256675720215, + "learning_rate": 8.499898590406652e-07, + "loss": 0.9251, + "step": 7995 + }, + { + "epoch": 0.19267358685966138, + "grad_norm": 1.6993896961212158, + "learning_rate": 8.498630970489808e-07, + "loss": 0.9591, + "step": 8000 + }, + { + "epoch": 0.19279400785144865, + "grad_norm": 1.6826540231704712, + "learning_rate": 8.497363350572964e-07, + "loss": 0.9337, + "step": 8005 + }, + { + "epoch": 0.19291442884323595, + "grad_norm": 1.5303553342819214, + "learning_rate": 8.49609573065612e-07, + "loss": 0.9317, + "step": 8010 + }, + { + "epoch": 0.19303484983502325, + "grad_norm": 1.7423923015594482, + "learning_rate": 8.494828110739276e-07, + "loss": 0.9019, + "step": 8015 + }, + { + "epoch": 0.19315527082681053, + "grad_norm": 1.838249921798706, + "learning_rate": 8.493560490822431e-07, + "loss": 0.988, + "step": 8020 + }, + { + "epoch": 0.19327569181859783, + "grad_norm": 1.5801361799240112, + "learning_rate": 8.492292870905587e-07, + "loss": 0.9737, + "step": 8025 + }, + { + "epoch": 0.1933961128103851, + "grad_norm": 1.6283152103424072, + "learning_rate": 8.491025250988744e-07, + "loss": 0.9168, + "step": 8030 + }, + { + "epoch": 0.1935165338021724, + "grad_norm": 1.3490309715270996, + "learning_rate": 8.489757631071899e-07, + "loss": 0.9394, + "step": 8035 + }, + { + "epoch": 0.19363695479395968, + "grad_norm": 1.5663366317749023, + "learning_rate": 8.488490011155056e-07, + "loss": 0.906, + "step": 8040 + }, + { + "epoch": 0.19375737578574698, + "grad_norm": 1.4401100873947144, + "learning_rate": 8.487222391238211e-07, + "loss": 0.9805, + "step": 8045 + }, + { + "epoch": 0.19387779677753425, + "grad_norm": 1.521753191947937, + "learning_rate": 8.485954771321366e-07, + "loss": 0.9311, + "step": 8050 + }, + { + "epoch": 0.19399821776932155, + "grad_norm": 1.6854158639907837, + "learning_rate": 8.484687151404523e-07, + "loss": 0.9865, + "step": 8055 + }, + { + "epoch": 0.19411863876110882, + "grad_norm": 1.4435174465179443, + "learning_rate": 8.483419531487679e-07, + "loss": 0.9494, + "step": 8060 + }, + { + "epoch": 0.19423905975289613, + "grad_norm": 1.5255597829818726, + "learning_rate": 8.482151911570834e-07, + "loss": 0.9435, + "step": 8065 + }, + { + "epoch": 0.1943594807446834, + "grad_norm": 1.3990375995635986, + "learning_rate": 8.48088429165399e-07, + "loss": 0.9662, + "step": 8070 + }, + { + "epoch": 0.1944799017364707, + "grad_norm": 1.5630102157592773, + "learning_rate": 8.479616671737146e-07, + "loss": 0.9869, + "step": 8075 + }, + { + "epoch": 0.194600322728258, + "grad_norm": 1.6120853424072266, + "learning_rate": 8.478349051820302e-07, + "loss": 0.9512, + "step": 8080 + }, + { + "epoch": 0.19472074372004528, + "grad_norm": 1.422527551651001, + "learning_rate": 8.477081431903458e-07, + "loss": 0.9422, + "step": 8085 + }, + { + "epoch": 0.19484116471183258, + "grad_norm": 1.715245246887207, + "learning_rate": 8.475813811986614e-07, + "loss": 0.9268, + "step": 8090 + }, + { + "epoch": 0.19496158570361985, + "grad_norm": 1.4960979223251343, + "learning_rate": 8.474546192069769e-07, + "loss": 0.9693, + "step": 8095 + }, + { + "epoch": 0.19508200669540715, + "grad_norm": 1.811741828918457, + "learning_rate": 8.473278572152925e-07, + "loss": 0.9967, + "step": 8100 + }, + { + "epoch": 0.19520242768719442, + "grad_norm": 1.8520315885543823, + "learning_rate": 8.472010952236082e-07, + "loss": 0.9514, + "step": 8105 + }, + { + "epoch": 0.19532284867898173, + "grad_norm": 1.4953160285949707, + "learning_rate": 8.470743332319236e-07, + "loss": 0.9414, + "step": 8110 + }, + { + "epoch": 0.195443269670769, + "grad_norm": 1.5047308206558228, + "learning_rate": 8.469475712402393e-07, + "loss": 0.9273, + "step": 8115 + }, + { + "epoch": 0.1955636906625563, + "grad_norm": 1.5107359886169434, + "learning_rate": 8.468208092485549e-07, + "loss": 0.9351, + "step": 8120 + }, + { + "epoch": 0.19568411165434357, + "grad_norm": 1.3560688495635986, + "learning_rate": 8.466940472568705e-07, + "loss": 0.9283, + "step": 8125 + }, + { + "epoch": 0.19580453264613087, + "grad_norm": 2.0308735370635986, + "learning_rate": 8.465672852651861e-07, + "loss": 0.9829, + "step": 8130 + }, + { + "epoch": 0.19592495363791818, + "grad_norm": 1.367697834968567, + "learning_rate": 8.464405232735016e-07, + "loss": 0.9557, + "step": 8135 + }, + { + "epoch": 0.19604537462970545, + "grad_norm": 1.5483380556106567, + "learning_rate": 8.463137612818172e-07, + "loss": 0.9502, + "step": 8140 + }, + { + "epoch": 0.19616579562149275, + "grad_norm": 1.533666729927063, + "learning_rate": 8.461869992901328e-07, + "loss": 0.9107, + "step": 8145 + }, + { + "epoch": 0.19628621661328002, + "grad_norm": 1.575505018234253, + "learning_rate": 8.460602372984485e-07, + "loss": 0.9449, + "step": 8150 + }, + { + "epoch": 0.19640663760506732, + "grad_norm": 1.605459213256836, + "learning_rate": 8.45933475306764e-07, + "loss": 0.9894, + "step": 8155 + }, + { + "epoch": 0.1965270585968546, + "grad_norm": 1.5962629318237305, + "learning_rate": 8.458067133150795e-07, + "loss": 0.8916, + "step": 8160 + }, + { + "epoch": 0.1966474795886419, + "grad_norm": 1.4973393678665161, + "learning_rate": 8.456799513233952e-07, + "loss": 0.9511, + "step": 8165 + }, + { + "epoch": 0.19676790058042917, + "grad_norm": 1.5095032453536987, + "learning_rate": 8.455531893317107e-07, + "loss": 0.9788, + "step": 8170 + }, + { + "epoch": 0.19688832157221647, + "grad_norm": 1.900133490562439, + "learning_rate": 8.454264273400264e-07, + "loss": 0.9538, + "step": 8175 + }, + { + "epoch": 0.19700874256400375, + "grad_norm": 1.530402421951294, + "learning_rate": 8.452996653483419e-07, + "loss": 0.9698, + "step": 8180 + }, + { + "epoch": 0.19712916355579105, + "grad_norm": 1.551681637763977, + "learning_rate": 8.451729033566574e-07, + "loss": 0.8823, + "step": 8185 + }, + { + "epoch": 0.19724958454757832, + "grad_norm": 1.61061429977417, + "learning_rate": 8.450461413649731e-07, + "loss": 0.9481, + "step": 8190 + }, + { + "epoch": 0.19737000553936562, + "grad_norm": 1.6920380592346191, + "learning_rate": 8.449193793732887e-07, + "loss": 0.9234, + "step": 8195 + }, + { + "epoch": 0.19749042653115292, + "grad_norm": 1.539293885231018, + "learning_rate": 8.447926173816043e-07, + "loss": 0.9832, + "step": 8200 + }, + { + "epoch": 0.1976108475229402, + "grad_norm": 1.5610792636871338, + "learning_rate": 8.446658553899198e-07, + "loss": 0.9322, + "step": 8205 + }, + { + "epoch": 0.1977312685147275, + "grad_norm": 1.4349342584609985, + "learning_rate": 8.445390933982355e-07, + "loss": 0.9551, + "step": 8210 + }, + { + "epoch": 0.19785168950651477, + "grad_norm": 1.6934919357299805, + "learning_rate": 8.44412331406551e-07, + "loss": 0.9032, + "step": 8215 + }, + { + "epoch": 0.19797211049830207, + "grad_norm": 1.657364845275879, + "learning_rate": 8.442855694148666e-07, + "loss": 0.9226, + "step": 8220 + }, + { + "epoch": 0.19809253149008935, + "grad_norm": 1.3018603324890137, + "learning_rate": 8.441588074231823e-07, + "loss": 0.9307, + "step": 8225 + }, + { + "epoch": 0.19821295248187665, + "grad_norm": 1.4067305326461792, + "learning_rate": 8.440320454314977e-07, + "loss": 0.9819, + "step": 8230 + }, + { + "epoch": 0.19833337347366392, + "grad_norm": 1.7177774906158447, + "learning_rate": 8.439052834398134e-07, + "loss": 0.9877, + "step": 8235 + }, + { + "epoch": 0.19845379446545122, + "grad_norm": 1.5552080869674683, + "learning_rate": 8.43778521448129e-07, + "loss": 0.9211, + "step": 8240 + }, + { + "epoch": 0.1985742154572385, + "grad_norm": 1.6212966442108154, + "learning_rate": 8.436517594564446e-07, + "loss": 0.882, + "step": 8245 + }, + { + "epoch": 0.1986946364490258, + "grad_norm": 1.4554401636123657, + "learning_rate": 8.435249974647601e-07, + "loss": 0.9842, + "step": 8250 + }, + { + "epoch": 0.19881505744081307, + "grad_norm": 1.6188297271728516, + "learning_rate": 8.433982354730757e-07, + "loss": 0.9164, + "step": 8255 + }, + { + "epoch": 0.19893547843260037, + "grad_norm": 1.503028154373169, + "learning_rate": 8.432714734813913e-07, + "loss": 0.9838, + "step": 8260 + }, + { + "epoch": 0.19905589942438767, + "grad_norm": 1.3847625255584717, + "learning_rate": 8.431447114897069e-07, + "loss": 0.9386, + "step": 8265 + }, + { + "epoch": 0.19917632041617495, + "grad_norm": 1.653942346572876, + "learning_rate": 8.430179494980226e-07, + "loss": 0.9559, + "step": 8270 + }, + { + "epoch": 0.19929674140796225, + "grad_norm": 1.5789453983306885, + "learning_rate": 8.42891187506338e-07, + "loss": 0.9565, + "step": 8275 + }, + { + "epoch": 0.19941716239974952, + "grad_norm": 1.4539376497268677, + "learning_rate": 8.427644255146536e-07, + "loss": 0.9697, + "step": 8280 + }, + { + "epoch": 0.19953758339153682, + "grad_norm": 1.746583342552185, + "learning_rate": 8.426376635229693e-07, + "loss": 1.0047, + "step": 8285 + }, + { + "epoch": 0.1996580043833241, + "grad_norm": 2.238571882247925, + "learning_rate": 8.425109015312848e-07, + "loss": 0.9372, + "step": 8290 + }, + { + "epoch": 0.1997784253751114, + "grad_norm": 1.5720535516738892, + "learning_rate": 8.423841395396004e-07, + "loss": 0.8936, + "step": 8295 + }, + { + "epoch": 0.19989884636689867, + "grad_norm": 1.8832333087921143, + "learning_rate": 8.42257377547916e-07, + "loss": 1.0061, + "step": 8300 + }, + { + "epoch": 0.20001926735868597, + "grad_norm": 1.635061264038086, + "learning_rate": 8.421306155562315e-07, + "loss": 0.98, + "step": 8305 + }, + { + "epoch": 0.20013968835047324, + "grad_norm": 1.7162737846374512, + "learning_rate": 8.420038535645472e-07, + "loss": 0.9042, + "step": 8310 + }, + { + "epoch": 0.20026010934226054, + "grad_norm": 1.5249680280685425, + "learning_rate": 8.418770915728628e-07, + "loss": 0.9422, + "step": 8315 + }, + { + "epoch": 0.20038053033404782, + "grad_norm": 1.5159083604812622, + "learning_rate": 8.417503295811783e-07, + "loss": 0.9585, + "step": 8320 + }, + { + "epoch": 0.20050095132583512, + "grad_norm": 1.547275424003601, + "learning_rate": 8.416235675894939e-07, + "loss": 0.9712, + "step": 8325 + }, + { + "epoch": 0.20062137231762242, + "grad_norm": 1.4447563886642456, + "learning_rate": 8.414968055978096e-07, + "loss": 0.9383, + "step": 8330 + }, + { + "epoch": 0.2007417933094097, + "grad_norm": 1.4032526016235352, + "learning_rate": 8.413700436061251e-07, + "loss": 0.987, + "step": 8335 + }, + { + "epoch": 0.200862214301197, + "grad_norm": 1.5580028295516968, + "learning_rate": 8.412432816144407e-07, + "loss": 0.8971, + "step": 8340 + }, + { + "epoch": 0.20098263529298427, + "grad_norm": 1.5366116762161255, + "learning_rate": 8.411165196227563e-07, + "loss": 0.9864, + "step": 8345 + }, + { + "epoch": 0.20110305628477157, + "grad_norm": 1.590053677558899, + "learning_rate": 8.409897576310718e-07, + "loss": 0.9343, + "step": 8350 + }, + { + "epoch": 0.20122347727655884, + "grad_norm": 1.5261359214782715, + "learning_rate": 8.408629956393875e-07, + "loss": 0.9035, + "step": 8355 + }, + { + "epoch": 0.20134389826834614, + "grad_norm": 1.4733154773712158, + "learning_rate": 8.407362336477031e-07, + "loss": 0.9255, + "step": 8360 + }, + { + "epoch": 0.20146431926013342, + "grad_norm": 1.9009312391281128, + "learning_rate": 8.406094716560185e-07, + "loss": 0.979, + "step": 8365 + }, + { + "epoch": 0.20158474025192072, + "grad_norm": 1.560454249382019, + "learning_rate": 8.404827096643342e-07, + "loss": 0.9264, + "step": 8370 + }, + { + "epoch": 0.201705161243708, + "grad_norm": 1.6878447532653809, + "learning_rate": 8.403559476726498e-07, + "loss": 0.9107, + "step": 8375 + }, + { + "epoch": 0.2018255822354953, + "grad_norm": 1.3901766538619995, + "learning_rate": 8.402291856809654e-07, + "loss": 0.9724, + "step": 8380 + }, + { + "epoch": 0.20194600322728257, + "grad_norm": 1.402201771736145, + "learning_rate": 8.40102423689281e-07, + "loss": 0.9326, + "step": 8385 + }, + { + "epoch": 0.20206642421906987, + "grad_norm": 1.4999525547027588, + "learning_rate": 8.399756616975966e-07, + "loss": 0.983, + "step": 8390 + }, + { + "epoch": 0.20218684521085717, + "grad_norm": 1.554654598236084, + "learning_rate": 8.398488997059121e-07, + "loss": 0.9594, + "step": 8395 + }, + { + "epoch": 0.20230726620264444, + "grad_norm": 1.4805643558502197, + "learning_rate": 8.397221377142277e-07, + "loss": 0.9716, + "step": 8400 + }, + { + "epoch": 0.20242768719443174, + "grad_norm": 1.4133025407791138, + "learning_rate": 8.395953757225434e-07, + "loss": 0.9541, + "step": 8405 + }, + { + "epoch": 0.20254810818621902, + "grad_norm": 1.8007771968841553, + "learning_rate": 8.394686137308588e-07, + "loss": 0.9827, + "step": 8410 + }, + { + "epoch": 0.20266852917800632, + "grad_norm": 1.5795798301696777, + "learning_rate": 8.393418517391745e-07, + "loss": 0.9792, + "step": 8415 + }, + { + "epoch": 0.2027889501697936, + "grad_norm": 1.633016586303711, + "learning_rate": 8.392150897474901e-07, + "loss": 0.9379, + "step": 8420 + }, + { + "epoch": 0.2029093711615809, + "grad_norm": 1.6277592182159424, + "learning_rate": 8.390883277558056e-07, + "loss": 0.9478, + "step": 8425 + }, + { + "epoch": 0.20302979215336817, + "grad_norm": 1.718751311302185, + "learning_rate": 8.389615657641213e-07, + "loss": 0.944, + "step": 8430 + }, + { + "epoch": 0.20315021314515547, + "grad_norm": 1.5914260149002075, + "learning_rate": 8.388348037724368e-07, + "loss": 0.9741, + "step": 8435 + }, + { + "epoch": 0.20327063413694274, + "grad_norm": 1.4470100402832031, + "learning_rate": 8.387080417807524e-07, + "loss": 0.9075, + "step": 8440 + }, + { + "epoch": 0.20339105512873004, + "grad_norm": 1.3547005653381348, + "learning_rate": 8.38581279789068e-07, + "loss": 0.9711, + "step": 8445 + }, + { + "epoch": 0.20351147612051732, + "grad_norm": 1.6130958795547485, + "learning_rate": 8.384545177973837e-07, + "loss": 0.8619, + "step": 8450 + }, + { + "epoch": 0.20363189711230462, + "grad_norm": 1.448756456375122, + "learning_rate": 8.383277558056992e-07, + "loss": 0.9311, + "step": 8455 + }, + { + "epoch": 0.20375231810409192, + "grad_norm": 1.7305628061294556, + "learning_rate": 8.382009938140147e-07, + "loss": 0.9222, + "step": 8460 + }, + { + "epoch": 0.2038727390958792, + "grad_norm": 1.4965406656265259, + "learning_rate": 8.380742318223304e-07, + "loss": 0.956, + "step": 8465 + }, + { + "epoch": 0.2039931600876665, + "grad_norm": 1.5525379180908203, + "learning_rate": 8.379474698306459e-07, + "loss": 0.9343, + "step": 8470 + }, + { + "epoch": 0.20411358107945377, + "grad_norm": 1.8234738111495972, + "learning_rate": 8.378207078389616e-07, + "loss": 0.9319, + "step": 8475 + }, + { + "epoch": 0.20423400207124107, + "grad_norm": 1.8297779560089111, + "learning_rate": 8.376939458472771e-07, + "loss": 0.9049, + "step": 8480 + }, + { + "epoch": 0.20435442306302834, + "grad_norm": 1.5790259838104248, + "learning_rate": 8.375671838555926e-07, + "loss": 0.9588, + "step": 8485 + }, + { + "epoch": 0.20447484405481564, + "grad_norm": 1.7919198274612427, + "learning_rate": 8.374404218639083e-07, + "loss": 0.9558, + "step": 8490 + }, + { + "epoch": 0.20459526504660291, + "grad_norm": 1.4658918380737305, + "learning_rate": 8.373136598722239e-07, + "loss": 0.9135, + "step": 8495 + }, + { + "epoch": 0.20471568603839022, + "grad_norm": 1.581298828125, + "learning_rate": 8.371868978805395e-07, + "loss": 0.931, + "step": 8500 + }, + { + "epoch": 0.2048361070301775, + "grad_norm": 1.5665123462677002, + "learning_rate": 8.37060135888855e-07, + "loss": 0.9686, + "step": 8505 + }, + { + "epoch": 0.2049565280219648, + "grad_norm": 1.6349343061447144, + "learning_rate": 8.369333738971707e-07, + "loss": 0.9041, + "step": 8510 + }, + { + "epoch": 0.20507694901375206, + "grad_norm": 1.7548354864120483, + "learning_rate": 8.368066119054862e-07, + "loss": 0.9142, + "step": 8515 + }, + { + "epoch": 0.20519737000553936, + "grad_norm": 1.7395234107971191, + "learning_rate": 8.366798499138018e-07, + "loss": 0.8966, + "step": 8520 + }, + { + "epoch": 0.20531779099732667, + "grad_norm": 1.6489882469177246, + "learning_rate": 8.365530879221175e-07, + "loss": 0.9579, + "step": 8525 + }, + { + "epoch": 0.20543821198911394, + "grad_norm": 1.679075837135315, + "learning_rate": 8.364263259304329e-07, + "loss": 0.9257, + "step": 8530 + }, + { + "epoch": 0.20555863298090124, + "grad_norm": 1.3792853355407715, + "learning_rate": 8.362995639387486e-07, + "loss": 0.9361, + "step": 8535 + }, + { + "epoch": 0.2056790539726885, + "grad_norm": 1.6504968404769897, + "learning_rate": 8.361728019470642e-07, + "loss": 0.9498, + "step": 8540 + }, + { + "epoch": 0.20579947496447581, + "grad_norm": 1.645012378692627, + "learning_rate": 8.360460399553797e-07, + "loss": 0.9963, + "step": 8545 + }, + { + "epoch": 0.2059198959562631, + "grad_norm": 1.3971391916275024, + "learning_rate": 8.359192779636953e-07, + "loss": 0.9941, + "step": 8550 + }, + { + "epoch": 0.2060403169480504, + "grad_norm": 1.675577998161316, + "learning_rate": 8.357925159720109e-07, + "loss": 0.9227, + "step": 8555 + }, + { + "epoch": 0.20616073793983766, + "grad_norm": 1.5664169788360596, + "learning_rate": 8.356657539803265e-07, + "loss": 0.9335, + "step": 8560 + }, + { + "epoch": 0.20628115893162496, + "grad_norm": 1.7027219533920288, + "learning_rate": 8.355389919886421e-07, + "loss": 0.961, + "step": 8565 + }, + { + "epoch": 0.20640157992341224, + "grad_norm": 1.5615417957305908, + "learning_rate": 8.354122299969578e-07, + "loss": 0.9518, + "step": 8570 + }, + { + "epoch": 0.20652200091519954, + "grad_norm": 1.8420103788375854, + "learning_rate": 8.352854680052732e-07, + "loss": 0.9987, + "step": 8575 + }, + { + "epoch": 0.20664242190698684, + "grad_norm": 1.625141978263855, + "learning_rate": 8.351587060135888e-07, + "loss": 0.8968, + "step": 8580 + }, + { + "epoch": 0.2067628428987741, + "grad_norm": 1.6060346364974976, + "learning_rate": 8.350319440219045e-07, + "loss": 0.9154, + "step": 8585 + }, + { + "epoch": 0.20688326389056141, + "grad_norm": 1.5125768184661865, + "learning_rate": 8.3490518203022e-07, + "loss": 0.9727, + "step": 8590 + }, + { + "epoch": 0.2070036848823487, + "grad_norm": 1.5166090726852417, + "learning_rate": 8.347784200385356e-07, + "loss": 0.9022, + "step": 8595 + }, + { + "epoch": 0.207124105874136, + "grad_norm": 1.4007935523986816, + "learning_rate": 8.346516580468512e-07, + "loss": 0.948, + "step": 8600 + }, + { + "epoch": 0.20724452686592326, + "grad_norm": 1.7391657829284668, + "learning_rate": 8.345248960551667e-07, + "loss": 0.8898, + "step": 8605 + }, + { + "epoch": 0.20736494785771056, + "grad_norm": 1.3972952365875244, + "learning_rate": 8.343981340634824e-07, + "loss": 0.8994, + "step": 8610 + }, + { + "epoch": 0.20748536884949784, + "grad_norm": 1.6144566535949707, + "learning_rate": 8.34271372071798e-07, + "loss": 0.9528, + "step": 8615 + }, + { + "epoch": 0.20760578984128514, + "grad_norm": 1.5001790523529053, + "learning_rate": 8.341446100801135e-07, + "loss": 0.9638, + "step": 8620 + }, + { + "epoch": 0.2077262108330724, + "grad_norm": 1.884637713432312, + "learning_rate": 8.340178480884291e-07, + "loss": 0.9093, + "step": 8625 + }, + { + "epoch": 0.2078466318248597, + "grad_norm": 1.5699843168258667, + "learning_rate": 8.338910860967447e-07, + "loss": 0.973, + "step": 8630 + }, + { + "epoch": 0.20796705281664699, + "grad_norm": 1.5710140466690063, + "learning_rate": 8.337643241050603e-07, + "loss": 0.9331, + "step": 8635 + }, + { + "epoch": 0.2080874738084343, + "grad_norm": 1.4831452369689941, + "learning_rate": 8.336375621133759e-07, + "loss": 0.9384, + "step": 8640 + }, + { + "epoch": 0.2082078948002216, + "grad_norm": 1.5688238143920898, + "learning_rate": 8.335108001216915e-07, + "loss": 0.9716, + "step": 8645 + }, + { + "epoch": 0.20832831579200886, + "grad_norm": 1.4611543416976929, + "learning_rate": 8.33384038130007e-07, + "loss": 0.8993, + "step": 8650 + }, + { + "epoch": 0.20844873678379616, + "grad_norm": 1.4965218305587769, + "learning_rate": 8.332572761383227e-07, + "loss": 0.9188, + "step": 8655 + }, + { + "epoch": 0.20856915777558344, + "grad_norm": 1.3796385526657104, + "learning_rate": 8.331305141466383e-07, + "loss": 0.8892, + "step": 8660 + }, + { + "epoch": 0.20868957876737074, + "grad_norm": 1.4881573915481567, + "learning_rate": 8.330037521549537e-07, + "loss": 0.8978, + "step": 8665 + }, + { + "epoch": 0.208809999759158, + "grad_norm": 1.588682770729065, + "learning_rate": 8.328769901632694e-07, + "loss": 0.9164, + "step": 8670 + }, + { + "epoch": 0.2089304207509453, + "grad_norm": 1.5392111539840698, + "learning_rate": 8.32750228171585e-07, + "loss": 0.9245, + "step": 8675 + }, + { + "epoch": 0.20905084174273258, + "grad_norm": 2.1696298122406006, + "learning_rate": 8.326234661799006e-07, + "loss": 0.9343, + "step": 8680 + }, + { + "epoch": 0.20917126273451989, + "grad_norm": 1.838498830795288, + "learning_rate": 8.324967041882162e-07, + "loss": 0.9351, + "step": 8685 + }, + { + "epoch": 0.20929168372630716, + "grad_norm": 1.9327776432037354, + "learning_rate": 8.323699421965317e-07, + "loss": 0.9452, + "step": 8690 + }, + { + "epoch": 0.20941210471809446, + "grad_norm": 1.5329734086990356, + "learning_rate": 8.322431802048473e-07, + "loss": 0.9778, + "step": 8695 + }, + { + "epoch": 0.20953252570988173, + "grad_norm": 1.5288251638412476, + "learning_rate": 8.321164182131629e-07, + "loss": 0.9253, + "step": 8700 + }, + { + "epoch": 0.20965294670166904, + "grad_norm": 1.4873708486557007, + "learning_rate": 8.319896562214786e-07, + "loss": 0.9109, + "step": 8705 + }, + { + "epoch": 0.20977336769345634, + "grad_norm": 1.5548856258392334, + "learning_rate": 8.31862894229794e-07, + "loss": 0.9195, + "step": 8710 + }, + { + "epoch": 0.2098937886852436, + "grad_norm": 1.4828455448150635, + "learning_rate": 8.317361322381096e-07, + "loss": 0.881, + "step": 8715 + }, + { + "epoch": 0.2100142096770309, + "grad_norm": 1.46074378490448, + "learning_rate": 8.316093702464253e-07, + "loss": 0.9704, + "step": 8720 + }, + { + "epoch": 0.21013463066881818, + "grad_norm": 1.7580933570861816, + "learning_rate": 8.314826082547408e-07, + "loss": 0.9254, + "step": 8725 + }, + { + "epoch": 0.21025505166060549, + "grad_norm": 1.614861011505127, + "learning_rate": 8.313558462630565e-07, + "loss": 0.9465, + "step": 8730 + }, + { + "epoch": 0.21037547265239276, + "grad_norm": 1.5930465459823608, + "learning_rate": 8.31229084271372e-07, + "loss": 0.9391, + "step": 8735 + }, + { + "epoch": 0.21049589364418006, + "grad_norm": 1.4870792627334595, + "learning_rate": 8.311023222796876e-07, + "loss": 0.8878, + "step": 8740 + }, + { + "epoch": 0.21061631463596733, + "grad_norm": 1.4152858257293701, + "learning_rate": 8.309755602880032e-07, + "loss": 0.9364, + "step": 8745 + }, + { + "epoch": 0.21073673562775463, + "grad_norm": 1.6814806461334229, + "learning_rate": 8.308487982963188e-07, + "loss": 0.92, + "step": 8750 + }, + { + "epoch": 0.2108571566195419, + "grad_norm": 1.6177480220794678, + "learning_rate": 8.307220363046344e-07, + "loss": 0.9439, + "step": 8755 + }, + { + "epoch": 0.2109775776113292, + "grad_norm": 1.4960063695907593, + "learning_rate": 8.305952743129499e-07, + "loss": 0.8827, + "step": 8760 + }, + { + "epoch": 0.21109799860311648, + "grad_norm": 1.6099573373794556, + "learning_rate": 8.304685123212656e-07, + "loss": 0.961, + "step": 8765 + }, + { + "epoch": 0.21121841959490378, + "grad_norm": 1.7052769660949707, + "learning_rate": 8.303417503295811e-07, + "loss": 0.9841, + "step": 8770 + }, + { + "epoch": 0.21133884058669108, + "grad_norm": 1.85787034034729, + "learning_rate": 8.302149883378968e-07, + "loss": 0.93, + "step": 8775 + }, + { + "epoch": 0.21145926157847836, + "grad_norm": 1.7271051406860352, + "learning_rate": 8.300882263462123e-07, + "loss": 0.9026, + "step": 8780 + }, + { + "epoch": 0.21157968257026566, + "grad_norm": 1.6201260089874268, + "learning_rate": 8.299614643545278e-07, + "loss": 0.9182, + "step": 8785 + }, + { + "epoch": 0.21170010356205293, + "grad_norm": 2.0843045711517334, + "learning_rate": 8.298347023628435e-07, + "loss": 1.0306, + "step": 8790 + }, + { + "epoch": 0.21182052455384023, + "grad_norm": 1.541614055633545, + "learning_rate": 8.297079403711591e-07, + "loss": 0.8982, + "step": 8795 + }, + { + "epoch": 0.2119409455456275, + "grad_norm": 1.7830393314361572, + "learning_rate": 8.295811783794747e-07, + "loss": 0.9132, + "step": 8800 + }, + { + "epoch": 0.2120613665374148, + "grad_norm": 1.6531665325164795, + "learning_rate": 8.294544163877902e-07, + "loss": 0.9826, + "step": 8805 + }, + { + "epoch": 0.21218178752920208, + "grad_norm": 1.539405107498169, + "learning_rate": 8.293276543961058e-07, + "loss": 0.9498, + "step": 8810 + }, + { + "epoch": 0.21230220852098938, + "grad_norm": 1.5371087789535522, + "learning_rate": 8.292008924044214e-07, + "loss": 0.8884, + "step": 8815 + }, + { + "epoch": 0.21242262951277666, + "grad_norm": 1.4828464984893799, + "learning_rate": 8.29074130412737e-07, + "loss": 0.9002, + "step": 8820 + }, + { + "epoch": 0.21254305050456396, + "grad_norm": 1.4968286752700806, + "learning_rate": 8.289473684210527e-07, + "loss": 0.9189, + "step": 8825 + }, + { + "epoch": 0.21266347149635123, + "grad_norm": 1.496974229812622, + "learning_rate": 8.288206064293681e-07, + "loss": 0.951, + "step": 8830 + }, + { + "epoch": 0.21278389248813853, + "grad_norm": 1.4990928173065186, + "learning_rate": 8.286938444376837e-07, + "loss": 0.977, + "step": 8835 + }, + { + "epoch": 0.21290431347992583, + "grad_norm": 1.6481170654296875, + "learning_rate": 8.285670824459994e-07, + "loss": 0.9582, + "step": 8840 + }, + { + "epoch": 0.2130247344717131, + "grad_norm": 1.5998990535736084, + "learning_rate": 8.284403204543149e-07, + "loss": 0.8715, + "step": 8845 + }, + { + "epoch": 0.2131451554635004, + "grad_norm": 1.5779908895492554, + "learning_rate": 8.283135584626305e-07, + "loss": 0.9167, + "step": 8850 + }, + { + "epoch": 0.21326557645528768, + "grad_norm": 1.4358510971069336, + "learning_rate": 8.281867964709461e-07, + "loss": 0.9238, + "step": 8855 + }, + { + "epoch": 0.21338599744707498, + "grad_norm": 1.5531116724014282, + "learning_rate": 8.280600344792617e-07, + "loss": 0.9415, + "step": 8860 + }, + { + "epoch": 0.21350641843886226, + "grad_norm": 1.5850262641906738, + "learning_rate": 8.279332724875773e-07, + "loss": 0.9163, + "step": 8865 + }, + { + "epoch": 0.21362683943064956, + "grad_norm": 1.3992431163787842, + "learning_rate": 8.278065104958929e-07, + "loss": 0.9199, + "step": 8870 + }, + { + "epoch": 0.21374726042243683, + "grad_norm": 1.2640268802642822, + "learning_rate": 8.276797485042084e-07, + "loss": 0.874, + "step": 8875 + }, + { + "epoch": 0.21386768141422413, + "grad_norm": 1.5796748399734497, + "learning_rate": 8.27552986512524e-07, + "loss": 0.8909, + "step": 8880 + }, + { + "epoch": 0.2139881024060114, + "grad_norm": 1.6204979419708252, + "learning_rate": 8.274262245208397e-07, + "loss": 0.8746, + "step": 8885 + }, + { + "epoch": 0.2141085233977987, + "grad_norm": 1.566311001777649, + "learning_rate": 8.272994625291552e-07, + "loss": 0.9035, + "step": 8890 + }, + { + "epoch": 0.21422894438958598, + "grad_norm": 1.482667326927185, + "learning_rate": 8.271727005374707e-07, + "loss": 0.9214, + "step": 8895 + }, + { + "epoch": 0.21434936538137328, + "grad_norm": 1.520044207572937, + "learning_rate": 8.270459385457864e-07, + "loss": 0.9096, + "step": 8900 + }, + { + "epoch": 0.21446978637316058, + "grad_norm": 1.5712969303131104, + "learning_rate": 8.269191765541019e-07, + "loss": 0.9333, + "step": 8905 + }, + { + "epoch": 0.21459020736494785, + "grad_norm": 1.6662484407424927, + "learning_rate": 8.267924145624176e-07, + "loss": 0.9577, + "step": 8910 + }, + { + "epoch": 0.21471062835673516, + "grad_norm": 1.487890362739563, + "learning_rate": 8.266656525707332e-07, + "loss": 0.9055, + "step": 8915 + }, + { + "epoch": 0.21483104934852243, + "grad_norm": 1.7255772352218628, + "learning_rate": 8.265388905790486e-07, + "loss": 0.9385, + "step": 8920 + }, + { + "epoch": 0.21495147034030973, + "grad_norm": 1.6064256429672241, + "learning_rate": 8.264121285873643e-07, + "loss": 0.9491, + "step": 8925 + }, + { + "epoch": 0.215071891332097, + "grad_norm": 1.5699553489685059, + "learning_rate": 8.262853665956799e-07, + "loss": 0.9202, + "step": 8930 + }, + { + "epoch": 0.2151923123238843, + "grad_norm": 1.543715238571167, + "learning_rate": 8.261586046039955e-07, + "loss": 0.9249, + "step": 8935 + }, + { + "epoch": 0.21531273331567158, + "grad_norm": 1.4051569700241089, + "learning_rate": 8.260318426123111e-07, + "loss": 0.9502, + "step": 8940 + }, + { + "epoch": 0.21543315430745888, + "grad_norm": 1.690550446510315, + "learning_rate": 8.259050806206267e-07, + "loss": 0.8807, + "step": 8945 + }, + { + "epoch": 0.21555357529924615, + "grad_norm": 1.3593227863311768, + "learning_rate": 8.257783186289422e-07, + "loss": 0.9861, + "step": 8950 + }, + { + "epoch": 0.21567399629103345, + "grad_norm": 1.652435064315796, + "learning_rate": 8.256515566372578e-07, + "loss": 0.9384, + "step": 8955 + }, + { + "epoch": 0.21579441728282076, + "grad_norm": 1.7051680088043213, + "learning_rate": 8.255247946455735e-07, + "loss": 0.9455, + "step": 8960 + }, + { + "epoch": 0.21591483827460803, + "grad_norm": 1.766378402709961, + "learning_rate": 8.25398032653889e-07, + "loss": 0.9533, + "step": 8965 + }, + { + "epoch": 0.21603525926639533, + "grad_norm": 1.6049004793167114, + "learning_rate": 8.252712706622046e-07, + "loss": 0.9435, + "step": 8970 + }, + { + "epoch": 0.2161556802581826, + "grad_norm": 1.7170352935791016, + "learning_rate": 8.251445086705202e-07, + "loss": 0.9371, + "step": 8975 + }, + { + "epoch": 0.2162761012499699, + "grad_norm": 1.506013035774231, + "learning_rate": 8.250177466788359e-07, + "loss": 0.8851, + "step": 8980 + }, + { + "epoch": 0.21639652224175718, + "grad_norm": 1.8347394466400146, + "learning_rate": 8.248909846871514e-07, + "loss": 0.9822, + "step": 8985 + }, + { + "epoch": 0.21651694323354448, + "grad_norm": 1.7963132858276367, + "learning_rate": 8.247642226954669e-07, + "loss": 0.9288, + "step": 8990 + }, + { + "epoch": 0.21663736422533175, + "grad_norm": 1.444070816040039, + "learning_rate": 8.246374607037826e-07, + "loss": 0.9085, + "step": 8995 + }, + { + "epoch": 0.21675778521711905, + "grad_norm": 1.5620007514953613, + "learning_rate": 8.245106987120981e-07, + "loss": 0.9667, + "step": 9000 + }, + { + "epoch": 0.21687820620890633, + "grad_norm": 1.4637254476547241, + "learning_rate": 8.243839367204138e-07, + "loss": 0.9519, + "step": 9005 + }, + { + "epoch": 0.21699862720069363, + "grad_norm": 1.7866486310958862, + "learning_rate": 8.242571747287294e-07, + "loss": 0.9099, + "step": 9010 + }, + { + "epoch": 0.2171190481924809, + "grad_norm": 1.5120880603790283, + "learning_rate": 8.241304127370448e-07, + "loss": 0.889, + "step": 9015 + }, + { + "epoch": 0.2172394691842682, + "grad_norm": 1.47304105758667, + "learning_rate": 8.240036507453605e-07, + "loss": 0.9659, + "step": 9020 + }, + { + "epoch": 0.2173598901760555, + "grad_norm": 1.4226914644241333, + "learning_rate": 8.238768887536761e-07, + "loss": 0.9371, + "step": 9025 + }, + { + "epoch": 0.21748031116784278, + "grad_norm": 1.4126009941101074, + "learning_rate": 8.237501267619917e-07, + "loss": 0.9464, + "step": 9030 + }, + { + "epoch": 0.21760073215963008, + "grad_norm": 1.5451222658157349, + "learning_rate": 8.236233647703072e-07, + "loss": 0.8829, + "step": 9035 + }, + { + "epoch": 0.21772115315141735, + "grad_norm": 1.4735143184661865, + "learning_rate": 8.234966027786229e-07, + "loss": 0.9358, + "step": 9040 + }, + { + "epoch": 0.21784157414320465, + "grad_norm": 1.7014250755310059, + "learning_rate": 8.233698407869384e-07, + "loss": 0.9346, + "step": 9045 + }, + { + "epoch": 0.21796199513499193, + "grad_norm": 1.54008150100708, + "learning_rate": 8.23243078795254e-07, + "loss": 0.9538, + "step": 9050 + }, + { + "epoch": 0.21808241612677923, + "grad_norm": 1.5994731187820435, + "learning_rate": 8.231163168035697e-07, + "loss": 0.9334, + "step": 9055 + }, + { + "epoch": 0.2182028371185665, + "grad_norm": 1.5312918424606323, + "learning_rate": 8.229895548118851e-07, + "loss": 0.9295, + "step": 9060 + }, + { + "epoch": 0.2183232581103538, + "grad_norm": 1.4426707029342651, + "learning_rate": 8.228627928202008e-07, + "loss": 0.9179, + "step": 9065 + }, + { + "epoch": 0.21844367910214108, + "grad_norm": 1.4576219320297241, + "learning_rate": 8.227360308285164e-07, + "loss": 0.9565, + "step": 9070 + }, + { + "epoch": 0.21856410009392838, + "grad_norm": 1.8166043758392334, + "learning_rate": 8.226092688368319e-07, + "loss": 0.9101, + "step": 9075 + }, + { + "epoch": 0.21868452108571565, + "grad_norm": 1.6293615102767944, + "learning_rate": 8.224825068451475e-07, + "loss": 0.8985, + "step": 9080 + }, + { + "epoch": 0.21880494207750295, + "grad_norm": 1.313899278640747, + "learning_rate": 8.223557448534631e-07, + "loss": 0.9632, + "step": 9085 + }, + { + "epoch": 0.21892536306929025, + "grad_norm": 1.5151069164276123, + "learning_rate": 8.222289828617787e-07, + "loss": 0.9532, + "step": 9090 + }, + { + "epoch": 0.21904578406107753, + "grad_norm": 1.794693946838379, + "learning_rate": 8.221022208700943e-07, + "loss": 0.9792, + "step": 9095 + }, + { + "epoch": 0.21916620505286483, + "grad_norm": 1.683070421218872, + "learning_rate": 8.2197545887841e-07, + "loss": 0.9168, + "step": 9100 + }, + { + "epoch": 0.2192866260446521, + "grad_norm": 1.560676097869873, + "learning_rate": 8.218486968867254e-07, + "loss": 0.9712, + "step": 9105 + }, + { + "epoch": 0.2194070470364394, + "grad_norm": 1.6398364305496216, + "learning_rate": 8.21721934895041e-07, + "loss": 0.9973, + "step": 9110 + }, + { + "epoch": 0.21952746802822667, + "grad_norm": 1.6339504718780518, + "learning_rate": 8.215951729033567e-07, + "loss": 0.876, + "step": 9115 + }, + { + "epoch": 0.21964788902001398, + "grad_norm": 2.2558040618896484, + "learning_rate": 8.214684109116722e-07, + "loss": 0.985, + "step": 9120 + }, + { + "epoch": 0.21976831001180125, + "grad_norm": 1.5891695022583008, + "learning_rate": 8.213416489199878e-07, + "loss": 0.9204, + "step": 9125 + }, + { + "epoch": 0.21988873100358855, + "grad_norm": 1.626607894897461, + "learning_rate": 8.212148869283034e-07, + "loss": 0.9663, + "step": 9130 + }, + { + "epoch": 0.22000915199537582, + "grad_norm": 1.8692668676376343, + "learning_rate": 8.210881249366189e-07, + "loss": 0.9775, + "step": 9135 + }, + { + "epoch": 0.22012957298716312, + "grad_norm": 1.653274416923523, + "learning_rate": 8.209613629449346e-07, + "loss": 0.9183, + "step": 9140 + }, + { + "epoch": 0.2202499939789504, + "grad_norm": 1.647321105003357, + "learning_rate": 8.208346009532502e-07, + "loss": 0.9363, + "step": 9145 + }, + { + "epoch": 0.2203704149707377, + "grad_norm": 1.404857873916626, + "learning_rate": 8.207078389615657e-07, + "loss": 0.8812, + "step": 9150 + }, + { + "epoch": 0.220490835962525, + "grad_norm": 1.4598122835159302, + "learning_rate": 8.205810769698813e-07, + "loss": 0.9517, + "step": 9155 + }, + { + "epoch": 0.22061125695431227, + "grad_norm": 1.5059850215911865, + "learning_rate": 8.20454314978197e-07, + "loss": 0.886, + "step": 9160 + }, + { + "epoch": 0.22073167794609957, + "grad_norm": 1.731574535369873, + "learning_rate": 8.203275529865125e-07, + "loss": 0.9316, + "step": 9165 + }, + { + "epoch": 0.22085209893788685, + "grad_norm": 1.4054409265518188, + "learning_rate": 8.202007909948281e-07, + "loss": 0.9281, + "step": 9170 + }, + { + "epoch": 0.22097251992967415, + "grad_norm": 1.78800630569458, + "learning_rate": 8.200740290031437e-07, + "loss": 0.954, + "step": 9175 + }, + { + "epoch": 0.22109294092146142, + "grad_norm": 1.5038881301879883, + "learning_rate": 8.199472670114592e-07, + "loss": 0.9794, + "step": 9180 + }, + { + "epoch": 0.22121336191324872, + "grad_norm": 1.5534361600875854, + "learning_rate": 8.198205050197749e-07, + "loss": 0.9235, + "step": 9185 + }, + { + "epoch": 0.221333782905036, + "grad_norm": 1.6785494089126587, + "learning_rate": 8.196937430280905e-07, + "loss": 0.9028, + "step": 9190 + }, + { + "epoch": 0.2214542038968233, + "grad_norm": 1.5974268913269043, + "learning_rate": 8.195669810364059e-07, + "loss": 0.9303, + "step": 9195 + }, + { + "epoch": 0.22157462488861057, + "grad_norm": 1.4408422708511353, + "learning_rate": 8.194402190447216e-07, + "loss": 0.8687, + "step": 9200 + }, + { + "epoch": 0.22169504588039787, + "grad_norm": 1.5255826711654663, + "learning_rate": 8.193134570530372e-07, + "loss": 0.9982, + "step": 9205 + }, + { + "epoch": 0.22181546687218515, + "grad_norm": 1.4327969551086426, + "learning_rate": 8.191866950613528e-07, + "loss": 0.9542, + "step": 9210 + }, + { + "epoch": 0.22193588786397245, + "grad_norm": 1.6425248384475708, + "learning_rate": 8.190599330696684e-07, + "loss": 0.8825, + "step": 9215 + }, + { + "epoch": 0.22205630885575975, + "grad_norm": 1.54902184009552, + "learning_rate": 8.189331710779839e-07, + "loss": 0.9572, + "step": 9220 + }, + { + "epoch": 0.22217672984754702, + "grad_norm": 1.4737131595611572, + "learning_rate": 8.188064090862995e-07, + "loss": 0.9314, + "step": 9225 + }, + { + "epoch": 0.22229715083933432, + "grad_norm": 1.696393609046936, + "learning_rate": 8.186796470946151e-07, + "loss": 0.9247, + "step": 9230 + }, + { + "epoch": 0.2224175718311216, + "grad_norm": 1.816698431968689, + "learning_rate": 8.185528851029308e-07, + "loss": 0.9583, + "step": 9235 + }, + { + "epoch": 0.2225379928229089, + "grad_norm": 1.5856852531433105, + "learning_rate": 8.184261231112462e-07, + "loss": 0.9045, + "step": 9240 + }, + { + "epoch": 0.22265841381469617, + "grad_norm": 1.691108226776123, + "learning_rate": 8.182993611195619e-07, + "loss": 0.9046, + "step": 9245 + }, + { + "epoch": 0.22277883480648347, + "grad_norm": 1.5667667388916016, + "learning_rate": 8.181725991278775e-07, + "loss": 0.959, + "step": 9250 + }, + { + "epoch": 0.22289925579827075, + "grad_norm": 1.6249210834503174, + "learning_rate": 8.18045837136193e-07, + "loss": 0.982, + "step": 9255 + }, + { + "epoch": 0.22301967679005805, + "grad_norm": 1.4573348760604858, + "learning_rate": 8.179190751445087e-07, + "loss": 0.9418, + "step": 9260 + }, + { + "epoch": 0.22314009778184532, + "grad_norm": 1.4732943773269653, + "learning_rate": 8.177923131528242e-07, + "loss": 0.9852, + "step": 9265 + }, + { + "epoch": 0.22326051877363262, + "grad_norm": 1.6084719896316528, + "learning_rate": 8.176655511611398e-07, + "loss": 0.9526, + "step": 9270 + }, + { + "epoch": 0.2233809397654199, + "grad_norm": 1.8238649368286133, + "learning_rate": 8.175387891694554e-07, + "loss": 0.8735, + "step": 9275 + }, + { + "epoch": 0.2235013607572072, + "grad_norm": 1.6604654788970947, + "learning_rate": 8.17412027177771e-07, + "loss": 0.9414, + "step": 9280 + }, + { + "epoch": 0.2236217817489945, + "grad_norm": 1.5664421319961548, + "learning_rate": 8.172852651860866e-07, + "loss": 0.9376, + "step": 9285 + }, + { + "epoch": 0.22374220274078177, + "grad_norm": 1.5238959789276123, + "learning_rate": 8.171585031944021e-07, + "loss": 0.9551, + "step": 9290 + }, + { + "epoch": 0.22386262373256907, + "grad_norm": 1.7010811567306519, + "learning_rate": 8.170317412027178e-07, + "loss": 0.9277, + "step": 9295 + }, + { + "epoch": 0.22398304472435634, + "grad_norm": 1.8692560195922852, + "learning_rate": 8.169049792110333e-07, + "loss": 0.9838, + "step": 9300 + }, + { + "epoch": 0.22410346571614365, + "grad_norm": 1.478436827659607, + "learning_rate": 8.16778217219349e-07, + "loss": 0.9273, + "step": 9305 + }, + { + "epoch": 0.22422388670793092, + "grad_norm": 1.5085464715957642, + "learning_rate": 8.166514552276645e-07, + "loss": 0.9042, + "step": 9310 + }, + { + "epoch": 0.22434430769971822, + "grad_norm": 1.468819260597229, + "learning_rate": 8.1652469323598e-07, + "loss": 0.9537, + "step": 9315 + }, + { + "epoch": 0.2244647286915055, + "grad_norm": 1.4283561706542969, + "learning_rate": 8.163979312442957e-07, + "loss": 0.8855, + "step": 9320 + }, + { + "epoch": 0.2245851496832928, + "grad_norm": 1.5676054954528809, + "learning_rate": 8.162711692526113e-07, + "loss": 0.9456, + "step": 9325 + }, + { + "epoch": 0.22470557067508007, + "grad_norm": 1.615062952041626, + "learning_rate": 8.161444072609269e-07, + "loss": 0.9407, + "step": 9330 + }, + { + "epoch": 0.22482599166686737, + "grad_norm": 1.4833568334579468, + "learning_rate": 8.160176452692424e-07, + "loss": 0.9134, + "step": 9335 + }, + { + "epoch": 0.22494641265865464, + "grad_norm": 1.451080083847046, + "learning_rate": 8.15890883277558e-07, + "loss": 0.887, + "step": 9340 + }, + { + "epoch": 0.22506683365044194, + "grad_norm": 1.5666835308074951, + "learning_rate": 8.157641212858736e-07, + "loss": 0.958, + "step": 9345 + }, + { + "epoch": 0.22518725464222925, + "grad_norm": 1.4987585544586182, + "learning_rate": 8.156373592941892e-07, + "loss": 0.8869, + "step": 9350 + }, + { + "epoch": 0.22530767563401652, + "grad_norm": 1.6401864290237427, + "learning_rate": 8.155105973025049e-07, + "loss": 0.9254, + "step": 9355 + }, + { + "epoch": 0.22542809662580382, + "grad_norm": 3.9416544437408447, + "learning_rate": 8.153838353108203e-07, + "loss": 0.8688, + "step": 9360 + }, + { + "epoch": 0.2255485176175911, + "grad_norm": 3.244033098220825, + "learning_rate": 8.15257073319136e-07, + "loss": 0.8505, + "step": 9365 + }, + { + "epoch": 0.2256689386093784, + "grad_norm": 1.533593773841858, + "learning_rate": 8.151303113274516e-07, + "loss": 0.9816, + "step": 9370 + }, + { + "epoch": 0.22578935960116567, + "grad_norm": 1.6429873704910278, + "learning_rate": 8.150035493357671e-07, + "loss": 0.992, + "step": 9375 + }, + { + "epoch": 0.22590978059295297, + "grad_norm": 1.38799250125885, + "learning_rate": 8.148767873440827e-07, + "loss": 0.9864, + "step": 9380 + }, + { + "epoch": 0.22603020158474024, + "grad_norm": 1.5810966491699219, + "learning_rate": 8.147500253523983e-07, + "loss": 0.9026, + "step": 9385 + }, + { + "epoch": 0.22615062257652754, + "grad_norm": 1.738532543182373, + "learning_rate": 8.146232633607139e-07, + "loss": 0.9626, + "step": 9390 + }, + { + "epoch": 0.22627104356831482, + "grad_norm": 1.605966329574585, + "learning_rate": 8.144965013690295e-07, + "loss": 0.8974, + "step": 9395 + }, + { + "epoch": 0.22639146456010212, + "grad_norm": 1.7122306823730469, + "learning_rate": 8.143697393773451e-07, + "loss": 0.9531, + "step": 9400 + }, + { + "epoch": 0.22651188555188942, + "grad_norm": 1.4698325395584106, + "learning_rate": 8.142429773856606e-07, + "loss": 0.9264, + "step": 9405 + }, + { + "epoch": 0.2266323065436767, + "grad_norm": 1.5234861373901367, + "learning_rate": 8.141162153939762e-07, + "loss": 0.9472, + "step": 9410 + }, + { + "epoch": 0.226752727535464, + "grad_norm": 1.6981227397918701, + "learning_rate": 8.139894534022919e-07, + "loss": 0.9419, + "step": 9415 + }, + { + "epoch": 0.22687314852725127, + "grad_norm": 1.6163617372512817, + "learning_rate": 8.138626914106074e-07, + "loss": 0.9259, + "step": 9420 + }, + { + "epoch": 0.22699356951903857, + "grad_norm": 1.4136073589324951, + "learning_rate": 8.137359294189229e-07, + "loss": 0.9281, + "step": 9425 + }, + { + "epoch": 0.22711399051082584, + "grad_norm": 1.4144577980041504, + "learning_rate": 8.136091674272386e-07, + "loss": 0.9359, + "step": 9430 + }, + { + "epoch": 0.22723441150261314, + "grad_norm": 1.7528362274169922, + "learning_rate": 8.134824054355541e-07, + "loss": 0.9475, + "step": 9435 + }, + { + "epoch": 0.22735483249440042, + "grad_norm": 1.4299169778823853, + "learning_rate": 8.133556434438698e-07, + "loss": 0.9244, + "step": 9440 + }, + { + "epoch": 0.22747525348618772, + "grad_norm": 1.508216381072998, + "learning_rate": 8.132288814521854e-07, + "loss": 0.9444, + "step": 9445 + }, + { + "epoch": 0.227595674477975, + "grad_norm": 1.595263957977295, + "learning_rate": 8.131021194605008e-07, + "loss": 0.9829, + "step": 9450 + }, + { + "epoch": 0.2277160954697623, + "grad_norm": 1.7147514820098877, + "learning_rate": 8.129753574688165e-07, + "loss": 0.9648, + "step": 9455 + }, + { + "epoch": 0.22783651646154957, + "grad_norm": 1.4773589372634888, + "learning_rate": 8.128485954771321e-07, + "loss": 0.9089, + "step": 9460 + }, + { + "epoch": 0.22795693745333687, + "grad_norm": 1.7763646841049194, + "learning_rate": 8.127218334854477e-07, + "loss": 0.8933, + "step": 9465 + }, + { + "epoch": 0.22807735844512417, + "grad_norm": 1.7241098880767822, + "learning_rate": 8.125950714937633e-07, + "loss": 0.9405, + "step": 9470 + }, + { + "epoch": 0.22819777943691144, + "grad_norm": 1.6956807374954224, + "learning_rate": 8.124683095020789e-07, + "loss": 0.9229, + "step": 9475 + }, + { + "epoch": 0.22831820042869874, + "grad_norm": 1.601848840713501, + "learning_rate": 8.123415475103944e-07, + "loss": 0.9073, + "step": 9480 + }, + { + "epoch": 0.22843862142048602, + "grad_norm": 1.5310746431350708, + "learning_rate": 8.1221478551871e-07, + "loss": 0.9413, + "step": 9485 + }, + { + "epoch": 0.22855904241227332, + "grad_norm": 1.630176305770874, + "learning_rate": 8.120880235270257e-07, + "loss": 0.9737, + "step": 9490 + }, + { + "epoch": 0.2286794634040606, + "grad_norm": 1.371692180633545, + "learning_rate": 8.119612615353411e-07, + "loss": 0.9324, + "step": 9495 + }, + { + "epoch": 0.2287998843958479, + "grad_norm": 1.6516149044036865, + "learning_rate": 8.118344995436568e-07, + "loss": 0.9379, + "step": 9500 + }, + { + "epoch": 0.22892030538763516, + "grad_norm": 1.6909898519515991, + "learning_rate": 8.117077375519724e-07, + "loss": 0.9307, + "step": 9505 + }, + { + "epoch": 0.22904072637942247, + "grad_norm": 1.8337432146072388, + "learning_rate": 8.11580975560288e-07, + "loss": 0.9343, + "step": 9510 + }, + { + "epoch": 0.22916114737120974, + "grad_norm": 1.6035581827163696, + "learning_rate": 8.114542135686036e-07, + "loss": 0.9654, + "step": 9515 + }, + { + "epoch": 0.22928156836299704, + "grad_norm": 1.5184273719787598, + "learning_rate": 8.113274515769191e-07, + "loss": 0.8989, + "step": 9520 + }, + { + "epoch": 0.2294019893547843, + "grad_norm": 1.56526780128479, + "learning_rate": 8.112006895852347e-07, + "loss": 0.9954, + "step": 9525 + }, + { + "epoch": 0.22952241034657161, + "grad_norm": 1.6989341974258423, + "learning_rate": 8.110739275935503e-07, + "loss": 0.9337, + "step": 9530 + }, + { + "epoch": 0.22964283133835892, + "grad_norm": 1.5808727741241455, + "learning_rate": 8.10947165601866e-07, + "loss": 0.936, + "step": 9535 + }, + { + "epoch": 0.2297632523301462, + "grad_norm": 1.6222801208496094, + "learning_rate": 8.108204036101814e-07, + "loss": 0.9086, + "step": 9540 + }, + { + "epoch": 0.2298836733219335, + "grad_norm": 1.6468894481658936, + "learning_rate": 8.10693641618497e-07, + "loss": 0.904, + "step": 9545 + }, + { + "epoch": 0.23000409431372076, + "grad_norm": 1.5979857444763184, + "learning_rate": 8.105668796268127e-07, + "loss": 0.902, + "step": 9550 + }, + { + "epoch": 0.23012451530550806, + "grad_norm": 1.7124053239822388, + "learning_rate": 8.104401176351282e-07, + "loss": 0.9355, + "step": 9555 + }, + { + "epoch": 0.23024493629729534, + "grad_norm": 1.5121243000030518, + "learning_rate": 8.103133556434439e-07, + "loss": 0.96, + "step": 9560 + }, + { + "epoch": 0.23036535728908264, + "grad_norm": 1.6419447660446167, + "learning_rate": 8.101865936517594e-07, + "loss": 0.913, + "step": 9565 + }, + { + "epoch": 0.2304857782808699, + "grad_norm": 1.5802937746047974, + "learning_rate": 8.100598316600749e-07, + "loss": 0.9036, + "step": 9570 + }, + { + "epoch": 0.23060619927265721, + "grad_norm": 1.6120250225067139, + "learning_rate": 8.099330696683906e-07, + "loss": 1.0045, + "step": 9575 + }, + { + "epoch": 0.2307266202644445, + "grad_norm": 1.6308773756027222, + "learning_rate": 8.098063076767062e-07, + "loss": 0.9348, + "step": 9580 + }, + { + "epoch": 0.2308470412562318, + "grad_norm": 1.5396705865859985, + "learning_rate": 8.096795456850218e-07, + "loss": 0.903, + "step": 9585 + }, + { + "epoch": 0.23096746224801906, + "grad_norm": 1.4627436399459839, + "learning_rate": 8.095527836933373e-07, + "loss": 0.9531, + "step": 9590 + }, + { + "epoch": 0.23108788323980636, + "grad_norm": 1.521868109703064, + "learning_rate": 8.09426021701653e-07, + "loss": 0.8989, + "step": 9595 + }, + { + "epoch": 0.23120830423159366, + "grad_norm": 1.6635419130325317, + "learning_rate": 8.092992597099685e-07, + "loss": 0.9897, + "step": 9600 + }, + { + "epoch": 0.23132872522338094, + "grad_norm": 1.906615972518921, + "learning_rate": 8.091724977182841e-07, + "loss": 0.9235, + "step": 9605 + }, + { + "epoch": 0.23144914621516824, + "grad_norm": 1.4365830421447754, + "learning_rate": 8.090457357265997e-07, + "loss": 0.9304, + "step": 9610 + }, + { + "epoch": 0.2315695672069555, + "grad_norm": 1.482728362083435, + "learning_rate": 8.089189737349152e-07, + "loss": 0.9914, + "step": 9615 + }, + { + "epoch": 0.2316899881987428, + "grad_norm": 1.4797344207763672, + "learning_rate": 8.087922117432309e-07, + "loss": 0.9271, + "step": 9620 + }, + { + "epoch": 0.2318104091905301, + "grad_norm": 1.562041163444519, + "learning_rate": 8.086654497515465e-07, + "loss": 0.9269, + "step": 9625 + }, + { + "epoch": 0.2319308301823174, + "grad_norm": 1.515610694885254, + "learning_rate": 8.08538687759862e-07, + "loss": 0.9559, + "step": 9630 + }, + { + "epoch": 0.23205125117410466, + "grad_norm": 1.6904336214065552, + "learning_rate": 8.084119257681776e-07, + "loss": 0.9103, + "step": 9635 + }, + { + "epoch": 0.23217167216589196, + "grad_norm": 1.5302565097808838, + "learning_rate": 8.082851637764932e-07, + "loss": 0.9336, + "step": 9640 + }, + { + "epoch": 0.23229209315767924, + "grad_norm": 1.7606114149093628, + "learning_rate": 8.081584017848088e-07, + "loss": 0.8815, + "step": 9645 + }, + { + "epoch": 0.23241251414946654, + "grad_norm": 1.400367259979248, + "learning_rate": 8.080316397931244e-07, + "loss": 0.9513, + "step": 9650 + }, + { + "epoch": 0.2325329351412538, + "grad_norm": 1.7276761531829834, + "learning_rate": 8.079048778014401e-07, + "loss": 0.9295, + "step": 9655 + }, + { + "epoch": 0.2326533561330411, + "grad_norm": 1.7191526889801025, + "learning_rate": 8.077781158097555e-07, + "loss": 0.8973, + "step": 9660 + }, + { + "epoch": 0.2327737771248284, + "grad_norm": 1.4676998853683472, + "learning_rate": 8.076513538180711e-07, + "loss": 0.8989, + "step": 9665 + }, + { + "epoch": 0.23289419811661569, + "grad_norm": 1.5539352893829346, + "learning_rate": 8.075245918263868e-07, + "loss": 0.9475, + "step": 9670 + }, + { + "epoch": 0.233014619108403, + "grad_norm": 1.6929291486740112, + "learning_rate": 8.073978298347023e-07, + "loss": 0.943, + "step": 9675 + }, + { + "epoch": 0.23313504010019026, + "grad_norm": 1.5347380638122559, + "learning_rate": 8.072710678430179e-07, + "loss": 0.9108, + "step": 9680 + }, + { + "epoch": 0.23325546109197756, + "grad_norm": 1.5121805667877197, + "learning_rate": 8.071443058513335e-07, + "loss": 0.9501, + "step": 9685 + }, + { + "epoch": 0.23337588208376484, + "grad_norm": 1.4723517894744873, + "learning_rate": 8.07017543859649e-07, + "loss": 0.9007, + "step": 9690 + }, + { + "epoch": 0.23349630307555214, + "grad_norm": 1.4446206092834473, + "learning_rate": 8.068907818679647e-07, + "loss": 0.95, + "step": 9695 + }, + { + "epoch": 0.2336167240673394, + "grad_norm": 1.6316677331924438, + "learning_rate": 8.067640198762803e-07, + "loss": 0.9776, + "step": 9700 + }, + { + "epoch": 0.2337371450591267, + "grad_norm": 1.6081719398498535, + "learning_rate": 8.066372578845958e-07, + "loss": 0.9209, + "step": 9705 + }, + { + "epoch": 0.23385756605091398, + "grad_norm": 1.6020768880844116, + "learning_rate": 8.065104958929114e-07, + "loss": 0.9266, + "step": 9710 + }, + { + "epoch": 0.23397798704270129, + "grad_norm": 1.450314998626709, + "learning_rate": 8.063837339012271e-07, + "loss": 0.9394, + "step": 9715 + }, + { + "epoch": 0.23409840803448856, + "grad_norm": 1.5475916862487793, + "learning_rate": 8.062569719095426e-07, + "loss": 0.8923, + "step": 9720 + }, + { + "epoch": 0.23421882902627586, + "grad_norm": 1.7144578695297241, + "learning_rate": 8.061302099178581e-07, + "loss": 0.9136, + "step": 9725 + }, + { + "epoch": 0.23433925001806316, + "grad_norm": 1.5156981945037842, + "learning_rate": 8.060034479261738e-07, + "loss": 0.9153, + "step": 9730 + }, + { + "epoch": 0.23445967100985043, + "grad_norm": 1.4762510061264038, + "learning_rate": 8.058766859344893e-07, + "loss": 0.9274, + "step": 9735 + }, + { + "epoch": 0.23458009200163774, + "grad_norm": 1.5367364883422852, + "learning_rate": 8.05749923942805e-07, + "loss": 0.9553, + "step": 9740 + }, + { + "epoch": 0.234700512993425, + "grad_norm": 1.4681675434112549, + "learning_rate": 8.056231619511206e-07, + "loss": 0.9167, + "step": 9745 + }, + { + "epoch": 0.2348209339852123, + "grad_norm": 1.4918434619903564, + "learning_rate": 8.05496399959436e-07, + "loss": 0.963, + "step": 9750 + }, + { + "epoch": 0.23494135497699958, + "grad_norm": 1.7512080669403076, + "learning_rate": 8.053696379677517e-07, + "loss": 0.9161, + "step": 9755 + }, + { + "epoch": 0.23506177596878688, + "grad_norm": 1.8454680442810059, + "learning_rate": 8.052428759760673e-07, + "loss": 0.9616, + "step": 9760 + }, + { + "epoch": 0.23518219696057416, + "grad_norm": 1.601298213005066, + "learning_rate": 8.051161139843829e-07, + "loss": 0.9286, + "step": 9765 + }, + { + "epoch": 0.23530261795236146, + "grad_norm": 1.5492949485778809, + "learning_rate": 8.049893519926985e-07, + "loss": 0.9517, + "step": 9770 + }, + { + "epoch": 0.23542303894414873, + "grad_norm": 2.002368688583374, + "learning_rate": 8.04862590001014e-07, + "loss": 0.9469, + "step": 9775 + }, + { + "epoch": 0.23554345993593603, + "grad_norm": 1.533035159111023, + "learning_rate": 8.047358280093296e-07, + "loss": 0.9728, + "step": 9780 + }, + { + "epoch": 0.2356638809277233, + "grad_norm": 1.7091615200042725, + "learning_rate": 8.046090660176452e-07, + "loss": 0.915, + "step": 9785 + }, + { + "epoch": 0.2357843019195106, + "grad_norm": 1.8280829191207886, + "learning_rate": 8.044823040259609e-07, + "loss": 0.9154, + "step": 9790 + }, + { + "epoch": 0.2359047229112979, + "grad_norm": 1.37148916721344, + "learning_rate": 8.043555420342763e-07, + "loss": 0.9121, + "step": 9795 + }, + { + "epoch": 0.23602514390308518, + "grad_norm": 1.4839376211166382, + "learning_rate": 8.04228780042592e-07, + "loss": 0.8989, + "step": 9800 + }, + { + "epoch": 0.23614556489487248, + "grad_norm": 1.599485993385315, + "learning_rate": 8.041020180509076e-07, + "loss": 0.9382, + "step": 9805 + }, + { + "epoch": 0.23626598588665976, + "grad_norm": 1.343124270439148, + "learning_rate": 8.039752560592231e-07, + "loss": 0.9264, + "step": 9810 + }, + { + "epoch": 0.23638640687844706, + "grad_norm": 1.5841120481491089, + "learning_rate": 8.038484940675388e-07, + "loss": 0.9461, + "step": 9815 + }, + { + "epoch": 0.23650682787023433, + "grad_norm": 1.5181254148483276, + "learning_rate": 8.037217320758543e-07, + "loss": 0.9149, + "step": 9820 + }, + { + "epoch": 0.23662724886202163, + "grad_norm": 1.5661225318908691, + "learning_rate": 8.035949700841699e-07, + "loss": 0.9626, + "step": 9825 + }, + { + "epoch": 0.2367476698538089, + "grad_norm": 1.4676679372787476, + "learning_rate": 8.034682080924855e-07, + "loss": 0.968, + "step": 9830 + }, + { + "epoch": 0.2368680908455962, + "grad_norm": 1.5806342363357544, + "learning_rate": 8.033414461008012e-07, + "loss": 0.9607, + "step": 9835 + }, + { + "epoch": 0.23698851183738348, + "grad_norm": 1.432308316230774, + "learning_rate": 8.032146841091166e-07, + "loss": 0.9342, + "step": 9840 + }, + { + "epoch": 0.23710893282917078, + "grad_norm": 1.5676426887512207, + "learning_rate": 8.030879221174322e-07, + "loss": 0.96, + "step": 9845 + }, + { + "epoch": 0.23722935382095808, + "grad_norm": 1.8210023641586304, + "learning_rate": 8.029611601257479e-07, + "loss": 0.9001, + "step": 9850 + }, + { + "epoch": 0.23734977481274536, + "grad_norm": 1.5412278175354004, + "learning_rate": 8.028343981340634e-07, + "loss": 0.931, + "step": 9855 + }, + { + "epoch": 0.23747019580453266, + "grad_norm": 1.582090139389038, + "learning_rate": 8.027076361423791e-07, + "loss": 0.893, + "step": 9860 + }, + { + "epoch": 0.23759061679631993, + "grad_norm": 1.480709433555603, + "learning_rate": 8.025808741506946e-07, + "loss": 0.8721, + "step": 9865 + }, + { + "epoch": 0.23771103778810723, + "grad_norm": 1.7517716884613037, + "learning_rate": 8.024541121590101e-07, + "loss": 0.8898, + "step": 9870 + }, + { + "epoch": 0.2378314587798945, + "grad_norm": 1.5809065103530884, + "learning_rate": 8.023273501673258e-07, + "loss": 0.9582, + "step": 9875 + }, + { + "epoch": 0.2379518797716818, + "grad_norm": 1.604315161705017, + "learning_rate": 8.022005881756414e-07, + "loss": 0.9441, + "step": 9880 + }, + { + "epoch": 0.23807230076346908, + "grad_norm": 1.5863564014434814, + "learning_rate": 8.02073826183957e-07, + "loss": 0.9584, + "step": 9885 + }, + { + "epoch": 0.23819272175525638, + "grad_norm": 1.621246337890625, + "learning_rate": 8.019470641922725e-07, + "loss": 0.9417, + "step": 9890 + }, + { + "epoch": 0.23831314274704365, + "grad_norm": 2.1980340480804443, + "learning_rate": 8.018203022005881e-07, + "loss": 0.9585, + "step": 9895 + }, + { + "epoch": 0.23843356373883096, + "grad_norm": 1.7182064056396484, + "learning_rate": 8.016935402089037e-07, + "loss": 0.9086, + "step": 9900 + }, + { + "epoch": 0.23855398473061823, + "grad_norm": 1.6237313747406006, + "learning_rate": 8.015667782172193e-07, + "loss": 0.9415, + "step": 9905 + }, + { + "epoch": 0.23867440572240553, + "grad_norm": 1.5629863739013672, + "learning_rate": 8.014400162255349e-07, + "loss": 0.8983, + "step": 9910 + }, + { + "epoch": 0.23879482671419283, + "grad_norm": 1.561966896057129, + "learning_rate": 8.013132542338504e-07, + "loss": 0.9293, + "step": 9915 + }, + { + "epoch": 0.2389152477059801, + "grad_norm": 1.643320083618164, + "learning_rate": 8.011864922421661e-07, + "loss": 0.9716, + "step": 9920 + }, + { + "epoch": 0.2390356686977674, + "grad_norm": 1.4515066146850586, + "learning_rate": 8.010597302504817e-07, + "loss": 0.9052, + "step": 9925 + }, + { + "epoch": 0.23915608968955468, + "grad_norm": 1.4326725006103516, + "learning_rate": 8.009329682587972e-07, + "loss": 0.9318, + "step": 9930 + }, + { + "epoch": 0.23927651068134198, + "grad_norm": 1.5089689493179321, + "learning_rate": 8.008062062671128e-07, + "loss": 0.9239, + "step": 9935 + }, + { + "epoch": 0.23939693167312925, + "grad_norm": 1.687284231185913, + "learning_rate": 8.006794442754284e-07, + "loss": 0.9622, + "step": 9940 + }, + { + "epoch": 0.23951735266491656, + "grad_norm": 1.4424015283584595, + "learning_rate": 8.00552682283744e-07, + "loss": 0.928, + "step": 9945 + }, + { + "epoch": 0.23963777365670383, + "grad_norm": 1.5909708738327026, + "learning_rate": 8.004259202920596e-07, + "loss": 0.9228, + "step": 9950 + }, + { + "epoch": 0.23975819464849113, + "grad_norm": 1.5277528762817383, + "learning_rate": 8.002991583003752e-07, + "loss": 0.9105, + "step": 9955 + }, + { + "epoch": 0.2398786156402784, + "grad_norm": 1.580113172531128, + "learning_rate": 8.001723963086907e-07, + "loss": 0.9575, + "step": 9960 + }, + { + "epoch": 0.2399990366320657, + "grad_norm": 1.5940556526184082, + "learning_rate": 8.000456343170063e-07, + "loss": 0.9305, + "step": 9965 + }, + { + "epoch": 0.24011945762385298, + "grad_norm": 1.5910226106643677, + "learning_rate": 7.99918872325322e-07, + "loss": 1.0218, + "step": 9970 + }, + { + "epoch": 0.24023987861564028, + "grad_norm": 1.6788511276245117, + "learning_rate": 7.997921103336375e-07, + "loss": 0.9411, + "step": 9975 + }, + { + "epoch": 0.24036029960742758, + "grad_norm": 1.4881618022918701, + "learning_rate": 7.99665348341953e-07, + "loss": 0.8998, + "step": 9980 + }, + { + "epoch": 0.24048072059921485, + "grad_norm": 2.371474266052246, + "learning_rate": 7.995385863502687e-07, + "loss": 0.8955, + "step": 9985 + }, + { + "epoch": 0.24060114159100215, + "grad_norm": 1.70774245262146, + "learning_rate": 7.994118243585843e-07, + "loss": 0.987, + "step": 9990 + }, + { + "epoch": 0.24072156258278943, + "grad_norm": 1.4561922550201416, + "learning_rate": 7.992850623668999e-07, + "loss": 0.8907, + "step": 9995 + }, + { + "epoch": 0.24084198357457673, + "grad_norm": 2.2680671215057373, + "learning_rate": 7.991583003752155e-07, + "loss": 0.9029, + "step": 10000 + }, + { + "epoch": 0.240962404566364, + "grad_norm": 1.4901586771011353, + "learning_rate": 7.990315383835311e-07, + "loss": 0.9067, + "step": 10005 + }, + { + "epoch": 0.2410828255581513, + "grad_norm": 1.5713847875595093, + "learning_rate": 7.989047763918466e-07, + "loss": 0.9342, + "step": 10010 + }, + { + "epoch": 0.24120324654993858, + "grad_norm": 1.7380549907684326, + "learning_rate": 7.987780144001622e-07, + "loss": 0.9784, + "step": 10015 + }, + { + "epoch": 0.24132366754172588, + "grad_norm": 1.6125047206878662, + "learning_rate": 7.986512524084779e-07, + "loss": 0.9062, + "step": 10020 + }, + { + "epoch": 0.24144408853351315, + "grad_norm": 1.498434066772461, + "learning_rate": 7.985244904167933e-07, + "loss": 0.9183, + "step": 10025 + }, + { + "epoch": 0.24156450952530045, + "grad_norm": 1.5576609373092651, + "learning_rate": 7.98397728425109e-07, + "loss": 0.9168, + "step": 10030 + }, + { + "epoch": 0.24168493051708773, + "grad_norm": 1.649990439414978, + "learning_rate": 7.982709664334246e-07, + "loss": 0.972, + "step": 10035 + }, + { + "epoch": 0.24180535150887503, + "grad_norm": 1.5411405563354492, + "learning_rate": 7.981442044417402e-07, + "loss": 0.9177, + "step": 10040 + }, + { + "epoch": 0.24192577250066233, + "grad_norm": 1.6077088117599487, + "learning_rate": 7.980174424500558e-07, + "loss": 0.8901, + "step": 10045 + }, + { + "epoch": 0.2420461934924496, + "grad_norm": 1.8830534219741821, + "learning_rate": 7.978906804583713e-07, + "loss": 0.9117, + "step": 10050 + }, + { + "epoch": 0.2421666144842369, + "grad_norm": 1.7061887979507446, + "learning_rate": 7.977639184666869e-07, + "loss": 0.886, + "step": 10055 + }, + { + "epoch": 0.24228703547602418, + "grad_norm": 1.4365276098251343, + "learning_rate": 7.976371564750025e-07, + "loss": 0.8753, + "step": 10060 + }, + { + "epoch": 0.24240745646781148, + "grad_norm": 1.485558032989502, + "learning_rate": 7.975103944833182e-07, + "loss": 0.9064, + "step": 10065 + }, + { + "epoch": 0.24252787745959875, + "grad_norm": 1.5110009908676147, + "learning_rate": 7.973836324916337e-07, + "loss": 0.9667, + "step": 10070 + }, + { + "epoch": 0.24264829845138605, + "grad_norm": 1.6972064971923828, + "learning_rate": 7.972568704999492e-07, + "loss": 0.9475, + "step": 10075 + }, + { + "epoch": 0.24276871944317333, + "grad_norm": 2.102038621902466, + "learning_rate": 7.971301085082649e-07, + "loss": 0.9135, + "step": 10080 + }, + { + "epoch": 0.24288914043496063, + "grad_norm": 1.4368513822555542, + "learning_rate": 7.970033465165804e-07, + "loss": 0.9098, + "step": 10085 + }, + { + "epoch": 0.2430095614267479, + "grad_norm": 1.52772057056427, + "learning_rate": 7.968765845248961e-07, + "loss": 0.8988, + "step": 10090 + }, + { + "epoch": 0.2431299824185352, + "grad_norm": 1.4237383604049683, + "learning_rate": 7.967498225332116e-07, + "loss": 0.9366, + "step": 10095 + }, + { + "epoch": 0.24325040341032247, + "grad_norm": 1.4499160051345825, + "learning_rate": 7.966230605415271e-07, + "loss": 0.9305, + "step": 10100 + }, + { + "epoch": 0.24337082440210978, + "grad_norm": 1.6376252174377441, + "learning_rate": 7.964962985498428e-07, + "loss": 0.9434, + "step": 10105 + }, + { + "epoch": 0.24349124539389708, + "grad_norm": 1.7884248495101929, + "learning_rate": 7.963695365581584e-07, + "loss": 0.9995, + "step": 10110 + }, + { + "epoch": 0.24361166638568435, + "grad_norm": 1.5183720588684082, + "learning_rate": 7.96242774566474e-07, + "loss": 0.9227, + "step": 10115 + }, + { + "epoch": 0.24373208737747165, + "grad_norm": 1.5874552726745605, + "learning_rate": 7.961160125747895e-07, + "loss": 0.9153, + "step": 10120 + }, + { + "epoch": 0.24385250836925892, + "grad_norm": 1.6614394187927246, + "learning_rate": 7.959892505831052e-07, + "loss": 0.978, + "step": 10125 + }, + { + "epoch": 0.24397292936104623, + "grad_norm": 1.417792558670044, + "learning_rate": 7.958624885914207e-07, + "loss": 0.9653, + "step": 10130 + }, + { + "epoch": 0.2440933503528335, + "grad_norm": 1.610689401626587, + "learning_rate": 7.957357265997363e-07, + "loss": 0.9511, + "step": 10135 + }, + { + "epoch": 0.2442137713446208, + "grad_norm": 1.515844702720642, + "learning_rate": 7.95608964608052e-07, + "loss": 0.9546, + "step": 10140 + }, + { + "epoch": 0.24433419233640807, + "grad_norm": 1.5553603172302246, + "learning_rate": 7.954822026163674e-07, + "loss": 0.9197, + "step": 10145 + }, + { + "epoch": 0.24445461332819537, + "grad_norm": 1.635810375213623, + "learning_rate": 7.953554406246831e-07, + "loss": 0.8908, + "step": 10150 + }, + { + "epoch": 0.24457503431998265, + "grad_norm": 1.8756006956100464, + "learning_rate": 7.952286786329987e-07, + "loss": 0.9145, + "step": 10155 + }, + { + "epoch": 0.24469545531176995, + "grad_norm": 1.562914490699768, + "learning_rate": 7.951019166413142e-07, + "loss": 0.9254, + "step": 10160 + }, + { + "epoch": 0.24481587630355722, + "grad_norm": 1.4989361763000488, + "learning_rate": 7.949751546496298e-07, + "loss": 0.9285, + "step": 10165 + }, + { + "epoch": 0.24493629729534452, + "grad_norm": 1.4218214750289917, + "learning_rate": 7.948483926579454e-07, + "loss": 0.9399, + "step": 10170 + }, + { + "epoch": 0.24505671828713183, + "grad_norm": 1.5706713199615479, + "learning_rate": 7.94721630666261e-07, + "loss": 0.8914, + "step": 10175 + }, + { + "epoch": 0.2451771392789191, + "grad_norm": 1.4235652685165405, + "learning_rate": 7.945948686745766e-07, + "loss": 0.8733, + "step": 10180 + }, + { + "epoch": 0.2452975602707064, + "grad_norm": 1.735122561454773, + "learning_rate": 7.944681066828923e-07, + "loss": 0.9394, + "step": 10185 + }, + { + "epoch": 0.24541798126249367, + "grad_norm": 1.5530369281768799, + "learning_rate": 7.943413446912077e-07, + "loss": 0.9979, + "step": 10190 + }, + { + "epoch": 0.24553840225428097, + "grad_norm": 1.6291742324829102, + "learning_rate": 7.942145826995233e-07, + "loss": 0.9256, + "step": 10195 + }, + { + "epoch": 0.24565882324606825, + "grad_norm": 1.5617204904556274, + "learning_rate": 7.94087820707839e-07, + "loss": 0.9474, + "step": 10200 + }, + { + "epoch": 0.24577924423785555, + "grad_norm": 1.5999999046325684, + "learning_rate": 7.939610587161545e-07, + "loss": 0.9339, + "step": 10205 + }, + { + "epoch": 0.24589966522964282, + "grad_norm": 1.3905305862426758, + "learning_rate": 7.938342967244701e-07, + "loss": 0.8518, + "step": 10210 + }, + { + "epoch": 0.24602008622143012, + "grad_norm": 1.569653034210205, + "learning_rate": 7.937075347327857e-07, + "loss": 0.9487, + "step": 10215 + }, + { + "epoch": 0.2461405072132174, + "grad_norm": 1.578831434249878, + "learning_rate": 7.935807727411012e-07, + "loss": 0.9662, + "step": 10220 + }, + { + "epoch": 0.2462609282050047, + "grad_norm": 1.528179407119751, + "learning_rate": 7.934540107494169e-07, + "loss": 0.9369, + "step": 10225 + }, + { + "epoch": 0.24638134919679197, + "grad_norm": 1.5490789413452148, + "learning_rate": 7.933272487577325e-07, + "loss": 0.9889, + "step": 10230 + }, + { + "epoch": 0.24650177018857927, + "grad_norm": 1.6813278198242188, + "learning_rate": 7.93200486766048e-07, + "loss": 0.9001, + "step": 10235 + }, + { + "epoch": 0.24662219118036657, + "grad_norm": 1.4886800050735474, + "learning_rate": 7.930737247743636e-07, + "loss": 0.923, + "step": 10240 + }, + { + "epoch": 0.24674261217215385, + "grad_norm": 1.586333990097046, + "learning_rate": 7.929469627826793e-07, + "loss": 0.951, + "step": 10245 + }, + { + "epoch": 0.24686303316394115, + "grad_norm": 1.4950419664382935, + "learning_rate": 7.928202007909948e-07, + "loss": 0.9323, + "step": 10250 + }, + { + "epoch": 0.24698345415572842, + "grad_norm": 1.5455451011657715, + "learning_rate": 7.926934387993104e-07, + "loss": 0.9356, + "step": 10255 + }, + { + "epoch": 0.24710387514751572, + "grad_norm": 1.5218327045440674, + "learning_rate": 7.92566676807626e-07, + "loss": 0.9828, + "step": 10260 + }, + { + "epoch": 0.247224296139303, + "grad_norm": 1.6908859014511108, + "learning_rate": 7.924399148159415e-07, + "loss": 0.9144, + "step": 10265 + }, + { + "epoch": 0.2473447171310903, + "grad_norm": 1.6348868608474731, + "learning_rate": 7.923131528242572e-07, + "loss": 0.9528, + "step": 10270 + }, + { + "epoch": 0.24746513812287757, + "grad_norm": 1.3725874423980713, + "learning_rate": 7.921863908325728e-07, + "loss": 0.8838, + "step": 10275 + }, + { + "epoch": 0.24758555911466487, + "grad_norm": 1.5570913553237915, + "learning_rate": 7.920596288408882e-07, + "loss": 0.9295, + "step": 10280 + }, + { + "epoch": 0.24770598010645214, + "grad_norm": 1.600831151008606, + "learning_rate": 7.919328668492039e-07, + "loss": 0.9013, + "step": 10285 + }, + { + "epoch": 0.24782640109823945, + "grad_norm": 1.507856011390686, + "learning_rate": 7.918061048575195e-07, + "loss": 0.9489, + "step": 10290 + }, + { + "epoch": 0.24794682209002675, + "grad_norm": 1.6118507385253906, + "learning_rate": 7.916793428658351e-07, + "loss": 0.9631, + "step": 10295 + }, + { + "epoch": 0.24806724308181402, + "grad_norm": 1.5403980016708374, + "learning_rate": 7.915525808741507e-07, + "loss": 0.8982, + "step": 10300 + }, + { + "epoch": 0.24818766407360132, + "grad_norm": 1.4400405883789062, + "learning_rate": 7.914258188824663e-07, + "loss": 0.8841, + "step": 10305 + }, + { + "epoch": 0.2483080850653886, + "grad_norm": 1.6470706462860107, + "learning_rate": 7.912990568907818e-07, + "loss": 0.895, + "step": 10310 + }, + { + "epoch": 0.2484285060571759, + "grad_norm": 1.5725401639938354, + "learning_rate": 7.911722948990974e-07, + "loss": 0.9149, + "step": 10315 + }, + { + "epoch": 0.24854892704896317, + "grad_norm": 1.4755620956420898, + "learning_rate": 7.910455329074131e-07, + "loss": 0.9526, + "step": 10320 + }, + { + "epoch": 0.24866934804075047, + "grad_norm": 1.624035120010376, + "learning_rate": 7.909187709157285e-07, + "loss": 0.9191, + "step": 10325 + }, + { + "epoch": 0.24878976903253774, + "grad_norm": 1.6740853786468506, + "learning_rate": 7.907920089240442e-07, + "loss": 0.8927, + "step": 10330 + }, + { + "epoch": 0.24891019002432505, + "grad_norm": 1.940212607383728, + "learning_rate": 7.906652469323598e-07, + "loss": 0.9221, + "step": 10335 + }, + { + "epoch": 0.24903061101611232, + "grad_norm": 1.5053337812423706, + "learning_rate": 7.905384849406753e-07, + "loss": 0.8715, + "step": 10340 + }, + { + "epoch": 0.24915103200789962, + "grad_norm": 1.6379318237304688, + "learning_rate": 7.90411722948991e-07, + "loss": 0.9527, + "step": 10345 + }, + { + "epoch": 0.2492714529996869, + "grad_norm": 1.5555362701416016, + "learning_rate": 7.902849609573065e-07, + "loss": 0.9096, + "step": 10350 + }, + { + "epoch": 0.2493918739914742, + "grad_norm": 1.6231192350387573, + "learning_rate": 7.901581989656221e-07, + "loss": 0.9344, + "step": 10355 + }, + { + "epoch": 0.2495122949832615, + "grad_norm": 2.2351765632629395, + "learning_rate": 7.900314369739377e-07, + "loss": 0.9563, + "step": 10360 + }, + { + "epoch": 0.24963271597504877, + "grad_norm": 1.3904240131378174, + "learning_rate": 7.899046749822534e-07, + "loss": 0.9499, + "step": 10365 + }, + { + "epoch": 0.24975313696683607, + "grad_norm": 2.1326162815093994, + "learning_rate": 7.897779129905689e-07, + "loss": 0.922, + "step": 10370 + }, + { + "epoch": 0.24987355795862334, + "grad_norm": 1.416149377822876, + "learning_rate": 7.896511509988844e-07, + "loss": 0.9077, + "step": 10375 + }, + { + "epoch": 0.24999397895041064, + "grad_norm": 1.636558175086975, + "learning_rate": 7.895243890072001e-07, + "loss": 0.9489, + "step": 10380 + }, + { + "epoch": 0.2501143999421979, + "grad_norm": 1.5372464656829834, + "learning_rate": 7.893976270155156e-07, + "loss": 0.9633, + "step": 10385 + }, + { + "epoch": 0.2502348209339852, + "grad_norm": 1.4980571269989014, + "learning_rate": 7.892708650238313e-07, + "loss": 0.9452, + "step": 10390 + }, + { + "epoch": 0.2503552419257725, + "grad_norm": 1.3831257820129395, + "learning_rate": 7.891441030321468e-07, + "loss": 0.9252, + "step": 10395 + }, + { + "epoch": 0.2504756629175598, + "grad_norm": 1.4749293327331543, + "learning_rate": 7.890173410404623e-07, + "loss": 0.9759, + "step": 10400 + }, + { + "epoch": 0.25059608390934707, + "grad_norm": 1.7515690326690674, + "learning_rate": 7.88890579048778e-07, + "loss": 1.0089, + "step": 10405 + }, + { + "epoch": 0.25071650490113434, + "grad_norm": 1.518198013305664, + "learning_rate": 7.887638170570936e-07, + "loss": 0.9502, + "step": 10410 + }, + { + "epoch": 0.25083692589292167, + "grad_norm": 1.4665954113006592, + "learning_rate": 7.886370550654092e-07, + "loss": 0.9368, + "step": 10415 + }, + { + "epoch": 0.25095734688470894, + "grad_norm": 1.6156924962997437, + "learning_rate": 7.885102930737247e-07, + "loss": 0.9023, + "step": 10420 + }, + { + "epoch": 0.2510777678764962, + "grad_norm": 1.5096485614776611, + "learning_rate": 7.883835310820403e-07, + "loss": 0.9287, + "step": 10425 + }, + { + "epoch": 0.25119818886828355, + "grad_norm": 1.555070400238037, + "learning_rate": 7.882567690903559e-07, + "loss": 0.9033, + "step": 10430 + }, + { + "epoch": 0.2513186098600708, + "grad_norm": 1.5974732637405396, + "learning_rate": 7.881300070986715e-07, + "loss": 0.9553, + "step": 10435 + }, + { + "epoch": 0.2514390308518581, + "grad_norm": 1.684948205947876, + "learning_rate": 7.880032451069872e-07, + "loss": 0.9587, + "step": 10440 + }, + { + "epoch": 0.25155945184364537, + "grad_norm": 1.6145825386047363, + "learning_rate": 7.878764831153026e-07, + "loss": 0.9151, + "step": 10445 + }, + { + "epoch": 0.2516798728354327, + "grad_norm": 1.5612765550613403, + "learning_rate": 7.877497211236183e-07, + "loss": 0.9207, + "step": 10450 + }, + { + "epoch": 0.25180029382721997, + "grad_norm": 1.639469027519226, + "learning_rate": 7.876229591319339e-07, + "loss": 0.9319, + "step": 10455 + }, + { + "epoch": 0.25192071481900724, + "grad_norm": 1.57554292678833, + "learning_rate": 7.874961971402494e-07, + "loss": 0.8741, + "step": 10460 + }, + { + "epoch": 0.2520411358107945, + "grad_norm": 1.6669059991836548, + "learning_rate": 7.87369435148565e-07, + "loss": 0.9171, + "step": 10465 + }, + { + "epoch": 0.25216155680258184, + "grad_norm": 1.6609137058258057, + "learning_rate": 7.872426731568806e-07, + "loss": 0.947, + "step": 10470 + }, + { + "epoch": 0.2522819777943691, + "grad_norm": 1.5579173564910889, + "learning_rate": 7.871159111651962e-07, + "loss": 0.9089, + "step": 10475 + }, + { + "epoch": 0.2524023987861564, + "grad_norm": 1.6061656475067139, + "learning_rate": 7.869891491735118e-07, + "loss": 0.9144, + "step": 10480 + }, + { + "epoch": 0.25252281977794366, + "grad_norm": 1.5641448497772217, + "learning_rate": 7.868623871818275e-07, + "loss": 0.9066, + "step": 10485 + }, + { + "epoch": 0.252643240769731, + "grad_norm": 1.3542393445968628, + "learning_rate": 7.867356251901429e-07, + "loss": 0.9641, + "step": 10490 + }, + { + "epoch": 0.25276366176151827, + "grad_norm": 1.599341869354248, + "learning_rate": 7.866088631984585e-07, + "loss": 0.8958, + "step": 10495 + }, + { + "epoch": 0.25288408275330554, + "grad_norm": 1.5220602750778198, + "learning_rate": 7.864821012067742e-07, + "loss": 0.9044, + "step": 10500 + }, + { + "epoch": 0.25300450374509287, + "grad_norm": 1.562246561050415, + "learning_rate": 7.863553392150897e-07, + "loss": 0.8578, + "step": 10505 + }, + { + "epoch": 0.25312492473688014, + "grad_norm": 1.6219570636749268, + "learning_rate": 7.862285772234053e-07, + "loss": 0.9255, + "step": 10510 + }, + { + "epoch": 0.2532453457286674, + "grad_norm": 1.7990187406539917, + "learning_rate": 7.861018152317209e-07, + "loss": 0.9217, + "step": 10515 + }, + { + "epoch": 0.2533657667204547, + "grad_norm": 1.477607250213623, + "learning_rate": 7.859750532400364e-07, + "loss": 0.9559, + "step": 10520 + }, + { + "epoch": 0.253486187712242, + "grad_norm": 1.5525449514389038, + "learning_rate": 7.858482912483521e-07, + "loss": 0.8984, + "step": 10525 + }, + { + "epoch": 0.2536066087040293, + "grad_norm": 1.4910857677459717, + "learning_rate": 7.857215292566677e-07, + "loss": 0.8982, + "step": 10530 + }, + { + "epoch": 0.25372702969581656, + "grad_norm": 1.5159627199172974, + "learning_rate": 7.855947672649832e-07, + "loss": 0.9465, + "step": 10535 + }, + { + "epoch": 0.25384745068760384, + "grad_norm": 1.797110676765442, + "learning_rate": 7.854680052732988e-07, + "loss": 0.9194, + "step": 10540 + }, + { + "epoch": 0.25396787167939117, + "grad_norm": 1.5768470764160156, + "learning_rate": 7.853412432816144e-07, + "loss": 0.9684, + "step": 10545 + }, + { + "epoch": 0.25408829267117844, + "grad_norm": 1.5663419961929321, + "learning_rate": 7.8521448128993e-07, + "loss": 0.9466, + "step": 10550 + }, + { + "epoch": 0.2542087136629657, + "grad_norm": 1.6284027099609375, + "learning_rate": 7.850877192982455e-07, + "loss": 0.9652, + "step": 10555 + }, + { + "epoch": 0.25432913465475304, + "grad_norm": 1.629702091217041, + "learning_rate": 7.849609573065612e-07, + "loss": 0.9266, + "step": 10560 + }, + { + "epoch": 0.2544495556465403, + "grad_norm": 1.8558638095855713, + "learning_rate": 7.848341953148767e-07, + "loss": 0.9305, + "step": 10565 + }, + { + "epoch": 0.2545699766383276, + "grad_norm": 1.5336354970932007, + "learning_rate": 7.847074333231924e-07, + "loss": 0.9098, + "step": 10570 + }, + { + "epoch": 0.25469039763011486, + "grad_norm": 1.4641679525375366, + "learning_rate": 7.84580671331508e-07, + "loss": 0.9908, + "step": 10575 + }, + { + "epoch": 0.2548108186219022, + "grad_norm": 1.5101847648620605, + "learning_rate": 7.844539093398234e-07, + "loss": 0.9182, + "step": 10580 + }, + { + "epoch": 0.25493123961368946, + "grad_norm": 1.5551209449768066, + "learning_rate": 7.843271473481391e-07, + "loss": 0.9541, + "step": 10585 + }, + { + "epoch": 0.25505166060547674, + "grad_norm": 1.5243884325027466, + "learning_rate": 7.842003853564547e-07, + "loss": 0.9704, + "step": 10590 + }, + { + "epoch": 0.255172081597264, + "grad_norm": 1.464455485343933, + "learning_rate": 7.840736233647703e-07, + "loss": 0.8812, + "step": 10595 + }, + { + "epoch": 0.25529250258905134, + "grad_norm": 1.4903156757354736, + "learning_rate": 7.839468613730859e-07, + "loss": 0.8868, + "step": 10600 + }, + { + "epoch": 0.2554129235808386, + "grad_norm": 1.581946611404419, + "learning_rate": 7.838200993814014e-07, + "loss": 0.8508, + "step": 10605 + }, + { + "epoch": 0.2555333445726259, + "grad_norm": 1.4839967489242554, + "learning_rate": 7.83693337389717e-07, + "loss": 0.9232, + "step": 10610 + }, + { + "epoch": 0.2556537655644132, + "grad_norm": 1.7015328407287598, + "learning_rate": 7.835665753980326e-07, + "loss": 0.9166, + "step": 10615 + }, + { + "epoch": 0.2557741865562005, + "grad_norm": 1.6252477169036865, + "learning_rate": 7.834398134063483e-07, + "loss": 0.9375, + "step": 10620 + }, + { + "epoch": 0.25589460754798776, + "grad_norm": 1.5471183061599731, + "learning_rate": 7.833130514146637e-07, + "loss": 0.918, + "step": 10625 + }, + { + "epoch": 0.25601502853977504, + "grad_norm": 1.4048386812210083, + "learning_rate": 7.831862894229793e-07, + "loss": 0.9263, + "step": 10630 + }, + { + "epoch": 0.25613544953156236, + "grad_norm": 1.5877938270568848, + "learning_rate": 7.83059527431295e-07, + "loss": 0.9965, + "step": 10635 + }, + { + "epoch": 0.25625587052334964, + "grad_norm": 1.8080618381500244, + "learning_rate": 7.829327654396105e-07, + "loss": 0.9158, + "step": 10640 + }, + { + "epoch": 0.2563762915151369, + "grad_norm": 1.533905267715454, + "learning_rate": 7.828060034479262e-07, + "loss": 0.9173, + "step": 10645 + }, + { + "epoch": 0.2564967125069242, + "grad_norm": 1.73739492893219, + "learning_rate": 7.826792414562417e-07, + "loss": 0.9389, + "step": 10650 + }, + { + "epoch": 0.2566171334987115, + "grad_norm": 1.486688494682312, + "learning_rate": 7.825524794645573e-07, + "loss": 0.959, + "step": 10655 + }, + { + "epoch": 0.2567375544904988, + "grad_norm": 1.5035080909729004, + "learning_rate": 7.824257174728729e-07, + "loss": 0.9039, + "step": 10660 + }, + { + "epoch": 0.25685797548228606, + "grad_norm": 1.618054986000061, + "learning_rate": 7.822989554811885e-07, + "loss": 0.9412, + "step": 10665 + }, + { + "epoch": 0.25697839647407333, + "grad_norm": 1.6609909534454346, + "learning_rate": 7.82172193489504e-07, + "loss": 0.8911, + "step": 10670 + }, + { + "epoch": 0.25709881746586066, + "grad_norm": 1.6681872606277466, + "learning_rate": 7.820454314978196e-07, + "loss": 0.9248, + "step": 10675 + }, + { + "epoch": 0.25721923845764794, + "grad_norm": 1.5016465187072754, + "learning_rate": 7.819186695061353e-07, + "loss": 0.9441, + "step": 10680 + }, + { + "epoch": 0.2573396594494352, + "grad_norm": 1.589369297027588, + "learning_rate": 7.817919075144508e-07, + "loss": 0.9639, + "step": 10685 + }, + { + "epoch": 0.25746008044122254, + "grad_norm": 1.712123155593872, + "learning_rate": 7.816651455227664e-07, + "loss": 0.9627, + "step": 10690 + }, + { + "epoch": 0.2575805014330098, + "grad_norm": 1.4570205211639404, + "learning_rate": 7.81538383531082e-07, + "loss": 0.8918, + "step": 10695 + }, + { + "epoch": 0.2577009224247971, + "grad_norm": 1.4842736721038818, + "learning_rate": 7.814116215393975e-07, + "loss": 0.9268, + "step": 10700 + }, + { + "epoch": 0.25782134341658436, + "grad_norm": 1.4311498403549194, + "learning_rate": 7.812848595477132e-07, + "loss": 0.9513, + "step": 10705 + }, + { + "epoch": 0.2579417644083717, + "grad_norm": 1.791871190071106, + "learning_rate": 7.811580975560288e-07, + "loss": 0.9687, + "step": 10710 + }, + { + "epoch": 0.25806218540015896, + "grad_norm": 1.471676230430603, + "learning_rate": 7.810313355643444e-07, + "loss": 0.9213, + "step": 10715 + }, + { + "epoch": 0.25818260639194623, + "grad_norm": 1.6426857709884644, + "learning_rate": 7.809045735726599e-07, + "loss": 0.9445, + "step": 10720 + }, + { + "epoch": 0.2583030273837335, + "grad_norm": 1.7955909967422485, + "learning_rate": 7.807778115809755e-07, + "loss": 0.8952, + "step": 10725 + }, + { + "epoch": 0.25842344837552084, + "grad_norm": 1.5316821336746216, + "learning_rate": 7.806510495892911e-07, + "loss": 0.8781, + "step": 10730 + }, + { + "epoch": 0.2585438693673081, + "grad_norm": 1.5494688749313354, + "learning_rate": 7.805242875976067e-07, + "loss": 0.9542, + "step": 10735 + }, + { + "epoch": 0.2586642903590954, + "grad_norm": 1.5220109224319458, + "learning_rate": 7.803975256059223e-07, + "loss": 0.9104, + "step": 10740 + }, + { + "epoch": 0.2587847113508827, + "grad_norm": 1.5227622985839844, + "learning_rate": 7.802707636142378e-07, + "loss": 0.9549, + "step": 10745 + }, + { + "epoch": 0.25890513234267, + "grad_norm": 1.5300692319869995, + "learning_rate": 7.801440016225534e-07, + "loss": 0.9111, + "step": 10750 + }, + { + "epoch": 0.25902555333445726, + "grad_norm": 1.4417011737823486, + "learning_rate": 7.800172396308691e-07, + "loss": 0.8647, + "step": 10755 + }, + { + "epoch": 0.25914597432624453, + "grad_norm": 1.893277645111084, + "learning_rate": 7.798904776391846e-07, + "loss": 0.9442, + "step": 10760 + }, + { + "epoch": 0.25926639531803186, + "grad_norm": 1.6196260452270508, + "learning_rate": 7.797637156475002e-07, + "loss": 0.9151, + "step": 10765 + }, + { + "epoch": 0.25938681630981913, + "grad_norm": 1.7305158376693726, + "learning_rate": 7.796369536558158e-07, + "loss": 0.9412, + "step": 10770 + }, + { + "epoch": 0.2595072373016064, + "grad_norm": 1.6203160285949707, + "learning_rate": 7.795101916641314e-07, + "loss": 0.9353, + "step": 10775 + }, + { + "epoch": 0.2596276582933937, + "grad_norm": 1.3651759624481201, + "learning_rate": 7.79383429672447e-07, + "loss": 0.9142, + "step": 10780 + }, + { + "epoch": 0.259748079285181, + "grad_norm": 1.571115493774414, + "learning_rate": 7.792566676807626e-07, + "loss": 0.9168, + "step": 10785 + }, + { + "epoch": 0.2598685002769683, + "grad_norm": 1.6559398174285889, + "learning_rate": 7.791299056890781e-07, + "loss": 0.9628, + "step": 10790 + }, + { + "epoch": 0.25998892126875556, + "grad_norm": 1.8212372064590454, + "learning_rate": 7.790031436973937e-07, + "loss": 0.9273, + "step": 10795 + }, + { + "epoch": 0.26010934226054283, + "grad_norm": 1.6014947891235352, + "learning_rate": 7.788763817057094e-07, + "loss": 0.9179, + "step": 10800 + }, + { + "epoch": 0.26022976325233016, + "grad_norm": 2.0242059230804443, + "learning_rate": 7.787496197140249e-07, + "loss": 0.9219, + "step": 10805 + }, + { + "epoch": 0.26035018424411743, + "grad_norm": 1.4445858001708984, + "learning_rate": 7.786228577223404e-07, + "loss": 0.9384, + "step": 10810 + }, + { + "epoch": 0.2604706052359047, + "grad_norm": 1.4920601844787598, + "learning_rate": 7.784960957306561e-07, + "loss": 0.9833, + "step": 10815 + }, + { + "epoch": 0.26059102622769204, + "grad_norm": 1.5067740678787231, + "learning_rate": 7.783693337389716e-07, + "loss": 0.8776, + "step": 10820 + }, + { + "epoch": 0.2607114472194793, + "grad_norm": 1.4836264848709106, + "learning_rate": 7.782425717472873e-07, + "loss": 0.9002, + "step": 10825 + }, + { + "epoch": 0.2608318682112666, + "grad_norm": 1.4572845697402954, + "learning_rate": 7.781158097556029e-07, + "loss": 0.9199, + "step": 10830 + }, + { + "epoch": 0.26095228920305386, + "grad_norm": 1.5106346607208252, + "learning_rate": 7.779890477639183e-07, + "loss": 0.8979, + "step": 10835 + }, + { + "epoch": 0.2610727101948412, + "grad_norm": 1.5475988388061523, + "learning_rate": 7.77862285772234e-07, + "loss": 0.9266, + "step": 10840 + }, + { + "epoch": 0.26119313118662846, + "grad_norm": 1.5073845386505127, + "learning_rate": 7.777355237805496e-07, + "loss": 0.9067, + "step": 10845 + }, + { + "epoch": 0.26131355217841573, + "grad_norm": 1.6924643516540527, + "learning_rate": 7.776087617888652e-07, + "loss": 0.9067, + "step": 10850 + }, + { + "epoch": 0.261433973170203, + "grad_norm": 1.3849414587020874, + "learning_rate": 7.774819997971807e-07, + "loss": 0.9451, + "step": 10855 + }, + { + "epoch": 0.26155439416199033, + "grad_norm": 1.6200286149978638, + "learning_rate": 7.773552378054964e-07, + "loss": 0.9167, + "step": 10860 + }, + { + "epoch": 0.2616748151537776, + "grad_norm": 1.5841484069824219, + "learning_rate": 7.772284758138119e-07, + "loss": 0.9103, + "step": 10865 + }, + { + "epoch": 0.2617952361455649, + "grad_norm": 1.4382582902908325, + "learning_rate": 7.771017138221275e-07, + "loss": 0.9266, + "step": 10870 + }, + { + "epoch": 0.2619156571373522, + "grad_norm": 1.5426464080810547, + "learning_rate": 7.769749518304432e-07, + "loss": 0.8977, + "step": 10875 + }, + { + "epoch": 0.2620360781291395, + "grad_norm": 3.087616443634033, + "learning_rate": 7.768481898387586e-07, + "loss": 0.9505, + "step": 10880 + }, + { + "epoch": 0.26215649912092676, + "grad_norm": 1.3389363288879395, + "learning_rate": 7.767214278470743e-07, + "loss": 0.9173, + "step": 10885 + }, + { + "epoch": 0.26227692011271403, + "grad_norm": 1.723882794380188, + "learning_rate": 7.765946658553899e-07, + "loss": 0.9164, + "step": 10890 + }, + { + "epoch": 0.26239734110450136, + "grad_norm": 1.563045620918274, + "learning_rate": 7.764679038637054e-07, + "loss": 0.8643, + "step": 10895 + }, + { + "epoch": 0.26251776209628863, + "grad_norm": 1.4408845901489258, + "learning_rate": 7.763411418720211e-07, + "loss": 0.9516, + "step": 10900 + }, + { + "epoch": 0.2626381830880759, + "grad_norm": 1.5494115352630615, + "learning_rate": 7.762143798803366e-07, + "loss": 0.9362, + "step": 10905 + }, + { + "epoch": 0.2627586040798632, + "grad_norm": 1.701547384262085, + "learning_rate": 7.760876178886522e-07, + "loss": 0.9754, + "step": 10910 + }, + { + "epoch": 0.2628790250716505, + "grad_norm": 1.7622368335723877, + "learning_rate": 7.759608558969678e-07, + "loss": 0.9356, + "step": 10915 + }, + { + "epoch": 0.2629994460634378, + "grad_norm": 1.4967761039733887, + "learning_rate": 7.758340939052835e-07, + "loss": 0.9762, + "step": 10920 + }, + { + "epoch": 0.26311986705522505, + "grad_norm": 1.4345545768737793, + "learning_rate": 7.757073319135989e-07, + "loss": 0.9089, + "step": 10925 + }, + { + "epoch": 0.2632402880470124, + "grad_norm": 1.3745805025100708, + "learning_rate": 7.755805699219145e-07, + "loss": 0.9402, + "step": 10930 + }, + { + "epoch": 0.26336070903879966, + "grad_norm": 1.5181835889816284, + "learning_rate": 7.754538079302302e-07, + "loss": 0.9539, + "step": 10935 + }, + { + "epoch": 0.26348113003058693, + "grad_norm": 1.359045386314392, + "learning_rate": 7.753270459385457e-07, + "loss": 0.8787, + "step": 10940 + }, + { + "epoch": 0.2636015510223742, + "grad_norm": 1.57675302028656, + "learning_rate": 7.752002839468614e-07, + "loss": 0.9159, + "step": 10945 + }, + { + "epoch": 0.26372197201416153, + "grad_norm": 1.686819314956665, + "learning_rate": 7.750735219551769e-07, + "loss": 0.9341, + "step": 10950 + }, + { + "epoch": 0.2638423930059488, + "grad_norm": 1.331228256225586, + "learning_rate": 7.749467599634924e-07, + "loss": 0.9419, + "step": 10955 + }, + { + "epoch": 0.2639628139977361, + "grad_norm": 1.553196907043457, + "learning_rate": 7.748199979718081e-07, + "loss": 0.9446, + "step": 10960 + }, + { + "epoch": 0.26408323498952335, + "grad_norm": 1.725075125694275, + "learning_rate": 7.746932359801237e-07, + "loss": 0.9115, + "step": 10965 + }, + { + "epoch": 0.2642036559813107, + "grad_norm": 1.7521941661834717, + "learning_rate": 7.745664739884392e-07, + "loss": 0.9618, + "step": 10970 + }, + { + "epoch": 0.26432407697309795, + "grad_norm": 1.7503281831741333, + "learning_rate": 7.744397119967548e-07, + "loss": 0.9227, + "step": 10975 + }, + { + "epoch": 0.26444449796488523, + "grad_norm": 1.615695595741272, + "learning_rate": 7.743129500050705e-07, + "loss": 0.9125, + "step": 10980 + }, + { + "epoch": 0.2645649189566725, + "grad_norm": 1.4781994819641113, + "learning_rate": 7.74186188013386e-07, + "loss": 0.9136, + "step": 10985 + }, + { + "epoch": 0.26468533994845983, + "grad_norm": 1.5761228799819946, + "learning_rate": 7.740594260217016e-07, + "loss": 0.9014, + "step": 10990 + }, + { + "epoch": 0.2648057609402471, + "grad_norm": 1.7451871633529663, + "learning_rate": 7.739326640300172e-07, + "loss": 0.9402, + "step": 10995 + }, + { + "epoch": 0.2649261819320344, + "grad_norm": 1.504990577697754, + "learning_rate": 7.738059020383327e-07, + "loss": 0.9071, + "step": 11000 + }, + { + "epoch": 0.2650466029238217, + "grad_norm": 1.5758501291275024, + "learning_rate": 7.736791400466484e-07, + "loss": 0.9766, + "step": 11005 + }, + { + "epoch": 0.265167023915609, + "grad_norm": 1.4429376125335693, + "learning_rate": 7.73552378054964e-07, + "loss": 0.9063, + "step": 11010 + }, + { + "epoch": 0.26528744490739625, + "grad_norm": 1.5816013813018799, + "learning_rate": 7.734256160632797e-07, + "loss": 0.9162, + "step": 11015 + }, + { + "epoch": 0.2654078658991835, + "grad_norm": 1.6807634830474854, + "learning_rate": 7.732988540715951e-07, + "loss": 0.938, + "step": 11020 + }, + { + "epoch": 0.26552828689097085, + "grad_norm": 1.447342038154602, + "learning_rate": 7.731720920799107e-07, + "loss": 0.9361, + "step": 11025 + }, + { + "epoch": 0.26564870788275813, + "grad_norm": 1.5453587770462036, + "learning_rate": 7.730453300882264e-07, + "loss": 0.9549, + "step": 11030 + }, + { + "epoch": 0.2657691288745454, + "grad_norm": 1.658867597579956, + "learning_rate": 7.729185680965419e-07, + "loss": 0.9225, + "step": 11035 + }, + { + "epoch": 0.2658895498663327, + "grad_norm": 1.4685916900634766, + "learning_rate": 7.727918061048575e-07, + "loss": 0.8776, + "step": 11040 + }, + { + "epoch": 0.26600997085812, + "grad_norm": 1.5075732469558716, + "learning_rate": 7.726650441131731e-07, + "loss": 0.9686, + "step": 11045 + }, + { + "epoch": 0.2661303918499073, + "grad_norm": 1.5110318660736084, + "learning_rate": 7.725382821214886e-07, + "loss": 0.9034, + "step": 11050 + }, + { + "epoch": 0.26625081284169455, + "grad_norm": 1.489485740661621, + "learning_rate": 7.724115201298043e-07, + "loss": 0.9587, + "step": 11055 + }, + { + "epoch": 0.2663712338334819, + "grad_norm": 1.6650078296661377, + "learning_rate": 7.722847581381199e-07, + "loss": 0.9401, + "step": 11060 + }, + { + "epoch": 0.26649165482526915, + "grad_norm": 1.5072864294052124, + "learning_rate": 7.721579961464354e-07, + "loss": 0.8722, + "step": 11065 + }, + { + "epoch": 0.2666120758170564, + "grad_norm": 1.5710713863372803, + "learning_rate": 7.72031234154751e-07, + "loss": 0.9457, + "step": 11070 + }, + { + "epoch": 0.2667324968088437, + "grad_norm": 1.441070795059204, + "learning_rate": 7.719044721630666e-07, + "loss": 0.9234, + "step": 11075 + }, + { + "epoch": 0.26685291780063103, + "grad_norm": 1.599685549736023, + "learning_rate": 7.717777101713822e-07, + "loss": 0.939, + "step": 11080 + }, + { + "epoch": 0.2669733387924183, + "grad_norm": 1.5153743028640747, + "learning_rate": 7.716509481796978e-07, + "loss": 0.9616, + "step": 11085 + }, + { + "epoch": 0.2670937597842056, + "grad_norm": 1.5871284008026123, + "learning_rate": 7.715241861880134e-07, + "loss": 0.9913, + "step": 11090 + }, + { + "epoch": 0.26721418077599285, + "grad_norm": 1.5965393781661987, + "learning_rate": 7.713974241963289e-07, + "loss": 0.9124, + "step": 11095 + }, + { + "epoch": 0.2673346017677802, + "grad_norm": 1.5728003978729248, + "learning_rate": 7.712706622046446e-07, + "loss": 0.9236, + "step": 11100 + }, + { + "epoch": 0.26745502275956745, + "grad_norm": 1.4616875648498535, + "learning_rate": 7.711439002129602e-07, + "loss": 0.941, + "step": 11105 + }, + { + "epoch": 0.2675754437513547, + "grad_norm": 1.7237117290496826, + "learning_rate": 7.710171382212756e-07, + "loss": 0.9377, + "step": 11110 + }, + { + "epoch": 0.267695864743142, + "grad_norm": 1.513432502746582, + "learning_rate": 7.708903762295913e-07, + "loss": 0.9646, + "step": 11115 + }, + { + "epoch": 0.2678162857349293, + "grad_norm": 1.4135527610778809, + "learning_rate": 7.707636142379069e-07, + "loss": 0.9422, + "step": 11120 + }, + { + "epoch": 0.2679367067267166, + "grad_norm": 1.4501285552978516, + "learning_rate": 7.706368522462225e-07, + "loss": 0.9505, + "step": 11125 + }, + { + "epoch": 0.2680571277185039, + "grad_norm": 1.7850518226623535, + "learning_rate": 7.705100902545381e-07, + "loss": 0.9199, + "step": 11130 + }, + { + "epoch": 0.2681775487102912, + "grad_norm": 1.5002425909042358, + "learning_rate": 7.703833282628536e-07, + "loss": 0.9142, + "step": 11135 + }, + { + "epoch": 0.2682979697020785, + "grad_norm": 1.6325747966766357, + "learning_rate": 7.702565662711692e-07, + "loss": 0.9115, + "step": 11140 + }, + { + "epoch": 0.26841839069386575, + "grad_norm": 1.522478699684143, + "learning_rate": 7.701298042794848e-07, + "loss": 0.9923, + "step": 11145 + }, + { + "epoch": 0.268538811685653, + "grad_norm": 1.7870874404907227, + "learning_rate": 7.700030422878005e-07, + "loss": 0.9462, + "step": 11150 + }, + { + "epoch": 0.26865923267744035, + "grad_norm": 1.5112918615341187, + "learning_rate": 7.698762802961159e-07, + "loss": 0.8391, + "step": 11155 + }, + { + "epoch": 0.2687796536692276, + "grad_norm": 1.603967308998108, + "learning_rate": 7.697495183044315e-07, + "loss": 0.9103, + "step": 11160 + }, + { + "epoch": 0.2689000746610149, + "grad_norm": 1.631317377090454, + "learning_rate": 7.696227563127472e-07, + "loss": 0.9129, + "step": 11165 + }, + { + "epoch": 0.26902049565280217, + "grad_norm": 1.62999427318573, + "learning_rate": 7.694959943210627e-07, + "loss": 0.9031, + "step": 11170 + }, + { + "epoch": 0.2691409166445895, + "grad_norm": 1.3878083229064941, + "learning_rate": 7.693692323293784e-07, + "loss": 0.919, + "step": 11175 + }, + { + "epoch": 0.2692613376363768, + "grad_norm": 1.516343355178833, + "learning_rate": 7.692424703376939e-07, + "loss": 0.9456, + "step": 11180 + }, + { + "epoch": 0.26938175862816405, + "grad_norm": 1.4829399585723877, + "learning_rate": 7.691157083460095e-07, + "loss": 0.915, + "step": 11185 + }, + { + "epoch": 0.2695021796199514, + "grad_norm": 1.9153403043746948, + "learning_rate": 7.689889463543251e-07, + "loss": 0.9071, + "step": 11190 + }, + { + "epoch": 0.26962260061173865, + "grad_norm": 1.5353426933288574, + "learning_rate": 7.688621843626407e-07, + "loss": 0.9368, + "step": 11195 + }, + { + "epoch": 0.2697430216035259, + "grad_norm": 1.4356238842010498, + "learning_rate": 7.687354223709563e-07, + "loss": 0.9369, + "step": 11200 + }, + { + "epoch": 0.2698634425953132, + "grad_norm": 1.9013326168060303, + "learning_rate": 7.686086603792718e-07, + "loss": 0.9614, + "step": 11205 + }, + { + "epoch": 0.2699838635871005, + "grad_norm": 1.7938717603683472, + "learning_rate": 7.684818983875875e-07, + "loss": 0.9126, + "step": 11210 + }, + { + "epoch": 0.2701042845788878, + "grad_norm": 1.5766910314559937, + "learning_rate": 7.68355136395903e-07, + "loss": 0.9517, + "step": 11215 + }, + { + "epoch": 0.27022470557067507, + "grad_norm": 1.5627079010009766, + "learning_rate": 7.682283744042186e-07, + "loss": 0.9558, + "step": 11220 + }, + { + "epoch": 0.27034512656246235, + "grad_norm": 1.4922775030136108, + "learning_rate": 7.681016124125342e-07, + "loss": 0.9098, + "step": 11225 + }, + { + "epoch": 0.2704655475542497, + "grad_norm": 1.735756278038025, + "learning_rate": 7.679748504208497e-07, + "loss": 0.9373, + "step": 11230 + }, + { + "epoch": 0.27058596854603695, + "grad_norm": 1.7026416063308716, + "learning_rate": 7.678480884291654e-07, + "loss": 0.9146, + "step": 11235 + }, + { + "epoch": 0.2707063895378242, + "grad_norm": 1.3560659885406494, + "learning_rate": 7.67721326437481e-07, + "loss": 0.9555, + "step": 11240 + }, + { + "epoch": 0.2708268105296115, + "grad_norm": 1.432887315750122, + "learning_rate": 7.675945644457966e-07, + "loss": 0.9551, + "step": 11245 + }, + { + "epoch": 0.2709472315213988, + "grad_norm": 1.6378475427627563, + "learning_rate": 7.674678024541121e-07, + "loss": 0.8897, + "step": 11250 + }, + { + "epoch": 0.2710676525131861, + "grad_norm": 1.5451323986053467, + "learning_rate": 7.673410404624277e-07, + "loss": 0.9499, + "step": 11255 + }, + { + "epoch": 0.27118807350497337, + "grad_norm": 1.508756160736084, + "learning_rate": 7.672142784707433e-07, + "loss": 0.9058, + "step": 11260 + }, + { + "epoch": 0.2713084944967607, + "grad_norm": 1.5754610300064087, + "learning_rate": 7.670875164790589e-07, + "loss": 0.9651, + "step": 11265 + }, + { + "epoch": 0.271428915488548, + "grad_norm": 1.5599875450134277, + "learning_rate": 7.669607544873746e-07, + "loss": 0.9101, + "step": 11270 + }, + { + "epoch": 0.27154933648033525, + "grad_norm": 1.5504896640777588, + "learning_rate": 7.6683399249569e-07, + "loss": 0.9511, + "step": 11275 + }, + { + "epoch": 0.2716697574721225, + "grad_norm": 1.6675881147384644, + "learning_rate": 7.667072305040056e-07, + "loss": 0.9192, + "step": 11280 + }, + { + "epoch": 0.27179017846390985, + "grad_norm": 1.3830749988555908, + "learning_rate": 7.665804685123213e-07, + "loss": 0.922, + "step": 11285 + }, + { + "epoch": 0.2719105994556971, + "grad_norm": 1.6686891317367554, + "learning_rate": 7.664537065206368e-07, + "loss": 0.9337, + "step": 11290 + }, + { + "epoch": 0.2720310204474844, + "grad_norm": 1.8631843328475952, + "learning_rate": 7.663269445289524e-07, + "loss": 0.9199, + "step": 11295 + }, + { + "epoch": 0.27215144143927167, + "grad_norm": 1.4798780679702759, + "learning_rate": 7.66200182537268e-07, + "loss": 0.958, + "step": 11300 + }, + { + "epoch": 0.272271862431059, + "grad_norm": 1.3518550395965576, + "learning_rate": 7.660734205455836e-07, + "loss": 0.8929, + "step": 11305 + }, + { + "epoch": 0.27239228342284627, + "grad_norm": 1.6929333209991455, + "learning_rate": 7.659466585538992e-07, + "loss": 0.916, + "step": 11310 + }, + { + "epoch": 0.27251270441463354, + "grad_norm": 1.4561303853988647, + "learning_rate": 7.658198965622148e-07, + "loss": 0.9634, + "step": 11315 + }, + { + "epoch": 0.2726331254064209, + "grad_norm": 1.59073007106781, + "learning_rate": 7.656931345705303e-07, + "loss": 0.9048, + "step": 11320 + }, + { + "epoch": 0.27275354639820815, + "grad_norm": 1.526777744293213, + "learning_rate": 7.655663725788459e-07, + "loss": 0.9254, + "step": 11325 + }, + { + "epoch": 0.2728739673899954, + "grad_norm": 1.476694107055664, + "learning_rate": 7.654396105871616e-07, + "loss": 0.9151, + "step": 11330 + }, + { + "epoch": 0.2729943883817827, + "grad_norm": 1.5337563753128052, + "learning_rate": 7.653128485954771e-07, + "loss": 0.9102, + "step": 11335 + }, + { + "epoch": 0.27311480937357, + "grad_norm": 1.5907397270202637, + "learning_rate": 7.651860866037926e-07, + "loss": 0.9202, + "step": 11340 + }, + { + "epoch": 0.2732352303653573, + "grad_norm": 1.7442792654037476, + "learning_rate": 7.650593246121083e-07, + "loss": 0.9326, + "step": 11345 + }, + { + "epoch": 0.27335565135714457, + "grad_norm": 1.5198930501937866, + "learning_rate": 7.649325626204238e-07, + "loss": 0.9246, + "step": 11350 + }, + { + "epoch": 0.27347607234893184, + "grad_norm": 1.7051467895507812, + "learning_rate": 7.648058006287395e-07, + "loss": 0.9402, + "step": 11355 + }, + { + "epoch": 0.27359649334071917, + "grad_norm": 1.5837024450302124, + "learning_rate": 7.646790386370551e-07, + "loss": 0.8917, + "step": 11360 + }, + { + "epoch": 0.27371691433250644, + "grad_norm": 2.134157419204712, + "learning_rate": 7.645522766453705e-07, + "loss": 0.9517, + "step": 11365 + }, + { + "epoch": 0.2738373353242937, + "grad_norm": 1.6315033435821533, + "learning_rate": 7.644255146536862e-07, + "loss": 0.9348, + "step": 11370 + }, + { + "epoch": 0.27395775631608105, + "grad_norm": 1.4641317129135132, + "learning_rate": 7.642987526620018e-07, + "loss": 0.9088, + "step": 11375 + }, + { + "epoch": 0.2740781773078683, + "grad_norm": 1.6204805374145508, + "learning_rate": 7.641719906703174e-07, + "loss": 0.9057, + "step": 11380 + }, + { + "epoch": 0.2741985982996556, + "grad_norm": 1.5819618701934814, + "learning_rate": 7.64045228678633e-07, + "loss": 0.9122, + "step": 11385 + }, + { + "epoch": 0.27431901929144287, + "grad_norm": 1.5845000743865967, + "learning_rate": 7.639184666869486e-07, + "loss": 0.9032, + "step": 11390 + }, + { + "epoch": 0.2744394402832302, + "grad_norm": 1.9233126640319824, + "learning_rate": 7.637917046952641e-07, + "loss": 0.9111, + "step": 11395 + }, + { + "epoch": 0.27455986127501747, + "grad_norm": 1.5648930072784424, + "learning_rate": 7.636649427035797e-07, + "loss": 0.9242, + "step": 11400 + }, + { + "epoch": 0.27468028226680474, + "grad_norm": 1.3369144201278687, + "learning_rate": 7.635381807118954e-07, + "loss": 0.8929, + "step": 11405 + }, + { + "epoch": 0.274800703258592, + "grad_norm": 1.5446151494979858, + "learning_rate": 7.634114187202108e-07, + "loss": 0.9359, + "step": 11410 + }, + { + "epoch": 0.27492112425037935, + "grad_norm": 1.5810915231704712, + "learning_rate": 7.632846567285265e-07, + "loss": 0.9423, + "step": 11415 + }, + { + "epoch": 0.2750415452421666, + "grad_norm": 1.6881409883499146, + "learning_rate": 7.631578947368421e-07, + "loss": 0.9508, + "step": 11420 + }, + { + "epoch": 0.2751619662339539, + "grad_norm": 1.5262564420700073, + "learning_rate": 7.630311327451576e-07, + "loss": 0.9414, + "step": 11425 + }, + { + "epoch": 0.27528238722574117, + "grad_norm": 1.4334099292755127, + "learning_rate": 7.629043707534733e-07, + "loss": 0.9007, + "step": 11430 + }, + { + "epoch": 0.2754028082175285, + "grad_norm": 1.3691715002059937, + "learning_rate": 7.627776087617888e-07, + "loss": 0.8882, + "step": 11435 + }, + { + "epoch": 0.27552322920931577, + "grad_norm": 1.717207670211792, + "learning_rate": 7.626508467701044e-07, + "loss": 0.8711, + "step": 11440 + }, + { + "epoch": 0.27564365020110304, + "grad_norm": 1.5524109601974487, + "learning_rate": 7.6252408477842e-07, + "loss": 0.8835, + "step": 11445 + }, + { + "epoch": 0.27576407119289037, + "grad_norm": 1.6486324071884155, + "learning_rate": 7.623973227867357e-07, + "loss": 0.8512, + "step": 11450 + }, + { + "epoch": 0.27588449218467764, + "grad_norm": 1.8091490268707275, + "learning_rate": 7.622705607950511e-07, + "loss": 0.9348, + "step": 11455 + }, + { + "epoch": 0.2760049131764649, + "grad_norm": 1.6140121221542358, + "learning_rate": 7.621437988033667e-07, + "loss": 0.9328, + "step": 11460 + }, + { + "epoch": 0.2761253341682522, + "grad_norm": 1.4895670413970947, + "learning_rate": 7.620170368116824e-07, + "loss": 0.8896, + "step": 11465 + }, + { + "epoch": 0.2762457551600395, + "grad_norm": 1.525490403175354, + "learning_rate": 7.618902748199979e-07, + "loss": 0.969, + "step": 11470 + }, + { + "epoch": 0.2763661761518268, + "grad_norm": 1.5126816034317017, + "learning_rate": 7.617635128283136e-07, + "loss": 0.9383, + "step": 11475 + }, + { + "epoch": 0.27648659714361407, + "grad_norm": 1.466437578201294, + "learning_rate": 7.616367508366291e-07, + "loss": 0.9023, + "step": 11480 + }, + { + "epoch": 0.27660701813540134, + "grad_norm": 1.477810263633728, + "learning_rate": 7.615099888449446e-07, + "loss": 0.9952, + "step": 11485 + }, + { + "epoch": 0.27672743912718867, + "grad_norm": 1.444549798965454, + "learning_rate": 7.613832268532603e-07, + "loss": 0.969, + "step": 11490 + }, + { + "epoch": 0.27684786011897594, + "grad_norm": 1.5766974687576294, + "learning_rate": 7.612564648615759e-07, + "loss": 0.8708, + "step": 11495 + }, + { + "epoch": 0.2769682811107632, + "grad_norm": 1.6157164573669434, + "learning_rate": 7.611297028698915e-07, + "loss": 0.9892, + "step": 11500 + }, + { + "epoch": 0.27708870210255054, + "grad_norm": 1.6714296340942383, + "learning_rate": 7.61002940878207e-07, + "loss": 0.8914, + "step": 11505 + }, + { + "epoch": 0.2772091230943378, + "grad_norm": 1.6638739109039307, + "learning_rate": 7.608761788865227e-07, + "loss": 0.9758, + "step": 11510 + }, + { + "epoch": 0.2773295440861251, + "grad_norm": 1.5034291744232178, + "learning_rate": 7.607494168948382e-07, + "loss": 0.9094, + "step": 11515 + }, + { + "epoch": 0.27744996507791236, + "grad_norm": 1.6141936779022217, + "learning_rate": 7.606226549031538e-07, + "loss": 0.973, + "step": 11520 + }, + { + "epoch": 0.2775703860696997, + "grad_norm": 1.4471684694290161, + "learning_rate": 7.604958929114694e-07, + "loss": 0.934, + "step": 11525 + }, + { + "epoch": 0.27769080706148697, + "grad_norm": 1.5534322261810303, + "learning_rate": 7.603691309197849e-07, + "loss": 0.935, + "step": 11530 + }, + { + "epoch": 0.27781122805327424, + "grad_norm": 1.5422011613845825, + "learning_rate": 7.602423689281006e-07, + "loss": 0.8949, + "step": 11535 + }, + { + "epoch": 0.2779316490450615, + "grad_norm": 1.6572320461273193, + "learning_rate": 7.601156069364162e-07, + "loss": 0.9403, + "step": 11540 + }, + { + "epoch": 0.27805207003684884, + "grad_norm": 1.4555169343948364, + "learning_rate": 7.599888449447317e-07, + "loss": 0.9631, + "step": 11545 + }, + { + "epoch": 0.2781724910286361, + "grad_norm": 1.4223170280456543, + "learning_rate": 7.598620829530473e-07, + "loss": 0.9567, + "step": 11550 + }, + { + "epoch": 0.2782929120204234, + "grad_norm": 1.6159312725067139, + "learning_rate": 7.597353209613629e-07, + "loss": 0.9088, + "step": 11555 + }, + { + "epoch": 0.27841333301221066, + "grad_norm": 1.6263511180877686, + "learning_rate": 7.596085589696785e-07, + "loss": 0.9317, + "step": 11560 + }, + { + "epoch": 0.278533754003998, + "grad_norm": 1.8005136251449585, + "learning_rate": 7.594817969779941e-07, + "loss": 0.9596, + "step": 11565 + }, + { + "epoch": 0.27865417499578526, + "grad_norm": 1.4669032096862793, + "learning_rate": 7.593550349863098e-07, + "loss": 0.9032, + "step": 11570 + }, + { + "epoch": 0.27877459598757254, + "grad_norm": 1.738063097000122, + "learning_rate": 7.592282729946252e-07, + "loss": 0.8592, + "step": 11575 + }, + { + "epoch": 0.27889501697935987, + "grad_norm": 1.4538829326629639, + "learning_rate": 7.591015110029408e-07, + "loss": 0.8801, + "step": 11580 + }, + { + "epoch": 0.27901543797114714, + "grad_norm": 1.568163514137268, + "learning_rate": 7.589747490112565e-07, + "loss": 0.9436, + "step": 11585 + }, + { + "epoch": 0.2791358589629344, + "grad_norm": 1.8308765888214111, + "learning_rate": 7.58847987019572e-07, + "loss": 0.8998, + "step": 11590 + }, + { + "epoch": 0.2792562799547217, + "grad_norm": 1.699054479598999, + "learning_rate": 7.587212250278876e-07, + "loss": 0.9425, + "step": 11595 + }, + { + "epoch": 0.279376700946509, + "grad_norm": 1.5109950304031372, + "learning_rate": 7.585944630362032e-07, + "loss": 0.9135, + "step": 11600 + }, + { + "epoch": 0.2794971219382963, + "grad_norm": 1.4689977169036865, + "learning_rate": 7.584677010445187e-07, + "loss": 0.9104, + "step": 11605 + }, + { + "epoch": 0.27961754293008356, + "grad_norm": 1.4785994291305542, + "learning_rate": 7.583409390528344e-07, + "loss": 0.9532, + "step": 11610 + }, + { + "epoch": 0.27973796392187084, + "grad_norm": 1.7106760740280151, + "learning_rate": 7.5821417706115e-07, + "loss": 0.9276, + "step": 11615 + }, + { + "epoch": 0.27985838491365816, + "grad_norm": 1.4372215270996094, + "learning_rate": 7.580874150694655e-07, + "loss": 0.8959, + "step": 11620 + }, + { + "epoch": 0.27997880590544544, + "grad_norm": 1.5852820873260498, + "learning_rate": 7.579606530777811e-07, + "loss": 0.9093, + "step": 11625 + }, + { + "epoch": 0.2800992268972327, + "grad_norm": 1.6567399501800537, + "learning_rate": 7.578338910860968e-07, + "loss": 0.8969, + "step": 11630 + }, + { + "epoch": 0.28021964788902004, + "grad_norm": 1.4735957384109497, + "learning_rate": 7.577071290944123e-07, + "loss": 0.9294, + "step": 11635 + }, + { + "epoch": 0.2803400688808073, + "grad_norm": 1.548314094543457, + "learning_rate": 7.575803671027278e-07, + "loss": 0.9493, + "step": 11640 + }, + { + "epoch": 0.2804604898725946, + "grad_norm": 1.645434021949768, + "learning_rate": 7.574536051110435e-07, + "loss": 0.9196, + "step": 11645 + }, + { + "epoch": 0.28058091086438186, + "grad_norm": 1.3293720483779907, + "learning_rate": 7.57326843119359e-07, + "loss": 0.9154, + "step": 11650 + }, + { + "epoch": 0.2807013318561692, + "grad_norm": 1.407025694847107, + "learning_rate": 7.572000811276747e-07, + "loss": 0.9089, + "step": 11655 + }, + { + "epoch": 0.28082175284795646, + "grad_norm": 1.4637197256088257, + "learning_rate": 7.570733191359903e-07, + "loss": 0.9169, + "step": 11660 + }, + { + "epoch": 0.28094217383974374, + "grad_norm": 1.6624650955200195, + "learning_rate": 7.569465571443057e-07, + "loss": 0.9073, + "step": 11665 + }, + { + "epoch": 0.281062594831531, + "grad_norm": 1.5180774927139282, + "learning_rate": 7.568197951526214e-07, + "loss": 0.9752, + "step": 11670 + }, + { + "epoch": 0.28118301582331834, + "grad_norm": 1.5979740619659424, + "learning_rate": 7.56693033160937e-07, + "loss": 0.9037, + "step": 11675 + }, + { + "epoch": 0.2813034368151056, + "grad_norm": 1.5818020105361938, + "learning_rate": 7.565662711692526e-07, + "loss": 0.9473, + "step": 11680 + }, + { + "epoch": 0.2814238578068929, + "grad_norm": 1.7185585498809814, + "learning_rate": 7.564395091775682e-07, + "loss": 0.9279, + "step": 11685 + }, + { + "epoch": 0.28154427879868016, + "grad_norm": 1.6393964290618896, + "learning_rate": 7.563127471858837e-07, + "loss": 0.8911, + "step": 11690 + }, + { + "epoch": 0.2816646997904675, + "grad_norm": 1.623339056968689, + "learning_rate": 7.561859851941993e-07, + "loss": 0.9645, + "step": 11695 + }, + { + "epoch": 0.28178512078225476, + "grad_norm": 1.5230019092559814, + "learning_rate": 7.560592232025149e-07, + "loss": 0.9363, + "step": 11700 + }, + { + "epoch": 0.28190554177404203, + "grad_norm": 1.6126532554626465, + "learning_rate": 7.559324612108306e-07, + "loss": 0.9706, + "step": 11705 + }, + { + "epoch": 0.28202596276582936, + "grad_norm": 1.8683366775512695, + "learning_rate": 7.55805699219146e-07, + "loss": 0.9447, + "step": 11710 + }, + { + "epoch": 0.28214638375761664, + "grad_norm": 1.5551618337631226, + "learning_rate": 7.556789372274617e-07, + "loss": 0.9209, + "step": 11715 + }, + { + "epoch": 0.2822668047494039, + "grad_norm": 1.4535892009735107, + "learning_rate": 7.555521752357773e-07, + "loss": 0.9299, + "step": 11720 + }, + { + "epoch": 0.2823872257411912, + "grad_norm": 2.7465555667877197, + "learning_rate": 7.554254132440928e-07, + "loss": 0.9016, + "step": 11725 + }, + { + "epoch": 0.2825076467329785, + "grad_norm": 1.6401972770690918, + "learning_rate": 7.552986512524085e-07, + "loss": 0.8983, + "step": 11730 + }, + { + "epoch": 0.2826280677247658, + "grad_norm": 1.655357003211975, + "learning_rate": 7.55171889260724e-07, + "loss": 0.9406, + "step": 11735 + }, + { + "epoch": 0.28274848871655306, + "grad_norm": 1.4839775562286377, + "learning_rate": 7.550451272690396e-07, + "loss": 0.9209, + "step": 11740 + }, + { + "epoch": 0.28286890970834033, + "grad_norm": 1.6104551553726196, + "learning_rate": 7.549183652773552e-07, + "loss": 0.9115, + "step": 11745 + }, + { + "epoch": 0.28298933070012766, + "grad_norm": 1.6464667320251465, + "learning_rate": 7.547916032856709e-07, + "loss": 0.9345, + "step": 11750 + }, + { + "epoch": 0.28310975169191493, + "grad_norm": 1.4681965112686157, + "learning_rate": 7.546648412939863e-07, + "loss": 0.9639, + "step": 11755 + }, + { + "epoch": 0.2832301726837022, + "grad_norm": 1.4175541400909424, + "learning_rate": 7.545380793023019e-07, + "loss": 0.938, + "step": 11760 + }, + { + "epoch": 0.28335059367548954, + "grad_norm": 1.8389374017715454, + "learning_rate": 7.544113173106176e-07, + "loss": 0.9199, + "step": 11765 + }, + { + "epoch": 0.2834710146672768, + "grad_norm": 1.4778343439102173, + "learning_rate": 7.542845553189331e-07, + "loss": 0.9656, + "step": 11770 + }, + { + "epoch": 0.2835914356590641, + "grad_norm": 1.595827341079712, + "learning_rate": 7.541577933272488e-07, + "loss": 0.9093, + "step": 11775 + }, + { + "epoch": 0.28371185665085136, + "grad_norm": 1.5474449396133423, + "learning_rate": 7.540310313355643e-07, + "loss": 0.9245, + "step": 11780 + }, + { + "epoch": 0.2838322776426387, + "grad_norm": 1.8880990743637085, + "learning_rate": 7.539042693438798e-07, + "loss": 0.9379, + "step": 11785 + }, + { + "epoch": 0.28395269863442596, + "grad_norm": 1.5933219194412231, + "learning_rate": 7.537775073521955e-07, + "loss": 0.9089, + "step": 11790 + }, + { + "epoch": 0.28407311962621323, + "grad_norm": 1.6476610898971558, + "learning_rate": 7.536507453605111e-07, + "loss": 0.947, + "step": 11795 + }, + { + "epoch": 0.2841935406180005, + "grad_norm": 1.3580931425094604, + "learning_rate": 7.535239833688267e-07, + "loss": 0.935, + "step": 11800 + }, + { + "epoch": 0.28431396160978784, + "grad_norm": 1.3850847482681274, + "learning_rate": 7.533972213771422e-07, + "loss": 0.9051, + "step": 11805 + }, + { + "epoch": 0.2844343826015751, + "grad_norm": 1.4011439085006714, + "learning_rate": 7.532704593854578e-07, + "loss": 0.8799, + "step": 11810 + }, + { + "epoch": 0.2845548035933624, + "grad_norm": 1.6950008869171143, + "learning_rate": 7.531436973937734e-07, + "loss": 0.9325, + "step": 11815 + }, + { + "epoch": 0.2846752245851497, + "grad_norm": 1.5124635696411133, + "learning_rate": 7.53016935402089e-07, + "loss": 0.9198, + "step": 11820 + }, + { + "epoch": 0.284795645576937, + "grad_norm": 1.481153964996338, + "learning_rate": 7.528901734104046e-07, + "loss": 0.9484, + "step": 11825 + }, + { + "epoch": 0.28491606656872426, + "grad_norm": 1.6342970132827759, + "learning_rate": 7.527634114187201e-07, + "loss": 0.9259, + "step": 11830 + }, + { + "epoch": 0.28503648756051153, + "grad_norm": 1.6247309446334839, + "learning_rate": 7.526366494270358e-07, + "loss": 0.9414, + "step": 11835 + }, + { + "epoch": 0.28515690855229886, + "grad_norm": 1.7598034143447876, + "learning_rate": 7.525098874353514e-07, + "loss": 0.9578, + "step": 11840 + }, + { + "epoch": 0.28527732954408613, + "grad_norm": 1.5112534761428833, + "learning_rate": 7.523831254436669e-07, + "loss": 0.9242, + "step": 11845 + }, + { + "epoch": 0.2853977505358734, + "grad_norm": 1.48063063621521, + "learning_rate": 7.522563634519825e-07, + "loss": 0.9513, + "step": 11850 + }, + { + "epoch": 0.2855181715276607, + "grad_norm": 1.4742642641067505, + "learning_rate": 7.521296014602981e-07, + "loss": 0.9896, + "step": 11855 + }, + { + "epoch": 0.285638592519448, + "grad_norm": 1.7979505062103271, + "learning_rate": 7.520028394686137e-07, + "loss": 0.9469, + "step": 11860 + }, + { + "epoch": 0.2857590135112353, + "grad_norm": 1.6058517694473267, + "learning_rate": 7.518760774769293e-07, + "loss": 0.9196, + "step": 11865 + }, + { + "epoch": 0.28587943450302256, + "grad_norm": 1.6035789251327515, + "learning_rate": 7.51749315485245e-07, + "loss": 0.9473, + "step": 11870 + }, + { + "epoch": 0.28599985549480983, + "grad_norm": 1.6348637342453003, + "learning_rate": 7.516225534935604e-07, + "loss": 0.9306, + "step": 11875 + }, + { + "epoch": 0.28612027648659716, + "grad_norm": 1.7751243114471436, + "learning_rate": 7.51495791501876e-07, + "loss": 0.967, + "step": 11880 + }, + { + "epoch": 0.28624069747838443, + "grad_norm": 1.681087613105774, + "learning_rate": 7.513690295101917e-07, + "loss": 0.9591, + "step": 11885 + }, + { + "epoch": 0.2863611184701717, + "grad_norm": 1.6238495111465454, + "learning_rate": 7.512422675185072e-07, + "loss": 0.9472, + "step": 11890 + }, + { + "epoch": 0.28648153946195903, + "grad_norm": 1.5437794923782349, + "learning_rate": 7.511155055268227e-07, + "loss": 0.928, + "step": 11895 + }, + { + "epoch": 0.2866019604537463, + "grad_norm": 1.565635323524475, + "learning_rate": 7.509887435351384e-07, + "loss": 0.9666, + "step": 11900 + }, + { + "epoch": 0.2867223814455336, + "grad_norm": 1.6704597473144531, + "learning_rate": 7.508619815434539e-07, + "loss": 0.9399, + "step": 11905 + }, + { + "epoch": 0.28684280243732085, + "grad_norm": 1.522324800491333, + "learning_rate": 7.507352195517696e-07, + "loss": 0.9282, + "step": 11910 + }, + { + "epoch": 0.2869632234291082, + "grad_norm": 1.4775621891021729, + "learning_rate": 7.506084575600852e-07, + "loss": 0.8827, + "step": 11915 + }, + { + "epoch": 0.28708364442089546, + "grad_norm": 1.5191771984100342, + "learning_rate": 7.504816955684007e-07, + "loss": 0.887, + "step": 11920 + }, + { + "epoch": 0.28720406541268273, + "grad_norm": 1.6595885753631592, + "learning_rate": 7.503549335767163e-07, + "loss": 0.9344, + "step": 11925 + }, + { + "epoch": 0.28732448640447, + "grad_norm": 1.7235771417617798, + "learning_rate": 7.502281715850319e-07, + "loss": 0.967, + "step": 11930 + }, + { + "epoch": 0.28744490739625733, + "grad_norm": 1.4997406005859375, + "learning_rate": 7.501014095933475e-07, + "loss": 0.9614, + "step": 11935 + }, + { + "epoch": 0.2875653283880446, + "grad_norm": 1.6935820579528809, + "learning_rate": 7.49974647601663e-07, + "loss": 0.9519, + "step": 11940 + }, + { + "epoch": 0.2876857493798319, + "grad_norm": 1.5619289875030518, + "learning_rate": 7.498478856099787e-07, + "loss": 0.8883, + "step": 11945 + }, + { + "epoch": 0.2878061703716192, + "grad_norm": 1.4663188457489014, + "learning_rate": 7.497211236182942e-07, + "loss": 0.9096, + "step": 11950 + }, + { + "epoch": 0.2879265913634065, + "grad_norm": 1.4758613109588623, + "learning_rate": 7.495943616266098e-07, + "loss": 0.9472, + "step": 11955 + }, + { + "epoch": 0.28804701235519375, + "grad_norm": 1.6588231325149536, + "learning_rate": 7.494675996349255e-07, + "loss": 0.8967, + "step": 11960 + }, + { + "epoch": 0.28816743334698103, + "grad_norm": 1.676204800605774, + "learning_rate": 7.493408376432409e-07, + "loss": 0.9067, + "step": 11965 + }, + { + "epoch": 0.28828785433876836, + "grad_norm": 1.6175498962402344, + "learning_rate": 7.492140756515566e-07, + "loss": 0.9315, + "step": 11970 + }, + { + "epoch": 0.28840827533055563, + "grad_norm": 1.8844404220581055, + "learning_rate": 7.490873136598722e-07, + "loss": 0.9453, + "step": 11975 + }, + { + "epoch": 0.2885286963223429, + "grad_norm": 1.6034772396087646, + "learning_rate": 7.489605516681878e-07, + "loss": 0.9564, + "step": 11980 + }, + { + "epoch": 0.2886491173141302, + "grad_norm": 1.7410510778427124, + "learning_rate": 7.488337896765033e-07, + "loss": 0.952, + "step": 11985 + }, + { + "epoch": 0.2887695383059175, + "grad_norm": 1.5769463777542114, + "learning_rate": 7.487070276848189e-07, + "loss": 0.9601, + "step": 11990 + }, + { + "epoch": 0.2888899592977048, + "grad_norm": 1.3784805536270142, + "learning_rate": 7.485802656931345e-07, + "loss": 0.9725, + "step": 11995 + }, + { + "epoch": 0.28901038028949205, + "grad_norm": 1.6138496398925781, + "learning_rate": 7.484535037014501e-07, + "loss": 0.8706, + "step": 12000 + }, + { + "epoch": 0.2891308012812793, + "grad_norm": 1.6166455745697021, + "learning_rate": 7.483267417097658e-07, + "loss": 0.9151, + "step": 12005 + }, + { + "epoch": 0.28925122227306665, + "grad_norm": 1.4987436532974243, + "learning_rate": 7.481999797180812e-07, + "loss": 0.95, + "step": 12010 + }, + { + "epoch": 0.28937164326485393, + "grad_norm": 1.3807342052459717, + "learning_rate": 7.480732177263968e-07, + "loss": 0.8848, + "step": 12015 + }, + { + "epoch": 0.2894920642566412, + "grad_norm": 1.5243955850601196, + "learning_rate": 7.479464557347125e-07, + "loss": 0.9122, + "step": 12020 + }, + { + "epoch": 0.28961248524842853, + "grad_norm": 1.4616668224334717, + "learning_rate": 7.47819693743028e-07, + "loss": 0.971, + "step": 12025 + }, + { + "epoch": 0.2897329062402158, + "grad_norm": 1.71182119846344, + "learning_rate": 7.476929317513437e-07, + "loss": 0.9404, + "step": 12030 + }, + { + "epoch": 0.2898533272320031, + "grad_norm": 1.7082267999649048, + "learning_rate": 7.475661697596592e-07, + "loss": 0.8648, + "step": 12035 + }, + { + "epoch": 0.28997374822379035, + "grad_norm": 1.566318154335022, + "learning_rate": 7.474394077679748e-07, + "loss": 0.9647, + "step": 12040 + }, + { + "epoch": 0.2900941692155777, + "grad_norm": 1.5414106845855713, + "learning_rate": 7.473126457762904e-07, + "loss": 0.927, + "step": 12045 + }, + { + "epoch": 0.29021459020736495, + "grad_norm": 1.4850926399230957, + "learning_rate": 7.47185883784606e-07, + "loss": 0.9644, + "step": 12050 + }, + { + "epoch": 0.2903350111991522, + "grad_norm": 1.467564582824707, + "learning_rate": 7.470591217929216e-07, + "loss": 0.9193, + "step": 12055 + }, + { + "epoch": 0.2904554321909395, + "grad_norm": 1.3166462182998657, + "learning_rate": 7.469323598012371e-07, + "loss": 0.9112, + "step": 12060 + }, + { + "epoch": 0.29057585318272683, + "grad_norm": 1.6410185098648071, + "learning_rate": 7.468055978095528e-07, + "loss": 0.9226, + "step": 12065 + }, + { + "epoch": 0.2906962741745141, + "grad_norm": 1.5618233680725098, + "learning_rate": 7.466788358178684e-07, + "loss": 0.9354, + "step": 12070 + }, + { + "epoch": 0.2908166951663014, + "grad_norm": 1.4823178052902222, + "learning_rate": 7.465520738261839e-07, + "loss": 0.9549, + "step": 12075 + }, + { + "epoch": 0.2909371161580887, + "grad_norm": 1.6021852493286133, + "learning_rate": 7.464253118344995e-07, + "loss": 0.9438, + "step": 12080 + }, + { + "epoch": 0.291057537149876, + "grad_norm": 1.3809022903442383, + "learning_rate": 7.462985498428151e-07, + "loss": 0.9323, + "step": 12085 + }, + { + "epoch": 0.29117795814166325, + "grad_norm": 1.5327422618865967, + "learning_rate": 7.461717878511307e-07, + "loss": 0.9262, + "step": 12090 + }, + { + "epoch": 0.2912983791334505, + "grad_norm": 1.8754101991653442, + "learning_rate": 7.460450258594463e-07, + "loss": 0.9813, + "step": 12095 + }, + { + "epoch": 0.29141880012523785, + "grad_norm": 1.5512574911117554, + "learning_rate": 7.45918263867762e-07, + "loss": 0.9313, + "step": 12100 + }, + { + "epoch": 0.2915392211170251, + "grad_norm": 1.3583546876907349, + "learning_rate": 7.457915018760774e-07, + "loss": 0.9547, + "step": 12105 + }, + { + "epoch": 0.2916596421088124, + "grad_norm": 1.700033187866211, + "learning_rate": 7.45664739884393e-07, + "loss": 0.9332, + "step": 12110 + }, + { + "epoch": 0.2917800631005997, + "grad_norm": 1.4785394668579102, + "learning_rate": 7.455379778927087e-07, + "loss": 0.9547, + "step": 12115 + }, + { + "epoch": 0.291900484092387, + "grad_norm": 1.5970126390457153, + "learning_rate": 7.454112159010242e-07, + "loss": 0.9075, + "step": 12120 + }, + { + "epoch": 0.2920209050841743, + "grad_norm": 1.6902731657028198, + "learning_rate": 7.452844539093398e-07, + "loss": 0.8942, + "step": 12125 + }, + { + "epoch": 0.29214132607596155, + "grad_norm": 1.7784210443496704, + "learning_rate": 7.451576919176554e-07, + "loss": 0.9149, + "step": 12130 + }, + { + "epoch": 0.2922617470677488, + "grad_norm": 1.578518033027649, + "learning_rate": 7.450309299259709e-07, + "loss": 0.8796, + "step": 12135 + }, + { + "epoch": 0.29238216805953615, + "grad_norm": 1.5227102041244507, + "learning_rate": 7.449041679342866e-07, + "loss": 0.9589, + "step": 12140 + }, + { + "epoch": 0.2925025890513234, + "grad_norm": 1.3288702964782715, + "learning_rate": 7.447774059426022e-07, + "loss": 0.9111, + "step": 12145 + }, + { + "epoch": 0.2926230100431107, + "grad_norm": 1.8506990671157837, + "learning_rate": 7.446506439509177e-07, + "loss": 0.8961, + "step": 12150 + }, + { + "epoch": 0.292743431034898, + "grad_norm": 1.7175493240356445, + "learning_rate": 7.445238819592333e-07, + "loss": 0.8911, + "step": 12155 + }, + { + "epoch": 0.2928638520266853, + "grad_norm": 1.4701457023620605, + "learning_rate": 7.44397119967549e-07, + "loss": 0.9094, + "step": 12160 + }, + { + "epoch": 0.2929842730184726, + "grad_norm": 1.4599250555038452, + "learning_rate": 7.442703579758645e-07, + "loss": 0.9085, + "step": 12165 + }, + { + "epoch": 0.29310469401025985, + "grad_norm": 1.5617862939834595, + "learning_rate": 7.4414359598418e-07, + "loss": 0.9693, + "step": 12170 + }, + { + "epoch": 0.2932251150020472, + "grad_norm": 1.5455378293991089, + "learning_rate": 7.440168339924957e-07, + "loss": 0.9117, + "step": 12175 + }, + { + "epoch": 0.29334553599383445, + "grad_norm": 1.577228307723999, + "learning_rate": 7.438900720008112e-07, + "loss": 0.9267, + "step": 12180 + }, + { + "epoch": 0.2934659569856217, + "grad_norm": 1.498317003250122, + "learning_rate": 7.437633100091269e-07, + "loss": 0.9736, + "step": 12185 + }, + { + "epoch": 0.293586377977409, + "grad_norm": 1.5183560848236084, + "learning_rate": 7.436365480174425e-07, + "loss": 0.9262, + "step": 12190 + }, + { + "epoch": 0.2937067989691963, + "grad_norm": 1.533486247062683, + "learning_rate": 7.435097860257579e-07, + "loss": 0.9778, + "step": 12195 + }, + { + "epoch": 0.2938272199609836, + "grad_norm": 1.602159023284912, + "learning_rate": 7.433830240340736e-07, + "loss": 0.9087, + "step": 12200 + }, + { + "epoch": 0.29394764095277087, + "grad_norm": 1.51285719871521, + "learning_rate": 7.432562620423892e-07, + "loss": 0.9398, + "step": 12205 + }, + { + "epoch": 0.2940680619445582, + "grad_norm": 1.9584453105926514, + "learning_rate": 7.431295000507048e-07, + "loss": 0.9404, + "step": 12210 + }, + { + "epoch": 0.2941884829363455, + "grad_norm": 1.566316843032837, + "learning_rate": 7.430027380590204e-07, + "loss": 0.8733, + "step": 12215 + }, + { + "epoch": 0.29430890392813275, + "grad_norm": 1.4096190929412842, + "learning_rate": 7.42875976067336e-07, + "loss": 0.8925, + "step": 12220 + }, + { + "epoch": 0.29442932491992, + "grad_norm": 1.6779398918151855, + "learning_rate": 7.427492140756515e-07, + "loss": 0.9056, + "step": 12225 + }, + { + "epoch": 0.29454974591170735, + "grad_norm": 1.5147545337677002, + "learning_rate": 7.426224520839671e-07, + "loss": 0.8572, + "step": 12230 + }, + { + "epoch": 0.2946701669034946, + "grad_norm": 1.5745984315872192, + "learning_rate": 7.424956900922828e-07, + "loss": 0.9432, + "step": 12235 + }, + { + "epoch": 0.2947905878952819, + "grad_norm": 1.381061315536499, + "learning_rate": 7.423689281005982e-07, + "loss": 0.9103, + "step": 12240 + }, + { + "epoch": 0.29491100888706917, + "grad_norm": 1.6397510766983032, + "learning_rate": 7.422421661089139e-07, + "loss": 0.9932, + "step": 12245 + }, + { + "epoch": 0.2950314298788565, + "grad_norm": 1.387656807899475, + "learning_rate": 7.421154041172295e-07, + "loss": 0.9609, + "step": 12250 + }, + { + "epoch": 0.2951518508706438, + "grad_norm": 1.619707465171814, + "learning_rate": 7.41988642125545e-07, + "loss": 0.9056, + "step": 12255 + }, + { + "epoch": 0.29527227186243105, + "grad_norm": 1.9158905744552612, + "learning_rate": 7.418618801338607e-07, + "loss": 0.987, + "step": 12260 + }, + { + "epoch": 0.2953926928542184, + "grad_norm": 1.4746280908584595, + "learning_rate": 7.417351181421762e-07, + "loss": 0.9398, + "step": 12265 + }, + { + "epoch": 0.29551311384600565, + "grad_norm": 1.4023077487945557, + "learning_rate": 7.416083561504918e-07, + "loss": 0.9538, + "step": 12270 + }, + { + "epoch": 0.2956335348377929, + "grad_norm": 1.656822681427002, + "learning_rate": 7.414815941588074e-07, + "loss": 0.9088, + "step": 12275 + }, + { + "epoch": 0.2957539558295802, + "grad_norm": 1.841209888458252, + "learning_rate": 7.41354832167123e-07, + "loss": 0.9502, + "step": 12280 + }, + { + "epoch": 0.2958743768213675, + "grad_norm": 1.6077429056167603, + "learning_rate": 7.412280701754385e-07, + "loss": 0.8769, + "step": 12285 + }, + { + "epoch": 0.2959947978131548, + "grad_norm": 1.752013087272644, + "learning_rate": 7.411013081837541e-07, + "loss": 0.8655, + "step": 12290 + }, + { + "epoch": 0.29611521880494207, + "grad_norm": 1.6575478315353394, + "learning_rate": 7.409745461920698e-07, + "loss": 0.9049, + "step": 12295 + }, + { + "epoch": 0.29623563979672934, + "grad_norm": 1.7278908491134644, + "learning_rate": 7.408477842003853e-07, + "loss": 0.8964, + "step": 12300 + }, + { + "epoch": 0.2963560607885167, + "grad_norm": 1.4890385866165161, + "learning_rate": 7.40721022208701e-07, + "loss": 0.9283, + "step": 12305 + }, + { + "epoch": 0.29647648178030395, + "grad_norm": 1.5512890815734863, + "learning_rate": 7.405942602170165e-07, + "loss": 0.9398, + "step": 12310 + }, + { + "epoch": 0.2965969027720912, + "grad_norm": 1.420446753501892, + "learning_rate": 7.40467498225332e-07, + "loss": 0.9375, + "step": 12315 + }, + { + "epoch": 0.2967173237638785, + "grad_norm": 1.551830768585205, + "learning_rate": 7.403407362336477e-07, + "loss": 0.9162, + "step": 12320 + }, + { + "epoch": 0.2968377447556658, + "grad_norm": 1.4006000757217407, + "learning_rate": 7.402139742419633e-07, + "loss": 0.937, + "step": 12325 + }, + { + "epoch": 0.2969581657474531, + "grad_norm": 1.6034669876098633, + "learning_rate": 7.400872122502789e-07, + "loss": 0.9142, + "step": 12330 + }, + { + "epoch": 0.29707858673924037, + "grad_norm": 1.5593807697296143, + "learning_rate": 7.399604502585944e-07, + "loss": 0.949, + "step": 12335 + }, + { + "epoch": 0.2971990077310277, + "grad_norm": 1.5804564952850342, + "learning_rate": 7.3983368826691e-07, + "loss": 0.8958, + "step": 12340 + }, + { + "epoch": 0.29731942872281497, + "grad_norm": 1.548058271408081, + "learning_rate": 7.397069262752256e-07, + "loss": 0.9193, + "step": 12345 + }, + { + "epoch": 0.29743984971460224, + "grad_norm": 1.42953622341156, + "learning_rate": 7.395801642835412e-07, + "loss": 0.8941, + "step": 12350 + }, + { + "epoch": 0.2975602707063895, + "grad_norm": 1.4340226650238037, + "learning_rate": 7.394534022918568e-07, + "loss": 0.8934, + "step": 12355 + }, + { + "epoch": 0.29768069169817685, + "grad_norm": 2.00081205368042, + "learning_rate": 7.393266403001723e-07, + "loss": 0.9587, + "step": 12360 + }, + { + "epoch": 0.2978011126899641, + "grad_norm": 1.5126882791519165, + "learning_rate": 7.39199878308488e-07, + "loss": 0.9114, + "step": 12365 + }, + { + "epoch": 0.2979215336817514, + "grad_norm": 2.0931479930877686, + "learning_rate": 7.390731163168036e-07, + "loss": 0.9315, + "step": 12370 + }, + { + "epoch": 0.29804195467353867, + "grad_norm": 1.5068527460098267, + "learning_rate": 7.389463543251191e-07, + "loss": 0.966, + "step": 12375 + }, + { + "epoch": 0.298162375665326, + "grad_norm": 1.7116059064865112, + "learning_rate": 7.388195923334347e-07, + "loss": 0.9632, + "step": 12380 + }, + { + "epoch": 0.29828279665711327, + "grad_norm": 1.4662964344024658, + "learning_rate": 7.386928303417503e-07, + "loss": 0.9452, + "step": 12385 + }, + { + "epoch": 0.29840321764890054, + "grad_norm": 2.1836767196655273, + "learning_rate": 7.385660683500659e-07, + "loss": 0.9214, + "step": 12390 + }, + { + "epoch": 0.29852363864068787, + "grad_norm": 1.7127418518066406, + "learning_rate": 7.384393063583815e-07, + "loss": 0.9549, + "step": 12395 + }, + { + "epoch": 0.29864405963247515, + "grad_norm": 2.890199661254883, + "learning_rate": 7.383125443666971e-07, + "loss": 0.9384, + "step": 12400 + }, + { + "epoch": 0.2987644806242624, + "grad_norm": 1.4727007150650024, + "learning_rate": 7.381857823750126e-07, + "loss": 0.9184, + "step": 12405 + }, + { + "epoch": 0.2988849016160497, + "grad_norm": 1.4787065982818604, + "learning_rate": 7.380590203833282e-07, + "loss": 0.9226, + "step": 12410 + }, + { + "epoch": 0.299005322607837, + "grad_norm": 1.5422483682632446, + "learning_rate": 7.379322583916439e-07, + "loss": 0.926, + "step": 12415 + }, + { + "epoch": 0.2991257435996243, + "grad_norm": 1.5323668718338013, + "learning_rate": 7.378054963999594e-07, + "loss": 0.8974, + "step": 12420 + }, + { + "epoch": 0.29924616459141157, + "grad_norm": 1.539766550064087, + "learning_rate": 7.37678734408275e-07, + "loss": 0.9356, + "step": 12425 + }, + { + "epoch": 0.29936658558319884, + "grad_norm": 1.5156326293945312, + "learning_rate": 7.375519724165906e-07, + "loss": 0.9192, + "step": 12430 + }, + { + "epoch": 0.29948700657498617, + "grad_norm": 1.5541387796401978, + "learning_rate": 7.374252104249061e-07, + "loss": 0.9117, + "step": 12435 + }, + { + "epoch": 0.29960742756677344, + "grad_norm": 1.493133544921875, + "learning_rate": 7.372984484332218e-07, + "loss": 0.9163, + "step": 12440 + }, + { + "epoch": 0.2997278485585607, + "grad_norm": 1.497205376625061, + "learning_rate": 7.371716864415374e-07, + "loss": 0.9502, + "step": 12445 + }, + { + "epoch": 0.299848269550348, + "grad_norm": 1.4042925834655762, + "learning_rate": 7.370449244498529e-07, + "loss": 0.9456, + "step": 12450 + }, + { + "epoch": 0.2999686905421353, + "grad_norm": 1.5551249980926514, + "learning_rate": 7.369181624581685e-07, + "loss": 0.9541, + "step": 12455 + }, + { + "epoch": 0.3000891115339226, + "grad_norm": 1.5501880645751953, + "learning_rate": 7.367914004664841e-07, + "loss": 0.9713, + "step": 12460 + }, + { + "epoch": 0.30020953252570987, + "grad_norm": 1.3974348306655884, + "learning_rate": 7.366646384747997e-07, + "loss": 0.935, + "step": 12465 + }, + { + "epoch": 0.3003299535174972, + "grad_norm": 1.6827278137207031, + "learning_rate": 7.365378764831152e-07, + "loss": 0.9721, + "step": 12470 + }, + { + "epoch": 0.30045037450928447, + "grad_norm": 1.3156495094299316, + "learning_rate": 7.364111144914309e-07, + "loss": 0.9203, + "step": 12475 + }, + { + "epoch": 0.30057079550107174, + "grad_norm": 1.7625117301940918, + "learning_rate": 7.362843524997464e-07, + "loss": 0.9274, + "step": 12480 + }, + { + "epoch": 0.300691216492859, + "grad_norm": 1.4372049570083618, + "learning_rate": 7.36157590508062e-07, + "loss": 0.925, + "step": 12485 + }, + { + "epoch": 0.30081163748464634, + "grad_norm": 1.524330496788025, + "learning_rate": 7.360308285163777e-07, + "loss": 0.8828, + "step": 12490 + }, + { + "epoch": 0.3009320584764336, + "grad_norm": 1.4534857273101807, + "learning_rate": 7.359040665246931e-07, + "loss": 0.9535, + "step": 12495 + }, + { + "epoch": 0.3010524794682209, + "grad_norm": 1.603531837463379, + "learning_rate": 7.357773045330088e-07, + "loss": 0.9084, + "step": 12500 + }, + { + "epoch": 0.30117290046000816, + "grad_norm": 1.6054507493972778, + "learning_rate": 7.356505425413244e-07, + "loss": 0.9002, + "step": 12505 + }, + { + "epoch": 0.3012933214517955, + "grad_norm": 1.4217936992645264, + "learning_rate": 7.3552378054964e-07, + "loss": 0.9002, + "step": 12510 + }, + { + "epoch": 0.30141374244358277, + "grad_norm": 1.3850017786026, + "learning_rate": 7.353970185579556e-07, + "loss": 0.9223, + "step": 12515 + }, + { + "epoch": 0.30153416343537004, + "grad_norm": 1.639332890510559, + "learning_rate": 7.352702565662711e-07, + "loss": 0.8923, + "step": 12520 + }, + { + "epoch": 0.30165458442715737, + "grad_norm": 1.4566148519515991, + "learning_rate": 7.351434945745867e-07, + "loss": 0.9832, + "step": 12525 + }, + { + "epoch": 0.30177500541894464, + "grad_norm": 1.6120548248291016, + "learning_rate": 7.350167325829023e-07, + "loss": 0.9499, + "step": 12530 + }, + { + "epoch": 0.3018954264107319, + "grad_norm": 1.4781494140625, + "learning_rate": 7.34889970591218e-07, + "loss": 0.9534, + "step": 12535 + }, + { + "epoch": 0.3020158474025192, + "grad_norm": 1.6403977870941162, + "learning_rate": 7.347632085995334e-07, + "loss": 0.9437, + "step": 12540 + }, + { + "epoch": 0.3021362683943065, + "grad_norm": 1.8433960676193237, + "learning_rate": 7.34636446607849e-07, + "loss": 0.9331, + "step": 12545 + }, + { + "epoch": 0.3022566893860938, + "grad_norm": 1.4172022342681885, + "learning_rate": 7.345096846161647e-07, + "loss": 0.9138, + "step": 12550 + }, + { + "epoch": 0.30237711037788106, + "grad_norm": 1.539608359336853, + "learning_rate": 7.343829226244802e-07, + "loss": 0.965, + "step": 12555 + }, + { + "epoch": 0.30249753136966834, + "grad_norm": 1.4750511646270752, + "learning_rate": 7.342561606327959e-07, + "loss": 0.9426, + "step": 12560 + }, + { + "epoch": 0.30261795236145567, + "grad_norm": 1.6245813369750977, + "learning_rate": 7.341293986411114e-07, + "loss": 0.9602, + "step": 12565 + }, + { + "epoch": 0.30273837335324294, + "grad_norm": 1.6092920303344727, + "learning_rate": 7.34002636649427e-07, + "loss": 0.9319, + "step": 12570 + }, + { + "epoch": 0.3028587943450302, + "grad_norm": 1.5803841352462769, + "learning_rate": 7.338758746577426e-07, + "loss": 0.9257, + "step": 12575 + }, + { + "epoch": 0.3029792153368175, + "grad_norm": 1.3965332508087158, + "learning_rate": 7.337491126660582e-07, + "loss": 0.9672, + "step": 12580 + }, + { + "epoch": 0.3030996363286048, + "grad_norm": 1.4964956045150757, + "learning_rate": 7.336223506743737e-07, + "loss": 0.9366, + "step": 12585 + }, + { + "epoch": 0.3032200573203921, + "grad_norm": 1.4547592401504517, + "learning_rate": 7.334955886826893e-07, + "loss": 0.966, + "step": 12590 + }, + { + "epoch": 0.30334047831217936, + "grad_norm": 1.5723978281021118, + "learning_rate": 7.33368826691005e-07, + "loss": 0.9346, + "step": 12595 + }, + { + "epoch": 0.3034608993039667, + "grad_norm": 1.736131191253662, + "learning_rate": 7.332420646993205e-07, + "loss": 0.9028, + "step": 12600 + }, + { + "epoch": 0.30358132029575396, + "grad_norm": 1.5331920385360718, + "learning_rate": 7.331153027076361e-07, + "loss": 0.951, + "step": 12605 + }, + { + "epoch": 0.30370174128754124, + "grad_norm": 1.4614812135696411, + "learning_rate": 7.329885407159517e-07, + "loss": 0.9038, + "step": 12610 + }, + { + "epoch": 0.3038221622793285, + "grad_norm": 1.489661693572998, + "learning_rate": 7.328617787242672e-07, + "loss": 0.9479, + "step": 12615 + }, + { + "epoch": 0.30394258327111584, + "grad_norm": 1.7694226503372192, + "learning_rate": 7.327350167325829e-07, + "loss": 0.9877, + "step": 12620 + }, + { + "epoch": 0.3040630042629031, + "grad_norm": 1.6576279401779175, + "learning_rate": 7.326082547408985e-07, + "loss": 0.9251, + "step": 12625 + }, + { + "epoch": 0.3041834252546904, + "grad_norm": 1.6266634464263916, + "learning_rate": 7.324814927492141e-07, + "loss": 0.9477, + "step": 12630 + }, + { + "epoch": 0.30430384624647766, + "grad_norm": 1.8012726306915283, + "learning_rate": 7.323547307575296e-07, + "loss": 0.9414, + "step": 12635 + }, + { + "epoch": 0.304424267238265, + "grad_norm": 1.5876845121383667, + "learning_rate": 7.322279687658452e-07, + "loss": 0.8884, + "step": 12640 + }, + { + "epoch": 0.30454468823005226, + "grad_norm": 1.5976060628890991, + "learning_rate": 7.321012067741608e-07, + "loss": 0.9195, + "step": 12645 + }, + { + "epoch": 0.30466510922183954, + "grad_norm": 1.7384992837905884, + "learning_rate": 7.319744447824764e-07, + "loss": 0.8957, + "step": 12650 + }, + { + "epoch": 0.30478553021362687, + "grad_norm": 1.5005650520324707, + "learning_rate": 7.31847682790792e-07, + "loss": 0.8999, + "step": 12655 + }, + { + "epoch": 0.30490595120541414, + "grad_norm": 1.669329047203064, + "learning_rate": 7.317209207991075e-07, + "loss": 0.9812, + "step": 12660 + }, + { + "epoch": 0.3050263721972014, + "grad_norm": 2.4113380908966064, + "learning_rate": 7.315941588074231e-07, + "loss": 0.937, + "step": 12665 + }, + { + "epoch": 0.3051467931889887, + "grad_norm": 1.5560503005981445, + "learning_rate": 7.314673968157388e-07, + "loss": 0.9361, + "step": 12670 + }, + { + "epoch": 0.305267214180776, + "grad_norm": 1.586330533027649, + "learning_rate": 7.313406348240543e-07, + "loss": 0.8831, + "step": 12675 + }, + { + "epoch": 0.3053876351725633, + "grad_norm": 1.5432112216949463, + "learning_rate": 7.312138728323699e-07, + "loss": 0.9674, + "step": 12680 + }, + { + "epoch": 0.30550805616435056, + "grad_norm": 1.4705675840377808, + "learning_rate": 7.310871108406855e-07, + "loss": 0.9631, + "step": 12685 + }, + { + "epoch": 0.30562847715613783, + "grad_norm": 1.5707398653030396, + "learning_rate": 7.30960348849001e-07, + "loss": 0.9109, + "step": 12690 + }, + { + "epoch": 0.30574889814792516, + "grad_norm": 1.442728042602539, + "learning_rate": 7.308335868573167e-07, + "loss": 0.939, + "step": 12695 + }, + { + "epoch": 0.30586931913971244, + "grad_norm": 1.4560580253601074, + "learning_rate": 7.307068248656323e-07, + "loss": 0.9516, + "step": 12700 + }, + { + "epoch": 0.3059897401314997, + "grad_norm": 1.3931323289871216, + "learning_rate": 7.305800628739478e-07, + "loss": 0.9051, + "step": 12705 + }, + { + "epoch": 0.30611016112328704, + "grad_norm": 1.5336275100708008, + "learning_rate": 7.304533008822634e-07, + "loss": 0.9185, + "step": 12710 + }, + { + "epoch": 0.3062305821150743, + "grad_norm": 1.623144507408142, + "learning_rate": 7.303265388905791e-07, + "loss": 0.9126, + "step": 12715 + }, + { + "epoch": 0.3063510031068616, + "grad_norm": 1.3647394180297852, + "learning_rate": 7.301997768988946e-07, + "loss": 0.9169, + "step": 12720 + }, + { + "epoch": 0.30647142409864886, + "grad_norm": 1.5183027982711792, + "learning_rate": 7.300730149072101e-07, + "loss": 0.8957, + "step": 12725 + }, + { + "epoch": 0.3065918450904362, + "grad_norm": 1.7214516401290894, + "learning_rate": 7.299462529155258e-07, + "loss": 0.8836, + "step": 12730 + }, + { + "epoch": 0.30671226608222346, + "grad_norm": 1.7252259254455566, + "learning_rate": 7.298194909238413e-07, + "loss": 0.8943, + "step": 12735 + }, + { + "epoch": 0.30683268707401073, + "grad_norm": 1.7761805057525635, + "learning_rate": 7.29692728932157e-07, + "loss": 0.9436, + "step": 12740 + }, + { + "epoch": 0.306953108065798, + "grad_norm": 1.742750883102417, + "learning_rate": 7.295659669404726e-07, + "loss": 0.9304, + "step": 12745 + }, + { + "epoch": 0.30707352905758534, + "grad_norm": 1.8857698440551758, + "learning_rate": 7.29439204948788e-07, + "loss": 0.9366, + "step": 12750 + }, + { + "epoch": 0.3071939500493726, + "grad_norm": 1.5318527221679688, + "learning_rate": 7.293124429571037e-07, + "loss": 0.9186, + "step": 12755 + }, + { + "epoch": 0.3073143710411599, + "grad_norm": 1.5236737728118896, + "learning_rate": 7.291856809654193e-07, + "loss": 0.9085, + "step": 12760 + }, + { + "epoch": 0.30743479203294716, + "grad_norm": 1.7529305219650269, + "learning_rate": 7.290589189737349e-07, + "loss": 0.8909, + "step": 12765 + }, + { + "epoch": 0.3075552130247345, + "grad_norm": 1.4512908458709717, + "learning_rate": 7.289321569820504e-07, + "loss": 0.9263, + "step": 12770 + }, + { + "epoch": 0.30767563401652176, + "grad_norm": 1.7084128856658936, + "learning_rate": 7.288053949903661e-07, + "loss": 0.9624, + "step": 12775 + }, + { + "epoch": 0.30779605500830903, + "grad_norm": 1.4374417066574097, + "learning_rate": 7.286786329986816e-07, + "loss": 0.9476, + "step": 12780 + }, + { + "epoch": 0.30791647600009636, + "grad_norm": 1.4318782091140747, + "learning_rate": 7.285518710069972e-07, + "loss": 0.9191, + "step": 12785 + }, + { + "epoch": 0.30803689699188364, + "grad_norm": 1.5451619625091553, + "learning_rate": 7.284251090153129e-07, + "loss": 0.8945, + "step": 12790 + }, + { + "epoch": 0.3081573179836709, + "grad_norm": 1.6542329788208008, + "learning_rate": 7.282983470236283e-07, + "loss": 0.8807, + "step": 12795 + }, + { + "epoch": 0.3082777389754582, + "grad_norm": 1.6969656944274902, + "learning_rate": 7.28171585031944e-07, + "loss": 0.9141, + "step": 12800 + }, + { + "epoch": 0.3083981599672455, + "grad_norm": 1.467103123664856, + "learning_rate": 7.280448230402596e-07, + "loss": 0.9163, + "step": 12805 + }, + { + "epoch": 0.3085185809590328, + "grad_norm": 1.5035455226898193, + "learning_rate": 7.279180610485751e-07, + "loss": 0.9495, + "step": 12810 + }, + { + "epoch": 0.30863900195082006, + "grad_norm": 1.504870891571045, + "learning_rate": 7.277912990568908e-07, + "loss": 0.8743, + "step": 12815 + }, + { + "epoch": 0.30875942294260733, + "grad_norm": 1.5313233137130737, + "learning_rate": 7.276645370652063e-07, + "loss": 0.907, + "step": 12820 + }, + { + "epoch": 0.30887984393439466, + "grad_norm": 1.7336504459381104, + "learning_rate": 7.275377750735219e-07, + "loss": 0.9576, + "step": 12825 + }, + { + "epoch": 0.30900026492618193, + "grad_norm": 1.4466485977172852, + "learning_rate": 7.274110130818375e-07, + "loss": 0.932, + "step": 12830 + }, + { + "epoch": 0.3091206859179692, + "grad_norm": 1.5821729898452759, + "learning_rate": 7.272842510901532e-07, + "loss": 0.8947, + "step": 12835 + }, + { + "epoch": 0.30924110690975654, + "grad_norm": 1.4757659435272217, + "learning_rate": 7.271574890984686e-07, + "loss": 0.9697, + "step": 12840 + }, + { + "epoch": 0.3093615279015438, + "grad_norm": 1.3930412530899048, + "learning_rate": 7.270307271067842e-07, + "loss": 0.8832, + "step": 12845 + }, + { + "epoch": 0.3094819488933311, + "grad_norm": 1.5699188709259033, + "learning_rate": 7.269039651150999e-07, + "loss": 0.9627, + "step": 12850 + }, + { + "epoch": 0.30960236988511836, + "grad_norm": 1.5719423294067383, + "learning_rate": 7.267772031234154e-07, + "loss": 0.9564, + "step": 12855 + }, + { + "epoch": 0.3097227908769057, + "grad_norm": 1.3833011388778687, + "learning_rate": 7.266504411317311e-07, + "loss": 0.8952, + "step": 12860 + }, + { + "epoch": 0.30984321186869296, + "grad_norm": 1.3970744609832764, + "learning_rate": 7.265236791400466e-07, + "loss": 0.881, + "step": 12865 + }, + { + "epoch": 0.30996363286048023, + "grad_norm": 1.5236992835998535, + "learning_rate": 7.263969171483621e-07, + "loss": 0.9615, + "step": 12870 + }, + { + "epoch": 0.3100840538522675, + "grad_norm": 1.4951473474502563, + "learning_rate": 7.262701551566778e-07, + "loss": 0.9406, + "step": 12875 + }, + { + "epoch": 0.31020447484405483, + "grad_norm": 1.4671101570129395, + "learning_rate": 7.261433931649934e-07, + "loss": 0.9372, + "step": 12880 + }, + { + "epoch": 0.3103248958358421, + "grad_norm": 1.4430850744247437, + "learning_rate": 7.260166311733089e-07, + "loss": 0.9654, + "step": 12885 + }, + { + "epoch": 0.3104453168276294, + "grad_norm": 1.451224684715271, + "learning_rate": 7.258898691816245e-07, + "loss": 0.9165, + "step": 12890 + }, + { + "epoch": 0.31056573781941665, + "grad_norm": 1.6193621158599854, + "learning_rate": 7.257631071899402e-07, + "loss": 0.9557, + "step": 12895 + }, + { + "epoch": 0.310686158811204, + "grad_norm": 1.431778073310852, + "learning_rate": 7.256363451982557e-07, + "loss": 0.9045, + "step": 12900 + }, + { + "epoch": 0.31080657980299126, + "grad_norm": 1.5868239402770996, + "learning_rate": 7.255095832065713e-07, + "loss": 0.9615, + "step": 12905 + }, + { + "epoch": 0.31092700079477853, + "grad_norm": 1.6796597242355347, + "learning_rate": 7.253828212148869e-07, + "loss": 0.9218, + "step": 12910 + }, + { + "epoch": 0.31104742178656586, + "grad_norm": 1.586971402168274, + "learning_rate": 7.252560592232024e-07, + "loss": 0.9413, + "step": 12915 + }, + { + "epoch": 0.31116784277835313, + "grad_norm": 1.555291771888733, + "learning_rate": 7.251292972315181e-07, + "loss": 0.8944, + "step": 12920 + }, + { + "epoch": 0.3112882637701404, + "grad_norm": 1.624487042427063, + "learning_rate": 7.250025352398337e-07, + "loss": 0.9409, + "step": 12925 + }, + { + "epoch": 0.3114086847619277, + "grad_norm": 1.8175469636917114, + "learning_rate": 7.248757732481492e-07, + "loss": 0.8911, + "step": 12930 + }, + { + "epoch": 0.311529105753715, + "grad_norm": 1.4849705696105957, + "learning_rate": 7.247490112564648e-07, + "loss": 0.927, + "step": 12935 + }, + { + "epoch": 0.3116495267455023, + "grad_norm": 1.5028043985366821, + "learning_rate": 7.246222492647804e-07, + "loss": 0.9222, + "step": 12940 + }, + { + "epoch": 0.31176994773728955, + "grad_norm": 1.5761449337005615, + "learning_rate": 7.24495487273096e-07, + "loss": 0.9601, + "step": 12945 + }, + { + "epoch": 0.31189036872907683, + "grad_norm": 1.5496820211410522, + "learning_rate": 7.243687252814116e-07, + "loss": 0.9189, + "step": 12950 + }, + { + "epoch": 0.31201078972086416, + "grad_norm": 1.502267599105835, + "learning_rate": 7.242419632897271e-07, + "loss": 0.8692, + "step": 12955 + }, + { + "epoch": 0.31213121071265143, + "grad_norm": 1.561307668685913, + "learning_rate": 7.241152012980427e-07, + "loss": 0.9263, + "step": 12960 + }, + { + "epoch": 0.3122516317044387, + "grad_norm": 1.597617268562317, + "learning_rate": 7.239884393063583e-07, + "loss": 0.888, + "step": 12965 + }, + { + "epoch": 0.31237205269622603, + "grad_norm": 1.7717195749282837, + "learning_rate": 7.23861677314674e-07, + "loss": 0.893, + "step": 12970 + }, + { + "epoch": 0.3124924736880133, + "grad_norm": 1.3559247255325317, + "learning_rate": 7.237349153229895e-07, + "loss": 0.9217, + "step": 12975 + }, + { + "epoch": 0.3126128946798006, + "grad_norm": 1.5530009269714355, + "learning_rate": 7.236081533313051e-07, + "loss": 0.9354, + "step": 12980 + }, + { + "epoch": 0.31273331567158785, + "grad_norm": 1.5920103788375854, + "learning_rate": 7.234813913396207e-07, + "loss": 0.906, + "step": 12985 + }, + { + "epoch": 0.3128537366633752, + "grad_norm": 1.5397907495498657, + "learning_rate": 7.233546293479362e-07, + "loss": 0.8861, + "step": 12990 + }, + { + "epoch": 0.31297415765516245, + "grad_norm": 1.7374628782272339, + "learning_rate": 7.232278673562519e-07, + "loss": 0.9622, + "step": 12995 + }, + { + "epoch": 0.31309457864694973, + "grad_norm": 1.6311469078063965, + "learning_rate": 7.231011053645675e-07, + "loss": 0.9128, + "step": 13000 + }, + { + "epoch": 0.313214999638737, + "grad_norm": 1.3971755504608154, + "learning_rate": 7.22974343372883e-07, + "loss": 0.9522, + "step": 13005 + }, + { + "epoch": 0.31333542063052433, + "grad_norm": 1.5353749990463257, + "learning_rate": 7.228475813811986e-07, + "loss": 0.9066, + "step": 13010 + }, + { + "epoch": 0.3134558416223116, + "grad_norm": 1.4577460289001465, + "learning_rate": 7.227208193895143e-07, + "loss": 0.9182, + "step": 13015 + }, + { + "epoch": 0.3135762626140989, + "grad_norm": 1.6157535314559937, + "learning_rate": 7.225940573978298e-07, + "loss": 0.904, + "step": 13020 + }, + { + "epoch": 0.31369668360588615, + "grad_norm": 1.75568687915802, + "learning_rate": 7.224672954061453e-07, + "loss": 0.9106, + "step": 13025 + }, + { + "epoch": 0.3138171045976735, + "grad_norm": 1.5280760526657104, + "learning_rate": 7.22340533414461e-07, + "loss": 0.923, + "step": 13030 + }, + { + "epoch": 0.31393752558946075, + "grad_norm": 1.4735020399093628, + "learning_rate": 7.222137714227765e-07, + "loss": 0.914, + "step": 13035 + }, + { + "epoch": 0.314057946581248, + "grad_norm": 1.794739842414856, + "learning_rate": 7.220870094310922e-07, + "loss": 0.9401, + "step": 13040 + }, + { + "epoch": 0.31417836757303536, + "grad_norm": 1.6184120178222656, + "learning_rate": 7.219602474394078e-07, + "loss": 0.9148, + "step": 13045 + }, + { + "epoch": 0.31429878856482263, + "grad_norm": 1.521453857421875, + "learning_rate": 7.218334854477232e-07, + "loss": 0.9631, + "step": 13050 + }, + { + "epoch": 0.3144192095566099, + "grad_norm": 1.5748240947723389, + "learning_rate": 7.217067234560389e-07, + "loss": 0.933, + "step": 13055 + }, + { + "epoch": 0.3145396305483972, + "grad_norm": 1.590860366821289, + "learning_rate": 7.215799614643545e-07, + "loss": 0.8997, + "step": 13060 + }, + { + "epoch": 0.3146600515401845, + "grad_norm": 1.5484305620193481, + "learning_rate": 7.214531994726701e-07, + "loss": 0.9044, + "step": 13065 + }, + { + "epoch": 0.3147804725319718, + "grad_norm": 1.5679107904434204, + "learning_rate": 7.213264374809856e-07, + "loss": 0.8809, + "step": 13070 + }, + { + "epoch": 0.31490089352375905, + "grad_norm": 1.368302345275879, + "learning_rate": 7.211996754893012e-07, + "loss": 0.9278, + "step": 13075 + }, + { + "epoch": 0.3150213145155463, + "grad_norm": 1.55647873878479, + "learning_rate": 7.210729134976169e-07, + "loss": 0.9217, + "step": 13080 + }, + { + "epoch": 0.31514173550733365, + "grad_norm": 1.5681729316711426, + "learning_rate": 7.209461515059324e-07, + "loss": 0.9554, + "step": 13085 + }, + { + "epoch": 0.3152621564991209, + "grad_norm": 1.511460781097412, + "learning_rate": 7.208193895142481e-07, + "loss": 0.9247, + "step": 13090 + }, + { + "epoch": 0.3153825774909082, + "grad_norm": 1.5184860229492188, + "learning_rate": 7.206926275225636e-07, + "loss": 0.8765, + "step": 13095 + }, + { + "epoch": 0.31550299848269553, + "grad_norm": 1.7019352912902832, + "learning_rate": 7.205658655308792e-07, + "loss": 0.9122, + "step": 13100 + }, + { + "epoch": 0.3156234194744828, + "grad_norm": 1.5765482187271118, + "learning_rate": 7.204391035391948e-07, + "loss": 0.9133, + "step": 13105 + }, + { + "epoch": 0.3157438404662701, + "grad_norm": 1.5094279050827026, + "learning_rate": 7.203123415475104e-07, + "loss": 0.9337, + "step": 13110 + }, + { + "epoch": 0.31586426145805735, + "grad_norm": 1.4930437803268433, + "learning_rate": 7.20185579555826e-07, + "loss": 0.9392, + "step": 13115 + }, + { + "epoch": 0.3159846824498447, + "grad_norm": 1.6247256994247437, + "learning_rate": 7.200588175641415e-07, + "loss": 0.9644, + "step": 13120 + }, + { + "epoch": 0.31610510344163195, + "grad_norm": 2.296663522720337, + "learning_rate": 7.199320555724572e-07, + "loss": 0.9872, + "step": 13125 + }, + { + "epoch": 0.3162255244334192, + "grad_norm": 1.671225666999817, + "learning_rate": 7.198052935807727e-07, + "loss": 0.9924, + "step": 13130 + }, + { + "epoch": 0.3163459454252065, + "grad_norm": 1.6024426221847534, + "learning_rate": 7.196785315890883e-07, + "loss": 0.9553, + "step": 13135 + }, + { + "epoch": 0.3164663664169938, + "grad_norm": 1.5475425720214844, + "learning_rate": 7.195517695974039e-07, + "loss": 0.913, + "step": 13140 + }, + { + "epoch": 0.3165867874087811, + "grad_norm": 1.7463332414627075, + "learning_rate": 7.194250076057194e-07, + "loss": 0.9436, + "step": 13145 + }, + { + "epoch": 0.3167072084005684, + "grad_norm": 1.461932897567749, + "learning_rate": 7.192982456140351e-07, + "loss": 0.9176, + "step": 13150 + }, + { + "epoch": 0.3168276293923557, + "grad_norm": 1.4646632671356201, + "learning_rate": 7.191714836223507e-07, + "loss": 0.9003, + "step": 13155 + }, + { + "epoch": 0.316948050384143, + "grad_norm": 1.5874431133270264, + "learning_rate": 7.190447216306663e-07, + "loss": 0.9587, + "step": 13160 + }, + { + "epoch": 0.31706847137593025, + "grad_norm": 1.8062708377838135, + "learning_rate": 7.189179596389818e-07, + "loss": 0.921, + "step": 13165 + }, + { + "epoch": 0.3171888923677175, + "grad_norm": 1.5187337398529053, + "learning_rate": 7.187911976472974e-07, + "loss": 0.9286, + "step": 13170 + }, + { + "epoch": 0.31730931335950485, + "grad_norm": 1.4751627445220947, + "learning_rate": 7.18664435655613e-07, + "loss": 0.8909, + "step": 13175 + }, + { + "epoch": 0.3174297343512921, + "grad_norm": 1.4710909128189087, + "learning_rate": 7.185376736639286e-07, + "loss": 0.8811, + "step": 13180 + }, + { + "epoch": 0.3175501553430794, + "grad_norm": 1.7347071170806885, + "learning_rate": 7.184109116722443e-07, + "loss": 0.9272, + "step": 13185 + }, + { + "epoch": 0.31767057633486667, + "grad_norm": 1.5981749296188354, + "learning_rate": 7.182841496805597e-07, + "loss": 0.9722, + "step": 13190 + }, + { + "epoch": 0.317790997326654, + "grad_norm": 1.7842888832092285, + "learning_rate": 7.181573876888753e-07, + "loss": 0.9258, + "step": 13195 + }, + { + "epoch": 0.3179114183184413, + "grad_norm": 1.4398106336593628, + "learning_rate": 7.18030625697191e-07, + "loss": 0.8923, + "step": 13200 + }, + { + "epoch": 0.31803183931022855, + "grad_norm": 1.4745246171951294, + "learning_rate": 7.179038637055065e-07, + "loss": 0.9105, + "step": 13205 + }, + { + "epoch": 0.3181522603020158, + "grad_norm": 1.5541917085647583, + "learning_rate": 7.177771017138221e-07, + "loss": 0.8613, + "step": 13210 + }, + { + "epoch": 0.31827268129380315, + "grad_norm": 1.4687206745147705, + "learning_rate": 7.176503397221377e-07, + "loss": 0.9219, + "step": 13215 + }, + { + "epoch": 0.3183931022855904, + "grad_norm": 1.6779239177703857, + "learning_rate": 7.175235777304532e-07, + "loss": 0.8582, + "step": 13220 + }, + { + "epoch": 0.3185135232773777, + "grad_norm": 1.4923917055130005, + "learning_rate": 7.173968157387689e-07, + "loss": 0.9351, + "step": 13225 + }, + { + "epoch": 0.318633944269165, + "grad_norm": 1.669708490371704, + "learning_rate": 7.172700537470845e-07, + "loss": 0.973, + "step": 13230 + }, + { + "epoch": 0.3187543652609523, + "grad_norm": 1.9078813791275024, + "learning_rate": 7.171432917554e-07, + "loss": 0.9391, + "step": 13235 + }, + { + "epoch": 0.3188747862527396, + "grad_norm": 1.4691177606582642, + "learning_rate": 7.170165297637156e-07, + "loss": 0.9232, + "step": 13240 + }, + { + "epoch": 0.31899520724452685, + "grad_norm": 1.5904854536056519, + "learning_rate": 7.168897677720313e-07, + "loss": 0.9105, + "step": 13245 + }, + { + "epoch": 0.3191156282363142, + "grad_norm": 1.7188918590545654, + "learning_rate": 7.167630057803468e-07, + "loss": 0.884, + "step": 13250 + }, + { + "epoch": 0.31923604922810145, + "grad_norm": 1.4673360586166382, + "learning_rate": 7.166362437886623e-07, + "loss": 0.9632, + "step": 13255 + }, + { + "epoch": 0.3193564702198887, + "grad_norm": 1.5699816942214966, + "learning_rate": 7.16509481796978e-07, + "loss": 0.9771, + "step": 13260 + }, + { + "epoch": 0.319476891211676, + "grad_norm": 1.598327398300171, + "learning_rate": 7.163827198052935e-07, + "loss": 0.9507, + "step": 13265 + }, + { + "epoch": 0.3195973122034633, + "grad_norm": 1.6673810482025146, + "learning_rate": 7.162559578136092e-07, + "loss": 0.9501, + "step": 13270 + }, + { + "epoch": 0.3197177331952506, + "grad_norm": 1.5441638231277466, + "learning_rate": 7.161291958219248e-07, + "loss": 0.9359, + "step": 13275 + }, + { + "epoch": 0.31983815418703787, + "grad_norm": 1.3806934356689453, + "learning_rate": 7.160024338302402e-07, + "loss": 0.8852, + "step": 13280 + }, + { + "epoch": 0.3199585751788252, + "grad_norm": 1.5654473304748535, + "learning_rate": 7.158756718385559e-07, + "loss": 0.9018, + "step": 13285 + }, + { + "epoch": 0.3200789961706125, + "grad_norm": 1.5995250940322876, + "learning_rate": 7.157489098468715e-07, + "loss": 0.9273, + "step": 13290 + }, + { + "epoch": 0.32019941716239975, + "grad_norm": 1.6036864519119263, + "learning_rate": 7.156221478551871e-07, + "loss": 0.9197, + "step": 13295 + }, + { + "epoch": 0.320319838154187, + "grad_norm": 1.4564919471740723, + "learning_rate": 7.154953858635027e-07, + "loss": 0.9494, + "step": 13300 + }, + { + "epoch": 0.32044025914597435, + "grad_norm": 1.441354513168335, + "learning_rate": 7.153686238718183e-07, + "loss": 0.9084, + "step": 13305 + }, + { + "epoch": 0.3205606801377616, + "grad_norm": 1.6930876970291138, + "learning_rate": 7.152418618801338e-07, + "loss": 0.9256, + "step": 13310 + }, + { + "epoch": 0.3206811011295489, + "grad_norm": 1.5789000988006592, + "learning_rate": 7.151150998884494e-07, + "loss": 0.918, + "step": 13315 + }, + { + "epoch": 0.32080152212133617, + "grad_norm": 1.6201459169387817, + "learning_rate": 7.149883378967651e-07, + "loss": 0.8801, + "step": 13320 + }, + { + "epoch": 0.3209219431131235, + "grad_norm": 1.62224543094635, + "learning_rate": 7.148615759050805e-07, + "loss": 0.8973, + "step": 13325 + }, + { + "epoch": 0.32104236410491077, + "grad_norm": 1.5501794815063477, + "learning_rate": 7.147348139133962e-07, + "loss": 0.9312, + "step": 13330 + }, + { + "epoch": 0.32116278509669804, + "grad_norm": 1.4192733764648438, + "learning_rate": 7.146080519217118e-07, + "loss": 0.9125, + "step": 13335 + }, + { + "epoch": 0.3212832060884853, + "grad_norm": 1.6199774742126465, + "learning_rate": 7.144812899300273e-07, + "loss": 0.9498, + "step": 13340 + }, + { + "epoch": 0.32140362708027265, + "grad_norm": 1.7698010206222534, + "learning_rate": 7.14354527938343e-07, + "loss": 0.9673, + "step": 13345 + }, + { + "epoch": 0.3215240480720599, + "grad_norm": 1.632602572441101, + "learning_rate": 7.142277659466585e-07, + "loss": 0.8909, + "step": 13350 + }, + { + "epoch": 0.3216444690638472, + "grad_norm": 1.4607270956039429, + "learning_rate": 7.141010039549741e-07, + "loss": 0.8579, + "step": 13355 + }, + { + "epoch": 0.3217648900556345, + "grad_norm": 1.6992939710617065, + "learning_rate": 7.139742419632897e-07, + "loss": 0.9084, + "step": 13360 + }, + { + "epoch": 0.3218853110474218, + "grad_norm": 1.440292239189148, + "learning_rate": 7.138474799716054e-07, + "loss": 0.8978, + "step": 13365 + }, + { + "epoch": 0.32200573203920907, + "grad_norm": 1.411969542503357, + "learning_rate": 7.137207179799208e-07, + "loss": 0.9203, + "step": 13370 + }, + { + "epoch": 0.32212615303099634, + "grad_norm": 1.7011206150054932, + "learning_rate": 7.135939559882364e-07, + "loss": 0.9135, + "step": 13375 + }, + { + "epoch": 0.32224657402278367, + "grad_norm": 1.639039158821106, + "learning_rate": 7.134671939965521e-07, + "loss": 0.9575, + "step": 13380 + }, + { + "epoch": 0.32236699501457095, + "grad_norm": 1.7481632232666016, + "learning_rate": 7.133404320048676e-07, + "loss": 0.9525, + "step": 13385 + }, + { + "epoch": 0.3224874160063582, + "grad_norm": 1.509905219078064, + "learning_rate": 7.132136700131833e-07, + "loss": 0.9806, + "step": 13390 + }, + { + "epoch": 0.3226078369981455, + "grad_norm": 1.5179061889648438, + "learning_rate": 7.130869080214988e-07, + "loss": 0.9796, + "step": 13395 + }, + { + "epoch": 0.3227282579899328, + "grad_norm": 1.6003575325012207, + "learning_rate": 7.129601460298143e-07, + "loss": 0.9339, + "step": 13400 + }, + { + "epoch": 0.3228486789817201, + "grad_norm": 1.5749802589416504, + "learning_rate": 7.1283338403813e-07, + "loss": 0.9196, + "step": 13405 + }, + { + "epoch": 0.32296909997350737, + "grad_norm": 1.4835052490234375, + "learning_rate": 7.127066220464456e-07, + "loss": 0.9415, + "step": 13410 + }, + { + "epoch": 0.3230895209652947, + "grad_norm": 1.6921086311340332, + "learning_rate": 7.125798600547611e-07, + "loss": 0.8908, + "step": 13415 + }, + { + "epoch": 0.32320994195708197, + "grad_norm": 1.3966692686080933, + "learning_rate": 7.124530980630767e-07, + "loss": 0.9145, + "step": 13420 + }, + { + "epoch": 0.32333036294886924, + "grad_norm": 1.4754492044448853, + "learning_rate": 7.123263360713924e-07, + "loss": 0.898, + "step": 13425 + }, + { + "epoch": 0.3234507839406565, + "grad_norm": 1.5174185037612915, + "learning_rate": 7.121995740797079e-07, + "loss": 0.8697, + "step": 13430 + }, + { + "epoch": 0.32357120493244385, + "grad_norm": 1.631089210510254, + "learning_rate": 7.120728120880235e-07, + "loss": 0.9433, + "step": 13435 + }, + { + "epoch": 0.3236916259242311, + "grad_norm": 1.4840291738510132, + "learning_rate": 7.119460500963391e-07, + "loss": 0.932, + "step": 13440 + }, + { + "epoch": 0.3238120469160184, + "grad_norm": 1.6019710302352905, + "learning_rate": 7.118192881046546e-07, + "loss": 0.9055, + "step": 13445 + }, + { + "epoch": 0.32393246790780567, + "grad_norm": 1.71696937084198, + "learning_rate": 7.116925261129703e-07, + "loss": 0.9434, + "step": 13450 + }, + { + "epoch": 0.324052888899593, + "grad_norm": 1.519882082939148, + "learning_rate": 7.115657641212859e-07, + "loss": 0.9167, + "step": 13455 + }, + { + "epoch": 0.32417330989138027, + "grad_norm": 1.4360700845718384, + "learning_rate": 7.114390021296014e-07, + "loss": 0.9351, + "step": 13460 + }, + { + "epoch": 0.32429373088316754, + "grad_norm": 1.5094038248062134, + "learning_rate": 7.11312240137917e-07, + "loss": 0.9051, + "step": 13465 + }, + { + "epoch": 0.3244141518749548, + "grad_norm": 1.4464014768600464, + "learning_rate": 7.111854781462326e-07, + "loss": 0.9252, + "step": 13470 + }, + { + "epoch": 0.32453457286674214, + "grad_norm": 1.5232619047164917, + "learning_rate": 7.110587161545482e-07, + "loss": 0.8695, + "step": 13475 + }, + { + "epoch": 0.3246549938585294, + "grad_norm": 1.4510326385498047, + "learning_rate": 7.109319541628638e-07, + "loss": 0.9222, + "step": 13480 + }, + { + "epoch": 0.3247754148503167, + "grad_norm": 1.5552394390106201, + "learning_rate": 7.108051921711793e-07, + "loss": 0.8811, + "step": 13485 + }, + { + "epoch": 0.324895835842104, + "grad_norm": 1.6308211088180542, + "learning_rate": 7.106784301794949e-07, + "loss": 0.8948, + "step": 13490 + }, + { + "epoch": 0.3250162568338913, + "grad_norm": 1.4746640920639038, + "learning_rate": 7.105516681878105e-07, + "loss": 0.9311, + "step": 13495 + }, + { + "epoch": 0.32513667782567857, + "grad_norm": 1.5032947063446045, + "learning_rate": 7.104249061961262e-07, + "loss": 0.9029, + "step": 13500 + }, + { + "epoch": 0.32525709881746584, + "grad_norm": 1.6702075004577637, + "learning_rate": 7.102981442044417e-07, + "loss": 0.9339, + "step": 13505 + }, + { + "epoch": 0.32537751980925317, + "grad_norm": 1.8243368864059448, + "learning_rate": 7.101713822127573e-07, + "loss": 0.9171, + "step": 13510 + }, + { + "epoch": 0.32549794080104044, + "grad_norm": 1.6092954874038696, + "learning_rate": 7.100446202210729e-07, + "loss": 0.9102, + "step": 13515 + }, + { + "epoch": 0.3256183617928277, + "grad_norm": 1.7775665521621704, + "learning_rate": 7.099178582293884e-07, + "loss": 0.9326, + "step": 13520 + }, + { + "epoch": 0.325738782784615, + "grad_norm": 1.5000969171524048, + "learning_rate": 7.097910962377041e-07, + "loss": 0.8927, + "step": 13525 + }, + { + "epoch": 0.3258592037764023, + "grad_norm": 2.0762746334075928, + "learning_rate": 7.096643342460197e-07, + "loss": 0.8871, + "step": 13530 + }, + { + "epoch": 0.3259796247681896, + "grad_norm": 1.554837942123413, + "learning_rate": 7.095375722543352e-07, + "loss": 0.9493, + "step": 13535 + }, + { + "epoch": 0.32610004575997686, + "grad_norm": 1.3311814069747925, + "learning_rate": 7.094108102626508e-07, + "loss": 0.9113, + "step": 13540 + }, + { + "epoch": 0.3262204667517642, + "grad_norm": 1.598663568496704, + "learning_rate": 7.092840482709665e-07, + "loss": 0.9669, + "step": 13545 + }, + { + "epoch": 0.32634088774355147, + "grad_norm": 1.6727991104125977, + "learning_rate": 7.09157286279282e-07, + "loss": 0.8797, + "step": 13550 + }, + { + "epoch": 0.32646130873533874, + "grad_norm": 1.5206135511398315, + "learning_rate": 7.090305242875975e-07, + "loss": 0.9103, + "step": 13555 + }, + { + "epoch": 0.326581729727126, + "grad_norm": 1.4802076816558838, + "learning_rate": 7.089037622959132e-07, + "loss": 0.9366, + "step": 13560 + }, + { + "epoch": 0.32670215071891334, + "grad_norm": 1.6493333578109741, + "learning_rate": 7.087770003042287e-07, + "loss": 0.8865, + "step": 13565 + }, + { + "epoch": 0.3268225717107006, + "grad_norm": 1.5122278928756714, + "learning_rate": 7.086502383125444e-07, + "loss": 0.9311, + "step": 13570 + }, + { + "epoch": 0.3269429927024879, + "grad_norm": 1.5008400678634644, + "learning_rate": 7.0852347632086e-07, + "loss": 0.9301, + "step": 13575 + }, + { + "epoch": 0.32706341369427516, + "grad_norm": 1.6388431787490845, + "learning_rate": 7.083967143291754e-07, + "loss": 0.9092, + "step": 13580 + }, + { + "epoch": 0.3271838346860625, + "grad_norm": 1.8835076093673706, + "learning_rate": 7.082699523374911e-07, + "loss": 0.8953, + "step": 13585 + }, + { + "epoch": 0.32730425567784976, + "grad_norm": 1.381044864654541, + "learning_rate": 7.081431903458067e-07, + "loss": 0.9351, + "step": 13590 + }, + { + "epoch": 0.32742467666963704, + "grad_norm": 1.7384233474731445, + "learning_rate": 7.080164283541223e-07, + "loss": 0.8984, + "step": 13595 + }, + { + "epoch": 0.32754509766142437, + "grad_norm": 1.2871006727218628, + "learning_rate": 7.078896663624378e-07, + "loss": 0.8785, + "step": 13600 + }, + { + "epoch": 0.32766551865321164, + "grad_norm": 1.4323610067367554, + "learning_rate": 7.077629043707534e-07, + "loss": 0.9649, + "step": 13605 + }, + { + "epoch": 0.3277859396449989, + "grad_norm": 1.8085834980010986, + "learning_rate": 7.07636142379069e-07, + "loss": 0.956, + "step": 13610 + }, + { + "epoch": 0.3279063606367862, + "grad_norm": 1.494591236114502, + "learning_rate": 7.075093803873846e-07, + "loss": 0.8899, + "step": 13615 + }, + { + "epoch": 0.3280267816285735, + "grad_norm": 1.4227632284164429, + "learning_rate": 7.073826183957003e-07, + "loss": 0.948, + "step": 13620 + }, + { + "epoch": 0.3281472026203608, + "grad_norm": 1.6277165412902832, + "learning_rate": 7.072558564040157e-07, + "loss": 0.9471, + "step": 13625 + }, + { + "epoch": 0.32826762361214806, + "grad_norm": 1.4908299446105957, + "learning_rate": 7.071290944123314e-07, + "loss": 0.9433, + "step": 13630 + }, + { + "epoch": 0.32838804460393534, + "grad_norm": 1.476320505142212, + "learning_rate": 7.07002332420647e-07, + "loss": 0.9705, + "step": 13635 + }, + { + "epoch": 0.32850846559572267, + "grad_norm": 1.515441656112671, + "learning_rate": 7.068755704289625e-07, + "loss": 0.9863, + "step": 13640 + }, + { + "epoch": 0.32862888658750994, + "grad_norm": 1.490149736404419, + "learning_rate": 7.067488084372782e-07, + "loss": 0.911, + "step": 13645 + }, + { + "epoch": 0.3287493075792972, + "grad_norm": 1.5658856630325317, + "learning_rate": 7.066220464455937e-07, + "loss": 0.942, + "step": 13650 + }, + { + "epoch": 0.3288697285710845, + "grad_norm": 1.8892005681991577, + "learning_rate": 7.064952844539093e-07, + "loss": 0.9446, + "step": 13655 + }, + { + "epoch": 0.3289901495628718, + "grad_norm": 1.5404114723205566, + "learning_rate": 7.063685224622249e-07, + "loss": 0.926, + "step": 13660 + }, + { + "epoch": 0.3291105705546591, + "grad_norm": 1.7159450054168701, + "learning_rate": 7.062417604705405e-07, + "loss": 0.9088, + "step": 13665 + }, + { + "epoch": 0.32923099154644636, + "grad_norm": 1.5221284627914429, + "learning_rate": 7.06114998478856e-07, + "loss": 0.9194, + "step": 13670 + }, + { + "epoch": 0.3293514125382337, + "grad_norm": 1.5362894535064697, + "learning_rate": 7.059882364871716e-07, + "loss": 0.9113, + "step": 13675 + }, + { + "epoch": 0.32947183353002096, + "grad_norm": 1.454923391342163, + "learning_rate": 7.058614744954873e-07, + "loss": 0.8754, + "step": 13680 + }, + { + "epoch": 0.32959225452180824, + "grad_norm": 1.5516047477722168, + "learning_rate": 7.057347125038028e-07, + "loss": 0.914, + "step": 13685 + }, + { + "epoch": 0.3297126755135955, + "grad_norm": 1.3942458629608154, + "learning_rate": 7.056079505121185e-07, + "loss": 0.9446, + "step": 13690 + }, + { + "epoch": 0.32983309650538284, + "grad_norm": 1.4205833673477173, + "learning_rate": 7.05481188520434e-07, + "loss": 0.9282, + "step": 13695 + }, + { + "epoch": 0.3299535174971701, + "grad_norm": 1.4916503429412842, + "learning_rate": 7.053544265287495e-07, + "loss": 0.8933, + "step": 13700 + }, + { + "epoch": 0.3300739384889574, + "grad_norm": 1.4028732776641846, + "learning_rate": 7.052276645370652e-07, + "loss": 0.921, + "step": 13705 + }, + { + "epoch": 0.33019435948074466, + "grad_norm": 1.5253241062164307, + "learning_rate": 7.051009025453808e-07, + "loss": 0.942, + "step": 13710 + }, + { + "epoch": 0.330314780472532, + "grad_norm": 1.5972962379455566, + "learning_rate": 7.049741405536963e-07, + "loss": 0.9471, + "step": 13715 + }, + { + "epoch": 0.33043520146431926, + "grad_norm": 1.4718220233917236, + "learning_rate": 7.048473785620119e-07, + "loss": 0.9449, + "step": 13720 + }, + { + "epoch": 0.33055562245610653, + "grad_norm": 1.5282396078109741, + "learning_rate": 7.047206165703275e-07, + "loss": 0.9375, + "step": 13725 + }, + { + "epoch": 0.33067604344789386, + "grad_norm": 1.509559988975525, + "learning_rate": 7.045938545786431e-07, + "loss": 0.9044, + "step": 13730 + }, + { + "epoch": 0.33079646443968114, + "grad_norm": 1.661907434463501, + "learning_rate": 7.044670925869587e-07, + "loss": 0.9749, + "step": 13735 + }, + { + "epoch": 0.3309168854314684, + "grad_norm": 1.8139874935150146, + "learning_rate": 7.043403305952743e-07, + "loss": 0.8787, + "step": 13740 + }, + { + "epoch": 0.3310373064232557, + "grad_norm": 1.4727627038955688, + "learning_rate": 7.042135686035898e-07, + "loss": 0.8722, + "step": 13745 + }, + { + "epoch": 0.331157727415043, + "grad_norm": 1.3403947353363037, + "learning_rate": 7.040868066119054e-07, + "loss": 0.9231, + "step": 13750 + }, + { + "epoch": 0.3312781484068303, + "grad_norm": 1.4505927562713623, + "learning_rate": 7.039600446202211e-07, + "loss": 0.8517, + "step": 13755 + }, + { + "epoch": 0.33139856939861756, + "grad_norm": 1.5560884475708008, + "learning_rate": 7.038332826285366e-07, + "loss": 0.9006, + "step": 13760 + }, + { + "epoch": 0.33151899039040483, + "grad_norm": 1.6754164695739746, + "learning_rate": 7.037065206368522e-07, + "loss": 0.9583, + "step": 13765 + }, + { + "epoch": 0.33163941138219216, + "grad_norm": 1.5612949132919312, + "learning_rate": 7.035797586451678e-07, + "loss": 0.9091, + "step": 13770 + }, + { + "epoch": 0.33175983237397944, + "grad_norm": 1.49845290184021, + "learning_rate": 7.034529966534834e-07, + "loss": 0.9324, + "step": 13775 + }, + { + "epoch": 0.3318802533657667, + "grad_norm": 1.5018976926803589, + "learning_rate": 7.03326234661799e-07, + "loss": 0.9384, + "step": 13780 + }, + { + "epoch": 0.332000674357554, + "grad_norm": 1.5326520204544067, + "learning_rate": 7.031994726701145e-07, + "loss": 0.8921, + "step": 13785 + }, + { + "epoch": 0.3321210953493413, + "grad_norm": 1.4276926517486572, + "learning_rate": 7.030727106784301e-07, + "loss": 0.9469, + "step": 13790 + }, + { + "epoch": 0.3322415163411286, + "grad_norm": 1.6255600452423096, + "learning_rate": 7.029459486867457e-07, + "loss": 0.9313, + "step": 13795 + }, + { + "epoch": 0.33236193733291586, + "grad_norm": 1.544748067855835, + "learning_rate": 7.028191866950614e-07, + "loss": 0.9412, + "step": 13800 + }, + { + "epoch": 0.3324823583247032, + "grad_norm": 1.56434965133667, + "learning_rate": 7.026924247033769e-07, + "loss": 0.9771, + "step": 13805 + }, + { + "epoch": 0.33260277931649046, + "grad_norm": 1.5131580829620361, + "learning_rate": 7.025656627116924e-07, + "loss": 0.91, + "step": 13810 + }, + { + "epoch": 0.33272320030827773, + "grad_norm": 1.5495069026947021, + "learning_rate": 7.024389007200081e-07, + "loss": 0.921, + "step": 13815 + }, + { + "epoch": 0.332843621300065, + "grad_norm": 1.6015516519546509, + "learning_rate": 7.023121387283236e-07, + "loss": 0.8956, + "step": 13820 + }, + { + "epoch": 0.33296404229185234, + "grad_norm": 1.6354554891586304, + "learning_rate": 7.021853767366393e-07, + "loss": 0.9254, + "step": 13825 + }, + { + "epoch": 0.3330844632836396, + "grad_norm": 1.6220519542694092, + "learning_rate": 7.020586147449549e-07, + "loss": 0.9387, + "step": 13830 + }, + { + "epoch": 0.3332048842754269, + "grad_norm": 1.7443767786026, + "learning_rate": 7.019318527532704e-07, + "loss": 0.9206, + "step": 13835 + }, + { + "epoch": 0.33332530526721416, + "grad_norm": 1.4180527925491333, + "learning_rate": 7.01805090761586e-07, + "loss": 0.9414, + "step": 13840 + }, + { + "epoch": 0.3334457262590015, + "grad_norm": 1.4445128440856934, + "learning_rate": 7.016783287699016e-07, + "loss": 0.9222, + "step": 13845 + }, + { + "epoch": 0.33356614725078876, + "grad_norm": 1.522389531135559, + "learning_rate": 7.015515667782172e-07, + "loss": 0.9301, + "step": 13850 + }, + { + "epoch": 0.33368656824257603, + "grad_norm": 1.559030294418335, + "learning_rate": 7.014248047865327e-07, + "loss": 0.9337, + "step": 13855 + }, + { + "epoch": 0.33380698923436336, + "grad_norm": 1.6620439291000366, + "learning_rate": 7.012980427948484e-07, + "loss": 0.8898, + "step": 13860 + }, + { + "epoch": 0.33392741022615063, + "grad_norm": 1.3690235614776611, + "learning_rate": 7.011712808031639e-07, + "loss": 0.939, + "step": 13865 + }, + { + "epoch": 0.3340478312179379, + "grad_norm": 1.57186758518219, + "learning_rate": 7.010445188114795e-07, + "loss": 0.9139, + "step": 13870 + }, + { + "epoch": 0.3341682522097252, + "grad_norm": 1.8457096815109253, + "learning_rate": 7.009177568197952e-07, + "loss": 0.9582, + "step": 13875 + }, + { + "epoch": 0.3342886732015125, + "grad_norm": 1.5629444122314453, + "learning_rate": 7.007909948281106e-07, + "loss": 0.9404, + "step": 13880 + }, + { + "epoch": 0.3344090941932998, + "grad_norm": 1.7172342538833618, + "learning_rate": 7.006642328364263e-07, + "loss": 0.8399, + "step": 13885 + }, + { + "epoch": 0.33452951518508706, + "grad_norm": 1.6273506879806519, + "learning_rate": 7.005374708447419e-07, + "loss": 0.8748, + "step": 13890 + }, + { + "epoch": 0.33464993617687433, + "grad_norm": 1.6080079078674316, + "learning_rate": 7.004107088530575e-07, + "loss": 0.9539, + "step": 13895 + }, + { + "epoch": 0.33477035716866166, + "grad_norm": 1.5368893146514893, + "learning_rate": 7.00283946861373e-07, + "loss": 0.9064, + "step": 13900 + }, + { + "epoch": 0.33489077816044893, + "grad_norm": 1.4585624933242798, + "learning_rate": 7.001571848696886e-07, + "loss": 0.9208, + "step": 13905 + }, + { + "epoch": 0.3350111991522362, + "grad_norm": 1.532059669494629, + "learning_rate": 7.000304228780042e-07, + "loss": 0.9143, + "step": 13910 + }, + { + "epoch": 0.3351316201440235, + "grad_norm": 1.5856126546859741, + "learning_rate": 6.999036608863198e-07, + "loss": 0.9381, + "step": 13915 + }, + { + "epoch": 0.3352520411358108, + "grad_norm": 1.5512603521347046, + "learning_rate": 6.997768988946355e-07, + "loss": 0.9217, + "step": 13920 + }, + { + "epoch": 0.3353724621275981, + "grad_norm": 1.4288643598556519, + "learning_rate": 6.996501369029509e-07, + "loss": 0.881, + "step": 13925 + }, + { + "epoch": 0.33549288311938535, + "grad_norm": 1.4540021419525146, + "learning_rate": 6.995233749112665e-07, + "loss": 0.893, + "step": 13930 + }, + { + "epoch": 0.3356133041111727, + "grad_norm": 1.457465410232544, + "learning_rate": 6.993966129195822e-07, + "loss": 0.9204, + "step": 13935 + }, + { + "epoch": 0.33573372510295996, + "grad_norm": 1.5519061088562012, + "learning_rate": 6.992698509278977e-07, + "loss": 0.9038, + "step": 13940 + }, + { + "epoch": 0.33585414609474723, + "grad_norm": 1.4745047092437744, + "learning_rate": 6.991430889362134e-07, + "loss": 0.9281, + "step": 13945 + }, + { + "epoch": 0.3359745670865345, + "grad_norm": 1.4566423892974854, + "learning_rate": 6.990163269445289e-07, + "loss": 0.8641, + "step": 13950 + }, + { + "epoch": 0.33609498807832183, + "grad_norm": 1.702518105506897, + "learning_rate": 6.988895649528444e-07, + "loss": 0.9277, + "step": 13955 + }, + { + "epoch": 0.3362154090701091, + "grad_norm": 1.5833892822265625, + "learning_rate": 6.987628029611601e-07, + "loss": 0.8841, + "step": 13960 + }, + { + "epoch": 0.3363358300618964, + "grad_norm": 1.4968024492263794, + "learning_rate": 6.986360409694757e-07, + "loss": 0.9054, + "step": 13965 + }, + { + "epoch": 0.33645625105368365, + "grad_norm": 1.5771639347076416, + "learning_rate": 6.985092789777912e-07, + "loss": 0.9448, + "step": 13970 + }, + { + "epoch": 0.336576672045471, + "grad_norm": 1.413447618484497, + "learning_rate": 6.983825169861068e-07, + "loss": 0.8788, + "step": 13975 + }, + { + "epoch": 0.33669709303725825, + "grad_norm": 1.5518314838409424, + "learning_rate": 6.982557549944225e-07, + "loss": 0.8855, + "step": 13980 + }, + { + "epoch": 0.33681751402904553, + "grad_norm": 1.6350971460342407, + "learning_rate": 6.98128993002738e-07, + "loss": 0.9442, + "step": 13985 + }, + { + "epoch": 0.33693793502083286, + "grad_norm": 1.3646329641342163, + "learning_rate": 6.980022310110536e-07, + "loss": 0.912, + "step": 13990 + }, + { + "epoch": 0.33705835601262013, + "grad_norm": 1.8672513961791992, + "learning_rate": 6.978754690193692e-07, + "loss": 0.9439, + "step": 13995 + }, + { + "epoch": 0.3371787770044074, + "grad_norm": 1.4845424890518188, + "learning_rate": 6.977487070276847e-07, + "loss": 0.9131, + "step": 14000 + }, + { + "epoch": 0.3372991979961947, + "grad_norm": 1.4043781757354736, + "learning_rate": 6.976219450360004e-07, + "loss": 0.9178, + "step": 14005 + }, + { + "epoch": 0.337419618987982, + "grad_norm": 1.4757076501846313, + "learning_rate": 6.97495183044316e-07, + "loss": 0.925, + "step": 14010 + }, + { + "epoch": 0.3375400399797693, + "grad_norm": 1.5454431772232056, + "learning_rate": 6.973684210526314e-07, + "loss": 0.9657, + "step": 14015 + }, + { + "epoch": 0.33766046097155655, + "grad_norm": 1.502697229385376, + "learning_rate": 6.972416590609471e-07, + "loss": 0.9258, + "step": 14020 + }, + { + "epoch": 0.3377808819633438, + "grad_norm": 1.6740292310714722, + "learning_rate": 6.971148970692627e-07, + "loss": 0.9542, + "step": 14025 + }, + { + "epoch": 0.33790130295513116, + "grad_norm": 1.615235447883606, + "learning_rate": 6.969881350775783e-07, + "loss": 0.9191, + "step": 14030 + }, + { + "epoch": 0.33802172394691843, + "grad_norm": 1.4535822868347168, + "learning_rate": 6.968613730858939e-07, + "loss": 0.9537, + "step": 14035 + }, + { + "epoch": 0.3381421449387057, + "grad_norm": 1.4042657613754272, + "learning_rate": 6.967346110942095e-07, + "loss": 0.9199, + "step": 14040 + }, + { + "epoch": 0.33826256593049303, + "grad_norm": 1.8075425624847412, + "learning_rate": 6.96607849102525e-07, + "loss": 0.924, + "step": 14045 + }, + { + "epoch": 0.3383829869222803, + "grad_norm": 1.711000680923462, + "learning_rate": 6.964810871108406e-07, + "loss": 0.9265, + "step": 14050 + }, + { + "epoch": 0.3385034079140676, + "grad_norm": 1.5463098287582397, + "learning_rate": 6.963543251191563e-07, + "loss": 0.9224, + "step": 14055 + }, + { + "epoch": 0.33862382890585485, + "grad_norm": 1.6100355386734009, + "learning_rate": 6.962275631274718e-07, + "loss": 0.9438, + "step": 14060 + }, + { + "epoch": 0.3387442498976422, + "grad_norm": 1.7484734058380127, + "learning_rate": 6.961008011357874e-07, + "loss": 0.9099, + "step": 14065 + }, + { + "epoch": 0.33886467088942945, + "grad_norm": 1.6006505489349365, + "learning_rate": 6.95974039144103e-07, + "loss": 0.9214, + "step": 14070 + }, + { + "epoch": 0.3389850918812167, + "grad_norm": 1.4968926906585693, + "learning_rate": 6.958472771524185e-07, + "loss": 0.942, + "step": 14075 + }, + { + "epoch": 0.339105512873004, + "grad_norm": 1.4391860961914062, + "learning_rate": 6.957205151607342e-07, + "loss": 0.9237, + "step": 14080 + }, + { + "epoch": 0.33922593386479133, + "grad_norm": 1.981737732887268, + "learning_rate": 6.955937531690497e-07, + "loss": 0.9219, + "step": 14085 + }, + { + "epoch": 0.3393463548565786, + "grad_norm": 2.0144782066345215, + "learning_rate": 6.954669911773653e-07, + "loss": 0.953, + "step": 14090 + }, + { + "epoch": 0.3394667758483659, + "grad_norm": 1.4978262186050415, + "learning_rate": 6.953402291856809e-07, + "loss": 0.8834, + "step": 14095 + }, + { + "epoch": 0.33958719684015315, + "grad_norm": 1.5561473369598389, + "learning_rate": 6.952134671939966e-07, + "loss": 0.8911, + "step": 14100 + }, + { + "epoch": 0.3397076178319405, + "grad_norm": 1.5292071104049683, + "learning_rate": 6.950867052023122e-07, + "loss": 0.9516, + "step": 14105 + }, + { + "epoch": 0.33982803882372775, + "grad_norm": 1.6524728536605835, + "learning_rate": 6.949599432106276e-07, + "loss": 0.9669, + "step": 14110 + }, + { + "epoch": 0.339948459815515, + "grad_norm": 1.861893653869629, + "learning_rate": 6.948331812189433e-07, + "loss": 0.8984, + "step": 14115 + }, + { + "epoch": 0.34006888080730235, + "grad_norm": 1.8193475008010864, + "learning_rate": 6.947064192272589e-07, + "loss": 0.9429, + "step": 14120 + }, + { + "epoch": 0.3401893017990896, + "grad_norm": 1.6743884086608887, + "learning_rate": 6.945796572355745e-07, + "loss": 0.9687, + "step": 14125 + }, + { + "epoch": 0.3403097227908769, + "grad_norm": 1.641255497932434, + "learning_rate": 6.944528952438901e-07, + "loss": 0.9117, + "step": 14130 + }, + { + "epoch": 0.3404301437826642, + "grad_norm": 1.6246029138565063, + "learning_rate": 6.943261332522056e-07, + "loss": 0.9413, + "step": 14135 + }, + { + "epoch": 0.3405505647744515, + "grad_norm": 1.459546685218811, + "learning_rate": 6.941993712605212e-07, + "loss": 0.9272, + "step": 14140 + }, + { + "epoch": 0.3406709857662388, + "grad_norm": 1.6321953535079956, + "learning_rate": 6.940726092688368e-07, + "loss": 0.938, + "step": 14145 + }, + { + "epoch": 0.34079140675802605, + "grad_norm": 1.5907163619995117, + "learning_rate": 6.939458472771525e-07, + "loss": 0.9595, + "step": 14150 + }, + { + "epoch": 0.3409118277498133, + "grad_norm": 1.456050157546997, + "learning_rate": 6.938190852854679e-07, + "loss": 0.9371, + "step": 14155 + }, + { + "epoch": 0.34103224874160065, + "grad_norm": 1.67795729637146, + "learning_rate": 6.936923232937836e-07, + "loss": 0.9938, + "step": 14160 + }, + { + "epoch": 0.3411526697333879, + "grad_norm": 1.7361115217208862, + "learning_rate": 6.935655613020992e-07, + "loss": 0.9387, + "step": 14165 + }, + { + "epoch": 0.3412730907251752, + "grad_norm": 1.8264391422271729, + "learning_rate": 6.934387993104147e-07, + "loss": 0.9648, + "step": 14170 + }, + { + "epoch": 0.3413935117169625, + "grad_norm": 1.6677944660186768, + "learning_rate": 6.933120373187304e-07, + "loss": 0.9156, + "step": 14175 + }, + { + "epoch": 0.3415139327087498, + "grad_norm": 1.4905892610549927, + "learning_rate": 6.931852753270459e-07, + "loss": 0.8835, + "step": 14180 + }, + { + "epoch": 0.3416343537005371, + "grad_norm": 1.5072036981582642, + "learning_rate": 6.930585133353615e-07, + "loss": 0.9174, + "step": 14185 + }, + { + "epoch": 0.34175477469232435, + "grad_norm": 1.64654541015625, + "learning_rate": 6.929317513436771e-07, + "loss": 0.9023, + "step": 14190 + }, + { + "epoch": 0.3418751956841117, + "grad_norm": 1.5622669458389282, + "learning_rate": 6.928049893519927e-07, + "loss": 0.9345, + "step": 14195 + }, + { + "epoch": 0.34199561667589895, + "grad_norm": 1.562902808189392, + "learning_rate": 6.926782273603082e-07, + "loss": 0.8823, + "step": 14200 + }, + { + "epoch": 0.3421160376676862, + "grad_norm": 1.4731025695800781, + "learning_rate": 6.925514653686238e-07, + "loss": 0.915, + "step": 14205 + }, + { + "epoch": 0.3422364586594735, + "grad_norm": 1.7186492681503296, + "learning_rate": 6.924247033769395e-07, + "loss": 0.9277, + "step": 14210 + }, + { + "epoch": 0.3423568796512608, + "grad_norm": 1.7451112270355225, + "learning_rate": 6.92297941385255e-07, + "loss": 0.9411, + "step": 14215 + }, + { + "epoch": 0.3424773006430481, + "grad_norm": 1.4915034770965576, + "learning_rate": 6.921711793935707e-07, + "loss": 0.9142, + "step": 14220 + }, + { + "epoch": 0.3425977216348354, + "grad_norm": 1.48759925365448, + "learning_rate": 6.920444174018862e-07, + "loss": 0.9394, + "step": 14225 + }, + { + "epoch": 0.34271814262662265, + "grad_norm": 1.5974241495132446, + "learning_rate": 6.919176554102017e-07, + "loss": 0.8681, + "step": 14230 + }, + { + "epoch": 0.34283856361841, + "grad_norm": 1.517372965812683, + "learning_rate": 6.917908934185174e-07, + "loss": 0.8845, + "step": 14235 + }, + { + "epoch": 0.34295898461019725, + "grad_norm": 1.6111204624176025, + "learning_rate": 6.91664131426833e-07, + "loss": 0.9087, + "step": 14240 + }, + { + "epoch": 0.3430794056019845, + "grad_norm": 1.4942491054534912, + "learning_rate": 6.915373694351486e-07, + "loss": 0.9885, + "step": 14245 + }, + { + "epoch": 0.34319982659377185, + "grad_norm": 1.5210905075073242, + "learning_rate": 6.914106074434641e-07, + "loss": 0.9222, + "step": 14250 + }, + { + "epoch": 0.3433202475855591, + "grad_norm": 1.6447272300720215, + "learning_rate": 6.912838454517797e-07, + "loss": 0.9016, + "step": 14255 + }, + { + "epoch": 0.3434406685773464, + "grad_norm": 1.4414374828338623, + "learning_rate": 6.911570834600953e-07, + "loss": 0.9526, + "step": 14260 + }, + { + "epoch": 0.34356108956913367, + "grad_norm": 1.6271892786026, + "learning_rate": 6.910303214684109e-07, + "loss": 0.9561, + "step": 14265 + }, + { + "epoch": 0.343681510560921, + "grad_norm": 1.5519522428512573, + "learning_rate": 6.909035594767265e-07, + "loss": 0.8733, + "step": 14270 + }, + { + "epoch": 0.3438019315527083, + "grad_norm": 1.8349835872650146, + "learning_rate": 6.90776797485042e-07, + "loss": 0.8791, + "step": 14275 + }, + { + "epoch": 0.34392235254449555, + "grad_norm": 1.4303919076919556, + "learning_rate": 6.906500354933577e-07, + "loss": 0.9214, + "step": 14280 + }, + { + "epoch": 0.3440427735362828, + "grad_norm": 1.4979641437530518, + "learning_rate": 6.905232735016733e-07, + "loss": 0.8764, + "step": 14285 + }, + { + "epoch": 0.34416319452807015, + "grad_norm": 1.6719623804092407, + "learning_rate": 6.903965115099888e-07, + "loss": 0.93, + "step": 14290 + }, + { + "epoch": 0.3442836155198574, + "grad_norm": 1.5701115131378174, + "learning_rate": 6.902697495183044e-07, + "loss": 0.8932, + "step": 14295 + }, + { + "epoch": 0.3444040365116447, + "grad_norm": 1.6487075090408325, + "learning_rate": 6.9014298752662e-07, + "loss": 0.9413, + "step": 14300 + }, + { + "epoch": 0.344524457503432, + "grad_norm": 1.575084924697876, + "learning_rate": 6.900162255349356e-07, + "loss": 0.9203, + "step": 14305 + }, + { + "epoch": 0.3446448784952193, + "grad_norm": 1.462643027305603, + "learning_rate": 6.898894635432512e-07, + "loss": 0.9005, + "step": 14310 + }, + { + "epoch": 0.34476529948700657, + "grad_norm": 1.448420763015747, + "learning_rate": 6.897627015515668e-07, + "loss": 0.9337, + "step": 14315 + }, + { + "epoch": 0.34488572047879384, + "grad_norm": 1.6062064170837402, + "learning_rate": 6.896359395598823e-07, + "loss": 0.9557, + "step": 14320 + }, + { + "epoch": 0.3450061414705812, + "grad_norm": 1.5350664854049683, + "learning_rate": 6.895091775681979e-07, + "loss": 0.9215, + "step": 14325 + }, + { + "epoch": 0.34512656246236845, + "grad_norm": 1.4015288352966309, + "learning_rate": 6.893824155765136e-07, + "loss": 0.9488, + "step": 14330 + }, + { + "epoch": 0.3452469834541557, + "grad_norm": 1.4283367395401, + "learning_rate": 6.892556535848291e-07, + "loss": 0.9227, + "step": 14335 + }, + { + "epoch": 0.345367404445943, + "grad_norm": 1.5763317346572876, + "learning_rate": 6.891288915931446e-07, + "loss": 0.9458, + "step": 14340 + }, + { + "epoch": 0.3454878254377303, + "grad_norm": 1.4506820440292358, + "learning_rate": 6.890021296014603e-07, + "loss": 0.9104, + "step": 14345 + }, + { + "epoch": 0.3456082464295176, + "grad_norm": 1.586639165878296, + "learning_rate": 6.888753676097758e-07, + "loss": 0.9296, + "step": 14350 + }, + { + "epoch": 0.34572866742130487, + "grad_norm": 1.5703184604644775, + "learning_rate": 6.887486056180915e-07, + "loss": 0.9198, + "step": 14355 + }, + { + "epoch": 0.34584908841309214, + "grad_norm": 1.518678903579712, + "learning_rate": 6.886218436264071e-07, + "loss": 0.9037, + "step": 14360 + }, + { + "epoch": 0.34596950940487947, + "grad_norm": 1.5263680219650269, + "learning_rate": 6.884950816347226e-07, + "loss": 0.9364, + "step": 14365 + }, + { + "epoch": 0.34608993039666675, + "grad_norm": 1.6340490579605103, + "learning_rate": 6.883683196430382e-07, + "loss": 0.9527, + "step": 14370 + }, + { + "epoch": 0.346210351388454, + "grad_norm": 1.4462147951126099, + "learning_rate": 6.882415576513538e-07, + "loss": 0.8904, + "step": 14375 + }, + { + "epoch": 0.34633077238024135, + "grad_norm": 1.6212999820709229, + "learning_rate": 6.881147956596694e-07, + "loss": 0.942, + "step": 14380 + }, + { + "epoch": 0.3464511933720286, + "grad_norm": 1.5607647895812988, + "learning_rate": 6.879880336679849e-07, + "loss": 0.9361, + "step": 14385 + }, + { + "epoch": 0.3465716143638159, + "grad_norm": 1.485691785812378, + "learning_rate": 6.878612716763006e-07, + "loss": 0.896, + "step": 14390 + }, + { + "epoch": 0.34669203535560317, + "grad_norm": 1.5243165493011475, + "learning_rate": 6.877345096846161e-07, + "loss": 0.9551, + "step": 14395 + }, + { + "epoch": 0.3468124563473905, + "grad_norm": 1.5864940881729126, + "learning_rate": 6.876077476929317e-07, + "loss": 0.8871, + "step": 14400 + }, + { + "epoch": 0.34693287733917777, + "grad_norm": 1.4279605150222778, + "learning_rate": 6.874809857012474e-07, + "loss": 0.9501, + "step": 14405 + }, + { + "epoch": 0.34705329833096504, + "grad_norm": 1.622212290763855, + "learning_rate": 6.873542237095628e-07, + "loss": 0.8701, + "step": 14410 + }, + { + "epoch": 0.3471737193227523, + "grad_norm": 1.7826794385910034, + "learning_rate": 6.872274617178785e-07, + "loss": 0.9761, + "step": 14415 + }, + { + "epoch": 0.34729414031453965, + "grad_norm": 1.470369577407837, + "learning_rate": 6.871006997261941e-07, + "loss": 0.8671, + "step": 14420 + }, + { + "epoch": 0.3474145613063269, + "grad_norm": 1.6227402687072754, + "learning_rate": 6.869739377345097e-07, + "loss": 0.9453, + "step": 14425 + }, + { + "epoch": 0.3475349822981142, + "grad_norm": 1.3772656917572021, + "learning_rate": 6.868471757428253e-07, + "loss": 0.9403, + "step": 14430 + }, + { + "epoch": 0.3476554032899015, + "grad_norm": 1.508234977722168, + "learning_rate": 6.867204137511408e-07, + "loss": 0.8758, + "step": 14435 + }, + { + "epoch": 0.3477758242816888, + "grad_norm": 1.557151198387146, + "learning_rate": 6.865936517594564e-07, + "loss": 0.944, + "step": 14440 + }, + { + "epoch": 0.34789624527347607, + "grad_norm": 1.6642489433288574, + "learning_rate": 6.86466889767772e-07, + "loss": 0.9506, + "step": 14445 + }, + { + "epoch": 0.34801666626526334, + "grad_norm": 1.665263056755066, + "learning_rate": 6.863401277760877e-07, + "loss": 0.9669, + "step": 14450 + }, + { + "epoch": 0.34813708725705067, + "grad_norm": 1.4337249994277954, + "learning_rate": 6.862133657844031e-07, + "loss": 0.9121, + "step": 14455 + }, + { + "epoch": 0.34825750824883794, + "grad_norm": 1.5387197732925415, + "learning_rate": 6.860866037927187e-07, + "loss": 0.918, + "step": 14460 + }, + { + "epoch": 0.3483779292406252, + "grad_norm": 1.3670315742492676, + "learning_rate": 6.859598418010344e-07, + "loss": 0.9419, + "step": 14465 + }, + { + "epoch": 0.3484983502324125, + "grad_norm": 1.4968301057815552, + "learning_rate": 6.858330798093499e-07, + "loss": 0.9089, + "step": 14470 + }, + { + "epoch": 0.3486187712241998, + "grad_norm": 1.4888393878936768, + "learning_rate": 6.857063178176656e-07, + "loss": 0.9157, + "step": 14475 + }, + { + "epoch": 0.3487391922159871, + "grad_norm": 1.4342647790908813, + "learning_rate": 6.855795558259811e-07, + "loss": 0.9848, + "step": 14480 + }, + { + "epoch": 0.34885961320777437, + "grad_norm": 1.4552973508834839, + "learning_rate": 6.854527938342966e-07, + "loss": 0.9555, + "step": 14485 + }, + { + "epoch": 0.3489800341995617, + "grad_norm": 1.503014087677002, + "learning_rate": 6.853260318426123e-07, + "loss": 0.9381, + "step": 14490 + }, + { + "epoch": 0.34910045519134897, + "grad_norm": 1.4178707599639893, + "learning_rate": 6.851992698509279e-07, + "loss": 0.9311, + "step": 14495 + }, + { + "epoch": 0.34922087618313624, + "grad_norm": 1.550925612449646, + "learning_rate": 6.850725078592434e-07, + "loss": 0.9277, + "step": 14500 + }, + { + "epoch": 0.3493412971749235, + "grad_norm": 1.6314831972122192, + "learning_rate": 6.84945745867559e-07, + "loss": 0.9252, + "step": 14505 + }, + { + "epoch": 0.34946171816671084, + "grad_norm": 1.5699819326400757, + "learning_rate": 6.848189838758747e-07, + "loss": 0.9059, + "step": 14510 + }, + { + "epoch": 0.3495821391584981, + "grad_norm": 1.4622288942337036, + "learning_rate": 6.846922218841902e-07, + "loss": 0.9703, + "step": 14515 + }, + { + "epoch": 0.3497025601502854, + "grad_norm": 1.5557868480682373, + "learning_rate": 6.845654598925058e-07, + "loss": 0.8926, + "step": 14520 + }, + { + "epoch": 0.34982298114207266, + "grad_norm": 1.5330698490142822, + "learning_rate": 6.844386979008214e-07, + "loss": 0.904, + "step": 14525 + }, + { + "epoch": 0.34994340213386, + "grad_norm": 1.5403659343719482, + "learning_rate": 6.843119359091369e-07, + "loss": 0.9054, + "step": 14530 + }, + { + "epoch": 0.35006382312564727, + "grad_norm": 1.4800559282302856, + "learning_rate": 6.841851739174526e-07, + "loss": 0.9392, + "step": 14535 + }, + { + "epoch": 0.35018424411743454, + "grad_norm": 1.378165602684021, + "learning_rate": 6.840584119257682e-07, + "loss": 0.9487, + "step": 14540 + }, + { + "epoch": 0.3503046651092218, + "grad_norm": 1.4545567035675049, + "learning_rate": 6.839316499340838e-07, + "loss": 0.9389, + "step": 14545 + }, + { + "epoch": 0.35042508610100914, + "grad_norm": 1.4694862365722656, + "learning_rate": 6.838048879423993e-07, + "loss": 0.9267, + "step": 14550 + }, + { + "epoch": 0.3505455070927964, + "grad_norm": 1.4605337381362915, + "learning_rate": 6.836781259507149e-07, + "loss": 0.8976, + "step": 14555 + }, + { + "epoch": 0.3506659280845837, + "grad_norm": 1.4146809577941895, + "learning_rate": 6.835513639590305e-07, + "loss": 0.938, + "step": 14560 + }, + { + "epoch": 0.350786349076371, + "grad_norm": 1.5950515270233154, + "learning_rate": 6.834246019673461e-07, + "loss": 0.9015, + "step": 14565 + }, + { + "epoch": 0.3509067700681583, + "grad_norm": 1.5792133808135986, + "learning_rate": 6.832978399756617e-07, + "loss": 0.8922, + "step": 14570 + }, + { + "epoch": 0.35102719105994556, + "grad_norm": 1.5362197160720825, + "learning_rate": 6.831710779839772e-07, + "loss": 0.9435, + "step": 14575 + }, + { + "epoch": 0.35114761205173284, + "grad_norm": 1.4740729331970215, + "learning_rate": 6.830443159922928e-07, + "loss": 0.926, + "step": 14580 + }, + { + "epoch": 0.35126803304352017, + "grad_norm": 1.5383095741271973, + "learning_rate": 6.829175540006085e-07, + "loss": 0.8675, + "step": 14585 + }, + { + "epoch": 0.35138845403530744, + "grad_norm": 1.6412146091461182, + "learning_rate": 6.82790792008924e-07, + "loss": 0.9271, + "step": 14590 + }, + { + "epoch": 0.3515088750270947, + "grad_norm": 1.4079954624176025, + "learning_rate": 6.826640300172396e-07, + "loss": 0.9501, + "step": 14595 + }, + { + "epoch": 0.351629296018882, + "grad_norm": 1.6104514598846436, + "learning_rate": 6.825372680255552e-07, + "loss": 0.9345, + "step": 14600 + }, + { + "epoch": 0.3517497170106693, + "grad_norm": 1.4540212154388428, + "learning_rate": 6.824105060338707e-07, + "loss": 0.9067, + "step": 14605 + }, + { + "epoch": 0.3518701380024566, + "grad_norm": 1.5886861085891724, + "learning_rate": 6.822837440421864e-07, + "loss": 0.9966, + "step": 14610 + }, + { + "epoch": 0.35199055899424386, + "grad_norm": 1.532599687576294, + "learning_rate": 6.82156982050502e-07, + "loss": 0.9118, + "step": 14615 + }, + { + "epoch": 0.3521109799860312, + "grad_norm": 1.4757113456726074, + "learning_rate": 6.820302200588175e-07, + "loss": 0.8775, + "step": 14620 + }, + { + "epoch": 0.35223140097781847, + "grad_norm": 1.3537869453430176, + "learning_rate": 6.819034580671331e-07, + "loss": 0.938, + "step": 14625 + }, + { + "epoch": 0.35235182196960574, + "grad_norm": 1.5233502388000488, + "learning_rate": 6.817766960754488e-07, + "loss": 0.9157, + "step": 14630 + }, + { + "epoch": 0.352472242961393, + "grad_norm": 1.6560343503952026, + "learning_rate": 6.816499340837643e-07, + "loss": 0.9447, + "step": 14635 + }, + { + "epoch": 0.35259266395318034, + "grad_norm": 1.452549695968628, + "learning_rate": 6.815231720920798e-07, + "loss": 0.8794, + "step": 14640 + }, + { + "epoch": 0.3527130849449676, + "grad_norm": 1.5324472188949585, + "learning_rate": 6.813964101003955e-07, + "loss": 0.9094, + "step": 14645 + }, + { + "epoch": 0.3528335059367549, + "grad_norm": 1.5266990661621094, + "learning_rate": 6.81269648108711e-07, + "loss": 0.9036, + "step": 14650 + }, + { + "epoch": 0.35295392692854216, + "grad_norm": 1.6903965473175049, + "learning_rate": 6.811428861170267e-07, + "loss": 0.9105, + "step": 14655 + }, + { + "epoch": 0.3530743479203295, + "grad_norm": 1.6886303424835205, + "learning_rate": 6.810161241253423e-07, + "loss": 0.9129, + "step": 14660 + }, + { + "epoch": 0.35319476891211676, + "grad_norm": 1.5531803369522095, + "learning_rate": 6.808893621336577e-07, + "loss": 0.9767, + "step": 14665 + }, + { + "epoch": 0.35331518990390404, + "grad_norm": 1.5922292470932007, + "learning_rate": 6.807626001419734e-07, + "loss": 0.9539, + "step": 14670 + }, + { + "epoch": 0.3534356108956913, + "grad_norm": 1.556394100189209, + "learning_rate": 6.80635838150289e-07, + "loss": 0.8524, + "step": 14675 + }, + { + "epoch": 0.35355603188747864, + "grad_norm": 1.4371875524520874, + "learning_rate": 6.805090761586046e-07, + "loss": 0.895, + "step": 14680 + }, + { + "epoch": 0.3536764528792659, + "grad_norm": 1.4753456115722656, + "learning_rate": 6.803823141669201e-07, + "loss": 0.9195, + "step": 14685 + }, + { + "epoch": 0.3537968738710532, + "grad_norm": 1.5675758123397827, + "learning_rate": 6.802555521752358e-07, + "loss": 0.8769, + "step": 14690 + }, + { + "epoch": 0.3539172948628405, + "grad_norm": 1.6256442070007324, + "learning_rate": 6.801287901835513e-07, + "loss": 0.8808, + "step": 14695 + }, + { + "epoch": 0.3540377158546278, + "grad_norm": 1.9164952039718628, + "learning_rate": 6.800020281918669e-07, + "loss": 0.9165, + "step": 14700 + }, + { + "epoch": 0.35415813684641506, + "grad_norm": 1.5501030683517456, + "learning_rate": 6.798752662001826e-07, + "loss": 0.8797, + "step": 14705 + }, + { + "epoch": 0.35427855783820233, + "grad_norm": 1.5340889692306519, + "learning_rate": 6.79748504208498e-07, + "loss": 0.9298, + "step": 14710 + }, + { + "epoch": 0.35439897882998966, + "grad_norm": 1.3141673803329468, + "learning_rate": 6.796217422168137e-07, + "loss": 0.9389, + "step": 14715 + }, + { + "epoch": 0.35451939982177694, + "grad_norm": 1.4161008596420288, + "learning_rate": 6.794949802251293e-07, + "loss": 0.9145, + "step": 14720 + }, + { + "epoch": 0.3546398208135642, + "grad_norm": 1.6499849557876587, + "learning_rate": 6.793682182334448e-07, + "loss": 0.9219, + "step": 14725 + }, + { + "epoch": 0.3547602418053515, + "grad_norm": 1.5385875701904297, + "learning_rate": 6.792414562417605e-07, + "loss": 0.9265, + "step": 14730 + }, + { + "epoch": 0.3548806627971388, + "grad_norm": 1.5018668174743652, + "learning_rate": 6.79114694250076e-07, + "loss": 1.0083, + "step": 14735 + }, + { + "epoch": 0.3550010837889261, + "grad_norm": 1.418845534324646, + "learning_rate": 6.789879322583916e-07, + "loss": 0.9134, + "step": 14740 + }, + { + "epoch": 0.35512150478071336, + "grad_norm": 1.488181471824646, + "learning_rate": 6.788611702667072e-07, + "loss": 0.9387, + "step": 14745 + }, + { + "epoch": 0.3552419257725007, + "grad_norm": 1.5587797164916992, + "learning_rate": 6.787344082750229e-07, + "loss": 0.9402, + "step": 14750 + }, + { + "epoch": 0.35536234676428796, + "grad_norm": 1.431835651397705, + "learning_rate": 6.786076462833383e-07, + "loss": 0.9412, + "step": 14755 + }, + { + "epoch": 0.35548276775607524, + "grad_norm": 1.7869625091552734, + "learning_rate": 6.784808842916539e-07, + "loss": 0.9299, + "step": 14760 + }, + { + "epoch": 0.3556031887478625, + "grad_norm": 1.3361440896987915, + "learning_rate": 6.783541222999696e-07, + "loss": 0.9497, + "step": 14765 + }, + { + "epoch": 0.35572360973964984, + "grad_norm": 1.3490222692489624, + "learning_rate": 6.782273603082851e-07, + "loss": 0.8876, + "step": 14770 + }, + { + "epoch": 0.3558440307314371, + "grad_norm": 1.5653305053710938, + "learning_rate": 6.781005983166008e-07, + "loss": 0.9391, + "step": 14775 + }, + { + "epoch": 0.3559644517232244, + "grad_norm": 1.583202600479126, + "learning_rate": 6.779738363249163e-07, + "loss": 0.9037, + "step": 14780 + }, + { + "epoch": 0.35608487271501166, + "grad_norm": 1.6768081188201904, + "learning_rate": 6.778470743332318e-07, + "loss": 0.943, + "step": 14785 + }, + { + "epoch": 0.356205293706799, + "grad_norm": 1.8261566162109375, + "learning_rate": 6.777203123415475e-07, + "loss": 0.9512, + "step": 14790 + }, + { + "epoch": 0.35632571469858626, + "grad_norm": 1.6274113655090332, + "learning_rate": 6.775935503498631e-07, + "loss": 0.9726, + "step": 14795 + }, + { + "epoch": 0.35644613569037353, + "grad_norm": 1.8121517896652222, + "learning_rate": 6.774667883581786e-07, + "loss": 0.8796, + "step": 14800 + }, + { + "epoch": 0.3565665566821608, + "grad_norm": 1.510184407234192, + "learning_rate": 6.773400263664942e-07, + "loss": 0.874, + "step": 14805 + }, + { + "epoch": 0.35668697767394814, + "grad_norm": 1.5640634298324585, + "learning_rate": 6.772132643748099e-07, + "loss": 0.9704, + "step": 14810 + }, + { + "epoch": 0.3568073986657354, + "grad_norm": 1.549045443534851, + "learning_rate": 6.770865023831254e-07, + "loss": 0.9432, + "step": 14815 + }, + { + "epoch": 0.3569278196575227, + "grad_norm": 1.5475493669509888, + "learning_rate": 6.76959740391441e-07, + "loss": 0.8924, + "step": 14820 + }, + { + "epoch": 0.35704824064931, + "grad_norm": 1.4325554370880127, + "learning_rate": 6.768329783997566e-07, + "loss": 0.8884, + "step": 14825 + }, + { + "epoch": 0.3571686616410973, + "grad_norm": 1.6210287809371948, + "learning_rate": 6.767062164080721e-07, + "loss": 0.9236, + "step": 14830 + }, + { + "epoch": 0.35728908263288456, + "grad_norm": 1.5211275815963745, + "learning_rate": 6.765794544163878e-07, + "loss": 0.9077, + "step": 14835 + }, + { + "epoch": 0.35740950362467183, + "grad_norm": 1.7513643503189087, + "learning_rate": 6.764526924247034e-07, + "loss": 0.8607, + "step": 14840 + }, + { + "epoch": 0.35752992461645916, + "grad_norm": 1.4754881858825684, + "learning_rate": 6.763259304330188e-07, + "loss": 0.9212, + "step": 14845 + }, + { + "epoch": 0.35765034560824643, + "grad_norm": 1.642807126045227, + "learning_rate": 6.761991684413345e-07, + "loss": 0.8888, + "step": 14850 + }, + { + "epoch": 0.3577707666000337, + "grad_norm": 1.4399112462997437, + "learning_rate": 6.760724064496501e-07, + "loss": 0.891, + "step": 14855 + }, + { + "epoch": 0.357891187591821, + "grad_norm": 1.6988621950149536, + "learning_rate": 6.759456444579657e-07, + "loss": 0.9361, + "step": 14860 + }, + { + "epoch": 0.3580116085836083, + "grad_norm": 1.4203370809555054, + "learning_rate": 6.758188824662813e-07, + "loss": 0.9133, + "step": 14865 + }, + { + "epoch": 0.3581320295753956, + "grad_norm": 1.418886661529541, + "learning_rate": 6.756921204745968e-07, + "loss": 0.9059, + "step": 14870 + }, + { + "epoch": 0.35825245056718286, + "grad_norm": 1.404044508934021, + "learning_rate": 6.755653584829124e-07, + "loss": 0.9271, + "step": 14875 + }, + { + "epoch": 0.3583728715589702, + "grad_norm": 1.5734264850616455, + "learning_rate": 6.75438596491228e-07, + "loss": 0.8857, + "step": 14880 + }, + { + "epoch": 0.35849329255075746, + "grad_norm": 1.4540066719055176, + "learning_rate": 6.753118344995437e-07, + "loss": 0.8624, + "step": 14885 + }, + { + "epoch": 0.35861371354254473, + "grad_norm": 1.578904390335083, + "learning_rate": 6.751850725078592e-07, + "loss": 0.9649, + "step": 14890 + }, + { + "epoch": 0.358734134534332, + "grad_norm": 1.5617300271987915, + "learning_rate": 6.750583105161748e-07, + "loss": 0.9249, + "step": 14895 + }, + { + "epoch": 0.35885455552611933, + "grad_norm": 1.649563193321228, + "learning_rate": 6.749315485244904e-07, + "loss": 0.9314, + "step": 14900 + }, + { + "epoch": 0.3589749765179066, + "grad_norm": 1.5435034036636353, + "learning_rate": 6.748047865328059e-07, + "loss": 0.9417, + "step": 14905 + }, + { + "epoch": 0.3590953975096939, + "grad_norm": 1.6014434099197388, + "learning_rate": 6.746780245411216e-07, + "loss": 0.9236, + "step": 14910 + }, + { + "epoch": 0.35921581850148115, + "grad_norm": 1.627638339996338, + "learning_rate": 6.745512625494371e-07, + "loss": 0.8733, + "step": 14915 + }, + { + "epoch": 0.3593362394932685, + "grad_norm": 1.7317113876342773, + "learning_rate": 6.744245005577527e-07, + "loss": 0.986, + "step": 14920 + }, + { + "epoch": 0.35945666048505576, + "grad_norm": 1.584110975265503, + "learning_rate": 6.742977385660683e-07, + "loss": 0.8878, + "step": 14925 + }, + { + "epoch": 0.35957708147684303, + "grad_norm": 1.5106953382492065, + "learning_rate": 6.74170976574384e-07, + "loss": 0.9117, + "step": 14930 + }, + { + "epoch": 0.35969750246863036, + "grad_norm": 1.4878908395767212, + "learning_rate": 6.740442145826995e-07, + "loss": 0.8723, + "step": 14935 + }, + { + "epoch": 0.35981792346041763, + "grad_norm": 1.6115652322769165, + "learning_rate": 6.73917452591015e-07, + "loss": 0.9179, + "step": 14940 + }, + { + "epoch": 0.3599383444522049, + "grad_norm": 1.5578006505966187, + "learning_rate": 6.737906905993307e-07, + "loss": 0.9446, + "step": 14945 + }, + { + "epoch": 0.3600587654439922, + "grad_norm": 1.6315845251083374, + "learning_rate": 6.736639286076462e-07, + "loss": 0.9004, + "step": 14950 + }, + { + "epoch": 0.3601791864357795, + "grad_norm": 1.4576815366744995, + "learning_rate": 6.735371666159619e-07, + "loss": 0.9585, + "step": 14955 + }, + { + "epoch": 0.3602996074275668, + "grad_norm": 1.4762134552001953, + "learning_rate": 6.734104046242775e-07, + "loss": 0.9007, + "step": 14960 + }, + { + "epoch": 0.36042002841935405, + "grad_norm": 1.4411206245422363, + "learning_rate": 6.732836426325929e-07, + "loss": 0.8957, + "step": 14965 + }, + { + "epoch": 0.36054044941114133, + "grad_norm": 1.505435824394226, + "learning_rate": 6.731568806409086e-07, + "loss": 0.9294, + "step": 14970 + }, + { + "epoch": 0.36066087040292866, + "grad_norm": 1.5263447761535645, + "learning_rate": 6.730301186492242e-07, + "loss": 0.9304, + "step": 14975 + }, + { + "epoch": 0.36078129139471593, + "grad_norm": 1.568605661392212, + "learning_rate": 6.729033566575398e-07, + "loss": 0.9157, + "step": 14980 + }, + { + "epoch": 0.3609017123865032, + "grad_norm": 1.693253517150879, + "learning_rate": 6.727765946658553e-07, + "loss": 0.9702, + "step": 14985 + }, + { + "epoch": 0.3610221333782905, + "grad_norm": 1.3735846281051636, + "learning_rate": 6.726498326741709e-07, + "loss": 0.8727, + "step": 14990 + }, + { + "epoch": 0.3611425543700778, + "grad_norm": 1.50700843334198, + "learning_rate": 6.725230706824865e-07, + "loss": 0.9141, + "step": 14995 + }, + { + "epoch": 0.3612629753618651, + "grad_norm": 1.4231067895889282, + "learning_rate": 6.723963086908021e-07, + "loss": 0.9359, + "step": 15000 + }, + { + "epoch": 0.36138339635365235, + "grad_norm": 1.4903537034988403, + "learning_rate": 6.722695466991178e-07, + "loss": 0.954, + "step": 15005 + }, + { + "epoch": 0.3615038173454397, + "grad_norm": 1.5555487871170044, + "learning_rate": 6.721427847074332e-07, + "loss": 0.8628, + "step": 15010 + }, + { + "epoch": 0.36162423833722696, + "grad_norm": 1.661105751991272, + "learning_rate": 6.720160227157488e-07, + "loss": 0.9473, + "step": 15015 + }, + { + "epoch": 0.36174465932901423, + "grad_norm": 1.5630122423171997, + "learning_rate": 6.718892607240645e-07, + "loss": 0.9535, + "step": 15020 + }, + { + "epoch": 0.3618650803208015, + "grad_norm": 1.8373959064483643, + "learning_rate": 6.7176249873238e-07, + "loss": 0.9357, + "step": 15025 + }, + { + "epoch": 0.36198550131258883, + "grad_norm": 1.462262749671936, + "learning_rate": 6.716357367406956e-07, + "loss": 0.8968, + "step": 15030 + }, + { + "epoch": 0.3621059223043761, + "grad_norm": 1.5995755195617676, + "learning_rate": 6.715089747490112e-07, + "loss": 0.9252, + "step": 15035 + }, + { + "epoch": 0.3622263432961634, + "grad_norm": 1.420731782913208, + "learning_rate": 6.713822127573268e-07, + "loss": 0.8925, + "step": 15040 + }, + { + "epoch": 0.36234676428795065, + "grad_norm": 1.6841161251068115, + "learning_rate": 6.712554507656424e-07, + "loss": 0.8825, + "step": 15045 + }, + { + "epoch": 0.362467185279738, + "grad_norm": 1.5006433725357056, + "learning_rate": 6.71128688773958e-07, + "loss": 0.934, + "step": 15050 + }, + { + "epoch": 0.36258760627152525, + "grad_norm": 1.5156694650650024, + "learning_rate": 6.710019267822735e-07, + "loss": 0.8896, + "step": 15055 + }, + { + "epoch": 0.3627080272633125, + "grad_norm": 1.449398159980774, + "learning_rate": 6.708751647905891e-07, + "loss": 0.8704, + "step": 15060 + }, + { + "epoch": 0.36282844825509986, + "grad_norm": 1.5191737413406372, + "learning_rate": 6.707484027989048e-07, + "loss": 0.959, + "step": 15065 + }, + { + "epoch": 0.36294886924688713, + "grad_norm": 1.6532554626464844, + "learning_rate": 6.706216408072203e-07, + "loss": 0.9403, + "step": 15070 + }, + { + "epoch": 0.3630692902386744, + "grad_norm": 1.6268415451049805, + "learning_rate": 6.70494878815536e-07, + "loss": 0.931, + "step": 15075 + }, + { + "epoch": 0.3631897112304617, + "grad_norm": 1.616729974746704, + "learning_rate": 6.703681168238515e-07, + "loss": 0.9523, + "step": 15080 + }, + { + "epoch": 0.363310132222249, + "grad_norm": 1.5711734294891357, + "learning_rate": 6.70241354832167e-07, + "loss": 0.8738, + "step": 15085 + }, + { + "epoch": 0.3634305532140363, + "grad_norm": 1.3926578760147095, + "learning_rate": 6.701145928404827e-07, + "loss": 0.8539, + "step": 15090 + }, + { + "epoch": 0.36355097420582355, + "grad_norm": 1.7388372421264648, + "learning_rate": 6.699878308487983e-07, + "loss": 0.9459, + "step": 15095 + }, + { + "epoch": 0.3636713951976108, + "grad_norm": 1.4930264949798584, + "learning_rate": 6.698610688571138e-07, + "loss": 0.9285, + "step": 15100 + }, + { + "epoch": 0.36379181618939815, + "grad_norm": 1.6159045696258545, + "learning_rate": 6.697343068654294e-07, + "loss": 0.8258, + "step": 15105 + }, + { + "epoch": 0.3639122371811854, + "grad_norm": 1.646911382675171, + "learning_rate": 6.69607544873745e-07, + "loss": 0.9706, + "step": 15110 + }, + { + "epoch": 0.3640326581729727, + "grad_norm": 1.7125273942947388, + "learning_rate": 6.694807828820606e-07, + "loss": 0.8925, + "step": 15115 + }, + { + "epoch": 0.36415307916476, + "grad_norm": 1.4787230491638184, + "learning_rate": 6.693540208903762e-07, + "loss": 0.8243, + "step": 15120 + }, + { + "epoch": 0.3642735001565473, + "grad_norm": 1.6219500303268433, + "learning_rate": 6.692272588986918e-07, + "loss": 0.9227, + "step": 15125 + }, + { + "epoch": 0.3643939211483346, + "grad_norm": 1.4867146015167236, + "learning_rate": 6.691004969070074e-07, + "loss": 0.8787, + "step": 15130 + }, + { + "epoch": 0.36451434214012185, + "grad_norm": 1.5020312070846558, + "learning_rate": 6.689737349153229e-07, + "loss": 0.9214, + "step": 15135 + }, + { + "epoch": 0.3646347631319092, + "grad_norm": 1.6953963041305542, + "learning_rate": 6.688469729236386e-07, + "loss": 0.9409, + "step": 15140 + }, + { + "epoch": 0.36475518412369645, + "grad_norm": 1.7892584800720215, + "learning_rate": 6.687202109319542e-07, + "loss": 0.894, + "step": 15145 + }, + { + "epoch": 0.3648756051154837, + "grad_norm": 1.7647579908370972, + "learning_rate": 6.685934489402697e-07, + "loss": 0.9145, + "step": 15150 + }, + { + "epoch": 0.364996026107271, + "grad_norm": 1.6124937534332275, + "learning_rate": 6.684666869485853e-07, + "loss": 0.961, + "step": 15155 + }, + { + "epoch": 0.3651164470990583, + "grad_norm": 1.5672036409378052, + "learning_rate": 6.68339924956901e-07, + "loss": 0.9143, + "step": 15160 + }, + { + "epoch": 0.3652368680908456, + "grad_norm": 1.5922040939331055, + "learning_rate": 6.682131629652165e-07, + "loss": 0.9337, + "step": 15165 + }, + { + "epoch": 0.3653572890826329, + "grad_norm": 1.4634770154953003, + "learning_rate": 6.68086400973532e-07, + "loss": 0.9371, + "step": 15170 + }, + { + "epoch": 0.36547771007442015, + "grad_norm": 1.745644211769104, + "learning_rate": 6.679596389818477e-07, + "loss": 0.894, + "step": 15175 + }, + { + "epoch": 0.3655981310662075, + "grad_norm": 1.4401402473449707, + "learning_rate": 6.678328769901632e-07, + "loss": 0.9414, + "step": 15180 + }, + { + "epoch": 0.36571855205799475, + "grad_norm": 1.5754197835922241, + "learning_rate": 6.677061149984789e-07, + "loss": 0.9305, + "step": 15185 + }, + { + "epoch": 0.365838973049782, + "grad_norm": 1.81437349319458, + "learning_rate": 6.675793530067945e-07, + "loss": 0.9554, + "step": 15190 + }, + { + "epoch": 0.36595939404156935, + "grad_norm": 1.5831947326660156, + "learning_rate": 6.674525910151099e-07, + "loss": 0.9147, + "step": 15195 + }, + { + "epoch": 0.3660798150333566, + "grad_norm": 1.520748257637024, + "learning_rate": 6.673258290234256e-07, + "loss": 0.8806, + "step": 15200 + }, + { + "epoch": 0.3662002360251439, + "grad_norm": 1.6147689819335938, + "learning_rate": 6.671990670317412e-07, + "loss": 0.9263, + "step": 15205 + }, + { + "epoch": 0.3663206570169312, + "grad_norm": 1.4286832809448242, + "learning_rate": 6.670723050400568e-07, + "loss": 0.9172, + "step": 15210 + }, + { + "epoch": 0.3664410780087185, + "grad_norm": 1.776050090789795, + "learning_rate": 6.669455430483723e-07, + "loss": 0.8837, + "step": 15215 + }, + { + "epoch": 0.3665614990005058, + "grad_norm": 1.509376883506775, + "learning_rate": 6.66818781056688e-07, + "loss": 0.8783, + "step": 15220 + }, + { + "epoch": 0.36668191999229305, + "grad_norm": 1.715714454650879, + "learning_rate": 6.666920190650035e-07, + "loss": 0.8801, + "step": 15225 + }, + { + "epoch": 0.3668023409840803, + "grad_norm": 1.6566847562789917, + "learning_rate": 6.665652570733191e-07, + "loss": 0.8783, + "step": 15230 + }, + { + "epoch": 0.36692276197586765, + "grad_norm": 1.720525860786438, + "learning_rate": 6.664384950816348e-07, + "loss": 0.9263, + "step": 15235 + }, + { + "epoch": 0.3670431829676549, + "grad_norm": 1.5244266986846924, + "learning_rate": 6.663117330899502e-07, + "loss": 0.8521, + "step": 15240 + }, + { + "epoch": 0.3671636039594422, + "grad_norm": 1.6485753059387207, + "learning_rate": 6.661849710982659e-07, + "loss": 0.8937, + "step": 15245 + }, + { + "epoch": 0.36728402495122947, + "grad_norm": 1.4862918853759766, + "learning_rate": 6.660582091065815e-07, + "loss": 0.9009, + "step": 15250 + }, + { + "epoch": 0.3674044459430168, + "grad_norm": 1.5611529350280762, + "learning_rate": 6.65931447114897e-07, + "loss": 0.8937, + "step": 15255 + }, + { + "epoch": 0.3675248669348041, + "grad_norm": 1.604556918144226, + "learning_rate": 6.658046851232127e-07, + "loss": 0.9168, + "step": 15260 + }, + { + "epoch": 0.36764528792659135, + "grad_norm": 1.7058712244033813, + "learning_rate": 6.656779231315282e-07, + "loss": 0.9182, + "step": 15265 + }, + { + "epoch": 0.3677657089183787, + "grad_norm": 1.5210764408111572, + "learning_rate": 6.655511611398438e-07, + "loss": 0.9549, + "step": 15270 + }, + { + "epoch": 0.36788612991016595, + "grad_norm": 1.4679126739501953, + "learning_rate": 6.654243991481594e-07, + "loss": 0.9445, + "step": 15275 + }, + { + "epoch": 0.3680065509019532, + "grad_norm": 1.5327095985412598, + "learning_rate": 6.652976371564751e-07, + "loss": 0.9276, + "step": 15280 + }, + { + "epoch": 0.3681269718937405, + "grad_norm": 1.4916969537734985, + "learning_rate": 6.651708751647905e-07, + "loss": 0.9069, + "step": 15285 + }, + { + "epoch": 0.3682473928855278, + "grad_norm": 1.508493185043335, + "learning_rate": 6.650441131731061e-07, + "loss": 0.9226, + "step": 15290 + }, + { + "epoch": 0.3683678138773151, + "grad_norm": 1.6520406007766724, + "learning_rate": 6.649173511814218e-07, + "loss": 0.8933, + "step": 15295 + }, + { + "epoch": 0.36848823486910237, + "grad_norm": 1.682572603225708, + "learning_rate": 6.647905891897373e-07, + "loss": 0.895, + "step": 15300 + }, + { + "epoch": 0.36860865586088964, + "grad_norm": 1.3576946258544922, + "learning_rate": 6.64663827198053e-07, + "loss": 0.8832, + "step": 15305 + }, + { + "epoch": 0.368729076852677, + "grad_norm": 1.4632619619369507, + "learning_rate": 6.645370652063685e-07, + "loss": 0.9272, + "step": 15310 + }, + { + "epoch": 0.36884949784446425, + "grad_norm": 1.3913495540618896, + "learning_rate": 6.64410303214684e-07, + "loss": 0.8963, + "step": 15315 + }, + { + "epoch": 0.3689699188362515, + "grad_norm": 1.4010612964630127, + "learning_rate": 6.642835412229997e-07, + "loss": 0.8897, + "step": 15320 + }, + { + "epoch": 0.36909033982803885, + "grad_norm": 1.4855096340179443, + "learning_rate": 6.641567792313153e-07, + "loss": 0.9235, + "step": 15325 + }, + { + "epoch": 0.3692107608198261, + "grad_norm": 1.5315731763839722, + "learning_rate": 6.640300172396308e-07, + "loss": 0.8996, + "step": 15330 + }, + { + "epoch": 0.3693311818116134, + "grad_norm": 1.444700002670288, + "learning_rate": 6.639032552479464e-07, + "loss": 0.8574, + "step": 15335 + }, + { + "epoch": 0.36945160280340067, + "grad_norm": 1.5219303369522095, + "learning_rate": 6.63776493256262e-07, + "loss": 0.8845, + "step": 15340 + }, + { + "epoch": 0.369572023795188, + "grad_norm": 1.5101958513259888, + "learning_rate": 6.636497312645776e-07, + "loss": 0.8634, + "step": 15345 + }, + { + "epoch": 0.36969244478697527, + "grad_norm": 1.630250334739685, + "learning_rate": 6.635229692728932e-07, + "loss": 0.908, + "step": 15350 + }, + { + "epoch": 0.36981286577876255, + "grad_norm": 1.55610990524292, + "learning_rate": 6.633962072812088e-07, + "loss": 0.9107, + "step": 15355 + }, + { + "epoch": 0.3699332867705498, + "grad_norm": 1.470426082611084, + "learning_rate": 6.632694452895243e-07, + "loss": 0.9563, + "step": 15360 + }, + { + "epoch": 0.37005370776233715, + "grad_norm": 1.4882392883300781, + "learning_rate": 6.6314268329784e-07, + "loss": 0.9542, + "step": 15365 + }, + { + "epoch": 0.3701741287541244, + "grad_norm": 1.565896987915039, + "learning_rate": 6.630159213061556e-07, + "loss": 0.973, + "step": 15370 + }, + { + "epoch": 0.3702945497459117, + "grad_norm": 1.5221936702728271, + "learning_rate": 6.628891593144711e-07, + "loss": 0.9164, + "step": 15375 + }, + { + "epoch": 0.370414970737699, + "grad_norm": 1.369215726852417, + "learning_rate": 6.627623973227867e-07, + "loss": 0.8889, + "step": 15380 + }, + { + "epoch": 0.3705353917294863, + "grad_norm": 1.3607157468795776, + "learning_rate": 6.626356353311023e-07, + "loss": 0.9203, + "step": 15385 + }, + { + "epoch": 0.37065581272127357, + "grad_norm": 1.4368348121643066, + "learning_rate": 6.625088733394179e-07, + "loss": 0.9226, + "step": 15390 + }, + { + "epoch": 0.37077623371306084, + "grad_norm": 1.6463371515274048, + "learning_rate": 6.623821113477335e-07, + "loss": 0.8857, + "step": 15395 + }, + { + "epoch": 0.37089665470484817, + "grad_norm": 1.6441726684570312, + "learning_rate": 6.62255349356049e-07, + "loss": 0.8931, + "step": 15400 + }, + { + "epoch": 0.37101707569663545, + "grad_norm": 1.3902664184570312, + "learning_rate": 6.621285873643646e-07, + "loss": 0.9283, + "step": 15405 + }, + { + "epoch": 0.3711374966884227, + "grad_norm": 1.4455829858779907, + "learning_rate": 6.620018253726802e-07, + "loss": 0.933, + "step": 15410 + }, + { + "epoch": 0.37125791768021, + "grad_norm": 1.4055064916610718, + "learning_rate": 6.618750633809959e-07, + "loss": 0.8331, + "step": 15415 + }, + { + "epoch": 0.3713783386719973, + "grad_norm": 1.5223290920257568, + "learning_rate": 6.617483013893114e-07, + "loss": 0.8939, + "step": 15420 + }, + { + "epoch": 0.3714987596637846, + "grad_norm": 1.7608566284179688, + "learning_rate": 6.61621539397627e-07, + "loss": 0.9388, + "step": 15425 + }, + { + "epoch": 0.37161918065557187, + "grad_norm": 1.5901376008987427, + "learning_rate": 6.614947774059426e-07, + "loss": 0.8834, + "step": 15430 + }, + { + "epoch": 0.37173960164735914, + "grad_norm": 1.581833839416504, + "learning_rate": 6.613680154142581e-07, + "loss": 0.9965, + "step": 15435 + }, + { + "epoch": 0.37186002263914647, + "grad_norm": 1.6572521924972534, + "learning_rate": 6.612412534225738e-07, + "loss": 0.8962, + "step": 15440 + }, + { + "epoch": 0.37198044363093374, + "grad_norm": 1.6382677555084229, + "learning_rate": 6.611144914308894e-07, + "loss": 0.9106, + "step": 15445 + }, + { + "epoch": 0.372100864622721, + "grad_norm": 1.4641860723495483, + "learning_rate": 6.609877294392049e-07, + "loss": 0.9181, + "step": 15450 + }, + { + "epoch": 0.37222128561450835, + "grad_norm": 1.5975430011749268, + "learning_rate": 6.608609674475205e-07, + "loss": 0.9305, + "step": 15455 + }, + { + "epoch": 0.3723417066062956, + "grad_norm": 1.375770092010498, + "learning_rate": 6.607342054558361e-07, + "loss": 0.9122, + "step": 15460 + }, + { + "epoch": 0.3724621275980829, + "grad_norm": 1.7418895959854126, + "learning_rate": 6.606074434641517e-07, + "loss": 0.8588, + "step": 15465 + }, + { + "epoch": 0.37258254858987017, + "grad_norm": 1.4999524354934692, + "learning_rate": 6.604806814724672e-07, + "loss": 0.953, + "step": 15470 + }, + { + "epoch": 0.3727029695816575, + "grad_norm": 1.607024908065796, + "learning_rate": 6.603539194807829e-07, + "loss": 0.9159, + "step": 15475 + }, + { + "epoch": 0.37282339057344477, + "grad_norm": 1.7114670276641846, + "learning_rate": 6.602271574890984e-07, + "loss": 0.9333, + "step": 15480 + }, + { + "epoch": 0.37294381156523204, + "grad_norm": 1.398600697517395, + "learning_rate": 6.601003954974141e-07, + "loss": 0.9032, + "step": 15485 + }, + { + "epoch": 0.3730642325570193, + "grad_norm": 1.5683618783950806, + "learning_rate": 6.599736335057297e-07, + "loss": 0.8666, + "step": 15490 + }, + { + "epoch": 0.37318465354880664, + "grad_norm": 1.7809679508209229, + "learning_rate": 6.598468715140451e-07, + "loss": 0.9364, + "step": 15495 + }, + { + "epoch": 0.3733050745405939, + "grad_norm": 1.5396602153778076, + "learning_rate": 6.597201095223608e-07, + "loss": 0.9189, + "step": 15500 + }, + { + "epoch": 0.3734254955323812, + "grad_norm": 1.375185251235962, + "learning_rate": 6.595933475306764e-07, + "loss": 0.9043, + "step": 15505 + }, + { + "epoch": 0.3735459165241685, + "grad_norm": 1.5217154026031494, + "learning_rate": 6.59466585538992e-07, + "loss": 0.9204, + "step": 15510 + }, + { + "epoch": 0.3736663375159558, + "grad_norm": 1.5808666944503784, + "learning_rate": 6.593398235473075e-07, + "loss": 0.9414, + "step": 15515 + }, + { + "epoch": 0.37378675850774307, + "grad_norm": 1.6650584936141968, + "learning_rate": 6.592130615556231e-07, + "loss": 0.8927, + "step": 15520 + }, + { + "epoch": 0.37390717949953034, + "grad_norm": 1.3403691053390503, + "learning_rate": 6.590862995639387e-07, + "loss": 0.9146, + "step": 15525 + }, + { + "epoch": 0.37402760049131767, + "grad_norm": 1.6455671787261963, + "learning_rate": 6.589595375722543e-07, + "loss": 0.887, + "step": 15530 + }, + { + "epoch": 0.37414802148310494, + "grad_norm": 1.5723843574523926, + "learning_rate": 6.5883277558057e-07, + "loss": 0.885, + "step": 15535 + }, + { + "epoch": 0.3742684424748922, + "grad_norm": 1.5224273204803467, + "learning_rate": 6.587060135888854e-07, + "loss": 0.9342, + "step": 15540 + }, + { + "epoch": 0.3743888634666795, + "grad_norm": 1.4680192470550537, + "learning_rate": 6.58579251597201e-07, + "loss": 0.915, + "step": 15545 + }, + { + "epoch": 0.3745092844584668, + "grad_norm": 1.478745937347412, + "learning_rate": 6.584524896055167e-07, + "loss": 0.9552, + "step": 15550 + }, + { + "epoch": 0.3746297054502541, + "grad_norm": 1.7124029397964478, + "learning_rate": 6.583257276138322e-07, + "loss": 0.9157, + "step": 15555 + }, + { + "epoch": 0.37475012644204136, + "grad_norm": 1.7640318870544434, + "learning_rate": 6.581989656221479e-07, + "loss": 0.9328, + "step": 15560 + }, + { + "epoch": 0.37487054743382864, + "grad_norm": 1.5102007389068604, + "learning_rate": 6.580722036304634e-07, + "loss": 0.9801, + "step": 15565 + }, + { + "epoch": 0.37499096842561597, + "grad_norm": 1.603365182876587, + "learning_rate": 6.57945441638779e-07, + "loss": 0.9126, + "step": 15570 + }, + { + "epoch": 0.37511138941740324, + "grad_norm": 1.7390234470367432, + "learning_rate": 6.578186796470946e-07, + "loss": 0.9553, + "step": 15575 + }, + { + "epoch": 0.3752318104091905, + "grad_norm": 1.5153967142105103, + "learning_rate": 6.576919176554102e-07, + "loss": 0.9007, + "step": 15580 + }, + { + "epoch": 0.37535223140097784, + "grad_norm": 1.5532500743865967, + "learning_rate": 6.575651556637257e-07, + "loss": 0.9155, + "step": 15585 + }, + { + "epoch": 0.3754726523927651, + "grad_norm": 1.6289955377578735, + "learning_rate": 6.574383936720413e-07, + "loss": 0.9619, + "step": 15590 + }, + { + "epoch": 0.3755930733845524, + "grad_norm": 1.5017445087432861, + "learning_rate": 6.57311631680357e-07, + "loss": 0.9287, + "step": 15595 + }, + { + "epoch": 0.37571349437633966, + "grad_norm": 1.3886905908584595, + "learning_rate": 6.571848696886725e-07, + "loss": 0.9289, + "step": 15600 + }, + { + "epoch": 0.375833915368127, + "grad_norm": 1.5643943548202515, + "learning_rate": 6.570581076969882e-07, + "loss": 0.9097, + "step": 15605 + }, + { + "epoch": 0.37595433635991427, + "grad_norm": 1.5563030242919922, + "learning_rate": 6.569313457053037e-07, + "loss": 0.8883, + "step": 15610 + }, + { + "epoch": 0.37607475735170154, + "grad_norm": 1.6308798789978027, + "learning_rate": 6.568045837136192e-07, + "loss": 0.9332, + "step": 15615 + }, + { + "epoch": 0.3761951783434888, + "grad_norm": 1.3513928651809692, + "learning_rate": 6.566778217219349e-07, + "loss": 0.9116, + "step": 15620 + }, + { + "epoch": 0.37631559933527614, + "grad_norm": 1.5536242723464966, + "learning_rate": 6.565510597302505e-07, + "loss": 0.9038, + "step": 15625 + }, + { + "epoch": 0.3764360203270634, + "grad_norm": 1.635443925857544, + "learning_rate": 6.56424297738566e-07, + "loss": 0.9306, + "step": 15630 + }, + { + "epoch": 0.3765564413188507, + "grad_norm": 1.6803452968597412, + "learning_rate": 6.562975357468816e-07, + "loss": 0.9142, + "step": 15635 + }, + { + "epoch": 0.376676862310638, + "grad_norm": 1.5191351175308228, + "learning_rate": 6.561707737551972e-07, + "loss": 0.9241, + "step": 15640 + }, + { + "epoch": 0.3767972833024253, + "grad_norm": 1.50296950340271, + "learning_rate": 6.560440117635128e-07, + "loss": 0.8873, + "step": 15645 + }, + { + "epoch": 0.37691770429421256, + "grad_norm": 1.504440426826477, + "learning_rate": 6.559172497718284e-07, + "loss": 0.9058, + "step": 15650 + }, + { + "epoch": 0.37703812528599984, + "grad_norm": 2.1617422103881836, + "learning_rate": 6.55790487780144e-07, + "loss": 0.967, + "step": 15655 + }, + { + "epoch": 0.37715854627778717, + "grad_norm": 1.6108896732330322, + "learning_rate": 6.556637257884595e-07, + "loss": 0.9132, + "step": 15660 + }, + { + "epoch": 0.37727896726957444, + "grad_norm": 1.5929207801818848, + "learning_rate": 6.555369637967751e-07, + "loss": 0.8985, + "step": 15665 + }, + { + "epoch": 0.3773993882613617, + "grad_norm": 1.4225010871887207, + "learning_rate": 6.554102018050908e-07, + "loss": 0.902, + "step": 15670 + }, + { + "epoch": 0.377519809253149, + "grad_norm": 1.5468358993530273, + "learning_rate": 6.552834398134063e-07, + "loss": 0.8869, + "step": 15675 + }, + { + "epoch": 0.3776402302449363, + "grad_norm": 1.748754858970642, + "learning_rate": 6.551566778217219e-07, + "loss": 0.9002, + "step": 15680 + }, + { + "epoch": 0.3777606512367236, + "grad_norm": 1.3694506883621216, + "learning_rate": 6.550299158300375e-07, + "loss": 0.9047, + "step": 15685 + }, + { + "epoch": 0.37788107222851086, + "grad_norm": 1.686044454574585, + "learning_rate": 6.549031538383531e-07, + "loss": 0.8809, + "step": 15690 + }, + { + "epoch": 0.37800149322029813, + "grad_norm": 1.3558310270309448, + "learning_rate": 6.547763918466687e-07, + "loss": 0.8826, + "step": 15695 + }, + { + "epoch": 0.37812191421208546, + "grad_norm": 1.455237865447998, + "learning_rate": 6.546496298549842e-07, + "loss": 0.8968, + "step": 15700 + }, + { + "epoch": 0.37824233520387274, + "grad_norm": 1.6350035667419434, + "learning_rate": 6.545228678632998e-07, + "loss": 0.9855, + "step": 15705 + }, + { + "epoch": 0.37836275619566, + "grad_norm": 1.4560292959213257, + "learning_rate": 6.543961058716154e-07, + "loss": 0.9257, + "step": 15710 + }, + { + "epoch": 0.37848317718744734, + "grad_norm": 1.6068018674850464, + "learning_rate": 6.542693438799311e-07, + "loss": 0.8658, + "step": 15715 + }, + { + "epoch": 0.3786035981792346, + "grad_norm": 1.4421308040618896, + "learning_rate": 6.541425818882466e-07, + "loss": 0.9409, + "step": 15720 + }, + { + "epoch": 0.3787240191710219, + "grad_norm": 1.6204299926757812, + "learning_rate": 6.540158198965621e-07, + "loss": 0.9064, + "step": 15725 + }, + { + "epoch": 0.37884444016280916, + "grad_norm": 1.4492141008377075, + "learning_rate": 6.538890579048778e-07, + "loss": 0.8868, + "step": 15730 + }, + { + "epoch": 0.3789648611545965, + "grad_norm": 1.4784122705459595, + "learning_rate": 6.537622959131933e-07, + "loss": 0.8887, + "step": 15735 + }, + { + "epoch": 0.37908528214638376, + "grad_norm": 1.5144078731536865, + "learning_rate": 6.53635533921509e-07, + "loss": 0.8895, + "step": 15740 + }, + { + "epoch": 0.37920570313817104, + "grad_norm": 1.4604182243347168, + "learning_rate": 6.535087719298246e-07, + "loss": 0.908, + "step": 15745 + }, + { + "epoch": 0.3793261241299583, + "grad_norm": 1.8133037090301514, + "learning_rate": 6.5338200993814e-07, + "loss": 0.8743, + "step": 15750 + }, + { + "epoch": 0.37944654512174564, + "grad_norm": 1.9104950428009033, + "learning_rate": 6.532552479464557e-07, + "loss": 0.9707, + "step": 15755 + }, + { + "epoch": 0.3795669661135329, + "grad_norm": 1.2851866483688354, + "learning_rate": 6.531284859547713e-07, + "loss": 0.8677, + "step": 15760 + }, + { + "epoch": 0.3796873871053202, + "grad_norm": 1.5550357103347778, + "learning_rate": 6.530017239630869e-07, + "loss": 0.9423, + "step": 15765 + }, + { + "epoch": 0.3798078080971075, + "grad_norm": 1.375746250152588, + "learning_rate": 6.528749619714024e-07, + "loss": 0.8621, + "step": 15770 + }, + { + "epoch": 0.3799282290888948, + "grad_norm": 1.5696123838424683, + "learning_rate": 6.527481999797181e-07, + "loss": 0.8835, + "step": 15775 + }, + { + "epoch": 0.38004865008068206, + "grad_norm": 1.5719820261001587, + "learning_rate": 6.526214379880336e-07, + "loss": 0.9317, + "step": 15780 + }, + { + "epoch": 0.38016907107246933, + "grad_norm": 1.4429283142089844, + "learning_rate": 6.524946759963492e-07, + "loss": 0.862, + "step": 15785 + }, + { + "epoch": 0.38028949206425666, + "grad_norm": 1.5603541135787964, + "learning_rate": 6.523679140046649e-07, + "loss": 0.9097, + "step": 15790 + }, + { + "epoch": 0.38040991305604394, + "grad_norm": 1.5130277872085571, + "learning_rate": 6.522411520129803e-07, + "loss": 0.9088, + "step": 15795 + }, + { + "epoch": 0.3805303340478312, + "grad_norm": 1.3764976263046265, + "learning_rate": 6.52114390021296e-07, + "loss": 0.8709, + "step": 15800 + }, + { + "epoch": 0.3806507550396185, + "grad_norm": 1.3912783861160278, + "learning_rate": 6.519876280296116e-07, + "loss": 0.9016, + "step": 15805 + }, + { + "epoch": 0.3807711760314058, + "grad_norm": 1.5324950218200684, + "learning_rate": 6.518608660379272e-07, + "loss": 0.913, + "step": 15810 + }, + { + "epoch": 0.3808915970231931, + "grad_norm": 1.5858607292175293, + "learning_rate": 6.517341040462427e-07, + "loss": 0.8764, + "step": 15815 + }, + { + "epoch": 0.38101201801498036, + "grad_norm": 1.5081491470336914, + "learning_rate": 6.516073420545583e-07, + "loss": 0.9308, + "step": 15820 + }, + { + "epoch": 0.3811324390067677, + "grad_norm": 1.554904580116272, + "learning_rate": 6.514805800628739e-07, + "loss": 0.9474, + "step": 15825 + }, + { + "epoch": 0.38125285999855496, + "grad_norm": 1.6217893362045288, + "learning_rate": 6.513538180711895e-07, + "loss": 0.887, + "step": 15830 + }, + { + "epoch": 0.38137328099034223, + "grad_norm": 1.6178269386291504, + "learning_rate": 6.512270560795052e-07, + "loss": 0.958, + "step": 15835 + }, + { + "epoch": 0.3814937019821295, + "grad_norm": 1.3969887495040894, + "learning_rate": 6.511002940878206e-07, + "loss": 0.948, + "step": 15840 + }, + { + "epoch": 0.38161412297391684, + "grad_norm": 1.5228673219680786, + "learning_rate": 6.509735320961362e-07, + "loss": 0.8732, + "step": 15845 + }, + { + "epoch": 0.3817345439657041, + "grad_norm": 1.427930235862732, + "learning_rate": 6.508467701044519e-07, + "loss": 0.9315, + "step": 15850 + }, + { + "epoch": 0.3818549649574914, + "grad_norm": 1.573591947555542, + "learning_rate": 6.507200081127674e-07, + "loss": 0.9077, + "step": 15855 + }, + { + "epoch": 0.38197538594927866, + "grad_norm": 1.5548583269119263, + "learning_rate": 6.505932461210831e-07, + "loss": 0.9134, + "step": 15860 + }, + { + "epoch": 0.382095806941066, + "grad_norm": 1.7557604312896729, + "learning_rate": 6.504664841293986e-07, + "loss": 0.9059, + "step": 15865 + }, + { + "epoch": 0.38221622793285326, + "grad_norm": 1.349873661994934, + "learning_rate": 6.503397221377141e-07, + "loss": 0.8972, + "step": 15870 + }, + { + "epoch": 0.38233664892464053, + "grad_norm": 1.7548439502716064, + "learning_rate": 6.502129601460298e-07, + "loss": 0.9036, + "step": 15875 + }, + { + "epoch": 0.3824570699164278, + "grad_norm": 1.759466290473938, + "learning_rate": 6.500861981543454e-07, + "loss": 0.9172, + "step": 15880 + }, + { + "epoch": 0.38257749090821513, + "grad_norm": 1.6105694770812988, + "learning_rate": 6.499594361626609e-07, + "loss": 0.89, + "step": 15885 + }, + { + "epoch": 0.3826979119000024, + "grad_norm": 1.4921594858169556, + "learning_rate": 6.498326741709765e-07, + "loss": 0.958, + "step": 15890 + }, + { + "epoch": 0.3828183328917897, + "grad_norm": 1.6352791786193848, + "learning_rate": 6.497059121792922e-07, + "loss": 0.8882, + "step": 15895 + }, + { + "epoch": 0.382938753883577, + "grad_norm": 1.5943946838378906, + "learning_rate": 6.495791501876077e-07, + "loss": 0.9128, + "step": 15900 + }, + { + "epoch": 0.3830591748753643, + "grad_norm": 1.497174859046936, + "learning_rate": 6.494523881959233e-07, + "loss": 0.9548, + "step": 15905 + }, + { + "epoch": 0.38317959586715156, + "grad_norm": 1.540085792541504, + "learning_rate": 6.493256262042389e-07, + "loss": 0.9293, + "step": 15910 + }, + { + "epoch": 0.38330001685893883, + "grad_norm": 1.5195233821868896, + "learning_rate": 6.491988642125544e-07, + "loss": 0.9311, + "step": 15915 + }, + { + "epoch": 0.38342043785072616, + "grad_norm": 1.3735129833221436, + "learning_rate": 6.490721022208701e-07, + "loss": 0.8887, + "step": 15920 + }, + { + "epoch": 0.38354085884251343, + "grad_norm": 1.440384030342102, + "learning_rate": 6.489453402291857e-07, + "loss": 0.8948, + "step": 15925 + }, + { + "epoch": 0.3836612798343007, + "grad_norm": 1.7172589302062988, + "learning_rate": 6.488185782375011e-07, + "loss": 0.8894, + "step": 15930 + }, + { + "epoch": 0.383781700826088, + "grad_norm": 1.529138207435608, + "learning_rate": 6.486918162458168e-07, + "loss": 0.9409, + "step": 15935 + }, + { + "epoch": 0.3839021218178753, + "grad_norm": 1.5386652946472168, + "learning_rate": 6.485650542541324e-07, + "loss": 0.9094, + "step": 15940 + }, + { + "epoch": 0.3840225428096626, + "grad_norm": 1.4735944271087646, + "learning_rate": 6.48438292262448e-07, + "loss": 0.8712, + "step": 15945 + }, + { + "epoch": 0.38414296380144985, + "grad_norm": 1.6935973167419434, + "learning_rate": 6.483115302707636e-07, + "loss": 0.8893, + "step": 15950 + }, + { + "epoch": 0.3842633847932372, + "grad_norm": 1.518571138381958, + "learning_rate": 6.481847682790792e-07, + "loss": 0.9172, + "step": 15955 + }, + { + "epoch": 0.38438380578502446, + "grad_norm": 1.3189793825149536, + "learning_rate": 6.480580062873947e-07, + "loss": 0.9205, + "step": 15960 + }, + { + "epoch": 0.38450422677681173, + "grad_norm": 1.464625358581543, + "learning_rate": 6.479312442957103e-07, + "loss": 0.8986, + "step": 15965 + }, + { + "epoch": 0.384624647768599, + "grad_norm": 1.9872775077819824, + "learning_rate": 6.47804482304026e-07, + "loss": 0.892, + "step": 15970 + }, + { + "epoch": 0.38474506876038633, + "grad_norm": 1.6058225631713867, + "learning_rate": 6.476777203123415e-07, + "loss": 0.9443, + "step": 15975 + }, + { + "epoch": 0.3848654897521736, + "grad_norm": 1.6584371328353882, + "learning_rate": 6.475509583206571e-07, + "loss": 0.8896, + "step": 15980 + }, + { + "epoch": 0.3849859107439609, + "grad_norm": 1.4320168495178223, + "learning_rate": 6.474241963289727e-07, + "loss": 0.8685, + "step": 15985 + }, + { + "epoch": 0.38510633173574815, + "grad_norm": 1.446475863456726, + "learning_rate": 6.472974343372882e-07, + "loss": 0.9195, + "step": 15990 + }, + { + "epoch": 0.3852267527275355, + "grad_norm": 1.3913068771362305, + "learning_rate": 6.471706723456039e-07, + "loss": 0.9159, + "step": 15995 + }, + { + "epoch": 0.38534717371932276, + "grad_norm": 1.4390864372253418, + "learning_rate": 6.470439103539194e-07, + "loss": 0.9049, + "step": 16000 + }, + { + "epoch": 0.38546759471111003, + "grad_norm": 1.8681691884994507, + "learning_rate": 6.46917148362235e-07, + "loss": 0.9117, + "step": 16005 + }, + { + "epoch": 0.3855880157028973, + "grad_norm": 1.505843997001648, + "learning_rate": 6.467903863705506e-07, + "loss": 0.9035, + "step": 16010 + }, + { + "epoch": 0.38570843669468463, + "grad_norm": 1.7058168649673462, + "learning_rate": 6.466636243788663e-07, + "loss": 0.8936, + "step": 16015 + }, + { + "epoch": 0.3858288576864719, + "grad_norm": 1.4852770566940308, + "learning_rate": 6.465368623871818e-07, + "loss": 0.9428, + "step": 16020 + }, + { + "epoch": 0.3859492786782592, + "grad_norm": 1.5499645471572876, + "learning_rate": 6.464101003954973e-07, + "loss": 0.9151, + "step": 16025 + }, + { + "epoch": 0.3860696996700465, + "grad_norm": 1.4708459377288818, + "learning_rate": 6.46283338403813e-07, + "loss": 0.9137, + "step": 16030 + }, + { + "epoch": 0.3861901206618338, + "grad_norm": 1.685199499130249, + "learning_rate": 6.461565764121285e-07, + "loss": 0.9266, + "step": 16035 + }, + { + "epoch": 0.38631054165362105, + "grad_norm": 2.1729648113250732, + "learning_rate": 6.460298144204442e-07, + "loss": 0.9238, + "step": 16040 + }, + { + "epoch": 0.3864309626454083, + "grad_norm": 1.7583898305892944, + "learning_rate": 6.459030524287598e-07, + "loss": 0.938, + "step": 16045 + }, + { + "epoch": 0.38655138363719566, + "grad_norm": 1.462072730064392, + "learning_rate": 6.457762904370752e-07, + "loss": 0.8998, + "step": 16050 + }, + { + "epoch": 0.38667180462898293, + "grad_norm": 1.5906111001968384, + "learning_rate": 6.456495284453909e-07, + "loss": 0.975, + "step": 16055 + }, + { + "epoch": 0.3867922256207702, + "grad_norm": 1.4166845083236694, + "learning_rate": 6.455227664537065e-07, + "loss": 0.8913, + "step": 16060 + }, + { + "epoch": 0.3869126466125575, + "grad_norm": 1.5081733465194702, + "learning_rate": 6.453960044620221e-07, + "loss": 0.9391, + "step": 16065 + }, + { + "epoch": 0.3870330676043448, + "grad_norm": 1.5960620641708374, + "learning_rate": 6.452692424703376e-07, + "loss": 0.9146, + "step": 16070 + }, + { + "epoch": 0.3871534885961321, + "grad_norm": 1.5286779403686523, + "learning_rate": 6.451424804786533e-07, + "loss": 0.9371, + "step": 16075 + }, + { + "epoch": 0.38727390958791935, + "grad_norm": 1.838586688041687, + "learning_rate": 6.450157184869688e-07, + "loss": 0.9116, + "step": 16080 + }, + { + "epoch": 0.3873943305797067, + "grad_norm": 1.3697798252105713, + "learning_rate": 6.448889564952844e-07, + "loss": 0.9188, + "step": 16085 + }, + { + "epoch": 0.38751475157149395, + "grad_norm": 1.5102379322052002, + "learning_rate": 6.447621945036001e-07, + "loss": 0.9338, + "step": 16090 + }, + { + "epoch": 0.3876351725632812, + "grad_norm": 1.5454723834991455, + "learning_rate": 6.446354325119155e-07, + "loss": 0.8167, + "step": 16095 + }, + { + "epoch": 0.3877555935550685, + "grad_norm": 1.4021121263504028, + "learning_rate": 6.445086705202312e-07, + "loss": 0.9009, + "step": 16100 + }, + { + "epoch": 0.38787601454685583, + "grad_norm": 1.4518719911575317, + "learning_rate": 6.443819085285468e-07, + "loss": 0.896, + "step": 16105 + }, + { + "epoch": 0.3879964355386431, + "grad_norm": 1.5615367889404297, + "learning_rate": 6.442551465368623e-07, + "loss": 0.8825, + "step": 16110 + }, + { + "epoch": 0.3881168565304304, + "grad_norm": 1.4544663429260254, + "learning_rate": 6.441283845451779e-07, + "loss": 0.9124, + "step": 16115 + }, + { + "epoch": 0.38823727752221765, + "grad_norm": 1.5126712322235107, + "learning_rate": 6.440016225534935e-07, + "loss": 0.9434, + "step": 16120 + }, + { + "epoch": 0.388357698514005, + "grad_norm": 1.5839651823043823, + "learning_rate": 6.438748605618091e-07, + "loss": 0.9702, + "step": 16125 + }, + { + "epoch": 0.38847811950579225, + "grad_norm": 2.018829107284546, + "learning_rate": 6.437480985701247e-07, + "loss": 0.9184, + "step": 16130 + }, + { + "epoch": 0.3885985404975795, + "grad_norm": 1.6250749826431274, + "learning_rate": 6.436213365784404e-07, + "loss": 0.9592, + "step": 16135 + }, + { + "epoch": 0.3887189614893668, + "grad_norm": 1.336804747581482, + "learning_rate": 6.434945745867558e-07, + "loss": 0.8839, + "step": 16140 + }, + { + "epoch": 0.3888393824811541, + "grad_norm": 1.6236604452133179, + "learning_rate": 6.433678125950714e-07, + "loss": 0.9167, + "step": 16145 + }, + { + "epoch": 0.3889598034729414, + "grad_norm": 1.5479767322540283, + "learning_rate": 6.432410506033871e-07, + "loss": 0.8679, + "step": 16150 + }, + { + "epoch": 0.3890802244647287, + "grad_norm": 1.5190855264663696, + "learning_rate": 6.431142886117027e-07, + "loss": 0.8982, + "step": 16155 + }, + { + "epoch": 0.389200645456516, + "grad_norm": 1.6366859674453735, + "learning_rate": 6.429875266200183e-07, + "loss": 0.9002, + "step": 16160 + }, + { + "epoch": 0.3893210664483033, + "grad_norm": 1.5403122901916504, + "learning_rate": 6.428607646283338e-07, + "loss": 0.8571, + "step": 16165 + }, + { + "epoch": 0.38944148744009055, + "grad_norm": 1.5745781660079956, + "learning_rate": 6.427340026366494e-07, + "loss": 0.9317, + "step": 16170 + }, + { + "epoch": 0.3895619084318778, + "grad_norm": 1.6703364849090576, + "learning_rate": 6.42607240644965e-07, + "loss": 0.927, + "step": 16175 + }, + { + "epoch": 0.38968232942366515, + "grad_norm": 1.4843415021896362, + "learning_rate": 6.424804786532806e-07, + "loss": 0.9106, + "step": 16180 + }, + { + "epoch": 0.3898027504154524, + "grad_norm": 1.4242477416992188, + "learning_rate": 6.423537166615962e-07, + "loss": 0.9481, + "step": 16185 + }, + { + "epoch": 0.3899231714072397, + "grad_norm": 1.34717857837677, + "learning_rate": 6.422269546699117e-07, + "loss": 0.9421, + "step": 16190 + }, + { + "epoch": 0.390043592399027, + "grad_norm": 1.734257698059082, + "learning_rate": 6.421001926782273e-07, + "loss": 0.9215, + "step": 16195 + }, + { + "epoch": 0.3901640133908143, + "grad_norm": 1.5538983345031738, + "learning_rate": 6.41973430686543e-07, + "loss": 0.9245, + "step": 16200 + }, + { + "epoch": 0.3902844343826016, + "grad_norm": 1.4236520528793335, + "learning_rate": 6.418466686948585e-07, + "loss": 0.9123, + "step": 16205 + }, + { + "epoch": 0.39040485537438885, + "grad_norm": 1.6775470972061157, + "learning_rate": 6.417199067031741e-07, + "loss": 0.9514, + "step": 16210 + }, + { + "epoch": 0.3905252763661762, + "grad_norm": 1.5894907712936401, + "learning_rate": 6.415931447114897e-07, + "loss": 0.8613, + "step": 16215 + }, + { + "epoch": 0.39064569735796345, + "grad_norm": 1.8615303039550781, + "learning_rate": 6.414663827198053e-07, + "loss": 0.8585, + "step": 16220 + }, + { + "epoch": 0.3907661183497507, + "grad_norm": 1.587877631187439, + "learning_rate": 6.413396207281209e-07, + "loss": 0.9591, + "step": 16225 + }, + { + "epoch": 0.390886539341538, + "grad_norm": 1.5446151494979858, + "learning_rate": 6.412128587364364e-07, + "loss": 0.9316, + "step": 16230 + }, + { + "epoch": 0.3910069603333253, + "grad_norm": 1.3091187477111816, + "learning_rate": 6.41086096744752e-07, + "loss": 0.9587, + "step": 16235 + }, + { + "epoch": 0.3911273813251126, + "grad_norm": 1.5175570249557495, + "learning_rate": 6.409593347530676e-07, + "loss": 0.9147, + "step": 16240 + }, + { + "epoch": 0.3912478023168999, + "grad_norm": 1.386093020439148, + "learning_rate": 6.408325727613833e-07, + "loss": 0.9803, + "step": 16245 + }, + { + "epoch": 0.39136822330868715, + "grad_norm": 1.4947230815887451, + "learning_rate": 6.407058107696988e-07, + "loss": 0.879, + "step": 16250 + }, + { + "epoch": 0.3914886443004745, + "grad_norm": 1.4012850522994995, + "learning_rate": 6.405790487780143e-07, + "loss": 0.8928, + "step": 16255 + }, + { + "epoch": 0.39160906529226175, + "grad_norm": 1.508278727531433, + "learning_rate": 6.4045228678633e-07, + "loss": 0.8992, + "step": 16260 + }, + { + "epoch": 0.391729486284049, + "grad_norm": 1.5345510244369507, + "learning_rate": 6.403255247946455e-07, + "loss": 0.9355, + "step": 16265 + }, + { + "epoch": 0.39184990727583635, + "grad_norm": 1.515647292137146, + "learning_rate": 6.401987628029612e-07, + "loss": 0.9122, + "step": 16270 + }, + { + "epoch": 0.3919703282676236, + "grad_norm": 1.5095194578170776, + "learning_rate": 6.400720008112768e-07, + "loss": 0.9869, + "step": 16275 + }, + { + "epoch": 0.3920907492594109, + "grad_norm": 1.6890863180160522, + "learning_rate": 6.399452388195922e-07, + "loss": 0.9405, + "step": 16280 + }, + { + "epoch": 0.39221117025119817, + "grad_norm": 1.5294418334960938, + "learning_rate": 6.398184768279079e-07, + "loss": 0.9011, + "step": 16285 + }, + { + "epoch": 0.3923315912429855, + "grad_norm": 1.4933300018310547, + "learning_rate": 6.396917148362235e-07, + "loss": 0.9059, + "step": 16290 + }, + { + "epoch": 0.3924520122347728, + "grad_norm": 1.4482684135437012, + "learning_rate": 6.395649528445391e-07, + "loss": 0.8861, + "step": 16295 + }, + { + "epoch": 0.39257243322656005, + "grad_norm": 1.3621864318847656, + "learning_rate": 6.394381908528546e-07, + "loss": 0.9167, + "step": 16300 + }, + { + "epoch": 0.3926928542183473, + "grad_norm": 1.451184868812561, + "learning_rate": 6.393114288611703e-07, + "loss": 0.9173, + "step": 16305 + }, + { + "epoch": 0.39281327521013465, + "grad_norm": 1.487359642982483, + "learning_rate": 6.391846668694858e-07, + "loss": 0.9596, + "step": 16310 + }, + { + "epoch": 0.3929336962019219, + "grad_norm": 1.5361380577087402, + "learning_rate": 6.390579048778014e-07, + "loss": 0.934, + "step": 16315 + }, + { + "epoch": 0.3930541171937092, + "grad_norm": 1.566884160041809, + "learning_rate": 6.389311428861171e-07, + "loss": 0.8644, + "step": 16320 + }, + { + "epoch": 0.39317453818549647, + "grad_norm": 1.6171588897705078, + "learning_rate": 6.388043808944325e-07, + "loss": 0.9374, + "step": 16325 + }, + { + "epoch": 0.3932949591772838, + "grad_norm": 1.5238803625106812, + "learning_rate": 6.386776189027482e-07, + "loss": 0.9359, + "step": 16330 + }, + { + "epoch": 0.39341538016907107, + "grad_norm": 1.6145451068878174, + "learning_rate": 6.385508569110638e-07, + "loss": 0.914, + "step": 16335 + }, + { + "epoch": 0.39353580116085835, + "grad_norm": 1.5774810314178467, + "learning_rate": 6.384240949193794e-07, + "loss": 0.8834, + "step": 16340 + }, + { + "epoch": 0.3936562221526457, + "grad_norm": 1.6705057621002197, + "learning_rate": 6.382973329276949e-07, + "loss": 0.8951, + "step": 16345 + }, + { + "epoch": 0.39377664314443295, + "grad_norm": 1.5039398670196533, + "learning_rate": 6.381705709360105e-07, + "loss": 0.9229, + "step": 16350 + }, + { + "epoch": 0.3938970641362202, + "grad_norm": 1.585025429725647, + "learning_rate": 6.380438089443261e-07, + "loss": 0.9039, + "step": 16355 + }, + { + "epoch": 0.3940174851280075, + "grad_norm": 1.6906098127365112, + "learning_rate": 6.379170469526417e-07, + "loss": 0.9368, + "step": 16360 + }, + { + "epoch": 0.3941379061197948, + "grad_norm": 1.5157690048217773, + "learning_rate": 6.377902849609574e-07, + "loss": 0.8906, + "step": 16365 + }, + { + "epoch": 0.3942583271115821, + "grad_norm": 1.4066121578216553, + "learning_rate": 6.376635229692728e-07, + "loss": 0.9337, + "step": 16370 + }, + { + "epoch": 0.39437874810336937, + "grad_norm": 1.4881788492202759, + "learning_rate": 6.375367609775884e-07, + "loss": 0.9241, + "step": 16375 + }, + { + "epoch": 0.39449916909515664, + "grad_norm": 1.500671148300171, + "learning_rate": 6.374099989859041e-07, + "loss": 0.97, + "step": 16380 + }, + { + "epoch": 0.39461959008694397, + "grad_norm": 1.5400736331939697, + "learning_rate": 6.372832369942196e-07, + "loss": 0.9208, + "step": 16385 + }, + { + "epoch": 0.39474001107873125, + "grad_norm": 1.468634009361267, + "learning_rate": 6.371564750025353e-07, + "loss": 0.9247, + "step": 16390 + }, + { + "epoch": 0.3948604320705185, + "grad_norm": 1.510955572128296, + "learning_rate": 6.370297130108508e-07, + "loss": 0.9041, + "step": 16395 + }, + { + "epoch": 0.39498085306230585, + "grad_norm": 1.528254747390747, + "learning_rate": 6.369029510191663e-07, + "loss": 0.9339, + "step": 16400 + }, + { + "epoch": 0.3951012740540931, + "grad_norm": 1.5330232381820679, + "learning_rate": 6.36776189027482e-07, + "loss": 0.897, + "step": 16405 + }, + { + "epoch": 0.3952216950458804, + "grad_norm": 1.4764940738677979, + "learning_rate": 6.366494270357976e-07, + "loss": 0.9394, + "step": 16410 + }, + { + "epoch": 0.39534211603766767, + "grad_norm": 1.4831757545471191, + "learning_rate": 6.365226650441131e-07, + "loss": 0.9418, + "step": 16415 + }, + { + "epoch": 0.395462537029455, + "grad_norm": 1.6943193674087524, + "learning_rate": 6.363959030524287e-07, + "loss": 0.8861, + "step": 16420 + }, + { + "epoch": 0.39558295802124227, + "grad_norm": 1.3982515335083008, + "learning_rate": 6.362691410607444e-07, + "loss": 0.791, + "step": 16425 + }, + { + "epoch": 0.39570337901302954, + "grad_norm": 1.4633845090866089, + "learning_rate": 6.361423790690599e-07, + "loss": 0.9638, + "step": 16430 + }, + { + "epoch": 0.3958238000048168, + "grad_norm": 1.807855486869812, + "learning_rate": 6.360156170773755e-07, + "loss": 0.941, + "step": 16435 + }, + { + "epoch": 0.39594422099660415, + "grad_norm": 1.523313283920288, + "learning_rate": 6.358888550856911e-07, + "loss": 0.9431, + "step": 16440 + }, + { + "epoch": 0.3960646419883914, + "grad_norm": 1.3463391065597534, + "learning_rate": 6.357620930940066e-07, + "loss": 0.9069, + "step": 16445 + }, + { + "epoch": 0.3961850629801787, + "grad_norm": 1.6524784564971924, + "learning_rate": 6.356353311023223e-07, + "loss": 0.9588, + "step": 16450 + }, + { + "epoch": 0.39630548397196597, + "grad_norm": 1.4540703296661377, + "learning_rate": 6.355085691106379e-07, + "loss": 0.8533, + "step": 16455 + }, + { + "epoch": 0.3964259049637533, + "grad_norm": 1.5041921138763428, + "learning_rate": 6.353818071189533e-07, + "loss": 0.9025, + "step": 16460 + }, + { + "epoch": 0.39654632595554057, + "grad_norm": 1.4892079830169678, + "learning_rate": 6.35255045127269e-07, + "loss": 0.948, + "step": 16465 + }, + { + "epoch": 0.39666674694732784, + "grad_norm": 1.909855604171753, + "learning_rate": 6.351282831355846e-07, + "loss": 0.9106, + "step": 16470 + }, + { + "epoch": 0.39678716793911517, + "grad_norm": 1.4165951013565063, + "learning_rate": 6.350015211439002e-07, + "loss": 0.9051, + "step": 16475 + }, + { + "epoch": 0.39690758893090244, + "grad_norm": 1.558835506439209, + "learning_rate": 6.348747591522158e-07, + "loss": 0.8895, + "step": 16480 + }, + { + "epoch": 0.3970280099226897, + "grad_norm": 1.5130789279937744, + "learning_rate": 6.347479971605314e-07, + "loss": 0.9011, + "step": 16485 + }, + { + "epoch": 0.397148430914477, + "grad_norm": 1.735569953918457, + "learning_rate": 6.346212351688469e-07, + "loss": 0.9343, + "step": 16490 + }, + { + "epoch": 0.3972688519062643, + "grad_norm": 1.4600125551223755, + "learning_rate": 6.344944731771625e-07, + "loss": 0.9466, + "step": 16495 + }, + { + "epoch": 0.3973892728980516, + "grad_norm": 1.612957239151001, + "learning_rate": 6.343677111854782e-07, + "loss": 0.9414, + "step": 16500 + }, + { + "epoch": 0.39750969388983887, + "grad_norm": 1.5158154964447021, + "learning_rate": 6.342409491937937e-07, + "loss": 0.9412, + "step": 16505 + }, + { + "epoch": 0.39763011488162614, + "grad_norm": 1.6656270027160645, + "learning_rate": 6.341141872021093e-07, + "loss": 0.8693, + "step": 16510 + }, + { + "epoch": 0.39775053587341347, + "grad_norm": 1.6811892986297607, + "learning_rate": 6.339874252104249e-07, + "loss": 0.9543, + "step": 16515 + }, + { + "epoch": 0.39787095686520074, + "grad_norm": 1.6302272081375122, + "learning_rate": 6.338606632187404e-07, + "loss": 0.9084, + "step": 16520 + }, + { + "epoch": 0.397991377856988, + "grad_norm": 1.7181906700134277, + "learning_rate": 6.337339012270561e-07, + "loss": 0.8922, + "step": 16525 + }, + { + "epoch": 0.39811179884877534, + "grad_norm": 1.523653268814087, + "learning_rate": 6.336071392353716e-07, + "loss": 0.8399, + "step": 16530 + }, + { + "epoch": 0.3982322198405626, + "grad_norm": 1.7446633577346802, + "learning_rate": 6.334803772436872e-07, + "loss": 0.9446, + "step": 16535 + }, + { + "epoch": 0.3983526408323499, + "grad_norm": 1.4260302782058716, + "learning_rate": 6.333536152520028e-07, + "loss": 0.9209, + "step": 16540 + }, + { + "epoch": 0.39847306182413716, + "grad_norm": 1.572892189025879, + "learning_rate": 6.332268532603185e-07, + "loss": 0.899, + "step": 16545 + }, + { + "epoch": 0.3985934828159245, + "grad_norm": 1.5617256164550781, + "learning_rate": 6.33100091268634e-07, + "loss": 0.909, + "step": 16550 + }, + { + "epoch": 0.39871390380771177, + "grad_norm": 1.680518388748169, + "learning_rate": 6.329733292769495e-07, + "loss": 0.9296, + "step": 16555 + }, + { + "epoch": 0.39883432479949904, + "grad_norm": 1.9393974542617798, + "learning_rate": 6.328465672852652e-07, + "loss": 0.8945, + "step": 16560 + }, + { + "epoch": 0.3989547457912863, + "grad_norm": 1.7089143991470337, + "learning_rate": 6.327198052935807e-07, + "loss": 0.9152, + "step": 16565 + }, + { + "epoch": 0.39907516678307364, + "grad_norm": 1.5115058422088623, + "learning_rate": 6.325930433018964e-07, + "loss": 0.8943, + "step": 16570 + }, + { + "epoch": 0.3991955877748609, + "grad_norm": 1.4281383752822876, + "learning_rate": 6.32466281310212e-07, + "loss": 0.9097, + "step": 16575 + }, + { + "epoch": 0.3993160087666482, + "grad_norm": 1.453593373298645, + "learning_rate": 6.323395193185274e-07, + "loss": 0.9352, + "step": 16580 + }, + { + "epoch": 0.39943642975843546, + "grad_norm": 1.7239865064620972, + "learning_rate": 6.322127573268431e-07, + "loss": 0.9259, + "step": 16585 + }, + { + "epoch": 0.3995568507502228, + "grad_norm": 1.3729093074798584, + "learning_rate": 6.320859953351587e-07, + "loss": 0.9445, + "step": 16590 + }, + { + "epoch": 0.39967727174201007, + "grad_norm": 1.594544529914856, + "learning_rate": 6.319592333434743e-07, + "loss": 0.9346, + "step": 16595 + }, + { + "epoch": 0.39979769273379734, + "grad_norm": 1.5372705459594727, + "learning_rate": 6.318324713517898e-07, + "loss": 0.937, + "step": 16600 + }, + { + "epoch": 0.39991811372558467, + "grad_norm": 1.740492343902588, + "learning_rate": 6.317057093601055e-07, + "loss": 0.955, + "step": 16605 + }, + { + "epoch": 0.40003853471737194, + "grad_norm": 1.7151694297790527, + "learning_rate": 6.31578947368421e-07, + "loss": 0.9433, + "step": 16610 + }, + { + "epoch": 0.4001589557091592, + "grad_norm": 1.511307954788208, + "learning_rate": 6.314521853767366e-07, + "loss": 0.8717, + "step": 16615 + }, + { + "epoch": 0.4002793767009465, + "grad_norm": 1.4371942281723022, + "learning_rate": 6.313254233850523e-07, + "loss": 0.9438, + "step": 16620 + }, + { + "epoch": 0.4003997976927338, + "grad_norm": 1.4800249338150024, + "learning_rate": 6.311986613933677e-07, + "loss": 0.92, + "step": 16625 + }, + { + "epoch": 0.4005202186845211, + "grad_norm": 1.5603761672973633, + "learning_rate": 6.310718994016834e-07, + "loss": 0.9322, + "step": 16630 + }, + { + "epoch": 0.40064063967630836, + "grad_norm": 1.6178046464920044, + "learning_rate": 6.30945137409999e-07, + "loss": 0.8687, + "step": 16635 + }, + { + "epoch": 0.40076106066809564, + "grad_norm": 1.4481092691421509, + "learning_rate": 6.308183754183145e-07, + "loss": 0.9405, + "step": 16640 + }, + { + "epoch": 0.40088148165988297, + "grad_norm": 1.5825382471084595, + "learning_rate": 6.306916134266301e-07, + "loss": 0.8999, + "step": 16645 + }, + { + "epoch": 0.40100190265167024, + "grad_norm": 1.569267749786377, + "learning_rate": 6.305648514349457e-07, + "loss": 0.8915, + "step": 16650 + }, + { + "epoch": 0.4011223236434575, + "grad_norm": 1.5229368209838867, + "learning_rate": 6.304380894432613e-07, + "loss": 0.9088, + "step": 16655 + }, + { + "epoch": 0.40124274463524484, + "grad_norm": 1.4642391204833984, + "learning_rate": 6.303113274515769e-07, + "loss": 0.9022, + "step": 16660 + }, + { + "epoch": 0.4013631656270321, + "grad_norm": 1.8505054712295532, + "learning_rate": 6.301845654598926e-07, + "loss": 0.9742, + "step": 16665 + }, + { + "epoch": 0.4014835866188194, + "grad_norm": 1.8605384826660156, + "learning_rate": 6.30057803468208e-07, + "loss": 0.8914, + "step": 16670 + }, + { + "epoch": 0.40160400761060666, + "grad_norm": 1.5216584205627441, + "learning_rate": 6.299310414765236e-07, + "loss": 0.9265, + "step": 16675 + }, + { + "epoch": 0.401724428602394, + "grad_norm": 1.6303132772445679, + "learning_rate": 6.298042794848393e-07, + "loss": 0.9063, + "step": 16680 + }, + { + "epoch": 0.40184484959418126, + "grad_norm": 1.4981777667999268, + "learning_rate": 6.296775174931548e-07, + "loss": 0.8603, + "step": 16685 + }, + { + "epoch": 0.40196527058596854, + "grad_norm": 1.6867685317993164, + "learning_rate": 6.295507555014705e-07, + "loss": 0.9356, + "step": 16690 + }, + { + "epoch": 0.4020856915777558, + "grad_norm": 1.4873652458190918, + "learning_rate": 6.29423993509786e-07, + "loss": 0.9087, + "step": 16695 + }, + { + "epoch": 0.40220611256954314, + "grad_norm": 1.558084487915039, + "learning_rate": 6.292972315181015e-07, + "loss": 0.9398, + "step": 16700 + }, + { + "epoch": 0.4023265335613304, + "grad_norm": 1.403309941291809, + "learning_rate": 6.291704695264172e-07, + "loss": 0.8758, + "step": 16705 + }, + { + "epoch": 0.4024469545531177, + "grad_norm": 1.7472859621047974, + "learning_rate": 6.290437075347328e-07, + "loss": 0.9222, + "step": 16710 + }, + { + "epoch": 0.402567375544905, + "grad_norm": 1.427155613899231, + "learning_rate": 6.289169455430483e-07, + "loss": 0.8999, + "step": 16715 + }, + { + "epoch": 0.4026877965366923, + "grad_norm": 1.5862841606140137, + "learning_rate": 6.287901835513639e-07, + "loss": 0.8766, + "step": 16720 + }, + { + "epoch": 0.40280821752847956, + "grad_norm": 1.6798174381256104, + "learning_rate": 6.286634215596795e-07, + "loss": 0.9728, + "step": 16725 + }, + { + "epoch": 0.40292863852026684, + "grad_norm": 1.4473795890808105, + "learning_rate": 6.285366595679951e-07, + "loss": 0.898, + "step": 16730 + }, + { + "epoch": 0.40304905951205416, + "grad_norm": 1.7120901346206665, + "learning_rate": 6.284098975763107e-07, + "loss": 0.8887, + "step": 16735 + }, + { + "epoch": 0.40316948050384144, + "grad_norm": 1.587019920349121, + "learning_rate": 6.282831355846263e-07, + "loss": 0.933, + "step": 16740 + }, + { + "epoch": 0.4032899014956287, + "grad_norm": 1.3541581630706787, + "learning_rate": 6.281563735929418e-07, + "loss": 0.8703, + "step": 16745 + }, + { + "epoch": 0.403410322487416, + "grad_norm": 1.5999221801757812, + "learning_rate": 6.280296116012575e-07, + "loss": 0.8818, + "step": 16750 + }, + { + "epoch": 0.4035307434792033, + "grad_norm": 1.6486579179763794, + "learning_rate": 6.279028496095731e-07, + "loss": 0.9107, + "step": 16755 + }, + { + "epoch": 0.4036511644709906, + "grad_norm": 1.8040155172348022, + "learning_rate": 6.277760876178885e-07, + "loss": 0.9138, + "step": 16760 + }, + { + "epoch": 0.40377158546277786, + "grad_norm": 1.6447283029556274, + "learning_rate": 6.276493256262042e-07, + "loss": 0.9212, + "step": 16765 + }, + { + "epoch": 0.40389200645456513, + "grad_norm": 1.5155274868011475, + "learning_rate": 6.275225636345198e-07, + "loss": 0.8841, + "step": 16770 + }, + { + "epoch": 0.40401242744635246, + "grad_norm": 1.705776333808899, + "learning_rate": 6.273958016428354e-07, + "loss": 0.9116, + "step": 16775 + }, + { + "epoch": 0.40413284843813974, + "grad_norm": 1.5871044397354126, + "learning_rate": 6.27269039651151e-07, + "loss": 0.9305, + "step": 16780 + }, + { + "epoch": 0.404253269429927, + "grad_norm": 1.5335018634796143, + "learning_rate": 6.271422776594665e-07, + "loss": 0.9243, + "step": 16785 + }, + { + "epoch": 0.40437369042171434, + "grad_norm": 1.5292437076568604, + "learning_rate": 6.270155156677821e-07, + "loss": 0.8909, + "step": 16790 + }, + { + "epoch": 0.4044941114135016, + "grad_norm": 1.6406742334365845, + "learning_rate": 6.268887536760977e-07, + "loss": 0.9501, + "step": 16795 + }, + { + "epoch": 0.4046145324052889, + "grad_norm": 1.462023377418518, + "learning_rate": 6.267619916844134e-07, + "loss": 0.8919, + "step": 16800 + }, + { + "epoch": 0.40473495339707616, + "grad_norm": 1.5037622451782227, + "learning_rate": 6.266352296927289e-07, + "loss": 0.9235, + "step": 16805 + }, + { + "epoch": 0.4048553743888635, + "grad_norm": 1.6942980289459229, + "learning_rate": 6.265084677010445e-07, + "loss": 0.9217, + "step": 16810 + }, + { + "epoch": 0.40497579538065076, + "grad_norm": 1.740375280380249, + "learning_rate": 6.263817057093601e-07, + "loss": 0.9559, + "step": 16815 + }, + { + "epoch": 0.40509621637243803, + "grad_norm": 1.5765180587768555, + "learning_rate": 6.262549437176756e-07, + "loss": 0.9235, + "step": 16820 + }, + { + "epoch": 0.4052166373642253, + "grad_norm": 1.4993369579315186, + "learning_rate": 6.261281817259913e-07, + "loss": 0.9422, + "step": 16825 + }, + { + "epoch": 0.40533705835601264, + "grad_norm": 1.3874576091766357, + "learning_rate": 6.260014197343068e-07, + "loss": 0.9643, + "step": 16830 + }, + { + "epoch": 0.4054574793477999, + "grad_norm": 1.4534552097320557, + "learning_rate": 6.258746577426224e-07, + "loss": 0.908, + "step": 16835 + }, + { + "epoch": 0.4055779003395872, + "grad_norm": 1.5588668584823608, + "learning_rate": 6.25747895750938e-07, + "loss": 0.8767, + "step": 16840 + }, + { + "epoch": 0.4056983213313745, + "grad_norm": 1.445541501045227, + "learning_rate": 6.256211337592536e-07, + "loss": 0.8934, + "step": 16845 + }, + { + "epoch": 0.4058187423231618, + "grad_norm": 1.5574666261672974, + "learning_rate": 6.254943717675692e-07, + "loss": 0.92, + "step": 16850 + }, + { + "epoch": 0.40593916331494906, + "grad_norm": 1.498324990272522, + "learning_rate": 6.253676097758847e-07, + "loss": 0.8976, + "step": 16855 + }, + { + "epoch": 0.40605958430673633, + "grad_norm": 1.4403085708618164, + "learning_rate": 6.252408477842004e-07, + "loss": 0.9248, + "step": 16860 + }, + { + "epoch": 0.40618000529852366, + "grad_norm": 1.6509935855865479, + "learning_rate": 6.251140857925159e-07, + "loss": 0.9398, + "step": 16865 + }, + { + "epoch": 0.40630042629031093, + "grad_norm": 1.5441958904266357, + "learning_rate": 6.249873238008316e-07, + "loss": 0.9183, + "step": 16870 + }, + { + "epoch": 0.4064208472820982, + "grad_norm": 1.4594789743423462, + "learning_rate": 6.248605618091472e-07, + "loss": 0.9768, + "step": 16875 + }, + { + "epoch": 0.4065412682738855, + "grad_norm": 1.571266770362854, + "learning_rate": 6.247337998174626e-07, + "loss": 0.8789, + "step": 16880 + }, + { + "epoch": 0.4066616892656728, + "grad_norm": 1.6715950965881348, + "learning_rate": 6.246070378257783e-07, + "loss": 0.8904, + "step": 16885 + }, + { + "epoch": 0.4067821102574601, + "grad_norm": 1.7333935499191284, + "learning_rate": 6.244802758340939e-07, + "loss": 0.9722, + "step": 16890 + }, + { + "epoch": 0.40690253124924736, + "grad_norm": 1.4930369853973389, + "learning_rate": 6.243535138424095e-07, + "loss": 0.879, + "step": 16895 + }, + { + "epoch": 0.40702295224103463, + "grad_norm": 1.5462515354156494, + "learning_rate": 6.24226751850725e-07, + "loss": 0.8793, + "step": 16900 + }, + { + "epoch": 0.40714337323282196, + "grad_norm": 1.6213178634643555, + "learning_rate": 6.240999898590406e-07, + "loss": 0.9315, + "step": 16905 + }, + { + "epoch": 0.40726379422460923, + "grad_norm": 1.5512969493865967, + "learning_rate": 6.239732278673562e-07, + "loss": 0.9116, + "step": 16910 + }, + { + "epoch": 0.4073842152163965, + "grad_norm": 1.5854614973068237, + "learning_rate": 6.238464658756718e-07, + "loss": 0.9105, + "step": 16915 + }, + { + "epoch": 0.40750463620818383, + "grad_norm": 1.4322234392166138, + "learning_rate": 6.237197038839875e-07, + "loss": 0.9013, + "step": 16920 + }, + { + "epoch": 0.4076250571999711, + "grad_norm": 1.5661823749542236, + "learning_rate": 6.235929418923029e-07, + "loss": 0.887, + "step": 16925 + }, + { + "epoch": 0.4077454781917584, + "grad_norm": 1.6415965557098389, + "learning_rate": 6.234661799006185e-07, + "loss": 0.8904, + "step": 16930 + }, + { + "epoch": 0.40786589918354565, + "grad_norm": 1.5648971796035767, + "learning_rate": 6.233394179089342e-07, + "loss": 0.9167, + "step": 16935 + }, + { + "epoch": 0.407986320175333, + "grad_norm": 1.5005626678466797, + "learning_rate": 6.232126559172497e-07, + "loss": 0.937, + "step": 16940 + }, + { + "epoch": 0.40810674116712026, + "grad_norm": 1.5273611545562744, + "learning_rate": 6.230858939255653e-07, + "loss": 0.9338, + "step": 16945 + }, + { + "epoch": 0.40822716215890753, + "grad_norm": 1.5377509593963623, + "learning_rate": 6.229591319338809e-07, + "loss": 0.9068, + "step": 16950 + }, + { + "epoch": 0.4083475831506948, + "grad_norm": 1.4508482217788696, + "learning_rate": 6.228323699421965e-07, + "loss": 0.9001, + "step": 16955 + }, + { + "epoch": 0.40846800414248213, + "grad_norm": 1.6953697204589844, + "learning_rate": 6.227056079505121e-07, + "loss": 0.9195, + "step": 16960 + }, + { + "epoch": 0.4085884251342694, + "grad_norm": 1.6012778282165527, + "learning_rate": 6.225788459588277e-07, + "loss": 0.8474, + "step": 16965 + }, + { + "epoch": 0.4087088461260567, + "grad_norm": 1.5923460721969604, + "learning_rate": 6.224520839671432e-07, + "loss": 0.9111, + "step": 16970 + }, + { + "epoch": 0.408829267117844, + "grad_norm": 1.6092580556869507, + "learning_rate": 6.223253219754588e-07, + "loss": 0.8893, + "step": 16975 + }, + { + "epoch": 0.4089496881096313, + "grad_norm": 1.7636964321136475, + "learning_rate": 6.221985599837745e-07, + "loss": 0.9521, + "step": 16980 + }, + { + "epoch": 0.40907010910141856, + "grad_norm": 1.5564132928848267, + "learning_rate": 6.2207179799209e-07, + "loss": 0.9026, + "step": 16985 + }, + { + "epoch": 0.40919053009320583, + "grad_norm": 1.6354960203170776, + "learning_rate": 6.219450360004056e-07, + "loss": 0.9423, + "step": 16990 + }, + { + "epoch": 0.40931095108499316, + "grad_norm": 1.5429902076721191, + "learning_rate": 6.218182740087212e-07, + "loss": 0.9144, + "step": 16995 + }, + { + "epoch": 0.40943137207678043, + "grad_norm": 2.275571823120117, + "learning_rate": 6.216915120170367e-07, + "loss": 0.9458, + "step": 17000 + }, + { + "epoch": 0.4095517930685677, + "grad_norm": 1.4017913341522217, + "learning_rate": 6.215647500253524e-07, + "loss": 0.9419, + "step": 17005 + }, + { + "epoch": 0.409672214060355, + "grad_norm": 1.5098503828048706, + "learning_rate": 6.21437988033668e-07, + "loss": 0.9175, + "step": 17010 + }, + { + "epoch": 0.4097926350521423, + "grad_norm": 1.4720624685287476, + "learning_rate": 6.213112260419834e-07, + "loss": 0.8629, + "step": 17015 + }, + { + "epoch": 0.4099130560439296, + "grad_norm": 1.7012861967086792, + "learning_rate": 6.211844640502991e-07, + "loss": 0.9614, + "step": 17020 + }, + { + "epoch": 0.41003347703571685, + "grad_norm": 1.732938528060913, + "learning_rate": 6.210577020586147e-07, + "loss": 0.9399, + "step": 17025 + }, + { + "epoch": 0.4101538980275041, + "grad_norm": 1.5244909524917603, + "learning_rate": 6.209309400669303e-07, + "loss": 0.8768, + "step": 17030 + }, + { + "epoch": 0.41027431901929146, + "grad_norm": 1.501592755317688, + "learning_rate": 6.208041780752459e-07, + "loss": 0.8713, + "step": 17035 + }, + { + "epoch": 0.41039474001107873, + "grad_norm": 1.55648934841156, + "learning_rate": 6.206774160835615e-07, + "loss": 0.9467, + "step": 17040 + }, + { + "epoch": 0.410515161002866, + "grad_norm": 1.4854168891906738, + "learning_rate": 6.20550654091877e-07, + "loss": 0.9206, + "step": 17045 + }, + { + "epoch": 0.41063558199465333, + "grad_norm": 1.598010540008545, + "learning_rate": 6.204238921001926e-07, + "loss": 0.892, + "step": 17050 + }, + { + "epoch": 0.4107560029864406, + "grad_norm": 1.6800447702407837, + "learning_rate": 6.202971301085083e-07, + "loss": 0.8823, + "step": 17055 + }, + { + "epoch": 0.4108764239782279, + "grad_norm": 1.541235327720642, + "learning_rate": 6.201703681168237e-07, + "loss": 0.9197, + "step": 17060 + }, + { + "epoch": 0.41099684497001515, + "grad_norm": 1.558349370956421, + "learning_rate": 6.200436061251394e-07, + "loss": 0.889, + "step": 17065 + }, + { + "epoch": 0.4111172659618025, + "grad_norm": 1.6211495399475098, + "learning_rate": 6.19916844133455e-07, + "loss": 0.9126, + "step": 17070 + }, + { + "epoch": 0.41123768695358975, + "grad_norm": 1.5009198188781738, + "learning_rate": 6.197900821417706e-07, + "loss": 0.9916, + "step": 17075 + }, + { + "epoch": 0.411358107945377, + "grad_norm": 1.7271745204925537, + "learning_rate": 6.196633201500862e-07, + "loss": 0.8982, + "step": 17080 + }, + { + "epoch": 0.4114785289371643, + "grad_norm": 1.4661312103271484, + "learning_rate": 6.195365581584017e-07, + "loss": 0.9097, + "step": 17085 + }, + { + "epoch": 0.41159894992895163, + "grad_norm": 1.656012773513794, + "learning_rate": 6.194097961667173e-07, + "loss": 0.9434, + "step": 17090 + }, + { + "epoch": 0.4117193709207389, + "grad_norm": 1.7758303880691528, + "learning_rate": 6.192830341750329e-07, + "loss": 0.9637, + "step": 17095 + }, + { + "epoch": 0.4118397919125262, + "grad_norm": 1.5642658472061157, + "learning_rate": 6.191562721833486e-07, + "loss": 0.9356, + "step": 17100 + }, + { + "epoch": 0.4119602129043135, + "grad_norm": 1.6727337837219238, + "learning_rate": 6.190295101916641e-07, + "loss": 0.958, + "step": 17105 + }, + { + "epoch": 0.4120806338961008, + "grad_norm": 1.5685449838638306, + "learning_rate": 6.189027481999796e-07, + "loss": 0.8907, + "step": 17110 + }, + { + "epoch": 0.41220105488788805, + "grad_norm": 1.5111569166183472, + "learning_rate": 6.187759862082953e-07, + "loss": 0.9169, + "step": 17115 + }, + { + "epoch": 0.4123214758796753, + "grad_norm": 1.5179859399795532, + "learning_rate": 6.186492242166108e-07, + "loss": 0.8996, + "step": 17120 + }, + { + "epoch": 0.41244189687146265, + "grad_norm": 1.6612423658370972, + "learning_rate": 6.185224622249265e-07, + "loss": 0.9161, + "step": 17125 + }, + { + "epoch": 0.4125623178632499, + "grad_norm": 1.5869808197021484, + "learning_rate": 6.18395700233242e-07, + "loss": 0.9005, + "step": 17130 + }, + { + "epoch": 0.4126827388550372, + "grad_norm": 1.490541696548462, + "learning_rate": 6.182689382415575e-07, + "loss": 0.949, + "step": 17135 + }, + { + "epoch": 0.4128031598468245, + "grad_norm": 1.481579065322876, + "learning_rate": 6.181421762498732e-07, + "loss": 0.8829, + "step": 17140 + }, + { + "epoch": 0.4129235808386118, + "grad_norm": 1.6850212812423706, + "learning_rate": 6.180154142581888e-07, + "loss": 0.8974, + "step": 17145 + }, + { + "epoch": 0.4130440018303991, + "grad_norm": 1.5239510536193848, + "learning_rate": 6.178886522665044e-07, + "loss": 0.9171, + "step": 17150 + }, + { + "epoch": 0.41316442282218635, + "grad_norm": 1.6153520345687866, + "learning_rate": 6.177618902748199e-07, + "loss": 0.9529, + "step": 17155 + }, + { + "epoch": 0.4132848438139737, + "grad_norm": 1.5189660787582397, + "learning_rate": 6.176351282831356e-07, + "loss": 0.9021, + "step": 17160 + }, + { + "epoch": 0.41340526480576095, + "grad_norm": 1.436489462852478, + "learning_rate": 6.175083662914511e-07, + "loss": 0.9482, + "step": 17165 + }, + { + "epoch": 0.4135256857975482, + "grad_norm": 1.5382148027420044, + "learning_rate": 6.173816042997667e-07, + "loss": 0.8656, + "step": 17170 + }, + { + "epoch": 0.4136461067893355, + "grad_norm": 1.4603832960128784, + "learning_rate": 6.172548423080824e-07, + "loss": 0.9581, + "step": 17175 + }, + { + "epoch": 0.41376652778112283, + "grad_norm": 1.5139708518981934, + "learning_rate": 6.171280803163979e-07, + "loss": 0.8897, + "step": 17180 + }, + { + "epoch": 0.4138869487729101, + "grad_norm": 1.4182488918304443, + "learning_rate": 6.170013183247135e-07, + "loss": 0.9158, + "step": 17185 + }, + { + "epoch": 0.4140073697646974, + "grad_norm": 1.9594801664352417, + "learning_rate": 6.168745563330291e-07, + "loss": 0.9053, + "step": 17190 + }, + { + "epoch": 0.41412779075648465, + "grad_norm": 1.588879108428955, + "learning_rate": 6.167477943413448e-07, + "loss": 0.8981, + "step": 17195 + }, + { + "epoch": 0.414248211748272, + "grad_norm": 1.7074207067489624, + "learning_rate": 6.166210323496602e-07, + "loss": 0.8977, + "step": 17200 + }, + { + "epoch": 0.41436863274005925, + "grad_norm": 1.6250323057174683, + "learning_rate": 6.164942703579758e-07, + "loss": 0.8861, + "step": 17205 + }, + { + "epoch": 0.4144890537318465, + "grad_norm": 1.850350022315979, + "learning_rate": 6.163675083662915e-07, + "loss": 0.8796, + "step": 17210 + }, + { + "epoch": 0.4146094747236338, + "grad_norm": 1.6797027587890625, + "learning_rate": 6.16240746374607e-07, + "loss": 0.8986, + "step": 17215 + }, + { + "epoch": 0.4147298957154211, + "grad_norm": 1.584898591041565, + "learning_rate": 6.161139843829227e-07, + "loss": 0.951, + "step": 17220 + }, + { + "epoch": 0.4148503167072084, + "grad_norm": 1.5470921993255615, + "learning_rate": 6.159872223912382e-07, + "loss": 0.9126, + "step": 17225 + }, + { + "epoch": 0.4149707376989957, + "grad_norm": 1.5832595825195312, + "learning_rate": 6.158604603995537e-07, + "loss": 0.9547, + "step": 17230 + }, + { + "epoch": 0.415091158690783, + "grad_norm": 1.6913233995437622, + "learning_rate": 6.157336984078694e-07, + "loss": 0.8602, + "step": 17235 + }, + { + "epoch": 0.4152115796825703, + "grad_norm": 1.8051854372024536, + "learning_rate": 6.15606936416185e-07, + "loss": 0.9028, + "step": 17240 + }, + { + "epoch": 0.41533200067435755, + "grad_norm": 1.5922828912734985, + "learning_rate": 6.154801744245005e-07, + "loss": 0.9325, + "step": 17245 + }, + { + "epoch": 0.4154524216661448, + "grad_norm": 1.8161146640777588, + "learning_rate": 6.153534124328161e-07, + "loss": 0.9126, + "step": 17250 + }, + { + "epoch": 0.41557284265793215, + "grad_norm": 1.5556491613388062, + "learning_rate": 6.152266504411317e-07, + "loss": 0.9627, + "step": 17255 + }, + { + "epoch": 0.4156932636497194, + "grad_norm": 1.4069287776947021, + "learning_rate": 6.150998884494473e-07, + "loss": 0.9157, + "step": 17260 + }, + { + "epoch": 0.4158136846415067, + "grad_norm": 1.5378355979919434, + "learning_rate": 6.149731264577629e-07, + "loss": 0.9711, + "step": 17265 + }, + { + "epoch": 0.41593410563329397, + "grad_norm": 1.4258906841278076, + "learning_rate": 6.148463644660785e-07, + "loss": 0.8693, + "step": 17270 + }, + { + "epoch": 0.4160545266250813, + "grad_norm": 1.4657279253005981, + "learning_rate": 6.14719602474394e-07, + "loss": 0.9234, + "step": 17275 + }, + { + "epoch": 0.4161749476168686, + "grad_norm": 1.731824517250061, + "learning_rate": 6.145928404827097e-07, + "loss": 0.8964, + "step": 17280 + }, + { + "epoch": 0.41629536860865585, + "grad_norm": 1.8693464994430542, + "learning_rate": 6.144660784910253e-07, + "loss": 0.9262, + "step": 17285 + }, + { + "epoch": 0.4164157896004432, + "grad_norm": 1.500510811805725, + "learning_rate": 6.143393164993408e-07, + "loss": 0.9205, + "step": 17290 + }, + { + "epoch": 0.41653621059223045, + "grad_norm": 1.625565767288208, + "learning_rate": 6.142125545076564e-07, + "loss": 0.9148, + "step": 17295 + }, + { + "epoch": 0.4166566315840177, + "grad_norm": 1.5997830629348755, + "learning_rate": 6.14085792515972e-07, + "loss": 0.9111, + "step": 17300 + }, + { + "epoch": 0.416777052575805, + "grad_norm": 1.6735154390335083, + "learning_rate": 6.139590305242876e-07, + "loss": 0.9583, + "step": 17305 + }, + { + "epoch": 0.4168974735675923, + "grad_norm": 1.5538638830184937, + "learning_rate": 6.138322685326032e-07, + "loss": 0.9632, + "step": 17310 + }, + { + "epoch": 0.4170178945593796, + "grad_norm": 1.4466016292572021, + "learning_rate": 6.137055065409187e-07, + "loss": 0.9088, + "step": 17315 + }, + { + "epoch": 0.41713831555116687, + "grad_norm": 1.5244786739349365, + "learning_rate": 6.135787445492343e-07, + "loss": 0.9279, + "step": 17320 + }, + { + "epoch": 0.41725873654295415, + "grad_norm": 1.305046796798706, + "learning_rate": 6.134519825575499e-07, + "loss": 0.9045, + "step": 17325 + }, + { + "epoch": 0.4173791575347415, + "grad_norm": 1.670288324356079, + "learning_rate": 6.133252205658656e-07, + "loss": 0.8701, + "step": 17330 + }, + { + "epoch": 0.41749957852652875, + "grad_norm": 1.6538128852844238, + "learning_rate": 6.131984585741811e-07, + "loss": 0.9362, + "step": 17335 + }, + { + "epoch": 0.417619999518316, + "grad_norm": 1.5970174074172974, + "learning_rate": 6.130716965824967e-07, + "loss": 0.9376, + "step": 17340 + }, + { + "epoch": 0.4177404205101033, + "grad_norm": 1.530699372291565, + "learning_rate": 6.129449345908123e-07, + "loss": 0.9649, + "step": 17345 + }, + { + "epoch": 0.4178608415018906, + "grad_norm": 1.6150037050247192, + "learning_rate": 6.128181725991278e-07, + "loss": 0.9158, + "step": 17350 + }, + { + "epoch": 0.4179812624936779, + "grad_norm": 1.4796768426895142, + "learning_rate": 6.126914106074435e-07, + "loss": 0.9045, + "step": 17355 + }, + { + "epoch": 0.41810168348546517, + "grad_norm": 1.542307734489441, + "learning_rate": 6.125646486157591e-07, + "loss": 0.8482, + "step": 17360 + }, + { + "epoch": 0.4182221044772525, + "grad_norm": 1.6469286680221558, + "learning_rate": 6.124378866240746e-07, + "loss": 0.929, + "step": 17365 + }, + { + "epoch": 0.41834252546903977, + "grad_norm": 1.5979429483413696, + "learning_rate": 6.123111246323902e-07, + "loss": 0.9167, + "step": 17370 + }, + { + "epoch": 0.41846294646082705, + "grad_norm": 1.4680538177490234, + "learning_rate": 6.121843626407058e-07, + "loss": 0.8667, + "step": 17375 + }, + { + "epoch": 0.4185833674526143, + "grad_norm": 1.752623200416565, + "learning_rate": 6.120576006490214e-07, + "loss": 0.9389, + "step": 17380 + }, + { + "epoch": 0.41870378844440165, + "grad_norm": 1.4785820245742798, + "learning_rate": 6.119308386573369e-07, + "loss": 0.8957, + "step": 17385 + }, + { + "epoch": 0.4188242094361889, + "grad_norm": 1.513867735862732, + "learning_rate": 6.118040766656526e-07, + "loss": 0.9167, + "step": 17390 + }, + { + "epoch": 0.4189446304279762, + "grad_norm": 1.5811598300933838, + "learning_rate": 6.116773146739681e-07, + "loss": 0.9019, + "step": 17395 + }, + { + "epoch": 0.41906505141976347, + "grad_norm": 1.643618106842041, + "learning_rate": 6.115505526822838e-07, + "loss": 0.9424, + "step": 17400 + }, + { + "epoch": 0.4191854724115508, + "grad_norm": 1.3657090663909912, + "learning_rate": 6.114237906905994e-07, + "loss": 0.9588, + "step": 17405 + }, + { + "epoch": 0.41930589340333807, + "grad_norm": 1.5041183233261108, + "learning_rate": 6.112970286989148e-07, + "loss": 0.8784, + "step": 17410 + }, + { + "epoch": 0.41942631439512534, + "grad_norm": 1.5289888381958008, + "learning_rate": 6.111702667072305e-07, + "loss": 0.8804, + "step": 17415 + }, + { + "epoch": 0.4195467353869127, + "grad_norm": 1.574815034866333, + "learning_rate": 6.110435047155461e-07, + "loss": 0.9201, + "step": 17420 + }, + { + "epoch": 0.41966715637869995, + "grad_norm": 1.6267809867858887, + "learning_rate": 6.109167427238617e-07, + "loss": 0.8833, + "step": 17425 + }, + { + "epoch": 0.4197875773704872, + "grad_norm": 1.4753605127334595, + "learning_rate": 6.107899807321772e-07, + "loss": 0.9439, + "step": 17430 + }, + { + "epoch": 0.4199079983622745, + "grad_norm": 1.5669854879379272, + "learning_rate": 6.106632187404928e-07, + "loss": 0.9109, + "step": 17435 + }, + { + "epoch": 0.4200284193540618, + "grad_norm": 1.5805938243865967, + "learning_rate": 6.105364567488084e-07, + "loss": 0.9097, + "step": 17440 + }, + { + "epoch": 0.4201488403458491, + "grad_norm": 1.501623272895813, + "learning_rate": 6.10409694757124e-07, + "loss": 0.9498, + "step": 17445 + }, + { + "epoch": 0.42026926133763637, + "grad_norm": 1.611153483390808, + "learning_rate": 6.102829327654397e-07, + "loss": 0.9448, + "step": 17450 + }, + { + "epoch": 0.42038968232942364, + "grad_norm": 1.4031232595443726, + "learning_rate": 6.101561707737551e-07, + "loss": 0.9303, + "step": 17455 + }, + { + "epoch": 0.42051010332121097, + "grad_norm": 1.5860594511032104, + "learning_rate": 6.100294087820707e-07, + "loss": 0.9008, + "step": 17460 + }, + { + "epoch": 0.42063052431299824, + "grad_norm": 1.665428876876831, + "learning_rate": 6.099026467903864e-07, + "loss": 0.9014, + "step": 17465 + }, + { + "epoch": 0.4207509453047855, + "grad_norm": 1.5833650827407837, + "learning_rate": 6.097758847987019e-07, + "loss": 0.9349, + "step": 17470 + }, + { + "epoch": 0.4208713662965728, + "grad_norm": 1.6934082508087158, + "learning_rate": 6.096491228070176e-07, + "loss": 0.9126, + "step": 17475 + }, + { + "epoch": 0.4209917872883601, + "grad_norm": 1.7227505445480347, + "learning_rate": 6.095223608153331e-07, + "loss": 1.0, + "step": 17480 + }, + { + "epoch": 0.4211122082801474, + "grad_norm": 1.4861100912094116, + "learning_rate": 6.093955988236487e-07, + "loss": 0.9235, + "step": 17485 + }, + { + "epoch": 0.42123262927193467, + "grad_norm": 1.6223926544189453, + "learning_rate": 6.092688368319643e-07, + "loss": 0.9327, + "step": 17490 + }, + { + "epoch": 0.421353050263722, + "grad_norm": 1.5731300115585327, + "learning_rate": 6.091420748402799e-07, + "loss": 0.9212, + "step": 17495 + }, + { + "epoch": 0.42147347125550927, + "grad_norm": 1.519948959350586, + "learning_rate": 6.090153128485954e-07, + "loss": 0.8683, + "step": 17500 + }, + { + "epoch": 0.42159389224729654, + "grad_norm": 1.383150339126587, + "learning_rate": 6.08888550856911e-07, + "loss": 0.9131, + "step": 17505 + }, + { + "epoch": 0.4217143132390838, + "grad_norm": 1.4093159437179565, + "learning_rate": 6.087617888652267e-07, + "loss": 0.9419, + "step": 17510 + }, + { + "epoch": 0.42183473423087114, + "grad_norm": 1.5490443706512451, + "learning_rate": 6.086350268735422e-07, + "loss": 0.9289, + "step": 17515 + }, + { + "epoch": 0.4219551552226584, + "grad_norm": 1.570052981376648, + "learning_rate": 6.085082648818578e-07, + "loss": 0.9024, + "step": 17520 + }, + { + "epoch": 0.4220755762144457, + "grad_norm": 1.583708643913269, + "learning_rate": 6.083815028901734e-07, + "loss": 0.8999, + "step": 17525 + }, + { + "epoch": 0.42219599720623296, + "grad_norm": 1.38755464553833, + "learning_rate": 6.082547408984889e-07, + "loss": 0.9087, + "step": 17530 + }, + { + "epoch": 0.4223164181980203, + "grad_norm": 1.5139927864074707, + "learning_rate": 6.081279789068046e-07, + "loss": 0.9185, + "step": 17535 + }, + { + "epoch": 0.42243683918980757, + "grad_norm": 1.3745379447937012, + "learning_rate": 6.080012169151202e-07, + "loss": 0.9573, + "step": 17540 + }, + { + "epoch": 0.42255726018159484, + "grad_norm": 1.3747448921203613, + "learning_rate": 6.078744549234356e-07, + "loss": 0.9007, + "step": 17545 + }, + { + "epoch": 0.42267768117338217, + "grad_norm": 1.657645344734192, + "learning_rate": 6.077476929317513e-07, + "loss": 0.927, + "step": 17550 + }, + { + "epoch": 0.42279810216516944, + "grad_norm": 1.5407798290252686, + "learning_rate": 6.076209309400669e-07, + "loss": 0.8979, + "step": 17555 + }, + { + "epoch": 0.4229185231569567, + "grad_norm": 1.5311154127120972, + "learning_rate": 6.074941689483825e-07, + "loss": 0.8862, + "step": 17560 + }, + { + "epoch": 0.423038944148744, + "grad_norm": 1.4764913320541382, + "learning_rate": 6.073674069566981e-07, + "loss": 0.8988, + "step": 17565 + }, + { + "epoch": 0.4231593651405313, + "grad_norm": 1.3411235809326172, + "learning_rate": 6.072406449650137e-07, + "loss": 0.9283, + "step": 17570 + }, + { + "epoch": 0.4232797861323186, + "grad_norm": 1.579624891281128, + "learning_rate": 6.071138829733292e-07, + "loss": 0.9296, + "step": 17575 + }, + { + "epoch": 0.42340020712410587, + "grad_norm": 1.8971341848373413, + "learning_rate": 6.069871209816448e-07, + "loss": 0.9012, + "step": 17580 + }, + { + "epoch": 0.42352062811589314, + "grad_norm": 1.5456421375274658, + "learning_rate": 6.068603589899605e-07, + "loss": 0.8915, + "step": 17585 + }, + { + "epoch": 0.42364104910768047, + "grad_norm": 1.4029710292816162, + "learning_rate": 6.067335969982759e-07, + "loss": 0.9787, + "step": 17590 + }, + { + "epoch": 0.42376147009946774, + "grad_norm": 1.7143163681030273, + "learning_rate": 6.066068350065916e-07, + "loss": 0.8726, + "step": 17595 + }, + { + "epoch": 0.423881891091255, + "grad_norm": 1.680330514907837, + "learning_rate": 6.064800730149072e-07, + "loss": 0.9316, + "step": 17600 + }, + { + "epoch": 0.42400231208304234, + "grad_norm": 1.5830532312393188, + "learning_rate": 6.063533110232228e-07, + "loss": 0.9317, + "step": 17605 + }, + { + "epoch": 0.4241227330748296, + "grad_norm": 1.6646406650543213, + "learning_rate": 6.062265490315384e-07, + "loss": 0.9463, + "step": 17610 + }, + { + "epoch": 0.4242431540666169, + "grad_norm": 1.6631529331207275, + "learning_rate": 6.060997870398539e-07, + "loss": 0.9289, + "step": 17615 + }, + { + "epoch": 0.42436357505840416, + "grad_norm": 1.5877068042755127, + "learning_rate": 6.059730250481695e-07, + "loss": 0.8933, + "step": 17620 + }, + { + "epoch": 0.4244839960501915, + "grad_norm": 1.5696178674697876, + "learning_rate": 6.058462630564851e-07, + "loss": 0.9762, + "step": 17625 + }, + { + "epoch": 0.42460441704197877, + "grad_norm": 2.281834363937378, + "learning_rate": 6.057195010648008e-07, + "loss": 0.8999, + "step": 17630 + }, + { + "epoch": 0.42472483803376604, + "grad_norm": 1.560080647468567, + "learning_rate": 6.055927390731163e-07, + "loss": 0.9323, + "step": 17635 + }, + { + "epoch": 0.4248452590255533, + "grad_norm": 1.710875153541565, + "learning_rate": 6.054659770814318e-07, + "loss": 0.9317, + "step": 17640 + }, + { + "epoch": 0.42496568001734064, + "grad_norm": 1.432875156402588, + "learning_rate": 6.053392150897475e-07, + "loss": 0.9254, + "step": 17645 + }, + { + "epoch": 0.4250861010091279, + "grad_norm": 1.6327581405639648, + "learning_rate": 6.05212453098063e-07, + "loss": 0.8601, + "step": 17650 + }, + { + "epoch": 0.4252065220009152, + "grad_norm": 1.535698413848877, + "learning_rate": 6.050856911063787e-07, + "loss": 0.9295, + "step": 17655 + }, + { + "epoch": 0.42532694299270246, + "grad_norm": 1.5619745254516602, + "learning_rate": 6.049589291146942e-07, + "loss": 0.9188, + "step": 17660 + }, + { + "epoch": 0.4254473639844898, + "grad_norm": 1.8153740167617798, + "learning_rate": 6.048321671230097e-07, + "loss": 0.9126, + "step": 17665 + }, + { + "epoch": 0.42556778497627706, + "grad_norm": 1.6192477941513062, + "learning_rate": 6.047054051313254e-07, + "loss": 0.8888, + "step": 17670 + }, + { + "epoch": 0.42568820596806434, + "grad_norm": 1.4542292356491089, + "learning_rate": 6.04578643139641e-07, + "loss": 0.9421, + "step": 17675 + }, + { + "epoch": 0.42580862695985167, + "grad_norm": 1.72112238407135, + "learning_rate": 6.044518811479566e-07, + "loss": 0.8955, + "step": 17680 + }, + { + "epoch": 0.42592904795163894, + "grad_norm": 1.7958778142929077, + "learning_rate": 6.043251191562721e-07, + "loss": 0.8612, + "step": 17685 + }, + { + "epoch": 0.4260494689434262, + "grad_norm": 1.5801275968551636, + "learning_rate": 6.041983571645878e-07, + "loss": 0.9085, + "step": 17690 + }, + { + "epoch": 0.4261698899352135, + "grad_norm": 1.7030131816864014, + "learning_rate": 6.040715951729033e-07, + "loss": 0.8984, + "step": 17695 + }, + { + "epoch": 0.4262903109270008, + "grad_norm": 1.5439563989639282, + "learning_rate": 6.039448331812189e-07, + "loss": 0.882, + "step": 17700 + }, + { + "epoch": 0.4264107319187881, + "grad_norm": 1.5502954721450806, + "learning_rate": 6.038180711895346e-07, + "loss": 0.8829, + "step": 17705 + }, + { + "epoch": 0.42653115291057536, + "grad_norm": 1.400022029876709, + "learning_rate": 6.0369130919785e-07, + "loss": 0.9075, + "step": 17710 + }, + { + "epoch": 0.42665157390236264, + "grad_norm": 1.654332160949707, + "learning_rate": 6.035645472061657e-07, + "loss": 0.9059, + "step": 17715 + }, + { + "epoch": 0.42677199489414996, + "grad_norm": 1.4520788192749023, + "learning_rate": 6.034377852144813e-07, + "loss": 0.8819, + "step": 17720 + }, + { + "epoch": 0.42689241588593724, + "grad_norm": 1.511816143989563, + "learning_rate": 6.033110232227968e-07, + "loss": 0.888, + "step": 17725 + }, + { + "epoch": 0.4270128368777245, + "grad_norm": 1.648866891860962, + "learning_rate": 6.031842612311124e-07, + "loss": 0.9111, + "step": 17730 + }, + { + "epoch": 0.42713325786951184, + "grad_norm": 1.6627436876296997, + "learning_rate": 6.03057499239428e-07, + "loss": 0.9159, + "step": 17735 + }, + { + "epoch": 0.4272536788612991, + "grad_norm": 1.4436696767807007, + "learning_rate": 6.029307372477436e-07, + "loss": 0.8866, + "step": 17740 + }, + { + "epoch": 0.4273740998530864, + "grad_norm": 1.6138030290603638, + "learning_rate": 6.028039752560592e-07, + "loss": 0.9277, + "step": 17745 + }, + { + "epoch": 0.42749452084487366, + "grad_norm": 1.652036190032959, + "learning_rate": 6.026772132643749e-07, + "loss": 0.9126, + "step": 17750 + }, + { + "epoch": 0.427614941836661, + "grad_norm": 1.5107982158660889, + "learning_rate": 6.025504512726903e-07, + "loss": 0.9208, + "step": 17755 + }, + { + "epoch": 0.42773536282844826, + "grad_norm": 1.5715196132659912, + "learning_rate": 6.024236892810059e-07, + "loss": 0.9569, + "step": 17760 + }, + { + "epoch": 0.42785578382023554, + "grad_norm": 1.4496690034866333, + "learning_rate": 6.022969272893216e-07, + "loss": 0.9336, + "step": 17765 + }, + { + "epoch": 0.4279762048120228, + "grad_norm": 1.6113660335540771, + "learning_rate": 6.021701652976371e-07, + "loss": 0.9136, + "step": 17770 + }, + { + "epoch": 0.42809662580381014, + "grad_norm": 1.7576104402542114, + "learning_rate": 6.020434033059527e-07, + "loss": 0.9168, + "step": 17775 + }, + { + "epoch": 0.4282170467955974, + "grad_norm": 1.6658543348312378, + "learning_rate": 6.019166413142683e-07, + "loss": 0.913, + "step": 17780 + }, + { + "epoch": 0.4283374677873847, + "grad_norm": 1.5523247718811035, + "learning_rate": 6.017898793225838e-07, + "loss": 0.9239, + "step": 17785 + }, + { + "epoch": 0.42845788877917196, + "grad_norm": 1.594376802444458, + "learning_rate": 6.016631173308995e-07, + "loss": 0.9631, + "step": 17790 + }, + { + "epoch": 0.4285783097709593, + "grad_norm": 1.3364665508270264, + "learning_rate": 6.015363553392151e-07, + "loss": 0.8864, + "step": 17795 + }, + { + "epoch": 0.42869873076274656, + "grad_norm": 1.6243319511413574, + "learning_rate": 6.014095933475306e-07, + "loss": 0.8928, + "step": 17800 + }, + { + "epoch": 0.42881915175453383, + "grad_norm": 2.047043561935425, + "learning_rate": 6.012828313558462e-07, + "loss": 0.9565, + "step": 17805 + }, + { + "epoch": 0.42893957274632116, + "grad_norm": 1.4199920892715454, + "learning_rate": 6.011560693641619e-07, + "loss": 0.8812, + "step": 17810 + }, + { + "epoch": 0.42905999373810844, + "grad_norm": 1.5130432844161987, + "learning_rate": 6.010293073724774e-07, + "loss": 0.8514, + "step": 17815 + }, + { + "epoch": 0.4291804147298957, + "grad_norm": 1.6105554103851318, + "learning_rate": 6.00902545380793e-07, + "loss": 0.8948, + "step": 17820 + }, + { + "epoch": 0.429300835721683, + "grad_norm": 1.5266790390014648, + "learning_rate": 6.007757833891086e-07, + "loss": 0.9541, + "step": 17825 + }, + { + "epoch": 0.4294212567134703, + "grad_norm": 1.5769493579864502, + "learning_rate": 6.006490213974241e-07, + "loss": 0.8985, + "step": 17830 + }, + { + "epoch": 0.4295416777052576, + "grad_norm": 1.99751615524292, + "learning_rate": 6.005222594057398e-07, + "loss": 0.9314, + "step": 17835 + }, + { + "epoch": 0.42966209869704486, + "grad_norm": 1.7150274515151978, + "learning_rate": 6.003954974140554e-07, + "loss": 0.9215, + "step": 17840 + }, + { + "epoch": 0.42978251968883213, + "grad_norm": 1.5975068807601929, + "learning_rate": 6.002687354223708e-07, + "loss": 0.908, + "step": 17845 + }, + { + "epoch": 0.42990294068061946, + "grad_norm": 1.5267257690429688, + "learning_rate": 6.001419734306865e-07, + "loss": 0.8962, + "step": 17850 + }, + { + "epoch": 0.43002336167240673, + "grad_norm": 1.561611533164978, + "learning_rate": 6.000152114390021e-07, + "loss": 0.9718, + "step": 17855 + }, + { + "epoch": 0.430143782664194, + "grad_norm": 1.575334072113037, + "learning_rate": 5.998884494473177e-07, + "loss": 0.9145, + "step": 17860 + }, + { + "epoch": 0.43026420365598134, + "grad_norm": 1.507455587387085, + "learning_rate": 5.997616874556333e-07, + "loss": 0.9244, + "step": 17865 + }, + { + "epoch": 0.4303846246477686, + "grad_norm": 1.6253801584243774, + "learning_rate": 5.996349254639489e-07, + "loss": 0.8836, + "step": 17870 + }, + { + "epoch": 0.4305050456395559, + "grad_norm": 1.6473288536071777, + "learning_rate": 5.995081634722644e-07, + "loss": 0.9189, + "step": 17875 + }, + { + "epoch": 0.43062546663134316, + "grad_norm": 1.3467274904251099, + "learning_rate": 5.9938140148058e-07, + "loss": 0.8914, + "step": 17880 + }, + { + "epoch": 0.4307458876231305, + "grad_norm": 1.6535570621490479, + "learning_rate": 5.992546394888957e-07, + "loss": 0.8545, + "step": 17885 + }, + { + "epoch": 0.43086630861491776, + "grad_norm": 1.9367197751998901, + "learning_rate": 5.991278774972111e-07, + "loss": 0.9876, + "step": 17890 + }, + { + "epoch": 0.43098672960670503, + "grad_norm": 1.5257887840270996, + "learning_rate": 5.990011155055268e-07, + "loss": 0.8616, + "step": 17895 + }, + { + "epoch": 0.4311071505984923, + "grad_norm": 1.6853076219558716, + "learning_rate": 5.988743535138424e-07, + "loss": 0.9274, + "step": 17900 + }, + { + "epoch": 0.43122757159027963, + "grad_norm": 1.6795798540115356, + "learning_rate": 5.987475915221579e-07, + "loss": 0.8983, + "step": 17905 + }, + { + "epoch": 0.4313479925820669, + "grad_norm": 1.533904790878296, + "learning_rate": 5.986208295304736e-07, + "loss": 0.8607, + "step": 17910 + }, + { + "epoch": 0.4314684135738542, + "grad_norm": 1.7242625951766968, + "learning_rate": 5.984940675387891e-07, + "loss": 0.9448, + "step": 17915 + }, + { + "epoch": 0.4315888345656415, + "grad_norm": 1.5450302362442017, + "learning_rate": 5.983673055471047e-07, + "loss": 0.9315, + "step": 17920 + }, + { + "epoch": 0.4317092555574288, + "grad_norm": 1.547723650932312, + "learning_rate": 5.982405435554203e-07, + "loss": 0.9453, + "step": 17925 + }, + { + "epoch": 0.43182967654921606, + "grad_norm": 1.5352214574813843, + "learning_rate": 5.98113781563736e-07, + "loss": 0.9061, + "step": 17930 + }, + { + "epoch": 0.43195009754100333, + "grad_norm": 1.7504684925079346, + "learning_rate": 5.979870195720515e-07, + "loss": 0.903, + "step": 17935 + }, + { + "epoch": 0.43207051853279066, + "grad_norm": 1.4188649654388428, + "learning_rate": 5.97860257580367e-07, + "loss": 0.91, + "step": 17940 + }, + { + "epoch": 0.43219093952457793, + "grad_norm": 1.8634692430496216, + "learning_rate": 5.977334955886827e-07, + "loss": 0.9519, + "step": 17945 + }, + { + "epoch": 0.4323113605163652, + "grad_norm": 1.4302127361297607, + "learning_rate": 5.976067335969982e-07, + "loss": 0.9287, + "step": 17950 + }, + { + "epoch": 0.4324317815081525, + "grad_norm": 1.4653289318084717, + "learning_rate": 5.974799716053139e-07, + "loss": 0.9013, + "step": 17955 + }, + { + "epoch": 0.4325522024999398, + "grad_norm": 1.4174020290374756, + "learning_rate": 5.973532096136294e-07, + "loss": 0.9004, + "step": 17960 + }, + { + "epoch": 0.4326726234917271, + "grad_norm": 1.780843734741211, + "learning_rate": 5.972264476219449e-07, + "loss": 0.884, + "step": 17965 + }, + { + "epoch": 0.43279304448351436, + "grad_norm": 1.7866368293762207, + "learning_rate": 5.970996856302606e-07, + "loss": 0.8893, + "step": 17970 + }, + { + "epoch": 0.43291346547530163, + "grad_norm": 1.6399894952774048, + "learning_rate": 5.969729236385762e-07, + "loss": 0.8958, + "step": 17975 + }, + { + "epoch": 0.43303388646708896, + "grad_norm": 1.3787689208984375, + "learning_rate": 5.968461616468918e-07, + "loss": 0.9251, + "step": 17980 + }, + { + "epoch": 0.43315430745887623, + "grad_norm": 1.4486820697784424, + "learning_rate": 5.967193996552073e-07, + "loss": 0.9001, + "step": 17985 + }, + { + "epoch": 0.4332747284506635, + "grad_norm": 1.5622378587722778, + "learning_rate": 5.96592637663523e-07, + "loss": 0.9648, + "step": 17990 + }, + { + "epoch": 0.43339514944245083, + "grad_norm": 1.6893956661224365, + "learning_rate": 5.964658756718385e-07, + "loss": 0.9027, + "step": 17995 + }, + { + "epoch": 0.4335155704342381, + "grad_norm": 1.5421922206878662, + "learning_rate": 5.963391136801541e-07, + "loss": 0.9626, + "step": 18000 + }, + { + "epoch": 0.4336359914260254, + "grad_norm": 1.4266217947006226, + "learning_rate": 5.962123516884698e-07, + "loss": 0.8913, + "step": 18005 + }, + { + "epoch": 0.43375641241781265, + "grad_norm": 1.4913381338119507, + "learning_rate": 5.960855896967852e-07, + "loss": 0.9139, + "step": 18010 + }, + { + "epoch": 0.4338768334096, + "grad_norm": 1.609383225440979, + "learning_rate": 5.959588277051009e-07, + "loss": 0.8989, + "step": 18015 + }, + { + "epoch": 0.43399725440138726, + "grad_norm": 1.436481237411499, + "learning_rate": 5.958320657134165e-07, + "loss": 0.9932, + "step": 18020 + }, + { + "epoch": 0.43411767539317453, + "grad_norm": 1.5978612899780273, + "learning_rate": 5.95705303721732e-07, + "loss": 0.8818, + "step": 18025 + }, + { + "epoch": 0.4342380963849618, + "grad_norm": 1.6705412864685059, + "learning_rate": 5.955785417300476e-07, + "loss": 0.8584, + "step": 18030 + }, + { + "epoch": 0.43435851737674913, + "grad_norm": 1.4328734874725342, + "learning_rate": 5.954517797383632e-07, + "loss": 0.9302, + "step": 18035 + }, + { + "epoch": 0.4344789383685364, + "grad_norm": 1.2172200679779053, + "learning_rate": 5.953250177466788e-07, + "loss": 0.9303, + "step": 18040 + }, + { + "epoch": 0.4345993593603237, + "grad_norm": 1.629767656326294, + "learning_rate": 5.951982557549944e-07, + "loss": 0.8776, + "step": 18045 + }, + { + "epoch": 0.434719780352111, + "grad_norm": 1.515710711479187, + "learning_rate": 5.9507149376331e-07, + "loss": 0.8947, + "step": 18050 + }, + { + "epoch": 0.4348402013438983, + "grad_norm": 1.75547456741333, + "learning_rate": 5.949447317716255e-07, + "loss": 0.9042, + "step": 18055 + }, + { + "epoch": 0.43496062233568555, + "grad_norm": 1.578229308128357, + "learning_rate": 5.948179697799411e-07, + "loss": 0.9092, + "step": 18060 + }, + { + "epoch": 0.4350810433274728, + "grad_norm": 1.6973870992660522, + "learning_rate": 5.946912077882568e-07, + "loss": 0.9316, + "step": 18065 + }, + { + "epoch": 0.43520146431926016, + "grad_norm": 1.6119117736816406, + "learning_rate": 5.945644457965723e-07, + "loss": 0.8674, + "step": 18070 + }, + { + "epoch": 0.43532188531104743, + "grad_norm": 1.3468626737594604, + "learning_rate": 5.944376838048879e-07, + "loss": 0.8576, + "step": 18075 + }, + { + "epoch": 0.4354423063028347, + "grad_norm": 1.451310396194458, + "learning_rate": 5.943109218132035e-07, + "loss": 0.9071, + "step": 18080 + }, + { + "epoch": 0.435562727294622, + "grad_norm": 1.6123762130737305, + "learning_rate": 5.94184159821519e-07, + "loss": 0.9343, + "step": 18085 + }, + { + "epoch": 0.4356831482864093, + "grad_norm": 1.454443335533142, + "learning_rate": 5.940573978298347e-07, + "loss": 0.9151, + "step": 18090 + }, + { + "epoch": 0.4358035692781966, + "grad_norm": 1.4933301210403442, + "learning_rate": 5.939306358381503e-07, + "loss": 0.9507, + "step": 18095 + }, + { + "epoch": 0.43592399026998385, + "grad_norm": 1.517813801765442, + "learning_rate": 5.938038738464658e-07, + "loss": 0.9592, + "step": 18100 + }, + { + "epoch": 0.4360444112617711, + "grad_norm": 1.5954922437667847, + "learning_rate": 5.936771118547814e-07, + "loss": 0.9175, + "step": 18105 + }, + { + "epoch": 0.43616483225355845, + "grad_norm": 1.4719862937927246, + "learning_rate": 5.93550349863097e-07, + "loss": 0.9304, + "step": 18110 + }, + { + "epoch": 0.4362852532453457, + "grad_norm": 1.4959850311279297, + "learning_rate": 5.934235878714126e-07, + "loss": 0.9394, + "step": 18115 + }, + { + "epoch": 0.436405674237133, + "grad_norm": 1.2718183994293213, + "learning_rate": 5.932968258797282e-07, + "loss": 0.9027, + "step": 18120 + }, + { + "epoch": 0.43652609522892033, + "grad_norm": 1.5270932912826538, + "learning_rate": 5.931700638880438e-07, + "loss": 0.9114, + "step": 18125 + }, + { + "epoch": 0.4366465162207076, + "grad_norm": 1.6102068424224854, + "learning_rate": 5.930433018963593e-07, + "loss": 0.9323, + "step": 18130 + }, + { + "epoch": 0.4367669372124949, + "grad_norm": 1.6144814491271973, + "learning_rate": 5.92916539904675e-07, + "loss": 0.9071, + "step": 18135 + }, + { + "epoch": 0.43688735820428215, + "grad_norm": 1.586299180984497, + "learning_rate": 5.927897779129906e-07, + "loss": 0.9377, + "step": 18140 + }, + { + "epoch": 0.4370077791960695, + "grad_norm": 1.690378189086914, + "learning_rate": 5.92663015921306e-07, + "loss": 0.8917, + "step": 18145 + }, + { + "epoch": 0.43712820018785675, + "grad_norm": 1.6998260021209717, + "learning_rate": 5.925362539296217e-07, + "loss": 0.9332, + "step": 18150 + }, + { + "epoch": 0.437248621179644, + "grad_norm": 1.5584924221038818, + "learning_rate": 5.924094919379373e-07, + "loss": 0.9188, + "step": 18155 + }, + { + "epoch": 0.4373690421714313, + "grad_norm": 1.5578699111938477, + "learning_rate": 5.922827299462529e-07, + "loss": 0.9049, + "step": 18160 + }, + { + "epoch": 0.43748946316321863, + "grad_norm": 1.6602588891983032, + "learning_rate": 5.921559679545685e-07, + "loss": 0.9397, + "step": 18165 + }, + { + "epoch": 0.4376098841550059, + "grad_norm": 1.7947601079940796, + "learning_rate": 5.92029205962884e-07, + "loss": 0.9206, + "step": 18170 + }, + { + "epoch": 0.4377303051467932, + "grad_norm": 1.5118459463119507, + "learning_rate": 5.919024439711996e-07, + "loss": 0.9207, + "step": 18175 + }, + { + "epoch": 0.4378507261385805, + "grad_norm": 1.6038711071014404, + "learning_rate": 5.917756819795152e-07, + "loss": 0.8964, + "step": 18180 + }, + { + "epoch": 0.4379711471303678, + "grad_norm": 1.8039369583129883, + "learning_rate": 5.916489199878309e-07, + "loss": 0.9169, + "step": 18185 + }, + { + "epoch": 0.43809156812215505, + "grad_norm": 1.5942789316177368, + "learning_rate": 5.915221579961463e-07, + "loss": 0.9819, + "step": 18190 + }, + { + "epoch": 0.4382119891139423, + "grad_norm": 1.330572485923767, + "learning_rate": 5.91395396004462e-07, + "loss": 0.9146, + "step": 18195 + }, + { + "epoch": 0.43833241010572965, + "grad_norm": 1.6067678928375244, + "learning_rate": 5.912686340127776e-07, + "loss": 0.8957, + "step": 18200 + }, + { + "epoch": 0.4384528310975169, + "grad_norm": 1.4917579889297485, + "learning_rate": 5.911418720210932e-07, + "loss": 0.9636, + "step": 18205 + }, + { + "epoch": 0.4385732520893042, + "grad_norm": 1.3807231187820435, + "learning_rate": 5.910151100294088e-07, + "loss": 0.8647, + "step": 18210 + }, + { + "epoch": 0.4386936730810915, + "grad_norm": 1.4772977828979492, + "learning_rate": 5.908883480377243e-07, + "loss": 0.9232, + "step": 18215 + }, + { + "epoch": 0.4388140940728788, + "grad_norm": 1.711417317390442, + "learning_rate": 5.9076158604604e-07, + "loss": 0.9004, + "step": 18220 + }, + { + "epoch": 0.4389345150646661, + "grad_norm": 1.4783941507339478, + "learning_rate": 5.906348240543555e-07, + "loss": 0.9263, + "step": 18225 + }, + { + "epoch": 0.43905493605645335, + "grad_norm": 1.916579008102417, + "learning_rate": 5.905080620626711e-07, + "loss": 0.8874, + "step": 18230 + }, + { + "epoch": 0.4391753570482406, + "grad_norm": 1.3718454837799072, + "learning_rate": 5.903813000709868e-07, + "loss": 0.9168, + "step": 18235 + }, + { + "epoch": 0.43929577804002795, + "grad_norm": 1.5684967041015625, + "learning_rate": 5.902545380793022e-07, + "loss": 0.8782, + "step": 18240 + }, + { + "epoch": 0.4394161990318152, + "grad_norm": 1.6263731718063354, + "learning_rate": 5.901277760876179e-07, + "loss": 0.9195, + "step": 18245 + }, + { + "epoch": 0.4395366200236025, + "grad_norm": 1.4979406595230103, + "learning_rate": 5.900010140959335e-07, + "loss": 0.9013, + "step": 18250 + }, + { + "epoch": 0.4396570410153898, + "grad_norm": 1.8347595930099487, + "learning_rate": 5.89874252104249e-07, + "loss": 0.9248, + "step": 18255 + }, + { + "epoch": 0.4397774620071771, + "grad_norm": 1.6108922958374023, + "learning_rate": 5.897474901125646e-07, + "loss": 0.8857, + "step": 18260 + }, + { + "epoch": 0.4398978829989644, + "grad_norm": 1.4353135824203491, + "learning_rate": 5.896207281208802e-07, + "loss": 0.9357, + "step": 18265 + }, + { + "epoch": 0.44001830399075165, + "grad_norm": 1.3888901472091675, + "learning_rate": 5.894939661291958e-07, + "loss": 0.9444, + "step": 18270 + }, + { + "epoch": 0.440138724982539, + "grad_norm": 1.522700548171997, + "learning_rate": 5.893672041375114e-07, + "loss": 0.8662, + "step": 18275 + }, + { + "epoch": 0.44025914597432625, + "grad_norm": 1.5019195079803467, + "learning_rate": 5.892404421458271e-07, + "loss": 0.955, + "step": 18280 + }, + { + "epoch": 0.4403795669661135, + "grad_norm": 1.7402147054672241, + "learning_rate": 5.891136801541425e-07, + "loss": 0.9712, + "step": 18285 + }, + { + "epoch": 0.4404999879579008, + "grad_norm": 1.4262351989746094, + "learning_rate": 5.889869181624581e-07, + "loss": 0.9333, + "step": 18290 + }, + { + "epoch": 0.4406204089496881, + "grad_norm": 1.7041842937469482, + "learning_rate": 5.888601561707738e-07, + "loss": 0.8435, + "step": 18295 + }, + { + "epoch": 0.4407408299414754, + "grad_norm": 1.5107301473617554, + "learning_rate": 5.887333941790893e-07, + "loss": 0.9032, + "step": 18300 + }, + { + "epoch": 0.44086125093326267, + "grad_norm": 1.543336033821106, + "learning_rate": 5.88606632187405e-07, + "loss": 0.9219, + "step": 18305 + }, + { + "epoch": 0.44098167192505, + "grad_norm": 1.5038975477218628, + "learning_rate": 5.884798701957205e-07, + "loss": 0.8895, + "step": 18310 + }, + { + "epoch": 0.4411020929168373, + "grad_norm": 1.7226147651672363, + "learning_rate": 5.88353108204036e-07, + "loss": 0.917, + "step": 18315 + }, + { + "epoch": 0.44122251390862455, + "grad_norm": 1.7164808511734009, + "learning_rate": 5.882263462123517e-07, + "loss": 0.9058, + "step": 18320 + }, + { + "epoch": 0.4413429349004118, + "grad_norm": 1.4979898929595947, + "learning_rate": 5.880995842206673e-07, + "loss": 0.9308, + "step": 18325 + }, + { + "epoch": 0.44146335589219915, + "grad_norm": 1.5202665328979492, + "learning_rate": 5.879728222289828e-07, + "loss": 0.9014, + "step": 18330 + }, + { + "epoch": 0.4415837768839864, + "grad_norm": 1.4686599969863892, + "learning_rate": 5.878460602372984e-07, + "loss": 0.9461, + "step": 18335 + }, + { + "epoch": 0.4417041978757737, + "grad_norm": 1.7539727687835693, + "learning_rate": 5.877192982456141e-07, + "loss": 0.9511, + "step": 18340 + }, + { + "epoch": 0.44182461886756097, + "grad_norm": 1.5274763107299805, + "learning_rate": 5.875925362539296e-07, + "loss": 0.9521, + "step": 18345 + }, + { + "epoch": 0.4419450398593483, + "grad_norm": 1.4581748247146606, + "learning_rate": 5.874657742622452e-07, + "loss": 0.9155, + "step": 18350 + }, + { + "epoch": 0.44206546085113557, + "grad_norm": 1.493125319480896, + "learning_rate": 5.873390122705608e-07, + "loss": 0.8899, + "step": 18355 + }, + { + "epoch": 0.44218588184292285, + "grad_norm": 1.47279953956604, + "learning_rate": 5.872122502788763e-07, + "loss": 0.8576, + "step": 18360 + }, + { + "epoch": 0.4423063028347102, + "grad_norm": 1.8338985443115234, + "learning_rate": 5.87085488287192e-07, + "loss": 0.9497, + "step": 18365 + }, + { + "epoch": 0.44242672382649745, + "grad_norm": 1.5390070676803589, + "learning_rate": 5.869587262955076e-07, + "loss": 0.9115, + "step": 18370 + }, + { + "epoch": 0.4425471448182847, + "grad_norm": 1.5464487075805664, + "learning_rate": 5.86831964303823e-07, + "loss": 0.9, + "step": 18375 + }, + { + "epoch": 0.442667565810072, + "grad_norm": 1.4387696981430054, + "learning_rate": 5.867052023121387e-07, + "loss": 0.9589, + "step": 18380 + }, + { + "epoch": 0.4427879868018593, + "grad_norm": 1.3581068515777588, + "learning_rate": 5.865784403204543e-07, + "loss": 0.8743, + "step": 18385 + }, + { + "epoch": 0.4429084077936466, + "grad_norm": 1.4792227745056152, + "learning_rate": 5.864516783287699e-07, + "loss": 0.9316, + "step": 18390 + }, + { + "epoch": 0.44302882878543387, + "grad_norm": 1.7097699642181396, + "learning_rate": 5.863249163370855e-07, + "loss": 0.8889, + "step": 18395 + }, + { + "epoch": 0.44314924977722114, + "grad_norm": 1.5005133152008057, + "learning_rate": 5.86198154345401e-07, + "loss": 0.9554, + "step": 18400 + }, + { + "epoch": 0.4432696707690085, + "grad_norm": 1.667360782623291, + "learning_rate": 5.860713923537166e-07, + "loss": 0.8946, + "step": 18405 + }, + { + "epoch": 0.44339009176079575, + "grad_norm": 1.39607572555542, + "learning_rate": 5.859446303620322e-07, + "loss": 0.9718, + "step": 18410 + }, + { + "epoch": 0.443510512752583, + "grad_norm": 1.274971842765808, + "learning_rate": 5.858178683703479e-07, + "loss": 0.9119, + "step": 18415 + }, + { + "epoch": 0.4436309337443703, + "grad_norm": 1.655137538909912, + "learning_rate": 5.856911063786634e-07, + "loss": 0.8827, + "step": 18420 + }, + { + "epoch": 0.4437513547361576, + "grad_norm": 1.6791355609893799, + "learning_rate": 5.85564344386979e-07, + "loss": 0.9148, + "step": 18425 + }, + { + "epoch": 0.4438717757279449, + "grad_norm": 1.470255732536316, + "learning_rate": 5.854375823952946e-07, + "loss": 0.9326, + "step": 18430 + }, + { + "epoch": 0.44399219671973217, + "grad_norm": 1.4369548559188843, + "learning_rate": 5.853108204036101e-07, + "loss": 0.9058, + "step": 18435 + }, + { + "epoch": 0.4441126177115195, + "grad_norm": 1.660200834274292, + "learning_rate": 5.851840584119258e-07, + "loss": 0.9008, + "step": 18440 + }, + { + "epoch": 0.44423303870330677, + "grad_norm": 1.6829956769943237, + "learning_rate": 5.850572964202413e-07, + "loss": 0.9556, + "step": 18445 + }, + { + "epoch": 0.44435345969509404, + "grad_norm": 1.4490734338760376, + "learning_rate": 5.849305344285569e-07, + "loss": 0.9025, + "step": 18450 + }, + { + "epoch": 0.4444738806868813, + "grad_norm": 1.6446168422698975, + "learning_rate": 5.848037724368725e-07, + "loss": 0.9281, + "step": 18455 + }, + { + "epoch": 0.44459430167866865, + "grad_norm": 1.7542520761489868, + "learning_rate": 5.846770104451882e-07, + "loss": 0.9345, + "step": 18460 + }, + { + "epoch": 0.4447147226704559, + "grad_norm": 1.385473608970642, + "learning_rate": 5.845502484535037e-07, + "loss": 0.86, + "step": 18465 + }, + { + "epoch": 0.4448351436622432, + "grad_norm": 1.7109313011169434, + "learning_rate": 5.844234864618192e-07, + "loss": 0.8704, + "step": 18470 + }, + { + "epoch": 0.44495556465403047, + "grad_norm": 1.594736099243164, + "learning_rate": 5.842967244701349e-07, + "loss": 0.9048, + "step": 18475 + }, + { + "epoch": 0.4450759856458178, + "grad_norm": 1.4663512706756592, + "learning_rate": 5.841699624784504e-07, + "loss": 0.9278, + "step": 18480 + }, + { + "epoch": 0.44519640663760507, + "grad_norm": 1.726747751235962, + "learning_rate": 5.840432004867661e-07, + "loss": 0.8576, + "step": 18485 + }, + { + "epoch": 0.44531682762939234, + "grad_norm": 1.5041638612747192, + "learning_rate": 5.839164384950817e-07, + "loss": 0.9079, + "step": 18490 + }, + { + "epoch": 0.44543724862117967, + "grad_norm": 1.6056970357894897, + "learning_rate": 5.837896765033971e-07, + "loss": 0.9491, + "step": 18495 + }, + { + "epoch": 0.44555766961296694, + "grad_norm": 1.565638780593872, + "learning_rate": 5.836629145117128e-07, + "loss": 0.9341, + "step": 18500 + }, + { + "epoch": 0.4456780906047542, + "grad_norm": 1.3103954792022705, + "learning_rate": 5.835361525200284e-07, + "loss": 0.9042, + "step": 18505 + }, + { + "epoch": 0.4457985115965415, + "grad_norm": 1.373991847038269, + "learning_rate": 5.83409390528344e-07, + "loss": 0.8608, + "step": 18510 + }, + { + "epoch": 0.4459189325883288, + "grad_norm": 1.3458672761917114, + "learning_rate": 5.832826285366595e-07, + "loss": 0.9019, + "step": 18515 + }, + { + "epoch": 0.4460393535801161, + "grad_norm": 1.499133586883545, + "learning_rate": 5.831558665449751e-07, + "loss": 0.902, + "step": 18520 + }, + { + "epoch": 0.44615977457190337, + "grad_norm": 1.6567200422286987, + "learning_rate": 5.830291045532907e-07, + "loss": 0.9113, + "step": 18525 + }, + { + "epoch": 0.44628019556369064, + "grad_norm": 1.7166463136672974, + "learning_rate": 5.829023425616063e-07, + "loss": 0.888, + "step": 18530 + }, + { + "epoch": 0.44640061655547797, + "grad_norm": 1.7077884674072266, + "learning_rate": 5.82775580569922e-07, + "loss": 0.9058, + "step": 18535 + }, + { + "epoch": 0.44652103754726524, + "grad_norm": 1.6900447607040405, + "learning_rate": 5.826488185782374e-07, + "loss": 0.8517, + "step": 18540 + }, + { + "epoch": 0.4466414585390525, + "grad_norm": 1.6718276739120483, + "learning_rate": 5.825220565865531e-07, + "loss": 0.8974, + "step": 18545 + }, + { + "epoch": 0.4467618795308398, + "grad_norm": 1.6190602779388428, + "learning_rate": 5.823952945948687e-07, + "loss": 0.9342, + "step": 18550 + }, + { + "epoch": 0.4468823005226271, + "grad_norm": 1.8204940557479858, + "learning_rate": 5.822685326031842e-07, + "loss": 0.8827, + "step": 18555 + }, + { + "epoch": 0.4470027215144144, + "grad_norm": 1.5450371503829956, + "learning_rate": 5.821417706114998e-07, + "loss": 0.9504, + "step": 18560 + }, + { + "epoch": 0.44712314250620167, + "grad_norm": 1.7189927101135254, + "learning_rate": 5.820150086198154e-07, + "loss": 0.921, + "step": 18565 + }, + { + "epoch": 0.447243563497989, + "grad_norm": 1.7274240255355835, + "learning_rate": 5.81888246628131e-07, + "loss": 0.9178, + "step": 18570 + }, + { + "epoch": 0.44736398448977627, + "grad_norm": 1.5280604362487793, + "learning_rate": 5.817614846364466e-07, + "loss": 0.9305, + "step": 18575 + }, + { + "epoch": 0.44748440548156354, + "grad_norm": 1.4250680208206177, + "learning_rate": 5.816347226447623e-07, + "loss": 0.8913, + "step": 18580 + }, + { + "epoch": 0.4476048264733508, + "grad_norm": 1.6027624607086182, + "learning_rate": 5.815079606530777e-07, + "loss": 0.9009, + "step": 18585 + }, + { + "epoch": 0.44772524746513814, + "grad_norm": 1.576629877090454, + "learning_rate": 5.813811986613933e-07, + "loss": 0.9222, + "step": 18590 + }, + { + "epoch": 0.4478456684569254, + "grad_norm": 1.5910590887069702, + "learning_rate": 5.81254436669709e-07, + "loss": 0.9767, + "step": 18595 + }, + { + "epoch": 0.4479660894487127, + "grad_norm": 1.580986499786377, + "learning_rate": 5.811276746780245e-07, + "loss": 0.8606, + "step": 18600 + }, + { + "epoch": 0.44808651044049996, + "grad_norm": 1.5624583959579468, + "learning_rate": 5.810009126863402e-07, + "loss": 0.8826, + "step": 18605 + }, + { + "epoch": 0.4482069314322873, + "grad_norm": 1.5432230234146118, + "learning_rate": 5.808741506946557e-07, + "loss": 0.9025, + "step": 18610 + }, + { + "epoch": 0.44832735242407457, + "grad_norm": 1.5583560466766357, + "learning_rate": 5.807473887029712e-07, + "loss": 0.8883, + "step": 18615 + }, + { + "epoch": 0.44844777341586184, + "grad_norm": 1.5831446647644043, + "learning_rate": 5.806206267112869e-07, + "loss": 0.9434, + "step": 18620 + }, + { + "epoch": 0.44856819440764917, + "grad_norm": 1.3928394317626953, + "learning_rate": 5.804938647196025e-07, + "loss": 0.8973, + "step": 18625 + }, + { + "epoch": 0.44868861539943644, + "grad_norm": 1.4622230529785156, + "learning_rate": 5.80367102727918e-07, + "loss": 0.8864, + "step": 18630 + }, + { + "epoch": 0.4488090363912237, + "grad_norm": 1.3747080564498901, + "learning_rate": 5.802403407362336e-07, + "loss": 0.8946, + "step": 18635 + }, + { + "epoch": 0.448929457383011, + "grad_norm": 1.7349574565887451, + "learning_rate": 5.801135787445492e-07, + "loss": 0.953, + "step": 18640 + }, + { + "epoch": 0.4490498783747983, + "grad_norm": 1.4037730693817139, + "learning_rate": 5.799868167528648e-07, + "loss": 0.8879, + "step": 18645 + }, + { + "epoch": 0.4491702993665856, + "grad_norm": 1.5784122943878174, + "learning_rate": 5.798600547611804e-07, + "loss": 0.9634, + "step": 18650 + }, + { + "epoch": 0.44929072035837286, + "grad_norm": 1.639102816581726, + "learning_rate": 5.79733292769496e-07, + "loss": 0.9171, + "step": 18655 + }, + { + "epoch": 0.44941114135016014, + "grad_norm": 1.5633991956710815, + "learning_rate": 5.796065307778115e-07, + "loss": 0.9123, + "step": 18660 + }, + { + "epoch": 0.44953156234194747, + "grad_norm": 1.5898741483688354, + "learning_rate": 5.794797687861272e-07, + "loss": 0.873, + "step": 18665 + }, + { + "epoch": 0.44965198333373474, + "grad_norm": 1.4367605447769165, + "learning_rate": 5.793530067944428e-07, + "loss": 0.846, + "step": 18670 + }, + { + "epoch": 0.449772404325522, + "grad_norm": 1.4805914163589478, + "learning_rate": 5.792262448027582e-07, + "loss": 0.9828, + "step": 18675 + }, + { + "epoch": 0.4498928253173093, + "grad_norm": 1.7379878759384155, + "learning_rate": 5.790994828110739e-07, + "loss": 0.9296, + "step": 18680 + }, + { + "epoch": 0.4500132463090966, + "grad_norm": 1.6005162000656128, + "learning_rate": 5.789727208193895e-07, + "loss": 0.953, + "step": 18685 + }, + { + "epoch": 0.4501336673008839, + "grad_norm": 1.7036206722259521, + "learning_rate": 5.788459588277051e-07, + "loss": 0.8372, + "step": 18690 + }, + { + "epoch": 0.45025408829267116, + "grad_norm": 1.5740010738372803, + "learning_rate": 5.787191968360207e-07, + "loss": 0.9642, + "step": 18695 + }, + { + "epoch": 0.4503745092844585, + "grad_norm": 1.5430411100387573, + "learning_rate": 5.785924348443362e-07, + "loss": 0.9323, + "step": 18700 + }, + { + "epoch": 0.45049493027624576, + "grad_norm": 1.446671724319458, + "learning_rate": 5.784656728526518e-07, + "loss": 0.9183, + "step": 18705 + }, + { + "epoch": 0.45061535126803304, + "grad_norm": 1.4905126094818115, + "learning_rate": 5.783389108609674e-07, + "loss": 0.897, + "step": 18710 + }, + { + "epoch": 0.4507357722598203, + "grad_norm": 1.5193527936935425, + "learning_rate": 5.782121488692831e-07, + "loss": 0.9159, + "step": 18715 + }, + { + "epoch": 0.45085619325160764, + "grad_norm": 1.6405127048492432, + "learning_rate": 5.780853868775986e-07, + "loss": 0.8922, + "step": 18720 + }, + { + "epoch": 0.4509766142433949, + "grad_norm": 1.4062474966049194, + "learning_rate": 5.779586248859141e-07, + "loss": 0.9168, + "step": 18725 + }, + { + "epoch": 0.4510970352351822, + "grad_norm": 1.3942046165466309, + "learning_rate": 5.778318628942298e-07, + "loss": 0.843, + "step": 18730 + }, + { + "epoch": 0.45121745622696946, + "grad_norm": 1.5190876722335815, + "learning_rate": 5.777051009025453e-07, + "loss": 0.895, + "step": 18735 + }, + { + "epoch": 0.4513378772187568, + "grad_norm": 1.42783784866333, + "learning_rate": 5.77578338910861e-07, + "loss": 0.8943, + "step": 18740 + }, + { + "epoch": 0.45145829821054406, + "grad_norm": 1.7751065492630005, + "learning_rate": 5.774515769191765e-07, + "loss": 0.8743, + "step": 18745 + }, + { + "epoch": 0.45157871920233134, + "grad_norm": 1.5588613748550415, + "learning_rate": 5.773248149274921e-07, + "loss": 0.9128, + "step": 18750 + }, + { + "epoch": 0.45169914019411866, + "grad_norm": 1.5651437044143677, + "learning_rate": 5.771980529358077e-07, + "loss": 0.8912, + "step": 18755 + }, + { + "epoch": 0.45181956118590594, + "grad_norm": 1.7240246534347534, + "learning_rate": 5.770712909441233e-07, + "loss": 0.9365, + "step": 18760 + }, + { + "epoch": 0.4519399821776932, + "grad_norm": 1.4902751445770264, + "learning_rate": 5.769445289524389e-07, + "loss": 0.8875, + "step": 18765 + }, + { + "epoch": 0.4520604031694805, + "grad_norm": 1.8112143278121948, + "learning_rate": 5.768177669607544e-07, + "loss": 0.9054, + "step": 18770 + }, + { + "epoch": 0.4521808241612678, + "grad_norm": 2.001507520675659, + "learning_rate": 5.766910049690701e-07, + "loss": 0.9174, + "step": 18775 + }, + { + "epoch": 0.4523012451530551, + "grad_norm": 1.568212628364563, + "learning_rate": 5.765642429773856e-07, + "loss": 0.9049, + "step": 18780 + }, + { + "epoch": 0.45242166614484236, + "grad_norm": 1.4313302040100098, + "learning_rate": 5.764374809857012e-07, + "loss": 0.9175, + "step": 18785 + }, + { + "epoch": 0.45254208713662963, + "grad_norm": 1.4737589359283447, + "learning_rate": 5.763107189940169e-07, + "loss": 0.8932, + "step": 18790 + }, + { + "epoch": 0.45266250812841696, + "grad_norm": 1.3767342567443848, + "learning_rate": 5.761839570023323e-07, + "loss": 0.9753, + "step": 18795 + }, + { + "epoch": 0.45278292912020424, + "grad_norm": 1.583419919013977, + "learning_rate": 5.76057195010648e-07, + "loss": 0.9356, + "step": 18800 + }, + { + "epoch": 0.4529033501119915, + "grad_norm": 1.5111521482467651, + "learning_rate": 5.759304330189636e-07, + "loss": 0.9451, + "step": 18805 + }, + { + "epoch": 0.45302377110377884, + "grad_norm": 1.6245722770690918, + "learning_rate": 5.758036710272792e-07, + "loss": 0.9008, + "step": 18810 + }, + { + "epoch": 0.4531441920955661, + "grad_norm": 1.539433240890503, + "learning_rate": 5.756769090355947e-07, + "loss": 0.9434, + "step": 18815 + }, + { + "epoch": 0.4532646130873534, + "grad_norm": 1.4418680667877197, + "learning_rate": 5.755501470439103e-07, + "loss": 0.9049, + "step": 18820 + }, + { + "epoch": 0.45338503407914066, + "grad_norm": 1.708088994026184, + "learning_rate": 5.754233850522259e-07, + "loss": 0.9196, + "step": 18825 + }, + { + "epoch": 0.453505455070928, + "grad_norm": 1.5235261917114258, + "learning_rate": 5.752966230605415e-07, + "loss": 0.9526, + "step": 18830 + }, + { + "epoch": 0.45362587606271526, + "grad_norm": 1.585042119026184, + "learning_rate": 5.751698610688572e-07, + "loss": 0.8745, + "step": 18835 + }, + { + "epoch": 0.45374629705450253, + "grad_norm": 1.689333200454712, + "learning_rate": 5.750430990771726e-07, + "loss": 0.9088, + "step": 18840 + }, + { + "epoch": 0.4538667180462898, + "grad_norm": 1.4670780897140503, + "learning_rate": 5.749163370854882e-07, + "loss": 0.9259, + "step": 18845 + }, + { + "epoch": 0.45398713903807714, + "grad_norm": 1.5117682218551636, + "learning_rate": 5.747895750938039e-07, + "loss": 0.8814, + "step": 18850 + }, + { + "epoch": 0.4541075600298644, + "grad_norm": 1.6144859790802002, + "learning_rate": 5.746628131021194e-07, + "loss": 0.9731, + "step": 18855 + }, + { + "epoch": 0.4542279810216517, + "grad_norm": 1.5023188591003418, + "learning_rate": 5.74536051110435e-07, + "loss": 0.8978, + "step": 18860 + }, + { + "epoch": 0.45434840201343896, + "grad_norm": 1.6316559314727783, + "learning_rate": 5.744092891187506e-07, + "loss": 0.9299, + "step": 18865 + }, + { + "epoch": 0.4544688230052263, + "grad_norm": 1.8118706941604614, + "learning_rate": 5.742825271270662e-07, + "loss": 0.8923, + "step": 18870 + }, + { + "epoch": 0.45458924399701356, + "grad_norm": 1.6119071245193481, + "learning_rate": 5.741557651353818e-07, + "loss": 0.8898, + "step": 18875 + }, + { + "epoch": 0.45470966498880083, + "grad_norm": 1.597812294960022, + "learning_rate": 5.740290031436974e-07, + "loss": 0.9185, + "step": 18880 + }, + { + "epoch": 0.45483008598058816, + "grad_norm": 1.587369680404663, + "learning_rate": 5.739022411520129e-07, + "loss": 0.9109, + "step": 18885 + }, + { + "epoch": 0.45495050697237543, + "grad_norm": 1.6539413928985596, + "learning_rate": 5.737754791603285e-07, + "loss": 0.8844, + "step": 18890 + }, + { + "epoch": 0.4550709279641627, + "grad_norm": 1.4326918125152588, + "learning_rate": 5.736487171686442e-07, + "loss": 0.9227, + "step": 18895 + }, + { + "epoch": 0.45519134895595, + "grad_norm": 1.6746175289154053, + "learning_rate": 5.735219551769597e-07, + "loss": 0.9478, + "step": 18900 + }, + { + "epoch": 0.4553117699477373, + "grad_norm": 1.4657456874847412, + "learning_rate": 5.733951931852753e-07, + "loss": 0.9131, + "step": 18905 + }, + { + "epoch": 0.4554321909395246, + "grad_norm": 1.613221287727356, + "learning_rate": 5.732684311935909e-07, + "loss": 0.9123, + "step": 18910 + }, + { + "epoch": 0.45555261193131186, + "grad_norm": 1.4709984064102173, + "learning_rate": 5.731416692019064e-07, + "loss": 0.856, + "step": 18915 + }, + { + "epoch": 0.45567303292309913, + "grad_norm": 1.6130681037902832, + "learning_rate": 5.730149072102221e-07, + "loss": 0.9035, + "step": 18920 + }, + { + "epoch": 0.45579345391488646, + "grad_norm": 1.5863080024719238, + "learning_rate": 5.728881452185377e-07, + "loss": 0.9249, + "step": 18925 + }, + { + "epoch": 0.45591387490667373, + "grad_norm": 1.593340516090393, + "learning_rate": 5.727613832268531e-07, + "loss": 0.9131, + "step": 18930 + }, + { + "epoch": 0.456034295898461, + "grad_norm": 1.613885521888733, + "learning_rate": 5.726346212351688e-07, + "loss": 0.914, + "step": 18935 + }, + { + "epoch": 0.45615471689024834, + "grad_norm": 1.6788082122802734, + "learning_rate": 5.725078592434844e-07, + "loss": 0.8911, + "step": 18940 + }, + { + "epoch": 0.4562751378820356, + "grad_norm": 1.7046750783920288, + "learning_rate": 5.723810972518e-07, + "loss": 0.9332, + "step": 18945 + }, + { + "epoch": 0.4563955588738229, + "grad_norm": 1.4113725423812866, + "learning_rate": 5.722543352601156e-07, + "loss": 0.9386, + "step": 18950 + }, + { + "epoch": 0.45651597986561016, + "grad_norm": 1.8018156290054321, + "learning_rate": 5.721275732684312e-07, + "loss": 0.9314, + "step": 18955 + }, + { + "epoch": 0.4566364008573975, + "grad_norm": 1.3819090127944946, + "learning_rate": 5.720008112767467e-07, + "loss": 0.9241, + "step": 18960 + }, + { + "epoch": 0.45675682184918476, + "grad_norm": 1.5696793794631958, + "learning_rate": 5.718740492850623e-07, + "loss": 0.9194, + "step": 18965 + }, + { + "epoch": 0.45687724284097203, + "grad_norm": 1.608102560043335, + "learning_rate": 5.71747287293378e-07, + "loss": 0.9204, + "step": 18970 + }, + { + "epoch": 0.4569976638327593, + "grad_norm": 1.5484436750411987, + "learning_rate": 5.716205253016934e-07, + "loss": 0.9414, + "step": 18975 + }, + { + "epoch": 0.45711808482454663, + "grad_norm": 1.551183819770813, + "learning_rate": 5.714937633100091e-07, + "loss": 0.9455, + "step": 18980 + }, + { + "epoch": 0.4572385058163339, + "grad_norm": 1.6090680360794067, + "learning_rate": 5.713670013183247e-07, + "loss": 0.9278, + "step": 18985 + }, + { + "epoch": 0.4573589268081212, + "grad_norm": 1.5775976181030273, + "learning_rate": 5.712402393266402e-07, + "loss": 0.9173, + "step": 18990 + }, + { + "epoch": 0.45747934779990845, + "grad_norm": 1.6444989442825317, + "learning_rate": 5.711134773349559e-07, + "loss": 0.9548, + "step": 18995 + }, + { + "epoch": 0.4575997687916958, + "grad_norm": 1.5032627582550049, + "learning_rate": 5.709867153432714e-07, + "loss": 0.9373, + "step": 19000 + }, + { + "epoch": 0.45772018978348306, + "grad_norm": 1.5852969884872437, + "learning_rate": 5.70859953351587e-07, + "loss": 0.926, + "step": 19005 + }, + { + "epoch": 0.45784061077527033, + "grad_norm": 1.5551520586013794, + "learning_rate": 5.707331913599026e-07, + "loss": 0.9585, + "step": 19010 + }, + { + "epoch": 0.45796103176705766, + "grad_norm": 1.562559962272644, + "learning_rate": 5.706064293682183e-07, + "loss": 0.9485, + "step": 19015 + }, + { + "epoch": 0.45808145275884493, + "grad_norm": 1.5225250720977783, + "learning_rate": 5.704796673765337e-07, + "loss": 0.8889, + "step": 19020 + }, + { + "epoch": 0.4582018737506322, + "grad_norm": 1.78077232837677, + "learning_rate": 5.703529053848493e-07, + "loss": 0.9894, + "step": 19025 + }, + { + "epoch": 0.4583222947424195, + "grad_norm": 1.660193681716919, + "learning_rate": 5.70226143393165e-07, + "loss": 0.9268, + "step": 19030 + }, + { + "epoch": 0.4584427157342068, + "grad_norm": 1.4802565574645996, + "learning_rate": 5.700993814014805e-07, + "loss": 0.9387, + "step": 19035 + }, + { + "epoch": 0.4585631367259941, + "grad_norm": 1.4176965951919556, + "learning_rate": 5.699726194097962e-07, + "loss": 0.917, + "step": 19040 + }, + { + "epoch": 0.45868355771778135, + "grad_norm": 1.7834274768829346, + "learning_rate": 5.698458574181117e-07, + "loss": 0.9135, + "step": 19045 + }, + { + "epoch": 0.4588039787095686, + "grad_norm": 1.7946574687957764, + "learning_rate": 5.697190954264272e-07, + "loss": 0.9362, + "step": 19050 + }, + { + "epoch": 0.45892439970135596, + "grad_norm": 1.5921515226364136, + "learning_rate": 5.695923334347429e-07, + "loss": 0.8743, + "step": 19055 + }, + { + "epoch": 0.45904482069314323, + "grad_norm": 1.4635932445526123, + "learning_rate": 5.694655714430585e-07, + "loss": 0.9383, + "step": 19060 + }, + { + "epoch": 0.4591652416849305, + "grad_norm": 1.5557299852371216, + "learning_rate": 5.693388094513741e-07, + "loss": 0.9065, + "step": 19065 + }, + { + "epoch": 0.45928566267671783, + "grad_norm": 1.3830097913742065, + "learning_rate": 5.692120474596896e-07, + "loss": 0.8941, + "step": 19070 + }, + { + "epoch": 0.4594060836685051, + "grad_norm": 1.644334316253662, + "learning_rate": 5.690852854680053e-07, + "loss": 0.903, + "step": 19075 + }, + { + "epoch": 0.4595265046602924, + "grad_norm": 1.4195899963378906, + "learning_rate": 5.689585234763208e-07, + "loss": 0.9194, + "step": 19080 + }, + { + "epoch": 0.45964692565207965, + "grad_norm": 1.5357506275177002, + "learning_rate": 5.688317614846364e-07, + "loss": 0.9624, + "step": 19085 + }, + { + "epoch": 0.459767346643867, + "grad_norm": 1.6892377138137817, + "learning_rate": 5.68704999492952e-07, + "loss": 0.9293, + "step": 19090 + }, + { + "epoch": 0.45988776763565425, + "grad_norm": 1.3714441061019897, + "learning_rate": 5.685782375012675e-07, + "loss": 0.8866, + "step": 19095 + }, + { + "epoch": 0.4600081886274415, + "grad_norm": 1.8560410737991333, + "learning_rate": 5.684514755095832e-07, + "loss": 0.9763, + "step": 19100 + }, + { + "epoch": 0.4601286096192288, + "grad_norm": 1.7453593015670776, + "learning_rate": 5.683247135178988e-07, + "loss": 0.914, + "step": 19105 + }, + { + "epoch": 0.46024903061101613, + "grad_norm": 1.5820306539535522, + "learning_rate": 5.681979515262143e-07, + "loss": 0.9245, + "step": 19110 + }, + { + "epoch": 0.4603694516028034, + "grad_norm": 1.48934006690979, + "learning_rate": 5.680711895345299e-07, + "loss": 0.8885, + "step": 19115 + }, + { + "epoch": 0.4604898725945907, + "grad_norm": 1.542831301689148, + "learning_rate": 5.679444275428455e-07, + "loss": 0.9049, + "step": 19120 + }, + { + "epoch": 0.46061029358637795, + "grad_norm": 1.5863815546035767, + "learning_rate": 5.678176655511611e-07, + "loss": 0.9294, + "step": 19125 + }, + { + "epoch": 0.4607307145781653, + "grad_norm": 1.705835223197937, + "learning_rate": 5.676909035594767e-07, + "loss": 0.9287, + "step": 19130 + }, + { + "epoch": 0.46085113556995255, + "grad_norm": 1.6327176094055176, + "learning_rate": 5.675641415677924e-07, + "loss": 0.9219, + "step": 19135 + }, + { + "epoch": 0.4609715565617398, + "grad_norm": 1.6016877889633179, + "learning_rate": 5.674373795761078e-07, + "loss": 0.8602, + "step": 19140 + }, + { + "epoch": 0.46109197755352715, + "grad_norm": 1.5826456546783447, + "learning_rate": 5.673106175844234e-07, + "loss": 0.9342, + "step": 19145 + }, + { + "epoch": 0.46121239854531443, + "grad_norm": 1.7967777252197266, + "learning_rate": 5.671838555927391e-07, + "loss": 0.8953, + "step": 19150 + }, + { + "epoch": 0.4613328195371017, + "grad_norm": 1.5923316478729248, + "learning_rate": 5.670570936010546e-07, + "loss": 0.9272, + "step": 19155 + }, + { + "epoch": 0.461453240528889, + "grad_norm": 1.5057430267333984, + "learning_rate": 5.669303316093702e-07, + "loss": 0.9219, + "step": 19160 + }, + { + "epoch": 0.4615736615206763, + "grad_norm": 1.4920909404754639, + "learning_rate": 5.668035696176858e-07, + "loss": 0.8777, + "step": 19165 + }, + { + "epoch": 0.4616940825124636, + "grad_norm": 1.5344513654708862, + "learning_rate": 5.666768076260013e-07, + "loss": 0.8931, + "step": 19170 + }, + { + "epoch": 0.46181450350425085, + "grad_norm": 1.4009016752243042, + "learning_rate": 5.66550045634317e-07, + "loss": 0.8792, + "step": 19175 + }, + { + "epoch": 0.4619349244960381, + "grad_norm": 1.6597226858139038, + "learning_rate": 5.664232836426326e-07, + "loss": 0.9244, + "step": 19180 + }, + { + "epoch": 0.46205534548782545, + "grad_norm": 1.7540236711502075, + "learning_rate": 5.662965216509481e-07, + "loss": 0.8937, + "step": 19185 + }, + { + "epoch": 0.4621757664796127, + "grad_norm": 1.5170707702636719, + "learning_rate": 5.661697596592637e-07, + "loss": 0.9194, + "step": 19190 + }, + { + "epoch": 0.4622961874714, + "grad_norm": 1.8222031593322754, + "learning_rate": 5.660429976675794e-07, + "loss": 0.9276, + "step": 19195 + }, + { + "epoch": 0.46241660846318733, + "grad_norm": 1.5705348253250122, + "learning_rate": 5.659162356758949e-07, + "loss": 0.9343, + "step": 19200 + }, + { + "epoch": 0.4625370294549746, + "grad_norm": 1.4817367792129517, + "learning_rate": 5.657894736842104e-07, + "loss": 0.9395, + "step": 19205 + }, + { + "epoch": 0.4626574504467619, + "grad_norm": 1.5779122114181519, + "learning_rate": 5.656627116925261e-07, + "loss": 0.8933, + "step": 19210 + }, + { + "epoch": 0.46277787143854915, + "grad_norm": 1.3455822467803955, + "learning_rate": 5.655359497008416e-07, + "loss": 0.9075, + "step": 19215 + }, + { + "epoch": 0.4628982924303365, + "grad_norm": 1.6021500825881958, + "learning_rate": 5.654091877091573e-07, + "loss": 0.9553, + "step": 19220 + }, + { + "epoch": 0.46301871342212375, + "grad_norm": 1.4690455198287964, + "learning_rate": 5.652824257174729e-07, + "loss": 0.9146, + "step": 19225 + }, + { + "epoch": 0.463139134413911, + "grad_norm": 1.4307332038879395, + "learning_rate": 5.651556637257883e-07, + "loss": 0.9641, + "step": 19230 + }, + { + "epoch": 0.4632595554056983, + "grad_norm": 1.579603672027588, + "learning_rate": 5.65028901734104e-07, + "loss": 0.8914, + "step": 19235 + }, + { + "epoch": 0.4633799763974856, + "grad_norm": 1.7392204999923706, + "learning_rate": 5.649021397424196e-07, + "loss": 0.8679, + "step": 19240 + }, + { + "epoch": 0.4635003973892729, + "grad_norm": 1.5015277862548828, + "learning_rate": 5.647753777507353e-07, + "loss": 0.9354, + "step": 19245 + }, + { + "epoch": 0.4636208183810602, + "grad_norm": 1.5179589986801147, + "learning_rate": 5.646486157590508e-07, + "loss": 0.9798, + "step": 19250 + }, + { + "epoch": 0.4637412393728475, + "grad_norm": 1.3988590240478516, + "learning_rate": 5.645218537673663e-07, + "loss": 0.9186, + "step": 19255 + }, + { + "epoch": 0.4638616603646348, + "grad_norm": 1.6146354675292969, + "learning_rate": 5.64395091775682e-07, + "loss": 0.8371, + "step": 19260 + }, + { + "epoch": 0.46398208135642205, + "grad_norm": 1.4624241590499878, + "learning_rate": 5.642683297839975e-07, + "loss": 0.9093, + "step": 19265 + }, + { + "epoch": 0.4641025023482093, + "grad_norm": 1.6220847368240356, + "learning_rate": 5.641415677923132e-07, + "loss": 0.8582, + "step": 19270 + }, + { + "epoch": 0.46422292333999665, + "grad_norm": 1.5948646068572998, + "learning_rate": 5.640148058006287e-07, + "loss": 0.937, + "step": 19275 + }, + { + "epoch": 0.4643433443317839, + "grad_norm": 1.5547462701797485, + "learning_rate": 5.638880438089443e-07, + "loss": 0.8907, + "step": 19280 + }, + { + "epoch": 0.4644637653235712, + "grad_norm": 1.6174482107162476, + "learning_rate": 5.637612818172599e-07, + "loss": 0.9369, + "step": 19285 + }, + { + "epoch": 0.46458418631535847, + "grad_norm": 1.4465388059616089, + "learning_rate": 5.636345198255755e-07, + "loss": 0.9182, + "step": 19290 + }, + { + "epoch": 0.4647046073071458, + "grad_norm": 1.8930598497390747, + "learning_rate": 5.635077578338911e-07, + "loss": 0.8905, + "step": 19295 + }, + { + "epoch": 0.4648250282989331, + "grad_norm": 1.3349248170852661, + "learning_rate": 5.633809958422066e-07, + "loss": 0.9202, + "step": 19300 + }, + { + "epoch": 0.46494544929072035, + "grad_norm": 1.38579523563385, + "learning_rate": 5.632542338505223e-07, + "loss": 0.9545, + "step": 19305 + }, + { + "epoch": 0.4650658702825076, + "grad_norm": 1.6983599662780762, + "learning_rate": 5.631274718588378e-07, + "loss": 0.9331, + "step": 19310 + }, + { + "epoch": 0.46518629127429495, + "grad_norm": 1.6366914510726929, + "learning_rate": 5.630007098671535e-07, + "loss": 0.9384, + "step": 19315 + }, + { + "epoch": 0.4653067122660822, + "grad_norm": 1.8956844806671143, + "learning_rate": 5.628739478754691e-07, + "loss": 0.8964, + "step": 19320 + }, + { + "epoch": 0.4654271332578695, + "grad_norm": 1.6888880729675293, + "learning_rate": 5.627471858837845e-07, + "loss": 0.8892, + "step": 19325 + }, + { + "epoch": 0.4655475542496568, + "grad_norm": 1.5253064632415771, + "learning_rate": 5.626204238921002e-07, + "loss": 0.8463, + "step": 19330 + }, + { + "epoch": 0.4656679752414441, + "grad_norm": 1.632535696029663, + "learning_rate": 5.624936619004158e-07, + "loss": 0.886, + "step": 19335 + }, + { + "epoch": 0.46578839623323137, + "grad_norm": 1.5638158321380615, + "learning_rate": 5.623668999087314e-07, + "loss": 0.95, + "step": 19340 + }, + { + "epoch": 0.46590881722501865, + "grad_norm": 1.78264582157135, + "learning_rate": 5.622401379170469e-07, + "loss": 0.9205, + "step": 19345 + }, + { + "epoch": 0.466029238216806, + "grad_norm": 1.6770012378692627, + "learning_rate": 5.621133759253625e-07, + "loss": 0.9276, + "step": 19350 + }, + { + "epoch": 0.46614965920859325, + "grad_norm": 1.6042697429656982, + "learning_rate": 5.619866139336781e-07, + "loss": 0.8913, + "step": 19355 + }, + { + "epoch": 0.4662700802003805, + "grad_norm": 1.4511951208114624, + "learning_rate": 5.618598519419937e-07, + "loss": 0.8983, + "step": 19360 + }, + { + "epoch": 0.4663905011921678, + "grad_norm": 1.4056437015533447, + "learning_rate": 5.617330899503094e-07, + "loss": 0.8683, + "step": 19365 + }, + { + "epoch": 0.4665109221839551, + "grad_norm": 1.493322491645813, + "learning_rate": 5.616063279586248e-07, + "loss": 0.916, + "step": 19370 + }, + { + "epoch": 0.4666313431757424, + "grad_norm": 1.4477388858795166, + "learning_rate": 5.614795659669404e-07, + "loss": 0.9469, + "step": 19375 + }, + { + "epoch": 0.46675176416752967, + "grad_norm": 1.498809576034546, + "learning_rate": 5.613528039752561e-07, + "loss": 0.9008, + "step": 19380 + }, + { + "epoch": 0.466872185159317, + "grad_norm": 1.6682100296020508, + "learning_rate": 5.612260419835716e-07, + "loss": 0.9416, + "step": 19385 + }, + { + "epoch": 0.4669926061511043, + "grad_norm": 1.6538997888565063, + "learning_rate": 5.610992799918872e-07, + "loss": 0.8469, + "step": 19390 + }, + { + "epoch": 0.46711302714289155, + "grad_norm": 1.5839837789535522, + "learning_rate": 5.609725180002028e-07, + "loss": 0.8992, + "step": 19395 + }, + { + "epoch": 0.4672334481346788, + "grad_norm": 1.6330792903900146, + "learning_rate": 5.608457560085184e-07, + "loss": 0.9026, + "step": 19400 + }, + { + "epoch": 0.46735386912646615, + "grad_norm": 1.4835282564163208, + "learning_rate": 5.60718994016834e-07, + "loss": 0.9264, + "step": 19405 + }, + { + "epoch": 0.4674742901182534, + "grad_norm": 1.6376011371612549, + "learning_rate": 5.605922320251496e-07, + "loss": 0.91, + "step": 19410 + }, + { + "epoch": 0.4675947111100407, + "grad_norm": 1.3809139728546143, + "learning_rate": 5.604654700334651e-07, + "loss": 0.908, + "step": 19415 + }, + { + "epoch": 0.46771513210182797, + "grad_norm": 1.5058355331420898, + "learning_rate": 5.603387080417807e-07, + "loss": 0.9446, + "step": 19420 + }, + { + "epoch": 0.4678355530936153, + "grad_norm": 1.6111805438995361, + "learning_rate": 5.602119460500964e-07, + "loss": 0.8859, + "step": 19425 + }, + { + "epoch": 0.46795597408540257, + "grad_norm": 1.508823037147522, + "learning_rate": 5.600851840584119e-07, + "loss": 0.9019, + "step": 19430 + }, + { + "epoch": 0.46807639507718984, + "grad_norm": 1.765920639038086, + "learning_rate": 5.599584220667275e-07, + "loss": 0.8805, + "step": 19435 + }, + { + "epoch": 0.4681968160689771, + "grad_norm": 1.5230178833007812, + "learning_rate": 5.598316600750431e-07, + "loss": 0.8594, + "step": 19440 + }, + { + "epoch": 0.46831723706076445, + "grad_norm": 1.5808008909225464, + "learning_rate": 5.597048980833586e-07, + "loss": 0.8729, + "step": 19445 + }, + { + "epoch": 0.4684376580525517, + "grad_norm": 1.6315981149673462, + "learning_rate": 5.595781360916743e-07, + "loss": 0.8992, + "step": 19450 + }, + { + "epoch": 0.468558079044339, + "grad_norm": 1.5495814085006714, + "learning_rate": 5.594513740999899e-07, + "loss": 0.9098, + "step": 19455 + }, + { + "epoch": 0.4686785000361263, + "grad_norm": 1.717214822769165, + "learning_rate": 5.593246121083053e-07, + "loss": 0.959, + "step": 19460 + }, + { + "epoch": 0.4687989210279136, + "grad_norm": 1.468532919883728, + "learning_rate": 5.59197850116621e-07, + "loss": 0.8781, + "step": 19465 + }, + { + "epoch": 0.46891934201970087, + "grad_norm": 1.4898240566253662, + "learning_rate": 5.590710881249366e-07, + "loss": 0.91, + "step": 19470 + }, + { + "epoch": 0.46903976301148814, + "grad_norm": 1.5860544443130493, + "learning_rate": 5.589443261332522e-07, + "loss": 0.8995, + "step": 19475 + }, + { + "epoch": 0.46916018400327547, + "grad_norm": 1.4536211490631104, + "learning_rate": 5.588175641415678e-07, + "loss": 0.9219, + "step": 19480 + }, + { + "epoch": 0.46928060499506274, + "grad_norm": 1.3934434652328491, + "learning_rate": 5.586908021498834e-07, + "loss": 0.8859, + "step": 19485 + }, + { + "epoch": 0.46940102598685, + "grad_norm": 1.4026354551315308, + "learning_rate": 5.585640401581989e-07, + "loss": 0.9145, + "step": 19490 + }, + { + "epoch": 0.4695214469786373, + "grad_norm": 1.5729193687438965, + "learning_rate": 5.584372781665145e-07, + "loss": 0.9152, + "step": 19495 + }, + { + "epoch": 0.4696418679704246, + "grad_norm": 1.5934133529663086, + "learning_rate": 5.583105161748302e-07, + "loss": 0.9541, + "step": 19500 + }, + { + "epoch": 0.4697622889622119, + "grad_norm": 1.3092372417449951, + "learning_rate": 5.581837541831456e-07, + "loss": 0.9333, + "step": 19505 + }, + { + "epoch": 0.46988270995399917, + "grad_norm": 1.6447803974151611, + "learning_rate": 5.580569921914613e-07, + "loss": 0.8509, + "step": 19510 + }, + { + "epoch": 0.4700031309457865, + "grad_norm": 1.5421675443649292, + "learning_rate": 5.579302301997769e-07, + "loss": 0.8606, + "step": 19515 + }, + { + "epoch": 0.47012355193757377, + "grad_norm": 1.5920692682266235, + "learning_rate": 5.578034682080924e-07, + "loss": 0.8919, + "step": 19520 + }, + { + "epoch": 0.47024397292936104, + "grad_norm": 1.4933526515960693, + "learning_rate": 5.576767062164081e-07, + "loss": 0.946, + "step": 19525 + }, + { + "epoch": 0.4703643939211483, + "grad_norm": 1.5667372941970825, + "learning_rate": 5.575499442247236e-07, + "loss": 0.8823, + "step": 19530 + }, + { + "epoch": 0.47048481491293564, + "grad_norm": 1.3437079191207886, + "learning_rate": 5.574231822330392e-07, + "loss": 0.9155, + "step": 19535 + }, + { + "epoch": 0.4706052359047229, + "grad_norm": 1.6043516397476196, + "learning_rate": 5.572964202413548e-07, + "loss": 0.9311, + "step": 19540 + }, + { + "epoch": 0.4707256568965102, + "grad_norm": 1.5555745363235474, + "learning_rate": 5.571696582496705e-07, + "loss": 0.9066, + "step": 19545 + }, + { + "epoch": 0.47084607788829747, + "grad_norm": 1.4690513610839844, + "learning_rate": 5.57042896257986e-07, + "loss": 0.915, + "step": 19550 + }, + { + "epoch": 0.4709664988800848, + "grad_norm": 1.412894368171692, + "learning_rate": 5.569161342663015e-07, + "loss": 0.9205, + "step": 19555 + }, + { + "epoch": 0.47108691987187207, + "grad_norm": 1.5198075771331787, + "learning_rate": 5.567893722746172e-07, + "loss": 0.9057, + "step": 19560 + }, + { + "epoch": 0.47120734086365934, + "grad_norm": 1.4156620502471924, + "learning_rate": 5.566626102829327e-07, + "loss": 0.9071, + "step": 19565 + }, + { + "epoch": 0.4713277618554466, + "grad_norm": 1.5950437784194946, + "learning_rate": 5.565358482912484e-07, + "loss": 0.926, + "step": 19570 + }, + { + "epoch": 0.47144818284723394, + "grad_norm": 1.900969386100769, + "learning_rate": 5.564090862995639e-07, + "loss": 0.8585, + "step": 19575 + }, + { + "epoch": 0.4715686038390212, + "grad_norm": 2.091481924057007, + "learning_rate": 5.562823243078794e-07, + "loss": 0.8834, + "step": 19580 + }, + { + "epoch": 0.4716890248308085, + "grad_norm": 1.8308582305908203, + "learning_rate": 5.561555623161951e-07, + "loss": 0.9513, + "step": 19585 + }, + { + "epoch": 0.4718094458225958, + "grad_norm": 1.667382001876831, + "learning_rate": 5.560288003245107e-07, + "loss": 0.8976, + "step": 19590 + }, + { + "epoch": 0.4719298668143831, + "grad_norm": 1.412461757659912, + "learning_rate": 5.559020383328263e-07, + "loss": 0.9328, + "step": 19595 + }, + { + "epoch": 0.47205028780617037, + "grad_norm": 1.6457659006118774, + "learning_rate": 5.557752763411418e-07, + "loss": 0.8995, + "step": 19600 + }, + { + "epoch": 0.47217070879795764, + "grad_norm": 1.5809874534606934, + "learning_rate": 5.556485143494575e-07, + "loss": 0.9108, + "step": 19605 + }, + { + "epoch": 0.47229112978974497, + "grad_norm": 1.529252290725708, + "learning_rate": 5.55521752357773e-07, + "loss": 0.8897, + "step": 19610 + }, + { + "epoch": 0.47241155078153224, + "grad_norm": 1.672222375869751, + "learning_rate": 5.553949903660886e-07, + "loss": 0.8975, + "step": 19615 + }, + { + "epoch": 0.4725319717733195, + "grad_norm": 1.5824137926101685, + "learning_rate": 5.552682283744043e-07, + "loss": 0.9066, + "step": 19620 + }, + { + "epoch": 0.4726523927651068, + "grad_norm": 1.9257863759994507, + "learning_rate": 5.551414663827197e-07, + "loss": 0.9294, + "step": 19625 + }, + { + "epoch": 0.4727728137568941, + "grad_norm": 1.6060237884521484, + "learning_rate": 5.550147043910354e-07, + "loss": 0.9267, + "step": 19630 + }, + { + "epoch": 0.4728932347486814, + "grad_norm": 1.5399199724197388, + "learning_rate": 5.54887942399351e-07, + "loss": 0.8888, + "step": 19635 + }, + { + "epoch": 0.47301365574046866, + "grad_norm": 1.5686084032058716, + "learning_rate": 5.547611804076665e-07, + "loss": 0.9176, + "step": 19640 + }, + { + "epoch": 0.473134076732256, + "grad_norm": 1.4253500699996948, + "learning_rate": 5.546344184159821e-07, + "loss": 0.9271, + "step": 19645 + }, + { + "epoch": 0.47325449772404327, + "grad_norm": 1.5969947576522827, + "learning_rate": 5.545076564242977e-07, + "loss": 0.9251, + "step": 19650 + }, + { + "epoch": 0.47337491871583054, + "grad_norm": 1.586877465248108, + "learning_rate": 5.543808944326133e-07, + "loss": 0.8853, + "step": 19655 + }, + { + "epoch": 0.4734953397076178, + "grad_norm": 1.579201340675354, + "learning_rate": 5.542541324409289e-07, + "loss": 0.8718, + "step": 19660 + }, + { + "epoch": 0.47361576069940514, + "grad_norm": 1.5453004837036133, + "learning_rate": 5.541273704492446e-07, + "loss": 0.8859, + "step": 19665 + }, + { + "epoch": 0.4737361816911924, + "grad_norm": 1.5748382806777954, + "learning_rate": 5.5400060845756e-07, + "loss": 0.8919, + "step": 19670 + }, + { + "epoch": 0.4738566026829797, + "grad_norm": 1.6768748760223389, + "learning_rate": 5.538738464658756e-07, + "loss": 0.8696, + "step": 19675 + }, + { + "epoch": 0.47397702367476696, + "grad_norm": 1.5177021026611328, + "learning_rate": 5.537470844741913e-07, + "loss": 0.8602, + "step": 19680 + }, + { + "epoch": 0.4740974446665543, + "grad_norm": 1.3657416105270386, + "learning_rate": 5.536203224825068e-07, + "loss": 0.8537, + "step": 19685 + }, + { + "epoch": 0.47421786565834156, + "grad_norm": 1.484498143196106, + "learning_rate": 5.534935604908224e-07, + "loss": 0.9626, + "step": 19690 + }, + { + "epoch": 0.47433828665012884, + "grad_norm": 1.3851714134216309, + "learning_rate": 5.53366798499138e-07, + "loss": 0.9084, + "step": 19695 + }, + { + "epoch": 0.47445870764191617, + "grad_norm": 1.6817436218261719, + "learning_rate": 5.532400365074535e-07, + "loss": 0.8925, + "step": 19700 + }, + { + "epoch": 0.47457912863370344, + "grad_norm": 1.6906062364578247, + "learning_rate": 5.531132745157692e-07, + "loss": 0.937, + "step": 19705 + }, + { + "epoch": 0.4746995496254907, + "grad_norm": 1.4443438053131104, + "learning_rate": 5.529865125240848e-07, + "loss": 0.9102, + "step": 19710 + }, + { + "epoch": 0.474819970617278, + "grad_norm": 1.527987003326416, + "learning_rate": 5.528597505324003e-07, + "loss": 0.8944, + "step": 19715 + }, + { + "epoch": 0.4749403916090653, + "grad_norm": 1.8593183755874634, + "learning_rate": 5.527329885407159e-07, + "loss": 0.8978, + "step": 19720 + }, + { + "epoch": 0.4750608126008526, + "grad_norm": 1.5174137353897095, + "learning_rate": 5.526062265490316e-07, + "loss": 0.8817, + "step": 19725 + }, + { + "epoch": 0.47518123359263986, + "grad_norm": 1.453525424003601, + "learning_rate": 5.524794645573471e-07, + "loss": 0.9118, + "step": 19730 + }, + { + "epoch": 0.47530165458442714, + "grad_norm": 1.5764050483703613, + "learning_rate": 5.523527025656627e-07, + "loss": 0.9359, + "step": 19735 + }, + { + "epoch": 0.47542207557621446, + "grad_norm": 1.7837824821472168, + "learning_rate": 5.522259405739783e-07, + "loss": 0.9578, + "step": 19740 + }, + { + "epoch": 0.47554249656800174, + "grad_norm": 1.5909477472305298, + "learning_rate": 5.520991785822938e-07, + "loss": 0.9389, + "step": 19745 + }, + { + "epoch": 0.475662917559789, + "grad_norm": 1.485216498374939, + "learning_rate": 5.519724165906095e-07, + "loss": 0.8666, + "step": 19750 + }, + { + "epoch": 0.4757833385515763, + "grad_norm": 1.3735274076461792, + "learning_rate": 5.518456545989251e-07, + "loss": 0.8216, + "step": 19755 + }, + { + "epoch": 0.4759037595433636, + "grad_norm": 1.5103178024291992, + "learning_rate": 5.517188926072405e-07, + "loss": 0.8766, + "step": 19760 + }, + { + "epoch": 0.4760241805351509, + "grad_norm": 1.46016263961792, + "learning_rate": 5.515921306155562e-07, + "loss": 0.9262, + "step": 19765 + }, + { + "epoch": 0.47614460152693816, + "grad_norm": 1.7239071130752563, + "learning_rate": 5.514653686238718e-07, + "loss": 0.9191, + "step": 19770 + }, + { + "epoch": 0.4762650225187255, + "grad_norm": 1.5743353366851807, + "learning_rate": 5.513386066321874e-07, + "loss": 0.9159, + "step": 19775 + }, + { + "epoch": 0.47638544351051276, + "grad_norm": 1.7983384132385254, + "learning_rate": 5.51211844640503e-07, + "loss": 0.925, + "step": 19780 + }, + { + "epoch": 0.47650586450230004, + "grad_norm": 1.5955098867416382, + "learning_rate": 5.510850826488185e-07, + "loss": 0.9405, + "step": 19785 + }, + { + "epoch": 0.4766262854940873, + "grad_norm": 1.530182123184204, + "learning_rate": 5.509583206571341e-07, + "loss": 0.9166, + "step": 19790 + }, + { + "epoch": 0.47674670648587464, + "grad_norm": 1.518515944480896, + "learning_rate": 5.508315586654497e-07, + "loss": 0.9451, + "step": 19795 + }, + { + "epoch": 0.4768671274776619, + "grad_norm": 1.643058180809021, + "learning_rate": 5.507047966737654e-07, + "loss": 0.8849, + "step": 19800 + }, + { + "epoch": 0.4769875484694492, + "grad_norm": 1.6911545991897583, + "learning_rate": 5.505780346820808e-07, + "loss": 0.8837, + "step": 19805 + }, + { + "epoch": 0.47710796946123646, + "grad_norm": 1.3822886943817139, + "learning_rate": 5.504512726903965e-07, + "loss": 0.8948, + "step": 19810 + }, + { + "epoch": 0.4772283904530238, + "grad_norm": 1.6200188398361206, + "learning_rate": 5.503245106987121e-07, + "loss": 0.9426, + "step": 19815 + }, + { + "epoch": 0.47734881144481106, + "grad_norm": 1.4352916479110718, + "learning_rate": 5.501977487070276e-07, + "loss": 0.8842, + "step": 19820 + }, + { + "epoch": 0.47746923243659833, + "grad_norm": 1.4828038215637207, + "learning_rate": 5.500709867153433e-07, + "loss": 0.8714, + "step": 19825 + }, + { + "epoch": 0.47758965342838566, + "grad_norm": 1.4750306606292725, + "learning_rate": 5.499442247236588e-07, + "loss": 0.9484, + "step": 19830 + }, + { + "epoch": 0.47771007442017294, + "grad_norm": 1.624162197113037, + "learning_rate": 5.498174627319744e-07, + "loss": 0.912, + "step": 19835 + }, + { + "epoch": 0.4778304954119602, + "grad_norm": 1.9018586874008179, + "learning_rate": 5.4969070074029e-07, + "loss": 0.8916, + "step": 19840 + }, + { + "epoch": 0.4779509164037475, + "grad_norm": 1.5208109617233276, + "learning_rate": 5.495639387486057e-07, + "loss": 0.9171, + "step": 19845 + }, + { + "epoch": 0.4780713373955348, + "grad_norm": 1.521644115447998, + "learning_rate": 5.494371767569212e-07, + "loss": 0.9251, + "step": 19850 + }, + { + "epoch": 0.4781917583873221, + "grad_norm": 1.4694710969924927, + "learning_rate": 5.493104147652367e-07, + "loss": 0.8663, + "step": 19855 + }, + { + "epoch": 0.47831217937910936, + "grad_norm": 1.6884132623672485, + "learning_rate": 5.491836527735524e-07, + "loss": 0.8857, + "step": 19860 + }, + { + "epoch": 0.47843260037089663, + "grad_norm": 1.7484136819839478, + "learning_rate": 5.490568907818679e-07, + "loss": 0.9107, + "step": 19865 + }, + { + "epoch": 0.47855302136268396, + "grad_norm": 1.915028691291809, + "learning_rate": 5.489301287901836e-07, + "loss": 0.8986, + "step": 19870 + }, + { + "epoch": 0.47867344235447123, + "grad_norm": 1.6282302141189575, + "learning_rate": 5.488033667984991e-07, + "loss": 0.91, + "step": 19875 + }, + { + "epoch": 0.4787938633462585, + "grad_norm": 1.9310438632965088, + "learning_rate": 5.486766048068146e-07, + "loss": 0.9392, + "step": 19880 + }, + { + "epoch": 0.4789142843380458, + "grad_norm": 1.624096393585205, + "learning_rate": 5.485498428151303e-07, + "loss": 0.8752, + "step": 19885 + }, + { + "epoch": 0.4790347053298331, + "grad_norm": 1.579325556755066, + "learning_rate": 5.484230808234459e-07, + "loss": 0.8679, + "step": 19890 + }, + { + "epoch": 0.4791551263216204, + "grad_norm": 1.6065541505813599, + "learning_rate": 5.482963188317615e-07, + "loss": 0.903, + "step": 19895 + }, + { + "epoch": 0.47927554731340766, + "grad_norm": 1.509092092514038, + "learning_rate": 5.48169556840077e-07, + "loss": 0.8833, + "step": 19900 + }, + { + "epoch": 0.479395968305195, + "grad_norm": 1.625239372253418, + "learning_rate": 5.480427948483926e-07, + "loss": 0.8796, + "step": 19905 + }, + { + "epoch": 0.47951638929698226, + "grad_norm": 1.5438594818115234, + "learning_rate": 5.479160328567082e-07, + "loss": 0.8598, + "step": 19910 + }, + { + "epoch": 0.47963681028876953, + "grad_norm": 1.5186994075775146, + "learning_rate": 5.477892708650238e-07, + "loss": 0.9471, + "step": 19915 + }, + { + "epoch": 0.4797572312805568, + "grad_norm": 1.6406010389328003, + "learning_rate": 5.476625088733395e-07, + "loss": 0.8856, + "step": 19920 + }, + { + "epoch": 0.47987765227234414, + "grad_norm": 1.6568893194198608, + "learning_rate": 5.475357468816549e-07, + "loss": 0.9892, + "step": 19925 + }, + { + "epoch": 0.4799980732641314, + "grad_norm": 1.6029942035675049, + "learning_rate": 5.474089848899706e-07, + "loss": 0.8746, + "step": 19930 + }, + { + "epoch": 0.4801184942559187, + "grad_norm": 1.5155408382415771, + "learning_rate": 5.472822228982862e-07, + "loss": 0.8756, + "step": 19935 + }, + { + "epoch": 0.48023891524770596, + "grad_norm": 1.720414638519287, + "learning_rate": 5.471554609066017e-07, + "loss": 0.8976, + "step": 19940 + }, + { + "epoch": 0.4803593362394933, + "grad_norm": 1.5202049016952515, + "learning_rate": 5.470286989149173e-07, + "loss": 0.9006, + "step": 19945 + }, + { + "epoch": 0.48047975723128056, + "grad_norm": 1.6243274211883545, + "learning_rate": 5.469019369232329e-07, + "loss": 0.9025, + "step": 19950 + }, + { + "epoch": 0.48060017822306783, + "grad_norm": 1.5468374490737915, + "learning_rate": 5.467751749315485e-07, + "loss": 0.891, + "step": 19955 + }, + { + "epoch": 0.48072059921485516, + "grad_norm": 1.7020221948623657, + "learning_rate": 5.466484129398641e-07, + "loss": 0.9049, + "step": 19960 + }, + { + "epoch": 0.48084102020664243, + "grad_norm": 1.6207133531570435, + "learning_rate": 5.465216509481797e-07, + "loss": 0.9273, + "step": 19965 + }, + { + "epoch": 0.4809614411984297, + "grad_norm": 1.7263379096984863, + "learning_rate": 5.463948889564952e-07, + "loss": 0.904, + "step": 19970 + }, + { + "epoch": 0.481081862190217, + "grad_norm": 1.5589679479599, + "learning_rate": 5.462681269648108e-07, + "loss": 0.9133, + "step": 19975 + }, + { + "epoch": 0.4812022831820043, + "grad_norm": 1.868116021156311, + "learning_rate": 5.461413649731265e-07, + "loss": 0.9224, + "step": 19980 + }, + { + "epoch": 0.4813227041737916, + "grad_norm": 1.5733035802841187, + "learning_rate": 5.46014602981442e-07, + "loss": 0.9143, + "step": 19985 + }, + { + "epoch": 0.48144312516557886, + "grad_norm": 1.7124382257461548, + "learning_rate": 5.458878409897575e-07, + "loss": 0.8175, + "step": 19990 + }, + { + "epoch": 0.48156354615736613, + "grad_norm": 1.5227895975112915, + "learning_rate": 5.457610789980732e-07, + "loss": 0.8792, + "step": 19995 + }, + { + "epoch": 0.48168396714915346, + "grad_norm": 1.7614829540252686, + "learning_rate": 5.456343170063887e-07, + "loss": 0.9365, + "step": 20000 + }, + { + "epoch": 0.48180438814094073, + "grad_norm": 1.6550016403198242, + "learning_rate": 5.455075550147044e-07, + "loss": 0.9716, + "step": 20005 + }, + { + "epoch": 0.481924809132728, + "grad_norm": 1.5123074054718018, + "learning_rate": 5.4538079302302e-07, + "loss": 0.8834, + "step": 20010 + }, + { + "epoch": 0.4820452301245153, + "grad_norm": 1.648113489151001, + "learning_rate": 5.452540310313355e-07, + "loss": 0.9213, + "step": 20015 + }, + { + "epoch": 0.4821656511163026, + "grad_norm": 1.904733657836914, + "learning_rate": 5.451272690396511e-07, + "loss": 0.8647, + "step": 20020 + }, + { + "epoch": 0.4822860721080899, + "grad_norm": 1.5189578533172607, + "learning_rate": 5.450005070479667e-07, + "loss": 0.8972, + "step": 20025 + }, + { + "epoch": 0.48240649309987715, + "grad_norm": 1.5393345355987549, + "learning_rate": 5.448737450562823e-07, + "loss": 0.8859, + "step": 20030 + }, + { + "epoch": 0.4825269140916645, + "grad_norm": 1.5878273248672485, + "learning_rate": 5.447469830645979e-07, + "loss": 0.9554, + "step": 20035 + }, + { + "epoch": 0.48264733508345176, + "grad_norm": 1.5807021856307983, + "learning_rate": 5.446202210729135e-07, + "loss": 0.9277, + "step": 20040 + }, + { + "epoch": 0.48276775607523903, + "grad_norm": 1.415024995803833, + "learning_rate": 5.44493459081229e-07, + "loss": 0.8944, + "step": 20045 + }, + { + "epoch": 0.4828881770670263, + "grad_norm": 1.553511381149292, + "learning_rate": 5.443666970895446e-07, + "loss": 0.9404, + "step": 20050 + }, + { + "epoch": 0.48300859805881363, + "grad_norm": 1.4661325216293335, + "learning_rate": 5.442399350978603e-07, + "loss": 0.8628, + "step": 20055 + }, + { + "epoch": 0.4831290190506009, + "grad_norm": 1.4524530172348022, + "learning_rate": 5.441131731061757e-07, + "loss": 0.8834, + "step": 20060 + }, + { + "epoch": 0.4832494400423882, + "grad_norm": 1.4855055809020996, + "learning_rate": 5.439864111144914e-07, + "loss": 0.899, + "step": 20065 + }, + { + "epoch": 0.48336986103417545, + "grad_norm": 1.4527695178985596, + "learning_rate": 5.43859649122807e-07, + "loss": 0.9224, + "step": 20070 + }, + { + "epoch": 0.4834902820259628, + "grad_norm": 1.781144380569458, + "learning_rate": 5.437328871311226e-07, + "loss": 0.8588, + "step": 20075 + }, + { + "epoch": 0.48361070301775005, + "grad_norm": 1.6836347579956055, + "learning_rate": 5.436061251394382e-07, + "loss": 0.9171, + "step": 20080 + }, + { + "epoch": 0.4837311240095373, + "grad_norm": 1.5334843397140503, + "learning_rate": 5.434793631477537e-07, + "loss": 0.9702, + "step": 20085 + }, + { + "epoch": 0.48385154500132466, + "grad_norm": 1.467098593711853, + "learning_rate": 5.433526011560693e-07, + "loss": 0.9252, + "step": 20090 + }, + { + "epoch": 0.48397196599311193, + "grad_norm": 1.5818877220153809, + "learning_rate": 5.432258391643849e-07, + "loss": 0.9293, + "step": 20095 + }, + { + "epoch": 0.4840923869848992, + "grad_norm": 1.555888056755066, + "learning_rate": 5.430990771727006e-07, + "loss": 0.8787, + "step": 20100 + }, + { + "epoch": 0.4842128079766865, + "grad_norm": 1.4838998317718506, + "learning_rate": 5.42972315181016e-07, + "loss": 0.9016, + "step": 20105 + }, + { + "epoch": 0.4843332289684738, + "grad_norm": 1.4996261596679688, + "learning_rate": 5.428455531893316e-07, + "loss": 0.9398, + "step": 20110 + }, + { + "epoch": 0.4844536499602611, + "grad_norm": 2.0929925441741943, + "learning_rate": 5.427187911976473e-07, + "loss": 0.9469, + "step": 20115 + }, + { + "epoch": 0.48457407095204835, + "grad_norm": 1.6425294876098633, + "learning_rate": 5.425920292059628e-07, + "loss": 0.8702, + "step": 20120 + }, + { + "epoch": 0.4846944919438356, + "grad_norm": 1.707254409790039, + "learning_rate": 5.424652672142785e-07, + "loss": 0.9226, + "step": 20125 + }, + { + "epoch": 0.48481491293562295, + "grad_norm": 1.6218721866607666, + "learning_rate": 5.42338505222594e-07, + "loss": 0.917, + "step": 20130 + }, + { + "epoch": 0.48493533392741023, + "grad_norm": 1.6185005903244019, + "learning_rate": 5.422117432309096e-07, + "loss": 0.872, + "step": 20135 + }, + { + "epoch": 0.4850557549191975, + "grad_norm": 1.5673400163650513, + "learning_rate": 5.420849812392252e-07, + "loss": 0.8663, + "step": 20140 + }, + { + "epoch": 0.48517617591098483, + "grad_norm": 1.6191776990890503, + "learning_rate": 5.419582192475408e-07, + "loss": 0.9139, + "step": 20145 + }, + { + "epoch": 0.4852965969027721, + "grad_norm": 1.4292807579040527, + "learning_rate": 5.418314572558564e-07, + "loss": 0.9326, + "step": 20150 + }, + { + "epoch": 0.4854170178945594, + "grad_norm": 2.3301453590393066, + "learning_rate": 5.417046952641719e-07, + "loss": 0.9048, + "step": 20155 + }, + { + "epoch": 0.48553743888634665, + "grad_norm": 1.5287058353424072, + "learning_rate": 5.415779332724876e-07, + "loss": 0.952, + "step": 20160 + }, + { + "epoch": 0.485657859878134, + "grad_norm": 1.7118383646011353, + "learning_rate": 5.414511712808031e-07, + "loss": 0.9114, + "step": 20165 + }, + { + "epoch": 0.48577828086992125, + "grad_norm": 1.7365562915802002, + "learning_rate": 5.413244092891187e-07, + "loss": 0.8634, + "step": 20170 + }, + { + "epoch": 0.4858987018617085, + "grad_norm": 1.5098973512649536, + "learning_rate": 5.411976472974343e-07, + "loss": 0.8725, + "step": 20175 + }, + { + "epoch": 0.4860191228534958, + "grad_norm": 2.2831051349639893, + "learning_rate": 5.410708853057498e-07, + "loss": 0.9016, + "step": 20180 + }, + { + "epoch": 0.48613954384528313, + "grad_norm": 1.7645684480667114, + "learning_rate": 5.409441233140655e-07, + "loss": 0.9091, + "step": 20185 + }, + { + "epoch": 0.4862599648370704, + "grad_norm": 1.58481764793396, + "learning_rate": 5.408173613223811e-07, + "loss": 0.9248, + "step": 20190 + }, + { + "epoch": 0.4863803858288577, + "grad_norm": 1.5158801078796387, + "learning_rate": 5.406905993306967e-07, + "loss": 0.8814, + "step": 20195 + }, + { + "epoch": 0.48650080682064495, + "grad_norm": 1.5112053155899048, + "learning_rate": 5.405638373390122e-07, + "loss": 0.9759, + "step": 20200 + }, + { + "epoch": 0.4866212278124323, + "grad_norm": 1.5802388191223145, + "learning_rate": 5.404370753473278e-07, + "loss": 0.8858, + "step": 20205 + }, + { + "epoch": 0.48674164880421955, + "grad_norm": 1.7204270362854004, + "learning_rate": 5.403103133556434e-07, + "loss": 0.9064, + "step": 20210 + }, + { + "epoch": 0.4868620697960068, + "grad_norm": 1.6286576986312866, + "learning_rate": 5.40183551363959e-07, + "loss": 0.9187, + "step": 20215 + }, + { + "epoch": 0.48698249078779415, + "grad_norm": 1.6453478336334229, + "learning_rate": 5.400567893722747e-07, + "loss": 0.9228, + "step": 20220 + }, + { + "epoch": 0.4871029117795814, + "grad_norm": 1.5130614042282104, + "learning_rate": 5.399300273805901e-07, + "loss": 0.9352, + "step": 20225 + }, + { + "epoch": 0.4872233327713687, + "grad_norm": 1.6289459466934204, + "learning_rate": 5.398032653889057e-07, + "loss": 0.8983, + "step": 20230 + }, + { + "epoch": 0.487343753763156, + "grad_norm": 1.3544524908065796, + "learning_rate": 5.396765033972214e-07, + "loss": 0.9375, + "step": 20235 + }, + { + "epoch": 0.4874641747549433, + "grad_norm": 1.5215047597885132, + "learning_rate": 5.395497414055369e-07, + "loss": 0.9086, + "step": 20240 + }, + { + "epoch": 0.4875845957467306, + "grad_norm": 1.5918240547180176, + "learning_rate": 5.394229794138525e-07, + "loss": 0.8656, + "step": 20245 + }, + { + "epoch": 0.48770501673851785, + "grad_norm": 1.523152470588684, + "learning_rate": 5.392962174221681e-07, + "loss": 0.9532, + "step": 20250 + }, + { + "epoch": 0.4878254377303051, + "grad_norm": 1.923143744468689, + "learning_rate": 5.391694554304836e-07, + "loss": 0.8834, + "step": 20255 + }, + { + "epoch": 0.48794585872209245, + "grad_norm": 1.4190473556518555, + "learning_rate": 5.390426934387993e-07, + "loss": 0.8735, + "step": 20260 + }, + { + "epoch": 0.4880662797138797, + "grad_norm": 1.439806342124939, + "learning_rate": 5.389159314471149e-07, + "loss": 0.8817, + "step": 20265 + }, + { + "epoch": 0.488186700705667, + "grad_norm": 1.634311318397522, + "learning_rate": 5.387891694554305e-07, + "loss": 0.9512, + "step": 20270 + }, + { + "epoch": 0.4883071216974543, + "grad_norm": 1.5653709173202515, + "learning_rate": 5.38662407463746e-07, + "loss": 0.9068, + "step": 20275 + }, + { + "epoch": 0.4884275426892416, + "grad_norm": 1.5077821016311646, + "learning_rate": 5.385356454720617e-07, + "loss": 0.8538, + "step": 20280 + }, + { + "epoch": 0.4885479636810289, + "grad_norm": 1.911816120147705, + "learning_rate": 5.384088834803773e-07, + "loss": 0.9354, + "step": 20285 + }, + { + "epoch": 0.48866838467281615, + "grad_norm": 1.5661500692367554, + "learning_rate": 5.382821214886927e-07, + "loss": 0.8956, + "step": 20290 + }, + { + "epoch": 0.4887888056646035, + "grad_norm": 1.4549514055252075, + "learning_rate": 5.381553594970084e-07, + "loss": 0.8628, + "step": 20295 + }, + { + "epoch": 0.48890922665639075, + "grad_norm": 1.549526572227478, + "learning_rate": 5.38028597505324e-07, + "loss": 0.9558, + "step": 20300 + }, + { + "epoch": 0.489029647648178, + "grad_norm": 1.5282357931137085, + "learning_rate": 5.379018355136396e-07, + "loss": 0.9305, + "step": 20305 + }, + { + "epoch": 0.4891500686399653, + "grad_norm": 1.5047615766525269, + "learning_rate": 5.377750735219552e-07, + "loss": 0.8253, + "step": 20310 + }, + { + "epoch": 0.4892704896317526, + "grad_norm": 1.5259456634521484, + "learning_rate": 5.376483115302708e-07, + "loss": 0.8936, + "step": 20315 + }, + { + "epoch": 0.4893909106235399, + "grad_norm": 1.5735572576522827, + "learning_rate": 5.375215495385863e-07, + "loss": 0.9149, + "step": 20320 + }, + { + "epoch": 0.48951133161532717, + "grad_norm": 1.8593881130218506, + "learning_rate": 5.373947875469019e-07, + "loss": 0.8886, + "step": 20325 + }, + { + "epoch": 0.48963175260711445, + "grad_norm": 1.5736616849899292, + "learning_rate": 5.372680255552176e-07, + "loss": 0.9432, + "step": 20330 + }, + { + "epoch": 0.4897521735989018, + "grad_norm": 1.6624977588653564, + "learning_rate": 5.371412635635331e-07, + "loss": 0.9397, + "step": 20335 + }, + { + "epoch": 0.48987259459068905, + "grad_norm": 1.5041385889053345, + "learning_rate": 5.370145015718487e-07, + "loss": 0.8978, + "step": 20340 + }, + { + "epoch": 0.4899930155824763, + "grad_norm": 1.363087773323059, + "learning_rate": 5.368877395801643e-07, + "loss": 0.8678, + "step": 20345 + }, + { + "epoch": 0.49011343657426365, + "grad_norm": 1.5875787734985352, + "learning_rate": 5.367609775884798e-07, + "loss": 0.8805, + "step": 20350 + }, + { + "epoch": 0.4902338575660509, + "grad_norm": 1.5088090896606445, + "learning_rate": 5.366342155967955e-07, + "loss": 0.913, + "step": 20355 + }, + { + "epoch": 0.4903542785578382, + "grad_norm": 1.4936821460723877, + "learning_rate": 5.36507453605111e-07, + "loss": 0.9112, + "step": 20360 + }, + { + "epoch": 0.49047469954962547, + "grad_norm": 1.4798657894134521, + "learning_rate": 5.363806916134266e-07, + "loss": 0.9366, + "step": 20365 + }, + { + "epoch": 0.4905951205414128, + "grad_norm": 1.4514061212539673, + "learning_rate": 5.362539296217422e-07, + "loss": 0.9174, + "step": 20370 + }, + { + "epoch": 0.4907155415332001, + "grad_norm": 1.5838004350662231, + "learning_rate": 5.361271676300579e-07, + "loss": 0.9367, + "step": 20375 + }, + { + "epoch": 0.49083596252498735, + "grad_norm": 1.5880588293075562, + "learning_rate": 5.360004056383734e-07, + "loss": 0.8806, + "step": 20380 + }, + { + "epoch": 0.4909563835167746, + "grad_norm": 1.36306893825531, + "learning_rate": 5.358736436466889e-07, + "loss": 0.8844, + "step": 20385 + }, + { + "epoch": 0.49107680450856195, + "grad_norm": 1.5225731134414673, + "learning_rate": 5.357468816550046e-07, + "loss": 0.9375, + "step": 20390 + }, + { + "epoch": 0.4911972255003492, + "grad_norm": 1.4811948537826538, + "learning_rate": 5.356201196633201e-07, + "loss": 0.8907, + "step": 20395 + }, + { + "epoch": 0.4913176464921365, + "grad_norm": 1.5076699256896973, + "learning_rate": 5.354933576716358e-07, + "loss": 0.9257, + "step": 20400 + }, + { + "epoch": 0.4914380674839238, + "grad_norm": 1.6767728328704834, + "learning_rate": 5.353665956799513e-07, + "loss": 0.8771, + "step": 20405 + }, + { + "epoch": 0.4915584884757111, + "grad_norm": 1.5127720832824707, + "learning_rate": 5.352398336882668e-07, + "loss": 0.9205, + "step": 20410 + }, + { + "epoch": 0.49167890946749837, + "grad_norm": 1.4718294143676758, + "learning_rate": 5.351130716965825e-07, + "loss": 0.941, + "step": 20415 + }, + { + "epoch": 0.49179933045928564, + "grad_norm": 1.371346116065979, + "learning_rate": 5.349863097048981e-07, + "loss": 0.8647, + "step": 20420 + }, + { + "epoch": 0.491919751451073, + "grad_norm": 1.5402333736419678, + "learning_rate": 5.348595477132137e-07, + "loss": 0.8228, + "step": 20425 + }, + { + "epoch": 0.49204017244286025, + "grad_norm": 1.554704189300537, + "learning_rate": 5.347327857215292e-07, + "loss": 0.8427, + "step": 20430 + }, + { + "epoch": 0.4921605934346475, + "grad_norm": 1.4905873537063599, + "learning_rate": 5.346060237298448e-07, + "loss": 0.9247, + "step": 20435 + }, + { + "epoch": 0.4922810144264348, + "grad_norm": 1.5101425647735596, + "learning_rate": 5.344792617381604e-07, + "loss": 0.8644, + "step": 20440 + }, + { + "epoch": 0.4924014354182221, + "grad_norm": 1.3385801315307617, + "learning_rate": 5.34352499746476e-07, + "loss": 0.8604, + "step": 20445 + }, + { + "epoch": 0.4925218564100094, + "grad_norm": 1.6283985376358032, + "learning_rate": 5.342257377547917e-07, + "loss": 0.8637, + "step": 20450 + }, + { + "epoch": 0.49264227740179667, + "grad_norm": 1.491113305091858, + "learning_rate": 5.340989757631071e-07, + "loss": 0.9338, + "step": 20455 + }, + { + "epoch": 0.49276269839358394, + "grad_norm": 1.8177359104156494, + "learning_rate": 5.339722137714228e-07, + "loss": 0.8777, + "step": 20460 + }, + { + "epoch": 0.49288311938537127, + "grad_norm": 1.573534607887268, + "learning_rate": 5.338454517797384e-07, + "loss": 0.9414, + "step": 20465 + }, + { + "epoch": 0.49300354037715854, + "grad_norm": 1.958213448524475, + "learning_rate": 5.337186897880539e-07, + "loss": 0.9488, + "step": 20470 + }, + { + "epoch": 0.4931239613689458, + "grad_norm": 1.4002881050109863, + "learning_rate": 5.335919277963695e-07, + "loss": 0.9283, + "step": 20475 + }, + { + "epoch": 0.49324438236073315, + "grad_norm": 1.6372034549713135, + "learning_rate": 5.334651658046851e-07, + "loss": 0.9578, + "step": 20480 + }, + { + "epoch": 0.4933648033525204, + "grad_norm": 1.5431723594665527, + "learning_rate": 5.333384038130007e-07, + "loss": 0.893, + "step": 20485 + }, + { + "epoch": 0.4934852243443077, + "grad_norm": 2.078256368637085, + "learning_rate": 5.332116418213163e-07, + "loss": 0.9235, + "step": 20490 + }, + { + "epoch": 0.49360564533609497, + "grad_norm": 1.4664630889892578, + "learning_rate": 5.33084879829632e-07, + "loss": 0.8957, + "step": 20495 + }, + { + "epoch": 0.4937260663278823, + "grad_norm": 1.4795653820037842, + "learning_rate": 5.329581178379474e-07, + "loss": 0.8877, + "step": 20500 + }, + { + "epoch": 0.49384648731966957, + "grad_norm": 1.6357628107070923, + "learning_rate": 5.32831355846263e-07, + "loss": 0.9041, + "step": 20505 + }, + { + "epoch": 0.49396690831145684, + "grad_norm": 1.5398471355438232, + "learning_rate": 5.327045938545787e-07, + "loss": 0.9487, + "step": 20510 + }, + { + "epoch": 0.4940873293032441, + "grad_norm": 1.6561620235443115, + "learning_rate": 5.325778318628942e-07, + "loss": 0.8507, + "step": 20515 + }, + { + "epoch": 0.49420775029503144, + "grad_norm": 1.5499076843261719, + "learning_rate": 5.324510698712097e-07, + "loss": 0.8637, + "step": 20520 + }, + { + "epoch": 0.4943281712868187, + "grad_norm": 1.4668582677841187, + "learning_rate": 5.323243078795254e-07, + "loss": 0.8989, + "step": 20525 + }, + { + "epoch": 0.494448592278606, + "grad_norm": 1.350682020187378, + "learning_rate": 5.321975458878409e-07, + "loss": 0.8597, + "step": 20530 + }, + { + "epoch": 0.4945690132703933, + "grad_norm": 1.4209243059158325, + "learning_rate": 5.320707838961566e-07, + "loss": 0.961, + "step": 20535 + }, + { + "epoch": 0.4946894342621806, + "grad_norm": 1.4936949014663696, + "learning_rate": 5.319440219044722e-07, + "loss": 0.8972, + "step": 20540 + }, + { + "epoch": 0.49480985525396787, + "grad_norm": 1.6538289785385132, + "learning_rate": 5.318172599127877e-07, + "loss": 0.9161, + "step": 20545 + }, + { + "epoch": 0.49493027624575514, + "grad_norm": 1.5872825384140015, + "learning_rate": 5.316904979211033e-07, + "loss": 0.856, + "step": 20550 + }, + { + "epoch": 0.49505069723754247, + "grad_norm": 1.6348376274108887, + "learning_rate": 5.315637359294189e-07, + "loss": 0.944, + "step": 20555 + }, + { + "epoch": 0.49517111822932974, + "grad_norm": 1.436761498451233, + "learning_rate": 5.314369739377345e-07, + "loss": 0.8888, + "step": 20560 + }, + { + "epoch": 0.495291539221117, + "grad_norm": 1.2941811084747314, + "learning_rate": 5.313102119460501e-07, + "loss": 0.8922, + "step": 20565 + }, + { + "epoch": 0.4954119602129043, + "grad_norm": 1.4106999635696411, + "learning_rate": 5.311834499543657e-07, + "loss": 0.9333, + "step": 20570 + }, + { + "epoch": 0.4955323812046916, + "grad_norm": 1.6659363508224487, + "learning_rate": 5.310566879626812e-07, + "loss": 0.9821, + "step": 20575 + }, + { + "epoch": 0.4956528021964789, + "grad_norm": 1.6132570505142212, + "learning_rate": 5.309299259709969e-07, + "loss": 0.8684, + "step": 20580 + }, + { + "epoch": 0.49577322318826617, + "grad_norm": 1.6882359981536865, + "learning_rate": 5.308031639793125e-07, + "loss": 0.8775, + "step": 20585 + }, + { + "epoch": 0.4958936441800535, + "grad_norm": 1.4514377117156982, + "learning_rate": 5.306764019876279e-07, + "loss": 0.95, + "step": 20590 + }, + { + "epoch": 0.49601406517184077, + "grad_norm": 1.6146961450576782, + "learning_rate": 5.305496399959436e-07, + "loss": 0.9243, + "step": 20595 + }, + { + "epoch": 0.49613448616362804, + "grad_norm": 1.6913578510284424, + "learning_rate": 5.304228780042592e-07, + "loss": 0.9379, + "step": 20600 + }, + { + "epoch": 0.4962549071554153, + "grad_norm": 1.521320104598999, + "learning_rate": 5.302961160125748e-07, + "loss": 0.8975, + "step": 20605 + }, + { + "epoch": 0.49637532814720264, + "grad_norm": 1.677592158317566, + "learning_rate": 5.301693540208904e-07, + "loss": 0.8753, + "step": 20610 + }, + { + "epoch": 0.4964957491389899, + "grad_norm": 1.498340129852295, + "learning_rate": 5.300425920292059e-07, + "loss": 0.872, + "step": 20615 + }, + { + "epoch": 0.4966161701307772, + "grad_norm": 1.684953212738037, + "learning_rate": 5.299158300375215e-07, + "loss": 0.952, + "step": 20620 + }, + { + "epoch": 0.49673659112256446, + "grad_norm": 1.5729172229766846, + "learning_rate": 5.297890680458371e-07, + "loss": 0.9443, + "step": 20625 + }, + { + "epoch": 0.4968570121143518, + "grad_norm": 1.7612193822860718, + "learning_rate": 5.296623060541528e-07, + "loss": 0.9308, + "step": 20630 + }, + { + "epoch": 0.49697743310613907, + "grad_norm": 1.7964433431625366, + "learning_rate": 5.295355440624682e-07, + "loss": 0.9247, + "step": 20635 + }, + { + "epoch": 0.49709785409792634, + "grad_norm": 1.82650625705719, + "learning_rate": 5.294087820707838e-07, + "loss": 0.95, + "step": 20640 + }, + { + "epoch": 0.4972182750897136, + "grad_norm": 1.417760968208313, + "learning_rate": 5.292820200790995e-07, + "loss": 0.9364, + "step": 20645 + }, + { + "epoch": 0.49733869608150094, + "grad_norm": 1.361054539680481, + "learning_rate": 5.29155258087415e-07, + "loss": 0.9345, + "step": 20650 + }, + { + "epoch": 0.4974591170732882, + "grad_norm": 1.4298704862594604, + "learning_rate": 5.290284960957307e-07, + "loss": 0.9849, + "step": 20655 + }, + { + "epoch": 0.4975795380650755, + "grad_norm": 1.7009376287460327, + "learning_rate": 5.289017341040462e-07, + "loss": 0.9051, + "step": 20660 + }, + { + "epoch": 0.4976999590568628, + "grad_norm": 1.4122412204742432, + "learning_rate": 5.287749721123618e-07, + "loss": 0.8759, + "step": 20665 + }, + { + "epoch": 0.4978203800486501, + "grad_norm": 1.7190061807632446, + "learning_rate": 5.286482101206774e-07, + "loss": 0.9077, + "step": 20670 + }, + { + "epoch": 0.49794080104043736, + "grad_norm": 1.5603126287460327, + "learning_rate": 5.28521448128993e-07, + "loss": 0.86, + "step": 20675 + }, + { + "epoch": 0.49806122203222464, + "grad_norm": 1.577416181564331, + "learning_rate": 5.283946861373086e-07, + "loss": 0.9753, + "step": 20680 + }, + { + "epoch": 0.49818164302401197, + "grad_norm": 1.4858267307281494, + "learning_rate": 5.282679241456241e-07, + "loss": 0.8542, + "step": 20685 + }, + { + "epoch": 0.49830206401579924, + "grad_norm": 1.597087025642395, + "learning_rate": 5.281411621539398e-07, + "loss": 0.8425, + "step": 20690 + }, + { + "epoch": 0.4984224850075865, + "grad_norm": 1.4735974073410034, + "learning_rate": 5.280144001622553e-07, + "loss": 0.9159, + "step": 20695 + }, + { + "epoch": 0.4985429059993738, + "grad_norm": 1.483817458152771, + "learning_rate": 5.27887638170571e-07, + "loss": 0.8468, + "step": 20700 + }, + { + "epoch": 0.4986633269911611, + "grad_norm": 2.0631678104400635, + "learning_rate": 5.277608761788865e-07, + "loss": 0.8393, + "step": 20705 + }, + { + "epoch": 0.4987837479829484, + "grad_norm": 1.443610668182373, + "learning_rate": 5.27634114187202e-07, + "loss": 0.9125, + "step": 20710 + }, + { + "epoch": 0.49890416897473566, + "grad_norm": 1.6772816181182861, + "learning_rate": 5.275073521955177e-07, + "loss": 0.8757, + "step": 20715 + }, + { + "epoch": 0.499024589966523, + "grad_norm": 1.4628548622131348, + "learning_rate": 5.273805902038333e-07, + "loss": 0.9609, + "step": 20720 + }, + { + "epoch": 0.49914501095831026, + "grad_norm": 1.4909286499023438, + "learning_rate": 5.272538282121489e-07, + "loss": 0.9024, + "step": 20725 + }, + { + "epoch": 0.49926543195009754, + "grad_norm": 1.5248208045959473, + "learning_rate": 5.271270662204644e-07, + "loss": 0.8609, + "step": 20730 + }, + { + "epoch": 0.4993858529418848, + "grad_norm": 1.7877882719039917, + "learning_rate": 5.2700030422878e-07, + "loss": 0.8867, + "step": 20735 + }, + { + "epoch": 0.49950627393367214, + "grad_norm": 1.4456939697265625, + "learning_rate": 5.268735422370956e-07, + "loss": 0.8803, + "step": 20740 + }, + { + "epoch": 0.4996266949254594, + "grad_norm": 1.5736900568008423, + "learning_rate": 5.267467802454112e-07, + "loss": 0.864, + "step": 20745 + }, + { + "epoch": 0.4997471159172467, + "grad_norm": 1.497660756111145, + "learning_rate": 5.266200182537269e-07, + "loss": 0.9403, + "step": 20750 + }, + { + "epoch": 0.49986753690903396, + "grad_norm": 1.4500926733016968, + "learning_rate": 5.264932562620423e-07, + "loss": 0.9268, + "step": 20755 + }, + { + "epoch": 0.4999879579008213, + "grad_norm": 1.5371147394180298, + "learning_rate": 5.263664942703579e-07, + "loss": 0.9218, + "step": 20760 + }, + { + "epoch": 0.5001083788926085, + "grad_norm": 1.5035649538040161, + "learning_rate": 5.262397322786736e-07, + "loss": 0.8771, + "step": 20765 + }, + { + "epoch": 0.5002287998843958, + "grad_norm": 1.7202868461608887, + "learning_rate": 5.261129702869891e-07, + "loss": 0.9059, + "step": 20770 + }, + { + "epoch": 0.5003492208761832, + "grad_norm": 1.4313465356826782, + "learning_rate": 5.259862082953047e-07, + "loss": 0.9434, + "step": 20775 + }, + { + "epoch": 0.5004696418679704, + "grad_norm": 1.7266535758972168, + "learning_rate": 5.258594463036203e-07, + "loss": 0.9265, + "step": 20780 + }, + { + "epoch": 0.5005900628597577, + "grad_norm": 1.4797693490982056, + "learning_rate": 5.257326843119358e-07, + "loss": 0.9025, + "step": 20785 + }, + { + "epoch": 0.500710483851545, + "grad_norm": 1.5850337743759155, + "learning_rate": 5.256059223202515e-07, + "loss": 0.8846, + "step": 20790 + }, + { + "epoch": 0.5008309048433323, + "grad_norm": 1.5374903678894043, + "learning_rate": 5.254791603285671e-07, + "loss": 0.9484, + "step": 20795 + }, + { + "epoch": 0.5009513258351196, + "grad_norm": 1.6088073253631592, + "learning_rate": 5.253523983368826e-07, + "loss": 0.8937, + "step": 20800 + }, + { + "epoch": 0.5010717468269069, + "grad_norm": 1.4605728387832642, + "learning_rate": 5.252256363451982e-07, + "loss": 0.8833, + "step": 20805 + }, + { + "epoch": 0.5011921678186941, + "grad_norm": 1.5764379501342773, + "learning_rate": 5.250988743535139e-07, + "loss": 0.8859, + "step": 20810 + }, + { + "epoch": 0.5013125888104815, + "grad_norm": 1.687082290649414, + "learning_rate": 5.249721123618294e-07, + "loss": 0.9462, + "step": 20815 + }, + { + "epoch": 0.5014330098022687, + "grad_norm": 1.431099534034729, + "learning_rate": 5.248453503701449e-07, + "loss": 0.8679, + "step": 20820 + }, + { + "epoch": 0.501553430794056, + "grad_norm": 1.5310856103897095, + "learning_rate": 5.247185883784606e-07, + "loss": 0.929, + "step": 20825 + }, + { + "epoch": 0.5016738517858433, + "grad_norm": 1.4208393096923828, + "learning_rate": 5.245918263867761e-07, + "loss": 0.8908, + "step": 20830 + }, + { + "epoch": 0.5017942727776306, + "grad_norm": 1.3968501091003418, + "learning_rate": 5.244650643950918e-07, + "loss": 0.8749, + "step": 20835 + }, + { + "epoch": 0.5019146937694179, + "grad_norm": 1.4458131790161133, + "learning_rate": 5.243383024034074e-07, + "loss": 0.9871, + "step": 20840 + }, + { + "epoch": 0.5020351147612052, + "grad_norm": 1.638501763343811, + "learning_rate": 5.242115404117228e-07, + "loss": 0.9494, + "step": 20845 + }, + { + "epoch": 0.5021555357529924, + "grad_norm": 2.0048091411590576, + "learning_rate": 5.240847784200385e-07, + "loss": 0.8665, + "step": 20850 + }, + { + "epoch": 0.5022759567447798, + "grad_norm": 1.4994688034057617, + "learning_rate": 5.239580164283541e-07, + "loss": 0.9034, + "step": 20855 + }, + { + "epoch": 0.5023963777365671, + "grad_norm": 1.4207297563552856, + "learning_rate": 5.238312544366697e-07, + "loss": 0.9173, + "step": 20860 + }, + { + "epoch": 0.5025167987283543, + "grad_norm": 1.6375930309295654, + "learning_rate": 5.237044924449853e-07, + "loss": 0.9082, + "step": 20865 + }, + { + "epoch": 0.5026372197201416, + "grad_norm": 1.6961311101913452, + "learning_rate": 5.235777304533009e-07, + "loss": 0.9142, + "step": 20870 + }, + { + "epoch": 0.5027576407119289, + "grad_norm": 1.500553011894226, + "learning_rate": 5.234509684616164e-07, + "loss": 0.8992, + "step": 20875 + }, + { + "epoch": 0.5028780617037162, + "grad_norm": 1.6316986083984375, + "learning_rate": 5.23324206469932e-07, + "loss": 0.8745, + "step": 20880 + }, + { + "epoch": 0.5029984826955035, + "grad_norm": 1.5603423118591309, + "learning_rate": 5.231974444782477e-07, + "loss": 0.913, + "step": 20885 + }, + { + "epoch": 0.5031189036872907, + "grad_norm": 1.532484531402588, + "learning_rate": 5.230706824865631e-07, + "loss": 0.8834, + "step": 20890 + }, + { + "epoch": 0.5032393246790781, + "grad_norm": 1.8932911157608032, + "learning_rate": 5.229439204948788e-07, + "loss": 0.904, + "step": 20895 + }, + { + "epoch": 0.5033597456708654, + "grad_norm": 1.5125609636306763, + "learning_rate": 5.228171585031944e-07, + "loss": 0.8856, + "step": 20900 + }, + { + "epoch": 0.5034801666626526, + "grad_norm": 1.4216543436050415, + "learning_rate": 5.226903965115099e-07, + "loss": 0.9319, + "step": 20905 + }, + { + "epoch": 0.5036005876544399, + "grad_norm": 1.739990234375, + "learning_rate": 5.225636345198256e-07, + "loss": 0.8709, + "step": 20910 + }, + { + "epoch": 0.5037210086462273, + "grad_norm": 1.427313208580017, + "learning_rate": 5.224368725281411e-07, + "loss": 0.8995, + "step": 20915 + }, + { + "epoch": 0.5038414296380145, + "grad_norm": 1.5396627187728882, + "learning_rate": 5.223101105364567e-07, + "loss": 0.9248, + "step": 20920 + }, + { + "epoch": 0.5039618506298018, + "grad_norm": 1.3887165784835815, + "learning_rate": 5.221833485447723e-07, + "loss": 0.895, + "step": 20925 + }, + { + "epoch": 0.504082271621589, + "grad_norm": 1.5276862382888794, + "learning_rate": 5.22056586553088e-07, + "loss": 0.8515, + "step": 20930 + }, + { + "epoch": 0.5042026926133764, + "grad_norm": 1.6163544654846191, + "learning_rate": 5.219298245614034e-07, + "loss": 0.8576, + "step": 20935 + }, + { + "epoch": 0.5043231136051637, + "grad_norm": 1.6514025926589966, + "learning_rate": 5.21803062569719e-07, + "loss": 0.9141, + "step": 20940 + }, + { + "epoch": 0.5044435345969509, + "grad_norm": 1.6270771026611328, + "learning_rate": 5.216763005780347e-07, + "loss": 0.8765, + "step": 20945 + }, + { + "epoch": 0.5045639555887382, + "grad_norm": 1.5601426362991333, + "learning_rate": 5.215495385863502e-07, + "loss": 0.8455, + "step": 20950 + }, + { + "epoch": 0.5046843765805256, + "grad_norm": 1.6472203731536865, + "learning_rate": 5.214227765946659e-07, + "loss": 0.8874, + "step": 20955 + }, + { + "epoch": 0.5048047975723128, + "grad_norm": 1.4407867193222046, + "learning_rate": 5.212960146029814e-07, + "loss": 0.902, + "step": 20960 + }, + { + "epoch": 0.5049252185641001, + "grad_norm": 1.5756430625915527, + "learning_rate": 5.211692526112969e-07, + "loss": 0.9313, + "step": 20965 + }, + { + "epoch": 0.5050456395558873, + "grad_norm": 1.4176348447799683, + "learning_rate": 5.210424906196126e-07, + "loss": 0.9203, + "step": 20970 + }, + { + "epoch": 0.5051660605476747, + "grad_norm": 1.592527151107788, + "learning_rate": 5.209157286279282e-07, + "loss": 0.9171, + "step": 20975 + }, + { + "epoch": 0.505286481539462, + "grad_norm": 1.5845403671264648, + "learning_rate": 5.207889666362438e-07, + "loss": 0.9015, + "step": 20980 + }, + { + "epoch": 0.5054069025312492, + "grad_norm": 1.458234190940857, + "learning_rate": 5.206622046445593e-07, + "loss": 0.8887, + "step": 20985 + }, + { + "epoch": 0.5055273235230365, + "grad_norm": 1.497634768486023, + "learning_rate": 5.20535442652875e-07, + "loss": 0.8902, + "step": 20990 + }, + { + "epoch": 0.5056477445148239, + "grad_norm": 1.5629130601882935, + "learning_rate": 5.204086806611905e-07, + "loss": 0.9415, + "step": 20995 + }, + { + "epoch": 0.5057681655066111, + "grad_norm": 1.4125767946243286, + "learning_rate": 5.202819186695061e-07, + "loss": 0.892, + "step": 21000 + }, + { + "epoch": 0.5058885864983984, + "grad_norm": 1.7816734313964844, + "learning_rate": 5.201551566778217e-07, + "loss": 0.8771, + "step": 21005 + }, + { + "epoch": 0.5060090074901857, + "grad_norm": 1.4958962202072144, + "learning_rate": 5.200283946861372e-07, + "loss": 0.9123, + "step": 21010 + }, + { + "epoch": 0.506129428481973, + "grad_norm": 1.4297374486923218, + "learning_rate": 5.199016326944529e-07, + "loss": 0.9241, + "step": 21015 + }, + { + "epoch": 0.5062498494737603, + "grad_norm": 1.6796749830245972, + "learning_rate": 5.197748707027685e-07, + "loss": 0.9057, + "step": 21020 + }, + { + "epoch": 0.5063702704655475, + "grad_norm": 1.3399157524108887, + "learning_rate": 5.19648108711084e-07, + "loss": 0.8854, + "step": 21025 + }, + { + "epoch": 0.5064906914573348, + "grad_norm": 1.734200358390808, + "learning_rate": 5.195213467193996e-07, + "loss": 0.9212, + "step": 21030 + }, + { + "epoch": 0.5066111124491222, + "grad_norm": 1.377709150314331, + "learning_rate": 5.193945847277152e-07, + "loss": 0.8889, + "step": 21035 + }, + { + "epoch": 0.5067315334409094, + "grad_norm": 1.5945323705673218, + "learning_rate": 5.192678227360308e-07, + "loss": 0.9097, + "step": 21040 + }, + { + "epoch": 0.5068519544326967, + "grad_norm": 1.528894305229187, + "learning_rate": 5.191410607443464e-07, + "loss": 0.9147, + "step": 21045 + }, + { + "epoch": 0.506972375424484, + "grad_norm": 1.6350053548812866, + "learning_rate": 5.190142987526621e-07, + "loss": 0.9181, + "step": 21050 + }, + { + "epoch": 0.5070927964162713, + "grad_norm": 1.4047951698303223, + "learning_rate": 5.188875367609775e-07, + "loss": 0.9403, + "step": 21055 + }, + { + "epoch": 0.5072132174080586, + "grad_norm": 1.8590868711471558, + "learning_rate": 5.187607747692931e-07, + "loss": 0.8864, + "step": 21060 + }, + { + "epoch": 0.5073336383998459, + "grad_norm": 1.7068983316421509, + "learning_rate": 5.186340127776088e-07, + "loss": 0.9501, + "step": 21065 + }, + { + "epoch": 0.5074540593916331, + "grad_norm": 1.4411805868148804, + "learning_rate": 5.185072507859243e-07, + "loss": 0.9195, + "step": 21070 + }, + { + "epoch": 0.5075744803834205, + "grad_norm": 1.4354585409164429, + "learning_rate": 5.183804887942399e-07, + "loss": 0.8736, + "step": 21075 + }, + { + "epoch": 0.5076949013752077, + "grad_norm": 1.5002541542053223, + "learning_rate": 5.182537268025555e-07, + "loss": 0.8807, + "step": 21080 + }, + { + "epoch": 0.507815322366995, + "grad_norm": 1.4930588006973267, + "learning_rate": 5.18126964810871e-07, + "loss": 0.9046, + "step": 21085 + }, + { + "epoch": 0.5079357433587823, + "grad_norm": 1.43183171749115, + "learning_rate": 5.180002028191867e-07, + "loss": 0.9073, + "step": 21090 + }, + { + "epoch": 0.5080561643505696, + "grad_norm": 1.4301148653030396, + "learning_rate": 5.178734408275023e-07, + "loss": 0.9289, + "step": 21095 + }, + { + "epoch": 0.5081765853423569, + "grad_norm": 1.5547271966934204, + "learning_rate": 5.177466788358178e-07, + "loss": 0.917, + "step": 21100 + }, + { + "epoch": 0.5082970063341442, + "grad_norm": 1.5416436195373535, + "learning_rate": 5.176199168441334e-07, + "loss": 0.9489, + "step": 21105 + }, + { + "epoch": 0.5084174273259314, + "grad_norm": 1.5894789695739746, + "learning_rate": 5.17493154852449e-07, + "loss": 0.8787, + "step": 21110 + }, + { + "epoch": 0.5085378483177188, + "grad_norm": 1.686223030090332, + "learning_rate": 5.173663928607646e-07, + "loss": 0.912, + "step": 21115 + }, + { + "epoch": 0.5086582693095061, + "grad_norm": 1.5424811840057373, + "learning_rate": 5.172396308690801e-07, + "loss": 0.8377, + "step": 21120 + }, + { + "epoch": 0.5087786903012933, + "grad_norm": 1.7151985168457031, + "learning_rate": 5.171128688773958e-07, + "loss": 0.9181, + "step": 21125 + }, + { + "epoch": 0.5088991112930806, + "grad_norm": 3.097501277923584, + "learning_rate": 5.169861068857113e-07, + "loss": 0.8949, + "step": 21130 + }, + { + "epoch": 0.5090195322848678, + "grad_norm": 1.4970147609710693, + "learning_rate": 5.16859344894027e-07, + "loss": 0.8966, + "step": 21135 + }, + { + "epoch": 0.5091399532766552, + "grad_norm": 1.6402392387390137, + "learning_rate": 5.167325829023426e-07, + "loss": 0.9161, + "step": 21140 + }, + { + "epoch": 0.5092603742684425, + "grad_norm": 1.5018306970596313, + "learning_rate": 5.16605820910658e-07, + "loss": 0.8831, + "step": 21145 + }, + { + "epoch": 0.5093807952602297, + "grad_norm": 1.8322926759719849, + "learning_rate": 5.164790589189737e-07, + "loss": 0.8392, + "step": 21150 + }, + { + "epoch": 0.509501216252017, + "grad_norm": 1.645729899406433, + "learning_rate": 5.163522969272893e-07, + "loss": 0.901, + "step": 21155 + }, + { + "epoch": 0.5096216372438044, + "grad_norm": 1.7000799179077148, + "learning_rate": 5.162255349356049e-07, + "loss": 0.8458, + "step": 21160 + }, + { + "epoch": 0.5097420582355916, + "grad_norm": 1.3836746215820312, + "learning_rate": 5.160987729439205e-07, + "loss": 0.9419, + "step": 21165 + }, + { + "epoch": 0.5098624792273789, + "grad_norm": 1.404828429222107, + "learning_rate": 5.15972010952236e-07, + "loss": 0.9054, + "step": 21170 + }, + { + "epoch": 0.5099829002191663, + "grad_norm": 1.502503514289856, + "learning_rate": 5.158452489605516e-07, + "loss": 0.9748, + "step": 21175 + }, + { + "epoch": 0.5101033212109535, + "grad_norm": 1.533842921257019, + "learning_rate": 5.157184869688672e-07, + "loss": 0.9296, + "step": 21180 + }, + { + "epoch": 0.5102237422027408, + "grad_norm": 1.3932472467422485, + "learning_rate": 5.155917249771829e-07, + "loss": 0.9343, + "step": 21185 + }, + { + "epoch": 0.510344163194528, + "grad_norm": 1.562790870666504, + "learning_rate": 5.154649629854983e-07, + "loss": 0.9493, + "step": 21190 + }, + { + "epoch": 0.5104645841863154, + "grad_norm": 1.5711618661880493, + "learning_rate": 5.15338200993814e-07, + "loss": 0.9587, + "step": 21195 + }, + { + "epoch": 0.5105850051781027, + "grad_norm": 1.5852607488632202, + "learning_rate": 5.152114390021296e-07, + "loss": 0.9167, + "step": 21200 + }, + { + "epoch": 0.5107054261698899, + "grad_norm": 1.6820813417434692, + "learning_rate": 5.150846770104451e-07, + "loss": 0.8955, + "step": 21205 + }, + { + "epoch": 0.5108258471616772, + "grad_norm": 1.5433590412139893, + "learning_rate": 5.149579150187608e-07, + "loss": 0.9351, + "step": 21210 + }, + { + "epoch": 0.5109462681534646, + "grad_norm": 1.5106788873672485, + "learning_rate": 5.148311530270763e-07, + "loss": 0.8874, + "step": 21215 + }, + { + "epoch": 0.5110666891452518, + "grad_norm": 1.558750867843628, + "learning_rate": 5.147043910353919e-07, + "loss": 0.9364, + "step": 21220 + }, + { + "epoch": 0.5111871101370391, + "grad_norm": 1.4906758069992065, + "learning_rate": 5.145776290437075e-07, + "loss": 0.868, + "step": 21225 + }, + { + "epoch": 0.5113075311288264, + "grad_norm": 1.4711549282073975, + "learning_rate": 5.144508670520231e-07, + "loss": 0.9563, + "step": 21230 + }, + { + "epoch": 0.5114279521206136, + "grad_norm": 1.7329277992248535, + "learning_rate": 5.143241050603386e-07, + "loss": 0.9388, + "step": 21235 + }, + { + "epoch": 0.511548373112401, + "grad_norm": 1.5840879678726196, + "learning_rate": 5.141973430686542e-07, + "loss": 0.924, + "step": 21240 + }, + { + "epoch": 0.5116687941041882, + "grad_norm": 1.4744466543197632, + "learning_rate": 5.140705810769699e-07, + "loss": 0.9193, + "step": 21245 + }, + { + "epoch": 0.5117892150959755, + "grad_norm": 1.7825807332992554, + "learning_rate": 5.139438190852854e-07, + "loss": 0.9507, + "step": 21250 + }, + { + "epoch": 0.5119096360877629, + "grad_norm": 1.3288898468017578, + "learning_rate": 5.138170570936011e-07, + "loss": 0.9705, + "step": 21255 + }, + { + "epoch": 0.5120300570795501, + "grad_norm": 1.7006186246871948, + "learning_rate": 5.136902951019166e-07, + "loss": 0.9218, + "step": 21260 + }, + { + "epoch": 0.5121504780713374, + "grad_norm": 1.5173015594482422, + "learning_rate": 5.135635331102321e-07, + "loss": 0.9247, + "step": 21265 + }, + { + "epoch": 0.5122708990631247, + "grad_norm": 1.546424150466919, + "learning_rate": 5.134367711185478e-07, + "loss": 0.9388, + "step": 21270 + }, + { + "epoch": 0.512391320054912, + "grad_norm": 1.7654190063476562, + "learning_rate": 5.133100091268634e-07, + "loss": 0.941, + "step": 21275 + }, + { + "epoch": 0.5125117410466993, + "grad_norm": 1.3392126560211182, + "learning_rate": 5.13183247135179e-07, + "loss": 0.8702, + "step": 21280 + }, + { + "epoch": 0.5126321620384865, + "grad_norm": 2.080995798110962, + "learning_rate": 5.130564851434945e-07, + "loss": 0.9056, + "step": 21285 + }, + { + "epoch": 0.5127525830302738, + "grad_norm": 1.460829257965088, + "learning_rate": 5.129297231518101e-07, + "loss": 0.8453, + "step": 21290 + }, + { + "epoch": 0.5128730040220612, + "grad_norm": 1.692646861076355, + "learning_rate": 5.128029611601258e-07, + "loss": 0.9056, + "step": 21295 + }, + { + "epoch": 0.5129934250138484, + "grad_norm": 1.5984737873077393, + "learning_rate": 5.126761991684413e-07, + "loss": 0.8921, + "step": 21300 + }, + { + "epoch": 0.5131138460056357, + "grad_norm": 1.5816010236740112, + "learning_rate": 5.125494371767569e-07, + "loss": 0.9293, + "step": 21305 + }, + { + "epoch": 0.513234266997423, + "grad_norm": 1.5962992906570435, + "learning_rate": 5.124226751850725e-07, + "loss": 0.8892, + "step": 21310 + }, + { + "epoch": 0.5133546879892102, + "grad_norm": 1.524151086807251, + "learning_rate": 5.12295913193388e-07, + "loss": 0.89, + "step": 21315 + }, + { + "epoch": 0.5134751089809976, + "grad_norm": 1.9870415925979614, + "learning_rate": 5.121691512017037e-07, + "loss": 0.9538, + "step": 21320 + }, + { + "epoch": 0.5135955299727849, + "grad_norm": 1.774303674697876, + "learning_rate": 5.120423892100193e-07, + "loss": 0.8695, + "step": 21325 + }, + { + "epoch": 0.5137159509645721, + "grad_norm": 1.5197960138320923, + "learning_rate": 5.119156272183348e-07, + "loss": 0.859, + "step": 21330 + }, + { + "epoch": 0.5138363719563595, + "grad_norm": 1.4730110168457031, + "learning_rate": 5.117888652266504e-07, + "loss": 0.8801, + "step": 21335 + }, + { + "epoch": 0.5139567929481467, + "grad_norm": 1.5757942199707031, + "learning_rate": 5.116621032349661e-07, + "loss": 0.8986, + "step": 21340 + }, + { + "epoch": 0.514077213939934, + "grad_norm": 1.787201166152954, + "learning_rate": 5.115353412432816e-07, + "loss": 0.9594, + "step": 21345 + }, + { + "epoch": 0.5141976349317213, + "grad_norm": 1.6020995378494263, + "learning_rate": 5.114085792515972e-07, + "loss": 0.8921, + "step": 21350 + }, + { + "epoch": 0.5143180559235085, + "grad_norm": 1.7841721773147583, + "learning_rate": 5.112818172599128e-07, + "loss": 0.8728, + "step": 21355 + }, + { + "epoch": 0.5144384769152959, + "grad_norm": 1.4041569232940674, + "learning_rate": 5.111550552682283e-07, + "loss": 0.86, + "step": 21360 + }, + { + "epoch": 0.5145588979070832, + "grad_norm": 1.733370065689087, + "learning_rate": 5.11028293276544e-07, + "loss": 0.9056, + "step": 21365 + }, + { + "epoch": 0.5146793188988704, + "grad_norm": 1.4316951036453247, + "learning_rate": 5.109015312848596e-07, + "loss": 0.9368, + "step": 21370 + }, + { + "epoch": 0.5147997398906577, + "grad_norm": 1.5830496549606323, + "learning_rate": 5.10774769293175e-07, + "loss": 0.9564, + "step": 21375 + }, + { + "epoch": 0.5149201608824451, + "grad_norm": 1.5182487964630127, + "learning_rate": 5.106480073014907e-07, + "loss": 0.9204, + "step": 21380 + }, + { + "epoch": 0.5150405818742323, + "grad_norm": 1.5096020698547363, + "learning_rate": 5.105212453098063e-07, + "loss": 0.8937, + "step": 21385 + }, + { + "epoch": 0.5151610028660196, + "grad_norm": 1.4812935590744019, + "learning_rate": 5.103944833181219e-07, + "loss": 0.9174, + "step": 21390 + }, + { + "epoch": 0.5152814238578068, + "grad_norm": 1.6584185361862183, + "learning_rate": 5.102677213264375e-07, + "loss": 0.9187, + "step": 21395 + }, + { + "epoch": 0.5154018448495942, + "grad_norm": 1.5996214151382446, + "learning_rate": 5.101409593347531e-07, + "loss": 0.9131, + "step": 21400 + }, + { + "epoch": 0.5155222658413815, + "grad_norm": 1.6251389980316162, + "learning_rate": 5.100141973430686e-07, + "loss": 0.8343, + "step": 21405 + }, + { + "epoch": 0.5156426868331687, + "grad_norm": 1.4660563468933105, + "learning_rate": 5.098874353513842e-07, + "loss": 0.927, + "step": 21410 + }, + { + "epoch": 0.515763107824956, + "grad_norm": 1.6889156103134155, + "learning_rate": 5.097606733596999e-07, + "loss": 0.8799, + "step": 21415 + }, + { + "epoch": 0.5158835288167434, + "grad_norm": 1.6388341188430786, + "learning_rate": 5.096339113680153e-07, + "loss": 0.9416, + "step": 21420 + }, + { + "epoch": 0.5160039498085306, + "grad_norm": 1.6233501434326172, + "learning_rate": 5.09507149376331e-07, + "loss": 0.8411, + "step": 21425 + }, + { + "epoch": 0.5161243708003179, + "grad_norm": 1.8281123638153076, + "learning_rate": 5.093803873846466e-07, + "loss": 0.9372, + "step": 21430 + }, + { + "epoch": 0.5162447917921053, + "grad_norm": 1.526695728302002, + "learning_rate": 5.092536253929621e-07, + "loss": 0.9745, + "step": 21435 + }, + { + "epoch": 0.5163652127838925, + "grad_norm": 1.6644231081008911, + "learning_rate": 5.091268634012778e-07, + "loss": 0.9452, + "step": 21440 + }, + { + "epoch": 0.5164856337756798, + "grad_norm": 1.6076133251190186, + "learning_rate": 5.090001014095933e-07, + "loss": 0.8995, + "step": 21445 + }, + { + "epoch": 0.516606054767467, + "grad_norm": 1.7203131914138794, + "learning_rate": 5.088733394179089e-07, + "loss": 0.9067, + "step": 21450 + }, + { + "epoch": 0.5167264757592543, + "grad_norm": 1.5742437839508057, + "learning_rate": 5.087465774262245e-07, + "loss": 0.87, + "step": 21455 + }, + { + "epoch": 0.5168468967510417, + "grad_norm": 1.6953926086425781, + "learning_rate": 5.086198154345402e-07, + "loss": 0.8837, + "step": 21460 + }, + { + "epoch": 0.5169673177428289, + "grad_norm": 1.421783208847046, + "learning_rate": 5.084930534428557e-07, + "loss": 0.8819, + "step": 21465 + }, + { + "epoch": 0.5170877387346162, + "grad_norm": 1.4581353664398193, + "learning_rate": 5.083662914511712e-07, + "loss": 0.938, + "step": 21470 + }, + { + "epoch": 0.5172081597264035, + "grad_norm": 1.642289638519287, + "learning_rate": 5.082395294594869e-07, + "loss": 0.9865, + "step": 21475 + }, + { + "epoch": 0.5173285807181908, + "grad_norm": 1.6251811981201172, + "learning_rate": 5.081127674678024e-07, + "loss": 0.924, + "step": 21480 + }, + { + "epoch": 0.5174490017099781, + "grad_norm": 1.596927285194397, + "learning_rate": 5.079860054761181e-07, + "loss": 0.9044, + "step": 21485 + }, + { + "epoch": 0.5175694227017654, + "grad_norm": 1.4459487199783325, + "learning_rate": 5.078592434844336e-07, + "loss": 0.9021, + "step": 21490 + }, + { + "epoch": 0.5176898436935526, + "grad_norm": 1.5127309560775757, + "learning_rate": 5.077324814927491e-07, + "loss": 0.9085, + "step": 21495 + }, + { + "epoch": 0.51781026468534, + "grad_norm": 1.7187678813934326, + "learning_rate": 5.076057195010648e-07, + "loss": 0.9297, + "step": 21500 + }, + { + "epoch": 0.5179306856771272, + "grad_norm": 1.4774119853973389, + "learning_rate": 5.074789575093804e-07, + "loss": 0.9093, + "step": 21505 + }, + { + "epoch": 0.5180511066689145, + "grad_norm": 1.88589608669281, + "learning_rate": 5.07352195517696e-07, + "loss": 0.8903, + "step": 21510 + }, + { + "epoch": 0.5181715276607018, + "grad_norm": 1.450732946395874, + "learning_rate": 5.072254335260115e-07, + "loss": 0.8562, + "step": 21515 + }, + { + "epoch": 0.5182919486524891, + "grad_norm": 1.2906776666641235, + "learning_rate": 5.070986715343272e-07, + "loss": 0.8162, + "step": 21520 + }, + { + "epoch": 0.5184123696442764, + "grad_norm": 1.682756781578064, + "learning_rate": 5.069719095426427e-07, + "loss": 0.9344, + "step": 21525 + }, + { + "epoch": 0.5185327906360637, + "grad_norm": 1.6122827529907227, + "learning_rate": 5.068451475509583e-07, + "loss": 0.9121, + "step": 21530 + }, + { + "epoch": 0.5186532116278509, + "grad_norm": 1.6736096143722534, + "learning_rate": 5.06718385559274e-07, + "loss": 0.9253, + "step": 21535 + }, + { + "epoch": 0.5187736326196383, + "grad_norm": 1.7354846000671387, + "learning_rate": 5.065916235675894e-07, + "loss": 0.9529, + "step": 21540 + }, + { + "epoch": 0.5188940536114256, + "grad_norm": 1.8778737783432007, + "learning_rate": 5.064648615759051e-07, + "loss": 0.859, + "step": 21545 + }, + { + "epoch": 0.5190144746032128, + "grad_norm": 1.4433398246765137, + "learning_rate": 5.063380995842207e-07, + "loss": 0.8193, + "step": 21550 + }, + { + "epoch": 0.5191348955950001, + "grad_norm": 1.7733087539672852, + "learning_rate": 5.062113375925362e-07, + "loss": 0.8426, + "step": 21555 + }, + { + "epoch": 0.5192553165867874, + "grad_norm": 1.736220359802246, + "learning_rate": 5.060845756008518e-07, + "loss": 0.9707, + "step": 21560 + }, + { + "epoch": 0.5193757375785747, + "grad_norm": 1.5831776857376099, + "learning_rate": 5.059578136091674e-07, + "loss": 0.956, + "step": 21565 + }, + { + "epoch": 0.519496158570362, + "grad_norm": 1.6207410097122192, + "learning_rate": 5.05831051617483e-07, + "loss": 0.87, + "step": 21570 + }, + { + "epoch": 0.5196165795621492, + "grad_norm": 1.3546632528305054, + "learning_rate": 5.057042896257986e-07, + "loss": 0.9305, + "step": 21575 + }, + { + "epoch": 0.5197370005539366, + "grad_norm": 1.4451007843017578, + "learning_rate": 5.055775276341143e-07, + "loss": 0.9327, + "step": 21580 + }, + { + "epoch": 0.5198574215457239, + "grad_norm": 1.677097201347351, + "learning_rate": 5.054507656424297e-07, + "loss": 0.8981, + "step": 21585 + }, + { + "epoch": 0.5199778425375111, + "grad_norm": 1.4165575504302979, + "learning_rate": 5.053240036507453e-07, + "loss": 0.9142, + "step": 21590 + }, + { + "epoch": 0.5200982635292984, + "grad_norm": 1.5407862663269043, + "learning_rate": 5.05197241659061e-07, + "loss": 0.8778, + "step": 21595 + }, + { + "epoch": 0.5202186845210857, + "grad_norm": 1.4036144018173218, + "learning_rate": 5.050704796673765e-07, + "loss": 0.9203, + "step": 21600 + }, + { + "epoch": 0.520339105512873, + "grad_norm": 1.6382545232772827, + "learning_rate": 5.049437176756921e-07, + "loss": 0.9628, + "step": 21605 + }, + { + "epoch": 0.5204595265046603, + "grad_norm": 1.9323283433914185, + "learning_rate": 5.048169556840077e-07, + "loss": 0.9123, + "step": 21610 + }, + { + "epoch": 0.5205799474964475, + "grad_norm": 1.5797611474990845, + "learning_rate": 5.046901936923232e-07, + "loss": 0.9022, + "step": 21615 + }, + { + "epoch": 0.5207003684882349, + "grad_norm": 1.5481973886489868, + "learning_rate": 5.045634317006389e-07, + "loss": 0.9458, + "step": 21620 + }, + { + "epoch": 0.5208207894800222, + "grad_norm": 1.4047622680664062, + "learning_rate": 5.044366697089545e-07, + "loss": 0.8682, + "step": 21625 + }, + { + "epoch": 0.5209412104718094, + "grad_norm": 1.4203743934631348, + "learning_rate": 5.0430990771727e-07, + "loss": 0.8861, + "step": 21630 + }, + { + "epoch": 0.5210616314635967, + "grad_norm": 1.6246775388717651, + "learning_rate": 5.041831457255856e-07, + "loss": 0.8425, + "step": 21635 + }, + { + "epoch": 0.5211820524553841, + "grad_norm": 1.6451424360275269, + "learning_rate": 5.040563837339013e-07, + "loss": 0.9304, + "step": 21640 + }, + { + "epoch": 0.5213024734471713, + "grad_norm": 1.7852120399475098, + "learning_rate": 5.039296217422168e-07, + "loss": 0.8965, + "step": 21645 + }, + { + "epoch": 0.5214228944389586, + "grad_norm": 1.5709748268127441, + "learning_rate": 5.038028597505324e-07, + "loss": 0.9198, + "step": 21650 + }, + { + "epoch": 0.5215433154307458, + "grad_norm": 2.4693374633789062, + "learning_rate": 5.03676097758848e-07, + "loss": 0.8776, + "step": 21655 + }, + { + "epoch": 0.5216637364225332, + "grad_norm": 1.5472397804260254, + "learning_rate": 5.035493357671635e-07, + "loss": 0.8557, + "step": 21660 + }, + { + "epoch": 0.5217841574143205, + "grad_norm": 1.5521728992462158, + "learning_rate": 5.034225737754792e-07, + "loss": 0.9405, + "step": 21665 + }, + { + "epoch": 0.5219045784061077, + "grad_norm": 1.5593839883804321, + "learning_rate": 5.032958117837948e-07, + "loss": 0.8569, + "step": 21670 + }, + { + "epoch": 0.522024999397895, + "grad_norm": 1.4785000085830688, + "learning_rate": 5.031690497921102e-07, + "loss": 0.8832, + "step": 21675 + }, + { + "epoch": 0.5221454203896824, + "grad_norm": 1.4843261241912842, + "learning_rate": 5.030422878004259e-07, + "loss": 0.8979, + "step": 21680 + }, + { + "epoch": 0.5222658413814696, + "grad_norm": 1.536954641342163, + "learning_rate": 5.029155258087415e-07, + "loss": 0.9145, + "step": 21685 + }, + { + "epoch": 0.5223862623732569, + "grad_norm": 1.4595369100570679, + "learning_rate": 5.027887638170571e-07, + "loss": 0.8905, + "step": 21690 + }, + { + "epoch": 0.5225066833650442, + "grad_norm": 1.3696085214614868, + "learning_rate": 5.026620018253727e-07, + "loss": 0.9039, + "step": 21695 + }, + { + "epoch": 0.5226271043568315, + "grad_norm": 1.6402760744094849, + "learning_rate": 5.025352398336882e-07, + "loss": 0.8909, + "step": 21700 + }, + { + "epoch": 0.5227475253486188, + "grad_norm": 1.538196325302124, + "learning_rate": 5.024084778420038e-07, + "loss": 0.9148, + "step": 21705 + }, + { + "epoch": 0.522867946340406, + "grad_norm": 1.3932260274887085, + "learning_rate": 5.022817158503194e-07, + "loss": 0.8892, + "step": 21710 + }, + { + "epoch": 0.5229883673321933, + "grad_norm": 1.6184591054916382, + "learning_rate": 5.021549538586351e-07, + "loss": 0.8681, + "step": 21715 + }, + { + "epoch": 0.5231087883239807, + "grad_norm": 1.3606884479522705, + "learning_rate": 5.020281918669505e-07, + "loss": 0.8935, + "step": 21720 + }, + { + "epoch": 0.5232292093157679, + "grad_norm": 1.7561999559402466, + "learning_rate": 5.019014298752662e-07, + "loss": 0.8992, + "step": 21725 + }, + { + "epoch": 0.5233496303075552, + "grad_norm": 1.3053646087646484, + "learning_rate": 5.017746678835818e-07, + "loss": 0.9059, + "step": 21730 + }, + { + "epoch": 0.5234700512993425, + "grad_norm": 1.863433599472046, + "learning_rate": 5.016479058918973e-07, + "loss": 0.9344, + "step": 21735 + }, + { + "epoch": 0.5235904722911298, + "grad_norm": 1.3806565999984741, + "learning_rate": 5.01521143900213e-07, + "loss": 0.9454, + "step": 21740 + }, + { + "epoch": 0.5237108932829171, + "grad_norm": 2.0067007541656494, + "learning_rate": 5.013943819085285e-07, + "loss": 0.8896, + "step": 21745 + }, + { + "epoch": 0.5238313142747044, + "grad_norm": 1.4963665008544922, + "learning_rate": 5.012676199168441e-07, + "loss": 0.8821, + "step": 21750 + }, + { + "epoch": 0.5239517352664916, + "grad_norm": 1.5632950067520142, + "learning_rate": 5.011408579251597e-07, + "loss": 0.9355, + "step": 21755 + }, + { + "epoch": 0.524072156258279, + "grad_norm": 1.5998642444610596, + "learning_rate": 5.010140959334753e-07, + "loss": 0.9616, + "step": 21760 + }, + { + "epoch": 0.5241925772500662, + "grad_norm": 1.5908117294311523, + "learning_rate": 5.008873339417909e-07, + "loss": 0.8835, + "step": 21765 + }, + { + "epoch": 0.5243129982418535, + "grad_norm": 1.4726042747497559, + "learning_rate": 5.007605719501064e-07, + "loss": 0.8774, + "step": 21770 + }, + { + "epoch": 0.5244334192336408, + "grad_norm": 1.5189387798309326, + "learning_rate": 5.006338099584221e-07, + "loss": 0.9001, + "step": 21775 + }, + { + "epoch": 0.5245538402254281, + "grad_norm": 1.428139090538025, + "learning_rate": 5.005070479667376e-07, + "loss": 0.9265, + "step": 21780 + }, + { + "epoch": 0.5246742612172154, + "grad_norm": 1.4064525365829468, + "learning_rate": 5.003802859750533e-07, + "loss": 0.8685, + "step": 21785 + }, + { + "epoch": 0.5247946822090027, + "grad_norm": 1.557692289352417, + "learning_rate": 5.002535239833688e-07, + "loss": 0.9168, + "step": 21790 + }, + { + "epoch": 0.5249151032007899, + "grad_norm": 1.5677871704101562, + "learning_rate": 5.001267619916843e-07, + "loss": 0.9341, + "step": 21795 + }, + { + "epoch": 0.5250355241925773, + "grad_norm": 1.5455026626586914, + "learning_rate": 5e-07, + "loss": 0.8812, + "step": 21800 + }, + { + "epoch": 0.5251559451843646, + "grad_norm": 1.6961249113082886, + "learning_rate": 4.998732380083155e-07, + "loss": 0.9275, + "step": 21805 + }, + { + "epoch": 0.5252763661761518, + "grad_norm": 1.4484466314315796, + "learning_rate": 4.997464760166312e-07, + "loss": 0.8577, + "step": 21810 + }, + { + "epoch": 0.5253967871679391, + "grad_norm": 1.5019750595092773, + "learning_rate": 4.996197140249467e-07, + "loss": 0.9201, + "step": 21815 + }, + { + "epoch": 0.5255172081597264, + "grad_norm": 1.5143229961395264, + "learning_rate": 4.994929520332623e-07, + "loss": 0.9347, + "step": 21820 + }, + { + "epoch": 0.5256376291515137, + "grad_norm": 1.7485090494155884, + "learning_rate": 4.993661900415779e-07, + "loss": 0.9222, + "step": 21825 + }, + { + "epoch": 0.525758050143301, + "grad_norm": 1.5002058744430542, + "learning_rate": 4.992394280498935e-07, + "loss": 0.882, + "step": 21830 + }, + { + "epoch": 0.5258784711350882, + "grad_norm": 2.1091864109039307, + "learning_rate": 4.991126660582091e-07, + "loss": 0.9634, + "step": 21835 + }, + { + "epoch": 0.5259988921268756, + "grad_norm": 1.8817534446716309, + "learning_rate": 4.989859040665247e-07, + "loss": 0.9068, + "step": 21840 + }, + { + "epoch": 0.5261193131186629, + "grad_norm": 1.529942274093628, + "learning_rate": 4.988591420748403e-07, + "loss": 0.8842, + "step": 21845 + }, + { + "epoch": 0.5262397341104501, + "grad_norm": 1.7217942476272583, + "learning_rate": 4.987323800831558e-07, + "loss": 0.9049, + "step": 21850 + }, + { + "epoch": 0.5263601551022374, + "grad_norm": 1.5230528116226196, + "learning_rate": 4.986056180914714e-07, + "loss": 0.9461, + "step": 21855 + }, + { + "epoch": 0.5264805760940248, + "grad_norm": 1.4534242153167725, + "learning_rate": 4.98478856099787e-07, + "loss": 0.8935, + "step": 21860 + }, + { + "epoch": 0.526600997085812, + "grad_norm": 1.3972665071487427, + "learning_rate": 4.983520941081026e-07, + "loss": 0.8875, + "step": 21865 + }, + { + "epoch": 0.5267214180775993, + "grad_norm": 1.5731639862060547, + "learning_rate": 4.982253321164182e-07, + "loss": 0.8769, + "step": 21870 + }, + { + "epoch": 0.5268418390693865, + "grad_norm": 1.5789008140563965, + "learning_rate": 4.980985701247338e-07, + "loss": 0.9617, + "step": 21875 + }, + { + "epoch": 0.5269622600611739, + "grad_norm": 1.4749755859375, + "learning_rate": 4.979718081330493e-07, + "loss": 0.8825, + "step": 21880 + }, + { + "epoch": 0.5270826810529612, + "grad_norm": 2.0449745655059814, + "learning_rate": 4.97845046141365e-07, + "loss": 0.9582, + "step": 21885 + }, + { + "epoch": 0.5272031020447484, + "grad_norm": 1.6935410499572754, + "learning_rate": 4.977182841496805e-07, + "loss": 0.9129, + "step": 21890 + }, + { + "epoch": 0.5273235230365357, + "grad_norm": 1.5887279510498047, + "learning_rate": 4.975915221579961e-07, + "loss": 0.8921, + "step": 21895 + }, + { + "epoch": 0.5274439440283231, + "grad_norm": 1.8212227821350098, + "learning_rate": 4.974647601663117e-07, + "loss": 0.9361, + "step": 21900 + }, + { + "epoch": 0.5275643650201103, + "grad_norm": 1.4673494100570679, + "learning_rate": 4.973379981746272e-07, + "loss": 0.919, + "step": 21905 + }, + { + "epoch": 0.5276847860118976, + "grad_norm": 1.492056131362915, + "learning_rate": 4.972112361829429e-07, + "loss": 0.9292, + "step": 21910 + }, + { + "epoch": 0.5278052070036848, + "grad_norm": 1.6951390504837036, + "learning_rate": 4.970844741912585e-07, + "loss": 0.9186, + "step": 21915 + }, + { + "epoch": 0.5279256279954722, + "grad_norm": 1.5264896154403687, + "learning_rate": 4.969577121995741e-07, + "loss": 0.8546, + "step": 21920 + }, + { + "epoch": 0.5280460489872595, + "grad_norm": 1.3826159238815308, + "learning_rate": 4.968309502078896e-07, + "loss": 0.9198, + "step": 21925 + }, + { + "epoch": 0.5281664699790467, + "grad_norm": 1.53154718875885, + "learning_rate": 4.967041882162053e-07, + "loss": 0.8998, + "step": 21930 + }, + { + "epoch": 0.528286890970834, + "grad_norm": 1.5544826984405518, + "learning_rate": 4.965774262245208e-07, + "loss": 0.8952, + "step": 21935 + }, + { + "epoch": 0.5284073119626214, + "grad_norm": 1.7008835077285767, + "learning_rate": 4.964506642328363e-07, + "loss": 0.9436, + "step": 21940 + }, + { + "epoch": 0.5285277329544086, + "grad_norm": 1.7531218528747559, + "learning_rate": 4.96323902241152e-07, + "loss": 0.9292, + "step": 21945 + }, + { + "epoch": 0.5286481539461959, + "grad_norm": 1.6096489429473877, + "learning_rate": 4.961971402494675e-07, + "loss": 0.9339, + "step": 21950 + }, + { + "epoch": 0.5287685749379832, + "grad_norm": 1.3603724241256714, + "learning_rate": 4.960703782577832e-07, + "loss": 0.8547, + "step": 21955 + }, + { + "epoch": 0.5288889959297705, + "grad_norm": 1.8208463191986084, + "learning_rate": 4.959436162660988e-07, + "loss": 0.9059, + "step": 21960 + }, + { + "epoch": 0.5290094169215578, + "grad_norm": 1.6965110301971436, + "learning_rate": 4.958168542744143e-07, + "loss": 0.8995, + "step": 21965 + }, + { + "epoch": 0.529129837913345, + "grad_norm": 1.5166840553283691, + "learning_rate": 4.956900922827299e-07, + "loss": 0.8476, + "step": 21970 + }, + { + "epoch": 0.5292502589051323, + "grad_norm": 1.7423069477081299, + "learning_rate": 4.955633302910455e-07, + "loss": 0.9335, + "step": 21975 + }, + { + "epoch": 0.5293706798969197, + "grad_norm": 1.5373573303222656, + "learning_rate": 4.954365682993611e-07, + "loss": 0.9236, + "step": 21980 + }, + { + "epoch": 0.5294911008887069, + "grad_norm": 1.5119625329971313, + "learning_rate": 4.953098063076766e-07, + "loss": 0.8502, + "step": 21985 + }, + { + "epoch": 0.5296115218804942, + "grad_norm": 1.5202735662460327, + "learning_rate": 4.951830443159923e-07, + "loss": 0.8701, + "step": 21990 + }, + { + "epoch": 0.5297319428722815, + "grad_norm": 1.4449344873428345, + "learning_rate": 4.950562823243079e-07, + "loss": 0.9292, + "step": 21995 + }, + { + "epoch": 0.5298523638640688, + "grad_norm": 1.5788203477859497, + "learning_rate": 4.949295203326234e-07, + "loss": 0.889, + "step": 22000 + }, + { + "epoch": 0.5299727848558561, + "grad_norm": 1.403855562210083, + "learning_rate": 4.948027583409391e-07, + "loss": 0.8862, + "step": 22005 + }, + { + "epoch": 0.5300932058476434, + "grad_norm": 1.394984245300293, + "learning_rate": 4.946759963492546e-07, + "loss": 0.9096, + "step": 22010 + }, + { + "epoch": 0.5302136268394306, + "grad_norm": 1.433779239654541, + "learning_rate": 4.945492343575702e-07, + "loss": 0.8892, + "step": 22015 + }, + { + "epoch": 0.530334047831218, + "grad_norm": 1.5095500946044922, + "learning_rate": 4.944224723658858e-07, + "loss": 0.9021, + "step": 22020 + }, + { + "epoch": 0.5304544688230052, + "grad_norm": 1.5243232250213623, + "learning_rate": 4.942957103742013e-07, + "loss": 0.9025, + "step": 22025 + }, + { + "epoch": 0.5305748898147925, + "grad_norm": 1.395520806312561, + "learning_rate": 4.94168948382517e-07, + "loss": 0.914, + "step": 22030 + }, + { + "epoch": 0.5306953108065798, + "grad_norm": 1.6437811851501465, + "learning_rate": 4.940421863908325e-07, + "loss": 0.9334, + "step": 22035 + }, + { + "epoch": 0.530815731798367, + "grad_norm": 1.5893141031265259, + "learning_rate": 4.939154243991482e-07, + "loss": 0.8897, + "step": 22040 + }, + { + "epoch": 0.5309361527901544, + "grad_norm": 1.482459545135498, + "learning_rate": 4.937886624074637e-07, + "loss": 0.866, + "step": 22045 + }, + { + "epoch": 0.5310565737819417, + "grad_norm": 1.573954701423645, + "learning_rate": 4.936619004157794e-07, + "loss": 0.8911, + "step": 22050 + }, + { + "epoch": 0.5311769947737289, + "grad_norm": 1.6320579051971436, + "learning_rate": 4.935351384240949e-07, + "loss": 0.9018, + "step": 22055 + }, + { + "epoch": 0.5312974157655163, + "grad_norm": 1.6011643409729004, + "learning_rate": 4.934083764324105e-07, + "loss": 0.9071, + "step": 22060 + }, + { + "epoch": 0.5314178367573036, + "grad_norm": 2.10659122467041, + "learning_rate": 4.932816144407261e-07, + "loss": 0.9777, + "step": 22065 + }, + { + "epoch": 0.5315382577490908, + "grad_norm": 1.4913204908370972, + "learning_rate": 4.931548524490416e-07, + "loss": 0.9108, + "step": 22070 + }, + { + "epoch": 0.5316586787408781, + "grad_norm": 1.5963008403778076, + "learning_rate": 4.930280904573573e-07, + "loss": 0.8917, + "step": 22075 + }, + { + "epoch": 0.5317790997326654, + "grad_norm": 1.4318112134933472, + "learning_rate": 4.929013284656728e-07, + "loss": 0.8998, + "step": 22080 + }, + { + "epoch": 0.5318995207244527, + "grad_norm": 1.4206163883209229, + "learning_rate": 4.927745664739884e-07, + "loss": 0.917, + "step": 22085 + }, + { + "epoch": 0.53201994171624, + "grad_norm": 1.5320185422897339, + "learning_rate": 4.92647804482304e-07, + "loss": 0.9307, + "step": 22090 + }, + { + "epoch": 0.5321403627080272, + "grad_norm": 1.8943982124328613, + "learning_rate": 4.925210424906196e-07, + "loss": 0.8849, + "step": 22095 + }, + { + "epoch": 0.5322607836998146, + "grad_norm": 1.48078453540802, + "learning_rate": 4.923942804989352e-07, + "loss": 0.8307, + "step": 22100 + }, + { + "epoch": 0.5323812046916019, + "grad_norm": 1.756567120552063, + "learning_rate": 4.922675185072508e-07, + "loss": 0.8657, + "step": 22105 + }, + { + "epoch": 0.5325016256833891, + "grad_norm": 1.6934877634048462, + "learning_rate": 4.921407565155664e-07, + "loss": 0.9453, + "step": 22110 + }, + { + "epoch": 0.5326220466751764, + "grad_norm": 1.4306222200393677, + "learning_rate": 4.920139945238819e-07, + "loss": 0.9266, + "step": 22115 + }, + { + "epoch": 0.5327424676669638, + "grad_norm": 1.571732759475708, + "learning_rate": 4.918872325321975e-07, + "loss": 0.8887, + "step": 22120 + }, + { + "epoch": 0.532862888658751, + "grad_norm": 1.6166703701019287, + "learning_rate": 4.917604705405131e-07, + "loss": 0.9261, + "step": 22125 + }, + { + "epoch": 0.5329833096505383, + "grad_norm": 1.8382987976074219, + "learning_rate": 4.916337085488287e-07, + "loss": 0.8377, + "step": 22130 + }, + { + "epoch": 0.5331037306423255, + "grad_norm": 2.1283352375030518, + "learning_rate": 4.915069465571443e-07, + "loss": 0.9282, + "step": 22135 + }, + { + "epoch": 0.5332241516341129, + "grad_norm": 1.431143879890442, + "learning_rate": 4.913801845654599e-07, + "loss": 0.8822, + "step": 22140 + }, + { + "epoch": 0.5333445726259002, + "grad_norm": 1.5645533800125122, + "learning_rate": 4.912534225737754e-07, + "loss": 0.8755, + "step": 22145 + }, + { + "epoch": 0.5334649936176874, + "grad_norm": 1.641961693763733, + "learning_rate": 4.911266605820911e-07, + "loss": 0.9234, + "step": 22150 + }, + { + "epoch": 0.5335854146094747, + "grad_norm": 1.3797447681427002, + "learning_rate": 4.909998985904066e-07, + "loss": 0.964, + "step": 22155 + }, + { + "epoch": 0.5337058356012621, + "grad_norm": 1.3773528337478638, + "learning_rate": 4.908731365987222e-07, + "loss": 0.8885, + "step": 22160 + }, + { + "epoch": 0.5338262565930493, + "grad_norm": 1.5127993822097778, + "learning_rate": 4.907463746070378e-07, + "loss": 0.9015, + "step": 22165 + }, + { + "epoch": 0.5339466775848366, + "grad_norm": 1.6455553770065308, + "learning_rate": 4.906196126153533e-07, + "loss": 0.8727, + "step": 22170 + }, + { + "epoch": 0.5340670985766238, + "grad_norm": 1.4866843223571777, + "learning_rate": 4.90492850623669e-07, + "loss": 0.9068, + "step": 22175 + }, + { + "epoch": 0.5341875195684112, + "grad_norm": 1.6697514057159424, + "learning_rate": 4.903660886319846e-07, + "loss": 0.9244, + "step": 22180 + }, + { + "epoch": 0.5343079405601985, + "grad_norm": 1.9051423072814941, + "learning_rate": 4.902393266403002e-07, + "loss": 0.9537, + "step": 22185 + }, + { + "epoch": 0.5344283615519857, + "grad_norm": 1.4561595916748047, + "learning_rate": 4.901125646486157e-07, + "loss": 0.8634, + "step": 22190 + }, + { + "epoch": 0.534548782543773, + "grad_norm": 1.6183537244796753, + "learning_rate": 4.899858026569314e-07, + "loss": 0.8765, + "step": 22195 + }, + { + "epoch": 0.5346692035355604, + "grad_norm": 1.6155229806900024, + "learning_rate": 4.898590406652469e-07, + "loss": 0.9336, + "step": 22200 + }, + { + "epoch": 0.5347896245273476, + "grad_norm": 1.5258883237838745, + "learning_rate": 4.897322786735624e-07, + "loss": 0.9099, + "step": 22205 + }, + { + "epoch": 0.5349100455191349, + "grad_norm": 1.4723219871520996, + "learning_rate": 4.896055166818781e-07, + "loss": 0.8694, + "step": 22210 + }, + { + "epoch": 0.5350304665109222, + "grad_norm": 1.4346950054168701, + "learning_rate": 4.894787546901937e-07, + "loss": 0.914, + "step": 22215 + }, + { + "epoch": 0.5351508875027094, + "grad_norm": 1.5123918056488037, + "learning_rate": 4.893519926985093e-07, + "loss": 0.8903, + "step": 22220 + }, + { + "epoch": 0.5352713084944968, + "grad_norm": 1.8437974452972412, + "learning_rate": 4.892252307068249e-07, + "loss": 0.9109, + "step": 22225 + }, + { + "epoch": 0.535391729486284, + "grad_norm": 1.5314289331436157, + "learning_rate": 4.890984687151404e-07, + "loss": 0.8496, + "step": 22230 + }, + { + "epoch": 0.5355121504780713, + "grad_norm": 1.5800902843475342, + "learning_rate": 4.88971706723456e-07, + "loss": 0.9294, + "step": 22235 + }, + { + "epoch": 0.5356325714698587, + "grad_norm": 1.6109490394592285, + "learning_rate": 4.888449447317716e-07, + "loss": 0.8759, + "step": 22240 + }, + { + "epoch": 0.5357529924616459, + "grad_norm": 1.414548397064209, + "learning_rate": 4.887181827400872e-07, + "loss": 0.9122, + "step": 22245 + }, + { + "epoch": 0.5358734134534332, + "grad_norm": 1.4882311820983887, + "learning_rate": 4.885914207484027e-07, + "loss": 0.8801, + "step": 22250 + }, + { + "epoch": 0.5359938344452205, + "grad_norm": 1.5739588737487793, + "learning_rate": 4.884646587567184e-07, + "loss": 0.8628, + "step": 22255 + }, + { + "epoch": 0.5361142554370077, + "grad_norm": 1.5703134536743164, + "learning_rate": 4.88337896765034e-07, + "loss": 0.8883, + "step": 22260 + }, + { + "epoch": 0.5362346764287951, + "grad_norm": 1.537899374961853, + "learning_rate": 4.882111347733495e-07, + "loss": 0.938, + "step": 22265 + }, + { + "epoch": 0.5363550974205824, + "grad_norm": 1.603467345237732, + "learning_rate": 4.880843727816652e-07, + "loss": 0.907, + "step": 22270 + }, + { + "epoch": 0.5364755184123696, + "grad_norm": 1.553680419921875, + "learning_rate": 4.879576107899807e-07, + "loss": 0.9072, + "step": 22275 + }, + { + "epoch": 0.536595939404157, + "grad_norm": 1.4864248037338257, + "learning_rate": 4.878308487982963e-07, + "loss": 0.8833, + "step": 22280 + }, + { + "epoch": 0.5367163603959442, + "grad_norm": 1.5189486742019653, + "learning_rate": 4.877040868066119e-07, + "loss": 0.9643, + "step": 22285 + }, + { + "epoch": 0.5368367813877315, + "grad_norm": 1.6031887531280518, + "learning_rate": 4.875773248149274e-07, + "loss": 0.9135, + "step": 22290 + }, + { + "epoch": 0.5369572023795188, + "grad_norm": 1.531436562538147, + "learning_rate": 4.874505628232431e-07, + "loss": 0.9054, + "step": 22295 + }, + { + "epoch": 0.537077623371306, + "grad_norm": 1.7289807796478271, + "learning_rate": 4.873238008315586e-07, + "loss": 0.9326, + "step": 22300 + }, + { + "epoch": 0.5371980443630934, + "grad_norm": 1.4896320104599, + "learning_rate": 4.871970388398743e-07, + "loss": 0.8859, + "step": 22305 + }, + { + "epoch": 0.5373184653548807, + "grad_norm": 1.5204687118530273, + "learning_rate": 4.870702768481898e-07, + "loss": 0.8812, + "step": 22310 + }, + { + "epoch": 0.5374388863466679, + "grad_norm": 1.5427957773208618, + "learning_rate": 4.869435148565055e-07, + "loss": 0.9262, + "step": 22315 + }, + { + "epoch": 0.5375593073384553, + "grad_norm": 1.3800618648529053, + "learning_rate": 4.86816752864821e-07, + "loss": 0.8908, + "step": 22320 + }, + { + "epoch": 0.5376797283302426, + "grad_norm": 1.6934906244277954, + "learning_rate": 4.866899908731365e-07, + "loss": 0.9171, + "step": 22325 + }, + { + "epoch": 0.5378001493220298, + "grad_norm": 1.7136369943618774, + "learning_rate": 4.865632288814522e-07, + "loss": 0.9061, + "step": 22330 + }, + { + "epoch": 0.5379205703138171, + "grad_norm": 2.0289530754089355, + "learning_rate": 4.864364668897677e-07, + "loss": 0.8473, + "step": 22335 + }, + { + "epoch": 0.5380409913056043, + "grad_norm": 1.7243914604187012, + "learning_rate": 4.863097048980834e-07, + "loss": 0.9861, + "step": 22340 + }, + { + "epoch": 0.5381614122973917, + "grad_norm": 1.466810941696167, + "learning_rate": 4.861829429063989e-07, + "loss": 0.9319, + "step": 22345 + }, + { + "epoch": 0.538281833289179, + "grad_norm": 1.6713786125183105, + "learning_rate": 4.860561809147145e-07, + "loss": 0.9378, + "step": 22350 + }, + { + "epoch": 0.5384022542809662, + "grad_norm": 1.7098300457000732, + "learning_rate": 4.859294189230301e-07, + "loss": 0.9032, + "step": 22355 + }, + { + "epoch": 0.5385226752727535, + "grad_norm": 1.8780438899993896, + "learning_rate": 4.858026569313457e-07, + "loss": 0.8889, + "step": 22360 + }, + { + "epoch": 0.5386430962645409, + "grad_norm": 1.4092459678649902, + "learning_rate": 4.856758949396613e-07, + "loss": 0.9362, + "step": 22365 + }, + { + "epoch": 0.5387635172563281, + "grad_norm": 1.4840173721313477, + "learning_rate": 4.855491329479768e-07, + "loss": 0.9139, + "step": 22370 + }, + { + "epoch": 0.5388839382481154, + "grad_norm": 1.5067224502563477, + "learning_rate": 4.854223709562925e-07, + "loss": 0.9317, + "step": 22375 + }, + { + "epoch": 0.5390043592399028, + "grad_norm": 1.5796935558319092, + "learning_rate": 4.85295608964608e-07, + "loss": 0.8784, + "step": 22380 + }, + { + "epoch": 0.53912478023169, + "grad_norm": 2.1127724647521973, + "learning_rate": 4.851688469729236e-07, + "loss": 0.852, + "step": 22385 + }, + { + "epoch": 0.5392452012234773, + "grad_norm": 1.3739427328109741, + "learning_rate": 4.850420849812392e-07, + "loss": 0.9154, + "step": 22390 + }, + { + "epoch": 0.5393656222152645, + "grad_norm": 1.535537600517273, + "learning_rate": 4.849153229895548e-07, + "loss": 0.9482, + "step": 22395 + }, + { + "epoch": 0.5394860432070518, + "grad_norm": 1.9030694961547852, + "learning_rate": 4.847885609978704e-07, + "loss": 0.9118, + "step": 22400 + }, + { + "epoch": 0.5396064641988392, + "grad_norm": 1.6682997941970825, + "learning_rate": 4.84661799006186e-07, + "loss": 0.909, + "step": 22405 + }, + { + "epoch": 0.5397268851906264, + "grad_norm": 1.4649995565414429, + "learning_rate": 4.845350370145015e-07, + "loss": 0.9428, + "step": 22410 + }, + { + "epoch": 0.5398473061824137, + "grad_norm": 1.4420379400253296, + "learning_rate": 4.844082750228171e-07, + "loss": 0.892, + "step": 22415 + }, + { + "epoch": 0.539967727174201, + "grad_norm": 1.4790561199188232, + "learning_rate": 4.842815130311327e-07, + "loss": 0.9114, + "step": 22420 + }, + { + "epoch": 0.5400881481659883, + "grad_norm": 1.670272946357727, + "learning_rate": 4.841547510394483e-07, + "loss": 0.92, + "step": 22425 + }, + { + "epoch": 0.5402085691577756, + "grad_norm": 1.5772608518600464, + "learning_rate": 4.840279890477639e-07, + "loss": 0.9178, + "step": 22430 + }, + { + "epoch": 0.5403289901495629, + "grad_norm": 1.4250884056091309, + "learning_rate": 4.839012270560794e-07, + "loss": 0.9318, + "step": 22435 + }, + { + "epoch": 0.5404494111413501, + "grad_norm": 1.5450215339660645, + "learning_rate": 4.837744650643951e-07, + "loss": 0.904, + "step": 22440 + }, + { + "epoch": 0.5405698321331375, + "grad_norm": 1.6371772289276123, + "learning_rate": 4.836477030727106e-07, + "loss": 0.9654, + "step": 22445 + }, + { + "epoch": 0.5406902531249247, + "grad_norm": 1.6197057962417603, + "learning_rate": 4.835209410810263e-07, + "loss": 0.8841, + "step": 22450 + }, + { + "epoch": 0.540810674116712, + "grad_norm": 1.5857040882110596, + "learning_rate": 4.833941790893418e-07, + "loss": 0.8552, + "step": 22455 + }, + { + "epoch": 0.5409310951084993, + "grad_norm": 1.353580355644226, + "learning_rate": 4.832674170976574e-07, + "loss": 0.9054, + "step": 22460 + }, + { + "epoch": 0.5410515161002866, + "grad_norm": 1.7279608249664307, + "learning_rate": 4.83140655105973e-07, + "loss": 0.9183, + "step": 22465 + }, + { + "epoch": 0.5411719370920739, + "grad_norm": 1.3793567419052124, + "learning_rate": 4.830138931142885e-07, + "loss": 0.931, + "step": 22470 + }, + { + "epoch": 0.5412923580838612, + "grad_norm": 1.5536220073699951, + "learning_rate": 4.828871311226042e-07, + "loss": 0.9399, + "step": 22475 + }, + { + "epoch": 0.5414127790756484, + "grad_norm": 1.6547752618789673, + "learning_rate": 4.827603691309198e-07, + "loss": 0.9144, + "step": 22480 + }, + { + "epoch": 0.5415332000674358, + "grad_norm": 1.457759976387024, + "learning_rate": 4.826336071392354e-07, + "loss": 0.9018, + "step": 22485 + }, + { + "epoch": 0.541653621059223, + "grad_norm": 1.404378056526184, + "learning_rate": 4.825068451475509e-07, + "loss": 0.8929, + "step": 22490 + }, + { + "epoch": 0.5417740420510103, + "grad_norm": 1.51789391040802, + "learning_rate": 4.823800831558665e-07, + "loss": 0.8866, + "step": 22495 + }, + { + "epoch": 0.5418944630427976, + "grad_norm": 1.4357305765151978, + "learning_rate": 4.822533211641821e-07, + "loss": 0.9547, + "step": 22500 + }, + { + "epoch": 0.5420148840345849, + "grad_norm": 1.4671056270599365, + "learning_rate": 4.821265591724976e-07, + "loss": 0.8463, + "step": 22505 + }, + { + "epoch": 0.5421353050263722, + "grad_norm": 1.8944841623306274, + "learning_rate": 4.819997971808133e-07, + "loss": 0.9059, + "step": 22510 + }, + { + "epoch": 0.5422557260181595, + "grad_norm": 1.6028144359588623, + "learning_rate": 4.818730351891288e-07, + "loss": 0.8956, + "step": 22515 + }, + { + "epoch": 0.5423761470099467, + "grad_norm": 1.5150758028030396, + "learning_rate": 4.817462731974445e-07, + "loss": 0.9154, + "step": 22520 + }, + { + "epoch": 0.5424965680017341, + "grad_norm": 1.5906963348388672, + "learning_rate": 4.816195112057601e-07, + "loss": 0.883, + "step": 22525 + }, + { + "epoch": 0.5426169889935214, + "grad_norm": 1.5873745679855347, + "learning_rate": 4.814927492140756e-07, + "loss": 0.9274, + "step": 22530 + }, + { + "epoch": 0.5427374099853086, + "grad_norm": 1.822314739227295, + "learning_rate": 4.813659872223912e-07, + "loss": 0.9055, + "step": 22535 + }, + { + "epoch": 0.542857830977096, + "grad_norm": 1.9757715463638306, + "learning_rate": 4.812392252307068e-07, + "loss": 0.9248, + "step": 22540 + }, + { + "epoch": 0.5429782519688832, + "grad_norm": 1.5905802249908447, + "learning_rate": 4.811124632390224e-07, + "loss": 0.861, + "step": 22545 + }, + { + "epoch": 0.5430986729606705, + "grad_norm": 1.6692482233047485, + "learning_rate": 4.809857012473379e-07, + "loss": 0.9657, + "step": 22550 + }, + { + "epoch": 0.5432190939524578, + "grad_norm": 1.6076393127441406, + "learning_rate": 4.808589392556535e-07, + "loss": 0.957, + "step": 22555 + }, + { + "epoch": 0.543339514944245, + "grad_norm": 1.3963638544082642, + "learning_rate": 4.807321772639692e-07, + "loss": 0.8874, + "step": 22560 + }, + { + "epoch": 0.5434599359360324, + "grad_norm": 1.4064629077911377, + "learning_rate": 4.806054152722847e-07, + "loss": 0.8713, + "step": 22565 + }, + { + "epoch": 0.5435803569278197, + "grad_norm": 1.4674707651138306, + "learning_rate": 4.804786532806004e-07, + "loss": 0.92, + "step": 22570 + }, + { + "epoch": 0.5437007779196069, + "grad_norm": 1.6980865001678467, + "learning_rate": 4.803518912889159e-07, + "loss": 0.8332, + "step": 22575 + }, + { + "epoch": 0.5438211989113942, + "grad_norm": 1.4662402868270874, + "learning_rate": 4.802251292972316e-07, + "loss": 0.8667, + "step": 22580 + }, + { + "epoch": 0.5439416199031816, + "grad_norm": 1.815271019935608, + "learning_rate": 4.800983673055471e-07, + "loss": 0.9332, + "step": 22585 + }, + { + "epoch": 0.5440620408949688, + "grad_norm": 1.5627825260162354, + "learning_rate": 4.799716053138626e-07, + "loss": 0.8746, + "step": 22590 + }, + { + "epoch": 0.5441824618867561, + "grad_norm": 1.4710911512374878, + "learning_rate": 4.798448433221783e-07, + "loss": 0.9136, + "step": 22595 + }, + { + "epoch": 0.5443028828785433, + "grad_norm": 1.5587544441223145, + "learning_rate": 4.797180813304938e-07, + "loss": 0.9304, + "step": 22600 + }, + { + "epoch": 0.5444233038703307, + "grad_norm": 1.7592480182647705, + "learning_rate": 4.795913193388095e-07, + "loss": 0.8881, + "step": 22605 + }, + { + "epoch": 0.544543724862118, + "grad_norm": 1.6748290061950684, + "learning_rate": 4.79464557347125e-07, + "loss": 0.9363, + "step": 22610 + }, + { + "epoch": 0.5446641458539052, + "grad_norm": 1.4647150039672852, + "learning_rate": 4.793377953554406e-07, + "loss": 0.8691, + "step": 22615 + }, + { + "epoch": 0.5447845668456925, + "grad_norm": 1.3892542123794556, + "learning_rate": 4.792110333637562e-07, + "loss": 0.8757, + "step": 22620 + }, + { + "epoch": 0.5449049878374799, + "grad_norm": 1.6088064908981323, + "learning_rate": 4.790842713720718e-07, + "loss": 0.8947, + "step": 22625 + }, + { + "epoch": 0.5450254088292671, + "grad_norm": 1.6441105604171753, + "learning_rate": 4.789575093803874e-07, + "loss": 0.9049, + "step": 22630 + }, + { + "epoch": 0.5451458298210544, + "grad_norm": 1.6489001512527466, + "learning_rate": 4.788307473887029e-07, + "loss": 0.8746, + "step": 22635 + }, + { + "epoch": 0.5452662508128417, + "grad_norm": 1.6179732084274292, + "learning_rate": 4.787039853970186e-07, + "loss": 0.931, + "step": 22640 + }, + { + "epoch": 0.545386671804629, + "grad_norm": 1.3718429803848267, + "learning_rate": 4.785772234053341e-07, + "loss": 0.8971, + "step": 22645 + }, + { + "epoch": 0.5455070927964163, + "grad_norm": 1.6162413358688354, + "learning_rate": 4.784504614136497e-07, + "loss": 0.9371, + "step": 22650 + }, + { + "epoch": 0.5456275137882035, + "grad_norm": 1.4348382949829102, + "learning_rate": 4.783236994219653e-07, + "loss": 0.9229, + "step": 22655 + }, + { + "epoch": 0.5457479347799908, + "grad_norm": 1.6404297351837158, + "learning_rate": 4.781969374302809e-07, + "loss": 0.8652, + "step": 22660 + }, + { + "epoch": 0.5458683557717782, + "grad_norm": 1.6286433935165405, + "learning_rate": 4.780701754385965e-07, + "loss": 0.8742, + "step": 22665 + }, + { + "epoch": 0.5459887767635654, + "grad_norm": 1.5594221353530884, + "learning_rate": 4.779434134469121e-07, + "loss": 0.9247, + "step": 22670 + }, + { + "epoch": 0.5461091977553527, + "grad_norm": 1.4674808979034424, + "learning_rate": 4.778166514552276e-07, + "loss": 0.8623, + "step": 22675 + }, + { + "epoch": 0.54622961874714, + "grad_norm": 1.5083880424499512, + "learning_rate": 4.776898894635432e-07, + "loss": 0.9244, + "step": 22680 + }, + { + "epoch": 0.5463500397389273, + "grad_norm": 1.5525587797164917, + "learning_rate": 4.775631274718588e-07, + "loss": 0.9286, + "step": 22685 + }, + { + "epoch": 0.5464704607307146, + "grad_norm": 1.4932695627212524, + "learning_rate": 4.774363654801744e-07, + "loss": 0.9448, + "step": 22690 + }, + { + "epoch": 0.5465908817225019, + "grad_norm": 1.7255370616912842, + "learning_rate": 4.7730960348849e-07, + "loss": 0.9156, + "step": 22695 + }, + { + "epoch": 0.5467113027142891, + "grad_norm": 1.6489670276641846, + "learning_rate": 4.771828414968055e-07, + "loss": 0.8368, + "step": 22700 + }, + { + "epoch": 0.5468317237060765, + "grad_norm": 1.7103477716445923, + "learning_rate": 4.770560795051212e-07, + "loss": 0.9328, + "step": 22705 + }, + { + "epoch": 0.5469521446978637, + "grad_norm": 1.6121007204055786, + "learning_rate": 4.769293175134367e-07, + "loss": 0.877, + "step": 22710 + }, + { + "epoch": 0.547072565689651, + "grad_norm": 1.7308216094970703, + "learning_rate": 4.768025555217524e-07, + "loss": 0.9461, + "step": 22715 + }, + { + "epoch": 0.5471929866814383, + "grad_norm": 1.5269876718521118, + "learning_rate": 4.766757935300679e-07, + "loss": 0.8507, + "step": 22720 + }, + { + "epoch": 0.5473134076732256, + "grad_norm": 1.4367316961288452, + "learning_rate": 4.765490315383835e-07, + "loss": 0.9051, + "step": 22725 + }, + { + "epoch": 0.5474338286650129, + "grad_norm": 1.6838974952697754, + "learning_rate": 4.764222695466991e-07, + "loss": 0.9335, + "step": 22730 + }, + { + "epoch": 0.5475542496568002, + "grad_norm": 1.4725620746612549, + "learning_rate": 4.762955075550147e-07, + "loss": 0.9563, + "step": 22735 + }, + { + "epoch": 0.5476746706485874, + "grad_norm": 1.7399842739105225, + "learning_rate": 4.7616874556333023e-07, + "loss": 0.918, + "step": 22740 + }, + { + "epoch": 0.5477950916403748, + "grad_norm": 1.464184045791626, + "learning_rate": 4.7604198357164587e-07, + "loss": 0.9001, + "step": 22745 + }, + { + "epoch": 0.5479155126321621, + "grad_norm": 1.5431313514709473, + "learning_rate": 4.7591522157996147e-07, + "loss": 0.9322, + "step": 22750 + }, + { + "epoch": 0.5480359336239493, + "grad_norm": 1.7318702936172485, + "learning_rate": 4.75788459588277e-07, + "loss": 0.9075, + "step": 22755 + }, + { + "epoch": 0.5481563546157366, + "grad_norm": 1.4880088567733765, + "learning_rate": 4.7566169759659265e-07, + "loss": 0.8943, + "step": 22760 + }, + { + "epoch": 0.5482767756075239, + "grad_norm": 1.956777811050415, + "learning_rate": 4.755349356049082e-07, + "loss": 0.8973, + "step": 22765 + }, + { + "epoch": 0.5483971965993112, + "grad_norm": 1.4007956981658936, + "learning_rate": 4.754081736132238e-07, + "loss": 0.9259, + "step": 22770 + }, + { + "epoch": 0.5485176175910985, + "grad_norm": 1.6305876970291138, + "learning_rate": 4.7528141162153937e-07, + "loss": 0.9025, + "step": 22775 + }, + { + "epoch": 0.5486380385828857, + "grad_norm": 1.549984097480774, + "learning_rate": 4.7515464962985496e-07, + "loss": 0.9122, + "step": 22780 + }, + { + "epoch": 0.5487584595746731, + "grad_norm": 1.472762107849121, + "learning_rate": 4.7502788763817055e-07, + "loss": 0.8993, + "step": 22785 + }, + { + "epoch": 0.5488788805664604, + "grad_norm": 1.4677027463912964, + "learning_rate": 4.7490112564648615e-07, + "loss": 0.9269, + "step": 22790 + }, + { + "epoch": 0.5489993015582476, + "grad_norm": 1.7056957483291626, + "learning_rate": 4.7477436365480174e-07, + "loss": 0.8521, + "step": 22795 + }, + { + "epoch": 0.5491197225500349, + "grad_norm": 1.5326414108276367, + "learning_rate": 4.746476016631173e-07, + "loss": 0.9362, + "step": 22800 + }, + { + "epoch": 0.5492401435418222, + "grad_norm": 1.6755796670913696, + "learning_rate": 4.745208396714329e-07, + "loss": 0.9232, + "step": 22805 + }, + { + "epoch": 0.5493605645336095, + "grad_norm": 1.5694706439971924, + "learning_rate": 4.7439407767974846e-07, + "loss": 0.9571, + "step": 22810 + }, + { + "epoch": 0.5494809855253968, + "grad_norm": 1.4975767135620117, + "learning_rate": 4.7426731568806405e-07, + "loss": 0.8897, + "step": 22815 + }, + { + "epoch": 0.549601406517184, + "grad_norm": 1.4253288507461548, + "learning_rate": 4.741405536963797e-07, + "loss": 0.9265, + "step": 22820 + }, + { + "epoch": 0.5497218275089714, + "grad_norm": 1.4730470180511475, + "learning_rate": 4.7401379170469524e-07, + "loss": 0.9216, + "step": 22825 + }, + { + "epoch": 0.5498422485007587, + "grad_norm": 1.5416759252548218, + "learning_rate": 4.7388702971301083e-07, + "loss": 0.934, + "step": 22830 + }, + { + "epoch": 0.5499626694925459, + "grad_norm": 1.5281325578689575, + "learning_rate": 4.737602677213264e-07, + "loss": 0.877, + "step": 22835 + }, + { + "epoch": 0.5500830904843332, + "grad_norm": 1.6388013362884521, + "learning_rate": 4.73633505729642e-07, + "loss": 0.9129, + "step": 22840 + }, + { + "epoch": 0.5502035114761206, + "grad_norm": 1.5889734029769897, + "learning_rate": 4.7350674373795755e-07, + "loss": 0.9524, + "step": 22845 + }, + { + "epoch": 0.5503239324679078, + "grad_norm": 1.3709381818771362, + "learning_rate": 4.733799817462732e-07, + "loss": 0.8604, + "step": 22850 + }, + { + "epoch": 0.5504443534596951, + "grad_norm": 1.5920021533966064, + "learning_rate": 4.732532197545888e-07, + "loss": 0.9046, + "step": 22855 + }, + { + "epoch": 0.5505647744514823, + "grad_norm": 1.5857954025268555, + "learning_rate": 4.731264577629043e-07, + "loss": 0.9432, + "step": 22860 + }, + { + "epoch": 0.5506851954432697, + "grad_norm": 1.6190216541290283, + "learning_rate": 4.7299969577121997e-07, + "loss": 0.9056, + "step": 22865 + }, + { + "epoch": 0.550805616435057, + "grad_norm": 1.6539288759231567, + "learning_rate": 4.728729337795355e-07, + "loss": 0.8823, + "step": 22870 + }, + { + "epoch": 0.5509260374268442, + "grad_norm": 1.4087973833084106, + "learning_rate": 4.727461717878511e-07, + "loss": 0.8828, + "step": 22875 + }, + { + "epoch": 0.5510464584186315, + "grad_norm": 1.6573415994644165, + "learning_rate": 4.726194097961667e-07, + "loss": 0.8531, + "step": 22880 + }, + { + "epoch": 0.5511668794104189, + "grad_norm": 1.5957608222961426, + "learning_rate": 4.724926478044823e-07, + "loss": 0.9273, + "step": 22885 + }, + { + "epoch": 0.5512873004022061, + "grad_norm": 1.4989583492279053, + "learning_rate": 4.723658858127978e-07, + "loss": 0.9151, + "step": 22890 + }, + { + "epoch": 0.5514077213939934, + "grad_norm": 1.7770851850509644, + "learning_rate": 4.7223912382111347e-07, + "loss": 0.9288, + "step": 22895 + }, + { + "epoch": 0.5515281423857807, + "grad_norm": 1.4110249280929565, + "learning_rate": 4.7211236182942906e-07, + "loss": 0.9065, + "step": 22900 + }, + { + "epoch": 0.551648563377568, + "grad_norm": 1.5001226663589478, + "learning_rate": 4.719855998377446e-07, + "loss": 0.917, + "step": 22905 + }, + { + "epoch": 0.5517689843693553, + "grad_norm": 1.3509827852249146, + "learning_rate": 4.7185883784606024e-07, + "loss": 0.9521, + "step": 22910 + }, + { + "epoch": 0.5518894053611425, + "grad_norm": 1.6468544006347656, + "learning_rate": 4.717320758543758e-07, + "loss": 0.8757, + "step": 22915 + }, + { + "epoch": 0.5520098263529298, + "grad_norm": 1.6438676118850708, + "learning_rate": 4.716053138626914e-07, + "loss": 0.8843, + "step": 22920 + }, + { + "epoch": 0.5521302473447172, + "grad_norm": 1.6613961458206177, + "learning_rate": 4.7147855187100697e-07, + "loss": 0.946, + "step": 22925 + }, + { + "epoch": 0.5522506683365044, + "grad_norm": 1.5499963760375977, + "learning_rate": 4.7135178987932256e-07, + "loss": 0.89, + "step": 22930 + }, + { + "epoch": 0.5523710893282917, + "grad_norm": 1.6481000185012817, + "learning_rate": 4.7122502788763815e-07, + "loss": 0.926, + "step": 22935 + }, + { + "epoch": 0.552491510320079, + "grad_norm": 1.503867506980896, + "learning_rate": 4.7109826589595374e-07, + "loss": 0.9033, + "step": 22940 + }, + { + "epoch": 0.5526119313118663, + "grad_norm": 1.8247016668319702, + "learning_rate": 4.7097150390426933e-07, + "loss": 0.8992, + "step": 22945 + }, + { + "epoch": 0.5527323523036536, + "grad_norm": 1.4254915714263916, + "learning_rate": 4.7084474191258487e-07, + "loss": 0.8789, + "step": 22950 + }, + { + "epoch": 0.5528527732954409, + "grad_norm": 1.5164680480957031, + "learning_rate": 4.707179799209005e-07, + "loss": 0.9925, + "step": 22955 + }, + { + "epoch": 0.5529731942872281, + "grad_norm": 1.5180974006652832, + "learning_rate": 4.7059121792921606e-07, + "loss": 0.9097, + "step": 22960 + }, + { + "epoch": 0.5530936152790155, + "grad_norm": 1.6972721815109253, + "learning_rate": 4.7046445593753165e-07, + "loss": 0.855, + "step": 22965 + }, + { + "epoch": 0.5532140362708027, + "grad_norm": 1.5323702096939087, + "learning_rate": 4.703376939458473e-07, + "loss": 0.931, + "step": 22970 + }, + { + "epoch": 0.55333445726259, + "grad_norm": 1.7028616666793823, + "learning_rate": 4.7021093195416283e-07, + "loss": 0.8891, + "step": 22975 + }, + { + "epoch": 0.5534548782543773, + "grad_norm": 1.4747989177703857, + "learning_rate": 4.700841699624784e-07, + "loss": 0.9096, + "step": 22980 + }, + { + "epoch": 0.5535752992461646, + "grad_norm": 1.4280892610549927, + "learning_rate": 4.69957407970794e-07, + "loss": 0.8631, + "step": 22985 + }, + { + "epoch": 0.5536957202379519, + "grad_norm": 1.6729850769042969, + "learning_rate": 4.698306459791096e-07, + "loss": 0.9157, + "step": 22990 + }, + { + "epoch": 0.5538161412297392, + "grad_norm": 1.6169942617416382, + "learning_rate": 4.6970388398742515e-07, + "loss": 0.8625, + "step": 22995 + }, + { + "epoch": 0.5539365622215264, + "grad_norm": 1.4130114316940308, + "learning_rate": 4.695771219957408e-07, + "loss": 0.9, + "step": 23000 + }, + { + "epoch": 0.5540569832133138, + "grad_norm": 1.5743272304534912, + "learning_rate": 4.694503600040564e-07, + "loss": 0.8981, + "step": 23005 + }, + { + "epoch": 0.5541774042051011, + "grad_norm": 1.4807997941970825, + "learning_rate": 4.693235980123719e-07, + "loss": 0.874, + "step": 23010 + }, + { + "epoch": 0.5542978251968883, + "grad_norm": 1.649823784828186, + "learning_rate": 4.6919683602068757e-07, + "loss": 0.8946, + "step": 23015 + }, + { + "epoch": 0.5544182461886756, + "grad_norm": 1.410031795501709, + "learning_rate": 4.690700740290031e-07, + "loss": 0.9026, + "step": 23020 + }, + { + "epoch": 0.5545386671804629, + "grad_norm": 1.7367534637451172, + "learning_rate": 4.689433120373187e-07, + "loss": 0.9166, + "step": 23025 + }, + { + "epoch": 0.5546590881722502, + "grad_norm": 1.5176085233688354, + "learning_rate": 4.688165500456343e-07, + "loss": 0.9129, + "step": 23030 + }, + { + "epoch": 0.5547795091640375, + "grad_norm": 1.437880039215088, + "learning_rate": 4.686897880539499e-07, + "loss": 0.9501, + "step": 23035 + }, + { + "epoch": 0.5548999301558247, + "grad_norm": 1.7201894521713257, + "learning_rate": 4.685630260622654e-07, + "loss": 0.8532, + "step": 23040 + }, + { + "epoch": 0.5550203511476121, + "grad_norm": 1.4754242897033691, + "learning_rate": 4.6843626407058106e-07, + "loss": 0.8962, + "step": 23045 + }, + { + "epoch": 0.5551407721393994, + "grad_norm": 1.5195088386535645, + "learning_rate": 4.6830950207889665e-07, + "loss": 0.8434, + "step": 23050 + }, + { + "epoch": 0.5552611931311866, + "grad_norm": 1.5234872102737427, + "learning_rate": 4.681827400872122e-07, + "loss": 0.9074, + "step": 23055 + }, + { + "epoch": 0.5553816141229739, + "grad_norm": 1.3597567081451416, + "learning_rate": 4.6805597809552784e-07, + "loss": 0.8834, + "step": 23060 + }, + { + "epoch": 0.5555020351147612, + "grad_norm": 1.5739555358886719, + "learning_rate": 4.679292161038434e-07, + "loss": 0.8921, + "step": 23065 + }, + { + "epoch": 0.5556224561065485, + "grad_norm": 1.5108563899993896, + "learning_rate": 4.6780245411215897e-07, + "loss": 0.9429, + "step": 23070 + }, + { + "epoch": 0.5557428770983358, + "grad_norm": 1.4870704412460327, + "learning_rate": 4.6767569212047456e-07, + "loss": 0.8813, + "step": 23075 + }, + { + "epoch": 0.555863298090123, + "grad_norm": 1.5405627489089966, + "learning_rate": 4.6754893012879015e-07, + "loss": 0.9435, + "step": 23080 + }, + { + "epoch": 0.5559837190819104, + "grad_norm": 1.615705966949463, + "learning_rate": 4.674221681371058e-07, + "loss": 0.9322, + "step": 23085 + }, + { + "epoch": 0.5561041400736977, + "grad_norm": 1.6554853916168213, + "learning_rate": 4.6729540614542134e-07, + "loss": 0.8981, + "step": 23090 + }, + { + "epoch": 0.5562245610654849, + "grad_norm": 1.5215091705322266, + "learning_rate": 4.6716864415373693e-07, + "loss": 0.8765, + "step": 23095 + }, + { + "epoch": 0.5563449820572722, + "grad_norm": 2.1298627853393555, + "learning_rate": 4.670418821620525e-07, + "loss": 0.877, + "step": 23100 + }, + { + "epoch": 0.5564654030490596, + "grad_norm": 1.5979092121124268, + "learning_rate": 4.669151201703681e-07, + "loss": 0.9118, + "step": 23105 + }, + { + "epoch": 0.5565858240408468, + "grad_norm": 1.449164867401123, + "learning_rate": 4.6678835817868365e-07, + "loss": 0.8959, + "step": 23110 + }, + { + "epoch": 0.5567062450326341, + "grad_norm": 1.5024070739746094, + "learning_rate": 4.666615961869993e-07, + "loss": 0.9059, + "step": 23115 + }, + { + "epoch": 0.5568266660244213, + "grad_norm": 1.5619574785232544, + "learning_rate": 4.665348341953149e-07, + "loss": 0.9053, + "step": 23120 + }, + { + "epoch": 0.5569470870162087, + "grad_norm": 1.5824140310287476, + "learning_rate": 4.664080722036304e-07, + "loss": 0.8958, + "step": 23125 + }, + { + "epoch": 0.557067508007996, + "grad_norm": 1.613577127456665, + "learning_rate": 4.6628131021194607e-07, + "loss": 0.8602, + "step": 23130 + }, + { + "epoch": 0.5571879289997832, + "grad_norm": 1.4480857849121094, + "learning_rate": 4.661545482202616e-07, + "loss": 0.8349, + "step": 23135 + }, + { + "epoch": 0.5573083499915705, + "grad_norm": 1.4946420192718506, + "learning_rate": 4.660277862285772e-07, + "loss": 0.8484, + "step": 23140 + }, + { + "epoch": 0.5574287709833579, + "grad_norm": 1.4564869403839111, + "learning_rate": 4.659010242368928e-07, + "loss": 0.8754, + "step": 23145 + }, + { + "epoch": 0.5575491919751451, + "grad_norm": 1.4581509828567505, + "learning_rate": 4.657742622452084e-07, + "loss": 0.8587, + "step": 23150 + }, + { + "epoch": 0.5576696129669324, + "grad_norm": 1.5620570182800293, + "learning_rate": 4.65647500253524e-07, + "loss": 0.9003, + "step": 23155 + }, + { + "epoch": 0.5577900339587197, + "grad_norm": 1.7160062789916992, + "learning_rate": 4.6552073826183957e-07, + "loss": 0.9365, + "step": 23160 + }, + { + "epoch": 0.557910454950507, + "grad_norm": 1.7763422727584839, + "learning_rate": 4.6539397627015516e-07, + "loss": 0.9056, + "step": 23165 + }, + { + "epoch": 0.5580308759422943, + "grad_norm": 1.4742810726165771, + "learning_rate": 4.652672142784707e-07, + "loss": 0.915, + "step": 23170 + }, + { + "epoch": 0.5581512969340815, + "grad_norm": 1.7074137926101685, + "learning_rate": 4.6514045228678634e-07, + "loss": 0.9176, + "step": 23175 + }, + { + "epoch": 0.5582717179258688, + "grad_norm": 1.6095412969589233, + "learning_rate": 4.650136902951019e-07, + "loss": 0.9024, + "step": 23180 + }, + { + "epoch": 0.5583921389176562, + "grad_norm": 1.5504175424575806, + "learning_rate": 4.648869283034175e-07, + "loss": 0.848, + "step": 23185 + }, + { + "epoch": 0.5585125599094434, + "grad_norm": 1.5489128828048706, + "learning_rate": 4.647601663117331e-07, + "loss": 0.928, + "step": 23190 + }, + { + "epoch": 0.5586329809012307, + "grad_norm": 1.507321834564209, + "learning_rate": 4.6463340432004866e-07, + "loss": 0.8686, + "step": 23195 + }, + { + "epoch": 0.558753401893018, + "grad_norm": 1.7442164421081543, + "learning_rate": 4.6450664232836425e-07, + "loss": 0.9243, + "step": 23200 + }, + { + "epoch": 0.5588738228848052, + "grad_norm": 1.522963047027588, + "learning_rate": 4.6437988033667984e-07, + "loss": 0.9081, + "step": 23205 + }, + { + "epoch": 0.5589942438765926, + "grad_norm": 1.4842394590377808, + "learning_rate": 4.6425311834499543e-07, + "loss": 0.9395, + "step": 23210 + }, + { + "epoch": 0.5591146648683799, + "grad_norm": 1.8214404582977295, + "learning_rate": 4.6412635635331097e-07, + "loss": 0.9317, + "step": 23215 + }, + { + "epoch": 0.5592350858601671, + "grad_norm": 1.510892629623413, + "learning_rate": 4.639995943616266e-07, + "loss": 0.8752, + "step": 23220 + }, + { + "epoch": 0.5593555068519545, + "grad_norm": 1.4507983922958374, + "learning_rate": 4.6387283236994216e-07, + "loss": 0.8842, + "step": 23225 + }, + { + "epoch": 0.5594759278437417, + "grad_norm": 1.5228618383407593, + "learning_rate": 4.6374607037825775e-07, + "loss": 0.8702, + "step": 23230 + }, + { + "epoch": 0.559596348835529, + "grad_norm": 1.431118130683899, + "learning_rate": 4.636193083865734e-07, + "loss": 0.913, + "step": 23235 + }, + { + "epoch": 0.5597167698273163, + "grad_norm": 1.5003418922424316, + "learning_rate": 4.6349254639488893e-07, + "loss": 0.9081, + "step": 23240 + }, + { + "epoch": 0.5598371908191035, + "grad_norm": 1.5814661979675293, + "learning_rate": 4.633657844032045e-07, + "loss": 0.9273, + "step": 23245 + }, + { + "epoch": 0.5599576118108909, + "grad_norm": 1.5471625328063965, + "learning_rate": 4.632390224115201e-07, + "loss": 0.8741, + "step": 23250 + }, + { + "epoch": 0.5600780328026782, + "grad_norm": 1.619927167892456, + "learning_rate": 4.631122604198357e-07, + "loss": 0.893, + "step": 23255 + }, + { + "epoch": 0.5601984537944654, + "grad_norm": 1.6046520471572876, + "learning_rate": 4.6298549842815125e-07, + "loss": 0.9589, + "step": 23260 + }, + { + "epoch": 0.5603188747862528, + "grad_norm": 1.6255428791046143, + "learning_rate": 4.628587364364669e-07, + "loss": 0.8499, + "step": 23265 + }, + { + "epoch": 0.5604392957780401, + "grad_norm": 1.6193327903747559, + "learning_rate": 4.627319744447825e-07, + "loss": 0.8742, + "step": 23270 + }, + { + "epoch": 0.5605597167698273, + "grad_norm": 1.8600654602050781, + "learning_rate": 4.62605212453098e-07, + "loss": 0.8795, + "step": 23275 + }, + { + "epoch": 0.5606801377616146, + "grad_norm": 1.4893161058425903, + "learning_rate": 4.6247845046141367e-07, + "loss": 0.9228, + "step": 23280 + }, + { + "epoch": 0.5608005587534018, + "grad_norm": 1.4823611974716187, + "learning_rate": 4.623516884697292e-07, + "loss": 0.8934, + "step": 23285 + }, + { + "epoch": 0.5609209797451892, + "grad_norm": 1.4936082363128662, + "learning_rate": 4.622249264780448e-07, + "loss": 0.8465, + "step": 23290 + }, + { + "epoch": 0.5610414007369765, + "grad_norm": 1.565303087234497, + "learning_rate": 4.620981644863604e-07, + "loss": 0.8208, + "step": 23295 + }, + { + "epoch": 0.5611618217287637, + "grad_norm": 1.5002118349075317, + "learning_rate": 4.61971402494676e-07, + "loss": 0.8306, + "step": 23300 + }, + { + "epoch": 0.561282242720551, + "grad_norm": 1.7962204217910767, + "learning_rate": 4.618446405029915e-07, + "loss": 0.9261, + "step": 23305 + }, + { + "epoch": 0.5614026637123384, + "grad_norm": 1.486973524093628, + "learning_rate": 4.6171787851130716e-07, + "loss": 0.8782, + "step": 23310 + }, + { + "epoch": 0.5615230847041256, + "grad_norm": 1.7047168016433716, + "learning_rate": 4.6159111651962276e-07, + "loss": 0.937, + "step": 23315 + }, + { + "epoch": 0.5616435056959129, + "grad_norm": 1.7348361015319824, + "learning_rate": 4.614643545279383e-07, + "loss": 0.9046, + "step": 23320 + }, + { + "epoch": 0.5617639266877003, + "grad_norm": 1.6398698091506958, + "learning_rate": 4.6133759253625394e-07, + "loss": 0.9449, + "step": 23325 + }, + { + "epoch": 0.5618843476794875, + "grad_norm": 1.6072243452072144, + "learning_rate": 4.612108305445695e-07, + "loss": 0.8987, + "step": 23330 + }, + { + "epoch": 0.5620047686712748, + "grad_norm": 1.5447580814361572, + "learning_rate": 4.6108406855288507e-07, + "loss": 0.8998, + "step": 23335 + }, + { + "epoch": 0.562125189663062, + "grad_norm": 1.4506652355194092, + "learning_rate": 4.6095730656120066e-07, + "loss": 0.9006, + "step": 23340 + }, + { + "epoch": 0.5622456106548493, + "grad_norm": 1.8912750482559204, + "learning_rate": 4.6083054456951625e-07, + "loss": 0.9356, + "step": 23345 + }, + { + "epoch": 0.5623660316466367, + "grad_norm": 1.5937507152557373, + "learning_rate": 4.6070378257783184e-07, + "loss": 0.8739, + "step": 23350 + }, + { + "epoch": 0.5624864526384239, + "grad_norm": 1.5998436212539673, + "learning_rate": 4.6057702058614744e-07, + "loss": 0.9358, + "step": 23355 + }, + { + "epoch": 0.5626068736302112, + "grad_norm": 1.690090537071228, + "learning_rate": 4.6045025859446303e-07, + "loss": 0.9209, + "step": 23360 + }, + { + "epoch": 0.5627272946219986, + "grad_norm": 1.6583771705627441, + "learning_rate": 4.6032349660277857e-07, + "loss": 0.9291, + "step": 23365 + }, + { + "epoch": 0.5628477156137858, + "grad_norm": 1.3349385261535645, + "learning_rate": 4.601967346110942e-07, + "loss": 0.865, + "step": 23370 + }, + { + "epoch": 0.5629681366055731, + "grad_norm": 1.6301311254501343, + "learning_rate": 4.6006997261940975e-07, + "loss": 0.8853, + "step": 23375 + }, + { + "epoch": 0.5630885575973603, + "grad_norm": 1.294317603111267, + "learning_rate": 4.5994321062772534e-07, + "loss": 0.8765, + "step": 23380 + }, + { + "epoch": 0.5632089785891476, + "grad_norm": 1.6579155921936035, + "learning_rate": 4.59816448636041e-07, + "loss": 0.8699, + "step": 23385 + }, + { + "epoch": 0.563329399580935, + "grad_norm": 1.5684454441070557, + "learning_rate": 4.596896866443565e-07, + "loss": 0.9242, + "step": 23390 + }, + { + "epoch": 0.5634498205727222, + "grad_norm": 1.7211233377456665, + "learning_rate": 4.595629246526721e-07, + "loss": 0.9232, + "step": 23395 + }, + { + "epoch": 0.5635702415645095, + "grad_norm": 1.4012408256530762, + "learning_rate": 4.594361626609877e-07, + "loss": 0.9368, + "step": 23400 + }, + { + "epoch": 0.5636906625562969, + "grad_norm": 1.6189051866531372, + "learning_rate": 4.593094006693033e-07, + "loss": 0.8924, + "step": 23405 + }, + { + "epoch": 0.5638110835480841, + "grad_norm": 1.5496536493301392, + "learning_rate": 4.5918263867761884e-07, + "loss": 0.9362, + "step": 23410 + }, + { + "epoch": 0.5639315045398714, + "grad_norm": 1.6953816413879395, + "learning_rate": 4.590558766859345e-07, + "loss": 0.91, + "step": 23415 + }, + { + "epoch": 0.5640519255316587, + "grad_norm": 1.9266865253448486, + "learning_rate": 4.589291146942501e-07, + "loss": 0.8785, + "step": 23420 + }, + { + "epoch": 0.5641723465234459, + "grad_norm": 1.821556568145752, + "learning_rate": 4.588023527025656e-07, + "loss": 0.889, + "step": 23425 + }, + { + "epoch": 0.5642927675152333, + "grad_norm": 1.4360072612762451, + "learning_rate": 4.5867559071088126e-07, + "loss": 0.9084, + "step": 23430 + }, + { + "epoch": 0.5644131885070205, + "grad_norm": 1.4635649919509888, + "learning_rate": 4.585488287191968e-07, + "loss": 0.9179, + "step": 23435 + }, + { + "epoch": 0.5645336094988078, + "grad_norm": 1.7907929420471191, + "learning_rate": 4.584220667275124e-07, + "loss": 0.8953, + "step": 23440 + }, + { + "epoch": 0.5646540304905951, + "grad_norm": 1.54899263381958, + "learning_rate": 4.58295304735828e-07, + "loss": 0.9073, + "step": 23445 + }, + { + "epoch": 0.5647744514823824, + "grad_norm": 1.6154134273529053, + "learning_rate": 4.581685427441436e-07, + "loss": 0.8705, + "step": 23450 + }, + { + "epoch": 0.5648948724741697, + "grad_norm": 1.8652158975601196, + "learning_rate": 4.580417807524591e-07, + "loss": 0.9056, + "step": 23455 + }, + { + "epoch": 0.565015293465957, + "grad_norm": 1.623321533203125, + "learning_rate": 4.5791501876077476e-07, + "loss": 0.8701, + "step": 23460 + }, + { + "epoch": 0.5651357144577442, + "grad_norm": 1.4679025411605835, + "learning_rate": 4.5778825676909035e-07, + "loss": 0.9068, + "step": 23465 + }, + { + "epoch": 0.5652561354495316, + "grad_norm": 1.803037166595459, + "learning_rate": 4.576614947774059e-07, + "loss": 0.9303, + "step": 23470 + }, + { + "epoch": 0.5653765564413189, + "grad_norm": 1.5411866903305054, + "learning_rate": 4.5753473278572153e-07, + "loss": 0.9731, + "step": 23475 + }, + { + "epoch": 0.5654969774331061, + "grad_norm": 1.4283922910690308, + "learning_rate": 4.5740797079403707e-07, + "loss": 0.8631, + "step": 23480 + }, + { + "epoch": 0.5656173984248934, + "grad_norm": 1.6044667959213257, + "learning_rate": 4.5728120880235266e-07, + "loss": 0.8486, + "step": 23485 + }, + { + "epoch": 0.5657378194166807, + "grad_norm": 1.5341675281524658, + "learning_rate": 4.5715444681066826e-07, + "loss": 0.9033, + "step": 23490 + }, + { + "epoch": 0.565858240408468, + "grad_norm": 1.3251118659973145, + "learning_rate": 4.5702768481898385e-07, + "loss": 0.9178, + "step": 23495 + }, + { + "epoch": 0.5659786614002553, + "grad_norm": 1.699512004852295, + "learning_rate": 4.5690092282729944e-07, + "loss": 0.8354, + "step": 23500 + }, + { + "epoch": 0.5660990823920425, + "grad_norm": 1.61118483543396, + "learning_rate": 4.5677416083561503e-07, + "loss": 0.8689, + "step": 23505 + }, + { + "epoch": 0.5662195033838299, + "grad_norm": 1.5231705904006958, + "learning_rate": 4.566473988439306e-07, + "loss": 0.9298, + "step": 23510 + }, + { + "epoch": 0.5663399243756172, + "grad_norm": 1.595055103302002, + "learning_rate": 4.5652063685224616e-07, + "loss": 0.8918, + "step": 23515 + }, + { + "epoch": 0.5664603453674044, + "grad_norm": 1.7120904922485352, + "learning_rate": 4.563938748605618e-07, + "loss": 0.8982, + "step": 23520 + }, + { + "epoch": 0.5665807663591917, + "grad_norm": 1.5685055255889893, + "learning_rate": 4.5626711286887735e-07, + "loss": 0.9227, + "step": 23525 + }, + { + "epoch": 0.5667011873509791, + "grad_norm": 1.5623687505722046, + "learning_rate": 4.5614035087719294e-07, + "loss": 0.9282, + "step": 23530 + }, + { + "epoch": 0.5668216083427663, + "grad_norm": 1.6257749795913696, + "learning_rate": 4.560135888855086e-07, + "loss": 0.9173, + "step": 23535 + }, + { + "epoch": 0.5669420293345536, + "grad_norm": 1.4779962301254272, + "learning_rate": 4.558868268938241e-07, + "loss": 0.9495, + "step": 23540 + }, + { + "epoch": 0.5670624503263408, + "grad_norm": 1.5573982000350952, + "learning_rate": 4.557600649021397e-07, + "loss": 0.936, + "step": 23545 + }, + { + "epoch": 0.5671828713181282, + "grad_norm": 1.638119101524353, + "learning_rate": 4.556333029104553e-07, + "loss": 0.9203, + "step": 23550 + }, + { + "epoch": 0.5673032923099155, + "grad_norm": 1.526727318763733, + "learning_rate": 4.555065409187709e-07, + "loss": 0.9222, + "step": 23555 + }, + { + "epoch": 0.5674237133017027, + "grad_norm": 1.6723929643630981, + "learning_rate": 4.5537977892708644e-07, + "loss": 0.9457, + "step": 23560 + }, + { + "epoch": 0.56754413429349, + "grad_norm": 1.4846093654632568, + "learning_rate": 4.552530169354021e-07, + "loss": 0.9399, + "step": 23565 + }, + { + "epoch": 0.5676645552852774, + "grad_norm": 1.665597677230835, + "learning_rate": 4.5512625494371767e-07, + "loss": 0.9339, + "step": 23570 + }, + { + "epoch": 0.5677849762770646, + "grad_norm": 1.617017149925232, + "learning_rate": 4.549994929520332e-07, + "loss": 0.9112, + "step": 23575 + }, + { + "epoch": 0.5679053972688519, + "grad_norm": 1.481245756149292, + "learning_rate": 4.5487273096034886e-07, + "loss": 0.9376, + "step": 23580 + }, + { + "epoch": 0.5680258182606392, + "grad_norm": 1.5374301671981812, + "learning_rate": 4.547459689686644e-07, + "loss": 0.9231, + "step": 23585 + }, + { + "epoch": 0.5681462392524265, + "grad_norm": 1.8817466497421265, + "learning_rate": 4.5461920697698e-07, + "loss": 0.9229, + "step": 23590 + }, + { + "epoch": 0.5682666602442138, + "grad_norm": 1.4146652221679688, + "learning_rate": 4.544924449852956e-07, + "loss": 0.8861, + "step": 23595 + }, + { + "epoch": 0.568387081236001, + "grad_norm": 1.700593113899231, + "learning_rate": 4.5436568299361117e-07, + "loss": 0.8761, + "step": 23600 + }, + { + "epoch": 0.5685075022277883, + "grad_norm": 1.4120657444000244, + "learning_rate": 4.542389210019268e-07, + "loss": 0.963, + "step": 23605 + }, + { + "epoch": 0.5686279232195757, + "grad_norm": 1.45216703414917, + "learning_rate": 4.5411215901024235e-07, + "loss": 0.913, + "step": 23610 + }, + { + "epoch": 0.5687483442113629, + "grad_norm": 1.4401744604110718, + "learning_rate": 4.5398539701855795e-07, + "loss": 0.9264, + "step": 23615 + }, + { + "epoch": 0.5688687652031502, + "grad_norm": 1.587065577507019, + "learning_rate": 4.5385863502687354e-07, + "loss": 0.8823, + "step": 23620 + }, + { + "epoch": 0.5689891861949375, + "grad_norm": 1.7187548875808716, + "learning_rate": 4.5373187303518913e-07, + "loss": 0.8663, + "step": 23625 + }, + { + "epoch": 0.5691096071867248, + "grad_norm": 1.6272677183151245, + "learning_rate": 4.5360511104350467e-07, + "loss": 0.8952, + "step": 23630 + }, + { + "epoch": 0.5692300281785121, + "grad_norm": 1.7676702737808228, + "learning_rate": 4.534783490518203e-07, + "loss": 0.94, + "step": 23635 + }, + { + "epoch": 0.5693504491702994, + "grad_norm": 1.674362301826477, + "learning_rate": 4.5335158706013585e-07, + "loss": 0.9102, + "step": 23640 + }, + { + "epoch": 0.5694708701620866, + "grad_norm": 1.3730660676956177, + "learning_rate": 4.5322482506845144e-07, + "loss": 0.808, + "step": 23645 + }, + { + "epoch": 0.569591291153874, + "grad_norm": 1.5507595539093018, + "learning_rate": 4.530980630767671e-07, + "loss": 0.8651, + "step": 23650 + }, + { + "epoch": 0.5697117121456612, + "grad_norm": 1.7860714197158813, + "learning_rate": 4.5297130108508263e-07, + "loss": 0.9472, + "step": 23655 + }, + { + "epoch": 0.5698321331374485, + "grad_norm": 1.460296869277954, + "learning_rate": 4.528445390933982e-07, + "loss": 0.8818, + "step": 23660 + }, + { + "epoch": 0.5699525541292358, + "grad_norm": 1.6900402307510376, + "learning_rate": 4.527177771017138e-07, + "loss": 1.0138, + "step": 23665 + }, + { + "epoch": 0.5700729751210231, + "grad_norm": 1.537196159362793, + "learning_rate": 4.525910151100294e-07, + "loss": 0.8917, + "step": 23670 + }, + { + "epoch": 0.5701933961128104, + "grad_norm": 1.4395272731781006, + "learning_rate": 4.5246425311834494e-07, + "loss": 0.9106, + "step": 23675 + }, + { + "epoch": 0.5703138171045977, + "grad_norm": 1.5833889245986938, + "learning_rate": 4.523374911266606e-07, + "loss": 0.8752, + "step": 23680 + }, + { + "epoch": 0.5704342380963849, + "grad_norm": 1.5276377201080322, + "learning_rate": 4.522107291349762e-07, + "loss": 0.9327, + "step": 23685 + }, + { + "epoch": 0.5705546590881723, + "grad_norm": 1.5240648984909058, + "learning_rate": 4.520839671432917e-07, + "loss": 0.9534, + "step": 23690 + }, + { + "epoch": 0.5706750800799595, + "grad_norm": 1.4590954780578613, + "learning_rate": 4.5195720515160736e-07, + "loss": 0.8412, + "step": 23695 + }, + { + "epoch": 0.5707955010717468, + "grad_norm": 1.6685837507247925, + "learning_rate": 4.518304431599229e-07, + "loss": 0.9154, + "step": 23700 + }, + { + "epoch": 0.5709159220635341, + "grad_norm": 1.4996044635772705, + "learning_rate": 4.517036811682385e-07, + "loss": 0.9211, + "step": 23705 + }, + { + "epoch": 0.5710363430553214, + "grad_norm": 1.4348300695419312, + "learning_rate": 4.515769191765541e-07, + "loss": 0.8831, + "step": 23710 + }, + { + "epoch": 0.5711567640471087, + "grad_norm": 1.3542038202285767, + "learning_rate": 4.514501571848697e-07, + "loss": 0.87, + "step": 23715 + }, + { + "epoch": 0.571277185038896, + "grad_norm": 1.7364726066589355, + "learning_rate": 4.5132339519318527e-07, + "loss": 0.9395, + "step": 23720 + }, + { + "epoch": 0.5713976060306832, + "grad_norm": 1.9224735498428345, + "learning_rate": 4.5119663320150086e-07, + "loss": 0.8701, + "step": 23725 + }, + { + "epoch": 0.5715180270224706, + "grad_norm": 1.6598951816558838, + "learning_rate": 4.5106987120981645e-07, + "loss": 0.8552, + "step": 23730 + }, + { + "epoch": 0.5716384480142579, + "grad_norm": 1.522518515586853, + "learning_rate": 4.50943109218132e-07, + "loss": 0.8725, + "step": 23735 + }, + { + "epoch": 0.5717588690060451, + "grad_norm": 1.536367654800415, + "learning_rate": 4.5081634722644763e-07, + "loss": 0.9576, + "step": 23740 + }, + { + "epoch": 0.5718792899978324, + "grad_norm": 1.6370139122009277, + "learning_rate": 4.5068958523476317e-07, + "loss": 0.8727, + "step": 23745 + }, + { + "epoch": 0.5719997109896197, + "grad_norm": 1.4542649984359741, + "learning_rate": 4.5056282324307876e-07, + "loss": 0.89, + "step": 23750 + }, + { + "epoch": 0.572120131981407, + "grad_norm": 1.434571385383606, + "learning_rate": 4.504360612513944e-07, + "loss": 0.9192, + "step": 23755 + }, + { + "epoch": 0.5722405529731943, + "grad_norm": 1.6002957820892334, + "learning_rate": 4.5030929925970995e-07, + "loss": 0.885, + "step": 23760 + }, + { + "epoch": 0.5723609739649815, + "grad_norm": 1.706491470336914, + "learning_rate": 4.5018253726802554e-07, + "loss": 0.9299, + "step": 23765 + }, + { + "epoch": 0.5724813949567689, + "grad_norm": 1.5384788513183594, + "learning_rate": 4.5005577527634113e-07, + "loss": 0.9399, + "step": 23770 + }, + { + "epoch": 0.5726018159485562, + "grad_norm": 1.4597822427749634, + "learning_rate": 4.499290132846567e-07, + "loss": 0.905, + "step": 23775 + }, + { + "epoch": 0.5727222369403434, + "grad_norm": 1.4537782669067383, + "learning_rate": 4.4980225129297226e-07, + "loss": 0.9231, + "step": 23780 + }, + { + "epoch": 0.5728426579321307, + "grad_norm": 1.4657148122787476, + "learning_rate": 4.496754893012879e-07, + "loss": 0.9407, + "step": 23785 + }, + { + "epoch": 0.5729630789239181, + "grad_norm": 1.6405856609344482, + "learning_rate": 4.4954872730960345e-07, + "loss": 0.8758, + "step": 23790 + }, + { + "epoch": 0.5730834999157053, + "grad_norm": 1.7731573581695557, + "learning_rate": 4.4942196531791904e-07, + "loss": 0.9526, + "step": 23795 + }, + { + "epoch": 0.5732039209074926, + "grad_norm": 1.3865348100662231, + "learning_rate": 4.492952033262347e-07, + "loss": 0.9243, + "step": 23800 + }, + { + "epoch": 0.5733243418992798, + "grad_norm": 1.372140645980835, + "learning_rate": 4.491684413345502e-07, + "loss": 0.8891, + "step": 23805 + }, + { + "epoch": 0.5734447628910672, + "grad_norm": 1.4230656623840332, + "learning_rate": 4.490416793428658e-07, + "loss": 0.8814, + "step": 23810 + }, + { + "epoch": 0.5735651838828545, + "grad_norm": 1.5180834531784058, + "learning_rate": 4.489149173511814e-07, + "loss": 0.9059, + "step": 23815 + }, + { + "epoch": 0.5736856048746417, + "grad_norm": 1.4684674739837646, + "learning_rate": 4.48788155359497e-07, + "loss": 0.9095, + "step": 23820 + }, + { + "epoch": 0.573806025866429, + "grad_norm": 1.4561958312988281, + "learning_rate": 4.4866139336781254e-07, + "loss": 0.9206, + "step": 23825 + }, + { + "epoch": 0.5739264468582164, + "grad_norm": 1.6689949035644531, + "learning_rate": 4.485346313761282e-07, + "loss": 0.9057, + "step": 23830 + }, + { + "epoch": 0.5740468678500036, + "grad_norm": 1.494704246520996, + "learning_rate": 4.4840786938444377e-07, + "loss": 0.8743, + "step": 23835 + }, + { + "epoch": 0.5741672888417909, + "grad_norm": 1.4397083520889282, + "learning_rate": 4.482811073927593e-07, + "loss": 0.9231, + "step": 23840 + }, + { + "epoch": 0.5742877098335782, + "grad_norm": 1.6924148797988892, + "learning_rate": 4.4815434540107496e-07, + "loss": 0.9065, + "step": 23845 + }, + { + "epoch": 0.5744081308253655, + "grad_norm": 1.6249243021011353, + "learning_rate": 4.480275834093905e-07, + "loss": 0.9242, + "step": 23850 + }, + { + "epoch": 0.5745285518171528, + "grad_norm": 1.471760869026184, + "learning_rate": 4.479008214177061e-07, + "loss": 0.865, + "step": 23855 + }, + { + "epoch": 0.57464897280894, + "grad_norm": 1.5433714389801025, + "learning_rate": 4.477740594260217e-07, + "loss": 0.866, + "step": 23860 + }, + { + "epoch": 0.5747693938007273, + "grad_norm": 1.5583155155181885, + "learning_rate": 4.4764729743433727e-07, + "loss": 0.913, + "step": 23865 + }, + { + "epoch": 0.5748898147925147, + "grad_norm": 1.5120084285736084, + "learning_rate": 4.4752053544265286e-07, + "loss": 0.9512, + "step": 23870 + }, + { + "epoch": 0.5750102357843019, + "grad_norm": 1.3898836374282837, + "learning_rate": 4.4739377345096845e-07, + "loss": 0.9161, + "step": 23875 + }, + { + "epoch": 0.5751306567760892, + "grad_norm": 1.754967212677002, + "learning_rate": 4.4726701145928405e-07, + "loss": 0.9644, + "step": 23880 + }, + { + "epoch": 0.5752510777678765, + "grad_norm": 1.5690884590148926, + "learning_rate": 4.471402494675996e-07, + "loss": 0.9398, + "step": 23885 + }, + { + "epoch": 0.5753714987596638, + "grad_norm": 1.4521434307098389, + "learning_rate": 4.4701348747591523e-07, + "loss": 0.9002, + "step": 23890 + }, + { + "epoch": 0.5754919197514511, + "grad_norm": 1.5135856866836548, + "learning_rate": 4.4688672548423077e-07, + "loss": 0.846, + "step": 23895 + }, + { + "epoch": 0.5756123407432384, + "grad_norm": 1.5185608863830566, + "learning_rate": 4.4675996349254636e-07, + "loss": 0.8862, + "step": 23900 + }, + { + "epoch": 0.5757327617350256, + "grad_norm": 1.4188989400863647, + "learning_rate": 4.46633201500862e-07, + "loss": 0.8444, + "step": 23905 + }, + { + "epoch": 0.575853182726813, + "grad_norm": 1.6794841289520264, + "learning_rate": 4.4650643950917754e-07, + "loss": 0.9201, + "step": 23910 + }, + { + "epoch": 0.5759736037186002, + "grad_norm": 1.6270520687103271, + "learning_rate": 4.4637967751749314e-07, + "loss": 0.9318, + "step": 23915 + }, + { + "epoch": 0.5760940247103875, + "grad_norm": 1.71327543258667, + "learning_rate": 4.4625291552580873e-07, + "loss": 0.9029, + "step": 23920 + }, + { + "epoch": 0.5762144457021748, + "grad_norm": 1.8402167558670044, + "learning_rate": 4.461261535341243e-07, + "loss": 0.9023, + "step": 23925 + }, + { + "epoch": 0.5763348666939621, + "grad_norm": 1.5386940240859985, + "learning_rate": 4.4599939154243986e-07, + "loss": 0.8126, + "step": 23930 + }, + { + "epoch": 0.5764552876857494, + "grad_norm": 1.6440932750701904, + "learning_rate": 4.458726295507555e-07, + "loss": 0.8952, + "step": 23935 + }, + { + "epoch": 0.5765757086775367, + "grad_norm": 1.5482745170593262, + "learning_rate": 4.4574586755907104e-07, + "loss": 0.9059, + "step": 23940 + }, + { + "epoch": 0.5766961296693239, + "grad_norm": 2.264068603515625, + "learning_rate": 4.4561910556738663e-07, + "loss": 0.876, + "step": 23945 + }, + { + "epoch": 0.5768165506611113, + "grad_norm": 1.9847251176834106, + "learning_rate": 4.454923435757023e-07, + "loss": 0.933, + "step": 23950 + }, + { + "epoch": 0.5769369716528985, + "grad_norm": 1.469585657119751, + "learning_rate": 4.453655815840178e-07, + "loss": 0.8767, + "step": 23955 + }, + { + "epoch": 0.5770573926446858, + "grad_norm": 1.5730228424072266, + "learning_rate": 4.452388195923334e-07, + "loss": 0.8507, + "step": 23960 + }, + { + "epoch": 0.5771778136364731, + "grad_norm": 1.466878890991211, + "learning_rate": 4.45112057600649e-07, + "loss": 0.8549, + "step": 23965 + }, + { + "epoch": 0.5772982346282604, + "grad_norm": 1.378785252571106, + "learning_rate": 4.449852956089646e-07, + "loss": 0.9119, + "step": 23970 + }, + { + "epoch": 0.5774186556200477, + "grad_norm": 1.5784924030303955, + "learning_rate": 4.4485853361728013e-07, + "loss": 0.912, + "step": 23975 + }, + { + "epoch": 0.577539076611835, + "grad_norm": 1.2947007417678833, + "learning_rate": 4.447317716255958e-07, + "loss": 0.8894, + "step": 23980 + }, + { + "epoch": 0.5776594976036222, + "grad_norm": 1.5826587677001953, + "learning_rate": 4.4460500963391137e-07, + "loss": 0.9001, + "step": 23985 + }, + { + "epoch": 0.5777799185954096, + "grad_norm": 1.6320418119430542, + "learning_rate": 4.444782476422269e-07, + "loss": 0.9695, + "step": 23990 + }, + { + "epoch": 0.5779003395871969, + "grad_norm": 1.3234777450561523, + "learning_rate": 4.4435148565054255e-07, + "loss": 0.931, + "step": 23995 + }, + { + "epoch": 0.5780207605789841, + "grad_norm": 1.4822025299072266, + "learning_rate": 4.442247236588581e-07, + "loss": 0.8501, + "step": 24000 + }, + { + "epoch": 0.5781411815707714, + "grad_norm": 1.6573212146759033, + "learning_rate": 4.440979616671737e-07, + "loss": 0.9256, + "step": 24005 + }, + { + "epoch": 0.5782616025625587, + "grad_norm": 1.4438883066177368, + "learning_rate": 4.4397119967548927e-07, + "loss": 0.8933, + "step": 24010 + }, + { + "epoch": 0.578382023554346, + "grad_norm": 1.6382768154144287, + "learning_rate": 4.4384443768380487e-07, + "loss": 0.91, + "step": 24015 + }, + { + "epoch": 0.5785024445461333, + "grad_norm": 1.5180890560150146, + "learning_rate": 4.437176756921204e-07, + "loss": 0.8616, + "step": 24020 + }, + { + "epoch": 0.5786228655379205, + "grad_norm": 1.5098698139190674, + "learning_rate": 4.4359091370043605e-07, + "loss": 0.8957, + "step": 24025 + }, + { + "epoch": 0.5787432865297079, + "grad_norm": 1.3582013845443726, + "learning_rate": 4.4346415170875164e-07, + "loss": 0.8859, + "step": 24030 + }, + { + "epoch": 0.5788637075214952, + "grad_norm": 1.6167140007019043, + "learning_rate": 4.433373897170672e-07, + "loss": 0.8977, + "step": 24035 + }, + { + "epoch": 0.5789841285132824, + "grad_norm": 1.5289353132247925, + "learning_rate": 4.432106277253828e-07, + "loss": 0.8622, + "step": 24040 + }, + { + "epoch": 0.5791045495050697, + "grad_norm": 1.5419551134109497, + "learning_rate": 4.4308386573369836e-07, + "loss": 0.9116, + "step": 24045 + }, + { + "epoch": 0.5792249704968571, + "grad_norm": 1.5560519695281982, + "learning_rate": 4.4295710374201395e-07, + "loss": 0.9161, + "step": 24050 + }, + { + "epoch": 0.5793453914886443, + "grad_norm": 1.4875389337539673, + "learning_rate": 4.4283034175032955e-07, + "loss": 0.912, + "step": 24055 + }, + { + "epoch": 0.5794658124804316, + "grad_norm": 1.446847915649414, + "learning_rate": 4.4270357975864514e-07, + "loss": 0.8848, + "step": 24060 + }, + { + "epoch": 0.5795862334722188, + "grad_norm": 1.881361484527588, + "learning_rate": 4.4257681776696073e-07, + "loss": 0.9394, + "step": 24065 + }, + { + "epoch": 0.5797066544640062, + "grad_norm": 1.6584711074829102, + "learning_rate": 4.424500557752763e-07, + "loss": 0.9249, + "step": 24070 + }, + { + "epoch": 0.5798270754557935, + "grad_norm": 1.4165360927581787, + "learning_rate": 4.423232937835919e-07, + "loss": 0.9157, + "step": 24075 + }, + { + "epoch": 0.5799474964475807, + "grad_norm": 1.5634981393814087, + "learning_rate": 4.4219653179190745e-07, + "loss": 0.9077, + "step": 24080 + }, + { + "epoch": 0.580067917439368, + "grad_norm": 1.435659408569336, + "learning_rate": 4.420697698002231e-07, + "loss": 0.9554, + "step": 24085 + }, + { + "epoch": 0.5801883384311554, + "grad_norm": 1.7094619274139404, + "learning_rate": 4.4194300780853864e-07, + "loss": 0.8962, + "step": 24090 + }, + { + "epoch": 0.5803087594229426, + "grad_norm": 1.600085973739624, + "learning_rate": 4.4181624581685423e-07, + "loss": 0.9115, + "step": 24095 + }, + { + "epoch": 0.5804291804147299, + "grad_norm": 1.5288809537887573, + "learning_rate": 4.4168948382516987e-07, + "loss": 0.9156, + "step": 24100 + }, + { + "epoch": 0.5805496014065172, + "grad_norm": 1.5837684869766235, + "learning_rate": 4.415627218334854e-07, + "loss": 0.8953, + "step": 24105 + }, + { + "epoch": 0.5806700223983045, + "grad_norm": 1.609930396080017, + "learning_rate": 4.41435959841801e-07, + "loss": 0.9261, + "step": 24110 + }, + { + "epoch": 0.5807904433900918, + "grad_norm": 1.6778355836868286, + "learning_rate": 4.413091978501166e-07, + "loss": 0.9514, + "step": 24115 + }, + { + "epoch": 0.580910864381879, + "grad_norm": 1.585976004600525, + "learning_rate": 4.411824358584322e-07, + "loss": 0.9506, + "step": 24120 + }, + { + "epoch": 0.5810312853736663, + "grad_norm": 1.465681552886963, + "learning_rate": 4.410556738667478e-07, + "loss": 0.8849, + "step": 24125 + }, + { + "epoch": 0.5811517063654537, + "grad_norm": 1.6639018058776855, + "learning_rate": 4.4092891187506337e-07, + "loss": 0.8529, + "step": 24130 + }, + { + "epoch": 0.5812721273572409, + "grad_norm": 1.3908381462097168, + "learning_rate": 4.4080214988337896e-07, + "loss": 0.9167, + "step": 24135 + }, + { + "epoch": 0.5813925483490282, + "grad_norm": 1.605887770652771, + "learning_rate": 4.4067538789169455e-07, + "loss": 0.9068, + "step": 24140 + }, + { + "epoch": 0.5815129693408155, + "grad_norm": 1.6921488046646118, + "learning_rate": 4.4054862590001015e-07, + "loss": 0.9202, + "step": 24145 + }, + { + "epoch": 0.5816333903326028, + "grad_norm": 1.6987926959991455, + "learning_rate": 4.404218639083257e-07, + "loss": 0.8765, + "step": 24150 + }, + { + "epoch": 0.5817538113243901, + "grad_norm": 1.5516892671585083, + "learning_rate": 4.4029510191664133e-07, + "loss": 0.8981, + "step": 24155 + }, + { + "epoch": 0.5818742323161774, + "grad_norm": 1.5336050987243652, + "learning_rate": 4.4016833992495687e-07, + "loss": 0.8836, + "step": 24160 + }, + { + "epoch": 0.5819946533079646, + "grad_norm": 1.3349018096923828, + "learning_rate": 4.4004157793327246e-07, + "loss": 0.8669, + "step": 24165 + }, + { + "epoch": 0.582115074299752, + "grad_norm": 2.0041520595550537, + "learning_rate": 4.399148159415881e-07, + "loss": 0.9013, + "step": 24170 + }, + { + "epoch": 0.5822354952915392, + "grad_norm": 1.5998023748397827, + "learning_rate": 4.3978805394990364e-07, + "loss": 0.9487, + "step": 24175 + }, + { + "epoch": 0.5823559162833265, + "grad_norm": 2.7034430503845215, + "learning_rate": 4.3966129195821924e-07, + "loss": 0.8808, + "step": 24180 + }, + { + "epoch": 0.5824763372751138, + "grad_norm": 1.5282458066940308, + "learning_rate": 4.3953452996653483e-07, + "loss": 0.8657, + "step": 24185 + }, + { + "epoch": 0.582596758266901, + "grad_norm": 1.5360181331634521, + "learning_rate": 4.394077679748504e-07, + "loss": 0.8707, + "step": 24190 + }, + { + "epoch": 0.5827171792586884, + "grad_norm": 1.6115931272506714, + "learning_rate": 4.3928100598316596e-07, + "loss": 0.8551, + "step": 24195 + }, + { + "epoch": 0.5828376002504757, + "grad_norm": 1.478049635887146, + "learning_rate": 4.391542439914816e-07, + "loss": 0.8887, + "step": 24200 + }, + { + "epoch": 0.5829580212422629, + "grad_norm": 1.7876139879226685, + "learning_rate": 4.3902748199979714e-07, + "loss": 0.9005, + "step": 24205 + }, + { + "epoch": 0.5830784422340503, + "grad_norm": 1.5490608215332031, + "learning_rate": 4.3890072000811273e-07, + "loss": 0.9169, + "step": 24210 + }, + { + "epoch": 0.5831988632258376, + "grad_norm": 1.4674124717712402, + "learning_rate": 4.387739580164284e-07, + "loss": 0.9054, + "step": 24215 + }, + { + "epoch": 0.5833192842176248, + "grad_norm": 1.3796958923339844, + "learning_rate": 4.386471960247439e-07, + "loss": 0.8739, + "step": 24220 + }, + { + "epoch": 0.5834397052094121, + "grad_norm": 1.6711292266845703, + "learning_rate": 4.385204340330595e-07, + "loss": 0.8855, + "step": 24225 + }, + { + "epoch": 0.5835601262011993, + "grad_norm": 1.5073076486587524, + "learning_rate": 4.383936720413751e-07, + "loss": 0.9541, + "step": 24230 + }, + { + "epoch": 0.5836805471929867, + "grad_norm": 1.5467579364776611, + "learning_rate": 4.382669100496907e-07, + "loss": 0.9097, + "step": 24235 + }, + { + "epoch": 0.583800968184774, + "grad_norm": 1.4945591688156128, + "learning_rate": 4.3814014805800623e-07, + "loss": 0.8685, + "step": 24240 + }, + { + "epoch": 0.5839213891765612, + "grad_norm": 1.4106559753417969, + "learning_rate": 4.380133860663219e-07, + "loss": 0.8636, + "step": 24245 + }, + { + "epoch": 0.5840418101683486, + "grad_norm": 1.6017537117004395, + "learning_rate": 4.3788662407463747e-07, + "loss": 0.9221, + "step": 24250 + }, + { + "epoch": 0.5841622311601359, + "grad_norm": 1.5603747367858887, + "learning_rate": 4.37759862082953e-07, + "loss": 0.8357, + "step": 24255 + }, + { + "epoch": 0.5842826521519231, + "grad_norm": 1.6029847860336304, + "learning_rate": 4.3763310009126865e-07, + "loss": 0.9294, + "step": 24260 + }, + { + "epoch": 0.5844030731437104, + "grad_norm": 1.52727472782135, + "learning_rate": 4.375063380995842e-07, + "loss": 0.8984, + "step": 24265 + }, + { + "epoch": 0.5845234941354976, + "grad_norm": 1.6402517557144165, + "learning_rate": 4.373795761078998e-07, + "loss": 0.9071, + "step": 24270 + }, + { + "epoch": 0.584643915127285, + "grad_norm": 1.5726128816604614, + "learning_rate": 4.372528141162154e-07, + "loss": 0.8772, + "step": 24275 + }, + { + "epoch": 0.5847643361190723, + "grad_norm": 1.6014689207077026, + "learning_rate": 4.3712605212453097e-07, + "loss": 0.9104, + "step": 24280 + }, + { + "epoch": 0.5848847571108595, + "grad_norm": 1.4787907600402832, + "learning_rate": 4.3699929013284656e-07, + "loss": 0.8989, + "step": 24285 + }, + { + "epoch": 0.5850051781026469, + "grad_norm": 1.6082152128219604, + "learning_rate": 4.3687252814116215e-07, + "loss": 0.9106, + "step": 24290 + }, + { + "epoch": 0.5851255990944342, + "grad_norm": 1.7320561408996582, + "learning_rate": 4.3674576614947774e-07, + "loss": 0.9092, + "step": 24295 + }, + { + "epoch": 0.5852460200862214, + "grad_norm": 1.379388689994812, + "learning_rate": 4.366190041577933e-07, + "loss": 0.9151, + "step": 24300 + }, + { + "epoch": 0.5853664410780087, + "grad_norm": 1.8676650524139404, + "learning_rate": 4.364922421661089e-07, + "loss": 0.9412, + "step": 24305 + }, + { + "epoch": 0.585486862069796, + "grad_norm": 1.5080041885375977, + "learning_rate": 4.3636548017442446e-07, + "loss": 0.9029, + "step": 24310 + }, + { + "epoch": 0.5856072830615833, + "grad_norm": 1.5643041133880615, + "learning_rate": 4.3623871818274006e-07, + "loss": 0.9524, + "step": 24315 + }, + { + "epoch": 0.5857277040533706, + "grad_norm": 1.6688721179962158, + "learning_rate": 4.361119561910557e-07, + "loss": 0.8733, + "step": 24320 + }, + { + "epoch": 0.5858481250451578, + "grad_norm": 1.5123419761657715, + "learning_rate": 4.3598519419937124e-07, + "loss": 0.8769, + "step": 24325 + }, + { + "epoch": 0.5859685460369451, + "grad_norm": 1.4766058921813965, + "learning_rate": 4.3585843220768683e-07, + "loss": 0.9166, + "step": 24330 + }, + { + "epoch": 0.5860889670287325, + "grad_norm": 1.4903916120529175, + "learning_rate": 4.357316702160024e-07, + "loss": 0.8974, + "step": 24335 + }, + { + "epoch": 0.5862093880205197, + "grad_norm": 1.6494252681732178, + "learning_rate": 4.35604908224318e-07, + "loss": 0.9333, + "step": 24340 + }, + { + "epoch": 0.586329809012307, + "grad_norm": 1.5249848365783691, + "learning_rate": 4.3547814623263355e-07, + "loss": 0.9179, + "step": 24345 + }, + { + "epoch": 0.5864502300040944, + "grad_norm": 1.384194016456604, + "learning_rate": 4.353513842409492e-07, + "loss": 0.9111, + "step": 24350 + }, + { + "epoch": 0.5865706509958816, + "grad_norm": 1.5499690771102905, + "learning_rate": 4.3522462224926474e-07, + "loss": 0.9198, + "step": 24355 + }, + { + "epoch": 0.5866910719876689, + "grad_norm": 1.390620470046997, + "learning_rate": 4.3509786025758033e-07, + "loss": 0.8384, + "step": 24360 + }, + { + "epoch": 0.5868114929794562, + "grad_norm": 1.4609301090240479, + "learning_rate": 4.3497109826589597e-07, + "loss": 0.8811, + "step": 24365 + }, + { + "epoch": 0.5869319139712434, + "grad_norm": 1.515616774559021, + "learning_rate": 4.348443362742115e-07, + "loss": 0.8662, + "step": 24370 + }, + { + "epoch": 0.5870523349630308, + "grad_norm": 1.6101609468460083, + "learning_rate": 4.347175742825271e-07, + "loss": 0.9414, + "step": 24375 + }, + { + "epoch": 0.587172755954818, + "grad_norm": 1.5424721240997314, + "learning_rate": 4.345908122908427e-07, + "loss": 0.9024, + "step": 24380 + }, + { + "epoch": 0.5872931769466053, + "grad_norm": 1.7643179893493652, + "learning_rate": 4.344640502991583e-07, + "loss": 0.8595, + "step": 24385 + }, + { + "epoch": 0.5874135979383927, + "grad_norm": 1.6247128248214722, + "learning_rate": 4.343372883074738e-07, + "loss": 0.9067, + "step": 24390 + }, + { + "epoch": 0.5875340189301799, + "grad_norm": 1.526525855064392, + "learning_rate": 4.3421052631578947e-07, + "loss": 0.9579, + "step": 24395 + }, + { + "epoch": 0.5876544399219672, + "grad_norm": 1.9331629276275635, + "learning_rate": 4.3408376432410506e-07, + "loss": 0.8746, + "step": 24400 + }, + { + "epoch": 0.5877748609137545, + "grad_norm": 1.432662844657898, + "learning_rate": 4.339570023324206e-07, + "loss": 0.9269, + "step": 24405 + }, + { + "epoch": 0.5878952819055417, + "grad_norm": 1.6652055978775024, + "learning_rate": 4.3383024034073625e-07, + "loss": 0.922, + "step": 24410 + }, + { + "epoch": 0.5880157028973291, + "grad_norm": 1.549464225769043, + "learning_rate": 4.337034783490518e-07, + "loss": 0.9436, + "step": 24415 + }, + { + "epoch": 0.5881361238891164, + "grad_norm": 1.5567026138305664, + "learning_rate": 4.335767163573674e-07, + "loss": 0.8839, + "step": 24420 + }, + { + "epoch": 0.5882565448809036, + "grad_norm": 1.5347050428390503, + "learning_rate": 4.3344995436568297e-07, + "loss": 0.8827, + "step": 24425 + }, + { + "epoch": 0.588376965872691, + "grad_norm": 1.6135860681533813, + "learning_rate": 4.3332319237399856e-07, + "loss": 0.8946, + "step": 24430 + }, + { + "epoch": 0.5884973868644782, + "grad_norm": 1.5451579093933105, + "learning_rate": 4.3319643038231415e-07, + "loss": 0.9577, + "step": 24435 + }, + { + "epoch": 0.5886178078562655, + "grad_norm": 1.5067342519760132, + "learning_rate": 4.3306966839062974e-07, + "loss": 0.8841, + "step": 24440 + }, + { + "epoch": 0.5887382288480528, + "grad_norm": 1.3985469341278076, + "learning_rate": 4.3294290639894534e-07, + "loss": 0.8999, + "step": 24445 + }, + { + "epoch": 0.58885864983984, + "grad_norm": 1.6078155040740967, + "learning_rate": 4.328161444072609e-07, + "loss": 0.939, + "step": 24450 + }, + { + "epoch": 0.5889790708316274, + "grad_norm": 1.5124714374542236, + "learning_rate": 4.326893824155765e-07, + "loss": 0.9505, + "step": 24455 + }, + { + "epoch": 0.5890994918234147, + "grad_norm": 1.4121074676513672, + "learning_rate": 4.3256262042389206e-07, + "loss": 0.9179, + "step": 24460 + }, + { + "epoch": 0.5892199128152019, + "grad_norm": 1.6301769018173218, + "learning_rate": 4.3243585843220765e-07, + "loss": 0.9344, + "step": 24465 + }, + { + "epoch": 0.5893403338069892, + "grad_norm": 1.4103622436523438, + "learning_rate": 4.323090964405233e-07, + "loss": 0.9356, + "step": 24470 + }, + { + "epoch": 0.5894607547987766, + "grad_norm": 1.471883773803711, + "learning_rate": 4.3218233444883883e-07, + "loss": 0.9248, + "step": 24475 + }, + { + "epoch": 0.5895811757905638, + "grad_norm": 1.5216230154037476, + "learning_rate": 4.320555724571544e-07, + "loss": 0.8915, + "step": 24480 + }, + { + "epoch": 0.5897015967823511, + "grad_norm": 1.7749937772750854, + "learning_rate": 4.3192881046547e-07, + "loss": 0.8772, + "step": 24485 + }, + { + "epoch": 0.5898220177741383, + "grad_norm": 1.7109736204147339, + "learning_rate": 4.318020484737856e-07, + "loss": 0.9155, + "step": 24490 + }, + { + "epoch": 0.5899424387659257, + "grad_norm": 1.5612523555755615, + "learning_rate": 4.3167528648210115e-07, + "loss": 0.9747, + "step": 24495 + }, + { + "epoch": 0.590062859757713, + "grad_norm": 1.8180674314498901, + "learning_rate": 4.315485244904168e-07, + "loss": 0.9173, + "step": 24500 + }, + { + "epoch": 0.5901832807495002, + "grad_norm": 1.5764421224594116, + "learning_rate": 4.3142176249873233e-07, + "loss": 0.9285, + "step": 24505 + }, + { + "epoch": 0.5903037017412875, + "grad_norm": 1.7687420845031738, + "learning_rate": 4.312950005070479e-07, + "loss": 0.9144, + "step": 24510 + }, + { + "epoch": 0.5904241227330749, + "grad_norm": 1.494286298751831, + "learning_rate": 4.3116823851536357e-07, + "loss": 0.8958, + "step": 24515 + }, + { + "epoch": 0.5905445437248621, + "grad_norm": 1.4615665674209595, + "learning_rate": 4.310414765236791e-07, + "loss": 0.889, + "step": 24520 + }, + { + "epoch": 0.5906649647166494, + "grad_norm": 1.6046537160873413, + "learning_rate": 4.309147145319947e-07, + "loss": 0.9077, + "step": 24525 + }, + { + "epoch": 0.5907853857084367, + "grad_norm": 1.524892807006836, + "learning_rate": 4.307879525403103e-07, + "loss": 0.91, + "step": 24530 + }, + { + "epoch": 0.590905806700224, + "grad_norm": 1.6162455081939697, + "learning_rate": 4.306611905486259e-07, + "loss": 0.9017, + "step": 24535 + }, + { + "epoch": 0.5910262276920113, + "grad_norm": 1.5659140348434448, + "learning_rate": 4.305344285569414e-07, + "loss": 0.9378, + "step": 24540 + }, + { + "epoch": 0.5911466486837985, + "grad_norm": 1.538069248199463, + "learning_rate": 4.3040766656525707e-07, + "loss": 0.9146, + "step": 24545 + }, + { + "epoch": 0.5912670696755858, + "grad_norm": 1.7028398513793945, + "learning_rate": 4.3028090457357266e-07, + "loss": 0.9234, + "step": 24550 + }, + { + "epoch": 0.5913874906673732, + "grad_norm": 1.4257739782333374, + "learning_rate": 4.301541425818882e-07, + "loss": 0.9103, + "step": 24555 + }, + { + "epoch": 0.5915079116591604, + "grad_norm": 1.5835907459259033, + "learning_rate": 4.3002738059020384e-07, + "loss": 0.9112, + "step": 24560 + }, + { + "epoch": 0.5916283326509477, + "grad_norm": 1.4983320236206055, + "learning_rate": 4.299006185985194e-07, + "loss": 0.859, + "step": 24565 + }, + { + "epoch": 0.591748753642735, + "grad_norm": 1.4904733896255493, + "learning_rate": 4.2977385660683497e-07, + "loss": 0.9099, + "step": 24570 + }, + { + "epoch": 0.5918691746345223, + "grad_norm": 1.565053105354309, + "learning_rate": 4.2964709461515056e-07, + "loss": 0.9727, + "step": 24575 + }, + { + "epoch": 0.5919895956263096, + "grad_norm": 1.3869820833206177, + "learning_rate": 4.2952033262346616e-07, + "loss": 0.8663, + "step": 24580 + }, + { + "epoch": 0.5921100166180968, + "grad_norm": 1.8419201374053955, + "learning_rate": 4.2939357063178175e-07, + "loss": 0.9035, + "step": 24585 + }, + { + "epoch": 0.5922304376098841, + "grad_norm": 1.458988904953003, + "learning_rate": 4.2926680864009734e-07, + "loss": 0.9035, + "step": 24590 + }, + { + "epoch": 0.5923508586016715, + "grad_norm": 1.638648509979248, + "learning_rate": 4.2914004664841293e-07, + "loss": 0.8944, + "step": 24595 + }, + { + "epoch": 0.5924712795934587, + "grad_norm": 1.8352802991867065, + "learning_rate": 4.2901328465672847e-07, + "loss": 0.8325, + "step": 24600 + }, + { + "epoch": 0.592591700585246, + "grad_norm": 1.7183581590652466, + "learning_rate": 4.288865226650441e-07, + "loss": 0.9469, + "step": 24605 + }, + { + "epoch": 0.5927121215770333, + "grad_norm": 1.4810676574707031, + "learning_rate": 4.2875976067335965e-07, + "loss": 0.8973, + "step": 24610 + }, + { + "epoch": 0.5928325425688206, + "grad_norm": 1.4876949787139893, + "learning_rate": 4.2863299868167524e-07, + "loss": 0.8831, + "step": 24615 + }, + { + "epoch": 0.5929529635606079, + "grad_norm": 1.4875684976577759, + "learning_rate": 4.2850623668999084e-07, + "loss": 0.9086, + "step": 24620 + }, + { + "epoch": 0.5930733845523952, + "grad_norm": 1.4098703861236572, + "learning_rate": 4.2837947469830643e-07, + "loss": 0.9037, + "step": 24625 + }, + { + "epoch": 0.5931938055441824, + "grad_norm": 1.6200228929519653, + "learning_rate": 4.2825271270662207e-07, + "loss": 0.8808, + "step": 24630 + }, + { + "epoch": 0.5933142265359698, + "grad_norm": 1.5252360105514526, + "learning_rate": 4.281259507149376e-07, + "loss": 0.9197, + "step": 24635 + }, + { + "epoch": 0.593434647527757, + "grad_norm": 1.503819465637207, + "learning_rate": 4.279991887232532e-07, + "loss": 0.9064, + "step": 24640 + }, + { + "epoch": 0.5935550685195443, + "grad_norm": 1.649817705154419, + "learning_rate": 4.278724267315688e-07, + "loss": 0.9156, + "step": 24645 + }, + { + "epoch": 0.5936754895113316, + "grad_norm": 1.7362719774246216, + "learning_rate": 4.277456647398844e-07, + "loss": 0.9057, + "step": 24650 + }, + { + "epoch": 0.5937959105031189, + "grad_norm": 1.6223540306091309, + "learning_rate": 4.276189027481999e-07, + "loss": 0.9153, + "step": 24655 + }, + { + "epoch": 0.5939163314949062, + "grad_norm": 1.979633092880249, + "learning_rate": 4.2749214075651557e-07, + "loss": 0.92, + "step": 24660 + }, + { + "epoch": 0.5940367524866935, + "grad_norm": 1.5584977865219116, + "learning_rate": 4.2736537876483116e-07, + "loss": 0.9124, + "step": 24665 + }, + { + "epoch": 0.5941571734784807, + "grad_norm": 1.7202004194259644, + "learning_rate": 4.272386167731467e-07, + "loss": 0.8668, + "step": 24670 + }, + { + "epoch": 0.5942775944702681, + "grad_norm": 1.3186968564987183, + "learning_rate": 4.2711185478146235e-07, + "loss": 0.895, + "step": 24675 + }, + { + "epoch": 0.5943980154620554, + "grad_norm": 1.4132670164108276, + "learning_rate": 4.269850927897779e-07, + "loss": 0.8781, + "step": 24680 + }, + { + "epoch": 0.5945184364538426, + "grad_norm": 1.6592215299606323, + "learning_rate": 4.268583307980935e-07, + "loss": 0.9732, + "step": 24685 + }, + { + "epoch": 0.5946388574456299, + "grad_norm": 1.5422641038894653, + "learning_rate": 4.2673156880640907e-07, + "loss": 0.8949, + "step": 24690 + }, + { + "epoch": 0.5947592784374172, + "grad_norm": 1.4672731161117554, + "learning_rate": 4.2660480681472466e-07, + "loss": 0.8768, + "step": 24695 + }, + { + "epoch": 0.5948796994292045, + "grad_norm": 1.4263854026794434, + "learning_rate": 4.2647804482304025e-07, + "loss": 0.8618, + "step": 24700 + }, + { + "epoch": 0.5950001204209918, + "grad_norm": 1.4597587585449219, + "learning_rate": 4.2635128283135584e-07, + "loss": 0.9404, + "step": 24705 + }, + { + "epoch": 0.595120541412779, + "grad_norm": 1.6408240795135498, + "learning_rate": 4.2622452083967144e-07, + "loss": 0.9067, + "step": 24710 + }, + { + "epoch": 0.5952409624045664, + "grad_norm": 1.608472228050232, + "learning_rate": 4.26097758847987e-07, + "loss": 0.8881, + "step": 24715 + }, + { + "epoch": 0.5953613833963537, + "grad_norm": 1.5572516918182373, + "learning_rate": 4.259709968563026e-07, + "loss": 0.9285, + "step": 24720 + }, + { + "epoch": 0.5954818043881409, + "grad_norm": 1.5052424669265747, + "learning_rate": 4.2584423486461816e-07, + "loss": 0.8771, + "step": 24725 + }, + { + "epoch": 0.5956022253799282, + "grad_norm": 1.4403895139694214, + "learning_rate": 4.2571747287293375e-07, + "loss": 0.9194, + "step": 24730 + }, + { + "epoch": 0.5957226463717156, + "grad_norm": 1.5183978080749512, + "learning_rate": 4.255907108812494e-07, + "loss": 0.8902, + "step": 24735 + }, + { + "epoch": 0.5958430673635028, + "grad_norm": 1.5620477199554443, + "learning_rate": 4.2546394888956493e-07, + "loss": 0.8974, + "step": 24740 + }, + { + "epoch": 0.5959634883552901, + "grad_norm": 1.5473697185516357, + "learning_rate": 4.253371868978805e-07, + "loss": 0.9218, + "step": 24745 + }, + { + "epoch": 0.5960839093470773, + "grad_norm": 1.8315085172653198, + "learning_rate": 4.252104249061961e-07, + "loss": 0.8657, + "step": 24750 + }, + { + "epoch": 0.5962043303388647, + "grad_norm": 1.4640153646469116, + "learning_rate": 4.250836629145117e-07, + "loss": 0.8704, + "step": 24755 + }, + { + "epoch": 0.596324751330652, + "grad_norm": 1.628290057182312, + "learning_rate": 4.2495690092282725e-07, + "loss": 0.9884, + "step": 24760 + }, + { + "epoch": 0.5964451723224392, + "grad_norm": 1.4359389543533325, + "learning_rate": 4.248301389311429e-07, + "loss": 0.8803, + "step": 24765 + }, + { + "epoch": 0.5965655933142265, + "grad_norm": 1.5545084476470947, + "learning_rate": 4.2470337693945843e-07, + "loss": 0.8864, + "step": 24770 + }, + { + "epoch": 0.5966860143060139, + "grad_norm": 1.6062496900558472, + "learning_rate": 4.24576614947774e-07, + "loss": 0.9378, + "step": 24775 + }, + { + "epoch": 0.5968064352978011, + "grad_norm": 1.5714846849441528, + "learning_rate": 4.2444985295608967e-07, + "loss": 0.9166, + "step": 24780 + }, + { + "epoch": 0.5969268562895884, + "grad_norm": 1.3731664419174194, + "learning_rate": 4.243230909644052e-07, + "loss": 0.9121, + "step": 24785 + }, + { + "epoch": 0.5970472772813757, + "grad_norm": 1.2442537546157837, + "learning_rate": 4.241963289727208e-07, + "loss": 0.8615, + "step": 24790 + }, + { + "epoch": 0.597167698273163, + "grad_norm": 1.513586163520813, + "learning_rate": 4.240695669810364e-07, + "loss": 0.8859, + "step": 24795 + }, + { + "epoch": 0.5972881192649503, + "grad_norm": 1.4235986471176147, + "learning_rate": 4.23942804989352e-07, + "loss": 0.8807, + "step": 24800 + }, + { + "epoch": 0.5974085402567375, + "grad_norm": 1.383156657218933, + "learning_rate": 4.238160429976675e-07, + "loss": 0.911, + "step": 24805 + }, + { + "epoch": 0.5975289612485248, + "grad_norm": 1.4993770122528076, + "learning_rate": 4.2368928100598317e-07, + "loss": 0.8858, + "step": 24810 + }, + { + "epoch": 0.5976493822403122, + "grad_norm": 1.6158260107040405, + "learning_rate": 4.2356251901429876e-07, + "loss": 0.9184, + "step": 24815 + }, + { + "epoch": 0.5977698032320994, + "grad_norm": 1.532894253730774, + "learning_rate": 4.234357570226143e-07, + "loss": 0.9393, + "step": 24820 + }, + { + "epoch": 0.5978902242238867, + "grad_norm": 1.5125641822814941, + "learning_rate": 4.2330899503092994e-07, + "loss": 0.9201, + "step": 24825 + }, + { + "epoch": 0.598010645215674, + "grad_norm": 1.6651484966278076, + "learning_rate": 4.231822330392455e-07, + "loss": 0.8663, + "step": 24830 + }, + { + "epoch": 0.5981310662074613, + "grad_norm": 1.5242916345596313, + "learning_rate": 4.2305547104756107e-07, + "loss": 0.9125, + "step": 24835 + }, + { + "epoch": 0.5982514871992486, + "grad_norm": 1.4305624961853027, + "learning_rate": 4.2292870905587666e-07, + "loss": 0.926, + "step": 24840 + }, + { + "epoch": 0.5983719081910358, + "grad_norm": 1.5143855810165405, + "learning_rate": 4.2280194706419226e-07, + "loss": 0.9227, + "step": 24845 + }, + { + "epoch": 0.5984923291828231, + "grad_norm": 1.9623805284500122, + "learning_rate": 4.2267518507250785e-07, + "loss": 0.9232, + "step": 24850 + }, + { + "epoch": 0.5986127501746105, + "grad_norm": 1.474603295326233, + "learning_rate": 4.2254842308082344e-07, + "loss": 0.9077, + "step": 24855 + }, + { + "epoch": 0.5987331711663977, + "grad_norm": 1.7623028755187988, + "learning_rate": 4.2242166108913903e-07, + "loss": 0.8742, + "step": 24860 + }, + { + "epoch": 0.598853592158185, + "grad_norm": 1.4490970373153687, + "learning_rate": 4.2229489909745457e-07, + "loss": 0.9129, + "step": 24865 + }, + { + "epoch": 0.5989740131499723, + "grad_norm": 1.5933712720870972, + "learning_rate": 4.221681371057702e-07, + "loss": 0.892, + "step": 24870 + }, + { + "epoch": 0.5990944341417596, + "grad_norm": 1.4571937322616577, + "learning_rate": 4.2204137511408575e-07, + "loss": 0.898, + "step": 24875 + }, + { + "epoch": 0.5992148551335469, + "grad_norm": 1.9972020387649536, + "learning_rate": 4.2191461312240135e-07, + "loss": 0.8579, + "step": 24880 + }, + { + "epoch": 0.5993352761253342, + "grad_norm": 1.5374258756637573, + "learning_rate": 4.21787851130717e-07, + "loss": 0.8947, + "step": 24885 + }, + { + "epoch": 0.5994556971171214, + "grad_norm": 1.6821030378341675, + "learning_rate": 4.2166108913903253e-07, + "loss": 0.965, + "step": 24890 + }, + { + "epoch": 0.5995761181089088, + "grad_norm": 1.6769107580184937, + "learning_rate": 4.215343271473481e-07, + "loss": 0.8754, + "step": 24895 + }, + { + "epoch": 0.599696539100696, + "grad_norm": 1.5854194164276123, + "learning_rate": 4.214075651556637e-07, + "loss": 0.9242, + "step": 24900 + }, + { + "epoch": 0.5998169600924833, + "grad_norm": 1.4867968559265137, + "learning_rate": 4.212808031639793e-07, + "loss": 0.9153, + "step": 24905 + }, + { + "epoch": 0.5999373810842706, + "grad_norm": 1.6546216011047363, + "learning_rate": 4.2115404117229484e-07, + "loss": 0.8559, + "step": 24910 + }, + { + "epoch": 0.6000578020760579, + "grad_norm": 1.4881573915481567, + "learning_rate": 4.210272791806105e-07, + "loss": 0.9238, + "step": 24915 + }, + { + "epoch": 0.6001782230678452, + "grad_norm": 1.7282466888427734, + "learning_rate": 4.2090051718892603e-07, + "loss": 0.9407, + "step": 24920 + }, + { + "epoch": 0.6002986440596325, + "grad_norm": 1.6105860471725464, + "learning_rate": 4.207737551972416e-07, + "loss": 0.9407, + "step": 24925 + }, + { + "epoch": 0.6004190650514197, + "grad_norm": 1.537200689315796, + "learning_rate": 4.2064699320555726e-07, + "loss": 0.8998, + "step": 24930 + }, + { + "epoch": 0.6005394860432071, + "grad_norm": 1.5517336130142212, + "learning_rate": 4.205202312138728e-07, + "loss": 0.9177, + "step": 24935 + }, + { + "epoch": 0.6006599070349944, + "grad_norm": 1.5445387363433838, + "learning_rate": 4.203934692221884e-07, + "loss": 0.9164, + "step": 24940 + }, + { + "epoch": 0.6007803280267816, + "grad_norm": 1.5406125783920288, + "learning_rate": 4.20266707230504e-07, + "loss": 0.9197, + "step": 24945 + }, + { + "epoch": 0.6009007490185689, + "grad_norm": 1.7499818801879883, + "learning_rate": 4.201399452388196e-07, + "loss": 0.8886, + "step": 24950 + }, + { + "epoch": 0.6010211700103562, + "grad_norm": 1.4955873489379883, + "learning_rate": 4.200131832471351e-07, + "loss": 0.9468, + "step": 24955 + }, + { + "epoch": 0.6011415910021435, + "grad_norm": 1.6411209106445312, + "learning_rate": 4.1988642125545076e-07, + "loss": 0.8876, + "step": 24960 + }, + { + "epoch": 0.6012620119939308, + "grad_norm": 1.583150863647461, + "learning_rate": 4.1975965926376635e-07, + "loss": 0.8628, + "step": 24965 + }, + { + "epoch": 0.601382432985718, + "grad_norm": 1.4690210819244385, + "learning_rate": 4.196328972720819e-07, + "loss": 0.9331, + "step": 24970 + }, + { + "epoch": 0.6015028539775054, + "grad_norm": 1.6037509441375732, + "learning_rate": 4.1950613528039754e-07, + "loss": 0.9168, + "step": 24975 + }, + { + "epoch": 0.6016232749692927, + "grad_norm": 1.454247236251831, + "learning_rate": 4.193793732887131e-07, + "loss": 0.9042, + "step": 24980 + }, + { + "epoch": 0.6017436959610799, + "grad_norm": 1.6040294170379639, + "learning_rate": 4.1925261129702867e-07, + "loss": 0.8736, + "step": 24985 + }, + { + "epoch": 0.6018641169528672, + "grad_norm": 1.4939274787902832, + "learning_rate": 4.1912584930534426e-07, + "loss": 0.8957, + "step": 24990 + }, + { + "epoch": 0.6019845379446546, + "grad_norm": 1.7517776489257812, + "learning_rate": 4.1899908731365985e-07, + "loss": 0.9518, + "step": 24995 + }, + { + "epoch": 0.6021049589364418, + "grad_norm": 1.6388598680496216, + "learning_rate": 4.1887232532197544e-07, + "loss": 0.8468, + "step": 25000 + }, + { + "epoch": 0.6022253799282291, + "grad_norm": 1.7947436571121216, + "learning_rate": 4.1874556333029103e-07, + "loss": 0.9134, + "step": 25005 + }, + { + "epoch": 0.6023458009200163, + "grad_norm": 1.5894259214401245, + "learning_rate": 4.186188013386066e-07, + "loss": 0.8828, + "step": 25010 + }, + { + "epoch": 0.6024662219118037, + "grad_norm": 1.5322158336639404, + "learning_rate": 4.1849203934692216e-07, + "loss": 0.8498, + "step": 25015 + }, + { + "epoch": 0.602586642903591, + "grad_norm": 1.5399775505065918, + "learning_rate": 4.183652773552378e-07, + "loss": 0.8729, + "step": 25020 + }, + { + "epoch": 0.6027070638953782, + "grad_norm": 1.4279295206069946, + "learning_rate": 4.1823851536355335e-07, + "loss": 0.8754, + "step": 25025 + }, + { + "epoch": 0.6028274848871655, + "grad_norm": 1.558327555656433, + "learning_rate": 4.1811175337186894e-07, + "loss": 0.9001, + "step": 25030 + }, + { + "epoch": 0.6029479058789529, + "grad_norm": 1.5234421491622925, + "learning_rate": 4.179849913801846e-07, + "loss": 0.9507, + "step": 25035 + }, + { + "epoch": 0.6030683268707401, + "grad_norm": 1.3733103275299072, + "learning_rate": 4.178582293885001e-07, + "loss": 0.8795, + "step": 25040 + }, + { + "epoch": 0.6031887478625274, + "grad_norm": 1.5034695863723755, + "learning_rate": 4.177314673968157e-07, + "loss": 0.8785, + "step": 25045 + }, + { + "epoch": 0.6033091688543147, + "grad_norm": 1.475675344467163, + "learning_rate": 4.176047054051313e-07, + "loss": 0.8879, + "step": 25050 + }, + { + "epoch": 0.603429589846102, + "grad_norm": 1.5570924282073975, + "learning_rate": 4.174779434134469e-07, + "loss": 0.8826, + "step": 25055 + }, + { + "epoch": 0.6035500108378893, + "grad_norm": 1.369581699371338, + "learning_rate": 4.1735118142176244e-07, + "loss": 0.9142, + "step": 25060 + }, + { + "epoch": 0.6036704318296765, + "grad_norm": 1.528016448020935, + "learning_rate": 4.172244194300781e-07, + "loss": 0.9121, + "step": 25065 + }, + { + "epoch": 0.6037908528214638, + "grad_norm": 1.5586113929748535, + "learning_rate": 4.170976574383936e-07, + "loss": 0.8676, + "step": 25070 + }, + { + "epoch": 0.6039112738132512, + "grad_norm": 1.7165907621383667, + "learning_rate": 4.169708954467092e-07, + "loss": 0.9142, + "step": 25075 + }, + { + "epoch": 0.6040316948050384, + "grad_norm": 1.5208547115325928, + "learning_rate": 4.1684413345502486e-07, + "loss": 0.8691, + "step": 25080 + }, + { + "epoch": 0.6041521157968257, + "grad_norm": 1.4242299795150757, + "learning_rate": 4.167173714633404e-07, + "loss": 0.8897, + "step": 25085 + }, + { + "epoch": 0.604272536788613, + "grad_norm": 1.5583271980285645, + "learning_rate": 4.16590609471656e-07, + "loss": 0.9203, + "step": 25090 + }, + { + "epoch": 0.6043929577804003, + "grad_norm": 1.7204725742340088, + "learning_rate": 4.164638474799716e-07, + "loss": 0.8859, + "step": 25095 + }, + { + "epoch": 0.6045133787721876, + "grad_norm": 1.7835016250610352, + "learning_rate": 4.1633708548828717e-07, + "loss": 0.9605, + "step": 25100 + }, + { + "epoch": 0.6046337997639749, + "grad_norm": 1.520129919052124, + "learning_rate": 4.162103234966027e-07, + "loss": 0.8718, + "step": 25105 + }, + { + "epoch": 0.6047542207557621, + "grad_norm": 1.5081130266189575, + "learning_rate": 4.1608356150491836e-07, + "loss": 0.9175, + "step": 25110 + }, + { + "epoch": 0.6048746417475495, + "grad_norm": 1.61761474609375, + "learning_rate": 4.1595679951323395e-07, + "loss": 0.885, + "step": 25115 + }, + { + "epoch": 0.6049950627393367, + "grad_norm": 1.576309323310852, + "learning_rate": 4.158300375215495e-07, + "loss": 0.8932, + "step": 25120 + }, + { + "epoch": 0.605115483731124, + "grad_norm": 1.5259867906570435, + "learning_rate": 4.1570327552986513e-07, + "loss": 0.8986, + "step": 25125 + }, + { + "epoch": 0.6052359047229113, + "grad_norm": 1.5326902866363525, + "learning_rate": 4.1557651353818067e-07, + "loss": 0.9337, + "step": 25130 + }, + { + "epoch": 0.6053563257146986, + "grad_norm": 1.5314621925354004, + "learning_rate": 4.1544975154649626e-07, + "loss": 0.8312, + "step": 25135 + }, + { + "epoch": 0.6054767467064859, + "grad_norm": 1.5467171669006348, + "learning_rate": 4.1532298955481185e-07, + "loss": 0.9158, + "step": 25140 + }, + { + "epoch": 0.6055971676982732, + "grad_norm": 1.5144315958023071, + "learning_rate": 4.1519622756312745e-07, + "loss": 0.8733, + "step": 25145 + }, + { + "epoch": 0.6057175886900604, + "grad_norm": 1.8202297687530518, + "learning_rate": 4.150694655714431e-07, + "loss": 0.9066, + "step": 25150 + }, + { + "epoch": 0.6058380096818478, + "grad_norm": 1.5166350603103638, + "learning_rate": 4.1494270357975863e-07, + "loss": 0.8701, + "step": 25155 + }, + { + "epoch": 0.605958430673635, + "grad_norm": 1.5550726652145386, + "learning_rate": 4.148159415880742e-07, + "loss": 0.9135, + "step": 25160 + }, + { + "epoch": 0.6060788516654223, + "grad_norm": 1.5523972511291504, + "learning_rate": 4.146891795963898e-07, + "loss": 0.9192, + "step": 25165 + }, + { + "epoch": 0.6061992726572096, + "grad_norm": 2.545870065689087, + "learning_rate": 4.145624176047054e-07, + "loss": 0.9183, + "step": 25170 + }, + { + "epoch": 0.6063196936489968, + "grad_norm": 1.58835768699646, + "learning_rate": 4.1443565561302094e-07, + "loss": 0.9499, + "step": 25175 + }, + { + "epoch": 0.6064401146407842, + "grad_norm": 1.5333999395370483, + "learning_rate": 4.143088936213366e-07, + "loss": 0.9539, + "step": 25180 + }, + { + "epoch": 0.6065605356325715, + "grad_norm": 1.4999562501907349, + "learning_rate": 4.141821316296522e-07, + "loss": 0.9189, + "step": 25185 + }, + { + "epoch": 0.6066809566243587, + "grad_norm": 1.7383499145507812, + "learning_rate": 4.140553696379677e-07, + "loss": 0.9103, + "step": 25190 + }, + { + "epoch": 0.606801377616146, + "grad_norm": 1.4625837802886963, + "learning_rate": 4.1392860764628336e-07, + "loss": 0.9116, + "step": 25195 + }, + { + "epoch": 0.6069217986079334, + "grad_norm": 1.3118094205856323, + "learning_rate": 4.138018456545989e-07, + "loss": 0.8632, + "step": 25200 + }, + { + "epoch": 0.6070422195997206, + "grad_norm": 1.6964633464813232, + "learning_rate": 4.136750836629145e-07, + "loss": 0.9179, + "step": 25205 + }, + { + "epoch": 0.6071626405915079, + "grad_norm": 1.8581135272979736, + "learning_rate": 4.135483216712301e-07, + "loss": 0.922, + "step": 25210 + }, + { + "epoch": 0.6072830615832951, + "grad_norm": 1.6567142009735107, + "learning_rate": 4.134215596795457e-07, + "loss": 0.9666, + "step": 25215 + }, + { + "epoch": 0.6074034825750825, + "grad_norm": 1.8037158250808716, + "learning_rate": 4.132947976878612e-07, + "loss": 0.9274, + "step": 25220 + }, + { + "epoch": 0.6075239035668698, + "grad_norm": 1.6046236753463745, + "learning_rate": 4.1316803569617686e-07, + "loss": 0.8757, + "step": 25225 + }, + { + "epoch": 0.607644324558657, + "grad_norm": 1.5918865203857422, + "learning_rate": 4.1304127370449245e-07, + "loss": 0.8372, + "step": 25230 + }, + { + "epoch": 0.6077647455504444, + "grad_norm": 1.3744317293167114, + "learning_rate": 4.12914511712808e-07, + "loss": 0.914, + "step": 25235 + }, + { + "epoch": 0.6078851665422317, + "grad_norm": 1.3174458742141724, + "learning_rate": 4.1278774972112364e-07, + "loss": 0.8677, + "step": 25240 + }, + { + "epoch": 0.6080055875340189, + "grad_norm": 1.6742388010025024, + "learning_rate": 4.126609877294392e-07, + "loss": 0.9147, + "step": 25245 + }, + { + "epoch": 0.6081260085258062, + "grad_norm": 1.6274393796920776, + "learning_rate": 4.1253422573775477e-07, + "loss": 0.9061, + "step": 25250 + }, + { + "epoch": 0.6082464295175936, + "grad_norm": 1.349190592765808, + "learning_rate": 4.1240746374607036e-07, + "loss": 0.8988, + "step": 25255 + }, + { + "epoch": 0.6083668505093808, + "grad_norm": 1.382152795791626, + "learning_rate": 4.1228070175438595e-07, + "loss": 0.9355, + "step": 25260 + }, + { + "epoch": 0.6084872715011681, + "grad_norm": 1.608886957168579, + "learning_rate": 4.1215393976270154e-07, + "loss": 0.9753, + "step": 25265 + }, + { + "epoch": 0.6086076924929553, + "grad_norm": 1.4702397584915161, + "learning_rate": 4.1202717777101713e-07, + "loss": 0.9265, + "step": 25270 + }, + { + "epoch": 0.6087281134847427, + "grad_norm": 1.6230460405349731, + "learning_rate": 4.119004157793327e-07, + "loss": 0.9075, + "step": 25275 + }, + { + "epoch": 0.60884853447653, + "grad_norm": 1.5647751092910767, + "learning_rate": 4.1177365378764827e-07, + "loss": 0.896, + "step": 25280 + }, + { + "epoch": 0.6089689554683172, + "grad_norm": 1.497682809829712, + "learning_rate": 4.116468917959639e-07, + "loss": 0.8999, + "step": 25285 + }, + { + "epoch": 0.6090893764601045, + "grad_norm": 1.3031469583511353, + "learning_rate": 4.1152012980427945e-07, + "loss": 0.8519, + "step": 25290 + }, + { + "epoch": 0.6092097974518919, + "grad_norm": 1.5770944356918335, + "learning_rate": 4.1139336781259504e-07, + "loss": 0.8997, + "step": 25295 + }, + { + "epoch": 0.6093302184436791, + "grad_norm": 1.457399606704712, + "learning_rate": 4.112666058209107e-07, + "loss": 0.8614, + "step": 25300 + }, + { + "epoch": 0.6094506394354664, + "grad_norm": 1.5790886878967285, + "learning_rate": 4.111398438292262e-07, + "loss": 0.8802, + "step": 25305 + }, + { + "epoch": 0.6095710604272537, + "grad_norm": 1.544673204421997, + "learning_rate": 4.110130818375418e-07, + "loss": 0.8845, + "step": 25310 + }, + { + "epoch": 0.609691481419041, + "grad_norm": 1.6734991073608398, + "learning_rate": 4.108863198458574e-07, + "loss": 0.9178, + "step": 25315 + }, + { + "epoch": 0.6098119024108283, + "grad_norm": 1.4224692583084106, + "learning_rate": 4.10759557854173e-07, + "loss": 0.8882, + "step": 25320 + }, + { + "epoch": 0.6099323234026155, + "grad_norm": 1.4789096117019653, + "learning_rate": 4.1063279586248854e-07, + "loss": 0.9298, + "step": 25325 + }, + { + "epoch": 0.6100527443944028, + "grad_norm": 1.5312858819961548, + "learning_rate": 4.105060338708042e-07, + "loss": 0.8349, + "step": 25330 + }, + { + "epoch": 0.6101731653861902, + "grad_norm": 1.6153223514556885, + "learning_rate": 4.103792718791197e-07, + "loss": 0.948, + "step": 25335 + }, + { + "epoch": 0.6102935863779774, + "grad_norm": 1.6232482194900513, + "learning_rate": 4.102525098874353e-07, + "loss": 0.9349, + "step": 25340 + }, + { + "epoch": 0.6104140073697647, + "grad_norm": 1.5309494733810425, + "learning_rate": 4.1012574789575096e-07, + "loss": 0.8721, + "step": 25345 + }, + { + "epoch": 0.610534428361552, + "grad_norm": 1.5360968112945557, + "learning_rate": 4.099989859040665e-07, + "loss": 0.8604, + "step": 25350 + }, + { + "epoch": 0.6106548493533392, + "grad_norm": 1.5799692869186401, + "learning_rate": 4.098722239123821e-07, + "loss": 0.9158, + "step": 25355 + }, + { + "epoch": 0.6107752703451266, + "grad_norm": 1.5045326948165894, + "learning_rate": 4.097454619206977e-07, + "loss": 0.9081, + "step": 25360 + }, + { + "epoch": 0.6108956913369139, + "grad_norm": 1.4619923830032349, + "learning_rate": 4.0961869992901327e-07, + "loss": 0.9109, + "step": 25365 + }, + { + "epoch": 0.6110161123287011, + "grad_norm": 1.5276981592178345, + "learning_rate": 4.094919379373288e-07, + "loss": 0.9295, + "step": 25370 + }, + { + "epoch": 0.6111365333204885, + "grad_norm": 1.6464943885803223, + "learning_rate": 4.0936517594564446e-07, + "loss": 0.8637, + "step": 25375 + }, + { + "epoch": 0.6112569543122757, + "grad_norm": 1.5115057229995728, + "learning_rate": 4.0923841395396005e-07, + "loss": 0.9234, + "step": 25380 + }, + { + "epoch": 0.611377375304063, + "grad_norm": 1.4718031883239746, + "learning_rate": 4.091116519622756e-07, + "loss": 0.8478, + "step": 25385 + }, + { + "epoch": 0.6114977962958503, + "grad_norm": 1.651680827140808, + "learning_rate": 4.0898488997059123e-07, + "loss": 0.95, + "step": 25390 + }, + { + "epoch": 0.6116182172876375, + "grad_norm": 1.441233515739441, + "learning_rate": 4.0885812797890677e-07, + "loss": 0.8692, + "step": 25395 + }, + { + "epoch": 0.6117386382794249, + "grad_norm": 1.785135269165039, + "learning_rate": 4.0873136598722236e-07, + "loss": 0.9518, + "step": 25400 + }, + { + "epoch": 0.6118590592712122, + "grad_norm": 1.4425971508026123, + "learning_rate": 4.0860460399553795e-07, + "loss": 0.8859, + "step": 25405 + }, + { + "epoch": 0.6119794802629994, + "grad_norm": 1.6750375032424927, + "learning_rate": 4.0847784200385355e-07, + "loss": 0.9056, + "step": 25410 + }, + { + "epoch": 0.6120999012547867, + "grad_norm": 1.4933054447174072, + "learning_rate": 4.0835108001216914e-07, + "loss": 0.8847, + "step": 25415 + }, + { + "epoch": 0.6122203222465741, + "grad_norm": 1.4828130006790161, + "learning_rate": 4.0822431802048473e-07, + "loss": 0.916, + "step": 25420 + }, + { + "epoch": 0.6123407432383613, + "grad_norm": 1.4448007345199585, + "learning_rate": 4.080975560288003e-07, + "loss": 0.8932, + "step": 25425 + }, + { + "epoch": 0.6124611642301486, + "grad_norm": 1.5228817462921143, + "learning_rate": 4.0797079403711586e-07, + "loss": 0.8805, + "step": 25430 + }, + { + "epoch": 0.6125815852219358, + "grad_norm": 1.779428482055664, + "learning_rate": 4.078440320454315e-07, + "loss": 0.9489, + "step": 25435 + }, + { + "epoch": 0.6127020062137232, + "grad_norm": 1.5332070589065552, + "learning_rate": 4.0771727005374704e-07, + "loss": 0.8749, + "step": 25440 + }, + { + "epoch": 0.6128224272055105, + "grad_norm": 1.5100640058517456, + "learning_rate": 4.0759050806206264e-07, + "loss": 0.8826, + "step": 25445 + }, + { + "epoch": 0.6129428481972977, + "grad_norm": 1.4698293209075928, + "learning_rate": 4.074637460703783e-07, + "loss": 0.9082, + "step": 25450 + }, + { + "epoch": 0.613063269189085, + "grad_norm": 1.4528988599777222, + "learning_rate": 4.073369840786938e-07, + "loss": 0.9264, + "step": 25455 + }, + { + "epoch": 0.6131836901808724, + "grad_norm": 1.6008024215698242, + "learning_rate": 4.072102220870094e-07, + "loss": 0.8836, + "step": 25460 + }, + { + "epoch": 0.6133041111726596, + "grad_norm": 1.3941173553466797, + "learning_rate": 4.07083460095325e-07, + "loss": 0.8613, + "step": 25465 + }, + { + "epoch": 0.6134245321644469, + "grad_norm": 1.6055837869644165, + "learning_rate": 4.069566981036406e-07, + "loss": 0.9142, + "step": 25470 + }, + { + "epoch": 0.6135449531562341, + "grad_norm": 1.5556641817092896, + "learning_rate": 4.0682993611195613e-07, + "loss": 0.8544, + "step": 25475 + }, + { + "epoch": 0.6136653741480215, + "grad_norm": 1.5129414796829224, + "learning_rate": 4.067031741202718e-07, + "loss": 0.9043, + "step": 25480 + }, + { + "epoch": 0.6137857951398088, + "grad_norm": 1.6764026880264282, + "learning_rate": 4.065764121285873e-07, + "loss": 0.8123, + "step": 25485 + }, + { + "epoch": 0.613906216131596, + "grad_norm": 1.4478250741958618, + "learning_rate": 4.064496501369029e-07, + "loss": 0.8387, + "step": 25490 + }, + { + "epoch": 0.6140266371233833, + "grad_norm": 1.7998301982879639, + "learning_rate": 4.0632288814521855e-07, + "loss": 0.8784, + "step": 25495 + }, + { + "epoch": 0.6141470581151707, + "grad_norm": 1.9295172691345215, + "learning_rate": 4.061961261535341e-07, + "loss": 0.9652, + "step": 25500 + }, + { + "epoch": 0.6142674791069579, + "grad_norm": 1.382350206375122, + "learning_rate": 4.060693641618497e-07, + "loss": 0.8637, + "step": 25505 + }, + { + "epoch": 0.6143879000987452, + "grad_norm": 1.5238209962844849, + "learning_rate": 4.059426021701653e-07, + "loss": 0.892, + "step": 25510 + }, + { + "epoch": 0.6145083210905325, + "grad_norm": 1.3008952140808105, + "learning_rate": 4.0581584017848087e-07, + "loss": 0.8638, + "step": 25515 + }, + { + "epoch": 0.6146287420823198, + "grad_norm": 1.6505320072174072, + "learning_rate": 4.056890781867964e-07, + "loss": 0.9282, + "step": 25520 + }, + { + "epoch": 0.6147491630741071, + "grad_norm": 1.905221700668335, + "learning_rate": 4.0556231619511205e-07, + "loss": 0.8916, + "step": 25525 + }, + { + "epoch": 0.6148695840658943, + "grad_norm": 1.665234923362732, + "learning_rate": 4.0543555420342764e-07, + "loss": 0.911, + "step": 25530 + }, + { + "epoch": 0.6149900050576816, + "grad_norm": 1.7937983274459839, + "learning_rate": 4.053087922117432e-07, + "loss": 0.9336, + "step": 25535 + }, + { + "epoch": 0.615110426049469, + "grad_norm": 1.494798183441162, + "learning_rate": 4.0518203022005883e-07, + "loss": 0.8983, + "step": 25540 + }, + { + "epoch": 0.6152308470412562, + "grad_norm": 1.6080095767974854, + "learning_rate": 4.0505526822837437e-07, + "loss": 0.8837, + "step": 25545 + }, + { + "epoch": 0.6153512680330435, + "grad_norm": 1.746788740158081, + "learning_rate": 4.0492850623668996e-07, + "loss": 0.8431, + "step": 25550 + }, + { + "epoch": 0.6154716890248308, + "grad_norm": 1.5956007242202759, + "learning_rate": 4.0480174424500555e-07, + "loss": 0.8483, + "step": 25555 + }, + { + "epoch": 0.6155921100166181, + "grad_norm": 1.6954641342163086, + "learning_rate": 4.0467498225332114e-07, + "loss": 0.8781, + "step": 25560 + }, + { + "epoch": 0.6157125310084054, + "grad_norm": 1.6233452558517456, + "learning_rate": 4.0454822026163673e-07, + "loss": 0.9598, + "step": 25565 + }, + { + "epoch": 0.6158329520001927, + "grad_norm": 1.3930495977401733, + "learning_rate": 4.044214582699523e-07, + "loss": 0.8972, + "step": 25570 + }, + { + "epoch": 0.6159533729919799, + "grad_norm": 1.6070493459701538, + "learning_rate": 4.042946962782679e-07, + "loss": 0.896, + "step": 25575 + }, + { + "epoch": 0.6160737939837673, + "grad_norm": 1.5206298828125, + "learning_rate": 4.0416793428658346e-07, + "loss": 0.8612, + "step": 25580 + }, + { + "epoch": 0.6161942149755545, + "grad_norm": 1.558423399925232, + "learning_rate": 4.040411722948991e-07, + "loss": 0.8981, + "step": 25585 + }, + { + "epoch": 0.6163146359673418, + "grad_norm": 1.5652716159820557, + "learning_rate": 4.0391441030321464e-07, + "loss": 0.8786, + "step": 25590 + }, + { + "epoch": 0.6164350569591291, + "grad_norm": 1.5595407485961914, + "learning_rate": 4.0378764831153023e-07, + "loss": 0.934, + "step": 25595 + }, + { + "epoch": 0.6165554779509164, + "grad_norm": 1.6016736030578613, + "learning_rate": 4.036608863198459e-07, + "loss": 0.9026, + "step": 25600 + }, + { + "epoch": 0.6166758989427037, + "grad_norm": 1.8003485202789307, + "learning_rate": 4.035341243281614e-07, + "loss": 0.8784, + "step": 25605 + }, + { + "epoch": 0.616796319934491, + "grad_norm": 1.4704478979110718, + "learning_rate": 4.03407362336477e-07, + "loss": 0.8715, + "step": 25610 + }, + { + "epoch": 0.6169167409262782, + "grad_norm": 1.592137098312378, + "learning_rate": 4.032806003447926e-07, + "loss": 0.8437, + "step": 25615 + }, + { + "epoch": 0.6170371619180656, + "grad_norm": 1.5061300992965698, + "learning_rate": 4.031538383531082e-07, + "loss": 0.9236, + "step": 25620 + }, + { + "epoch": 0.6171575829098529, + "grad_norm": 1.4585505723953247, + "learning_rate": 4.0302707636142373e-07, + "loss": 0.9172, + "step": 25625 + }, + { + "epoch": 0.6172780039016401, + "grad_norm": 1.784454584121704, + "learning_rate": 4.0290031436973937e-07, + "loss": 0.9294, + "step": 25630 + }, + { + "epoch": 0.6173984248934274, + "grad_norm": 1.5327225923538208, + "learning_rate": 4.027735523780549e-07, + "loss": 0.894, + "step": 25635 + }, + { + "epoch": 0.6175188458852147, + "grad_norm": 1.559232473373413, + "learning_rate": 4.026467903863705e-07, + "loss": 0.898, + "step": 25640 + }, + { + "epoch": 0.617639266877002, + "grad_norm": 1.4772984981536865, + "learning_rate": 4.0252002839468615e-07, + "loss": 0.9417, + "step": 25645 + }, + { + "epoch": 0.6177596878687893, + "grad_norm": 1.6661555767059326, + "learning_rate": 4.023932664030017e-07, + "loss": 0.8945, + "step": 25650 + }, + { + "epoch": 0.6178801088605765, + "grad_norm": 1.5550674200057983, + "learning_rate": 4.0226650441131733e-07, + "loss": 0.8786, + "step": 25655 + }, + { + "epoch": 0.6180005298523639, + "grad_norm": 1.5687839984893799, + "learning_rate": 4.0213974241963287e-07, + "loss": 0.9566, + "step": 25660 + }, + { + "epoch": 0.6181209508441512, + "grad_norm": 1.4756946563720703, + "learning_rate": 4.0201298042794846e-07, + "loss": 0.89, + "step": 25665 + }, + { + "epoch": 0.6182413718359384, + "grad_norm": 1.5157841444015503, + "learning_rate": 4.0188621843626405e-07, + "loss": 0.9546, + "step": 25670 + }, + { + "epoch": 0.6183617928277257, + "grad_norm": 1.4619146585464478, + "learning_rate": 4.0175945644457965e-07, + "loss": 0.9052, + "step": 25675 + }, + { + "epoch": 0.6184822138195131, + "grad_norm": 1.655191421508789, + "learning_rate": 4.0163269445289524e-07, + "loss": 0.8417, + "step": 25680 + }, + { + "epoch": 0.6186026348113003, + "grad_norm": 1.5669910907745361, + "learning_rate": 4.0150593246121083e-07, + "loss": 0.8885, + "step": 25685 + }, + { + "epoch": 0.6187230558030876, + "grad_norm": 1.7892998456954956, + "learning_rate": 4.013791704695264e-07, + "loss": 0.8722, + "step": 25690 + }, + { + "epoch": 0.6188434767948748, + "grad_norm": 1.5242003202438354, + "learning_rate": 4.0125240847784196e-07, + "loss": 0.8916, + "step": 25695 + }, + { + "epoch": 0.6189638977866622, + "grad_norm": 1.4352613687515259, + "learning_rate": 4.011256464861576e-07, + "loss": 0.9305, + "step": 25700 + }, + { + "epoch": 0.6190843187784495, + "grad_norm": 1.5476058721542358, + "learning_rate": 4.0099888449447314e-07, + "loss": 0.9093, + "step": 25705 + }, + { + "epoch": 0.6192047397702367, + "grad_norm": 1.5854041576385498, + "learning_rate": 4.0087212250278874e-07, + "loss": 0.8884, + "step": 25710 + }, + { + "epoch": 0.619325160762024, + "grad_norm": 1.4123213291168213, + "learning_rate": 4.007453605111044e-07, + "loss": 0.918, + "step": 25715 + }, + { + "epoch": 0.6194455817538114, + "grad_norm": 1.6956759691238403, + "learning_rate": 4.006185985194199e-07, + "loss": 0.8959, + "step": 25720 + }, + { + "epoch": 0.6195660027455986, + "grad_norm": 1.5501787662506104, + "learning_rate": 4.004918365277355e-07, + "loss": 0.8972, + "step": 25725 + }, + { + "epoch": 0.6196864237373859, + "grad_norm": 1.5617278814315796, + "learning_rate": 4.003650745360511e-07, + "loss": 0.885, + "step": 25730 + }, + { + "epoch": 0.6198068447291731, + "grad_norm": 1.554679274559021, + "learning_rate": 4.002383125443667e-07, + "loss": 0.9297, + "step": 25735 + }, + { + "epoch": 0.6199272657209605, + "grad_norm": 1.5556864738464355, + "learning_rate": 4.0011155055268223e-07, + "loss": 0.9393, + "step": 25740 + }, + { + "epoch": 0.6200476867127478, + "grad_norm": 1.5048941373825073, + "learning_rate": 3.999847885609979e-07, + "loss": 0.882, + "step": 25745 + }, + { + "epoch": 0.620168107704535, + "grad_norm": 1.519623041152954, + "learning_rate": 3.9985802656931347e-07, + "loss": 0.884, + "step": 25750 + }, + { + "epoch": 0.6202885286963223, + "grad_norm": 1.8814226388931274, + "learning_rate": 3.99731264577629e-07, + "loss": 0.907, + "step": 25755 + }, + { + "epoch": 0.6204089496881097, + "grad_norm": 1.719388723373413, + "learning_rate": 3.9960450258594465e-07, + "loss": 0.8555, + "step": 25760 + }, + { + "epoch": 0.6205293706798969, + "grad_norm": 1.635589361190796, + "learning_rate": 3.994777405942602e-07, + "loss": 0.9124, + "step": 25765 + }, + { + "epoch": 0.6206497916716842, + "grad_norm": 1.462107539176941, + "learning_rate": 3.993509786025758e-07, + "loss": 0.8733, + "step": 25770 + }, + { + "epoch": 0.6207702126634715, + "grad_norm": 1.593778371810913, + "learning_rate": 3.992242166108914e-07, + "loss": 0.8644, + "step": 25775 + }, + { + "epoch": 0.6208906336552588, + "grad_norm": 1.617329716682434, + "learning_rate": 3.9909745461920697e-07, + "loss": 0.8413, + "step": 25780 + }, + { + "epoch": 0.6210110546470461, + "grad_norm": 1.8050616979599, + "learning_rate": 3.989706926275225e-07, + "loss": 0.904, + "step": 25785 + }, + { + "epoch": 0.6211314756388333, + "grad_norm": 1.5802415609359741, + "learning_rate": 3.9884393063583815e-07, + "loss": 0.859, + "step": 25790 + }, + { + "epoch": 0.6212518966306206, + "grad_norm": 1.4133882522583008, + "learning_rate": 3.9871716864415374e-07, + "loss": 0.8923, + "step": 25795 + }, + { + "epoch": 0.621372317622408, + "grad_norm": 1.67210054397583, + "learning_rate": 3.985904066524693e-07, + "loss": 0.8528, + "step": 25800 + }, + { + "epoch": 0.6214927386141952, + "grad_norm": 1.5611308813095093, + "learning_rate": 3.9846364466078493e-07, + "loss": 0.8864, + "step": 25805 + }, + { + "epoch": 0.6216131596059825, + "grad_norm": 1.5887681245803833, + "learning_rate": 3.9833688266910047e-07, + "loss": 0.8617, + "step": 25810 + }, + { + "epoch": 0.6217335805977698, + "grad_norm": 1.3824416399002075, + "learning_rate": 3.9821012067741606e-07, + "loss": 0.8854, + "step": 25815 + }, + { + "epoch": 0.6218540015895571, + "grad_norm": 1.5376073122024536, + "learning_rate": 3.9808335868573165e-07, + "loss": 0.9021, + "step": 25820 + }, + { + "epoch": 0.6219744225813444, + "grad_norm": 1.580991506576538, + "learning_rate": 3.9795659669404724e-07, + "loss": 0.9195, + "step": 25825 + }, + { + "epoch": 0.6220948435731317, + "grad_norm": 1.452588677406311, + "learning_rate": 3.9782983470236283e-07, + "loss": 0.9412, + "step": 25830 + }, + { + "epoch": 0.6222152645649189, + "grad_norm": 1.4612324237823486, + "learning_rate": 3.977030727106784e-07, + "loss": 0.8796, + "step": 25835 + }, + { + "epoch": 0.6223356855567063, + "grad_norm": 1.5148411989212036, + "learning_rate": 3.97576310718994e-07, + "loss": 0.8432, + "step": 25840 + }, + { + "epoch": 0.6224561065484935, + "grad_norm": 1.6802312135696411, + "learning_rate": 3.9744954872730956e-07, + "loss": 0.8767, + "step": 25845 + }, + { + "epoch": 0.6225765275402808, + "grad_norm": 1.4846410751342773, + "learning_rate": 3.973227867356252e-07, + "loss": 0.8864, + "step": 25850 + }, + { + "epoch": 0.6226969485320681, + "grad_norm": 1.74970543384552, + "learning_rate": 3.9719602474394074e-07, + "loss": 0.9246, + "step": 25855 + }, + { + "epoch": 0.6228173695238554, + "grad_norm": 1.4260430335998535, + "learning_rate": 3.9706926275225633e-07, + "loss": 0.8989, + "step": 25860 + }, + { + "epoch": 0.6229377905156427, + "grad_norm": 1.3911371231079102, + "learning_rate": 3.96942500760572e-07, + "loss": 0.8305, + "step": 25865 + }, + { + "epoch": 0.62305821150743, + "grad_norm": 1.4906085729599, + "learning_rate": 3.968157387688875e-07, + "loss": 0.9143, + "step": 25870 + }, + { + "epoch": 0.6231786324992172, + "grad_norm": 1.4780354499816895, + "learning_rate": 3.966889767772031e-07, + "loss": 0.8974, + "step": 25875 + }, + { + "epoch": 0.6232990534910046, + "grad_norm": 1.6314855813980103, + "learning_rate": 3.965622147855187e-07, + "loss": 0.9217, + "step": 25880 + }, + { + "epoch": 0.6234194744827919, + "grad_norm": 1.570934772491455, + "learning_rate": 3.964354527938343e-07, + "loss": 0.9263, + "step": 25885 + }, + { + "epoch": 0.6235398954745791, + "grad_norm": 1.472892165184021, + "learning_rate": 3.9630869080214983e-07, + "loss": 0.9212, + "step": 25890 + }, + { + "epoch": 0.6236603164663664, + "grad_norm": 1.8066248893737793, + "learning_rate": 3.9618192881046547e-07, + "loss": 0.8877, + "step": 25895 + }, + { + "epoch": 0.6237807374581537, + "grad_norm": 1.555860161781311, + "learning_rate": 3.9605516681878107e-07, + "loss": 0.8556, + "step": 25900 + }, + { + "epoch": 0.623901158449941, + "grad_norm": 1.4210270643234253, + "learning_rate": 3.959284048270966e-07, + "loss": 0.874, + "step": 25905 + }, + { + "epoch": 0.6240215794417283, + "grad_norm": 1.781644344329834, + "learning_rate": 3.9580164283541225e-07, + "loss": 0.8762, + "step": 25910 + }, + { + "epoch": 0.6241420004335155, + "grad_norm": 1.4464399814605713, + "learning_rate": 3.956748808437278e-07, + "loss": 0.8866, + "step": 25915 + }, + { + "epoch": 0.6242624214253029, + "grad_norm": 1.4830899238586426, + "learning_rate": 3.955481188520434e-07, + "loss": 0.9026, + "step": 25920 + }, + { + "epoch": 0.6243828424170902, + "grad_norm": 1.7051576375961304, + "learning_rate": 3.9542135686035897e-07, + "loss": 0.8593, + "step": 25925 + }, + { + "epoch": 0.6245032634088774, + "grad_norm": 1.7752699851989746, + "learning_rate": 3.9529459486867456e-07, + "loss": 0.8789, + "step": 25930 + }, + { + "epoch": 0.6246236844006647, + "grad_norm": 1.7253508567810059, + "learning_rate": 3.951678328769901e-07, + "loss": 0.897, + "step": 25935 + }, + { + "epoch": 0.6247441053924521, + "grad_norm": 1.6633070707321167, + "learning_rate": 3.9504107088530575e-07, + "loss": 0.8794, + "step": 25940 + }, + { + "epoch": 0.6248645263842393, + "grad_norm": 1.5716090202331543, + "learning_rate": 3.9491430889362134e-07, + "loss": 0.8672, + "step": 25945 + }, + { + "epoch": 0.6249849473760266, + "grad_norm": 1.3929953575134277, + "learning_rate": 3.947875469019369e-07, + "loss": 0.8398, + "step": 25950 + }, + { + "epoch": 0.6251053683678138, + "grad_norm": 1.645712971687317, + "learning_rate": 3.946607849102525e-07, + "loss": 0.9471, + "step": 25955 + }, + { + "epoch": 0.6252257893596012, + "grad_norm": 1.4748599529266357, + "learning_rate": 3.9453402291856806e-07, + "loss": 0.9099, + "step": 25960 + }, + { + "epoch": 0.6253462103513885, + "grad_norm": 1.6290721893310547, + "learning_rate": 3.9440726092688365e-07, + "loss": 0.9081, + "step": 25965 + }, + { + "epoch": 0.6254666313431757, + "grad_norm": 1.4641081094741821, + "learning_rate": 3.9428049893519924e-07, + "loss": 0.884, + "step": 25970 + }, + { + "epoch": 0.625587052334963, + "grad_norm": 1.4148682355880737, + "learning_rate": 3.9415373694351484e-07, + "loss": 0.921, + "step": 25975 + }, + { + "epoch": 0.6257074733267504, + "grad_norm": 1.4865994453430176, + "learning_rate": 3.9402697495183043e-07, + "loss": 0.9002, + "step": 25980 + }, + { + "epoch": 0.6258278943185376, + "grad_norm": 2.53155779838562, + "learning_rate": 3.93900212960146e-07, + "loss": 0.8965, + "step": 25985 + }, + { + "epoch": 0.6259483153103249, + "grad_norm": 1.5487921237945557, + "learning_rate": 3.937734509684616e-07, + "loss": 0.9097, + "step": 25990 + }, + { + "epoch": 0.6260687363021122, + "grad_norm": 1.5251574516296387, + "learning_rate": 3.9364668897677715e-07, + "loss": 0.8939, + "step": 25995 + }, + { + "epoch": 0.6261891572938995, + "grad_norm": 1.7059253454208374, + "learning_rate": 3.935199269850928e-07, + "loss": 0.9429, + "step": 26000 + }, + { + "epoch": 0.6263095782856868, + "grad_norm": 1.45933997631073, + "learning_rate": 3.9339316499340833e-07, + "loss": 0.9006, + "step": 26005 + }, + { + "epoch": 0.626429999277474, + "grad_norm": 1.5664831399917603, + "learning_rate": 3.932664030017239e-07, + "loss": 0.8552, + "step": 26010 + }, + { + "epoch": 0.6265504202692613, + "grad_norm": 1.6096681356430054, + "learning_rate": 3.9313964101003957e-07, + "loss": 0.9251, + "step": 26015 + }, + { + "epoch": 0.6266708412610487, + "grad_norm": 1.4551560878753662, + "learning_rate": 3.930128790183551e-07, + "loss": 0.9036, + "step": 26020 + }, + { + "epoch": 0.6267912622528359, + "grad_norm": 1.4428728818893433, + "learning_rate": 3.928861170266707e-07, + "loss": 0.8899, + "step": 26025 + }, + { + "epoch": 0.6269116832446232, + "grad_norm": 1.5575615167617798, + "learning_rate": 3.927593550349863e-07, + "loss": 0.9135, + "step": 26030 + }, + { + "epoch": 0.6270321042364105, + "grad_norm": 1.4295635223388672, + "learning_rate": 3.926325930433019e-07, + "loss": 0.8989, + "step": 26035 + }, + { + "epoch": 0.6271525252281978, + "grad_norm": 1.672738790512085, + "learning_rate": 3.925058310516174e-07, + "loss": 0.9512, + "step": 26040 + }, + { + "epoch": 0.6272729462199851, + "grad_norm": 1.6317218542099, + "learning_rate": 3.9237906905993307e-07, + "loss": 0.9122, + "step": 26045 + }, + { + "epoch": 0.6273933672117723, + "grad_norm": 1.7149540185928345, + "learning_rate": 3.922523070682486e-07, + "loss": 0.8781, + "step": 26050 + }, + { + "epoch": 0.6275137882035596, + "grad_norm": 1.5915231704711914, + "learning_rate": 3.921255450765642e-07, + "loss": 0.916, + "step": 26055 + }, + { + "epoch": 0.627634209195347, + "grad_norm": 1.4769103527069092, + "learning_rate": 3.9199878308487984e-07, + "loss": 0.8903, + "step": 26060 + }, + { + "epoch": 0.6277546301871342, + "grad_norm": 1.4845454692840576, + "learning_rate": 3.918720210931954e-07, + "loss": 0.8652, + "step": 26065 + }, + { + "epoch": 0.6278750511789215, + "grad_norm": 1.4999992847442627, + "learning_rate": 3.91745259101511e-07, + "loss": 0.9442, + "step": 26070 + }, + { + "epoch": 0.6279954721707088, + "grad_norm": 1.6089973449707031, + "learning_rate": 3.9161849710982657e-07, + "loss": 0.9395, + "step": 26075 + }, + { + "epoch": 0.628115893162496, + "grad_norm": 1.5661829710006714, + "learning_rate": 3.9149173511814216e-07, + "loss": 0.8413, + "step": 26080 + }, + { + "epoch": 0.6282363141542834, + "grad_norm": 1.571110725402832, + "learning_rate": 3.913649731264577e-07, + "loss": 0.8922, + "step": 26085 + }, + { + "epoch": 0.6283567351460707, + "grad_norm": 1.6855576038360596, + "learning_rate": 3.9123821113477334e-07, + "loss": 0.9262, + "step": 26090 + }, + { + "epoch": 0.6284771561378579, + "grad_norm": 1.6715831756591797, + "learning_rate": 3.9111144914308893e-07, + "loss": 0.8839, + "step": 26095 + }, + { + "epoch": 0.6285975771296453, + "grad_norm": 1.7230852842330933, + "learning_rate": 3.9098468715140447e-07, + "loss": 0.8953, + "step": 26100 + }, + { + "epoch": 0.6287179981214325, + "grad_norm": 1.8735923767089844, + "learning_rate": 3.908579251597201e-07, + "loss": 0.8738, + "step": 26105 + }, + { + "epoch": 0.6288384191132198, + "grad_norm": 1.372200608253479, + "learning_rate": 3.9073116316803566e-07, + "loss": 0.9413, + "step": 26110 + }, + { + "epoch": 0.6289588401050071, + "grad_norm": 1.5144858360290527, + "learning_rate": 3.9060440117635125e-07, + "loss": 0.9318, + "step": 26115 + }, + { + "epoch": 0.6290792610967944, + "grad_norm": 1.5318498611450195, + "learning_rate": 3.9047763918466684e-07, + "loss": 0.9534, + "step": 26120 + }, + { + "epoch": 0.6291996820885817, + "grad_norm": 1.483949065208435, + "learning_rate": 3.9035087719298243e-07, + "loss": 0.9023, + "step": 26125 + }, + { + "epoch": 0.629320103080369, + "grad_norm": 1.7388157844543457, + "learning_rate": 3.90224115201298e-07, + "loss": 0.9406, + "step": 26130 + }, + { + "epoch": 0.6294405240721562, + "grad_norm": 1.5275570154190063, + "learning_rate": 3.900973532096136e-07, + "loss": 0.8946, + "step": 26135 + }, + { + "epoch": 0.6295609450639436, + "grad_norm": 1.5564861297607422, + "learning_rate": 3.899705912179292e-07, + "loss": 0.8627, + "step": 26140 + }, + { + "epoch": 0.6296813660557309, + "grad_norm": 1.4270212650299072, + "learning_rate": 3.8984382922624475e-07, + "loss": 0.9366, + "step": 26145 + }, + { + "epoch": 0.6298017870475181, + "grad_norm": 1.545400619506836, + "learning_rate": 3.897170672345604e-07, + "loss": 0.9127, + "step": 26150 + }, + { + "epoch": 0.6299222080393054, + "grad_norm": 1.3926692008972168, + "learning_rate": 3.8959030524287593e-07, + "loss": 0.9545, + "step": 26155 + }, + { + "epoch": 0.6300426290310926, + "grad_norm": 1.4838570356369019, + "learning_rate": 3.894635432511915e-07, + "loss": 0.8884, + "step": 26160 + }, + { + "epoch": 0.63016305002288, + "grad_norm": 1.6471329927444458, + "learning_rate": 3.8933678125950717e-07, + "loss": 0.8492, + "step": 26165 + }, + { + "epoch": 0.6302834710146673, + "grad_norm": 1.645909070968628, + "learning_rate": 3.892100192678227e-07, + "loss": 0.9108, + "step": 26170 + }, + { + "epoch": 0.6304038920064545, + "grad_norm": 1.407267689704895, + "learning_rate": 3.8908325727613835e-07, + "loss": 0.9539, + "step": 26175 + }, + { + "epoch": 0.6305243129982419, + "grad_norm": 1.7201238870620728, + "learning_rate": 3.889564952844539e-07, + "loss": 0.8818, + "step": 26180 + }, + { + "epoch": 0.6306447339900292, + "grad_norm": 1.3268872499465942, + "learning_rate": 3.888297332927695e-07, + "loss": 0.9351, + "step": 26185 + }, + { + "epoch": 0.6307651549818164, + "grad_norm": 1.6392351388931274, + "learning_rate": 3.8870297130108507e-07, + "loss": 0.985, + "step": 26190 + }, + { + "epoch": 0.6308855759736037, + "grad_norm": 1.5395928621292114, + "learning_rate": 3.8857620930940066e-07, + "loss": 0.9171, + "step": 26195 + }, + { + "epoch": 0.6310059969653911, + "grad_norm": 1.3914449214935303, + "learning_rate": 3.884494473177162e-07, + "loss": 0.8717, + "step": 26200 + }, + { + "epoch": 0.6311264179571783, + "grad_norm": 1.6061489582061768, + "learning_rate": 3.8832268532603185e-07, + "loss": 0.8801, + "step": 26205 + }, + { + "epoch": 0.6312468389489656, + "grad_norm": 1.565010666847229, + "learning_rate": 3.8819592333434744e-07, + "loss": 0.8897, + "step": 26210 + }, + { + "epoch": 0.6313672599407528, + "grad_norm": 1.7284237146377563, + "learning_rate": 3.88069161342663e-07, + "loss": 0.9043, + "step": 26215 + }, + { + "epoch": 0.6314876809325402, + "grad_norm": 1.5633896589279175, + "learning_rate": 3.879423993509786e-07, + "loss": 0.889, + "step": 26220 + }, + { + "epoch": 0.6316081019243275, + "grad_norm": 1.578115701675415, + "learning_rate": 3.8781563735929416e-07, + "loss": 0.9369, + "step": 26225 + }, + { + "epoch": 0.6317285229161147, + "grad_norm": 1.5071097612380981, + "learning_rate": 3.8768887536760975e-07, + "loss": 0.9064, + "step": 26230 + }, + { + "epoch": 0.631848943907902, + "grad_norm": 1.3944430351257324, + "learning_rate": 3.8756211337592534e-07, + "loss": 0.9425, + "step": 26235 + }, + { + "epoch": 0.6319693648996894, + "grad_norm": 1.3645925521850586, + "learning_rate": 3.8743535138424094e-07, + "loss": 0.9139, + "step": 26240 + }, + { + "epoch": 0.6320897858914766, + "grad_norm": 1.6590458154678345, + "learning_rate": 3.8730858939255653e-07, + "loss": 0.9541, + "step": 26245 + }, + { + "epoch": 0.6322102068832639, + "grad_norm": 1.7004053592681885, + "learning_rate": 3.871818274008721e-07, + "loss": 0.8755, + "step": 26250 + }, + { + "epoch": 0.6323306278750512, + "grad_norm": 1.7825452089309692, + "learning_rate": 3.870550654091877e-07, + "loss": 0.8939, + "step": 26255 + }, + { + "epoch": 0.6324510488668385, + "grad_norm": 1.6429630517959595, + "learning_rate": 3.8692830341750325e-07, + "loss": 0.8728, + "step": 26260 + }, + { + "epoch": 0.6325714698586258, + "grad_norm": 1.5073269605636597, + "learning_rate": 3.868015414258189e-07, + "loss": 0.8906, + "step": 26265 + }, + { + "epoch": 0.632691890850413, + "grad_norm": 1.5260565280914307, + "learning_rate": 3.8667477943413443e-07, + "loss": 0.8797, + "step": 26270 + }, + { + "epoch": 0.6328123118422003, + "grad_norm": 1.4282317161560059, + "learning_rate": 3.8654801744245e-07, + "loss": 0.8967, + "step": 26275 + }, + { + "epoch": 0.6329327328339877, + "grad_norm": 1.5447337627410889, + "learning_rate": 3.8642125545076567e-07, + "loss": 0.8773, + "step": 26280 + }, + { + "epoch": 0.6330531538257749, + "grad_norm": 1.4414620399475098, + "learning_rate": 3.862944934590812e-07, + "loss": 0.9458, + "step": 26285 + }, + { + "epoch": 0.6331735748175622, + "grad_norm": 1.4325686693191528, + "learning_rate": 3.861677314673968e-07, + "loss": 0.8641, + "step": 26290 + }, + { + "epoch": 0.6332939958093495, + "grad_norm": 1.695372223854065, + "learning_rate": 3.860409694757124e-07, + "loss": 0.9215, + "step": 26295 + }, + { + "epoch": 0.6334144168011367, + "grad_norm": 1.6153851747512817, + "learning_rate": 3.85914207484028e-07, + "loss": 0.9339, + "step": 26300 + }, + { + "epoch": 0.6335348377929241, + "grad_norm": 1.4820550680160522, + "learning_rate": 3.857874454923435e-07, + "loss": 0.927, + "step": 26305 + }, + { + "epoch": 0.6336552587847114, + "grad_norm": 1.3766353130340576, + "learning_rate": 3.8566068350065917e-07, + "loss": 0.8868, + "step": 26310 + }, + { + "epoch": 0.6337756797764986, + "grad_norm": 1.4582586288452148, + "learning_rate": 3.8553392150897476e-07, + "loss": 0.8952, + "step": 26315 + }, + { + "epoch": 0.633896100768286, + "grad_norm": 1.5797805786132812, + "learning_rate": 3.854071595172903e-07, + "loss": 0.9397, + "step": 26320 + }, + { + "epoch": 0.6340165217600732, + "grad_norm": 1.3752812147140503, + "learning_rate": 3.8528039752560594e-07, + "loss": 0.9331, + "step": 26325 + }, + { + "epoch": 0.6341369427518605, + "grad_norm": 1.3963216543197632, + "learning_rate": 3.851536355339215e-07, + "loss": 0.9193, + "step": 26330 + }, + { + "epoch": 0.6342573637436478, + "grad_norm": 1.6616880893707275, + "learning_rate": 3.850268735422371e-07, + "loss": 0.9022, + "step": 26335 + }, + { + "epoch": 0.634377784735435, + "grad_norm": 1.4412713050842285, + "learning_rate": 3.8490011155055267e-07, + "loss": 0.9221, + "step": 26340 + }, + { + "epoch": 0.6344982057272224, + "grad_norm": 1.553244709968567, + "learning_rate": 3.8477334955886826e-07, + "loss": 0.9253, + "step": 26345 + }, + { + "epoch": 0.6346186267190097, + "grad_norm": 1.510672688484192, + "learning_rate": 3.846465875671838e-07, + "loss": 0.9447, + "step": 26350 + }, + { + "epoch": 0.6347390477107969, + "grad_norm": 1.4998446702957153, + "learning_rate": 3.8451982557549944e-07, + "loss": 0.8506, + "step": 26355 + }, + { + "epoch": 0.6348594687025843, + "grad_norm": 1.5620739459991455, + "learning_rate": 3.8439306358381503e-07, + "loss": 0.9012, + "step": 26360 + }, + { + "epoch": 0.6349798896943715, + "grad_norm": 1.568901777267456, + "learning_rate": 3.8426630159213057e-07, + "loss": 0.8616, + "step": 26365 + }, + { + "epoch": 0.6351003106861588, + "grad_norm": 1.428557276725769, + "learning_rate": 3.841395396004462e-07, + "loss": 0.8568, + "step": 26370 + }, + { + "epoch": 0.6352207316779461, + "grad_norm": 1.4728060960769653, + "learning_rate": 3.8401277760876176e-07, + "loss": 0.9468, + "step": 26375 + }, + { + "epoch": 0.6353411526697333, + "grad_norm": 1.4308539628982544, + "learning_rate": 3.8388601561707735e-07, + "loss": 0.9365, + "step": 26380 + }, + { + "epoch": 0.6354615736615207, + "grad_norm": 1.3621821403503418, + "learning_rate": 3.8375925362539294e-07, + "loss": 0.8578, + "step": 26385 + }, + { + "epoch": 0.635581994653308, + "grad_norm": 1.6784234046936035, + "learning_rate": 3.8363249163370853e-07, + "loss": 0.918, + "step": 26390 + }, + { + "epoch": 0.6357024156450952, + "grad_norm": 1.464553952217102, + "learning_rate": 3.835057296420241e-07, + "loss": 0.8666, + "step": 26395 + }, + { + "epoch": 0.6358228366368825, + "grad_norm": 1.692110538482666, + "learning_rate": 3.833789676503397e-07, + "loss": 0.8731, + "step": 26400 + }, + { + "epoch": 0.6359432576286699, + "grad_norm": 1.6025325059890747, + "learning_rate": 3.832522056586553e-07, + "loss": 0.8945, + "step": 26405 + }, + { + "epoch": 0.6360636786204571, + "grad_norm": 1.3949683904647827, + "learning_rate": 3.8312544366697085e-07, + "loss": 0.9353, + "step": 26410 + }, + { + "epoch": 0.6361840996122444, + "grad_norm": 2.3893911838531494, + "learning_rate": 3.829986816752865e-07, + "loss": 0.9459, + "step": 26415 + }, + { + "epoch": 0.6363045206040316, + "grad_norm": 1.7152431011199951, + "learning_rate": 3.8287191968360203e-07, + "loss": 0.8687, + "step": 26420 + }, + { + "epoch": 0.636424941595819, + "grad_norm": 1.7054212093353271, + "learning_rate": 3.827451576919176e-07, + "loss": 0.9631, + "step": 26425 + }, + { + "epoch": 0.6365453625876063, + "grad_norm": 1.4074687957763672, + "learning_rate": 3.8261839570023327e-07, + "loss": 0.9171, + "step": 26430 + }, + { + "epoch": 0.6366657835793935, + "grad_norm": 1.5717144012451172, + "learning_rate": 3.824916337085488e-07, + "loss": 0.8751, + "step": 26435 + }, + { + "epoch": 0.6367862045711808, + "grad_norm": 1.6245671510696411, + "learning_rate": 3.823648717168644e-07, + "loss": 0.8708, + "step": 26440 + }, + { + "epoch": 0.6369066255629682, + "grad_norm": 1.5715320110321045, + "learning_rate": 3.8223810972518e-07, + "loss": 0.9241, + "step": 26445 + }, + { + "epoch": 0.6370270465547554, + "grad_norm": 1.4957575798034668, + "learning_rate": 3.821113477334956e-07, + "loss": 0.8936, + "step": 26450 + }, + { + "epoch": 0.6371474675465427, + "grad_norm": 1.4049856662750244, + "learning_rate": 3.819845857418111e-07, + "loss": 0.8851, + "step": 26455 + }, + { + "epoch": 0.63726788853833, + "grad_norm": 1.5102617740631104, + "learning_rate": 3.8185782375012676e-07, + "loss": 0.9093, + "step": 26460 + }, + { + "epoch": 0.6373883095301173, + "grad_norm": 1.5172317028045654, + "learning_rate": 3.8173106175844236e-07, + "loss": 0.8992, + "step": 26465 + }, + { + "epoch": 0.6375087305219046, + "grad_norm": 1.799931526184082, + "learning_rate": 3.816042997667579e-07, + "loss": 0.8988, + "step": 26470 + }, + { + "epoch": 0.6376291515136918, + "grad_norm": 1.5349489450454712, + "learning_rate": 3.8147753777507354e-07, + "loss": 0.8534, + "step": 26475 + }, + { + "epoch": 0.6377495725054791, + "grad_norm": 1.5610055923461914, + "learning_rate": 3.813507757833891e-07, + "loss": 0.8847, + "step": 26480 + }, + { + "epoch": 0.6378699934972665, + "grad_norm": 1.4507337808609009, + "learning_rate": 3.8122401379170467e-07, + "loss": 0.8905, + "step": 26485 + }, + { + "epoch": 0.6379904144890537, + "grad_norm": 1.7067121267318726, + "learning_rate": 3.8109725180002026e-07, + "loss": 0.8915, + "step": 26490 + }, + { + "epoch": 0.638110835480841, + "grad_norm": 1.573899745941162, + "learning_rate": 3.8097048980833585e-07, + "loss": 0.8917, + "step": 26495 + }, + { + "epoch": 0.6382312564726283, + "grad_norm": 1.5376611948013306, + "learning_rate": 3.808437278166514e-07, + "loss": 0.895, + "step": 26500 + }, + { + "epoch": 0.6383516774644156, + "grad_norm": 1.6120070219039917, + "learning_rate": 3.8071696582496704e-07, + "loss": 0.8908, + "step": 26505 + }, + { + "epoch": 0.6384720984562029, + "grad_norm": 1.5504016876220703, + "learning_rate": 3.8059020383328263e-07, + "loss": 0.9152, + "step": 26510 + }, + { + "epoch": 0.6385925194479902, + "grad_norm": 1.4211766719818115, + "learning_rate": 3.8046344184159817e-07, + "loss": 0.8928, + "step": 26515 + }, + { + "epoch": 0.6387129404397774, + "grad_norm": 1.503017544746399, + "learning_rate": 3.803366798499138e-07, + "loss": 0.9085, + "step": 26520 + }, + { + "epoch": 0.6388333614315648, + "grad_norm": 1.5840296745300293, + "learning_rate": 3.8020991785822935e-07, + "loss": 0.948, + "step": 26525 + }, + { + "epoch": 0.638953782423352, + "grad_norm": 1.7545835971832275, + "learning_rate": 3.8008315586654494e-07, + "loss": 0.8774, + "step": 26530 + }, + { + "epoch": 0.6390742034151393, + "grad_norm": 1.5773112773895264, + "learning_rate": 3.7995639387486053e-07, + "loss": 0.9347, + "step": 26535 + }, + { + "epoch": 0.6391946244069266, + "grad_norm": 1.4779413938522339, + "learning_rate": 3.798296318831761e-07, + "loss": 0.8947, + "step": 26540 + }, + { + "epoch": 0.6393150453987139, + "grad_norm": 1.5355712175369263, + "learning_rate": 3.797028698914917e-07, + "loss": 0.922, + "step": 26545 + }, + { + "epoch": 0.6394354663905012, + "grad_norm": 1.4067556858062744, + "learning_rate": 3.795761078998073e-07, + "loss": 0.9262, + "step": 26550 + }, + { + "epoch": 0.6395558873822885, + "grad_norm": 1.55918550491333, + "learning_rate": 3.794493459081229e-07, + "loss": 0.969, + "step": 26555 + }, + { + "epoch": 0.6396763083740757, + "grad_norm": 1.6701033115386963, + "learning_rate": 3.7932258391643844e-07, + "loss": 0.9076, + "step": 26560 + }, + { + "epoch": 0.6397967293658631, + "grad_norm": 1.5663307905197144, + "learning_rate": 3.791958219247541e-07, + "loss": 0.889, + "step": 26565 + }, + { + "epoch": 0.6399171503576504, + "grad_norm": 1.6836600303649902, + "learning_rate": 3.790690599330696e-07, + "loss": 0.893, + "step": 26570 + }, + { + "epoch": 0.6400375713494376, + "grad_norm": 1.670732021331787, + "learning_rate": 3.789422979413852e-07, + "loss": 0.96, + "step": 26575 + }, + { + "epoch": 0.640157992341225, + "grad_norm": 1.6099560260772705, + "learning_rate": 3.7881553594970086e-07, + "loss": 0.8397, + "step": 26580 + }, + { + "epoch": 0.6402784133330122, + "grad_norm": 1.6978719234466553, + "learning_rate": 3.786887739580164e-07, + "loss": 0.8757, + "step": 26585 + }, + { + "epoch": 0.6403988343247995, + "grad_norm": 1.590781569480896, + "learning_rate": 3.78562011966332e-07, + "loss": 0.887, + "step": 26590 + }, + { + "epoch": 0.6405192553165868, + "grad_norm": 1.5266937017440796, + "learning_rate": 3.784352499746476e-07, + "loss": 0.9099, + "step": 26595 + }, + { + "epoch": 0.640639676308374, + "grad_norm": 1.675328254699707, + "learning_rate": 3.783084879829632e-07, + "loss": 0.8897, + "step": 26600 + }, + { + "epoch": 0.6407600973001614, + "grad_norm": 1.6646935939788818, + "learning_rate": 3.781817259912787e-07, + "loss": 0.8669, + "step": 26605 + }, + { + "epoch": 0.6408805182919487, + "grad_norm": 1.7408345937728882, + "learning_rate": 3.7805496399959436e-07, + "loss": 0.8912, + "step": 26610 + }, + { + "epoch": 0.6410009392837359, + "grad_norm": 1.5620888471603394, + "learning_rate": 3.7792820200790995e-07, + "loss": 0.8979, + "step": 26615 + }, + { + "epoch": 0.6411213602755232, + "grad_norm": 1.50484299659729, + "learning_rate": 3.778014400162255e-07, + "loss": 0.9288, + "step": 26620 + }, + { + "epoch": 0.6412417812673105, + "grad_norm": 1.5604791641235352, + "learning_rate": 3.7767467802454113e-07, + "loss": 0.9343, + "step": 26625 + }, + { + "epoch": 0.6413622022590978, + "grad_norm": 1.6399976015090942, + "learning_rate": 3.7754791603285667e-07, + "loss": 0.9413, + "step": 26630 + }, + { + "epoch": 0.6414826232508851, + "grad_norm": 1.6691919565200806, + "learning_rate": 3.7742115404117226e-07, + "loss": 0.9217, + "step": 26635 + }, + { + "epoch": 0.6416030442426723, + "grad_norm": 1.4400848150253296, + "learning_rate": 3.7729439204948786e-07, + "loss": 0.8995, + "step": 26640 + }, + { + "epoch": 0.6417234652344597, + "grad_norm": 1.523863673210144, + "learning_rate": 3.7716763005780345e-07, + "loss": 0.8833, + "step": 26645 + }, + { + "epoch": 0.641843886226247, + "grad_norm": 1.4621446132659912, + "learning_rate": 3.77040868066119e-07, + "loss": 0.923, + "step": 26650 + }, + { + "epoch": 0.6419643072180342, + "grad_norm": 1.5442419052124023, + "learning_rate": 3.7691410607443463e-07, + "loss": 0.9418, + "step": 26655 + }, + { + "epoch": 0.6420847282098215, + "grad_norm": 1.7486423254013062, + "learning_rate": 3.767873440827502e-07, + "loss": 0.8763, + "step": 26660 + }, + { + "epoch": 0.6422051492016089, + "grad_norm": 1.7192951440811157, + "learning_rate": 3.7666058209106576e-07, + "loss": 0.8825, + "step": 26665 + }, + { + "epoch": 0.6423255701933961, + "grad_norm": 1.8942385911941528, + "learning_rate": 3.765338200993814e-07, + "loss": 0.9291, + "step": 26670 + }, + { + "epoch": 0.6424459911851834, + "grad_norm": 1.748542070388794, + "learning_rate": 3.7640705810769695e-07, + "loss": 0.9082, + "step": 26675 + }, + { + "epoch": 0.6425664121769706, + "grad_norm": 1.3566148281097412, + "learning_rate": 3.762802961160126e-07, + "loss": 0.8808, + "step": 26680 + }, + { + "epoch": 0.642686833168758, + "grad_norm": 1.5772252082824707, + "learning_rate": 3.7615353412432813e-07, + "loss": 0.8409, + "step": 26685 + }, + { + "epoch": 0.6428072541605453, + "grad_norm": 1.758379340171814, + "learning_rate": 3.760267721326437e-07, + "loss": 0.9144, + "step": 26690 + }, + { + "epoch": 0.6429276751523325, + "grad_norm": 1.517451524734497, + "learning_rate": 3.7590001014095937e-07, + "loss": 0.9186, + "step": 26695 + }, + { + "epoch": 0.6430480961441198, + "grad_norm": 1.5986350774765015, + "learning_rate": 3.757732481492749e-07, + "loss": 0.9321, + "step": 26700 + }, + { + "epoch": 0.6431685171359072, + "grad_norm": 1.535396695137024, + "learning_rate": 3.756464861575905e-07, + "loss": 0.9007, + "step": 26705 + }, + { + "epoch": 0.6432889381276944, + "grad_norm": 1.7621718645095825, + "learning_rate": 3.755197241659061e-07, + "loss": 0.9153, + "step": 26710 + }, + { + "epoch": 0.6434093591194817, + "grad_norm": 1.6428961753845215, + "learning_rate": 3.753929621742217e-07, + "loss": 0.9142, + "step": 26715 + }, + { + "epoch": 0.643529780111269, + "grad_norm": 1.5684250593185425, + "learning_rate": 3.752662001825372e-07, + "loss": 0.9805, + "step": 26720 + }, + { + "epoch": 0.6436502011030563, + "grad_norm": 1.5175715684890747, + "learning_rate": 3.7513943819085286e-07, + "loss": 0.868, + "step": 26725 + }, + { + "epoch": 0.6437706220948436, + "grad_norm": 1.5524253845214844, + "learning_rate": 3.7501267619916846e-07, + "loss": 0.869, + "step": 26730 + }, + { + "epoch": 0.6438910430866308, + "grad_norm": 1.3661057949066162, + "learning_rate": 3.74885914207484e-07, + "loss": 0.9048, + "step": 26735 + }, + { + "epoch": 0.6440114640784181, + "grad_norm": 1.640135407447815, + "learning_rate": 3.7475915221579964e-07, + "loss": 0.9211, + "step": 26740 + }, + { + "epoch": 0.6441318850702055, + "grad_norm": 1.5012540817260742, + "learning_rate": 3.746323902241152e-07, + "loss": 0.883, + "step": 26745 + }, + { + "epoch": 0.6442523060619927, + "grad_norm": 2.04201078414917, + "learning_rate": 3.7450562823243077e-07, + "loss": 0.9226, + "step": 26750 + }, + { + "epoch": 0.64437272705378, + "grad_norm": 1.4492322206497192, + "learning_rate": 3.7437886624074636e-07, + "loss": 0.8431, + "step": 26755 + }, + { + "epoch": 0.6444931480455673, + "grad_norm": 1.5902260541915894, + "learning_rate": 3.7425210424906195e-07, + "loss": 0.8626, + "step": 26760 + }, + { + "epoch": 0.6446135690373546, + "grad_norm": 1.510667324066162, + "learning_rate": 3.741253422573775e-07, + "loss": 0.8645, + "step": 26765 + }, + { + "epoch": 0.6447339900291419, + "grad_norm": 1.7094923257827759, + "learning_rate": 3.7399858026569314e-07, + "loss": 0.9253, + "step": 26770 + }, + { + "epoch": 0.6448544110209292, + "grad_norm": 1.5884010791778564, + "learning_rate": 3.7387181827400873e-07, + "loss": 0.9593, + "step": 26775 + }, + { + "epoch": 0.6449748320127164, + "grad_norm": 1.6673965454101562, + "learning_rate": 3.7374505628232427e-07, + "loss": 0.8712, + "step": 26780 + }, + { + "epoch": 0.6450952530045038, + "grad_norm": 1.4968947172164917, + "learning_rate": 3.736182942906399e-07, + "loss": 0.8972, + "step": 26785 + }, + { + "epoch": 0.645215673996291, + "grad_norm": 1.5666877031326294, + "learning_rate": 3.7349153229895545e-07, + "loss": 0.9499, + "step": 26790 + }, + { + "epoch": 0.6453360949880783, + "grad_norm": 1.5352009534835815, + "learning_rate": 3.7336477030727104e-07, + "loss": 0.8903, + "step": 26795 + }, + { + "epoch": 0.6454565159798656, + "grad_norm": 1.6426624059677124, + "learning_rate": 3.7323800831558663e-07, + "loss": 0.8634, + "step": 26800 + }, + { + "epoch": 0.6455769369716529, + "grad_norm": 1.7372010946273804, + "learning_rate": 3.7311124632390223e-07, + "loss": 0.9357, + "step": 26805 + }, + { + "epoch": 0.6456973579634402, + "grad_norm": 1.5004019737243652, + "learning_rate": 3.729844843322178e-07, + "loss": 0.9616, + "step": 26810 + }, + { + "epoch": 0.6458177789552275, + "grad_norm": 1.5163060426712036, + "learning_rate": 3.728577223405334e-07, + "loss": 0.9188, + "step": 26815 + }, + { + "epoch": 0.6459381999470147, + "grad_norm": 1.4289931058883667, + "learning_rate": 3.72730960348849e-07, + "loss": 0.8999, + "step": 26820 + }, + { + "epoch": 0.6460586209388021, + "grad_norm": 1.5710238218307495, + "learning_rate": 3.7260419835716454e-07, + "loss": 0.9245, + "step": 26825 + }, + { + "epoch": 0.6461790419305894, + "grad_norm": 1.4990209341049194, + "learning_rate": 3.724774363654802e-07, + "loss": 0.9087, + "step": 26830 + }, + { + "epoch": 0.6462994629223766, + "grad_norm": 1.4621906280517578, + "learning_rate": 3.723506743737957e-07, + "loss": 0.9007, + "step": 26835 + }, + { + "epoch": 0.6464198839141639, + "grad_norm": 1.655481219291687, + "learning_rate": 3.722239123821113e-07, + "loss": 0.9313, + "step": 26840 + }, + { + "epoch": 0.6465403049059512, + "grad_norm": 1.8018776178359985, + "learning_rate": 3.7209715039042696e-07, + "loss": 0.9167, + "step": 26845 + }, + { + "epoch": 0.6466607258977385, + "grad_norm": 1.529486894607544, + "learning_rate": 3.719703883987425e-07, + "loss": 0.9216, + "step": 26850 + }, + { + "epoch": 0.6467811468895258, + "grad_norm": 1.5701395273208618, + "learning_rate": 3.718436264070581e-07, + "loss": 0.8683, + "step": 26855 + }, + { + "epoch": 0.646901567881313, + "grad_norm": 1.528743863105774, + "learning_rate": 3.717168644153737e-07, + "loss": 0.9513, + "step": 26860 + }, + { + "epoch": 0.6470219888731004, + "grad_norm": 1.7234257459640503, + "learning_rate": 3.715901024236893e-07, + "loss": 0.9194, + "step": 26865 + }, + { + "epoch": 0.6471424098648877, + "grad_norm": 1.337449550628662, + "learning_rate": 3.714633404320048e-07, + "loss": 0.8864, + "step": 26870 + }, + { + "epoch": 0.6472628308566749, + "grad_norm": 1.568869948387146, + "learning_rate": 3.7133657844032046e-07, + "loss": 0.8847, + "step": 26875 + }, + { + "epoch": 0.6473832518484622, + "grad_norm": 1.664362907409668, + "learning_rate": 3.7120981644863605e-07, + "loss": 0.8685, + "step": 26880 + }, + { + "epoch": 0.6475036728402496, + "grad_norm": 1.5255695581436157, + "learning_rate": 3.710830544569516e-07, + "loss": 0.943, + "step": 26885 + }, + { + "epoch": 0.6476240938320368, + "grad_norm": 1.3643174171447754, + "learning_rate": 3.7095629246526723e-07, + "loss": 0.8955, + "step": 26890 + }, + { + "epoch": 0.6477445148238241, + "grad_norm": 1.5007296800613403, + "learning_rate": 3.7082953047358277e-07, + "loss": 0.924, + "step": 26895 + }, + { + "epoch": 0.6478649358156113, + "grad_norm": 1.6384427547454834, + "learning_rate": 3.7070276848189836e-07, + "loss": 0.9658, + "step": 26900 + }, + { + "epoch": 0.6479853568073987, + "grad_norm": 1.6323484182357788, + "learning_rate": 3.7057600649021396e-07, + "loss": 0.9548, + "step": 26905 + }, + { + "epoch": 0.648105777799186, + "grad_norm": 1.6146575212478638, + "learning_rate": 3.7044924449852955e-07, + "loss": 0.8944, + "step": 26910 + }, + { + "epoch": 0.6482261987909732, + "grad_norm": 1.5658189058303833, + "learning_rate": 3.703224825068451e-07, + "loss": 0.9234, + "step": 26915 + }, + { + "epoch": 0.6483466197827605, + "grad_norm": 1.5981191396713257, + "learning_rate": 3.7019572051516073e-07, + "loss": 0.8716, + "step": 26920 + }, + { + "epoch": 0.6484670407745479, + "grad_norm": 1.6653273105621338, + "learning_rate": 3.700689585234763e-07, + "loss": 0.8921, + "step": 26925 + }, + { + "epoch": 0.6485874617663351, + "grad_norm": 1.453226089477539, + "learning_rate": 3.6994219653179186e-07, + "loss": 0.8319, + "step": 26930 + }, + { + "epoch": 0.6487078827581224, + "grad_norm": 1.4281163215637207, + "learning_rate": 3.698154345401075e-07, + "loss": 0.8778, + "step": 26935 + }, + { + "epoch": 0.6488283037499096, + "grad_norm": 1.6727575063705444, + "learning_rate": 3.6968867254842305e-07, + "loss": 0.8621, + "step": 26940 + }, + { + "epoch": 0.648948724741697, + "grad_norm": 1.6255943775177002, + "learning_rate": 3.6956191055673864e-07, + "loss": 0.8967, + "step": 26945 + }, + { + "epoch": 0.6490691457334843, + "grad_norm": 1.9116973876953125, + "learning_rate": 3.6943514856505423e-07, + "loss": 0.886, + "step": 26950 + }, + { + "epoch": 0.6491895667252715, + "grad_norm": 1.4924372434616089, + "learning_rate": 3.693083865733698e-07, + "loss": 0.9099, + "step": 26955 + }, + { + "epoch": 0.6493099877170588, + "grad_norm": 1.5505237579345703, + "learning_rate": 3.691816245816854e-07, + "loss": 0.9415, + "step": 26960 + }, + { + "epoch": 0.6494304087088462, + "grad_norm": 1.570006251335144, + "learning_rate": 3.69054862590001e-07, + "loss": 0.9139, + "step": 26965 + }, + { + "epoch": 0.6495508297006334, + "grad_norm": 1.5226932764053345, + "learning_rate": 3.689281005983166e-07, + "loss": 0.8751, + "step": 26970 + }, + { + "epoch": 0.6496712506924207, + "grad_norm": 1.5911332368850708, + "learning_rate": 3.6880133860663214e-07, + "loss": 0.8287, + "step": 26975 + }, + { + "epoch": 0.649791671684208, + "grad_norm": 1.4197216033935547, + "learning_rate": 3.686745766149478e-07, + "loss": 0.9192, + "step": 26980 + }, + { + "epoch": 0.6499120926759953, + "grad_norm": 1.5569384098052979, + "learning_rate": 3.685478146232633e-07, + "loss": 0.9194, + "step": 26985 + }, + { + "epoch": 0.6500325136677826, + "grad_norm": 1.6305328607559204, + "learning_rate": 3.684210526315789e-07, + "loss": 0.8704, + "step": 26990 + }, + { + "epoch": 0.6501529346595698, + "grad_norm": 1.5222176313400269, + "learning_rate": 3.6829429063989456e-07, + "loss": 0.9204, + "step": 26995 + }, + { + "epoch": 0.6502733556513571, + "grad_norm": 1.4481796026229858, + "learning_rate": 3.681675286482101e-07, + "loss": 0.8853, + "step": 27000 + }, + { + "epoch": 0.6503937766431445, + "grad_norm": 1.665404200553894, + "learning_rate": 3.680407666565257e-07, + "loss": 0.9072, + "step": 27005 + }, + { + "epoch": 0.6505141976349317, + "grad_norm": 1.5380555391311646, + "learning_rate": 3.679140046648413e-07, + "loss": 0.8119, + "step": 27010 + }, + { + "epoch": 0.650634618626719, + "grad_norm": 1.5314487218856812, + "learning_rate": 3.6778724267315687e-07, + "loss": 0.8875, + "step": 27015 + }, + { + "epoch": 0.6507550396185063, + "grad_norm": 1.5317686796188354, + "learning_rate": 3.676604806814724e-07, + "loss": 0.8951, + "step": 27020 + }, + { + "epoch": 0.6508754606102936, + "grad_norm": 1.418816089630127, + "learning_rate": 3.6753371868978805e-07, + "loss": 0.8695, + "step": 27025 + }, + { + "epoch": 0.6509958816020809, + "grad_norm": 1.6045305728912354, + "learning_rate": 3.6740695669810365e-07, + "loss": 0.8818, + "step": 27030 + }, + { + "epoch": 0.6511163025938682, + "grad_norm": 1.4292452335357666, + "learning_rate": 3.672801947064192e-07, + "loss": 0.9099, + "step": 27035 + }, + { + "epoch": 0.6512367235856554, + "grad_norm": 1.8049204349517822, + "learning_rate": 3.6715343271473483e-07, + "loss": 0.9229, + "step": 27040 + }, + { + "epoch": 0.6513571445774428, + "grad_norm": 1.461598515510559, + "learning_rate": 3.6702667072305037e-07, + "loss": 0.912, + "step": 27045 + }, + { + "epoch": 0.65147756556923, + "grad_norm": 1.686318039894104, + "learning_rate": 3.6689990873136596e-07, + "loss": 0.8798, + "step": 27050 + }, + { + "epoch": 0.6515979865610173, + "grad_norm": 1.6203144788742065, + "learning_rate": 3.6677314673968155e-07, + "loss": 0.8777, + "step": 27055 + }, + { + "epoch": 0.6517184075528046, + "grad_norm": 1.5647941827774048, + "learning_rate": 3.6664638474799714e-07, + "loss": 0.9458, + "step": 27060 + }, + { + "epoch": 0.6518388285445919, + "grad_norm": 1.4100810289382935, + "learning_rate": 3.665196227563127e-07, + "loss": 0.869, + "step": 27065 + }, + { + "epoch": 0.6519592495363792, + "grad_norm": 1.5744898319244385, + "learning_rate": 3.6639286076462833e-07, + "loss": 0.8851, + "step": 27070 + }, + { + "epoch": 0.6520796705281665, + "grad_norm": 1.6459345817565918, + "learning_rate": 3.662660987729439e-07, + "loss": 0.8938, + "step": 27075 + }, + { + "epoch": 0.6522000915199537, + "grad_norm": 1.555381178855896, + "learning_rate": 3.6613933678125946e-07, + "loss": 0.9509, + "step": 27080 + }, + { + "epoch": 0.6523205125117411, + "grad_norm": 1.4916878938674927, + "learning_rate": 3.660125747895751e-07, + "loss": 0.8233, + "step": 27085 + }, + { + "epoch": 0.6524409335035284, + "grad_norm": 1.42539381980896, + "learning_rate": 3.6588581279789064e-07, + "loss": 0.9541, + "step": 27090 + }, + { + "epoch": 0.6525613544953156, + "grad_norm": 1.3812246322631836, + "learning_rate": 3.6575905080620623e-07, + "loss": 0.9144, + "step": 27095 + }, + { + "epoch": 0.6526817754871029, + "grad_norm": 1.936347484588623, + "learning_rate": 3.656322888145218e-07, + "loss": 0.8679, + "step": 27100 + }, + { + "epoch": 0.6528021964788902, + "grad_norm": 1.5986363887786865, + "learning_rate": 3.655055268228374e-07, + "loss": 0.8995, + "step": 27105 + }, + { + "epoch": 0.6529226174706775, + "grad_norm": 1.5964977741241455, + "learning_rate": 3.65378764831153e-07, + "loss": 0.8813, + "step": 27110 + }, + { + "epoch": 0.6530430384624648, + "grad_norm": 1.3528989553451538, + "learning_rate": 3.652520028394686e-07, + "loss": 0.8285, + "step": 27115 + }, + { + "epoch": 0.653163459454252, + "grad_norm": 1.4709807634353638, + "learning_rate": 3.651252408477842e-07, + "loss": 0.9106, + "step": 27120 + }, + { + "epoch": 0.6532838804460394, + "grad_norm": 1.7439950704574585, + "learning_rate": 3.6499847885609973e-07, + "loss": 0.9014, + "step": 27125 + }, + { + "epoch": 0.6534043014378267, + "grad_norm": 1.5640136003494263, + "learning_rate": 3.648717168644154e-07, + "loss": 0.8836, + "step": 27130 + }, + { + "epoch": 0.6535247224296139, + "grad_norm": 1.639180064201355, + "learning_rate": 3.647449548727309e-07, + "loss": 0.8945, + "step": 27135 + }, + { + "epoch": 0.6536451434214012, + "grad_norm": 1.587928056716919, + "learning_rate": 3.646181928810465e-07, + "loss": 0.8792, + "step": 27140 + }, + { + "epoch": 0.6537655644131886, + "grad_norm": 1.5554908514022827, + "learning_rate": 3.6449143088936215e-07, + "loss": 0.9404, + "step": 27145 + }, + { + "epoch": 0.6538859854049758, + "grad_norm": 1.4645404815673828, + "learning_rate": 3.643646688976777e-07, + "loss": 0.8887, + "step": 27150 + }, + { + "epoch": 0.6540064063967631, + "grad_norm": 1.6191918849945068, + "learning_rate": 3.642379069059933e-07, + "loss": 0.8678, + "step": 27155 + }, + { + "epoch": 0.6541268273885503, + "grad_norm": 1.373336672782898, + "learning_rate": 3.6411114491430887e-07, + "loss": 0.9033, + "step": 27160 + }, + { + "epoch": 0.6542472483803377, + "grad_norm": 1.5440839529037476, + "learning_rate": 3.6398438292262447e-07, + "loss": 0.8413, + "step": 27165 + }, + { + "epoch": 0.654367669372125, + "grad_norm": 1.6653863191604614, + "learning_rate": 3.6385762093094e-07, + "loss": 0.926, + "step": 27170 + }, + { + "epoch": 0.6544880903639122, + "grad_norm": 1.4619879722595215, + "learning_rate": 3.6373085893925565e-07, + "loss": 0.919, + "step": 27175 + }, + { + "epoch": 0.6546085113556995, + "grad_norm": 1.5410512685775757, + "learning_rate": 3.6360409694757124e-07, + "loss": 0.8741, + "step": 27180 + }, + { + "epoch": 0.6547289323474869, + "grad_norm": 1.6692581176757812, + "learning_rate": 3.634773349558868e-07, + "loss": 0.9221, + "step": 27185 + }, + { + "epoch": 0.6548493533392741, + "grad_norm": 1.6221915483474731, + "learning_rate": 3.633505729642024e-07, + "loss": 0.9062, + "step": 27190 + }, + { + "epoch": 0.6549697743310614, + "grad_norm": 1.5964298248291016, + "learning_rate": 3.6322381097251796e-07, + "loss": 0.8578, + "step": 27195 + }, + { + "epoch": 0.6550901953228487, + "grad_norm": 1.405503511428833, + "learning_rate": 3.630970489808336e-07, + "loss": 0.9153, + "step": 27200 + }, + { + "epoch": 0.655210616314636, + "grad_norm": 1.5333468914031982, + "learning_rate": 3.6297028698914915e-07, + "loss": 0.862, + "step": 27205 + }, + { + "epoch": 0.6553310373064233, + "grad_norm": 1.7078754901885986, + "learning_rate": 3.6284352499746474e-07, + "loss": 0.861, + "step": 27210 + }, + { + "epoch": 0.6554514582982105, + "grad_norm": 1.62969172000885, + "learning_rate": 3.627167630057804e-07, + "loss": 0.8841, + "step": 27215 + }, + { + "epoch": 0.6555718792899978, + "grad_norm": 1.381977915763855, + "learning_rate": 3.625900010140959e-07, + "loss": 0.9321, + "step": 27220 + }, + { + "epoch": 0.6556923002817852, + "grad_norm": 1.4872454404830933, + "learning_rate": 3.624632390224115e-07, + "loss": 0.8616, + "step": 27225 + }, + { + "epoch": 0.6558127212735724, + "grad_norm": 1.4531726837158203, + "learning_rate": 3.623364770307271e-07, + "loss": 0.9679, + "step": 27230 + }, + { + "epoch": 0.6559331422653597, + "grad_norm": 1.531635046005249, + "learning_rate": 3.622097150390427e-07, + "loss": 0.9636, + "step": 27235 + }, + { + "epoch": 0.656053563257147, + "grad_norm": 1.585182785987854, + "learning_rate": 3.6208295304735824e-07, + "loss": 0.867, + "step": 27240 + }, + { + "epoch": 0.6561739842489343, + "grad_norm": 1.6876927614212036, + "learning_rate": 3.619561910556739e-07, + "loss": 0.9311, + "step": 27245 + }, + { + "epoch": 0.6562944052407216, + "grad_norm": 1.6832166910171509, + "learning_rate": 3.618294290639894e-07, + "loss": 0.9061, + "step": 27250 + }, + { + "epoch": 0.6564148262325088, + "grad_norm": 1.585931658744812, + "learning_rate": 3.61702667072305e-07, + "loss": 0.8733, + "step": 27255 + }, + { + "epoch": 0.6565352472242961, + "grad_norm": 1.6085927486419678, + "learning_rate": 3.6157590508062066e-07, + "loss": 0.8653, + "step": 27260 + }, + { + "epoch": 0.6566556682160835, + "grad_norm": 1.4381110668182373, + "learning_rate": 3.614491430889362e-07, + "loss": 0.884, + "step": 27265 + }, + { + "epoch": 0.6567760892078707, + "grad_norm": 1.5061804056167603, + "learning_rate": 3.613223810972518e-07, + "loss": 0.912, + "step": 27270 + }, + { + "epoch": 0.656896510199658, + "grad_norm": 1.5456966161727905, + "learning_rate": 3.611956191055674e-07, + "loss": 0.873, + "step": 27275 + }, + { + "epoch": 0.6570169311914453, + "grad_norm": 1.3770190477371216, + "learning_rate": 3.6106885711388297e-07, + "loss": 0.9152, + "step": 27280 + }, + { + "epoch": 0.6571373521832325, + "grad_norm": 1.554132342338562, + "learning_rate": 3.609420951221985e-07, + "loss": 0.9046, + "step": 27285 + }, + { + "epoch": 0.6572577731750199, + "grad_norm": 1.5005851984024048, + "learning_rate": 3.6081533313051415e-07, + "loss": 0.8832, + "step": 27290 + }, + { + "epoch": 0.6573781941668072, + "grad_norm": 1.5317542552947998, + "learning_rate": 3.6068857113882975e-07, + "loss": 0.8734, + "step": 27295 + }, + { + "epoch": 0.6574986151585944, + "grad_norm": 1.7083379030227661, + "learning_rate": 3.605618091471453e-07, + "loss": 0.9485, + "step": 27300 + }, + { + "epoch": 0.6576190361503818, + "grad_norm": 1.5612186193466187, + "learning_rate": 3.6043504715546093e-07, + "loss": 0.9064, + "step": 27305 + }, + { + "epoch": 0.657739457142169, + "grad_norm": 1.361822247505188, + "learning_rate": 3.6030828516377647e-07, + "loss": 0.9019, + "step": 27310 + }, + { + "epoch": 0.6578598781339563, + "grad_norm": 2.116149663925171, + "learning_rate": 3.6018152317209206e-07, + "loss": 0.869, + "step": 27315 + }, + { + "epoch": 0.6579802991257436, + "grad_norm": 1.8238283395767212, + "learning_rate": 3.6005476118040765e-07, + "loss": 0.8696, + "step": 27320 + }, + { + "epoch": 0.6581007201175308, + "grad_norm": 1.6186177730560303, + "learning_rate": 3.5992799918872324e-07, + "loss": 0.8781, + "step": 27325 + }, + { + "epoch": 0.6582211411093182, + "grad_norm": 1.4667737483978271, + "learning_rate": 3.5980123719703884e-07, + "loss": 0.8702, + "step": 27330 + }, + { + "epoch": 0.6583415621011055, + "grad_norm": 1.6477582454681396, + "learning_rate": 3.5967447520535443e-07, + "loss": 0.8977, + "step": 27335 + }, + { + "epoch": 0.6584619830928927, + "grad_norm": 1.6902414560317993, + "learning_rate": 3.5954771321367e-07, + "loss": 0.9111, + "step": 27340 + }, + { + "epoch": 0.65858240408468, + "grad_norm": 1.6111053228378296, + "learning_rate": 3.5942095122198556e-07, + "loss": 0.9084, + "step": 27345 + }, + { + "epoch": 0.6587028250764674, + "grad_norm": 1.544984221458435, + "learning_rate": 3.592941892303012e-07, + "loss": 0.9529, + "step": 27350 + }, + { + "epoch": 0.6588232460682546, + "grad_norm": 1.3802094459533691, + "learning_rate": 3.5916742723861674e-07, + "loss": 0.8947, + "step": 27355 + }, + { + "epoch": 0.6589436670600419, + "grad_norm": 1.6167811155319214, + "learning_rate": 3.5904066524693233e-07, + "loss": 0.8962, + "step": 27360 + }, + { + "epoch": 0.6590640880518291, + "grad_norm": 1.3995361328125, + "learning_rate": 3.58913903255248e-07, + "loss": 0.8824, + "step": 27365 + }, + { + "epoch": 0.6591845090436165, + "grad_norm": 1.7289831638336182, + "learning_rate": 3.587871412635635e-07, + "loss": 0.9314, + "step": 27370 + }, + { + "epoch": 0.6593049300354038, + "grad_norm": 1.5664397478103638, + "learning_rate": 3.586603792718791e-07, + "loss": 0.8917, + "step": 27375 + }, + { + "epoch": 0.659425351027191, + "grad_norm": 2.464617967605591, + "learning_rate": 3.585336172801947e-07, + "loss": 0.8862, + "step": 27380 + }, + { + "epoch": 0.6595457720189783, + "grad_norm": 1.5785813331604004, + "learning_rate": 3.584068552885103e-07, + "loss": 0.8589, + "step": 27385 + }, + { + "epoch": 0.6596661930107657, + "grad_norm": 1.6598893404006958, + "learning_rate": 3.5828009329682583e-07, + "loss": 0.8989, + "step": 27390 + }, + { + "epoch": 0.6597866140025529, + "grad_norm": 1.4237953424453735, + "learning_rate": 3.581533313051415e-07, + "loss": 0.9035, + "step": 27395 + }, + { + "epoch": 0.6599070349943402, + "grad_norm": 1.4129544496536255, + "learning_rate": 3.58026569313457e-07, + "loss": 0.8987, + "step": 27400 + }, + { + "epoch": 0.6600274559861276, + "grad_norm": 1.421212911605835, + "learning_rate": 3.578998073217726e-07, + "loss": 0.9319, + "step": 27405 + }, + { + "epoch": 0.6601478769779148, + "grad_norm": 1.5244791507720947, + "learning_rate": 3.5777304533008825e-07, + "loss": 0.903, + "step": 27410 + }, + { + "epoch": 0.6602682979697021, + "grad_norm": 1.5054500102996826, + "learning_rate": 3.576462833384038e-07, + "loss": 0.9038, + "step": 27415 + }, + { + "epoch": 0.6603887189614893, + "grad_norm": 1.5401767492294312, + "learning_rate": 3.575195213467194e-07, + "loss": 0.9308, + "step": 27420 + }, + { + "epoch": 0.6605091399532766, + "grad_norm": 1.5578993558883667, + "learning_rate": 3.57392759355035e-07, + "loss": 0.8465, + "step": 27425 + }, + { + "epoch": 0.660629560945064, + "grad_norm": 1.6200608015060425, + "learning_rate": 3.5726599736335057e-07, + "loss": 0.9103, + "step": 27430 + }, + { + "epoch": 0.6607499819368512, + "grad_norm": 1.6002734899520874, + "learning_rate": 3.571392353716661e-07, + "loss": 0.8838, + "step": 27435 + }, + { + "epoch": 0.6608704029286385, + "grad_norm": 1.7070163488388062, + "learning_rate": 3.5701247337998175e-07, + "loss": 0.8847, + "step": 27440 + }, + { + "epoch": 0.6609908239204259, + "grad_norm": 1.6092935800552368, + "learning_rate": 3.5688571138829734e-07, + "loss": 0.8689, + "step": 27445 + }, + { + "epoch": 0.6611112449122131, + "grad_norm": 1.691953420639038, + "learning_rate": 3.567589493966129e-07, + "loss": 0.8966, + "step": 27450 + }, + { + "epoch": 0.6612316659040004, + "grad_norm": 1.625665545463562, + "learning_rate": 3.566321874049285e-07, + "loss": 0.9179, + "step": 27455 + }, + { + "epoch": 0.6613520868957877, + "grad_norm": 1.5149598121643066, + "learning_rate": 3.5650542541324406e-07, + "loss": 0.9011, + "step": 27460 + }, + { + "epoch": 0.661472507887575, + "grad_norm": 1.4930402040481567, + "learning_rate": 3.5637866342155966e-07, + "loss": 0.9046, + "step": 27465 + }, + { + "epoch": 0.6615929288793623, + "grad_norm": 1.608102560043335, + "learning_rate": 3.5625190142987525e-07, + "loss": 0.8862, + "step": 27470 + }, + { + "epoch": 0.6617133498711495, + "grad_norm": 1.6462239027023315, + "learning_rate": 3.5612513943819084e-07, + "loss": 0.9726, + "step": 27475 + }, + { + "epoch": 0.6618337708629368, + "grad_norm": 1.5935025215148926, + "learning_rate": 3.559983774465064e-07, + "loss": 0.9017, + "step": 27480 + }, + { + "epoch": 0.6619541918547241, + "grad_norm": 1.6776366233825684, + "learning_rate": 3.55871615454822e-07, + "loss": 0.8751, + "step": 27485 + }, + { + "epoch": 0.6620746128465114, + "grad_norm": 1.3133662939071655, + "learning_rate": 3.557448534631376e-07, + "loss": 0.8528, + "step": 27490 + }, + { + "epoch": 0.6621950338382987, + "grad_norm": 1.6722146272659302, + "learning_rate": 3.5561809147145315e-07, + "loss": 0.9213, + "step": 27495 + }, + { + "epoch": 0.662315454830086, + "grad_norm": 1.365938663482666, + "learning_rate": 3.554913294797688e-07, + "loss": 0.8883, + "step": 27500 + }, + { + "epoch": 0.6624358758218732, + "grad_norm": 1.6689952611923218, + "learning_rate": 3.5536456748808434e-07, + "loss": 0.8746, + "step": 27505 + }, + { + "epoch": 0.6625562968136606, + "grad_norm": 1.7636252641677856, + "learning_rate": 3.5523780549639993e-07, + "loss": 0.9094, + "step": 27510 + }, + { + "epoch": 0.6626767178054478, + "grad_norm": 1.6054761409759521, + "learning_rate": 3.551110435047155e-07, + "loss": 0.9026, + "step": 27515 + }, + { + "epoch": 0.6627971387972351, + "grad_norm": 1.563241720199585, + "learning_rate": 3.549842815130311e-07, + "loss": 0.9092, + "step": 27520 + }, + { + "epoch": 0.6629175597890224, + "grad_norm": 1.3838763236999512, + "learning_rate": 3.548575195213467e-07, + "loss": 0.9027, + "step": 27525 + }, + { + "epoch": 0.6630379807808097, + "grad_norm": 1.4422403573989868, + "learning_rate": 3.547307575296623e-07, + "loss": 0.8957, + "step": 27530 + }, + { + "epoch": 0.663158401772597, + "grad_norm": 1.3933910131454468, + "learning_rate": 3.546039955379779e-07, + "loss": 0.9401, + "step": 27535 + }, + { + "epoch": 0.6632788227643843, + "grad_norm": 1.6570963859558105, + "learning_rate": 3.544772335462934e-07, + "loss": 0.8739, + "step": 27540 + }, + { + "epoch": 0.6633992437561715, + "grad_norm": 1.5029670000076294, + "learning_rate": 3.5435047155460907e-07, + "loss": 0.8745, + "step": 27545 + }, + { + "epoch": 0.6635196647479589, + "grad_norm": 1.5351386070251465, + "learning_rate": 3.542237095629246e-07, + "loss": 0.8826, + "step": 27550 + }, + { + "epoch": 0.6636400857397462, + "grad_norm": 1.940496563911438, + "learning_rate": 3.540969475712402e-07, + "loss": 0.9065, + "step": 27555 + }, + { + "epoch": 0.6637605067315334, + "grad_norm": 1.5638905763626099, + "learning_rate": 3.5397018557955585e-07, + "loss": 0.902, + "step": 27560 + }, + { + "epoch": 0.6638809277233207, + "grad_norm": 1.4888930320739746, + "learning_rate": 3.538434235878714e-07, + "loss": 0.9199, + "step": 27565 + }, + { + "epoch": 0.664001348715108, + "grad_norm": 1.898266315460205, + "learning_rate": 3.53716661596187e-07, + "loss": 0.9166, + "step": 27570 + }, + { + "epoch": 0.6641217697068953, + "grad_norm": 1.5693520307540894, + "learning_rate": 3.5358989960450257e-07, + "loss": 0.9261, + "step": 27575 + }, + { + "epoch": 0.6642421906986826, + "grad_norm": 1.8452355861663818, + "learning_rate": 3.5346313761281816e-07, + "loss": 0.8723, + "step": 27580 + }, + { + "epoch": 0.6643626116904698, + "grad_norm": 1.486311435699463, + "learning_rate": 3.533363756211337e-07, + "loss": 0.8415, + "step": 27585 + }, + { + "epoch": 0.6644830326822572, + "grad_norm": 1.572955846786499, + "learning_rate": 3.5320961362944934e-07, + "loss": 0.8646, + "step": 27590 + }, + { + "epoch": 0.6646034536740445, + "grad_norm": 1.6016361713409424, + "learning_rate": 3.5308285163776494e-07, + "loss": 0.855, + "step": 27595 + }, + { + "epoch": 0.6647238746658317, + "grad_norm": 1.5524715185165405, + "learning_rate": 3.529560896460805e-07, + "loss": 0.9067, + "step": 27600 + }, + { + "epoch": 0.664844295657619, + "grad_norm": 1.272606372833252, + "learning_rate": 3.528293276543961e-07, + "loss": 0.902, + "step": 27605 + }, + { + "epoch": 0.6649647166494064, + "grad_norm": 1.6640665531158447, + "learning_rate": 3.5270256566271166e-07, + "loss": 0.9027, + "step": 27610 + }, + { + "epoch": 0.6650851376411936, + "grad_norm": 1.495080590248108, + "learning_rate": 3.5257580367102725e-07, + "loss": 0.835, + "step": 27615 + }, + { + "epoch": 0.6652055586329809, + "grad_norm": 1.670669674873352, + "learning_rate": 3.5244904167934284e-07, + "loss": 0.9034, + "step": 27620 + }, + { + "epoch": 0.6653259796247681, + "grad_norm": 1.649122953414917, + "learning_rate": 3.5232227968765843e-07, + "loss": 0.8889, + "step": 27625 + }, + { + "epoch": 0.6654464006165555, + "grad_norm": 1.7025654315948486, + "learning_rate": 3.5219551769597397e-07, + "loss": 0.8757, + "step": 27630 + }, + { + "epoch": 0.6655668216083428, + "grad_norm": 1.565008521080017, + "learning_rate": 3.520687557042896e-07, + "loss": 0.8086, + "step": 27635 + }, + { + "epoch": 0.66568724260013, + "grad_norm": 1.5095890760421753, + "learning_rate": 3.519419937126052e-07, + "loss": 0.9017, + "step": 27640 + }, + { + "epoch": 0.6658076635919173, + "grad_norm": 1.465999722480774, + "learning_rate": 3.5181523172092075e-07, + "loss": 0.8848, + "step": 27645 + }, + { + "epoch": 0.6659280845837047, + "grad_norm": 1.6381109952926636, + "learning_rate": 3.516884697292364e-07, + "loss": 0.9635, + "step": 27650 + }, + { + "epoch": 0.6660485055754919, + "grad_norm": 1.6575664281845093, + "learning_rate": 3.5156170773755193e-07, + "loss": 0.934, + "step": 27655 + }, + { + "epoch": 0.6661689265672792, + "grad_norm": 1.5238529443740845, + "learning_rate": 3.514349457458675e-07, + "loss": 0.9337, + "step": 27660 + }, + { + "epoch": 0.6662893475590665, + "grad_norm": 1.5721980333328247, + "learning_rate": 3.513081837541831e-07, + "loss": 0.919, + "step": 27665 + }, + { + "epoch": 0.6664097685508538, + "grad_norm": 1.770738959312439, + "learning_rate": 3.511814217624987e-07, + "loss": 0.8885, + "step": 27670 + }, + { + "epoch": 0.6665301895426411, + "grad_norm": 1.657386064529419, + "learning_rate": 3.510546597708143e-07, + "loss": 0.9222, + "step": 27675 + }, + { + "epoch": 0.6666506105344283, + "grad_norm": 1.6715120077133179, + "learning_rate": 3.509278977791299e-07, + "loss": 0.9324, + "step": 27680 + }, + { + "epoch": 0.6667710315262156, + "grad_norm": 1.4019960165023804, + "learning_rate": 3.508011357874455e-07, + "loss": 0.9458, + "step": 27685 + }, + { + "epoch": 0.666891452518003, + "grad_norm": 1.474033236503601, + "learning_rate": 3.50674373795761e-07, + "loss": 0.8673, + "step": 27690 + }, + { + "epoch": 0.6670118735097902, + "grad_norm": 1.807705283164978, + "learning_rate": 3.5054761180407667e-07, + "loss": 0.8754, + "step": 27695 + }, + { + "epoch": 0.6671322945015775, + "grad_norm": 1.7381840944290161, + "learning_rate": 3.504208498123922e-07, + "loss": 0.8682, + "step": 27700 + }, + { + "epoch": 0.6672527154933648, + "grad_norm": 1.5708887577056885, + "learning_rate": 3.502940878207078e-07, + "loss": 0.9015, + "step": 27705 + }, + { + "epoch": 0.6673731364851521, + "grad_norm": 1.4795726537704468, + "learning_rate": 3.5016732582902344e-07, + "loss": 0.8973, + "step": 27710 + }, + { + "epoch": 0.6674935574769394, + "grad_norm": 1.4762171506881714, + "learning_rate": 3.50040563837339e-07, + "loss": 0.932, + "step": 27715 + }, + { + "epoch": 0.6676139784687267, + "grad_norm": 1.4258259534835815, + "learning_rate": 3.499138018456546e-07, + "loss": 0.9464, + "step": 27720 + }, + { + "epoch": 0.6677343994605139, + "grad_norm": 1.4672174453735352, + "learning_rate": 3.4978703985397016e-07, + "loss": 0.9, + "step": 27725 + }, + { + "epoch": 0.6678548204523013, + "grad_norm": 1.41704261302948, + "learning_rate": 3.4966027786228576e-07, + "loss": 0.9241, + "step": 27730 + }, + { + "epoch": 0.6679752414440885, + "grad_norm": 1.6913530826568604, + "learning_rate": 3.4953351587060135e-07, + "loss": 0.897, + "step": 27735 + }, + { + "epoch": 0.6680956624358758, + "grad_norm": 1.5633080005645752, + "learning_rate": 3.4940675387891694e-07, + "loss": 0.8569, + "step": 27740 + }, + { + "epoch": 0.6682160834276631, + "grad_norm": 1.7641186714172363, + "learning_rate": 3.4927999188723253e-07, + "loss": 0.8692, + "step": 27745 + }, + { + "epoch": 0.6683365044194504, + "grad_norm": 1.6391301155090332, + "learning_rate": 3.491532298955481e-07, + "loss": 0.9202, + "step": 27750 + }, + { + "epoch": 0.6684569254112377, + "grad_norm": 1.598775863647461, + "learning_rate": 3.490264679038637e-07, + "loss": 0.9319, + "step": 27755 + }, + { + "epoch": 0.668577346403025, + "grad_norm": 1.6690956354141235, + "learning_rate": 3.4889970591217925e-07, + "loss": 0.8681, + "step": 27760 + }, + { + "epoch": 0.6686977673948122, + "grad_norm": 1.440314531326294, + "learning_rate": 3.487729439204949e-07, + "loss": 0.8995, + "step": 27765 + }, + { + "epoch": 0.6688181883865996, + "grad_norm": 1.5245696306228638, + "learning_rate": 3.4864618192881044e-07, + "loss": 0.8909, + "step": 27770 + }, + { + "epoch": 0.6689386093783869, + "grad_norm": 1.6076589822769165, + "learning_rate": 3.4851941993712603e-07, + "loss": 0.8378, + "step": 27775 + }, + { + "epoch": 0.6690590303701741, + "grad_norm": 1.628814935684204, + "learning_rate": 3.4839265794544167e-07, + "loss": 0.8412, + "step": 27780 + }, + { + "epoch": 0.6691794513619614, + "grad_norm": 1.5785472393035889, + "learning_rate": 3.482658959537572e-07, + "loss": 0.913, + "step": 27785 + }, + { + "epoch": 0.6692998723537487, + "grad_norm": 1.8055250644683838, + "learning_rate": 3.481391339620728e-07, + "loss": 0.9016, + "step": 27790 + }, + { + "epoch": 0.669420293345536, + "grad_norm": 1.6282498836517334, + "learning_rate": 3.480123719703884e-07, + "loss": 0.9203, + "step": 27795 + }, + { + "epoch": 0.6695407143373233, + "grad_norm": 1.3604364395141602, + "learning_rate": 3.47885609978704e-07, + "loss": 0.9183, + "step": 27800 + }, + { + "epoch": 0.6696611353291105, + "grad_norm": 1.5119647979736328, + "learning_rate": 3.477588479870195e-07, + "loss": 0.9655, + "step": 27805 + }, + { + "epoch": 0.6697815563208979, + "grad_norm": 1.5084353685379028, + "learning_rate": 3.4763208599533517e-07, + "loss": 0.9002, + "step": 27810 + }, + { + "epoch": 0.6699019773126852, + "grad_norm": 1.4556913375854492, + "learning_rate": 3.475053240036507e-07, + "loss": 0.896, + "step": 27815 + }, + { + "epoch": 0.6700223983044724, + "grad_norm": 1.5710209608078003, + "learning_rate": 3.473785620119663e-07, + "loss": 0.9056, + "step": 27820 + }, + { + "epoch": 0.6701428192962597, + "grad_norm": 2.3126919269561768, + "learning_rate": 3.4725180002028195e-07, + "loss": 0.8748, + "step": 27825 + }, + { + "epoch": 0.670263240288047, + "grad_norm": 1.6674573421478271, + "learning_rate": 3.471250380285975e-07, + "loss": 0.9199, + "step": 27830 + }, + { + "epoch": 0.6703836612798343, + "grad_norm": 1.5254837274551392, + "learning_rate": 3.469982760369131e-07, + "loss": 0.9121, + "step": 27835 + }, + { + "epoch": 0.6705040822716216, + "grad_norm": 1.6415302753448486, + "learning_rate": 3.4687151404522867e-07, + "loss": 0.9034, + "step": 27840 + }, + { + "epoch": 0.6706245032634088, + "grad_norm": 1.5428940057754517, + "learning_rate": 3.4674475205354426e-07, + "loss": 0.906, + "step": 27845 + }, + { + "epoch": 0.6707449242551962, + "grad_norm": 1.6544368267059326, + "learning_rate": 3.466179900618598e-07, + "loss": 0.9137, + "step": 27850 + }, + { + "epoch": 0.6708653452469835, + "grad_norm": 1.7646950483322144, + "learning_rate": 3.4649122807017544e-07, + "loss": 0.8985, + "step": 27855 + }, + { + "epoch": 0.6709857662387707, + "grad_norm": 1.6507251262664795, + "learning_rate": 3.4636446607849104e-07, + "loss": 0.8856, + "step": 27860 + }, + { + "epoch": 0.671106187230558, + "grad_norm": 1.5486855506896973, + "learning_rate": 3.462377040868066e-07, + "loss": 0.8904, + "step": 27865 + }, + { + "epoch": 0.6712266082223454, + "grad_norm": 1.3910833597183228, + "learning_rate": 3.461109420951222e-07, + "loss": 0.9062, + "step": 27870 + }, + { + "epoch": 0.6713470292141326, + "grad_norm": 1.4404289722442627, + "learning_rate": 3.4598418010343776e-07, + "loss": 0.9364, + "step": 27875 + }, + { + "epoch": 0.6714674502059199, + "grad_norm": 1.5207289457321167, + "learning_rate": 3.4585741811175335e-07, + "loss": 0.9202, + "step": 27880 + }, + { + "epoch": 0.6715878711977071, + "grad_norm": 1.6543253660202026, + "learning_rate": 3.4573065612006894e-07, + "loss": 0.9125, + "step": 27885 + }, + { + "epoch": 0.6717082921894945, + "grad_norm": 1.3949698209762573, + "learning_rate": 3.4560389412838453e-07, + "loss": 0.8908, + "step": 27890 + }, + { + "epoch": 0.6718287131812818, + "grad_norm": 1.6345555782318115, + "learning_rate": 3.454771321367001e-07, + "loss": 0.9174, + "step": 27895 + }, + { + "epoch": 0.671949134173069, + "grad_norm": 1.3611493110656738, + "learning_rate": 3.453503701450157e-07, + "loss": 0.9037, + "step": 27900 + }, + { + "epoch": 0.6720695551648563, + "grad_norm": 1.530236005783081, + "learning_rate": 3.452236081533313e-07, + "loss": 0.911, + "step": 27905 + }, + { + "epoch": 0.6721899761566437, + "grad_norm": 1.83746337890625, + "learning_rate": 3.4509684616164685e-07, + "loss": 0.9357, + "step": 27910 + }, + { + "epoch": 0.6723103971484309, + "grad_norm": 1.508793830871582, + "learning_rate": 3.449700841699625e-07, + "loss": 0.8985, + "step": 27915 + }, + { + "epoch": 0.6724308181402182, + "grad_norm": 1.5518765449523926, + "learning_rate": 3.4484332217827803e-07, + "loss": 0.8257, + "step": 27920 + }, + { + "epoch": 0.6725512391320055, + "grad_norm": 1.5395053625106812, + "learning_rate": 3.447165601865936e-07, + "loss": 0.9241, + "step": 27925 + }, + { + "epoch": 0.6726716601237928, + "grad_norm": 1.581311583518982, + "learning_rate": 3.4458979819490927e-07, + "loss": 0.9167, + "step": 27930 + }, + { + "epoch": 0.6727920811155801, + "grad_norm": 1.6826280355453491, + "learning_rate": 3.444630362032248e-07, + "loss": 0.9198, + "step": 27935 + }, + { + "epoch": 0.6729125021073673, + "grad_norm": 1.5144160985946655, + "learning_rate": 3.443362742115404e-07, + "loss": 0.8347, + "step": 27940 + }, + { + "epoch": 0.6730329230991546, + "grad_norm": 1.4464256763458252, + "learning_rate": 3.44209512219856e-07, + "loss": 0.9136, + "step": 27945 + }, + { + "epoch": 0.673153344090942, + "grad_norm": 1.5539331436157227, + "learning_rate": 3.440827502281716e-07, + "loss": 0.8851, + "step": 27950 + }, + { + "epoch": 0.6732737650827292, + "grad_norm": 1.602190613746643, + "learning_rate": 3.439559882364871e-07, + "loss": 0.8836, + "step": 27955 + }, + { + "epoch": 0.6733941860745165, + "grad_norm": 1.5354704856872559, + "learning_rate": 3.4382922624480277e-07, + "loss": 0.886, + "step": 27960 + }, + { + "epoch": 0.6735146070663038, + "grad_norm": 1.4831632375717163, + "learning_rate": 3.437024642531183e-07, + "loss": 0.9186, + "step": 27965 + }, + { + "epoch": 0.6736350280580911, + "grad_norm": 1.542081356048584, + "learning_rate": 3.435757022614339e-07, + "loss": 0.9533, + "step": 27970 + }, + { + "epoch": 0.6737554490498784, + "grad_norm": 1.5842245817184448, + "learning_rate": 3.4344894026974954e-07, + "loss": 0.8567, + "step": 27975 + }, + { + "epoch": 0.6738758700416657, + "grad_norm": 1.6727997064590454, + "learning_rate": 3.433221782780651e-07, + "loss": 0.9581, + "step": 27980 + }, + { + "epoch": 0.6739962910334529, + "grad_norm": 1.609487533569336, + "learning_rate": 3.4319541628638067e-07, + "loss": 0.8881, + "step": 27985 + }, + { + "epoch": 0.6741167120252403, + "grad_norm": 1.6405456066131592, + "learning_rate": 3.4306865429469626e-07, + "loss": 0.9186, + "step": 27990 + }, + { + "epoch": 0.6742371330170275, + "grad_norm": 1.876981496810913, + "learning_rate": 3.4294189230301186e-07, + "loss": 0.8679, + "step": 27995 + }, + { + "epoch": 0.6743575540088148, + "grad_norm": 1.5377568006515503, + "learning_rate": 3.428151303113274e-07, + "loss": 0.9266, + "step": 28000 + }, + { + "epoch": 0.6744779750006021, + "grad_norm": 1.7596919536590576, + "learning_rate": 3.4268836831964304e-07, + "loss": 0.9209, + "step": 28005 + }, + { + "epoch": 0.6745983959923894, + "grad_norm": 1.688023328781128, + "learning_rate": 3.4256160632795863e-07, + "loss": 0.9065, + "step": 28010 + }, + { + "epoch": 0.6747188169841767, + "grad_norm": 1.5075517892837524, + "learning_rate": 3.4243484433627417e-07, + "loss": 0.903, + "step": 28015 + }, + { + "epoch": 0.674839237975964, + "grad_norm": 1.4978880882263184, + "learning_rate": 3.423080823445898e-07, + "loss": 0.8766, + "step": 28020 + }, + { + "epoch": 0.6749596589677512, + "grad_norm": 1.665744662284851, + "learning_rate": 3.4218132035290535e-07, + "loss": 0.8692, + "step": 28025 + }, + { + "epoch": 0.6750800799595386, + "grad_norm": 1.611232042312622, + "learning_rate": 3.4205455836122095e-07, + "loss": 0.8826, + "step": 28030 + }, + { + "epoch": 0.6752005009513259, + "grad_norm": 2.0650761127471924, + "learning_rate": 3.4192779636953654e-07, + "loss": 0.9662, + "step": 28035 + }, + { + "epoch": 0.6753209219431131, + "grad_norm": 1.5076416730880737, + "learning_rate": 3.4180103437785213e-07, + "loss": 0.8655, + "step": 28040 + }, + { + "epoch": 0.6754413429349004, + "grad_norm": 1.4803639650344849, + "learning_rate": 3.416742723861677e-07, + "loss": 0.8003, + "step": 28045 + }, + { + "epoch": 0.6755617639266877, + "grad_norm": 1.8180280923843384, + "learning_rate": 3.415475103944833e-07, + "loss": 0.8784, + "step": 28050 + }, + { + "epoch": 0.675682184918475, + "grad_norm": 1.4070686101913452, + "learning_rate": 3.414207484027989e-07, + "loss": 0.8566, + "step": 28055 + }, + { + "epoch": 0.6758026059102623, + "grad_norm": 1.5639938116073608, + "learning_rate": 3.4129398641111444e-07, + "loss": 0.8849, + "step": 28060 + }, + { + "epoch": 0.6759230269020495, + "grad_norm": 1.6492422819137573, + "learning_rate": 3.411672244194301e-07, + "loss": 0.8641, + "step": 28065 + }, + { + "epoch": 0.6760434478938369, + "grad_norm": 1.6205191612243652, + "learning_rate": 3.4104046242774563e-07, + "loss": 0.8943, + "step": 28070 + }, + { + "epoch": 0.6761638688856242, + "grad_norm": 1.5223482847213745, + "learning_rate": 3.409137004360612e-07, + "loss": 0.8655, + "step": 28075 + }, + { + "epoch": 0.6762842898774114, + "grad_norm": 1.5024832487106323, + "learning_rate": 3.4078693844437686e-07, + "loss": 0.9364, + "step": 28080 + }, + { + "epoch": 0.6764047108691987, + "grad_norm": 1.5253880023956299, + "learning_rate": 3.406601764526924e-07, + "loss": 0.9013, + "step": 28085 + }, + { + "epoch": 0.6765251318609861, + "grad_norm": 1.6916687488555908, + "learning_rate": 3.40533414461008e-07, + "loss": 0.9208, + "step": 28090 + }, + { + "epoch": 0.6766455528527733, + "grad_norm": 1.7585076093673706, + "learning_rate": 3.404066524693236e-07, + "loss": 0.8898, + "step": 28095 + }, + { + "epoch": 0.6767659738445606, + "grad_norm": 1.453704833984375, + "learning_rate": 3.402798904776392e-07, + "loss": 0.9072, + "step": 28100 + }, + { + "epoch": 0.6768863948363478, + "grad_norm": 1.501059651374817, + "learning_rate": 3.401531284859547e-07, + "loss": 0.8604, + "step": 28105 + }, + { + "epoch": 0.6770068158281352, + "grad_norm": 1.4455554485321045, + "learning_rate": 3.4002636649427036e-07, + "loss": 0.9066, + "step": 28110 + }, + { + "epoch": 0.6771272368199225, + "grad_norm": 1.4117891788482666, + "learning_rate": 3.398996045025859e-07, + "loss": 0.8527, + "step": 28115 + }, + { + "epoch": 0.6772476578117097, + "grad_norm": 1.7244776487350464, + "learning_rate": 3.397728425109015e-07, + "loss": 0.8595, + "step": 28120 + }, + { + "epoch": 0.677368078803497, + "grad_norm": 1.7050223350524902, + "learning_rate": 3.3964608051921714e-07, + "loss": 0.941, + "step": 28125 + }, + { + "epoch": 0.6774884997952844, + "grad_norm": 1.3838149309158325, + "learning_rate": 3.395193185275327e-07, + "loss": 0.9236, + "step": 28130 + }, + { + "epoch": 0.6776089207870716, + "grad_norm": 1.6386252641677856, + "learning_rate": 3.3939255653584827e-07, + "loss": 0.8627, + "step": 28135 + }, + { + "epoch": 0.6777293417788589, + "grad_norm": 1.5708414316177368, + "learning_rate": 3.3926579454416386e-07, + "loss": 0.942, + "step": 28140 + }, + { + "epoch": 0.6778497627706461, + "grad_norm": 1.4829083681106567, + "learning_rate": 3.3913903255247945e-07, + "loss": 0.8537, + "step": 28145 + }, + { + "epoch": 0.6779701837624335, + "grad_norm": 1.8113583326339722, + "learning_rate": 3.39012270560795e-07, + "loss": 0.9425, + "step": 28150 + }, + { + "epoch": 0.6780906047542208, + "grad_norm": 1.42169988155365, + "learning_rate": 3.3888550856911063e-07, + "loss": 0.928, + "step": 28155 + }, + { + "epoch": 0.678211025746008, + "grad_norm": 1.514338493347168, + "learning_rate": 3.387587465774262e-07, + "loss": 0.8995, + "step": 28160 + }, + { + "epoch": 0.6783314467377953, + "grad_norm": 1.3799099922180176, + "learning_rate": 3.3863198458574176e-07, + "loss": 0.8719, + "step": 28165 + }, + { + "epoch": 0.6784518677295827, + "grad_norm": 1.5651031732559204, + "learning_rate": 3.385052225940574e-07, + "loss": 0.9245, + "step": 28170 + }, + { + "epoch": 0.6785722887213699, + "grad_norm": 1.80035400390625, + "learning_rate": 3.3837846060237295e-07, + "loss": 0.9217, + "step": 28175 + }, + { + "epoch": 0.6786927097131572, + "grad_norm": 1.7736549377441406, + "learning_rate": 3.3825169861068854e-07, + "loss": 0.918, + "step": 28180 + }, + { + "epoch": 0.6788131307049445, + "grad_norm": 1.5086334943771362, + "learning_rate": 3.3812493661900413e-07, + "loss": 0.9244, + "step": 28185 + }, + { + "epoch": 0.6789335516967318, + "grad_norm": 1.5210703611373901, + "learning_rate": 3.379981746273197e-07, + "loss": 0.8856, + "step": 28190 + }, + { + "epoch": 0.6790539726885191, + "grad_norm": 1.441664218902588, + "learning_rate": 3.3787141263563526e-07, + "loss": 0.8924, + "step": 28195 + }, + { + "epoch": 0.6791743936803063, + "grad_norm": 1.375878095626831, + "learning_rate": 3.377446506439509e-07, + "loss": 0.9211, + "step": 28200 + }, + { + "epoch": 0.6792948146720936, + "grad_norm": 1.7333893775939941, + "learning_rate": 3.376178886522665e-07, + "loss": 0.9314, + "step": 28205 + }, + { + "epoch": 0.679415235663881, + "grad_norm": 1.5676325559616089, + "learning_rate": 3.3749112666058204e-07, + "loss": 0.9135, + "step": 28210 + }, + { + "epoch": 0.6795356566556682, + "grad_norm": 1.579702377319336, + "learning_rate": 3.373643646688977e-07, + "loss": 0.8987, + "step": 28215 + }, + { + "epoch": 0.6796560776474555, + "grad_norm": 2.8164761066436768, + "learning_rate": 3.372376026772132e-07, + "loss": 0.8973, + "step": 28220 + }, + { + "epoch": 0.6797764986392428, + "grad_norm": 1.6582890748977661, + "learning_rate": 3.3711084068552887e-07, + "loss": 0.9338, + "step": 28225 + }, + { + "epoch": 0.67989691963103, + "grad_norm": 1.664105772972107, + "learning_rate": 3.369840786938444e-07, + "loss": 0.9562, + "step": 28230 + }, + { + "epoch": 0.6800173406228174, + "grad_norm": 1.8200933933258057, + "learning_rate": 3.3685731670216e-07, + "loss": 0.8899, + "step": 28235 + }, + { + "epoch": 0.6801377616146047, + "grad_norm": 1.5525609254837036, + "learning_rate": 3.3673055471047564e-07, + "loss": 0.8831, + "step": 28240 + }, + { + "epoch": 0.6802581826063919, + "grad_norm": 1.448174238204956, + "learning_rate": 3.366037927187912e-07, + "loss": 0.901, + "step": 28245 + }, + { + "epoch": 0.6803786035981793, + "grad_norm": 1.6512342691421509, + "learning_rate": 3.3647703072710677e-07, + "loss": 0.9541, + "step": 28250 + }, + { + "epoch": 0.6804990245899665, + "grad_norm": 1.595133662223816, + "learning_rate": 3.3635026873542236e-07, + "loss": 0.9343, + "step": 28255 + }, + { + "epoch": 0.6806194455817538, + "grad_norm": 1.513055443763733, + "learning_rate": 3.3622350674373796e-07, + "loss": 0.9306, + "step": 28260 + }, + { + "epoch": 0.6807398665735411, + "grad_norm": 1.5822969675064087, + "learning_rate": 3.360967447520535e-07, + "loss": 0.8789, + "step": 28265 + }, + { + "epoch": 0.6808602875653283, + "grad_norm": 1.7590924501419067, + "learning_rate": 3.3596998276036914e-07, + "loss": 0.8886, + "step": 28270 + }, + { + "epoch": 0.6809807085571157, + "grad_norm": 1.41705322265625, + "learning_rate": 3.3584322076868473e-07, + "loss": 0.9205, + "step": 28275 + }, + { + "epoch": 0.681101129548903, + "grad_norm": 1.8668830394744873, + "learning_rate": 3.3571645877700027e-07, + "loss": 0.9275, + "step": 28280 + }, + { + "epoch": 0.6812215505406902, + "grad_norm": 1.4133549928665161, + "learning_rate": 3.355896967853159e-07, + "loss": 0.9073, + "step": 28285 + }, + { + "epoch": 0.6813419715324776, + "grad_norm": 1.3410143852233887, + "learning_rate": 3.3546293479363145e-07, + "loss": 0.8635, + "step": 28290 + }, + { + "epoch": 0.6814623925242649, + "grad_norm": 1.4225311279296875, + "learning_rate": 3.3533617280194705e-07, + "loss": 0.9126, + "step": 28295 + }, + { + "epoch": 0.6815828135160521, + "grad_norm": 1.5505460500717163, + "learning_rate": 3.3520941081026264e-07, + "loss": 0.8903, + "step": 28300 + }, + { + "epoch": 0.6817032345078394, + "grad_norm": 1.7633575201034546, + "learning_rate": 3.3508264881857823e-07, + "loss": 0.8563, + "step": 28305 + }, + { + "epoch": 0.6818236554996266, + "grad_norm": 1.2661945819854736, + "learning_rate": 3.349558868268938e-07, + "loss": 0.8984, + "step": 28310 + }, + { + "epoch": 0.681944076491414, + "grad_norm": 1.5636026859283447, + "learning_rate": 3.348291248352094e-07, + "loss": 0.8653, + "step": 28315 + }, + { + "epoch": 0.6820644974832013, + "grad_norm": 1.41295325756073, + "learning_rate": 3.34702362843525e-07, + "loss": 0.8966, + "step": 28320 + }, + { + "epoch": 0.6821849184749885, + "grad_norm": 1.695952296257019, + "learning_rate": 3.3457560085184054e-07, + "loss": 0.9029, + "step": 28325 + }, + { + "epoch": 0.6823053394667759, + "grad_norm": 1.8036835193634033, + "learning_rate": 3.344488388601562e-07, + "loss": 0.933, + "step": 28330 + }, + { + "epoch": 0.6824257604585632, + "grad_norm": 1.3599246740341187, + "learning_rate": 3.3432207686847173e-07, + "loss": 0.8424, + "step": 28335 + }, + { + "epoch": 0.6825461814503504, + "grad_norm": 1.4054001569747925, + "learning_rate": 3.341953148767873e-07, + "loss": 0.8867, + "step": 28340 + }, + { + "epoch": 0.6826666024421377, + "grad_norm": 1.5154298543930054, + "learning_rate": 3.3406855288510296e-07, + "loss": 0.8846, + "step": 28345 + }, + { + "epoch": 0.682787023433925, + "grad_norm": 1.7169283628463745, + "learning_rate": 3.339417908934185e-07, + "loss": 0.9178, + "step": 28350 + }, + { + "epoch": 0.6829074444257123, + "grad_norm": 1.7609354257583618, + "learning_rate": 3.338150289017341e-07, + "loss": 0.8968, + "step": 28355 + }, + { + "epoch": 0.6830278654174996, + "grad_norm": 1.993127703666687, + "learning_rate": 3.336882669100497e-07, + "loss": 0.9229, + "step": 28360 + }, + { + "epoch": 0.6831482864092868, + "grad_norm": 1.6624497175216675, + "learning_rate": 3.335615049183653e-07, + "loss": 0.8973, + "step": 28365 + }, + { + "epoch": 0.6832687074010741, + "grad_norm": 1.5066438913345337, + "learning_rate": 3.334347429266808e-07, + "loss": 0.9035, + "step": 28370 + }, + { + "epoch": 0.6833891283928615, + "grad_norm": 1.6899200677871704, + "learning_rate": 3.3330798093499646e-07, + "loss": 0.9272, + "step": 28375 + }, + { + "epoch": 0.6835095493846487, + "grad_norm": 1.4545860290527344, + "learning_rate": 3.33181218943312e-07, + "loss": 0.8128, + "step": 28380 + }, + { + "epoch": 0.683629970376436, + "grad_norm": 1.4383794069290161, + "learning_rate": 3.330544569516276e-07, + "loss": 0.862, + "step": 28385 + }, + { + "epoch": 0.6837503913682234, + "grad_norm": 1.5542722940444946, + "learning_rate": 3.3292769495994324e-07, + "loss": 0.9214, + "step": 28390 + }, + { + "epoch": 0.6838708123600106, + "grad_norm": 1.58388090133667, + "learning_rate": 3.328009329682588e-07, + "loss": 0.9244, + "step": 28395 + }, + { + "epoch": 0.6839912333517979, + "grad_norm": 1.826556921005249, + "learning_rate": 3.3267417097657437e-07, + "loss": 0.9191, + "step": 28400 + }, + { + "epoch": 0.6841116543435851, + "grad_norm": 1.6162747144699097, + "learning_rate": 3.3254740898488996e-07, + "loss": 0.9699, + "step": 28405 + }, + { + "epoch": 0.6842320753353724, + "grad_norm": 1.5720504522323608, + "learning_rate": 3.3242064699320555e-07, + "loss": 0.8742, + "step": 28410 + }, + { + "epoch": 0.6843524963271598, + "grad_norm": 1.6315302848815918, + "learning_rate": 3.322938850015211e-07, + "loss": 0.9017, + "step": 28415 + }, + { + "epoch": 0.684472917318947, + "grad_norm": 1.5458885431289673, + "learning_rate": 3.3216712300983673e-07, + "loss": 0.9638, + "step": 28420 + }, + { + "epoch": 0.6845933383107343, + "grad_norm": 1.3931212425231934, + "learning_rate": 3.320403610181523e-07, + "loss": 0.8568, + "step": 28425 + }, + { + "epoch": 0.6847137593025217, + "grad_norm": 1.4961457252502441, + "learning_rate": 3.3191359902646787e-07, + "loss": 0.868, + "step": 28430 + }, + { + "epoch": 0.6848341802943089, + "grad_norm": 1.6866486072540283, + "learning_rate": 3.317868370347835e-07, + "loss": 0.8903, + "step": 28435 + }, + { + "epoch": 0.6849546012860962, + "grad_norm": 1.539838433265686, + "learning_rate": 3.3166007504309905e-07, + "loss": 0.8707, + "step": 28440 + }, + { + "epoch": 0.6850750222778835, + "grad_norm": 1.6249197721481323, + "learning_rate": 3.3153331305141464e-07, + "loss": 0.9353, + "step": 28445 + }, + { + "epoch": 0.6851954432696707, + "grad_norm": 1.4114655256271362, + "learning_rate": 3.3140655105973023e-07, + "loss": 0.8629, + "step": 28450 + }, + { + "epoch": 0.6853158642614581, + "grad_norm": 1.4835420846939087, + "learning_rate": 3.312797890680458e-07, + "loss": 0.8831, + "step": 28455 + }, + { + "epoch": 0.6854362852532453, + "grad_norm": 1.542344570159912, + "learning_rate": 3.311530270763614e-07, + "loss": 0.8706, + "step": 28460 + }, + { + "epoch": 0.6855567062450326, + "grad_norm": 1.5637487173080444, + "learning_rate": 3.31026265084677e-07, + "loss": 0.8511, + "step": 28465 + }, + { + "epoch": 0.68567712723682, + "grad_norm": 1.612994909286499, + "learning_rate": 3.308995030929926e-07, + "loss": 0.8995, + "step": 28470 + }, + { + "epoch": 0.6857975482286072, + "grad_norm": 1.5317628383636475, + "learning_rate": 3.3077274110130814e-07, + "loss": 0.9169, + "step": 28475 + }, + { + "epoch": 0.6859179692203945, + "grad_norm": 1.5903844833374023, + "learning_rate": 3.306459791096238e-07, + "loss": 0.9222, + "step": 28480 + }, + { + "epoch": 0.6860383902121818, + "grad_norm": 1.6235851049423218, + "learning_rate": 3.305192171179393e-07, + "loss": 0.9109, + "step": 28485 + }, + { + "epoch": 0.686158811203969, + "grad_norm": 1.5382553339004517, + "learning_rate": 3.303924551262549e-07, + "loss": 0.9307, + "step": 28490 + }, + { + "epoch": 0.6862792321957564, + "grad_norm": 1.549065113067627, + "learning_rate": 3.3026569313457056e-07, + "loss": 0.9015, + "step": 28495 + }, + { + "epoch": 0.6863996531875437, + "grad_norm": 1.5097498893737793, + "learning_rate": 3.301389311428861e-07, + "loss": 0.8801, + "step": 28500 + }, + { + "epoch": 0.6865200741793309, + "grad_norm": 1.4290951490402222, + "learning_rate": 3.300121691512017e-07, + "loss": 0.9129, + "step": 28505 + }, + { + "epoch": 0.6866404951711182, + "grad_norm": 1.4810277223587036, + "learning_rate": 3.298854071595173e-07, + "loss": 0.8924, + "step": 28510 + }, + { + "epoch": 0.6867609161629055, + "grad_norm": 1.5234373807907104, + "learning_rate": 3.2975864516783287e-07, + "loss": 0.8969, + "step": 28515 + }, + { + "epoch": 0.6868813371546928, + "grad_norm": 1.4685182571411133, + "learning_rate": 3.296318831761484e-07, + "loss": 0.8968, + "step": 28520 + }, + { + "epoch": 0.6870017581464801, + "grad_norm": 1.7230477333068848, + "learning_rate": 3.2950512118446406e-07, + "loss": 0.9227, + "step": 28525 + }, + { + "epoch": 0.6871221791382673, + "grad_norm": 1.5892548561096191, + "learning_rate": 3.293783591927796e-07, + "loss": 0.915, + "step": 28530 + }, + { + "epoch": 0.6872426001300547, + "grad_norm": 1.4972862005233765, + "learning_rate": 3.292515972010952e-07, + "loss": 0.9013, + "step": 28535 + }, + { + "epoch": 0.687363021121842, + "grad_norm": 1.53257417678833, + "learning_rate": 3.2912483520941083e-07, + "loss": 0.8379, + "step": 28540 + }, + { + "epoch": 0.6874834421136292, + "grad_norm": 1.5534003973007202, + "learning_rate": 3.2899807321772637e-07, + "loss": 0.8975, + "step": 28545 + }, + { + "epoch": 0.6876038631054165, + "grad_norm": 1.609209418296814, + "learning_rate": 3.2887131122604196e-07, + "loss": 0.8671, + "step": 28550 + }, + { + "epoch": 0.6877242840972039, + "grad_norm": 1.4327870607376099, + "learning_rate": 3.2874454923435755e-07, + "loss": 0.86, + "step": 28555 + }, + { + "epoch": 0.6878447050889911, + "grad_norm": 1.5099050998687744, + "learning_rate": 3.2861778724267315e-07, + "loss": 0.9027, + "step": 28560 + }, + { + "epoch": 0.6879651260807784, + "grad_norm": 1.4186595678329468, + "learning_rate": 3.284910252509887e-07, + "loss": 0.8845, + "step": 28565 + }, + { + "epoch": 0.6880855470725656, + "grad_norm": 1.5847558975219727, + "learning_rate": 3.2836426325930433e-07, + "loss": 0.9053, + "step": 28570 + }, + { + "epoch": 0.688205968064353, + "grad_norm": 1.4546008110046387, + "learning_rate": 3.282375012676199e-07, + "loss": 0.9295, + "step": 28575 + }, + { + "epoch": 0.6883263890561403, + "grad_norm": 1.7883195877075195, + "learning_rate": 3.2811073927593546e-07, + "loss": 0.9353, + "step": 28580 + }, + { + "epoch": 0.6884468100479275, + "grad_norm": 1.451277256011963, + "learning_rate": 3.279839772842511e-07, + "loss": 0.876, + "step": 28585 + }, + { + "epoch": 0.6885672310397148, + "grad_norm": 1.5470596551895142, + "learning_rate": 3.2785721529256664e-07, + "loss": 0.9338, + "step": 28590 + }, + { + "epoch": 0.6886876520315022, + "grad_norm": 1.6704994440078735, + "learning_rate": 3.2773045330088224e-07, + "loss": 0.8935, + "step": 28595 + }, + { + "epoch": 0.6888080730232894, + "grad_norm": 1.406724452972412, + "learning_rate": 3.2760369130919783e-07, + "loss": 0.8856, + "step": 28600 + }, + { + "epoch": 0.6889284940150767, + "grad_norm": 1.718907117843628, + "learning_rate": 3.274769293175134e-07, + "loss": 0.9068, + "step": 28605 + }, + { + "epoch": 0.689048915006864, + "grad_norm": 1.6455481052398682, + "learning_rate": 3.27350167325829e-07, + "loss": 0.8716, + "step": 28610 + }, + { + "epoch": 0.6891693359986513, + "grad_norm": 1.4469377994537354, + "learning_rate": 3.272234053341446e-07, + "loss": 0.887, + "step": 28615 + }, + { + "epoch": 0.6892897569904386, + "grad_norm": 1.5895036458969116, + "learning_rate": 3.270966433424602e-07, + "loss": 0.9508, + "step": 28620 + }, + { + "epoch": 0.6894101779822258, + "grad_norm": 1.68479585647583, + "learning_rate": 3.2696988135077573e-07, + "loss": 0.8584, + "step": 28625 + }, + { + "epoch": 0.6895305989740131, + "grad_norm": 1.654051661491394, + "learning_rate": 3.268431193590914e-07, + "loss": 0.86, + "step": 28630 + }, + { + "epoch": 0.6896510199658005, + "grad_norm": 1.661611795425415, + "learning_rate": 3.267163573674069e-07, + "loss": 0.8986, + "step": 28635 + }, + { + "epoch": 0.6897714409575877, + "grad_norm": 1.7625168561935425, + "learning_rate": 3.265895953757225e-07, + "loss": 0.8978, + "step": 28640 + }, + { + "epoch": 0.689891861949375, + "grad_norm": 1.7721279859542847, + "learning_rate": 3.2646283338403815e-07, + "loss": 0.9409, + "step": 28645 + }, + { + "epoch": 0.6900122829411623, + "grad_norm": 1.6151334047317505, + "learning_rate": 3.263360713923537e-07, + "loss": 0.9559, + "step": 28650 + }, + { + "epoch": 0.6901327039329496, + "grad_norm": 1.4370307922363281, + "learning_rate": 3.262093094006693e-07, + "loss": 0.8596, + "step": 28655 + }, + { + "epoch": 0.6902531249247369, + "grad_norm": 1.4641938209533691, + "learning_rate": 3.260825474089849e-07, + "loss": 0.9553, + "step": 28660 + }, + { + "epoch": 0.6903735459165242, + "grad_norm": 1.4579864740371704, + "learning_rate": 3.2595578541730047e-07, + "loss": 0.8337, + "step": 28665 + }, + { + "epoch": 0.6904939669083114, + "grad_norm": 1.6612746715545654, + "learning_rate": 3.25829023425616e-07, + "loss": 0.8416, + "step": 28670 + }, + { + "epoch": 0.6906143879000988, + "grad_norm": 1.6550942659378052, + "learning_rate": 3.2570226143393165e-07, + "loss": 0.9109, + "step": 28675 + }, + { + "epoch": 0.690734808891886, + "grad_norm": 1.4688982963562012, + "learning_rate": 3.255754994422472e-07, + "loss": 0.9009, + "step": 28680 + }, + { + "epoch": 0.6908552298836733, + "grad_norm": 1.709791898727417, + "learning_rate": 3.254487374505628e-07, + "loss": 0.8808, + "step": 28685 + }, + { + "epoch": 0.6909756508754606, + "grad_norm": 1.4528766870498657, + "learning_rate": 3.2532197545887843e-07, + "loss": 0.9381, + "step": 28690 + }, + { + "epoch": 0.6910960718672479, + "grad_norm": 1.5410945415496826, + "learning_rate": 3.2519521346719397e-07, + "loss": 0.853, + "step": 28695 + }, + { + "epoch": 0.6912164928590352, + "grad_norm": 1.4717358350753784, + "learning_rate": 3.2506845147550956e-07, + "loss": 0.8683, + "step": 28700 + }, + { + "epoch": 0.6913369138508225, + "grad_norm": 1.528515100479126, + "learning_rate": 3.2494168948382515e-07, + "loss": 0.93, + "step": 28705 + }, + { + "epoch": 0.6914573348426097, + "grad_norm": 1.6877847909927368, + "learning_rate": 3.2481492749214074e-07, + "loss": 0.9553, + "step": 28710 + }, + { + "epoch": 0.6915777558343971, + "grad_norm": 1.7174590826034546, + "learning_rate": 3.246881655004563e-07, + "loss": 0.8947, + "step": 28715 + }, + { + "epoch": 0.6916981768261843, + "grad_norm": 1.3972924947738647, + "learning_rate": 3.245614035087719e-07, + "loss": 0.9128, + "step": 28720 + }, + { + "epoch": 0.6918185978179716, + "grad_norm": 1.4659849405288696, + "learning_rate": 3.244346415170875e-07, + "loss": 0.9376, + "step": 28725 + }, + { + "epoch": 0.6919390188097589, + "grad_norm": 1.6079254150390625, + "learning_rate": 3.2430787952540306e-07, + "loss": 0.8994, + "step": 28730 + }, + { + "epoch": 0.6920594398015462, + "grad_norm": 1.5413719415664673, + "learning_rate": 3.241811175337187e-07, + "loss": 0.9299, + "step": 28735 + }, + { + "epoch": 0.6921798607933335, + "grad_norm": 1.6940799951553345, + "learning_rate": 3.2405435554203424e-07, + "loss": 0.85, + "step": 28740 + }, + { + "epoch": 0.6923002817851208, + "grad_norm": 1.489127278327942, + "learning_rate": 3.239275935503499e-07, + "loss": 0.931, + "step": 28745 + }, + { + "epoch": 0.692420702776908, + "grad_norm": 1.5167713165283203, + "learning_rate": 3.238008315586654e-07, + "loss": 0.9065, + "step": 28750 + }, + { + "epoch": 0.6925411237686954, + "grad_norm": 1.5457457304000854, + "learning_rate": 3.23674069566981e-07, + "loss": 0.9039, + "step": 28755 + }, + { + "epoch": 0.6926615447604827, + "grad_norm": 1.664921760559082, + "learning_rate": 3.2354730757529666e-07, + "loss": 0.867, + "step": 28760 + }, + { + "epoch": 0.6927819657522699, + "grad_norm": 1.631435751914978, + "learning_rate": 3.234205455836122e-07, + "loss": 0.8717, + "step": 28765 + }, + { + "epoch": 0.6929023867440572, + "grad_norm": 1.630450963973999, + "learning_rate": 3.232937835919278e-07, + "loss": 0.9026, + "step": 28770 + }, + { + "epoch": 0.6930228077358445, + "grad_norm": 1.4385497570037842, + "learning_rate": 3.231670216002434e-07, + "loss": 0.9005, + "step": 28775 + }, + { + "epoch": 0.6931432287276318, + "grad_norm": 1.5163872241973877, + "learning_rate": 3.2304025960855897e-07, + "loss": 0.9388, + "step": 28780 + }, + { + "epoch": 0.6932636497194191, + "grad_norm": 1.693488597869873, + "learning_rate": 3.229134976168745e-07, + "loss": 0.8862, + "step": 28785 + }, + { + "epoch": 0.6933840707112063, + "grad_norm": 1.6429200172424316, + "learning_rate": 3.2278673562519016e-07, + "loss": 0.8752, + "step": 28790 + }, + { + "epoch": 0.6935044917029937, + "grad_norm": 1.5503240823745728, + "learning_rate": 3.2265997363350575e-07, + "loss": 0.9339, + "step": 28795 + }, + { + "epoch": 0.693624912694781, + "grad_norm": 1.4487390518188477, + "learning_rate": 3.225332116418213e-07, + "loss": 0.911, + "step": 28800 + }, + { + "epoch": 0.6937453336865682, + "grad_norm": 1.561008095741272, + "learning_rate": 3.2240644965013693e-07, + "loss": 0.8992, + "step": 28805 + }, + { + "epoch": 0.6938657546783555, + "grad_norm": 1.6827517747879028, + "learning_rate": 3.2227968765845247e-07, + "loss": 0.9336, + "step": 28810 + }, + { + "epoch": 0.6939861756701429, + "grad_norm": 1.4995176792144775, + "learning_rate": 3.2215292566676806e-07, + "loss": 0.9022, + "step": 28815 + }, + { + "epoch": 0.6941065966619301, + "grad_norm": 1.3722862005233765, + "learning_rate": 3.2202616367508365e-07, + "loss": 0.866, + "step": 28820 + }, + { + "epoch": 0.6942270176537174, + "grad_norm": 1.6332498788833618, + "learning_rate": 3.2189940168339925e-07, + "loss": 0.9745, + "step": 28825 + }, + { + "epoch": 0.6943474386455046, + "grad_norm": 1.4901752471923828, + "learning_rate": 3.217726396917148e-07, + "loss": 0.9024, + "step": 28830 + }, + { + "epoch": 0.694467859637292, + "grad_norm": 1.6924899816513062, + "learning_rate": 3.2164587770003043e-07, + "loss": 0.8887, + "step": 28835 + }, + { + "epoch": 0.6945882806290793, + "grad_norm": 1.6854546070098877, + "learning_rate": 3.21519115708346e-07, + "loss": 0.919, + "step": 28840 + }, + { + "epoch": 0.6947087016208665, + "grad_norm": 1.4148783683776855, + "learning_rate": 3.2139235371666156e-07, + "loss": 0.9169, + "step": 28845 + }, + { + "epoch": 0.6948291226126538, + "grad_norm": 1.6648613214492798, + "learning_rate": 3.212655917249772e-07, + "loss": 0.8992, + "step": 28850 + }, + { + "epoch": 0.6949495436044412, + "grad_norm": 1.4850813150405884, + "learning_rate": 3.2113882973329274e-07, + "loss": 0.908, + "step": 28855 + }, + { + "epoch": 0.6950699645962284, + "grad_norm": 1.6685442924499512, + "learning_rate": 3.2101206774160834e-07, + "loss": 0.8785, + "step": 28860 + }, + { + "epoch": 0.6951903855880157, + "grad_norm": 1.6255873441696167, + "learning_rate": 3.2088530574992393e-07, + "loss": 0.9498, + "step": 28865 + }, + { + "epoch": 0.695310806579803, + "grad_norm": 1.4400304555892944, + "learning_rate": 3.207585437582395e-07, + "loss": 0.9274, + "step": 28870 + }, + { + "epoch": 0.6954312275715903, + "grad_norm": 1.3725250959396362, + "learning_rate": 3.206317817665551e-07, + "loss": 0.9407, + "step": 28875 + }, + { + "epoch": 0.6955516485633776, + "grad_norm": 1.577254295349121, + "learning_rate": 3.205050197748707e-07, + "loss": 0.882, + "step": 28880 + }, + { + "epoch": 0.6956720695551648, + "grad_norm": 1.5213263034820557, + "learning_rate": 3.203782577831863e-07, + "loss": 0.8686, + "step": 28885 + }, + { + "epoch": 0.6957924905469521, + "grad_norm": 1.3886874914169312, + "learning_rate": 3.2025149579150183e-07, + "loss": 0.8521, + "step": 28890 + }, + { + "epoch": 0.6959129115387395, + "grad_norm": 1.5748800039291382, + "learning_rate": 3.201247337998175e-07, + "loss": 0.8903, + "step": 28895 + }, + { + "epoch": 0.6960333325305267, + "grad_norm": 1.5380878448486328, + "learning_rate": 3.19997971808133e-07, + "loss": 0.938, + "step": 28900 + }, + { + "epoch": 0.696153753522314, + "grad_norm": 1.555604100227356, + "learning_rate": 3.198712098164486e-07, + "loss": 0.8907, + "step": 28905 + }, + { + "epoch": 0.6962741745141013, + "grad_norm": 1.5657362937927246, + "learning_rate": 3.1974444782476425e-07, + "loss": 0.8877, + "step": 28910 + }, + { + "epoch": 0.6963945955058886, + "grad_norm": 1.5190544128417969, + "learning_rate": 3.196176858330798e-07, + "loss": 0.887, + "step": 28915 + }, + { + "epoch": 0.6965150164976759, + "grad_norm": 1.5380806922912598, + "learning_rate": 3.194909238413954e-07, + "loss": 0.8979, + "step": 28920 + }, + { + "epoch": 0.6966354374894632, + "grad_norm": 1.5136362314224243, + "learning_rate": 3.19364161849711e-07, + "loss": 0.9221, + "step": 28925 + }, + { + "epoch": 0.6967558584812504, + "grad_norm": 1.7348721027374268, + "learning_rate": 3.1923739985802657e-07, + "loss": 0.862, + "step": 28930 + }, + { + "epoch": 0.6968762794730378, + "grad_norm": 1.4176759719848633, + "learning_rate": 3.191106378663421e-07, + "loss": 0.8833, + "step": 28935 + }, + { + "epoch": 0.696996700464825, + "grad_norm": 1.5287283658981323, + "learning_rate": 3.1898387587465775e-07, + "loss": 0.9482, + "step": 28940 + }, + { + "epoch": 0.6971171214566123, + "grad_norm": 1.7669847011566162, + "learning_rate": 3.188571138829733e-07, + "loss": 0.8706, + "step": 28945 + }, + { + "epoch": 0.6972375424483996, + "grad_norm": 1.6608628034591675, + "learning_rate": 3.187303518912889e-07, + "loss": 0.8909, + "step": 28950 + }, + { + "epoch": 0.6973579634401869, + "grad_norm": 1.3679029941558838, + "learning_rate": 3.1860358989960453e-07, + "loss": 0.8428, + "step": 28955 + }, + { + "epoch": 0.6974783844319742, + "grad_norm": 1.4520623683929443, + "learning_rate": 3.1847682790792007e-07, + "loss": 0.913, + "step": 28960 + }, + { + "epoch": 0.6975988054237615, + "grad_norm": 1.913512945175171, + "learning_rate": 3.1835006591623566e-07, + "loss": 0.8596, + "step": 28965 + }, + { + "epoch": 0.6977192264155487, + "grad_norm": 1.517116665840149, + "learning_rate": 3.1822330392455125e-07, + "loss": 0.9363, + "step": 28970 + }, + { + "epoch": 0.6978396474073361, + "grad_norm": 2.0575098991394043, + "learning_rate": 3.1809654193286684e-07, + "loss": 0.8777, + "step": 28975 + }, + { + "epoch": 0.6979600683991234, + "grad_norm": 1.5659877061843872, + "learning_rate": 3.179697799411824e-07, + "loss": 0.8989, + "step": 28980 + }, + { + "epoch": 0.6980804893909106, + "grad_norm": 1.4551013708114624, + "learning_rate": 3.17843017949498e-07, + "loss": 0.8814, + "step": 28985 + }, + { + "epoch": 0.6982009103826979, + "grad_norm": 1.452511191368103, + "learning_rate": 3.177162559578136e-07, + "loss": 0.9256, + "step": 28990 + }, + { + "epoch": 0.6983213313744852, + "grad_norm": 1.4275633096694946, + "learning_rate": 3.1758949396612916e-07, + "loss": 0.9033, + "step": 28995 + }, + { + "epoch": 0.6984417523662725, + "grad_norm": 1.5840181112289429, + "learning_rate": 3.174627319744448e-07, + "loss": 0.8965, + "step": 29000 + }, + { + "epoch": 0.6985621733580598, + "grad_norm": 1.48139226436615, + "learning_rate": 3.1733596998276034e-07, + "loss": 0.9428, + "step": 29005 + }, + { + "epoch": 0.698682594349847, + "grad_norm": 1.5555086135864258, + "learning_rate": 3.1720920799107593e-07, + "loss": 0.8771, + "step": 29010 + }, + { + "epoch": 0.6988030153416344, + "grad_norm": 1.6042619943618774, + "learning_rate": 3.170824459993915e-07, + "loss": 0.8988, + "step": 29015 + }, + { + "epoch": 0.6989234363334217, + "grad_norm": 1.6112546920776367, + "learning_rate": 3.169556840077071e-07, + "loss": 0.9054, + "step": 29020 + }, + { + "epoch": 0.6990438573252089, + "grad_norm": 1.7235147953033447, + "learning_rate": 3.168289220160227e-07, + "loss": 0.9407, + "step": 29025 + }, + { + "epoch": 0.6991642783169962, + "grad_norm": 1.510565161705017, + "learning_rate": 3.167021600243383e-07, + "loss": 0.8459, + "step": 29030 + }, + { + "epoch": 0.6992846993087835, + "grad_norm": 1.5521827936172485, + "learning_rate": 3.165753980326539e-07, + "loss": 0.9397, + "step": 29035 + }, + { + "epoch": 0.6994051203005708, + "grad_norm": 1.5342634916305542, + "learning_rate": 3.1644863604096943e-07, + "loss": 0.9075, + "step": 29040 + }, + { + "epoch": 0.6995255412923581, + "grad_norm": 1.5240405797958374, + "learning_rate": 3.1632187404928507e-07, + "loss": 0.9667, + "step": 29045 + }, + { + "epoch": 0.6996459622841453, + "grad_norm": 1.4187840223312378, + "learning_rate": 3.161951120576006e-07, + "loss": 0.8983, + "step": 29050 + }, + { + "epoch": 0.6997663832759327, + "grad_norm": 1.578110933303833, + "learning_rate": 3.160683500659162e-07, + "loss": 0.8728, + "step": 29055 + }, + { + "epoch": 0.69988680426772, + "grad_norm": 1.782132625579834, + "learning_rate": 3.1594158807423185e-07, + "loss": 0.9311, + "step": 29060 + }, + { + "epoch": 0.7000072252595072, + "grad_norm": 1.8534215688705444, + "learning_rate": 3.158148260825474e-07, + "loss": 0.8793, + "step": 29065 + }, + { + "epoch": 0.7001276462512945, + "grad_norm": 1.4021070003509521, + "learning_rate": 3.15688064090863e-07, + "loss": 0.889, + "step": 29070 + }, + { + "epoch": 0.7002480672430819, + "grad_norm": 1.5727077722549438, + "learning_rate": 3.1556130209917857e-07, + "loss": 0.9066, + "step": 29075 + }, + { + "epoch": 0.7003684882348691, + "grad_norm": 1.6189826726913452, + "learning_rate": 3.1543454010749416e-07, + "loss": 0.9, + "step": 29080 + }, + { + "epoch": 0.7004889092266564, + "grad_norm": 1.587286353111267, + "learning_rate": 3.153077781158097e-07, + "loss": 0.8862, + "step": 29085 + }, + { + "epoch": 0.7006093302184436, + "grad_norm": 1.7006717920303345, + "learning_rate": 3.1518101612412535e-07, + "loss": 0.9625, + "step": 29090 + }, + { + "epoch": 0.700729751210231, + "grad_norm": 1.6427197456359863, + "learning_rate": 3.150542541324409e-07, + "loss": 0.8765, + "step": 29095 + }, + { + "epoch": 0.7008501722020183, + "grad_norm": 1.4622749090194702, + "learning_rate": 3.149274921407565e-07, + "loss": 0.8812, + "step": 29100 + }, + { + "epoch": 0.7009705931938055, + "grad_norm": 1.509885549545288, + "learning_rate": 3.148007301490721e-07, + "loss": 0.8931, + "step": 29105 + }, + { + "epoch": 0.7010910141855928, + "grad_norm": 1.533454418182373, + "learning_rate": 3.1467396815738766e-07, + "loss": 0.9289, + "step": 29110 + }, + { + "epoch": 0.7012114351773802, + "grad_norm": 1.5319466590881348, + "learning_rate": 3.1454720616570325e-07, + "loss": 0.929, + "step": 29115 + }, + { + "epoch": 0.7013318561691674, + "grad_norm": 1.4898655414581299, + "learning_rate": 3.1442044417401884e-07, + "loss": 0.8779, + "step": 29120 + }, + { + "epoch": 0.7014522771609547, + "grad_norm": 1.5144740343093872, + "learning_rate": 3.1429368218233444e-07, + "loss": 0.8956, + "step": 29125 + }, + { + "epoch": 0.701572698152742, + "grad_norm": 1.7286611795425415, + "learning_rate": 3.1416692019065e-07, + "loss": 0.9521, + "step": 29130 + }, + { + "epoch": 0.7016931191445293, + "grad_norm": 2.0020668506622314, + "learning_rate": 3.140401581989656e-07, + "loss": 0.9005, + "step": 29135 + }, + { + "epoch": 0.7018135401363166, + "grad_norm": 1.621082067489624, + "learning_rate": 3.139133962072812e-07, + "loss": 0.9036, + "step": 29140 + }, + { + "epoch": 0.7019339611281038, + "grad_norm": 1.5306613445281982, + "learning_rate": 3.1378663421559675e-07, + "loss": 0.9365, + "step": 29145 + }, + { + "epoch": 0.7020543821198911, + "grad_norm": 1.7362675666809082, + "learning_rate": 3.136598722239124e-07, + "loss": 0.8838, + "step": 29150 + }, + { + "epoch": 0.7021748031116785, + "grad_norm": 1.5057249069213867, + "learning_rate": 3.1353311023222793e-07, + "loss": 0.8765, + "step": 29155 + }, + { + "epoch": 0.7022952241034657, + "grad_norm": 1.4992295503616333, + "learning_rate": 3.134063482405435e-07, + "loss": 0.9159, + "step": 29160 + }, + { + "epoch": 0.702415645095253, + "grad_norm": 1.5106385946273804, + "learning_rate": 3.132795862488591e-07, + "loss": 0.8865, + "step": 29165 + }, + { + "epoch": 0.7025360660870403, + "grad_norm": 1.6429600715637207, + "learning_rate": 3.131528242571747e-07, + "loss": 0.982, + "step": 29170 + }, + { + "epoch": 0.7026564870788276, + "grad_norm": 1.50474214553833, + "learning_rate": 3.130260622654903e-07, + "loss": 0.823, + "step": 29175 + }, + { + "epoch": 0.7027769080706149, + "grad_norm": 1.6841436624526978, + "learning_rate": 3.128993002738059e-07, + "loss": 0.8769, + "step": 29180 + }, + { + "epoch": 0.7028973290624022, + "grad_norm": 1.8708577156066895, + "learning_rate": 3.127725382821215e-07, + "loss": 0.943, + "step": 29185 + }, + { + "epoch": 0.7030177500541894, + "grad_norm": 1.5782015323638916, + "learning_rate": 3.12645776290437e-07, + "loss": 0.91, + "step": 29190 + }, + { + "epoch": 0.7031381710459768, + "grad_norm": 1.6359310150146484, + "learning_rate": 3.1251901429875267e-07, + "loss": 0.9172, + "step": 29195 + }, + { + "epoch": 0.703258592037764, + "grad_norm": 1.3926293849945068, + "learning_rate": 3.123922523070682e-07, + "loss": 0.9453, + "step": 29200 + }, + { + "epoch": 0.7033790130295513, + "grad_norm": 1.5979011058807373, + "learning_rate": 3.122654903153838e-07, + "loss": 0.9411, + "step": 29205 + }, + { + "epoch": 0.7034994340213386, + "grad_norm": 1.612737774848938, + "learning_rate": 3.1213872832369944e-07, + "loss": 0.8778, + "step": 29210 + }, + { + "epoch": 0.7036198550131259, + "grad_norm": 1.5918177366256714, + "learning_rate": 3.12011966332015e-07, + "loss": 0.9218, + "step": 29215 + }, + { + "epoch": 0.7037402760049132, + "grad_norm": 1.55571448802948, + "learning_rate": 3.118852043403306e-07, + "loss": 0.9203, + "step": 29220 + }, + { + "epoch": 0.7038606969967005, + "grad_norm": 1.6012756824493408, + "learning_rate": 3.1175844234864617e-07, + "loss": 0.9508, + "step": 29225 + }, + { + "epoch": 0.7039811179884877, + "grad_norm": 1.6371347904205322, + "learning_rate": 3.1163168035696176e-07, + "loss": 0.8785, + "step": 29230 + }, + { + "epoch": 0.704101538980275, + "grad_norm": 1.51962411403656, + "learning_rate": 3.115049183652773e-07, + "loss": 0.8845, + "step": 29235 + }, + { + "epoch": 0.7042219599720624, + "grad_norm": 1.8233025074005127, + "learning_rate": 3.1137815637359294e-07, + "loss": 0.9195, + "step": 29240 + }, + { + "epoch": 0.7043423809638496, + "grad_norm": 1.4151511192321777, + "learning_rate": 3.112513943819085e-07, + "loss": 0.8823, + "step": 29245 + }, + { + "epoch": 0.7044628019556369, + "grad_norm": 1.5707738399505615, + "learning_rate": 3.111246323902241e-07, + "loss": 0.918, + "step": 29250 + }, + { + "epoch": 0.7045832229474241, + "grad_norm": 1.680002212524414, + "learning_rate": 3.109978703985397e-07, + "loss": 0.8727, + "step": 29255 + }, + { + "epoch": 0.7047036439392115, + "grad_norm": 1.4788126945495605, + "learning_rate": 3.1087110840685526e-07, + "loss": 0.8589, + "step": 29260 + }, + { + "epoch": 0.7048240649309988, + "grad_norm": 1.565387487411499, + "learning_rate": 3.107443464151709e-07, + "loss": 0.835, + "step": 29265 + }, + { + "epoch": 0.704944485922786, + "grad_norm": 1.8114027976989746, + "learning_rate": 3.1061758442348644e-07, + "loss": 0.8903, + "step": 29270 + }, + { + "epoch": 0.7050649069145734, + "grad_norm": 1.5533870458602905, + "learning_rate": 3.1049082243180203e-07, + "loss": 0.9147, + "step": 29275 + }, + { + "epoch": 0.7051853279063607, + "grad_norm": 1.6295989751815796, + "learning_rate": 3.103640604401176e-07, + "loss": 1.0114, + "step": 29280 + }, + { + "epoch": 0.7053057488981479, + "grad_norm": 1.5241661071777344, + "learning_rate": 3.102372984484332e-07, + "loss": 0.8899, + "step": 29285 + }, + { + "epoch": 0.7054261698899352, + "grad_norm": 1.4702953100204468, + "learning_rate": 3.101105364567488e-07, + "loss": 0.9182, + "step": 29290 + }, + { + "epoch": 0.7055465908817226, + "grad_norm": 1.4871397018432617, + "learning_rate": 3.099837744650644e-07, + "loss": 0.8649, + "step": 29295 + }, + { + "epoch": 0.7056670118735098, + "grad_norm": 1.5919462442398071, + "learning_rate": 3.0985701247338e-07, + "loss": 0.9245, + "step": 29300 + }, + { + "epoch": 0.7057874328652971, + "grad_norm": 1.4001860618591309, + "learning_rate": 3.0973025048169553e-07, + "loss": 0.9326, + "step": 29305 + }, + { + "epoch": 0.7059078538570843, + "grad_norm": 1.7274092435836792, + "learning_rate": 3.096034884900112e-07, + "loss": 0.8802, + "step": 29310 + }, + { + "epoch": 0.7060282748488717, + "grad_norm": 1.5036511421203613, + "learning_rate": 3.094767264983267e-07, + "loss": 0.929, + "step": 29315 + }, + { + "epoch": 0.706148695840659, + "grad_norm": 1.533713459968567, + "learning_rate": 3.093499645066423e-07, + "loss": 0.8605, + "step": 29320 + }, + { + "epoch": 0.7062691168324462, + "grad_norm": 1.4081368446350098, + "learning_rate": 3.0922320251495795e-07, + "loss": 0.9085, + "step": 29325 + }, + { + "epoch": 0.7063895378242335, + "grad_norm": 1.609192967414856, + "learning_rate": 3.090964405232735e-07, + "loss": 0.9238, + "step": 29330 + }, + { + "epoch": 0.7065099588160209, + "grad_norm": 1.4841941595077515, + "learning_rate": 3.089696785315891e-07, + "loss": 0.9246, + "step": 29335 + }, + { + "epoch": 0.7066303798078081, + "grad_norm": 1.4160584211349487, + "learning_rate": 3.0884291653990467e-07, + "loss": 0.8796, + "step": 29340 + }, + { + "epoch": 0.7067508007995954, + "grad_norm": 1.6532182693481445, + "learning_rate": 3.0871615454822026e-07, + "loss": 0.9149, + "step": 29345 + }, + { + "epoch": 0.7068712217913826, + "grad_norm": 1.6987322568893433, + "learning_rate": 3.085893925565358e-07, + "loss": 0.9261, + "step": 29350 + }, + { + "epoch": 0.70699164278317, + "grad_norm": 1.4547004699707031, + "learning_rate": 3.0846263056485145e-07, + "loss": 0.9175, + "step": 29355 + }, + { + "epoch": 0.7071120637749573, + "grad_norm": 1.4887008666992188, + "learning_rate": 3.0833586857316704e-07, + "loss": 0.877, + "step": 29360 + }, + { + "epoch": 0.7072324847667445, + "grad_norm": 1.42011296749115, + "learning_rate": 3.082091065814826e-07, + "loss": 0.946, + "step": 29365 + }, + { + "epoch": 0.7073529057585318, + "grad_norm": 1.5739734172821045, + "learning_rate": 3.080823445897982e-07, + "loss": 0.911, + "step": 29370 + }, + { + "epoch": 0.7074733267503192, + "grad_norm": 1.4277094602584839, + "learning_rate": 3.0795558259811376e-07, + "loss": 0.8827, + "step": 29375 + }, + { + "epoch": 0.7075937477421064, + "grad_norm": 1.556277871131897, + "learning_rate": 3.0782882060642935e-07, + "loss": 0.8724, + "step": 29380 + }, + { + "epoch": 0.7077141687338937, + "grad_norm": 1.618062138557434, + "learning_rate": 3.0770205861474494e-07, + "loss": 0.8724, + "step": 29385 + }, + { + "epoch": 0.707834589725681, + "grad_norm": 1.4980077743530273, + "learning_rate": 3.0757529662306054e-07, + "loss": 0.9093, + "step": 29390 + }, + { + "epoch": 0.7079550107174682, + "grad_norm": 1.7104482650756836, + "learning_rate": 3.074485346313761e-07, + "loss": 0.9211, + "step": 29395 + }, + { + "epoch": 0.7080754317092556, + "grad_norm": 1.6199488639831543, + "learning_rate": 3.073217726396917e-07, + "loss": 0.9073, + "step": 29400 + }, + { + "epoch": 0.7081958527010428, + "grad_norm": 1.8200767040252686, + "learning_rate": 3.071950106480073e-07, + "loss": 0.9386, + "step": 29405 + }, + { + "epoch": 0.7083162736928301, + "grad_norm": 1.5339254140853882, + "learning_rate": 3.0706824865632285e-07, + "loss": 0.8746, + "step": 29410 + }, + { + "epoch": 0.7084366946846175, + "grad_norm": 1.588697910308838, + "learning_rate": 3.069414866646385e-07, + "loss": 0.9552, + "step": 29415 + }, + { + "epoch": 0.7085571156764047, + "grad_norm": 1.454702377319336, + "learning_rate": 3.0681472467295403e-07, + "loss": 0.8876, + "step": 29420 + }, + { + "epoch": 0.708677536668192, + "grad_norm": 1.5141271352767944, + "learning_rate": 3.066879626812696e-07, + "loss": 0.8862, + "step": 29425 + }, + { + "epoch": 0.7087979576599793, + "grad_norm": 1.958306074142456, + "learning_rate": 3.065612006895852e-07, + "loss": 0.9148, + "step": 29430 + }, + { + "epoch": 0.7089183786517665, + "grad_norm": 1.6023107767105103, + "learning_rate": 3.064344386979008e-07, + "loss": 0.8941, + "step": 29435 + }, + { + "epoch": 0.7090387996435539, + "grad_norm": 1.651862382888794, + "learning_rate": 3.063076767062164e-07, + "loss": 0.929, + "step": 29440 + }, + { + "epoch": 0.7091592206353412, + "grad_norm": 1.775821328163147, + "learning_rate": 3.06180914714532e-07, + "loss": 0.9159, + "step": 29445 + }, + { + "epoch": 0.7092796416271284, + "grad_norm": 1.7366076707839966, + "learning_rate": 3.060541527228476e-07, + "loss": 0.9161, + "step": 29450 + }, + { + "epoch": 0.7094000626189157, + "grad_norm": 1.4495103359222412, + "learning_rate": 3.059273907311631e-07, + "loss": 0.9226, + "step": 29455 + }, + { + "epoch": 0.709520483610703, + "grad_norm": 1.4665801525115967, + "learning_rate": 3.0580062873947877e-07, + "loss": 0.8856, + "step": 29460 + }, + { + "epoch": 0.7096409046024903, + "grad_norm": 1.6649484634399414, + "learning_rate": 3.056738667477943e-07, + "loss": 0.8967, + "step": 29465 + }, + { + "epoch": 0.7097613255942776, + "grad_norm": 1.6469159126281738, + "learning_rate": 3.055471047561099e-07, + "loss": 0.8974, + "step": 29470 + }, + { + "epoch": 0.7098817465860648, + "grad_norm": 1.42695152759552, + "learning_rate": 3.0542034276442554e-07, + "loss": 0.9147, + "step": 29475 + }, + { + "epoch": 0.7100021675778522, + "grad_norm": 1.5906567573547363, + "learning_rate": 3.052935807727411e-07, + "loss": 0.8881, + "step": 29480 + }, + { + "epoch": 0.7101225885696395, + "grad_norm": 1.5483026504516602, + "learning_rate": 3.051668187810567e-07, + "loss": 0.934, + "step": 29485 + }, + { + "epoch": 0.7102430095614267, + "grad_norm": 1.3973199129104614, + "learning_rate": 3.0504005678937227e-07, + "loss": 0.8659, + "step": 29490 + }, + { + "epoch": 0.710363430553214, + "grad_norm": 1.7336456775665283, + "learning_rate": 3.0491329479768786e-07, + "loss": 0.9344, + "step": 29495 + }, + { + "epoch": 0.7104838515450014, + "grad_norm": 1.6791404485702515, + "learning_rate": 3.047865328060034e-07, + "loss": 0.905, + "step": 29500 + }, + { + "epoch": 0.7106042725367886, + "grad_norm": 1.6338578462600708, + "learning_rate": 3.0465977081431904e-07, + "loss": 0.8957, + "step": 29505 + }, + { + "epoch": 0.7107246935285759, + "grad_norm": 1.517227292060852, + "learning_rate": 3.0453300882263463e-07, + "loss": 0.8981, + "step": 29510 + }, + { + "epoch": 0.7108451145203631, + "grad_norm": 1.650689959526062, + "learning_rate": 3.0440624683095017e-07, + "loss": 0.9155, + "step": 29515 + }, + { + "epoch": 0.7109655355121505, + "grad_norm": 1.6722382307052612, + "learning_rate": 3.042794848392658e-07, + "loss": 0.8428, + "step": 29520 + }, + { + "epoch": 0.7110859565039378, + "grad_norm": 1.6126636266708374, + "learning_rate": 3.0415272284758136e-07, + "loss": 0.9016, + "step": 29525 + }, + { + "epoch": 0.711206377495725, + "grad_norm": 1.598414659500122, + "learning_rate": 3.0402596085589695e-07, + "loss": 0.9092, + "step": 29530 + }, + { + "epoch": 0.7113267984875123, + "grad_norm": 1.6130987405776978, + "learning_rate": 3.0389919886421254e-07, + "loss": 0.9736, + "step": 29535 + }, + { + "epoch": 0.7114472194792997, + "grad_norm": 1.5357635021209717, + "learning_rate": 3.0377243687252813e-07, + "loss": 0.9343, + "step": 29540 + }, + { + "epoch": 0.7115676404710869, + "grad_norm": 1.6397613286972046, + "learning_rate": 3.0364567488084367e-07, + "loss": 0.9212, + "step": 29545 + }, + { + "epoch": 0.7116880614628742, + "grad_norm": 1.4497694969177246, + "learning_rate": 3.035189128891593e-07, + "loss": 0.9303, + "step": 29550 + }, + { + "epoch": 0.7118084824546616, + "grad_norm": 1.4531131982803345, + "learning_rate": 3.033921508974749e-07, + "loss": 0.9327, + "step": 29555 + }, + { + "epoch": 0.7119289034464488, + "grad_norm": 1.509235143661499, + "learning_rate": 3.0326538890579045e-07, + "loss": 0.9276, + "step": 29560 + }, + { + "epoch": 0.7120493244382361, + "grad_norm": 1.6412030458450317, + "learning_rate": 3.031386269141061e-07, + "loss": 0.9299, + "step": 29565 + }, + { + "epoch": 0.7121697454300233, + "grad_norm": 1.4233083724975586, + "learning_rate": 3.0301186492242163e-07, + "loss": 0.8982, + "step": 29570 + }, + { + "epoch": 0.7122901664218106, + "grad_norm": 1.3822218179702759, + "learning_rate": 3.028851029307372e-07, + "loss": 0.8684, + "step": 29575 + }, + { + "epoch": 0.712410587413598, + "grad_norm": 1.3487155437469482, + "learning_rate": 3.027583409390528e-07, + "loss": 0.8833, + "step": 29580 + }, + { + "epoch": 0.7125310084053852, + "grad_norm": 1.3766354322433472, + "learning_rate": 3.026315789473684e-07, + "loss": 0.7947, + "step": 29585 + }, + { + "epoch": 0.7126514293971725, + "grad_norm": 1.5623881816864014, + "learning_rate": 3.02504816955684e-07, + "loss": 0.8732, + "step": 29590 + }, + { + "epoch": 0.7127718503889598, + "grad_norm": 1.7563700675964355, + "learning_rate": 3.023780549639996e-07, + "loss": 0.9263, + "step": 29595 + }, + { + "epoch": 0.7128922713807471, + "grad_norm": 1.507169485092163, + "learning_rate": 3.022512929723152e-07, + "loss": 0.9059, + "step": 29600 + }, + { + "epoch": 0.7130126923725344, + "grad_norm": 1.4992848634719849, + "learning_rate": 3.021245309806307e-07, + "loss": 0.8569, + "step": 29605 + }, + { + "epoch": 0.7131331133643216, + "grad_norm": 1.4725024700164795, + "learning_rate": 3.0199776898894636e-07, + "loss": 0.9172, + "step": 29610 + }, + { + "epoch": 0.7132535343561089, + "grad_norm": 1.655243158340454, + "learning_rate": 3.018710069972619e-07, + "loss": 0.9222, + "step": 29615 + }, + { + "epoch": 0.7133739553478963, + "grad_norm": 1.535508394241333, + "learning_rate": 3.017442450055775e-07, + "loss": 0.9346, + "step": 29620 + }, + { + "epoch": 0.7134943763396835, + "grad_norm": 1.552764892578125, + "learning_rate": 3.0161748301389314e-07, + "loss": 0.9204, + "step": 29625 + }, + { + "epoch": 0.7136147973314708, + "grad_norm": 1.6510361433029175, + "learning_rate": 3.014907210222087e-07, + "loss": 0.9179, + "step": 29630 + }, + { + "epoch": 0.7137352183232581, + "grad_norm": 1.5170223712921143, + "learning_rate": 3.0136395903052427e-07, + "loss": 0.8754, + "step": 29635 + }, + { + "epoch": 0.7138556393150454, + "grad_norm": 1.6754573583602905, + "learning_rate": 3.0123719703883986e-07, + "loss": 0.9019, + "step": 29640 + }, + { + "epoch": 0.7139760603068327, + "grad_norm": 1.57989501953125, + "learning_rate": 3.0111043504715545e-07, + "loss": 0.9705, + "step": 29645 + }, + { + "epoch": 0.71409648129862, + "grad_norm": 1.874380111694336, + "learning_rate": 3.00983673055471e-07, + "loss": 0.826, + "step": 29650 + }, + { + "epoch": 0.7142169022904072, + "grad_norm": 1.5729446411132812, + "learning_rate": 3.0085691106378664e-07, + "loss": 0.8823, + "step": 29655 + }, + { + "epoch": 0.7143373232821946, + "grad_norm": 1.636989712715149, + "learning_rate": 3.007301490721022e-07, + "loss": 0.8943, + "step": 29660 + }, + { + "epoch": 0.7144577442739818, + "grad_norm": 1.538765549659729, + "learning_rate": 3.0060338708041777e-07, + "loss": 0.9151, + "step": 29665 + }, + { + "epoch": 0.7145781652657691, + "grad_norm": 1.378821611404419, + "learning_rate": 3.004766250887334e-07, + "loss": 0.8793, + "step": 29670 + }, + { + "epoch": 0.7146985862575564, + "grad_norm": 1.720047950744629, + "learning_rate": 3.0034986309704895e-07, + "loss": 0.8541, + "step": 29675 + }, + { + "epoch": 0.7148190072493437, + "grad_norm": 1.70628821849823, + "learning_rate": 3.0022310110536454e-07, + "loss": 0.8768, + "step": 29680 + }, + { + "epoch": 0.714939428241131, + "grad_norm": 1.598002314567566, + "learning_rate": 3.0009633911368013e-07, + "loss": 0.8626, + "step": 29685 + }, + { + "epoch": 0.7150598492329183, + "grad_norm": 1.4448331594467163, + "learning_rate": 2.999695771219957e-07, + "loss": 0.8836, + "step": 29690 + }, + { + "epoch": 0.7151802702247055, + "grad_norm": 1.5112354755401611, + "learning_rate": 2.9984281513031127e-07, + "loss": 0.9152, + "step": 29695 + }, + { + "epoch": 0.7153006912164929, + "grad_norm": 1.4921990633010864, + "learning_rate": 2.997160531386269e-07, + "loss": 0.8707, + "step": 29700 + }, + { + "epoch": 0.7154211122082802, + "grad_norm": 1.6580222845077515, + "learning_rate": 2.995892911469425e-07, + "loss": 0.9313, + "step": 29705 + }, + { + "epoch": 0.7155415332000674, + "grad_norm": 1.7308915853500366, + "learning_rate": 2.9946252915525804e-07, + "loss": 0.925, + "step": 29710 + }, + { + "epoch": 0.7156619541918547, + "grad_norm": 1.6118513345718384, + "learning_rate": 2.993357671635737e-07, + "loss": 0.8931, + "step": 29715 + }, + { + "epoch": 0.715782375183642, + "grad_norm": 1.7087616920471191, + "learning_rate": 2.992090051718892e-07, + "loss": 0.8905, + "step": 29720 + }, + { + "epoch": 0.7159027961754293, + "grad_norm": 1.477902889251709, + "learning_rate": 2.990822431802048e-07, + "loss": 0.8966, + "step": 29725 + }, + { + "epoch": 0.7160232171672166, + "grad_norm": 1.3396321535110474, + "learning_rate": 2.989554811885204e-07, + "loss": 0.9194, + "step": 29730 + }, + { + "epoch": 0.7161436381590038, + "grad_norm": 1.9310853481292725, + "learning_rate": 2.98828719196836e-07, + "loss": 0.877, + "step": 29735 + }, + { + "epoch": 0.7162640591507912, + "grad_norm": 1.5338441133499146, + "learning_rate": 2.987019572051516e-07, + "loss": 0.9028, + "step": 29740 + }, + { + "epoch": 0.7163844801425785, + "grad_norm": 1.3704942464828491, + "learning_rate": 2.985751952134672e-07, + "loss": 0.9116, + "step": 29745 + }, + { + "epoch": 0.7165049011343657, + "grad_norm": 1.5461375713348389, + "learning_rate": 2.984484332217828e-07, + "loss": 0.9145, + "step": 29750 + }, + { + "epoch": 0.716625322126153, + "grad_norm": 1.5435659885406494, + "learning_rate": 2.983216712300983e-07, + "loss": 0.9219, + "step": 29755 + }, + { + "epoch": 0.7167457431179404, + "grad_norm": 1.4328842163085938, + "learning_rate": 2.9819490923841396e-07, + "loss": 0.8917, + "step": 29760 + }, + { + "epoch": 0.7168661641097276, + "grad_norm": 1.4749674797058105, + "learning_rate": 2.980681472467295e-07, + "loss": 0.8919, + "step": 29765 + }, + { + "epoch": 0.7169865851015149, + "grad_norm": 2.0215656757354736, + "learning_rate": 2.9794138525504514e-07, + "loss": 0.8829, + "step": 29770 + }, + { + "epoch": 0.7171070060933021, + "grad_norm": 1.995144248008728, + "learning_rate": 2.9781462326336073e-07, + "loss": 0.9261, + "step": 29775 + }, + { + "epoch": 0.7172274270850895, + "grad_norm": 1.4340596199035645, + "learning_rate": 2.9768786127167627e-07, + "loss": 0.892, + "step": 29780 + }, + { + "epoch": 0.7173478480768768, + "grad_norm": 1.4539889097213745, + "learning_rate": 2.975610992799919e-07, + "loss": 0.8956, + "step": 29785 + }, + { + "epoch": 0.717468269068664, + "grad_norm": 1.598737120628357, + "learning_rate": 2.9743433728830746e-07, + "loss": 0.895, + "step": 29790 + }, + { + "epoch": 0.7175886900604513, + "grad_norm": 1.61039137840271, + "learning_rate": 2.9730757529662305e-07, + "loss": 0.971, + "step": 29795 + }, + { + "epoch": 0.7177091110522387, + "grad_norm": 1.8494083881378174, + "learning_rate": 2.9718081330493864e-07, + "loss": 0.8593, + "step": 29800 + }, + { + "epoch": 0.7178295320440259, + "grad_norm": 1.4721626043319702, + "learning_rate": 2.9705405131325423e-07, + "loss": 0.8918, + "step": 29805 + }, + { + "epoch": 0.7179499530358132, + "grad_norm": 2.004418134689331, + "learning_rate": 2.9692728932156977e-07, + "loss": 0.9148, + "step": 29810 + }, + { + "epoch": 0.7180703740276005, + "grad_norm": 1.5343745946884155, + "learning_rate": 2.968005273298854e-07, + "loss": 0.8677, + "step": 29815 + }, + { + "epoch": 0.7181907950193878, + "grad_norm": 1.5843580961227417, + "learning_rate": 2.96673765338201e-07, + "loss": 0.8925, + "step": 29820 + }, + { + "epoch": 0.7183112160111751, + "grad_norm": 1.5251814126968384, + "learning_rate": 2.9654700334651655e-07, + "loss": 0.8998, + "step": 29825 + }, + { + "epoch": 0.7184316370029623, + "grad_norm": 1.525067687034607, + "learning_rate": 2.964202413548322e-07, + "loss": 0.8747, + "step": 29830 + }, + { + "epoch": 0.7185520579947496, + "grad_norm": 1.5406259298324585, + "learning_rate": 2.9629347936314773e-07, + "loss": 0.9172, + "step": 29835 + }, + { + "epoch": 0.718672478986537, + "grad_norm": 1.8802706003189087, + "learning_rate": 2.961667173714633e-07, + "loss": 0.9207, + "step": 29840 + }, + { + "epoch": 0.7187928999783242, + "grad_norm": 1.6404411792755127, + "learning_rate": 2.960399553797789e-07, + "loss": 0.9423, + "step": 29845 + }, + { + "epoch": 0.7189133209701115, + "grad_norm": 1.5202003717422485, + "learning_rate": 2.959131933880945e-07, + "loss": 0.8866, + "step": 29850 + }, + { + "epoch": 0.7190337419618988, + "grad_norm": 1.4623454809188843, + "learning_rate": 2.957864313964101e-07, + "loss": 0.8896, + "step": 29855 + }, + { + "epoch": 0.7191541629536861, + "grad_norm": 1.5731388330459595, + "learning_rate": 2.956596694047257e-07, + "loss": 0.8521, + "step": 29860 + }, + { + "epoch": 0.7192745839454734, + "grad_norm": 1.6413127183914185, + "learning_rate": 2.955329074130413e-07, + "loss": 0.9062, + "step": 29865 + }, + { + "epoch": 0.7193950049372607, + "grad_norm": 1.5317072868347168, + "learning_rate": 2.954061454213568e-07, + "loss": 0.858, + "step": 29870 + }, + { + "epoch": 0.7195154259290479, + "grad_norm": 1.4505599737167358, + "learning_rate": 2.9527938342967246e-07, + "loss": 0.8647, + "step": 29875 + }, + { + "epoch": 0.7196358469208353, + "grad_norm": 1.4833729267120361, + "learning_rate": 2.95152621437988e-07, + "loss": 0.8414, + "step": 29880 + }, + { + "epoch": 0.7197562679126225, + "grad_norm": 1.6453678607940674, + "learning_rate": 2.950258594463036e-07, + "loss": 0.8929, + "step": 29885 + }, + { + "epoch": 0.7198766889044098, + "grad_norm": 1.612945556640625, + "learning_rate": 2.9489909745461924e-07, + "loss": 0.8469, + "step": 29890 + }, + { + "epoch": 0.7199971098961971, + "grad_norm": 1.5266822576522827, + "learning_rate": 2.947723354629348e-07, + "loss": 0.8734, + "step": 29895 + }, + { + "epoch": 0.7201175308879844, + "grad_norm": 1.4711270332336426, + "learning_rate": 2.9464557347125037e-07, + "loss": 0.9144, + "step": 29900 + }, + { + "epoch": 0.7202379518797717, + "grad_norm": 1.4607304334640503, + "learning_rate": 2.9451881147956596e-07, + "loss": 0.9439, + "step": 29905 + }, + { + "epoch": 0.720358372871559, + "grad_norm": 1.421431541442871, + "learning_rate": 2.9439204948788155e-07, + "loss": 0.8784, + "step": 29910 + }, + { + "epoch": 0.7204787938633462, + "grad_norm": 1.4766961336135864, + "learning_rate": 2.942652874961971e-07, + "loss": 0.8832, + "step": 29915 + }, + { + "epoch": 0.7205992148551336, + "grad_norm": 1.451119303703308, + "learning_rate": 2.9413852550451274e-07, + "loss": 0.9138, + "step": 29920 + }, + { + "epoch": 0.7207196358469208, + "grad_norm": 1.7488954067230225, + "learning_rate": 2.9401176351282833e-07, + "loss": 0.8863, + "step": 29925 + }, + { + "epoch": 0.7208400568387081, + "grad_norm": 1.5258170366287231, + "learning_rate": 2.9388500152114387e-07, + "loss": 0.8871, + "step": 29930 + }, + { + "epoch": 0.7209604778304954, + "grad_norm": 2.0056021213531494, + "learning_rate": 2.937582395294595e-07, + "loss": 0.8868, + "step": 29935 + }, + { + "epoch": 0.7210808988222827, + "grad_norm": 1.5685434341430664, + "learning_rate": 2.9363147753777505e-07, + "loss": 0.8433, + "step": 29940 + }, + { + "epoch": 0.72120131981407, + "grad_norm": 1.5814381837844849, + "learning_rate": 2.9350471554609064e-07, + "loss": 0.906, + "step": 29945 + }, + { + "epoch": 0.7213217408058573, + "grad_norm": 1.5315696001052856, + "learning_rate": 2.9337795355440623e-07, + "loss": 0.8956, + "step": 29950 + }, + { + "epoch": 0.7214421617976445, + "grad_norm": 1.589921236038208, + "learning_rate": 2.9325119156272183e-07, + "loss": 0.9191, + "step": 29955 + }, + { + "epoch": 0.7215625827894319, + "grad_norm": 1.6248891353607178, + "learning_rate": 2.9312442957103737e-07, + "loss": 0.9341, + "step": 29960 + }, + { + "epoch": 0.7216830037812192, + "grad_norm": 1.5531809329986572, + "learning_rate": 2.92997667579353e-07, + "loss": 0.8565, + "step": 29965 + }, + { + "epoch": 0.7218034247730064, + "grad_norm": 1.4952003955841064, + "learning_rate": 2.928709055876686e-07, + "loss": 0.9033, + "step": 29970 + }, + { + "epoch": 0.7219238457647937, + "grad_norm": 1.6778074502944946, + "learning_rate": 2.9274414359598414e-07, + "loss": 0.8708, + "step": 29975 + }, + { + "epoch": 0.722044266756581, + "grad_norm": 1.4443367719650269, + "learning_rate": 2.926173816042998e-07, + "loss": 0.9148, + "step": 29980 + }, + { + "epoch": 0.7221646877483683, + "grad_norm": 1.5499740839004517, + "learning_rate": 2.924906196126153e-07, + "loss": 0.9231, + "step": 29985 + }, + { + "epoch": 0.7222851087401556, + "grad_norm": 1.434739589691162, + "learning_rate": 2.923638576209309e-07, + "loss": 0.8409, + "step": 29990 + }, + { + "epoch": 0.7224055297319428, + "grad_norm": 1.4947270154953003, + "learning_rate": 2.922370956292465e-07, + "loss": 0.8939, + "step": 29995 + }, + { + "epoch": 0.7225259507237302, + "grad_norm": 1.7011665105819702, + "learning_rate": 2.921103336375621e-07, + "loss": 0.8744, + "step": 30000 + } + ], + "logging_steps": 5, + "max_steps": 41521, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.180075135360408e+19, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}