{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 5.109787367972822, "eval_steps": 500, "global_step": 138000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0037027780092014034, "grad_norm": 21.161716771412177, "learning_rate": 3.6657162957751695e-08, "loss": 2.6783, "step": 100 }, { "epoch": 0.007405556018402807, "grad_norm": 17.61802823072404, "learning_rate": 7.368460028881402e-08, "loss": 2.6426, "step": 200 }, { "epoch": 0.01110833402760421, "grad_norm": 8.635640044259958, "learning_rate": 1.1071203761987633e-07, "loss": 2.5725, "step": 300 }, { "epoch": 0.014811112036805614, "grad_norm": 4.290727817061978, "learning_rate": 1.4773947495093866e-07, "loss": 2.4757, "step": 400 }, { "epoch": 0.018513890046007016, "grad_norm": 3.791476034639625, "learning_rate": 1.8476691228200099e-07, "loss": 2.367, "step": 500 }, { "epoch": 0.02221666805520842, "grad_norm": 4.547923325980467, "learning_rate": 2.2179434961306329e-07, "loss": 2.3288, "step": 600 }, { "epoch": 0.025919446064409823, "grad_norm": 4.402404364420564, "learning_rate": 2.5882178694412564e-07, "loss": 2.2759, "step": 700 }, { "epoch": 0.029622224073611227, "grad_norm": 3.6276565743103535, "learning_rate": 2.958492242751879e-07, "loss": 2.2766, "step": 800 }, { "epoch": 0.03332500208281263, "grad_norm": 4.170244852808218, "learning_rate": 3.3287666160625024e-07, "loss": 2.2338, "step": 900 }, { "epoch": 0.03702778009201403, "grad_norm": 4.412766673186702, "learning_rate": 3.699040989373126e-07, "loss": 2.1561, "step": 1000 }, { "epoch": 0.04073055810121544, "grad_norm": 4.628401400135504, "learning_rate": 4.069315362683749e-07, "loss": 2.1544, "step": 1100 }, { "epoch": 0.04443333611041684, "grad_norm": 3.7081302230204867, "learning_rate": 4.439589735994372e-07, "loss": 2.106, "step": 1200 }, { "epoch": 0.04813611411961825, "grad_norm": 4.2137908215712905, "learning_rate": 4.809864109304995e-07, "loss": 2.0794, "step": 1300 }, { "epoch": 0.05183889212881965, "grad_norm": 4.60851346975717, "learning_rate": 5.180138482615619e-07, "loss": 2.0641, "step": 1400 }, { "epoch": 0.05554167013802105, "grad_norm": 4.313023013354129, "learning_rate": 5.550412855926242e-07, "loss": 1.9983, "step": 1500 }, { "epoch": 0.059244448147222455, "grad_norm": 3.966177577383383, "learning_rate": 5.920687229236865e-07, "loss": 2.0366, "step": 1600 }, { "epoch": 0.06294722615642385, "grad_norm": 4.661276659671688, "learning_rate": 6.290961602547487e-07, "loss": 2.0112, "step": 1700 }, { "epoch": 0.06665000416562526, "grad_norm": 3.856161388179088, "learning_rate": 6.661235975858112e-07, "loss": 1.9547, "step": 1800 }, { "epoch": 0.07035278217482667, "grad_norm": 4.5116015322440015, "learning_rate": 7.031510349168734e-07, "loss": 1.9599, "step": 1900 }, { "epoch": 0.07405556018402806, "grad_norm": 3.9605628970368416, "learning_rate": 7.401784722479357e-07, "loss": 1.9673, "step": 2000 }, { "epoch": 0.07775833819322947, "grad_norm": 4.534104810402031, "learning_rate": 7.772059095789982e-07, "loss": 1.8987, "step": 2100 }, { "epoch": 0.08146111620243088, "grad_norm": 3.7768720377686815, "learning_rate": 8.142333469100604e-07, "loss": 1.9039, "step": 2200 }, { "epoch": 0.08516389421163227, "grad_norm": 3.709870476374346, "learning_rate": 8.512607842411227e-07, "loss": 1.8572, "step": 2300 }, { "epoch": 0.08886667222083368, "grad_norm": 3.861469813158571, "learning_rate": 8.882882215721851e-07, "loss": 1.831, "step": 2400 }, { "epoch": 0.09256945023003509, "grad_norm": 4.046784580473015, "learning_rate": 9.253156589032473e-07, "loss": 1.856, "step": 2500 }, { "epoch": 0.0962722282392365, "grad_norm": 4.439862554726597, "learning_rate": 9.623430962343098e-07, "loss": 1.8344, "step": 2600 }, { "epoch": 0.09997500624843789, "grad_norm": 6.222158633649652, "learning_rate": 9.993705335653721e-07, "loss": 1.8408, "step": 2700 }, { "epoch": 0.1036777842576393, "grad_norm": 3.7129912385070503, "learning_rate": 1.0363979708964342e-06, "loss": 1.8195, "step": 2800 }, { "epoch": 0.1073805622668407, "grad_norm": 4.073889809938404, "learning_rate": 1.0734254082274968e-06, "loss": 1.7714, "step": 2900 }, { "epoch": 0.1110833402760421, "grad_norm": 3.8485467079112805, "learning_rate": 1.1104528455585589e-06, "loss": 1.7782, "step": 3000 }, { "epoch": 0.1147861182852435, "grad_norm": 3.548076088775858, "learning_rate": 1.1474802828896212e-06, "loss": 1.7646, "step": 3100 }, { "epoch": 0.11848889629444491, "grad_norm": 3.754340846452376, "learning_rate": 1.1845077202206837e-06, "loss": 1.7733, "step": 3200 }, { "epoch": 0.12219167430364632, "grad_norm": 6.281470572024916, "learning_rate": 1.2215351575517459e-06, "loss": 1.7361, "step": 3300 }, { "epoch": 0.1258944523128477, "grad_norm": 4.065489405313006, "learning_rate": 1.2585625948828082e-06, "loss": 1.7397, "step": 3400 }, { "epoch": 0.12959723032204912, "grad_norm": 4.260390115086277, "learning_rate": 1.2955900322138707e-06, "loss": 1.7571, "step": 3500 }, { "epoch": 0.13330000833125052, "grad_norm": 5.9838104492225215, "learning_rate": 1.3326174695449328e-06, "loss": 1.7205, "step": 3600 }, { "epoch": 0.13700278634045193, "grad_norm": 4.7607112587769915, "learning_rate": 1.3696449068759952e-06, "loss": 1.713, "step": 3700 }, { "epoch": 0.14070556434965334, "grad_norm": 4.047013434162942, "learning_rate": 1.4066723442070577e-06, "loss": 1.7084, "step": 3800 }, { "epoch": 0.14440834235885472, "grad_norm": 6.154993338003737, "learning_rate": 1.4436997815381198e-06, "loss": 1.6784, "step": 3900 }, { "epoch": 0.14811112036805613, "grad_norm": 3.6172501531655614, "learning_rate": 1.4807272188691821e-06, "loss": 1.6514, "step": 4000 }, { "epoch": 0.15181389837725753, "grad_norm": 3.9661920321991757, "learning_rate": 1.5177546562002447e-06, "loss": 1.6642, "step": 4100 }, { "epoch": 0.15551667638645894, "grad_norm": 4.468519494026533, "learning_rate": 1.5547820935313068e-06, "loss": 1.6517, "step": 4200 }, { "epoch": 0.15921945439566035, "grad_norm": 4.809090349712038, "learning_rate": 1.5918095308623691e-06, "loss": 1.6456, "step": 4300 }, { "epoch": 0.16292223240486176, "grad_norm": 3.991862639554731, "learning_rate": 1.6288369681934317e-06, "loss": 1.6474, "step": 4400 }, { "epoch": 0.16662501041406316, "grad_norm": 4.719090103521477, "learning_rate": 1.6658644055244938e-06, "loss": 1.6042, "step": 4500 }, { "epoch": 0.17032778842326454, "grad_norm": 4.927706214636146, "learning_rate": 1.702891842855556e-06, "loss": 1.5918, "step": 4600 }, { "epoch": 0.17403056643246595, "grad_norm": 4.566524993389404, "learning_rate": 1.7399192801866186e-06, "loss": 1.6103, "step": 4700 }, { "epoch": 0.17773334444166736, "grad_norm": 4.908438965819459, "learning_rate": 1.7769467175176807e-06, "loss": 1.6101, "step": 4800 }, { "epoch": 0.18143612245086876, "grad_norm": 3.8011540864311866, "learning_rate": 1.813974154848743e-06, "loss": 1.6162, "step": 4900 }, { "epoch": 0.18513890046007017, "grad_norm": 4.241301056669864, "learning_rate": 1.8510015921798056e-06, "loss": 1.6117, "step": 5000 }, { "epoch": 0.18884167846927158, "grad_norm": 4.305210424709023, "learning_rate": 1.8880290295108677e-06, "loss": 1.5634, "step": 5100 }, { "epoch": 0.192544456478473, "grad_norm": 5.188453275523176, "learning_rate": 1.92505646684193e-06, "loss": 1.5839, "step": 5200 }, { "epoch": 0.19624723448767437, "grad_norm": 4.390695329489216, "learning_rate": 1.9620839041729924e-06, "loss": 1.5747, "step": 5300 }, { "epoch": 0.19995001249687577, "grad_norm": 4.581970539434446, "learning_rate": 1.999111341504055e-06, "loss": 1.5597, "step": 5400 }, { "epoch": 0.20365279050607718, "grad_norm": 4.0470801193804755, "learning_rate": 2.036138778835117e-06, "loss": 1.5275, "step": 5500 }, { "epoch": 0.2073555685152786, "grad_norm": 4.1287928029941385, "learning_rate": 2.073166216166179e-06, "loss": 1.5455, "step": 5600 }, { "epoch": 0.21105834652448, "grad_norm": 5.5739542390705274, "learning_rate": 2.1101936534972417e-06, "loss": 1.526, "step": 5700 }, { "epoch": 0.2147611245336814, "grad_norm": 4.072688346808655, "learning_rate": 2.147221090828304e-06, "loss": 1.5006, "step": 5800 }, { "epoch": 0.2184639025428828, "grad_norm": 5.331904305941977, "learning_rate": 2.1842485281593663e-06, "loss": 1.5417, "step": 5900 }, { "epoch": 0.2221666805520842, "grad_norm": 4.378202415797325, "learning_rate": 2.221275965490429e-06, "loss": 1.4885, "step": 6000 }, { "epoch": 0.2258694585612856, "grad_norm": 4.318453195850846, "learning_rate": 2.258303402821491e-06, "loss": 1.5359, "step": 6100 }, { "epoch": 0.229572236570487, "grad_norm": 4.475047536813248, "learning_rate": 2.295330840152553e-06, "loss": 1.5168, "step": 6200 }, { "epoch": 0.2332750145796884, "grad_norm": 4.665314820774367, "learning_rate": 2.3323582774836156e-06, "loss": 1.4688, "step": 6300 }, { "epoch": 0.23697779258888982, "grad_norm": 5.210970519844203, "learning_rate": 2.3693857148146778e-06, "loss": 1.4775, "step": 6400 }, { "epoch": 0.24068057059809123, "grad_norm": 4.410640014744449, "learning_rate": 2.4064131521457403e-06, "loss": 1.4738, "step": 6500 }, { "epoch": 0.24438334860729263, "grad_norm": 3.944391436658137, "learning_rate": 2.443440589476803e-06, "loss": 1.4469, "step": 6600 }, { "epoch": 0.248086126616494, "grad_norm": 4.96868756013431, "learning_rate": 2.480468026807865e-06, "loss": 1.4821, "step": 6700 }, { "epoch": 0.2517889046256954, "grad_norm": 4.462942085694235, "learning_rate": 2.517495464138927e-06, "loss": 1.4591, "step": 6800 }, { "epoch": 0.25549168263489686, "grad_norm": 4.378031684132469, "learning_rate": 2.5545229014699896e-06, "loss": 1.4578, "step": 6900 }, { "epoch": 0.25919446064409823, "grad_norm": 4.241655047493595, "learning_rate": 2.5915503388010517e-06, "loss": 1.4705, "step": 7000 }, { "epoch": 0.2628972386532996, "grad_norm": 3.981686552106327, "learning_rate": 2.6285777761321142e-06, "loss": 1.4538, "step": 7100 }, { "epoch": 0.26660001666250105, "grad_norm": 4.3607343276496575, "learning_rate": 2.6656052134631768e-06, "loss": 1.411, "step": 7200 }, { "epoch": 0.27030279467170243, "grad_norm": 4.807349625008967, "learning_rate": 2.7026326507942385e-06, "loss": 1.4205, "step": 7300 }, { "epoch": 0.27400557268090386, "grad_norm": 4.289974925492848, "learning_rate": 2.739660088125301e-06, "loss": 1.4374, "step": 7400 }, { "epoch": 0.27770835069010524, "grad_norm": 3.491997941988729, "learning_rate": 2.7766875254563636e-06, "loss": 1.4393, "step": 7500 }, { "epoch": 0.2814111286993067, "grad_norm": 4.317473381602871, "learning_rate": 2.8137149627874257e-06, "loss": 1.3965, "step": 7600 }, { "epoch": 0.28511390670850806, "grad_norm": 4.317169353881641, "learning_rate": 2.850742400118488e-06, "loss": 1.4139, "step": 7700 }, { "epoch": 0.28881668471770944, "grad_norm": 4.342306783527047, "learning_rate": 2.8877698374495503e-06, "loss": 1.3966, "step": 7800 }, { "epoch": 0.2925194627269109, "grad_norm": 4.561613447481574, "learning_rate": 2.9247972747806124e-06, "loss": 1.4207, "step": 7900 }, { "epoch": 0.29622224073611225, "grad_norm": 4.603226423674348, "learning_rate": 2.961824712111675e-06, "loss": 1.4369, "step": 8000 }, { "epoch": 0.2999250187453137, "grad_norm": 4.628660307539308, "learning_rate": 2.9988521494427375e-06, "loss": 1.3871, "step": 8100 }, { "epoch": 0.30362779675451507, "grad_norm": 3.8865443379767415, "learning_rate": 3.0358795867737996e-06, "loss": 1.4182, "step": 8200 }, { "epoch": 0.3073305747637165, "grad_norm": 4.038269978955141, "learning_rate": 3.072907024104862e-06, "loss": 1.3622, "step": 8300 }, { "epoch": 0.3110333527729179, "grad_norm": 5.136502242831012, "learning_rate": 3.1099344614359243e-06, "loss": 1.3889, "step": 8400 }, { "epoch": 0.31473613078211926, "grad_norm": 4.457630892251384, "learning_rate": 3.1469618987669864e-06, "loss": 1.387, "step": 8500 }, { "epoch": 0.3184389087913207, "grad_norm": 4.651901883399858, "learning_rate": 3.183989336098049e-06, "loss": 1.3741, "step": 8600 }, { "epoch": 0.3221416868005221, "grad_norm": 4.800399697463299, "learning_rate": 3.2210167734291115e-06, "loss": 1.3473, "step": 8700 }, { "epoch": 0.3258444648097235, "grad_norm": 5.122987743156367, "learning_rate": 3.2580442107601736e-06, "loss": 1.3733, "step": 8800 }, { "epoch": 0.3295472428189249, "grad_norm": 5.307686552433291, "learning_rate": 3.295071648091236e-06, "loss": 1.341, "step": 8900 }, { "epoch": 0.3332500208281263, "grad_norm": 4.4204932532106085, "learning_rate": 3.3320990854222982e-06, "loss": 1.3615, "step": 9000 }, { "epoch": 0.3369527988373277, "grad_norm": 4.937899325267276, "learning_rate": 3.3691265227533603e-06, "loss": 1.368, "step": 9100 }, { "epoch": 0.3406555768465291, "grad_norm": 3.6435580501639326, "learning_rate": 3.406153960084423e-06, "loss": 1.341, "step": 9200 }, { "epoch": 0.3443583548557305, "grad_norm": 3.884956476912519, "learning_rate": 3.4431813974154854e-06, "loss": 1.3458, "step": 9300 }, { "epoch": 0.3480611328649319, "grad_norm": 4.291808958034935, "learning_rate": 3.4802088347465475e-06, "loss": 1.301, "step": 9400 }, { "epoch": 0.35176391087413333, "grad_norm": 4.55718057133725, "learning_rate": 3.51723627207761e-06, "loss": 1.3203, "step": 9500 }, { "epoch": 0.3554666888833347, "grad_norm": 3.927788263058885, "learning_rate": 3.554263709408672e-06, "loss": 1.3087, "step": 9600 }, { "epoch": 0.35916946689253615, "grad_norm": 4.334142071876793, "learning_rate": 3.5912911467397343e-06, "loss": 1.328, "step": 9700 }, { "epoch": 0.36287224490173753, "grad_norm": 3.9826425837051715, "learning_rate": 3.628318584070797e-06, "loss": 1.3162, "step": 9800 }, { "epoch": 0.3665750229109389, "grad_norm": 3.7373318962103834, "learning_rate": 3.665346021401859e-06, "loss": 1.3427, "step": 9900 }, { "epoch": 0.37027780092014034, "grad_norm": 4.309257631492866, "learning_rate": 3.7023734587329215e-06, "loss": 1.2918, "step": 10000 }, { "epoch": 0.3739805789293417, "grad_norm": 4.000949746879435, "learning_rate": 3.739400896063984e-06, "loss": 1.2864, "step": 10100 }, { "epoch": 0.37768335693854316, "grad_norm": 4.314641431918059, "learning_rate": 3.7764283333950457e-06, "loss": 1.2784, "step": 10200 }, { "epoch": 0.38138613494774454, "grad_norm": 4.136976500403523, "learning_rate": 3.8134557707261083e-06, "loss": 1.2821, "step": 10300 }, { "epoch": 0.385088912956946, "grad_norm": 4.8607889216449385, "learning_rate": 3.85048320805717e-06, "loss": 1.281, "step": 10400 }, { "epoch": 0.38879169096614735, "grad_norm": 4.432173453928133, "learning_rate": 3.8875106453882325e-06, "loss": 1.2959, "step": 10500 }, { "epoch": 0.39249446897534873, "grad_norm": 5.118601631131207, "learning_rate": 3.9245380827192954e-06, "loss": 1.2772, "step": 10600 }, { "epoch": 0.39619724698455017, "grad_norm": 4.189210920096741, "learning_rate": 3.9615655200503576e-06, "loss": 1.311, "step": 10700 }, { "epoch": 0.39990002499375155, "grad_norm": 4.9933785952086875, "learning_rate": 3.99859295738142e-06, "loss": 1.2798, "step": 10800 }, { "epoch": 0.403602803002953, "grad_norm": 3.8535620848124785, "learning_rate": 4.035620394712483e-06, "loss": 1.2532, "step": 10900 }, { "epoch": 0.40730558101215436, "grad_norm": 4.040521438500001, "learning_rate": 4.072647832043545e-06, "loss": 1.2739, "step": 11000 }, { "epoch": 0.4110083590213558, "grad_norm": 4.280568957640515, "learning_rate": 4.109675269374607e-06, "loss": 1.2755, "step": 11100 }, { "epoch": 0.4147111370305572, "grad_norm": 4.230099582418301, "learning_rate": 4.146702706705669e-06, "loss": 1.2676, "step": 11200 }, { "epoch": 0.41841391503975855, "grad_norm": 3.807801983954757, "learning_rate": 4.183730144036732e-06, "loss": 1.2487, "step": 11300 }, { "epoch": 0.42211669304896, "grad_norm": 4.2784766116809365, "learning_rate": 4.220757581367794e-06, "loss": 1.2545, "step": 11400 }, { "epoch": 0.42581947105816137, "grad_norm": 4.855973605257093, "learning_rate": 4.257785018698856e-06, "loss": 1.2436, "step": 11500 }, { "epoch": 0.4295222490673628, "grad_norm": 3.5115767446990995, "learning_rate": 4.294812456029918e-06, "loss": 1.2505, "step": 11600 }, { "epoch": 0.4332250270765642, "grad_norm": 3.7062317227814408, "learning_rate": 4.33183989336098e-06, "loss": 1.2175, "step": 11700 }, { "epoch": 0.4369278050857656, "grad_norm": 3.9387496036649936, "learning_rate": 4.368867330692043e-06, "loss": 1.222, "step": 11800 }, { "epoch": 0.440630583094967, "grad_norm": 5.23595658030001, "learning_rate": 4.4058947680231055e-06, "loss": 1.2388, "step": 11900 }, { "epoch": 0.4443333611041684, "grad_norm": 4.759096268753192, "learning_rate": 4.442922205354168e-06, "loss": 1.2036, "step": 12000 }, { "epoch": 0.4480361391133698, "grad_norm": 4.094075469283295, "learning_rate": 4.4799496426852306e-06, "loss": 1.2543, "step": 12100 }, { "epoch": 0.4517389171225712, "grad_norm": 4.153721175621665, "learning_rate": 4.516977080016293e-06, "loss": 1.227, "step": 12200 }, { "epoch": 0.45544169513177263, "grad_norm": 4.508419619958273, "learning_rate": 4.554004517347355e-06, "loss": 1.188, "step": 12300 }, { "epoch": 0.459144473140974, "grad_norm": 5.431232416017837, "learning_rate": 4.591031954678417e-06, "loss": 1.2254, "step": 12400 }, { "epoch": 0.46284725115017544, "grad_norm": 4.195672755415828, "learning_rate": 4.62805939200948e-06, "loss": 1.2161, "step": 12500 }, { "epoch": 0.4665500291593768, "grad_norm": 4.928316304383083, "learning_rate": 4.665086829340542e-06, "loss": 1.2117, "step": 12600 }, { "epoch": 0.4702528071685782, "grad_norm": 3.591246874782549, "learning_rate": 4.702114266671604e-06, "loss": 1.2114, "step": 12700 }, { "epoch": 0.47395558517777964, "grad_norm": 3.4385074546101473, "learning_rate": 4.739141704002666e-06, "loss": 1.1622, "step": 12800 }, { "epoch": 0.477658363186981, "grad_norm": 4.266312659389707, "learning_rate": 4.776169141333728e-06, "loss": 1.1994, "step": 12900 }, { "epoch": 0.48136114119618245, "grad_norm": 4.318134728161222, "learning_rate": 4.813196578664791e-06, "loss": 1.194, "step": 13000 }, { "epoch": 0.48506391920538383, "grad_norm": 3.743433627557919, "learning_rate": 4.850224015995853e-06, "loss": 1.1862, "step": 13100 }, { "epoch": 0.48876669721458527, "grad_norm": 4.432513953759492, "learning_rate": 4.8872514533269155e-06, "loss": 1.2034, "step": 13200 }, { "epoch": 0.49246947522378665, "grad_norm": 4.932972639382486, "learning_rate": 4.924278890657978e-06, "loss": 1.1829, "step": 13300 }, { "epoch": 0.496172253232988, "grad_norm": 4.552124318708114, "learning_rate": 4.961306327989041e-06, "loss": 1.1877, "step": 13400 }, { "epoch": 0.49987503124218946, "grad_norm": 4.57039085993272, "learning_rate": 4.998333765320103e-06, "loss": 1.1804, "step": 13500 }, { "epoch": 0.5035778092513908, "grad_norm": 4.435102306960172, "learning_rate": 5.035361202651165e-06, "loss": 1.1673, "step": 13600 }, { "epoch": 0.5072805872605922, "grad_norm": 4.186670025498094, "learning_rate": 5.072388639982228e-06, "loss": 1.1933, "step": 13700 }, { "epoch": 0.5109833652697937, "grad_norm": 4.405934801625066, "learning_rate": 5.10941607731329e-06, "loss": 1.1772, "step": 13800 }, { "epoch": 0.5146861432789951, "grad_norm": 4.311924669929619, "learning_rate": 5.146443514644351e-06, "loss": 1.1665, "step": 13900 }, { "epoch": 0.5183889212881965, "grad_norm": 5.625220459288197, "learning_rate": 5.183470951975414e-06, "loss": 1.1735, "step": 14000 }, { "epoch": 0.5220916992973978, "grad_norm": 3.8405703022228606, "learning_rate": 5.220498389306476e-06, "loss": 1.1904, "step": 14100 }, { "epoch": 0.5257944773065992, "grad_norm": 4.059806709247945, "learning_rate": 5.257525826637538e-06, "loss": 1.1588, "step": 14200 }, { "epoch": 0.5294972553158007, "grad_norm": 3.8722695281592485, "learning_rate": 5.294553263968601e-06, "loss": 1.1469, "step": 14300 }, { "epoch": 0.5332000333250021, "grad_norm": 4.66275202781423, "learning_rate": 5.331580701299663e-06, "loss": 1.1442, "step": 14400 }, { "epoch": 0.5369028113342035, "grad_norm": 4.6182096996448845, "learning_rate": 5.3686081386307255e-06, "loss": 1.1543, "step": 14500 }, { "epoch": 0.5406055893434049, "grad_norm": 3.9837337174635103, "learning_rate": 5.4056355759617885e-06, "loss": 1.158, "step": 14600 }, { "epoch": 0.5443083673526063, "grad_norm": 3.8034049725872356, "learning_rate": 5.442663013292851e-06, "loss": 1.1546, "step": 14700 }, { "epoch": 0.5480111453618077, "grad_norm": 4.62083026318626, "learning_rate": 5.479690450623913e-06, "loss": 1.1553, "step": 14800 }, { "epoch": 0.5517139233710091, "grad_norm": 4.462803142186011, "learning_rate": 5.516717887954976e-06, "loss": 1.1506, "step": 14900 }, { "epoch": 0.5554167013802105, "grad_norm": 4.165198766777481, "learning_rate": 5.553745325286038e-06, "loss": 1.147, "step": 15000 }, { "epoch": 0.5591194793894119, "grad_norm": 3.5365327477483297, "learning_rate": 5.590772762617099e-06, "loss": 1.1341, "step": 15100 }, { "epoch": 0.5628222573986134, "grad_norm": 3.8324766687461653, "learning_rate": 5.627800199948162e-06, "loss": 1.1123, "step": 15200 }, { "epoch": 0.5665250354078147, "grad_norm": 4.271061462203587, "learning_rate": 5.664827637279224e-06, "loss": 1.1212, "step": 15300 }, { "epoch": 0.5702278134170161, "grad_norm": 4.5272229115194875, "learning_rate": 5.701855074610286e-06, "loss": 1.1438, "step": 15400 }, { "epoch": 0.5739305914262175, "grad_norm": 4.660071317336288, "learning_rate": 5.738882511941349e-06, "loss": 1.1174, "step": 15500 }, { "epoch": 0.5776333694354189, "grad_norm": 3.8174601055554094, "learning_rate": 5.775909949272411e-06, "loss": 1.095, "step": 15600 }, { "epoch": 0.5813361474446204, "grad_norm": 3.9772037532519784, "learning_rate": 5.8129373866034734e-06, "loss": 1.0901, "step": 15700 }, { "epoch": 0.5850389254538217, "grad_norm": 4.117368390246338, "learning_rate": 5.849964823934536e-06, "loss": 1.1218, "step": 15800 }, { "epoch": 0.5887417034630231, "grad_norm": 3.6969429985121387, "learning_rate": 5.8869922612655985e-06, "loss": 1.1172, "step": 15900 }, { "epoch": 0.5924444814722245, "grad_norm": 4.208647101328797, "learning_rate": 5.924019698596661e-06, "loss": 1.1005, "step": 16000 }, { "epoch": 0.596147259481426, "grad_norm": 4.0291894246178455, "learning_rate": 5.961047135927724e-06, "loss": 1.1129, "step": 16100 }, { "epoch": 0.5998500374906274, "grad_norm": 4.713616701400172, "learning_rate": 5.998074573258785e-06, "loss": 1.1002, "step": 16200 }, { "epoch": 0.6035528154998288, "grad_norm": 4.1930372418205355, "learning_rate": 6.035102010589847e-06, "loss": 1.0886, "step": 16300 }, { "epoch": 0.6072555935090301, "grad_norm": 4.254442138954682, "learning_rate": 6.07212944792091e-06, "loss": 1.0954, "step": 16400 }, { "epoch": 0.6109583715182315, "grad_norm": 3.7190710738003014, "learning_rate": 6.109156885251972e-06, "loss": 1.1288, "step": 16500 }, { "epoch": 0.614661149527433, "grad_norm": 5.286326819627371, "learning_rate": 6.146184322583034e-06, "loss": 1.0861, "step": 16600 }, { "epoch": 0.6183639275366344, "grad_norm": 3.5036265705416, "learning_rate": 6.183211759914097e-06, "loss": 1.1086, "step": 16700 }, { "epoch": 0.6220667055458358, "grad_norm": 3.5111130437153495, "learning_rate": 6.220239197245159e-06, "loss": 1.1022, "step": 16800 }, { "epoch": 0.6257694835550371, "grad_norm": 3.2896238161080946, "learning_rate": 6.257266634576221e-06, "loss": 1.0955, "step": 16900 }, { "epoch": 0.6294722615642385, "grad_norm": 3.6540403667854604, "learning_rate": 6.294294071907284e-06, "loss": 1.058, "step": 17000 }, { "epoch": 0.63317503957344, "grad_norm": 4.4670975803638475, "learning_rate": 6.3313215092383464e-06, "loss": 1.068, "step": 17100 }, { "epoch": 0.6368778175826414, "grad_norm": 4.530195581158383, "learning_rate": 6.3683489465694085e-06, "loss": 1.1105, "step": 17200 }, { "epoch": 0.6405805955918428, "grad_norm": 3.8288830335210995, "learning_rate": 6.4053763839004715e-06, "loss": 1.0332, "step": 17300 }, { "epoch": 0.6442833736010442, "grad_norm": 3.815833342915358, "learning_rate": 6.442403821231533e-06, "loss": 1.0794, "step": 17400 }, { "epoch": 0.6479861516102456, "grad_norm": 4.604021991413655, "learning_rate": 6.479431258562595e-06, "loss": 1.079, "step": 17500 }, { "epoch": 0.651688929619447, "grad_norm": 3.577028223699294, "learning_rate": 6.516458695893658e-06, "loss": 1.0653, "step": 17600 }, { "epoch": 0.6553917076286484, "grad_norm": 4.111818385244818, "learning_rate": 6.55348613322472e-06, "loss": 1.035, "step": 17700 }, { "epoch": 0.6590944856378498, "grad_norm": 3.56811764186164, "learning_rate": 6.590513570555782e-06, "loss": 1.0452, "step": 17800 }, { "epoch": 0.6627972636470512, "grad_norm": 3.87646009188637, "learning_rate": 6.627541007886845e-06, "loss": 1.0493, "step": 17900 }, { "epoch": 0.6665000416562527, "grad_norm": 3.406407594729227, "learning_rate": 6.664568445217907e-06, "loss": 1.0651, "step": 18000 }, { "epoch": 0.670202819665454, "grad_norm": 2.866315730154109, "learning_rate": 6.701595882548969e-06, "loss": 1.0561, "step": 18100 }, { "epoch": 0.6739055976746554, "grad_norm": 4.376427115975032, "learning_rate": 6.738623319880032e-06, "loss": 1.0602, "step": 18200 }, { "epoch": 0.6776083756838568, "grad_norm": 3.2373014543024072, "learning_rate": 6.775650757211094e-06, "loss": 1.0397, "step": 18300 }, { "epoch": 0.6813111536930582, "grad_norm": 3.9666084272474245, "learning_rate": 6.8126781945421565e-06, "loss": 1.0445, "step": 18400 }, { "epoch": 0.6850139317022597, "grad_norm": 3.800460648153449, "learning_rate": 6.8497056318732194e-06, "loss": 1.0494, "step": 18500 }, { "epoch": 0.688716709711461, "grad_norm": 4.274139967670289, "learning_rate": 6.886733069204281e-06, "loss": 1.0756, "step": 18600 }, { "epoch": 0.6924194877206624, "grad_norm": 3.9994897603573665, "learning_rate": 6.923760506535343e-06, "loss": 1.0412, "step": 18700 }, { "epoch": 0.6961222657298638, "grad_norm": 3.8633706482810553, "learning_rate": 6.960787943866406e-06, "loss": 1.0522, "step": 18800 }, { "epoch": 0.6998250437390653, "grad_norm": 4.381038145148409, "learning_rate": 6.997815381197468e-06, "loss": 1.0209, "step": 18900 }, { "epoch": 0.7035278217482667, "grad_norm": 4.1366815727300175, "learning_rate": 7.03484281852853e-06, "loss": 1.0351, "step": 19000 }, { "epoch": 0.707230599757468, "grad_norm": 3.675160954426471, "learning_rate": 7.071870255859593e-06, "loss": 1.0291, "step": 19100 }, { "epoch": 0.7109333777666694, "grad_norm": 3.603001585625626, "learning_rate": 7.108897693190655e-06, "loss": 1.0093, "step": 19200 }, { "epoch": 0.7146361557758708, "grad_norm": 3.264233985905883, "learning_rate": 7.145925130521717e-06, "loss": 1.0394, "step": 19300 }, { "epoch": 0.7183389337850723, "grad_norm": 3.8898393773576614, "learning_rate": 7.182952567852779e-06, "loss": 1.0309, "step": 19400 }, { "epoch": 0.7220417117942737, "grad_norm": 3.2365236970202917, "learning_rate": 7.219980005183842e-06, "loss": 1.0304, "step": 19500 }, { "epoch": 0.7257444898034751, "grad_norm": 3.497985907223146, "learning_rate": 7.257007442514904e-06, "loss": 1.0375, "step": 19600 }, { "epoch": 0.7294472678126764, "grad_norm": 3.6359487776078714, "learning_rate": 7.294034879845966e-06, "loss": 1.0132, "step": 19700 }, { "epoch": 0.7331500458218778, "grad_norm": 3.9379821448814343, "learning_rate": 7.331062317177029e-06, "loss": 1.019, "step": 19800 }, { "epoch": 0.7368528238310793, "grad_norm": 4.750912299244358, "learning_rate": 7.368089754508091e-06, "loss": 1.0172, "step": 19900 }, { "epoch": 0.7405556018402807, "grad_norm": 4.253487251656156, "learning_rate": 7.405117191839153e-06, "loss": 1.0093, "step": 20000 }, { "epoch": 0.7442583798494821, "grad_norm": 3.5926201949515284, "learning_rate": 7.442144629170216e-06, "loss": 1.0163, "step": 20100 }, { "epoch": 0.7479611578586834, "grad_norm": 3.8927981125403073, "learning_rate": 7.479172066501278e-06, "loss": 0.9827, "step": 20200 }, { "epoch": 0.7516639358678849, "grad_norm": 4.573750708649385, "learning_rate": 7.51619950383234e-06, "loss": 1.0087, "step": 20300 }, { "epoch": 0.7553667138770863, "grad_norm": 3.1260071677382952, "learning_rate": 7.553226941163403e-06, "loss": 1.0275, "step": 20400 }, { "epoch": 0.7590694918862877, "grad_norm": 3.965170800530151, "learning_rate": 7.590254378494465e-06, "loss": 1.0132, "step": 20500 }, { "epoch": 0.7627722698954891, "grad_norm": 3.1267324185244876, "learning_rate": 7.627281815825527e-06, "loss": 0.9803, "step": 20600 }, { "epoch": 0.7664750479046905, "grad_norm": 3.4086206206945358, "learning_rate": 7.66430925315659e-06, "loss": 0.9954, "step": 20700 }, { "epoch": 0.770177825913892, "grad_norm": 4.254989394844253, "learning_rate": 7.701336690487652e-06, "loss": 0.9922, "step": 20800 }, { "epoch": 0.7738806039230933, "grad_norm": 3.7295802263455564, "learning_rate": 7.738364127818714e-06, "loss": 1.0004, "step": 20900 }, { "epoch": 0.7775833819322947, "grad_norm": 4.671822378080011, "learning_rate": 7.775391565149777e-06, "loss": 0.9887, "step": 21000 }, { "epoch": 0.7812861599414961, "grad_norm": 3.941447202712077, "learning_rate": 7.81241900248084e-06, "loss": 0.982, "step": 21100 }, { "epoch": 0.7849889379506975, "grad_norm": 3.6530756566069633, "learning_rate": 7.8494464398119e-06, "loss": 0.9668, "step": 21200 }, { "epoch": 0.788691715959899, "grad_norm": 3.9080413448774625, "learning_rate": 7.886473877142964e-06, "loss": 1.0055, "step": 21300 }, { "epoch": 0.7923944939691003, "grad_norm": 3.603292889224154, "learning_rate": 7.923501314474025e-06, "loss": 1.0039, "step": 21400 }, { "epoch": 0.7960972719783017, "grad_norm": 3.677324652882952, "learning_rate": 7.960528751805088e-06, "loss": 1.0226, "step": 21500 }, { "epoch": 0.7998000499875031, "grad_norm": 3.6513050899215056, "learning_rate": 7.997556189136151e-06, "loss": 0.9766, "step": 21600 }, { "epoch": 0.8035028279967046, "grad_norm": 3.357793359434031, "learning_rate": 8.034583626467212e-06, "loss": 0.9813, "step": 21700 }, { "epoch": 0.807205606005906, "grad_norm": 2.760916399152151, "learning_rate": 8.071611063798275e-06, "loss": 0.979, "step": 21800 }, { "epoch": 0.8109083840151073, "grad_norm": 3.6887561411257046, "learning_rate": 8.108638501129338e-06, "loss": 0.9601, "step": 21900 }, { "epoch": 0.8146111620243087, "grad_norm": 3.652784111767623, "learning_rate": 8.1456659384604e-06, "loss": 0.9799, "step": 22000 }, { "epoch": 0.8183139400335101, "grad_norm": 3.479191284518454, "learning_rate": 8.182693375791462e-06, "loss": 0.9785, "step": 22100 }, { "epoch": 0.8220167180427116, "grad_norm": 4.128090009925586, "learning_rate": 8.219720813122525e-06, "loss": 0.9841, "step": 22200 }, { "epoch": 0.825719496051913, "grad_norm": 3.662863541747893, "learning_rate": 8.256748250453587e-06, "loss": 0.947, "step": 22300 }, { "epoch": 0.8294222740611144, "grad_norm": 3.217491905494941, "learning_rate": 8.29377568778465e-06, "loss": 0.9757, "step": 22400 }, { "epoch": 0.8331250520703157, "grad_norm": 3.326399094958347, "learning_rate": 8.330803125115712e-06, "loss": 0.9598, "step": 22500 }, { "epoch": 0.8368278300795171, "grad_norm": 2.909529562627351, "learning_rate": 8.367830562446774e-06, "loss": 0.9587, "step": 22600 }, { "epoch": 0.8405306080887186, "grad_norm": 3.1644941957196346, "learning_rate": 8.404857999777835e-06, "loss": 0.9689, "step": 22700 }, { "epoch": 0.84423338609792, "grad_norm": 3.152721404570826, "learning_rate": 8.441885437108898e-06, "loss": 0.9822, "step": 22800 }, { "epoch": 0.8479361641071214, "grad_norm": 3.879640971417378, "learning_rate": 8.478912874439961e-06, "loss": 0.9379, "step": 22900 }, { "epoch": 0.8516389421163227, "grad_norm": 4.432110940230918, "learning_rate": 8.515940311771022e-06, "loss": 0.9544, "step": 23000 }, { "epoch": 0.8553417201255242, "grad_norm": 3.2420578574260697, "learning_rate": 8.552967749102085e-06, "loss": 0.9292, "step": 23100 }, { "epoch": 0.8590444981347256, "grad_norm": 4.617591314029601, "learning_rate": 8.589995186433148e-06, "loss": 0.9477, "step": 23200 }, { "epoch": 0.862747276143927, "grad_norm": 3.0259937163445194, "learning_rate": 8.62702262376421e-06, "loss": 0.9559, "step": 23300 }, { "epoch": 0.8664500541531284, "grad_norm": 3.278192583185341, "learning_rate": 8.664050061095272e-06, "loss": 0.9583, "step": 23400 }, { "epoch": 0.8701528321623297, "grad_norm": 3.6509553777490424, "learning_rate": 8.701077498426335e-06, "loss": 0.9749, "step": 23500 }, { "epoch": 0.8738556101715312, "grad_norm": 3.4075169195241677, "learning_rate": 8.738104935757397e-06, "loss": 0.9223, "step": 23600 }, { "epoch": 0.8775583881807326, "grad_norm": 2.893435896152694, "learning_rate": 8.77513237308846e-06, "loss": 0.9721, "step": 23700 }, { "epoch": 0.881261166189934, "grad_norm": 4.064038243050667, "learning_rate": 8.81215981041952e-06, "loss": 0.9648, "step": 23800 }, { "epoch": 0.8849639441991354, "grad_norm": 3.28397127001984, "learning_rate": 8.849187247750584e-06, "loss": 0.9413, "step": 23900 }, { "epoch": 0.8886667222083368, "grad_norm": 2.7396572790329183, "learning_rate": 8.886214685081647e-06, "loss": 0.9538, "step": 24000 }, { "epoch": 0.8923695002175382, "grad_norm": 3.831935799094677, "learning_rate": 8.923242122412708e-06, "loss": 0.9488, "step": 24100 }, { "epoch": 0.8960722782267396, "grad_norm": 3.083787982483977, "learning_rate": 8.960269559743771e-06, "loss": 0.9798, "step": 24200 }, { "epoch": 0.899775056235941, "grad_norm": 3.2638182611068784, "learning_rate": 8.997296997074834e-06, "loss": 0.9364, "step": 24300 }, { "epoch": 0.9034778342451424, "grad_norm": 2.8271807284549824, "learning_rate": 9.034324434405895e-06, "loss": 0.9592, "step": 24400 }, { "epoch": 0.9071806122543439, "grad_norm": 3.1878575837021295, "learning_rate": 9.071351871736958e-06, "loss": 0.944, "step": 24500 }, { "epoch": 0.9108833902635453, "grad_norm": 3.344639834530028, "learning_rate": 9.108379309068021e-06, "loss": 0.9305, "step": 24600 }, { "epoch": 0.9145861682727466, "grad_norm": 3.271225972067076, "learning_rate": 9.145406746399082e-06, "loss": 0.9334, "step": 24700 }, { "epoch": 0.918288946281948, "grad_norm": 3.7284762668702314, "learning_rate": 9.182434183730145e-06, "loss": 0.9195, "step": 24800 }, { "epoch": 0.9219917242911494, "grad_norm": 4.098726803359245, "learning_rate": 9.219461621061208e-06, "loss": 0.9398, "step": 24900 }, { "epoch": 0.9256945023003509, "grad_norm": 2.8965683319135795, "learning_rate": 9.25648905839227e-06, "loss": 0.9365, "step": 25000 }, { "epoch": 0.9293972803095523, "grad_norm": 3.472751774536713, "learning_rate": 9.29351649572333e-06, "loss": 0.942, "step": 25100 }, { "epoch": 0.9331000583187536, "grad_norm": 2.741682625283635, "learning_rate": 9.330543933054394e-06, "loss": 0.9463, "step": 25200 }, { "epoch": 0.936802836327955, "grad_norm": 2.9695965336662584, "learning_rate": 9.367571370385457e-06, "loss": 0.9355, "step": 25300 }, { "epoch": 0.9405056143371564, "grad_norm": 3.0507053957289814, "learning_rate": 9.404598807716518e-06, "loss": 0.9123, "step": 25400 }, { "epoch": 0.9442083923463579, "grad_norm": 3.997694772330171, "learning_rate": 9.441626245047581e-06, "loss": 0.9326, "step": 25500 }, { "epoch": 0.9479111703555593, "grad_norm": 3.006394895446821, "learning_rate": 9.478653682378644e-06, "loss": 0.9343, "step": 25600 }, { "epoch": 0.9516139483647607, "grad_norm": 3.030113997176904, "learning_rate": 9.515681119709705e-06, "loss": 0.9263, "step": 25700 }, { "epoch": 0.955316726373962, "grad_norm": 2.9203358557009187, "learning_rate": 9.552708557040768e-06, "loss": 0.9432, "step": 25800 }, { "epoch": 0.9590195043831635, "grad_norm": 3.320595109219397, "learning_rate": 9.589735994371831e-06, "loss": 0.9132, "step": 25900 }, { "epoch": 0.9627222823923649, "grad_norm": 3.2953825381075883, "learning_rate": 9.626763431702892e-06, "loss": 0.945, "step": 26000 }, { "epoch": 0.9664250604015663, "grad_norm": 2.8690624980388097, "learning_rate": 9.663790869033955e-06, "loss": 0.9269, "step": 26100 }, { "epoch": 0.9701278384107677, "grad_norm": 3.238448651703217, "learning_rate": 9.700818306365017e-06, "loss": 0.9223, "step": 26200 }, { "epoch": 0.973830616419969, "grad_norm": 3.8191924390794076, "learning_rate": 9.73784574369608e-06, "loss": 0.9294, "step": 26300 }, { "epoch": 0.9775333944291705, "grad_norm": 2.89593322378588, "learning_rate": 9.774873181027143e-06, "loss": 0.92, "step": 26400 }, { "epoch": 0.9812361724383719, "grad_norm": 3.701573808189793, "learning_rate": 9.811900618358204e-06, "loss": 0.9471, "step": 26500 }, { "epoch": 0.9849389504475733, "grad_norm": 3.334857306171596, "learning_rate": 9.848928055689267e-06, "loss": 0.9205, "step": 26600 }, { "epoch": 0.9886417284567747, "grad_norm": 3.3187413615000705, "learning_rate": 9.88595549302033e-06, "loss": 0.9065, "step": 26700 }, { "epoch": 0.992344506465976, "grad_norm": 3.3451918803521945, "learning_rate": 9.922982930351391e-06, "loss": 0.8933, "step": 26800 }, { "epoch": 0.9960472844751775, "grad_norm": 3.2100420698017147, "learning_rate": 9.960010367682454e-06, "loss": 0.9112, "step": 26900 }, { "epoch": 0.9997500624843789, "grad_norm": 2.837918388436053, "learning_rate": 9.997037805013515e-06, "loss": 0.9567, "step": 27000 }, { "epoch": 1.0034435835485573, "grad_norm": 2.9548645107416087, "learning_rate": 9.99999646509579e-06, "loss": 0.8909, "step": 27100 }, { "epoch": 1.0071463615577587, "grad_norm": 3.1446751198623435, "learning_rate": 9.999984604128388e-06, "loss": 0.8726, "step": 27200 }, { "epoch": 1.01084913956696, "grad_norm": 2.976258203122292, "learning_rate": 9.999964390388652e-06, "loss": 0.893, "step": 27300 }, { "epoch": 1.0145519175761615, "grad_norm": 2.980666927786789, "learning_rate": 9.999935823910352e-06, "loss": 0.8665, "step": 27400 }, { "epoch": 1.0182546955853629, "grad_norm": 3.1487567474786795, "learning_rate": 9.999898904741209e-06, "loss": 0.8649, "step": 27500 }, { "epoch": 1.0219574735945642, "grad_norm": 2.629105326617924, "learning_rate": 9.999853632942897e-06, "loss": 0.8607, "step": 27600 }, { "epoch": 1.0256602516037656, "grad_norm": 3.8572529888050826, "learning_rate": 9.999800008591049e-06, "loss": 0.8761, "step": 27700 }, { "epoch": 1.0293630296129672, "grad_norm": 2.767732632774148, "learning_rate": 9.999738031775246e-06, "loss": 0.8778, "step": 27800 }, { "epoch": 1.0330658076221686, "grad_norm": 3.4506007181787606, "learning_rate": 9.99966770259902e-06, "loss": 0.8842, "step": 27900 }, { "epoch": 1.03676858563137, "grad_norm": 3.3838219329729764, "learning_rate": 9.999589021179867e-06, "loss": 0.8653, "step": 28000 }, { "epoch": 1.0404713636405714, "grad_norm": 2.7380768463734673, "learning_rate": 9.999501987649225e-06, "loss": 0.8715, "step": 28100 }, { "epoch": 1.0441741416497727, "grad_norm": 2.5724850888468818, "learning_rate": 9.999406602152487e-06, "loss": 0.873, "step": 28200 }, { "epoch": 1.0478769196589741, "grad_norm": 2.814547694838622, "learning_rate": 9.999302864849006e-06, "loss": 0.8652, "step": 28300 }, { "epoch": 1.0515796976681755, "grad_norm": 2.8671305113874985, "learning_rate": 9.999190775912075e-06, "loss": 0.8773, "step": 28400 }, { "epoch": 1.0552824756773769, "grad_norm": 3.4364162307062016, "learning_rate": 9.999070335528951e-06, "loss": 0.8722, "step": 28500 }, { "epoch": 1.0589852536865783, "grad_norm": 3.1668831518960747, "learning_rate": 9.99894154390083e-06, "loss": 0.878, "step": 28600 }, { "epoch": 1.0626880316957799, "grad_norm": 2.5661208890092215, "learning_rate": 9.998804401242874e-06, "loss": 0.8642, "step": 28700 }, { "epoch": 1.0663908097049812, "grad_norm": 2.702695496460383, "learning_rate": 9.998658907784183e-06, "loss": 0.8576, "step": 28800 }, { "epoch": 1.0700935877141826, "grad_norm": 2.9176557901328666, "learning_rate": 9.998505063767811e-06, "loss": 0.8705, "step": 28900 }, { "epoch": 1.073796365723384, "grad_norm": 3.7212751721205937, "learning_rate": 9.998342869450767e-06, "loss": 0.8641, "step": 29000 }, { "epoch": 1.0774991437325854, "grad_norm": 3.1124898105603767, "learning_rate": 9.998172325104007e-06, "loss": 0.8679, "step": 29100 }, { "epoch": 1.0812019217417868, "grad_norm": 3.211449706725418, "learning_rate": 9.997993431012433e-06, "loss": 0.8698, "step": 29200 }, { "epoch": 1.0849046997509881, "grad_norm": 3.3735417926416105, "learning_rate": 9.997806187474899e-06, "loss": 0.8593, "step": 29300 }, { "epoch": 1.0886074777601895, "grad_norm": 3.100346897510759, "learning_rate": 9.997610594804206e-06, "loss": 0.8852, "step": 29400 }, { "epoch": 1.092310255769391, "grad_norm": 3.046486604721806, "learning_rate": 9.997406653327103e-06, "loss": 0.8734, "step": 29500 }, { "epoch": 1.0960130337785925, "grad_norm": 3.462276437104938, "learning_rate": 9.99719436338429e-06, "loss": 0.8333, "step": 29600 }, { "epoch": 1.0997158117877939, "grad_norm": 2.8629449765918564, "learning_rate": 9.996973725330405e-06, "loss": 0.8584, "step": 29700 }, { "epoch": 1.1034185897969953, "grad_norm": 2.9165610257153873, "learning_rate": 9.996744739534042e-06, "loss": 0.8665, "step": 29800 }, { "epoch": 1.1071213678061966, "grad_norm": 3.1796750168940315, "learning_rate": 9.996507406377728e-06, "loss": 0.8787, "step": 29900 }, { "epoch": 1.110824145815398, "grad_norm": 3.160592130889014, "learning_rate": 9.99626172625795e-06, "loss": 0.8718, "step": 30000 }, { "epoch": 1.1145269238245994, "grad_norm": 2.779237960939386, "learning_rate": 9.99600769958513e-06, "loss": 0.878, "step": 30100 }, { "epoch": 1.1182297018338008, "grad_norm": 3.108400228708763, "learning_rate": 9.995745326783628e-06, "loss": 0.868, "step": 30200 }, { "epoch": 1.1219324798430022, "grad_norm": 3.0911030432916817, "learning_rate": 9.995474608291761e-06, "loss": 0.8621, "step": 30300 }, { "epoch": 1.1256352578522035, "grad_norm": 2.5787433018725, "learning_rate": 9.995195544561778e-06, "loss": 0.8754, "step": 30400 }, { "epoch": 1.1293380358614051, "grad_norm": 2.8342296943136165, "learning_rate": 9.994908136059868e-06, "loss": 0.8373, "step": 30500 }, { "epoch": 1.1330408138706065, "grad_norm": 2.476461823350524, "learning_rate": 9.994612383266171e-06, "loss": 0.842, "step": 30600 }, { "epoch": 1.136743591879808, "grad_norm": 3.9271884057807296, "learning_rate": 9.994308286674754e-06, "loss": 0.8453, "step": 30700 }, { "epoch": 1.1404463698890093, "grad_norm": 3.1786946065406236, "learning_rate": 9.99399584679363e-06, "loss": 0.8648, "step": 30800 }, { "epoch": 1.1441491478982106, "grad_norm": 2.634901131802063, "learning_rate": 9.99367506414475e-06, "loss": 0.8751, "step": 30900 }, { "epoch": 1.147851925907412, "grad_norm": 3.078376805123231, "learning_rate": 9.993345939264e-06, "loss": 0.8635, "step": 31000 }, { "epoch": 1.1515547039166134, "grad_norm": 3.1528853117678786, "learning_rate": 9.9930084727012e-06, "loss": 0.843, "step": 31100 }, { "epoch": 1.1552574819258148, "grad_norm": 2.828228109125317, "learning_rate": 9.992662665020112e-06, "loss": 0.8624, "step": 31200 }, { "epoch": 1.1589602599350162, "grad_norm": 3.1953124851506707, "learning_rate": 9.992308516798426e-06, "loss": 0.8579, "step": 31300 }, { "epoch": 1.1626630379442178, "grad_norm": 3.0902899613744603, "learning_rate": 9.991946028627768e-06, "loss": 0.8527, "step": 31400 }, { "epoch": 1.1663658159534191, "grad_norm": 2.9463681925783023, "learning_rate": 9.991575201113695e-06, "loss": 0.8268, "step": 31500 }, { "epoch": 1.1700685939626205, "grad_norm": 2.8044002498862057, "learning_rate": 9.991196034875698e-06, "loss": 0.8395, "step": 31600 }, { "epoch": 1.173771371971822, "grad_norm": 2.9461135183049936, "learning_rate": 9.990808530547197e-06, "loss": 0.858, "step": 31700 }, { "epoch": 1.1774741499810233, "grad_norm": 3.213674861669168, "learning_rate": 9.990412688775542e-06, "loss": 0.864, "step": 31800 }, { "epoch": 1.1811769279902247, "grad_norm": 2.71190688635739, "learning_rate": 9.99000851022201e-06, "loss": 0.855, "step": 31900 }, { "epoch": 1.184879705999426, "grad_norm": 3.723777231794139, "learning_rate": 9.9895959955618e-06, "loss": 0.8456, "step": 32000 }, { "epoch": 1.1885824840086274, "grad_norm": 2.4622343303272918, "learning_rate": 9.989175145484049e-06, "loss": 0.8217, "step": 32100 }, { "epoch": 1.1922852620178288, "grad_norm": 2.8875388301298472, "learning_rate": 9.98874596069181e-06, "loss": 0.8591, "step": 32200 }, { "epoch": 1.1959880400270302, "grad_norm": 2.5910572126310716, "learning_rate": 9.988308441902061e-06, "loss": 0.8453, "step": 32300 }, { "epoch": 1.1996908180362316, "grad_norm": 2.4069698963541755, "learning_rate": 9.987862589845703e-06, "loss": 0.8503, "step": 32400 }, { "epoch": 1.2033935960454332, "grad_norm": 2.914526087822122, "learning_rate": 9.987408405267561e-06, "loss": 0.8668, "step": 32500 }, { "epoch": 1.2070963740546345, "grad_norm": 2.8077292984671485, "learning_rate": 9.986945888926374e-06, "loss": 0.8314, "step": 32600 }, { "epoch": 1.210799152063836, "grad_norm": 3.6704712964311437, "learning_rate": 9.986475041594805e-06, "loss": 0.8371, "step": 32700 }, { "epoch": 1.2145019300730373, "grad_norm": 2.6706897230097297, "learning_rate": 9.985995864059433e-06, "loss": 0.876, "step": 32800 }, { "epoch": 1.2182047080822387, "grad_norm": 3.0940143448561037, "learning_rate": 9.98550835712075e-06, "loss": 0.8364, "step": 32900 }, { "epoch": 1.22190748609144, "grad_norm": 3.6081252765429963, "learning_rate": 9.98501252159317e-06, "loss": 0.8378, "step": 33000 }, { "epoch": 1.2256102641006414, "grad_norm": 2.77425534329751, "learning_rate": 9.984508358305012e-06, "loss": 0.8449, "step": 33100 }, { "epoch": 1.2293130421098428, "grad_norm": 2.847973382987711, "learning_rate": 9.98399586809851e-06, "loss": 0.8446, "step": 33200 }, { "epoch": 1.2330158201190442, "grad_norm": 3.1916476231654984, "learning_rate": 9.983475051829814e-06, "loss": 0.8499, "step": 33300 }, { "epoch": 1.2367185981282458, "grad_norm": 3.0278042743633047, "learning_rate": 9.982945910368974e-06, "loss": 0.8427, "step": 33400 }, { "epoch": 1.2404213761374472, "grad_norm": 2.9987823840994685, "learning_rate": 9.982408444599955e-06, "loss": 0.8565, "step": 33500 }, { "epoch": 1.2441241541466486, "grad_norm": 2.956201689254189, "learning_rate": 9.981862655420626e-06, "loss": 0.8303, "step": 33600 }, { "epoch": 1.24782693215585, "grad_norm": 3.306173716101804, "learning_rate": 9.981308543742759e-06, "loss": 0.8351, "step": 33700 }, { "epoch": 1.2515297101650513, "grad_norm": 2.8358161347669624, "learning_rate": 9.98074611049203e-06, "loss": 0.854, "step": 33800 }, { "epoch": 1.2552324881742527, "grad_norm": 3.2862147001432263, "learning_rate": 9.980175356608018e-06, "loss": 0.8176, "step": 33900 }, { "epoch": 1.258935266183454, "grad_norm": 3.839933772493448, "learning_rate": 9.979596283044202e-06, "loss": 0.8353, "step": 34000 }, { "epoch": 1.2626380441926555, "grad_norm": 3.039521277363643, "learning_rate": 9.979008890767958e-06, "loss": 0.8313, "step": 34100 }, { "epoch": 1.2663408222018568, "grad_norm": 2.670418682490729, "learning_rate": 9.97841318076056e-06, "loss": 0.8299, "step": 34200 }, { "epoch": 1.2700436002110584, "grad_norm": 2.6610287960828947, "learning_rate": 9.977809154017177e-06, "loss": 0.8255, "step": 34300 }, { "epoch": 1.2737463782202598, "grad_norm": 4.255372007943821, "learning_rate": 9.977196811546874e-06, "loss": 0.8178, "step": 34400 }, { "epoch": 1.2774491562294612, "grad_norm": 2.486491204040578, "learning_rate": 9.976576154372603e-06, "loss": 0.8131, "step": 34500 }, { "epoch": 1.2811519342386626, "grad_norm": 2.5063224331855967, "learning_rate": 9.975947183531208e-06, "loss": 0.8425, "step": 34600 }, { "epoch": 1.284854712247864, "grad_norm": 2.7512179307220226, "learning_rate": 9.975309900073424e-06, "loss": 0.8593, "step": 34700 }, { "epoch": 1.2885574902570653, "grad_norm": 2.712022237784725, "learning_rate": 9.974664305063872e-06, "loss": 0.8217, "step": 34800 }, { "epoch": 1.2922602682662667, "grad_norm": 2.5781139265649213, "learning_rate": 9.974010399581056e-06, "loss": 0.8009, "step": 34900 }, { "epoch": 1.295963046275468, "grad_norm": 2.3094975445159927, "learning_rate": 9.973348184717362e-06, "loss": 0.8441, "step": 35000 }, { "epoch": 1.2996658242846695, "grad_norm": 3.222306020034265, "learning_rate": 9.972677661579062e-06, "loss": 0.8453, "step": 35100 }, { "epoch": 1.303368602293871, "grad_norm": 2.4004480742086383, "learning_rate": 9.971998831286305e-06, "loss": 0.8352, "step": 35200 }, { "epoch": 1.3070713803030725, "grad_norm": 2.9242567540358193, "learning_rate": 9.971311694973115e-06, "loss": 0.8251, "step": 35300 }, { "epoch": 1.3107741583122738, "grad_norm": 3.3760497497529234, "learning_rate": 9.970616253787394e-06, "loss": 0.8212, "step": 35400 }, { "epoch": 1.3144769363214752, "grad_norm": 2.738484575208949, "learning_rate": 9.969912508890924e-06, "loss": 0.8338, "step": 35500 }, { "epoch": 1.3181797143306766, "grad_norm": 2.403858688871253, "learning_rate": 9.969200461459344e-06, "loss": 0.8051, "step": 35600 }, { "epoch": 1.321882492339878, "grad_norm": 3.317873477816687, "learning_rate": 9.96848011268218e-06, "loss": 0.8275, "step": 35700 }, { "epoch": 1.3255852703490794, "grad_norm": 3.0383409211764465, "learning_rate": 9.967751463762811e-06, "loss": 0.8102, "step": 35800 }, { "epoch": 1.3292880483582807, "grad_norm": 2.716682345656308, "learning_rate": 9.967014515918491e-06, "loss": 0.7922, "step": 35900 }, { "epoch": 1.3329908263674821, "grad_norm": 2.5903044471345407, "learning_rate": 9.966269270380338e-06, "loss": 0.8152, "step": 36000 }, { "epoch": 1.3366936043766837, "grad_norm": 2.6042198988611505, "learning_rate": 9.965515728393324e-06, "loss": 0.816, "step": 36100 }, { "epoch": 1.3403963823858849, "grad_norm": 2.693329966895918, "learning_rate": 9.96475389121629e-06, "loss": 0.8213, "step": 36200 }, { "epoch": 1.3440991603950865, "grad_norm": 2.8636639283082683, "learning_rate": 9.963983760121927e-06, "loss": 0.8028, "step": 36300 }, { "epoch": 1.3478019384042879, "grad_norm": 2.265142729976952, "learning_rate": 9.963205336396789e-06, "loss": 0.8312, "step": 36400 }, { "epoch": 1.3515047164134892, "grad_norm": 2.346991317901365, "learning_rate": 9.962418621341275e-06, "loss": 0.8057, "step": 36500 }, { "epoch": 1.3552074944226906, "grad_norm": 2.9365358115995988, "learning_rate": 9.961623616269642e-06, "loss": 0.811, "step": 36600 }, { "epoch": 1.358910272431892, "grad_norm": 2.765547820893004, "learning_rate": 9.960820322509991e-06, "loss": 0.8176, "step": 36700 }, { "epoch": 1.3626130504410934, "grad_norm": 2.242257331515756, "learning_rate": 9.960008741404278e-06, "loss": 0.8093, "step": 36800 }, { "epoch": 1.3663158284502948, "grad_norm": 2.6584008858920396, "learning_rate": 9.959188874308289e-06, "loss": 0.8128, "step": 36900 }, { "epoch": 1.3700186064594964, "grad_norm": 2.426868840194363, "learning_rate": 9.958360722591666e-06, "loss": 0.8356, "step": 37000 }, { "epoch": 1.3737213844686975, "grad_norm": 2.372175307387934, "learning_rate": 9.957524287637887e-06, "loss": 0.7955, "step": 37100 }, { "epoch": 1.3774241624778991, "grad_norm": 3.223986523587691, "learning_rate": 9.956679570844263e-06, "loss": 0.8446, "step": 37200 }, { "epoch": 1.3811269404871005, "grad_norm": 2.9758674723626495, "learning_rate": 9.955826573621947e-06, "loss": 0.8258, "step": 37300 }, { "epoch": 1.3848297184963019, "grad_norm": 2.4986387404446972, "learning_rate": 9.954965297395917e-06, "loss": 0.838, "step": 37400 }, { "epoch": 1.3885324965055033, "grad_norm": 3.0506103639317383, "learning_rate": 9.954095743604993e-06, "loss": 0.8106, "step": 37500 }, { "epoch": 1.3922352745147046, "grad_norm": 2.596803194782613, "learning_rate": 9.953217913701809e-06, "loss": 0.8101, "step": 37600 }, { "epoch": 1.395938052523906, "grad_norm": 3.0268925428493034, "learning_rate": 9.952331809152837e-06, "loss": 0.7984, "step": 37700 }, { "epoch": 1.3996408305331074, "grad_norm": 2.811749288978374, "learning_rate": 9.951437431438368e-06, "loss": 0.7956, "step": 37800 }, { "epoch": 1.403343608542309, "grad_norm": 2.5791340034648673, "learning_rate": 9.95053478205251e-06, "loss": 0.807, "step": 37900 }, { "epoch": 1.4070463865515102, "grad_norm": 2.73932100840186, "learning_rate": 9.949623862503194e-06, "loss": 0.8045, "step": 38000 }, { "epoch": 1.4107491645607118, "grad_norm": 2.4120952001387836, "learning_rate": 9.948704674312166e-06, "loss": 0.8062, "step": 38100 }, { "epoch": 1.4144519425699131, "grad_norm": 2.876732239954283, "learning_rate": 9.947777219014985e-06, "loss": 0.8153, "step": 38200 }, { "epoch": 1.4181547205791145, "grad_norm": 2.648889760862942, "learning_rate": 9.94684149816102e-06, "loss": 0.7769, "step": 38300 }, { "epoch": 1.421857498588316, "grad_norm": 2.3672035754478253, "learning_rate": 9.945897513313446e-06, "loss": 0.8248, "step": 38400 }, { "epoch": 1.4255602765975173, "grad_norm": 2.5897674265990966, "learning_rate": 9.944945266049249e-06, "loss": 0.8168, "step": 38500 }, { "epoch": 1.4292630546067187, "grad_norm": 2.3627727017427986, "learning_rate": 9.943984757959214e-06, "loss": 0.8061, "step": 38600 }, { "epoch": 1.43296583261592, "grad_norm": 2.75793352812743, "learning_rate": 9.943015990647928e-06, "loss": 0.8406, "step": 38700 }, { "epoch": 1.4366686106251216, "grad_norm": 2.93848471854443, "learning_rate": 9.942038965733772e-06, "loss": 0.8093, "step": 38800 }, { "epoch": 1.4403713886343228, "grad_norm": 2.649269836710229, "learning_rate": 9.941053684848927e-06, "loss": 0.8156, "step": 38900 }, { "epoch": 1.4440741666435244, "grad_norm": 2.8474414546277336, "learning_rate": 9.940060149639362e-06, "loss": 0.803, "step": 39000 }, { "epoch": 1.4477769446527258, "grad_norm": 2.48930011170331, "learning_rate": 9.939058361764835e-06, "loss": 0.8149, "step": 39100 }, { "epoch": 1.4514797226619272, "grad_norm": 3.0541359408620954, "learning_rate": 9.938048322898897e-06, "loss": 0.7905, "step": 39200 }, { "epoch": 1.4551825006711285, "grad_norm": 2.4906684578035634, "learning_rate": 9.937030034728875e-06, "loss": 0.7983, "step": 39300 }, { "epoch": 1.45888527868033, "grad_norm": 2.522647352158736, "learning_rate": 9.93600349895588e-06, "loss": 0.8257, "step": 39400 }, { "epoch": 1.4625880566895313, "grad_norm": 2.3593412219963636, "learning_rate": 9.934968717294801e-06, "loss": 0.8156, "step": 39500 }, { "epoch": 1.4662908346987327, "grad_norm": 2.557490073344118, "learning_rate": 9.933925691474306e-06, "loss": 0.8168, "step": 39600 }, { "epoch": 1.469993612707934, "grad_norm": 2.338179664285831, "learning_rate": 9.932874423236827e-06, "loss": 0.8037, "step": 39700 }, { "epoch": 1.4736963907171354, "grad_norm": 2.5845812397294106, "learning_rate": 9.931814914338574e-06, "loss": 0.8037, "step": 39800 }, { "epoch": 1.477399168726337, "grad_norm": 2.738120618908721, "learning_rate": 9.930747166549517e-06, "loss": 0.8248, "step": 39900 }, { "epoch": 1.4811019467355384, "grad_norm": 3.0001861261521077, "learning_rate": 9.929671181653393e-06, "loss": 0.8158, "step": 40000 }, { "epoch": 1.4848047247447398, "grad_norm": 2.8790789963198864, "learning_rate": 9.9285869614477e-06, "loss": 0.81, "step": 40100 }, { "epoch": 1.4885075027539412, "grad_norm": 2.3530303824789263, "learning_rate": 9.927494507743693e-06, "loss": 0.8065, "step": 40200 }, { "epoch": 1.4922102807631425, "grad_norm": 2.5001939120344563, "learning_rate": 9.926393822366378e-06, "loss": 0.7986, "step": 40300 }, { "epoch": 1.495913058772344, "grad_norm": 3.322122232875802, "learning_rate": 9.925284907154518e-06, "loss": 0.8018, "step": 40400 }, { "epoch": 1.4996158367815453, "grad_norm": 2.632512694974853, "learning_rate": 9.924167763960622e-06, "loss": 0.7926, "step": 40500 }, { "epoch": 1.503318614790747, "grad_norm": 3.298194101210266, "learning_rate": 9.923042394650944e-06, "loss": 0.8012, "step": 40600 }, { "epoch": 1.507021392799948, "grad_norm": 2.5336893450551714, "learning_rate": 9.921908801105478e-06, "loss": 0.7979, "step": 40700 }, { "epoch": 1.5107241708091497, "grad_norm": 2.805390982714785, "learning_rate": 9.920766985217964e-06, "loss": 0.8007, "step": 40800 }, { "epoch": 1.5144269488183508, "grad_norm": 2.6228261966166846, "learning_rate": 9.919616948895869e-06, "loss": 0.7925, "step": 40900 }, { "epoch": 1.5181297268275524, "grad_norm": 2.608871750206852, "learning_rate": 9.918458694060401e-06, "loss": 0.8165, "step": 41000 }, { "epoch": 1.5218325048367538, "grad_norm": 2.3331476004266802, "learning_rate": 9.917292222646494e-06, "loss": 0.812, "step": 41100 }, { "epoch": 1.5255352828459552, "grad_norm": 2.256474693417922, "learning_rate": 9.916117536602805e-06, "loss": 0.8252, "step": 41200 }, { "epoch": 1.5292380608551566, "grad_norm": 2.2698106482233444, "learning_rate": 9.914934637891717e-06, "loss": 0.8049, "step": 41300 }, { "epoch": 1.532940838864358, "grad_norm": 2.505148031562781, "learning_rate": 9.913743528489335e-06, "loss": 0.7945, "step": 41400 }, { "epoch": 1.5366436168735595, "grad_norm": 2.273130621446297, "learning_rate": 9.912544210385478e-06, "loss": 0.7592, "step": 41500 }, { "epoch": 1.5403463948827607, "grad_norm": 2.7980874710174746, "learning_rate": 9.911336685583678e-06, "loss": 0.788, "step": 41600 }, { "epoch": 1.5440491728919623, "grad_norm": 2.518652055633435, "learning_rate": 9.910120956101177e-06, "loss": 0.7985, "step": 41700 }, { "epoch": 1.5477519509011635, "grad_norm": 2.8908580237428727, "learning_rate": 9.908897023968923e-06, "loss": 0.8022, "step": 41800 }, { "epoch": 1.551454728910365, "grad_norm": 2.796915452230328, "learning_rate": 9.907664891231567e-06, "loss": 0.7891, "step": 41900 }, { "epoch": 1.5551575069195664, "grad_norm": 2.58199772952833, "learning_rate": 9.906424559947463e-06, "loss": 0.8127, "step": 42000 }, { "epoch": 1.5588602849287678, "grad_norm": 2.9876242988374795, "learning_rate": 9.905176032188657e-06, "loss": 0.8103, "step": 42100 }, { "epoch": 1.5625630629379692, "grad_norm": 2.5402919151901284, "learning_rate": 9.903919310040888e-06, "loss": 0.8088, "step": 42200 }, { "epoch": 1.5662658409471706, "grad_norm": 2.759850752080656, "learning_rate": 9.902654395603585e-06, "loss": 0.7802, "step": 42300 }, { "epoch": 1.569968618956372, "grad_norm": 2.303462256584801, "learning_rate": 9.901381290989866e-06, "loss": 0.814, "step": 42400 }, { "epoch": 1.5736713969655733, "grad_norm": 2.5606229953487007, "learning_rate": 9.900099998326524e-06, "loss": 0.8199, "step": 42500 }, { "epoch": 1.577374174974775, "grad_norm": 2.429988282841457, "learning_rate": 9.898810519754038e-06, "loss": 0.8119, "step": 42600 }, { "epoch": 1.581076952983976, "grad_norm": 2.5219967389765823, "learning_rate": 9.897512857426559e-06, "loss": 0.8047, "step": 42700 }, { "epoch": 1.5847797309931777, "grad_norm": 1.9311972103887236, "learning_rate": 9.896207013511906e-06, "loss": 0.7986, "step": 42800 }, { "epoch": 1.588482509002379, "grad_norm": 2.234707722695266, "learning_rate": 9.894892990191572e-06, "loss": 0.8208, "step": 42900 }, { "epoch": 1.5921852870115805, "grad_norm": 2.52585572604136, "learning_rate": 9.89357078966071e-06, "loss": 0.8055, "step": 43000 }, { "epoch": 1.5958880650207818, "grad_norm": 2.4607414449083564, "learning_rate": 9.892240414128134e-06, "loss": 0.814, "step": 43100 }, { "epoch": 1.5995908430299832, "grad_norm": 2.397978043007156, "learning_rate": 9.890901865816318e-06, "loss": 0.7858, "step": 43200 }, { "epoch": 1.6032936210391846, "grad_norm": 2.6825010522588464, "learning_rate": 9.889555146961386e-06, "loss": 0.7643, "step": 43300 }, { "epoch": 1.606996399048386, "grad_norm": 2.296924382807737, "learning_rate": 9.888200259813112e-06, "loss": 0.774, "step": 43400 }, { "epoch": 1.6106991770575876, "grad_norm": 3.086030898842717, "learning_rate": 9.886837206634913e-06, "loss": 0.7903, "step": 43500 }, { "epoch": 1.6144019550667887, "grad_norm": 2.1770814434757852, "learning_rate": 9.885465989703855e-06, "loss": 0.7992, "step": 43600 }, { "epoch": 1.6181047330759903, "grad_norm": 2.1173853625857677, "learning_rate": 9.884086611310636e-06, "loss": 0.7562, "step": 43700 }, { "epoch": 1.6218075110851915, "grad_norm": 2.6347540026530383, "learning_rate": 9.88269907375959e-06, "loss": 0.7812, "step": 43800 }, { "epoch": 1.625510289094393, "grad_norm": 2.3677734896044367, "learning_rate": 9.881303379368679e-06, "loss": 0.7949, "step": 43900 }, { "epoch": 1.6292130671035945, "grad_norm": 1.9541227462181452, "learning_rate": 9.879899530469495e-06, "loss": 0.8014, "step": 44000 }, { "epoch": 1.6329158451127959, "grad_norm": 2.250577456663235, "learning_rate": 9.878487529407252e-06, "loss": 0.789, "step": 44100 }, { "epoch": 1.6366186231219972, "grad_norm": 2.332159230411907, "learning_rate": 9.877067378540783e-06, "loss": 0.7583, "step": 44200 }, { "epoch": 1.6403214011311986, "grad_norm": 2.620339688070587, "learning_rate": 9.875639080242532e-06, "loss": 0.7609, "step": 44300 }, { "epoch": 1.6440241791404002, "grad_norm": 2.281843212752168, "learning_rate": 9.874202636898557e-06, "loss": 0.7923, "step": 44400 }, { "epoch": 1.6477269571496014, "grad_norm": 2.7756614740728756, "learning_rate": 9.872758050908525e-06, "loss": 0.8133, "step": 44500 }, { "epoch": 1.651429735158803, "grad_norm": 2.0566706027251933, "learning_rate": 9.871305324685698e-06, "loss": 0.7771, "step": 44600 }, { "epoch": 1.6551325131680041, "grad_norm": 2.7157875508307203, "learning_rate": 9.869844460656946e-06, "loss": 0.7887, "step": 44700 }, { "epoch": 1.6588352911772057, "grad_norm": 2.3909529963011225, "learning_rate": 9.868375461262729e-06, "loss": 0.786, "step": 44800 }, { "epoch": 1.6625380691864071, "grad_norm": 3.0348798043450107, "learning_rate": 9.866898328957097e-06, "loss": 0.7658, "step": 44900 }, { "epoch": 1.6662408471956085, "grad_norm": 2.6335015906277564, "learning_rate": 9.865413066207686e-06, "loss": 0.7995, "step": 45000 }, { "epoch": 1.6699436252048099, "grad_norm": 2.385629891283996, "learning_rate": 9.863919675495718e-06, "loss": 0.7915, "step": 45100 }, { "epoch": 1.6736464032140113, "grad_norm": 2.9226049040665196, "learning_rate": 9.862418159315994e-06, "loss": 0.7846, "step": 45200 }, { "epoch": 1.6773491812232129, "grad_norm": 2.0898569179597546, "learning_rate": 9.860908520176881e-06, "loss": 0.7798, "step": 45300 }, { "epoch": 1.681051959232414, "grad_norm": 2.486543038672127, "learning_rate": 9.859390760600323e-06, "loss": 0.788, "step": 45400 }, { "epoch": 1.6847547372416156, "grad_norm": 2.150826890053404, "learning_rate": 9.857864883121829e-06, "loss": 0.8, "step": 45500 }, { "epoch": 1.6884575152508168, "grad_norm": 2.0064655253486494, "learning_rate": 9.856330890290467e-06, "loss": 0.7893, "step": 45600 }, { "epoch": 1.6921602932600184, "grad_norm": 1.974144415250403, "learning_rate": 9.854788784668862e-06, "loss": 0.8071, "step": 45700 }, { "epoch": 1.6958630712692198, "grad_norm": 2.8759598688034553, "learning_rate": 9.853238568833198e-06, "loss": 0.795, "step": 45800 }, { "epoch": 1.6995658492784211, "grad_norm": 2.4899159117702325, "learning_rate": 9.851680245373201e-06, "loss": 0.7933, "step": 45900 }, { "epoch": 1.7032686272876225, "grad_norm": 2.4900169396878535, "learning_rate": 9.85011381689214e-06, "loss": 0.7734, "step": 46000 }, { "epoch": 1.706971405296824, "grad_norm": 2.422580198248974, "learning_rate": 9.848539286006832e-06, "loss": 0.7951, "step": 46100 }, { "epoch": 1.7106741833060255, "grad_norm": 2.8907724621020634, "learning_rate": 9.846956655347621e-06, "loss": 0.7905, "step": 46200 }, { "epoch": 1.7143769613152267, "grad_norm": 2.013474316995975, "learning_rate": 9.845365927558387e-06, "loss": 0.8006, "step": 46300 }, { "epoch": 1.7180797393244283, "grad_norm": 2.1840681748691444, "learning_rate": 9.843767105296536e-06, "loss": 0.7635, "step": 46400 }, { "epoch": 1.7217825173336294, "grad_norm": 2.5617018354083934, "learning_rate": 9.842160191232996e-06, "loss": 0.7824, "step": 46500 }, { "epoch": 1.725485295342831, "grad_norm": 2.308519604503349, "learning_rate": 9.840545188052214e-06, "loss": 0.774, "step": 46600 }, { "epoch": 1.7291880733520324, "grad_norm": 2.2499525442223853, "learning_rate": 9.838922098452146e-06, "loss": 0.756, "step": 46700 }, { "epoch": 1.7328908513612338, "grad_norm": 2.3820412376888322, "learning_rate": 9.83729092514426e-06, "loss": 0.789, "step": 46800 }, { "epoch": 1.7365936293704352, "grad_norm": 2.5725331335845127, "learning_rate": 9.835651670853532e-06, "loss": 0.7854, "step": 46900 }, { "epoch": 1.7402964073796365, "grad_norm": 2.309343999229651, "learning_rate": 9.83400433831843e-06, "loss": 0.775, "step": 47000 }, { "epoch": 1.7439991853888381, "grad_norm": 2.5850280201791436, "learning_rate": 9.832348930290925e-06, "loss": 0.7802, "step": 47100 }, { "epoch": 1.7477019633980393, "grad_norm": 2.2679853588645105, "learning_rate": 9.830685449536472e-06, "loss": 0.7678, "step": 47200 }, { "epoch": 1.751404741407241, "grad_norm": 2.5086985240224635, "learning_rate": 9.829013898834014e-06, "loss": 0.7577, "step": 47300 }, { "epoch": 1.755107519416442, "grad_norm": 1.9266513380957035, "learning_rate": 9.827334280975978e-06, "loss": 0.7758, "step": 47400 }, { "epoch": 1.7588102974256437, "grad_norm": 2.2928770756948547, "learning_rate": 9.825646598768267e-06, "loss": 0.7637, "step": 47500 }, { "epoch": 1.762513075434845, "grad_norm": 2.5105321008988146, "learning_rate": 9.82395085503025e-06, "loss": 0.7832, "step": 47600 }, { "epoch": 1.7662158534440464, "grad_norm": 2.2393983265475867, "learning_rate": 9.822247052594775e-06, "loss": 0.7957, "step": 47700 }, { "epoch": 1.7699186314532478, "grad_norm": 2.2356517021326447, "learning_rate": 9.82053519430814e-06, "loss": 0.7786, "step": 47800 }, { "epoch": 1.7736214094624492, "grad_norm": 2.1349870329764467, "learning_rate": 9.818815283030107e-06, "loss": 0.7639, "step": 47900 }, { "epoch": 1.7773241874716506, "grad_norm": 2.207362188864924, "learning_rate": 9.817087321633891e-06, "loss": 0.7774, "step": 48000 }, { "epoch": 1.781026965480852, "grad_norm": 2.357569522929328, "learning_rate": 9.815351313006155e-06, "loss": 0.7903, "step": 48100 }, { "epoch": 1.7847297434900535, "grad_norm": 2.5604354649057512, "learning_rate": 9.813607260047007e-06, "loss": 0.7861, "step": 48200 }, { "epoch": 1.7884325214992547, "grad_norm": 2.3900699599516964, "learning_rate": 9.811855165669985e-06, "loss": 0.7883, "step": 48300 }, { "epoch": 1.7921352995084563, "grad_norm": 2.2498623353917093, "learning_rate": 9.810095032802075e-06, "loss": 0.7749, "step": 48400 }, { "epoch": 1.7958380775176577, "grad_norm": 2.523396892959795, "learning_rate": 9.808326864383679e-06, "loss": 0.773, "step": 48500 }, { "epoch": 1.799540855526859, "grad_norm": 2.2014201396256214, "learning_rate": 9.806550663368628e-06, "loss": 0.7784, "step": 48600 }, { "epoch": 1.8032436335360604, "grad_norm": 2.137068120876505, "learning_rate": 9.804766432724172e-06, "loss": 0.781, "step": 48700 }, { "epoch": 1.8069464115452618, "grad_norm": 2.1941678037156036, "learning_rate": 9.802974175430975e-06, "loss": 0.7813, "step": 48800 }, { "epoch": 1.8106491895544632, "grad_norm": 2.3496769370735775, "learning_rate": 9.801173894483111e-06, "loss": 0.7758, "step": 48900 }, { "epoch": 1.8143519675636646, "grad_norm": 2.6951877515683917, "learning_rate": 9.799365592888054e-06, "loss": 0.7753, "step": 49000 }, { "epoch": 1.8180547455728662, "grad_norm": 2.2987294149497504, "learning_rate": 9.797549273666682e-06, "loss": 0.7979, "step": 49100 }, { "epoch": 1.8217575235820673, "grad_norm": 2.1550107736300883, "learning_rate": 9.795724939853265e-06, "loss": 0.7547, "step": 49200 }, { "epoch": 1.825460301591269, "grad_norm": 2.1082078725588724, "learning_rate": 9.793892594495457e-06, "loss": 0.7481, "step": 49300 }, { "epoch": 1.82916307960047, "grad_norm": 2.535213902917527, "learning_rate": 9.792052240654304e-06, "loss": 0.7568, "step": 49400 }, { "epoch": 1.8328658576096717, "grad_norm": 2.3198113159972595, "learning_rate": 9.790203881404228e-06, "loss": 0.7834, "step": 49500 }, { "epoch": 1.836568635618873, "grad_norm": 2.519238802449602, "learning_rate": 9.78834751983302e-06, "loss": 0.7699, "step": 49600 }, { "epoch": 1.8402714136280744, "grad_norm": 2.3333401238694798, "learning_rate": 9.786483159041842e-06, "loss": 0.7834, "step": 49700 }, { "epoch": 1.8439741916372758, "grad_norm": 2.4205774253956385, "learning_rate": 9.784610802145222e-06, "loss": 0.7863, "step": 49800 }, { "epoch": 1.8476769696464772, "grad_norm": 2.178068900898099, "learning_rate": 9.782730452271046e-06, "loss": 0.7674, "step": 49900 }, { "epoch": 1.8513797476556788, "grad_norm": 2.6080625282619714, "learning_rate": 9.780842112560548e-06, "loss": 0.7642, "step": 50000 }, { "epoch": 1.85508252566488, "grad_norm": 2.843984991990864, "learning_rate": 9.778945786168308e-06, "loss": 0.7655, "step": 50100 }, { "epoch": 1.8587853036740816, "grad_norm": 2.2308315520099424, "learning_rate": 9.777041476262259e-06, "loss": 0.7656, "step": 50200 }, { "epoch": 1.8624880816832827, "grad_norm": 2.400873208112685, "learning_rate": 9.775129186023661e-06, "loss": 0.7588, "step": 50300 }, { "epoch": 1.8661908596924843, "grad_norm": 2.9815623334199604, "learning_rate": 9.773208918647111e-06, "loss": 0.7722, "step": 50400 }, { "epoch": 1.8698936377016857, "grad_norm": 2.6488046885793373, "learning_rate": 9.771280677340528e-06, "loss": 0.7813, "step": 50500 }, { "epoch": 1.873596415710887, "grad_norm": 2.521885076282361, "learning_rate": 9.769344465325153e-06, "loss": 0.7846, "step": 50600 }, { "epoch": 1.8772991937200885, "grad_norm": 1.8398874480846792, "learning_rate": 9.767400285835546e-06, "loss": 0.7799, "step": 50700 }, { "epoch": 1.8810019717292898, "grad_norm": 2.6273527775975114, "learning_rate": 9.765448142119575e-06, "loss": 0.7463, "step": 50800 }, { "epoch": 1.8847047497384914, "grad_norm": 2.4232551189720626, "learning_rate": 9.763488037438412e-06, "loss": 0.7763, "step": 50900 }, { "epoch": 1.8884075277476926, "grad_norm": 2.182119510957546, "learning_rate": 9.761519975066524e-06, "loss": 0.76, "step": 51000 }, { "epoch": 1.8921103057568942, "grad_norm": 2.4800365305029106, "learning_rate": 9.759543958291683e-06, "loss": 0.7878, "step": 51100 }, { "epoch": 1.8958130837660954, "grad_norm": 2.2209159681563055, "learning_rate": 9.757559990414941e-06, "loss": 0.7706, "step": 51200 }, { "epoch": 1.899515861775297, "grad_norm": 2.2456207460433175, "learning_rate": 9.755568074750635e-06, "loss": 0.7533, "step": 51300 }, { "epoch": 1.9032186397844983, "grad_norm": 2.216946549826359, "learning_rate": 9.753568214626375e-06, "loss": 0.7651, "step": 51400 }, { "epoch": 1.9069214177936997, "grad_norm": 2.1353696650613556, "learning_rate": 9.751560413383051e-06, "loss": 0.7451, "step": 51500 }, { "epoch": 1.910624195802901, "grad_norm": 2.2188964222997227, "learning_rate": 9.749544674374814e-06, "loss": 0.771, "step": 51600 }, { "epoch": 1.9143269738121025, "grad_norm": 2.6602884956835373, "learning_rate": 9.747521000969074e-06, "loss": 0.7652, "step": 51700 }, { "epoch": 1.918029751821304, "grad_norm": 2.366026652497562, "learning_rate": 9.745489396546499e-06, "loss": 0.7778, "step": 51800 }, { "epoch": 1.9217325298305052, "grad_norm": 2.4178576890485166, "learning_rate": 9.743449864501006e-06, "loss": 0.7682, "step": 51900 }, { "epoch": 1.9254353078397068, "grad_norm": 2.461073225865995, "learning_rate": 9.741402408239753e-06, "loss": 0.7379, "step": 52000 }, { "epoch": 1.929138085848908, "grad_norm": 2.3169585710466443, "learning_rate": 9.739347031183142e-06, "loss": 0.74, "step": 52100 }, { "epoch": 1.9328408638581096, "grad_norm": 2.480606692215648, "learning_rate": 9.737283736764798e-06, "loss": 0.7811, "step": 52200 }, { "epoch": 1.936543641867311, "grad_norm": 2.33302742635216, "learning_rate": 9.73521252843158e-06, "loss": 0.7853, "step": 52300 }, { "epoch": 1.9402464198765124, "grad_norm": 2.178325980295709, "learning_rate": 9.733133409643565e-06, "loss": 0.7678, "step": 52400 }, { "epoch": 1.9439491978857137, "grad_norm": 2.6202132000217, "learning_rate": 9.731046383874044e-06, "loss": 0.7496, "step": 52500 }, { "epoch": 1.9476519758949151, "grad_norm": 3.155040247361292, "learning_rate": 9.728951454609517e-06, "loss": 0.7728, "step": 52600 }, { "epoch": 1.9513547539041167, "grad_norm": 2.115338674943332, "learning_rate": 9.726848625349691e-06, "loss": 0.7625, "step": 52700 }, { "epoch": 1.9550575319133179, "grad_norm": 2.203447982138841, "learning_rate": 9.724737899607466e-06, "loss": 0.7693, "step": 52800 }, { "epoch": 1.9587603099225195, "grad_norm": 2.314534740326119, "learning_rate": 9.722619280908934e-06, "loss": 0.7628, "step": 52900 }, { "epoch": 1.9624630879317206, "grad_norm": 2.1956336825914793, "learning_rate": 9.720492772793375e-06, "loss": 0.7636, "step": 53000 }, { "epoch": 1.9661658659409222, "grad_norm": 2.237936138713292, "learning_rate": 9.718358378813248e-06, "loss": 0.7559, "step": 53100 }, { "epoch": 1.9698686439501236, "grad_norm": 2.166260133162057, "learning_rate": 9.716216102534186e-06, "loss": 0.7619, "step": 53200 }, { "epoch": 1.973571421959325, "grad_norm": 2.4562700667901933, "learning_rate": 9.714065947534987e-06, "loss": 0.7596, "step": 53300 }, { "epoch": 1.9772741999685264, "grad_norm": 2.0820216516365027, "learning_rate": 9.711907917407614e-06, "loss": 0.7526, "step": 53400 }, { "epoch": 1.9809769779777278, "grad_norm": 2.3755910874830657, "learning_rate": 9.709742015757187e-06, "loss": 0.7553, "step": 53500 }, { "epoch": 1.9846797559869291, "grad_norm": 2.369452864927645, "learning_rate": 9.707568246201972e-06, "loss": 0.753, "step": 53600 }, { "epoch": 1.9883825339961305, "grad_norm": 2.3410608706416762, "learning_rate": 9.70538661237338e-06, "loss": 0.7787, "step": 53700 }, { "epoch": 1.9920853120053321, "grad_norm": 2.087568589173381, "learning_rate": 9.70319711791596e-06, "loss": 0.7586, "step": 53800 }, { "epoch": 1.9957880900145333, "grad_norm": 2.6786991173682373, "learning_rate": 9.700999766487395e-06, "loss": 0.7465, "step": 53900 }, { "epoch": 1.9994908680237349, "grad_norm": 3.03880633662284, "learning_rate": 9.698794561758493e-06, "loss": 0.7403, "step": 54000 }, { "epoch": 2.003184389087913, "grad_norm": 2.5868756494206497, "learning_rate": 9.696581507413174e-06, "loss": 0.6992, "step": 54100 }, { "epoch": 2.0068871670971147, "grad_norm": 2.0423484305586994, "learning_rate": 9.694360607148484e-06, "loss": 0.6838, "step": 54200 }, { "epoch": 2.010589945106316, "grad_norm": 2.313387880714559, "learning_rate": 9.692131864674563e-06, "loss": 0.6912, "step": 54300 }, { "epoch": 2.0142927231155174, "grad_norm": 2.274235624804895, "learning_rate": 9.689895283714663e-06, "loss": 0.6854, "step": 54400 }, { "epoch": 2.017995501124719, "grad_norm": 2.119309140091523, "learning_rate": 9.687650868005124e-06, "loss": 0.6786, "step": 54500 }, { "epoch": 2.02169827913392, "grad_norm": 2.3759711399354413, "learning_rate": 9.685398621295377e-06, "loss": 0.6841, "step": 54600 }, { "epoch": 2.025401057143122, "grad_norm": 2.652183252752739, "learning_rate": 9.683138547347933e-06, "loss": 0.6938, "step": 54700 }, { "epoch": 2.029103835152323, "grad_norm": 2.497195487638967, "learning_rate": 9.68087064993838e-06, "loss": 0.6834, "step": 54800 }, { "epoch": 2.0328066131615246, "grad_norm": 2.0788434226359174, "learning_rate": 9.678594932855377e-06, "loss": 0.681, "step": 54900 }, { "epoch": 2.0365093911707257, "grad_norm": 2.557154847367296, "learning_rate": 9.676311399900644e-06, "loss": 0.6956, "step": 55000 }, { "epoch": 2.0402121691799273, "grad_norm": 2.5976913602453595, "learning_rate": 9.674020054888962e-06, "loss": 0.673, "step": 55100 }, { "epoch": 2.0439149471891285, "grad_norm": 2.1397702042236206, "learning_rate": 9.671720901648157e-06, "loss": 0.6939, "step": 55200 }, { "epoch": 2.04761772519833, "grad_norm": 2.384247384660651, "learning_rate": 9.669413944019099e-06, "loss": 0.6757, "step": 55300 }, { "epoch": 2.0513205032075312, "grad_norm": 2.1863152443770786, "learning_rate": 9.667099185855703e-06, "loss": 0.6968, "step": 55400 }, { "epoch": 2.055023281216733, "grad_norm": 4.405956537339234, "learning_rate": 9.664776631024908e-06, "loss": 0.683, "step": 55500 }, { "epoch": 2.0587260592259344, "grad_norm": 2.049043630631007, "learning_rate": 9.662446283406682e-06, "loss": 0.6914, "step": 55600 }, { "epoch": 2.0624288372351356, "grad_norm": 2.5664972711721625, "learning_rate": 9.660108146894007e-06, "loss": 0.6915, "step": 55700 }, { "epoch": 2.066131615244337, "grad_norm": 2.005386472768463, "learning_rate": 9.65776222539288e-06, "loss": 0.6598, "step": 55800 }, { "epoch": 2.0698343932535384, "grad_norm": 2.449106381543406, "learning_rate": 9.655408522822306e-06, "loss": 0.66, "step": 55900 }, { "epoch": 2.07353717126274, "grad_norm": 3.0381314839729177, "learning_rate": 9.653047043114281e-06, "loss": 0.6685, "step": 56000 }, { "epoch": 2.077239949271941, "grad_norm": 2.412440777034649, "learning_rate": 9.650677790213799e-06, "loss": 0.666, "step": 56100 }, { "epoch": 2.0809427272811427, "grad_norm": 2.3635812683682222, "learning_rate": 9.64830076807884e-06, "loss": 0.6719, "step": 56200 }, { "epoch": 2.084645505290344, "grad_norm": 2.337252826890419, "learning_rate": 9.64591598068036e-06, "loss": 0.6761, "step": 56300 }, { "epoch": 2.0883482832995455, "grad_norm": 2.1411084610979856, "learning_rate": 9.643523432002288e-06, "loss": 0.6699, "step": 56400 }, { "epoch": 2.092051061308747, "grad_norm": 2.396579312617091, "learning_rate": 9.64112312604152e-06, "loss": 0.6811, "step": 56500 }, { "epoch": 2.0957538393179482, "grad_norm": 1.991014001057909, "learning_rate": 9.638715066807908e-06, "loss": 0.6921, "step": 56600 }, { "epoch": 2.09945661732715, "grad_norm": 1.8170099574989464, "learning_rate": 9.636299258324263e-06, "loss": 0.6748, "step": 56700 }, { "epoch": 2.103159395336351, "grad_norm": 2.407538918588163, "learning_rate": 9.633875704626332e-06, "loss": 0.6556, "step": 56800 }, { "epoch": 2.1068621733455526, "grad_norm": 2.0009350084583186, "learning_rate": 9.63144440976281e-06, "loss": 0.6804, "step": 56900 }, { "epoch": 2.1105649513547537, "grad_norm": 2.318529373742503, "learning_rate": 9.629005377795318e-06, "loss": 0.6766, "step": 57000 }, { "epoch": 2.1142677293639554, "grad_norm": 2.901357640692631, "learning_rate": 9.626558612798404e-06, "loss": 0.6794, "step": 57100 }, { "epoch": 2.1179705073731565, "grad_norm": 2.154557247934532, "learning_rate": 9.624104118859535e-06, "loss": 0.6691, "step": 57200 }, { "epoch": 2.121673285382358, "grad_norm": 2.3903883257512577, "learning_rate": 9.62164190007909e-06, "loss": 0.6545, "step": 57300 }, { "epoch": 2.1253760633915597, "grad_norm": 2.1742676469308093, "learning_rate": 9.619171960570353e-06, "loss": 0.6894, "step": 57400 }, { "epoch": 2.129078841400761, "grad_norm": 2.654747582200517, "learning_rate": 9.616694304459504e-06, "loss": 0.6784, "step": 57500 }, { "epoch": 2.1327816194099625, "grad_norm": 2.402727194182496, "learning_rate": 9.614208935885615e-06, "loss": 0.6724, "step": 57600 }, { "epoch": 2.1364843974191636, "grad_norm": 2.2360152593662743, "learning_rate": 9.611715859000643e-06, "loss": 0.6622, "step": 57700 }, { "epoch": 2.1401871754283652, "grad_norm": 2.591011720239606, "learning_rate": 9.609215077969422e-06, "loss": 0.6981, "step": 57800 }, { "epoch": 2.1438899534375664, "grad_norm": 2.059137036300649, "learning_rate": 9.606706596969655e-06, "loss": 0.6665, "step": 57900 }, { "epoch": 2.147592731446768, "grad_norm": 2.5471008394463768, "learning_rate": 9.604190420191908e-06, "loss": 0.6725, "step": 58000 }, { "epoch": 2.151295509455969, "grad_norm": 2.3973486591141504, "learning_rate": 9.601666551839606e-06, "loss": 0.6855, "step": 58100 }, { "epoch": 2.1549982874651707, "grad_norm": 2.6375773664347286, "learning_rate": 9.599134996129022e-06, "loss": 0.6826, "step": 58200 }, { "epoch": 2.1587010654743723, "grad_norm": 2.332457102848655, "learning_rate": 9.596595757289268e-06, "loss": 0.6814, "step": 58300 }, { "epoch": 2.1624038434835735, "grad_norm": 2.4870013874361745, "learning_rate": 9.594048839562298e-06, "loss": 0.6792, "step": 58400 }, { "epoch": 2.166106621492775, "grad_norm": 2.3255182488507034, "learning_rate": 9.591494247202886e-06, "loss": 0.6954, "step": 58500 }, { "epoch": 2.1698093995019763, "grad_norm": 2.83057658696704, "learning_rate": 9.588931984478633e-06, "loss": 0.6914, "step": 58600 }, { "epoch": 2.173512177511178, "grad_norm": 2.354736029056531, "learning_rate": 9.58636205566995e-06, "loss": 0.6705, "step": 58700 }, { "epoch": 2.177214955520379, "grad_norm": 2.413081651067487, "learning_rate": 9.583784465070056e-06, "loss": 0.692, "step": 58800 }, { "epoch": 2.1809177335295806, "grad_norm": 1.9994020495850402, "learning_rate": 9.581199216984974e-06, "loss": 0.6789, "step": 58900 }, { "epoch": 2.184620511538782, "grad_norm": 2.336676864459619, "learning_rate": 9.57860631573351e-06, "loss": 0.6746, "step": 59000 }, { "epoch": 2.1883232895479834, "grad_norm": 1.9765951823994232, "learning_rate": 9.576005765647262e-06, "loss": 0.6841, "step": 59100 }, { "epoch": 2.192026067557185, "grad_norm": 2.6704707200624567, "learning_rate": 9.573397571070606e-06, "loss": 0.6606, "step": 59200 }, { "epoch": 2.195728845566386, "grad_norm": 2.7273193459509057, "learning_rate": 9.570781736360682e-06, "loss": 0.694, "step": 59300 }, { "epoch": 2.1994316235755877, "grad_norm": 2.4075381061640475, "learning_rate": 9.568158265887402e-06, "loss": 0.7058, "step": 59400 }, { "epoch": 2.203134401584789, "grad_norm": 2.7561200770361283, "learning_rate": 9.565527164033428e-06, "loss": 0.6635, "step": 59500 }, { "epoch": 2.2068371795939905, "grad_norm": 1.9296516325452246, "learning_rate": 9.562888435194171e-06, "loss": 0.6944, "step": 59600 }, { "epoch": 2.2105399576031917, "grad_norm": 2.3953208065104445, "learning_rate": 9.56024208377779e-06, "loss": 0.6511, "step": 59700 }, { "epoch": 2.2142427356123933, "grad_norm": 2.3935986970014507, "learning_rate": 9.557588114205166e-06, "loss": 0.684, "step": 59800 }, { "epoch": 2.2179455136215944, "grad_norm": 2.474410928209759, "learning_rate": 9.554926530909918e-06, "loss": 0.6944, "step": 59900 }, { "epoch": 2.221648291630796, "grad_norm": 2.300308607407991, "learning_rate": 9.552257338338377e-06, "loss": 0.6619, "step": 60000 }, { "epoch": 2.2253510696399976, "grad_norm": 2.2318245286663347, "learning_rate": 9.549580540949592e-06, "loss": 0.6737, "step": 60100 }, { "epoch": 2.229053847649199, "grad_norm": 3.050417412156981, "learning_rate": 9.546896143215307e-06, "loss": 0.6588, "step": 60200 }, { "epoch": 2.2327566256584004, "grad_norm": 2.096536346012167, "learning_rate": 9.544204149619973e-06, "loss": 0.6529, "step": 60300 }, { "epoch": 2.2364594036676015, "grad_norm": 2.81530679664561, "learning_rate": 9.541504564660726e-06, "loss": 0.6691, "step": 60400 }, { "epoch": 2.240162181676803, "grad_norm": 2.5250157137123606, "learning_rate": 9.53879739284738e-06, "loss": 0.6956, "step": 60500 }, { "epoch": 2.2438649596860043, "grad_norm": 2.1361057888741057, "learning_rate": 9.536082638702428e-06, "loss": 0.6622, "step": 60600 }, { "epoch": 2.247567737695206, "grad_norm": 2.418961423369967, "learning_rate": 9.533360306761032e-06, "loss": 0.6718, "step": 60700 }, { "epoch": 2.251270515704407, "grad_norm": 2.0162128525744984, "learning_rate": 9.530630401571006e-06, "loss": 0.6757, "step": 60800 }, { "epoch": 2.2549732937136087, "grad_norm": 1.7747207441276736, "learning_rate": 9.527892927692819e-06, "loss": 0.6895, "step": 60900 }, { "epoch": 2.2586760717228103, "grad_norm": 2.3868183988130007, "learning_rate": 9.525147889699587e-06, "loss": 0.6982, "step": 61000 }, { "epoch": 2.2623788497320114, "grad_norm": 2.370342809471196, "learning_rate": 9.52239529217706e-06, "loss": 0.6802, "step": 61100 }, { "epoch": 2.266081627741213, "grad_norm": 2.001291795407346, "learning_rate": 9.519635139723613e-06, "loss": 0.6836, "step": 61200 }, { "epoch": 2.269784405750414, "grad_norm": 2.4678585122972367, "learning_rate": 9.516867436950247e-06, "loss": 0.6709, "step": 61300 }, { "epoch": 2.273487183759616, "grad_norm": 2.6005957215451754, "learning_rate": 9.514092188480574e-06, "loss": 0.6818, "step": 61400 }, { "epoch": 2.277189961768817, "grad_norm": 2.920386936383934, "learning_rate": 9.511309398950815e-06, "loss": 0.7052, "step": 61500 }, { "epoch": 2.2808927397780185, "grad_norm": 2.5403436437373865, "learning_rate": 9.50851907300978e-06, "loss": 0.6955, "step": 61600 }, { "epoch": 2.2845955177872197, "grad_norm": 2.2492931914844347, "learning_rate": 9.505721215318879e-06, "loss": 0.6785, "step": 61700 }, { "epoch": 2.2882982957964213, "grad_norm": 2.6880292587251047, "learning_rate": 9.5029158305521e-06, "loss": 0.68, "step": 61800 }, { "epoch": 2.292001073805623, "grad_norm": 2.402997197868183, "learning_rate": 9.500102923396004e-06, "loss": 0.6927, "step": 61900 }, { "epoch": 2.295703851814824, "grad_norm": 1.9864142200066779, "learning_rate": 9.49728249854972e-06, "loss": 0.6919, "step": 62000 }, { "epoch": 2.2994066298240257, "grad_norm": 2.8456743709517163, "learning_rate": 9.494454560724938e-06, "loss": 0.6762, "step": 62100 }, { "epoch": 2.303109407833227, "grad_norm": 2.7748514053291484, "learning_rate": 9.491619114645892e-06, "loss": 0.6777, "step": 62200 }, { "epoch": 2.3068121858424284, "grad_norm": 2.559623553355795, "learning_rate": 9.48877616504937e-06, "loss": 0.6885, "step": 62300 }, { "epoch": 2.3105149638516296, "grad_norm": 2.1469384529226008, "learning_rate": 9.485925716684684e-06, "loss": 0.7014, "step": 62400 }, { "epoch": 2.314217741860831, "grad_norm": 2.6264777887477444, "learning_rate": 9.48306777431368e-06, "loss": 0.6778, "step": 62500 }, { "epoch": 2.3179205198700323, "grad_norm": 2.621989964486446, "learning_rate": 9.48020234271072e-06, "loss": 0.6805, "step": 62600 }, { "epoch": 2.321623297879234, "grad_norm": 2.272202713631239, "learning_rate": 9.47732942666268e-06, "loss": 0.6867, "step": 62700 }, { "epoch": 2.3253260758884355, "grad_norm": 1.9762020868593124, "learning_rate": 9.474449030968937e-06, "loss": 0.6854, "step": 62800 }, { "epoch": 2.3290288538976367, "grad_norm": 2.1226904406736984, "learning_rate": 9.471561160441363e-06, "loss": 0.6688, "step": 62900 }, { "epoch": 2.3327316319068383, "grad_norm": 2.316689916305218, "learning_rate": 9.468665819904317e-06, "loss": 0.6951, "step": 63000 }, { "epoch": 2.3364344099160395, "grad_norm": 2.316136479919069, "learning_rate": 9.465763014194638e-06, "loss": 0.6808, "step": 63100 }, { "epoch": 2.340137187925241, "grad_norm": 2.307220790631874, "learning_rate": 9.46285274816164e-06, "loss": 0.6869, "step": 63200 }, { "epoch": 2.343839965934442, "grad_norm": 2.213269812970463, "learning_rate": 9.459935026667089e-06, "loss": 0.6578, "step": 63300 }, { "epoch": 2.347542743943644, "grad_norm": 2.344279831358738, "learning_rate": 9.457009854585219e-06, "loss": 0.6971, "step": 63400 }, { "epoch": 2.351245521952845, "grad_norm": 2.0096880506357446, "learning_rate": 9.454077236802702e-06, "loss": 0.6828, "step": 63500 }, { "epoch": 2.3549482999620466, "grad_norm": 2.2548311729082253, "learning_rate": 9.45113717821865e-06, "loss": 0.6727, "step": 63600 }, { "epoch": 2.358651077971248, "grad_norm": 2.4341693614642996, "learning_rate": 9.448189683744608e-06, "loss": 0.6809, "step": 63700 }, { "epoch": 2.3623538559804493, "grad_norm": 2.419848393996797, "learning_rate": 9.445234758304537e-06, "loss": 0.6928, "step": 63800 }, { "epoch": 2.3660566339896505, "grad_norm": 2.7840357590734994, "learning_rate": 9.442272406834823e-06, "loss": 0.6698, "step": 63900 }, { "epoch": 2.369759411998852, "grad_norm": 2.3936132840359665, "learning_rate": 9.439302634284244e-06, "loss": 0.6741, "step": 64000 }, { "epoch": 2.3734621900080537, "grad_norm": 2.2628698955348923, "learning_rate": 9.436325445613988e-06, "loss": 0.6982, "step": 64100 }, { "epoch": 2.377164968017255, "grad_norm": 2.5913137677554645, "learning_rate": 9.43334084579762e-06, "loss": 0.6843, "step": 64200 }, { "epoch": 2.3808677460264565, "grad_norm": 2.5306835812838027, "learning_rate": 9.430348839821095e-06, "loss": 0.6931, "step": 64300 }, { "epoch": 2.3845705240356576, "grad_norm": 1.824497906863608, "learning_rate": 9.42734943268274e-06, "loss": 0.6784, "step": 64400 }, { "epoch": 2.388273302044859, "grad_norm": 2.031648470909946, "learning_rate": 9.424342629393238e-06, "loss": 0.6845, "step": 64500 }, { "epoch": 2.3919760800540604, "grad_norm": 2.5806566539882274, "learning_rate": 9.421328434975636e-06, "loss": 0.6893, "step": 64600 }, { "epoch": 2.395678858063262, "grad_norm": 2.1526340438291807, "learning_rate": 9.418306854465327e-06, "loss": 0.6973, "step": 64700 }, { "epoch": 2.399381636072463, "grad_norm": 2.4285535244597702, "learning_rate": 9.41527789291004e-06, "loss": 0.7019, "step": 64800 }, { "epoch": 2.4030844140816647, "grad_norm": 2.621188381463244, "learning_rate": 9.412241555369834e-06, "loss": 0.6653, "step": 64900 }, { "epoch": 2.4067871920908663, "grad_norm": 2.224098798333827, "learning_rate": 9.409197846917093e-06, "loss": 0.6725, "step": 65000 }, { "epoch": 2.4104899701000675, "grad_norm": 1.997533351532834, "learning_rate": 9.406146772636516e-06, "loss": 0.6812, "step": 65100 }, { "epoch": 2.414192748109269, "grad_norm": 2.19098514780732, "learning_rate": 9.403088337625099e-06, "loss": 0.6677, "step": 65200 }, { "epoch": 2.4178955261184703, "grad_norm": 2.1908878890803605, "learning_rate": 9.400022546992148e-06, "loss": 0.6813, "step": 65300 }, { "epoch": 2.421598304127672, "grad_norm": 2.324540384353367, "learning_rate": 9.396949405859239e-06, "loss": 0.6579, "step": 65400 }, { "epoch": 2.425301082136873, "grad_norm": 2.5054016122271374, "learning_rate": 9.393868919360244e-06, "loss": 0.6744, "step": 65500 }, { "epoch": 2.4290038601460746, "grad_norm": 2.582887067658994, "learning_rate": 9.390781092641301e-06, "loss": 0.6913, "step": 65600 }, { "epoch": 2.4327066381552758, "grad_norm": 2.2584713627681428, "learning_rate": 9.387685930860804e-06, "loss": 0.6645, "step": 65700 }, { "epoch": 2.4364094161644774, "grad_norm": 2.202586980967711, "learning_rate": 9.384583439189406e-06, "loss": 0.6599, "step": 65800 }, { "epoch": 2.440112194173679, "grad_norm": 2.0537705242407256, "learning_rate": 9.381473622810005e-06, "loss": 0.6524, "step": 65900 }, { "epoch": 2.44381497218288, "grad_norm": 2.446679586314843, "learning_rate": 9.378356486917736e-06, "loss": 0.6586, "step": 66000 }, { "epoch": 2.4475177501920817, "grad_norm": 2.3254324060908886, "learning_rate": 9.37523203671996e-06, "loss": 0.6716, "step": 66100 }, { "epoch": 2.451220528201283, "grad_norm": 2.402871716965202, "learning_rate": 9.372100277436253e-06, "loss": 0.6771, "step": 66200 }, { "epoch": 2.4549233062104845, "grad_norm": 2.4014908865791402, "learning_rate": 9.368961214298414e-06, "loss": 0.6892, "step": 66300 }, { "epoch": 2.4586260842196856, "grad_norm": 2.309859916718413, "learning_rate": 9.365814852550426e-06, "loss": 0.6725, "step": 66400 }, { "epoch": 2.4623288622288872, "grad_norm": 2.657756967242288, "learning_rate": 9.36266119744848e-06, "loss": 0.6835, "step": 66500 }, { "epoch": 2.4660316402380884, "grad_norm": 2.3060608989482327, "learning_rate": 9.35950025426094e-06, "loss": 0.6694, "step": 66600 }, { "epoch": 2.46973441824729, "grad_norm": 1.9200073351424498, "learning_rate": 9.356332028268356e-06, "loss": 0.6725, "step": 66700 }, { "epoch": 2.4734371962564916, "grad_norm": 1.7930879502348702, "learning_rate": 9.353156524763433e-06, "loss": 0.6674, "step": 66800 }, { "epoch": 2.4771399742656928, "grad_norm": 2.312137593139913, "learning_rate": 9.349973749051042e-06, "loss": 0.665, "step": 66900 }, { "epoch": 2.4808427522748944, "grad_norm": 2.7119648286693536, "learning_rate": 9.346783706448199e-06, "loss": 0.6925, "step": 67000 }, { "epoch": 2.4845455302840955, "grad_norm": 2.356555621714717, "learning_rate": 9.343586402284061e-06, "loss": 0.6774, "step": 67100 }, { "epoch": 2.488248308293297, "grad_norm": 3.4467386366257196, "learning_rate": 9.340381841899913e-06, "loss": 0.6907, "step": 67200 }, { "epoch": 2.4919510863024983, "grad_norm": 2.5874996764431, "learning_rate": 9.337170030649166e-06, "loss": 0.6808, "step": 67300 }, { "epoch": 2.4956538643117, "grad_norm": 2.4262991872836093, "learning_rate": 9.33395097389734e-06, "loss": 0.6714, "step": 67400 }, { "epoch": 2.499356642320901, "grad_norm": 2.2026220733741737, "learning_rate": 9.330724677022063e-06, "loss": 0.6798, "step": 67500 }, { "epoch": 2.5030594203301026, "grad_norm": 2.1788581179144395, "learning_rate": 9.327491145413057e-06, "loss": 0.6811, "step": 67600 }, { "epoch": 2.5067621983393042, "grad_norm": 2.275441203213566, "learning_rate": 9.324250384472127e-06, "loss": 0.6627, "step": 67700 }, { "epoch": 2.5104649763485054, "grad_norm": 2.283344949810879, "learning_rate": 9.32100239961316e-06, "loss": 0.6642, "step": 67800 }, { "epoch": 2.514167754357707, "grad_norm": 2.4267848884723167, "learning_rate": 9.317747196262105e-06, "loss": 0.6787, "step": 67900 }, { "epoch": 2.517870532366908, "grad_norm": 2.5953019278693965, "learning_rate": 9.314484779856977e-06, "loss": 0.6737, "step": 68000 }, { "epoch": 2.5215733103761098, "grad_norm": 2.036941282735882, "learning_rate": 9.311215155847834e-06, "loss": 0.6589, "step": 68100 }, { "epoch": 2.525276088385311, "grad_norm": 2.597845885761239, "learning_rate": 9.30793832969678e-06, "loss": 0.6717, "step": 68200 }, { "epoch": 2.5289788663945125, "grad_norm": 2.4622763848737774, "learning_rate": 9.304654306877946e-06, "loss": 0.6897, "step": 68300 }, { "epoch": 2.5326816444037137, "grad_norm": 2.2606318900396047, "learning_rate": 9.30136309287749e-06, "loss": 0.6811, "step": 68400 }, { "epoch": 2.5363844224129153, "grad_norm": 2.4860591476196423, "learning_rate": 9.298064693193581e-06, "loss": 0.6776, "step": 68500 }, { "epoch": 2.540087200422117, "grad_norm": 2.4829377853240837, "learning_rate": 9.29475911333639e-06, "loss": 0.7002, "step": 68600 }, { "epoch": 2.543789978431318, "grad_norm": 2.367231988606884, "learning_rate": 9.291446358828091e-06, "loss": 0.6675, "step": 68700 }, { "epoch": 2.5474927564405196, "grad_norm": 2.4582997910649484, "learning_rate": 9.288126435202831e-06, "loss": 0.6656, "step": 68800 }, { "epoch": 2.551195534449721, "grad_norm": 2.0143015209204185, "learning_rate": 9.284799348006743e-06, "loss": 0.6811, "step": 68900 }, { "epoch": 2.5548983124589224, "grad_norm": 2.9078889050531473, "learning_rate": 9.281465102797926e-06, "loss": 0.677, "step": 69000 }, { "epoch": 2.5586010904681236, "grad_norm": 2.2408419976033693, "learning_rate": 9.278123705146434e-06, "loss": 0.6884, "step": 69100 }, { "epoch": 2.562303868477325, "grad_norm": 2.626201962148744, "learning_rate": 9.27477516063427e-06, "loss": 0.6612, "step": 69200 }, { "epoch": 2.5660066464865263, "grad_norm": 2.1552325508134134, "learning_rate": 9.271419474855377e-06, "loss": 0.666, "step": 69300 }, { "epoch": 2.569709424495728, "grad_norm": 2.3227195866166768, "learning_rate": 9.268056653415632e-06, "loss": 0.6652, "step": 69400 }, { "epoch": 2.5734122025049295, "grad_norm": 2.753551170952296, "learning_rate": 9.264686701932825e-06, "loss": 0.6791, "step": 69500 }, { "epoch": 2.5771149805141307, "grad_norm": 2.2976640888247415, "learning_rate": 9.261309626036661e-06, "loss": 0.6705, "step": 69600 }, { "epoch": 2.5808177585233323, "grad_norm": 2.339779663329093, "learning_rate": 9.257925431368749e-06, "loss": 0.6669, "step": 69700 }, { "epoch": 2.5845205365325334, "grad_norm": 2.1019636120259695, "learning_rate": 9.254534123582585e-06, "loss": 0.6734, "step": 69800 }, { "epoch": 2.588223314541735, "grad_norm": 2.2977009075813744, "learning_rate": 9.251135708343555e-06, "loss": 0.6724, "step": 69900 }, { "epoch": 2.591926092550936, "grad_norm": 2.1726587191847386, "learning_rate": 9.247730191328908e-06, "loss": 0.686, "step": 70000 }, { "epoch": 2.595628870560138, "grad_norm": 2.2059169127907907, "learning_rate": 9.244317578227769e-06, "loss": 0.6829, "step": 70100 }, { "epoch": 2.599331648569339, "grad_norm": 2.2693764246927843, "learning_rate": 9.240897874741108e-06, "loss": 0.6706, "step": 70200 }, { "epoch": 2.6030344265785406, "grad_norm": 2.3773955458790192, "learning_rate": 9.237471086581744e-06, "loss": 0.6403, "step": 70300 }, { "epoch": 2.606737204587742, "grad_norm": 2.376894341944025, "learning_rate": 9.234037219474332e-06, "loss": 0.6556, "step": 70400 }, { "epoch": 2.6104399825969433, "grad_norm": 2.2164027402226756, "learning_rate": 9.230596279155353e-06, "loss": 0.6677, "step": 70500 }, { "epoch": 2.614142760606145, "grad_norm": 2.6004295191699596, "learning_rate": 9.227148271373102e-06, "loss": 0.6656, "step": 70600 }, { "epoch": 2.617845538615346, "grad_norm": 2.2941992876843145, "learning_rate": 9.223693201887677e-06, "loss": 0.671, "step": 70700 }, { "epoch": 2.6215483166245477, "grad_norm": 2.423457996166192, "learning_rate": 9.220231076470985e-06, "loss": 0.671, "step": 70800 }, { "epoch": 2.625251094633749, "grad_norm": 2.028224896189644, "learning_rate": 9.216761900906707e-06, "loss": 0.6633, "step": 70900 }, { "epoch": 2.6289538726429504, "grad_norm": 2.8043596176994234, "learning_rate": 9.213285680990311e-06, "loss": 0.6733, "step": 71000 }, { "epoch": 2.6326566506521516, "grad_norm": 2.631240157715802, "learning_rate": 9.209802422529028e-06, "loss": 0.6694, "step": 71100 }, { "epoch": 2.636359428661353, "grad_norm": 3.27526490574497, "learning_rate": 9.206312131341848e-06, "loss": 0.6736, "step": 71200 }, { "epoch": 2.640062206670555, "grad_norm": 2.2969788418244734, "learning_rate": 9.202814813259514e-06, "loss": 0.6685, "step": 71300 }, { "epoch": 2.643764984679756, "grad_norm": 2.0448759395992693, "learning_rate": 9.199310474124501e-06, "loss": 0.6734, "step": 71400 }, { "epoch": 2.647467762688957, "grad_norm": 2.194888247981071, "learning_rate": 9.195799119791018e-06, "loss": 0.6853, "step": 71500 }, { "epoch": 2.6511705406981587, "grad_norm": 2.3157294382898037, "learning_rate": 9.19228075612499e-06, "loss": 0.6936, "step": 71600 }, { "epoch": 2.6548733187073603, "grad_norm": 2.2600268640470516, "learning_rate": 9.188755389004056e-06, "loss": 0.6482, "step": 71700 }, { "epoch": 2.6585760967165615, "grad_norm": 2.7867346539584026, "learning_rate": 9.18522302431755e-06, "loss": 0.6736, "step": 71800 }, { "epoch": 2.662278874725763, "grad_norm": 2.4244549380103284, "learning_rate": 9.181683667966497e-06, "loss": 0.6612, "step": 71900 }, { "epoch": 2.6659816527349642, "grad_norm": 2.467628082595294, "learning_rate": 9.178137325863606e-06, "loss": 0.662, "step": 72000 }, { "epoch": 2.669684430744166, "grad_norm": 1.991818914003808, "learning_rate": 9.17458400393325e-06, "loss": 0.6546, "step": 72100 }, { "epoch": 2.6733872087533674, "grad_norm": 2.6084822187687893, "learning_rate": 9.171023708111467e-06, "loss": 0.6707, "step": 72200 }, { "epoch": 2.6770899867625686, "grad_norm": 2.4759759139487674, "learning_rate": 9.16745644434594e-06, "loss": 0.6589, "step": 72300 }, { "epoch": 2.6807927647717698, "grad_norm": 2.417557023636743, "learning_rate": 9.163882218595998e-06, "loss": 0.6692, "step": 72400 }, { "epoch": 2.6844955427809714, "grad_norm": 2.4757717167657303, "learning_rate": 9.160301036832601e-06, "loss": 0.6824, "step": 72500 }, { "epoch": 2.688198320790173, "grad_norm": 1.7864811103086602, "learning_rate": 9.156712905038324e-06, "loss": 0.6549, "step": 72600 }, { "epoch": 2.691901098799374, "grad_norm": 1.931145360031176, "learning_rate": 9.153117829207353e-06, "loss": 0.6707, "step": 72700 }, { "epoch": 2.6956038768085757, "grad_norm": 2.6583751811214515, "learning_rate": 9.149515815345477e-06, "loss": 0.6746, "step": 72800 }, { "epoch": 2.699306654817777, "grad_norm": 2.3434065726826874, "learning_rate": 9.14590686947008e-06, "loss": 0.6746, "step": 72900 }, { "epoch": 2.7030094328269785, "grad_norm": 2.1951946075529003, "learning_rate": 9.142290997610114e-06, "loss": 0.672, "step": 73000 }, { "epoch": 2.70671221083618, "grad_norm": 2.1266717085417715, "learning_rate": 9.138668205806116e-06, "loss": 0.6596, "step": 73100 }, { "epoch": 2.7104149888453812, "grad_norm": 2.052871241822731, "learning_rate": 9.135038500110169e-06, "loss": 0.6562, "step": 73200 }, { "epoch": 2.7141177668545824, "grad_norm": 2.3890278232506144, "learning_rate": 9.131401886585916e-06, "loss": 0.6791, "step": 73300 }, { "epoch": 2.717820544863784, "grad_norm": 2.339795856765528, "learning_rate": 9.127758371308537e-06, "loss": 0.6769, "step": 73400 }, { "epoch": 2.7215233228729856, "grad_norm": 2.0980772669298946, "learning_rate": 9.124107960364738e-06, "loss": 0.687, "step": 73500 }, { "epoch": 2.7252261008821868, "grad_norm": 1.9520671316508236, "learning_rate": 9.120450659852754e-06, "loss": 0.6619, "step": 73600 }, { "epoch": 2.7289288788913884, "grad_norm": 1.9489571927723024, "learning_rate": 9.116786475882318e-06, "loss": 0.6643, "step": 73700 }, { "epoch": 2.7326316569005895, "grad_norm": 2.1143535490363963, "learning_rate": 9.11311541457467e-06, "loss": 0.6647, "step": 73800 }, { "epoch": 2.736334434909791, "grad_norm": 2.4454265529124415, "learning_rate": 9.109437482062538e-06, "loss": 0.6791, "step": 73900 }, { "epoch": 2.7400372129189927, "grad_norm": 2.12417553054465, "learning_rate": 9.105752684490125e-06, "loss": 0.6751, "step": 74000 }, { "epoch": 2.743739990928194, "grad_norm": 3.3698294360651286, "learning_rate": 9.102061028013108e-06, "loss": 0.6805, "step": 74100 }, { "epoch": 2.747442768937395, "grad_norm": 2.6079682276880694, "learning_rate": 9.098362518798615e-06, "loss": 0.6542, "step": 74200 }, { "epoch": 2.7511455469465966, "grad_norm": 2.9285278794017167, "learning_rate": 9.094657163025228e-06, "loss": 0.6798, "step": 74300 }, { "epoch": 2.7548483249557982, "grad_norm": 2.029512121868359, "learning_rate": 9.090944966882968e-06, "loss": 0.6716, "step": 74400 }, { "epoch": 2.7585511029649994, "grad_norm": 2.37703823122831, "learning_rate": 9.087225936573275e-06, "loss": 0.6664, "step": 74500 }, { "epoch": 2.762253880974201, "grad_norm": 2.290740875061313, "learning_rate": 9.083500078309013e-06, "loss": 0.7054, "step": 74600 }, { "epoch": 2.765956658983402, "grad_norm": 1.9826452203518832, "learning_rate": 9.079767398314452e-06, "loss": 0.6574, "step": 74700 }, { "epoch": 2.7696594369926038, "grad_norm": 2.510390912417119, "learning_rate": 9.076027902825252e-06, "loss": 0.6573, "step": 74800 }, { "epoch": 2.7733622150018054, "grad_norm": 3.253767602420802, "learning_rate": 9.072281598088467e-06, "loss": 0.6565, "step": 74900 }, { "epoch": 2.7770649930110065, "grad_norm": 3.1743643654172278, "learning_rate": 9.068528490362524e-06, "loss": 0.6636, "step": 75000 }, { "epoch": 2.7807677710202077, "grad_norm": 2.9292198577340463, "learning_rate": 9.064768585917207e-06, "loss": 0.6763, "step": 75100 }, { "epoch": 2.7844705490294093, "grad_norm": 2.3225242842709766, "learning_rate": 9.061001891033666e-06, "loss": 0.6696, "step": 75200 }, { "epoch": 2.788173327038611, "grad_norm": 2.272648856356267, "learning_rate": 9.057228412004386e-06, "loss": 0.6585, "step": 75300 }, { "epoch": 2.791876105047812, "grad_norm": 2.708064532509065, "learning_rate": 9.053448155133192e-06, "loss": 0.6674, "step": 75400 }, { "epoch": 2.7955788830570136, "grad_norm": 2.0878561841156706, "learning_rate": 9.049661126735223e-06, "loss": 0.6523, "step": 75500 }, { "epoch": 2.799281661066215, "grad_norm": 2.0218162021372637, "learning_rate": 9.045867333136939e-06, "loss": 0.667, "step": 75600 }, { "epoch": 2.8029844390754164, "grad_norm": 1.9615749815202044, "learning_rate": 9.042066780676101e-06, "loss": 0.6644, "step": 75700 }, { "epoch": 2.806687217084618, "grad_norm": 2.458125241194594, "learning_rate": 9.038259475701756e-06, "loss": 0.6592, "step": 75800 }, { "epoch": 2.810389995093819, "grad_norm": 2.5321957606480887, "learning_rate": 9.034445424574232e-06, "loss": 0.6542, "step": 75900 }, { "epoch": 2.8140927731030203, "grad_norm": 2.305578502814208, "learning_rate": 9.030624633665131e-06, "loss": 0.6626, "step": 76000 }, { "epoch": 2.817795551112222, "grad_norm": 2.304093777477429, "learning_rate": 9.026797109357313e-06, "loss": 0.6585, "step": 76100 }, { "epoch": 2.8214983291214235, "grad_norm": 1.9063487829056964, "learning_rate": 9.022962858044881e-06, "loss": 0.6634, "step": 76200 }, { "epoch": 2.8252011071306247, "grad_norm": 2.4605756536089998, "learning_rate": 9.019121886133185e-06, "loss": 0.659, "step": 76300 }, { "epoch": 2.8289038851398263, "grad_norm": 2.908256690477109, "learning_rate": 9.015274200038798e-06, "loss": 0.6873, "step": 76400 }, { "epoch": 2.8326066631490274, "grad_norm": 2.195376131615668, "learning_rate": 9.011419806189503e-06, "loss": 0.6786, "step": 76500 }, { "epoch": 2.836309441158229, "grad_norm": 2.4481520740229588, "learning_rate": 9.0075587110243e-06, "loss": 0.6586, "step": 76600 }, { "epoch": 2.8400122191674306, "grad_norm": 2.5436298766851024, "learning_rate": 9.003690920993378e-06, "loss": 0.6732, "step": 76700 }, { "epoch": 2.843714997176632, "grad_norm": 2.0233903955790664, "learning_rate": 8.999816442558112e-06, "loss": 0.6694, "step": 76800 }, { "epoch": 2.847417775185833, "grad_norm": 1.9592757597831238, "learning_rate": 8.995935282191044e-06, "loss": 0.642, "step": 76900 }, { "epoch": 2.8511205531950345, "grad_norm": 2.4999659621973676, "learning_rate": 8.992047446375887e-06, "loss": 0.6758, "step": 77000 }, { "epoch": 2.854823331204236, "grad_norm": 2.320920562047208, "learning_rate": 8.988152941607505e-06, "loss": 0.6686, "step": 77100 }, { "epoch": 2.8585261092134373, "grad_norm": 2.180371204577853, "learning_rate": 8.984251774391895e-06, "loss": 0.6572, "step": 77200 }, { "epoch": 2.862228887222639, "grad_norm": 2.548377630577026, "learning_rate": 8.980343951246193e-06, "loss": 0.6858, "step": 77300 }, { "epoch": 2.86593166523184, "grad_norm": 2.2916044435835023, "learning_rate": 8.976429478698651e-06, "loss": 0.6612, "step": 77400 }, { "epoch": 2.8696344432410417, "grad_norm": 2.137867387232337, "learning_rate": 8.972508363288627e-06, "loss": 0.656, "step": 77500 }, { "epoch": 2.8733372212502433, "grad_norm": 2.6319833480679713, "learning_rate": 8.968580611566578e-06, "loss": 0.6505, "step": 77600 }, { "epoch": 2.8770399992594444, "grad_norm": 2.1088025728984907, "learning_rate": 8.96464623009405e-06, "loss": 0.6667, "step": 77700 }, { "epoch": 2.8807427772686456, "grad_norm": 1.9521003147155882, "learning_rate": 8.960705225443657e-06, "loss": 0.6596, "step": 77800 }, { "epoch": 2.884445555277847, "grad_norm": 2.5972066347938294, "learning_rate": 8.956757604199085e-06, "loss": 0.6545, "step": 77900 }, { "epoch": 2.888148333287049, "grad_norm": 2.4786047868289964, "learning_rate": 8.952803372955073e-06, "loss": 0.6722, "step": 78000 }, { "epoch": 2.89185111129625, "grad_norm": 2.2514808731629112, "learning_rate": 8.948842538317395e-06, "loss": 0.6556, "step": 78100 }, { "epoch": 2.8955538893054515, "grad_norm": 2.365087481495297, "learning_rate": 8.944875106902864e-06, "loss": 0.6482, "step": 78200 }, { "epoch": 2.8992566673146527, "grad_norm": 2.452402390597274, "learning_rate": 8.94090108533931e-06, "loss": 0.6893, "step": 78300 }, { "epoch": 2.9029594453238543, "grad_norm": 2.1846111061646885, "learning_rate": 8.936920480265576e-06, "loss": 0.6565, "step": 78400 }, { "epoch": 2.9066622233330555, "grad_norm": 2.5440937876149907, "learning_rate": 8.932933298331496e-06, "loss": 0.6731, "step": 78500 }, { "epoch": 2.910365001342257, "grad_norm": 2.1791116047812125, "learning_rate": 8.928939546197897e-06, "loss": 0.6747, "step": 78600 }, { "epoch": 2.914067779351458, "grad_norm": 2.5381792077290934, "learning_rate": 8.92493923053658e-06, "loss": 0.6759, "step": 78700 }, { "epoch": 2.91777055736066, "grad_norm": 2.3236635792732137, "learning_rate": 8.920932358030309e-06, "loss": 0.6675, "step": 78800 }, { "epoch": 2.9214733353698614, "grad_norm": 1.9029464622582775, "learning_rate": 8.916918935372805e-06, "loss": 0.6634, "step": 78900 }, { "epoch": 2.9251761133790626, "grad_norm": 2.224078219093189, "learning_rate": 8.912898969268731e-06, "loss": 0.6546, "step": 79000 }, { "epoch": 2.928878891388264, "grad_norm": 2.9148804782966233, "learning_rate": 8.908872466433677e-06, "loss": 0.6549, "step": 79100 }, { "epoch": 2.9325816693974653, "grad_norm": 2.4717406257998773, "learning_rate": 8.904839433594158e-06, "loss": 0.6522, "step": 79200 }, { "epoch": 2.936284447406667, "grad_norm": 2.6821434461084896, "learning_rate": 8.900799877487595e-06, "loss": 0.669, "step": 79300 }, { "epoch": 2.939987225415868, "grad_norm": 2.5288488175630057, "learning_rate": 8.896753804862308e-06, "loss": 0.6675, "step": 79400 }, { "epoch": 2.9436900034250697, "grad_norm": 2.3118984656483748, "learning_rate": 8.892701222477503e-06, "loss": 0.6428, "step": 79500 }, { "epoch": 2.947392781434271, "grad_norm": 1.7707450134385863, "learning_rate": 8.888642137103258e-06, "loss": 0.6423, "step": 79600 }, { "epoch": 2.9510955594434725, "grad_norm": 2.7951973513737016, "learning_rate": 8.884576555520521e-06, "loss": 0.6666, "step": 79700 }, { "epoch": 2.954798337452674, "grad_norm": 1.9441758598215642, "learning_rate": 8.880504484521084e-06, "loss": 0.6911, "step": 79800 }, { "epoch": 2.958501115461875, "grad_norm": 2.311415822913053, "learning_rate": 8.876425930907587e-06, "loss": 0.69, "step": 79900 }, { "epoch": 2.962203893471077, "grad_norm": 1.962196622233137, "learning_rate": 8.872340901493496e-06, "loss": 0.6991, "step": 80000 }, { "epoch": 2.965906671480278, "grad_norm": 2.2188989804402635, "learning_rate": 8.868249403103098e-06, "loss": 0.6512, "step": 80100 }, { "epoch": 2.9696094494894796, "grad_norm": 2.0738562772495217, "learning_rate": 8.864151442571481e-06, "loss": 0.6673, "step": 80200 }, { "epoch": 2.9733122274986807, "grad_norm": 2.45682348863258, "learning_rate": 8.860047026744535e-06, "loss": 0.6488, "step": 80300 }, { "epoch": 2.9770150055078823, "grad_norm": 2.876210559752475, "learning_rate": 8.855936162478933e-06, "loss": 0.641, "step": 80400 }, { "epoch": 2.9807177835170835, "grad_norm": 2.113010077915775, "learning_rate": 8.851818856642116e-06, "loss": 0.6482, "step": 80500 }, { "epoch": 2.984420561526285, "grad_norm": 2.2593684990909297, "learning_rate": 8.84769511611229e-06, "loss": 0.6596, "step": 80600 }, { "epoch": 2.9881233395354867, "grad_norm": 2.213052710368658, "learning_rate": 8.843564947778408e-06, "loss": 0.6674, "step": 80700 }, { "epoch": 2.991826117544688, "grad_norm": 1.9824851077389378, "learning_rate": 8.839428358540165e-06, "loss": 0.6606, "step": 80800 }, { "epoch": 2.9955288955538895, "grad_norm": 1.8350785430581344, "learning_rate": 8.835285355307979e-06, "loss": 0.6625, "step": 80900 }, { "epoch": 2.9992316735630906, "grad_norm": 2.2196935514359537, "learning_rate": 8.831135945002982e-06, "loss": 0.6483, "step": 81000 }, { "epoch": 3.0029251946272693, "grad_norm": 2.180481700028787, "learning_rate": 8.826980134557012e-06, "loss": 0.5716, "step": 81100 }, { "epoch": 3.0066279726364704, "grad_norm": 2.3154128557009166, "learning_rate": 8.8228179309126e-06, "loss": 0.5747, "step": 81200 }, { "epoch": 3.010330750645672, "grad_norm": 2.5911631549986316, "learning_rate": 8.818649341022954e-06, "loss": 0.5708, "step": 81300 }, { "epoch": 3.014033528654873, "grad_norm": 2.172878251158029, "learning_rate": 8.81447437185195e-06, "loss": 0.586, "step": 81400 }, { "epoch": 3.0177363066640748, "grad_norm": 2.285708121202155, "learning_rate": 8.810293030374126e-06, "loss": 0.5279, "step": 81500 }, { "epoch": 3.021439084673276, "grad_norm": 1.8325067800290862, "learning_rate": 8.80610532357466e-06, "loss": 0.5743, "step": 81600 }, { "epoch": 3.0251418626824775, "grad_norm": 2.4724163520836617, "learning_rate": 8.801911258449367e-06, "loss": 0.5686, "step": 81700 }, { "epoch": 3.028844640691679, "grad_norm": 2.8204386478402657, "learning_rate": 8.797710842004683e-06, "loss": 0.5661, "step": 81800 }, { "epoch": 3.0325474187008803, "grad_norm": 2.1624621580723504, "learning_rate": 8.793504081257653e-06, "loss": 0.5609, "step": 81900 }, { "epoch": 3.036250196710082, "grad_norm": 1.9578194242090217, "learning_rate": 8.789290983235925e-06, "loss": 0.5557, "step": 82000 }, { "epoch": 3.039952974719283, "grad_norm": 2.2570569549943373, "learning_rate": 8.785071554977724e-06, "loss": 0.5834, "step": 82100 }, { "epoch": 3.0436557527284847, "grad_norm": 2.848538981400608, "learning_rate": 8.780845803531861e-06, "loss": 0.5586, "step": 82200 }, { "epoch": 3.047358530737686, "grad_norm": 2.5342124121191936, "learning_rate": 8.776613735957706e-06, "loss": 0.5663, "step": 82300 }, { "epoch": 3.0510613087468874, "grad_norm": 2.414694188326073, "learning_rate": 8.772375359325179e-06, "loss": 0.5816, "step": 82400 }, { "epoch": 3.0547640867560886, "grad_norm": 3.677606480946753, "learning_rate": 8.768130680714739e-06, "loss": 0.5925, "step": 82500 }, { "epoch": 3.05846686476529, "grad_norm": 2.065087660620565, "learning_rate": 8.763879707217376e-06, "loss": 0.5508, "step": 82600 }, { "epoch": 3.0621696427744918, "grad_norm": 2.1775582587253246, "learning_rate": 8.759622445934595e-06, "loss": 0.5616, "step": 82700 }, { "epoch": 3.065872420783693, "grad_norm": 2.367143422543227, "learning_rate": 8.755358903978406e-06, "loss": 0.5546, "step": 82800 }, { "epoch": 3.0695751987928945, "grad_norm": 1.932878560997957, "learning_rate": 8.751089088471309e-06, "loss": 0.5689, "step": 82900 }, { "epoch": 3.0732779768020957, "grad_norm": 2.468459899117457, "learning_rate": 8.746813006546284e-06, "loss": 0.5823, "step": 83000 }, { "epoch": 3.0769807548112973, "grad_norm": 2.388094083983455, "learning_rate": 8.742530665346782e-06, "loss": 0.5823, "step": 83100 }, { "epoch": 3.0806835328204984, "grad_norm": 2.753875995282493, "learning_rate": 8.738242072026708e-06, "loss": 0.5716, "step": 83200 }, { "epoch": 3.0843863108297, "grad_norm": 1.925258395637769, "learning_rate": 8.733947233750415e-06, "loss": 0.578, "step": 83300 }, { "epoch": 3.088089088838901, "grad_norm": 2.2968217168726146, "learning_rate": 8.729646157692686e-06, "loss": 0.5618, "step": 83400 }, { "epoch": 3.091791866848103, "grad_norm": 1.9145250555584297, "learning_rate": 8.725338851038724e-06, "loss": 0.5751, "step": 83500 }, { "epoch": 3.095494644857304, "grad_norm": 2.321631737387505, "learning_rate": 8.72102532098414e-06, "loss": 0.5764, "step": 83600 }, { "epoch": 3.0991974228665056, "grad_norm": 2.629048057167854, "learning_rate": 8.716705574734944e-06, "loss": 0.5592, "step": 83700 }, { "epoch": 3.102900200875707, "grad_norm": 2.1345036952627, "learning_rate": 8.712379619507533e-06, "loss": 0.5869, "step": 83800 }, { "epoch": 3.1066029788849083, "grad_norm": 2.376817542505941, "learning_rate": 8.708047462528669e-06, "loss": 0.5743, "step": 83900 }, { "epoch": 3.11030575689411, "grad_norm": 2.0590557589733858, "learning_rate": 8.703709111035481e-06, "loss": 0.5735, "step": 84000 }, { "epoch": 3.114008534903311, "grad_norm": 2.1517329092180852, "learning_rate": 8.699364572275442e-06, "loss": 0.5848, "step": 84100 }, { "epoch": 3.1177113129125127, "grad_norm": 2.4571874139842036, "learning_rate": 8.695013853506364e-06, "loss": 0.566, "step": 84200 }, { "epoch": 3.121414090921714, "grad_norm": 2.4392712628475057, "learning_rate": 8.69065696199638e-06, "loss": 0.5793, "step": 84300 }, { "epoch": 3.1251168689309154, "grad_norm": 2.376177384679562, "learning_rate": 8.686293905023941e-06, "loss": 0.5645, "step": 84400 }, { "epoch": 3.1288196469401166, "grad_norm": 3.0840276149036723, "learning_rate": 8.68192468987779e-06, "loss": 0.5785, "step": 84500 }, { "epoch": 3.132522424949318, "grad_norm": 1.931592431086348, "learning_rate": 8.677549323856964e-06, "loss": 0.5996, "step": 84600 }, { "epoch": 3.13622520295852, "grad_norm": 2.8359388556879663, "learning_rate": 8.673167814270772e-06, "loss": 0.5752, "step": 84700 }, { "epoch": 3.139927980967721, "grad_norm": 2.1694839975776685, "learning_rate": 8.668780168438784e-06, "loss": 0.5783, "step": 84800 }, { "epoch": 3.1436307589769226, "grad_norm": 2.170377358274312, "learning_rate": 8.664386393690827e-06, "loss": 0.5744, "step": 84900 }, { "epoch": 3.1473335369861237, "grad_norm": 2.7322247425514656, "learning_rate": 8.659986497366964e-06, "loss": 0.5872, "step": 85000 }, { "epoch": 3.1510363149953253, "grad_norm": 2.139429340074681, "learning_rate": 8.655580486817483e-06, "loss": 0.5623, "step": 85100 }, { "epoch": 3.1547390930045265, "grad_norm": 2.270927109600586, "learning_rate": 8.651168369402886e-06, "loss": 0.5556, "step": 85200 }, { "epoch": 3.158441871013728, "grad_norm": 2.158974366309539, "learning_rate": 8.646750152493882e-06, "loss": 0.5766, "step": 85300 }, { "epoch": 3.1621446490229292, "grad_norm": 2.8884198485960533, "learning_rate": 8.642325843471362e-06, "loss": 0.5706, "step": 85400 }, { "epoch": 3.165847427032131, "grad_norm": 2.4603276187169465, "learning_rate": 8.637895449726401e-06, "loss": 0.5604, "step": 85500 }, { "epoch": 3.1695502050413324, "grad_norm": 2.28684630848756, "learning_rate": 8.633458978660232e-06, "loss": 0.5689, "step": 85600 }, { "epoch": 3.1732529830505336, "grad_norm": 2.065588394156344, "learning_rate": 8.629016437684247e-06, "loss": 0.5755, "step": 85700 }, { "epoch": 3.176955761059735, "grad_norm": 2.6622896390379234, "learning_rate": 8.624567834219975e-06, "loss": 0.5773, "step": 85800 }, { "epoch": 3.1806585390689364, "grad_norm": 2.4272121681797896, "learning_rate": 8.620113175699071e-06, "loss": 0.576, "step": 85900 }, { "epoch": 3.184361317078138, "grad_norm": 2.2656366677460866, "learning_rate": 8.615652469563314e-06, "loss": 0.5757, "step": 86000 }, { "epoch": 3.188064095087339, "grad_norm": 3.070358732125583, "learning_rate": 8.611185723264573e-06, "loss": 0.5777, "step": 86100 }, { "epoch": 3.1917668730965407, "grad_norm": 2.2785064720213914, "learning_rate": 8.606712944264816e-06, "loss": 0.5647, "step": 86200 }, { "epoch": 3.195469651105742, "grad_norm": 2.6726609213951247, "learning_rate": 8.60223414003609e-06, "loss": 0.5683, "step": 86300 }, { "epoch": 3.1991724291149435, "grad_norm": 1.9268968765680792, "learning_rate": 8.5977493180605e-06, "loss": 0.5547, "step": 86400 }, { "epoch": 3.202875207124145, "grad_norm": 2.2009014605761283, "learning_rate": 8.593258485830209e-06, "loss": 0.5674, "step": 86500 }, { "epoch": 3.2065779851333462, "grad_norm": 2.5770835343076253, "learning_rate": 8.588761650847423e-06, "loss": 0.566, "step": 86600 }, { "epoch": 3.210280763142548, "grad_norm": 2.1268754856239798, "learning_rate": 8.58425882062437e-06, "loss": 0.5763, "step": 86700 }, { "epoch": 3.213983541151749, "grad_norm": 3.546253377796802, "learning_rate": 8.579750002683297e-06, "loss": 0.5639, "step": 86800 }, { "epoch": 3.2176863191609506, "grad_norm": 2.8436316173690783, "learning_rate": 8.575235204556454e-06, "loss": 0.5726, "step": 86900 }, { "epoch": 3.2213890971701518, "grad_norm": 2.2338849763754585, "learning_rate": 8.57071443378608e-06, "loss": 0.5711, "step": 87000 }, { "epoch": 3.2250918751793534, "grad_norm": 2.6215661551368585, "learning_rate": 8.566187697924392e-06, "loss": 0.5682, "step": 87100 }, { "epoch": 3.2287946531885545, "grad_norm": 2.296684023281763, "learning_rate": 8.561655004533571e-06, "loss": 0.571, "step": 87200 }, { "epoch": 3.232497431197756, "grad_norm": 2.180038678793432, "learning_rate": 8.557116361185754e-06, "loss": 0.5841, "step": 87300 }, { "epoch": 3.2362002092069577, "grad_norm": 2.5889570875979904, "learning_rate": 8.552571775463013e-06, "loss": 0.5707, "step": 87400 }, { "epoch": 3.239902987216159, "grad_norm": 2.5300049872050514, "learning_rate": 8.54802125495735e-06, "loss": 0.5671, "step": 87500 }, { "epoch": 3.2436057652253605, "grad_norm": 2.1589891937328174, "learning_rate": 8.54346480727068e-06, "loss": 0.5635, "step": 87600 }, { "epoch": 3.2473085432345616, "grad_norm": 2.9790311952941844, "learning_rate": 8.538902440014823e-06, "loss": 0.5799, "step": 87700 }, { "epoch": 3.2510113212437632, "grad_norm": 1.9855218128536265, "learning_rate": 8.534334160811484e-06, "loss": 0.5825, "step": 87800 }, { "epoch": 3.2547140992529644, "grad_norm": 2.707021696599486, "learning_rate": 8.529759977292244e-06, "loss": 0.567, "step": 87900 }, { "epoch": 3.258416877262166, "grad_norm": 1.7496553697244026, "learning_rate": 8.525179897098553e-06, "loss": 0.5818, "step": 88000 }, { "epoch": 3.262119655271367, "grad_norm": 2.5493631060051913, "learning_rate": 8.520593927881704e-06, "loss": 0.5555, "step": 88100 }, { "epoch": 3.2658224332805688, "grad_norm": 2.3005234793894753, "learning_rate": 8.516002077302834e-06, "loss": 0.5835, "step": 88200 }, { "epoch": 3.2695252112897704, "grad_norm": 2.4947238296800323, "learning_rate": 8.511404353032904e-06, "loss": 0.5646, "step": 88300 }, { "epoch": 3.2732279892989715, "grad_norm": 2.1383581546817405, "learning_rate": 8.506800762752684e-06, "loss": 0.574, "step": 88400 }, { "epoch": 3.276930767308173, "grad_norm": 2.7090085405397177, "learning_rate": 8.502191314152748e-06, "loss": 0.5761, "step": 88500 }, { "epoch": 3.2806335453173743, "grad_norm": 2.2572994039829, "learning_rate": 8.497576014933453e-06, "loss": 0.5689, "step": 88600 }, { "epoch": 3.284336323326576, "grad_norm": 2.1784236759915316, "learning_rate": 8.492954872804933e-06, "loss": 0.5637, "step": 88700 }, { "epoch": 3.288039101335777, "grad_norm": 2.6598511240028944, "learning_rate": 8.488327895487081e-06, "loss": 0.5869, "step": 88800 }, { "epoch": 3.2917418793449786, "grad_norm": 2.5119951302942414, "learning_rate": 8.483695090709539e-06, "loss": 0.5628, "step": 88900 }, { "epoch": 3.29544465735418, "grad_norm": 2.2577688993420884, "learning_rate": 8.479056466211682e-06, "loss": 0.5773, "step": 89000 }, { "epoch": 3.2991474353633814, "grad_norm": 2.26338651208394, "learning_rate": 8.47441202974261e-06, "loss": 0.5657, "step": 89100 }, { "epoch": 3.302850213372583, "grad_norm": 2.830637239341942, "learning_rate": 8.469761789061131e-06, "loss": 0.5855, "step": 89200 }, { "epoch": 3.306552991381784, "grad_norm": 2.3162388609170796, "learning_rate": 8.465105751935747e-06, "loss": 0.5838, "step": 89300 }, { "epoch": 3.3102557693909858, "grad_norm": 2.199318493930729, "learning_rate": 8.460443926144648e-06, "loss": 0.5683, "step": 89400 }, { "epoch": 3.313958547400187, "grad_norm": 3.222660991369513, "learning_rate": 8.455776319475692e-06, "loss": 0.5892, "step": 89500 }, { "epoch": 3.3176613254093885, "grad_norm": 1.9136020619676297, "learning_rate": 8.451102939726392e-06, "loss": 0.5798, "step": 89600 }, { "epoch": 3.3213641034185897, "grad_norm": 2.077865813605651, "learning_rate": 8.446423794703911e-06, "loss": 0.5838, "step": 89700 }, { "epoch": 3.3250668814277913, "grad_norm": 3.416726422101747, "learning_rate": 8.441738892225035e-06, "loss": 0.5906, "step": 89800 }, { "epoch": 3.3287696594369924, "grad_norm": 2.9006389238777546, "learning_rate": 8.437048240116177e-06, "loss": 0.569, "step": 89900 }, { "epoch": 3.332472437446194, "grad_norm": 2.5447712669705758, "learning_rate": 8.43235184621335e-06, "loss": 0.5752, "step": 90000 }, { "epoch": 3.3361752154553956, "grad_norm": 2.305166664568876, "learning_rate": 8.427649718362157e-06, "loss": 0.5707, "step": 90100 }, { "epoch": 3.339877993464597, "grad_norm": 2.2149261895869445, "learning_rate": 8.422941864417788e-06, "loss": 0.5646, "step": 90200 }, { "epoch": 3.3435807714737984, "grad_norm": 2.503944270712931, "learning_rate": 8.418228292244988e-06, "loss": 0.5879, "step": 90300 }, { "epoch": 3.3472835494829996, "grad_norm": 2.528394023467674, "learning_rate": 8.413509009718066e-06, "loss": 0.5781, "step": 90400 }, { "epoch": 3.350986327492201, "grad_norm": 2.7007719270064134, "learning_rate": 8.40878402472086e-06, "loss": 0.5783, "step": 90500 }, { "epoch": 3.3546891055014023, "grad_norm": 2.528722212381294, "learning_rate": 8.40405334514674e-06, "loss": 0.5732, "step": 90600 }, { "epoch": 3.358391883510604, "grad_norm": 1.9221209925352007, "learning_rate": 8.399316978898592e-06, "loss": 0.5888, "step": 90700 }, { "epoch": 3.362094661519805, "grad_norm": 2.453761443992394, "learning_rate": 8.394574933888795e-06, "loss": 0.586, "step": 90800 }, { "epoch": 3.3657974395290067, "grad_norm": 1.9676213399891758, "learning_rate": 8.389827218039218e-06, "loss": 0.5871, "step": 90900 }, { "epoch": 3.3695002175382083, "grad_norm": 2.6940689815714896, "learning_rate": 8.385073839281203e-06, "loss": 0.5631, "step": 91000 }, { "epoch": 3.3732029955474094, "grad_norm": 1.6436440331266102, "learning_rate": 8.380314805555553e-06, "loss": 0.5832, "step": 91100 }, { "epoch": 3.376905773556611, "grad_norm": 2.989313383357062, "learning_rate": 8.375550124812519e-06, "loss": 0.5596, "step": 91200 }, { "epoch": 3.380608551565812, "grad_norm": 2.189503279736181, "learning_rate": 8.370779805011782e-06, "loss": 0.5701, "step": 91300 }, { "epoch": 3.384311329575014, "grad_norm": 2.2201615383283477, "learning_rate": 8.366003854122446e-06, "loss": 0.5558, "step": 91400 }, { "epoch": 3.388014107584215, "grad_norm": 2.3818071913075767, "learning_rate": 8.361222280123022e-06, "loss": 0.5601, "step": 91500 }, { "epoch": 3.3917168855934166, "grad_norm": 2.776501103734086, "learning_rate": 8.356435091001416e-06, "loss": 0.5843, "step": 91600 }, { "epoch": 3.3954196636026177, "grad_norm": 2.326063237327877, "learning_rate": 8.35164229475491e-06, "loss": 0.5883, "step": 91700 }, { "epoch": 3.3991224416118193, "grad_norm": 2.5129236495165586, "learning_rate": 8.34684389939016e-06, "loss": 0.5884, "step": 91800 }, { "epoch": 3.402825219621021, "grad_norm": 2.01094347184802, "learning_rate": 8.342039912923165e-06, "loss": 0.5659, "step": 91900 }, { "epoch": 3.406527997630222, "grad_norm": 2.468311288253847, "learning_rate": 8.337230343379277e-06, "loss": 0.5745, "step": 92000 }, { "epoch": 3.4102307756394232, "grad_norm": 2.065590664413841, "learning_rate": 8.332415198793164e-06, "loss": 0.5815, "step": 92100 }, { "epoch": 3.413933553648625, "grad_norm": 2.454408323517853, "learning_rate": 8.327594487208816e-06, "loss": 0.5866, "step": 92200 }, { "epoch": 3.4176363316578264, "grad_norm": 2.5784845749755108, "learning_rate": 8.322768216679515e-06, "loss": 0.5741, "step": 92300 }, { "epoch": 3.4213391096670276, "grad_norm": 1.9669935139935812, "learning_rate": 8.317936395267839e-06, "loss": 0.5785, "step": 92400 }, { "epoch": 3.425041887676229, "grad_norm": 2.9530633770577324, "learning_rate": 8.313099031045628e-06, "loss": 0.5827, "step": 92500 }, { "epoch": 3.4287446656854303, "grad_norm": 2.9530374725812796, "learning_rate": 8.30825613209399e-06, "loss": 0.5764, "step": 92600 }, { "epoch": 3.432447443694632, "grad_norm": 3.146837611510521, "learning_rate": 8.303407706503276e-06, "loss": 0.5888, "step": 92700 }, { "epoch": 3.436150221703833, "grad_norm": 2.014624931144349, "learning_rate": 8.298553762373072e-06, "loss": 0.5814, "step": 92800 }, { "epoch": 3.4398529997130347, "grad_norm": 2.271771736171648, "learning_rate": 8.293694307812178e-06, "loss": 0.5813, "step": 92900 }, { "epoch": 3.443555777722236, "grad_norm": 2.2250518236313623, "learning_rate": 8.288829350938603e-06, "loss": 0.6018, "step": 93000 }, { "epoch": 3.4472585557314375, "grad_norm": 2.355307591732802, "learning_rate": 8.283958899879549e-06, "loss": 0.5743, "step": 93100 }, { "epoch": 3.450961333740639, "grad_norm": 2.1577028157294937, "learning_rate": 8.279082962771394e-06, "loss": 0.6141, "step": 93200 }, { "epoch": 3.4546641117498402, "grad_norm": 1.9940467996328672, "learning_rate": 8.274201547759682e-06, "loss": 0.5768, "step": 93300 }, { "epoch": 3.458366889759042, "grad_norm": 1.9035346194812903, "learning_rate": 8.269314662999106e-06, "loss": 0.5701, "step": 93400 }, { "epoch": 3.462069667768243, "grad_norm": 2.567403729149102, "learning_rate": 8.264422316653501e-06, "loss": 0.563, "step": 93500 }, { "epoch": 3.4657724457774446, "grad_norm": 1.992301547877153, "learning_rate": 8.259524516895822e-06, "loss": 0.5716, "step": 93600 }, { "epoch": 3.4694752237866457, "grad_norm": 2.556550765056129, "learning_rate": 8.254621271908135e-06, "loss": 0.5714, "step": 93700 }, { "epoch": 3.4731780017958473, "grad_norm": 2.6585067358733925, "learning_rate": 8.249712589881603e-06, "loss": 0.5757, "step": 93800 }, { "epoch": 3.4768807798050485, "grad_norm": 2.6125392828815985, "learning_rate": 8.244798479016472e-06, "loss": 0.5785, "step": 93900 }, { "epoch": 3.48058355781425, "grad_norm": 3.1249606169391035, "learning_rate": 8.239878947522057e-06, "loss": 0.5786, "step": 94000 }, { "epoch": 3.4842863358234517, "grad_norm": 2.0752341974019415, "learning_rate": 8.23495400361673e-06, "loss": 0.571, "step": 94100 }, { "epoch": 3.487989113832653, "grad_norm": 2.1598119180321325, "learning_rate": 8.2300236555279e-06, "loss": 0.5795, "step": 94200 }, { "epoch": 3.4916918918418545, "grad_norm": 2.5040392881281286, "learning_rate": 8.225087911492014e-06, "loss": 0.5877, "step": 94300 }, { "epoch": 3.4953946698510556, "grad_norm": 2.5620709029183684, "learning_rate": 8.22014677975452e-06, "loss": 0.5696, "step": 94400 }, { "epoch": 3.4990974478602572, "grad_norm": 2.80257168651608, "learning_rate": 8.215200268569876e-06, "loss": 0.5822, "step": 94500 }, { "epoch": 3.502800225869459, "grad_norm": 2.358821440100724, "learning_rate": 8.210248386201522e-06, "loss": 0.5892, "step": 94600 }, { "epoch": 3.50650300387866, "grad_norm": 2.9519264576150563, "learning_rate": 8.205291140921876e-06, "loss": 0.5774, "step": 94700 }, { "epoch": 3.510205781887861, "grad_norm": 3.1574756821832777, "learning_rate": 8.200328541012308e-06, "loss": 0.5767, "step": 94800 }, { "epoch": 3.5139085598970627, "grad_norm": 2.472391047636803, "learning_rate": 8.19536059476314e-06, "loss": 0.569, "step": 94900 }, { "epoch": 3.5176113379062643, "grad_norm": 2.181909173239881, "learning_rate": 8.190387310473621e-06, "loss": 0.5731, "step": 95000 }, { "epoch": 3.5213141159154655, "grad_norm": 2.5151206228973253, "learning_rate": 8.185408696451919e-06, "loss": 0.5702, "step": 95100 }, { "epoch": 3.525016893924667, "grad_norm": 2.2451337142381487, "learning_rate": 8.180424761015104e-06, "loss": 0.5622, "step": 95200 }, { "epoch": 3.5287196719338683, "grad_norm": 2.8086150684166364, "learning_rate": 8.175435512489142e-06, "loss": 0.5787, "step": 95300 }, { "epoch": 3.53242244994307, "grad_norm": 2.1589736105614863, "learning_rate": 8.170440959208863e-06, "loss": 0.5882, "step": 95400 }, { "epoch": 3.5361252279522715, "grad_norm": 2.649100412725189, "learning_rate": 8.16544110951797e-06, "loss": 0.5873, "step": 95500 }, { "epoch": 3.5398280059614726, "grad_norm": 2.2173500566825557, "learning_rate": 8.160435971769011e-06, "loss": 0.5822, "step": 95600 }, { "epoch": 3.543530783970674, "grad_norm": 2.6394459968322574, "learning_rate": 8.155425554323367e-06, "loss": 0.5724, "step": 95700 }, { "epoch": 3.5472335619798754, "grad_norm": 3.138421987579112, "learning_rate": 8.150409865551236e-06, "loss": 0.5678, "step": 95800 }, { "epoch": 3.550936339989077, "grad_norm": 1.9986368807819936, "learning_rate": 8.145388913831628e-06, "loss": 0.5911, "step": 95900 }, { "epoch": 3.554639117998278, "grad_norm": 2.3897511135757807, "learning_rate": 8.140362707552339e-06, "loss": 0.5584, "step": 96000 }, { "epoch": 3.5583418960074797, "grad_norm": 1.9161759975487356, "learning_rate": 8.13533125510995e-06, "loss": 0.5955, "step": 96100 }, { "epoch": 3.562044674016681, "grad_norm": 2.2574382654700504, "learning_rate": 8.130294564909799e-06, "loss": 0.5676, "step": 96200 }, { "epoch": 3.5657474520258825, "grad_norm": 2.113759667605042, "learning_rate": 8.12525264536598e-06, "loss": 0.574, "step": 96300 }, { "epoch": 3.5694502300350837, "grad_norm": 2.287277608850245, "learning_rate": 8.120205504901318e-06, "loss": 0.5786, "step": 96400 }, { "epoch": 3.5731530080442853, "grad_norm": 2.061373244442029, "learning_rate": 8.115153151947361e-06, "loss": 0.5812, "step": 96500 }, { "epoch": 3.5768557860534864, "grad_norm": 2.348188372275138, "learning_rate": 8.11009559494437e-06, "loss": 0.5728, "step": 96600 }, { "epoch": 3.580558564062688, "grad_norm": 2.458190986228973, "learning_rate": 8.105032842341291e-06, "loss": 0.5727, "step": 96700 }, { "epoch": 3.5842613420718896, "grad_norm": 2.411829509518225, "learning_rate": 8.099964902595758e-06, "loss": 0.5797, "step": 96800 }, { "epoch": 3.587964120081091, "grad_norm": 2.1310007120131944, "learning_rate": 8.094891784174064e-06, "loss": 0.577, "step": 96900 }, { "epoch": 3.5916668980902924, "grad_norm": 2.3816865257696147, "learning_rate": 8.089813495551155e-06, "loss": 0.5923, "step": 97000 }, { "epoch": 3.5953696760994935, "grad_norm": 2.480199504132188, "learning_rate": 8.084730045210615e-06, "loss": 0.5707, "step": 97100 }, { "epoch": 3.599072454108695, "grad_norm": 2.5406169712199578, "learning_rate": 8.079641441644654e-06, "loss": 0.5711, "step": 97200 }, { "epoch": 3.6027752321178963, "grad_norm": 1.746358333419465, "learning_rate": 8.074547693354085e-06, "loss": 0.5601, "step": 97300 }, { "epoch": 3.606478010127098, "grad_norm": 2.0071373682669025, "learning_rate": 8.06944880884832e-06, "loss": 0.5653, "step": 97400 }, { "epoch": 3.610180788136299, "grad_norm": 2.25476001672745, "learning_rate": 8.064344796645346e-06, "loss": 0.5514, "step": 97500 }, { "epoch": 3.6138835661455007, "grad_norm": 2.590183540990655, "learning_rate": 8.059235665271723e-06, "loss": 0.5798, "step": 97600 }, { "epoch": 3.6175863441547023, "grad_norm": 2.5365393631024764, "learning_rate": 8.054121423262557e-06, "loss": 0.5726, "step": 97700 }, { "epoch": 3.6212891221639034, "grad_norm": 2.6037968127621656, "learning_rate": 8.049002079161496e-06, "loss": 0.5824, "step": 97800 }, { "epoch": 3.624991900173105, "grad_norm": 2.2504785421911637, "learning_rate": 8.043877641520709e-06, "loss": 0.5756, "step": 97900 }, { "epoch": 3.628694678182306, "grad_norm": 2.425078894587925, "learning_rate": 8.038748118900876e-06, "loss": 0.5879, "step": 98000 }, { "epoch": 3.632397456191508, "grad_norm": 2.1475388332364553, "learning_rate": 8.033613519871167e-06, "loss": 0.5779, "step": 98100 }, { "epoch": 3.636100234200709, "grad_norm": 3.283557024810981, "learning_rate": 8.028473853009238e-06, "loss": 0.5805, "step": 98200 }, { "epoch": 3.6398030122099105, "grad_norm": 2.8553212719509284, "learning_rate": 8.023329126901207e-06, "loss": 0.5726, "step": 98300 }, { "epoch": 3.6435057902191117, "grad_norm": 1.9873871681428459, "learning_rate": 8.018179350141648e-06, "loss": 0.5835, "step": 98400 }, { "epoch": 3.6472085682283133, "grad_norm": 3.04999170737206, "learning_rate": 8.01302453133357e-06, "loss": 0.5777, "step": 98500 }, { "epoch": 3.650911346237515, "grad_norm": 2.2855774214290454, "learning_rate": 8.007864679088404e-06, "loss": 0.5854, "step": 98600 }, { "epoch": 3.654614124246716, "grad_norm": 2.029891712543651, "learning_rate": 8.00269980202599e-06, "loss": 0.5637, "step": 98700 }, { "epoch": 3.6583169022559177, "grad_norm": 1.9365812857506421, "learning_rate": 7.997529908774563e-06, "loss": 0.5815, "step": 98800 }, { "epoch": 3.662019680265119, "grad_norm": 2.3809406520922876, "learning_rate": 7.992355007970743e-06, "loss": 0.5607, "step": 98900 }, { "epoch": 3.6657224582743204, "grad_norm": 2.4472961752647624, "learning_rate": 7.987175108259502e-06, "loss": 0.582, "step": 99000 }, { "epoch": 3.6694252362835216, "grad_norm": 2.297073497529406, "learning_rate": 7.981990218294177e-06, "loss": 0.5853, "step": 99100 }, { "epoch": 3.673128014292723, "grad_norm": 2.193654781760126, "learning_rate": 7.976800346736433e-06, "loss": 0.5567, "step": 99200 }, { "epoch": 3.6768307923019243, "grad_norm": 2.5120527380596744, "learning_rate": 7.971605502256264e-06, "loss": 0.5724, "step": 99300 }, { "epoch": 3.680533570311126, "grad_norm": 2.986360231553051, "learning_rate": 7.96640569353196e-06, "loss": 0.5966, "step": 99400 }, { "epoch": 3.6842363483203275, "grad_norm": 2.5385534409935038, "learning_rate": 7.96120092925012e-06, "loss": 0.56, "step": 99500 }, { "epoch": 3.6879391263295287, "grad_norm": 2.396788300046507, "learning_rate": 7.955991218105605e-06, "loss": 0.5832, "step": 99600 }, { "epoch": 3.6916419043387303, "grad_norm": 1.945591340982587, "learning_rate": 7.950776568801553e-06, "loss": 0.5661, "step": 99700 }, { "epoch": 3.6953446823479315, "grad_norm": 2.4626957403423413, "learning_rate": 7.945556990049346e-06, "loss": 0.5686, "step": 99800 }, { "epoch": 3.699047460357133, "grad_norm": 2.543084552240699, "learning_rate": 7.9403324905686e-06, "loss": 0.5834, "step": 99900 }, { "epoch": 3.702750238366334, "grad_norm": 2.7278130553270477, "learning_rate": 7.935103079087152e-06, "loss": 0.5828, "step": 100000 }, { "epoch": 3.706453016375536, "grad_norm": 1.8553355956612076, "learning_rate": 7.929868764341052e-06, "loss": 0.5661, "step": 100100 }, { "epoch": 3.710155794384737, "grad_norm": 3.2563785291340923, "learning_rate": 7.924629555074524e-06, "loss": 0.5602, "step": 100200 }, { "epoch": 3.7138585723939386, "grad_norm": 2.3695115820488293, "learning_rate": 7.919385460039989e-06, "loss": 0.5832, "step": 100300 }, { "epoch": 3.71756135040314, "grad_norm": 2.404279027873752, "learning_rate": 7.914136487998017e-06, "loss": 0.5804, "step": 100400 }, { "epoch": 3.7212641284123413, "grad_norm": 2.5267151386892217, "learning_rate": 7.908882647717327e-06, "loss": 0.5649, "step": 100500 }, { "epoch": 3.7249669064215425, "grad_norm": 2.2440627793598753, "learning_rate": 7.903623947974773e-06, "loss": 0.567, "step": 100600 }, { "epoch": 3.728669684430744, "grad_norm": 2.061954908732842, "learning_rate": 7.898360397555328e-06, "loss": 0.5715, "step": 100700 }, { "epoch": 3.7323724624399457, "grad_norm": 3.026808498362811, "learning_rate": 7.893092005252066e-06, "loss": 0.5922, "step": 100800 }, { "epoch": 3.736075240449147, "grad_norm": 2.113166407337317, "learning_rate": 7.88781877986615e-06, "loss": 0.5628, "step": 100900 }, { "epoch": 3.7397780184583485, "grad_norm": 2.4885840764165867, "learning_rate": 7.882540730206816e-06, "loss": 0.5643, "step": 101000 }, { "epoch": 3.7434807964675496, "grad_norm": 2.1881705767086195, "learning_rate": 7.877257865091365e-06, "loss": 0.558, "step": 101100 }, { "epoch": 3.747183574476751, "grad_norm": 2.4376674497107067, "learning_rate": 7.871970193345135e-06, "loss": 0.5845, "step": 101200 }, { "epoch": 3.750886352485953, "grad_norm": 1.9827152023090446, "learning_rate": 7.866677723801494e-06, "loss": 0.5746, "step": 101300 }, { "epoch": 3.754589130495154, "grad_norm": 2.1297068413680114, "learning_rate": 7.861380465301835e-06, "loss": 0.5683, "step": 101400 }, { "epoch": 3.758291908504355, "grad_norm": 2.242495816229493, "learning_rate": 7.856078426695538e-06, "loss": 0.5802, "step": 101500 }, { "epoch": 3.7619946865135567, "grad_norm": 2.6382227670969827, "learning_rate": 7.850771616839979e-06, "loss": 0.584, "step": 101600 }, { "epoch": 3.7656974645227583, "grad_norm": 2.0106184503255284, "learning_rate": 7.845460044600495e-06, "loss": 0.5695, "step": 101700 }, { "epoch": 3.7694002425319595, "grad_norm": 2.2094809386507586, "learning_rate": 7.840143718850388e-06, "loss": 0.5665, "step": 101800 }, { "epoch": 3.773103020541161, "grad_norm": 2.169175236796824, "learning_rate": 7.834822648470894e-06, "loss": 0.5833, "step": 101900 }, { "epoch": 3.7768057985503622, "grad_norm": 2.1238304296348987, "learning_rate": 7.829496842351184e-06, "loss": 0.5639, "step": 102000 }, { "epoch": 3.780508576559564, "grad_norm": 2.2584052114903215, "learning_rate": 7.824166309388327e-06, "loss": 0.579, "step": 102100 }, { "epoch": 3.7842113545687655, "grad_norm": 2.7273859685599753, "learning_rate": 7.818831058487303e-06, "loss": 0.5743, "step": 102200 }, { "epoch": 3.7879141325779666, "grad_norm": 2.76040010501173, "learning_rate": 7.813491098560963e-06, "loss": 0.5682, "step": 102300 }, { "epoch": 3.7916169105871678, "grad_norm": 2.65951986570608, "learning_rate": 7.808146438530029e-06, "loss": 0.5664, "step": 102400 }, { "epoch": 3.7953196885963694, "grad_norm": 2.0871614540792347, "learning_rate": 7.802797087323072e-06, "loss": 0.5608, "step": 102500 }, { "epoch": 3.799022466605571, "grad_norm": 2.419589313489518, "learning_rate": 7.797443053876505e-06, "loss": 0.5774, "step": 102600 }, { "epoch": 3.802725244614772, "grad_norm": 2.0066811497036743, "learning_rate": 7.79208434713456e-06, "loss": 0.5682, "step": 102700 }, { "epoch": 3.8064280226239737, "grad_norm": 2.079851678078465, "learning_rate": 7.786720976049276e-06, "loss": 0.5702, "step": 102800 }, { "epoch": 3.810130800633175, "grad_norm": 2.451489740018321, "learning_rate": 7.781352949580484e-06, "loss": 0.5597, "step": 102900 }, { "epoch": 3.8138335786423765, "grad_norm": 3.011933968921174, "learning_rate": 7.775980276695789e-06, "loss": 0.5762, "step": 103000 }, { "epoch": 3.817536356651578, "grad_norm": 2.3641130679614526, "learning_rate": 7.770602966370566e-06, "loss": 0.5651, "step": 103100 }, { "epoch": 3.8212391346607792, "grad_norm": 2.173061672532996, "learning_rate": 7.76522102758793e-06, "loss": 0.5717, "step": 103200 }, { "epoch": 3.8249419126699804, "grad_norm": 2.199985084049493, "learning_rate": 7.75983446933873e-06, "loss": 0.6023, "step": 103300 }, { "epoch": 3.828644690679182, "grad_norm": 2.57020524379545, "learning_rate": 7.754443300621533e-06, "loss": 0.5714, "step": 103400 }, { "epoch": 3.8323474686883836, "grad_norm": 3.072418942579769, "learning_rate": 7.74904753044261e-06, "loss": 0.5935, "step": 103500 }, { "epoch": 3.8360502466975848, "grad_norm": 2.5499661749782208, "learning_rate": 7.743647167815914e-06, "loss": 0.5683, "step": 103600 }, { "epoch": 3.8397530247067864, "grad_norm": 1.8258327506561032, "learning_rate": 7.738242221763073e-06, "loss": 0.5666, "step": 103700 }, { "epoch": 3.8434558027159875, "grad_norm": 2.5632297493493224, "learning_rate": 7.732832701313374e-06, "loss": 0.5848, "step": 103800 }, { "epoch": 3.847158580725189, "grad_norm": 2.2148423084289055, "learning_rate": 7.727418615503741e-06, "loss": 0.573, "step": 103900 }, { "epoch": 3.8508613587343907, "grad_norm": 2.602219638764699, "learning_rate": 7.721999973378727e-06, "loss": 0.5711, "step": 104000 }, { "epoch": 3.854564136743592, "grad_norm": 2.4059453328622387, "learning_rate": 7.716576783990498e-06, "loss": 0.5585, "step": 104100 }, { "epoch": 3.858266914752793, "grad_norm": 2.4798681586394347, "learning_rate": 7.711149056398815e-06, "loss": 0.5648, "step": 104200 }, { "epoch": 3.8619696927619946, "grad_norm": 2.469160236812727, "learning_rate": 7.705716799671019e-06, "loss": 0.5809, "step": 104300 }, { "epoch": 3.8656724707711962, "grad_norm": 2.848952673170172, "learning_rate": 7.700280022882021e-06, "loss": 0.5689, "step": 104400 }, { "epoch": 3.8693752487803974, "grad_norm": 2.346993704098933, "learning_rate": 7.69483873511428e-06, "loss": 0.5673, "step": 104500 }, { "epoch": 3.873078026789599, "grad_norm": 2.3360510936903487, "learning_rate": 7.68939294545779e-06, "loss": 0.5734, "step": 104600 }, { "epoch": 3.8767808047988, "grad_norm": 2.3911751387088658, "learning_rate": 7.683942663010067e-06, "loss": 0.5626, "step": 104700 }, { "epoch": 3.8804835828080018, "grad_norm": 2.1068941554476255, "learning_rate": 7.678487896876136e-06, "loss": 0.5685, "step": 104800 }, { "epoch": 3.8841863608172034, "grad_norm": 2.732073324262785, "learning_rate": 7.673028656168505e-06, "loss": 0.5698, "step": 104900 }, { "epoch": 3.8878891388264045, "grad_norm": 2.6956984639443475, "learning_rate": 7.667564950007165e-06, "loss": 0.5714, "step": 105000 }, { "epoch": 3.8915919168356057, "grad_norm": 2.303792318809918, "learning_rate": 7.66209678751956e-06, "loss": 0.5702, "step": 105100 }, { "epoch": 3.8952946948448073, "grad_norm": 2.0392508601683588, "learning_rate": 7.65662417784058e-06, "loss": 0.5663, "step": 105200 }, { "epoch": 3.898997472854009, "grad_norm": 2.5065113240609938, "learning_rate": 7.651147130112552e-06, "loss": 0.5838, "step": 105300 }, { "epoch": 3.90270025086321, "grad_norm": 2.3018440604181607, "learning_rate": 7.645665653485205e-06, "loss": 0.5709, "step": 105400 }, { "epoch": 3.9064030288724116, "grad_norm": 2.7348370280337626, "learning_rate": 7.640179757115676e-06, "loss": 0.5547, "step": 105500 }, { "epoch": 3.910105806881613, "grad_norm": 1.9716787422903184, "learning_rate": 7.634689450168475e-06, "loss": 0.5653, "step": 105600 }, { "epoch": 3.9138085848908144, "grad_norm": 2.5195323475854714, "learning_rate": 7.629194741815498e-06, "loss": 0.5461, "step": 105700 }, { "epoch": 3.917511362900016, "grad_norm": 3.0429879329203864, "learning_rate": 7.623695641235973e-06, "loss": 0.5591, "step": 105800 }, { "epoch": 3.921214140909217, "grad_norm": 3.1289920864933722, "learning_rate": 7.618192157616481e-06, "loss": 0.5566, "step": 105900 }, { "epoch": 3.9249169189184183, "grad_norm": 2.351111876147374, "learning_rate": 7.612684300150915e-06, "loss": 0.5686, "step": 106000 }, { "epoch": 3.92861969692762, "grad_norm": 2.3371509447626013, "learning_rate": 7.607172078040486e-06, "loss": 0.5654, "step": 106100 }, { "epoch": 3.9323224749368215, "grad_norm": 2.4947252960442414, "learning_rate": 7.601655500493683e-06, "loss": 0.5694, "step": 106200 }, { "epoch": 3.9360252529460227, "grad_norm": 2.235493588527231, "learning_rate": 7.596134576726281e-06, "loss": 0.5701, "step": 106300 }, { "epoch": 3.9397280309552243, "grad_norm": 2.142549043508473, "learning_rate": 7.59060931596131e-06, "loss": 0.585, "step": 106400 }, { "epoch": 3.9434308089644254, "grad_norm": 2.040006863189977, "learning_rate": 7.5850797274290496e-06, "loss": 0.5751, "step": 106500 }, { "epoch": 3.947133586973627, "grad_norm": 2.4360294335571675, "learning_rate": 7.579545820367007e-06, "loss": 0.5725, "step": 106600 }, { "epoch": 3.9508363649828286, "grad_norm": 2.114533906634112, "learning_rate": 7.574007604019904e-06, "loss": 0.5738, "step": 106700 }, { "epoch": 3.95453914299203, "grad_norm": 2.1700037480355245, "learning_rate": 7.568465087639658e-06, "loss": 0.5543, "step": 106800 }, { "epoch": 3.958241921001231, "grad_norm": 2.276195584109169, "learning_rate": 7.562918280485377e-06, "loss": 0.5579, "step": 106900 }, { "epoch": 3.9619446990104326, "grad_norm": 2.640745399198246, "learning_rate": 7.55736719182333e-06, "loss": 0.5746, "step": 107000 }, { "epoch": 3.965647477019634, "grad_norm": 2.390510274706146, "learning_rate": 7.551811830926945e-06, "loss": 0.5682, "step": 107100 }, { "epoch": 3.9693502550288353, "grad_norm": 2.6749570381335945, "learning_rate": 7.546252207076781e-06, "loss": 0.5555, "step": 107200 }, { "epoch": 3.973053033038037, "grad_norm": 2.0639413640436786, "learning_rate": 7.540688329560519e-06, "loss": 0.574, "step": 107300 }, { "epoch": 3.976755811047238, "grad_norm": 2.3147046981656576, "learning_rate": 7.535120207672953e-06, "loss": 0.5737, "step": 107400 }, { "epoch": 3.9804585890564397, "grad_norm": 2.697404766700257, "learning_rate": 7.529547850715959e-06, "loss": 0.5564, "step": 107500 }, { "epoch": 3.984161367065641, "grad_norm": 1.9157798877306975, "learning_rate": 7.523971267998493e-06, "loss": 0.5591, "step": 107600 }, { "epoch": 3.9878641450748424, "grad_norm": 2.2153796793467277, "learning_rate": 7.5183904688365675e-06, "loss": 0.5575, "step": 107700 }, { "epoch": 3.9915669230840436, "grad_norm": 2.448919609671325, "learning_rate": 7.512805462553241e-06, "loss": 0.5837, "step": 107800 }, { "epoch": 3.995269701093245, "grad_norm": 2.104625885076626, "learning_rate": 7.5072162584786e-06, "loss": 0.5806, "step": 107900 }, { "epoch": 3.998972479102447, "grad_norm": 2.183908285051534, "learning_rate": 7.501622865949745e-06, "loss": 0.5701, "step": 108000 }, { "epoch": 4.002666000166625, "grad_norm": 2.395119264367867, "learning_rate": 7.496025294310765e-06, "loss": 0.4925, "step": 108100 }, { "epoch": 4.006368778175826, "grad_norm": 2.6395448326626525, "learning_rate": 7.490423552912744e-06, "loss": 0.4714, "step": 108200 }, { "epoch": 4.010071556185028, "grad_norm": 2.7990565305728716, "learning_rate": 7.484817651113721e-06, "loss": 0.4793, "step": 108300 }, { "epoch": 4.013774334194229, "grad_norm": 2.657780242575247, "learning_rate": 7.47920759827869e-06, "loss": 0.4718, "step": 108400 }, { "epoch": 4.017477112203431, "grad_norm": 2.202042378392529, "learning_rate": 7.47359340377958e-06, "loss": 0.4566, "step": 108500 }, { "epoch": 4.021179890212632, "grad_norm": 2.390996517741498, "learning_rate": 7.467975076995237e-06, "loss": 0.4823, "step": 108600 }, { "epoch": 4.024882668221833, "grad_norm": 2.1233934746784136, "learning_rate": 7.462352627311409e-06, "loss": 0.4801, "step": 108700 }, { "epoch": 4.028585446231035, "grad_norm": 2.5482309137919676, "learning_rate": 7.456726064120736e-06, "loss": 0.4691, "step": 108800 }, { "epoch": 4.0322882242402365, "grad_norm": 2.5219433351008083, "learning_rate": 7.451095396822725e-06, "loss": 0.468, "step": 108900 }, { "epoch": 4.035991002249438, "grad_norm": 3.142738994028247, "learning_rate": 7.445460634823742e-06, "loss": 0.467, "step": 109000 }, { "epoch": 4.039693780258639, "grad_norm": 2.542251560826263, "learning_rate": 7.439821787536994e-06, "loss": 0.4838, "step": 109100 }, { "epoch": 4.04339655826784, "grad_norm": 2.347903021222968, "learning_rate": 7.434178864382511e-06, "loss": 0.4633, "step": 109200 }, { "epoch": 4.047099336277042, "grad_norm": 3.4098999775052232, "learning_rate": 7.428531874787132e-06, "loss": 0.4605, "step": 109300 }, { "epoch": 4.050802114286244, "grad_norm": 2.6445332221671967, "learning_rate": 7.422880828184489e-06, "loss": 0.4613, "step": 109400 }, { "epoch": 4.054504892295444, "grad_norm": 2.1796472899817068, "learning_rate": 7.417225734014994e-06, "loss": 0.4531, "step": 109500 }, { "epoch": 4.058207670304646, "grad_norm": 2.1374931848283776, "learning_rate": 7.411566601725817e-06, "loss": 0.4762, "step": 109600 }, { "epoch": 4.0619104483138475, "grad_norm": 2.0754734993699406, "learning_rate": 7.405903440770878e-06, "loss": 0.4728, "step": 109700 }, { "epoch": 4.065613226323049, "grad_norm": 2.942741298679025, "learning_rate": 7.400236260610824e-06, "loss": 0.4818, "step": 109800 }, { "epoch": 4.069316004332251, "grad_norm": 2.7635971054292376, "learning_rate": 7.3945650707130154e-06, "loss": 0.489, "step": 109900 }, { "epoch": 4.073018782341451, "grad_norm": 2.4020572333282844, "learning_rate": 7.388889880551516e-06, "loss": 0.4617, "step": 110000 }, { "epoch": 4.076721560350653, "grad_norm": 2.4937066619377206, "learning_rate": 7.383210699607068e-06, "loss": 0.4704, "step": 110100 }, { "epoch": 4.080424338359855, "grad_norm": 2.248393819641373, "learning_rate": 7.3775275373670816e-06, "loss": 0.4794, "step": 110200 }, { "epoch": 4.084127116369056, "grad_norm": 2.5751949229412374, "learning_rate": 7.3718404033256185e-06, "loss": 0.4734, "step": 110300 }, { "epoch": 4.087829894378257, "grad_norm": 2.8620226992546947, "learning_rate": 7.366149306983374e-06, "loss": 0.4733, "step": 110400 }, { "epoch": 4.0915326723874585, "grad_norm": 2.796077670071193, "learning_rate": 7.3604542578476645e-06, "loss": 0.4701, "step": 110500 }, { "epoch": 4.09523545039666, "grad_norm": 2.9311946283814185, "learning_rate": 7.3547552654324104e-06, "loss": 0.4842, "step": 110600 }, { "epoch": 4.098938228405862, "grad_norm": 2.852888672259453, "learning_rate": 7.349052339258117e-06, "loss": 0.4669, "step": 110700 }, { "epoch": 4.1026410064150625, "grad_norm": 2.52314905910618, "learning_rate": 7.3433454888518605e-06, "loss": 0.4688, "step": 110800 }, { "epoch": 4.106343784424264, "grad_norm": 3.4498187128967728, "learning_rate": 7.337634723747279e-06, "loss": 0.4788, "step": 110900 }, { "epoch": 4.110046562433466, "grad_norm": 2.5428098266591648, "learning_rate": 7.3319200534845425e-06, "loss": 0.4713, "step": 111000 }, { "epoch": 4.113749340442667, "grad_norm": 2.5363328616978973, "learning_rate": 7.326201487610349e-06, "loss": 0.4811, "step": 111100 }, { "epoch": 4.117452118451869, "grad_norm": 2.7225591788476975, "learning_rate": 7.320479035677904e-06, "loss": 0.4905, "step": 111200 }, { "epoch": 4.12115489646107, "grad_norm": 2.7325604110348753, "learning_rate": 7.314752707246907e-06, "loss": 0.4877, "step": 111300 }, { "epoch": 4.124857674470271, "grad_norm": 2.352613493618641, "learning_rate": 7.309022511883528e-06, "loss": 0.4859, "step": 111400 }, { "epoch": 4.128560452479473, "grad_norm": 2.3856483744938095, "learning_rate": 7.303288459160399e-06, "loss": 0.4837, "step": 111500 }, { "epoch": 4.132263230488674, "grad_norm": 2.6982186905887517, "learning_rate": 7.2975505586565994e-06, "loss": 0.4763, "step": 111600 }, { "epoch": 4.135966008497875, "grad_norm": 3.123665016743717, "learning_rate": 7.291808819957633e-06, "loss": 0.4824, "step": 111700 }, { "epoch": 4.139668786507077, "grad_norm": 2.417352812020203, "learning_rate": 7.286063252655418e-06, "loss": 0.4731, "step": 111800 }, { "epoch": 4.143371564516278, "grad_norm": 2.442966867335525, "learning_rate": 7.280313866348264e-06, "loss": 0.4783, "step": 111900 }, { "epoch": 4.14707434252548, "grad_norm": 2.7158007224063474, "learning_rate": 7.2745606706408664e-06, "loss": 0.4836, "step": 112000 }, { "epoch": 4.1507771205346815, "grad_norm": 2.165933059326296, "learning_rate": 7.268803675144281e-06, "loss": 0.4595, "step": 112100 }, { "epoch": 4.154479898543882, "grad_norm": 2.6736184167249895, "learning_rate": 7.2630428894759095e-06, "loss": 0.4843, "step": 112200 }, { "epoch": 4.158182676553084, "grad_norm": 2.6912674696352736, "learning_rate": 7.257278323259491e-06, "loss": 0.4746, "step": 112300 }, { "epoch": 4.161885454562285, "grad_norm": 2.2719100741463096, "learning_rate": 7.251509986125076e-06, "loss": 0.4652, "step": 112400 }, { "epoch": 4.165588232571487, "grad_norm": 2.829871124357899, "learning_rate": 7.245737887709016e-06, "loss": 0.4892, "step": 112500 }, { "epoch": 4.169291010580688, "grad_norm": 3.1146572045771057, "learning_rate": 7.239962037653944e-06, "loss": 0.4903, "step": 112600 }, { "epoch": 4.172993788589889, "grad_norm": 2.8441029614216706, "learning_rate": 7.234182445608762e-06, "loss": 0.4751, "step": 112700 }, { "epoch": 4.176696566599091, "grad_norm": 2.277393018580735, "learning_rate": 7.228399121228624e-06, "loss": 0.4798, "step": 112800 }, { "epoch": 4.1803993446082925, "grad_norm": 2.2641109513573157, "learning_rate": 7.222612074174919e-06, "loss": 0.4892, "step": 112900 }, { "epoch": 4.184102122617494, "grad_norm": 2.903747599217592, "learning_rate": 7.2168213141152525e-06, "loss": 0.4837, "step": 113000 }, { "epoch": 4.187804900626695, "grad_norm": 2.4190241406262434, "learning_rate": 7.211026850723433e-06, "loss": 0.4743, "step": 113100 }, { "epoch": 4.1915076786358965, "grad_norm": 2.064493835192583, "learning_rate": 7.205228693679462e-06, "loss": 0.4825, "step": 113200 }, { "epoch": 4.195210456645098, "grad_norm": 2.2071643327553683, "learning_rate": 7.199426852669499e-06, "loss": 0.4755, "step": 113300 }, { "epoch": 4.1989132346543, "grad_norm": 2.3541750344044434, "learning_rate": 7.1936213373858725e-06, "loss": 0.4681, "step": 113400 }, { "epoch": 4.2026160126635, "grad_norm": 2.365991928745428, "learning_rate": 7.187812157527037e-06, "loss": 0.49, "step": 113500 }, { "epoch": 4.206318790672702, "grad_norm": 2.7718180163744535, "learning_rate": 7.181999322797577e-06, "loss": 0.4851, "step": 113600 }, { "epoch": 4.210021568681904, "grad_norm": 2.8097335679028834, "learning_rate": 7.176182842908177e-06, "loss": 0.4906, "step": 113700 }, { "epoch": 4.213724346691105, "grad_norm": 3.10265320580523, "learning_rate": 7.170362727575615e-06, "loss": 0.4876, "step": 113800 }, { "epoch": 4.217427124700307, "grad_norm": 4.397469918720515, "learning_rate": 7.16453898652274e-06, "loss": 0.4806, "step": 113900 }, { "epoch": 4.2211299027095075, "grad_norm": 2.573865293429817, "learning_rate": 7.158711629478458e-06, "loss": 0.4756, "step": 114000 }, { "epoch": 4.224832680718709, "grad_norm": 3.0609065438135756, "learning_rate": 7.152880666177717e-06, "loss": 0.4865, "step": 114100 }, { "epoch": 4.228535458727911, "grad_norm": 2.7903782545588434, "learning_rate": 7.147046106361488e-06, "loss": 0.4673, "step": 114200 }, { "epoch": 4.232238236737112, "grad_norm": 3.5068683286409645, "learning_rate": 7.141207959776752e-06, "loss": 0.4883, "step": 114300 }, { "epoch": 4.235941014746313, "grad_norm": 2.306977865812599, "learning_rate": 7.135366236176482e-06, "loss": 0.4788, "step": 114400 }, { "epoch": 4.239643792755515, "grad_norm": 2.1841288423962815, "learning_rate": 7.129520945319622e-06, "loss": 0.4847, "step": 114500 }, { "epoch": 4.243346570764716, "grad_norm": 2.51779510784242, "learning_rate": 7.1236720969710814e-06, "loss": 0.4727, "step": 114600 }, { "epoch": 4.247049348773918, "grad_norm": 2.254764368256725, "learning_rate": 7.11781970090171e-06, "loss": 0.4837, "step": 114700 }, { "epoch": 4.250752126783119, "grad_norm": 2.3339475004598125, "learning_rate": 7.111963766888283e-06, "loss": 0.4771, "step": 114800 }, { "epoch": 4.25445490479232, "grad_norm": 2.0973123847204023, "learning_rate": 7.106104304713488e-06, "loss": 0.4819, "step": 114900 }, { "epoch": 4.258157682801522, "grad_norm": 2.255168443692503, "learning_rate": 7.1002413241659064e-06, "loss": 0.4795, "step": 115000 }, { "epoch": 4.261860460810723, "grad_norm": 2.194734035175973, "learning_rate": 7.0943748350399966e-06, "loss": 0.4919, "step": 115100 }, { "epoch": 4.265563238819925, "grad_norm": 2.598157851828195, "learning_rate": 7.088504847136077e-06, "loss": 0.4906, "step": 115200 }, { "epoch": 4.269266016829126, "grad_norm": 2.0944867539920287, "learning_rate": 7.082631370260313e-06, "loss": 0.4996, "step": 115300 }, { "epoch": 4.272968794838327, "grad_norm": 2.869614618117374, "learning_rate": 7.076754414224699e-06, "loss": 0.4864, "step": 115400 }, { "epoch": 4.276671572847529, "grad_norm": 2.9103037050777316, "learning_rate": 7.070873988847039e-06, "loss": 0.4922, "step": 115500 }, { "epoch": 4.2803743508567305, "grad_norm": 2.349057828429775, "learning_rate": 7.064990103950934e-06, "loss": 0.4966, "step": 115600 }, { "epoch": 4.284077128865932, "grad_norm": 2.299484224604513, "learning_rate": 7.059102769365767e-06, "loss": 0.4944, "step": 115700 }, { "epoch": 4.287779906875133, "grad_norm": 3.0241630100591386, "learning_rate": 7.053211994926677e-06, "loss": 0.4842, "step": 115800 }, { "epoch": 4.291482684884334, "grad_norm": 2.729346255592703, "learning_rate": 7.0473177904745585e-06, "loss": 0.4897, "step": 115900 }, { "epoch": 4.295185462893536, "grad_norm": 3.0009180265569775, "learning_rate": 7.041420165856027e-06, "loss": 0.4753, "step": 116000 }, { "epoch": 4.298888240902738, "grad_norm": 2.163760802374818, "learning_rate": 7.03551913092342e-06, "loss": 0.481, "step": 116100 }, { "epoch": 4.302591018911938, "grad_norm": 2.707307153881491, "learning_rate": 7.029614695534763e-06, "loss": 0.4826, "step": 116200 }, { "epoch": 4.30629379692114, "grad_norm": 2.3828944877491773, "learning_rate": 7.0237068695537745e-06, "loss": 0.4816, "step": 116300 }, { "epoch": 4.3099965749303415, "grad_norm": 2.6742507912562132, "learning_rate": 7.017795662849824e-06, "loss": 0.4849, "step": 116400 }, { "epoch": 4.313699352939543, "grad_norm": 2.3496824589903063, "learning_rate": 7.011881085297938e-06, "loss": 0.4891, "step": 116500 }, { "epoch": 4.317402130948745, "grad_norm": 2.7195836235745885, "learning_rate": 7.005963146778769e-06, "loss": 0.4724, "step": 116600 }, { "epoch": 4.321104908957945, "grad_norm": 2.526361555572989, "learning_rate": 7.0000418571785855e-06, "loss": 0.5009, "step": 116700 }, { "epoch": 4.324807686967147, "grad_norm": 2.445359274585664, "learning_rate": 6.994117226389257e-06, "loss": 0.4912, "step": 116800 }, { "epoch": 4.328510464976349, "grad_norm": 2.207776419802179, "learning_rate": 6.988189264308231e-06, "loss": 0.486, "step": 116900 }, { "epoch": 4.33221324298555, "grad_norm": 2.0860921386053417, "learning_rate": 6.982257980838522e-06, "loss": 0.4831, "step": 117000 }, { "epoch": 4.335916020994751, "grad_norm": 3.530286859450668, "learning_rate": 6.976323385888693e-06, "loss": 0.4956, "step": 117100 }, { "epoch": 4.3396187990039525, "grad_norm": 2.775205999729146, "learning_rate": 6.970385489372836e-06, "loss": 0.4797, "step": 117200 }, { "epoch": 4.343321577013154, "grad_norm": 2.7177539828762374, "learning_rate": 6.964444301210564e-06, "loss": 0.5003, "step": 117300 }, { "epoch": 4.347024355022356, "grad_norm": 2.34328467272709, "learning_rate": 6.9584998313269855e-06, "loss": 0.4749, "step": 117400 }, { "epoch": 4.350727133031557, "grad_norm": 2.775313740741385, "learning_rate": 6.952552089652692e-06, "loss": 0.4793, "step": 117500 }, { "epoch": 4.354429911040758, "grad_norm": 3.0224940111862204, "learning_rate": 6.94660108612374e-06, "loss": 0.4696, "step": 117600 }, { "epoch": 4.35813268904996, "grad_norm": 2.2249952754004356, "learning_rate": 6.940646830681635e-06, "loss": 0.4786, "step": 117700 }, { "epoch": 4.361835467059161, "grad_norm": 2.0731088857063296, "learning_rate": 6.9346893332733155e-06, "loss": 0.4764, "step": 117800 }, { "epoch": 4.365538245068363, "grad_norm": 2.27152895471557, "learning_rate": 6.9287286038511384e-06, "loss": 0.4855, "step": 117900 }, { "epoch": 4.369241023077564, "grad_norm": 3.595046461290223, "learning_rate": 6.922764652372854e-06, "loss": 0.4829, "step": 118000 }, { "epoch": 4.372943801086765, "grad_norm": 2.109141970755987, "learning_rate": 6.916797488801603e-06, "loss": 0.4939, "step": 118100 }, { "epoch": 4.376646579095967, "grad_norm": 3.5600173477932517, "learning_rate": 6.9108271231058836e-06, "loss": 0.4711, "step": 118200 }, { "epoch": 4.380349357105168, "grad_norm": 2.2260449197789587, "learning_rate": 6.904853565259549e-06, "loss": 0.4858, "step": 118300 }, { "epoch": 4.38405213511437, "grad_norm": 2.781930220607348, "learning_rate": 6.898876825241786e-06, "loss": 0.4943, "step": 118400 }, { "epoch": 4.387754913123571, "grad_norm": 2.5821939985590676, "learning_rate": 6.892896913037091e-06, "loss": 0.4854, "step": 118500 }, { "epoch": 4.391457691132772, "grad_norm": 2.6725161574141345, "learning_rate": 6.886913838635269e-06, "loss": 0.4839, "step": 118600 }, { "epoch": 4.395160469141974, "grad_norm": 2.5550505106542025, "learning_rate": 6.880927612031396e-06, "loss": 0.4777, "step": 118700 }, { "epoch": 4.3988632471511755, "grad_norm": 1.8459806917625088, "learning_rate": 6.874938243225828e-06, "loss": 0.4979, "step": 118800 }, { "epoch": 4.402566025160376, "grad_norm": 2.6859874695671326, "learning_rate": 6.868945742224154e-06, "loss": 0.4796, "step": 118900 }, { "epoch": 4.406268803169578, "grad_norm": 2.9259685564314437, "learning_rate": 6.862950119037213e-06, "loss": 0.4888, "step": 119000 }, { "epoch": 4.409971581178779, "grad_norm": 2.8645089090700813, "learning_rate": 6.856951383681043e-06, "loss": 0.48, "step": 119100 }, { "epoch": 4.413674359187981, "grad_norm": 2.460515552982391, "learning_rate": 6.850949546176895e-06, "loss": 0.4926, "step": 119200 }, { "epoch": 4.417377137197182, "grad_norm": 2.46857405112397, "learning_rate": 6.8449446165511935e-06, "loss": 0.4868, "step": 119300 }, { "epoch": 4.421079915206383, "grad_norm": 2.5605238616345147, "learning_rate": 6.838936604835531e-06, "loss": 0.4896, "step": 119400 }, { "epoch": 4.424782693215585, "grad_norm": 2.557683620446568, "learning_rate": 6.8329255210666505e-06, "loss": 0.4864, "step": 119500 }, { "epoch": 4.4284854712247865, "grad_norm": 2.7597256139440973, "learning_rate": 6.826911375286424e-06, "loss": 0.4722, "step": 119600 }, { "epoch": 4.432188249233988, "grad_norm": 2.3140313063999796, "learning_rate": 6.820894177541843e-06, "loss": 0.4836, "step": 119700 }, { "epoch": 4.435891027243189, "grad_norm": 2.839887894077623, "learning_rate": 6.814873937884994e-06, "loss": 0.4818, "step": 119800 }, { "epoch": 4.4395938052523904, "grad_norm": 2.6453728960622294, "learning_rate": 6.808850666373046e-06, "loss": 0.4867, "step": 119900 }, { "epoch": 4.443296583261592, "grad_norm": 2.7029127305705303, "learning_rate": 6.802824373068232e-06, "loss": 0.4738, "step": 120000 }, { "epoch": 4.446999361270794, "grad_norm": 2.3660030930629885, "learning_rate": 6.7967950680378355e-06, "loss": 0.4913, "step": 120100 }, { "epoch": 4.450702139279995, "grad_norm": 2.3742474705309884, "learning_rate": 6.790762761354172e-06, "loss": 0.496, "step": 120200 }, { "epoch": 4.454404917289196, "grad_norm": 3.287355549713654, "learning_rate": 6.784727463094565e-06, "loss": 0.4968, "step": 120300 }, { "epoch": 4.458107695298398, "grad_norm": 3.3136534670261484, "learning_rate": 6.7786891833413435e-06, "loss": 0.4675, "step": 120400 }, { "epoch": 4.461810473307599, "grad_norm": 2.264704600641698, "learning_rate": 6.7726479321818105e-06, "loss": 0.4953, "step": 120500 }, { "epoch": 4.465513251316801, "grad_norm": 3.1512463382364437, "learning_rate": 6.766603719708241e-06, "loss": 0.4756, "step": 120600 }, { "epoch": 4.4692160293260015, "grad_norm": 2.461294636247639, "learning_rate": 6.7605565560178475e-06, "loss": 0.4797, "step": 120700 }, { "epoch": 4.472918807335203, "grad_norm": 2.596205766172642, "learning_rate": 6.7545064512127815e-06, "loss": 0.4977, "step": 120800 }, { "epoch": 4.476621585344405, "grad_norm": 2.665235552678545, "learning_rate": 6.748453415400098e-06, "loss": 0.4933, "step": 120900 }, { "epoch": 4.480324363353606, "grad_norm": 2.555271924225678, "learning_rate": 6.742397458691761e-06, "loss": 0.4728, "step": 121000 }, { "epoch": 4.484027141362807, "grad_norm": 2.721727508686565, "learning_rate": 6.736338591204602e-06, "loss": 0.4877, "step": 121100 }, { "epoch": 4.487729919372009, "grad_norm": 2.416527347314746, "learning_rate": 6.730276823060321e-06, "loss": 0.4827, "step": 121200 }, { "epoch": 4.49143269738121, "grad_norm": 2.226670431352718, "learning_rate": 6.724212164385467e-06, "loss": 0.4777, "step": 121300 }, { "epoch": 4.495135475390412, "grad_norm": 3.1512271155340614, "learning_rate": 6.718144625311409e-06, "loss": 0.4896, "step": 121400 }, { "epoch": 4.498838253399613, "grad_norm": 2.584678532712773, "learning_rate": 6.712074215974337e-06, "loss": 0.4821, "step": 121500 }, { "epoch": 4.502541031408814, "grad_norm": 3.3685805212604976, "learning_rate": 6.706000946515228e-06, "loss": 0.4785, "step": 121600 }, { "epoch": 4.506243809418016, "grad_norm": 2.4277100908526017, "learning_rate": 6.6999248270798426e-06, "loss": 0.5073, "step": 121700 }, { "epoch": 4.509946587427217, "grad_norm": 2.1631619252225334, "learning_rate": 6.6938458678187e-06, "loss": 0.4779, "step": 121800 }, { "epoch": 4.513649365436419, "grad_norm": 2.6345103994604475, "learning_rate": 6.687764078887064e-06, "loss": 0.4821, "step": 121900 }, { "epoch": 4.5173521434456205, "grad_norm": 3.1504096303856173, "learning_rate": 6.681679470444924e-06, "loss": 0.4932, "step": 122000 }, { "epoch": 4.521054921454821, "grad_norm": 2.6219421996762136, "learning_rate": 6.6755920526569855e-06, "loss": 0.4779, "step": 122100 }, { "epoch": 4.524757699464023, "grad_norm": 2.5231665660182814, "learning_rate": 6.669501835692638e-06, "loss": 0.4905, "step": 122200 }, { "epoch": 4.528460477473224, "grad_norm": 2.98575720694564, "learning_rate": 6.663408829725954e-06, "loss": 0.4904, "step": 122300 }, { "epoch": 4.532163255482426, "grad_norm": 2.715436674537074, "learning_rate": 6.657313044935664e-06, "loss": 0.4708, "step": 122400 }, { "epoch": 4.535866033491627, "grad_norm": 2.1934688852034254, "learning_rate": 6.651214491505138e-06, "loss": 0.4853, "step": 122500 }, { "epoch": 4.539568811500828, "grad_norm": 2.114157245702518, "learning_rate": 6.645113179622374e-06, "loss": 0.4872, "step": 122600 }, { "epoch": 4.54327158951003, "grad_norm": 2.4392272758861817, "learning_rate": 6.639009119479979e-06, "loss": 0.4941, "step": 122700 }, { "epoch": 4.546974367519232, "grad_norm": 2.853557558835589, "learning_rate": 6.632902321275147e-06, "loss": 0.4913, "step": 122800 }, { "epoch": 4.550677145528432, "grad_norm": 2.286297564517873, "learning_rate": 6.626792795209649e-06, "loss": 0.4829, "step": 122900 }, { "epoch": 4.554379923537634, "grad_norm": 2.1549045849526665, "learning_rate": 6.620680551489811e-06, "loss": 0.4753, "step": 123000 }, { "epoch": 4.5580827015468355, "grad_norm": 3.0694210117624134, "learning_rate": 6.614565600326503e-06, "loss": 0.4831, "step": 123100 }, { "epoch": 4.561785479556037, "grad_norm": 2.5769155044520624, "learning_rate": 6.608447951935114e-06, "loss": 0.4675, "step": 123200 }, { "epoch": 4.565488257565239, "grad_norm": 2.1501587492632828, "learning_rate": 6.602327616535542e-06, "loss": 0.4887, "step": 123300 }, { "epoch": 4.569191035574439, "grad_norm": 3.1844282751827087, "learning_rate": 6.59620460435217e-06, "loss": 0.4831, "step": 123400 }, { "epoch": 4.572893813583641, "grad_norm": 2.1803661617688594, "learning_rate": 6.590078925613856e-06, "loss": 0.483, "step": 123500 }, { "epoch": 4.576596591592843, "grad_norm": 2.351217285598787, "learning_rate": 6.58395059055391e-06, "loss": 0.4836, "step": 123600 }, { "epoch": 4.580299369602044, "grad_norm": 2.7364406757049586, "learning_rate": 6.5778196094100845e-06, "loss": 0.4899, "step": 123700 }, { "epoch": 4.584002147611246, "grad_norm": 2.249668965679071, "learning_rate": 6.571685992424545e-06, "loss": 0.4944, "step": 123800 }, { "epoch": 4.5877049256204465, "grad_norm": 2.8608500157676646, "learning_rate": 6.565549749843867e-06, "loss": 0.4884, "step": 123900 }, { "epoch": 4.591407703629648, "grad_norm": 2.0704691687691446, "learning_rate": 6.559410891919007e-06, "loss": 0.4869, "step": 124000 }, { "epoch": 4.59511048163885, "grad_norm": 2.513643981814775, "learning_rate": 6.5532694289052945e-06, "loss": 0.4725, "step": 124100 }, { "epoch": 4.598813259648051, "grad_norm": 1.8346756829592497, "learning_rate": 6.547125371062411e-06, "loss": 0.4898, "step": 124200 }, { "epoch": 4.602516037657252, "grad_norm": 1.8373979059420529, "learning_rate": 6.54097872865437e-06, "loss": 0.4923, "step": 124300 }, { "epoch": 4.606218815666454, "grad_norm": 1.8980897311117784, "learning_rate": 6.534829511949505e-06, "loss": 0.4883, "step": 124400 }, { "epoch": 4.609921593675655, "grad_norm": 2.4577788786247763, "learning_rate": 6.528677731220447e-06, "loss": 0.475, "step": 124500 }, { "epoch": 4.613624371684857, "grad_norm": 1.788251972075112, "learning_rate": 6.522523396744114e-06, "loss": 0.483, "step": 124600 }, { "epoch": 4.6173271496940576, "grad_norm": 2.138005157490113, "learning_rate": 6.516366518801687e-06, "loss": 0.4743, "step": 124700 }, { "epoch": 4.621029927703259, "grad_norm": 2.1089924397338335, "learning_rate": 6.5102071076786e-06, "loss": 0.4859, "step": 124800 }, { "epoch": 4.624732705712461, "grad_norm": 2.5345766598102335, "learning_rate": 6.504045173664515e-06, "loss": 0.4774, "step": 124900 }, { "epoch": 4.628435483721662, "grad_norm": 2.2430894467043143, "learning_rate": 6.49788072705331e-06, "loss": 0.4942, "step": 125000 }, { "epoch": 4.632138261730864, "grad_norm": 3.29142205319767, "learning_rate": 6.491713778143062e-06, "loss": 0.487, "step": 125100 }, { "epoch": 4.635841039740065, "grad_norm": 2.4990089888187192, "learning_rate": 6.485544337236024e-06, "loss": 0.4779, "step": 125200 }, { "epoch": 4.639543817749266, "grad_norm": 3.141183594230581, "learning_rate": 6.479372414638617e-06, "loss": 0.4904, "step": 125300 }, { "epoch": 4.643246595758468, "grad_norm": 2.11174274346095, "learning_rate": 6.473198020661407e-06, "loss": 0.4807, "step": 125400 }, { "epoch": 4.6469493737676695, "grad_norm": 3.4290678589472656, "learning_rate": 6.4670211656190825e-06, "loss": 0.4906, "step": 125500 }, { "epoch": 4.650652151776871, "grad_norm": 2.273091288482063, "learning_rate": 6.4608418598304535e-06, "loss": 0.5042, "step": 125600 }, { "epoch": 4.654354929786072, "grad_norm": 3.2279964193849775, "learning_rate": 6.454660113618413e-06, "loss": 0.4926, "step": 125700 }, { "epoch": 4.658057707795273, "grad_norm": 1.8974419657256836, "learning_rate": 6.448475937309942e-06, "loss": 0.4806, "step": 125800 }, { "epoch": 4.661760485804475, "grad_norm": 2.3658113569425954, "learning_rate": 6.442289341236071e-06, "loss": 0.4802, "step": 125900 }, { "epoch": 4.665463263813677, "grad_norm": 2.1788815468630793, "learning_rate": 6.4361003357318815e-06, "loss": 0.4781, "step": 126000 }, { "epoch": 4.669166041822877, "grad_norm": 2.149969109930297, "learning_rate": 6.429908931136473e-06, "loss": 0.4907, "step": 126100 }, { "epoch": 4.672868819832079, "grad_norm": 2.4156226733435284, "learning_rate": 6.4237151377929585e-06, "loss": 0.4916, "step": 126200 }, { "epoch": 4.6765715978412805, "grad_norm": 2.596342053823738, "learning_rate": 6.417518966048438e-06, "loss": 0.5053, "step": 126300 }, { "epoch": 4.680274375850482, "grad_norm": 2.8196661098333244, "learning_rate": 6.4113204262539855e-06, "loss": 0.4844, "step": 126400 }, { "epoch": 4.683977153859683, "grad_norm": 2.1804203104976945, "learning_rate": 6.40511952876463e-06, "loss": 0.495, "step": 126500 }, { "epoch": 4.687679931868884, "grad_norm": 2.6996928756647978, "learning_rate": 6.398916283939342e-06, "loss": 0.4908, "step": 126600 }, { "epoch": 4.691382709878086, "grad_norm": 2.849659095008784, "learning_rate": 6.392710702141011e-06, "loss": 0.4769, "step": 126700 }, { "epoch": 4.695085487887288, "grad_norm": 2.4733462038948804, "learning_rate": 6.386502793736433e-06, "loss": 0.488, "step": 126800 }, { "epoch": 4.698788265896489, "grad_norm": 3.3662895643976274, "learning_rate": 6.380292569096288e-06, "loss": 0.4961, "step": 126900 }, { "epoch": 4.70249104390569, "grad_norm": 2.187765403797937, "learning_rate": 6.374080038595125e-06, "loss": 0.4971, "step": 127000 }, { "epoch": 4.7061938219148916, "grad_norm": 3.0808367827887384, "learning_rate": 6.3678652126113495e-06, "loss": 0.4796, "step": 127100 }, { "epoch": 4.709896599924093, "grad_norm": 2.6310373043248827, "learning_rate": 6.361648101527196e-06, "loss": 0.485, "step": 127200 }, { "epoch": 4.713599377933295, "grad_norm": 2.681834013835313, "learning_rate": 6.35542871572872e-06, "loss": 0.4757, "step": 127300 }, { "epoch": 4.717302155942496, "grad_norm": 2.5872752719690655, "learning_rate": 6.349207065605776e-06, "loss": 0.4916, "step": 127400 }, { "epoch": 4.721004933951697, "grad_norm": 2.1678661367916776, "learning_rate": 6.342983161552003e-06, "loss": 0.4811, "step": 127500 }, { "epoch": 4.724707711960899, "grad_norm": 2.497614288372322, "learning_rate": 6.336757013964802e-06, "loss": 0.49, "step": 127600 }, { "epoch": 4.7284104899701, "grad_norm": 2.717593629293241, "learning_rate": 6.330528633245324e-06, "loss": 0.4896, "step": 127700 }, { "epoch": 4.732113267979301, "grad_norm": 2.1534385778915395, "learning_rate": 6.3242980297984515e-06, "loss": 0.5071, "step": 127800 }, { "epoch": 4.735816045988503, "grad_norm": 2.8558940053947306, "learning_rate": 6.318065214032777e-06, "loss": 0.4772, "step": 127900 }, { "epoch": 4.739518823997704, "grad_norm": 2.228157066344852, "learning_rate": 6.311830196360592e-06, "loss": 0.4907, "step": 128000 }, { "epoch": 4.743221602006906, "grad_norm": 3.224973580753529, "learning_rate": 6.305592987197866e-06, "loss": 0.4921, "step": 128100 }, { "epoch": 4.746924380016107, "grad_norm": 2.8659344751079447, "learning_rate": 6.299353596964226e-06, "loss": 0.4767, "step": 128200 }, { "epoch": 4.750627158025308, "grad_norm": 2.3241013218056574, "learning_rate": 6.293112036082949e-06, "loss": 0.4798, "step": 128300 }, { "epoch": 4.75432993603451, "grad_norm": 2.352197391267397, "learning_rate": 6.286868314980931e-06, "loss": 0.5121, "step": 128400 }, { "epoch": 4.758032714043711, "grad_norm": 2.8863073028515513, "learning_rate": 6.280622444088683e-06, "loss": 0.4876, "step": 128500 }, { "epoch": 4.761735492052913, "grad_norm": 2.943751321679694, "learning_rate": 6.274374433840302e-06, "loss": 0.496, "step": 128600 }, { "epoch": 4.7654382700621145, "grad_norm": 2.8355093742076427, "learning_rate": 6.2681242946734645e-06, "loss": 0.4683, "step": 128700 }, { "epoch": 4.769141048071315, "grad_norm": 2.2208024850557537, "learning_rate": 6.261872037029397e-06, "loss": 0.4817, "step": 128800 }, { "epoch": 4.772843826080517, "grad_norm": 2.1998306274220862, "learning_rate": 6.255617671352874e-06, "loss": 0.4918, "step": 128900 }, { "epoch": 4.776546604089718, "grad_norm": 2.782353293868382, "learning_rate": 6.249361208092181e-06, "loss": 0.4997, "step": 129000 }, { "epoch": 4.78024938209892, "grad_norm": 2.1870099917778334, "learning_rate": 6.243102657699115e-06, "loss": 0.4849, "step": 129100 }, { "epoch": 4.783952160108121, "grad_norm": 2.628997070625464, "learning_rate": 6.236842030628957e-06, "loss": 0.4912, "step": 129200 }, { "epoch": 4.787654938117322, "grad_norm": 2.4167882979598407, "learning_rate": 6.2305793373404564e-06, "loss": 0.467, "step": 129300 }, { "epoch": 4.791357716126524, "grad_norm": 2.5855026278624686, "learning_rate": 6.224314588295819e-06, "loss": 0.4981, "step": 129400 }, { "epoch": 4.7950604941357255, "grad_norm": 1.9671440880274877, "learning_rate": 6.21804779396068e-06, "loss": 0.4878, "step": 129500 }, { "epoch": 4.798763272144926, "grad_norm": 2.437241849810084, "learning_rate": 6.211778964804092e-06, "loss": 0.4767, "step": 129600 }, { "epoch": 4.802466050154128, "grad_norm": 2.992748281680595, "learning_rate": 6.205508111298508e-06, "loss": 0.4861, "step": 129700 }, { "epoch": 4.8061688281633295, "grad_norm": 2.3329934484417314, "learning_rate": 6.199235243919763e-06, "loss": 0.4816, "step": 129800 }, { "epoch": 4.809871606172531, "grad_norm": 2.6689421524177943, "learning_rate": 6.192960373147057e-06, "loss": 0.4739, "step": 129900 }, { "epoch": 4.813574384181733, "grad_norm": 2.6966827058153124, "learning_rate": 6.186683509462934e-06, "loss": 0.5064, "step": 130000 }, { "epoch": 4.817277162190933, "grad_norm": 2.8911069402274747, "learning_rate": 6.1804046633532685e-06, "loss": 0.479, "step": 130100 }, { "epoch": 4.820979940200135, "grad_norm": 3.3230348344033396, "learning_rate": 6.174123845307249e-06, "loss": 0.4845, "step": 130200 }, { "epoch": 4.824682718209337, "grad_norm": 2.3864729430664178, "learning_rate": 6.167841065817357e-06, "loss": 0.4767, "step": 130300 }, { "epoch": 4.828385496218538, "grad_norm": 2.6664843937265275, "learning_rate": 6.161556335379347e-06, "loss": 0.4876, "step": 130400 }, { "epoch": 4.83208827422774, "grad_norm": 2.5332303988002485, "learning_rate": 6.155269664492238e-06, "loss": 0.4915, "step": 130500 }, { "epoch": 4.8357910522369405, "grad_norm": 2.0349829058441764, "learning_rate": 6.148981063658289e-06, "loss": 0.4712, "step": 130600 }, { "epoch": 4.839493830246142, "grad_norm": 2.911400216084972, "learning_rate": 6.142690543382981e-06, "loss": 0.4923, "step": 130700 }, { "epoch": 4.843196608255344, "grad_norm": 2.8220930936439, "learning_rate": 6.1363981141750054e-06, "loss": 0.4969, "step": 130800 }, { "epoch": 4.846899386264545, "grad_norm": 2.4651103228513316, "learning_rate": 6.130103786546236e-06, "loss": 0.4912, "step": 130900 }, { "epoch": 4.850602164273746, "grad_norm": 2.014152164239959, "learning_rate": 6.123807571011727e-06, "loss": 0.4818, "step": 131000 }, { "epoch": 4.854304942282948, "grad_norm": 2.140400878259725, "learning_rate": 6.117509478089678e-06, "loss": 0.4751, "step": 131100 }, { "epoch": 4.858007720292149, "grad_norm": 2.4014938002576947, "learning_rate": 6.111209518301433e-06, "loss": 0.4817, "step": 131200 }, { "epoch": 4.861710498301351, "grad_norm": 2.6012118453696282, "learning_rate": 6.104907702171445e-06, "loss": 0.4911, "step": 131300 }, { "epoch": 4.8654132763105515, "grad_norm": 2.1334272620347488, "learning_rate": 6.09860404022728e-06, "loss": 0.4759, "step": 131400 }, { "epoch": 4.869116054319753, "grad_norm": 2.677998646210992, "learning_rate": 6.092298542999574e-06, "loss": 0.4943, "step": 131500 }, { "epoch": 4.872818832328955, "grad_norm": 3.002335497953678, "learning_rate": 6.085991221022041e-06, "loss": 0.4782, "step": 131600 }, { "epoch": 4.876521610338156, "grad_norm": 2.22599287637259, "learning_rate": 6.079682084831435e-06, "loss": 0.4899, "step": 131700 }, { "epoch": 4.880224388347358, "grad_norm": 2.566699572922749, "learning_rate": 6.073371144967547e-06, "loss": 0.4807, "step": 131800 }, { "epoch": 4.883927166356559, "grad_norm": 2.0929693955218935, "learning_rate": 6.067058411973176e-06, "loss": 0.4891, "step": 131900 }, { "epoch": 4.88762994436576, "grad_norm": 2.582302614689035, "learning_rate": 6.060743896394121e-06, "loss": 0.4822, "step": 132000 }, { "epoch": 4.891332722374962, "grad_norm": 2.251437975281811, "learning_rate": 6.054427608779155e-06, "loss": 0.4994, "step": 132100 }, { "epoch": 4.8950355003841635, "grad_norm": 2.1843777302681135, "learning_rate": 6.048109559680014e-06, "loss": 0.4746, "step": 132200 }, { "epoch": 4.898738278393365, "grad_norm": 2.830086051831928, "learning_rate": 6.041789759651375e-06, "loss": 0.4822, "step": 132300 }, { "epoch": 4.902441056402566, "grad_norm": 3.4137217763599916, "learning_rate": 6.03546821925084e-06, "loss": 0.4685, "step": 132400 }, { "epoch": 4.906143834411767, "grad_norm": 3.2322828499337866, "learning_rate": 6.029144949038921e-06, "loss": 0.5023, "step": 132500 }, { "epoch": 4.909846612420969, "grad_norm": 2.700361036212904, "learning_rate": 6.022819959579016e-06, "loss": 0.4676, "step": 132600 }, { "epoch": 4.913549390430171, "grad_norm": 2.338231743781301, "learning_rate": 6.0164932614373976e-06, "loss": 0.4881, "step": 132700 }, { "epoch": 4.917252168439371, "grad_norm": 2.8717111566551585, "learning_rate": 6.0101648651831925e-06, "loss": 0.4723, "step": 132800 }, { "epoch": 4.920954946448573, "grad_norm": 2.8332272286827527, "learning_rate": 6.003834781388364e-06, "loss": 0.4768, "step": 132900 }, { "epoch": 4.9246577244577745, "grad_norm": 2.6015219330961328, "learning_rate": 5.997503020627693e-06, "loss": 0.4943, "step": 133000 }, { "epoch": 4.928360502466976, "grad_norm": 3.4228171215721606, "learning_rate": 5.991169593478764e-06, "loss": 0.491, "step": 133100 }, { "epoch": 4.932063280476177, "grad_norm": 2.6766087710258213, "learning_rate": 5.984834510521945e-06, "loss": 0.4844, "step": 133200 }, { "epoch": 4.935766058485378, "grad_norm": 2.4736813535572404, "learning_rate": 5.97849778234037e-06, "loss": 0.4783, "step": 133300 }, { "epoch": 4.93946883649458, "grad_norm": 2.8750821064965, "learning_rate": 5.972159419519922e-06, "loss": 0.4835, "step": 133400 }, { "epoch": 4.943171614503782, "grad_norm": 2.3385271098041125, "learning_rate": 5.965819432649212e-06, "loss": 0.4688, "step": 133500 }, { "epoch": 4.946874392512983, "grad_norm": 1.8738655225502558, "learning_rate": 5.959477832319565e-06, "loss": 0.4877, "step": 133600 }, { "epoch": 4.950577170522184, "grad_norm": 2.5770270724629283, "learning_rate": 5.953134629125006e-06, "loss": 0.4924, "step": 133700 }, { "epoch": 4.9542799485313855, "grad_norm": 2.2673011418933484, "learning_rate": 5.946789833662231e-06, "loss": 0.4887, "step": 133800 }, { "epoch": 4.957982726540587, "grad_norm": 2.2110336007597717, "learning_rate": 5.940443456530601e-06, "loss": 0.4873, "step": 133900 }, { "epoch": 4.961685504549789, "grad_norm": 2.7353125694528035, "learning_rate": 5.934095508332115e-06, "loss": 0.4626, "step": 134000 }, { "epoch": 4.96538828255899, "grad_norm": 2.275005618699468, "learning_rate": 5.927745999671403e-06, "loss": 0.479, "step": 134100 }, { "epoch": 4.969091060568191, "grad_norm": 2.2518768325795557, "learning_rate": 5.921394941155692e-06, "loss": 0.4791, "step": 134200 }, { "epoch": 4.972793838577393, "grad_norm": 2.93892149692789, "learning_rate": 5.915042343394809e-06, "loss": 0.4963, "step": 134300 }, { "epoch": 4.976496616586594, "grad_norm": 2.5524947364440487, "learning_rate": 5.908688217001144e-06, "loss": 0.4953, "step": 134400 }, { "epoch": 4.980199394595796, "grad_norm": 1.8626751576119176, "learning_rate": 5.902332572589646e-06, "loss": 0.495, "step": 134500 }, { "epoch": 4.983902172604997, "grad_norm": 2.5929432519241673, "learning_rate": 5.895975420777797e-06, "loss": 0.4838, "step": 134600 }, { "epoch": 4.987604950614198, "grad_norm": 3.233193166886902, "learning_rate": 5.8896167721855965e-06, "loss": 0.4646, "step": 134700 }, { "epoch": 4.9913077286234, "grad_norm": 3.0625225120597297, "learning_rate": 5.883256637435548e-06, "loss": 0.5067, "step": 134800 }, { "epoch": 4.995010506632601, "grad_norm": 2.5131860256621685, "learning_rate": 5.876895027152636e-06, "loss": 0.475, "step": 134900 }, { "epoch": 4.998713284641802, "grad_norm": 2.203657996485025, "learning_rate": 5.8705319519643075e-06, "loss": 0.5024, "step": 135000 }, { "epoch": 5.002406805705981, "grad_norm": 2.2686414258779766, "learning_rate": 5.864167422500462e-06, "loss": 0.422, "step": 135100 }, { "epoch": 5.006109583715182, "grad_norm": 2.7680060768619565, "learning_rate": 5.857801449393423e-06, "loss": 0.3838, "step": 135200 }, { "epoch": 5.009812361724384, "grad_norm": 2.417267587539369, "learning_rate": 5.85143404327793e-06, "loss": 0.4005, "step": 135300 }, { "epoch": 5.0135151397335855, "grad_norm": 2.6314313606518227, "learning_rate": 5.8450652147911105e-06, "loss": 0.3818, "step": 135400 }, { "epoch": 5.017217917742786, "grad_norm": 3.698380376781409, "learning_rate": 5.838694974572475e-06, "loss": 0.3912, "step": 135500 }, { "epoch": 5.020920695751988, "grad_norm": 2.3198123000011797, "learning_rate": 5.8323233332638905e-06, "loss": 0.3844, "step": 135600 }, { "epoch": 5.0246234737611895, "grad_norm": 2.7826943312637487, "learning_rate": 5.825950301509563e-06, "loss": 0.3876, "step": 135700 }, { "epoch": 5.028326251770391, "grad_norm": 2.476348006770625, "learning_rate": 5.819575889956021e-06, "loss": 0.398, "step": 135800 }, { "epoch": 5.032029029779592, "grad_norm": 2.367146494862415, "learning_rate": 5.8132001092521e-06, "loss": 0.3929, "step": 135900 }, { "epoch": 5.035731807788793, "grad_norm": 2.473323850374056, "learning_rate": 5.8068229700489205e-06, "loss": 0.3759, "step": 136000 }, { "epoch": 5.039434585797995, "grad_norm": 2.551884939181615, "learning_rate": 5.800444482999874e-06, "loss": 0.3964, "step": 136100 }, { "epoch": 5.043137363807197, "grad_norm": 2.9405020761064056, "learning_rate": 5.794064658760604e-06, "loss": 0.3921, "step": 136200 }, { "epoch": 5.046840141816398, "grad_norm": 2.6580802049514887, "learning_rate": 5.787683507988986e-06, "loss": 0.4057, "step": 136300 }, { "epoch": 5.050542919825599, "grad_norm": 3.154805131474556, "learning_rate": 5.781301041345115e-06, "loss": 0.3877, "step": 136400 }, { "epoch": 5.0542456978348005, "grad_norm": 2.0715360334035835, "learning_rate": 5.77491726949128e-06, "loss": 0.3943, "step": 136500 }, { "epoch": 5.057948475844002, "grad_norm": 2.6373309493637342, "learning_rate": 5.768532203091955e-06, "loss": 0.3961, "step": 136600 }, { "epoch": 5.061651253853204, "grad_norm": 2.240829739410264, "learning_rate": 5.7621458528137684e-06, "loss": 0.3962, "step": 136700 }, { "epoch": 5.065354031862404, "grad_norm": 3.178372547918326, "learning_rate": 5.7557582293255054e-06, "loss": 0.407, "step": 136800 }, { "epoch": 5.069056809871606, "grad_norm": 1.9947446146459578, "learning_rate": 5.749369343298068e-06, "loss": 0.3884, "step": 136900 }, { "epoch": 5.072759587880808, "grad_norm": 2.7914293161275356, "learning_rate": 5.742979205404473e-06, "loss": 0.3887, "step": 137000 }, { "epoch": 5.076462365890009, "grad_norm": 2.2579226015010847, "learning_rate": 5.736587826319825e-06, "loss": 0.377, "step": 137100 }, { "epoch": 5.080165143899211, "grad_norm": 3.1523076738120914, "learning_rate": 5.730195216721305e-06, "loss": 0.3985, "step": 137200 }, { "epoch": 5.0838679219084115, "grad_norm": 2.8236086880108524, "learning_rate": 5.723801387288148e-06, "loss": 0.3864, "step": 137300 }, { "epoch": 5.087570699917613, "grad_norm": 2.5280235014463868, "learning_rate": 5.717406348701627e-06, "loss": 0.3893, "step": 137400 }, { "epoch": 5.091273477926815, "grad_norm": 2.581363217409193, "learning_rate": 5.711010111645037e-06, "loss": 0.3912, "step": 137500 }, { "epoch": 5.094976255936016, "grad_norm": 3.2573032463957667, "learning_rate": 5.70461268680367e-06, "loss": 0.3876, "step": 137600 }, { "epoch": 5.098679033945217, "grad_norm": 3.0165365478649777, "learning_rate": 5.6982140848648085e-06, "loss": 0.3821, "step": 137700 }, { "epoch": 5.102381811954419, "grad_norm": 2.485460090481209, "learning_rate": 5.6918143165176965e-06, "loss": 0.3805, "step": 137800 }, { "epoch": 5.10608458996362, "grad_norm": 2.0165095025748143, "learning_rate": 5.6854133924535295e-06, "loss": 0.3896, "step": 137900 }, { "epoch": 5.109787367972822, "grad_norm": 2.9372548077558265, "learning_rate": 5.679011323365433e-06, "loss": 0.3937, "step": 138000 } ], "logging_steps": 100, "max_steps": 270070, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3567256998772736.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }