{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.036250196710082, "eval_steps": 500, "global_step": 82000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0037027780092014034, "grad_norm": 21.161716771412177, "learning_rate": 3.6657162957751695e-08, "loss": 2.6783, "step": 100 }, { "epoch": 0.007405556018402807, "grad_norm": 17.61802823072404, "learning_rate": 7.368460028881402e-08, "loss": 2.6426, "step": 200 }, { "epoch": 0.01110833402760421, "grad_norm": 8.635640044259958, "learning_rate": 1.1071203761987633e-07, "loss": 2.5725, "step": 300 }, { "epoch": 0.014811112036805614, "grad_norm": 4.290727817061978, "learning_rate": 1.4773947495093866e-07, "loss": 2.4757, "step": 400 }, { "epoch": 0.018513890046007016, "grad_norm": 3.791476034639625, "learning_rate": 1.8476691228200099e-07, "loss": 2.367, "step": 500 }, { "epoch": 0.02221666805520842, "grad_norm": 4.547923325980467, "learning_rate": 2.2179434961306329e-07, "loss": 2.3288, "step": 600 }, { "epoch": 0.025919446064409823, "grad_norm": 4.402404364420564, "learning_rate": 2.5882178694412564e-07, "loss": 2.2759, "step": 700 }, { "epoch": 0.029622224073611227, "grad_norm": 3.6276565743103535, "learning_rate": 2.958492242751879e-07, "loss": 2.2766, "step": 800 }, { "epoch": 0.03332500208281263, "grad_norm": 4.170244852808218, "learning_rate": 3.3287666160625024e-07, "loss": 2.2338, "step": 900 }, { "epoch": 0.03702778009201403, "grad_norm": 4.412766673186702, "learning_rate": 3.699040989373126e-07, "loss": 2.1561, "step": 1000 }, { "epoch": 0.04073055810121544, "grad_norm": 4.628401400135504, "learning_rate": 4.069315362683749e-07, "loss": 2.1544, "step": 1100 }, { "epoch": 0.04443333611041684, "grad_norm": 3.7081302230204867, "learning_rate": 4.439589735994372e-07, "loss": 2.106, "step": 1200 }, { "epoch": 0.04813611411961825, "grad_norm": 4.2137908215712905, "learning_rate": 4.809864109304995e-07, "loss": 2.0794, "step": 1300 }, { "epoch": 0.05183889212881965, "grad_norm": 4.60851346975717, "learning_rate": 5.180138482615619e-07, "loss": 2.0641, "step": 1400 }, { "epoch": 0.05554167013802105, "grad_norm": 4.313023013354129, "learning_rate": 5.550412855926242e-07, "loss": 1.9983, "step": 1500 }, { "epoch": 0.059244448147222455, "grad_norm": 3.966177577383383, "learning_rate": 5.920687229236865e-07, "loss": 2.0366, "step": 1600 }, { "epoch": 0.06294722615642385, "grad_norm": 4.661276659671688, "learning_rate": 6.290961602547487e-07, "loss": 2.0112, "step": 1700 }, { "epoch": 0.06665000416562526, "grad_norm": 3.856161388179088, "learning_rate": 6.661235975858112e-07, "loss": 1.9547, "step": 1800 }, { "epoch": 0.07035278217482667, "grad_norm": 4.5116015322440015, "learning_rate": 7.031510349168734e-07, "loss": 1.9599, "step": 1900 }, { "epoch": 0.07405556018402806, "grad_norm": 3.9605628970368416, "learning_rate": 7.401784722479357e-07, "loss": 1.9673, "step": 2000 }, { "epoch": 0.07775833819322947, "grad_norm": 4.534104810402031, "learning_rate": 7.772059095789982e-07, "loss": 1.8987, "step": 2100 }, { "epoch": 0.08146111620243088, "grad_norm": 3.7768720377686815, "learning_rate": 8.142333469100604e-07, "loss": 1.9039, "step": 2200 }, { "epoch": 0.08516389421163227, "grad_norm": 3.709870476374346, "learning_rate": 8.512607842411227e-07, "loss": 1.8572, "step": 2300 }, { "epoch": 0.08886667222083368, "grad_norm": 3.861469813158571, "learning_rate": 8.882882215721851e-07, "loss": 1.831, "step": 2400 }, { "epoch": 0.09256945023003509, "grad_norm": 4.046784580473015, "learning_rate": 9.253156589032473e-07, "loss": 1.856, "step": 2500 }, { "epoch": 0.0962722282392365, "grad_norm": 4.439862554726597, "learning_rate": 9.623430962343098e-07, "loss": 1.8344, "step": 2600 }, { "epoch": 0.09997500624843789, "grad_norm": 6.222158633649652, "learning_rate": 9.993705335653721e-07, "loss": 1.8408, "step": 2700 }, { "epoch": 0.1036777842576393, "grad_norm": 3.7129912385070503, "learning_rate": 1.0363979708964342e-06, "loss": 1.8195, "step": 2800 }, { "epoch": 0.1073805622668407, "grad_norm": 4.073889809938404, "learning_rate": 1.0734254082274968e-06, "loss": 1.7714, "step": 2900 }, { "epoch": 0.1110833402760421, "grad_norm": 3.8485467079112805, "learning_rate": 1.1104528455585589e-06, "loss": 1.7782, "step": 3000 }, { "epoch": 0.1147861182852435, "grad_norm": 3.548076088775858, "learning_rate": 1.1474802828896212e-06, "loss": 1.7646, "step": 3100 }, { "epoch": 0.11848889629444491, "grad_norm": 3.754340846452376, "learning_rate": 1.1845077202206837e-06, "loss": 1.7733, "step": 3200 }, { "epoch": 0.12219167430364632, "grad_norm": 6.281470572024916, "learning_rate": 1.2215351575517459e-06, "loss": 1.7361, "step": 3300 }, { "epoch": 0.1258944523128477, "grad_norm": 4.065489405313006, "learning_rate": 1.2585625948828082e-06, "loss": 1.7397, "step": 3400 }, { "epoch": 0.12959723032204912, "grad_norm": 4.260390115086277, "learning_rate": 1.2955900322138707e-06, "loss": 1.7571, "step": 3500 }, { "epoch": 0.13330000833125052, "grad_norm": 5.9838104492225215, "learning_rate": 1.3326174695449328e-06, "loss": 1.7205, "step": 3600 }, { "epoch": 0.13700278634045193, "grad_norm": 4.7607112587769915, "learning_rate": 1.3696449068759952e-06, "loss": 1.713, "step": 3700 }, { "epoch": 0.14070556434965334, "grad_norm": 4.047013434162942, "learning_rate": 1.4066723442070577e-06, "loss": 1.7084, "step": 3800 }, { "epoch": 0.14440834235885472, "grad_norm": 6.154993338003737, "learning_rate": 1.4436997815381198e-06, "loss": 1.6784, "step": 3900 }, { "epoch": 0.14811112036805613, "grad_norm": 3.6172501531655614, "learning_rate": 1.4807272188691821e-06, "loss": 1.6514, "step": 4000 }, { "epoch": 0.15181389837725753, "grad_norm": 3.9661920321991757, "learning_rate": 1.5177546562002447e-06, "loss": 1.6642, "step": 4100 }, { "epoch": 0.15551667638645894, "grad_norm": 4.468519494026533, "learning_rate": 1.5547820935313068e-06, "loss": 1.6517, "step": 4200 }, { "epoch": 0.15921945439566035, "grad_norm": 4.809090349712038, "learning_rate": 1.5918095308623691e-06, "loss": 1.6456, "step": 4300 }, { "epoch": 0.16292223240486176, "grad_norm": 3.991862639554731, "learning_rate": 1.6288369681934317e-06, "loss": 1.6474, "step": 4400 }, { "epoch": 0.16662501041406316, "grad_norm": 4.719090103521477, "learning_rate": 1.6658644055244938e-06, "loss": 1.6042, "step": 4500 }, { "epoch": 0.17032778842326454, "grad_norm": 4.927706214636146, "learning_rate": 1.702891842855556e-06, "loss": 1.5918, "step": 4600 }, { "epoch": 0.17403056643246595, "grad_norm": 4.566524993389404, "learning_rate": 1.7399192801866186e-06, "loss": 1.6103, "step": 4700 }, { "epoch": 0.17773334444166736, "grad_norm": 4.908438965819459, "learning_rate": 1.7769467175176807e-06, "loss": 1.6101, "step": 4800 }, { "epoch": 0.18143612245086876, "grad_norm": 3.8011540864311866, "learning_rate": 1.813974154848743e-06, "loss": 1.6162, "step": 4900 }, { "epoch": 0.18513890046007017, "grad_norm": 4.241301056669864, "learning_rate": 1.8510015921798056e-06, "loss": 1.6117, "step": 5000 }, { "epoch": 0.18884167846927158, "grad_norm": 4.305210424709023, "learning_rate": 1.8880290295108677e-06, "loss": 1.5634, "step": 5100 }, { "epoch": 0.192544456478473, "grad_norm": 5.188453275523176, "learning_rate": 1.92505646684193e-06, "loss": 1.5839, "step": 5200 }, { "epoch": 0.19624723448767437, "grad_norm": 4.390695329489216, "learning_rate": 1.9620839041729924e-06, "loss": 1.5747, "step": 5300 }, { "epoch": 0.19995001249687577, "grad_norm": 4.581970539434446, "learning_rate": 1.999111341504055e-06, "loss": 1.5597, "step": 5400 }, { "epoch": 0.20365279050607718, "grad_norm": 4.0470801193804755, "learning_rate": 2.036138778835117e-06, "loss": 1.5275, "step": 5500 }, { "epoch": 0.2073555685152786, "grad_norm": 4.1287928029941385, "learning_rate": 2.073166216166179e-06, "loss": 1.5455, "step": 5600 }, { "epoch": 0.21105834652448, "grad_norm": 5.5739542390705274, "learning_rate": 2.1101936534972417e-06, "loss": 1.526, "step": 5700 }, { "epoch": 0.2147611245336814, "grad_norm": 4.072688346808655, "learning_rate": 2.147221090828304e-06, "loss": 1.5006, "step": 5800 }, { "epoch": 0.2184639025428828, "grad_norm": 5.331904305941977, "learning_rate": 2.1842485281593663e-06, "loss": 1.5417, "step": 5900 }, { "epoch": 0.2221666805520842, "grad_norm": 4.378202415797325, "learning_rate": 2.221275965490429e-06, "loss": 1.4885, "step": 6000 }, { "epoch": 0.2258694585612856, "grad_norm": 4.318453195850846, "learning_rate": 2.258303402821491e-06, "loss": 1.5359, "step": 6100 }, { "epoch": 0.229572236570487, "grad_norm": 4.475047536813248, "learning_rate": 2.295330840152553e-06, "loss": 1.5168, "step": 6200 }, { "epoch": 0.2332750145796884, "grad_norm": 4.665314820774367, "learning_rate": 2.3323582774836156e-06, "loss": 1.4688, "step": 6300 }, { "epoch": 0.23697779258888982, "grad_norm": 5.210970519844203, "learning_rate": 2.3693857148146778e-06, "loss": 1.4775, "step": 6400 }, { "epoch": 0.24068057059809123, "grad_norm": 4.410640014744449, "learning_rate": 2.4064131521457403e-06, "loss": 1.4738, "step": 6500 }, { "epoch": 0.24438334860729263, "grad_norm": 3.944391436658137, "learning_rate": 2.443440589476803e-06, "loss": 1.4469, "step": 6600 }, { "epoch": 0.248086126616494, "grad_norm": 4.96868756013431, "learning_rate": 2.480468026807865e-06, "loss": 1.4821, "step": 6700 }, { "epoch": 0.2517889046256954, "grad_norm": 4.462942085694235, "learning_rate": 2.517495464138927e-06, "loss": 1.4591, "step": 6800 }, { "epoch": 0.25549168263489686, "grad_norm": 4.378031684132469, "learning_rate": 2.5545229014699896e-06, "loss": 1.4578, "step": 6900 }, { "epoch": 0.25919446064409823, "grad_norm": 4.241655047493595, "learning_rate": 2.5915503388010517e-06, "loss": 1.4705, "step": 7000 }, { "epoch": 0.2628972386532996, "grad_norm": 3.981686552106327, "learning_rate": 2.6285777761321142e-06, "loss": 1.4538, "step": 7100 }, { "epoch": 0.26660001666250105, "grad_norm": 4.3607343276496575, "learning_rate": 2.6656052134631768e-06, "loss": 1.411, "step": 7200 }, { "epoch": 0.27030279467170243, "grad_norm": 4.807349625008967, "learning_rate": 2.7026326507942385e-06, "loss": 1.4205, "step": 7300 }, { "epoch": 0.27400557268090386, "grad_norm": 4.289974925492848, "learning_rate": 2.739660088125301e-06, "loss": 1.4374, "step": 7400 }, { "epoch": 0.27770835069010524, "grad_norm": 3.491997941988729, "learning_rate": 2.7766875254563636e-06, "loss": 1.4393, "step": 7500 }, { "epoch": 0.2814111286993067, "grad_norm": 4.317473381602871, "learning_rate": 2.8137149627874257e-06, "loss": 1.3965, "step": 7600 }, { "epoch": 0.28511390670850806, "grad_norm": 4.317169353881641, "learning_rate": 2.850742400118488e-06, "loss": 1.4139, "step": 7700 }, { "epoch": 0.28881668471770944, "grad_norm": 4.342306783527047, "learning_rate": 2.8877698374495503e-06, "loss": 1.3966, "step": 7800 }, { "epoch": 0.2925194627269109, "grad_norm": 4.561613447481574, "learning_rate": 2.9247972747806124e-06, "loss": 1.4207, "step": 7900 }, { "epoch": 0.29622224073611225, "grad_norm": 4.603226423674348, "learning_rate": 2.961824712111675e-06, "loss": 1.4369, "step": 8000 }, { "epoch": 0.2999250187453137, "grad_norm": 4.628660307539308, "learning_rate": 2.9988521494427375e-06, "loss": 1.3871, "step": 8100 }, { "epoch": 0.30362779675451507, "grad_norm": 3.8865443379767415, "learning_rate": 3.0358795867737996e-06, "loss": 1.4182, "step": 8200 }, { "epoch": 0.3073305747637165, "grad_norm": 4.038269978955141, "learning_rate": 3.072907024104862e-06, "loss": 1.3622, "step": 8300 }, { "epoch": 0.3110333527729179, "grad_norm": 5.136502242831012, "learning_rate": 3.1099344614359243e-06, "loss": 1.3889, "step": 8400 }, { "epoch": 0.31473613078211926, "grad_norm": 4.457630892251384, "learning_rate": 3.1469618987669864e-06, "loss": 1.387, "step": 8500 }, { "epoch": 0.3184389087913207, "grad_norm": 4.651901883399858, "learning_rate": 3.183989336098049e-06, "loss": 1.3741, "step": 8600 }, { "epoch": 0.3221416868005221, "grad_norm": 4.800399697463299, "learning_rate": 3.2210167734291115e-06, "loss": 1.3473, "step": 8700 }, { "epoch": 0.3258444648097235, "grad_norm": 5.122987743156367, "learning_rate": 3.2580442107601736e-06, "loss": 1.3733, "step": 8800 }, { "epoch": 0.3295472428189249, "grad_norm": 5.307686552433291, "learning_rate": 3.295071648091236e-06, "loss": 1.341, "step": 8900 }, { "epoch": 0.3332500208281263, "grad_norm": 4.4204932532106085, "learning_rate": 3.3320990854222982e-06, "loss": 1.3615, "step": 9000 }, { "epoch": 0.3369527988373277, "grad_norm": 4.937899325267276, "learning_rate": 3.3691265227533603e-06, "loss": 1.368, "step": 9100 }, { "epoch": 0.3406555768465291, "grad_norm": 3.6435580501639326, "learning_rate": 3.406153960084423e-06, "loss": 1.341, "step": 9200 }, { "epoch": 0.3443583548557305, "grad_norm": 3.884956476912519, "learning_rate": 3.4431813974154854e-06, "loss": 1.3458, "step": 9300 }, { "epoch": 0.3480611328649319, "grad_norm": 4.291808958034935, "learning_rate": 3.4802088347465475e-06, "loss": 1.301, "step": 9400 }, { "epoch": 0.35176391087413333, "grad_norm": 4.55718057133725, "learning_rate": 3.51723627207761e-06, "loss": 1.3203, "step": 9500 }, { "epoch": 0.3554666888833347, "grad_norm": 3.927788263058885, "learning_rate": 3.554263709408672e-06, "loss": 1.3087, "step": 9600 }, { "epoch": 0.35916946689253615, "grad_norm": 4.334142071876793, "learning_rate": 3.5912911467397343e-06, "loss": 1.328, "step": 9700 }, { "epoch": 0.36287224490173753, "grad_norm": 3.9826425837051715, "learning_rate": 3.628318584070797e-06, "loss": 1.3162, "step": 9800 }, { "epoch": 0.3665750229109389, "grad_norm": 3.7373318962103834, "learning_rate": 3.665346021401859e-06, "loss": 1.3427, "step": 9900 }, { "epoch": 0.37027780092014034, "grad_norm": 4.309257631492866, "learning_rate": 3.7023734587329215e-06, "loss": 1.2918, "step": 10000 }, { "epoch": 0.3739805789293417, "grad_norm": 4.000949746879435, "learning_rate": 3.739400896063984e-06, "loss": 1.2864, "step": 10100 }, { "epoch": 0.37768335693854316, "grad_norm": 4.314641431918059, "learning_rate": 3.7764283333950457e-06, "loss": 1.2784, "step": 10200 }, { "epoch": 0.38138613494774454, "grad_norm": 4.136976500403523, "learning_rate": 3.8134557707261083e-06, "loss": 1.2821, "step": 10300 }, { "epoch": 0.385088912956946, "grad_norm": 4.8607889216449385, "learning_rate": 3.85048320805717e-06, "loss": 1.281, "step": 10400 }, { "epoch": 0.38879169096614735, "grad_norm": 4.432173453928133, "learning_rate": 3.8875106453882325e-06, "loss": 1.2959, "step": 10500 }, { "epoch": 0.39249446897534873, "grad_norm": 5.118601631131207, "learning_rate": 3.9245380827192954e-06, "loss": 1.2772, "step": 10600 }, { "epoch": 0.39619724698455017, "grad_norm": 4.189210920096741, "learning_rate": 3.9615655200503576e-06, "loss": 1.311, "step": 10700 }, { "epoch": 0.39990002499375155, "grad_norm": 4.9933785952086875, "learning_rate": 3.99859295738142e-06, "loss": 1.2798, "step": 10800 }, { "epoch": 0.403602803002953, "grad_norm": 3.8535620848124785, "learning_rate": 4.035620394712483e-06, "loss": 1.2532, "step": 10900 }, { "epoch": 0.40730558101215436, "grad_norm": 4.040521438500001, "learning_rate": 4.072647832043545e-06, "loss": 1.2739, "step": 11000 }, { "epoch": 0.4110083590213558, "grad_norm": 4.280568957640515, "learning_rate": 4.109675269374607e-06, "loss": 1.2755, "step": 11100 }, { "epoch": 0.4147111370305572, "grad_norm": 4.230099582418301, "learning_rate": 4.146702706705669e-06, "loss": 1.2676, "step": 11200 }, { "epoch": 0.41841391503975855, "grad_norm": 3.807801983954757, "learning_rate": 4.183730144036732e-06, "loss": 1.2487, "step": 11300 }, { "epoch": 0.42211669304896, "grad_norm": 4.2784766116809365, "learning_rate": 4.220757581367794e-06, "loss": 1.2545, "step": 11400 }, { "epoch": 0.42581947105816137, "grad_norm": 4.855973605257093, "learning_rate": 4.257785018698856e-06, "loss": 1.2436, "step": 11500 }, { "epoch": 0.4295222490673628, "grad_norm": 3.5115767446990995, "learning_rate": 4.294812456029918e-06, "loss": 1.2505, "step": 11600 }, { "epoch": 0.4332250270765642, "grad_norm": 3.7062317227814408, "learning_rate": 4.33183989336098e-06, "loss": 1.2175, "step": 11700 }, { "epoch": 0.4369278050857656, "grad_norm": 3.9387496036649936, "learning_rate": 4.368867330692043e-06, "loss": 1.222, "step": 11800 }, { "epoch": 0.440630583094967, "grad_norm": 5.23595658030001, "learning_rate": 4.4058947680231055e-06, "loss": 1.2388, "step": 11900 }, { "epoch": 0.4443333611041684, "grad_norm": 4.759096268753192, "learning_rate": 4.442922205354168e-06, "loss": 1.2036, "step": 12000 }, { "epoch": 0.4480361391133698, "grad_norm": 4.094075469283295, "learning_rate": 4.4799496426852306e-06, "loss": 1.2543, "step": 12100 }, { "epoch": 0.4517389171225712, "grad_norm": 4.153721175621665, "learning_rate": 4.516977080016293e-06, "loss": 1.227, "step": 12200 }, { "epoch": 0.45544169513177263, "grad_norm": 4.508419619958273, "learning_rate": 4.554004517347355e-06, "loss": 1.188, "step": 12300 }, { "epoch": 0.459144473140974, "grad_norm": 5.431232416017837, "learning_rate": 4.591031954678417e-06, "loss": 1.2254, "step": 12400 }, { "epoch": 0.46284725115017544, "grad_norm": 4.195672755415828, "learning_rate": 4.62805939200948e-06, "loss": 1.2161, "step": 12500 }, { "epoch": 0.4665500291593768, "grad_norm": 4.928316304383083, "learning_rate": 4.665086829340542e-06, "loss": 1.2117, "step": 12600 }, { "epoch": 0.4702528071685782, "grad_norm": 3.591246874782549, "learning_rate": 4.702114266671604e-06, "loss": 1.2114, "step": 12700 }, { "epoch": 0.47395558517777964, "grad_norm": 3.4385074546101473, "learning_rate": 4.739141704002666e-06, "loss": 1.1622, "step": 12800 }, { "epoch": 0.477658363186981, "grad_norm": 4.266312659389707, "learning_rate": 4.776169141333728e-06, "loss": 1.1994, "step": 12900 }, { "epoch": 0.48136114119618245, "grad_norm": 4.318134728161222, "learning_rate": 4.813196578664791e-06, "loss": 1.194, "step": 13000 }, { "epoch": 0.48506391920538383, "grad_norm": 3.743433627557919, "learning_rate": 4.850224015995853e-06, "loss": 1.1862, "step": 13100 }, { "epoch": 0.48876669721458527, "grad_norm": 4.432513953759492, "learning_rate": 4.8872514533269155e-06, "loss": 1.2034, "step": 13200 }, { "epoch": 0.49246947522378665, "grad_norm": 4.932972639382486, "learning_rate": 4.924278890657978e-06, "loss": 1.1829, "step": 13300 }, { "epoch": 0.496172253232988, "grad_norm": 4.552124318708114, "learning_rate": 4.961306327989041e-06, "loss": 1.1877, "step": 13400 }, { "epoch": 0.49987503124218946, "grad_norm": 4.57039085993272, "learning_rate": 4.998333765320103e-06, "loss": 1.1804, "step": 13500 }, { "epoch": 0.5035778092513908, "grad_norm": 4.435102306960172, "learning_rate": 5.035361202651165e-06, "loss": 1.1673, "step": 13600 }, { "epoch": 0.5072805872605922, "grad_norm": 4.186670025498094, "learning_rate": 5.072388639982228e-06, "loss": 1.1933, "step": 13700 }, { "epoch": 0.5109833652697937, "grad_norm": 4.405934801625066, "learning_rate": 5.10941607731329e-06, "loss": 1.1772, "step": 13800 }, { "epoch": 0.5146861432789951, "grad_norm": 4.311924669929619, "learning_rate": 5.146443514644351e-06, "loss": 1.1665, "step": 13900 }, { "epoch": 0.5183889212881965, "grad_norm": 5.625220459288197, "learning_rate": 5.183470951975414e-06, "loss": 1.1735, "step": 14000 }, { "epoch": 0.5220916992973978, "grad_norm": 3.8405703022228606, "learning_rate": 5.220498389306476e-06, "loss": 1.1904, "step": 14100 }, { "epoch": 0.5257944773065992, "grad_norm": 4.059806709247945, "learning_rate": 5.257525826637538e-06, "loss": 1.1588, "step": 14200 }, { "epoch": 0.5294972553158007, "grad_norm": 3.8722695281592485, "learning_rate": 5.294553263968601e-06, "loss": 1.1469, "step": 14300 }, { "epoch": 0.5332000333250021, "grad_norm": 4.66275202781423, "learning_rate": 5.331580701299663e-06, "loss": 1.1442, "step": 14400 }, { "epoch": 0.5369028113342035, "grad_norm": 4.6182096996448845, "learning_rate": 5.3686081386307255e-06, "loss": 1.1543, "step": 14500 }, { "epoch": 0.5406055893434049, "grad_norm": 3.9837337174635103, "learning_rate": 5.4056355759617885e-06, "loss": 1.158, "step": 14600 }, { "epoch": 0.5443083673526063, "grad_norm": 3.8034049725872356, "learning_rate": 5.442663013292851e-06, "loss": 1.1546, "step": 14700 }, { "epoch": 0.5480111453618077, "grad_norm": 4.62083026318626, "learning_rate": 5.479690450623913e-06, "loss": 1.1553, "step": 14800 }, { "epoch": 0.5517139233710091, "grad_norm": 4.462803142186011, "learning_rate": 5.516717887954976e-06, "loss": 1.1506, "step": 14900 }, { "epoch": 0.5554167013802105, "grad_norm": 4.165198766777481, "learning_rate": 5.553745325286038e-06, "loss": 1.147, "step": 15000 }, { "epoch": 0.5591194793894119, "grad_norm": 3.5365327477483297, "learning_rate": 5.590772762617099e-06, "loss": 1.1341, "step": 15100 }, { "epoch": 0.5628222573986134, "grad_norm": 3.8324766687461653, "learning_rate": 5.627800199948162e-06, "loss": 1.1123, "step": 15200 }, { "epoch": 0.5665250354078147, "grad_norm": 4.271061462203587, "learning_rate": 5.664827637279224e-06, "loss": 1.1212, "step": 15300 }, { "epoch": 0.5702278134170161, "grad_norm": 4.5272229115194875, "learning_rate": 5.701855074610286e-06, "loss": 1.1438, "step": 15400 }, { "epoch": 0.5739305914262175, "grad_norm": 4.660071317336288, "learning_rate": 5.738882511941349e-06, "loss": 1.1174, "step": 15500 }, { "epoch": 0.5776333694354189, "grad_norm": 3.8174601055554094, "learning_rate": 5.775909949272411e-06, "loss": 1.095, "step": 15600 }, { "epoch": 0.5813361474446204, "grad_norm": 3.9772037532519784, "learning_rate": 5.8129373866034734e-06, "loss": 1.0901, "step": 15700 }, { "epoch": 0.5850389254538217, "grad_norm": 4.117368390246338, "learning_rate": 5.849964823934536e-06, "loss": 1.1218, "step": 15800 }, { "epoch": 0.5887417034630231, "grad_norm": 3.6969429985121387, "learning_rate": 5.8869922612655985e-06, "loss": 1.1172, "step": 15900 }, { "epoch": 0.5924444814722245, "grad_norm": 4.208647101328797, "learning_rate": 5.924019698596661e-06, "loss": 1.1005, "step": 16000 }, { "epoch": 0.596147259481426, "grad_norm": 4.0291894246178455, "learning_rate": 5.961047135927724e-06, "loss": 1.1129, "step": 16100 }, { "epoch": 0.5998500374906274, "grad_norm": 4.713616701400172, "learning_rate": 5.998074573258785e-06, "loss": 1.1002, "step": 16200 }, { "epoch": 0.6035528154998288, "grad_norm": 4.1930372418205355, "learning_rate": 6.035102010589847e-06, "loss": 1.0886, "step": 16300 }, { "epoch": 0.6072555935090301, "grad_norm": 4.254442138954682, "learning_rate": 6.07212944792091e-06, "loss": 1.0954, "step": 16400 }, { "epoch": 0.6109583715182315, "grad_norm": 3.7190710738003014, "learning_rate": 6.109156885251972e-06, "loss": 1.1288, "step": 16500 }, { "epoch": 0.614661149527433, "grad_norm": 5.286326819627371, "learning_rate": 6.146184322583034e-06, "loss": 1.0861, "step": 16600 }, { "epoch": 0.6183639275366344, "grad_norm": 3.5036265705416, "learning_rate": 6.183211759914097e-06, "loss": 1.1086, "step": 16700 }, { "epoch": 0.6220667055458358, "grad_norm": 3.5111130437153495, "learning_rate": 6.220239197245159e-06, "loss": 1.1022, "step": 16800 }, { "epoch": 0.6257694835550371, "grad_norm": 3.2896238161080946, "learning_rate": 6.257266634576221e-06, "loss": 1.0955, "step": 16900 }, { "epoch": 0.6294722615642385, "grad_norm": 3.6540403667854604, "learning_rate": 6.294294071907284e-06, "loss": 1.058, "step": 17000 }, { "epoch": 0.63317503957344, "grad_norm": 4.4670975803638475, "learning_rate": 6.3313215092383464e-06, "loss": 1.068, "step": 17100 }, { "epoch": 0.6368778175826414, "grad_norm": 4.530195581158383, "learning_rate": 6.3683489465694085e-06, "loss": 1.1105, "step": 17200 }, { "epoch": 0.6405805955918428, "grad_norm": 3.8288830335210995, "learning_rate": 6.4053763839004715e-06, "loss": 1.0332, "step": 17300 }, { "epoch": 0.6442833736010442, "grad_norm": 3.815833342915358, "learning_rate": 6.442403821231533e-06, "loss": 1.0794, "step": 17400 }, { "epoch": 0.6479861516102456, "grad_norm": 4.604021991413655, "learning_rate": 6.479431258562595e-06, "loss": 1.079, "step": 17500 }, { "epoch": 0.651688929619447, "grad_norm": 3.577028223699294, "learning_rate": 6.516458695893658e-06, "loss": 1.0653, "step": 17600 }, { "epoch": 0.6553917076286484, "grad_norm": 4.111818385244818, "learning_rate": 6.55348613322472e-06, "loss": 1.035, "step": 17700 }, { "epoch": 0.6590944856378498, "grad_norm": 3.56811764186164, "learning_rate": 6.590513570555782e-06, "loss": 1.0452, "step": 17800 }, { "epoch": 0.6627972636470512, "grad_norm": 3.87646009188637, "learning_rate": 6.627541007886845e-06, "loss": 1.0493, "step": 17900 }, { "epoch": 0.6665000416562527, "grad_norm": 3.406407594729227, "learning_rate": 6.664568445217907e-06, "loss": 1.0651, "step": 18000 }, { "epoch": 0.670202819665454, "grad_norm": 2.866315730154109, "learning_rate": 6.701595882548969e-06, "loss": 1.0561, "step": 18100 }, { "epoch": 0.6739055976746554, "grad_norm": 4.376427115975032, "learning_rate": 6.738623319880032e-06, "loss": 1.0602, "step": 18200 }, { "epoch": 0.6776083756838568, "grad_norm": 3.2373014543024072, "learning_rate": 6.775650757211094e-06, "loss": 1.0397, "step": 18300 }, { "epoch": 0.6813111536930582, "grad_norm": 3.9666084272474245, "learning_rate": 6.8126781945421565e-06, "loss": 1.0445, "step": 18400 }, { "epoch": 0.6850139317022597, "grad_norm": 3.800460648153449, "learning_rate": 6.8497056318732194e-06, "loss": 1.0494, "step": 18500 }, { "epoch": 0.688716709711461, "grad_norm": 4.274139967670289, "learning_rate": 6.886733069204281e-06, "loss": 1.0756, "step": 18600 }, { "epoch": 0.6924194877206624, "grad_norm": 3.9994897603573665, "learning_rate": 6.923760506535343e-06, "loss": 1.0412, "step": 18700 }, { "epoch": 0.6961222657298638, "grad_norm": 3.8633706482810553, "learning_rate": 6.960787943866406e-06, "loss": 1.0522, "step": 18800 }, { "epoch": 0.6998250437390653, "grad_norm": 4.381038145148409, "learning_rate": 6.997815381197468e-06, "loss": 1.0209, "step": 18900 }, { "epoch": 0.7035278217482667, "grad_norm": 4.1366815727300175, "learning_rate": 7.03484281852853e-06, "loss": 1.0351, "step": 19000 }, { "epoch": 0.707230599757468, "grad_norm": 3.675160954426471, "learning_rate": 7.071870255859593e-06, "loss": 1.0291, "step": 19100 }, { "epoch": 0.7109333777666694, "grad_norm": 3.603001585625626, "learning_rate": 7.108897693190655e-06, "loss": 1.0093, "step": 19200 }, { "epoch": 0.7146361557758708, "grad_norm": 3.264233985905883, "learning_rate": 7.145925130521717e-06, "loss": 1.0394, "step": 19300 }, { "epoch": 0.7183389337850723, "grad_norm": 3.8898393773576614, "learning_rate": 7.182952567852779e-06, "loss": 1.0309, "step": 19400 }, { "epoch": 0.7220417117942737, "grad_norm": 3.2365236970202917, "learning_rate": 7.219980005183842e-06, "loss": 1.0304, "step": 19500 }, { "epoch": 0.7257444898034751, "grad_norm": 3.497985907223146, "learning_rate": 7.257007442514904e-06, "loss": 1.0375, "step": 19600 }, { "epoch": 0.7294472678126764, "grad_norm": 3.6359487776078714, "learning_rate": 7.294034879845966e-06, "loss": 1.0132, "step": 19700 }, { "epoch": 0.7331500458218778, "grad_norm": 3.9379821448814343, "learning_rate": 7.331062317177029e-06, "loss": 1.019, "step": 19800 }, { "epoch": 0.7368528238310793, "grad_norm": 4.750912299244358, "learning_rate": 7.368089754508091e-06, "loss": 1.0172, "step": 19900 }, { "epoch": 0.7405556018402807, "grad_norm": 4.253487251656156, "learning_rate": 7.405117191839153e-06, "loss": 1.0093, "step": 20000 }, { "epoch": 0.7442583798494821, "grad_norm": 3.5926201949515284, "learning_rate": 7.442144629170216e-06, "loss": 1.0163, "step": 20100 }, { "epoch": 0.7479611578586834, "grad_norm": 3.8927981125403073, "learning_rate": 7.479172066501278e-06, "loss": 0.9827, "step": 20200 }, { "epoch": 0.7516639358678849, "grad_norm": 4.573750708649385, "learning_rate": 7.51619950383234e-06, "loss": 1.0087, "step": 20300 }, { "epoch": 0.7553667138770863, "grad_norm": 3.1260071677382952, "learning_rate": 7.553226941163403e-06, "loss": 1.0275, "step": 20400 }, { "epoch": 0.7590694918862877, "grad_norm": 3.965170800530151, "learning_rate": 7.590254378494465e-06, "loss": 1.0132, "step": 20500 }, { "epoch": 0.7627722698954891, "grad_norm": 3.1267324185244876, "learning_rate": 7.627281815825527e-06, "loss": 0.9803, "step": 20600 }, { "epoch": 0.7664750479046905, "grad_norm": 3.4086206206945358, "learning_rate": 7.66430925315659e-06, "loss": 0.9954, "step": 20700 }, { "epoch": 0.770177825913892, "grad_norm": 4.254989394844253, "learning_rate": 7.701336690487652e-06, "loss": 0.9922, "step": 20800 }, { "epoch": 0.7738806039230933, "grad_norm": 3.7295802263455564, "learning_rate": 7.738364127818714e-06, "loss": 1.0004, "step": 20900 }, { "epoch": 0.7775833819322947, "grad_norm": 4.671822378080011, "learning_rate": 7.775391565149777e-06, "loss": 0.9887, "step": 21000 }, { "epoch": 0.7812861599414961, "grad_norm": 3.941447202712077, "learning_rate": 7.81241900248084e-06, "loss": 0.982, "step": 21100 }, { "epoch": 0.7849889379506975, "grad_norm": 3.6530756566069633, "learning_rate": 7.8494464398119e-06, "loss": 0.9668, "step": 21200 }, { "epoch": 0.788691715959899, "grad_norm": 3.9080413448774625, "learning_rate": 7.886473877142964e-06, "loss": 1.0055, "step": 21300 }, { "epoch": 0.7923944939691003, "grad_norm": 3.603292889224154, "learning_rate": 7.923501314474025e-06, "loss": 1.0039, "step": 21400 }, { "epoch": 0.7960972719783017, "grad_norm": 3.677324652882952, "learning_rate": 7.960528751805088e-06, "loss": 1.0226, "step": 21500 }, { "epoch": 0.7998000499875031, "grad_norm": 3.6513050899215056, "learning_rate": 7.997556189136151e-06, "loss": 0.9766, "step": 21600 }, { "epoch": 0.8035028279967046, "grad_norm": 3.357793359434031, "learning_rate": 8.034583626467212e-06, "loss": 0.9813, "step": 21700 }, { "epoch": 0.807205606005906, "grad_norm": 2.760916399152151, "learning_rate": 8.071611063798275e-06, "loss": 0.979, "step": 21800 }, { "epoch": 0.8109083840151073, "grad_norm": 3.6887561411257046, "learning_rate": 8.108638501129338e-06, "loss": 0.9601, "step": 21900 }, { "epoch": 0.8146111620243087, "grad_norm": 3.652784111767623, "learning_rate": 8.1456659384604e-06, "loss": 0.9799, "step": 22000 }, { "epoch": 0.8183139400335101, "grad_norm": 3.479191284518454, "learning_rate": 8.182693375791462e-06, "loss": 0.9785, "step": 22100 }, { "epoch": 0.8220167180427116, "grad_norm": 4.128090009925586, "learning_rate": 8.219720813122525e-06, "loss": 0.9841, "step": 22200 }, { "epoch": 0.825719496051913, "grad_norm": 3.662863541747893, "learning_rate": 8.256748250453587e-06, "loss": 0.947, "step": 22300 }, { "epoch": 0.8294222740611144, "grad_norm": 3.217491905494941, "learning_rate": 8.29377568778465e-06, "loss": 0.9757, "step": 22400 }, { "epoch": 0.8331250520703157, "grad_norm": 3.326399094958347, "learning_rate": 8.330803125115712e-06, "loss": 0.9598, "step": 22500 }, { "epoch": 0.8368278300795171, "grad_norm": 2.909529562627351, "learning_rate": 8.367830562446774e-06, "loss": 0.9587, "step": 22600 }, { "epoch": 0.8405306080887186, "grad_norm": 3.1644941957196346, "learning_rate": 8.404857999777835e-06, "loss": 0.9689, "step": 22700 }, { "epoch": 0.84423338609792, "grad_norm": 3.152721404570826, "learning_rate": 8.441885437108898e-06, "loss": 0.9822, "step": 22800 }, { "epoch": 0.8479361641071214, "grad_norm": 3.879640971417378, "learning_rate": 8.478912874439961e-06, "loss": 0.9379, "step": 22900 }, { "epoch": 0.8516389421163227, "grad_norm": 4.432110940230918, "learning_rate": 8.515940311771022e-06, "loss": 0.9544, "step": 23000 }, { "epoch": 0.8553417201255242, "grad_norm": 3.2420578574260697, "learning_rate": 8.552967749102085e-06, "loss": 0.9292, "step": 23100 }, { "epoch": 0.8590444981347256, "grad_norm": 4.617591314029601, "learning_rate": 8.589995186433148e-06, "loss": 0.9477, "step": 23200 }, { "epoch": 0.862747276143927, "grad_norm": 3.0259937163445194, "learning_rate": 8.62702262376421e-06, "loss": 0.9559, "step": 23300 }, { "epoch": 0.8664500541531284, "grad_norm": 3.278192583185341, "learning_rate": 8.664050061095272e-06, "loss": 0.9583, "step": 23400 }, { "epoch": 0.8701528321623297, "grad_norm": 3.6509553777490424, "learning_rate": 8.701077498426335e-06, "loss": 0.9749, "step": 23500 }, { "epoch": 0.8738556101715312, "grad_norm": 3.4075169195241677, "learning_rate": 8.738104935757397e-06, "loss": 0.9223, "step": 23600 }, { "epoch": 0.8775583881807326, "grad_norm": 2.893435896152694, "learning_rate": 8.77513237308846e-06, "loss": 0.9721, "step": 23700 }, { "epoch": 0.881261166189934, "grad_norm": 4.064038243050667, "learning_rate": 8.81215981041952e-06, "loss": 0.9648, "step": 23800 }, { "epoch": 0.8849639441991354, "grad_norm": 3.28397127001984, "learning_rate": 8.849187247750584e-06, "loss": 0.9413, "step": 23900 }, { "epoch": 0.8886667222083368, "grad_norm": 2.7396572790329183, "learning_rate": 8.886214685081647e-06, "loss": 0.9538, "step": 24000 }, { "epoch": 0.8923695002175382, "grad_norm": 3.831935799094677, "learning_rate": 8.923242122412708e-06, "loss": 0.9488, "step": 24100 }, { "epoch": 0.8960722782267396, "grad_norm": 3.083787982483977, "learning_rate": 8.960269559743771e-06, "loss": 0.9798, "step": 24200 }, { "epoch": 0.899775056235941, "grad_norm": 3.2638182611068784, "learning_rate": 8.997296997074834e-06, "loss": 0.9364, "step": 24300 }, { "epoch": 0.9034778342451424, "grad_norm": 2.8271807284549824, "learning_rate": 9.034324434405895e-06, "loss": 0.9592, "step": 24400 }, { "epoch": 0.9071806122543439, "grad_norm": 3.1878575837021295, "learning_rate": 9.071351871736958e-06, "loss": 0.944, "step": 24500 }, { "epoch": 0.9108833902635453, "grad_norm": 3.344639834530028, "learning_rate": 9.108379309068021e-06, "loss": 0.9305, "step": 24600 }, { "epoch": 0.9145861682727466, "grad_norm": 3.271225972067076, "learning_rate": 9.145406746399082e-06, "loss": 0.9334, "step": 24700 }, { "epoch": 0.918288946281948, "grad_norm": 3.7284762668702314, "learning_rate": 9.182434183730145e-06, "loss": 0.9195, "step": 24800 }, { "epoch": 0.9219917242911494, "grad_norm": 4.098726803359245, "learning_rate": 9.219461621061208e-06, "loss": 0.9398, "step": 24900 }, { "epoch": 0.9256945023003509, "grad_norm": 2.8965683319135795, "learning_rate": 9.25648905839227e-06, "loss": 0.9365, "step": 25000 }, { "epoch": 0.9293972803095523, "grad_norm": 3.472751774536713, "learning_rate": 9.29351649572333e-06, "loss": 0.942, "step": 25100 }, { "epoch": 0.9331000583187536, "grad_norm": 2.741682625283635, "learning_rate": 9.330543933054394e-06, "loss": 0.9463, "step": 25200 }, { "epoch": 0.936802836327955, "grad_norm": 2.9695965336662584, "learning_rate": 9.367571370385457e-06, "loss": 0.9355, "step": 25300 }, { "epoch": 0.9405056143371564, "grad_norm": 3.0507053957289814, "learning_rate": 9.404598807716518e-06, "loss": 0.9123, "step": 25400 }, { "epoch": 0.9442083923463579, "grad_norm": 3.997694772330171, "learning_rate": 9.441626245047581e-06, "loss": 0.9326, "step": 25500 }, { "epoch": 0.9479111703555593, "grad_norm": 3.006394895446821, "learning_rate": 9.478653682378644e-06, "loss": 0.9343, "step": 25600 }, { "epoch": 0.9516139483647607, "grad_norm": 3.030113997176904, "learning_rate": 9.515681119709705e-06, "loss": 0.9263, "step": 25700 }, { "epoch": 0.955316726373962, "grad_norm": 2.9203358557009187, "learning_rate": 9.552708557040768e-06, "loss": 0.9432, "step": 25800 }, { "epoch": 0.9590195043831635, "grad_norm": 3.320595109219397, "learning_rate": 9.589735994371831e-06, "loss": 0.9132, "step": 25900 }, { "epoch": 0.9627222823923649, "grad_norm": 3.2953825381075883, "learning_rate": 9.626763431702892e-06, "loss": 0.945, "step": 26000 }, { "epoch": 0.9664250604015663, "grad_norm": 2.8690624980388097, "learning_rate": 9.663790869033955e-06, "loss": 0.9269, "step": 26100 }, { "epoch": 0.9701278384107677, "grad_norm": 3.238448651703217, "learning_rate": 9.700818306365017e-06, "loss": 0.9223, "step": 26200 }, { "epoch": 0.973830616419969, "grad_norm": 3.8191924390794076, "learning_rate": 9.73784574369608e-06, "loss": 0.9294, "step": 26300 }, { "epoch": 0.9775333944291705, "grad_norm": 2.89593322378588, "learning_rate": 9.774873181027143e-06, "loss": 0.92, "step": 26400 }, { "epoch": 0.9812361724383719, "grad_norm": 3.701573808189793, "learning_rate": 9.811900618358204e-06, "loss": 0.9471, "step": 26500 }, { "epoch": 0.9849389504475733, "grad_norm": 3.334857306171596, "learning_rate": 9.848928055689267e-06, "loss": 0.9205, "step": 26600 }, { "epoch": 0.9886417284567747, "grad_norm": 3.3187413615000705, "learning_rate": 9.88595549302033e-06, "loss": 0.9065, "step": 26700 }, { "epoch": 0.992344506465976, "grad_norm": 3.3451918803521945, "learning_rate": 9.922982930351391e-06, "loss": 0.8933, "step": 26800 }, { "epoch": 0.9960472844751775, "grad_norm": 3.2100420698017147, "learning_rate": 9.960010367682454e-06, "loss": 0.9112, "step": 26900 }, { "epoch": 0.9997500624843789, "grad_norm": 2.837918388436053, "learning_rate": 9.997037805013515e-06, "loss": 0.9567, "step": 27000 }, { "epoch": 1.0034435835485573, "grad_norm": 2.9548645107416087, "learning_rate": 9.99999646509579e-06, "loss": 0.8909, "step": 27100 }, { "epoch": 1.0071463615577587, "grad_norm": 3.1446751198623435, "learning_rate": 9.999984604128388e-06, "loss": 0.8726, "step": 27200 }, { "epoch": 1.01084913956696, "grad_norm": 2.976258203122292, "learning_rate": 9.999964390388652e-06, "loss": 0.893, "step": 27300 }, { "epoch": 1.0145519175761615, "grad_norm": 2.980666927786789, "learning_rate": 9.999935823910352e-06, "loss": 0.8665, "step": 27400 }, { "epoch": 1.0182546955853629, "grad_norm": 3.1487567474786795, "learning_rate": 9.999898904741209e-06, "loss": 0.8649, "step": 27500 }, { "epoch": 1.0219574735945642, "grad_norm": 2.629105326617924, "learning_rate": 9.999853632942897e-06, "loss": 0.8607, "step": 27600 }, { "epoch": 1.0256602516037656, "grad_norm": 3.8572529888050826, "learning_rate": 9.999800008591049e-06, "loss": 0.8761, "step": 27700 }, { "epoch": 1.0293630296129672, "grad_norm": 2.767732632774148, "learning_rate": 9.999738031775246e-06, "loss": 0.8778, "step": 27800 }, { "epoch": 1.0330658076221686, "grad_norm": 3.4506007181787606, "learning_rate": 9.99966770259902e-06, "loss": 0.8842, "step": 27900 }, { "epoch": 1.03676858563137, "grad_norm": 3.3838219329729764, "learning_rate": 9.999589021179867e-06, "loss": 0.8653, "step": 28000 }, { "epoch": 1.0404713636405714, "grad_norm": 2.7380768463734673, "learning_rate": 9.999501987649225e-06, "loss": 0.8715, "step": 28100 }, { "epoch": 1.0441741416497727, "grad_norm": 2.5724850888468818, "learning_rate": 9.999406602152487e-06, "loss": 0.873, "step": 28200 }, { "epoch": 1.0478769196589741, "grad_norm": 2.814547694838622, "learning_rate": 9.999302864849006e-06, "loss": 0.8652, "step": 28300 }, { "epoch": 1.0515796976681755, "grad_norm": 2.8671305113874985, "learning_rate": 9.999190775912075e-06, "loss": 0.8773, "step": 28400 }, { "epoch": 1.0552824756773769, "grad_norm": 3.4364162307062016, "learning_rate": 9.999070335528951e-06, "loss": 0.8722, "step": 28500 }, { "epoch": 1.0589852536865783, "grad_norm": 3.1668831518960747, "learning_rate": 9.99894154390083e-06, "loss": 0.878, "step": 28600 }, { "epoch": 1.0626880316957799, "grad_norm": 2.5661208890092215, "learning_rate": 9.998804401242874e-06, "loss": 0.8642, "step": 28700 }, { "epoch": 1.0663908097049812, "grad_norm": 2.702695496460383, "learning_rate": 9.998658907784183e-06, "loss": 0.8576, "step": 28800 }, { "epoch": 1.0700935877141826, "grad_norm": 2.9176557901328666, "learning_rate": 9.998505063767811e-06, "loss": 0.8705, "step": 28900 }, { "epoch": 1.073796365723384, "grad_norm": 3.7212751721205937, "learning_rate": 9.998342869450767e-06, "loss": 0.8641, "step": 29000 }, { "epoch": 1.0774991437325854, "grad_norm": 3.1124898105603767, "learning_rate": 9.998172325104007e-06, "loss": 0.8679, "step": 29100 }, { "epoch": 1.0812019217417868, "grad_norm": 3.211449706725418, "learning_rate": 9.997993431012433e-06, "loss": 0.8698, "step": 29200 }, { "epoch": 1.0849046997509881, "grad_norm": 3.3735417926416105, "learning_rate": 9.997806187474899e-06, "loss": 0.8593, "step": 29300 }, { "epoch": 1.0886074777601895, "grad_norm": 3.100346897510759, "learning_rate": 9.997610594804206e-06, "loss": 0.8852, "step": 29400 }, { "epoch": 1.092310255769391, "grad_norm": 3.046486604721806, "learning_rate": 9.997406653327103e-06, "loss": 0.8734, "step": 29500 }, { "epoch": 1.0960130337785925, "grad_norm": 3.462276437104938, "learning_rate": 9.99719436338429e-06, "loss": 0.8333, "step": 29600 }, { "epoch": 1.0997158117877939, "grad_norm": 2.8629449765918564, "learning_rate": 9.996973725330405e-06, "loss": 0.8584, "step": 29700 }, { "epoch": 1.1034185897969953, "grad_norm": 2.9165610257153873, "learning_rate": 9.996744739534042e-06, "loss": 0.8665, "step": 29800 }, { "epoch": 1.1071213678061966, "grad_norm": 3.1796750168940315, "learning_rate": 9.996507406377728e-06, "loss": 0.8787, "step": 29900 }, { "epoch": 1.110824145815398, "grad_norm": 3.160592130889014, "learning_rate": 9.99626172625795e-06, "loss": 0.8718, "step": 30000 }, { "epoch": 1.1145269238245994, "grad_norm": 2.779237960939386, "learning_rate": 9.99600769958513e-06, "loss": 0.878, "step": 30100 }, { "epoch": 1.1182297018338008, "grad_norm": 3.108400228708763, "learning_rate": 9.995745326783628e-06, "loss": 0.868, "step": 30200 }, { "epoch": 1.1219324798430022, "grad_norm": 3.0911030432916817, "learning_rate": 9.995474608291761e-06, "loss": 0.8621, "step": 30300 }, { "epoch": 1.1256352578522035, "grad_norm": 2.5787433018725, "learning_rate": 9.995195544561778e-06, "loss": 0.8754, "step": 30400 }, { "epoch": 1.1293380358614051, "grad_norm": 2.8342296943136165, "learning_rate": 9.994908136059868e-06, "loss": 0.8373, "step": 30500 }, { "epoch": 1.1330408138706065, "grad_norm": 2.476461823350524, "learning_rate": 9.994612383266171e-06, "loss": 0.842, "step": 30600 }, { "epoch": 1.136743591879808, "grad_norm": 3.9271884057807296, "learning_rate": 9.994308286674754e-06, "loss": 0.8453, "step": 30700 }, { "epoch": 1.1404463698890093, "grad_norm": 3.1786946065406236, "learning_rate": 9.99399584679363e-06, "loss": 0.8648, "step": 30800 }, { "epoch": 1.1441491478982106, "grad_norm": 2.634901131802063, "learning_rate": 9.99367506414475e-06, "loss": 0.8751, "step": 30900 }, { "epoch": 1.147851925907412, "grad_norm": 3.078376805123231, "learning_rate": 9.993345939264e-06, "loss": 0.8635, "step": 31000 }, { "epoch": 1.1515547039166134, "grad_norm": 3.1528853117678786, "learning_rate": 9.9930084727012e-06, "loss": 0.843, "step": 31100 }, { "epoch": 1.1552574819258148, "grad_norm": 2.828228109125317, "learning_rate": 9.992662665020112e-06, "loss": 0.8624, "step": 31200 }, { "epoch": 1.1589602599350162, "grad_norm": 3.1953124851506707, "learning_rate": 9.992308516798426e-06, "loss": 0.8579, "step": 31300 }, { "epoch": 1.1626630379442178, "grad_norm": 3.0902899613744603, "learning_rate": 9.991946028627768e-06, "loss": 0.8527, "step": 31400 }, { "epoch": 1.1663658159534191, "grad_norm": 2.9463681925783023, "learning_rate": 9.991575201113695e-06, "loss": 0.8268, "step": 31500 }, { "epoch": 1.1700685939626205, "grad_norm": 2.8044002498862057, "learning_rate": 9.991196034875698e-06, "loss": 0.8395, "step": 31600 }, { "epoch": 1.173771371971822, "grad_norm": 2.9461135183049936, "learning_rate": 9.990808530547197e-06, "loss": 0.858, "step": 31700 }, { "epoch": 1.1774741499810233, "grad_norm": 3.213674861669168, "learning_rate": 9.990412688775542e-06, "loss": 0.864, "step": 31800 }, { "epoch": 1.1811769279902247, "grad_norm": 2.71190688635739, "learning_rate": 9.99000851022201e-06, "loss": 0.855, "step": 31900 }, { "epoch": 1.184879705999426, "grad_norm": 3.723777231794139, "learning_rate": 9.9895959955618e-06, "loss": 0.8456, "step": 32000 }, { "epoch": 1.1885824840086274, "grad_norm": 2.4622343303272918, "learning_rate": 9.989175145484049e-06, "loss": 0.8217, "step": 32100 }, { "epoch": 1.1922852620178288, "grad_norm": 2.8875388301298472, "learning_rate": 9.98874596069181e-06, "loss": 0.8591, "step": 32200 }, { "epoch": 1.1959880400270302, "grad_norm": 2.5910572126310716, "learning_rate": 9.988308441902061e-06, "loss": 0.8453, "step": 32300 }, { "epoch": 1.1996908180362316, "grad_norm": 2.4069698963541755, "learning_rate": 9.987862589845703e-06, "loss": 0.8503, "step": 32400 }, { "epoch": 1.2033935960454332, "grad_norm": 2.914526087822122, "learning_rate": 9.987408405267561e-06, "loss": 0.8668, "step": 32500 }, { "epoch": 1.2070963740546345, "grad_norm": 2.8077292984671485, "learning_rate": 9.986945888926374e-06, "loss": 0.8314, "step": 32600 }, { "epoch": 1.210799152063836, "grad_norm": 3.6704712964311437, "learning_rate": 9.986475041594805e-06, "loss": 0.8371, "step": 32700 }, { "epoch": 1.2145019300730373, "grad_norm": 2.6706897230097297, "learning_rate": 9.985995864059433e-06, "loss": 0.876, "step": 32800 }, { "epoch": 1.2182047080822387, "grad_norm": 3.0940143448561037, "learning_rate": 9.98550835712075e-06, "loss": 0.8364, "step": 32900 }, { "epoch": 1.22190748609144, "grad_norm": 3.6081252765429963, "learning_rate": 9.98501252159317e-06, "loss": 0.8378, "step": 33000 }, { "epoch": 1.2256102641006414, "grad_norm": 2.77425534329751, "learning_rate": 9.984508358305012e-06, "loss": 0.8449, "step": 33100 }, { "epoch": 1.2293130421098428, "grad_norm": 2.847973382987711, "learning_rate": 9.98399586809851e-06, "loss": 0.8446, "step": 33200 }, { "epoch": 1.2330158201190442, "grad_norm": 3.1916476231654984, "learning_rate": 9.983475051829814e-06, "loss": 0.8499, "step": 33300 }, { "epoch": 1.2367185981282458, "grad_norm": 3.0278042743633047, "learning_rate": 9.982945910368974e-06, "loss": 0.8427, "step": 33400 }, { "epoch": 1.2404213761374472, "grad_norm": 2.9987823840994685, "learning_rate": 9.982408444599955e-06, "loss": 0.8565, "step": 33500 }, { "epoch": 1.2441241541466486, "grad_norm": 2.956201689254189, "learning_rate": 9.981862655420626e-06, "loss": 0.8303, "step": 33600 }, { "epoch": 1.24782693215585, "grad_norm": 3.306173716101804, "learning_rate": 9.981308543742759e-06, "loss": 0.8351, "step": 33700 }, { "epoch": 1.2515297101650513, "grad_norm": 2.8358161347669624, "learning_rate": 9.98074611049203e-06, "loss": 0.854, "step": 33800 }, { "epoch": 1.2552324881742527, "grad_norm": 3.2862147001432263, "learning_rate": 9.980175356608018e-06, "loss": 0.8176, "step": 33900 }, { "epoch": 1.258935266183454, "grad_norm": 3.839933772493448, "learning_rate": 9.979596283044202e-06, "loss": 0.8353, "step": 34000 }, { "epoch": 1.2626380441926555, "grad_norm": 3.039521277363643, "learning_rate": 9.979008890767958e-06, "loss": 0.8313, "step": 34100 }, { "epoch": 1.2663408222018568, "grad_norm": 2.670418682490729, "learning_rate": 9.97841318076056e-06, "loss": 0.8299, "step": 34200 }, { "epoch": 1.2700436002110584, "grad_norm": 2.6610287960828947, "learning_rate": 9.977809154017177e-06, "loss": 0.8255, "step": 34300 }, { "epoch": 1.2737463782202598, "grad_norm": 4.255372007943821, "learning_rate": 9.977196811546874e-06, "loss": 0.8178, "step": 34400 }, { "epoch": 1.2774491562294612, "grad_norm": 2.486491204040578, "learning_rate": 9.976576154372603e-06, "loss": 0.8131, "step": 34500 }, { "epoch": 1.2811519342386626, "grad_norm": 2.5063224331855967, "learning_rate": 9.975947183531208e-06, "loss": 0.8425, "step": 34600 }, { "epoch": 1.284854712247864, "grad_norm": 2.7512179307220226, "learning_rate": 9.975309900073424e-06, "loss": 0.8593, "step": 34700 }, { "epoch": 1.2885574902570653, "grad_norm": 2.712022237784725, "learning_rate": 9.974664305063872e-06, "loss": 0.8217, "step": 34800 }, { "epoch": 1.2922602682662667, "grad_norm": 2.5781139265649213, "learning_rate": 9.974010399581056e-06, "loss": 0.8009, "step": 34900 }, { "epoch": 1.295963046275468, "grad_norm": 2.3094975445159927, "learning_rate": 9.973348184717362e-06, "loss": 0.8441, "step": 35000 }, { "epoch": 1.2996658242846695, "grad_norm": 3.222306020034265, "learning_rate": 9.972677661579062e-06, "loss": 0.8453, "step": 35100 }, { "epoch": 1.303368602293871, "grad_norm": 2.4004480742086383, "learning_rate": 9.971998831286305e-06, "loss": 0.8352, "step": 35200 }, { "epoch": 1.3070713803030725, "grad_norm": 2.9242567540358193, "learning_rate": 9.971311694973115e-06, "loss": 0.8251, "step": 35300 }, { "epoch": 1.3107741583122738, "grad_norm": 3.3760497497529234, "learning_rate": 9.970616253787394e-06, "loss": 0.8212, "step": 35400 }, { "epoch": 1.3144769363214752, "grad_norm": 2.738484575208949, "learning_rate": 9.969912508890924e-06, "loss": 0.8338, "step": 35500 }, { "epoch": 1.3181797143306766, "grad_norm": 2.403858688871253, "learning_rate": 9.969200461459344e-06, "loss": 0.8051, "step": 35600 }, { "epoch": 1.321882492339878, "grad_norm": 3.317873477816687, "learning_rate": 9.96848011268218e-06, "loss": 0.8275, "step": 35700 }, { "epoch": 1.3255852703490794, "grad_norm": 3.0383409211764465, "learning_rate": 9.967751463762811e-06, "loss": 0.8102, "step": 35800 }, { "epoch": 1.3292880483582807, "grad_norm": 2.716682345656308, "learning_rate": 9.967014515918491e-06, "loss": 0.7922, "step": 35900 }, { "epoch": 1.3329908263674821, "grad_norm": 2.5903044471345407, "learning_rate": 9.966269270380338e-06, "loss": 0.8152, "step": 36000 }, { "epoch": 1.3366936043766837, "grad_norm": 2.6042198988611505, "learning_rate": 9.965515728393324e-06, "loss": 0.816, "step": 36100 }, { "epoch": 1.3403963823858849, "grad_norm": 2.693329966895918, "learning_rate": 9.96475389121629e-06, "loss": 0.8213, "step": 36200 }, { "epoch": 1.3440991603950865, "grad_norm": 2.8636639283082683, "learning_rate": 9.963983760121927e-06, "loss": 0.8028, "step": 36300 }, { "epoch": 1.3478019384042879, "grad_norm": 2.265142729976952, "learning_rate": 9.963205336396789e-06, "loss": 0.8312, "step": 36400 }, { "epoch": 1.3515047164134892, "grad_norm": 2.346991317901365, "learning_rate": 9.962418621341275e-06, "loss": 0.8057, "step": 36500 }, { "epoch": 1.3552074944226906, "grad_norm": 2.9365358115995988, "learning_rate": 9.961623616269642e-06, "loss": 0.811, "step": 36600 }, { "epoch": 1.358910272431892, "grad_norm": 2.765547820893004, "learning_rate": 9.960820322509991e-06, "loss": 0.8176, "step": 36700 }, { "epoch": 1.3626130504410934, "grad_norm": 2.242257331515756, "learning_rate": 9.960008741404278e-06, "loss": 0.8093, "step": 36800 }, { "epoch": 1.3663158284502948, "grad_norm": 2.6584008858920396, "learning_rate": 9.959188874308289e-06, "loss": 0.8128, "step": 36900 }, { "epoch": 1.3700186064594964, "grad_norm": 2.426868840194363, "learning_rate": 9.958360722591666e-06, "loss": 0.8356, "step": 37000 }, { "epoch": 1.3737213844686975, "grad_norm": 2.372175307387934, "learning_rate": 9.957524287637887e-06, "loss": 0.7955, "step": 37100 }, { "epoch": 1.3774241624778991, "grad_norm": 3.223986523587691, "learning_rate": 9.956679570844263e-06, "loss": 0.8446, "step": 37200 }, { "epoch": 1.3811269404871005, "grad_norm": 2.9758674723626495, "learning_rate": 9.955826573621947e-06, "loss": 0.8258, "step": 37300 }, { "epoch": 1.3848297184963019, "grad_norm": 2.4986387404446972, "learning_rate": 9.954965297395917e-06, "loss": 0.838, "step": 37400 }, { "epoch": 1.3885324965055033, "grad_norm": 3.0506103639317383, "learning_rate": 9.954095743604993e-06, "loss": 0.8106, "step": 37500 }, { "epoch": 1.3922352745147046, "grad_norm": 2.596803194782613, "learning_rate": 9.953217913701809e-06, "loss": 0.8101, "step": 37600 }, { "epoch": 1.395938052523906, "grad_norm": 3.0268925428493034, "learning_rate": 9.952331809152837e-06, "loss": 0.7984, "step": 37700 }, { "epoch": 1.3996408305331074, "grad_norm": 2.811749288978374, "learning_rate": 9.951437431438368e-06, "loss": 0.7956, "step": 37800 }, { "epoch": 1.403343608542309, "grad_norm": 2.5791340034648673, "learning_rate": 9.95053478205251e-06, "loss": 0.807, "step": 37900 }, { "epoch": 1.4070463865515102, "grad_norm": 2.73932100840186, "learning_rate": 9.949623862503194e-06, "loss": 0.8045, "step": 38000 }, { "epoch": 1.4107491645607118, "grad_norm": 2.4120952001387836, "learning_rate": 9.948704674312166e-06, "loss": 0.8062, "step": 38100 }, { "epoch": 1.4144519425699131, "grad_norm": 2.876732239954283, "learning_rate": 9.947777219014985e-06, "loss": 0.8153, "step": 38200 }, { "epoch": 1.4181547205791145, "grad_norm": 2.648889760862942, "learning_rate": 9.94684149816102e-06, "loss": 0.7769, "step": 38300 }, { "epoch": 1.421857498588316, "grad_norm": 2.3672035754478253, "learning_rate": 9.945897513313446e-06, "loss": 0.8248, "step": 38400 }, { "epoch": 1.4255602765975173, "grad_norm": 2.5897674265990966, "learning_rate": 9.944945266049249e-06, "loss": 0.8168, "step": 38500 }, { "epoch": 1.4292630546067187, "grad_norm": 2.3627727017427986, "learning_rate": 9.943984757959214e-06, "loss": 0.8061, "step": 38600 }, { "epoch": 1.43296583261592, "grad_norm": 2.75793352812743, "learning_rate": 9.943015990647928e-06, "loss": 0.8406, "step": 38700 }, { "epoch": 1.4366686106251216, "grad_norm": 2.93848471854443, "learning_rate": 9.942038965733772e-06, "loss": 0.8093, "step": 38800 }, { "epoch": 1.4403713886343228, "grad_norm": 2.649269836710229, "learning_rate": 9.941053684848927e-06, "loss": 0.8156, "step": 38900 }, { "epoch": 1.4440741666435244, "grad_norm": 2.8474414546277336, "learning_rate": 9.940060149639362e-06, "loss": 0.803, "step": 39000 }, { "epoch": 1.4477769446527258, "grad_norm": 2.48930011170331, "learning_rate": 9.939058361764835e-06, "loss": 0.8149, "step": 39100 }, { "epoch": 1.4514797226619272, "grad_norm": 3.0541359408620954, "learning_rate": 9.938048322898897e-06, "loss": 0.7905, "step": 39200 }, { "epoch": 1.4551825006711285, "grad_norm": 2.4906684578035634, "learning_rate": 9.937030034728875e-06, "loss": 0.7983, "step": 39300 }, { "epoch": 1.45888527868033, "grad_norm": 2.522647352158736, "learning_rate": 9.93600349895588e-06, "loss": 0.8257, "step": 39400 }, { "epoch": 1.4625880566895313, "grad_norm": 2.3593412219963636, "learning_rate": 9.934968717294801e-06, "loss": 0.8156, "step": 39500 }, { "epoch": 1.4662908346987327, "grad_norm": 2.557490073344118, "learning_rate": 9.933925691474306e-06, "loss": 0.8168, "step": 39600 }, { "epoch": 1.469993612707934, "grad_norm": 2.338179664285831, "learning_rate": 9.932874423236827e-06, "loss": 0.8037, "step": 39700 }, { "epoch": 1.4736963907171354, "grad_norm": 2.5845812397294106, "learning_rate": 9.931814914338574e-06, "loss": 0.8037, "step": 39800 }, { "epoch": 1.477399168726337, "grad_norm": 2.738120618908721, "learning_rate": 9.930747166549517e-06, "loss": 0.8248, "step": 39900 }, { "epoch": 1.4811019467355384, "grad_norm": 3.0001861261521077, "learning_rate": 9.929671181653393e-06, "loss": 0.8158, "step": 40000 }, { "epoch": 1.4848047247447398, "grad_norm": 2.8790789963198864, "learning_rate": 9.9285869614477e-06, "loss": 0.81, "step": 40100 }, { "epoch": 1.4885075027539412, "grad_norm": 2.3530303824789263, "learning_rate": 9.927494507743693e-06, "loss": 0.8065, "step": 40200 }, { "epoch": 1.4922102807631425, "grad_norm": 2.5001939120344563, "learning_rate": 9.926393822366378e-06, "loss": 0.7986, "step": 40300 }, { "epoch": 1.495913058772344, "grad_norm": 3.322122232875802, "learning_rate": 9.925284907154518e-06, "loss": 0.8018, "step": 40400 }, { "epoch": 1.4996158367815453, "grad_norm": 2.632512694974853, "learning_rate": 9.924167763960622e-06, "loss": 0.7926, "step": 40500 }, { "epoch": 1.503318614790747, "grad_norm": 3.298194101210266, "learning_rate": 9.923042394650944e-06, "loss": 0.8012, "step": 40600 }, { "epoch": 1.507021392799948, "grad_norm": 2.5336893450551714, "learning_rate": 9.921908801105478e-06, "loss": 0.7979, "step": 40700 }, { "epoch": 1.5107241708091497, "grad_norm": 2.805390982714785, "learning_rate": 9.920766985217964e-06, "loss": 0.8007, "step": 40800 }, { "epoch": 1.5144269488183508, "grad_norm": 2.6228261966166846, "learning_rate": 9.919616948895869e-06, "loss": 0.7925, "step": 40900 }, { "epoch": 1.5181297268275524, "grad_norm": 2.608871750206852, "learning_rate": 9.918458694060401e-06, "loss": 0.8165, "step": 41000 }, { "epoch": 1.5218325048367538, "grad_norm": 2.3331476004266802, "learning_rate": 9.917292222646494e-06, "loss": 0.812, "step": 41100 }, { "epoch": 1.5255352828459552, "grad_norm": 2.256474693417922, "learning_rate": 9.916117536602805e-06, "loss": 0.8252, "step": 41200 }, { "epoch": 1.5292380608551566, "grad_norm": 2.2698106482233444, "learning_rate": 9.914934637891717e-06, "loss": 0.8049, "step": 41300 }, { "epoch": 1.532940838864358, "grad_norm": 2.505148031562781, "learning_rate": 9.913743528489335e-06, "loss": 0.7945, "step": 41400 }, { "epoch": 1.5366436168735595, "grad_norm": 2.273130621446297, "learning_rate": 9.912544210385478e-06, "loss": 0.7592, "step": 41500 }, { "epoch": 1.5403463948827607, "grad_norm": 2.7980874710174746, "learning_rate": 9.911336685583678e-06, "loss": 0.788, "step": 41600 }, { "epoch": 1.5440491728919623, "grad_norm": 2.518652055633435, "learning_rate": 9.910120956101177e-06, "loss": 0.7985, "step": 41700 }, { "epoch": 1.5477519509011635, "grad_norm": 2.8908580237428727, "learning_rate": 9.908897023968923e-06, "loss": 0.8022, "step": 41800 }, { "epoch": 1.551454728910365, "grad_norm": 2.796915452230328, "learning_rate": 9.907664891231567e-06, "loss": 0.7891, "step": 41900 }, { "epoch": 1.5551575069195664, "grad_norm": 2.58199772952833, "learning_rate": 9.906424559947463e-06, "loss": 0.8127, "step": 42000 }, { "epoch": 1.5588602849287678, "grad_norm": 2.9876242988374795, "learning_rate": 9.905176032188657e-06, "loss": 0.8103, "step": 42100 }, { "epoch": 1.5625630629379692, "grad_norm": 2.5402919151901284, "learning_rate": 9.903919310040888e-06, "loss": 0.8088, "step": 42200 }, { "epoch": 1.5662658409471706, "grad_norm": 2.759850752080656, "learning_rate": 9.902654395603585e-06, "loss": 0.7802, "step": 42300 }, { "epoch": 1.569968618956372, "grad_norm": 2.303462256584801, "learning_rate": 9.901381290989866e-06, "loss": 0.814, "step": 42400 }, { "epoch": 1.5736713969655733, "grad_norm": 2.5606229953487007, "learning_rate": 9.900099998326524e-06, "loss": 0.8199, "step": 42500 }, { "epoch": 1.577374174974775, "grad_norm": 2.429988282841457, "learning_rate": 9.898810519754038e-06, "loss": 0.8119, "step": 42600 }, { "epoch": 1.581076952983976, "grad_norm": 2.5219967389765823, "learning_rate": 9.897512857426559e-06, "loss": 0.8047, "step": 42700 }, { "epoch": 1.5847797309931777, "grad_norm": 1.9311972103887236, "learning_rate": 9.896207013511906e-06, "loss": 0.7986, "step": 42800 }, { "epoch": 1.588482509002379, "grad_norm": 2.234707722695266, "learning_rate": 9.894892990191572e-06, "loss": 0.8208, "step": 42900 }, { "epoch": 1.5921852870115805, "grad_norm": 2.52585572604136, "learning_rate": 9.89357078966071e-06, "loss": 0.8055, "step": 43000 }, { "epoch": 1.5958880650207818, "grad_norm": 2.4607414449083564, "learning_rate": 9.892240414128134e-06, "loss": 0.814, "step": 43100 }, { "epoch": 1.5995908430299832, "grad_norm": 2.397978043007156, "learning_rate": 9.890901865816318e-06, "loss": 0.7858, "step": 43200 }, { "epoch": 1.6032936210391846, "grad_norm": 2.6825010522588464, "learning_rate": 9.889555146961386e-06, "loss": 0.7643, "step": 43300 }, { "epoch": 1.606996399048386, "grad_norm": 2.296924382807737, "learning_rate": 9.888200259813112e-06, "loss": 0.774, "step": 43400 }, { "epoch": 1.6106991770575876, "grad_norm": 3.086030898842717, "learning_rate": 9.886837206634913e-06, "loss": 0.7903, "step": 43500 }, { "epoch": 1.6144019550667887, "grad_norm": 2.1770814434757852, "learning_rate": 9.885465989703855e-06, "loss": 0.7992, "step": 43600 }, { "epoch": 1.6181047330759903, "grad_norm": 2.1173853625857677, "learning_rate": 9.884086611310636e-06, "loss": 0.7562, "step": 43700 }, { "epoch": 1.6218075110851915, "grad_norm": 2.6347540026530383, "learning_rate": 9.88269907375959e-06, "loss": 0.7812, "step": 43800 }, { "epoch": 1.625510289094393, "grad_norm": 2.3677734896044367, "learning_rate": 9.881303379368679e-06, "loss": 0.7949, "step": 43900 }, { "epoch": 1.6292130671035945, "grad_norm": 1.9541227462181452, "learning_rate": 9.879899530469495e-06, "loss": 0.8014, "step": 44000 }, { "epoch": 1.6329158451127959, "grad_norm": 2.250577456663235, "learning_rate": 9.878487529407252e-06, "loss": 0.789, "step": 44100 }, { "epoch": 1.6366186231219972, "grad_norm": 2.332159230411907, "learning_rate": 9.877067378540783e-06, "loss": 0.7583, "step": 44200 }, { "epoch": 1.6403214011311986, "grad_norm": 2.620339688070587, "learning_rate": 9.875639080242532e-06, "loss": 0.7609, "step": 44300 }, { "epoch": 1.6440241791404002, "grad_norm": 2.281843212752168, "learning_rate": 9.874202636898557e-06, "loss": 0.7923, "step": 44400 }, { "epoch": 1.6477269571496014, "grad_norm": 2.7756614740728756, "learning_rate": 9.872758050908525e-06, "loss": 0.8133, "step": 44500 }, { "epoch": 1.651429735158803, "grad_norm": 2.0566706027251933, "learning_rate": 9.871305324685698e-06, "loss": 0.7771, "step": 44600 }, { "epoch": 1.6551325131680041, "grad_norm": 2.7157875508307203, "learning_rate": 9.869844460656946e-06, "loss": 0.7887, "step": 44700 }, { "epoch": 1.6588352911772057, "grad_norm": 2.3909529963011225, "learning_rate": 9.868375461262729e-06, "loss": 0.786, "step": 44800 }, { "epoch": 1.6625380691864071, "grad_norm": 3.0348798043450107, "learning_rate": 9.866898328957097e-06, "loss": 0.7658, "step": 44900 }, { "epoch": 1.6662408471956085, "grad_norm": 2.6335015906277564, "learning_rate": 9.865413066207686e-06, "loss": 0.7995, "step": 45000 }, { "epoch": 1.6699436252048099, "grad_norm": 2.385629891283996, "learning_rate": 9.863919675495718e-06, "loss": 0.7915, "step": 45100 }, { "epoch": 1.6736464032140113, "grad_norm": 2.9226049040665196, "learning_rate": 9.862418159315994e-06, "loss": 0.7846, "step": 45200 }, { "epoch": 1.6773491812232129, "grad_norm": 2.0898569179597546, "learning_rate": 9.860908520176881e-06, "loss": 0.7798, "step": 45300 }, { "epoch": 1.681051959232414, "grad_norm": 2.486543038672127, "learning_rate": 9.859390760600323e-06, "loss": 0.788, "step": 45400 }, { "epoch": 1.6847547372416156, "grad_norm": 2.150826890053404, "learning_rate": 9.857864883121829e-06, "loss": 0.8, "step": 45500 }, { "epoch": 1.6884575152508168, "grad_norm": 2.0064655253486494, "learning_rate": 9.856330890290467e-06, "loss": 0.7893, "step": 45600 }, { "epoch": 1.6921602932600184, "grad_norm": 1.974144415250403, "learning_rate": 9.854788784668862e-06, "loss": 0.8071, "step": 45700 }, { "epoch": 1.6958630712692198, "grad_norm": 2.8759598688034553, "learning_rate": 9.853238568833198e-06, "loss": 0.795, "step": 45800 }, { "epoch": 1.6995658492784211, "grad_norm": 2.4899159117702325, "learning_rate": 9.851680245373201e-06, "loss": 0.7933, "step": 45900 }, { "epoch": 1.7032686272876225, "grad_norm": 2.4900169396878535, "learning_rate": 9.85011381689214e-06, "loss": 0.7734, "step": 46000 }, { "epoch": 1.706971405296824, "grad_norm": 2.422580198248974, "learning_rate": 9.848539286006832e-06, "loss": 0.7951, "step": 46100 }, { "epoch": 1.7106741833060255, "grad_norm": 2.8907724621020634, "learning_rate": 9.846956655347621e-06, "loss": 0.7905, "step": 46200 }, { "epoch": 1.7143769613152267, "grad_norm": 2.013474316995975, "learning_rate": 9.845365927558387e-06, "loss": 0.8006, "step": 46300 }, { "epoch": 1.7180797393244283, "grad_norm": 2.1840681748691444, "learning_rate": 9.843767105296536e-06, "loss": 0.7635, "step": 46400 }, { "epoch": 1.7217825173336294, "grad_norm": 2.5617018354083934, "learning_rate": 9.842160191232996e-06, "loss": 0.7824, "step": 46500 }, { "epoch": 1.725485295342831, "grad_norm": 2.308519604503349, "learning_rate": 9.840545188052214e-06, "loss": 0.774, "step": 46600 }, { "epoch": 1.7291880733520324, "grad_norm": 2.2499525442223853, "learning_rate": 9.838922098452146e-06, "loss": 0.756, "step": 46700 }, { "epoch": 1.7328908513612338, "grad_norm": 2.3820412376888322, "learning_rate": 9.83729092514426e-06, "loss": 0.789, "step": 46800 }, { "epoch": 1.7365936293704352, "grad_norm": 2.5725331335845127, "learning_rate": 9.835651670853532e-06, "loss": 0.7854, "step": 46900 }, { "epoch": 1.7402964073796365, "grad_norm": 2.309343999229651, "learning_rate": 9.83400433831843e-06, "loss": 0.775, "step": 47000 }, { "epoch": 1.7439991853888381, "grad_norm": 2.5850280201791436, "learning_rate": 9.832348930290925e-06, "loss": 0.7802, "step": 47100 }, { "epoch": 1.7477019633980393, "grad_norm": 2.2679853588645105, "learning_rate": 9.830685449536472e-06, "loss": 0.7678, "step": 47200 }, { "epoch": 1.751404741407241, "grad_norm": 2.5086985240224635, "learning_rate": 9.829013898834014e-06, "loss": 0.7577, "step": 47300 }, { "epoch": 1.755107519416442, "grad_norm": 1.9266513380957035, "learning_rate": 9.827334280975978e-06, "loss": 0.7758, "step": 47400 }, { "epoch": 1.7588102974256437, "grad_norm": 2.2928770756948547, "learning_rate": 9.825646598768267e-06, "loss": 0.7637, "step": 47500 }, { "epoch": 1.762513075434845, "grad_norm": 2.5105321008988146, "learning_rate": 9.82395085503025e-06, "loss": 0.7832, "step": 47600 }, { "epoch": 1.7662158534440464, "grad_norm": 2.2393983265475867, "learning_rate": 9.822247052594775e-06, "loss": 0.7957, "step": 47700 }, { "epoch": 1.7699186314532478, "grad_norm": 2.2356517021326447, "learning_rate": 9.82053519430814e-06, "loss": 0.7786, "step": 47800 }, { "epoch": 1.7736214094624492, "grad_norm": 2.1349870329764467, "learning_rate": 9.818815283030107e-06, "loss": 0.7639, "step": 47900 }, { "epoch": 1.7773241874716506, "grad_norm": 2.207362188864924, "learning_rate": 9.817087321633891e-06, "loss": 0.7774, "step": 48000 }, { "epoch": 1.781026965480852, "grad_norm": 2.357569522929328, "learning_rate": 9.815351313006155e-06, "loss": 0.7903, "step": 48100 }, { "epoch": 1.7847297434900535, "grad_norm": 2.5604354649057512, "learning_rate": 9.813607260047007e-06, "loss": 0.7861, "step": 48200 }, { "epoch": 1.7884325214992547, "grad_norm": 2.3900699599516964, "learning_rate": 9.811855165669985e-06, "loss": 0.7883, "step": 48300 }, { "epoch": 1.7921352995084563, "grad_norm": 2.2498623353917093, "learning_rate": 9.810095032802075e-06, "loss": 0.7749, "step": 48400 }, { "epoch": 1.7958380775176577, "grad_norm": 2.523396892959795, "learning_rate": 9.808326864383679e-06, "loss": 0.773, "step": 48500 }, { "epoch": 1.799540855526859, "grad_norm": 2.2014201396256214, "learning_rate": 9.806550663368628e-06, "loss": 0.7784, "step": 48600 }, { "epoch": 1.8032436335360604, "grad_norm": 2.137068120876505, "learning_rate": 9.804766432724172e-06, "loss": 0.781, "step": 48700 }, { "epoch": 1.8069464115452618, "grad_norm": 2.1941678037156036, "learning_rate": 9.802974175430975e-06, "loss": 0.7813, "step": 48800 }, { "epoch": 1.8106491895544632, "grad_norm": 2.3496769370735775, "learning_rate": 9.801173894483111e-06, "loss": 0.7758, "step": 48900 }, { "epoch": 1.8143519675636646, "grad_norm": 2.6951877515683917, "learning_rate": 9.799365592888054e-06, "loss": 0.7753, "step": 49000 }, { "epoch": 1.8180547455728662, "grad_norm": 2.2987294149497504, "learning_rate": 9.797549273666682e-06, "loss": 0.7979, "step": 49100 }, { "epoch": 1.8217575235820673, "grad_norm": 2.1550107736300883, "learning_rate": 9.795724939853265e-06, "loss": 0.7547, "step": 49200 }, { "epoch": 1.825460301591269, "grad_norm": 2.1082078725588724, "learning_rate": 9.793892594495457e-06, "loss": 0.7481, "step": 49300 }, { "epoch": 1.82916307960047, "grad_norm": 2.535213902917527, "learning_rate": 9.792052240654304e-06, "loss": 0.7568, "step": 49400 }, { "epoch": 1.8328658576096717, "grad_norm": 2.3198113159972595, "learning_rate": 9.790203881404228e-06, "loss": 0.7834, "step": 49500 }, { "epoch": 1.836568635618873, "grad_norm": 2.519238802449602, "learning_rate": 9.78834751983302e-06, "loss": 0.7699, "step": 49600 }, { "epoch": 1.8402714136280744, "grad_norm": 2.3333401238694798, "learning_rate": 9.786483159041842e-06, "loss": 0.7834, "step": 49700 }, { "epoch": 1.8439741916372758, "grad_norm": 2.4205774253956385, "learning_rate": 9.784610802145222e-06, "loss": 0.7863, "step": 49800 }, { "epoch": 1.8476769696464772, "grad_norm": 2.178068900898099, "learning_rate": 9.782730452271046e-06, "loss": 0.7674, "step": 49900 }, { "epoch": 1.8513797476556788, "grad_norm": 2.6080625282619714, "learning_rate": 9.780842112560548e-06, "loss": 0.7642, "step": 50000 }, { "epoch": 1.85508252566488, "grad_norm": 2.843984991990864, "learning_rate": 9.778945786168308e-06, "loss": 0.7655, "step": 50100 }, { "epoch": 1.8587853036740816, "grad_norm": 2.2308315520099424, "learning_rate": 9.777041476262259e-06, "loss": 0.7656, "step": 50200 }, { "epoch": 1.8624880816832827, "grad_norm": 2.400873208112685, "learning_rate": 9.775129186023661e-06, "loss": 0.7588, "step": 50300 }, { "epoch": 1.8661908596924843, "grad_norm": 2.9815623334199604, "learning_rate": 9.773208918647111e-06, "loss": 0.7722, "step": 50400 }, { "epoch": 1.8698936377016857, "grad_norm": 2.6488046885793373, "learning_rate": 9.771280677340528e-06, "loss": 0.7813, "step": 50500 }, { "epoch": 1.873596415710887, "grad_norm": 2.521885076282361, "learning_rate": 9.769344465325153e-06, "loss": 0.7846, "step": 50600 }, { "epoch": 1.8772991937200885, "grad_norm": 1.8398874480846792, "learning_rate": 9.767400285835546e-06, "loss": 0.7799, "step": 50700 }, { "epoch": 1.8810019717292898, "grad_norm": 2.6273527775975114, "learning_rate": 9.765448142119575e-06, "loss": 0.7463, "step": 50800 }, { "epoch": 1.8847047497384914, "grad_norm": 2.4232551189720626, "learning_rate": 9.763488037438412e-06, "loss": 0.7763, "step": 50900 }, { "epoch": 1.8884075277476926, "grad_norm": 2.182119510957546, "learning_rate": 9.761519975066524e-06, "loss": 0.76, "step": 51000 }, { "epoch": 1.8921103057568942, "grad_norm": 2.4800365305029106, "learning_rate": 9.759543958291683e-06, "loss": 0.7878, "step": 51100 }, { "epoch": 1.8958130837660954, "grad_norm": 2.2209159681563055, "learning_rate": 9.757559990414941e-06, "loss": 0.7706, "step": 51200 }, { "epoch": 1.899515861775297, "grad_norm": 2.2456207460433175, "learning_rate": 9.755568074750635e-06, "loss": 0.7533, "step": 51300 }, { "epoch": 1.9032186397844983, "grad_norm": 2.216946549826359, "learning_rate": 9.753568214626375e-06, "loss": 0.7651, "step": 51400 }, { "epoch": 1.9069214177936997, "grad_norm": 2.1353696650613556, "learning_rate": 9.751560413383051e-06, "loss": 0.7451, "step": 51500 }, { "epoch": 1.910624195802901, "grad_norm": 2.2188964222997227, "learning_rate": 9.749544674374814e-06, "loss": 0.771, "step": 51600 }, { "epoch": 1.9143269738121025, "grad_norm": 2.6602884956835373, "learning_rate": 9.747521000969074e-06, "loss": 0.7652, "step": 51700 }, { "epoch": 1.918029751821304, "grad_norm": 2.366026652497562, "learning_rate": 9.745489396546499e-06, "loss": 0.7778, "step": 51800 }, { "epoch": 1.9217325298305052, "grad_norm": 2.4178576890485166, "learning_rate": 9.743449864501006e-06, "loss": 0.7682, "step": 51900 }, { "epoch": 1.9254353078397068, "grad_norm": 2.461073225865995, "learning_rate": 9.741402408239753e-06, "loss": 0.7379, "step": 52000 }, { "epoch": 1.929138085848908, "grad_norm": 2.3169585710466443, "learning_rate": 9.739347031183142e-06, "loss": 0.74, "step": 52100 }, { "epoch": 1.9328408638581096, "grad_norm": 2.480606692215648, "learning_rate": 9.737283736764798e-06, "loss": 0.7811, "step": 52200 }, { "epoch": 1.936543641867311, "grad_norm": 2.33302742635216, "learning_rate": 9.73521252843158e-06, "loss": 0.7853, "step": 52300 }, { "epoch": 1.9402464198765124, "grad_norm": 2.178325980295709, "learning_rate": 9.733133409643565e-06, "loss": 0.7678, "step": 52400 }, { "epoch": 1.9439491978857137, "grad_norm": 2.6202132000217, "learning_rate": 9.731046383874044e-06, "loss": 0.7496, "step": 52500 }, { "epoch": 1.9476519758949151, "grad_norm": 3.155040247361292, "learning_rate": 9.728951454609517e-06, "loss": 0.7728, "step": 52600 }, { "epoch": 1.9513547539041167, "grad_norm": 2.115338674943332, "learning_rate": 9.726848625349691e-06, "loss": 0.7625, "step": 52700 }, { "epoch": 1.9550575319133179, "grad_norm": 2.203447982138841, "learning_rate": 9.724737899607466e-06, "loss": 0.7693, "step": 52800 }, { "epoch": 1.9587603099225195, "grad_norm": 2.314534740326119, "learning_rate": 9.722619280908934e-06, "loss": 0.7628, "step": 52900 }, { "epoch": 1.9624630879317206, "grad_norm": 2.1956336825914793, "learning_rate": 9.720492772793375e-06, "loss": 0.7636, "step": 53000 }, { "epoch": 1.9661658659409222, "grad_norm": 2.237936138713292, "learning_rate": 9.718358378813248e-06, "loss": 0.7559, "step": 53100 }, { "epoch": 1.9698686439501236, "grad_norm": 2.166260133162057, "learning_rate": 9.716216102534186e-06, "loss": 0.7619, "step": 53200 }, { "epoch": 1.973571421959325, "grad_norm": 2.4562700667901933, "learning_rate": 9.714065947534987e-06, "loss": 0.7596, "step": 53300 }, { "epoch": 1.9772741999685264, "grad_norm": 2.0820216516365027, "learning_rate": 9.711907917407614e-06, "loss": 0.7526, "step": 53400 }, { "epoch": 1.9809769779777278, "grad_norm": 2.3755910874830657, "learning_rate": 9.709742015757187e-06, "loss": 0.7553, "step": 53500 }, { "epoch": 1.9846797559869291, "grad_norm": 2.369452864927645, "learning_rate": 9.707568246201972e-06, "loss": 0.753, "step": 53600 }, { "epoch": 1.9883825339961305, "grad_norm": 2.3410608706416762, "learning_rate": 9.70538661237338e-06, "loss": 0.7787, "step": 53700 }, { "epoch": 1.9920853120053321, "grad_norm": 2.087568589173381, "learning_rate": 9.70319711791596e-06, "loss": 0.7586, "step": 53800 }, { "epoch": 1.9957880900145333, "grad_norm": 2.6786991173682373, "learning_rate": 9.700999766487395e-06, "loss": 0.7465, "step": 53900 }, { "epoch": 1.9994908680237349, "grad_norm": 3.03880633662284, "learning_rate": 9.698794561758493e-06, "loss": 0.7403, "step": 54000 }, { "epoch": 2.003184389087913, "grad_norm": 2.5868756494206497, "learning_rate": 9.696581507413174e-06, "loss": 0.6992, "step": 54100 }, { "epoch": 2.0068871670971147, "grad_norm": 2.0423484305586994, "learning_rate": 9.694360607148484e-06, "loss": 0.6838, "step": 54200 }, { "epoch": 2.010589945106316, "grad_norm": 2.313387880714559, "learning_rate": 9.692131864674563e-06, "loss": 0.6912, "step": 54300 }, { "epoch": 2.0142927231155174, "grad_norm": 2.274235624804895, "learning_rate": 9.689895283714663e-06, "loss": 0.6854, "step": 54400 }, { "epoch": 2.017995501124719, "grad_norm": 2.119309140091523, "learning_rate": 9.687650868005124e-06, "loss": 0.6786, "step": 54500 }, { "epoch": 2.02169827913392, "grad_norm": 2.3759711399354413, "learning_rate": 9.685398621295377e-06, "loss": 0.6841, "step": 54600 }, { "epoch": 2.025401057143122, "grad_norm": 2.652183252752739, "learning_rate": 9.683138547347933e-06, "loss": 0.6938, "step": 54700 }, { "epoch": 2.029103835152323, "grad_norm": 2.497195487638967, "learning_rate": 9.68087064993838e-06, "loss": 0.6834, "step": 54800 }, { "epoch": 2.0328066131615246, "grad_norm": 2.0788434226359174, "learning_rate": 9.678594932855377e-06, "loss": 0.681, "step": 54900 }, { "epoch": 2.0365093911707257, "grad_norm": 2.557154847367296, "learning_rate": 9.676311399900644e-06, "loss": 0.6956, "step": 55000 }, { "epoch": 2.0402121691799273, "grad_norm": 2.5976913602453595, "learning_rate": 9.674020054888962e-06, "loss": 0.673, "step": 55100 }, { "epoch": 2.0439149471891285, "grad_norm": 2.1397702042236206, "learning_rate": 9.671720901648157e-06, "loss": 0.6939, "step": 55200 }, { "epoch": 2.04761772519833, "grad_norm": 2.384247384660651, "learning_rate": 9.669413944019099e-06, "loss": 0.6757, "step": 55300 }, { "epoch": 2.0513205032075312, "grad_norm": 2.1863152443770786, "learning_rate": 9.667099185855703e-06, "loss": 0.6968, "step": 55400 }, { "epoch": 2.055023281216733, "grad_norm": 4.405956537339234, "learning_rate": 9.664776631024908e-06, "loss": 0.683, "step": 55500 }, { "epoch": 2.0587260592259344, "grad_norm": 2.049043630631007, "learning_rate": 9.662446283406682e-06, "loss": 0.6914, "step": 55600 }, { "epoch": 2.0624288372351356, "grad_norm": 2.5664972711721625, "learning_rate": 9.660108146894007e-06, "loss": 0.6915, "step": 55700 }, { "epoch": 2.066131615244337, "grad_norm": 2.005386472768463, "learning_rate": 9.65776222539288e-06, "loss": 0.6598, "step": 55800 }, { "epoch": 2.0698343932535384, "grad_norm": 2.449106381543406, "learning_rate": 9.655408522822306e-06, "loss": 0.66, "step": 55900 }, { "epoch": 2.07353717126274, "grad_norm": 3.0381314839729177, "learning_rate": 9.653047043114281e-06, "loss": 0.6685, "step": 56000 }, { "epoch": 2.077239949271941, "grad_norm": 2.412440777034649, "learning_rate": 9.650677790213799e-06, "loss": 0.666, "step": 56100 }, { "epoch": 2.0809427272811427, "grad_norm": 2.3635812683682222, "learning_rate": 9.64830076807884e-06, "loss": 0.6719, "step": 56200 }, { "epoch": 2.084645505290344, "grad_norm": 2.337252826890419, "learning_rate": 9.64591598068036e-06, "loss": 0.6761, "step": 56300 }, { "epoch": 2.0883482832995455, "grad_norm": 2.1411084610979856, "learning_rate": 9.643523432002288e-06, "loss": 0.6699, "step": 56400 }, { "epoch": 2.092051061308747, "grad_norm": 2.396579312617091, "learning_rate": 9.64112312604152e-06, "loss": 0.6811, "step": 56500 }, { "epoch": 2.0957538393179482, "grad_norm": 1.991014001057909, "learning_rate": 9.638715066807908e-06, "loss": 0.6921, "step": 56600 }, { "epoch": 2.09945661732715, "grad_norm": 1.8170099574989464, "learning_rate": 9.636299258324263e-06, "loss": 0.6748, "step": 56700 }, { "epoch": 2.103159395336351, "grad_norm": 2.407538918588163, "learning_rate": 9.633875704626332e-06, "loss": 0.6556, "step": 56800 }, { "epoch": 2.1068621733455526, "grad_norm": 2.0009350084583186, "learning_rate": 9.63144440976281e-06, "loss": 0.6804, "step": 56900 }, { "epoch": 2.1105649513547537, "grad_norm": 2.318529373742503, "learning_rate": 9.629005377795318e-06, "loss": 0.6766, "step": 57000 }, { "epoch": 2.1142677293639554, "grad_norm": 2.901357640692631, "learning_rate": 9.626558612798404e-06, "loss": 0.6794, "step": 57100 }, { "epoch": 2.1179705073731565, "grad_norm": 2.154557247934532, "learning_rate": 9.624104118859535e-06, "loss": 0.6691, "step": 57200 }, { "epoch": 2.121673285382358, "grad_norm": 2.3903883257512577, "learning_rate": 9.62164190007909e-06, "loss": 0.6545, "step": 57300 }, { "epoch": 2.1253760633915597, "grad_norm": 2.1742676469308093, "learning_rate": 9.619171960570353e-06, "loss": 0.6894, "step": 57400 }, { "epoch": 2.129078841400761, "grad_norm": 2.654747582200517, "learning_rate": 9.616694304459504e-06, "loss": 0.6784, "step": 57500 }, { "epoch": 2.1327816194099625, "grad_norm": 2.402727194182496, "learning_rate": 9.614208935885615e-06, "loss": 0.6724, "step": 57600 }, { "epoch": 2.1364843974191636, "grad_norm": 2.2360152593662743, "learning_rate": 9.611715859000643e-06, "loss": 0.6622, "step": 57700 }, { "epoch": 2.1401871754283652, "grad_norm": 2.591011720239606, "learning_rate": 9.609215077969422e-06, "loss": 0.6981, "step": 57800 }, { "epoch": 2.1438899534375664, "grad_norm": 2.059137036300649, "learning_rate": 9.606706596969655e-06, "loss": 0.6665, "step": 57900 }, { "epoch": 2.147592731446768, "grad_norm": 2.5471008394463768, "learning_rate": 9.604190420191908e-06, "loss": 0.6725, "step": 58000 }, { "epoch": 2.151295509455969, "grad_norm": 2.3973486591141504, "learning_rate": 9.601666551839606e-06, "loss": 0.6855, "step": 58100 }, { "epoch": 2.1549982874651707, "grad_norm": 2.6375773664347286, "learning_rate": 9.599134996129022e-06, "loss": 0.6826, "step": 58200 }, { "epoch": 2.1587010654743723, "grad_norm": 2.332457102848655, "learning_rate": 9.596595757289268e-06, "loss": 0.6814, "step": 58300 }, { "epoch": 2.1624038434835735, "grad_norm": 2.4870013874361745, "learning_rate": 9.594048839562298e-06, "loss": 0.6792, "step": 58400 }, { "epoch": 2.166106621492775, "grad_norm": 2.3255182488507034, "learning_rate": 9.591494247202886e-06, "loss": 0.6954, "step": 58500 }, { "epoch": 2.1698093995019763, "grad_norm": 2.83057658696704, "learning_rate": 9.588931984478633e-06, "loss": 0.6914, "step": 58600 }, { "epoch": 2.173512177511178, "grad_norm": 2.354736029056531, "learning_rate": 9.58636205566995e-06, "loss": 0.6705, "step": 58700 }, { "epoch": 2.177214955520379, "grad_norm": 2.413081651067487, "learning_rate": 9.583784465070056e-06, "loss": 0.692, "step": 58800 }, { "epoch": 2.1809177335295806, "grad_norm": 1.9994020495850402, "learning_rate": 9.581199216984974e-06, "loss": 0.6789, "step": 58900 }, { "epoch": 2.184620511538782, "grad_norm": 2.336676864459619, "learning_rate": 9.57860631573351e-06, "loss": 0.6746, "step": 59000 }, { "epoch": 2.1883232895479834, "grad_norm": 1.9765951823994232, "learning_rate": 9.576005765647262e-06, "loss": 0.6841, "step": 59100 }, { "epoch": 2.192026067557185, "grad_norm": 2.6704707200624567, "learning_rate": 9.573397571070606e-06, "loss": 0.6606, "step": 59200 }, { "epoch": 2.195728845566386, "grad_norm": 2.7273193459509057, "learning_rate": 9.570781736360682e-06, "loss": 0.694, "step": 59300 }, { "epoch": 2.1994316235755877, "grad_norm": 2.4075381061640475, "learning_rate": 9.568158265887402e-06, "loss": 0.7058, "step": 59400 }, { "epoch": 2.203134401584789, "grad_norm": 2.7561200770361283, "learning_rate": 9.565527164033428e-06, "loss": 0.6635, "step": 59500 }, { "epoch": 2.2068371795939905, "grad_norm": 1.9296516325452246, "learning_rate": 9.562888435194171e-06, "loss": 0.6944, "step": 59600 }, { "epoch": 2.2105399576031917, "grad_norm": 2.3953208065104445, "learning_rate": 9.56024208377779e-06, "loss": 0.6511, "step": 59700 }, { "epoch": 2.2142427356123933, "grad_norm": 2.3935986970014507, "learning_rate": 9.557588114205166e-06, "loss": 0.684, "step": 59800 }, { "epoch": 2.2179455136215944, "grad_norm": 2.474410928209759, "learning_rate": 9.554926530909918e-06, "loss": 0.6944, "step": 59900 }, { "epoch": 2.221648291630796, "grad_norm": 2.300308607407991, "learning_rate": 9.552257338338377e-06, "loss": 0.6619, "step": 60000 }, { "epoch": 2.2253510696399976, "grad_norm": 2.2318245286663347, "learning_rate": 9.549580540949592e-06, "loss": 0.6737, "step": 60100 }, { "epoch": 2.229053847649199, "grad_norm": 3.050417412156981, "learning_rate": 9.546896143215307e-06, "loss": 0.6588, "step": 60200 }, { "epoch": 2.2327566256584004, "grad_norm": 2.096536346012167, "learning_rate": 9.544204149619973e-06, "loss": 0.6529, "step": 60300 }, { "epoch": 2.2364594036676015, "grad_norm": 2.81530679664561, "learning_rate": 9.541504564660726e-06, "loss": 0.6691, "step": 60400 }, { "epoch": 2.240162181676803, "grad_norm": 2.5250157137123606, "learning_rate": 9.53879739284738e-06, "loss": 0.6956, "step": 60500 }, { "epoch": 2.2438649596860043, "grad_norm": 2.1361057888741057, "learning_rate": 9.536082638702428e-06, "loss": 0.6622, "step": 60600 }, { "epoch": 2.247567737695206, "grad_norm": 2.418961423369967, "learning_rate": 9.533360306761032e-06, "loss": 0.6718, "step": 60700 }, { "epoch": 2.251270515704407, "grad_norm": 2.0162128525744984, "learning_rate": 9.530630401571006e-06, "loss": 0.6757, "step": 60800 }, { "epoch": 2.2549732937136087, "grad_norm": 1.7747207441276736, "learning_rate": 9.527892927692819e-06, "loss": 0.6895, "step": 60900 }, { "epoch": 2.2586760717228103, "grad_norm": 2.3868183988130007, "learning_rate": 9.525147889699587e-06, "loss": 0.6982, "step": 61000 }, { "epoch": 2.2623788497320114, "grad_norm": 2.370342809471196, "learning_rate": 9.52239529217706e-06, "loss": 0.6802, "step": 61100 }, { "epoch": 2.266081627741213, "grad_norm": 2.001291795407346, "learning_rate": 9.519635139723613e-06, "loss": 0.6836, "step": 61200 }, { "epoch": 2.269784405750414, "grad_norm": 2.4678585122972367, "learning_rate": 9.516867436950247e-06, "loss": 0.6709, "step": 61300 }, { "epoch": 2.273487183759616, "grad_norm": 2.6005957215451754, "learning_rate": 9.514092188480574e-06, "loss": 0.6818, "step": 61400 }, { "epoch": 2.277189961768817, "grad_norm": 2.920386936383934, "learning_rate": 9.511309398950815e-06, "loss": 0.7052, "step": 61500 }, { "epoch": 2.2808927397780185, "grad_norm": 2.5403436437373865, "learning_rate": 9.50851907300978e-06, "loss": 0.6955, "step": 61600 }, { "epoch": 2.2845955177872197, "grad_norm": 2.2492931914844347, "learning_rate": 9.505721215318879e-06, "loss": 0.6785, "step": 61700 }, { "epoch": 2.2882982957964213, "grad_norm": 2.6880292587251047, "learning_rate": 9.5029158305521e-06, "loss": 0.68, "step": 61800 }, { "epoch": 2.292001073805623, "grad_norm": 2.402997197868183, "learning_rate": 9.500102923396004e-06, "loss": 0.6927, "step": 61900 }, { "epoch": 2.295703851814824, "grad_norm": 1.9864142200066779, "learning_rate": 9.49728249854972e-06, "loss": 0.6919, "step": 62000 }, { "epoch": 2.2994066298240257, "grad_norm": 2.8456743709517163, "learning_rate": 9.494454560724938e-06, "loss": 0.6762, "step": 62100 }, { "epoch": 2.303109407833227, "grad_norm": 2.7748514053291484, "learning_rate": 9.491619114645892e-06, "loss": 0.6777, "step": 62200 }, { "epoch": 2.3068121858424284, "grad_norm": 2.559623553355795, "learning_rate": 9.48877616504937e-06, "loss": 0.6885, "step": 62300 }, { "epoch": 2.3105149638516296, "grad_norm": 2.1469384529226008, "learning_rate": 9.485925716684684e-06, "loss": 0.7014, "step": 62400 }, { "epoch": 2.314217741860831, "grad_norm": 2.6264777887477444, "learning_rate": 9.48306777431368e-06, "loss": 0.6778, "step": 62500 }, { "epoch": 2.3179205198700323, "grad_norm": 2.621989964486446, "learning_rate": 9.48020234271072e-06, "loss": 0.6805, "step": 62600 }, { "epoch": 2.321623297879234, "grad_norm": 2.272202713631239, "learning_rate": 9.47732942666268e-06, "loss": 0.6867, "step": 62700 }, { "epoch": 2.3253260758884355, "grad_norm": 1.9762020868593124, "learning_rate": 9.474449030968937e-06, "loss": 0.6854, "step": 62800 }, { "epoch": 2.3290288538976367, "grad_norm": 2.1226904406736984, "learning_rate": 9.471561160441363e-06, "loss": 0.6688, "step": 62900 }, { "epoch": 2.3327316319068383, "grad_norm": 2.316689916305218, "learning_rate": 9.468665819904317e-06, "loss": 0.6951, "step": 63000 }, { "epoch": 2.3364344099160395, "grad_norm": 2.316136479919069, "learning_rate": 9.465763014194638e-06, "loss": 0.6808, "step": 63100 }, { "epoch": 2.340137187925241, "grad_norm": 2.307220790631874, "learning_rate": 9.46285274816164e-06, "loss": 0.6869, "step": 63200 }, { "epoch": 2.343839965934442, "grad_norm": 2.213269812970463, "learning_rate": 9.459935026667089e-06, "loss": 0.6578, "step": 63300 }, { "epoch": 2.347542743943644, "grad_norm": 2.344279831358738, "learning_rate": 9.457009854585219e-06, "loss": 0.6971, "step": 63400 }, { "epoch": 2.351245521952845, "grad_norm": 2.0096880506357446, "learning_rate": 9.454077236802702e-06, "loss": 0.6828, "step": 63500 }, { "epoch": 2.3549482999620466, "grad_norm": 2.2548311729082253, "learning_rate": 9.45113717821865e-06, "loss": 0.6727, "step": 63600 }, { "epoch": 2.358651077971248, "grad_norm": 2.4341693614642996, "learning_rate": 9.448189683744608e-06, "loss": 0.6809, "step": 63700 }, { "epoch": 2.3623538559804493, "grad_norm": 2.419848393996797, "learning_rate": 9.445234758304537e-06, "loss": 0.6928, "step": 63800 }, { "epoch": 2.3660566339896505, "grad_norm": 2.7840357590734994, "learning_rate": 9.442272406834823e-06, "loss": 0.6698, "step": 63900 }, { "epoch": 2.369759411998852, "grad_norm": 2.3936132840359665, "learning_rate": 9.439302634284244e-06, "loss": 0.6741, "step": 64000 }, { "epoch": 2.3734621900080537, "grad_norm": 2.2628698955348923, "learning_rate": 9.436325445613988e-06, "loss": 0.6982, "step": 64100 }, { "epoch": 2.377164968017255, "grad_norm": 2.5913137677554645, "learning_rate": 9.43334084579762e-06, "loss": 0.6843, "step": 64200 }, { "epoch": 2.3808677460264565, "grad_norm": 2.5306835812838027, "learning_rate": 9.430348839821095e-06, "loss": 0.6931, "step": 64300 }, { "epoch": 2.3845705240356576, "grad_norm": 1.824497906863608, "learning_rate": 9.42734943268274e-06, "loss": 0.6784, "step": 64400 }, { "epoch": 2.388273302044859, "grad_norm": 2.031648470909946, "learning_rate": 9.424342629393238e-06, "loss": 0.6845, "step": 64500 }, { "epoch": 2.3919760800540604, "grad_norm": 2.5806566539882274, "learning_rate": 9.421328434975636e-06, "loss": 0.6893, "step": 64600 }, { "epoch": 2.395678858063262, "grad_norm": 2.1526340438291807, "learning_rate": 9.418306854465327e-06, "loss": 0.6973, "step": 64700 }, { "epoch": 2.399381636072463, "grad_norm": 2.4285535244597702, "learning_rate": 9.41527789291004e-06, "loss": 0.7019, "step": 64800 }, { "epoch": 2.4030844140816647, "grad_norm": 2.621188381463244, "learning_rate": 9.412241555369834e-06, "loss": 0.6653, "step": 64900 }, { "epoch": 2.4067871920908663, "grad_norm": 2.224098798333827, "learning_rate": 9.409197846917093e-06, "loss": 0.6725, "step": 65000 }, { "epoch": 2.4104899701000675, "grad_norm": 1.997533351532834, "learning_rate": 9.406146772636516e-06, "loss": 0.6812, "step": 65100 }, { "epoch": 2.414192748109269, "grad_norm": 2.19098514780732, "learning_rate": 9.403088337625099e-06, "loss": 0.6677, "step": 65200 }, { "epoch": 2.4178955261184703, "grad_norm": 2.1908878890803605, "learning_rate": 9.400022546992148e-06, "loss": 0.6813, "step": 65300 }, { "epoch": 2.421598304127672, "grad_norm": 2.324540384353367, "learning_rate": 9.396949405859239e-06, "loss": 0.6579, "step": 65400 }, { "epoch": 2.425301082136873, "grad_norm": 2.5054016122271374, "learning_rate": 9.393868919360244e-06, "loss": 0.6744, "step": 65500 }, { "epoch": 2.4290038601460746, "grad_norm": 2.582887067658994, "learning_rate": 9.390781092641301e-06, "loss": 0.6913, "step": 65600 }, { "epoch": 2.4327066381552758, "grad_norm": 2.2584713627681428, "learning_rate": 9.387685930860804e-06, "loss": 0.6645, "step": 65700 }, { "epoch": 2.4364094161644774, "grad_norm": 2.202586980967711, "learning_rate": 9.384583439189406e-06, "loss": 0.6599, "step": 65800 }, { "epoch": 2.440112194173679, "grad_norm": 2.0537705242407256, "learning_rate": 9.381473622810005e-06, "loss": 0.6524, "step": 65900 }, { "epoch": 2.44381497218288, "grad_norm": 2.446679586314843, "learning_rate": 9.378356486917736e-06, "loss": 0.6586, "step": 66000 }, { "epoch": 2.4475177501920817, "grad_norm": 2.3254324060908886, "learning_rate": 9.37523203671996e-06, "loss": 0.6716, "step": 66100 }, { "epoch": 2.451220528201283, "grad_norm": 2.402871716965202, "learning_rate": 9.372100277436253e-06, "loss": 0.6771, "step": 66200 }, { "epoch": 2.4549233062104845, "grad_norm": 2.4014908865791402, "learning_rate": 9.368961214298414e-06, "loss": 0.6892, "step": 66300 }, { "epoch": 2.4586260842196856, "grad_norm": 2.309859916718413, "learning_rate": 9.365814852550426e-06, "loss": 0.6725, "step": 66400 }, { "epoch": 2.4623288622288872, "grad_norm": 2.657756967242288, "learning_rate": 9.36266119744848e-06, "loss": 0.6835, "step": 66500 }, { "epoch": 2.4660316402380884, "grad_norm": 2.3060608989482327, "learning_rate": 9.35950025426094e-06, "loss": 0.6694, "step": 66600 }, { "epoch": 2.46973441824729, "grad_norm": 1.9200073351424498, "learning_rate": 9.356332028268356e-06, "loss": 0.6725, "step": 66700 }, { "epoch": 2.4734371962564916, "grad_norm": 1.7930879502348702, "learning_rate": 9.353156524763433e-06, "loss": 0.6674, "step": 66800 }, { "epoch": 2.4771399742656928, "grad_norm": 2.312137593139913, "learning_rate": 9.349973749051042e-06, "loss": 0.665, "step": 66900 }, { "epoch": 2.4808427522748944, "grad_norm": 2.7119648286693536, "learning_rate": 9.346783706448199e-06, "loss": 0.6925, "step": 67000 }, { "epoch": 2.4845455302840955, "grad_norm": 2.356555621714717, "learning_rate": 9.343586402284061e-06, "loss": 0.6774, "step": 67100 }, { "epoch": 2.488248308293297, "grad_norm": 3.4467386366257196, "learning_rate": 9.340381841899913e-06, "loss": 0.6907, "step": 67200 }, { "epoch": 2.4919510863024983, "grad_norm": 2.5874996764431, "learning_rate": 9.337170030649166e-06, "loss": 0.6808, "step": 67300 }, { "epoch": 2.4956538643117, "grad_norm": 2.4262991872836093, "learning_rate": 9.33395097389734e-06, "loss": 0.6714, "step": 67400 }, { "epoch": 2.499356642320901, "grad_norm": 2.2026220733741737, "learning_rate": 9.330724677022063e-06, "loss": 0.6798, "step": 67500 }, { "epoch": 2.5030594203301026, "grad_norm": 2.1788581179144395, "learning_rate": 9.327491145413057e-06, "loss": 0.6811, "step": 67600 }, { "epoch": 2.5067621983393042, "grad_norm": 2.275441203213566, "learning_rate": 9.324250384472127e-06, "loss": 0.6627, "step": 67700 }, { "epoch": 2.5104649763485054, "grad_norm": 2.283344949810879, "learning_rate": 9.32100239961316e-06, "loss": 0.6642, "step": 67800 }, { "epoch": 2.514167754357707, "grad_norm": 2.4267848884723167, "learning_rate": 9.317747196262105e-06, "loss": 0.6787, "step": 67900 }, { "epoch": 2.517870532366908, "grad_norm": 2.5953019278693965, "learning_rate": 9.314484779856977e-06, "loss": 0.6737, "step": 68000 }, { "epoch": 2.5215733103761098, "grad_norm": 2.036941282735882, "learning_rate": 9.311215155847834e-06, "loss": 0.6589, "step": 68100 }, { "epoch": 2.525276088385311, "grad_norm": 2.597845885761239, "learning_rate": 9.30793832969678e-06, "loss": 0.6717, "step": 68200 }, { "epoch": 2.5289788663945125, "grad_norm": 2.4622763848737774, "learning_rate": 9.304654306877946e-06, "loss": 0.6897, "step": 68300 }, { "epoch": 2.5326816444037137, "grad_norm": 2.2606318900396047, "learning_rate": 9.30136309287749e-06, "loss": 0.6811, "step": 68400 }, { "epoch": 2.5363844224129153, "grad_norm": 2.4860591476196423, "learning_rate": 9.298064693193581e-06, "loss": 0.6776, "step": 68500 }, { "epoch": 2.540087200422117, "grad_norm": 2.4829377853240837, "learning_rate": 9.29475911333639e-06, "loss": 0.7002, "step": 68600 }, { "epoch": 2.543789978431318, "grad_norm": 2.367231988606884, "learning_rate": 9.291446358828091e-06, "loss": 0.6675, "step": 68700 }, { "epoch": 2.5474927564405196, "grad_norm": 2.4582997910649484, "learning_rate": 9.288126435202831e-06, "loss": 0.6656, "step": 68800 }, { "epoch": 2.551195534449721, "grad_norm": 2.0143015209204185, "learning_rate": 9.284799348006743e-06, "loss": 0.6811, "step": 68900 }, { "epoch": 2.5548983124589224, "grad_norm": 2.9078889050531473, "learning_rate": 9.281465102797926e-06, "loss": 0.677, "step": 69000 }, { "epoch": 2.5586010904681236, "grad_norm": 2.2408419976033693, "learning_rate": 9.278123705146434e-06, "loss": 0.6884, "step": 69100 }, { "epoch": 2.562303868477325, "grad_norm": 2.626201962148744, "learning_rate": 9.27477516063427e-06, "loss": 0.6612, "step": 69200 }, { "epoch": 2.5660066464865263, "grad_norm": 2.1552325508134134, "learning_rate": 9.271419474855377e-06, "loss": 0.666, "step": 69300 }, { "epoch": 2.569709424495728, "grad_norm": 2.3227195866166768, "learning_rate": 9.268056653415632e-06, "loss": 0.6652, "step": 69400 }, { "epoch": 2.5734122025049295, "grad_norm": 2.753551170952296, "learning_rate": 9.264686701932825e-06, "loss": 0.6791, "step": 69500 }, { "epoch": 2.5771149805141307, "grad_norm": 2.2976640888247415, "learning_rate": 9.261309626036661e-06, "loss": 0.6705, "step": 69600 }, { "epoch": 2.5808177585233323, "grad_norm": 2.339779663329093, "learning_rate": 9.257925431368749e-06, "loss": 0.6669, "step": 69700 }, { "epoch": 2.5845205365325334, "grad_norm": 2.1019636120259695, "learning_rate": 9.254534123582585e-06, "loss": 0.6734, "step": 69800 }, { "epoch": 2.588223314541735, "grad_norm": 2.2977009075813744, "learning_rate": 9.251135708343555e-06, "loss": 0.6724, "step": 69900 }, { "epoch": 2.591926092550936, "grad_norm": 2.1726587191847386, "learning_rate": 9.247730191328908e-06, "loss": 0.686, "step": 70000 }, { "epoch": 2.595628870560138, "grad_norm": 2.2059169127907907, "learning_rate": 9.244317578227769e-06, "loss": 0.6829, "step": 70100 }, { "epoch": 2.599331648569339, "grad_norm": 2.2693764246927843, "learning_rate": 9.240897874741108e-06, "loss": 0.6706, "step": 70200 }, { "epoch": 2.6030344265785406, "grad_norm": 2.3773955458790192, "learning_rate": 9.237471086581744e-06, "loss": 0.6403, "step": 70300 }, { "epoch": 2.606737204587742, "grad_norm": 2.376894341944025, "learning_rate": 9.234037219474332e-06, "loss": 0.6556, "step": 70400 }, { "epoch": 2.6104399825969433, "grad_norm": 2.2164027402226756, "learning_rate": 9.230596279155353e-06, "loss": 0.6677, "step": 70500 }, { "epoch": 2.614142760606145, "grad_norm": 2.6004295191699596, "learning_rate": 9.227148271373102e-06, "loss": 0.6656, "step": 70600 }, { "epoch": 2.617845538615346, "grad_norm": 2.2941992876843145, "learning_rate": 9.223693201887677e-06, "loss": 0.671, "step": 70700 }, { "epoch": 2.6215483166245477, "grad_norm": 2.423457996166192, "learning_rate": 9.220231076470985e-06, "loss": 0.671, "step": 70800 }, { "epoch": 2.625251094633749, "grad_norm": 2.028224896189644, "learning_rate": 9.216761900906707e-06, "loss": 0.6633, "step": 70900 }, { "epoch": 2.6289538726429504, "grad_norm": 2.8043596176994234, "learning_rate": 9.213285680990311e-06, "loss": 0.6733, "step": 71000 }, { "epoch": 2.6326566506521516, "grad_norm": 2.631240157715802, "learning_rate": 9.209802422529028e-06, "loss": 0.6694, "step": 71100 }, { "epoch": 2.636359428661353, "grad_norm": 3.27526490574497, "learning_rate": 9.206312131341848e-06, "loss": 0.6736, "step": 71200 }, { "epoch": 2.640062206670555, "grad_norm": 2.2969788418244734, "learning_rate": 9.202814813259514e-06, "loss": 0.6685, "step": 71300 }, { "epoch": 2.643764984679756, "grad_norm": 2.0448759395992693, "learning_rate": 9.199310474124501e-06, "loss": 0.6734, "step": 71400 }, { "epoch": 2.647467762688957, "grad_norm": 2.194888247981071, "learning_rate": 9.195799119791018e-06, "loss": 0.6853, "step": 71500 }, { "epoch": 2.6511705406981587, "grad_norm": 2.3157294382898037, "learning_rate": 9.19228075612499e-06, "loss": 0.6936, "step": 71600 }, { "epoch": 2.6548733187073603, "grad_norm": 2.2600268640470516, "learning_rate": 9.188755389004056e-06, "loss": 0.6482, "step": 71700 }, { "epoch": 2.6585760967165615, "grad_norm": 2.7867346539584026, "learning_rate": 9.18522302431755e-06, "loss": 0.6736, "step": 71800 }, { "epoch": 2.662278874725763, "grad_norm": 2.4244549380103284, "learning_rate": 9.181683667966497e-06, "loss": 0.6612, "step": 71900 }, { "epoch": 2.6659816527349642, "grad_norm": 2.467628082595294, "learning_rate": 9.178137325863606e-06, "loss": 0.662, "step": 72000 }, { "epoch": 2.669684430744166, "grad_norm": 1.991818914003808, "learning_rate": 9.17458400393325e-06, "loss": 0.6546, "step": 72100 }, { "epoch": 2.6733872087533674, "grad_norm": 2.6084822187687893, "learning_rate": 9.171023708111467e-06, "loss": 0.6707, "step": 72200 }, { "epoch": 2.6770899867625686, "grad_norm": 2.4759759139487674, "learning_rate": 9.16745644434594e-06, "loss": 0.6589, "step": 72300 }, { "epoch": 2.6807927647717698, "grad_norm": 2.417557023636743, "learning_rate": 9.163882218595998e-06, "loss": 0.6692, "step": 72400 }, { "epoch": 2.6844955427809714, "grad_norm": 2.4757717167657303, "learning_rate": 9.160301036832601e-06, "loss": 0.6824, "step": 72500 }, { "epoch": 2.688198320790173, "grad_norm": 1.7864811103086602, "learning_rate": 9.156712905038324e-06, "loss": 0.6549, "step": 72600 }, { "epoch": 2.691901098799374, "grad_norm": 1.931145360031176, "learning_rate": 9.153117829207353e-06, "loss": 0.6707, "step": 72700 }, { "epoch": 2.6956038768085757, "grad_norm": 2.6583751811214515, "learning_rate": 9.149515815345477e-06, "loss": 0.6746, "step": 72800 }, { "epoch": 2.699306654817777, "grad_norm": 2.3434065726826874, "learning_rate": 9.14590686947008e-06, "loss": 0.6746, "step": 72900 }, { "epoch": 2.7030094328269785, "grad_norm": 2.1951946075529003, "learning_rate": 9.142290997610114e-06, "loss": 0.672, "step": 73000 }, { "epoch": 2.70671221083618, "grad_norm": 2.1266717085417715, "learning_rate": 9.138668205806116e-06, "loss": 0.6596, "step": 73100 }, { "epoch": 2.7104149888453812, "grad_norm": 2.052871241822731, "learning_rate": 9.135038500110169e-06, "loss": 0.6562, "step": 73200 }, { "epoch": 2.7141177668545824, "grad_norm": 2.3890278232506144, "learning_rate": 9.131401886585916e-06, "loss": 0.6791, "step": 73300 }, { "epoch": 2.717820544863784, "grad_norm": 2.339795856765528, "learning_rate": 9.127758371308537e-06, "loss": 0.6769, "step": 73400 }, { "epoch": 2.7215233228729856, "grad_norm": 2.0980772669298946, "learning_rate": 9.124107960364738e-06, "loss": 0.687, "step": 73500 }, { "epoch": 2.7252261008821868, "grad_norm": 1.9520671316508236, "learning_rate": 9.120450659852754e-06, "loss": 0.6619, "step": 73600 }, { "epoch": 2.7289288788913884, "grad_norm": 1.9489571927723024, "learning_rate": 9.116786475882318e-06, "loss": 0.6643, "step": 73700 }, { "epoch": 2.7326316569005895, "grad_norm": 2.1143535490363963, "learning_rate": 9.11311541457467e-06, "loss": 0.6647, "step": 73800 }, { "epoch": 2.736334434909791, "grad_norm": 2.4454265529124415, "learning_rate": 9.109437482062538e-06, "loss": 0.6791, "step": 73900 }, { "epoch": 2.7400372129189927, "grad_norm": 2.12417553054465, "learning_rate": 9.105752684490125e-06, "loss": 0.6751, "step": 74000 }, { "epoch": 2.743739990928194, "grad_norm": 3.3698294360651286, "learning_rate": 9.102061028013108e-06, "loss": 0.6805, "step": 74100 }, { "epoch": 2.747442768937395, "grad_norm": 2.6079682276880694, "learning_rate": 9.098362518798615e-06, "loss": 0.6542, "step": 74200 }, { "epoch": 2.7511455469465966, "grad_norm": 2.9285278794017167, "learning_rate": 9.094657163025228e-06, "loss": 0.6798, "step": 74300 }, { "epoch": 2.7548483249557982, "grad_norm": 2.029512121868359, "learning_rate": 9.090944966882968e-06, "loss": 0.6716, "step": 74400 }, { "epoch": 2.7585511029649994, "grad_norm": 2.37703823122831, "learning_rate": 9.087225936573275e-06, "loss": 0.6664, "step": 74500 }, { "epoch": 2.762253880974201, "grad_norm": 2.290740875061313, "learning_rate": 9.083500078309013e-06, "loss": 0.7054, "step": 74600 }, { "epoch": 2.765956658983402, "grad_norm": 1.9826452203518832, "learning_rate": 9.079767398314452e-06, "loss": 0.6574, "step": 74700 }, { "epoch": 2.7696594369926038, "grad_norm": 2.510390912417119, "learning_rate": 9.076027902825252e-06, "loss": 0.6573, "step": 74800 }, { "epoch": 2.7733622150018054, "grad_norm": 3.253767602420802, "learning_rate": 9.072281598088467e-06, "loss": 0.6565, "step": 74900 }, { "epoch": 2.7770649930110065, "grad_norm": 3.1743643654172278, "learning_rate": 9.068528490362524e-06, "loss": 0.6636, "step": 75000 }, { "epoch": 2.7807677710202077, "grad_norm": 2.9292198577340463, "learning_rate": 9.064768585917207e-06, "loss": 0.6763, "step": 75100 }, { "epoch": 2.7844705490294093, "grad_norm": 2.3225242842709766, "learning_rate": 9.061001891033666e-06, "loss": 0.6696, "step": 75200 }, { "epoch": 2.788173327038611, "grad_norm": 2.272648856356267, "learning_rate": 9.057228412004386e-06, "loss": 0.6585, "step": 75300 }, { "epoch": 2.791876105047812, "grad_norm": 2.708064532509065, "learning_rate": 9.053448155133192e-06, "loss": 0.6674, "step": 75400 }, { "epoch": 2.7955788830570136, "grad_norm": 2.0878561841156706, "learning_rate": 9.049661126735223e-06, "loss": 0.6523, "step": 75500 }, { "epoch": 2.799281661066215, "grad_norm": 2.0218162021372637, "learning_rate": 9.045867333136939e-06, "loss": 0.667, "step": 75600 }, { "epoch": 2.8029844390754164, "grad_norm": 1.9615749815202044, "learning_rate": 9.042066780676101e-06, "loss": 0.6644, "step": 75700 }, { "epoch": 2.806687217084618, "grad_norm": 2.458125241194594, "learning_rate": 9.038259475701756e-06, "loss": 0.6592, "step": 75800 }, { "epoch": 2.810389995093819, "grad_norm": 2.5321957606480887, "learning_rate": 9.034445424574232e-06, "loss": 0.6542, "step": 75900 }, { "epoch": 2.8140927731030203, "grad_norm": 2.305578502814208, "learning_rate": 9.030624633665131e-06, "loss": 0.6626, "step": 76000 }, { "epoch": 2.817795551112222, "grad_norm": 2.304093777477429, "learning_rate": 9.026797109357313e-06, "loss": 0.6585, "step": 76100 }, { "epoch": 2.8214983291214235, "grad_norm": 1.9063487829056964, "learning_rate": 9.022962858044881e-06, "loss": 0.6634, "step": 76200 }, { "epoch": 2.8252011071306247, "grad_norm": 2.4605756536089998, "learning_rate": 9.019121886133185e-06, "loss": 0.659, "step": 76300 }, { "epoch": 2.8289038851398263, "grad_norm": 2.908256690477109, "learning_rate": 9.015274200038798e-06, "loss": 0.6873, "step": 76400 }, { "epoch": 2.8326066631490274, "grad_norm": 2.195376131615668, "learning_rate": 9.011419806189503e-06, "loss": 0.6786, "step": 76500 }, { "epoch": 2.836309441158229, "grad_norm": 2.4481520740229588, "learning_rate": 9.0075587110243e-06, "loss": 0.6586, "step": 76600 }, { "epoch": 2.8400122191674306, "grad_norm": 2.5436298766851024, "learning_rate": 9.003690920993378e-06, "loss": 0.6732, "step": 76700 }, { "epoch": 2.843714997176632, "grad_norm": 2.0233903955790664, "learning_rate": 8.999816442558112e-06, "loss": 0.6694, "step": 76800 }, { "epoch": 2.847417775185833, "grad_norm": 1.9592757597831238, "learning_rate": 8.995935282191044e-06, "loss": 0.642, "step": 76900 }, { "epoch": 2.8511205531950345, "grad_norm": 2.4999659621973676, "learning_rate": 8.992047446375887e-06, "loss": 0.6758, "step": 77000 }, { "epoch": 2.854823331204236, "grad_norm": 2.320920562047208, "learning_rate": 8.988152941607505e-06, "loss": 0.6686, "step": 77100 }, { "epoch": 2.8585261092134373, "grad_norm": 2.180371204577853, "learning_rate": 8.984251774391895e-06, "loss": 0.6572, "step": 77200 }, { "epoch": 2.862228887222639, "grad_norm": 2.548377630577026, "learning_rate": 8.980343951246193e-06, "loss": 0.6858, "step": 77300 }, { "epoch": 2.86593166523184, "grad_norm": 2.2916044435835023, "learning_rate": 8.976429478698651e-06, "loss": 0.6612, "step": 77400 }, { "epoch": 2.8696344432410417, "grad_norm": 2.137867387232337, "learning_rate": 8.972508363288627e-06, "loss": 0.656, "step": 77500 }, { "epoch": 2.8733372212502433, "grad_norm": 2.6319833480679713, "learning_rate": 8.968580611566578e-06, "loss": 0.6505, "step": 77600 }, { "epoch": 2.8770399992594444, "grad_norm": 2.1088025728984907, "learning_rate": 8.96464623009405e-06, "loss": 0.6667, "step": 77700 }, { "epoch": 2.8807427772686456, "grad_norm": 1.9521003147155882, "learning_rate": 8.960705225443657e-06, "loss": 0.6596, "step": 77800 }, { "epoch": 2.884445555277847, "grad_norm": 2.5972066347938294, "learning_rate": 8.956757604199085e-06, "loss": 0.6545, "step": 77900 }, { "epoch": 2.888148333287049, "grad_norm": 2.4786047868289964, "learning_rate": 8.952803372955073e-06, "loss": 0.6722, "step": 78000 }, { "epoch": 2.89185111129625, "grad_norm": 2.2514808731629112, "learning_rate": 8.948842538317395e-06, "loss": 0.6556, "step": 78100 }, { "epoch": 2.8955538893054515, "grad_norm": 2.365087481495297, "learning_rate": 8.944875106902864e-06, "loss": 0.6482, "step": 78200 }, { "epoch": 2.8992566673146527, "grad_norm": 2.452402390597274, "learning_rate": 8.94090108533931e-06, "loss": 0.6893, "step": 78300 }, { "epoch": 2.9029594453238543, "grad_norm": 2.1846111061646885, "learning_rate": 8.936920480265576e-06, "loss": 0.6565, "step": 78400 }, { "epoch": 2.9066622233330555, "grad_norm": 2.5440937876149907, "learning_rate": 8.932933298331496e-06, "loss": 0.6731, "step": 78500 }, { "epoch": 2.910365001342257, "grad_norm": 2.1791116047812125, "learning_rate": 8.928939546197897e-06, "loss": 0.6747, "step": 78600 }, { "epoch": 2.914067779351458, "grad_norm": 2.5381792077290934, "learning_rate": 8.92493923053658e-06, "loss": 0.6759, "step": 78700 }, { "epoch": 2.91777055736066, "grad_norm": 2.3236635792732137, "learning_rate": 8.920932358030309e-06, "loss": 0.6675, "step": 78800 }, { "epoch": 2.9214733353698614, "grad_norm": 1.9029464622582775, "learning_rate": 8.916918935372805e-06, "loss": 0.6634, "step": 78900 }, { "epoch": 2.9251761133790626, "grad_norm": 2.224078219093189, "learning_rate": 8.912898969268731e-06, "loss": 0.6546, "step": 79000 }, { "epoch": 2.928878891388264, "grad_norm": 2.9148804782966233, "learning_rate": 8.908872466433677e-06, "loss": 0.6549, "step": 79100 }, { "epoch": 2.9325816693974653, "grad_norm": 2.4717406257998773, "learning_rate": 8.904839433594158e-06, "loss": 0.6522, "step": 79200 }, { "epoch": 2.936284447406667, "grad_norm": 2.6821434461084896, "learning_rate": 8.900799877487595e-06, "loss": 0.669, "step": 79300 }, { "epoch": 2.939987225415868, "grad_norm": 2.5288488175630057, "learning_rate": 8.896753804862308e-06, "loss": 0.6675, "step": 79400 }, { "epoch": 2.9436900034250697, "grad_norm": 2.3118984656483748, "learning_rate": 8.892701222477503e-06, "loss": 0.6428, "step": 79500 }, { "epoch": 2.947392781434271, "grad_norm": 1.7707450134385863, "learning_rate": 8.888642137103258e-06, "loss": 0.6423, "step": 79600 }, { "epoch": 2.9510955594434725, "grad_norm": 2.7951973513737016, "learning_rate": 8.884576555520521e-06, "loss": 0.6666, "step": 79700 }, { "epoch": 2.954798337452674, "grad_norm": 1.9441758598215642, "learning_rate": 8.880504484521084e-06, "loss": 0.6911, "step": 79800 }, { "epoch": 2.958501115461875, "grad_norm": 2.311415822913053, "learning_rate": 8.876425930907587e-06, "loss": 0.69, "step": 79900 }, { "epoch": 2.962203893471077, "grad_norm": 1.962196622233137, "learning_rate": 8.872340901493496e-06, "loss": 0.6991, "step": 80000 }, { "epoch": 2.965906671480278, "grad_norm": 2.2188989804402635, "learning_rate": 8.868249403103098e-06, "loss": 0.6512, "step": 80100 }, { "epoch": 2.9696094494894796, "grad_norm": 2.0738562772495217, "learning_rate": 8.864151442571481e-06, "loss": 0.6673, "step": 80200 }, { "epoch": 2.9733122274986807, "grad_norm": 2.45682348863258, "learning_rate": 8.860047026744535e-06, "loss": 0.6488, "step": 80300 }, { "epoch": 2.9770150055078823, "grad_norm": 2.876210559752475, "learning_rate": 8.855936162478933e-06, "loss": 0.641, "step": 80400 }, { "epoch": 2.9807177835170835, "grad_norm": 2.113010077915775, "learning_rate": 8.851818856642116e-06, "loss": 0.6482, "step": 80500 }, { "epoch": 2.984420561526285, "grad_norm": 2.2593684990909297, "learning_rate": 8.84769511611229e-06, "loss": 0.6596, "step": 80600 }, { "epoch": 2.9881233395354867, "grad_norm": 2.213052710368658, "learning_rate": 8.843564947778408e-06, "loss": 0.6674, "step": 80700 }, { "epoch": 2.991826117544688, "grad_norm": 1.9824851077389378, "learning_rate": 8.839428358540165e-06, "loss": 0.6606, "step": 80800 }, { "epoch": 2.9955288955538895, "grad_norm": 1.8350785430581344, "learning_rate": 8.835285355307979e-06, "loss": 0.6625, "step": 80900 }, { "epoch": 2.9992316735630906, "grad_norm": 2.2196935514359537, "learning_rate": 8.831135945002982e-06, "loss": 0.6483, "step": 81000 }, { "epoch": 3.0029251946272693, "grad_norm": 2.180481700028787, "learning_rate": 8.826980134557012e-06, "loss": 0.5716, "step": 81100 }, { "epoch": 3.0066279726364704, "grad_norm": 2.3154128557009166, "learning_rate": 8.8228179309126e-06, "loss": 0.5747, "step": 81200 }, { "epoch": 3.010330750645672, "grad_norm": 2.5911631549986316, "learning_rate": 8.818649341022954e-06, "loss": 0.5708, "step": 81300 }, { "epoch": 3.014033528654873, "grad_norm": 2.172878251158029, "learning_rate": 8.81447437185195e-06, "loss": 0.586, "step": 81400 }, { "epoch": 3.0177363066640748, "grad_norm": 2.285708121202155, "learning_rate": 8.810293030374126e-06, "loss": 0.5279, "step": 81500 }, { "epoch": 3.021439084673276, "grad_norm": 1.8325067800290862, "learning_rate": 8.80610532357466e-06, "loss": 0.5743, "step": 81600 }, { "epoch": 3.0251418626824775, "grad_norm": 2.4724163520836617, "learning_rate": 8.801911258449367e-06, "loss": 0.5686, "step": 81700 }, { "epoch": 3.028844640691679, "grad_norm": 2.8204386478402657, "learning_rate": 8.797710842004683e-06, "loss": 0.5661, "step": 81800 }, { "epoch": 3.0325474187008803, "grad_norm": 2.1624621580723504, "learning_rate": 8.793504081257653e-06, "loss": 0.5609, "step": 81900 }, { "epoch": 3.036250196710082, "grad_norm": 1.9578194242090217, "learning_rate": 8.789290983235925e-06, "loss": 0.5557, "step": 82000 } ], "logging_steps": 100, "max_steps": 270070, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2119412823359488.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }