{ "best_metric": 6.374266624450684, "best_model_checkpoint": "learning_source_20260316/genome_sequence/bert-output/genome_sequence-small/checkpoint-46600", "epoch": 133.29387302467424, "eval_steps": 100, "global_step": 60000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.22179096201829776, "grad_norm": 0.9666945338249207, "learning_rate": 3e-06, "loss": 8.2198, "step": 100 }, { "epoch": 0.22179096201829776, "eval_loss": 7.910184383392334, "eval_runtime": 100.9607, "eval_samples_per_second": 99.048, "eval_steps_per_second": 99.048, "step": 100 }, { "epoch": 0.4435819240365955, "grad_norm": 0.7436413764953613, "learning_rate": 6e-06, "loss": 7.7448, "step": 200 }, { "epoch": 0.4435819240365955, "eval_loss": 7.522714138031006, "eval_runtime": 101.1239, "eval_samples_per_second": 98.889, "eval_steps_per_second": 98.889, "step": 200 }, { "epoch": 0.6653728860548933, "grad_norm": 0.5597550868988037, "learning_rate": 5.998999666555519e-06, "loss": 7.3644, "step": 300 }, { "epoch": 0.6653728860548933, "eval_loss": 7.118653297424316, "eval_runtime": 101.7208, "eval_samples_per_second": 98.308, "eval_steps_per_second": 98.308, "step": 300 }, { "epoch": 0.887163848073191, "grad_norm": 0.3977542519569397, "learning_rate": 5.997999333111037e-06, "loss": 7.039, "step": 400 }, { "epoch": 0.887163848073191, "eval_loss": 6.858039855957031, "eval_runtime": 103.108, "eval_samples_per_second": 96.986, "eval_steps_per_second": 96.986, "step": 400 }, { "epoch": 1.1089548100914888, "grad_norm": 0.31371042132377625, "learning_rate": 5.9969989996665554e-06, "loss": 6.8537, "step": 500 }, { "epoch": 1.1089548100914888, "eval_loss": 6.725042343139648, "eval_runtime": 100.7633, "eval_samples_per_second": 99.243, "eval_steps_per_second": 99.243, "step": 500 }, { "epoch": 1.3307457721097866, "grad_norm": 0.2910732924938202, "learning_rate": 5.995998666222074e-06, "loss": 6.749, "step": 600 }, { "epoch": 1.3307457721097866, "eval_loss": 6.648338317871094, "eval_runtime": 103.8281, "eval_samples_per_second": 96.313, "eval_steps_per_second": 96.313, "step": 600 }, { "epoch": 1.5525367341280842, "grad_norm": 0.38117602467536926, "learning_rate": 5.994998332777593e-06, "loss": 6.6809, "step": 700 }, { "epoch": 1.5525367341280842, "eval_loss": 6.598635196685791, "eval_runtime": 100.7294, "eval_samples_per_second": 99.276, "eval_steps_per_second": 99.276, "step": 700 }, { "epoch": 1.774327696146382, "grad_norm": 0.23082487285137177, "learning_rate": 5.9939979993331115e-06, "loss": 6.6363, "step": 800 }, { "epoch": 1.774327696146382, "eval_loss": 6.5613298416137695, "eval_runtime": 100.99, "eval_samples_per_second": 99.02, "eval_steps_per_second": 99.02, "step": 800 }, { "epoch": 1.9961186581646797, "grad_norm": 0.3537309169769287, "learning_rate": 5.992997665888629e-06, "loss": 6.6008, "step": 900 }, { "epoch": 1.9961186581646797, "eval_loss": 6.539489269256592, "eval_runtime": 103.6291, "eval_samples_per_second": 96.498, "eval_steps_per_second": 96.498, "step": 900 }, { "epoch": 2.2179096201829775, "grad_norm": 0.22692321240901947, "learning_rate": 5.991997332444148e-06, "loss": 6.5735, "step": 1000 }, { "epoch": 2.2179096201829775, "eval_loss": 6.521015644073486, "eval_runtime": 100.5379, "eval_samples_per_second": 99.465, "eval_steps_per_second": 99.465, "step": 1000 }, { "epoch": 2.4397005822012754, "grad_norm": 0.5465587973594666, "learning_rate": 5.990996998999667e-06, "loss": 6.5555, "step": 1100 }, { "epoch": 2.4397005822012754, "eval_loss": 6.505192279815674, "eval_runtime": 101.8008, "eval_samples_per_second": 98.231, "eval_steps_per_second": 98.231, "step": 1100 }, { "epoch": 2.6614915442195732, "grad_norm": 0.6720498204231262, "learning_rate": 5.989996665555185e-06, "loss": 6.5407, "step": 1200 }, { "epoch": 2.6614915442195732, "eval_loss": 6.497246265411377, "eval_runtime": 103.0853, "eval_samples_per_second": 97.007, "eval_steps_per_second": 97.007, "step": 1200 }, { "epoch": 2.8832825062378706, "grad_norm": 0.3426739275455475, "learning_rate": 5.988996332110703e-06, "loss": 6.529, "step": 1300 }, { "epoch": 2.8832825062378706, "eval_loss": 6.488556861877441, "eval_runtime": 100.6535, "eval_samples_per_second": 99.351, "eval_steps_per_second": 99.351, "step": 1300 }, { "epoch": 3.1050734682561685, "grad_norm": 0.2463805377483368, "learning_rate": 5.987995998666222e-06, "loss": 6.5196, "step": 1400 }, { "epoch": 3.1050734682561685, "eval_loss": 6.484075546264648, "eval_runtime": 104.3708, "eval_samples_per_second": 95.812, "eval_steps_per_second": 95.812, "step": 1400 }, { "epoch": 3.3268644302744663, "grad_norm": 0.1849370300769806, "learning_rate": 5.986995665221741e-06, "loss": 6.5099, "step": 1500 }, { "epoch": 3.3268644302744663, "eval_loss": 6.476208209991455, "eval_runtime": 100.8511, "eval_samples_per_second": 99.156, "eval_steps_per_second": 99.156, "step": 1500 }, { "epoch": 3.548655392292764, "grad_norm": 0.23534879088401794, "learning_rate": 5.9859953317772595e-06, "loss": 6.503, "step": 1600 }, { "epoch": 3.548655392292764, "eval_loss": 6.473758220672607, "eval_runtime": 100.8445, "eval_samples_per_second": 99.163, "eval_steps_per_second": 99.163, "step": 1600 }, { "epoch": 3.770446354311062, "grad_norm": 0.3312935531139374, "learning_rate": 5.984994998332777e-06, "loss": 6.4991, "step": 1700 }, { "epoch": 3.770446354311062, "eval_loss": 6.471902370452881, "eval_runtime": 104.4468, "eval_samples_per_second": 95.743, "eval_steps_per_second": 95.743, "step": 1700 }, { "epoch": 3.9922373163293594, "grad_norm": 0.27324172854423523, "learning_rate": 5.983994664888296e-06, "loss": 6.4936, "step": 1800 }, { "epoch": 3.9922373163293594, "eval_loss": 6.464596271514893, "eval_runtime": 100.6385, "eval_samples_per_second": 99.366, "eval_steps_per_second": 99.366, "step": 1800 }, { "epoch": 4.214028278347658, "grad_norm": 0.29278630018234253, "learning_rate": 5.982994331443815e-06, "loss": 6.4875, "step": 1900 }, { "epoch": 4.214028278347658, "eval_loss": 6.462095260620117, "eval_runtime": 100.6404, "eval_samples_per_second": 99.364, "eval_steps_per_second": 99.364, "step": 1900 }, { "epoch": 4.435819240365955, "grad_norm": 0.26022714376449585, "learning_rate": 5.981993997999333e-06, "loss": 6.4834, "step": 2000 }, { "epoch": 4.435819240365955, "eval_loss": 6.45832633972168, "eval_runtime": 104.5104, "eval_samples_per_second": 95.684, "eval_steps_per_second": 95.684, "step": 2000 }, { "epoch": 4.6576102023842525, "grad_norm": 0.7873703837394714, "learning_rate": 5.980993664554851e-06, "loss": 6.4796, "step": 2100 }, { "epoch": 4.6576102023842525, "eval_loss": 6.456444263458252, "eval_runtime": 100.8687, "eval_samples_per_second": 99.139, "eval_steps_per_second": 99.139, "step": 2100 }, { "epoch": 4.887163848073191, "grad_norm": 0.7525845766067505, "learning_rate": 5.979993331110371e-06, "loss": 6.4755, "step": 2200 }, { "epoch": 4.887163848073191, "eval_loss": 6.453465938568115, "eval_runtime": 66.4579, "eval_samples_per_second": 150.471, "eval_steps_per_second": 18.809, "step": 2200 }, { "epoch": 5.108954810091489, "grad_norm": 0.5191181302070618, "learning_rate": 5.978992997665889e-06, "loss": 6.472, "step": 2300 }, { "epoch": 5.108954810091489, "eval_loss": 6.44980525970459, "eval_runtime": 63.8377, "eval_samples_per_second": 156.647, "eval_steps_per_second": 19.581, "step": 2300 }, { "epoch": 5.330745772109786, "grad_norm": 0.31189826130867004, "learning_rate": 5.9779926642214075e-06, "loss": 6.4681, "step": 2400 }, { "epoch": 5.330745772109786, "eval_loss": 6.448277473449707, "eval_runtime": 63.9509, "eval_samples_per_second": 156.37, "eval_steps_per_second": 19.546, "step": 2400 }, { "epoch": 5.5525367341280845, "grad_norm": 0.4947231113910675, "learning_rate": 5.976992330776926e-06, "loss": 6.4659, "step": 2500 }, { "epoch": 5.5525367341280845, "eval_loss": 6.4454731941223145, "eval_runtime": 66.4235, "eval_samples_per_second": 150.549, "eval_steps_per_second": 18.819, "step": 2500 }, { "epoch": 5.774327696146382, "grad_norm": 0.22547227144241333, "learning_rate": 5.975991997332444e-06, "loss": 6.4619, "step": 2600 }, { "epoch": 5.774327696146382, "eval_loss": 6.444580554962158, "eval_runtime": 63.7522, "eval_samples_per_second": 156.857, "eval_steps_per_second": 19.607, "step": 2600 }, { "epoch": 5.99611865816468, "grad_norm": 0.2726474404335022, "learning_rate": 5.974991663887963e-06, "loss": 6.4594, "step": 2700 }, { "epoch": 5.99611865816468, "eval_loss": 6.44156551361084, "eval_runtime": 66.3901, "eval_samples_per_second": 150.625, "eval_steps_per_second": 18.828, "step": 2700 }, { "epoch": 6.2179096201829775, "grad_norm": 0.17645886540412903, "learning_rate": 5.973991330443481e-06, "loss": 6.4574, "step": 2800 }, { "epoch": 6.2179096201829775, "eval_loss": 6.4393510818481445, "eval_runtime": 63.8118, "eval_samples_per_second": 156.711, "eval_steps_per_second": 19.589, "step": 2800 }, { "epoch": 6.439700582201275, "grad_norm": 0.9444617629051208, "learning_rate": 5.972990996999e-06, "loss": 6.4546, "step": 2900 }, { "epoch": 6.439700582201275, "eval_loss": 6.439332008361816, "eval_runtime": 63.6523, "eval_samples_per_second": 157.103, "eval_steps_per_second": 19.638, "step": 2900 }, { "epoch": 6.661491544219573, "grad_norm": 0.4472251534461975, "learning_rate": 5.971990663554519e-06, "loss": 6.4515, "step": 3000 }, { "epoch": 6.661491544219573, "eval_loss": 6.435446262359619, "eval_runtime": 63.845, "eval_samples_per_second": 156.629, "eval_steps_per_second": 19.579, "step": 3000 }, { "epoch": 6.883282506237871, "grad_norm": 0.29884466528892517, "learning_rate": 5.970990330110037e-06, "loss": 6.4483, "step": 3100 }, { "epoch": 6.883282506237871, "eval_loss": 6.433766841888428, "eval_runtime": 66.4883, "eval_samples_per_second": 150.402, "eval_steps_per_second": 18.8, "step": 3100 }, { "epoch": 7.105073468256169, "grad_norm": 0.4576103687286377, "learning_rate": 5.9699899966655554e-06, "loss": 6.4465, "step": 3200 }, { "epoch": 7.105073468256169, "eval_loss": 6.432063102722168, "eval_runtime": 63.7483, "eval_samples_per_second": 156.867, "eval_steps_per_second": 19.608, "step": 3200 }, { "epoch": 7.326864430274466, "grad_norm": 0.1679336577653885, "learning_rate": 5.968989663221074e-06, "loss": 6.4453, "step": 3300 }, { "epoch": 7.326864430274466, "eval_loss": 6.430073261260986, "eval_runtime": 63.7036, "eval_samples_per_second": 156.977, "eval_steps_per_second": 19.622, "step": 3300 }, { "epoch": 7.548655392292764, "grad_norm": 0.3880283236503601, "learning_rate": 5.967989329776592e-06, "loss": 6.4406, "step": 3400 }, { "epoch": 7.548655392292764, "eval_loss": 6.431549072265625, "eval_runtime": 66.1695, "eval_samples_per_second": 151.127, "eval_steps_per_second": 18.891, "step": 3400 }, { "epoch": 7.770446354311062, "grad_norm": 0.8515690565109253, "learning_rate": 5.966988996332111e-06, "loss": 6.4413, "step": 3500 }, { "epoch": 7.770446354311062, "eval_loss": 6.42842435836792, "eval_runtime": 63.7187, "eval_samples_per_second": 156.94, "eval_steps_per_second": 19.617, "step": 3500 }, { "epoch": 7.992237316329359, "grad_norm": 0.4197738468647003, "learning_rate": 5.965988662887629e-06, "loss": 6.4404, "step": 3600 }, { "epoch": 7.992237316329359, "eval_loss": 6.429299354553223, "eval_runtime": 63.7081, "eval_samples_per_second": 156.966, "eval_steps_per_second": 19.621, "step": 3600 }, { "epoch": 8.214028278347657, "grad_norm": 0.16546382009983063, "learning_rate": 5.964988329443148e-06, "loss": 6.438, "step": 3700 }, { "epoch": 8.214028278347657, "eval_loss": 6.426889896392822, "eval_runtime": 66.066, "eval_samples_per_second": 151.364, "eval_steps_per_second": 18.92, "step": 3700 }, { "epoch": 8.435819240365955, "grad_norm": 0.48783496022224426, "learning_rate": 5.963987995998667e-06, "loss": 6.437, "step": 3800 }, { "epoch": 8.435819240365955, "eval_loss": 6.424874305725098, "eval_runtime": 63.6818, "eval_samples_per_second": 157.031, "eval_steps_per_second": 19.629, "step": 3800 }, { "epoch": 8.657610202384253, "grad_norm": 0.2994876205921173, "learning_rate": 5.962987662554185e-06, "loss": 6.434, "step": 3900 }, { "epoch": 8.657610202384253, "eval_loss": 6.428049087524414, "eval_runtime": 63.6981, "eval_samples_per_second": 156.991, "eval_steps_per_second": 19.624, "step": 3900 }, { "epoch": 8.87940116440255, "grad_norm": 0.26397526264190674, "learning_rate": 5.961987329109703e-06, "loss": 6.4344, "step": 4000 }, { "epoch": 8.87940116440255, "eval_loss": 6.427630424499512, "eval_runtime": 63.7853, "eval_samples_per_second": 156.776, "eval_steps_per_second": 19.597, "step": 4000 }, { "epoch": 9.101192126420848, "grad_norm": 0.6336208581924438, "learning_rate": 5.960986995665222e-06, "loss": 6.4322, "step": 4100 }, { "epoch": 9.101192126420848, "eval_loss": 6.423878192901611, "eval_runtime": 66.3296, "eval_samples_per_second": 150.762, "eval_steps_per_second": 18.845, "step": 4100 }, { "epoch": 9.322983088439146, "grad_norm": 0.5242211818695068, "learning_rate": 5.95998666222074e-06, "loss": 6.4302, "step": 4200 }, { "epoch": 9.322983088439146, "eval_loss": 6.42392110824585, "eval_runtime": 63.7079, "eval_samples_per_second": 156.966, "eval_steps_per_second": 19.621, "step": 4200 }, { "epoch": 9.544774050457445, "grad_norm": 0.49379467964172363, "learning_rate": 5.958986328776259e-06, "loss": 6.4307, "step": 4300 }, { "epoch": 9.544774050457445, "eval_loss": 6.422423839569092, "eval_runtime": 63.6859, "eval_samples_per_second": 157.021, "eval_steps_per_second": 19.628, "step": 4300 }, { "epoch": 9.766565012475741, "grad_norm": 0.305960476398468, "learning_rate": 5.957985995331777e-06, "loss": 6.4285, "step": 4400 }, { "epoch": 9.766565012475741, "eval_loss": 6.421577453613281, "eval_runtime": 66.1928, "eval_samples_per_second": 151.074, "eval_steps_per_second": 18.884, "step": 4400 }, { "epoch": 9.98835597449404, "grad_norm": 0.3036479353904724, "learning_rate": 5.956985661887296e-06, "loss": 6.4249, "step": 4500 }, { "epoch": 9.98835597449404, "eval_loss": 6.41899299621582, "eval_runtime": 63.6775, "eval_samples_per_second": 157.041, "eval_steps_per_second": 19.63, "step": 4500 }, { "epoch": 10.210146936512338, "grad_norm": 1.1105852127075195, "learning_rate": 5.955985328442815e-06, "loss": 6.4262, "step": 4600 }, { "epoch": 10.210146936512338, "eval_loss": 6.420323371887207, "eval_runtime": 63.5916, "eval_samples_per_second": 157.253, "eval_steps_per_second": 19.657, "step": 4600 }, { "epoch": 10.431937898530634, "grad_norm": 0.38992971181869507, "learning_rate": 5.954984994998333e-06, "loss": 6.4259, "step": 4700 }, { "epoch": 10.431937898530634, "eval_loss": 6.415469646453857, "eval_runtime": 63.7968, "eval_samples_per_second": 156.748, "eval_steps_per_second": 19.593, "step": 4700 }, { "epoch": 10.653728860548933, "grad_norm": 0.39246854186058044, "learning_rate": 5.953984661553851e-06, "loss": 6.4258, "step": 4800 }, { "epoch": 10.653728860548933, "eval_loss": 6.414693832397461, "eval_runtime": 66.2863, "eval_samples_per_second": 150.861, "eval_steps_per_second": 18.858, "step": 4800 }, { "epoch": 10.875519822567231, "grad_norm": 0.6589607000350952, "learning_rate": 5.95298432810937e-06, "loss": 6.4226, "step": 4900 }, { "epoch": 10.875519822567231, "eval_loss": 6.417821884155273, "eval_runtime": 63.7381, "eval_samples_per_second": 156.892, "eval_steps_per_second": 19.612, "step": 4900 }, { "epoch": 11.097310784585527, "grad_norm": 0.44160690903663635, "learning_rate": 5.951983994664888e-06, "loss": 6.4223, "step": 5000 }, { "epoch": 11.097310784585527, "eval_loss": 6.417135715484619, "eval_runtime": 63.7803, "eval_samples_per_second": 156.788, "eval_steps_per_second": 19.599, "step": 5000 }, { "epoch": 11.319101746603826, "grad_norm": 0.7182816863059998, "learning_rate": 5.950983661220407e-06, "loss": 6.4221, "step": 5100 }, { "epoch": 11.319101746603826, "eval_loss": 6.417608737945557, "eval_runtime": 66.5138, "eval_samples_per_second": 150.345, "eval_steps_per_second": 18.793, "step": 5100 }, { "epoch": 11.540892708622124, "grad_norm": 0.45741328597068787, "learning_rate": 5.949983327775925e-06, "loss": 6.4211, "step": 5200 }, { "epoch": 11.540892708622124, "eval_loss": 6.411616325378418, "eval_runtime": 63.8646, "eval_samples_per_second": 156.581, "eval_steps_per_second": 19.573, "step": 5200 }, { "epoch": 11.76268367064042, "grad_norm": 0.37045249342918396, "learning_rate": 5.948982994331444e-06, "loss": 6.4203, "step": 5300 }, { "epoch": 11.76268367064042, "eval_loss": 6.415543556213379, "eval_runtime": 63.6959, "eval_samples_per_second": 156.996, "eval_steps_per_second": 19.624, "step": 5300 }, { "epoch": 11.984474632658719, "grad_norm": 0.5875869989395142, "learning_rate": 5.947982660886963e-06, "loss": 6.4189, "step": 5400 }, { "epoch": 11.984474632658719, "eval_loss": 6.417328834533691, "eval_runtime": 63.8682, "eval_samples_per_second": 156.572, "eval_steps_per_second": 19.572, "step": 5400 }, { "epoch": 12.206265594677017, "grad_norm": 0.39769718050956726, "learning_rate": 5.9469823274424815e-06, "loss": 6.4185, "step": 5500 }, { "epoch": 12.206265594677017, "eval_loss": 6.417914390563965, "eval_runtime": 66.821, "eval_samples_per_second": 149.653, "eval_steps_per_second": 18.707, "step": 5500 }, { "epoch": 12.428056556695315, "grad_norm": 0.8144527673721313, "learning_rate": 5.945981993997999e-06, "loss": 6.417, "step": 5600 }, { "epoch": 12.428056556695315, "eval_loss": 6.414742946624756, "eval_runtime": 63.6455, "eval_samples_per_second": 157.12, "eval_steps_per_second": 19.64, "step": 5600 }, { "epoch": 12.649847518713612, "grad_norm": 0.304855078458786, "learning_rate": 5.944981660553518e-06, "loss": 6.4169, "step": 5700 }, { "epoch": 12.649847518713612, "eval_loss": 6.411574363708496, "eval_runtime": 63.6479, "eval_samples_per_second": 157.114, "eval_steps_per_second": 19.639, "step": 5700 }, { "epoch": 12.87163848073191, "grad_norm": 0.5774130821228027, "learning_rate": 5.943981327109036e-06, "loss": 6.4162, "step": 5800 }, { "epoch": 12.87163848073191, "eval_loss": 6.4110517501831055, "eval_runtime": 66.215, "eval_samples_per_second": 151.023, "eval_steps_per_second": 18.878, "step": 5800 }, { "epoch": 13.093429442750208, "grad_norm": 0.6892155408859253, "learning_rate": 5.942980993664555e-06, "loss": 6.414, "step": 5900 }, { "epoch": 13.093429442750208, "eval_loss": 6.413996696472168, "eval_runtime": 63.6174, "eval_samples_per_second": 157.19, "eval_steps_per_second": 19.649, "step": 5900 }, { "epoch": 13.315220404768505, "grad_norm": 0.5487566590309143, "learning_rate": 5.941980660220073e-06, "loss": 6.4153, "step": 6000 }, { "epoch": 13.315220404768505, "eval_loss": 6.414098739624023, "eval_runtime": 63.6464, "eval_samples_per_second": 157.118, "eval_steps_per_second": 19.64, "step": 6000 }, { "epoch": 13.537011366786803, "grad_norm": 0.7147879004478455, "learning_rate": 5.940980326775592e-06, "loss": 6.4132, "step": 6100 }, { "epoch": 13.537011366786803, "eval_loss": 6.411059379577637, "eval_runtime": 66.5345, "eval_samples_per_second": 150.298, "eval_steps_per_second": 18.787, "step": 6100 }, { "epoch": 13.758802328805102, "grad_norm": 0.4990188181400299, "learning_rate": 5.939979993331111e-06, "loss": 6.4127, "step": 6200 }, { "epoch": 13.758802328805102, "eval_loss": 6.411470890045166, "eval_runtime": 63.7718, "eval_samples_per_second": 156.809, "eval_steps_per_second": 19.601, "step": 6200 }, { "epoch": 13.9805932908234, "grad_norm": 0.3841017782688141, "learning_rate": 5.9389796598866294e-06, "loss": 6.4133, "step": 6300 }, { "epoch": 13.9805932908234, "eval_loss": 6.4090681076049805, "eval_runtime": 63.7617, "eval_samples_per_second": 156.834, "eval_steps_per_second": 19.604, "step": 6300 }, { "epoch": 14.202384252841696, "grad_norm": 0.3359989523887634, "learning_rate": 5.937979326442147e-06, "loss": 6.4107, "step": 6400 }, { "epoch": 14.202384252841696, "eval_loss": 6.409322738647461, "eval_runtime": 63.5969, "eval_samples_per_second": 157.24, "eval_steps_per_second": 19.655, "step": 6400 }, { "epoch": 14.424175214859995, "grad_norm": 0.5810059905052185, "learning_rate": 5.936978992997666e-06, "loss": 6.411, "step": 6500 }, { "epoch": 14.424175214859995, "eval_loss": 6.411257743835449, "eval_runtime": 66.5523, "eval_samples_per_second": 150.258, "eval_steps_per_second": 18.782, "step": 6500 }, { "epoch": 14.645966176878293, "grad_norm": 0.45823681354522705, "learning_rate": 5.935978659553185e-06, "loss": 6.4107, "step": 6600 }, { "epoch": 14.645966176878293, "eval_loss": 6.4073872566223145, "eval_runtime": 63.6788, "eval_samples_per_second": 157.038, "eval_steps_per_second": 19.63, "step": 6600 }, { "epoch": 14.86775713889659, "grad_norm": 0.6735783815383911, "learning_rate": 5.9349783261087026e-06, "loss": 6.4112, "step": 6700 }, { "epoch": 14.86775713889659, "eval_loss": 6.411919593811035, "eval_runtime": 63.7297, "eval_samples_per_second": 156.913, "eval_steps_per_second": 19.614, "step": 6700 }, { "epoch": 15.089548100914888, "grad_norm": 0.5670196413993835, "learning_rate": 5.933977992664221e-06, "loss": 6.4099, "step": 6800 }, { "epoch": 15.089548100914888, "eval_loss": 6.407878875732422, "eval_runtime": 66.3771, "eval_samples_per_second": 150.654, "eval_steps_per_second": 18.832, "step": 6800 }, { "epoch": 15.311339062933186, "grad_norm": 0.3068266808986664, "learning_rate": 5.93297765921974e-06, "loss": 6.4089, "step": 6900 }, { "epoch": 15.311339062933186, "eval_loss": 6.4104766845703125, "eval_runtime": 63.6627, "eval_samples_per_second": 157.078, "eval_steps_per_second": 19.635, "step": 6900 }, { "epoch": 15.533130024951483, "grad_norm": 0.8304972052574158, "learning_rate": 5.931977325775259e-06, "loss": 6.409, "step": 7000 }, { "epoch": 15.533130024951483, "eval_loss": 6.414528846740723, "eval_runtime": 63.6701, "eval_samples_per_second": 157.06, "eval_steps_per_second": 19.632, "step": 7000 }, { "epoch": 15.75492098696978, "grad_norm": 0.5522041916847229, "learning_rate": 5.930976992330777e-06, "loss": 6.4089, "step": 7100 }, { "epoch": 15.75492098696978, "eval_loss": 6.407095909118652, "eval_runtime": 66.1999, "eval_samples_per_second": 151.058, "eval_steps_per_second": 18.882, "step": 7100 }, { "epoch": 15.97671194898808, "grad_norm": 0.373626708984375, "learning_rate": 5.929976658886295e-06, "loss": 6.4071, "step": 7200 }, { "epoch": 15.97671194898808, "eval_loss": 6.4060258865356445, "eval_runtime": 63.7049, "eval_samples_per_second": 156.974, "eval_steps_per_second": 19.622, "step": 7200 }, { "epoch": 16.198502911006376, "grad_norm": 0.3747236132621765, "learning_rate": 5.928976325441814e-06, "loss": 6.4072, "step": 7300 }, { "epoch": 16.198502911006376, "eval_loss": 6.403803825378418, "eval_runtime": 63.6478, "eval_samples_per_second": 157.115, "eval_steps_per_second": 19.639, "step": 7300 }, { "epoch": 16.420293873024676, "grad_norm": 0.9381150007247925, "learning_rate": 5.927975991997333e-06, "loss": 6.4068, "step": 7400 }, { "epoch": 16.420293873024676, "eval_loss": 6.406477451324463, "eval_runtime": 66.4296, "eval_samples_per_second": 150.535, "eval_steps_per_second": 18.817, "step": 7400 }, { "epoch": 16.642084835042972, "grad_norm": 0.4905136823654175, "learning_rate": 5.9269756585528505e-06, "loss": 6.4047, "step": 7500 }, { "epoch": 16.642084835042972, "eval_loss": 6.4078850746154785, "eval_runtime": 63.7258, "eval_samples_per_second": 156.922, "eval_steps_per_second": 19.615, "step": 7500 }, { "epoch": 16.86387579706127, "grad_norm": 0.5776643753051758, "learning_rate": 5.92597532510837e-06, "loss": 6.4054, "step": 7600 }, { "epoch": 16.86387579706127, "eval_loss": 6.403768539428711, "eval_runtime": 63.7461, "eval_samples_per_second": 156.872, "eval_steps_per_second": 19.609, "step": 7600 }, { "epoch": 17.08566675907957, "grad_norm": 0.791892945766449, "learning_rate": 5.924974991663888e-06, "loss": 6.4051, "step": 7700 }, { "epoch": 17.08566675907957, "eval_loss": 6.403835773468018, "eval_runtime": 63.9137, "eval_samples_per_second": 156.461, "eval_steps_per_second": 19.558, "step": 7700 }, { "epoch": 17.307457721097865, "grad_norm": 0.485984206199646, "learning_rate": 5.923974658219407e-06, "loss": 6.4058, "step": 7800 }, { "epoch": 17.307457721097865, "eval_loss": 6.405175685882568, "eval_runtime": 66.2871, "eval_samples_per_second": 150.859, "eval_steps_per_second": 18.857, "step": 7800 }, { "epoch": 17.529248683116162, "grad_norm": 1.0781219005584717, "learning_rate": 5.922974324774925e-06, "loss": 6.4037, "step": 7900 }, { "epoch": 17.529248683116162, "eval_loss": 6.408561706542969, "eval_runtime": 66.8857, "eval_samples_per_second": 149.509, "eval_steps_per_second": 18.689, "step": 7900 }, { "epoch": 17.751039645134462, "grad_norm": 0.6358538269996643, "learning_rate": 5.921973991330443e-06, "loss": 6.403, "step": 8000 }, { "epoch": 17.751039645134462, "eval_loss": 6.402519702911377, "eval_runtime": 63.7653, "eval_samples_per_second": 156.825, "eval_steps_per_second": 19.603, "step": 8000 }, { "epoch": 17.97283060715276, "grad_norm": 0.5632463097572327, "learning_rate": 5.920973657885962e-06, "loss": 6.4034, "step": 8100 }, { "epoch": 17.97283060715276, "eval_loss": 6.403571128845215, "eval_runtime": 63.7754, "eval_samples_per_second": 156.8, "eval_steps_per_second": 19.6, "step": 8100 }, { "epoch": 18.194621569171055, "grad_norm": 0.23312948644161224, "learning_rate": 5.919973324441481e-06, "loss": 6.4048, "step": 8200 }, { "epoch": 18.194621569171055, "eval_loss": 6.404890060424805, "eval_runtime": 63.6831, "eval_samples_per_second": 157.028, "eval_steps_per_second": 19.628, "step": 8200 }, { "epoch": 18.416412531189355, "grad_norm": 0.5255222916603088, "learning_rate": 5.918972990996999e-06, "loss": 6.4018, "step": 8300 }, { "epoch": 18.416412531189355, "eval_loss": 6.401614665985107, "eval_runtime": 66.6013, "eval_samples_per_second": 150.147, "eval_steps_per_second": 18.768, "step": 8300 }, { "epoch": 18.63820349320765, "grad_norm": 0.44263362884521484, "learning_rate": 5.917972657552518e-06, "loss": 6.4018, "step": 8400 }, { "epoch": 18.63820349320765, "eval_loss": 6.40390682220459, "eval_runtime": 63.7484, "eval_samples_per_second": 156.867, "eval_steps_per_second": 19.608, "step": 8400 }, { "epoch": 18.859994455225948, "grad_norm": 0.5826687812805176, "learning_rate": 5.916972324108037e-06, "loss": 6.402, "step": 8500 }, { "epoch": 18.859994455225948, "eval_loss": 6.401444911956787, "eval_runtime": 63.73, "eval_samples_per_second": 156.912, "eval_steps_per_second": 19.614, "step": 8500 }, { "epoch": 19.081785417244248, "grad_norm": 0.5808525681495667, "learning_rate": 5.915971990663555e-06, "loss": 6.4031, "step": 8600 }, { "epoch": 19.081785417244248, "eval_loss": 6.398373126983643, "eval_runtime": 66.6574, "eval_samples_per_second": 150.021, "eval_steps_per_second": 18.753, "step": 8600 }, { "epoch": 19.303576379262545, "grad_norm": 0.9179806113243103, "learning_rate": 5.914971657219073e-06, "loss": 6.4019, "step": 8700 }, { "epoch": 19.303576379262545, "eval_loss": 6.399080276489258, "eval_runtime": 63.6271, "eval_samples_per_second": 157.166, "eval_steps_per_second": 19.646, "step": 8700 }, { "epoch": 19.52536734128084, "grad_norm": 0.45992511510849, "learning_rate": 5.913971323774591e-06, "loss": 6.4, "step": 8800 }, { "epoch": 19.52536734128084, "eval_loss": 6.403900623321533, "eval_runtime": 63.7034, "eval_samples_per_second": 156.977, "eval_steps_per_second": 19.622, "step": 8800 }, { "epoch": 19.74715830329914, "grad_norm": 0.702781081199646, "learning_rate": 5.91297099033011e-06, "loss": 6.3993, "step": 8900 }, { "epoch": 19.74715830329914, "eval_loss": 6.401424884796143, "eval_runtime": 66.2276, "eval_samples_per_second": 150.994, "eval_steps_per_second": 18.874, "step": 8900 }, { "epoch": 19.968949265317438, "grad_norm": 0.6189502477645874, "learning_rate": 5.911970656885629e-06, "loss": 6.3999, "step": 9000 }, { "epoch": 19.968949265317438, "eval_loss": 6.400846481323242, "eval_runtime": 63.7467, "eval_samples_per_second": 156.871, "eval_steps_per_second": 19.609, "step": 9000 }, { "epoch": 20.190740227335738, "grad_norm": 0.37635141611099243, "learning_rate": 5.910970323441147e-06, "loss": 6.3994, "step": 9100 }, { "epoch": 20.190740227335738, "eval_loss": 6.402886867523193, "eval_runtime": 63.6159, "eval_samples_per_second": 157.193, "eval_steps_per_second": 19.649, "step": 9100 }, { "epoch": 20.412531189354034, "grad_norm": 0.5809453129768372, "learning_rate": 5.909969989996666e-06, "loss": 6.3996, "step": 9200 }, { "epoch": 20.412531189354034, "eval_loss": 6.399085998535156, "eval_runtime": 66.2096, "eval_samples_per_second": 151.035, "eval_steps_per_second": 18.879, "step": 9200 }, { "epoch": 20.63432215137233, "grad_norm": 0.535410463809967, "learning_rate": 5.908969656552185e-06, "loss": 6.3985, "step": 9300 }, { "epoch": 20.63432215137233, "eval_loss": 6.399356842041016, "eval_runtime": 63.8098, "eval_samples_per_second": 156.716, "eval_steps_per_second": 19.589, "step": 9300 }, { "epoch": 20.85611311339063, "grad_norm": 0.5065354108810425, "learning_rate": 5.907969323107703e-06, "loss": 6.3993, "step": 9400 }, { "epoch": 20.85611311339063, "eval_loss": 6.401696681976318, "eval_runtime": 63.6775, "eval_samples_per_second": 157.041, "eval_steps_per_second": 19.63, "step": 9400 }, { "epoch": 21.077904075408927, "grad_norm": 0.4803392291069031, "learning_rate": 5.906968989663221e-06, "loss": 6.4003, "step": 9500 }, { "epoch": 21.077904075408927, "eval_loss": 6.399422645568848, "eval_runtime": 63.6426, "eval_samples_per_second": 157.127, "eval_steps_per_second": 19.641, "step": 9500 }, { "epoch": 21.299695037427224, "grad_norm": 0.7447142004966736, "learning_rate": 5.90596865621874e-06, "loss": 6.3992, "step": 9600 }, { "epoch": 21.299695037427224, "eval_loss": 6.397017002105713, "eval_runtime": 66.4941, "eval_samples_per_second": 150.389, "eval_steps_per_second": 18.799, "step": 9600 }, { "epoch": 21.521485999445524, "grad_norm": 0.2856753468513489, "learning_rate": 5.904968322774258e-06, "loss": 6.3999, "step": 9700 }, { "epoch": 21.521485999445524, "eval_loss": 6.400000095367432, "eval_runtime": 63.7186, "eval_samples_per_second": 156.94, "eval_steps_per_second": 19.618, "step": 9700 }, { "epoch": 21.74327696146382, "grad_norm": 0.8077158331871033, "learning_rate": 5.9039679893297766e-06, "loss": 6.3981, "step": 9800 }, { "epoch": 21.74327696146382, "eval_loss": 6.398531436920166, "eval_runtime": 63.7668, "eval_samples_per_second": 156.821, "eval_steps_per_second": 19.603, "step": 9800 }, { "epoch": 21.965067923482117, "grad_norm": 0.8744412660598755, "learning_rate": 5.902967655885295e-06, "loss": 6.3988, "step": 9900 }, { "epoch": 21.965067923482117, "eval_loss": 6.396906852722168, "eval_runtime": 66.2535, "eval_samples_per_second": 150.935, "eval_steps_per_second": 18.867, "step": 9900 }, { "epoch": 22.186858885500417, "grad_norm": 0.44601574540138245, "learning_rate": 5.901967322440814e-06, "loss": 6.3969, "step": 10000 }, { "epoch": 22.186858885500417, "eval_loss": 6.395452976226807, "eval_runtime": 63.6969, "eval_samples_per_second": 156.994, "eval_steps_per_second": 19.624, "step": 10000 }, { "epoch": 22.408649847518713, "grad_norm": 0.6895701289176941, "learning_rate": 5.900966988996333e-06, "loss": 6.3967, "step": 10100 }, { "epoch": 22.408649847518713, "eval_loss": 6.40028190612793, "eval_runtime": 63.7023, "eval_samples_per_second": 156.98, "eval_steps_per_second": 19.623, "step": 10100 }, { "epoch": 22.63044080953701, "grad_norm": 0.6166660189628601, "learning_rate": 5.8999666555518505e-06, "loss": 6.3968, "step": 10200 }, { "epoch": 22.63044080953701, "eval_loss": 6.397933483123779, "eval_runtime": 66.8627, "eval_samples_per_second": 149.56, "eval_steps_per_second": 18.695, "step": 10200 }, { "epoch": 22.85223177155531, "grad_norm": 1.0633758306503296, "learning_rate": 5.898966322107369e-06, "loss": 6.3976, "step": 10300 }, { "epoch": 22.85223177155531, "eval_loss": 6.396650791168213, "eval_runtime": 63.7935, "eval_samples_per_second": 156.756, "eval_steps_per_second": 19.594, "step": 10300 }, { "epoch": 23.074022733573607, "grad_norm": 0.4864283502101898, "learning_rate": 5.897965988662888e-06, "loss": 6.3967, "step": 10400 }, { "epoch": 23.074022733573607, "eval_loss": 6.39711332321167, "eval_runtime": 63.6284, "eval_samples_per_second": 157.163, "eval_steps_per_second": 19.645, "step": 10400 }, { "epoch": 23.295813695591903, "grad_norm": 0.65082186460495, "learning_rate": 5.896965655218406e-06, "loss": 6.3973, "step": 10500 }, { "epoch": 23.295813695591903, "eval_loss": 6.395853519439697, "eval_runtime": 66.242, "eval_samples_per_second": 150.962, "eval_steps_per_second": 18.87, "step": 10500 }, { "epoch": 23.517604657610203, "grad_norm": 0.45799535512924194, "learning_rate": 5.8959653217739245e-06, "loss": 6.396, "step": 10600 }, { "epoch": 23.517604657610203, "eval_loss": 6.398243427276611, "eval_runtime": 63.7686, "eval_samples_per_second": 156.817, "eval_steps_per_second": 19.602, "step": 10600 }, { "epoch": 23.7393956196285, "grad_norm": 0.5860775709152222, "learning_rate": 5.894964988329443e-06, "loss": 6.3956, "step": 10700 }, { "epoch": 23.7393956196285, "eval_loss": 6.3961687088012695, "eval_runtime": 67.0182, "eval_samples_per_second": 149.213, "eval_steps_per_second": 18.652, "step": 10700 }, { "epoch": 23.9611865816468, "grad_norm": 0.5584791898727417, "learning_rate": 5.893964654884962e-06, "loss": 6.3957, "step": 10800 }, { "epoch": 23.9611865816468, "eval_loss": 6.396393775939941, "eval_runtime": 63.8981, "eval_samples_per_second": 156.499, "eval_steps_per_second": 19.562, "step": 10800 }, { "epoch": 24.182977543665096, "grad_norm": 0.7845295667648315, "learning_rate": 5.892964321440481e-06, "loss": 6.3956, "step": 10900 }, { "epoch": 24.182977543665096, "eval_loss": 6.397210121154785, "eval_runtime": 64.0302, "eval_samples_per_second": 156.176, "eval_steps_per_second": 19.522, "step": 10900 }, { "epoch": 24.404768505683393, "grad_norm": 0.564857006072998, "learning_rate": 5.8919639879959985e-06, "loss": 6.3955, "step": 11000 }, { "epoch": 24.404768505683393, "eval_loss": 6.395459175109863, "eval_runtime": 67.2462, "eval_samples_per_second": 148.707, "eval_steps_per_second": 18.588, "step": 11000 }, { "epoch": 24.665372886054893, "grad_norm": 0.7520161271095276, "learning_rate": 4.906354515050168e-06, "loss": 6.3944, "step": 11100 }, { "epoch": 24.665372886054893, "eval_loss": 6.389779567718506, "eval_runtime": 87.8112, "eval_samples_per_second": 113.881, "eval_steps_per_second": 14.235, "step": 11100 }, { "epoch": 24.88716384807319, "grad_norm": 0.6003276705741882, "learning_rate": 4.8963210702341136e-06, "loss": 6.394, "step": 11200 }, { "epoch": 24.88716384807319, "eval_loss": 6.394806861877441, "eval_runtime": 75.8812, "eval_samples_per_second": 131.785, "eval_steps_per_second": 16.473, "step": 11200 }, { "epoch": 25.10895481009149, "grad_norm": 0.28259870409965515, "learning_rate": 4.88628762541806e-06, "loss": 6.3945, "step": 11300 }, { "epoch": 25.10895481009149, "eval_loss": 6.398300647735596, "eval_runtime": 88.2774, "eval_samples_per_second": 113.279, "eval_steps_per_second": 14.16, "step": 11300 }, { "epoch": 25.330745772109786, "grad_norm": 0.30802807211875916, "learning_rate": 4.876254180602007e-06, "loss": 6.3941, "step": 11400 }, { "epoch": 25.330745772109786, "eval_loss": 6.394501686096191, "eval_runtime": 66.156, "eval_samples_per_second": 151.158, "eval_steps_per_second": 18.895, "step": 11400 }, { "epoch": 25.552536734128083, "grad_norm": 0.5175557732582092, "learning_rate": 4.866220735785953e-06, "loss": 6.394, "step": 11500 }, { "epoch": 25.552536734128083, "eval_loss": 6.3985795974731445, "eval_runtime": 63.6993, "eval_samples_per_second": 156.988, "eval_steps_per_second": 19.623, "step": 11500 }, { "epoch": 25.774327696146383, "grad_norm": 0.5214359164237976, "learning_rate": 4.8561872909699e-06, "loss": 6.3942, "step": 11600 }, { "epoch": 25.774327696146383, "eval_loss": 6.391521453857422, "eval_runtime": 63.6987, "eval_samples_per_second": 156.989, "eval_steps_per_second": 19.624, "step": 11600 }, { "epoch": 25.99611865816468, "grad_norm": 0.5827904343605042, "learning_rate": 4.8461538461538465e-06, "loss": 6.3953, "step": 11700 }, { "epoch": 25.99611865816468, "eval_loss": 6.393467903137207, "eval_runtime": 66.2727, "eval_samples_per_second": 150.892, "eval_steps_per_second": 18.861, "step": 11700 }, { "epoch": 26.21790962018298, "grad_norm": 0.24229009449481964, "learning_rate": 4.8361204013377925e-06, "loss": 6.3945, "step": 11800 }, { "epoch": 26.21790962018298, "eval_loss": 6.39454460144043, "eval_runtime": 63.6782, "eval_samples_per_second": 157.04, "eval_steps_per_second": 19.63, "step": 11800 }, { "epoch": 26.439700582201276, "grad_norm": 0.6859923005104065, "learning_rate": 4.826086956521739e-06, "loss": 6.3929, "step": 11900 }, { "epoch": 26.439700582201276, "eval_loss": 6.394321918487549, "eval_runtime": 66.2701, "eval_samples_per_second": 150.898, "eval_steps_per_second": 18.862, "step": 11900 }, { "epoch": 26.661491544219572, "grad_norm": 0.4267604947090149, "learning_rate": 4.816053511705686e-06, "loss": 6.3941, "step": 12000 }, { "epoch": 26.661491544219572, "eval_loss": 6.394528865814209, "eval_runtime": 63.7313, "eval_samples_per_second": 156.909, "eval_steps_per_second": 19.614, "step": 12000 }, { "epoch": 26.883282506237872, "grad_norm": 0.43895894289016724, "learning_rate": 4.806020066889633e-06, "loss": 6.3929, "step": 12100 }, { "epoch": 26.883282506237872, "eval_loss": 6.3936076164245605, "eval_runtime": 66.3275, "eval_samples_per_second": 150.767, "eval_steps_per_second": 18.846, "step": 12100 }, { "epoch": 27.10507346825617, "grad_norm": 0.3438960015773773, "learning_rate": 4.795986622073579e-06, "loss": 6.3933, "step": 12200 }, { "epoch": 27.10507346825617, "eval_loss": 6.397474765777588, "eval_runtime": 63.621, "eval_samples_per_second": 157.181, "eval_steps_per_second": 19.648, "step": 12200 }, { "epoch": 27.326864430274465, "grad_norm": 0.5950188636779785, "learning_rate": 4.785953177257525e-06, "loss": 6.394, "step": 12300 }, { "epoch": 27.326864430274465, "eval_loss": 6.393238544464111, "eval_runtime": 63.6999, "eval_samples_per_second": 156.986, "eval_steps_per_second": 19.623, "step": 12300 }, { "epoch": 27.548655392292765, "grad_norm": 0.34001484513282776, "learning_rate": 4.775919732441472e-06, "loss": 6.3947, "step": 12400 }, { "epoch": 27.548655392292765, "eval_loss": 6.394363880157471, "eval_runtime": 66.2457, "eval_samples_per_second": 150.953, "eval_steps_per_second": 18.869, "step": 12400 }, { "epoch": 27.770446354311062, "grad_norm": 0.47045424580574036, "learning_rate": 4.765886287625418e-06, "loss": 6.3929, "step": 12500 }, { "epoch": 27.770446354311062, "eval_loss": 6.393606185913086, "eval_runtime": 63.7187, "eval_samples_per_second": 156.94, "eval_steps_per_second": 19.617, "step": 12500 }, { "epoch": 27.99223731632936, "grad_norm": 0.6604583859443665, "learning_rate": 4.755852842809365e-06, "loss": 6.3931, "step": 12600 }, { "epoch": 27.99223731632936, "eval_loss": 6.39324426651001, "eval_runtime": 63.6887, "eval_samples_per_second": 157.014, "eval_steps_per_second": 19.627, "step": 12600 }, { "epoch": 28.21402827834766, "grad_norm": 0.6491646766662598, "learning_rate": 4.745819397993312e-06, "loss": 6.3912, "step": 12700 }, { "epoch": 28.21402827834766, "eval_loss": 6.394981384277344, "eval_runtime": 66.2742, "eval_samples_per_second": 150.888, "eval_steps_per_second": 18.861, "step": 12700 }, { "epoch": 28.435819240365955, "grad_norm": 0.5381952524185181, "learning_rate": 4.7357859531772575e-06, "loss": 6.3929, "step": 12800 }, { "epoch": 28.435819240365955, "eval_loss": 6.392743110656738, "eval_runtime": 63.6892, "eval_samples_per_second": 157.012, "eval_steps_per_second": 19.627, "step": 12800 }, { "epoch": 28.65761020238425, "grad_norm": 0.7769903540611267, "learning_rate": 4.725752508361204e-06, "loss": 6.3927, "step": 12900 }, { "epoch": 28.65761020238425, "eval_loss": 6.390952110290527, "eval_runtime": 66.3226, "eval_samples_per_second": 150.778, "eval_steps_per_second": 18.847, "step": 12900 }, { "epoch": 28.87940116440255, "grad_norm": 0.4297138452529907, "learning_rate": 4.715719063545151e-06, "loss": 6.393, "step": 13000 }, { "epoch": 28.87940116440255, "eval_loss": 6.390758037567139, "eval_runtime": 63.8216, "eval_samples_per_second": 156.687, "eval_steps_per_second": 19.586, "step": 13000 }, { "epoch": 29.101192126420848, "grad_norm": 0.7731721997261047, "learning_rate": 4.705685618729097e-06, "loss": 6.3923, "step": 13100 }, { "epoch": 29.101192126420848, "eval_loss": 6.392960071563721, "eval_runtime": 63.6867, "eval_samples_per_second": 157.019, "eval_steps_per_second": 19.627, "step": 13100 }, { "epoch": 29.322983088439145, "grad_norm": 0.27714040875434875, "learning_rate": 4.695652173913044e-06, "loss": 6.3934, "step": 13200 }, { "epoch": 29.322983088439145, "eval_loss": 6.395288944244385, "eval_runtime": 66.2909, "eval_samples_per_second": 150.85, "eval_steps_per_second": 18.856, "step": 13200 }, { "epoch": 29.544774050457445, "grad_norm": 0.5391174554824829, "learning_rate": 4.6856187290969905e-06, "loss": 6.3927, "step": 13300 }, { "epoch": 29.544774050457445, "eval_loss": 6.395300388336182, "eval_runtime": 63.6935, "eval_samples_per_second": 157.002, "eval_steps_per_second": 19.625, "step": 13300 }, { "epoch": 29.76656501247574, "grad_norm": 0.9717122912406921, "learning_rate": 4.675585284280936e-06, "loss": 6.391, "step": 13400 }, { "epoch": 29.76656501247574, "eval_loss": 6.3939642906188965, "eval_runtime": 64.4676, "eval_samples_per_second": 155.117, "eval_steps_per_second": 19.39, "step": 13400 }, { "epoch": 29.988355974494038, "grad_norm": 0.3409580588340759, "learning_rate": 4.665551839464883e-06, "loss": 6.3929, "step": 13500 }, { "epoch": 29.988355974494038, "eval_loss": 6.393261909484863, "eval_runtime": 65.5531, "eval_samples_per_second": 152.548, "eval_steps_per_second": 19.069, "step": 13500 }, { "epoch": 30.210146936512338, "grad_norm": 0.7017607092857361, "learning_rate": 4.65551839464883e-06, "loss": 6.3914, "step": 13600 }, { "epoch": 30.210146936512338, "eval_loss": 6.389814853668213, "eval_runtime": 63.5889, "eval_samples_per_second": 157.26, "eval_steps_per_second": 19.658, "step": 13600 }, { "epoch": 30.431937898530634, "grad_norm": 0.494228720664978, "learning_rate": 4.645484949832776e-06, "loss": 6.3913, "step": 13700 }, { "epoch": 30.431937898530634, "eval_loss": 6.389814853668213, "eval_runtime": 63.6983, "eval_samples_per_second": 156.99, "eval_steps_per_second": 19.624, "step": 13700 }, { "epoch": 30.65372886054893, "grad_norm": 0.6848724484443665, "learning_rate": 4.635451505016723e-06, "loss": 6.3909, "step": 13800 }, { "epoch": 30.65372886054893, "eval_loss": 6.391334533691406, "eval_runtime": 66.3245, "eval_samples_per_second": 150.774, "eval_steps_per_second": 18.847, "step": 13800 }, { "epoch": 30.87551982256723, "grad_norm": 0.5187550187110901, "learning_rate": 4.625418060200669e-06, "loss": 6.3905, "step": 13900 }, { "epoch": 30.87551982256723, "eval_loss": 6.393035411834717, "eval_runtime": 63.667, "eval_samples_per_second": 157.067, "eval_steps_per_second": 19.633, "step": 13900 }, { "epoch": 31.097310784585527, "grad_norm": 0.4394451081752777, "learning_rate": 4.615384615384616e-06, "loss": 6.3902, "step": 14000 }, { "epoch": 31.097310784585527, "eval_loss": 6.391651630401611, "eval_runtime": 66.2607, "eval_samples_per_second": 150.919, "eval_steps_per_second": 18.865, "step": 14000 }, { "epoch": 31.319101746603828, "grad_norm": 0.6403105854988098, "learning_rate": 4.605351170568562e-06, "loss": 6.3904, "step": 14100 }, { "epoch": 31.319101746603828, "eval_loss": 6.390075206756592, "eval_runtime": 63.7818, "eval_samples_per_second": 156.785, "eval_steps_per_second": 19.598, "step": 14100 }, { "epoch": 31.540892708622124, "grad_norm": 0.41991308331489563, "learning_rate": 4.595317725752509e-06, "loss": 6.3915, "step": 14200 }, { "epoch": 31.540892708622124, "eval_loss": 6.390388488769531, "eval_runtime": 66.3061, "eval_samples_per_second": 150.816, "eval_steps_per_second": 18.852, "step": 14200 }, { "epoch": 31.76268367064042, "grad_norm": 0.5049502849578857, "learning_rate": 4.585284280936456e-06, "loss": 6.3901, "step": 14300 }, { "epoch": 31.76268367064042, "eval_loss": 6.394845485687256, "eval_runtime": 63.7361, "eval_samples_per_second": 156.897, "eval_steps_per_second": 19.612, "step": 14300 }, { "epoch": 31.98447463265872, "grad_norm": 0.5375522375106812, "learning_rate": 4.5752508361204015e-06, "loss": 6.3901, "step": 14400 }, { "epoch": 31.98447463265872, "eval_loss": 6.3919267654418945, "eval_runtime": 63.6609, "eval_samples_per_second": 157.082, "eval_steps_per_second": 19.635, "step": 14400 }, { "epoch": 32.206265594677014, "grad_norm": 0.6649445295333862, "learning_rate": 4.565217391304348e-06, "loss": 6.3897, "step": 14500 }, { "epoch": 32.206265594677014, "eval_loss": 6.391171932220459, "eval_runtime": 66.188, "eval_samples_per_second": 151.085, "eval_steps_per_second": 18.886, "step": 14500 }, { "epoch": 32.42805655669532, "grad_norm": 0.5367133021354675, "learning_rate": 4.555183946488295e-06, "loss": 6.3903, "step": 14600 }, { "epoch": 32.42805655669532, "eval_loss": 6.390655517578125, "eval_runtime": 63.747, "eval_samples_per_second": 156.87, "eval_steps_per_second": 19.609, "step": 14600 }, { "epoch": 32.649847518713614, "grad_norm": 0.5683135986328125, "learning_rate": 4.545150501672241e-06, "loss": 6.3881, "step": 14700 }, { "epoch": 32.649847518713614, "eval_loss": 6.387674808502197, "eval_runtime": 63.678, "eval_samples_per_second": 157.04, "eval_steps_per_second": 19.63, "step": 14700 }, { "epoch": 32.87163848073191, "grad_norm": 0.697325587272644, "learning_rate": 4.535117056856188e-06, "loss": 6.3908, "step": 14800 }, { "epoch": 32.87163848073191, "eval_loss": 6.393805027008057, "eval_runtime": 63.7212, "eval_samples_per_second": 156.934, "eval_steps_per_second": 19.617, "step": 14800 }, { "epoch": 33.09342944275021, "grad_norm": 0.5757908225059509, "learning_rate": 4.5250836120401345e-06, "loss": 6.3907, "step": 14900 }, { "epoch": 33.09342944275021, "eval_loss": 6.393499851226807, "eval_runtime": 66.2096, "eval_samples_per_second": 151.035, "eval_steps_per_second": 18.879, "step": 14900 }, { "epoch": 33.3152204047685, "grad_norm": 0.3517054319381714, "learning_rate": 4.51505016722408e-06, "loss": 6.3902, "step": 15000 }, { "epoch": 33.3152204047685, "eval_loss": 6.386899471282959, "eval_runtime": 63.7082, "eval_samples_per_second": 156.966, "eval_steps_per_second": 19.621, "step": 15000 }, { "epoch": 33.53701136678681, "grad_norm": 0.7311076521873474, "learning_rate": 4.505016722408027e-06, "loss": 6.3905, "step": 15100 }, { "epoch": 33.53701136678681, "eval_loss": 6.391955375671387, "eval_runtime": 63.6711, "eval_samples_per_second": 157.057, "eval_steps_per_second": 19.632, "step": 15100 }, { "epoch": 33.7588023288051, "grad_norm": 0.4526328444480896, "learning_rate": 4.494983277591973e-06, "loss": 6.3891, "step": 15200 }, { "epoch": 33.7588023288051, "eval_loss": 6.390474796295166, "eval_runtime": 66.2489, "eval_samples_per_second": 150.946, "eval_steps_per_second": 18.868, "step": 15200 }, { "epoch": 33.9805932908234, "grad_norm": 0.5623629093170166, "learning_rate": 4.48494983277592e-06, "loss": 6.3901, "step": 15300 }, { "epoch": 33.9805932908234, "eval_loss": 6.388679027557373, "eval_runtime": 63.6854, "eval_samples_per_second": 157.022, "eval_steps_per_second": 19.628, "step": 15300 }, { "epoch": 34.202384252841696, "grad_norm": 0.49122416973114014, "learning_rate": 4.474916387959866e-06, "loss": 6.389, "step": 15400 }, { "epoch": 34.202384252841696, "eval_loss": 6.39013671875, "eval_runtime": 63.5858, "eval_samples_per_second": 157.268, "eval_steps_per_second": 19.658, "step": 15400 }, { "epoch": 34.42417521485999, "grad_norm": 0.674659013748169, "learning_rate": 4.4648829431438125e-06, "loss": 6.3887, "step": 15500 }, { "epoch": 34.42417521485999, "eval_loss": 6.392813205718994, "eval_runtime": 66.2307, "eval_samples_per_second": 150.987, "eval_steps_per_second": 18.873, "step": 15500 }, { "epoch": 34.64596617687829, "grad_norm": 0.43613201379776, "learning_rate": 4.454849498327759e-06, "loss": 6.3889, "step": 15600 }, { "epoch": 34.64596617687829, "eval_loss": 6.388660907745361, "eval_runtime": 63.6774, "eval_samples_per_second": 157.042, "eval_steps_per_second": 19.63, "step": 15600 }, { "epoch": 34.86775713889659, "grad_norm": 0.737578272819519, "learning_rate": 4.444816053511705e-06, "loss": 6.3894, "step": 15700 }, { "epoch": 34.86775713889659, "eval_loss": 6.389644145965576, "eval_runtime": 63.7079, "eval_samples_per_second": 156.966, "eval_steps_per_second": 19.621, "step": 15700 }, { "epoch": 35.08954810091489, "grad_norm": 0.4716251790523529, "learning_rate": 4.434782608695652e-06, "loss": 6.3885, "step": 15800 }, { "epoch": 35.08954810091489, "eval_loss": 6.392263412475586, "eval_runtime": 66.1971, "eval_samples_per_second": 151.064, "eval_steps_per_second": 18.883, "step": 15800 }, { "epoch": 35.311339062933186, "grad_norm": 0.47875767946243286, "learning_rate": 4.424749163879599e-06, "loss": 6.3886, "step": 15900 }, { "epoch": 35.311339062933186, "eval_loss": 6.389831066131592, "eval_runtime": 63.6821, "eval_samples_per_second": 157.03, "eval_steps_per_second": 19.629, "step": 15900 }, { "epoch": 35.53313002495148, "grad_norm": 0.43402403593063354, "learning_rate": 4.414715719063545e-06, "loss": 6.3909, "step": 16000 }, { "epoch": 35.53313002495148, "eval_loss": 6.389725208282471, "eval_runtime": 63.7124, "eval_samples_per_second": 156.955, "eval_steps_per_second": 19.619, "step": 16000 }, { "epoch": 35.75492098696978, "grad_norm": 0.5011460781097412, "learning_rate": 4.404682274247491e-06, "loss": 6.3891, "step": 16100 }, { "epoch": 35.75492098696978, "eval_loss": 6.388359546661377, "eval_runtime": 66.2636, "eval_samples_per_second": 150.912, "eval_steps_per_second": 18.864, "step": 16100 }, { "epoch": 35.976711948988076, "grad_norm": 0.4029878079891205, "learning_rate": 4.394648829431438e-06, "loss": 6.3875, "step": 16200 }, { "epoch": 35.976711948988076, "eval_loss": 6.387814044952393, "eval_runtime": 63.7085, "eval_samples_per_second": 156.965, "eval_steps_per_second": 19.621, "step": 16200 }, { "epoch": 36.19850291100638, "grad_norm": 0.5763450264930725, "learning_rate": 4.384615384615384e-06, "loss": 6.3889, "step": 16300 }, { "epoch": 36.19850291100638, "eval_loss": 6.389321327209473, "eval_runtime": 65.8717, "eval_samples_per_second": 151.81, "eval_steps_per_second": 18.976, "step": 16300 }, { "epoch": 36.420293873024676, "grad_norm": 0.4742737412452698, "learning_rate": 4.374581939799331e-06, "loss": 6.3886, "step": 16400 }, { "epoch": 36.420293873024676, "eval_loss": 6.388833522796631, "eval_runtime": 63.752, "eval_samples_per_second": 156.858, "eval_steps_per_second": 19.607, "step": 16400 }, { "epoch": 36.64208483504297, "grad_norm": 0.4631459414958954, "learning_rate": 4.364548494983278e-06, "loss": 6.3886, "step": 16500 }, { "epoch": 36.64208483504297, "eval_loss": 6.387075901031494, "eval_runtime": 63.6816, "eval_samples_per_second": 157.031, "eval_steps_per_second": 19.629, "step": 16500 }, { "epoch": 36.86387579706127, "grad_norm": 0.5047929286956787, "learning_rate": 4.354515050167224e-06, "loss": 6.3869, "step": 16600 }, { "epoch": 36.86387579706127, "eval_loss": 6.39074182510376, "eval_runtime": 64.7171, "eval_samples_per_second": 154.519, "eval_steps_per_second": 19.315, "step": 16600 }, { "epoch": 37.085666759079565, "grad_norm": 0.45218634605407715, "learning_rate": 4.34448160535117e-06, "loss": 6.3894, "step": 16700 }, { "epoch": 37.085666759079565, "eval_loss": 6.393436908721924, "eval_runtime": 64.9705, "eval_samples_per_second": 153.916, "eval_steps_per_second": 19.24, "step": 16700 }, { "epoch": 37.30745772109786, "grad_norm": 0.5652719736099243, "learning_rate": 4.334448160535117e-06, "loss": 6.3873, "step": 16800 }, { "epoch": 37.30745772109786, "eval_loss": 6.391731262207031, "eval_runtime": 63.565, "eval_samples_per_second": 157.319, "eval_steps_per_second": 19.665, "step": 16800 }, { "epoch": 37.529248683116165, "grad_norm": 0.28403371572494507, "learning_rate": 4.324414715719064e-06, "loss": 6.3882, "step": 16900 }, { "epoch": 37.529248683116165, "eval_loss": 6.390590190887451, "eval_runtime": 63.6107, "eval_samples_per_second": 157.206, "eval_steps_per_second": 19.651, "step": 16900 }, { "epoch": 37.75103964513446, "grad_norm": 0.477235347032547, "learning_rate": 4.31438127090301e-06, "loss": 6.3872, "step": 17000 }, { "epoch": 37.75103964513446, "eval_loss": 6.390269756317139, "eval_runtime": 66.2763, "eval_samples_per_second": 150.884, "eval_steps_per_second": 18.86, "step": 17000 }, { "epoch": 37.97283060715276, "grad_norm": 0.37472817301750183, "learning_rate": 4.3043478260869565e-06, "loss": 6.3874, "step": 17100 }, { "epoch": 37.97283060715276, "eval_loss": 6.390199184417725, "eval_runtime": 63.6243, "eval_samples_per_second": 157.173, "eval_steps_per_second": 19.647, "step": 17100 }, { "epoch": 38.194621569171055, "grad_norm": 0.3379691243171692, "learning_rate": 4.294314381270903e-06, "loss": 6.387, "step": 17200 }, { "epoch": 38.194621569171055, "eval_loss": 6.386340618133545, "eval_runtime": 63.5571, "eval_samples_per_second": 157.339, "eval_steps_per_second": 19.667, "step": 17200 }, { "epoch": 38.41641253118935, "grad_norm": 0.46496257185935974, "learning_rate": 4.284280936454849e-06, "loss": 6.3856, "step": 17300 }, { "epoch": 38.41641253118935, "eval_loss": 6.3855695724487305, "eval_runtime": 65.9737, "eval_samples_per_second": 151.576, "eval_steps_per_second": 18.947, "step": 17300 }, { "epoch": 38.638203493207655, "grad_norm": 0.37888166308403015, "learning_rate": 4.274247491638796e-06, "loss": 6.3884, "step": 17400 }, { "epoch": 38.638203493207655, "eval_loss": 6.388376235961914, "eval_runtime": 63.6302, "eval_samples_per_second": 157.158, "eval_steps_per_second": 19.645, "step": 17400 }, { "epoch": 38.85999445522595, "grad_norm": 0.25813955068588257, "learning_rate": 4.264214046822743e-06, "loss": 6.3885, "step": 17500 }, { "epoch": 38.85999445522595, "eval_loss": 6.389296054840088, "eval_runtime": 63.7359, "eval_samples_per_second": 156.897, "eval_steps_per_second": 19.612, "step": 17500 }, { "epoch": 39.08178541724425, "grad_norm": 0.4262288510799408, "learning_rate": 4.254180602006689e-06, "loss": 6.3873, "step": 17600 }, { "epoch": 39.08178541724425, "eval_loss": 6.389705657958984, "eval_runtime": 66.0125, "eval_samples_per_second": 151.486, "eval_steps_per_second": 18.936, "step": 17600 }, { "epoch": 39.303576379262545, "grad_norm": 0.5291593074798584, "learning_rate": 4.244147157190635e-06, "loss": 6.3875, "step": 17700 }, { "epoch": 39.303576379262545, "eval_loss": 6.390807628631592, "eval_runtime": 63.619, "eval_samples_per_second": 157.186, "eval_steps_per_second": 19.648, "step": 17700 }, { "epoch": 39.52536734128084, "grad_norm": 0.3667999505996704, "learning_rate": 4.234113712374582e-06, "loss": 6.3887, "step": 17800 }, { "epoch": 39.52536734128084, "eval_loss": 6.3871259689331055, "eval_runtime": 63.6878, "eval_samples_per_second": 157.016, "eval_steps_per_second": 19.627, "step": 17800 }, { "epoch": 39.74715830329914, "grad_norm": 0.40572404861450195, "learning_rate": 4.224080267558528e-06, "loss": 6.3877, "step": 17900 }, { "epoch": 39.74715830329914, "eval_loss": 6.387050628662109, "eval_runtime": 66.0715, "eval_samples_per_second": 151.351, "eval_steps_per_second": 18.919, "step": 17900 }, { "epoch": 39.96894926531744, "grad_norm": 0.5057101845741272, "learning_rate": 4.214046822742475e-06, "loss": 6.385, "step": 18000 }, { "epoch": 39.96894926531744, "eval_loss": 6.388771057128906, "eval_runtime": 63.6312, "eval_samples_per_second": 157.156, "eval_steps_per_second": 19.644, "step": 18000 }, { "epoch": 40.19074022733574, "grad_norm": 0.5846272110939026, "learning_rate": 4.2040133779264216e-06, "loss": 6.3873, "step": 18100 }, { "epoch": 40.19074022733574, "eval_loss": 6.388961315155029, "eval_runtime": 63.6009, "eval_samples_per_second": 157.23, "eval_steps_per_second": 19.654, "step": 18100 }, { "epoch": 40.412531189354034, "grad_norm": 0.40428778529167175, "learning_rate": 4.1939799331103675e-06, "loss": 6.3878, "step": 18200 }, { "epoch": 40.412531189354034, "eval_loss": 6.392088413238525, "eval_runtime": 66.128, "eval_samples_per_second": 151.222, "eval_steps_per_second": 18.903, "step": 18200 }, { "epoch": 40.63432215137233, "grad_norm": 0.46563634276390076, "learning_rate": 4.183946488294314e-06, "loss": 6.386, "step": 18300 }, { "epoch": 40.63432215137233, "eval_loss": 6.389146327972412, "eval_runtime": 63.6612, "eval_samples_per_second": 157.082, "eval_steps_per_second": 19.635, "step": 18300 }, { "epoch": 40.85611311339063, "grad_norm": 0.4533691704273224, "learning_rate": 4.173913043478261e-06, "loss": 6.3874, "step": 18400 }, { "epoch": 40.85611311339063, "eval_loss": 6.386475086212158, "eval_runtime": 63.7394, "eval_samples_per_second": 156.889, "eval_steps_per_second": 19.611, "step": 18400 }, { "epoch": 41.077904075408924, "grad_norm": 0.38121113181114197, "learning_rate": 4.163879598662208e-06, "loss": 6.3862, "step": 18500 }, { "epoch": 41.077904075408924, "eval_loss": 6.384340763092041, "eval_runtime": 65.9841, "eval_samples_per_second": 151.552, "eval_steps_per_second": 18.944, "step": 18500 }, { "epoch": 41.29969503742723, "grad_norm": 0.4599936604499817, "learning_rate": 4.153846153846154e-06, "loss": 6.3871, "step": 18600 }, { "epoch": 41.29969503742723, "eval_loss": 6.38564395904541, "eval_runtime": 63.6008, "eval_samples_per_second": 157.231, "eval_steps_per_second": 19.654, "step": 18600 }, { "epoch": 41.521485999445524, "grad_norm": 0.6862403154373169, "learning_rate": 4.1438127090301005e-06, "loss": 6.3867, "step": 18700 }, { "epoch": 41.521485999445524, "eval_loss": 6.385303020477295, "eval_runtime": 63.6207, "eval_samples_per_second": 157.181, "eval_steps_per_second": 19.648, "step": 18700 }, { "epoch": 41.74327696146382, "grad_norm": 0.26633918285369873, "learning_rate": 4.133779264214047e-06, "loss": 6.3869, "step": 18800 }, { "epoch": 41.74327696146382, "eval_loss": 6.389577388763428, "eval_runtime": 66.0775, "eval_samples_per_second": 151.337, "eval_steps_per_second": 18.917, "step": 18800 }, { "epoch": 41.96506792348212, "grad_norm": 0.30118024349212646, "learning_rate": 4.123745819397993e-06, "loss": 6.3869, "step": 18900 }, { "epoch": 41.96506792348212, "eval_loss": 6.387940406799316, "eval_runtime": 63.6813, "eval_samples_per_second": 157.032, "eval_steps_per_second": 19.629, "step": 18900 }, { "epoch": 42.18685888550041, "grad_norm": 0.6833294630050659, "learning_rate": 4.11371237458194e-06, "loss": 6.3857, "step": 19000 }, { "epoch": 42.18685888550041, "eval_loss": 6.3908514976501465, "eval_runtime": 66.0844, "eval_samples_per_second": 151.322, "eval_steps_per_second": 18.915, "step": 19000 }, { "epoch": 42.40864984751872, "grad_norm": 0.35510268807411194, "learning_rate": 4.103678929765887e-06, "loss": 6.3862, "step": 19100 }, { "epoch": 42.40864984751872, "eval_loss": 6.3866119384765625, "eval_runtime": 63.7625, "eval_samples_per_second": 156.832, "eval_steps_per_second": 19.604, "step": 19100 }, { "epoch": 42.63044080953701, "grad_norm": 0.5903100371360779, "learning_rate": 4.0936454849498326e-06, "loss": 6.3857, "step": 19200 }, { "epoch": 42.63044080953701, "eval_loss": 6.385927677154541, "eval_runtime": 63.6174, "eval_samples_per_second": 157.19, "eval_steps_per_second": 19.649, "step": 19200 }, { "epoch": 42.85223177155531, "grad_norm": 0.4845108091831207, "learning_rate": 4.083612040133779e-06, "loss": 6.387, "step": 19300 }, { "epoch": 42.85223177155531, "eval_loss": 6.38942289352417, "eval_runtime": 66.1264, "eval_samples_per_second": 151.226, "eval_steps_per_second": 18.903, "step": 19300 }, { "epoch": 43.07402273357361, "grad_norm": 0.3592558801174164, "learning_rate": 4.073578595317726e-06, "loss": 6.3862, "step": 19400 }, { "epoch": 43.07402273357361, "eval_loss": 6.389144420623779, "eval_runtime": 63.5655, "eval_samples_per_second": 157.318, "eval_steps_per_second": 19.665, "step": 19400 }, { "epoch": 43.2958136955919, "grad_norm": 0.5529589056968689, "learning_rate": 4.063545150501672e-06, "loss": 6.3842, "step": 19500 }, { "epoch": 43.2958136955919, "eval_loss": 6.386436939239502, "eval_runtime": 66.2264, "eval_samples_per_second": 150.997, "eval_steps_per_second": 18.875, "step": 19500 }, { "epoch": 43.5176046576102, "grad_norm": 0.42238518595695496, "learning_rate": 4.053511705685619e-06, "loss": 6.3866, "step": 19600 }, { "epoch": 43.5176046576102, "eval_loss": 6.385384559631348, "eval_runtime": 63.7683, "eval_samples_per_second": 156.818, "eval_steps_per_second": 19.602, "step": 19600 }, { "epoch": 43.7393956196285, "grad_norm": 0.5223355293273926, "learning_rate": 4.0434782608695655e-06, "loss": 6.3853, "step": 19700 }, { "epoch": 43.7393956196285, "eval_loss": 6.385824203491211, "eval_runtime": 63.6506, "eval_samples_per_second": 157.108, "eval_steps_per_second": 19.638, "step": 19700 }, { "epoch": 43.9611865816468, "grad_norm": 0.46218928694725037, "learning_rate": 4.0334448160535115e-06, "loss": 6.387, "step": 19800 }, { "epoch": 43.9611865816468, "eval_loss": 6.38681697845459, "eval_runtime": 66.1858, "eval_samples_per_second": 151.09, "eval_steps_per_second": 18.886, "step": 19800 }, { "epoch": 44.182977543665096, "grad_norm": 0.3450022041797638, "learning_rate": 4.023411371237458e-06, "loss": 6.3845, "step": 19900 }, { "epoch": 44.182977543665096, "eval_loss": 6.386622428894043, "eval_runtime": 63.5361, "eval_samples_per_second": 157.391, "eval_steps_per_second": 19.674, "step": 19900 }, { "epoch": 44.40476850568339, "grad_norm": 0.39958134293556213, "learning_rate": 4.013377926421405e-06, "loss": 6.3863, "step": 20000 }, { "epoch": 44.40476850568339, "eval_loss": 6.387628555297852, "eval_runtime": 63.6316, "eval_samples_per_second": 157.155, "eval_steps_per_second": 19.644, "step": 20000 }, { "epoch": 44.62655946770169, "grad_norm": 0.28472310304641724, "learning_rate": 4.003344481605351e-06, "loss": 6.3851, "step": 20100 }, { "epoch": 44.62655946770169, "eval_loss": 6.388401031494141, "eval_runtime": 63.6958, "eval_samples_per_second": 156.996, "eval_steps_per_second": 19.625, "step": 20100 }, { "epoch": 44.848350429719986, "grad_norm": 0.39134547114372253, "learning_rate": 3.993311036789298e-06, "loss": 6.3849, "step": 20200 }, { "epoch": 44.848350429719986, "eval_loss": 6.389621734619141, "eval_runtime": 66.137, "eval_samples_per_second": 151.201, "eval_steps_per_second": 18.9, "step": 20200 }, { "epoch": 45.07014139173829, "grad_norm": 0.5134591460227966, "learning_rate": 3.9832775919732444e-06, "loss": 6.3847, "step": 20300 }, { "epoch": 45.07014139173829, "eval_loss": 6.387813568115234, "eval_runtime": 63.5686, "eval_samples_per_second": 157.31, "eval_steps_per_second": 19.664, "step": 20300 }, { "epoch": 45.291932353756586, "grad_norm": 0.2885007858276367, "learning_rate": 3.97324414715719e-06, "loss": 6.3865, "step": 20400 }, { "epoch": 45.291932353756586, "eval_loss": 6.389806270599365, "eval_runtime": 63.5893, "eval_samples_per_second": 157.259, "eval_steps_per_second": 19.657, "step": 20400 }, { "epoch": 45.51372331577488, "grad_norm": 0.37093526124954224, "learning_rate": 3.963210702341137e-06, "loss": 6.3842, "step": 20500 }, { "epoch": 45.51372331577488, "eval_loss": 6.386034965515137, "eval_runtime": 66.1094, "eval_samples_per_second": 151.265, "eval_steps_per_second": 18.908, "step": 20500 }, { "epoch": 45.73551427779318, "grad_norm": 0.4181094169616699, "learning_rate": 3.953177257525084e-06, "loss": 6.3827, "step": 20600 }, { "epoch": 45.73551427779318, "eval_loss": 6.386598587036133, "eval_runtime": 63.6628, "eval_samples_per_second": 157.078, "eval_steps_per_second": 19.635, "step": 20600 }, { "epoch": 45.957305239811475, "grad_norm": 0.6212390661239624, "learning_rate": 3.943143812709031e-06, "loss": 6.3864, "step": 20700 }, { "epoch": 45.957305239811475, "eval_loss": 6.3882646560668945, "eval_runtime": 65.9973, "eval_samples_per_second": 151.521, "eval_steps_per_second": 18.94, "step": 20700 }, { "epoch": 46.17909620182977, "grad_norm": 0.443857878446579, "learning_rate": 3.9331103678929765e-06, "loss": 6.3859, "step": 20800 }, { "epoch": 46.17909620182977, "eval_loss": 6.388275623321533, "eval_runtime": 63.7053, "eval_samples_per_second": 156.973, "eval_steps_per_second": 19.622, "step": 20800 }, { "epoch": 46.400887163848076, "grad_norm": 0.2678993344306946, "learning_rate": 3.923076923076923e-06, "loss": 6.3865, "step": 20900 }, { "epoch": 46.400887163848076, "eval_loss": 6.38779354095459, "eval_runtime": 63.6908, "eval_samples_per_second": 157.009, "eval_steps_per_second": 19.626, "step": 20900 }, { "epoch": 46.62267812586637, "grad_norm": 0.35121896862983704, "learning_rate": 3.91304347826087e-06, "loss": 6.3842, "step": 21000 }, { "epoch": 46.62267812586637, "eval_loss": 6.385668754577637, "eval_runtime": 66.0547, "eval_samples_per_second": 151.39, "eval_steps_per_second": 18.924, "step": 21000 }, { "epoch": 46.84446908788467, "grad_norm": 0.6166325807571411, "learning_rate": 3.903010033444816e-06, "loss": 6.3848, "step": 21100 }, { "epoch": 46.84446908788467, "eval_loss": 6.385282516479492, "eval_runtime": 63.6134, "eval_samples_per_second": 157.2, "eval_steps_per_second": 19.65, "step": 21100 }, { "epoch": 47.066260049902965, "grad_norm": 0.5324620008468628, "learning_rate": 3.892976588628763e-06, "loss": 6.3847, "step": 21200 }, { "epoch": 47.066260049902965, "eval_loss": 6.386166572570801, "eval_runtime": 63.7747, "eval_samples_per_second": 156.802, "eval_steps_per_second": 19.6, "step": 21200 }, { "epoch": 47.28805101192126, "grad_norm": 0.37806278467178345, "learning_rate": 3.8829431438127095e-06, "loss": 6.3847, "step": 21300 }, { "epoch": 47.28805101192126, "eval_loss": 6.387280464172363, "eval_runtime": 66.0795, "eval_samples_per_second": 151.333, "eval_steps_per_second": 18.917, "step": 21300 }, { "epoch": 47.509841973939565, "grad_norm": 0.2344857156276703, "learning_rate": 3.8729096989966554e-06, "loss": 6.3851, "step": 21400 }, { "epoch": 47.509841973939565, "eval_loss": 6.38550329208374, "eval_runtime": 63.7048, "eval_samples_per_second": 156.974, "eval_steps_per_second": 19.622, "step": 21400 }, { "epoch": 47.73163293595786, "grad_norm": 0.47279292345046997, "learning_rate": 3.862876254180602e-06, "loss": 6.3843, "step": 21500 }, { "epoch": 47.73163293595786, "eval_loss": 6.390079021453857, "eval_runtime": 63.7137, "eval_samples_per_second": 156.952, "eval_steps_per_second": 19.619, "step": 21500 }, { "epoch": 47.95342389797616, "grad_norm": 0.5413157343864441, "learning_rate": 3.852842809364549e-06, "loss": 6.3844, "step": 21600 }, { "epoch": 47.95342389797616, "eval_loss": 6.385741233825684, "eval_runtime": 66.101, "eval_samples_per_second": 151.284, "eval_steps_per_second": 18.91, "step": 21600 }, { "epoch": 48.175214859994455, "grad_norm": 0.48085787892341614, "learning_rate": 3.842809364548495e-06, "loss": 6.3851, "step": 21700 }, { "epoch": 48.175214859994455, "eval_loss": 6.385941505432129, "eval_runtime": 63.6718, "eval_samples_per_second": 157.055, "eval_steps_per_second": 19.632, "step": 21700 }, { "epoch": 48.39700582201275, "grad_norm": 0.6270382404327393, "learning_rate": 3.832775919732442e-06, "loss": 6.3845, "step": 21800 }, { "epoch": 48.39700582201275, "eval_loss": 6.387849807739258, "eval_runtime": 66.1314, "eval_samples_per_second": 151.214, "eval_steps_per_second": 18.902, "step": 21800 }, { "epoch": 48.61879678403105, "grad_norm": 0.36722734570503235, "learning_rate": 3.822742474916388e-06, "loss": 6.3848, "step": 21900 }, { "epoch": 48.61879678403105, "eval_loss": 6.387927532196045, "eval_runtime": 63.6715, "eval_samples_per_second": 157.056, "eval_steps_per_second": 19.632, "step": 21900 }, { "epoch": 48.84058774604935, "grad_norm": 0.4715673327445984, "learning_rate": 3.8127090301003347e-06, "loss": 6.3828, "step": 22000 }, { "epoch": 48.84058774604935, "eval_loss": 6.388005256652832, "eval_runtime": 63.7564, "eval_samples_per_second": 156.847, "eval_steps_per_second": 19.606, "step": 22000 }, { "epoch": 49.06237870806765, "grad_norm": 0.46226397156715393, "learning_rate": 3.802675585284281e-06, "loss": 6.3839, "step": 22100 }, { "epoch": 49.06237870806765, "eval_loss": 6.386138439178467, "eval_runtime": 65.9562, "eval_samples_per_second": 151.616, "eval_steps_per_second": 18.952, "step": 22100 }, { "epoch": 49.284169670085944, "grad_norm": 0.48933687806129456, "learning_rate": 3.792642140468228e-06, "loss": 6.3835, "step": 22200 }, { "epoch": 49.284169670085944, "eval_loss": 6.386913776397705, "eval_runtime": 63.5702, "eval_samples_per_second": 157.306, "eval_steps_per_second": 19.663, "step": 22200 }, { "epoch": 49.50596063210424, "grad_norm": 0.4057106375694275, "learning_rate": 3.782608695652174e-06, "loss": 6.3831, "step": 22300 }, { "epoch": 49.50596063210424, "eval_loss": 6.3875555992126465, "eval_runtime": 63.6283, "eval_samples_per_second": 157.163, "eval_steps_per_second": 19.645, "step": 22300 }, { "epoch": 49.72775159412254, "grad_norm": 0.4397966265678406, "learning_rate": 3.7725752508361205e-06, "loss": 6.3847, "step": 22400 }, { "epoch": 49.72775159412254, "eval_loss": 6.386244297027588, "eval_runtime": 63.6792, "eval_samples_per_second": 157.037, "eval_steps_per_second": 19.63, "step": 22400 }, { "epoch": 49.949542556140834, "grad_norm": 0.4629203677177429, "learning_rate": 3.7625418060200673e-06, "loss": 6.384, "step": 22500 }, { "epoch": 49.949542556140834, "eval_loss": 6.386322498321533, "eval_runtime": 66.1359, "eval_samples_per_second": 151.204, "eval_steps_per_second": 18.9, "step": 22500 }, { "epoch": 50.17133351815914, "grad_norm": 0.43559348583221436, "learning_rate": 3.7525083612040136e-06, "loss": 6.3831, "step": 22600 }, { "epoch": 50.17133351815914, "eval_loss": 6.386173248291016, "eval_runtime": 63.6043, "eval_samples_per_second": 157.222, "eval_steps_per_second": 19.653, "step": 22600 }, { "epoch": 50.393124480177434, "grad_norm": 0.3772810399532318, "learning_rate": 3.74247491638796e-06, "loss": 6.3836, "step": 22700 }, { "epoch": 50.393124480177434, "eval_loss": 6.38073205947876, "eval_runtime": 63.7199, "eval_samples_per_second": 156.937, "eval_steps_per_second": 19.617, "step": 22700 }, { "epoch": 50.61491544219573, "grad_norm": 0.36232537031173706, "learning_rate": 3.7324414715719067e-06, "loss": 6.3837, "step": 22800 }, { "epoch": 50.61491544219573, "eval_loss": 6.385157108306885, "eval_runtime": 66.1214, "eval_samples_per_second": 151.237, "eval_steps_per_second": 18.905, "step": 22800 }, { "epoch": 50.83670640421403, "grad_norm": 0.3568231165409088, "learning_rate": 3.722408026755853e-06, "loss": 6.3837, "step": 22900 }, { "epoch": 50.83670640421403, "eval_loss": 6.388894081115723, "eval_runtime": 63.6202, "eval_samples_per_second": 157.183, "eval_steps_per_second": 19.648, "step": 22900 }, { "epoch": 51.058497366232324, "grad_norm": 0.5292544960975647, "learning_rate": 3.7123745819398e-06, "loss": 6.3824, "step": 23000 }, { "epoch": 51.058497366232324, "eval_loss": 6.382253170013428, "eval_runtime": 63.6223, "eval_samples_per_second": 157.178, "eval_steps_per_second": 19.647, "step": 23000 }, { "epoch": 51.28028832825063, "grad_norm": 0.47718894481658936, "learning_rate": 3.702341137123746e-06, "loss": 6.3833, "step": 23100 }, { "epoch": 51.28028832825063, "eval_loss": 6.389714241027832, "eval_runtime": 66.0943, "eval_samples_per_second": 151.299, "eval_steps_per_second": 18.912, "step": 23100 }, { "epoch": 51.502079290268924, "grad_norm": 0.2303953319787979, "learning_rate": 3.6923076923076925e-06, "loss": 6.3822, "step": 23200 }, { "epoch": 51.502079290268924, "eval_loss": 6.384761810302734, "eval_runtime": 63.6768, "eval_samples_per_second": 157.043, "eval_steps_per_second": 19.63, "step": 23200 }, { "epoch": 51.72387025228722, "grad_norm": 0.4536280035972595, "learning_rate": 3.6822742474916393e-06, "loss": 6.3829, "step": 23300 }, { "epoch": 51.72387025228722, "eval_loss": 6.38330078125, "eval_runtime": 63.6407, "eval_samples_per_second": 157.132, "eval_steps_per_second": 19.642, "step": 23300 }, { "epoch": 51.94566121430552, "grad_norm": 0.36595970392227173, "learning_rate": 3.6722408026755856e-06, "loss": 6.3839, "step": 23400 }, { "epoch": 51.94566121430552, "eval_loss": 6.384377956390381, "eval_runtime": 63.6117, "eval_samples_per_second": 157.204, "eval_steps_per_second": 19.65, "step": 23400 }, { "epoch": 52.16745217632381, "grad_norm": 0.4151841104030609, "learning_rate": 3.662207357859532e-06, "loss": 6.3838, "step": 23500 }, { "epoch": 52.16745217632381, "eval_loss": 6.385963439941406, "eval_runtime": 66.0487, "eval_samples_per_second": 151.403, "eval_steps_per_second": 18.925, "step": 23500 }, { "epoch": 52.38924313834211, "grad_norm": 0.3460543155670166, "learning_rate": 3.6521739130434787e-06, "loss": 6.3828, "step": 23600 }, { "epoch": 52.38924313834211, "eval_loss": 6.384364128112793, "eval_runtime": 63.6451, "eval_samples_per_second": 157.121, "eval_steps_per_second": 19.64, "step": 23600 }, { "epoch": 52.61103410036041, "grad_norm": 0.35991814732551575, "learning_rate": 3.642140468227425e-06, "loss": 6.3828, "step": 23700 }, { "epoch": 52.61103410036041, "eval_loss": 6.382322311401367, "eval_runtime": 63.5885, "eval_samples_per_second": 157.261, "eval_steps_per_second": 19.658, "step": 23700 }, { "epoch": 52.83282506237871, "grad_norm": 0.556122899055481, "learning_rate": 3.6321070234113714e-06, "loss": 6.383, "step": 23800 }, { "epoch": 52.83282506237871, "eval_loss": 6.387279987335205, "eval_runtime": 63.668, "eval_samples_per_second": 157.065, "eval_steps_per_second": 19.633, "step": 23800 }, { "epoch": 53.054616024397006, "grad_norm": 0.4246836304664612, "learning_rate": 3.622073578595318e-06, "loss": 6.3842, "step": 23900 }, { "epoch": 53.054616024397006, "eval_loss": 6.382977485656738, "eval_runtime": 65.9495, "eval_samples_per_second": 151.631, "eval_steps_per_second": 18.954, "step": 23900 }, { "epoch": 53.2764069864153, "grad_norm": 0.4062933027744293, "learning_rate": 3.6120401337792645e-06, "loss": 6.3829, "step": 24000 }, { "epoch": 53.2764069864153, "eval_loss": 6.386227130889893, "eval_runtime": 63.6044, "eval_samples_per_second": 157.222, "eval_steps_per_second": 19.653, "step": 24000 }, { "epoch": 53.4981979484336, "grad_norm": 0.36249685287475586, "learning_rate": 3.6020066889632112e-06, "loss": 6.3841, "step": 24100 }, { "epoch": 53.4981979484336, "eval_loss": 6.388720989227295, "eval_runtime": 63.7502, "eval_samples_per_second": 156.862, "eval_steps_per_second": 19.608, "step": 24100 }, { "epoch": 53.719988910451896, "grad_norm": 0.464330792427063, "learning_rate": 3.5919732441471576e-06, "loss": 6.3821, "step": 24200 }, { "epoch": 53.719988910451896, "eval_loss": 6.385589122772217, "eval_runtime": 66.108, "eval_samples_per_second": 151.268, "eval_steps_per_second": 18.908, "step": 24200 }, { "epoch": 53.9417798724702, "grad_norm": 0.36706265807151794, "learning_rate": 3.581939799331104e-06, "loss": 6.3834, "step": 24300 }, { "epoch": 53.9417798724702, "eval_loss": 6.385077476501465, "eval_runtime": 63.7574, "eval_samples_per_second": 156.844, "eval_steps_per_second": 19.606, "step": 24300 }, { "epoch": 54.2217909620183, "grad_norm": 0.5084080100059509, "learning_rate": 3.5719063545150507e-06, "loss": 6.3829, "step": 24400 }, { "epoch": 54.2217909620183, "eval_loss": 6.384501934051514, "eval_runtime": 66.1045, "eval_samples_per_second": 151.276, "eval_steps_per_second": 18.909, "step": 24400 }, { "epoch": 54.44358192403659, "grad_norm": 0.2843925952911377, "learning_rate": 3.561872909698997e-06, "loss": 6.3828, "step": 24500 }, { "epoch": 54.44358192403659, "eval_loss": 6.386019706726074, "eval_runtime": 63.7676, "eval_samples_per_second": 156.819, "eval_steps_per_second": 19.602, "step": 24500 }, { "epoch": 54.6653728860549, "grad_norm": 0.3394639492034912, "learning_rate": 3.5518394648829434e-06, "loss": 6.3839, "step": 24600 }, { "epoch": 54.6653728860549, "eval_loss": 6.385280132293701, "eval_runtime": 64.0386, "eval_samples_per_second": 156.156, "eval_steps_per_second": 19.519, "step": 24600 }, { "epoch": 54.88716384807319, "grad_norm": 0.5277294516563416, "learning_rate": 3.54180602006689e-06, "loss": 6.3827, "step": 24700 }, { "epoch": 54.88716384807319, "eval_loss": 6.382243633270264, "eval_runtime": 66.1687, "eval_samples_per_second": 151.129, "eval_steps_per_second": 18.891, "step": 24700 }, { "epoch": 55.10895481009149, "grad_norm": 0.4542704224586487, "learning_rate": 3.5317725752508365e-06, "loss": 6.3835, "step": 24800 }, { "epoch": 55.10895481009149, "eval_loss": 6.384250640869141, "eval_runtime": 63.6729, "eval_samples_per_second": 157.053, "eval_steps_per_second": 19.632, "step": 24800 }, { "epoch": 55.330745772109786, "grad_norm": 0.4311918318271637, "learning_rate": 3.521739130434783e-06, "loss": 6.3821, "step": 24900 }, { "epoch": 55.330745772109786, "eval_loss": 6.382208824157715, "eval_runtime": 63.7247, "eval_samples_per_second": 156.925, "eval_steps_per_second": 19.616, "step": 24900 }, { "epoch": 55.55253673412808, "grad_norm": 0.5033969283103943, "learning_rate": 3.5117056856187296e-06, "loss": 6.3828, "step": 25000 }, { "epoch": 55.55253673412808, "eval_loss": 6.384891510009766, "eval_runtime": 66.1992, "eval_samples_per_second": 151.059, "eval_steps_per_second": 18.882, "step": 25000 }, { "epoch": 55.77432769614638, "grad_norm": 0.389417827129364, "learning_rate": 3.501672240802676e-06, "loss": 6.3821, "step": 25100 }, { "epoch": 55.77432769614638, "eval_loss": 6.3841633796691895, "eval_runtime": 63.7582, "eval_samples_per_second": 156.843, "eval_steps_per_second": 19.605, "step": 25100 }, { "epoch": 55.99611865816468, "grad_norm": 0.35223087668418884, "learning_rate": 3.491638795986622e-06, "loss": 6.382, "step": 25200 }, { "epoch": 55.99611865816468, "eval_loss": 6.3838019371032715, "eval_runtime": 63.6971, "eval_samples_per_second": 156.993, "eval_steps_per_second": 19.624, "step": 25200 }, { "epoch": 56.21790962018298, "grad_norm": 0.3913029134273529, "learning_rate": 3.481605351170568e-06, "loss": 6.3815, "step": 25300 }, { "epoch": 56.21790962018298, "eval_loss": 6.3869524002075195, "eval_runtime": 66.1208, "eval_samples_per_second": 151.238, "eval_steps_per_second": 18.905, "step": 25300 }, { "epoch": 56.439700582201276, "grad_norm": 0.4827691614627838, "learning_rate": 3.471571906354515e-06, "loss": 6.3827, "step": 25400 }, { "epoch": 56.439700582201276, "eval_loss": 6.384666442871094, "eval_runtime": 63.6765, "eval_samples_per_second": 157.044, "eval_steps_per_second": 19.63, "step": 25400 }, { "epoch": 56.66149154421957, "grad_norm": 0.3427080512046814, "learning_rate": 3.4615384615384613e-06, "loss": 6.3827, "step": 25500 }, { "epoch": 56.66149154421957, "eval_loss": 6.384727478027344, "eval_runtime": 66.2151, "eval_samples_per_second": 151.023, "eval_steps_per_second": 18.878, "step": 25500 }, { "epoch": 56.88328250623787, "grad_norm": 0.43282854557037354, "learning_rate": 3.4515050167224076e-06, "loss": 6.3822, "step": 25600 }, { "epoch": 56.88328250623787, "eval_loss": 6.384084224700928, "eval_runtime": 63.8392, "eval_samples_per_second": 156.643, "eval_steps_per_second": 19.58, "step": 25600 }, { "epoch": 57.105073468256165, "grad_norm": 0.42564040422439575, "learning_rate": 3.4414715719063544e-06, "loss": 6.3814, "step": 25700 }, { "epoch": 57.105073468256165, "eval_loss": 6.383011817932129, "eval_runtime": 63.6955, "eval_samples_per_second": 156.997, "eval_steps_per_second": 19.625, "step": 25700 }, { "epoch": 57.32686443027447, "grad_norm": 0.3655114471912384, "learning_rate": 3.4314381270903007e-06, "loss": 6.3813, "step": 25800 }, { "epoch": 57.32686443027447, "eval_loss": 6.384052753448486, "eval_runtime": 66.0629, "eval_samples_per_second": 151.371, "eval_steps_per_second": 18.921, "step": 25800 }, { "epoch": 57.548655392292765, "grad_norm": 0.4009644389152527, "learning_rate": 3.4214046822742475e-06, "loss": 6.3819, "step": 25900 }, { "epoch": 57.548655392292765, "eval_loss": 6.384483814239502, "eval_runtime": 63.6201, "eval_samples_per_second": 157.183, "eval_steps_per_second": 19.648, "step": 25900 }, { "epoch": 57.77044635431106, "grad_norm": 0.45892468094825745, "learning_rate": 3.411371237458194e-06, "loss": 6.3823, "step": 26000 }, { "epoch": 57.77044635431106, "eval_loss": 6.382046222686768, "eval_runtime": 63.6871, "eval_samples_per_second": 157.018, "eval_steps_per_second": 19.627, "step": 26000 }, { "epoch": 57.99223731632936, "grad_norm": 0.6261206865310669, "learning_rate": 3.40133779264214e-06, "loss": 6.3822, "step": 26100 }, { "epoch": 57.99223731632936, "eval_loss": 6.385235786437988, "eval_runtime": 66.2139, "eval_samples_per_second": 151.026, "eval_steps_per_second": 18.878, "step": 26100 }, { "epoch": 58.214028278347655, "grad_norm": 0.38988542556762695, "learning_rate": 3.391304347826087e-06, "loss": 6.3817, "step": 26200 }, { "epoch": 58.214028278347655, "eval_loss": 6.385043144226074, "eval_runtime": 63.6337, "eval_samples_per_second": 157.149, "eval_steps_per_second": 19.644, "step": 26200 }, { "epoch": 58.43581924036596, "grad_norm": 0.3526028096675873, "learning_rate": 3.3812709030100333e-06, "loss": 6.3819, "step": 26300 }, { "epoch": 58.43581924036596, "eval_loss": 6.385810375213623, "eval_runtime": 63.6001, "eval_samples_per_second": 157.233, "eval_steps_per_second": 19.654, "step": 26300 }, { "epoch": 58.657610202384255, "grad_norm": 0.38116052746772766, "learning_rate": 3.3712374581939796e-06, "loss": 6.3835, "step": 26400 }, { "epoch": 58.657610202384255, "eval_loss": 6.383828639984131, "eval_runtime": 66.0823, "eval_samples_per_second": 151.327, "eval_steps_per_second": 18.916, "step": 26400 }, { "epoch": 58.87940116440255, "grad_norm": 0.5195460319519043, "learning_rate": 3.3612040133779264e-06, "loss": 6.3824, "step": 26500 }, { "epoch": 58.87940116440255, "eval_loss": 6.3872599601745605, "eval_runtime": 63.6944, "eval_samples_per_second": 157.0, "eval_steps_per_second": 19.625, "step": 26500 }, { "epoch": 59.10119212642085, "grad_norm": 0.3986002206802368, "learning_rate": 3.3511705685618727e-06, "loss": 6.3813, "step": 26600 }, { "epoch": 59.10119212642085, "eval_loss": 6.384389877319336, "eval_runtime": 63.651, "eval_samples_per_second": 157.107, "eval_steps_per_second": 19.638, "step": 26600 }, { "epoch": 59.322983088439145, "grad_norm": 0.3788560628890991, "learning_rate": 3.3411371237458195e-06, "loss": 6.3834, "step": 26700 }, { "epoch": 59.322983088439145, "eval_loss": 6.383492946624756, "eval_runtime": 66.1062, "eval_samples_per_second": 151.272, "eval_steps_per_second": 18.909, "step": 26700 }, { "epoch": 59.54477405045744, "grad_norm": 0.3633769750595093, "learning_rate": 3.331103678929766e-06, "loss": 6.3806, "step": 26800 }, { "epoch": 59.54477405045744, "eval_loss": 6.383812427520752, "eval_runtime": 63.6852, "eval_samples_per_second": 157.022, "eval_steps_per_second": 19.628, "step": 26800 }, { "epoch": 59.766565012475745, "grad_norm": 0.5389061570167542, "learning_rate": 3.321070234113712e-06, "loss": 6.3818, "step": 26900 }, { "epoch": 59.766565012475745, "eval_loss": 6.386070251464844, "eval_runtime": 63.6726, "eval_samples_per_second": 157.054, "eval_steps_per_second": 19.632, "step": 26900 }, { "epoch": 59.98835597449404, "grad_norm": 0.5415310263633728, "learning_rate": 3.311036789297659e-06, "loss": 6.3812, "step": 27000 }, { "epoch": 59.98835597449404, "eval_loss": 6.386297702789307, "eval_runtime": 66.0998, "eval_samples_per_second": 151.286, "eval_steps_per_second": 18.911, "step": 27000 }, { "epoch": 60.21014693651234, "grad_norm": 0.25073182582855225, "learning_rate": 3.3010033444816052e-06, "loss": 6.3825, "step": 27100 }, { "epoch": 60.21014693651234, "eval_loss": 6.384896278381348, "eval_runtime": 63.7213, "eval_samples_per_second": 156.933, "eval_steps_per_second": 19.617, "step": 27100 }, { "epoch": 60.431937898530634, "grad_norm": 0.2894960045814514, "learning_rate": 3.2909698996655516e-06, "loss": 6.3806, "step": 27200 }, { "epoch": 60.431937898530634, "eval_loss": 6.383223533630371, "eval_runtime": 66.1431, "eval_samples_per_second": 151.187, "eval_steps_per_second": 18.898, "step": 27200 }, { "epoch": 60.65372886054893, "grad_norm": 0.48593568801879883, "learning_rate": 3.2809364548494983e-06, "loss": 6.3814, "step": 27300 }, { "epoch": 60.65372886054893, "eval_loss": 6.382923603057861, "eval_runtime": 63.8018, "eval_samples_per_second": 156.735, "eval_steps_per_second": 19.592, "step": 27300 }, { "epoch": 60.87551982256723, "grad_norm": 0.3919661343097687, "learning_rate": 3.2709030100334447e-06, "loss": 6.3812, "step": 27400 }, { "epoch": 60.87551982256723, "eval_loss": 6.384592056274414, "eval_runtime": 63.6834, "eval_samples_per_second": 157.027, "eval_steps_per_second": 19.628, "step": 27400 }, { "epoch": 61.09731078458553, "grad_norm": 0.41026151180267334, "learning_rate": 3.260869565217391e-06, "loss": 6.3823, "step": 27500 }, { "epoch": 61.09731078458553, "eval_loss": 6.385217189788818, "eval_runtime": 66.0642, "eval_samples_per_second": 151.368, "eval_steps_per_second": 18.921, "step": 27500 }, { "epoch": 61.31910174660383, "grad_norm": 0.3794995844364166, "learning_rate": 3.2508361204013378e-06, "loss": 6.3811, "step": 27600 }, { "epoch": 61.31910174660383, "eval_loss": 6.383106231689453, "eval_runtime": 63.705, "eval_samples_per_second": 156.973, "eval_steps_per_second": 19.622, "step": 27600 }, { "epoch": 61.540892708622124, "grad_norm": 0.4461415410041809, "learning_rate": 3.240802675585284e-06, "loss": 6.3828, "step": 27700 }, { "epoch": 61.540892708622124, "eval_loss": 6.384341239929199, "eval_runtime": 63.6974, "eval_samples_per_second": 156.992, "eval_steps_per_second": 19.624, "step": 27700 }, { "epoch": 61.76268367064042, "grad_norm": 0.24599848687648773, "learning_rate": 3.230769230769231e-06, "loss": 6.3807, "step": 27800 }, { "epoch": 61.76268367064042, "eval_loss": 6.384603023529053, "eval_runtime": 66.1353, "eval_samples_per_second": 151.205, "eval_steps_per_second": 18.901, "step": 27800 }, { "epoch": 61.98447463265872, "grad_norm": 0.2466522455215454, "learning_rate": 3.2207357859531772e-06, "loss": 6.3823, "step": 27900 }, { "epoch": 61.98447463265872, "eval_loss": 6.383478164672852, "eval_runtime": 63.6948, "eval_samples_per_second": 156.999, "eval_steps_per_second": 19.625, "step": 27900 }, { "epoch": 62.206265594677014, "grad_norm": 0.3806278705596924, "learning_rate": 3.2107023411371236e-06, "loss": 6.3806, "step": 28000 }, { "epoch": 62.206265594677014, "eval_loss": 6.382126331329346, "eval_runtime": 63.5806, "eval_samples_per_second": 157.281, "eval_steps_per_second": 19.66, "step": 28000 }, { "epoch": 62.42805655669532, "grad_norm": 0.5161334872245789, "learning_rate": 3.2006688963210703e-06, "loss": 6.3816, "step": 28100 }, { "epoch": 62.42805655669532, "eval_loss": 6.384099960327148, "eval_runtime": 66.2035, "eval_samples_per_second": 151.049, "eval_steps_per_second": 18.881, "step": 28100 }, { "epoch": 62.649847518713614, "grad_norm": 0.44599130749702454, "learning_rate": 3.1906354515050167e-06, "loss": 6.3799, "step": 28200 }, { "epoch": 62.649847518713614, "eval_loss": 6.385605335235596, "eval_runtime": 63.6738, "eval_samples_per_second": 157.051, "eval_steps_per_second": 19.631, "step": 28200 }, { "epoch": 62.87163848073191, "grad_norm": 0.49202173948287964, "learning_rate": 3.180602006688963e-06, "loss": 6.3817, "step": 28300 }, { "epoch": 62.87163848073191, "eval_loss": 6.3858418464660645, "eval_runtime": 63.6792, "eval_samples_per_second": 157.037, "eval_steps_per_second": 19.63, "step": 28300 }, { "epoch": 63.09342944275021, "grad_norm": 0.4090692400932312, "learning_rate": 3.1705685618729098e-06, "loss": 6.3797, "step": 28400 }, { "epoch": 63.09342944275021, "eval_loss": 6.381466388702393, "eval_runtime": 66.0632, "eval_samples_per_second": 151.37, "eval_steps_per_second": 18.921, "step": 28400 }, { "epoch": 63.3152204047685, "grad_norm": 0.4286213517189026, "learning_rate": 3.160535117056856e-06, "loss": 6.3816, "step": 28500 }, { "epoch": 63.3152204047685, "eval_loss": 6.383074760437012, "eval_runtime": 63.6206, "eval_samples_per_second": 157.182, "eval_steps_per_second": 19.648, "step": 28500 }, { "epoch": 63.53701136678681, "grad_norm": 0.36026620864868164, "learning_rate": 3.1505016722408024e-06, "loss": 6.3811, "step": 28600 }, { "epoch": 63.53701136678681, "eval_loss": 6.383544445037842, "eval_runtime": 63.7194, "eval_samples_per_second": 156.938, "eval_steps_per_second": 19.617, "step": 28600 }, { "epoch": 63.7588023288051, "grad_norm": 0.5875244140625, "learning_rate": 3.140468227424749e-06, "loss": 6.3822, "step": 28700 }, { "epoch": 63.7588023288051, "eval_loss": 6.384294033050537, "eval_runtime": 66.1191, "eval_samples_per_second": 151.242, "eval_steps_per_second": 18.905, "step": 28700 }, { "epoch": 63.9805932908234, "grad_norm": 0.39102068543434143, "learning_rate": 3.1304347826086955e-06, "loss": 6.3823, "step": 28800 }, { "epoch": 63.9805932908234, "eval_loss": 6.381502628326416, "eval_runtime": 63.7504, "eval_samples_per_second": 156.862, "eval_steps_per_second": 19.608, "step": 28800 }, { "epoch": 64.2023842528417, "grad_norm": 0.4450345039367676, "learning_rate": 3.1204013377926423e-06, "loss": 6.3813, "step": 28900 }, { "epoch": 64.2023842528417, "eval_loss": 6.384424209594727, "eval_runtime": 66.2286, "eval_samples_per_second": 150.992, "eval_steps_per_second": 18.874, "step": 28900 }, { "epoch": 64.42417521486, "grad_norm": 0.3168383240699768, "learning_rate": 3.1103678929765886e-06, "loss": 6.383, "step": 29000 }, { "epoch": 64.42417521486, "eval_loss": 6.385626316070557, "eval_runtime": 63.7217, "eval_samples_per_second": 156.932, "eval_steps_per_second": 19.617, "step": 29000 }, { "epoch": 64.64596617687829, "grad_norm": 0.3088781535625458, "learning_rate": 3.100334448160535e-06, "loss": 6.3807, "step": 29100 }, { "epoch": 64.64596617687829, "eval_loss": 6.385305881500244, "eval_runtime": 63.6226, "eval_samples_per_second": 157.177, "eval_steps_per_second": 19.647, "step": 29100 }, { "epoch": 64.8677571388966, "grad_norm": 0.4493953287601471, "learning_rate": 3.0903010033444818e-06, "loss": 6.381, "step": 29200 }, { "epoch": 64.8677571388966, "eval_loss": 6.383870601654053, "eval_runtime": 66.0987, "eval_samples_per_second": 151.289, "eval_steps_per_second": 18.911, "step": 29200 }, { "epoch": 65.08954810091488, "grad_norm": 0.3246123194694519, "learning_rate": 3.080267558528428e-06, "loss": 6.3811, "step": 29300 }, { "epoch": 65.08954810091488, "eval_loss": 6.383446216583252, "eval_runtime": 63.63, "eval_samples_per_second": 157.159, "eval_steps_per_second": 19.645, "step": 29300 }, { "epoch": 65.31133906293319, "grad_norm": 0.2923065721988678, "learning_rate": 3.0702341137123744e-06, "loss": 6.3805, "step": 29400 }, { "epoch": 65.31133906293319, "eval_loss": 6.382349014282227, "eval_runtime": 66.1161, "eval_samples_per_second": 151.249, "eval_steps_per_second": 18.906, "step": 29400 }, { "epoch": 65.53313002495149, "grad_norm": 0.48411309719085693, "learning_rate": 3.060200668896321e-06, "loss": 6.3816, "step": 29500 }, { "epoch": 65.53313002495149, "eval_loss": 6.381749153137207, "eval_runtime": 63.6992, "eval_samples_per_second": 156.988, "eval_steps_per_second": 19.623, "step": 29500 }, { "epoch": 65.75492098696978, "grad_norm": 0.3250056803226471, "learning_rate": 3.0501672240802675e-06, "loss": 6.3806, "step": 29600 }, { "epoch": 65.75492098696978, "eval_loss": 6.383174896240234, "eval_runtime": 63.6191, "eval_samples_per_second": 157.185, "eval_steps_per_second": 19.648, "step": 29600 }, { "epoch": 65.97671194898808, "grad_norm": 0.337882399559021, "learning_rate": 3.0401337792642143e-06, "loss": 6.3793, "step": 29700 }, { "epoch": 65.97671194898808, "eval_loss": 6.383576393127441, "eval_runtime": 66.0393, "eval_samples_per_second": 151.425, "eval_steps_per_second": 18.928, "step": 29700 }, { "epoch": 66.19850291100637, "grad_norm": 0.36923250555992126, "learning_rate": 3.0301003344481606e-06, "loss": 6.3805, "step": 29800 }, { "epoch": 66.19850291100637, "eval_loss": 6.383658409118652, "eval_runtime": 63.5576, "eval_samples_per_second": 157.338, "eval_steps_per_second": 19.667, "step": 29800 }, { "epoch": 66.42029387302468, "grad_norm": 0.3375002443790436, "learning_rate": 3.020066889632107e-06, "loss": 6.3805, "step": 29900 }, { "epoch": 66.42029387302468, "eval_loss": 6.382904529571533, "eval_runtime": 66.0839, "eval_samples_per_second": 151.323, "eval_steps_per_second": 18.915, "step": 29900 }, { "epoch": 66.64208483504297, "grad_norm": 0.44055986404418945, "learning_rate": 3.0100334448160537e-06, "loss": 6.3812, "step": 30000 }, { "epoch": 66.64208483504297, "eval_loss": 6.384601593017578, "eval_runtime": 63.7135, "eval_samples_per_second": 156.953, "eval_steps_per_second": 19.619, "step": 30000 }, { "epoch": 66.86387579706127, "grad_norm": 0.5010361075401306, "learning_rate": 3e-06, "loss": 6.3814, "step": 30100 }, { "epoch": 66.86387579706127, "eval_loss": 6.38201904296875, "eval_runtime": 63.631, "eval_samples_per_second": 157.156, "eval_steps_per_second": 19.645, "step": 30100 }, { "epoch": 67.08566675907957, "grad_norm": 0.36018142104148865, "learning_rate": 2.9899665551839464e-06, "loss": 6.3801, "step": 30200 }, { "epoch": 67.08566675907957, "eval_loss": 6.384942054748535, "eval_runtime": 66.0805, "eval_samples_per_second": 151.331, "eval_steps_per_second": 18.916, "step": 30200 }, { "epoch": 67.30745772109786, "grad_norm": 0.34176790714263916, "learning_rate": 2.979933110367893e-06, "loss": 6.3815, "step": 30300 }, { "epoch": 67.30745772109786, "eval_loss": 6.382652282714844, "eval_runtime": 63.6886, "eval_samples_per_second": 157.014, "eval_steps_per_second": 19.627, "step": 30300 }, { "epoch": 67.52924868311617, "grad_norm": 0.43459710478782654, "learning_rate": 2.9698996655518395e-06, "loss": 6.3811, "step": 30400 }, { "epoch": 67.52924868311617, "eval_loss": 6.386653423309326, "eval_runtime": 63.7002, "eval_samples_per_second": 156.985, "eval_steps_per_second": 19.623, "step": 30400 }, { "epoch": 67.75103964513445, "grad_norm": 0.37743738293647766, "learning_rate": 2.959866220735786e-06, "loss": 6.3804, "step": 30500 }, { "epoch": 67.75103964513445, "eval_loss": 6.383271217346191, "eval_runtime": 66.2218, "eval_samples_per_second": 151.008, "eval_steps_per_second": 18.876, "step": 30500 }, { "epoch": 67.97283060715276, "grad_norm": 0.34814783930778503, "learning_rate": 2.9498327759197326e-06, "loss": 6.3803, "step": 30600 }, { "epoch": 67.97283060715276, "eval_loss": 6.38237190246582, "eval_runtime": 63.6997, "eval_samples_per_second": 156.987, "eval_steps_per_second": 19.623, "step": 30600 }, { "epoch": 68.19462156917106, "grad_norm": 0.344685435295105, "learning_rate": 2.939799331103679e-06, "loss": 6.3795, "step": 30700 }, { "epoch": 68.19462156917106, "eval_loss": 6.384911060333252, "eval_runtime": 63.7005, "eval_samples_per_second": 156.985, "eval_steps_per_second": 19.623, "step": 30700 }, { "epoch": 68.41641253118935, "grad_norm": 0.30350542068481445, "learning_rate": 2.9297658862876257e-06, "loss": 6.3799, "step": 30800 }, { "epoch": 68.41641253118935, "eval_loss": 6.383020401000977, "eval_runtime": 66.1786, "eval_samples_per_second": 151.106, "eval_steps_per_second": 18.888, "step": 30800 }, { "epoch": 68.63820349320766, "grad_norm": 0.33513781428337097, "learning_rate": 2.919732441471572e-06, "loss": 6.3808, "step": 30900 }, { "epoch": 68.63820349320766, "eval_loss": 6.38442325592041, "eval_runtime": 63.6954, "eval_samples_per_second": 156.997, "eval_steps_per_second": 19.625, "step": 30900 }, { "epoch": 68.85999445522594, "grad_norm": 0.38895151019096375, "learning_rate": 2.9096989966555184e-06, "loss": 6.3803, "step": 31000 }, { "epoch": 68.85999445522594, "eval_loss": 6.382268905639648, "eval_runtime": 66.1082, "eval_samples_per_second": 151.267, "eval_steps_per_second": 18.908, "step": 31000 }, { "epoch": 69.08178541724425, "grad_norm": 0.49591463804244995, "learning_rate": 2.899665551839465e-06, "loss": 6.381, "step": 31100 }, { "epoch": 69.08178541724425, "eval_loss": 6.384127140045166, "eval_runtime": 63.6361, "eval_samples_per_second": 157.144, "eval_steps_per_second": 19.643, "step": 31100 }, { "epoch": 69.30357637926255, "grad_norm": 0.47946080565452576, "learning_rate": 2.8896321070234115e-06, "loss": 6.3803, "step": 31200 }, { "epoch": 69.30357637926255, "eval_loss": 6.380748748779297, "eval_runtime": 63.7274, "eval_samples_per_second": 156.918, "eval_steps_per_second": 19.615, "step": 31200 }, { "epoch": 69.52536734128084, "grad_norm": 0.33409592509269714, "learning_rate": 2.879598662207358e-06, "loss": 6.3795, "step": 31300 }, { "epoch": 69.52536734128084, "eval_loss": 6.3822197914123535, "eval_runtime": 66.2573, "eval_samples_per_second": 150.927, "eval_steps_per_second": 18.866, "step": 31300 }, { "epoch": 69.74715830329914, "grad_norm": 0.36530378460884094, "learning_rate": 2.8695652173913046e-06, "loss": 6.3793, "step": 31400 }, { "epoch": 69.74715830329914, "eval_loss": 6.3831787109375, "eval_runtime": 63.6807, "eval_samples_per_second": 157.034, "eval_steps_per_second": 19.629, "step": 31400 }, { "epoch": 69.96894926531743, "grad_norm": 0.4838181436061859, "learning_rate": 2.859531772575251e-06, "loss": 6.3802, "step": 31500 }, { "epoch": 69.96894926531743, "eval_loss": 6.383909225463867, "eval_runtime": 66.1242, "eval_samples_per_second": 151.231, "eval_steps_per_second": 18.904, "step": 31500 }, { "epoch": 70.19074022733574, "grad_norm": 0.3999974727630615, "learning_rate": 2.8494983277591977e-06, "loss": 6.3817, "step": 31600 }, { "epoch": 70.19074022733574, "eval_loss": 6.382571220397949, "eval_runtime": 63.5921, "eval_samples_per_second": 157.252, "eval_steps_per_second": 19.657, "step": 31600 }, { "epoch": 70.41253118935403, "grad_norm": 0.37044674158096313, "learning_rate": 2.839464882943144e-06, "loss": 6.3785, "step": 31700 }, { "epoch": 70.41253118935403, "eval_loss": 6.381692886352539, "eval_runtime": 66.1492, "eval_samples_per_second": 151.173, "eval_steps_per_second": 18.897, "step": 31700 }, { "epoch": 70.63432215137233, "grad_norm": 0.43440505862236023, "learning_rate": 2.8294314381270904e-06, "loss": 6.3811, "step": 31800 }, { "epoch": 70.63432215137233, "eval_loss": 6.384181499481201, "eval_runtime": 63.7156, "eval_samples_per_second": 156.947, "eval_steps_per_second": 19.618, "step": 31800 }, { "epoch": 70.85611311339063, "grad_norm": 0.45394232869148254, "learning_rate": 2.819397993311037e-06, "loss": 6.3803, "step": 31900 }, { "epoch": 70.85611311339063, "eval_loss": 6.382298469543457, "eval_runtime": 66.1426, "eval_samples_per_second": 151.189, "eval_steps_per_second": 18.899, "step": 31900 }, { "epoch": 71.07790407540892, "grad_norm": 0.24088256061077118, "learning_rate": 2.8093645484949835e-06, "loss": 6.3789, "step": 32000 }, { "epoch": 71.07790407540892, "eval_loss": 6.378951072692871, "eval_runtime": 63.7166, "eval_samples_per_second": 156.945, "eval_steps_per_second": 19.618, "step": 32000 }, { "epoch": 71.29969503742723, "grad_norm": 0.3836078643798828, "learning_rate": 2.79933110367893e-06, "loss": 6.3793, "step": 32100 }, { "epoch": 71.29969503742723, "eval_loss": 6.382381916046143, "eval_runtime": 63.7336, "eval_samples_per_second": 156.903, "eval_steps_per_second": 19.613, "step": 32100 }, { "epoch": 71.52148599944552, "grad_norm": 0.3558043837547302, "learning_rate": 2.7892976588628766e-06, "loss": 6.3779, "step": 32200 }, { "epoch": 71.52148599944552, "eval_loss": 6.3820366859436035, "eval_runtime": 66.1055, "eval_samples_per_second": 151.273, "eval_steps_per_second": 18.909, "step": 32200 }, { "epoch": 71.74327696146382, "grad_norm": 0.2369541972875595, "learning_rate": 2.779264214046823e-06, "loss": 6.3808, "step": 32300 }, { "epoch": 71.74327696146382, "eval_loss": 6.37972354888916, "eval_runtime": 63.6602, "eval_samples_per_second": 157.084, "eval_steps_per_second": 19.636, "step": 32300 }, { "epoch": 71.96506792348212, "grad_norm": 0.3357178270816803, "learning_rate": 2.7692307692307693e-06, "loss": 6.3796, "step": 32400 }, { "epoch": 71.96506792348212, "eval_loss": 6.3810296058654785, "eval_runtime": 66.2569, "eval_samples_per_second": 150.928, "eval_steps_per_second": 18.866, "step": 32400 }, { "epoch": 72.18685888550041, "grad_norm": 0.2965914011001587, "learning_rate": 2.759197324414716e-06, "loss": 6.3794, "step": 32500 }, { "epoch": 72.18685888550041, "eval_loss": 6.381561756134033, "eval_runtime": 63.6325, "eval_samples_per_second": 157.152, "eval_steps_per_second": 19.644, "step": 32500 }, { "epoch": 72.40864984751872, "grad_norm": 0.31444767117500305, "learning_rate": 2.749163879598662e-06, "loss": 6.3811, "step": 32600 }, { "epoch": 72.40864984751872, "eval_loss": 6.383826732635498, "eval_runtime": 63.819, "eval_samples_per_second": 156.693, "eval_steps_per_second": 19.587, "step": 32600 }, { "epoch": 72.630440809537, "grad_norm": 0.335440456867218, "learning_rate": 2.7391304347826087e-06, "loss": 6.3787, "step": 32700 }, { "epoch": 72.630440809537, "eval_loss": 6.382222652435303, "eval_runtime": 66.3235, "eval_samples_per_second": 150.776, "eval_steps_per_second": 18.847, "step": 32700 }, { "epoch": 72.85223177155531, "grad_norm": 0.3031088411808014, "learning_rate": 2.729096989966555e-06, "loss": 6.379, "step": 32800 }, { "epoch": 72.85223177155531, "eval_loss": 6.380151748657227, "eval_runtime": 63.7642, "eval_samples_per_second": 156.828, "eval_steps_per_second": 19.603, "step": 32800 }, { "epoch": 73.07402273357361, "grad_norm": 0.2734851539134979, "learning_rate": 2.7190635451505014e-06, "loss": 6.3796, "step": 32900 }, { "epoch": 73.07402273357361, "eval_loss": 6.381131172180176, "eval_runtime": 66.3028, "eval_samples_per_second": 150.823, "eval_steps_per_second": 18.853, "step": 32900 }, { "epoch": 73.2958136955919, "grad_norm": 0.4682227671146393, "learning_rate": 2.709030100334448e-06, "loss": 6.3791, "step": 33000 }, { "epoch": 73.2958136955919, "eval_loss": 6.382552146911621, "eval_runtime": 63.7907, "eval_samples_per_second": 156.763, "eval_steps_per_second": 19.595, "step": 33000 }, { "epoch": 73.5176046576102, "grad_norm": 0.38640567660331726, "learning_rate": 2.6989966555183945e-06, "loss": 6.378, "step": 33100 }, { "epoch": 73.5176046576102, "eval_loss": 6.37783670425415, "eval_runtime": 66.2991, "eval_samples_per_second": 150.832, "eval_steps_per_second": 18.854, "step": 33100 }, { "epoch": 73.7393956196285, "grad_norm": 0.3248431086540222, "learning_rate": 2.6889632107023413e-06, "loss": 6.3798, "step": 33200 }, { "epoch": 73.7393956196285, "eval_loss": 6.382781982421875, "eval_runtime": 63.7636, "eval_samples_per_second": 156.829, "eval_steps_per_second": 19.604, "step": 33200 }, { "epoch": 73.9611865816468, "grad_norm": 0.40707120299339294, "learning_rate": 2.6789297658862876e-06, "loss": 6.3787, "step": 33300 }, { "epoch": 73.9611865816468, "eval_loss": 6.381734371185303, "eval_runtime": 63.7575, "eval_samples_per_second": 156.844, "eval_steps_per_second": 19.606, "step": 33300 }, { "epoch": 74.18297754366509, "grad_norm": 0.3740542232990265, "learning_rate": 2.668896321070234e-06, "loss": 6.3799, "step": 33400 }, { "epoch": 74.18297754366509, "eval_loss": 6.38359260559082, "eval_runtime": 63.7058, "eval_samples_per_second": 156.972, "eval_steps_per_second": 19.621, "step": 33400 }, { "epoch": 74.40476850568339, "grad_norm": 0.3560076057910919, "learning_rate": 2.6588628762541807e-06, "loss": 6.3788, "step": 33500 }, { "epoch": 74.40476850568339, "eval_loss": 6.380216598510742, "eval_runtime": 66.3163, "eval_samples_per_second": 150.792, "eval_steps_per_second": 18.849, "step": 33500 }, { "epoch": 74.6265594677017, "grad_norm": 0.2998209595680237, "learning_rate": 2.648829431438127e-06, "loss": 6.3798, "step": 33600 }, { "epoch": 74.6265594677017, "eval_loss": 6.3799357414245605, "eval_runtime": 63.7525, "eval_samples_per_second": 156.857, "eval_steps_per_second": 19.607, "step": 33600 }, { "epoch": 74.84835042971999, "grad_norm": 0.42181283235549927, "learning_rate": 2.6387959866220734e-06, "loss": 6.3797, "step": 33700 }, { "epoch": 74.84835042971999, "eval_loss": 6.3854804039001465, "eval_runtime": 63.7045, "eval_samples_per_second": 156.975, "eval_steps_per_second": 19.622, "step": 33700 }, { "epoch": 75.07014139173829, "grad_norm": 0.35232749581336975, "learning_rate": 2.62876254180602e-06, "loss": 6.3794, "step": 33800 }, { "epoch": 75.07014139173829, "eval_loss": 6.38426399230957, "eval_runtime": 66.2061, "eval_samples_per_second": 151.043, "eval_steps_per_second": 18.88, "step": 33800 }, { "epoch": 75.29193235375658, "grad_norm": 0.3319035470485687, "learning_rate": 2.6187290969899665e-06, "loss": 6.3801, "step": 33900 }, { "epoch": 75.29193235375658, "eval_loss": 6.382733345031738, "eval_runtime": 63.7241, "eval_samples_per_second": 156.927, "eval_steps_per_second": 19.616, "step": 33900 }, { "epoch": 75.51372331577488, "grad_norm": 0.320116251707077, "learning_rate": 2.6086956521739132e-06, "loss": 6.3796, "step": 34000 }, { "epoch": 75.51372331577488, "eval_loss": 6.383172035217285, "eval_runtime": 66.2886, "eval_samples_per_second": 150.856, "eval_steps_per_second": 18.857, "step": 34000 }, { "epoch": 75.73551427779319, "grad_norm": 0.25732365250587463, "learning_rate": 2.5986622073578596e-06, "loss": 6.3793, "step": 34100 }, { "epoch": 75.73551427779319, "eval_loss": 6.3826189041137695, "eval_runtime": 63.7021, "eval_samples_per_second": 156.981, "eval_steps_per_second": 19.623, "step": 34100 }, { "epoch": 75.95730523981148, "grad_norm": 0.41861915588378906, "learning_rate": 2.588628762541806e-06, "loss": 6.3806, "step": 34200 }, { "epoch": 75.95730523981148, "eval_loss": 6.3810224533081055, "eval_runtime": 63.8703, "eval_samples_per_second": 156.567, "eval_steps_per_second": 19.571, "step": 34200 }, { "epoch": 76.17909620182978, "grad_norm": 0.37039920687675476, "learning_rate": 2.5785953177257527e-06, "loss": 6.3782, "step": 34300 }, { "epoch": 76.17909620182978, "eval_loss": 6.384817600250244, "eval_runtime": 63.7083, "eval_samples_per_second": 156.965, "eval_steps_per_second": 19.621, "step": 34300 }, { "epoch": 76.40088716384807, "grad_norm": 0.29002711176872253, "learning_rate": 2.568561872909699e-06, "loss": 6.3804, "step": 34400 }, { "epoch": 76.40088716384807, "eval_loss": 6.381626605987549, "eval_runtime": 66.318, "eval_samples_per_second": 150.789, "eval_steps_per_second": 18.849, "step": 34400 }, { "epoch": 76.62267812586637, "grad_norm": 0.3963169455528259, "learning_rate": 2.5585284280936454e-06, "loss": 6.3802, "step": 34500 }, { "epoch": 76.62267812586637, "eval_loss": 6.385863304138184, "eval_runtime": 63.683, "eval_samples_per_second": 157.028, "eval_steps_per_second": 19.628, "step": 34500 }, { "epoch": 76.84446908788468, "grad_norm": 0.3641812801361084, "learning_rate": 2.548494983277592e-06, "loss": 6.3794, "step": 34600 }, { "epoch": 76.84446908788468, "eval_loss": 6.379196643829346, "eval_runtime": 63.7355, "eval_samples_per_second": 156.899, "eval_steps_per_second": 19.612, "step": 34600 }, { "epoch": 77.06626004990297, "grad_norm": 0.34516364336013794, "learning_rate": 2.5384615384615385e-06, "loss": 6.3781, "step": 34700 }, { "epoch": 77.06626004990297, "eval_loss": 6.381167411804199, "eval_runtime": 66.2593, "eval_samples_per_second": 150.922, "eval_steps_per_second": 18.865, "step": 34700 }, { "epoch": 77.28805101192127, "grad_norm": 0.37135106325149536, "learning_rate": 2.528428093645485e-06, "loss": 6.3791, "step": 34800 }, { "epoch": 77.28805101192127, "eval_loss": 6.3796210289001465, "eval_runtime": 63.5952, "eval_samples_per_second": 157.245, "eval_steps_per_second": 19.656, "step": 34800 }, { "epoch": 77.50984197393956, "grad_norm": 0.27615365386009216, "learning_rate": 2.5183946488294316e-06, "loss": 6.3788, "step": 34900 }, { "epoch": 77.50984197393956, "eval_loss": 6.38156270980835, "eval_runtime": 63.666, "eval_samples_per_second": 157.07, "eval_steps_per_second": 19.634, "step": 34900 }, { "epoch": 77.73163293595786, "grad_norm": 0.40949293971061707, "learning_rate": 2.508361204013378e-06, "loss": 6.3784, "step": 35000 }, { "epoch": 77.73163293595786, "eval_loss": 6.379955291748047, "eval_runtime": 66.2108, "eval_samples_per_second": 151.033, "eval_steps_per_second": 18.879, "step": 35000 }, { "epoch": 77.95342389797615, "grad_norm": 0.21426652371883392, "learning_rate": 2.4983277591973247e-06, "loss": 6.3792, "step": 35100 }, { "epoch": 77.95342389797615, "eval_loss": 6.38067102432251, "eval_runtime": 63.6933, "eval_samples_per_second": 157.002, "eval_steps_per_second": 19.625, "step": 35100 }, { "epoch": 78.17521485999445, "grad_norm": 0.3121929466724396, "learning_rate": 2.488294314381271e-06, "loss": 6.3799, "step": 35200 }, { "epoch": 78.17521485999445, "eval_loss": 6.383203983306885, "eval_runtime": 63.6759, "eval_samples_per_second": 157.045, "eval_steps_per_second": 19.631, "step": 35200 }, { "epoch": 78.39700582201276, "grad_norm": 0.3007084131240845, "learning_rate": 2.4782608695652173e-06, "loss": 6.3782, "step": 35300 }, { "epoch": 78.39700582201276, "eval_loss": 6.380030632019043, "eval_runtime": 65.4722, "eval_samples_per_second": 152.737, "eval_steps_per_second": 19.092, "step": 35300 }, { "epoch": 78.61879678403105, "grad_norm": 0.4903746247291565, "learning_rate": 2.468227424749164e-06, "loss": 6.3791, "step": 35400 }, { "epoch": 78.61879678403105, "eval_loss": 6.382900714874268, "eval_runtime": 64.4146, "eval_samples_per_second": 155.244, "eval_steps_per_second": 19.406, "step": 35400 }, { "epoch": 78.84058774604935, "grad_norm": 0.41819822788238525, "learning_rate": 2.4581939799331104e-06, "loss": 6.3779, "step": 35500 }, { "epoch": 78.84058774604935, "eval_loss": 6.380439281463623, "eval_runtime": 63.6771, "eval_samples_per_second": 157.042, "eval_steps_per_second": 19.63, "step": 35500 }, { "epoch": 79.06237870806764, "grad_norm": 0.4207383990287781, "learning_rate": 2.4481605351170568e-06, "loss": 6.3794, "step": 35600 }, { "epoch": 79.06237870806764, "eval_loss": 6.381216049194336, "eval_runtime": 63.8668, "eval_samples_per_second": 156.576, "eval_steps_per_second": 19.572, "step": 35600 }, { "epoch": 79.28416967008594, "grad_norm": 0.3821701109409332, "learning_rate": 2.4381270903010035e-06, "loss": 6.3796, "step": 35700 }, { "epoch": 79.28416967008594, "eval_loss": 6.380701541900635, "eval_runtime": 66.2305, "eval_samples_per_second": 150.988, "eval_steps_per_second": 18.873, "step": 35700 }, { "epoch": 79.50596063210425, "grad_norm": 0.3124147653579712, "learning_rate": 2.42809364548495e-06, "loss": 6.3792, "step": 35800 }, { "epoch": 79.50596063210425, "eval_loss": 6.383649826049805, "eval_runtime": 63.6853, "eval_samples_per_second": 157.022, "eval_steps_per_second": 19.628, "step": 35800 }, { "epoch": 79.72775159412254, "grad_norm": 0.37319284677505493, "learning_rate": 2.4180602006688962e-06, "loss": 6.3793, "step": 35900 }, { "epoch": 79.72775159412254, "eval_loss": 6.379690647125244, "eval_runtime": 63.71, "eval_samples_per_second": 156.961, "eval_steps_per_second": 19.62, "step": 35900 }, { "epoch": 79.94954255614084, "grad_norm": 0.3518475890159607, "learning_rate": 2.408026755852843e-06, "loss": 6.3794, "step": 36000 }, { "epoch": 79.94954255614084, "eval_loss": 6.3837385177612305, "eval_runtime": 66.2591, "eval_samples_per_second": 150.923, "eval_steps_per_second": 18.865, "step": 36000 }, { "epoch": 80.17133351815913, "grad_norm": 0.3394939601421356, "learning_rate": 2.3979933110367893e-06, "loss": 6.3779, "step": 36100 }, { "epoch": 80.17133351815913, "eval_loss": 6.383784294128418, "eval_runtime": 63.536, "eval_samples_per_second": 157.391, "eval_steps_per_second": 19.674, "step": 36100 }, { "epoch": 80.39312448017743, "grad_norm": 0.2030980885028839, "learning_rate": 2.387959866220736e-06, "loss": 6.3787, "step": 36200 }, { "epoch": 80.39312448017743, "eval_loss": 6.381889820098877, "eval_runtime": 63.5998, "eval_samples_per_second": 157.233, "eval_steps_per_second": 19.654, "step": 36200 }, { "epoch": 80.61491544219572, "grad_norm": 0.35631629824638367, "learning_rate": 2.3779264214046824e-06, "loss": 6.3778, "step": 36300 }, { "epoch": 80.61491544219572, "eval_loss": 6.382266998291016, "eval_runtime": 66.2682, "eval_samples_per_second": 150.902, "eval_steps_per_second": 18.863, "step": 36300 }, { "epoch": 80.83670640421403, "grad_norm": 0.38831663131713867, "learning_rate": 2.3678929765886288e-06, "loss": 6.3796, "step": 36400 }, { "epoch": 80.83670640421403, "eval_loss": 6.379624843597412, "eval_runtime": 63.7336, "eval_samples_per_second": 156.903, "eval_steps_per_second": 19.613, "step": 36400 }, { "epoch": 81.05849736623233, "grad_norm": 0.29808080196380615, "learning_rate": 2.3578595317725755e-06, "loss": 6.3787, "step": 36500 }, { "epoch": 81.05849736623233, "eval_loss": 6.380765914916992, "eval_runtime": 63.6941, "eval_samples_per_second": 157.0, "eval_steps_per_second": 19.625, "step": 36500 }, { "epoch": 81.28028832825062, "grad_norm": 0.32311221957206726, "learning_rate": 2.347826086956522e-06, "loss": 6.3795, "step": 36600 }, { "epoch": 81.28028832825062, "eval_loss": 6.38113260269165, "eval_runtime": 66.3064, "eval_samples_per_second": 150.815, "eval_steps_per_second": 18.852, "step": 36600 }, { "epoch": 81.50207929026892, "grad_norm": 0.3027205765247345, "learning_rate": 2.337792642140468e-06, "loss": 6.3777, "step": 36700 }, { "epoch": 81.50207929026892, "eval_loss": 6.378735542297363, "eval_runtime": 63.8029, "eval_samples_per_second": 156.733, "eval_steps_per_second": 19.592, "step": 36700 }, { "epoch": 81.72387025228721, "grad_norm": 0.44942182302474976, "learning_rate": 2.327759197324415e-06, "loss": 6.3793, "step": 36800 }, { "epoch": 81.72387025228721, "eval_loss": 6.382872104644775, "eval_runtime": 63.7382, "eval_samples_per_second": 156.892, "eval_steps_per_second": 19.611, "step": 36800 }, { "epoch": 81.94566121430552, "grad_norm": 0.3363696038722992, "learning_rate": 2.3177257525083613e-06, "loss": 6.3786, "step": 36900 }, { "epoch": 81.94566121430552, "eval_loss": 6.3805928230285645, "eval_runtime": 66.2659, "eval_samples_per_second": 150.907, "eval_steps_per_second": 18.863, "step": 36900 }, { "epoch": 82.16745217632382, "grad_norm": 0.3836919367313385, "learning_rate": 2.307692307692308e-06, "loss": 6.378, "step": 37000 }, { "epoch": 82.16745217632382, "eval_loss": 6.381478786468506, "eval_runtime": 63.6472, "eval_samples_per_second": 157.116, "eval_steps_per_second": 19.64, "step": 37000 }, { "epoch": 82.38924313834211, "grad_norm": 0.3322221338748932, "learning_rate": 2.2976588628762544e-06, "loss": 6.3774, "step": 37100 }, { "epoch": 82.38924313834211, "eval_loss": 6.381748199462891, "eval_runtime": 63.7815, "eval_samples_per_second": 156.785, "eval_steps_per_second": 19.598, "step": 37100 }, { "epoch": 82.61103410036041, "grad_norm": 0.33737483620643616, "learning_rate": 2.2876254180602008e-06, "loss": 6.3792, "step": 37200 }, { "epoch": 82.61103410036041, "eval_loss": 6.381521224975586, "eval_runtime": 66.3297, "eval_samples_per_second": 150.762, "eval_steps_per_second": 18.845, "step": 37200 }, { "epoch": 82.8328250623787, "grad_norm": 0.34915590286254883, "learning_rate": 2.2775919732441475e-06, "loss": 6.3792, "step": 37300 }, { "epoch": 82.8328250623787, "eval_loss": 6.382421493530273, "eval_runtime": 63.7523, "eval_samples_per_second": 156.857, "eval_steps_per_second": 19.607, "step": 37300 }, { "epoch": 83.054616024397, "grad_norm": 0.2967890202999115, "learning_rate": 2.267558528428094e-06, "loss": 6.379, "step": 37400 }, { "epoch": 83.054616024397, "eval_loss": 6.379049301147461, "eval_runtime": 63.7339, "eval_samples_per_second": 156.902, "eval_steps_per_second": 19.613, "step": 37400 }, { "epoch": 83.27640698641531, "grad_norm": 0.3198423385620117, "learning_rate": 2.25752508361204e-06, "loss": 6.3784, "step": 37500 }, { "epoch": 83.27640698641531, "eval_loss": 6.380875110626221, "eval_runtime": 66.2715, "eval_samples_per_second": 150.894, "eval_steps_per_second": 18.862, "step": 37500 }, { "epoch": 83.4981979484336, "grad_norm": 0.22756338119506836, "learning_rate": 2.2474916387959865e-06, "loss": 6.3772, "step": 37600 }, { "epoch": 83.4981979484336, "eval_loss": 6.380899429321289, "eval_runtime": 63.6746, "eval_samples_per_second": 157.048, "eval_steps_per_second": 19.631, "step": 37600 }, { "epoch": 83.7199889104519, "grad_norm": 0.375475138425827, "learning_rate": 2.237458193979933e-06, "loss": 6.38, "step": 37700 }, { "epoch": 83.7199889104519, "eval_loss": 6.379432201385498, "eval_runtime": 63.7694, "eval_samples_per_second": 156.815, "eval_steps_per_second": 19.602, "step": 37700 }, { "epoch": 83.99805932908234, "grad_norm": 0.26553675532341003, "learning_rate": 2.2274247491638796e-06, "loss": 6.3791, "step": 37800 }, { "epoch": 83.99805932908234, "eval_loss": 6.386465072631836, "eval_runtime": 66.2313, "eval_samples_per_second": 150.986, "eval_steps_per_second": 18.873, "step": 37800 }, { "epoch": 84.21985029110064, "grad_norm": 0.2572327256202698, "learning_rate": 2.217391304347826e-06, "loss": 6.3779, "step": 37900 }, { "epoch": 84.21985029110064, "eval_loss": 6.381786823272705, "eval_runtime": 63.8252, "eval_samples_per_second": 156.678, "eval_steps_per_second": 19.585, "step": 37900 }, { "epoch": 84.44164125311893, "grad_norm": 0.3603324294090271, "learning_rate": 2.2073578595317723e-06, "loss": 6.3796, "step": 38000 }, { "epoch": 84.44164125311893, "eval_loss": 6.381040573120117, "eval_runtime": 64.0412, "eval_samples_per_second": 156.15, "eval_steps_per_second": 19.519, "step": 38000 }, { "epoch": 84.66343221513723, "grad_norm": 0.3384093642234802, "learning_rate": 2.197324414715719e-06, "loss": 6.3778, "step": 38100 }, { "epoch": 84.66343221513723, "eval_loss": 6.377985000610352, "eval_runtime": 66.1598, "eval_samples_per_second": 151.149, "eval_steps_per_second": 18.894, "step": 38100 }, { "epoch": 84.88522317715552, "grad_norm": 0.3742137551307678, "learning_rate": 2.1872909698996654e-06, "loss": 6.3788, "step": 38200 }, { "epoch": 84.88522317715552, "eval_loss": 6.382181167602539, "eval_runtime": 63.6067, "eval_samples_per_second": 157.216, "eval_steps_per_second": 19.652, "step": 38200 }, { "epoch": 85.10701413917383, "grad_norm": 0.31179383397102356, "learning_rate": 2.177257525083612e-06, "loss": 6.3771, "step": 38300 }, { "epoch": 85.10701413917383, "eval_loss": 6.380379676818848, "eval_runtime": 66.212, "eval_samples_per_second": 151.03, "eval_steps_per_second": 18.879, "step": 38300 }, { "epoch": 85.32880510119213, "grad_norm": 0.36700376868247986, "learning_rate": 2.1672240802675585e-06, "loss": 6.3767, "step": 38400 }, { "epoch": 85.32880510119213, "eval_loss": 6.3812575340271, "eval_runtime": 63.6889, "eval_samples_per_second": 157.013, "eval_steps_per_second": 19.627, "step": 38400 }, { "epoch": 85.55059606321042, "grad_norm": 0.38559991121292114, "learning_rate": 2.157190635451505e-06, "loss": 6.3781, "step": 38500 }, { "epoch": 85.55059606321042, "eval_loss": 6.384213447570801, "eval_runtime": 66.1477, "eval_samples_per_second": 151.177, "eval_steps_per_second": 18.897, "step": 38500 }, { "epoch": 85.77238702522872, "grad_norm": 0.2753937244415283, "learning_rate": 2.1471571906354516e-06, "loss": 6.3795, "step": 38600 }, { "epoch": 85.77238702522872, "eval_loss": 6.37845516204834, "eval_runtime": 63.6513, "eval_samples_per_second": 157.106, "eval_steps_per_second": 19.638, "step": 38600 }, { "epoch": 85.99417798724701, "grad_norm": 0.22831951081752777, "learning_rate": 2.137123745819398e-06, "loss": 6.3789, "step": 38700 }, { "epoch": 85.99417798724701, "eval_loss": 6.381505966186523, "eval_runtime": 66.231, "eval_samples_per_second": 150.987, "eval_steps_per_second": 18.873, "step": 38700 }, { "epoch": 86.21596894926532, "grad_norm": 0.339546799659729, "learning_rate": 2.1270903010033443e-06, "loss": 6.379, "step": 38800 }, { "epoch": 86.21596894926532, "eval_loss": 6.381498336791992, "eval_runtime": 63.6802, "eval_samples_per_second": 157.035, "eval_steps_per_second": 19.629, "step": 38800 }, { "epoch": 86.43775991128362, "grad_norm": 0.2600659728050232, "learning_rate": 2.117056856187291e-06, "loss": 6.3774, "step": 38900 }, { "epoch": 86.43775991128362, "eval_loss": 6.381589889526367, "eval_runtime": 63.6804, "eval_samples_per_second": 157.034, "eval_steps_per_second": 19.629, "step": 38900 }, { "epoch": 86.65955087330191, "grad_norm": 0.32178473472595215, "learning_rate": 2.1070234113712374e-06, "loss": 6.3785, "step": 39000 }, { "epoch": 86.65955087330191, "eval_loss": 6.377468585968018, "eval_runtime": 66.295, "eval_samples_per_second": 150.841, "eval_steps_per_second": 18.855, "step": 39000 }, { "epoch": 86.88134183532021, "grad_norm": 0.28717854619026184, "learning_rate": 2.0969899665551837e-06, "loss": 6.377, "step": 39100 }, { "epoch": 86.88134183532021, "eval_loss": 6.3805928230285645, "eval_runtime": 63.7605, "eval_samples_per_second": 156.837, "eval_steps_per_second": 19.605, "step": 39100 }, { "epoch": 87.1031327973385, "grad_norm": 0.2932318150997162, "learning_rate": 2.0869565217391305e-06, "loss": 6.3791, "step": 39200 }, { "epoch": 87.1031327973385, "eval_loss": 6.380700588226318, "eval_runtime": 63.6583, "eval_samples_per_second": 157.089, "eval_steps_per_second": 19.636, "step": 39200 }, { "epoch": 87.3249237593568, "grad_norm": 0.39832741022109985, "learning_rate": 2.076923076923077e-06, "loss": 6.3784, "step": 39300 }, { "epoch": 87.3249237593568, "eval_loss": 6.37957763671875, "eval_runtime": 66.5746, "eval_samples_per_second": 150.207, "eval_steps_per_second": 18.776, "step": 39300 }, { "epoch": 87.54671472137511, "grad_norm": 0.3088468611240387, "learning_rate": 2.0668896321070236e-06, "loss": 6.3774, "step": 39400 }, { "epoch": 87.54671472137511, "eval_loss": 6.379054069519043, "eval_runtime": 66.2367, "eval_samples_per_second": 150.974, "eval_steps_per_second": 18.872, "step": 39400 }, { "epoch": 87.7685056833934, "grad_norm": 0.284956693649292, "learning_rate": 2.05685618729097e-06, "loss": 6.3777, "step": 39500 }, { "epoch": 87.7685056833934, "eval_loss": 6.381918907165527, "eval_runtime": 66.2296, "eval_samples_per_second": 150.99, "eval_steps_per_second": 18.874, "step": 39500 }, { "epoch": 87.9902966454117, "grad_norm": 0.26180529594421387, "learning_rate": 2.0468227424749163e-06, "loss": 6.3779, "step": 39600 }, { "epoch": 87.9902966454117, "eval_loss": 6.381536483764648, "eval_runtime": 66.1969, "eval_samples_per_second": 151.064, "eval_steps_per_second": 18.883, "step": 39600 }, { "epoch": 88.21208760742999, "grad_norm": 0.39024651050567627, "learning_rate": 2.036789297658863e-06, "loss": 6.377, "step": 39700 }, { "epoch": 88.21208760742999, "eval_loss": 6.3777618408203125, "eval_runtime": 66.1491, "eval_samples_per_second": 151.174, "eval_steps_per_second": 18.897, "step": 39700 }, { "epoch": 88.4338785694483, "grad_norm": 0.2729719579219818, "learning_rate": 2.0267558528428094e-06, "loss": 6.3782, "step": 39800 }, { "epoch": 88.4338785694483, "eval_loss": 6.382574081420898, "eval_runtime": 66.1908, "eval_samples_per_second": 151.078, "eval_steps_per_second": 18.885, "step": 39800 }, { "epoch": 88.65566953146659, "grad_norm": 0.23033183813095093, "learning_rate": 2.0167224080267557e-06, "loss": 6.3776, "step": 39900 }, { "epoch": 88.65566953146659, "eval_loss": 6.378293514251709, "eval_runtime": 66.1466, "eval_samples_per_second": 151.179, "eval_steps_per_second": 18.897, "step": 39900 }, { "epoch": 88.87746049348489, "grad_norm": 0.43995988368988037, "learning_rate": 2.0066889632107025e-06, "loss": 6.3793, "step": 40000 }, { "epoch": 88.87746049348489, "eval_loss": 6.380235195159912, "eval_runtime": 66.2981, "eval_samples_per_second": 150.834, "eval_steps_per_second": 18.854, "step": 40000 }, { "epoch": 89.0992514555032, "grad_norm": 0.2878618836402893, "learning_rate": 1.996655518394649e-06, "loss": 6.3783, "step": 40100 }, { "epoch": 89.0992514555032, "eval_loss": 6.379173755645752, "eval_runtime": 63.6984, "eval_samples_per_second": 156.99, "eval_steps_per_second": 19.624, "step": 40100 }, { "epoch": 89.32104241752148, "grad_norm": 0.22379851341247559, "learning_rate": 1.986622073578595e-06, "loss": 6.3785, "step": 40200 }, { "epoch": 89.32104241752148, "eval_loss": 6.38173770904541, "eval_runtime": 63.792, "eval_samples_per_second": 156.759, "eval_steps_per_second": 19.595, "step": 40200 }, { "epoch": 89.54283337953979, "grad_norm": 0.3321212828159332, "learning_rate": 1.976588628762542e-06, "loss": 6.3775, "step": 40300 }, { "epoch": 89.54283337953979, "eval_loss": 6.377793788909912, "eval_runtime": 66.2425, "eval_samples_per_second": 150.96, "eval_steps_per_second": 18.87, "step": 40300 }, { "epoch": 89.76462434155808, "grad_norm": 0.3513726592063904, "learning_rate": 1.9665551839464883e-06, "loss": 6.3777, "step": 40400 }, { "epoch": 89.76462434155808, "eval_loss": 6.38060998916626, "eval_runtime": 66.2865, "eval_samples_per_second": 150.86, "eval_steps_per_second": 18.858, "step": 40400 }, { "epoch": 89.98641530357638, "grad_norm": 0.3225536048412323, "learning_rate": 1.956521739130435e-06, "loss": 6.3781, "step": 40500 }, { "epoch": 89.98641530357638, "eval_loss": 6.3820648193359375, "eval_runtime": 63.6933, "eval_samples_per_second": 157.002, "eval_steps_per_second": 19.625, "step": 40500 }, { "epoch": 90.20820626559468, "grad_norm": 0.3866877853870392, "learning_rate": 1.9464882943143814e-06, "loss": 6.3772, "step": 40600 }, { "epoch": 90.20820626559468, "eval_loss": 6.382141590118408, "eval_runtime": 63.7832, "eval_samples_per_second": 156.781, "eval_steps_per_second": 19.598, "step": 40600 }, { "epoch": 90.42999722761297, "grad_norm": 0.43070387840270996, "learning_rate": 1.9364548494983277e-06, "loss": 6.3778, "step": 40700 }, { "epoch": 90.42999722761297, "eval_loss": 6.375494480133057, "eval_runtime": 65.724, "eval_samples_per_second": 152.152, "eval_steps_per_second": 19.019, "step": 40700 }, { "epoch": 90.65178818963128, "grad_norm": 0.35665562748908997, "learning_rate": 1.9264214046822745e-06, "loss": 6.3767, "step": 40800 }, { "epoch": 90.65178818963128, "eval_loss": 6.379345417022705, "eval_runtime": 64.4622, "eval_samples_per_second": 155.13, "eval_steps_per_second": 19.391, "step": 40800 }, { "epoch": 90.87357915164957, "grad_norm": 0.35841798782348633, "learning_rate": 1.916387959866221e-06, "loss": 6.3765, "step": 40900 }, { "epoch": 90.87357915164957, "eval_loss": 6.379830360412598, "eval_runtime": 66.3033, "eval_samples_per_second": 150.822, "eval_steps_per_second": 18.853, "step": 40900 }, { "epoch": 91.09537011366787, "grad_norm": 0.29910504817962646, "learning_rate": 1.9063545150501674e-06, "loss": 6.3774, "step": 41000 }, { "epoch": 91.09537011366787, "eval_loss": 6.380716323852539, "eval_runtime": 63.7905, "eval_samples_per_second": 156.763, "eval_steps_per_second": 19.595, "step": 41000 }, { "epoch": 91.31716107568617, "grad_norm": 0.3775427043437958, "learning_rate": 1.896321070234114e-06, "loss": 6.3784, "step": 41100 }, { "epoch": 91.31716107568617, "eval_loss": 6.38125467300415, "eval_runtime": 63.7934, "eval_samples_per_second": 156.756, "eval_steps_per_second": 19.595, "step": 41100 }, { "epoch": 91.53895203770446, "grad_norm": 0.2421695590019226, "learning_rate": 1.8862876254180603e-06, "loss": 6.377, "step": 41200 }, { "epoch": 91.53895203770446, "eval_loss": 6.381397724151611, "eval_runtime": 66.2535, "eval_samples_per_second": 150.935, "eval_steps_per_second": 18.867, "step": 41200 }, { "epoch": 91.76074299972277, "grad_norm": 0.2967372238636017, "learning_rate": 1.8762541806020068e-06, "loss": 6.3783, "step": 41300 }, { "epoch": 91.76074299972277, "eval_loss": 6.380742073059082, "eval_runtime": 63.7433, "eval_samples_per_second": 156.879, "eval_steps_per_second": 19.61, "step": 41300 }, { "epoch": 91.98253396174105, "grad_norm": 0.3849758505821228, "learning_rate": 1.8662207357859534e-06, "loss": 6.3789, "step": 41400 }, { "epoch": 91.98253396174105, "eval_loss": 6.3830342292785645, "eval_runtime": 66.3151, "eval_samples_per_second": 150.795, "eval_steps_per_second": 18.849, "step": 41400 }, { "epoch": 92.20432492375936, "grad_norm": 0.377841055393219, "learning_rate": 1.8561872909699e-06, "loss": 6.3769, "step": 41500 }, { "epoch": 92.20432492375936, "eval_loss": 6.381241798400879, "eval_runtime": 63.6855, "eval_samples_per_second": 157.022, "eval_steps_per_second": 19.628, "step": 41500 }, { "epoch": 92.42611588577765, "grad_norm": 0.4062901437282562, "learning_rate": 1.8461538461538462e-06, "loss": 6.3791, "step": 41600 }, { "epoch": 92.42611588577765, "eval_loss": 6.378665924072266, "eval_runtime": 63.7914, "eval_samples_per_second": 156.761, "eval_steps_per_second": 19.595, "step": 41600 }, { "epoch": 92.64790684779595, "grad_norm": 0.33464646339416504, "learning_rate": 1.8361204013377928e-06, "loss": 6.3782, "step": 41700 }, { "epoch": 92.64790684779595, "eval_loss": 6.379201412200928, "eval_runtime": 66.2071, "eval_samples_per_second": 151.041, "eval_steps_per_second": 18.88, "step": 41700 }, { "epoch": 92.86969780981426, "grad_norm": 0.26191645860671997, "learning_rate": 1.8260869565217394e-06, "loss": 6.3768, "step": 41800 }, { "epoch": 92.86969780981426, "eval_loss": 6.380030632019043, "eval_runtime": 63.7463, "eval_samples_per_second": 156.872, "eval_steps_per_second": 19.609, "step": 41800 }, { "epoch": 93.09148877183254, "grad_norm": 0.4473271667957306, "learning_rate": 1.8160535117056857e-06, "loss": 6.376, "step": 41900 }, { "epoch": 93.09148877183254, "eval_loss": 6.383362293243408, "eval_runtime": 66.2652, "eval_samples_per_second": 150.909, "eval_steps_per_second": 18.864, "step": 41900 }, { "epoch": 93.31327973385085, "grad_norm": 0.30396267771720886, "learning_rate": 1.8060200668896322e-06, "loss": 6.3782, "step": 42000 }, { "epoch": 93.31327973385085, "eval_loss": 6.382277965545654, "eval_runtime": 63.811, "eval_samples_per_second": 156.713, "eval_steps_per_second": 19.589, "step": 42000 }, { "epoch": 93.53507069586914, "grad_norm": 0.2819732129573822, "learning_rate": 1.7959866220735788e-06, "loss": 6.3782, "step": 42100 }, { "epoch": 93.53507069586914, "eval_loss": 6.381258010864258, "eval_runtime": 63.7343, "eval_samples_per_second": 156.901, "eval_steps_per_second": 19.613, "step": 42100 }, { "epoch": 93.75686165788744, "grad_norm": 0.2994706630706787, "learning_rate": 1.7859531772575253e-06, "loss": 6.3786, "step": 42200 }, { "epoch": 93.75686165788744, "eval_loss": 6.381169319152832, "eval_runtime": 66.2919, "eval_samples_per_second": 150.848, "eval_steps_per_second": 18.856, "step": 42200 }, { "epoch": 93.97865261990574, "grad_norm": 0.31294333934783936, "learning_rate": 1.7759197324414717e-06, "loss": 6.3766, "step": 42300 }, { "epoch": 93.97865261990574, "eval_loss": 6.379955291748047, "eval_runtime": 63.7737, "eval_samples_per_second": 156.804, "eval_steps_per_second": 19.601, "step": 42300 }, { "epoch": 94.20044358192403, "grad_norm": 0.291477769613266, "learning_rate": 1.7658862876254182e-06, "loss": 6.3777, "step": 42400 }, { "epoch": 94.20044358192403, "eval_loss": 6.379477500915527, "eval_runtime": 66.2866, "eval_samples_per_second": 150.86, "eval_steps_per_second": 18.857, "step": 42400 }, { "epoch": 94.42223454394234, "grad_norm": 0.23638038337230682, "learning_rate": 1.7558528428093648e-06, "loss": 6.3781, "step": 42500 }, { "epoch": 94.42223454394234, "eval_loss": 6.380892753601074, "eval_runtime": 63.8247, "eval_samples_per_second": 156.679, "eval_steps_per_second": 19.585, "step": 42500 }, { "epoch": 94.64402550596063, "grad_norm": 0.3445935547351837, "learning_rate": 1.745819397993311e-06, "loss": 6.3768, "step": 42600 }, { "epoch": 94.64402550596063, "eval_loss": 6.382579803466797, "eval_runtime": 63.8197, "eval_samples_per_second": 156.691, "eval_steps_per_second": 19.586, "step": 42600 }, { "epoch": 94.86581646797893, "grad_norm": 0.3376341462135315, "learning_rate": 1.7357859531772575e-06, "loss": 6.3768, "step": 42700 }, { "epoch": 94.86581646797893, "eval_loss": 6.381232261657715, "eval_runtime": 66.3632, "eval_samples_per_second": 150.686, "eval_steps_per_second": 18.836, "step": 42700 }, { "epoch": 95.08760742999723, "grad_norm": 0.29045116901397705, "learning_rate": 1.7257525083612038e-06, "loss": 6.3763, "step": 42800 }, { "epoch": 95.08760742999723, "eval_loss": 6.3776373863220215, "eval_runtime": 63.6759, "eval_samples_per_second": 157.045, "eval_steps_per_second": 19.631, "step": 42800 }, { "epoch": 95.30939839201552, "grad_norm": 0.2851983308792114, "learning_rate": 1.7157190635451504e-06, "loss": 6.3778, "step": 42900 }, { "epoch": 95.30939839201552, "eval_loss": 6.380300998687744, "eval_runtime": 63.8175, "eval_samples_per_second": 156.697, "eval_steps_per_second": 19.587, "step": 42900 }, { "epoch": 95.53118935403383, "grad_norm": 0.33936771750450134, "learning_rate": 1.705685618729097e-06, "loss": 6.3787, "step": 43000 }, { "epoch": 95.53118935403383, "eval_loss": 6.37871789932251, "eval_runtime": 63.8614, "eval_samples_per_second": 156.589, "eval_steps_per_second": 19.574, "step": 43000 }, { "epoch": 95.75298031605212, "grad_norm": 0.4443320333957672, "learning_rate": 1.6956521739130435e-06, "loss": 6.3781, "step": 43100 }, { "epoch": 95.75298031605212, "eval_loss": 6.382043838500977, "eval_runtime": 66.3729, "eval_samples_per_second": 150.664, "eval_steps_per_second": 18.833, "step": 43100 }, { "epoch": 95.97477127807042, "grad_norm": 0.33091309666633606, "learning_rate": 1.6856187290969898e-06, "loss": 6.3772, "step": 43200 }, { "epoch": 95.97477127807042, "eval_loss": 6.380916595458984, "eval_runtime": 63.7824, "eval_samples_per_second": 156.783, "eval_steps_per_second": 19.598, "step": 43200 }, { "epoch": 96.19656224008871, "grad_norm": 0.3929876685142517, "learning_rate": 1.6755852842809363e-06, "loss": 6.3785, "step": 43300 }, { "epoch": 96.19656224008871, "eval_loss": 6.377211570739746, "eval_runtime": 66.2793, "eval_samples_per_second": 150.877, "eval_steps_per_second": 18.86, "step": 43300 }, { "epoch": 96.41835320210701, "grad_norm": 0.3379896581172943, "learning_rate": 1.665551839464883e-06, "loss": 6.3772, "step": 43400 }, { "epoch": 96.41835320210701, "eval_loss": 6.380885124206543, "eval_runtime": 63.8749, "eval_samples_per_second": 156.556, "eval_steps_per_second": 19.569, "step": 43400 }, { "epoch": 96.64014416412532, "grad_norm": 0.3330114483833313, "learning_rate": 1.6555183946488294e-06, "loss": 6.378, "step": 43500 }, { "epoch": 96.64014416412532, "eval_loss": 6.381417751312256, "eval_runtime": 66.3248, "eval_samples_per_second": 150.773, "eval_steps_per_second": 18.847, "step": 43500 }, { "epoch": 96.8619351261436, "grad_norm": 0.5002055168151855, "learning_rate": 1.6454849498327758e-06, "loss": 6.3772, "step": 43600 }, { "epoch": 96.8619351261436, "eval_loss": 6.379367351531982, "eval_runtime": 63.7674, "eval_samples_per_second": 156.82, "eval_steps_per_second": 19.603, "step": 43600 }, { "epoch": 97.08372608816191, "grad_norm": 0.4039636552333832, "learning_rate": 1.6354515050167223e-06, "loss": 6.376, "step": 43700 }, { "epoch": 97.08372608816191, "eval_loss": 6.379873275756836, "eval_runtime": 63.6881, "eval_samples_per_second": 157.015, "eval_steps_per_second": 19.627, "step": 43700 }, { "epoch": 97.3055170501802, "grad_norm": 0.3500140309333801, "learning_rate": 1.6254180602006689e-06, "loss": 6.3793, "step": 43800 }, { "epoch": 97.3055170501802, "eval_loss": 6.3825764656066895, "eval_runtime": 66.2863, "eval_samples_per_second": 150.861, "eval_steps_per_second": 18.858, "step": 43800 }, { "epoch": 97.5273080121985, "grad_norm": 0.343735009431839, "learning_rate": 1.6153846153846154e-06, "loss": 6.3779, "step": 43900 }, { "epoch": 97.5273080121985, "eval_loss": 6.378231525421143, "eval_runtime": 63.7143, "eval_samples_per_second": 156.951, "eval_steps_per_second": 19.619, "step": 43900 }, { "epoch": 97.7490989742168, "grad_norm": 0.3836156129837036, "learning_rate": 1.6053511705685618e-06, "loss": 6.3773, "step": 44000 }, { "epoch": 97.7490989742168, "eval_loss": 6.37751579284668, "eval_runtime": 63.729, "eval_samples_per_second": 156.915, "eval_steps_per_second": 19.614, "step": 44000 }, { "epoch": 97.9708899362351, "grad_norm": 0.3120937645435333, "learning_rate": 1.5953177257525083e-06, "loss": 6.3755, "step": 44100 }, { "epoch": 97.9708899362351, "eval_loss": 6.3800272941589355, "eval_runtime": 64.1744, "eval_samples_per_second": 155.825, "eval_steps_per_second": 19.478, "step": 44100 }, { "epoch": 98.1926808982534, "grad_norm": 0.33682048320770264, "learning_rate": 1.5852842809364549e-06, "loss": 6.3765, "step": 44200 }, { "epoch": 98.1926808982534, "eval_loss": 6.378459930419922, "eval_runtime": 65.8486, "eval_samples_per_second": 151.864, "eval_steps_per_second": 18.983, "step": 44200 }, { "epoch": 98.41447186027169, "grad_norm": 0.33430323004722595, "learning_rate": 1.5752508361204012e-06, "loss": 6.3784, "step": 44300 }, { "epoch": 98.41447186027169, "eval_loss": 6.37835693359375, "eval_runtime": 63.7423, "eval_samples_per_second": 156.882, "eval_steps_per_second": 19.61, "step": 44300 }, { "epoch": 98.63626282228999, "grad_norm": 0.3729492425918579, "learning_rate": 1.5652173913043478e-06, "loss": 6.3775, "step": 44400 }, { "epoch": 98.63626282228999, "eval_loss": 6.379312515258789, "eval_runtime": 67.1919, "eval_samples_per_second": 148.827, "eval_steps_per_second": 18.603, "step": 44400 }, { "epoch": 98.8580537843083, "grad_norm": 0.30378684401512146, "learning_rate": 1.5551839464882943e-06, "loss": 6.3773, "step": 44500 }, { "epoch": 98.8580537843083, "eval_loss": 6.380176544189453, "eval_runtime": 66.2505, "eval_samples_per_second": 150.942, "eval_steps_per_second": 18.868, "step": 44500 }, { "epoch": 99.07984474632659, "grad_norm": 0.2708960771560669, "learning_rate": 1.5451505016722409e-06, "loss": 6.3791, "step": 44600 }, { "epoch": 99.07984474632659, "eval_loss": 6.381106853485107, "eval_runtime": 63.6851, "eval_samples_per_second": 157.023, "eval_steps_per_second": 19.628, "step": 44600 }, { "epoch": 99.30163570834489, "grad_norm": 0.28966355323791504, "learning_rate": 1.5351170568561872e-06, "loss": 6.3769, "step": 44700 }, { "epoch": 99.30163570834489, "eval_loss": 6.380806922912598, "eval_runtime": 66.2295, "eval_samples_per_second": 150.99, "eval_steps_per_second": 18.874, "step": 44700 }, { "epoch": 99.52342667036318, "grad_norm": 0.33378317952156067, "learning_rate": 1.5250836120401338e-06, "loss": 6.3764, "step": 44800 }, { "epoch": 99.52342667036318, "eval_loss": 6.378901481628418, "eval_runtime": 63.7387, "eval_samples_per_second": 156.89, "eval_steps_per_second": 19.611, "step": 44800 }, { "epoch": 99.74521763238148, "grad_norm": 0.2659667134284973, "learning_rate": 1.5150501672240803e-06, "loss": 6.3763, "step": 44900 }, { "epoch": 99.74521763238148, "eval_loss": 6.378689289093018, "eval_runtime": 66.291, "eval_samples_per_second": 150.85, "eval_steps_per_second": 18.856, "step": 44900 }, { "epoch": 99.96700859439977, "grad_norm": 0.36868181824684143, "learning_rate": 1.5050167224080269e-06, "loss": 6.3773, "step": 45000 }, { "epoch": 99.96700859439977, "eval_loss": 6.379394054412842, "eval_runtime": 63.8432, "eval_samples_per_second": 156.634, "eval_steps_per_second": 19.579, "step": 45000 }, { "epoch": 100.18879955641808, "grad_norm": 0.2957492768764496, "learning_rate": 1.4949832775919732e-06, "loss": 6.3777, "step": 45100 }, { "epoch": 100.18879955641808, "eval_loss": 6.37989616394043, "eval_runtime": 63.7161, "eval_samples_per_second": 156.946, "eval_steps_per_second": 19.618, "step": 45100 }, { "epoch": 100.41059051843638, "grad_norm": 0.36346226930618286, "learning_rate": 1.4849498327759198e-06, "loss": 6.3771, "step": 45200 }, { "epoch": 100.41059051843638, "eval_loss": 6.382117748260498, "eval_runtime": 66.181, "eval_samples_per_second": 151.101, "eval_steps_per_second": 18.888, "step": 45200 }, { "epoch": 100.63238148045467, "grad_norm": 0.21758611500263214, "learning_rate": 1.4749163879598663e-06, "loss": 6.3768, "step": 45300 }, { "epoch": 100.63238148045467, "eval_loss": 6.378548622131348, "eval_runtime": 63.8643, "eval_samples_per_second": 156.582, "eval_steps_per_second": 19.573, "step": 45300 }, { "epoch": 100.85417244247297, "grad_norm": 0.21891988813877106, "learning_rate": 1.4648829431438129e-06, "loss": 6.3759, "step": 45400 }, { "epoch": 100.85417244247297, "eval_loss": 6.3807806968688965, "eval_runtime": 66.1954, "eval_samples_per_second": 151.068, "eval_steps_per_second": 18.883, "step": 45400 }, { "epoch": 101.07596340449126, "grad_norm": 0.31398728489875793, "learning_rate": 1.4548494983277592e-06, "loss": 6.3783, "step": 45500 }, { "epoch": 101.07596340449126, "eval_loss": 6.3800740242004395, "eval_runtime": 63.74, "eval_samples_per_second": 156.887, "eval_steps_per_second": 19.611, "step": 45500 }, { "epoch": 101.29775436650957, "grad_norm": 0.3506067991256714, "learning_rate": 1.4448160535117058e-06, "loss": 6.3757, "step": 45600 }, { "epoch": 101.29775436650957, "eval_loss": 6.3802642822265625, "eval_runtime": 66.3029, "eval_samples_per_second": 150.823, "eval_steps_per_second": 18.853, "step": 45600 }, { "epoch": 101.51954532852787, "grad_norm": 0.4127357304096222, "learning_rate": 1.4347826086956523e-06, "loss": 6.377, "step": 45700 }, { "epoch": 101.51954532852787, "eval_loss": 6.379199028015137, "eval_runtime": 63.6147, "eval_samples_per_second": 157.196, "eval_steps_per_second": 19.65, "step": 45700 }, { "epoch": 101.74133629054616, "grad_norm": 0.40180787444114685, "learning_rate": 1.4247491638795989e-06, "loss": 6.3774, "step": 45800 }, { "epoch": 101.74133629054616, "eval_loss": 6.378483295440674, "eval_runtime": 63.6205, "eval_samples_per_second": 157.182, "eval_steps_per_second": 19.648, "step": 45800 }, { "epoch": 101.96312725256446, "grad_norm": 0.2862705588340759, "learning_rate": 1.4147157190635452e-06, "loss": 6.3777, "step": 45900 }, { "epoch": 101.96312725256446, "eval_loss": 6.377134323120117, "eval_runtime": 63.9897, "eval_samples_per_second": 156.275, "eval_steps_per_second": 19.534, "step": 45900 }, { "epoch": 102.18491821458275, "grad_norm": 0.2539602816104889, "learning_rate": 1.4046822742474917e-06, "loss": 6.3786, "step": 46000 }, { "epoch": 102.18491821458275, "eval_loss": 6.379866123199463, "eval_runtime": 66.1001, "eval_samples_per_second": 151.286, "eval_steps_per_second": 18.911, "step": 46000 }, { "epoch": 102.40670917660105, "grad_norm": 0.36692872643470764, "learning_rate": 1.3946488294314383e-06, "loss": 6.3771, "step": 46100 }, { "epoch": 102.40670917660105, "eval_loss": 6.379576683044434, "eval_runtime": 63.6911, "eval_samples_per_second": 157.008, "eval_steps_per_second": 19.626, "step": 46100 }, { "epoch": 102.62850013861934, "grad_norm": 0.3044676184654236, "learning_rate": 1.3846153846153846e-06, "loss": 6.3772, "step": 46200 }, { "epoch": 102.62850013861934, "eval_loss": 6.381227493286133, "eval_runtime": 63.7064, "eval_samples_per_second": 156.97, "eval_steps_per_second": 19.621, "step": 46200 }, { "epoch": 102.85029110063765, "grad_norm": 0.3508971035480499, "learning_rate": 1.374581939799331e-06, "loss": 6.3762, "step": 46300 }, { "epoch": 102.85029110063765, "eval_loss": 6.377274513244629, "eval_runtime": 66.2947, "eval_samples_per_second": 150.842, "eval_steps_per_second": 18.855, "step": 46300 }, { "epoch": 103.07208206265595, "grad_norm": 0.31413570046424866, "learning_rate": 1.3645484949832775e-06, "loss": 6.3774, "step": 46400 }, { "epoch": 103.07208206265595, "eval_loss": 6.380115032196045, "eval_runtime": 63.6441, "eval_samples_per_second": 157.124, "eval_steps_per_second": 19.64, "step": 46400 }, { "epoch": 103.29387302467424, "grad_norm": 0.2552104890346527, "learning_rate": 1.354515050167224e-06, "loss": 6.3775, "step": 46500 }, { "epoch": 103.29387302467424, "eval_loss": 6.379015922546387, "eval_runtime": 63.6755, "eval_samples_per_second": 157.046, "eval_steps_per_second": 19.631, "step": 46500 }, { "epoch": 103.51566398669254, "grad_norm": 0.3744960129261017, "learning_rate": 1.3444816053511706e-06, "loss": 6.3763, "step": 46600 }, { "epoch": 103.51566398669254, "eval_loss": 6.374266624450684, "eval_runtime": 66.5834, "eval_samples_per_second": 150.188, "eval_steps_per_second": 18.773, "step": 46600 }, { "epoch": 103.73745494871083, "grad_norm": 0.27893921732902527, "learning_rate": 1.334448160535117e-06, "loss": 6.3775, "step": 46700 }, { "epoch": 103.73745494871083, "eval_loss": 6.380270957946777, "eval_runtime": 66.2442, "eval_samples_per_second": 150.957, "eval_steps_per_second": 18.87, "step": 46700 }, { "epoch": 103.95924591072914, "grad_norm": 0.2601492404937744, "learning_rate": 1.3244147157190635e-06, "loss": 6.3775, "step": 46800 }, { "epoch": 103.95924591072914, "eval_loss": 6.380533218383789, "eval_runtime": 66.2494, "eval_samples_per_second": 150.945, "eval_steps_per_second": 18.868, "step": 46800 }, { "epoch": 104.18103687274744, "grad_norm": 0.28285419940948486, "learning_rate": 1.31438127090301e-06, "loss": 6.3776, "step": 46900 }, { "epoch": 104.18103687274744, "eval_loss": 6.3801751136779785, "eval_runtime": 66.2411, "eval_samples_per_second": 150.964, "eval_steps_per_second": 18.87, "step": 46900 }, { "epoch": 104.40282783476573, "grad_norm": 0.4723234176635742, "learning_rate": 1.3043478260869566e-06, "loss": 6.376, "step": 47000 }, { "epoch": 104.40282783476573, "eval_loss": 6.379186153411865, "eval_runtime": 63.7279, "eval_samples_per_second": 156.917, "eval_steps_per_second": 19.615, "step": 47000 }, { "epoch": 104.62461879678403, "grad_norm": 0.3108322322368622, "learning_rate": 1.294314381270903e-06, "loss": 6.3773, "step": 47100 }, { "epoch": 104.62461879678403, "eval_loss": 6.3764142990112305, "eval_runtime": 63.7167, "eval_samples_per_second": 156.945, "eval_steps_per_second": 19.618, "step": 47100 }, { "epoch": 104.84640975880232, "grad_norm": 0.38544511795043945, "learning_rate": 1.2842809364548495e-06, "loss": 6.3773, "step": 47200 }, { "epoch": 104.84640975880232, "eval_loss": 6.379009246826172, "eval_runtime": 66.2408, "eval_samples_per_second": 150.964, "eval_steps_per_second": 18.871, "step": 47200 }, { "epoch": 105.06820072082063, "grad_norm": 0.2773985266685486, "learning_rate": 1.274247491638796e-06, "loss": 6.3772, "step": 47300 }, { "epoch": 105.06820072082063, "eval_loss": 6.3756842613220215, "eval_runtime": 63.6518, "eval_samples_per_second": 157.105, "eval_steps_per_second": 19.638, "step": 47300 }, { "epoch": 105.28999168283893, "grad_norm": 0.2765492796897888, "learning_rate": 1.2642140468227424e-06, "loss": 6.3764, "step": 47400 }, { "epoch": 105.28999168283893, "eval_loss": 6.377975940704346, "eval_runtime": 66.2572, "eval_samples_per_second": 150.927, "eval_steps_per_second": 18.866, "step": 47400 }, { "epoch": 105.51178264485722, "grad_norm": 0.30239638686180115, "learning_rate": 1.254180602006689e-06, "loss": 6.3761, "step": 47500 }, { "epoch": 105.51178264485722, "eval_loss": 6.379149436950684, "eval_runtime": 63.8068, "eval_samples_per_second": 156.723, "eval_steps_per_second": 19.59, "step": 47500 }, { "epoch": 105.73357360687552, "grad_norm": 0.22471874952316284, "learning_rate": 1.2441471571906355e-06, "loss": 6.3775, "step": 47600 }, { "epoch": 105.73357360687552, "eval_loss": 6.3783087730407715, "eval_runtime": 66.2436, "eval_samples_per_second": 150.958, "eval_steps_per_second": 18.87, "step": 47600 }, { "epoch": 105.95536456889381, "grad_norm": 0.23722052574157715, "learning_rate": 1.234113712374582e-06, "loss": 6.377, "step": 47700 }, { "epoch": 105.95536456889381, "eval_loss": 6.376536846160889, "eval_runtime": 63.6766, "eval_samples_per_second": 157.044, "eval_steps_per_second": 19.63, "step": 47700 }, { "epoch": 106.17715553091212, "grad_norm": 0.26499879360198975, "learning_rate": 1.2240802675585284e-06, "loss": 6.3758, "step": 47800 }, { "epoch": 106.17715553091212, "eval_loss": 6.380406856536865, "eval_runtime": 66.1835, "eval_samples_per_second": 151.095, "eval_steps_per_second": 18.887, "step": 47800 }, { "epoch": 106.3989464929304, "grad_norm": 0.32900717854499817, "learning_rate": 1.214046822742475e-06, "loss": 6.375, "step": 47900 }, { "epoch": 106.3989464929304, "eval_loss": 6.375906467437744, "eval_runtime": 63.8048, "eval_samples_per_second": 156.728, "eval_steps_per_second": 19.591, "step": 47900 }, { "epoch": 106.62073745494871, "grad_norm": 0.3241865932941437, "learning_rate": 1.2040133779264215e-06, "loss": 6.3792, "step": 48000 }, { "epoch": 106.62073745494871, "eval_loss": 6.37775993347168, "eval_runtime": 66.3426, "eval_samples_per_second": 150.733, "eval_steps_per_second": 18.842, "step": 48000 }, { "epoch": 106.84252841696701, "grad_norm": 0.3194703757762909, "learning_rate": 1.193979933110368e-06, "loss": 6.3766, "step": 48100 }, { "epoch": 106.84252841696701, "eval_loss": 6.37912654876709, "eval_runtime": 63.7236, "eval_samples_per_second": 156.928, "eval_steps_per_second": 19.616, "step": 48100 }, { "epoch": 107.0643193789853, "grad_norm": 0.25526002049446106, "learning_rate": 1.1839464882943144e-06, "loss": 6.3776, "step": 48200 }, { "epoch": 107.0643193789853, "eval_loss": 6.38245153427124, "eval_runtime": 66.2659, "eval_samples_per_second": 150.907, "eval_steps_per_second": 18.863, "step": 48200 }, { "epoch": 107.2861103410036, "grad_norm": 0.2747518718242645, "learning_rate": 1.173913043478261e-06, "loss": 6.3768, "step": 48300 }, { "epoch": 107.2861103410036, "eval_loss": 6.380572319030762, "eval_runtime": 63.8901, "eval_samples_per_second": 156.519, "eval_steps_per_second": 19.565, "step": 48300 }, { "epoch": 107.5079013030219, "grad_norm": 0.2569632828235626, "learning_rate": 1.1638795986622075e-06, "loss": 6.3764, "step": 48400 }, { "epoch": 107.5079013030219, "eval_loss": 6.380358695983887, "eval_runtime": 66.316, "eval_samples_per_second": 150.793, "eval_steps_per_second": 18.849, "step": 48400 }, { "epoch": 107.7296922650402, "grad_norm": 0.28270038962364197, "learning_rate": 1.153846153846154e-06, "loss": 6.3772, "step": 48500 }, { "epoch": 107.7296922650402, "eval_loss": 6.3787407875061035, "eval_runtime": 63.7582, "eval_samples_per_second": 156.842, "eval_steps_per_second": 19.605, "step": 48500 }, { "epoch": 107.9514832270585, "grad_norm": 0.35361409187316895, "learning_rate": 1.1438127090301004e-06, "loss": 6.3754, "step": 48600 }, { "epoch": 107.9514832270585, "eval_loss": 6.37959098815918, "eval_runtime": 63.698, "eval_samples_per_second": 156.991, "eval_steps_per_second": 19.624, "step": 48600 }, { "epoch": 108.17327418907679, "grad_norm": 0.2802847921848297, "learning_rate": 1.133779264214047e-06, "loss": 6.375, "step": 48700 }, { "epoch": 108.17327418907679, "eval_loss": 6.376708030700684, "eval_runtime": 66.263, "eval_samples_per_second": 150.914, "eval_steps_per_second": 18.864, "step": 48700 }, { "epoch": 108.3950651510951, "grad_norm": 0.3533788323402405, "learning_rate": 1.1237458193979933e-06, "loss": 6.3757, "step": 48800 }, { "epoch": 108.3950651510951, "eval_loss": 6.380278587341309, "eval_runtime": 67.3818, "eval_samples_per_second": 148.408, "eval_steps_per_second": 18.551, "step": 48800 }, { "epoch": 108.61685611311339, "grad_norm": 0.21207566559314728, "learning_rate": 1.1137123745819398e-06, "loss": 6.3776, "step": 48900 }, { "epoch": 108.61685611311339, "eval_loss": 6.375850200653076, "eval_runtime": 63.7895, "eval_samples_per_second": 156.766, "eval_steps_per_second": 19.596, "step": 48900 }, { "epoch": 108.83864707513169, "grad_norm": 0.33531099557876587, "learning_rate": 1.1036789297658862e-06, "loss": 6.3765, "step": 49000 }, { "epoch": 108.83864707513169, "eval_loss": 6.378798484802246, "eval_runtime": 63.7683, "eval_samples_per_second": 156.818, "eval_steps_per_second": 19.602, "step": 49000 }, { "epoch": 109.06043803714999, "grad_norm": 0.39727288484573364, "learning_rate": 1.0936454849498327e-06, "loss": 6.3774, "step": 49100 }, { "epoch": 109.06043803714999, "eval_loss": 6.379205703735352, "eval_runtime": 66.2384, "eval_samples_per_second": 150.97, "eval_steps_per_second": 18.871, "step": 49100 }, { "epoch": 109.28222899916828, "grad_norm": 0.3876926004886627, "learning_rate": 1.0836120401337793e-06, "loss": 6.3772, "step": 49200 }, { "epoch": 109.28222899916828, "eval_loss": 6.382777214050293, "eval_runtime": 63.7163, "eval_samples_per_second": 156.946, "eval_steps_per_second": 19.618, "step": 49200 }, { "epoch": 109.50401996118659, "grad_norm": 0.3268238604068756, "learning_rate": 1.0735785953177258e-06, "loss": 6.3765, "step": 49300 }, { "epoch": 109.50401996118659, "eval_loss": 6.378788471221924, "eval_runtime": 66.3254, "eval_samples_per_second": 150.772, "eval_steps_per_second": 18.846, "step": 49300 }, { "epoch": 109.72581092320488, "grad_norm": 0.24343077838420868, "learning_rate": 1.0635451505016722e-06, "loss": 6.3766, "step": 49400 }, { "epoch": 109.72581092320488, "eval_loss": 6.379393577575684, "eval_runtime": 63.7485, "eval_samples_per_second": 156.866, "eval_steps_per_second": 19.608, "step": 49400 }, { "epoch": 109.94760188522318, "grad_norm": 0.3532174229621887, "learning_rate": 1.0535117056856187e-06, "loss": 6.3762, "step": 49500 }, { "epoch": 109.94760188522318, "eval_loss": 6.383326530456543, "eval_runtime": 63.7304, "eval_samples_per_second": 156.911, "eval_steps_per_second": 19.614, "step": 49500 }, { "epoch": 110.16939284724147, "grad_norm": 0.28071361780166626, "learning_rate": 1.0434782608695653e-06, "loss": 6.3763, "step": 49600 }, { "epoch": 110.16939284724147, "eval_loss": 6.376327991485596, "eval_runtime": 66.248, "eval_samples_per_second": 150.948, "eval_steps_per_second": 18.868, "step": 49600 }, { "epoch": 110.39118380925977, "grad_norm": 0.3425652086734772, "learning_rate": 1.0334448160535118e-06, "loss": 6.3755, "step": 49700 }, { "epoch": 110.39118380925977, "eval_loss": 6.3802337646484375, "eval_runtime": 63.7015, "eval_samples_per_second": 156.982, "eval_steps_per_second": 19.623, "step": 49700 }, { "epoch": 110.61297477127808, "grad_norm": 0.22676917910575867, "learning_rate": 1.0234113712374581e-06, "loss": 6.3773, "step": 49800 }, { "epoch": 110.61297477127808, "eval_loss": 6.3807525634765625, "eval_runtime": 66.2796, "eval_samples_per_second": 150.876, "eval_steps_per_second": 18.86, "step": 49800 }, { "epoch": 110.83476573329636, "grad_norm": 0.25897106528282166, "learning_rate": 1.0133779264214047e-06, "loss": 6.3768, "step": 49900 }, { "epoch": 110.83476573329636, "eval_loss": 6.381240367889404, "eval_runtime": 63.8656, "eval_samples_per_second": 156.579, "eval_steps_per_second": 19.572, "step": 49900 }, { "epoch": 111.05655669531467, "grad_norm": 0.2521306574344635, "learning_rate": 1.0033444816053512e-06, "loss": 6.3748, "step": 50000 }, { "epoch": 111.05655669531467, "eval_loss": 6.379097938537598, "eval_runtime": 63.7336, "eval_samples_per_second": 156.903, "eval_steps_per_second": 19.613, "step": 50000 }, { "epoch": 111.27834765733296, "grad_norm": 0.32774215936660767, "learning_rate": 9.933110367892976e-07, "loss": 6.3777, "step": 50100 }, { "epoch": 111.27834765733296, "eval_loss": 6.379392147064209, "eval_runtime": 66.4051, "eval_samples_per_second": 150.591, "eval_steps_per_second": 18.824, "step": 50100 }, { "epoch": 111.50013861935126, "grad_norm": 0.23284611105918884, "learning_rate": 9.832775919732441e-07, "loss": 6.3746, "step": 50200 }, { "epoch": 111.50013861935126, "eval_loss": 6.377693176269531, "eval_runtime": 64.6002, "eval_samples_per_second": 154.798, "eval_steps_per_second": 19.35, "step": 50200 }, { "epoch": 111.72192958136957, "grad_norm": 0.2757164537906647, "learning_rate": 9.732441471571907e-07, "loss": 6.3743, "step": 50300 }, { "epoch": 111.72192958136957, "eval_loss": 6.38041877746582, "eval_runtime": 65.5393, "eval_samples_per_second": 152.58, "eval_steps_per_second": 19.073, "step": 50300 }, { "epoch": 111.94372054338785, "grad_norm": 0.326815128326416, "learning_rate": 9.632107023411372e-07, "loss": 6.3765, "step": 50400 }, { "epoch": 111.94372054338785, "eval_loss": 6.37969970703125, "eval_runtime": 63.7883, "eval_samples_per_second": 156.769, "eval_steps_per_second": 19.596, "step": 50400 }, { "epoch": 112.16551150540616, "grad_norm": 0.34073254466056824, "learning_rate": 9.531772575250837e-07, "loss": 6.3758, "step": 50500 }, { "epoch": 112.16551150540616, "eval_loss": 6.380171298980713, "eval_runtime": 66.2335, "eval_samples_per_second": 150.981, "eval_steps_per_second": 18.873, "step": 50500 }, { "epoch": 112.38730246742445, "grad_norm": 0.2289067655801773, "learning_rate": 9.431438127090301e-07, "loss": 6.3766, "step": 50600 }, { "epoch": 112.38730246742445, "eval_loss": 6.379415035247803, "eval_runtime": 63.6851, "eval_samples_per_second": 157.023, "eval_steps_per_second": 19.628, "step": 50600 }, { "epoch": 112.60909342944275, "grad_norm": 0.2386418581008911, "learning_rate": 9.331103678929767e-07, "loss": 6.375, "step": 50700 }, { "epoch": 112.60909342944275, "eval_loss": 6.375070571899414, "eval_runtime": 66.2164, "eval_samples_per_second": 151.02, "eval_steps_per_second": 18.878, "step": 50700 }, { "epoch": 112.83088439146105, "grad_norm": 0.26779764890670776, "learning_rate": 9.230769230769231e-07, "loss": 6.3754, "step": 50800 }, { "epoch": 112.83088439146105, "eval_loss": 6.377529621124268, "eval_runtime": 63.7216, "eval_samples_per_second": 156.933, "eval_steps_per_second": 19.617, "step": 50800 }, { "epoch": 113.05267535347934, "grad_norm": 0.2792610228061676, "learning_rate": 9.130434782608697e-07, "loss": 6.3768, "step": 50900 }, { "epoch": 113.05267535347934, "eval_loss": 6.376430988311768, "eval_runtime": 66.1841, "eval_samples_per_second": 151.094, "eval_steps_per_second": 18.887, "step": 50900 }, { "epoch": 113.27446631549765, "grad_norm": 0.26424017548561096, "learning_rate": 9.030100334448161e-07, "loss": 6.3748, "step": 51000 }, { "epoch": 113.27446631549765, "eval_loss": 6.37862491607666, "eval_runtime": 63.7419, "eval_samples_per_second": 156.883, "eval_steps_per_second": 19.61, "step": 51000 }, { "epoch": 113.49625727751594, "grad_norm": 0.26083120703697205, "learning_rate": 8.929765886287627e-07, "loss": 6.3779, "step": 51100 }, { "epoch": 113.49625727751594, "eval_loss": 6.379500389099121, "eval_runtime": 66.2253, "eval_samples_per_second": 151.0, "eval_steps_per_second": 18.875, "step": 51100 }, { "epoch": 113.77626836706405, "grad_norm": 0.25904449820518494, "learning_rate": 8.829431438127091e-07, "loss": 6.3757, "step": 51200 }, { "epoch": 113.77626836706405, "eval_loss": 6.375171661376953, "eval_runtime": 66.093, "eval_samples_per_second": 151.302, "eval_steps_per_second": 18.913, "step": 51200 }, { "epoch": 113.99805932908234, "grad_norm": 0.2680477499961853, "learning_rate": 8.729096989966555e-07, "loss": 6.3769, "step": 51300 }, { "epoch": 113.99805932908234, "eval_loss": 6.376518726348877, "eval_runtime": 63.5204, "eval_samples_per_second": 157.43, "eval_steps_per_second": 19.679, "step": 51300 }, { "epoch": 114.21985029110064, "grad_norm": 0.30891552567481995, "learning_rate": 8.628762541806019e-07, "loss": 6.3752, "step": 51400 }, { "epoch": 114.21985029110064, "eval_loss": 6.377015590667725, "eval_runtime": 63.4143, "eval_samples_per_second": 157.693, "eval_steps_per_second": 19.712, "step": 51400 }, { "epoch": 114.44164125311893, "grad_norm": 0.32155531644821167, "learning_rate": 8.528428093645485e-07, "loss": 6.3767, "step": 51500 }, { "epoch": 114.44164125311893, "eval_loss": 6.377589702606201, "eval_runtime": 66.1364, "eval_samples_per_second": 151.203, "eval_steps_per_second": 18.9, "step": 51500 }, { "epoch": 114.66343221513723, "grad_norm": 0.28316569328308105, "learning_rate": 8.428093645484949e-07, "loss": 6.3755, "step": 51600 }, { "epoch": 114.66343221513723, "eval_loss": 6.3766303062438965, "eval_runtime": 65.9296, "eval_samples_per_second": 151.677, "eval_steps_per_second": 18.96, "step": 51600 }, { "epoch": 114.88522317715552, "grad_norm": 0.24125680327415466, "learning_rate": 8.327759197324414e-07, "loss": 6.3773, "step": 51700 }, { "epoch": 114.88522317715552, "eval_loss": 6.37697172164917, "eval_runtime": 65.9478, "eval_samples_per_second": 151.635, "eval_steps_per_second": 18.954, "step": 51700 }, { "epoch": 115.10701413917383, "grad_norm": 0.21407043933868408, "learning_rate": 8.227424749163879e-07, "loss": 6.3751, "step": 51800 }, { "epoch": 115.10701413917383, "eval_loss": 6.377639293670654, "eval_runtime": 63.6016, "eval_samples_per_second": 157.229, "eval_steps_per_second": 19.654, "step": 51800 }, { "epoch": 115.32880510119213, "grad_norm": 0.23014885187149048, "learning_rate": 8.127090301003344e-07, "loss": 6.3771, "step": 51900 }, { "epoch": 115.32880510119213, "eval_loss": 6.380842208862305, "eval_runtime": 63.4674, "eval_samples_per_second": 157.561, "eval_steps_per_second": 19.695, "step": 51900 }, { "epoch": 115.55059606321042, "grad_norm": 0.2553617060184479, "learning_rate": 8.026755852842809e-07, "loss": 6.3752, "step": 52000 }, { "epoch": 115.55059606321042, "eval_loss": 6.377804756164551, "eval_runtime": 64.2492, "eval_samples_per_second": 155.644, "eval_steps_per_second": 19.456, "step": 52000 }, { "epoch": 115.77238702522872, "grad_norm": 0.32242822647094727, "learning_rate": 7.926421404682274e-07, "loss": 6.3762, "step": 52100 }, { "epoch": 115.77238702522872, "eval_loss": 6.382247447967529, "eval_runtime": 65.2652, "eval_samples_per_second": 153.221, "eval_steps_per_second": 19.153, "step": 52100 }, { "epoch": 115.99417798724701, "grad_norm": 0.25089436769485474, "learning_rate": 7.826086956521739e-07, "loss": 6.3757, "step": 52200 }, { "epoch": 115.99417798724701, "eval_loss": 6.379915714263916, "eval_runtime": 63.3864, "eval_samples_per_second": 157.763, "eval_steps_per_second": 19.72, "step": 52200 }, { "epoch": 116.21596894926532, "grad_norm": 0.24113717675209045, "learning_rate": 7.725752508361204e-07, "loss": 6.3761, "step": 52300 }, { "epoch": 116.21596894926532, "eval_loss": 6.376662731170654, "eval_runtime": 63.5361, "eval_samples_per_second": 157.391, "eval_steps_per_second": 19.674, "step": 52300 }, { "epoch": 116.43775991128362, "grad_norm": 0.3414776027202606, "learning_rate": 7.625418060200669e-07, "loss": 6.3757, "step": 52400 }, { "epoch": 116.43775991128362, "eval_loss": 6.377313137054443, "eval_runtime": 63.5522, "eval_samples_per_second": 157.351, "eval_steps_per_second": 19.669, "step": 52400 }, { "epoch": 116.65955087330191, "grad_norm": 0.24650247395038605, "learning_rate": 7.525083612040134e-07, "loss": 6.3754, "step": 52500 }, { "epoch": 116.65955087330191, "eval_loss": 6.37901496887207, "eval_runtime": 65.8363, "eval_samples_per_second": 151.892, "eval_steps_per_second": 18.986, "step": 52500 }, { "epoch": 116.88134183532021, "grad_norm": 0.27944493293762207, "learning_rate": 7.424749163879599e-07, "loss": 6.3776, "step": 52600 }, { "epoch": 116.88134183532021, "eval_loss": 6.376550197601318, "eval_runtime": 63.5812, "eval_samples_per_second": 157.279, "eval_steps_per_second": 19.66, "step": 52600 }, { "epoch": 117.1031327973385, "grad_norm": 0.2298879325389862, "learning_rate": 7.324414715719064e-07, "loss": 6.3751, "step": 52700 }, { "epoch": 117.1031327973385, "eval_loss": 6.377909183502197, "eval_runtime": 63.3541, "eval_samples_per_second": 157.843, "eval_steps_per_second": 19.73, "step": 52700 }, { "epoch": 117.3249237593568, "grad_norm": 0.25682932138442993, "learning_rate": 7.224080267558529e-07, "loss": 6.3757, "step": 52800 }, { "epoch": 117.3249237593568, "eval_loss": 6.378458023071289, "eval_runtime": 65.7985, "eval_samples_per_second": 151.979, "eval_steps_per_second": 18.997, "step": 52800 }, { "epoch": 117.54671472137511, "grad_norm": 0.2633031904697418, "learning_rate": 7.123745819397994e-07, "loss": 6.3767, "step": 52900 }, { "epoch": 117.54671472137511, "eval_loss": 6.380926132202148, "eval_runtime": 63.5491, "eval_samples_per_second": 157.359, "eval_steps_per_second": 19.67, "step": 52900 }, { "epoch": 117.7685056833934, "grad_norm": 0.26749059557914734, "learning_rate": 7.023411371237459e-07, "loss": 6.3767, "step": 53000 }, { "epoch": 117.7685056833934, "eval_loss": 6.381775856018066, "eval_runtime": 63.4542, "eval_samples_per_second": 157.594, "eval_steps_per_second": 19.699, "step": 53000 }, { "epoch": 117.9902966454117, "grad_norm": 0.22249187529087067, "learning_rate": 6.923076923076923e-07, "loss": 6.377, "step": 53100 }, { "epoch": 117.9902966454117, "eval_loss": 6.38169002532959, "eval_runtime": 63.5488, "eval_samples_per_second": 157.359, "eval_steps_per_second": 19.67, "step": 53100 }, { "epoch": 118.21208760742999, "grad_norm": 0.22224722802639008, "learning_rate": 6.822742474916388e-07, "loss": 6.3764, "step": 53200 }, { "epoch": 118.21208760742999, "eval_loss": 6.37975549697876, "eval_runtime": 65.9614, "eval_samples_per_second": 151.604, "eval_steps_per_second": 18.95, "step": 53200 }, { "epoch": 118.4338785694483, "grad_norm": 0.2897886037826538, "learning_rate": 6.722408026755853e-07, "loss": 6.3737, "step": 53300 }, { "epoch": 118.4338785694483, "eval_loss": 6.376906394958496, "eval_runtime": 63.536, "eval_samples_per_second": 157.391, "eval_steps_per_second": 19.674, "step": 53300 }, { "epoch": 118.65566953146659, "grad_norm": 0.2731805145740509, "learning_rate": 6.622073578595318e-07, "loss": 6.3774, "step": 53400 }, { "epoch": 118.65566953146659, "eval_loss": 6.377748489379883, "eval_runtime": 63.612, "eval_samples_per_second": 157.203, "eval_steps_per_second": 19.65, "step": 53400 }, { "epoch": 118.87746049348489, "grad_norm": 0.22697260975837708, "learning_rate": 6.521739130434783e-07, "loss": 6.3767, "step": 53500 }, { "epoch": 118.87746049348489, "eval_loss": 6.381230354309082, "eval_runtime": 65.9156, "eval_samples_per_second": 151.709, "eval_steps_per_second": 18.964, "step": 53500 }, { "epoch": 119.0992514555032, "grad_norm": 0.30966779589653015, "learning_rate": 6.421404682274248e-07, "loss": 6.376, "step": 53600 }, { "epoch": 119.0992514555032, "eval_loss": 6.37573766708374, "eval_runtime": 63.3841, "eval_samples_per_second": 157.768, "eval_steps_per_second": 19.721, "step": 53600 }, { "epoch": 119.32104241752148, "grad_norm": 0.2676733136177063, "learning_rate": 6.321070234113712e-07, "loss": 6.3759, "step": 53700 }, { "epoch": 119.32104241752148, "eval_loss": 6.374691963195801, "eval_runtime": 63.4737, "eval_samples_per_second": 157.545, "eval_steps_per_second": 19.693, "step": 53700 }, { "epoch": 119.54283337953979, "grad_norm": 0.2713070213794708, "learning_rate": 6.220735785953178e-07, "loss": 6.3768, "step": 53800 }, { "epoch": 119.54283337953979, "eval_loss": 6.378169059753418, "eval_runtime": 65.7452, "eval_samples_per_second": 152.102, "eval_steps_per_second": 19.013, "step": 53800 }, { "epoch": 119.76462434155808, "grad_norm": 0.2583908140659332, "learning_rate": 6.120401337792642e-07, "loss": 6.3756, "step": 53900 }, { "epoch": 119.76462434155808, "eval_loss": 6.380895137786865, "eval_runtime": 63.7905, "eval_samples_per_second": 156.763, "eval_steps_per_second": 19.595, "step": 53900 }, { "epoch": 119.98641530357638, "grad_norm": 0.2636660933494568, "learning_rate": 6.020066889632107e-07, "loss": 6.376, "step": 54000 }, { "epoch": 119.98641530357638, "eval_loss": 6.378993034362793, "eval_runtime": 63.4987, "eval_samples_per_second": 157.483, "eval_steps_per_second": 19.685, "step": 54000 }, { "epoch": 120.20820626559468, "grad_norm": 0.2040402740240097, "learning_rate": 5.919732441471572e-07, "loss": 6.3742, "step": 54100 }, { "epoch": 120.20820626559468, "eval_loss": 6.379099369049072, "eval_runtime": 63.4641, "eval_samples_per_second": 157.569, "eval_steps_per_second": 19.696, "step": 54100 }, { "epoch": 120.42999722761297, "grad_norm": 0.2771637439727783, "learning_rate": 5.819397993311037e-07, "loss": 6.377, "step": 54200 }, { "epoch": 120.42999722761297, "eval_loss": 6.380918025970459, "eval_runtime": 63.5735, "eval_samples_per_second": 157.298, "eval_steps_per_second": 19.662, "step": 54200 }, { "epoch": 120.65178818963128, "grad_norm": 0.2907504141330719, "learning_rate": 5.719063545150502e-07, "loss": 6.3771, "step": 54300 }, { "epoch": 120.65178818963128, "eval_loss": 6.379312515258789, "eval_runtime": 65.9548, "eval_samples_per_second": 151.619, "eval_steps_per_second": 18.952, "step": 54300 }, { "epoch": 120.87357915164957, "grad_norm": 0.30987074971199036, "learning_rate": 5.618729096989966e-07, "loss": 6.3768, "step": 54400 }, { "epoch": 120.87357915164957, "eval_loss": 6.37892484664917, "eval_runtime": 63.4967, "eval_samples_per_second": 157.488, "eval_steps_per_second": 19.686, "step": 54400 }, { "epoch": 121.09537011366787, "grad_norm": 0.3270675837993622, "learning_rate": 5.518394648829431e-07, "loss": 6.376, "step": 54500 }, { "epoch": 121.09537011366787, "eval_loss": 6.377264976501465, "eval_runtime": 63.4405, "eval_samples_per_second": 157.628, "eval_steps_per_second": 19.704, "step": 54500 }, { "epoch": 121.31716107568617, "grad_norm": 0.23159100115299225, "learning_rate": 5.418060200668896e-07, "loss": 6.3773, "step": 54600 }, { "epoch": 121.31716107568617, "eval_loss": 6.379176616668701, "eval_runtime": 66.0387, "eval_samples_per_second": 151.426, "eval_steps_per_second": 18.928, "step": 54600 }, { "epoch": 121.53895203770446, "grad_norm": 0.231267511844635, "learning_rate": 5.317725752508361e-07, "loss": 6.3773, "step": 54700 }, { "epoch": 121.53895203770446, "eval_loss": 6.376558780670166, "eval_runtime": 63.5938, "eval_samples_per_second": 157.248, "eval_steps_per_second": 19.656, "step": 54700 }, { "epoch": 121.76074299972277, "grad_norm": 0.24276390671730042, "learning_rate": 5.217391304347826e-07, "loss": 6.3754, "step": 54800 }, { "epoch": 121.76074299972277, "eval_loss": 6.378441333770752, "eval_runtime": 63.5257, "eval_samples_per_second": 157.417, "eval_steps_per_second": 19.677, "step": 54800 }, { "epoch": 121.98253396174105, "grad_norm": 0.26921290159225464, "learning_rate": 5.117056856187291e-07, "loss": 6.3751, "step": 54900 }, { "epoch": 121.98253396174105, "eval_loss": 6.378532886505127, "eval_runtime": 63.5535, "eval_samples_per_second": 157.348, "eval_steps_per_second": 19.668, "step": 54900 }, { "epoch": 122.20432492375936, "grad_norm": 0.274029016494751, "learning_rate": 5.016722408026756e-07, "loss": 6.376, "step": 55000 }, { "epoch": 122.20432492375936, "eval_loss": 6.378449440002441, "eval_runtime": 65.8768, "eval_samples_per_second": 151.798, "eval_steps_per_second": 18.975, "step": 55000 }, { "epoch": 122.42611588577765, "grad_norm": 0.27585527300834656, "learning_rate": 4.916387959866221e-07, "loss": 6.376, "step": 55100 }, { "epoch": 122.42611588577765, "eval_loss": 6.37809944152832, "eval_runtime": 63.5221, "eval_samples_per_second": 157.426, "eval_steps_per_second": 19.678, "step": 55100 }, { "epoch": 122.64790684779595, "grad_norm": 0.2652019262313843, "learning_rate": 4.816053511705686e-07, "loss": 6.3753, "step": 55200 }, { "epoch": 122.64790684779595, "eval_loss": 6.38352632522583, "eval_runtime": 63.4719, "eval_samples_per_second": 157.55, "eval_steps_per_second": 19.694, "step": 55200 }, { "epoch": 122.86969780981426, "grad_norm": 0.24283932149410248, "learning_rate": 4.7157190635451506e-07, "loss": 6.3761, "step": 55300 }, { "epoch": 122.86969780981426, "eval_loss": 6.376107215881348, "eval_runtime": 63.547, "eval_samples_per_second": 157.364, "eval_steps_per_second": 19.67, "step": 55300 }, { "epoch": 123.09148877183254, "grad_norm": 0.29150310158729553, "learning_rate": 4.6153846153846156e-07, "loss": 6.3765, "step": 55400 }, { "epoch": 123.09148877183254, "eval_loss": 6.37521505355835, "eval_runtime": 65.9064, "eval_samples_per_second": 151.73, "eval_steps_per_second": 18.966, "step": 55400 }, { "epoch": 123.31327973385085, "grad_norm": 0.28435659408569336, "learning_rate": 4.5150501672240806e-07, "loss": 6.3757, "step": 55500 }, { "epoch": 123.31327973385085, "eval_loss": 6.378593921661377, "eval_runtime": 63.5654, "eval_samples_per_second": 157.318, "eval_steps_per_second": 19.665, "step": 55500 }, { "epoch": 123.53507069586914, "grad_norm": 0.2412547916173935, "learning_rate": 4.4147157190635456e-07, "loss": 6.3757, "step": 55600 }, { "epoch": 123.53507069586914, "eval_loss": 6.377431869506836, "eval_runtime": 66.0043, "eval_samples_per_second": 151.505, "eval_steps_per_second": 18.938, "step": 55600 }, { "epoch": 123.75686165788744, "grad_norm": 0.21835213899612427, "learning_rate": 4.3143812709030095e-07, "loss": 6.3763, "step": 55700 }, { "epoch": 123.75686165788744, "eval_loss": 6.378489971160889, "eval_runtime": 63.7489, "eval_samples_per_second": 156.865, "eval_steps_per_second": 19.608, "step": 55700 }, { "epoch": 123.97865261990574, "grad_norm": 0.18911224603652954, "learning_rate": 4.2140468227424745e-07, "loss": 6.3754, "step": 55800 }, { "epoch": 123.97865261990574, "eval_loss": 6.379303932189941, "eval_runtime": 66.1257, "eval_samples_per_second": 151.227, "eval_steps_per_second": 18.903, "step": 55800 }, { "epoch": 124.20044358192403, "grad_norm": 0.283447265625, "learning_rate": 4.1137123745819395e-07, "loss": 6.3743, "step": 55900 }, { "epoch": 124.20044358192403, "eval_loss": 6.381599426269531, "eval_runtime": 63.605, "eval_samples_per_second": 157.22, "eval_steps_per_second": 19.653, "step": 55900 }, { "epoch": 124.42223454394234, "grad_norm": 0.1898406594991684, "learning_rate": 4.0133779264214045e-07, "loss": 6.3755, "step": 56000 }, { "epoch": 124.42223454394234, "eval_loss": 6.376759052276611, "eval_runtime": 64.3574, "eval_samples_per_second": 155.382, "eval_steps_per_second": 19.423, "step": 56000 }, { "epoch": 124.64402550596063, "grad_norm": 0.2740555703639984, "learning_rate": 3.9130434782608694e-07, "loss": 6.3767, "step": 56100 }, { "epoch": 124.64402550596063, "eval_loss": 6.377686023712158, "eval_runtime": 65.4964, "eval_samples_per_second": 152.68, "eval_steps_per_second": 19.085, "step": 56100 }, { "epoch": 124.86581646797893, "grad_norm": 0.24969562888145447, "learning_rate": 3.8127090301003344e-07, "loss": 6.3749, "step": 56200 }, { "epoch": 124.86581646797893, "eval_loss": 6.3803300857543945, "eval_runtime": 63.6262, "eval_samples_per_second": 157.168, "eval_steps_per_second": 19.646, "step": 56200 }, { "epoch": 125.08760742999723, "grad_norm": 0.271085649728775, "learning_rate": 3.7123745819397994e-07, "loss": 6.3761, "step": 56300 }, { "epoch": 125.08760742999723, "eval_loss": 6.377999782562256, "eval_runtime": 63.5511, "eval_samples_per_second": 157.354, "eval_steps_per_second": 19.669, "step": 56300 }, { "epoch": 125.30939839201552, "grad_norm": 0.2341337651014328, "learning_rate": 3.6120401337792644e-07, "loss": 6.3787, "step": 56400 }, { "epoch": 125.30939839201552, "eval_loss": 6.377155780792236, "eval_runtime": 66.011, "eval_samples_per_second": 151.49, "eval_steps_per_second": 18.936, "step": 56400 }, { "epoch": 125.53118935403383, "grad_norm": 0.2656327784061432, "learning_rate": 3.5117056856187294e-07, "loss": 6.3742, "step": 56500 }, { "epoch": 125.53118935403383, "eval_loss": 6.378920078277588, "eval_runtime": 63.6517, "eval_samples_per_second": 157.105, "eval_steps_per_second": 19.638, "step": 56500 }, { "epoch": 125.75298031605212, "grad_norm": 0.261843204498291, "learning_rate": 3.411371237458194e-07, "loss": 6.3742, "step": 56600 }, { "epoch": 125.75298031605212, "eval_loss": 6.376353740692139, "eval_runtime": 65.896, "eval_samples_per_second": 151.754, "eval_steps_per_second": 18.969, "step": 56600 }, { "epoch": 125.97477127807042, "grad_norm": 0.27163127064704895, "learning_rate": 3.311036789297659e-07, "loss": 6.3765, "step": 56700 }, { "epoch": 125.97477127807042, "eval_loss": 6.3804826736450195, "eval_runtime": 63.514, "eval_samples_per_second": 157.446, "eval_steps_per_second": 19.681, "step": 56700 }, { "epoch": 126.19656224008871, "grad_norm": 0.2797481417655945, "learning_rate": 3.210702341137124e-07, "loss": 6.3764, "step": 56800 }, { "epoch": 126.19656224008871, "eval_loss": 6.378259658813477, "eval_runtime": 63.4475, "eval_samples_per_second": 157.611, "eval_steps_per_second": 19.701, "step": 56800 }, { "epoch": 126.41835320210701, "grad_norm": 0.21093739569187164, "learning_rate": 3.110367892976589e-07, "loss": 6.3764, "step": 56900 }, { "epoch": 126.41835320210701, "eval_loss": 6.378982067108154, "eval_runtime": 66.045, "eval_samples_per_second": 151.412, "eval_steps_per_second": 18.927, "step": 56900 }, { "epoch": 126.64014416412532, "grad_norm": 0.268632173538208, "learning_rate": 3.010033444816054e-07, "loss": 6.3762, "step": 57000 }, { "epoch": 126.64014416412532, "eval_loss": 6.379413604736328, "eval_runtime": 63.641, "eval_samples_per_second": 157.131, "eval_steps_per_second": 19.641, "step": 57000 }, { "epoch": 126.8619351261436, "grad_norm": 0.2878783047199249, "learning_rate": 2.9096989966555187e-07, "loss": 6.376, "step": 57100 }, { "epoch": 126.8619351261436, "eval_loss": 6.378924369812012, "eval_runtime": 66.1831, "eval_samples_per_second": 151.096, "eval_steps_per_second": 18.887, "step": 57100 }, { "epoch": 127.08372608816191, "grad_norm": 0.2618252635002136, "learning_rate": 2.809364548494983e-07, "loss": 6.3768, "step": 57200 }, { "epoch": 127.08372608816191, "eval_loss": 6.37802267074585, "eval_runtime": 63.5424, "eval_samples_per_second": 157.375, "eval_steps_per_second": 19.672, "step": 57200 }, { "epoch": 127.3055170501802, "grad_norm": 0.20790652930736542, "learning_rate": 2.709030100334448e-07, "loss": 6.3763, "step": 57300 }, { "epoch": 127.3055170501802, "eval_loss": 6.377635955810547, "eval_runtime": 66.2394, "eval_samples_per_second": 150.967, "eval_steps_per_second": 18.871, "step": 57300 }, { "epoch": 127.5273080121985, "grad_norm": 0.23446954786777496, "learning_rate": 2.608695652173913e-07, "loss": 6.3758, "step": 57400 }, { "epoch": 127.5273080121985, "eval_loss": 6.378016471862793, "eval_runtime": 63.7187, "eval_samples_per_second": 156.94, "eval_steps_per_second": 19.617, "step": 57400 }, { "epoch": 127.7490989742168, "grad_norm": 0.2730012536048889, "learning_rate": 2.508361204013378e-07, "loss": 6.3771, "step": 57500 }, { "epoch": 127.7490989742168, "eval_loss": 6.378283500671387, "eval_runtime": 66.0326, "eval_samples_per_second": 151.44, "eval_steps_per_second": 18.93, "step": 57500 }, { "epoch": 127.9708899362351, "grad_norm": 0.19740967452526093, "learning_rate": 2.408026755852843e-07, "loss": 6.3754, "step": 57600 }, { "epoch": 127.9708899362351, "eval_loss": 6.377573490142822, "eval_runtime": 68.5433, "eval_samples_per_second": 145.893, "eval_steps_per_second": 18.237, "step": 57600 }, { "epoch": 128.1926808982534, "grad_norm": 0.20099857449531555, "learning_rate": 2.3076923076923078e-07, "loss": 6.3763, "step": 57700 }, { "epoch": 128.1926808982534, "eval_loss": 6.380809783935547, "eval_runtime": 63.6372, "eval_samples_per_second": 157.141, "eval_steps_per_second": 19.643, "step": 57700 }, { "epoch": 128.4144718602717, "grad_norm": 0.26378223299980164, "learning_rate": 2.2073578595317728e-07, "loss": 6.3742, "step": 57800 }, { "epoch": 128.4144718602717, "eval_loss": 6.377455234527588, "eval_runtime": 63.6147, "eval_samples_per_second": 157.196, "eval_steps_per_second": 19.65, "step": 57800 }, { "epoch": 128.63626282228998, "grad_norm": 0.22778332233428955, "learning_rate": 2.1070234113712372e-07, "loss": 6.3757, "step": 57900 }, { "epoch": 128.63626282228998, "eval_loss": 6.376725196838379, "eval_runtime": 63.6324, "eval_samples_per_second": 157.153, "eval_steps_per_second": 19.644, "step": 57900 }, { "epoch": 128.85805378430828, "grad_norm": 0.25024932622909546, "learning_rate": 2.0066889632107022e-07, "loss": 6.3767, "step": 58000 }, { "epoch": 128.85805378430828, "eval_loss": 6.378956317901611, "eval_runtime": 66.0444, "eval_samples_per_second": 151.413, "eval_steps_per_second": 18.927, "step": 58000 }, { "epoch": 129.0798447463266, "grad_norm": 0.22629129886627197, "learning_rate": 1.9063545150501672e-07, "loss": 6.3751, "step": 58100 }, { "epoch": 129.0798447463266, "eval_loss": 6.378350734710693, "eval_runtime": 63.6424, "eval_samples_per_second": 157.128, "eval_steps_per_second": 19.641, "step": 58100 }, { "epoch": 129.3016357083449, "grad_norm": 0.22958730161190033, "learning_rate": 1.8060200668896322e-07, "loss": 6.3754, "step": 58200 }, { "epoch": 129.3016357083449, "eval_loss": 6.379317760467529, "eval_runtime": 66.1349, "eval_samples_per_second": 151.206, "eval_steps_per_second": 18.901, "step": 58200 }, { "epoch": 129.5234266703632, "grad_norm": 0.29147765040397644, "learning_rate": 1.705685618729097e-07, "loss": 6.3766, "step": 58300 }, { "epoch": 129.5234266703632, "eval_loss": 6.379565238952637, "eval_runtime": 63.6308, "eval_samples_per_second": 157.157, "eval_steps_per_second": 19.645, "step": 58300 }, { "epoch": 129.74521763238147, "grad_norm": 0.2274588942527771, "learning_rate": 1.605351170568562e-07, "loss": 6.3766, "step": 58400 }, { "epoch": 129.74521763238147, "eval_loss": 6.378822326660156, "eval_runtime": 63.7248, "eval_samples_per_second": 156.925, "eval_steps_per_second": 19.616, "step": 58400 }, { "epoch": 129.96700859439977, "grad_norm": 0.27082857489585876, "learning_rate": 1.505016722408027e-07, "loss": 6.3762, "step": 58500 }, { "epoch": 129.96700859439977, "eval_loss": 6.376942157745361, "eval_runtime": 66.2694, "eval_samples_per_second": 150.899, "eval_steps_per_second": 18.862, "step": 58500 }, { "epoch": 130.18879955641808, "grad_norm": 0.2117777317762375, "learning_rate": 1.4046822742474916e-07, "loss": 6.3756, "step": 58600 }, { "epoch": 130.18879955641808, "eval_loss": 6.381185054779053, "eval_runtime": 63.6203, "eval_samples_per_second": 157.183, "eval_steps_per_second": 19.648, "step": 58600 }, { "epoch": 130.41059051843638, "grad_norm": 0.244340181350708, "learning_rate": 1.3043478260869566e-07, "loss": 6.3746, "step": 58700 }, { "epoch": 130.41059051843638, "eval_loss": 6.378442764282227, "eval_runtime": 63.6467, "eval_samples_per_second": 157.117, "eval_steps_per_second": 19.64, "step": 58700 }, { "epoch": 130.63238148045468, "grad_norm": 0.23617205023765564, "learning_rate": 1.2040133779264215e-07, "loss": 6.3759, "step": 58800 }, { "epoch": 130.63238148045468, "eval_loss": 6.377311706542969, "eval_runtime": 66.2898, "eval_samples_per_second": 150.853, "eval_steps_per_second": 18.857, "step": 58800 }, { "epoch": 130.85417244247296, "grad_norm": 0.22402510046958923, "learning_rate": 1.1036789297658864e-07, "loss": 6.3766, "step": 58900 }, { "epoch": 130.85417244247296, "eval_loss": 6.378325939178467, "eval_runtime": 63.7783, "eval_samples_per_second": 156.793, "eval_steps_per_second": 19.599, "step": 58900 }, { "epoch": 131.07596340449126, "grad_norm": 0.22382721304893494, "learning_rate": 1.0033444816053511e-07, "loss": 6.377, "step": 59000 }, { "epoch": 131.07596340449126, "eval_loss": 6.375909328460693, "eval_runtime": 63.6862, "eval_samples_per_second": 157.02, "eval_steps_per_second": 19.627, "step": 59000 }, { "epoch": 131.29775436650957, "grad_norm": 0.2319914549589157, "learning_rate": 9.030100334448161e-08, "loss": 6.3759, "step": 59100 }, { "epoch": 131.29775436650957, "eval_loss": 6.380961894989014, "eval_runtime": 63.73, "eval_samples_per_second": 156.912, "eval_steps_per_second": 19.614, "step": 59100 }, { "epoch": 131.51954532852787, "grad_norm": 0.27138957381248474, "learning_rate": 8.02675585284281e-08, "loss": 6.3765, "step": 59200 }, { "epoch": 131.51954532852787, "eval_loss": 6.378270626068115, "eval_runtime": 66.164, "eval_samples_per_second": 151.14, "eval_steps_per_second": 18.892, "step": 59200 }, { "epoch": 131.74133629054617, "grad_norm": 0.24163523316383362, "learning_rate": 7.023411371237458e-08, "loss": 6.3758, "step": 59300 }, { "epoch": 131.74133629054617, "eval_loss": 6.379899024963379, "eval_runtime": 66.2406, "eval_samples_per_second": 150.965, "eval_steps_per_second": 18.871, "step": 59300 }, { "epoch": 131.96312725256445, "grad_norm": 0.20410296320915222, "learning_rate": 6.020066889632108e-08, "loss": 6.3753, "step": 59400 }, { "epoch": 131.96312725256445, "eval_loss": 6.378077983856201, "eval_runtime": 63.7013, "eval_samples_per_second": 156.983, "eval_steps_per_second": 19.623, "step": 59400 }, { "epoch": 132.18491821458275, "grad_norm": 0.15991632640361786, "learning_rate": 5.0167224080267556e-08, "loss": 6.3762, "step": 59500 }, { "epoch": 132.18491821458275, "eval_loss": 6.379003524780273, "eval_runtime": 63.6773, "eval_samples_per_second": 157.042, "eval_steps_per_second": 19.63, "step": 59500 }, { "epoch": 132.40670917660105, "grad_norm": 0.2014060765504837, "learning_rate": 4.013377926421405e-08, "loss": 6.3734, "step": 59600 }, { "epoch": 132.40670917660105, "eval_loss": 6.377279758453369, "eval_runtime": 64.9426, "eval_samples_per_second": 153.982, "eval_steps_per_second": 19.248, "step": 59600 }, { "epoch": 132.62850013861936, "grad_norm": 0.23493210971355438, "learning_rate": 3.010033444816054e-08, "loss": 6.3767, "step": 59700 }, { "epoch": 132.62850013861936, "eval_loss": 6.378801345825195, "eval_runtime": 65.0941, "eval_samples_per_second": 153.624, "eval_steps_per_second": 19.203, "step": 59700 }, { "epoch": 132.85029110063766, "grad_norm": 0.2207670956850052, "learning_rate": 2.0066889632107024e-08, "loss": 6.3764, "step": 59800 }, { "epoch": 132.85029110063766, "eval_loss": 6.377054691314697, "eval_runtime": 63.7133, "eval_samples_per_second": 156.953, "eval_steps_per_second": 19.619, "step": 59800 }, { "epoch": 133.07208206265594, "grad_norm": 0.21483196318149567, "learning_rate": 1.0033444816053512e-08, "loss": 6.3763, "step": 59900 }, { "epoch": 133.07208206265594, "eval_loss": 6.3776984214782715, "eval_runtime": 63.6217, "eval_samples_per_second": 157.179, "eval_steps_per_second": 19.647, "step": 59900 }, { "epoch": 133.29387302467424, "grad_norm": 0.1953832507133484, "learning_rate": 0.0, "loss": 6.3751, "step": 60000 }, { "epoch": 133.29387302467424, "eval_loss": 6.377795219421387, "eval_runtime": 66.2186, "eval_samples_per_second": 151.015, "eval_steps_per_second": 18.877, "step": 60000 } ], "logging_steps": 100, "max_steps": 60000, "num_input_tokens_seen": 0, "num_train_epochs": 134, "save_steps": 100, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 10, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 10 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.020754951164035e+19, "train_batch_size": 8, "trial_name": null, "trial_params": null }