| { |
| "best_metric": 6.374266624450684, |
| "best_model_checkpoint": "learning_source_20260316/genome_sequence/bert-output/genome_sequence-small/checkpoint-46600", |
| "epoch": 133.29387302467424, |
| "eval_steps": 100, |
| "global_step": 60000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.22179096201829776, |
| "grad_norm": 0.9666945338249207, |
| "learning_rate": 3e-06, |
| "loss": 8.2198, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.22179096201829776, |
| "eval_loss": 7.910184383392334, |
| "eval_runtime": 100.9607, |
| "eval_samples_per_second": 99.048, |
| "eval_steps_per_second": 99.048, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.4435819240365955, |
| "grad_norm": 0.7436413764953613, |
| "learning_rate": 6e-06, |
| "loss": 7.7448, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.4435819240365955, |
| "eval_loss": 7.522714138031006, |
| "eval_runtime": 101.1239, |
| "eval_samples_per_second": 98.889, |
| "eval_steps_per_second": 98.889, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.6653728860548933, |
| "grad_norm": 0.5597550868988037, |
| "learning_rate": 5.998999666555519e-06, |
| "loss": 7.3644, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.6653728860548933, |
| "eval_loss": 7.118653297424316, |
| "eval_runtime": 101.7208, |
| "eval_samples_per_second": 98.308, |
| "eval_steps_per_second": 98.308, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.887163848073191, |
| "grad_norm": 0.3977542519569397, |
| "learning_rate": 5.997999333111037e-06, |
| "loss": 7.039, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.887163848073191, |
| "eval_loss": 6.858039855957031, |
| "eval_runtime": 103.108, |
| "eval_samples_per_second": 96.986, |
| "eval_steps_per_second": 96.986, |
| "step": 400 |
| }, |
| { |
| "epoch": 1.1089548100914888, |
| "grad_norm": 0.31371042132377625, |
| "learning_rate": 5.9969989996665554e-06, |
| "loss": 6.8537, |
| "step": 500 |
| }, |
| { |
| "epoch": 1.1089548100914888, |
| "eval_loss": 6.725042343139648, |
| "eval_runtime": 100.7633, |
| "eval_samples_per_second": 99.243, |
| "eval_steps_per_second": 99.243, |
| "step": 500 |
| }, |
| { |
| "epoch": 1.3307457721097866, |
| "grad_norm": 0.2910732924938202, |
| "learning_rate": 5.995998666222074e-06, |
| "loss": 6.749, |
| "step": 600 |
| }, |
| { |
| "epoch": 1.3307457721097866, |
| "eval_loss": 6.648338317871094, |
| "eval_runtime": 103.8281, |
| "eval_samples_per_second": 96.313, |
| "eval_steps_per_second": 96.313, |
| "step": 600 |
| }, |
| { |
| "epoch": 1.5525367341280842, |
| "grad_norm": 0.38117602467536926, |
| "learning_rate": 5.994998332777593e-06, |
| "loss": 6.6809, |
| "step": 700 |
| }, |
| { |
| "epoch": 1.5525367341280842, |
| "eval_loss": 6.598635196685791, |
| "eval_runtime": 100.7294, |
| "eval_samples_per_second": 99.276, |
| "eval_steps_per_second": 99.276, |
| "step": 700 |
| }, |
| { |
| "epoch": 1.774327696146382, |
| "grad_norm": 0.23082487285137177, |
| "learning_rate": 5.9939979993331115e-06, |
| "loss": 6.6363, |
| "step": 800 |
| }, |
| { |
| "epoch": 1.774327696146382, |
| "eval_loss": 6.5613298416137695, |
| "eval_runtime": 100.99, |
| "eval_samples_per_second": 99.02, |
| "eval_steps_per_second": 99.02, |
| "step": 800 |
| }, |
| { |
| "epoch": 1.9961186581646797, |
| "grad_norm": 0.3537309169769287, |
| "learning_rate": 5.992997665888629e-06, |
| "loss": 6.6008, |
| "step": 900 |
| }, |
| { |
| "epoch": 1.9961186581646797, |
| "eval_loss": 6.539489269256592, |
| "eval_runtime": 103.6291, |
| "eval_samples_per_second": 96.498, |
| "eval_steps_per_second": 96.498, |
| "step": 900 |
| }, |
| { |
| "epoch": 2.2179096201829775, |
| "grad_norm": 0.22692321240901947, |
| "learning_rate": 5.991997332444148e-06, |
| "loss": 6.5735, |
| "step": 1000 |
| }, |
| { |
| "epoch": 2.2179096201829775, |
| "eval_loss": 6.521015644073486, |
| "eval_runtime": 100.5379, |
| "eval_samples_per_second": 99.465, |
| "eval_steps_per_second": 99.465, |
| "step": 1000 |
| }, |
| { |
| "epoch": 2.4397005822012754, |
| "grad_norm": 0.5465587973594666, |
| "learning_rate": 5.990996998999667e-06, |
| "loss": 6.5555, |
| "step": 1100 |
| }, |
| { |
| "epoch": 2.4397005822012754, |
| "eval_loss": 6.505192279815674, |
| "eval_runtime": 101.8008, |
| "eval_samples_per_second": 98.231, |
| "eval_steps_per_second": 98.231, |
| "step": 1100 |
| }, |
| { |
| "epoch": 2.6614915442195732, |
| "grad_norm": 0.6720498204231262, |
| "learning_rate": 5.989996665555185e-06, |
| "loss": 6.5407, |
| "step": 1200 |
| }, |
| { |
| "epoch": 2.6614915442195732, |
| "eval_loss": 6.497246265411377, |
| "eval_runtime": 103.0853, |
| "eval_samples_per_second": 97.007, |
| "eval_steps_per_second": 97.007, |
| "step": 1200 |
| }, |
| { |
| "epoch": 2.8832825062378706, |
| "grad_norm": 0.3426739275455475, |
| "learning_rate": 5.988996332110703e-06, |
| "loss": 6.529, |
| "step": 1300 |
| }, |
| { |
| "epoch": 2.8832825062378706, |
| "eval_loss": 6.488556861877441, |
| "eval_runtime": 100.6535, |
| "eval_samples_per_second": 99.351, |
| "eval_steps_per_second": 99.351, |
| "step": 1300 |
| }, |
| { |
| "epoch": 3.1050734682561685, |
| "grad_norm": 0.2463805377483368, |
| "learning_rate": 5.987995998666222e-06, |
| "loss": 6.5196, |
| "step": 1400 |
| }, |
| { |
| "epoch": 3.1050734682561685, |
| "eval_loss": 6.484075546264648, |
| "eval_runtime": 104.3708, |
| "eval_samples_per_second": 95.812, |
| "eval_steps_per_second": 95.812, |
| "step": 1400 |
| }, |
| { |
| "epoch": 3.3268644302744663, |
| "grad_norm": 0.1849370300769806, |
| "learning_rate": 5.986995665221741e-06, |
| "loss": 6.5099, |
| "step": 1500 |
| }, |
| { |
| "epoch": 3.3268644302744663, |
| "eval_loss": 6.476208209991455, |
| "eval_runtime": 100.8511, |
| "eval_samples_per_second": 99.156, |
| "eval_steps_per_second": 99.156, |
| "step": 1500 |
| }, |
| { |
| "epoch": 3.548655392292764, |
| "grad_norm": 0.23534879088401794, |
| "learning_rate": 5.9859953317772595e-06, |
| "loss": 6.503, |
| "step": 1600 |
| }, |
| { |
| "epoch": 3.548655392292764, |
| "eval_loss": 6.473758220672607, |
| "eval_runtime": 100.8445, |
| "eval_samples_per_second": 99.163, |
| "eval_steps_per_second": 99.163, |
| "step": 1600 |
| }, |
| { |
| "epoch": 3.770446354311062, |
| "grad_norm": 0.3312935531139374, |
| "learning_rate": 5.984994998332777e-06, |
| "loss": 6.4991, |
| "step": 1700 |
| }, |
| { |
| "epoch": 3.770446354311062, |
| "eval_loss": 6.471902370452881, |
| "eval_runtime": 104.4468, |
| "eval_samples_per_second": 95.743, |
| "eval_steps_per_second": 95.743, |
| "step": 1700 |
| }, |
| { |
| "epoch": 3.9922373163293594, |
| "grad_norm": 0.27324172854423523, |
| "learning_rate": 5.983994664888296e-06, |
| "loss": 6.4936, |
| "step": 1800 |
| }, |
| { |
| "epoch": 3.9922373163293594, |
| "eval_loss": 6.464596271514893, |
| "eval_runtime": 100.6385, |
| "eval_samples_per_second": 99.366, |
| "eval_steps_per_second": 99.366, |
| "step": 1800 |
| }, |
| { |
| "epoch": 4.214028278347658, |
| "grad_norm": 0.29278630018234253, |
| "learning_rate": 5.982994331443815e-06, |
| "loss": 6.4875, |
| "step": 1900 |
| }, |
| { |
| "epoch": 4.214028278347658, |
| "eval_loss": 6.462095260620117, |
| "eval_runtime": 100.6404, |
| "eval_samples_per_second": 99.364, |
| "eval_steps_per_second": 99.364, |
| "step": 1900 |
| }, |
| { |
| "epoch": 4.435819240365955, |
| "grad_norm": 0.26022714376449585, |
| "learning_rate": 5.981993997999333e-06, |
| "loss": 6.4834, |
| "step": 2000 |
| }, |
| { |
| "epoch": 4.435819240365955, |
| "eval_loss": 6.45832633972168, |
| "eval_runtime": 104.5104, |
| "eval_samples_per_second": 95.684, |
| "eval_steps_per_second": 95.684, |
| "step": 2000 |
| }, |
| { |
| "epoch": 4.6576102023842525, |
| "grad_norm": 0.7873703837394714, |
| "learning_rate": 5.980993664554851e-06, |
| "loss": 6.4796, |
| "step": 2100 |
| }, |
| { |
| "epoch": 4.6576102023842525, |
| "eval_loss": 6.456444263458252, |
| "eval_runtime": 100.8687, |
| "eval_samples_per_second": 99.139, |
| "eval_steps_per_second": 99.139, |
| "step": 2100 |
| }, |
| { |
| "epoch": 4.887163848073191, |
| "grad_norm": 0.7525845766067505, |
| "learning_rate": 5.979993331110371e-06, |
| "loss": 6.4755, |
| "step": 2200 |
| }, |
| { |
| "epoch": 4.887163848073191, |
| "eval_loss": 6.453465938568115, |
| "eval_runtime": 66.4579, |
| "eval_samples_per_second": 150.471, |
| "eval_steps_per_second": 18.809, |
| "step": 2200 |
| }, |
| { |
| "epoch": 5.108954810091489, |
| "grad_norm": 0.5191181302070618, |
| "learning_rate": 5.978992997665889e-06, |
| "loss": 6.472, |
| "step": 2300 |
| }, |
| { |
| "epoch": 5.108954810091489, |
| "eval_loss": 6.44980525970459, |
| "eval_runtime": 63.8377, |
| "eval_samples_per_second": 156.647, |
| "eval_steps_per_second": 19.581, |
| "step": 2300 |
| }, |
| { |
| "epoch": 5.330745772109786, |
| "grad_norm": 0.31189826130867004, |
| "learning_rate": 5.9779926642214075e-06, |
| "loss": 6.4681, |
| "step": 2400 |
| }, |
| { |
| "epoch": 5.330745772109786, |
| "eval_loss": 6.448277473449707, |
| "eval_runtime": 63.9509, |
| "eval_samples_per_second": 156.37, |
| "eval_steps_per_second": 19.546, |
| "step": 2400 |
| }, |
| { |
| "epoch": 5.5525367341280845, |
| "grad_norm": 0.4947231113910675, |
| "learning_rate": 5.976992330776926e-06, |
| "loss": 6.4659, |
| "step": 2500 |
| }, |
| { |
| "epoch": 5.5525367341280845, |
| "eval_loss": 6.4454731941223145, |
| "eval_runtime": 66.4235, |
| "eval_samples_per_second": 150.549, |
| "eval_steps_per_second": 18.819, |
| "step": 2500 |
| }, |
| { |
| "epoch": 5.774327696146382, |
| "grad_norm": 0.22547227144241333, |
| "learning_rate": 5.975991997332444e-06, |
| "loss": 6.4619, |
| "step": 2600 |
| }, |
| { |
| "epoch": 5.774327696146382, |
| "eval_loss": 6.444580554962158, |
| "eval_runtime": 63.7522, |
| "eval_samples_per_second": 156.857, |
| "eval_steps_per_second": 19.607, |
| "step": 2600 |
| }, |
| { |
| "epoch": 5.99611865816468, |
| "grad_norm": 0.2726474404335022, |
| "learning_rate": 5.974991663887963e-06, |
| "loss": 6.4594, |
| "step": 2700 |
| }, |
| { |
| "epoch": 5.99611865816468, |
| "eval_loss": 6.44156551361084, |
| "eval_runtime": 66.3901, |
| "eval_samples_per_second": 150.625, |
| "eval_steps_per_second": 18.828, |
| "step": 2700 |
| }, |
| { |
| "epoch": 6.2179096201829775, |
| "grad_norm": 0.17645886540412903, |
| "learning_rate": 5.973991330443481e-06, |
| "loss": 6.4574, |
| "step": 2800 |
| }, |
| { |
| "epoch": 6.2179096201829775, |
| "eval_loss": 6.4393510818481445, |
| "eval_runtime": 63.8118, |
| "eval_samples_per_second": 156.711, |
| "eval_steps_per_second": 19.589, |
| "step": 2800 |
| }, |
| { |
| "epoch": 6.439700582201275, |
| "grad_norm": 0.9444617629051208, |
| "learning_rate": 5.972990996999e-06, |
| "loss": 6.4546, |
| "step": 2900 |
| }, |
| { |
| "epoch": 6.439700582201275, |
| "eval_loss": 6.439332008361816, |
| "eval_runtime": 63.6523, |
| "eval_samples_per_second": 157.103, |
| "eval_steps_per_second": 19.638, |
| "step": 2900 |
| }, |
| { |
| "epoch": 6.661491544219573, |
| "grad_norm": 0.4472251534461975, |
| "learning_rate": 5.971990663554519e-06, |
| "loss": 6.4515, |
| "step": 3000 |
| }, |
| { |
| "epoch": 6.661491544219573, |
| "eval_loss": 6.435446262359619, |
| "eval_runtime": 63.845, |
| "eval_samples_per_second": 156.629, |
| "eval_steps_per_second": 19.579, |
| "step": 3000 |
| }, |
| { |
| "epoch": 6.883282506237871, |
| "grad_norm": 0.29884466528892517, |
| "learning_rate": 5.970990330110037e-06, |
| "loss": 6.4483, |
| "step": 3100 |
| }, |
| { |
| "epoch": 6.883282506237871, |
| "eval_loss": 6.433766841888428, |
| "eval_runtime": 66.4883, |
| "eval_samples_per_second": 150.402, |
| "eval_steps_per_second": 18.8, |
| "step": 3100 |
| }, |
| { |
| "epoch": 7.105073468256169, |
| "grad_norm": 0.4576103687286377, |
| "learning_rate": 5.9699899966655554e-06, |
| "loss": 6.4465, |
| "step": 3200 |
| }, |
| { |
| "epoch": 7.105073468256169, |
| "eval_loss": 6.432063102722168, |
| "eval_runtime": 63.7483, |
| "eval_samples_per_second": 156.867, |
| "eval_steps_per_second": 19.608, |
| "step": 3200 |
| }, |
| { |
| "epoch": 7.326864430274466, |
| "grad_norm": 0.1679336577653885, |
| "learning_rate": 5.968989663221074e-06, |
| "loss": 6.4453, |
| "step": 3300 |
| }, |
| { |
| "epoch": 7.326864430274466, |
| "eval_loss": 6.430073261260986, |
| "eval_runtime": 63.7036, |
| "eval_samples_per_second": 156.977, |
| "eval_steps_per_second": 19.622, |
| "step": 3300 |
| }, |
| { |
| "epoch": 7.548655392292764, |
| "grad_norm": 0.3880283236503601, |
| "learning_rate": 5.967989329776592e-06, |
| "loss": 6.4406, |
| "step": 3400 |
| }, |
| { |
| "epoch": 7.548655392292764, |
| "eval_loss": 6.431549072265625, |
| "eval_runtime": 66.1695, |
| "eval_samples_per_second": 151.127, |
| "eval_steps_per_second": 18.891, |
| "step": 3400 |
| }, |
| { |
| "epoch": 7.770446354311062, |
| "grad_norm": 0.8515690565109253, |
| "learning_rate": 5.966988996332111e-06, |
| "loss": 6.4413, |
| "step": 3500 |
| }, |
| { |
| "epoch": 7.770446354311062, |
| "eval_loss": 6.42842435836792, |
| "eval_runtime": 63.7187, |
| "eval_samples_per_second": 156.94, |
| "eval_steps_per_second": 19.617, |
| "step": 3500 |
| }, |
| { |
| "epoch": 7.992237316329359, |
| "grad_norm": 0.4197738468647003, |
| "learning_rate": 5.965988662887629e-06, |
| "loss": 6.4404, |
| "step": 3600 |
| }, |
| { |
| "epoch": 7.992237316329359, |
| "eval_loss": 6.429299354553223, |
| "eval_runtime": 63.7081, |
| "eval_samples_per_second": 156.966, |
| "eval_steps_per_second": 19.621, |
| "step": 3600 |
| }, |
| { |
| "epoch": 8.214028278347657, |
| "grad_norm": 0.16546382009983063, |
| "learning_rate": 5.964988329443148e-06, |
| "loss": 6.438, |
| "step": 3700 |
| }, |
| { |
| "epoch": 8.214028278347657, |
| "eval_loss": 6.426889896392822, |
| "eval_runtime": 66.066, |
| "eval_samples_per_second": 151.364, |
| "eval_steps_per_second": 18.92, |
| "step": 3700 |
| }, |
| { |
| "epoch": 8.435819240365955, |
| "grad_norm": 0.48783496022224426, |
| "learning_rate": 5.963987995998667e-06, |
| "loss": 6.437, |
| "step": 3800 |
| }, |
| { |
| "epoch": 8.435819240365955, |
| "eval_loss": 6.424874305725098, |
| "eval_runtime": 63.6818, |
| "eval_samples_per_second": 157.031, |
| "eval_steps_per_second": 19.629, |
| "step": 3800 |
| }, |
| { |
| "epoch": 8.657610202384253, |
| "grad_norm": 0.2994876205921173, |
| "learning_rate": 5.962987662554185e-06, |
| "loss": 6.434, |
| "step": 3900 |
| }, |
| { |
| "epoch": 8.657610202384253, |
| "eval_loss": 6.428049087524414, |
| "eval_runtime": 63.6981, |
| "eval_samples_per_second": 156.991, |
| "eval_steps_per_second": 19.624, |
| "step": 3900 |
| }, |
| { |
| "epoch": 8.87940116440255, |
| "grad_norm": 0.26397526264190674, |
| "learning_rate": 5.961987329109703e-06, |
| "loss": 6.4344, |
| "step": 4000 |
| }, |
| { |
| "epoch": 8.87940116440255, |
| "eval_loss": 6.427630424499512, |
| "eval_runtime": 63.7853, |
| "eval_samples_per_second": 156.776, |
| "eval_steps_per_second": 19.597, |
| "step": 4000 |
| }, |
| { |
| "epoch": 9.101192126420848, |
| "grad_norm": 0.6336208581924438, |
| "learning_rate": 5.960986995665222e-06, |
| "loss": 6.4322, |
| "step": 4100 |
| }, |
| { |
| "epoch": 9.101192126420848, |
| "eval_loss": 6.423878192901611, |
| "eval_runtime": 66.3296, |
| "eval_samples_per_second": 150.762, |
| "eval_steps_per_second": 18.845, |
| "step": 4100 |
| }, |
| { |
| "epoch": 9.322983088439146, |
| "grad_norm": 0.5242211818695068, |
| "learning_rate": 5.95998666222074e-06, |
| "loss": 6.4302, |
| "step": 4200 |
| }, |
| { |
| "epoch": 9.322983088439146, |
| "eval_loss": 6.42392110824585, |
| "eval_runtime": 63.7079, |
| "eval_samples_per_second": 156.966, |
| "eval_steps_per_second": 19.621, |
| "step": 4200 |
| }, |
| { |
| "epoch": 9.544774050457445, |
| "grad_norm": 0.49379467964172363, |
| "learning_rate": 5.958986328776259e-06, |
| "loss": 6.4307, |
| "step": 4300 |
| }, |
| { |
| "epoch": 9.544774050457445, |
| "eval_loss": 6.422423839569092, |
| "eval_runtime": 63.6859, |
| "eval_samples_per_second": 157.021, |
| "eval_steps_per_second": 19.628, |
| "step": 4300 |
| }, |
| { |
| "epoch": 9.766565012475741, |
| "grad_norm": 0.305960476398468, |
| "learning_rate": 5.957985995331777e-06, |
| "loss": 6.4285, |
| "step": 4400 |
| }, |
| { |
| "epoch": 9.766565012475741, |
| "eval_loss": 6.421577453613281, |
| "eval_runtime": 66.1928, |
| "eval_samples_per_second": 151.074, |
| "eval_steps_per_second": 18.884, |
| "step": 4400 |
| }, |
| { |
| "epoch": 9.98835597449404, |
| "grad_norm": 0.3036479353904724, |
| "learning_rate": 5.956985661887296e-06, |
| "loss": 6.4249, |
| "step": 4500 |
| }, |
| { |
| "epoch": 9.98835597449404, |
| "eval_loss": 6.41899299621582, |
| "eval_runtime": 63.6775, |
| "eval_samples_per_second": 157.041, |
| "eval_steps_per_second": 19.63, |
| "step": 4500 |
| }, |
| { |
| "epoch": 10.210146936512338, |
| "grad_norm": 1.1105852127075195, |
| "learning_rate": 5.955985328442815e-06, |
| "loss": 6.4262, |
| "step": 4600 |
| }, |
| { |
| "epoch": 10.210146936512338, |
| "eval_loss": 6.420323371887207, |
| "eval_runtime": 63.5916, |
| "eval_samples_per_second": 157.253, |
| "eval_steps_per_second": 19.657, |
| "step": 4600 |
| }, |
| { |
| "epoch": 10.431937898530634, |
| "grad_norm": 0.38992971181869507, |
| "learning_rate": 5.954984994998333e-06, |
| "loss": 6.4259, |
| "step": 4700 |
| }, |
| { |
| "epoch": 10.431937898530634, |
| "eval_loss": 6.415469646453857, |
| "eval_runtime": 63.7968, |
| "eval_samples_per_second": 156.748, |
| "eval_steps_per_second": 19.593, |
| "step": 4700 |
| }, |
| { |
| "epoch": 10.653728860548933, |
| "grad_norm": 0.39246854186058044, |
| "learning_rate": 5.953984661553851e-06, |
| "loss": 6.4258, |
| "step": 4800 |
| }, |
| { |
| "epoch": 10.653728860548933, |
| "eval_loss": 6.414693832397461, |
| "eval_runtime": 66.2863, |
| "eval_samples_per_second": 150.861, |
| "eval_steps_per_second": 18.858, |
| "step": 4800 |
| }, |
| { |
| "epoch": 10.875519822567231, |
| "grad_norm": 0.6589607000350952, |
| "learning_rate": 5.95298432810937e-06, |
| "loss": 6.4226, |
| "step": 4900 |
| }, |
| { |
| "epoch": 10.875519822567231, |
| "eval_loss": 6.417821884155273, |
| "eval_runtime": 63.7381, |
| "eval_samples_per_second": 156.892, |
| "eval_steps_per_second": 19.612, |
| "step": 4900 |
| }, |
| { |
| "epoch": 11.097310784585527, |
| "grad_norm": 0.44160690903663635, |
| "learning_rate": 5.951983994664888e-06, |
| "loss": 6.4223, |
| "step": 5000 |
| }, |
| { |
| "epoch": 11.097310784585527, |
| "eval_loss": 6.417135715484619, |
| "eval_runtime": 63.7803, |
| "eval_samples_per_second": 156.788, |
| "eval_steps_per_second": 19.599, |
| "step": 5000 |
| }, |
| { |
| "epoch": 11.319101746603826, |
| "grad_norm": 0.7182816863059998, |
| "learning_rate": 5.950983661220407e-06, |
| "loss": 6.4221, |
| "step": 5100 |
| }, |
| { |
| "epoch": 11.319101746603826, |
| "eval_loss": 6.417608737945557, |
| "eval_runtime": 66.5138, |
| "eval_samples_per_second": 150.345, |
| "eval_steps_per_second": 18.793, |
| "step": 5100 |
| }, |
| { |
| "epoch": 11.540892708622124, |
| "grad_norm": 0.45741328597068787, |
| "learning_rate": 5.949983327775925e-06, |
| "loss": 6.4211, |
| "step": 5200 |
| }, |
| { |
| "epoch": 11.540892708622124, |
| "eval_loss": 6.411616325378418, |
| "eval_runtime": 63.8646, |
| "eval_samples_per_second": 156.581, |
| "eval_steps_per_second": 19.573, |
| "step": 5200 |
| }, |
| { |
| "epoch": 11.76268367064042, |
| "grad_norm": 0.37045249342918396, |
| "learning_rate": 5.948982994331444e-06, |
| "loss": 6.4203, |
| "step": 5300 |
| }, |
| { |
| "epoch": 11.76268367064042, |
| "eval_loss": 6.415543556213379, |
| "eval_runtime": 63.6959, |
| "eval_samples_per_second": 156.996, |
| "eval_steps_per_second": 19.624, |
| "step": 5300 |
| }, |
| { |
| "epoch": 11.984474632658719, |
| "grad_norm": 0.5875869989395142, |
| "learning_rate": 5.947982660886963e-06, |
| "loss": 6.4189, |
| "step": 5400 |
| }, |
| { |
| "epoch": 11.984474632658719, |
| "eval_loss": 6.417328834533691, |
| "eval_runtime": 63.8682, |
| "eval_samples_per_second": 156.572, |
| "eval_steps_per_second": 19.572, |
| "step": 5400 |
| }, |
| { |
| "epoch": 12.206265594677017, |
| "grad_norm": 0.39769718050956726, |
| "learning_rate": 5.9469823274424815e-06, |
| "loss": 6.4185, |
| "step": 5500 |
| }, |
| { |
| "epoch": 12.206265594677017, |
| "eval_loss": 6.417914390563965, |
| "eval_runtime": 66.821, |
| "eval_samples_per_second": 149.653, |
| "eval_steps_per_second": 18.707, |
| "step": 5500 |
| }, |
| { |
| "epoch": 12.428056556695315, |
| "grad_norm": 0.8144527673721313, |
| "learning_rate": 5.945981993997999e-06, |
| "loss": 6.417, |
| "step": 5600 |
| }, |
| { |
| "epoch": 12.428056556695315, |
| "eval_loss": 6.414742946624756, |
| "eval_runtime": 63.6455, |
| "eval_samples_per_second": 157.12, |
| "eval_steps_per_second": 19.64, |
| "step": 5600 |
| }, |
| { |
| "epoch": 12.649847518713612, |
| "grad_norm": 0.304855078458786, |
| "learning_rate": 5.944981660553518e-06, |
| "loss": 6.4169, |
| "step": 5700 |
| }, |
| { |
| "epoch": 12.649847518713612, |
| "eval_loss": 6.411574363708496, |
| "eval_runtime": 63.6479, |
| "eval_samples_per_second": 157.114, |
| "eval_steps_per_second": 19.639, |
| "step": 5700 |
| }, |
| { |
| "epoch": 12.87163848073191, |
| "grad_norm": 0.5774130821228027, |
| "learning_rate": 5.943981327109036e-06, |
| "loss": 6.4162, |
| "step": 5800 |
| }, |
| { |
| "epoch": 12.87163848073191, |
| "eval_loss": 6.4110517501831055, |
| "eval_runtime": 66.215, |
| "eval_samples_per_second": 151.023, |
| "eval_steps_per_second": 18.878, |
| "step": 5800 |
| }, |
| { |
| "epoch": 13.093429442750208, |
| "grad_norm": 0.6892155408859253, |
| "learning_rate": 5.942980993664555e-06, |
| "loss": 6.414, |
| "step": 5900 |
| }, |
| { |
| "epoch": 13.093429442750208, |
| "eval_loss": 6.413996696472168, |
| "eval_runtime": 63.6174, |
| "eval_samples_per_second": 157.19, |
| "eval_steps_per_second": 19.649, |
| "step": 5900 |
| }, |
| { |
| "epoch": 13.315220404768505, |
| "grad_norm": 0.5487566590309143, |
| "learning_rate": 5.941980660220073e-06, |
| "loss": 6.4153, |
| "step": 6000 |
| }, |
| { |
| "epoch": 13.315220404768505, |
| "eval_loss": 6.414098739624023, |
| "eval_runtime": 63.6464, |
| "eval_samples_per_second": 157.118, |
| "eval_steps_per_second": 19.64, |
| "step": 6000 |
| }, |
| { |
| "epoch": 13.537011366786803, |
| "grad_norm": 0.7147879004478455, |
| "learning_rate": 5.940980326775592e-06, |
| "loss": 6.4132, |
| "step": 6100 |
| }, |
| { |
| "epoch": 13.537011366786803, |
| "eval_loss": 6.411059379577637, |
| "eval_runtime": 66.5345, |
| "eval_samples_per_second": 150.298, |
| "eval_steps_per_second": 18.787, |
| "step": 6100 |
| }, |
| { |
| "epoch": 13.758802328805102, |
| "grad_norm": 0.4990188181400299, |
| "learning_rate": 5.939979993331111e-06, |
| "loss": 6.4127, |
| "step": 6200 |
| }, |
| { |
| "epoch": 13.758802328805102, |
| "eval_loss": 6.411470890045166, |
| "eval_runtime": 63.7718, |
| "eval_samples_per_second": 156.809, |
| "eval_steps_per_second": 19.601, |
| "step": 6200 |
| }, |
| { |
| "epoch": 13.9805932908234, |
| "grad_norm": 0.3841017782688141, |
| "learning_rate": 5.9389796598866294e-06, |
| "loss": 6.4133, |
| "step": 6300 |
| }, |
| { |
| "epoch": 13.9805932908234, |
| "eval_loss": 6.4090681076049805, |
| "eval_runtime": 63.7617, |
| "eval_samples_per_second": 156.834, |
| "eval_steps_per_second": 19.604, |
| "step": 6300 |
| }, |
| { |
| "epoch": 14.202384252841696, |
| "grad_norm": 0.3359989523887634, |
| "learning_rate": 5.937979326442147e-06, |
| "loss": 6.4107, |
| "step": 6400 |
| }, |
| { |
| "epoch": 14.202384252841696, |
| "eval_loss": 6.409322738647461, |
| "eval_runtime": 63.5969, |
| "eval_samples_per_second": 157.24, |
| "eval_steps_per_second": 19.655, |
| "step": 6400 |
| }, |
| { |
| "epoch": 14.424175214859995, |
| "grad_norm": 0.5810059905052185, |
| "learning_rate": 5.936978992997666e-06, |
| "loss": 6.411, |
| "step": 6500 |
| }, |
| { |
| "epoch": 14.424175214859995, |
| "eval_loss": 6.411257743835449, |
| "eval_runtime": 66.5523, |
| "eval_samples_per_second": 150.258, |
| "eval_steps_per_second": 18.782, |
| "step": 6500 |
| }, |
| { |
| "epoch": 14.645966176878293, |
| "grad_norm": 0.45823681354522705, |
| "learning_rate": 5.935978659553185e-06, |
| "loss": 6.4107, |
| "step": 6600 |
| }, |
| { |
| "epoch": 14.645966176878293, |
| "eval_loss": 6.4073872566223145, |
| "eval_runtime": 63.6788, |
| "eval_samples_per_second": 157.038, |
| "eval_steps_per_second": 19.63, |
| "step": 6600 |
| }, |
| { |
| "epoch": 14.86775713889659, |
| "grad_norm": 0.6735783815383911, |
| "learning_rate": 5.9349783261087026e-06, |
| "loss": 6.4112, |
| "step": 6700 |
| }, |
| { |
| "epoch": 14.86775713889659, |
| "eval_loss": 6.411919593811035, |
| "eval_runtime": 63.7297, |
| "eval_samples_per_second": 156.913, |
| "eval_steps_per_second": 19.614, |
| "step": 6700 |
| }, |
| { |
| "epoch": 15.089548100914888, |
| "grad_norm": 0.5670196413993835, |
| "learning_rate": 5.933977992664221e-06, |
| "loss": 6.4099, |
| "step": 6800 |
| }, |
| { |
| "epoch": 15.089548100914888, |
| "eval_loss": 6.407878875732422, |
| "eval_runtime": 66.3771, |
| "eval_samples_per_second": 150.654, |
| "eval_steps_per_second": 18.832, |
| "step": 6800 |
| }, |
| { |
| "epoch": 15.311339062933186, |
| "grad_norm": 0.3068266808986664, |
| "learning_rate": 5.93297765921974e-06, |
| "loss": 6.4089, |
| "step": 6900 |
| }, |
| { |
| "epoch": 15.311339062933186, |
| "eval_loss": 6.4104766845703125, |
| "eval_runtime": 63.6627, |
| "eval_samples_per_second": 157.078, |
| "eval_steps_per_second": 19.635, |
| "step": 6900 |
| }, |
| { |
| "epoch": 15.533130024951483, |
| "grad_norm": 0.8304972052574158, |
| "learning_rate": 5.931977325775259e-06, |
| "loss": 6.409, |
| "step": 7000 |
| }, |
| { |
| "epoch": 15.533130024951483, |
| "eval_loss": 6.414528846740723, |
| "eval_runtime": 63.6701, |
| "eval_samples_per_second": 157.06, |
| "eval_steps_per_second": 19.632, |
| "step": 7000 |
| }, |
| { |
| "epoch": 15.75492098696978, |
| "grad_norm": 0.5522041916847229, |
| "learning_rate": 5.930976992330777e-06, |
| "loss": 6.4089, |
| "step": 7100 |
| }, |
| { |
| "epoch": 15.75492098696978, |
| "eval_loss": 6.407095909118652, |
| "eval_runtime": 66.1999, |
| "eval_samples_per_second": 151.058, |
| "eval_steps_per_second": 18.882, |
| "step": 7100 |
| }, |
| { |
| "epoch": 15.97671194898808, |
| "grad_norm": 0.373626708984375, |
| "learning_rate": 5.929976658886295e-06, |
| "loss": 6.4071, |
| "step": 7200 |
| }, |
| { |
| "epoch": 15.97671194898808, |
| "eval_loss": 6.4060258865356445, |
| "eval_runtime": 63.7049, |
| "eval_samples_per_second": 156.974, |
| "eval_steps_per_second": 19.622, |
| "step": 7200 |
| }, |
| { |
| "epoch": 16.198502911006376, |
| "grad_norm": 0.3747236132621765, |
| "learning_rate": 5.928976325441814e-06, |
| "loss": 6.4072, |
| "step": 7300 |
| }, |
| { |
| "epoch": 16.198502911006376, |
| "eval_loss": 6.403803825378418, |
| "eval_runtime": 63.6478, |
| "eval_samples_per_second": 157.115, |
| "eval_steps_per_second": 19.639, |
| "step": 7300 |
| }, |
| { |
| "epoch": 16.420293873024676, |
| "grad_norm": 0.9381150007247925, |
| "learning_rate": 5.927975991997333e-06, |
| "loss": 6.4068, |
| "step": 7400 |
| }, |
| { |
| "epoch": 16.420293873024676, |
| "eval_loss": 6.406477451324463, |
| "eval_runtime": 66.4296, |
| "eval_samples_per_second": 150.535, |
| "eval_steps_per_second": 18.817, |
| "step": 7400 |
| }, |
| { |
| "epoch": 16.642084835042972, |
| "grad_norm": 0.4905136823654175, |
| "learning_rate": 5.9269756585528505e-06, |
| "loss": 6.4047, |
| "step": 7500 |
| }, |
| { |
| "epoch": 16.642084835042972, |
| "eval_loss": 6.4078850746154785, |
| "eval_runtime": 63.7258, |
| "eval_samples_per_second": 156.922, |
| "eval_steps_per_second": 19.615, |
| "step": 7500 |
| }, |
| { |
| "epoch": 16.86387579706127, |
| "grad_norm": 0.5776643753051758, |
| "learning_rate": 5.92597532510837e-06, |
| "loss": 6.4054, |
| "step": 7600 |
| }, |
| { |
| "epoch": 16.86387579706127, |
| "eval_loss": 6.403768539428711, |
| "eval_runtime": 63.7461, |
| "eval_samples_per_second": 156.872, |
| "eval_steps_per_second": 19.609, |
| "step": 7600 |
| }, |
| { |
| "epoch": 17.08566675907957, |
| "grad_norm": 0.791892945766449, |
| "learning_rate": 5.924974991663888e-06, |
| "loss": 6.4051, |
| "step": 7700 |
| }, |
| { |
| "epoch": 17.08566675907957, |
| "eval_loss": 6.403835773468018, |
| "eval_runtime": 63.9137, |
| "eval_samples_per_second": 156.461, |
| "eval_steps_per_second": 19.558, |
| "step": 7700 |
| }, |
| { |
| "epoch": 17.307457721097865, |
| "grad_norm": 0.485984206199646, |
| "learning_rate": 5.923974658219407e-06, |
| "loss": 6.4058, |
| "step": 7800 |
| }, |
| { |
| "epoch": 17.307457721097865, |
| "eval_loss": 6.405175685882568, |
| "eval_runtime": 66.2871, |
| "eval_samples_per_second": 150.859, |
| "eval_steps_per_second": 18.857, |
| "step": 7800 |
| }, |
| { |
| "epoch": 17.529248683116162, |
| "grad_norm": 1.0781219005584717, |
| "learning_rate": 5.922974324774925e-06, |
| "loss": 6.4037, |
| "step": 7900 |
| }, |
| { |
| "epoch": 17.529248683116162, |
| "eval_loss": 6.408561706542969, |
| "eval_runtime": 66.8857, |
| "eval_samples_per_second": 149.509, |
| "eval_steps_per_second": 18.689, |
| "step": 7900 |
| }, |
| { |
| "epoch": 17.751039645134462, |
| "grad_norm": 0.6358538269996643, |
| "learning_rate": 5.921973991330443e-06, |
| "loss": 6.403, |
| "step": 8000 |
| }, |
| { |
| "epoch": 17.751039645134462, |
| "eval_loss": 6.402519702911377, |
| "eval_runtime": 63.7653, |
| "eval_samples_per_second": 156.825, |
| "eval_steps_per_second": 19.603, |
| "step": 8000 |
| }, |
| { |
| "epoch": 17.97283060715276, |
| "grad_norm": 0.5632463097572327, |
| "learning_rate": 5.920973657885962e-06, |
| "loss": 6.4034, |
| "step": 8100 |
| }, |
| { |
| "epoch": 17.97283060715276, |
| "eval_loss": 6.403571128845215, |
| "eval_runtime": 63.7754, |
| "eval_samples_per_second": 156.8, |
| "eval_steps_per_second": 19.6, |
| "step": 8100 |
| }, |
| { |
| "epoch": 18.194621569171055, |
| "grad_norm": 0.23312948644161224, |
| "learning_rate": 5.919973324441481e-06, |
| "loss": 6.4048, |
| "step": 8200 |
| }, |
| { |
| "epoch": 18.194621569171055, |
| "eval_loss": 6.404890060424805, |
| "eval_runtime": 63.6831, |
| "eval_samples_per_second": 157.028, |
| "eval_steps_per_second": 19.628, |
| "step": 8200 |
| }, |
| { |
| "epoch": 18.416412531189355, |
| "grad_norm": 0.5255222916603088, |
| "learning_rate": 5.918972990996999e-06, |
| "loss": 6.4018, |
| "step": 8300 |
| }, |
| { |
| "epoch": 18.416412531189355, |
| "eval_loss": 6.401614665985107, |
| "eval_runtime": 66.6013, |
| "eval_samples_per_second": 150.147, |
| "eval_steps_per_second": 18.768, |
| "step": 8300 |
| }, |
| { |
| "epoch": 18.63820349320765, |
| "grad_norm": 0.44263362884521484, |
| "learning_rate": 5.917972657552518e-06, |
| "loss": 6.4018, |
| "step": 8400 |
| }, |
| { |
| "epoch": 18.63820349320765, |
| "eval_loss": 6.40390682220459, |
| "eval_runtime": 63.7484, |
| "eval_samples_per_second": 156.867, |
| "eval_steps_per_second": 19.608, |
| "step": 8400 |
| }, |
| { |
| "epoch": 18.859994455225948, |
| "grad_norm": 0.5826687812805176, |
| "learning_rate": 5.916972324108037e-06, |
| "loss": 6.402, |
| "step": 8500 |
| }, |
| { |
| "epoch": 18.859994455225948, |
| "eval_loss": 6.401444911956787, |
| "eval_runtime": 63.73, |
| "eval_samples_per_second": 156.912, |
| "eval_steps_per_second": 19.614, |
| "step": 8500 |
| }, |
| { |
| "epoch": 19.081785417244248, |
| "grad_norm": 0.5808525681495667, |
| "learning_rate": 5.915971990663555e-06, |
| "loss": 6.4031, |
| "step": 8600 |
| }, |
| { |
| "epoch": 19.081785417244248, |
| "eval_loss": 6.398373126983643, |
| "eval_runtime": 66.6574, |
| "eval_samples_per_second": 150.021, |
| "eval_steps_per_second": 18.753, |
| "step": 8600 |
| }, |
| { |
| "epoch": 19.303576379262545, |
| "grad_norm": 0.9179806113243103, |
| "learning_rate": 5.914971657219073e-06, |
| "loss": 6.4019, |
| "step": 8700 |
| }, |
| { |
| "epoch": 19.303576379262545, |
| "eval_loss": 6.399080276489258, |
| "eval_runtime": 63.6271, |
| "eval_samples_per_second": 157.166, |
| "eval_steps_per_second": 19.646, |
| "step": 8700 |
| }, |
| { |
| "epoch": 19.52536734128084, |
| "grad_norm": 0.45992511510849, |
| "learning_rate": 5.913971323774591e-06, |
| "loss": 6.4, |
| "step": 8800 |
| }, |
| { |
| "epoch": 19.52536734128084, |
| "eval_loss": 6.403900623321533, |
| "eval_runtime": 63.7034, |
| "eval_samples_per_second": 156.977, |
| "eval_steps_per_second": 19.622, |
| "step": 8800 |
| }, |
| { |
| "epoch": 19.74715830329914, |
| "grad_norm": 0.702781081199646, |
| "learning_rate": 5.91297099033011e-06, |
| "loss": 6.3993, |
| "step": 8900 |
| }, |
| { |
| "epoch": 19.74715830329914, |
| "eval_loss": 6.401424884796143, |
| "eval_runtime": 66.2276, |
| "eval_samples_per_second": 150.994, |
| "eval_steps_per_second": 18.874, |
| "step": 8900 |
| }, |
| { |
| "epoch": 19.968949265317438, |
| "grad_norm": 0.6189502477645874, |
| "learning_rate": 5.911970656885629e-06, |
| "loss": 6.3999, |
| "step": 9000 |
| }, |
| { |
| "epoch": 19.968949265317438, |
| "eval_loss": 6.400846481323242, |
| "eval_runtime": 63.7467, |
| "eval_samples_per_second": 156.871, |
| "eval_steps_per_second": 19.609, |
| "step": 9000 |
| }, |
| { |
| "epoch": 20.190740227335738, |
| "grad_norm": 0.37635141611099243, |
| "learning_rate": 5.910970323441147e-06, |
| "loss": 6.3994, |
| "step": 9100 |
| }, |
| { |
| "epoch": 20.190740227335738, |
| "eval_loss": 6.402886867523193, |
| "eval_runtime": 63.6159, |
| "eval_samples_per_second": 157.193, |
| "eval_steps_per_second": 19.649, |
| "step": 9100 |
| }, |
| { |
| "epoch": 20.412531189354034, |
| "grad_norm": 0.5809453129768372, |
| "learning_rate": 5.909969989996666e-06, |
| "loss": 6.3996, |
| "step": 9200 |
| }, |
| { |
| "epoch": 20.412531189354034, |
| "eval_loss": 6.399085998535156, |
| "eval_runtime": 66.2096, |
| "eval_samples_per_second": 151.035, |
| "eval_steps_per_second": 18.879, |
| "step": 9200 |
| }, |
| { |
| "epoch": 20.63432215137233, |
| "grad_norm": 0.535410463809967, |
| "learning_rate": 5.908969656552185e-06, |
| "loss": 6.3985, |
| "step": 9300 |
| }, |
| { |
| "epoch": 20.63432215137233, |
| "eval_loss": 6.399356842041016, |
| "eval_runtime": 63.8098, |
| "eval_samples_per_second": 156.716, |
| "eval_steps_per_second": 19.589, |
| "step": 9300 |
| }, |
| { |
| "epoch": 20.85611311339063, |
| "grad_norm": 0.5065354108810425, |
| "learning_rate": 5.907969323107703e-06, |
| "loss": 6.3993, |
| "step": 9400 |
| }, |
| { |
| "epoch": 20.85611311339063, |
| "eval_loss": 6.401696681976318, |
| "eval_runtime": 63.6775, |
| "eval_samples_per_second": 157.041, |
| "eval_steps_per_second": 19.63, |
| "step": 9400 |
| }, |
| { |
| "epoch": 21.077904075408927, |
| "grad_norm": 0.4803392291069031, |
| "learning_rate": 5.906968989663221e-06, |
| "loss": 6.4003, |
| "step": 9500 |
| }, |
| { |
| "epoch": 21.077904075408927, |
| "eval_loss": 6.399422645568848, |
| "eval_runtime": 63.6426, |
| "eval_samples_per_second": 157.127, |
| "eval_steps_per_second": 19.641, |
| "step": 9500 |
| }, |
| { |
| "epoch": 21.299695037427224, |
| "grad_norm": 0.7447142004966736, |
| "learning_rate": 5.90596865621874e-06, |
| "loss": 6.3992, |
| "step": 9600 |
| }, |
| { |
| "epoch": 21.299695037427224, |
| "eval_loss": 6.397017002105713, |
| "eval_runtime": 66.4941, |
| "eval_samples_per_second": 150.389, |
| "eval_steps_per_second": 18.799, |
| "step": 9600 |
| }, |
| { |
| "epoch": 21.521485999445524, |
| "grad_norm": 0.2856753468513489, |
| "learning_rate": 5.904968322774258e-06, |
| "loss": 6.3999, |
| "step": 9700 |
| }, |
| { |
| "epoch": 21.521485999445524, |
| "eval_loss": 6.400000095367432, |
| "eval_runtime": 63.7186, |
| "eval_samples_per_second": 156.94, |
| "eval_steps_per_second": 19.618, |
| "step": 9700 |
| }, |
| { |
| "epoch": 21.74327696146382, |
| "grad_norm": 0.8077158331871033, |
| "learning_rate": 5.9039679893297766e-06, |
| "loss": 6.3981, |
| "step": 9800 |
| }, |
| { |
| "epoch": 21.74327696146382, |
| "eval_loss": 6.398531436920166, |
| "eval_runtime": 63.7668, |
| "eval_samples_per_second": 156.821, |
| "eval_steps_per_second": 19.603, |
| "step": 9800 |
| }, |
| { |
| "epoch": 21.965067923482117, |
| "grad_norm": 0.8744412660598755, |
| "learning_rate": 5.902967655885295e-06, |
| "loss": 6.3988, |
| "step": 9900 |
| }, |
| { |
| "epoch": 21.965067923482117, |
| "eval_loss": 6.396906852722168, |
| "eval_runtime": 66.2535, |
| "eval_samples_per_second": 150.935, |
| "eval_steps_per_second": 18.867, |
| "step": 9900 |
| }, |
| { |
| "epoch": 22.186858885500417, |
| "grad_norm": 0.44601574540138245, |
| "learning_rate": 5.901967322440814e-06, |
| "loss": 6.3969, |
| "step": 10000 |
| }, |
| { |
| "epoch": 22.186858885500417, |
| "eval_loss": 6.395452976226807, |
| "eval_runtime": 63.6969, |
| "eval_samples_per_second": 156.994, |
| "eval_steps_per_second": 19.624, |
| "step": 10000 |
| }, |
| { |
| "epoch": 22.408649847518713, |
| "grad_norm": 0.6895701289176941, |
| "learning_rate": 5.900966988996333e-06, |
| "loss": 6.3967, |
| "step": 10100 |
| }, |
| { |
| "epoch": 22.408649847518713, |
| "eval_loss": 6.40028190612793, |
| "eval_runtime": 63.7023, |
| "eval_samples_per_second": 156.98, |
| "eval_steps_per_second": 19.623, |
| "step": 10100 |
| }, |
| { |
| "epoch": 22.63044080953701, |
| "grad_norm": 0.6166660189628601, |
| "learning_rate": 5.8999666555518505e-06, |
| "loss": 6.3968, |
| "step": 10200 |
| }, |
| { |
| "epoch": 22.63044080953701, |
| "eval_loss": 6.397933483123779, |
| "eval_runtime": 66.8627, |
| "eval_samples_per_second": 149.56, |
| "eval_steps_per_second": 18.695, |
| "step": 10200 |
| }, |
| { |
| "epoch": 22.85223177155531, |
| "grad_norm": 1.0633758306503296, |
| "learning_rate": 5.898966322107369e-06, |
| "loss": 6.3976, |
| "step": 10300 |
| }, |
| { |
| "epoch": 22.85223177155531, |
| "eval_loss": 6.396650791168213, |
| "eval_runtime": 63.7935, |
| "eval_samples_per_second": 156.756, |
| "eval_steps_per_second": 19.594, |
| "step": 10300 |
| }, |
| { |
| "epoch": 23.074022733573607, |
| "grad_norm": 0.4864283502101898, |
| "learning_rate": 5.897965988662888e-06, |
| "loss": 6.3967, |
| "step": 10400 |
| }, |
| { |
| "epoch": 23.074022733573607, |
| "eval_loss": 6.39711332321167, |
| "eval_runtime": 63.6284, |
| "eval_samples_per_second": 157.163, |
| "eval_steps_per_second": 19.645, |
| "step": 10400 |
| }, |
| { |
| "epoch": 23.295813695591903, |
| "grad_norm": 0.65082186460495, |
| "learning_rate": 5.896965655218406e-06, |
| "loss": 6.3973, |
| "step": 10500 |
| }, |
| { |
| "epoch": 23.295813695591903, |
| "eval_loss": 6.395853519439697, |
| "eval_runtime": 66.242, |
| "eval_samples_per_second": 150.962, |
| "eval_steps_per_second": 18.87, |
| "step": 10500 |
| }, |
| { |
| "epoch": 23.517604657610203, |
| "grad_norm": 0.45799535512924194, |
| "learning_rate": 5.8959653217739245e-06, |
| "loss": 6.396, |
| "step": 10600 |
| }, |
| { |
| "epoch": 23.517604657610203, |
| "eval_loss": 6.398243427276611, |
| "eval_runtime": 63.7686, |
| "eval_samples_per_second": 156.817, |
| "eval_steps_per_second": 19.602, |
| "step": 10600 |
| }, |
| { |
| "epoch": 23.7393956196285, |
| "grad_norm": 0.5860775709152222, |
| "learning_rate": 5.894964988329443e-06, |
| "loss": 6.3956, |
| "step": 10700 |
| }, |
| { |
| "epoch": 23.7393956196285, |
| "eval_loss": 6.3961687088012695, |
| "eval_runtime": 67.0182, |
| "eval_samples_per_second": 149.213, |
| "eval_steps_per_second": 18.652, |
| "step": 10700 |
| }, |
| { |
| "epoch": 23.9611865816468, |
| "grad_norm": 0.5584791898727417, |
| "learning_rate": 5.893964654884962e-06, |
| "loss": 6.3957, |
| "step": 10800 |
| }, |
| { |
| "epoch": 23.9611865816468, |
| "eval_loss": 6.396393775939941, |
| "eval_runtime": 63.8981, |
| "eval_samples_per_second": 156.499, |
| "eval_steps_per_second": 19.562, |
| "step": 10800 |
| }, |
| { |
| "epoch": 24.182977543665096, |
| "grad_norm": 0.7845295667648315, |
| "learning_rate": 5.892964321440481e-06, |
| "loss": 6.3956, |
| "step": 10900 |
| }, |
| { |
| "epoch": 24.182977543665096, |
| "eval_loss": 6.397210121154785, |
| "eval_runtime": 64.0302, |
| "eval_samples_per_second": 156.176, |
| "eval_steps_per_second": 19.522, |
| "step": 10900 |
| }, |
| { |
| "epoch": 24.404768505683393, |
| "grad_norm": 0.564857006072998, |
| "learning_rate": 5.8919639879959985e-06, |
| "loss": 6.3955, |
| "step": 11000 |
| }, |
| { |
| "epoch": 24.404768505683393, |
| "eval_loss": 6.395459175109863, |
| "eval_runtime": 67.2462, |
| "eval_samples_per_second": 148.707, |
| "eval_steps_per_second": 18.588, |
| "step": 11000 |
| }, |
| { |
| "epoch": 24.665372886054893, |
| "grad_norm": 0.7520161271095276, |
| "learning_rate": 4.906354515050168e-06, |
| "loss": 6.3944, |
| "step": 11100 |
| }, |
| { |
| "epoch": 24.665372886054893, |
| "eval_loss": 6.389779567718506, |
| "eval_runtime": 87.8112, |
| "eval_samples_per_second": 113.881, |
| "eval_steps_per_second": 14.235, |
| "step": 11100 |
| }, |
| { |
| "epoch": 24.88716384807319, |
| "grad_norm": 0.6003276705741882, |
| "learning_rate": 4.8963210702341136e-06, |
| "loss": 6.394, |
| "step": 11200 |
| }, |
| { |
| "epoch": 24.88716384807319, |
| "eval_loss": 6.394806861877441, |
| "eval_runtime": 75.8812, |
| "eval_samples_per_second": 131.785, |
| "eval_steps_per_second": 16.473, |
| "step": 11200 |
| }, |
| { |
| "epoch": 25.10895481009149, |
| "grad_norm": 0.28259870409965515, |
| "learning_rate": 4.88628762541806e-06, |
| "loss": 6.3945, |
| "step": 11300 |
| }, |
| { |
| "epoch": 25.10895481009149, |
| "eval_loss": 6.398300647735596, |
| "eval_runtime": 88.2774, |
| "eval_samples_per_second": 113.279, |
| "eval_steps_per_second": 14.16, |
| "step": 11300 |
| }, |
| { |
| "epoch": 25.330745772109786, |
| "grad_norm": 0.30802807211875916, |
| "learning_rate": 4.876254180602007e-06, |
| "loss": 6.3941, |
| "step": 11400 |
| }, |
| { |
| "epoch": 25.330745772109786, |
| "eval_loss": 6.394501686096191, |
| "eval_runtime": 66.156, |
| "eval_samples_per_second": 151.158, |
| "eval_steps_per_second": 18.895, |
| "step": 11400 |
| }, |
| { |
| "epoch": 25.552536734128083, |
| "grad_norm": 0.5175557732582092, |
| "learning_rate": 4.866220735785953e-06, |
| "loss": 6.394, |
| "step": 11500 |
| }, |
| { |
| "epoch": 25.552536734128083, |
| "eval_loss": 6.3985795974731445, |
| "eval_runtime": 63.6993, |
| "eval_samples_per_second": 156.988, |
| "eval_steps_per_second": 19.623, |
| "step": 11500 |
| }, |
| { |
| "epoch": 25.774327696146383, |
| "grad_norm": 0.5214359164237976, |
| "learning_rate": 4.8561872909699e-06, |
| "loss": 6.3942, |
| "step": 11600 |
| }, |
| { |
| "epoch": 25.774327696146383, |
| "eval_loss": 6.391521453857422, |
| "eval_runtime": 63.6987, |
| "eval_samples_per_second": 156.989, |
| "eval_steps_per_second": 19.624, |
| "step": 11600 |
| }, |
| { |
| "epoch": 25.99611865816468, |
| "grad_norm": 0.5827904343605042, |
| "learning_rate": 4.8461538461538465e-06, |
| "loss": 6.3953, |
| "step": 11700 |
| }, |
| { |
| "epoch": 25.99611865816468, |
| "eval_loss": 6.393467903137207, |
| "eval_runtime": 66.2727, |
| "eval_samples_per_second": 150.892, |
| "eval_steps_per_second": 18.861, |
| "step": 11700 |
| }, |
| { |
| "epoch": 26.21790962018298, |
| "grad_norm": 0.24229009449481964, |
| "learning_rate": 4.8361204013377925e-06, |
| "loss": 6.3945, |
| "step": 11800 |
| }, |
| { |
| "epoch": 26.21790962018298, |
| "eval_loss": 6.39454460144043, |
| "eval_runtime": 63.6782, |
| "eval_samples_per_second": 157.04, |
| "eval_steps_per_second": 19.63, |
| "step": 11800 |
| }, |
| { |
| "epoch": 26.439700582201276, |
| "grad_norm": 0.6859923005104065, |
| "learning_rate": 4.826086956521739e-06, |
| "loss": 6.3929, |
| "step": 11900 |
| }, |
| { |
| "epoch": 26.439700582201276, |
| "eval_loss": 6.394321918487549, |
| "eval_runtime": 66.2701, |
| "eval_samples_per_second": 150.898, |
| "eval_steps_per_second": 18.862, |
| "step": 11900 |
| }, |
| { |
| "epoch": 26.661491544219572, |
| "grad_norm": 0.4267604947090149, |
| "learning_rate": 4.816053511705686e-06, |
| "loss": 6.3941, |
| "step": 12000 |
| }, |
| { |
| "epoch": 26.661491544219572, |
| "eval_loss": 6.394528865814209, |
| "eval_runtime": 63.7313, |
| "eval_samples_per_second": 156.909, |
| "eval_steps_per_second": 19.614, |
| "step": 12000 |
| }, |
| { |
| "epoch": 26.883282506237872, |
| "grad_norm": 0.43895894289016724, |
| "learning_rate": 4.806020066889633e-06, |
| "loss": 6.3929, |
| "step": 12100 |
| }, |
| { |
| "epoch": 26.883282506237872, |
| "eval_loss": 6.3936076164245605, |
| "eval_runtime": 66.3275, |
| "eval_samples_per_second": 150.767, |
| "eval_steps_per_second": 18.846, |
| "step": 12100 |
| }, |
| { |
| "epoch": 27.10507346825617, |
| "grad_norm": 0.3438960015773773, |
| "learning_rate": 4.795986622073579e-06, |
| "loss": 6.3933, |
| "step": 12200 |
| }, |
| { |
| "epoch": 27.10507346825617, |
| "eval_loss": 6.397474765777588, |
| "eval_runtime": 63.621, |
| "eval_samples_per_second": 157.181, |
| "eval_steps_per_second": 19.648, |
| "step": 12200 |
| }, |
| { |
| "epoch": 27.326864430274465, |
| "grad_norm": 0.5950188636779785, |
| "learning_rate": 4.785953177257525e-06, |
| "loss": 6.394, |
| "step": 12300 |
| }, |
| { |
| "epoch": 27.326864430274465, |
| "eval_loss": 6.393238544464111, |
| "eval_runtime": 63.6999, |
| "eval_samples_per_second": 156.986, |
| "eval_steps_per_second": 19.623, |
| "step": 12300 |
| }, |
| { |
| "epoch": 27.548655392292765, |
| "grad_norm": 0.34001484513282776, |
| "learning_rate": 4.775919732441472e-06, |
| "loss": 6.3947, |
| "step": 12400 |
| }, |
| { |
| "epoch": 27.548655392292765, |
| "eval_loss": 6.394363880157471, |
| "eval_runtime": 66.2457, |
| "eval_samples_per_second": 150.953, |
| "eval_steps_per_second": 18.869, |
| "step": 12400 |
| }, |
| { |
| "epoch": 27.770446354311062, |
| "grad_norm": 0.47045424580574036, |
| "learning_rate": 4.765886287625418e-06, |
| "loss": 6.3929, |
| "step": 12500 |
| }, |
| { |
| "epoch": 27.770446354311062, |
| "eval_loss": 6.393606185913086, |
| "eval_runtime": 63.7187, |
| "eval_samples_per_second": 156.94, |
| "eval_steps_per_second": 19.617, |
| "step": 12500 |
| }, |
| { |
| "epoch": 27.99223731632936, |
| "grad_norm": 0.6604583859443665, |
| "learning_rate": 4.755852842809365e-06, |
| "loss": 6.3931, |
| "step": 12600 |
| }, |
| { |
| "epoch": 27.99223731632936, |
| "eval_loss": 6.39324426651001, |
| "eval_runtime": 63.6887, |
| "eval_samples_per_second": 157.014, |
| "eval_steps_per_second": 19.627, |
| "step": 12600 |
| }, |
| { |
| "epoch": 28.21402827834766, |
| "grad_norm": 0.6491646766662598, |
| "learning_rate": 4.745819397993312e-06, |
| "loss": 6.3912, |
| "step": 12700 |
| }, |
| { |
| "epoch": 28.21402827834766, |
| "eval_loss": 6.394981384277344, |
| "eval_runtime": 66.2742, |
| "eval_samples_per_second": 150.888, |
| "eval_steps_per_second": 18.861, |
| "step": 12700 |
| }, |
| { |
| "epoch": 28.435819240365955, |
| "grad_norm": 0.5381952524185181, |
| "learning_rate": 4.7357859531772575e-06, |
| "loss": 6.3929, |
| "step": 12800 |
| }, |
| { |
| "epoch": 28.435819240365955, |
| "eval_loss": 6.392743110656738, |
| "eval_runtime": 63.6892, |
| "eval_samples_per_second": 157.012, |
| "eval_steps_per_second": 19.627, |
| "step": 12800 |
| }, |
| { |
| "epoch": 28.65761020238425, |
| "grad_norm": 0.7769903540611267, |
| "learning_rate": 4.725752508361204e-06, |
| "loss": 6.3927, |
| "step": 12900 |
| }, |
| { |
| "epoch": 28.65761020238425, |
| "eval_loss": 6.390952110290527, |
| "eval_runtime": 66.3226, |
| "eval_samples_per_second": 150.778, |
| "eval_steps_per_second": 18.847, |
| "step": 12900 |
| }, |
| { |
| "epoch": 28.87940116440255, |
| "grad_norm": 0.4297138452529907, |
| "learning_rate": 4.715719063545151e-06, |
| "loss": 6.393, |
| "step": 13000 |
| }, |
| { |
| "epoch": 28.87940116440255, |
| "eval_loss": 6.390758037567139, |
| "eval_runtime": 63.8216, |
| "eval_samples_per_second": 156.687, |
| "eval_steps_per_second": 19.586, |
| "step": 13000 |
| }, |
| { |
| "epoch": 29.101192126420848, |
| "grad_norm": 0.7731721997261047, |
| "learning_rate": 4.705685618729097e-06, |
| "loss": 6.3923, |
| "step": 13100 |
| }, |
| { |
| "epoch": 29.101192126420848, |
| "eval_loss": 6.392960071563721, |
| "eval_runtime": 63.6867, |
| "eval_samples_per_second": 157.019, |
| "eval_steps_per_second": 19.627, |
| "step": 13100 |
| }, |
| { |
| "epoch": 29.322983088439145, |
| "grad_norm": 0.27714040875434875, |
| "learning_rate": 4.695652173913044e-06, |
| "loss": 6.3934, |
| "step": 13200 |
| }, |
| { |
| "epoch": 29.322983088439145, |
| "eval_loss": 6.395288944244385, |
| "eval_runtime": 66.2909, |
| "eval_samples_per_second": 150.85, |
| "eval_steps_per_second": 18.856, |
| "step": 13200 |
| }, |
| { |
| "epoch": 29.544774050457445, |
| "grad_norm": 0.5391174554824829, |
| "learning_rate": 4.6856187290969905e-06, |
| "loss": 6.3927, |
| "step": 13300 |
| }, |
| { |
| "epoch": 29.544774050457445, |
| "eval_loss": 6.395300388336182, |
| "eval_runtime": 63.6935, |
| "eval_samples_per_second": 157.002, |
| "eval_steps_per_second": 19.625, |
| "step": 13300 |
| }, |
| { |
| "epoch": 29.76656501247574, |
| "grad_norm": 0.9717122912406921, |
| "learning_rate": 4.675585284280936e-06, |
| "loss": 6.391, |
| "step": 13400 |
| }, |
| { |
| "epoch": 29.76656501247574, |
| "eval_loss": 6.3939642906188965, |
| "eval_runtime": 64.4676, |
| "eval_samples_per_second": 155.117, |
| "eval_steps_per_second": 19.39, |
| "step": 13400 |
| }, |
| { |
| "epoch": 29.988355974494038, |
| "grad_norm": 0.3409580588340759, |
| "learning_rate": 4.665551839464883e-06, |
| "loss": 6.3929, |
| "step": 13500 |
| }, |
| { |
| "epoch": 29.988355974494038, |
| "eval_loss": 6.393261909484863, |
| "eval_runtime": 65.5531, |
| "eval_samples_per_second": 152.548, |
| "eval_steps_per_second": 19.069, |
| "step": 13500 |
| }, |
| { |
| "epoch": 30.210146936512338, |
| "grad_norm": 0.7017607092857361, |
| "learning_rate": 4.65551839464883e-06, |
| "loss": 6.3914, |
| "step": 13600 |
| }, |
| { |
| "epoch": 30.210146936512338, |
| "eval_loss": 6.389814853668213, |
| "eval_runtime": 63.5889, |
| "eval_samples_per_second": 157.26, |
| "eval_steps_per_second": 19.658, |
| "step": 13600 |
| }, |
| { |
| "epoch": 30.431937898530634, |
| "grad_norm": 0.494228720664978, |
| "learning_rate": 4.645484949832776e-06, |
| "loss": 6.3913, |
| "step": 13700 |
| }, |
| { |
| "epoch": 30.431937898530634, |
| "eval_loss": 6.389814853668213, |
| "eval_runtime": 63.6983, |
| "eval_samples_per_second": 156.99, |
| "eval_steps_per_second": 19.624, |
| "step": 13700 |
| }, |
| { |
| "epoch": 30.65372886054893, |
| "grad_norm": 0.6848724484443665, |
| "learning_rate": 4.635451505016723e-06, |
| "loss": 6.3909, |
| "step": 13800 |
| }, |
| { |
| "epoch": 30.65372886054893, |
| "eval_loss": 6.391334533691406, |
| "eval_runtime": 66.3245, |
| "eval_samples_per_second": 150.774, |
| "eval_steps_per_second": 18.847, |
| "step": 13800 |
| }, |
| { |
| "epoch": 30.87551982256723, |
| "grad_norm": 0.5187550187110901, |
| "learning_rate": 4.625418060200669e-06, |
| "loss": 6.3905, |
| "step": 13900 |
| }, |
| { |
| "epoch": 30.87551982256723, |
| "eval_loss": 6.393035411834717, |
| "eval_runtime": 63.667, |
| "eval_samples_per_second": 157.067, |
| "eval_steps_per_second": 19.633, |
| "step": 13900 |
| }, |
| { |
| "epoch": 31.097310784585527, |
| "grad_norm": 0.4394451081752777, |
| "learning_rate": 4.615384615384616e-06, |
| "loss": 6.3902, |
| "step": 14000 |
| }, |
| { |
| "epoch": 31.097310784585527, |
| "eval_loss": 6.391651630401611, |
| "eval_runtime": 66.2607, |
| "eval_samples_per_second": 150.919, |
| "eval_steps_per_second": 18.865, |
| "step": 14000 |
| }, |
| { |
| "epoch": 31.319101746603828, |
| "grad_norm": 0.6403105854988098, |
| "learning_rate": 4.605351170568562e-06, |
| "loss": 6.3904, |
| "step": 14100 |
| }, |
| { |
| "epoch": 31.319101746603828, |
| "eval_loss": 6.390075206756592, |
| "eval_runtime": 63.7818, |
| "eval_samples_per_second": 156.785, |
| "eval_steps_per_second": 19.598, |
| "step": 14100 |
| }, |
| { |
| "epoch": 31.540892708622124, |
| "grad_norm": 0.41991308331489563, |
| "learning_rate": 4.595317725752509e-06, |
| "loss": 6.3915, |
| "step": 14200 |
| }, |
| { |
| "epoch": 31.540892708622124, |
| "eval_loss": 6.390388488769531, |
| "eval_runtime": 66.3061, |
| "eval_samples_per_second": 150.816, |
| "eval_steps_per_second": 18.852, |
| "step": 14200 |
| }, |
| { |
| "epoch": 31.76268367064042, |
| "grad_norm": 0.5049502849578857, |
| "learning_rate": 4.585284280936456e-06, |
| "loss": 6.3901, |
| "step": 14300 |
| }, |
| { |
| "epoch": 31.76268367064042, |
| "eval_loss": 6.394845485687256, |
| "eval_runtime": 63.7361, |
| "eval_samples_per_second": 156.897, |
| "eval_steps_per_second": 19.612, |
| "step": 14300 |
| }, |
| { |
| "epoch": 31.98447463265872, |
| "grad_norm": 0.5375522375106812, |
| "learning_rate": 4.5752508361204015e-06, |
| "loss": 6.3901, |
| "step": 14400 |
| }, |
| { |
| "epoch": 31.98447463265872, |
| "eval_loss": 6.3919267654418945, |
| "eval_runtime": 63.6609, |
| "eval_samples_per_second": 157.082, |
| "eval_steps_per_second": 19.635, |
| "step": 14400 |
| }, |
| { |
| "epoch": 32.206265594677014, |
| "grad_norm": 0.6649445295333862, |
| "learning_rate": 4.565217391304348e-06, |
| "loss": 6.3897, |
| "step": 14500 |
| }, |
| { |
| "epoch": 32.206265594677014, |
| "eval_loss": 6.391171932220459, |
| "eval_runtime": 66.188, |
| "eval_samples_per_second": 151.085, |
| "eval_steps_per_second": 18.886, |
| "step": 14500 |
| }, |
| { |
| "epoch": 32.42805655669532, |
| "grad_norm": 0.5367133021354675, |
| "learning_rate": 4.555183946488295e-06, |
| "loss": 6.3903, |
| "step": 14600 |
| }, |
| { |
| "epoch": 32.42805655669532, |
| "eval_loss": 6.390655517578125, |
| "eval_runtime": 63.747, |
| "eval_samples_per_second": 156.87, |
| "eval_steps_per_second": 19.609, |
| "step": 14600 |
| }, |
| { |
| "epoch": 32.649847518713614, |
| "grad_norm": 0.5683135986328125, |
| "learning_rate": 4.545150501672241e-06, |
| "loss": 6.3881, |
| "step": 14700 |
| }, |
| { |
| "epoch": 32.649847518713614, |
| "eval_loss": 6.387674808502197, |
| "eval_runtime": 63.678, |
| "eval_samples_per_second": 157.04, |
| "eval_steps_per_second": 19.63, |
| "step": 14700 |
| }, |
| { |
| "epoch": 32.87163848073191, |
| "grad_norm": 0.697325587272644, |
| "learning_rate": 4.535117056856188e-06, |
| "loss": 6.3908, |
| "step": 14800 |
| }, |
| { |
| "epoch": 32.87163848073191, |
| "eval_loss": 6.393805027008057, |
| "eval_runtime": 63.7212, |
| "eval_samples_per_second": 156.934, |
| "eval_steps_per_second": 19.617, |
| "step": 14800 |
| }, |
| { |
| "epoch": 33.09342944275021, |
| "grad_norm": 0.5757908225059509, |
| "learning_rate": 4.5250836120401345e-06, |
| "loss": 6.3907, |
| "step": 14900 |
| }, |
| { |
| "epoch": 33.09342944275021, |
| "eval_loss": 6.393499851226807, |
| "eval_runtime": 66.2096, |
| "eval_samples_per_second": 151.035, |
| "eval_steps_per_second": 18.879, |
| "step": 14900 |
| }, |
| { |
| "epoch": 33.3152204047685, |
| "grad_norm": 0.3517054319381714, |
| "learning_rate": 4.51505016722408e-06, |
| "loss": 6.3902, |
| "step": 15000 |
| }, |
| { |
| "epoch": 33.3152204047685, |
| "eval_loss": 6.386899471282959, |
| "eval_runtime": 63.7082, |
| "eval_samples_per_second": 156.966, |
| "eval_steps_per_second": 19.621, |
| "step": 15000 |
| }, |
| { |
| "epoch": 33.53701136678681, |
| "grad_norm": 0.7311076521873474, |
| "learning_rate": 4.505016722408027e-06, |
| "loss": 6.3905, |
| "step": 15100 |
| }, |
| { |
| "epoch": 33.53701136678681, |
| "eval_loss": 6.391955375671387, |
| "eval_runtime": 63.6711, |
| "eval_samples_per_second": 157.057, |
| "eval_steps_per_second": 19.632, |
| "step": 15100 |
| }, |
| { |
| "epoch": 33.7588023288051, |
| "grad_norm": 0.4526328444480896, |
| "learning_rate": 4.494983277591973e-06, |
| "loss": 6.3891, |
| "step": 15200 |
| }, |
| { |
| "epoch": 33.7588023288051, |
| "eval_loss": 6.390474796295166, |
| "eval_runtime": 66.2489, |
| "eval_samples_per_second": 150.946, |
| "eval_steps_per_second": 18.868, |
| "step": 15200 |
| }, |
| { |
| "epoch": 33.9805932908234, |
| "grad_norm": 0.5623629093170166, |
| "learning_rate": 4.48494983277592e-06, |
| "loss": 6.3901, |
| "step": 15300 |
| }, |
| { |
| "epoch": 33.9805932908234, |
| "eval_loss": 6.388679027557373, |
| "eval_runtime": 63.6854, |
| "eval_samples_per_second": 157.022, |
| "eval_steps_per_second": 19.628, |
| "step": 15300 |
| }, |
| { |
| "epoch": 34.202384252841696, |
| "grad_norm": 0.49122416973114014, |
| "learning_rate": 4.474916387959866e-06, |
| "loss": 6.389, |
| "step": 15400 |
| }, |
| { |
| "epoch": 34.202384252841696, |
| "eval_loss": 6.39013671875, |
| "eval_runtime": 63.5858, |
| "eval_samples_per_second": 157.268, |
| "eval_steps_per_second": 19.658, |
| "step": 15400 |
| }, |
| { |
| "epoch": 34.42417521485999, |
| "grad_norm": 0.674659013748169, |
| "learning_rate": 4.4648829431438125e-06, |
| "loss": 6.3887, |
| "step": 15500 |
| }, |
| { |
| "epoch": 34.42417521485999, |
| "eval_loss": 6.392813205718994, |
| "eval_runtime": 66.2307, |
| "eval_samples_per_second": 150.987, |
| "eval_steps_per_second": 18.873, |
| "step": 15500 |
| }, |
| { |
| "epoch": 34.64596617687829, |
| "grad_norm": 0.43613201379776, |
| "learning_rate": 4.454849498327759e-06, |
| "loss": 6.3889, |
| "step": 15600 |
| }, |
| { |
| "epoch": 34.64596617687829, |
| "eval_loss": 6.388660907745361, |
| "eval_runtime": 63.6774, |
| "eval_samples_per_second": 157.042, |
| "eval_steps_per_second": 19.63, |
| "step": 15600 |
| }, |
| { |
| "epoch": 34.86775713889659, |
| "grad_norm": 0.737578272819519, |
| "learning_rate": 4.444816053511705e-06, |
| "loss": 6.3894, |
| "step": 15700 |
| }, |
| { |
| "epoch": 34.86775713889659, |
| "eval_loss": 6.389644145965576, |
| "eval_runtime": 63.7079, |
| "eval_samples_per_second": 156.966, |
| "eval_steps_per_second": 19.621, |
| "step": 15700 |
| }, |
| { |
| "epoch": 35.08954810091489, |
| "grad_norm": 0.4716251790523529, |
| "learning_rate": 4.434782608695652e-06, |
| "loss": 6.3885, |
| "step": 15800 |
| }, |
| { |
| "epoch": 35.08954810091489, |
| "eval_loss": 6.392263412475586, |
| "eval_runtime": 66.1971, |
| "eval_samples_per_second": 151.064, |
| "eval_steps_per_second": 18.883, |
| "step": 15800 |
| }, |
| { |
| "epoch": 35.311339062933186, |
| "grad_norm": 0.47875767946243286, |
| "learning_rate": 4.424749163879599e-06, |
| "loss": 6.3886, |
| "step": 15900 |
| }, |
| { |
| "epoch": 35.311339062933186, |
| "eval_loss": 6.389831066131592, |
| "eval_runtime": 63.6821, |
| "eval_samples_per_second": 157.03, |
| "eval_steps_per_second": 19.629, |
| "step": 15900 |
| }, |
| { |
| "epoch": 35.53313002495148, |
| "grad_norm": 0.43402403593063354, |
| "learning_rate": 4.414715719063545e-06, |
| "loss": 6.3909, |
| "step": 16000 |
| }, |
| { |
| "epoch": 35.53313002495148, |
| "eval_loss": 6.389725208282471, |
| "eval_runtime": 63.7124, |
| "eval_samples_per_second": 156.955, |
| "eval_steps_per_second": 19.619, |
| "step": 16000 |
| }, |
| { |
| "epoch": 35.75492098696978, |
| "grad_norm": 0.5011460781097412, |
| "learning_rate": 4.404682274247491e-06, |
| "loss": 6.3891, |
| "step": 16100 |
| }, |
| { |
| "epoch": 35.75492098696978, |
| "eval_loss": 6.388359546661377, |
| "eval_runtime": 66.2636, |
| "eval_samples_per_second": 150.912, |
| "eval_steps_per_second": 18.864, |
| "step": 16100 |
| }, |
| { |
| "epoch": 35.976711948988076, |
| "grad_norm": 0.4029878079891205, |
| "learning_rate": 4.394648829431438e-06, |
| "loss": 6.3875, |
| "step": 16200 |
| }, |
| { |
| "epoch": 35.976711948988076, |
| "eval_loss": 6.387814044952393, |
| "eval_runtime": 63.7085, |
| "eval_samples_per_second": 156.965, |
| "eval_steps_per_second": 19.621, |
| "step": 16200 |
| }, |
| { |
| "epoch": 36.19850291100638, |
| "grad_norm": 0.5763450264930725, |
| "learning_rate": 4.384615384615384e-06, |
| "loss": 6.3889, |
| "step": 16300 |
| }, |
| { |
| "epoch": 36.19850291100638, |
| "eval_loss": 6.389321327209473, |
| "eval_runtime": 65.8717, |
| "eval_samples_per_second": 151.81, |
| "eval_steps_per_second": 18.976, |
| "step": 16300 |
| }, |
| { |
| "epoch": 36.420293873024676, |
| "grad_norm": 0.4742737412452698, |
| "learning_rate": 4.374581939799331e-06, |
| "loss": 6.3886, |
| "step": 16400 |
| }, |
| { |
| "epoch": 36.420293873024676, |
| "eval_loss": 6.388833522796631, |
| "eval_runtime": 63.752, |
| "eval_samples_per_second": 156.858, |
| "eval_steps_per_second": 19.607, |
| "step": 16400 |
| }, |
| { |
| "epoch": 36.64208483504297, |
| "grad_norm": 0.4631459414958954, |
| "learning_rate": 4.364548494983278e-06, |
| "loss": 6.3886, |
| "step": 16500 |
| }, |
| { |
| "epoch": 36.64208483504297, |
| "eval_loss": 6.387075901031494, |
| "eval_runtime": 63.6816, |
| "eval_samples_per_second": 157.031, |
| "eval_steps_per_second": 19.629, |
| "step": 16500 |
| }, |
| { |
| "epoch": 36.86387579706127, |
| "grad_norm": 0.5047929286956787, |
| "learning_rate": 4.354515050167224e-06, |
| "loss": 6.3869, |
| "step": 16600 |
| }, |
| { |
| "epoch": 36.86387579706127, |
| "eval_loss": 6.39074182510376, |
| "eval_runtime": 64.7171, |
| "eval_samples_per_second": 154.519, |
| "eval_steps_per_second": 19.315, |
| "step": 16600 |
| }, |
| { |
| "epoch": 37.085666759079565, |
| "grad_norm": 0.45218634605407715, |
| "learning_rate": 4.34448160535117e-06, |
| "loss": 6.3894, |
| "step": 16700 |
| }, |
| { |
| "epoch": 37.085666759079565, |
| "eval_loss": 6.393436908721924, |
| "eval_runtime": 64.9705, |
| "eval_samples_per_second": 153.916, |
| "eval_steps_per_second": 19.24, |
| "step": 16700 |
| }, |
| { |
| "epoch": 37.30745772109786, |
| "grad_norm": 0.5652719736099243, |
| "learning_rate": 4.334448160535117e-06, |
| "loss": 6.3873, |
| "step": 16800 |
| }, |
| { |
| "epoch": 37.30745772109786, |
| "eval_loss": 6.391731262207031, |
| "eval_runtime": 63.565, |
| "eval_samples_per_second": 157.319, |
| "eval_steps_per_second": 19.665, |
| "step": 16800 |
| }, |
| { |
| "epoch": 37.529248683116165, |
| "grad_norm": 0.28403371572494507, |
| "learning_rate": 4.324414715719064e-06, |
| "loss": 6.3882, |
| "step": 16900 |
| }, |
| { |
| "epoch": 37.529248683116165, |
| "eval_loss": 6.390590190887451, |
| "eval_runtime": 63.6107, |
| "eval_samples_per_second": 157.206, |
| "eval_steps_per_second": 19.651, |
| "step": 16900 |
| }, |
| { |
| "epoch": 37.75103964513446, |
| "grad_norm": 0.477235347032547, |
| "learning_rate": 4.31438127090301e-06, |
| "loss": 6.3872, |
| "step": 17000 |
| }, |
| { |
| "epoch": 37.75103964513446, |
| "eval_loss": 6.390269756317139, |
| "eval_runtime": 66.2763, |
| "eval_samples_per_second": 150.884, |
| "eval_steps_per_second": 18.86, |
| "step": 17000 |
| }, |
| { |
| "epoch": 37.97283060715276, |
| "grad_norm": 0.37472817301750183, |
| "learning_rate": 4.3043478260869565e-06, |
| "loss": 6.3874, |
| "step": 17100 |
| }, |
| { |
| "epoch": 37.97283060715276, |
| "eval_loss": 6.390199184417725, |
| "eval_runtime": 63.6243, |
| "eval_samples_per_second": 157.173, |
| "eval_steps_per_second": 19.647, |
| "step": 17100 |
| }, |
| { |
| "epoch": 38.194621569171055, |
| "grad_norm": 0.3379691243171692, |
| "learning_rate": 4.294314381270903e-06, |
| "loss": 6.387, |
| "step": 17200 |
| }, |
| { |
| "epoch": 38.194621569171055, |
| "eval_loss": 6.386340618133545, |
| "eval_runtime": 63.5571, |
| "eval_samples_per_second": 157.339, |
| "eval_steps_per_second": 19.667, |
| "step": 17200 |
| }, |
| { |
| "epoch": 38.41641253118935, |
| "grad_norm": 0.46496257185935974, |
| "learning_rate": 4.284280936454849e-06, |
| "loss": 6.3856, |
| "step": 17300 |
| }, |
| { |
| "epoch": 38.41641253118935, |
| "eval_loss": 6.3855695724487305, |
| "eval_runtime": 65.9737, |
| "eval_samples_per_second": 151.576, |
| "eval_steps_per_second": 18.947, |
| "step": 17300 |
| }, |
| { |
| "epoch": 38.638203493207655, |
| "grad_norm": 0.37888166308403015, |
| "learning_rate": 4.274247491638796e-06, |
| "loss": 6.3884, |
| "step": 17400 |
| }, |
| { |
| "epoch": 38.638203493207655, |
| "eval_loss": 6.388376235961914, |
| "eval_runtime": 63.6302, |
| "eval_samples_per_second": 157.158, |
| "eval_steps_per_second": 19.645, |
| "step": 17400 |
| }, |
| { |
| "epoch": 38.85999445522595, |
| "grad_norm": 0.25813955068588257, |
| "learning_rate": 4.264214046822743e-06, |
| "loss": 6.3885, |
| "step": 17500 |
| }, |
| { |
| "epoch": 38.85999445522595, |
| "eval_loss": 6.389296054840088, |
| "eval_runtime": 63.7359, |
| "eval_samples_per_second": 156.897, |
| "eval_steps_per_second": 19.612, |
| "step": 17500 |
| }, |
| { |
| "epoch": 39.08178541724425, |
| "grad_norm": 0.4262288510799408, |
| "learning_rate": 4.254180602006689e-06, |
| "loss": 6.3873, |
| "step": 17600 |
| }, |
| { |
| "epoch": 39.08178541724425, |
| "eval_loss": 6.389705657958984, |
| "eval_runtime": 66.0125, |
| "eval_samples_per_second": 151.486, |
| "eval_steps_per_second": 18.936, |
| "step": 17600 |
| }, |
| { |
| "epoch": 39.303576379262545, |
| "grad_norm": 0.5291593074798584, |
| "learning_rate": 4.244147157190635e-06, |
| "loss": 6.3875, |
| "step": 17700 |
| }, |
| { |
| "epoch": 39.303576379262545, |
| "eval_loss": 6.390807628631592, |
| "eval_runtime": 63.619, |
| "eval_samples_per_second": 157.186, |
| "eval_steps_per_second": 19.648, |
| "step": 17700 |
| }, |
| { |
| "epoch": 39.52536734128084, |
| "grad_norm": 0.3667999505996704, |
| "learning_rate": 4.234113712374582e-06, |
| "loss": 6.3887, |
| "step": 17800 |
| }, |
| { |
| "epoch": 39.52536734128084, |
| "eval_loss": 6.3871259689331055, |
| "eval_runtime": 63.6878, |
| "eval_samples_per_second": 157.016, |
| "eval_steps_per_second": 19.627, |
| "step": 17800 |
| }, |
| { |
| "epoch": 39.74715830329914, |
| "grad_norm": 0.40572404861450195, |
| "learning_rate": 4.224080267558528e-06, |
| "loss": 6.3877, |
| "step": 17900 |
| }, |
| { |
| "epoch": 39.74715830329914, |
| "eval_loss": 6.387050628662109, |
| "eval_runtime": 66.0715, |
| "eval_samples_per_second": 151.351, |
| "eval_steps_per_second": 18.919, |
| "step": 17900 |
| }, |
| { |
| "epoch": 39.96894926531744, |
| "grad_norm": 0.5057101845741272, |
| "learning_rate": 4.214046822742475e-06, |
| "loss": 6.385, |
| "step": 18000 |
| }, |
| { |
| "epoch": 39.96894926531744, |
| "eval_loss": 6.388771057128906, |
| "eval_runtime": 63.6312, |
| "eval_samples_per_second": 157.156, |
| "eval_steps_per_second": 19.644, |
| "step": 18000 |
| }, |
| { |
| "epoch": 40.19074022733574, |
| "grad_norm": 0.5846272110939026, |
| "learning_rate": 4.2040133779264216e-06, |
| "loss": 6.3873, |
| "step": 18100 |
| }, |
| { |
| "epoch": 40.19074022733574, |
| "eval_loss": 6.388961315155029, |
| "eval_runtime": 63.6009, |
| "eval_samples_per_second": 157.23, |
| "eval_steps_per_second": 19.654, |
| "step": 18100 |
| }, |
| { |
| "epoch": 40.412531189354034, |
| "grad_norm": 0.40428778529167175, |
| "learning_rate": 4.1939799331103675e-06, |
| "loss": 6.3878, |
| "step": 18200 |
| }, |
| { |
| "epoch": 40.412531189354034, |
| "eval_loss": 6.392088413238525, |
| "eval_runtime": 66.128, |
| "eval_samples_per_second": 151.222, |
| "eval_steps_per_second": 18.903, |
| "step": 18200 |
| }, |
| { |
| "epoch": 40.63432215137233, |
| "grad_norm": 0.46563634276390076, |
| "learning_rate": 4.183946488294314e-06, |
| "loss": 6.386, |
| "step": 18300 |
| }, |
| { |
| "epoch": 40.63432215137233, |
| "eval_loss": 6.389146327972412, |
| "eval_runtime": 63.6612, |
| "eval_samples_per_second": 157.082, |
| "eval_steps_per_second": 19.635, |
| "step": 18300 |
| }, |
| { |
| "epoch": 40.85611311339063, |
| "grad_norm": 0.4533691704273224, |
| "learning_rate": 4.173913043478261e-06, |
| "loss": 6.3874, |
| "step": 18400 |
| }, |
| { |
| "epoch": 40.85611311339063, |
| "eval_loss": 6.386475086212158, |
| "eval_runtime": 63.7394, |
| "eval_samples_per_second": 156.889, |
| "eval_steps_per_second": 19.611, |
| "step": 18400 |
| }, |
| { |
| "epoch": 41.077904075408924, |
| "grad_norm": 0.38121113181114197, |
| "learning_rate": 4.163879598662208e-06, |
| "loss": 6.3862, |
| "step": 18500 |
| }, |
| { |
| "epoch": 41.077904075408924, |
| "eval_loss": 6.384340763092041, |
| "eval_runtime": 65.9841, |
| "eval_samples_per_second": 151.552, |
| "eval_steps_per_second": 18.944, |
| "step": 18500 |
| }, |
| { |
| "epoch": 41.29969503742723, |
| "grad_norm": 0.4599936604499817, |
| "learning_rate": 4.153846153846154e-06, |
| "loss": 6.3871, |
| "step": 18600 |
| }, |
| { |
| "epoch": 41.29969503742723, |
| "eval_loss": 6.38564395904541, |
| "eval_runtime": 63.6008, |
| "eval_samples_per_second": 157.231, |
| "eval_steps_per_second": 19.654, |
| "step": 18600 |
| }, |
| { |
| "epoch": 41.521485999445524, |
| "grad_norm": 0.6862403154373169, |
| "learning_rate": 4.1438127090301005e-06, |
| "loss": 6.3867, |
| "step": 18700 |
| }, |
| { |
| "epoch": 41.521485999445524, |
| "eval_loss": 6.385303020477295, |
| "eval_runtime": 63.6207, |
| "eval_samples_per_second": 157.181, |
| "eval_steps_per_second": 19.648, |
| "step": 18700 |
| }, |
| { |
| "epoch": 41.74327696146382, |
| "grad_norm": 0.26633918285369873, |
| "learning_rate": 4.133779264214047e-06, |
| "loss": 6.3869, |
| "step": 18800 |
| }, |
| { |
| "epoch": 41.74327696146382, |
| "eval_loss": 6.389577388763428, |
| "eval_runtime": 66.0775, |
| "eval_samples_per_second": 151.337, |
| "eval_steps_per_second": 18.917, |
| "step": 18800 |
| }, |
| { |
| "epoch": 41.96506792348212, |
| "grad_norm": 0.30118024349212646, |
| "learning_rate": 4.123745819397993e-06, |
| "loss": 6.3869, |
| "step": 18900 |
| }, |
| { |
| "epoch": 41.96506792348212, |
| "eval_loss": 6.387940406799316, |
| "eval_runtime": 63.6813, |
| "eval_samples_per_second": 157.032, |
| "eval_steps_per_second": 19.629, |
| "step": 18900 |
| }, |
| { |
| "epoch": 42.18685888550041, |
| "grad_norm": 0.6833294630050659, |
| "learning_rate": 4.11371237458194e-06, |
| "loss": 6.3857, |
| "step": 19000 |
| }, |
| { |
| "epoch": 42.18685888550041, |
| "eval_loss": 6.3908514976501465, |
| "eval_runtime": 66.0844, |
| "eval_samples_per_second": 151.322, |
| "eval_steps_per_second": 18.915, |
| "step": 19000 |
| }, |
| { |
| "epoch": 42.40864984751872, |
| "grad_norm": 0.35510268807411194, |
| "learning_rate": 4.103678929765887e-06, |
| "loss": 6.3862, |
| "step": 19100 |
| }, |
| { |
| "epoch": 42.40864984751872, |
| "eval_loss": 6.3866119384765625, |
| "eval_runtime": 63.7625, |
| "eval_samples_per_second": 156.832, |
| "eval_steps_per_second": 19.604, |
| "step": 19100 |
| }, |
| { |
| "epoch": 42.63044080953701, |
| "grad_norm": 0.5903100371360779, |
| "learning_rate": 4.0936454849498326e-06, |
| "loss": 6.3857, |
| "step": 19200 |
| }, |
| { |
| "epoch": 42.63044080953701, |
| "eval_loss": 6.385927677154541, |
| "eval_runtime": 63.6174, |
| "eval_samples_per_second": 157.19, |
| "eval_steps_per_second": 19.649, |
| "step": 19200 |
| }, |
| { |
| "epoch": 42.85223177155531, |
| "grad_norm": 0.4845108091831207, |
| "learning_rate": 4.083612040133779e-06, |
| "loss": 6.387, |
| "step": 19300 |
| }, |
| { |
| "epoch": 42.85223177155531, |
| "eval_loss": 6.38942289352417, |
| "eval_runtime": 66.1264, |
| "eval_samples_per_second": 151.226, |
| "eval_steps_per_second": 18.903, |
| "step": 19300 |
| }, |
| { |
| "epoch": 43.07402273357361, |
| "grad_norm": 0.3592558801174164, |
| "learning_rate": 4.073578595317726e-06, |
| "loss": 6.3862, |
| "step": 19400 |
| }, |
| { |
| "epoch": 43.07402273357361, |
| "eval_loss": 6.389144420623779, |
| "eval_runtime": 63.5655, |
| "eval_samples_per_second": 157.318, |
| "eval_steps_per_second": 19.665, |
| "step": 19400 |
| }, |
| { |
| "epoch": 43.2958136955919, |
| "grad_norm": 0.5529589056968689, |
| "learning_rate": 4.063545150501672e-06, |
| "loss": 6.3842, |
| "step": 19500 |
| }, |
| { |
| "epoch": 43.2958136955919, |
| "eval_loss": 6.386436939239502, |
| "eval_runtime": 66.2264, |
| "eval_samples_per_second": 150.997, |
| "eval_steps_per_second": 18.875, |
| "step": 19500 |
| }, |
| { |
| "epoch": 43.5176046576102, |
| "grad_norm": 0.42238518595695496, |
| "learning_rate": 4.053511705685619e-06, |
| "loss": 6.3866, |
| "step": 19600 |
| }, |
| { |
| "epoch": 43.5176046576102, |
| "eval_loss": 6.385384559631348, |
| "eval_runtime": 63.7683, |
| "eval_samples_per_second": 156.818, |
| "eval_steps_per_second": 19.602, |
| "step": 19600 |
| }, |
| { |
| "epoch": 43.7393956196285, |
| "grad_norm": 0.5223355293273926, |
| "learning_rate": 4.0434782608695655e-06, |
| "loss": 6.3853, |
| "step": 19700 |
| }, |
| { |
| "epoch": 43.7393956196285, |
| "eval_loss": 6.385824203491211, |
| "eval_runtime": 63.6506, |
| "eval_samples_per_second": 157.108, |
| "eval_steps_per_second": 19.638, |
| "step": 19700 |
| }, |
| { |
| "epoch": 43.9611865816468, |
| "grad_norm": 0.46218928694725037, |
| "learning_rate": 4.0334448160535115e-06, |
| "loss": 6.387, |
| "step": 19800 |
| }, |
| { |
| "epoch": 43.9611865816468, |
| "eval_loss": 6.38681697845459, |
| "eval_runtime": 66.1858, |
| "eval_samples_per_second": 151.09, |
| "eval_steps_per_second": 18.886, |
| "step": 19800 |
| }, |
| { |
| "epoch": 44.182977543665096, |
| "grad_norm": 0.3450022041797638, |
| "learning_rate": 4.023411371237458e-06, |
| "loss": 6.3845, |
| "step": 19900 |
| }, |
| { |
| "epoch": 44.182977543665096, |
| "eval_loss": 6.386622428894043, |
| "eval_runtime": 63.5361, |
| "eval_samples_per_second": 157.391, |
| "eval_steps_per_second": 19.674, |
| "step": 19900 |
| }, |
| { |
| "epoch": 44.40476850568339, |
| "grad_norm": 0.39958134293556213, |
| "learning_rate": 4.013377926421405e-06, |
| "loss": 6.3863, |
| "step": 20000 |
| }, |
| { |
| "epoch": 44.40476850568339, |
| "eval_loss": 6.387628555297852, |
| "eval_runtime": 63.6316, |
| "eval_samples_per_second": 157.155, |
| "eval_steps_per_second": 19.644, |
| "step": 20000 |
| }, |
| { |
| "epoch": 44.62655946770169, |
| "grad_norm": 0.28472310304641724, |
| "learning_rate": 4.003344481605351e-06, |
| "loss": 6.3851, |
| "step": 20100 |
| }, |
| { |
| "epoch": 44.62655946770169, |
| "eval_loss": 6.388401031494141, |
| "eval_runtime": 63.6958, |
| "eval_samples_per_second": 156.996, |
| "eval_steps_per_second": 19.625, |
| "step": 20100 |
| }, |
| { |
| "epoch": 44.848350429719986, |
| "grad_norm": 0.39134547114372253, |
| "learning_rate": 3.993311036789298e-06, |
| "loss": 6.3849, |
| "step": 20200 |
| }, |
| { |
| "epoch": 44.848350429719986, |
| "eval_loss": 6.389621734619141, |
| "eval_runtime": 66.137, |
| "eval_samples_per_second": 151.201, |
| "eval_steps_per_second": 18.9, |
| "step": 20200 |
| }, |
| { |
| "epoch": 45.07014139173829, |
| "grad_norm": 0.5134591460227966, |
| "learning_rate": 3.9832775919732444e-06, |
| "loss": 6.3847, |
| "step": 20300 |
| }, |
| { |
| "epoch": 45.07014139173829, |
| "eval_loss": 6.387813568115234, |
| "eval_runtime": 63.5686, |
| "eval_samples_per_second": 157.31, |
| "eval_steps_per_second": 19.664, |
| "step": 20300 |
| }, |
| { |
| "epoch": 45.291932353756586, |
| "grad_norm": 0.2885007858276367, |
| "learning_rate": 3.97324414715719e-06, |
| "loss": 6.3865, |
| "step": 20400 |
| }, |
| { |
| "epoch": 45.291932353756586, |
| "eval_loss": 6.389806270599365, |
| "eval_runtime": 63.5893, |
| "eval_samples_per_second": 157.259, |
| "eval_steps_per_second": 19.657, |
| "step": 20400 |
| }, |
| { |
| "epoch": 45.51372331577488, |
| "grad_norm": 0.37093526124954224, |
| "learning_rate": 3.963210702341137e-06, |
| "loss": 6.3842, |
| "step": 20500 |
| }, |
| { |
| "epoch": 45.51372331577488, |
| "eval_loss": 6.386034965515137, |
| "eval_runtime": 66.1094, |
| "eval_samples_per_second": 151.265, |
| "eval_steps_per_second": 18.908, |
| "step": 20500 |
| }, |
| { |
| "epoch": 45.73551427779318, |
| "grad_norm": 0.4181094169616699, |
| "learning_rate": 3.953177257525084e-06, |
| "loss": 6.3827, |
| "step": 20600 |
| }, |
| { |
| "epoch": 45.73551427779318, |
| "eval_loss": 6.386598587036133, |
| "eval_runtime": 63.6628, |
| "eval_samples_per_second": 157.078, |
| "eval_steps_per_second": 19.635, |
| "step": 20600 |
| }, |
| { |
| "epoch": 45.957305239811475, |
| "grad_norm": 0.6212390661239624, |
| "learning_rate": 3.943143812709031e-06, |
| "loss": 6.3864, |
| "step": 20700 |
| }, |
| { |
| "epoch": 45.957305239811475, |
| "eval_loss": 6.3882646560668945, |
| "eval_runtime": 65.9973, |
| "eval_samples_per_second": 151.521, |
| "eval_steps_per_second": 18.94, |
| "step": 20700 |
| }, |
| { |
| "epoch": 46.17909620182977, |
| "grad_norm": 0.443857878446579, |
| "learning_rate": 3.9331103678929765e-06, |
| "loss": 6.3859, |
| "step": 20800 |
| }, |
| { |
| "epoch": 46.17909620182977, |
| "eval_loss": 6.388275623321533, |
| "eval_runtime": 63.7053, |
| "eval_samples_per_second": 156.973, |
| "eval_steps_per_second": 19.622, |
| "step": 20800 |
| }, |
| { |
| "epoch": 46.400887163848076, |
| "grad_norm": 0.2678993344306946, |
| "learning_rate": 3.923076923076923e-06, |
| "loss": 6.3865, |
| "step": 20900 |
| }, |
| { |
| "epoch": 46.400887163848076, |
| "eval_loss": 6.38779354095459, |
| "eval_runtime": 63.6908, |
| "eval_samples_per_second": 157.009, |
| "eval_steps_per_second": 19.626, |
| "step": 20900 |
| }, |
| { |
| "epoch": 46.62267812586637, |
| "grad_norm": 0.35121896862983704, |
| "learning_rate": 3.91304347826087e-06, |
| "loss": 6.3842, |
| "step": 21000 |
| }, |
| { |
| "epoch": 46.62267812586637, |
| "eval_loss": 6.385668754577637, |
| "eval_runtime": 66.0547, |
| "eval_samples_per_second": 151.39, |
| "eval_steps_per_second": 18.924, |
| "step": 21000 |
| }, |
| { |
| "epoch": 46.84446908788467, |
| "grad_norm": 0.6166325807571411, |
| "learning_rate": 3.903010033444816e-06, |
| "loss": 6.3848, |
| "step": 21100 |
| }, |
| { |
| "epoch": 46.84446908788467, |
| "eval_loss": 6.385282516479492, |
| "eval_runtime": 63.6134, |
| "eval_samples_per_second": 157.2, |
| "eval_steps_per_second": 19.65, |
| "step": 21100 |
| }, |
| { |
| "epoch": 47.066260049902965, |
| "grad_norm": 0.5324620008468628, |
| "learning_rate": 3.892976588628763e-06, |
| "loss": 6.3847, |
| "step": 21200 |
| }, |
| { |
| "epoch": 47.066260049902965, |
| "eval_loss": 6.386166572570801, |
| "eval_runtime": 63.7747, |
| "eval_samples_per_second": 156.802, |
| "eval_steps_per_second": 19.6, |
| "step": 21200 |
| }, |
| { |
| "epoch": 47.28805101192126, |
| "grad_norm": 0.37806278467178345, |
| "learning_rate": 3.8829431438127095e-06, |
| "loss": 6.3847, |
| "step": 21300 |
| }, |
| { |
| "epoch": 47.28805101192126, |
| "eval_loss": 6.387280464172363, |
| "eval_runtime": 66.0795, |
| "eval_samples_per_second": 151.333, |
| "eval_steps_per_second": 18.917, |
| "step": 21300 |
| }, |
| { |
| "epoch": 47.509841973939565, |
| "grad_norm": 0.2344857156276703, |
| "learning_rate": 3.8729096989966554e-06, |
| "loss": 6.3851, |
| "step": 21400 |
| }, |
| { |
| "epoch": 47.509841973939565, |
| "eval_loss": 6.38550329208374, |
| "eval_runtime": 63.7048, |
| "eval_samples_per_second": 156.974, |
| "eval_steps_per_second": 19.622, |
| "step": 21400 |
| }, |
| { |
| "epoch": 47.73163293595786, |
| "grad_norm": 0.47279292345046997, |
| "learning_rate": 3.862876254180602e-06, |
| "loss": 6.3843, |
| "step": 21500 |
| }, |
| { |
| "epoch": 47.73163293595786, |
| "eval_loss": 6.390079021453857, |
| "eval_runtime": 63.7137, |
| "eval_samples_per_second": 156.952, |
| "eval_steps_per_second": 19.619, |
| "step": 21500 |
| }, |
| { |
| "epoch": 47.95342389797616, |
| "grad_norm": 0.5413157343864441, |
| "learning_rate": 3.852842809364549e-06, |
| "loss": 6.3844, |
| "step": 21600 |
| }, |
| { |
| "epoch": 47.95342389797616, |
| "eval_loss": 6.385741233825684, |
| "eval_runtime": 66.101, |
| "eval_samples_per_second": 151.284, |
| "eval_steps_per_second": 18.91, |
| "step": 21600 |
| }, |
| { |
| "epoch": 48.175214859994455, |
| "grad_norm": 0.48085787892341614, |
| "learning_rate": 3.842809364548495e-06, |
| "loss": 6.3851, |
| "step": 21700 |
| }, |
| { |
| "epoch": 48.175214859994455, |
| "eval_loss": 6.385941505432129, |
| "eval_runtime": 63.6718, |
| "eval_samples_per_second": 157.055, |
| "eval_steps_per_second": 19.632, |
| "step": 21700 |
| }, |
| { |
| "epoch": 48.39700582201275, |
| "grad_norm": 0.6270382404327393, |
| "learning_rate": 3.832775919732442e-06, |
| "loss": 6.3845, |
| "step": 21800 |
| }, |
| { |
| "epoch": 48.39700582201275, |
| "eval_loss": 6.387849807739258, |
| "eval_runtime": 66.1314, |
| "eval_samples_per_second": 151.214, |
| "eval_steps_per_second": 18.902, |
| "step": 21800 |
| }, |
| { |
| "epoch": 48.61879678403105, |
| "grad_norm": 0.36722734570503235, |
| "learning_rate": 3.822742474916388e-06, |
| "loss": 6.3848, |
| "step": 21900 |
| }, |
| { |
| "epoch": 48.61879678403105, |
| "eval_loss": 6.387927532196045, |
| "eval_runtime": 63.6715, |
| "eval_samples_per_second": 157.056, |
| "eval_steps_per_second": 19.632, |
| "step": 21900 |
| }, |
| { |
| "epoch": 48.84058774604935, |
| "grad_norm": 0.4715673327445984, |
| "learning_rate": 3.8127090301003347e-06, |
| "loss": 6.3828, |
| "step": 22000 |
| }, |
| { |
| "epoch": 48.84058774604935, |
| "eval_loss": 6.388005256652832, |
| "eval_runtime": 63.7564, |
| "eval_samples_per_second": 156.847, |
| "eval_steps_per_second": 19.606, |
| "step": 22000 |
| }, |
| { |
| "epoch": 49.06237870806765, |
| "grad_norm": 0.46226397156715393, |
| "learning_rate": 3.802675585284281e-06, |
| "loss": 6.3839, |
| "step": 22100 |
| }, |
| { |
| "epoch": 49.06237870806765, |
| "eval_loss": 6.386138439178467, |
| "eval_runtime": 65.9562, |
| "eval_samples_per_second": 151.616, |
| "eval_steps_per_second": 18.952, |
| "step": 22100 |
| }, |
| { |
| "epoch": 49.284169670085944, |
| "grad_norm": 0.48933687806129456, |
| "learning_rate": 3.792642140468228e-06, |
| "loss": 6.3835, |
| "step": 22200 |
| }, |
| { |
| "epoch": 49.284169670085944, |
| "eval_loss": 6.386913776397705, |
| "eval_runtime": 63.5702, |
| "eval_samples_per_second": 157.306, |
| "eval_steps_per_second": 19.663, |
| "step": 22200 |
| }, |
| { |
| "epoch": 49.50596063210424, |
| "grad_norm": 0.4057106375694275, |
| "learning_rate": 3.782608695652174e-06, |
| "loss": 6.3831, |
| "step": 22300 |
| }, |
| { |
| "epoch": 49.50596063210424, |
| "eval_loss": 6.3875555992126465, |
| "eval_runtime": 63.6283, |
| "eval_samples_per_second": 157.163, |
| "eval_steps_per_second": 19.645, |
| "step": 22300 |
| }, |
| { |
| "epoch": 49.72775159412254, |
| "grad_norm": 0.4397966265678406, |
| "learning_rate": 3.7725752508361205e-06, |
| "loss": 6.3847, |
| "step": 22400 |
| }, |
| { |
| "epoch": 49.72775159412254, |
| "eval_loss": 6.386244297027588, |
| "eval_runtime": 63.6792, |
| "eval_samples_per_second": 157.037, |
| "eval_steps_per_second": 19.63, |
| "step": 22400 |
| }, |
| { |
| "epoch": 49.949542556140834, |
| "grad_norm": 0.4629203677177429, |
| "learning_rate": 3.7625418060200673e-06, |
| "loss": 6.384, |
| "step": 22500 |
| }, |
| { |
| "epoch": 49.949542556140834, |
| "eval_loss": 6.386322498321533, |
| "eval_runtime": 66.1359, |
| "eval_samples_per_second": 151.204, |
| "eval_steps_per_second": 18.9, |
| "step": 22500 |
| }, |
| { |
| "epoch": 50.17133351815914, |
| "grad_norm": 0.43559348583221436, |
| "learning_rate": 3.7525083612040136e-06, |
| "loss": 6.3831, |
| "step": 22600 |
| }, |
| { |
| "epoch": 50.17133351815914, |
| "eval_loss": 6.386173248291016, |
| "eval_runtime": 63.6043, |
| "eval_samples_per_second": 157.222, |
| "eval_steps_per_second": 19.653, |
| "step": 22600 |
| }, |
| { |
| "epoch": 50.393124480177434, |
| "grad_norm": 0.3772810399532318, |
| "learning_rate": 3.74247491638796e-06, |
| "loss": 6.3836, |
| "step": 22700 |
| }, |
| { |
| "epoch": 50.393124480177434, |
| "eval_loss": 6.38073205947876, |
| "eval_runtime": 63.7199, |
| "eval_samples_per_second": 156.937, |
| "eval_steps_per_second": 19.617, |
| "step": 22700 |
| }, |
| { |
| "epoch": 50.61491544219573, |
| "grad_norm": 0.36232537031173706, |
| "learning_rate": 3.7324414715719067e-06, |
| "loss": 6.3837, |
| "step": 22800 |
| }, |
| { |
| "epoch": 50.61491544219573, |
| "eval_loss": 6.385157108306885, |
| "eval_runtime": 66.1214, |
| "eval_samples_per_second": 151.237, |
| "eval_steps_per_second": 18.905, |
| "step": 22800 |
| }, |
| { |
| "epoch": 50.83670640421403, |
| "grad_norm": 0.3568231165409088, |
| "learning_rate": 3.722408026755853e-06, |
| "loss": 6.3837, |
| "step": 22900 |
| }, |
| { |
| "epoch": 50.83670640421403, |
| "eval_loss": 6.388894081115723, |
| "eval_runtime": 63.6202, |
| "eval_samples_per_second": 157.183, |
| "eval_steps_per_second": 19.648, |
| "step": 22900 |
| }, |
| { |
| "epoch": 51.058497366232324, |
| "grad_norm": 0.5292544960975647, |
| "learning_rate": 3.7123745819398e-06, |
| "loss": 6.3824, |
| "step": 23000 |
| }, |
| { |
| "epoch": 51.058497366232324, |
| "eval_loss": 6.382253170013428, |
| "eval_runtime": 63.6223, |
| "eval_samples_per_second": 157.178, |
| "eval_steps_per_second": 19.647, |
| "step": 23000 |
| }, |
| { |
| "epoch": 51.28028832825063, |
| "grad_norm": 0.47718894481658936, |
| "learning_rate": 3.702341137123746e-06, |
| "loss": 6.3833, |
| "step": 23100 |
| }, |
| { |
| "epoch": 51.28028832825063, |
| "eval_loss": 6.389714241027832, |
| "eval_runtime": 66.0943, |
| "eval_samples_per_second": 151.299, |
| "eval_steps_per_second": 18.912, |
| "step": 23100 |
| }, |
| { |
| "epoch": 51.502079290268924, |
| "grad_norm": 0.2303953319787979, |
| "learning_rate": 3.6923076923076925e-06, |
| "loss": 6.3822, |
| "step": 23200 |
| }, |
| { |
| "epoch": 51.502079290268924, |
| "eval_loss": 6.384761810302734, |
| "eval_runtime": 63.6768, |
| "eval_samples_per_second": 157.043, |
| "eval_steps_per_second": 19.63, |
| "step": 23200 |
| }, |
| { |
| "epoch": 51.72387025228722, |
| "grad_norm": 0.4536280035972595, |
| "learning_rate": 3.6822742474916393e-06, |
| "loss": 6.3829, |
| "step": 23300 |
| }, |
| { |
| "epoch": 51.72387025228722, |
| "eval_loss": 6.38330078125, |
| "eval_runtime": 63.6407, |
| "eval_samples_per_second": 157.132, |
| "eval_steps_per_second": 19.642, |
| "step": 23300 |
| }, |
| { |
| "epoch": 51.94566121430552, |
| "grad_norm": 0.36595970392227173, |
| "learning_rate": 3.6722408026755856e-06, |
| "loss": 6.3839, |
| "step": 23400 |
| }, |
| { |
| "epoch": 51.94566121430552, |
| "eval_loss": 6.384377956390381, |
| "eval_runtime": 63.6117, |
| "eval_samples_per_second": 157.204, |
| "eval_steps_per_second": 19.65, |
| "step": 23400 |
| }, |
| { |
| "epoch": 52.16745217632381, |
| "grad_norm": 0.4151841104030609, |
| "learning_rate": 3.662207357859532e-06, |
| "loss": 6.3838, |
| "step": 23500 |
| }, |
| { |
| "epoch": 52.16745217632381, |
| "eval_loss": 6.385963439941406, |
| "eval_runtime": 66.0487, |
| "eval_samples_per_second": 151.403, |
| "eval_steps_per_second": 18.925, |
| "step": 23500 |
| }, |
| { |
| "epoch": 52.38924313834211, |
| "grad_norm": 0.3460543155670166, |
| "learning_rate": 3.6521739130434787e-06, |
| "loss": 6.3828, |
| "step": 23600 |
| }, |
| { |
| "epoch": 52.38924313834211, |
| "eval_loss": 6.384364128112793, |
| "eval_runtime": 63.6451, |
| "eval_samples_per_second": 157.121, |
| "eval_steps_per_second": 19.64, |
| "step": 23600 |
| }, |
| { |
| "epoch": 52.61103410036041, |
| "grad_norm": 0.35991814732551575, |
| "learning_rate": 3.642140468227425e-06, |
| "loss": 6.3828, |
| "step": 23700 |
| }, |
| { |
| "epoch": 52.61103410036041, |
| "eval_loss": 6.382322311401367, |
| "eval_runtime": 63.5885, |
| "eval_samples_per_second": 157.261, |
| "eval_steps_per_second": 19.658, |
| "step": 23700 |
| }, |
| { |
| "epoch": 52.83282506237871, |
| "grad_norm": 0.556122899055481, |
| "learning_rate": 3.6321070234113714e-06, |
| "loss": 6.383, |
| "step": 23800 |
| }, |
| { |
| "epoch": 52.83282506237871, |
| "eval_loss": 6.387279987335205, |
| "eval_runtime": 63.668, |
| "eval_samples_per_second": 157.065, |
| "eval_steps_per_second": 19.633, |
| "step": 23800 |
| }, |
| { |
| "epoch": 53.054616024397006, |
| "grad_norm": 0.4246836304664612, |
| "learning_rate": 3.622073578595318e-06, |
| "loss": 6.3842, |
| "step": 23900 |
| }, |
| { |
| "epoch": 53.054616024397006, |
| "eval_loss": 6.382977485656738, |
| "eval_runtime": 65.9495, |
| "eval_samples_per_second": 151.631, |
| "eval_steps_per_second": 18.954, |
| "step": 23900 |
| }, |
| { |
| "epoch": 53.2764069864153, |
| "grad_norm": 0.4062933027744293, |
| "learning_rate": 3.6120401337792645e-06, |
| "loss": 6.3829, |
| "step": 24000 |
| }, |
| { |
| "epoch": 53.2764069864153, |
| "eval_loss": 6.386227130889893, |
| "eval_runtime": 63.6044, |
| "eval_samples_per_second": 157.222, |
| "eval_steps_per_second": 19.653, |
| "step": 24000 |
| }, |
| { |
| "epoch": 53.4981979484336, |
| "grad_norm": 0.36249685287475586, |
| "learning_rate": 3.6020066889632112e-06, |
| "loss": 6.3841, |
| "step": 24100 |
| }, |
| { |
| "epoch": 53.4981979484336, |
| "eval_loss": 6.388720989227295, |
| "eval_runtime": 63.7502, |
| "eval_samples_per_second": 156.862, |
| "eval_steps_per_second": 19.608, |
| "step": 24100 |
| }, |
| { |
| "epoch": 53.719988910451896, |
| "grad_norm": 0.464330792427063, |
| "learning_rate": 3.5919732441471576e-06, |
| "loss": 6.3821, |
| "step": 24200 |
| }, |
| { |
| "epoch": 53.719988910451896, |
| "eval_loss": 6.385589122772217, |
| "eval_runtime": 66.108, |
| "eval_samples_per_second": 151.268, |
| "eval_steps_per_second": 18.908, |
| "step": 24200 |
| }, |
| { |
| "epoch": 53.9417798724702, |
| "grad_norm": 0.36706265807151794, |
| "learning_rate": 3.581939799331104e-06, |
| "loss": 6.3834, |
| "step": 24300 |
| }, |
| { |
| "epoch": 53.9417798724702, |
| "eval_loss": 6.385077476501465, |
| "eval_runtime": 63.7574, |
| "eval_samples_per_second": 156.844, |
| "eval_steps_per_second": 19.606, |
| "step": 24300 |
| }, |
| { |
| "epoch": 54.2217909620183, |
| "grad_norm": 0.5084080100059509, |
| "learning_rate": 3.5719063545150507e-06, |
| "loss": 6.3829, |
| "step": 24400 |
| }, |
| { |
| "epoch": 54.2217909620183, |
| "eval_loss": 6.384501934051514, |
| "eval_runtime": 66.1045, |
| "eval_samples_per_second": 151.276, |
| "eval_steps_per_second": 18.909, |
| "step": 24400 |
| }, |
| { |
| "epoch": 54.44358192403659, |
| "grad_norm": 0.2843925952911377, |
| "learning_rate": 3.561872909698997e-06, |
| "loss": 6.3828, |
| "step": 24500 |
| }, |
| { |
| "epoch": 54.44358192403659, |
| "eval_loss": 6.386019706726074, |
| "eval_runtime": 63.7676, |
| "eval_samples_per_second": 156.819, |
| "eval_steps_per_second": 19.602, |
| "step": 24500 |
| }, |
| { |
| "epoch": 54.6653728860549, |
| "grad_norm": 0.3394639492034912, |
| "learning_rate": 3.5518394648829434e-06, |
| "loss": 6.3839, |
| "step": 24600 |
| }, |
| { |
| "epoch": 54.6653728860549, |
| "eval_loss": 6.385280132293701, |
| "eval_runtime": 64.0386, |
| "eval_samples_per_second": 156.156, |
| "eval_steps_per_second": 19.519, |
| "step": 24600 |
| }, |
| { |
| "epoch": 54.88716384807319, |
| "grad_norm": 0.5277294516563416, |
| "learning_rate": 3.54180602006689e-06, |
| "loss": 6.3827, |
| "step": 24700 |
| }, |
| { |
| "epoch": 54.88716384807319, |
| "eval_loss": 6.382243633270264, |
| "eval_runtime": 66.1687, |
| "eval_samples_per_second": 151.129, |
| "eval_steps_per_second": 18.891, |
| "step": 24700 |
| }, |
| { |
| "epoch": 55.10895481009149, |
| "grad_norm": 0.4542704224586487, |
| "learning_rate": 3.5317725752508365e-06, |
| "loss": 6.3835, |
| "step": 24800 |
| }, |
| { |
| "epoch": 55.10895481009149, |
| "eval_loss": 6.384250640869141, |
| "eval_runtime": 63.6729, |
| "eval_samples_per_second": 157.053, |
| "eval_steps_per_second": 19.632, |
| "step": 24800 |
| }, |
| { |
| "epoch": 55.330745772109786, |
| "grad_norm": 0.4311918318271637, |
| "learning_rate": 3.521739130434783e-06, |
| "loss": 6.3821, |
| "step": 24900 |
| }, |
| { |
| "epoch": 55.330745772109786, |
| "eval_loss": 6.382208824157715, |
| "eval_runtime": 63.7247, |
| "eval_samples_per_second": 156.925, |
| "eval_steps_per_second": 19.616, |
| "step": 24900 |
| }, |
| { |
| "epoch": 55.55253673412808, |
| "grad_norm": 0.5033969283103943, |
| "learning_rate": 3.5117056856187296e-06, |
| "loss": 6.3828, |
| "step": 25000 |
| }, |
| { |
| "epoch": 55.55253673412808, |
| "eval_loss": 6.384891510009766, |
| "eval_runtime": 66.1992, |
| "eval_samples_per_second": 151.059, |
| "eval_steps_per_second": 18.882, |
| "step": 25000 |
| }, |
| { |
| "epoch": 55.77432769614638, |
| "grad_norm": 0.389417827129364, |
| "learning_rate": 3.501672240802676e-06, |
| "loss": 6.3821, |
| "step": 25100 |
| }, |
| { |
| "epoch": 55.77432769614638, |
| "eval_loss": 6.3841633796691895, |
| "eval_runtime": 63.7582, |
| "eval_samples_per_second": 156.843, |
| "eval_steps_per_second": 19.605, |
| "step": 25100 |
| }, |
| { |
| "epoch": 55.99611865816468, |
| "grad_norm": 0.35223087668418884, |
| "learning_rate": 3.491638795986622e-06, |
| "loss": 6.382, |
| "step": 25200 |
| }, |
| { |
| "epoch": 55.99611865816468, |
| "eval_loss": 6.3838019371032715, |
| "eval_runtime": 63.6971, |
| "eval_samples_per_second": 156.993, |
| "eval_steps_per_second": 19.624, |
| "step": 25200 |
| }, |
| { |
| "epoch": 56.21790962018298, |
| "grad_norm": 0.3913029134273529, |
| "learning_rate": 3.481605351170568e-06, |
| "loss": 6.3815, |
| "step": 25300 |
| }, |
| { |
| "epoch": 56.21790962018298, |
| "eval_loss": 6.3869524002075195, |
| "eval_runtime": 66.1208, |
| "eval_samples_per_second": 151.238, |
| "eval_steps_per_second": 18.905, |
| "step": 25300 |
| }, |
| { |
| "epoch": 56.439700582201276, |
| "grad_norm": 0.4827691614627838, |
| "learning_rate": 3.471571906354515e-06, |
| "loss": 6.3827, |
| "step": 25400 |
| }, |
| { |
| "epoch": 56.439700582201276, |
| "eval_loss": 6.384666442871094, |
| "eval_runtime": 63.6765, |
| "eval_samples_per_second": 157.044, |
| "eval_steps_per_second": 19.63, |
| "step": 25400 |
| }, |
| { |
| "epoch": 56.66149154421957, |
| "grad_norm": 0.3427080512046814, |
| "learning_rate": 3.4615384615384613e-06, |
| "loss": 6.3827, |
| "step": 25500 |
| }, |
| { |
| "epoch": 56.66149154421957, |
| "eval_loss": 6.384727478027344, |
| "eval_runtime": 66.2151, |
| "eval_samples_per_second": 151.023, |
| "eval_steps_per_second": 18.878, |
| "step": 25500 |
| }, |
| { |
| "epoch": 56.88328250623787, |
| "grad_norm": 0.43282854557037354, |
| "learning_rate": 3.4515050167224076e-06, |
| "loss": 6.3822, |
| "step": 25600 |
| }, |
| { |
| "epoch": 56.88328250623787, |
| "eval_loss": 6.384084224700928, |
| "eval_runtime": 63.8392, |
| "eval_samples_per_second": 156.643, |
| "eval_steps_per_second": 19.58, |
| "step": 25600 |
| }, |
| { |
| "epoch": 57.105073468256165, |
| "grad_norm": 0.42564040422439575, |
| "learning_rate": 3.4414715719063544e-06, |
| "loss": 6.3814, |
| "step": 25700 |
| }, |
| { |
| "epoch": 57.105073468256165, |
| "eval_loss": 6.383011817932129, |
| "eval_runtime": 63.6955, |
| "eval_samples_per_second": 156.997, |
| "eval_steps_per_second": 19.625, |
| "step": 25700 |
| }, |
| { |
| "epoch": 57.32686443027447, |
| "grad_norm": 0.3655114471912384, |
| "learning_rate": 3.4314381270903007e-06, |
| "loss": 6.3813, |
| "step": 25800 |
| }, |
| { |
| "epoch": 57.32686443027447, |
| "eval_loss": 6.384052753448486, |
| "eval_runtime": 66.0629, |
| "eval_samples_per_second": 151.371, |
| "eval_steps_per_second": 18.921, |
| "step": 25800 |
| }, |
| { |
| "epoch": 57.548655392292765, |
| "grad_norm": 0.4009644389152527, |
| "learning_rate": 3.4214046822742475e-06, |
| "loss": 6.3819, |
| "step": 25900 |
| }, |
| { |
| "epoch": 57.548655392292765, |
| "eval_loss": 6.384483814239502, |
| "eval_runtime": 63.6201, |
| "eval_samples_per_second": 157.183, |
| "eval_steps_per_second": 19.648, |
| "step": 25900 |
| }, |
| { |
| "epoch": 57.77044635431106, |
| "grad_norm": 0.45892468094825745, |
| "learning_rate": 3.411371237458194e-06, |
| "loss": 6.3823, |
| "step": 26000 |
| }, |
| { |
| "epoch": 57.77044635431106, |
| "eval_loss": 6.382046222686768, |
| "eval_runtime": 63.6871, |
| "eval_samples_per_second": 157.018, |
| "eval_steps_per_second": 19.627, |
| "step": 26000 |
| }, |
| { |
| "epoch": 57.99223731632936, |
| "grad_norm": 0.6261206865310669, |
| "learning_rate": 3.40133779264214e-06, |
| "loss": 6.3822, |
| "step": 26100 |
| }, |
| { |
| "epoch": 57.99223731632936, |
| "eval_loss": 6.385235786437988, |
| "eval_runtime": 66.2139, |
| "eval_samples_per_second": 151.026, |
| "eval_steps_per_second": 18.878, |
| "step": 26100 |
| }, |
| { |
| "epoch": 58.214028278347655, |
| "grad_norm": 0.38988542556762695, |
| "learning_rate": 3.391304347826087e-06, |
| "loss": 6.3817, |
| "step": 26200 |
| }, |
| { |
| "epoch": 58.214028278347655, |
| "eval_loss": 6.385043144226074, |
| "eval_runtime": 63.6337, |
| "eval_samples_per_second": 157.149, |
| "eval_steps_per_second": 19.644, |
| "step": 26200 |
| }, |
| { |
| "epoch": 58.43581924036596, |
| "grad_norm": 0.3526028096675873, |
| "learning_rate": 3.3812709030100333e-06, |
| "loss": 6.3819, |
| "step": 26300 |
| }, |
| { |
| "epoch": 58.43581924036596, |
| "eval_loss": 6.385810375213623, |
| "eval_runtime": 63.6001, |
| "eval_samples_per_second": 157.233, |
| "eval_steps_per_second": 19.654, |
| "step": 26300 |
| }, |
| { |
| "epoch": 58.657610202384255, |
| "grad_norm": 0.38116052746772766, |
| "learning_rate": 3.3712374581939796e-06, |
| "loss": 6.3835, |
| "step": 26400 |
| }, |
| { |
| "epoch": 58.657610202384255, |
| "eval_loss": 6.383828639984131, |
| "eval_runtime": 66.0823, |
| "eval_samples_per_second": 151.327, |
| "eval_steps_per_second": 18.916, |
| "step": 26400 |
| }, |
| { |
| "epoch": 58.87940116440255, |
| "grad_norm": 0.5195460319519043, |
| "learning_rate": 3.3612040133779264e-06, |
| "loss": 6.3824, |
| "step": 26500 |
| }, |
| { |
| "epoch": 58.87940116440255, |
| "eval_loss": 6.3872599601745605, |
| "eval_runtime": 63.6944, |
| "eval_samples_per_second": 157.0, |
| "eval_steps_per_second": 19.625, |
| "step": 26500 |
| }, |
| { |
| "epoch": 59.10119212642085, |
| "grad_norm": 0.3986002206802368, |
| "learning_rate": 3.3511705685618727e-06, |
| "loss": 6.3813, |
| "step": 26600 |
| }, |
| { |
| "epoch": 59.10119212642085, |
| "eval_loss": 6.384389877319336, |
| "eval_runtime": 63.651, |
| "eval_samples_per_second": 157.107, |
| "eval_steps_per_second": 19.638, |
| "step": 26600 |
| }, |
| { |
| "epoch": 59.322983088439145, |
| "grad_norm": 0.3788560628890991, |
| "learning_rate": 3.3411371237458195e-06, |
| "loss": 6.3834, |
| "step": 26700 |
| }, |
| { |
| "epoch": 59.322983088439145, |
| "eval_loss": 6.383492946624756, |
| "eval_runtime": 66.1062, |
| "eval_samples_per_second": 151.272, |
| "eval_steps_per_second": 18.909, |
| "step": 26700 |
| }, |
| { |
| "epoch": 59.54477405045744, |
| "grad_norm": 0.3633769750595093, |
| "learning_rate": 3.331103678929766e-06, |
| "loss": 6.3806, |
| "step": 26800 |
| }, |
| { |
| "epoch": 59.54477405045744, |
| "eval_loss": 6.383812427520752, |
| "eval_runtime": 63.6852, |
| "eval_samples_per_second": 157.022, |
| "eval_steps_per_second": 19.628, |
| "step": 26800 |
| }, |
| { |
| "epoch": 59.766565012475745, |
| "grad_norm": 0.5389061570167542, |
| "learning_rate": 3.321070234113712e-06, |
| "loss": 6.3818, |
| "step": 26900 |
| }, |
| { |
| "epoch": 59.766565012475745, |
| "eval_loss": 6.386070251464844, |
| "eval_runtime": 63.6726, |
| "eval_samples_per_second": 157.054, |
| "eval_steps_per_second": 19.632, |
| "step": 26900 |
| }, |
| { |
| "epoch": 59.98835597449404, |
| "grad_norm": 0.5415310263633728, |
| "learning_rate": 3.311036789297659e-06, |
| "loss": 6.3812, |
| "step": 27000 |
| }, |
| { |
| "epoch": 59.98835597449404, |
| "eval_loss": 6.386297702789307, |
| "eval_runtime": 66.0998, |
| "eval_samples_per_second": 151.286, |
| "eval_steps_per_second": 18.911, |
| "step": 27000 |
| }, |
| { |
| "epoch": 60.21014693651234, |
| "grad_norm": 0.25073182582855225, |
| "learning_rate": 3.3010033444816052e-06, |
| "loss": 6.3825, |
| "step": 27100 |
| }, |
| { |
| "epoch": 60.21014693651234, |
| "eval_loss": 6.384896278381348, |
| "eval_runtime": 63.7213, |
| "eval_samples_per_second": 156.933, |
| "eval_steps_per_second": 19.617, |
| "step": 27100 |
| }, |
| { |
| "epoch": 60.431937898530634, |
| "grad_norm": 0.2894960045814514, |
| "learning_rate": 3.2909698996655516e-06, |
| "loss": 6.3806, |
| "step": 27200 |
| }, |
| { |
| "epoch": 60.431937898530634, |
| "eval_loss": 6.383223533630371, |
| "eval_runtime": 66.1431, |
| "eval_samples_per_second": 151.187, |
| "eval_steps_per_second": 18.898, |
| "step": 27200 |
| }, |
| { |
| "epoch": 60.65372886054893, |
| "grad_norm": 0.48593568801879883, |
| "learning_rate": 3.2809364548494983e-06, |
| "loss": 6.3814, |
| "step": 27300 |
| }, |
| { |
| "epoch": 60.65372886054893, |
| "eval_loss": 6.382923603057861, |
| "eval_runtime": 63.8018, |
| "eval_samples_per_second": 156.735, |
| "eval_steps_per_second": 19.592, |
| "step": 27300 |
| }, |
| { |
| "epoch": 60.87551982256723, |
| "grad_norm": 0.3919661343097687, |
| "learning_rate": 3.2709030100334447e-06, |
| "loss": 6.3812, |
| "step": 27400 |
| }, |
| { |
| "epoch": 60.87551982256723, |
| "eval_loss": 6.384592056274414, |
| "eval_runtime": 63.6834, |
| "eval_samples_per_second": 157.027, |
| "eval_steps_per_second": 19.628, |
| "step": 27400 |
| }, |
| { |
| "epoch": 61.09731078458553, |
| "grad_norm": 0.41026151180267334, |
| "learning_rate": 3.260869565217391e-06, |
| "loss": 6.3823, |
| "step": 27500 |
| }, |
| { |
| "epoch": 61.09731078458553, |
| "eval_loss": 6.385217189788818, |
| "eval_runtime": 66.0642, |
| "eval_samples_per_second": 151.368, |
| "eval_steps_per_second": 18.921, |
| "step": 27500 |
| }, |
| { |
| "epoch": 61.31910174660383, |
| "grad_norm": 0.3794995844364166, |
| "learning_rate": 3.2508361204013378e-06, |
| "loss": 6.3811, |
| "step": 27600 |
| }, |
| { |
| "epoch": 61.31910174660383, |
| "eval_loss": 6.383106231689453, |
| "eval_runtime": 63.705, |
| "eval_samples_per_second": 156.973, |
| "eval_steps_per_second": 19.622, |
| "step": 27600 |
| }, |
| { |
| "epoch": 61.540892708622124, |
| "grad_norm": 0.4461415410041809, |
| "learning_rate": 3.240802675585284e-06, |
| "loss": 6.3828, |
| "step": 27700 |
| }, |
| { |
| "epoch": 61.540892708622124, |
| "eval_loss": 6.384341239929199, |
| "eval_runtime": 63.6974, |
| "eval_samples_per_second": 156.992, |
| "eval_steps_per_second": 19.624, |
| "step": 27700 |
| }, |
| { |
| "epoch": 61.76268367064042, |
| "grad_norm": 0.24599848687648773, |
| "learning_rate": 3.230769230769231e-06, |
| "loss": 6.3807, |
| "step": 27800 |
| }, |
| { |
| "epoch": 61.76268367064042, |
| "eval_loss": 6.384603023529053, |
| "eval_runtime": 66.1353, |
| "eval_samples_per_second": 151.205, |
| "eval_steps_per_second": 18.901, |
| "step": 27800 |
| }, |
| { |
| "epoch": 61.98447463265872, |
| "grad_norm": 0.2466522455215454, |
| "learning_rate": 3.2207357859531772e-06, |
| "loss": 6.3823, |
| "step": 27900 |
| }, |
| { |
| "epoch": 61.98447463265872, |
| "eval_loss": 6.383478164672852, |
| "eval_runtime": 63.6948, |
| "eval_samples_per_second": 156.999, |
| "eval_steps_per_second": 19.625, |
| "step": 27900 |
| }, |
| { |
| "epoch": 62.206265594677014, |
| "grad_norm": 0.3806278705596924, |
| "learning_rate": 3.2107023411371236e-06, |
| "loss": 6.3806, |
| "step": 28000 |
| }, |
| { |
| "epoch": 62.206265594677014, |
| "eval_loss": 6.382126331329346, |
| "eval_runtime": 63.5806, |
| "eval_samples_per_second": 157.281, |
| "eval_steps_per_second": 19.66, |
| "step": 28000 |
| }, |
| { |
| "epoch": 62.42805655669532, |
| "grad_norm": 0.5161334872245789, |
| "learning_rate": 3.2006688963210703e-06, |
| "loss": 6.3816, |
| "step": 28100 |
| }, |
| { |
| "epoch": 62.42805655669532, |
| "eval_loss": 6.384099960327148, |
| "eval_runtime": 66.2035, |
| "eval_samples_per_second": 151.049, |
| "eval_steps_per_second": 18.881, |
| "step": 28100 |
| }, |
| { |
| "epoch": 62.649847518713614, |
| "grad_norm": 0.44599130749702454, |
| "learning_rate": 3.1906354515050167e-06, |
| "loss": 6.3799, |
| "step": 28200 |
| }, |
| { |
| "epoch": 62.649847518713614, |
| "eval_loss": 6.385605335235596, |
| "eval_runtime": 63.6738, |
| "eval_samples_per_second": 157.051, |
| "eval_steps_per_second": 19.631, |
| "step": 28200 |
| }, |
| { |
| "epoch": 62.87163848073191, |
| "grad_norm": 0.49202173948287964, |
| "learning_rate": 3.180602006688963e-06, |
| "loss": 6.3817, |
| "step": 28300 |
| }, |
| { |
| "epoch": 62.87163848073191, |
| "eval_loss": 6.3858418464660645, |
| "eval_runtime": 63.6792, |
| "eval_samples_per_second": 157.037, |
| "eval_steps_per_second": 19.63, |
| "step": 28300 |
| }, |
| { |
| "epoch": 63.09342944275021, |
| "grad_norm": 0.4090692400932312, |
| "learning_rate": 3.1705685618729098e-06, |
| "loss": 6.3797, |
| "step": 28400 |
| }, |
| { |
| "epoch": 63.09342944275021, |
| "eval_loss": 6.381466388702393, |
| "eval_runtime": 66.0632, |
| "eval_samples_per_second": 151.37, |
| "eval_steps_per_second": 18.921, |
| "step": 28400 |
| }, |
| { |
| "epoch": 63.3152204047685, |
| "grad_norm": 0.4286213517189026, |
| "learning_rate": 3.160535117056856e-06, |
| "loss": 6.3816, |
| "step": 28500 |
| }, |
| { |
| "epoch": 63.3152204047685, |
| "eval_loss": 6.383074760437012, |
| "eval_runtime": 63.6206, |
| "eval_samples_per_second": 157.182, |
| "eval_steps_per_second": 19.648, |
| "step": 28500 |
| }, |
| { |
| "epoch": 63.53701136678681, |
| "grad_norm": 0.36026620864868164, |
| "learning_rate": 3.1505016722408024e-06, |
| "loss": 6.3811, |
| "step": 28600 |
| }, |
| { |
| "epoch": 63.53701136678681, |
| "eval_loss": 6.383544445037842, |
| "eval_runtime": 63.7194, |
| "eval_samples_per_second": 156.938, |
| "eval_steps_per_second": 19.617, |
| "step": 28600 |
| }, |
| { |
| "epoch": 63.7588023288051, |
| "grad_norm": 0.5875244140625, |
| "learning_rate": 3.140468227424749e-06, |
| "loss": 6.3822, |
| "step": 28700 |
| }, |
| { |
| "epoch": 63.7588023288051, |
| "eval_loss": 6.384294033050537, |
| "eval_runtime": 66.1191, |
| "eval_samples_per_second": 151.242, |
| "eval_steps_per_second": 18.905, |
| "step": 28700 |
| }, |
| { |
| "epoch": 63.9805932908234, |
| "grad_norm": 0.39102068543434143, |
| "learning_rate": 3.1304347826086955e-06, |
| "loss": 6.3823, |
| "step": 28800 |
| }, |
| { |
| "epoch": 63.9805932908234, |
| "eval_loss": 6.381502628326416, |
| "eval_runtime": 63.7504, |
| "eval_samples_per_second": 156.862, |
| "eval_steps_per_second": 19.608, |
| "step": 28800 |
| }, |
| { |
| "epoch": 64.2023842528417, |
| "grad_norm": 0.4450345039367676, |
| "learning_rate": 3.1204013377926423e-06, |
| "loss": 6.3813, |
| "step": 28900 |
| }, |
| { |
| "epoch": 64.2023842528417, |
| "eval_loss": 6.384424209594727, |
| "eval_runtime": 66.2286, |
| "eval_samples_per_second": 150.992, |
| "eval_steps_per_second": 18.874, |
| "step": 28900 |
| }, |
| { |
| "epoch": 64.42417521486, |
| "grad_norm": 0.3168383240699768, |
| "learning_rate": 3.1103678929765886e-06, |
| "loss": 6.383, |
| "step": 29000 |
| }, |
| { |
| "epoch": 64.42417521486, |
| "eval_loss": 6.385626316070557, |
| "eval_runtime": 63.7217, |
| "eval_samples_per_second": 156.932, |
| "eval_steps_per_second": 19.617, |
| "step": 29000 |
| }, |
| { |
| "epoch": 64.64596617687829, |
| "grad_norm": 0.3088781535625458, |
| "learning_rate": 3.100334448160535e-06, |
| "loss": 6.3807, |
| "step": 29100 |
| }, |
| { |
| "epoch": 64.64596617687829, |
| "eval_loss": 6.385305881500244, |
| "eval_runtime": 63.6226, |
| "eval_samples_per_second": 157.177, |
| "eval_steps_per_second": 19.647, |
| "step": 29100 |
| }, |
| { |
| "epoch": 64.8677571388966, |
| "grad_norm": 0.4493953287601471, |
| "learning_rate": 3.0903010033444818e-06, |
| "loss": 6.381, |
| "step": 29200 |
| }, |
| { |
| "epoch": 64.8677571388966, |
| "eval_loss": 6.383870601654053, |
| "eval_runtime": 66.0987, |
| "eval_samples_per_second": 151.289, |
| "eval_steps_per_second": 18.911, |
| "step": 29200 |
| }, |
| { |
| "epoch": 65.08954810091488, |
| "grad_norm": 0.3246123194694519, |
| "learning_rate": 3.080267558528428e-06, |
| "loss": 6.3811, |
| "step": 29300 |
| }, |
| { |
| "epoch": 65.08954810091488, |
| "eval_loss": 6.383446216583252, |
| "eval_runtime": 63.63, |
| "eval_samples_per_second": 157.159, |
| "eval_steps_per_second": 19.645, |
| "step": 29300 |
| }, |
| { |
| "epoch": 65.31133906293319, |
| "grad_norm": 0.2923065721988678, |
| "learning_rate": 3.0702341137123744e-06, |
| "loss": 6.3805, |
| "step": 29400 |
| }, |
| { |
| "epoch": 65.31133906293319, |
| "eval_loss": 6.382349014282227, |
| "eval_runtime": 66.1161, |
| "eval_samples_per_second": 151.249, |
| "eval_steps_per_second": 18.906, |
| "step": 29400 |
| }, |
| { |
| "epoch": 65.53313002495149, |
| "grad_norm": 0.48411309719085693, |
| "learning_rate": 3.060200668896321e-06, |
| "loss": 6.3816, |
| "step": 29500 |
| }, |
| { |
| "epoch": 65.53313002495149, |
| "eval_loss": 6.381749153137207, |
| "eval_runtime": 63.6992, |
| "eval_samples_per_second": 156.988, |
| "eval_steps_per_second": 19.623, |
| "step": 29500 |
| }, |
| { |
| "epoch": 65.75492098696978, |
| "grad_norm": 0.3250056803226471, |
| "learning_rate": 3.0501672240802675e-06, |
| "loss": 6.3806, |
| "step": 29600 |
| }, |
| { |
| "epoch": 65.75492098696978, |
| "eval_loss": 6.383174896240234, |
| "eval_runtime": 63.6191, |
| "eval_samples_per_second": 157.185, |
| "eval_steps_per_second": 19.648, |
| "step": 29600 |
| }, |
| { |
| "epoch": 65.97671194898808, |
| "grad_norm": 0.337882399559021, |
| "learning_rate": 3.0401337792642143e-06, |
| "loss": 6.3793, |
| "step": 29700 |
| }, |
| { |
| "epoch": 65.97671194898808, |
| "eval_loss": 6.383576393127441, |
| "eval_runtime": 66.0393, |
| "eval_samples_per_second": 151.425, |
| "eval_steps_per_second": 18.928, |
| "step": 29700 |
| }, |
| { |
| "epoch": 66.19850291100637, |
| "grad_norm": 0.36923250555992126, |
| "learning_rate": 3.0301003344481606e-06, |
| "loss": 6.3805, |
| "step": 29800 |
| }, |
| { |
| "epoch": 66.19850291100637, |
| "eval_loss": 6.383658409118652, |
| "eval_runtime": 63.5576, |
| "eval_samples_per_second": 157.338, |
| "eval_steps_per_second": 19.667, |
| "step": 29800 |
| }, |
| { |
| "epoch": 66.42029387302468, |
| "grad_norm": 0.3375002443790436, |
| "learning_rate": 3.020066889632107e-06, |
| "loss": 6.3805, |
| "step": 29900 |
| }, |
| { |
| "epoch": 66.42029387302468, |
| "eval_loss": 6.382904529571533, |
| "eval_runtime": 66.0839, |
| "eval_samples_per_second": 151.323, |
| "eval_steps_per_second": 18.915, |
| "step": 29900 |
| }, |
| { |
| "epoch": 66.64208483504297, |
| "grad_norm": 0.44055986404418945, |
| "learning_rate": 3.0100334448160537e-06, |
| "loss": 6.3812, |
| "step": 30000 |
| }, |
| { |
| "epoch": 66.64208483504297, |
| "eval_loss": 6.384601593017578, |
| "eval_runtime": 63.7135, |
| "eval_samples_per_second": 156.953, |
| "eval_steps_per_second": 19.619, |
| "step": 30000 |
| }, |
| { |
| "epoch": 66.86387579706127, |
| "grad_norm": 0.5010361075401306, |
| "learning_rate": 3e-06, |
| "loss": 6.3814, |
| "step": 30100 |
| }, |
| { |
| "epoch": 66.86387579706127, |
| "eval_loss": 6.38201904296875, |
| "eval_runtime": 63.631, |
| "eval_samples_per_second": 157.156, |
| "eval_steps_per_second": 19.645, |
| "step": 30100 |
| }, |
| { |
| "epoch": 67.08566675907957, |
| "grad_norm": 0.36018142104148865, |
| "learning_rate": 2.9899665551839464e-06, |
| "loss": 6.3801, |
| "step": 30200 |
| }, |
| { |
| "epoch": 67.08566675907957, |
| "eval_loss": 6.384942054748535, |
| "eval_runtime": 66.0805, |
| "eval_samples_per_second": 151.331, |
| "eval_steps_per_second": 18.916, |
| "step": 30200 |
| }, |
| { |
| "epoch": 67.30745772109786, |
| "grad_norm": 0.34176790714263916, |
| "learning_rate": 2.979933110367893e-06, |
| "loss": 6.3815, |
| "step": 30300 |
| }, |
| { |
| "epoch": 67.30745772109786, |
| "eval_loss": 6.382652282714844, |
| "eval_runtime": 63.6886, |
| "eval_samples_per_second": 157.014, |
| "eval_steps_per_second": 19.627, |
| "step": 30300 |
| }, |
| { |
| "epoch": 67.52924868311617, |
| "grad_norm": 0.43459710478782654, |
| "learning_rate": 2.9698996655518395e-06, |
| "loss": 6.3811, |
| "step": 30400 |
| }, |
| { |
| "epoch": 67.52924868311617, |
| "eval_loss": 6.386653423309326, |
| "eval_runtime": 63.7002, |
| "eval_samples_per_second": 156.985, |
| "eval_steps_per_second": 19.623, |
| "step": 30400 |
| }, |
| { |
| "epoch": 67.75103964513445, |
| "grad_norm": 0.37743738293647766, |
| "learning_rate": 2.959866220735786e-06, |
| "loss": 6.3804, |
| "step": 30500 |
| }, |
| { |
| "epoch": 67.75103964513445, |
| "eval_loss": 6.383271217346191, |
| "eval_runtime": 66.2218, |
| "eval_samples_per_second": 151.008, |
| "eval_steps_per_second": 18.876, |
| "step": 30500 |
| }, |
| { |
| "epoch": 67.97283060715276, |
| "grad_norm": 0.34814783930778503, |
| "learning_rate": 2.9498327759197326e-06, |
| "loss": 6.3803, |
| "step": 30600 |
| }, |
| { |
| "epoch": 67.97283060715276, |
| "eval_loss": 6.38237190246582, |
| "eval_runtime": 63.6997, |
| "eval_samples_per_second": 156.987, |
| "eval_steps_per_second": 19.623, |
| "step": 30600 |
| }, |
| { |
| "epoch": 68.19462156917106, |
| "grad_norm": 0.344685435295105, |
| "learning_rate": 2.939799331103679e-06, |
| "loss": 6.3795, |
| "step": 30700 |
| }, |
| { |
| "epoch": 68.19462156917106, |
| "eval_loss": 6.384911060333252, |
| "eval_runtime": 63.7005, |
| "eval_samples_per_second": 156.985, |
| "eval_steps_per_second": 19.623, |
| "step": 30700 |
| }, |
| { |
| "epoch": 68.41641253118935, |
| "grad_norm": 0.30350542068481445, |
| "learning_rate": 2.9297658862876257e-06, |
| "loss": 6.3799, |
| "step": 30800 |
| }, |
| { |
| "epoch": 68.41641253118935, |
| "eval_loss": 6.383020401000977, |
| "eval_runtime": 66.1786, |
| "eval_samples_per_second": 151.106, |
| "eval_steps_per_second": 18.888, |
| "step": 30800 |
| }, |
| { |
| "epoch": 68.63820349320766, |
| "grad_norm": 0.33513781428337097, |
| "learning_rate": 2.919732441471572e-06, |
| "loss": 6.3808, |
| "step": 30900 |
| }, |
| { |
| "epoch": 68.63820349320766, |
| "eval_loss": 6.38442325592041, |
| "eval_runtime": 63.6954, |
| "eval_samples_per_second": 156.997, |
| "eval_steps_per_second": 19.625, |
| "step": 30900 |
| }, |
| { |
| "epoch": 68.85999445522594, |
| "grad_norm": 0.38895151019096375, |
| "learning_rate": 2.9096989966555184e-06, |
| "loss": 6.3803, |
| "step": 31000 |
| }, |
| { |
| "epoch": 68.85999445522594, |
| "eval_loss": 6.382268905639648, |
| "eval_runtime": 66.1082, |
| "eval_samples_per_second": 151.267, |
| "eval_steps_per_second": 18.908, |
| "step": 31000 |
| }, |
| { |
| "epoch": 69.08178541724425, |
| "grad_norm": 0.49591463804244995, |
| "learning_rate": 2.899665551839465e-06, |
| "loss": 6.381, |
| "step": 31100 |
| }, |
| { |
| "epoch": 69.08178541724425, |
| "eval_loss": 6.384127140045166, |
| "eval_runtime": 63.6361, |
| "eval_samples_per_second": 157.144, |
| "eval_steps_per_second": 19.643, |
| "step": 31100 |
| }, |
| { |
| "epoch": 69.30357637926255, |
| "grad_norm": 0.47946080565452576, |
| "learning_rate": 2.8896321070234115e-06, |
| "loss": 6.3803, |
| "step": 31200 |
| }, |
| { |
| "epoch": 69.30357637926255, |
| "eval_loss": 6.380748748779297, |
| "eval_runtime": 63.7274, |
| "eval_samples_per_second": 156.918, |
| "eval_steps_per_second": 19.615, |
| "step": 31200 |
| }, |
| { |
| "epoch": 69.52536734128084, |
| "grad_norm": 0.33409592509269714, |
| "learning_rate": 2.879598662207358e-06, |
| "loss": 6.3795, |
| "step": 31300 |
| }, |
| { |
| "epoch": 69.52536734128084, |
| "eval_loss": 6.3822197914123535, |
| "eval_runtime": 66.2573, |
| "eval_samples_per_second": 150.927, |
| "eval_steps_per_second": 18.866, |
| "step": 31300 |
| }, |
| { |
| "epoch": 69.74715830329914, |
| "grad_norm": 0.36530378460884094, |
| "learning_rate": 2.8695652173913046e-06, |
| "loss": 6.3793, |
| "step": 31400 |
| }, |
| { |
| "epoch": 69.74715830329914, |
| "eval_loss": 6.3831787109375, |
| "eval_runtime": 63.6807, |
| "eval_samples_per_second": 157.034, |
| "eval_steps_per_second": 19.629, |
| "step": 31400 |
| }, |
| { |
| "epoch": 69.96894926531743, |
| "grad_norm": 0.4838181436061859, |
| "learning_rate": 2.859531772575251e-06, |
| "loss": 6.3802, |
| "step": 31500 |
| }, |
| { |
| "epoch": 69.96894926531743, |
| "eval_loss": 6.383909225463867, |
| "eval_runtime": 66.1242, |
| "eval_samples_per_second": 151.231, |
| "eval_steps_per_second": 18.904, |
| "step": 31500 |
| }, |
| { |
| "epoch": 70.19074022733574, |
| "grad_norm": 0.3999974727630615, |
| "learning_rate": 2.8494983277591977e-06, |
| "loss": 6.3817, |
| "step": 31600 |
| }, |
| { |
| "epoch": 70.19074022733574, |
| "eval_loss": 6.382571220397949, |
| "eval_runtime": 63.5921, |
| "eval_samples_per_second": 157.252, |
| "eval_steps_per_second": 19.657, |
| "step": 31600 |
| }, |
| { |
| "epoch": 70.41253118935403, |
| "grad_norm": 0.37044674158096313, |
| "learning_rate": 2.839464882943144e-06, |
| "loss": 6.3785, |
| "step": 31700 |
| }, |
| { |
| "epoch": 70.41253118935403, |
| "eval_loss": 6.381692886352539, |
| "eval_runtime": 66.1492, |
| "eval_samples_per_second": 151.173, |
| "eval_steps_per_second": 18.897, |
| "step": 31700 |
| }, |
| { |
| "epoch": 70.63432215137233, |
| "grad_norm": 0.43440505862236023, |
| "learning_rate": 2.8294314381270904e-06, |
| "loss": 6.3811, |
| "step": 31800 |
| }, |
| { |
| "epoch": 70.63432215137233, |
| "eval_loss": 6.384181499481201, |
| "eval_runtime": 63.7156, |
| "eval_samples_per_second": 156.947, |
| "eval_steps_per_second": 19.618, |
| "step": 31800 |
| }, |
| { |
| "epoch": 70.85611311339063, |
| "grad_norm": 0.45394232869148254, |
| "learning_rate": 2.819397993311037e-06, |
| "loss": 6.3803, |
| "step": 31900 |
| }, |
| { |
| "epoch": 70.85611311339063, |
| "eval_loss": 6.382298469543457, |
| "eval_runtime": 66.1426, |
| "eval_samples_per_second": 151.189, |
| "eval_steps_per_second": 18.899, |
| "step": 31900 |
| }, |
| { |
| "epoch": 71.07790407540892, |
| "grad_norm": 0.24088256061077118, |
| "learning_rate": 2.8093645484949835e-06, |
| "loss": 6.3789, |
| "step": 32000 |
| }, |
| { |
| "epoch": 71.07790407540892, |
| "eval_loss": 6.378951072692871, |
| "eval_runtime": 63.7166, |
| "eval_samples_per_second": 156.945, |
| "eval_steps_per_second": 19.618, |
| "step": 32000 |
| }, |
| { |
| "epoch": 71.29969503742723, |
| "grad_norm": 0.3836078643798828, |
| "learning_rate": 2.79933110367893e-06, |
| "loss": 6.3793, |
| "step": 32100 |
| }, |
| { |
| "epoch": 71.29969503742723, |
| "eval_loss": 6.382381916046143, |
| "eval_runtime": 63.7336, |
| "eval_samples_per_second": 156.903, |
| "eval_steps_per_second": 19.613, |
| "step": 32100 |
| }, |
| { |
| "epoch": 71.52148599944552, |
| "grad_norm": 0.3558043837547302, |
| "learning_rate": 2.7892976588628766e-06, |
| "loss": 6.3779, |
| "step": 32200 |
| }, |
| { |
| "epoch": 71.52148599944552, |
| "eval_loss": 6.3820366859436035, |
| "eval_runtime": 66.1055, |
| "eval_samples_per_second": 151.273, |
| "eval_steps_per_second": 18.909, |
| "step": 32200 |
| }, |
| { |
| "epoch": 71.74327696146382, |
| "grad_norm": 0.2369541972875595, |
| "learning_rate": 2.779264214046823e-06, |
| "loss": 6.3808, |
| "step": 32300 |
| }, |
| { |
| "epoch": 71.74327696146382, |
| "eval_loss": 6.37972354888916, |
| "eval_runtime": 63.6602, |
| "eval_samples_per_second": 157.084, |
| "eval_steps_per_second": 19.636, |
| "step": 32300 |
| }, |
| { |
| "epoch": 71.96506792348212, |
| "grad_norm": 0.3357178270816803, |
| "learning_rate": 2.7692307692307693e-06, |
| "loss": 6.3796, |
| "step": 32400 |
| }, |
| { |
| "epoch": 71.96506792348212, |
| "eval_loss": 6.3810296058654785, |
| "eval_runtime": 66.2569, |
| "eval_samples_per_second": 150.928, |
| "eval_steps_per_second": 18.866, |
| "step": 32400 |
| }, |
| { |
| "epoch": 72.18685888550041, |
| "grad_norm": 0.2965914011001587, |
| "learning_rate": 2.759197324414716e-06, |
| "loss": 6.3794, |
| "step": 32500 |
| }, |
| { |
| "epoch": 72.18685888550041, |
| "eval_loss": 6.381561756134033, |
| "eval_runtime": 63.6325, |
| "eval_samples_per_second": 157.152, |
| "eval_steps_per_second": 19.644, |
| "step": 32500 |
| }, |
| { |
| "epoch": 72.40864984751872, |
| "grad_norm": 0.31444767117500305, |
| "learning_rate": 2.749163879598662e-06, |
| "loss": 6.3811, |
| "step": 32600 |
| }, |
| { |
| "epoch": 72.40864984751872, |
| "eval_loss": 6.383826732635498, |
| "eval_runtime": 63.819, |
| "eval_samples_per_second": 156.693, |
| "eval_steps_per_second": 19.587, |
| "step": 32600 |
| }, |
| { |
| "epoch": 72.630440809537, |
| "grad_norm": 0.335440456867218, |
| "learning_rate": 2.7391304347826087e-06, |
| "loss": 6.3787, |
| "step": 32700 |
| }, |
| { |
| "epoch": 72.630440809537, |
| "eval_loss": 6.382222652435303, |
| "eval_runtime": 66.3235, |
| "eval_samples_per_second": 150.776, |
| "eval_steps_per_second": 18.847, |
| "step": 32700 |
| }, |
| { |
| "epoch": 72.85223177155531, |
| "grad_norm": 0.3031088411808014, |
| "learning_rate": 2.729096989966555e-06, |
| "loss": 6.379, |
| "step": 32800 |
| }, |
| { |
| "epoch": 72.85223177155531, |
| "eval_loss": 6.380151748657227, |
| "eval_runtime": 63.7642, |
| "eval_samples_per_second": 156.828, |
| "eval_steps_per_second": 19.603, |
| "step": 32800 |
| }, |
| { |
| "epoch": 73.07402273357361, |
| "grad_norm": 0.2734851539134979, |
| "learning_rate": 2.7190635451505014e-06, |
| "loss": 6.3796, |
| "step": 32900 |
| }, |
| { |
| "epoch": 73.07402273357361, |
| "eval_loss": 6.381131172180176, |
| "eval_runtime": 66.3028, |
| "eval_samples_per_second": 150.823, |
| "eval_steps_per_second": 18.853, |
| "step": 32900 |
| }, |
| { |
| "epoch": 73.2958136955919, |
| "grad_norm": 0.4682227671146393, |
| "learning_rate": 2.709030100334448e-06, |
| "loss": 6.3791, |
| "step": 33000 |
| }, |
| { |
| "epoch": 73.2958136955919, |
| "eval_loss": 6.382552146911621, |
| "eval_runtime": 63.7907, |
| "eval_samples_per_second": 156.763, |
| "eval_steps_per_second": 19.595, |
| "step": 33000 |
| }, |
| { |
| "epoch": 73.5176046576102, |
| "grad_norm": 0.38640567660331726, |
| "learning_rate": 2.6989966555183945e-06, |
| "loss": 6.378, |
| "step": 33100 |
| }, |
| { |
| "epoch": 73.5176046576102, |
| "eval_loss": 6.37783670425415, |
| "eval_runtime": 66.2991, |
| "eval_samples_per_second": 150.832, |
| "eval_steps_per_second": 18.854, |
| "step": 33100 |
| }, |
| { |
| "epoch": 73.7393956196285, |
| "grad_norm": 0.3248431086540222, |
| "learning_rate": 2.6889632107023413e-06, |
| "loss": 6.3798, |
| "step": 33200 |
| }, |
| { |
| "epoch": 73.7393956196285, |
| "eval_loss": 6.382781982421875, |
| "eval_runtime": 63.7636, |
| "eval_samples_per_second": 156.829, |
| "eval_steps_per_second": 19.604, |
| "step": 33200 |
| }, |
| { |
| "epoch": 73.9611865816468, |
| "grad_norm": 0.40707120299339294, |
| "learning_rate": 2.6789297658862876e-06, |
| "loss": 6.3787, |
| "step": 33300 |
| }, |
| { |
| "epoch": 73.9611865816468, |
| "eval_loss": 6.381734371185303, |
| "eval_runtime": 63.7575, |
| "eval_samples_per_second": 156.844, |
| "eval_steps_per_second": 19.606, |
| "step": 33300 |
| }, |
| { |
| "epoch": 74.18297754366509, |
| "grad_norm": 0.3740542232990265, |
| "learning_rate": 2.668896321070234e-06, |
| "loss": 6.3799, |
| "step": 33400 |
| }, |
| { |
| "epoch": 74.18297754366509, |
| "eval_loss": 6.38359260559082, |
| "eval_runtime": 63.7058, |
| "eval_samples_per_second": 156.972, |
| "eval_steps_per_second": 19.621, |
| "step": 33400 |
| }, |
| { |
| "epoch": 74.40476850568339, |
| "grad_norm": 0.3560076057910919, |
| "learning_rate": 2.6588628762541807e-06, |
| "loss": 6.3788, |
| "step": 33500 |
| }, |
| { |
| "epoch": 74.40476850568339, |
| "eval_loss": 6.380216598510742, |
| "eval_runtime": 66.3163, |
| "eval_samples_per_second": 150.792, |
| "eval_steps_per_second": 18.849, |
| "step": 33500 |
| }, |
| { |
| "epoch": 74.6265594677017, |
| "grad_norm": 0.2998209595680237, |
| "learning_rate": 2.648829431438127e-06, |
| "loss": 6.3798, |
| "step": 33600 |
| }, |
| { |
| "epoch": 74.6265594677017, |
| "eval_loss": 6.3799357414245605, |
| "eval_runtime": 63.7525, |
| "eval_samples_per_second": 156.857, |
| "eval_steps_per_second": 19.607, |
| "step": 33600 |
| }, |
| { |
| "epoch": 74.84835042971999, |
| "grad_norm": 0.42181283235549927, |
| "learning_rate": 2.6387959866220734e-06, |
| "loss": 6.3797, |
| "step": 33700 |
| }, |
| { |
| "epoch": 74.84835042971999, |
| "eval_loss": 6.3854804039001465, |
| "eval_runtime": 63.7045, |
| "eval_samples_per_second": 156.975, |
| "eval_steps_per_second": 19.622, |
| "step": 33700 |
| }, |
| { |
| "epoch": 75.07014139173829, |
| "grad_norm": 0.35232749581336975, |
| "learning_rate": 2.62876254180602e-06, |
| "loss": 6.3794, |
| "step": 33800 |
| }, |
| { |
| "epoch": 75.07014139173829, |
| "eval_loss": 6.38426399230957, |
| "eval_runtime": 66.2061, |
| "eval_samples_per_second": 151.043, |
| "eval_steps_per_second": 18.88, |
| "step": 33800 |
| }, |
| { |
| "epoch": 75.29193235375658, |
| "grad_norm": 0.3319035470485687, |
| "learning_rate": 2.6187290969899665e-06, |
| "loss": 6.3801, |
| "step": 33900 |
| }, |
| { |
| "epoch": 75.29193235375658, |
| "eval_loss": 6.382733345031738, |
| "eval_runtime": 63.7241, |
| "eval_samples_per_second": 156.927, |
| "eval_steps_per_second": 19.616, |
| "step": 33900 |
| }, |
| { |
| "epoch": 75.51372331577488, |
| "grad_norm": 0.320116251707077, |
| "learning_rate": 2.6086956521739132e-06, |
| "loss": 6.3796, |
| "step": 34000 |
| }, |
| { |
| "epoch": 75.51372331577488, |
| "eval_loss": 6.383172035217285, |
| "eval_runtime": 66.2886, |
| "eval_samples_per_second": 150.856, |
| "eval_steps_per_second": 18.857, |
| "step": 34000 |
| }, |
| { |
| "epoch": 75.73551427779319, |
| "grad_norm": 0.25732365250587463, |
| "learning_rate": 2.5986622073578596e-06, |
| "loss": 6.3793, |
| "step": 34100 |
| }, |
| { |
| "epoch": 75.73551427779319, |
| "eval_loss": 6.3826189041137695, |
| "eval_runtime": 63.7021, |
| "eval_samples_per_second": 156.981, |
| "eval_steps_per_second": 19.623, |
| "step": 34100 |
| }, |
| { |
| "epoch": 75.95730523981148, |
| "grad_norm": 0.41861915588378906, |
| "learning_rate": 2.588628762541806e-06, |
| "loss": 6.3806, |
| "step": 34200 |
| }, |
| { |
| "epoch": 75.95730523981148, |
| "eval_loss": 6.3810224533081055, |
| "eval_runtime": 63.8703, |
| "eval_samples_per_second": 156.567, |
| "eval_steps_per_second": 19.571, |
| "step": 34200 |
| }, |
| { |
| "epoch": 76.17909620182978, |
| "grad_norm": 0.37039920687675476, |
| "learning_rate": 2.5785953177257527e-06, |
| "loss": 6.3782, |
| "step": 34300 |
| }, |
| { |
| "epoch": 76.17909620182978, |
| "eval_loss": 6.384817600250244, |
| "eval_runtime": 63.7083, |
| "eval_samples_per_second": 156.965, |
| "eval_steps_per_second": 19.621, |
| "step": 34300 |
| }, |
| { |
| "epoch": 76.40088716384807, |
| "grad_norm": 0.29002711176872253, |
| "learning_rate": 2.568561872909699e-06, |
| "loss": 6.3804, |
| "step": 34400 |
| }, |
| { |
| "epoch": 76.40088716384807, |
| "eval_loss": 6.381626605987549, |
| "eval_runtime": 66.318, |
| "eval_samples_per_second": 150.789, |
| "eval_steps_per_second": 18.849, |
| "step": 34400 |
| }, |
| { |
| "epoch": 76.62267812586637, |
| "grad_norm": 0.3963169455528259, |
| "learning_rate": 2.5585284280936454e-06, |
| "loss": 6.3802, |
| "step": 34500 |
| }, |
| { |
| "epoch": 76.62267812586637, |
| "eval_loss": 6.385863304138184, |
| "eval_runtime": 63.683, |
| "eval_samples_per_second": 157.028, |
| "eval_steps_per_second": 19.628, |
| "step": 34500 |
| }, |
| { |
| "epoch": 76.84446908788468, |
| "grad_norm": 0.3641812801361084, |
| "learning_rate": 2.548494983277592e-06, |
| "loss": 6.3794, |
| "step": 34600 |
| }, |
| { |
| "epoch": 76.84446908788468, |
| "eval_loss": 6.379196643829346, |
| "eval_runtime": 63.7355, |
| "eval_samples_per_second": 156.899, |
| "eval_steps_per_second": 19.612, |
| "step": 34600 |
| }, |
| { |
| "epoch": 77.06626004990297, |
| "grad_norm": 0.34516364336013794, |
| "learning_rate": 2.5384615384615385e-06, |
| "loss": 6.3781, |
| "step": 34700 |
| }, |
| { |
| "epoch": 77.06626004990297, |
| "eval_loss": 6.381167411804199, |
| "eval_runtime": 66.2593, |
| "eval_samples_per_second": 150.922, |
| "eval_steps_per_second": 18.865, |
| "step": 34700 |
| }, |
| { |
| "epoch": 77.28805101192127, |
| "grad_norm": 0.37135106325149536, |
| "learning_rate": 2.528428093645485e-06, |
| "loss": 6.3791, |
| "step": 34800 |
| }, |
| { |
| "epoch": 77.28805101192127, |
| "eval_loss": 6.3796210289001465, |
| "eval_runtime": 63.5952, |
| "eval_samples_per_second": 157.245, |
| "eval_steps_per_second": 19.656, |
| "step": 34800 |
| }, |
| { |
| "epoch": 77.50984197393956, |
| "grad_norm": 0.27615365386009216, |
| "learning_rate": 2.5183946488294316e-06, |
| "loss": 6.3788, |
| "step": 34900 |
| }, |
| { |
| "epoch": 77.50984197393956, |
| "eval_loss": 6.38156270980835, |
| "eval_runtime": 63.666, |
| "eval_samples_per_second": 157.07, |
| "eval_steps_per_second": 19.634, |
| "step": 34900 |
| }, |
| { |
| "epoch": 77.73163293595786, |
| "grad_norm": 0.40949293971061707, |
| "learning_rate": 2.508361204013378e-06, |
| "loss": 6.3784, |
| "step": 35000 |
| }, |
| { |
| "epoch": 77.73163293595786, |
| "eval_loss": 6.379955291748047, |
| "eval_runtime": 66.2108, |
| "eval_samples_per_second": 151.033, |
| "eval_steps_per_second": 18.879, |
| "step": 35000 |
| }, |
| { |
| "epoch": 77.95342389797615, |
| "grad_norm": 0.21426652371883392, |
| "learning_rate": 2.4983277591973247e-06, |
| "loss": 6.3792, |
| "step": 35100 |
| }, |
| { |
| "epoch": 77.95342389797615, |
| "eval_loss": 6.38067102432251, |
| "eval_runtime": 63.6933, |
| "eval_samples_per_second": 157.002, |
| "eval_steps_per_second": 19.625, |
| "step": 35100 |
| }, |
| { |
| "epoch": 78.17521485999445, |
| "grad_norm": 0.3121929466724396, |
| "learning_rate": 2.488294314381271e-06, |
| "loss": 6.3799, |
| "step": 35200 |
| }, |
| { |
| "epoch": 78.17521485999445, |
| "eval_loss": 6.383203983306885, |
| "eval_runtime": 63.6759, |
| "eval_samples_per_second": 157.045, |
| "eval_steps_per_second": 19.631, |
| "step": 35200 |
| }, |
| { |
| "epoch": 78.39700582201276, |
| "grad_norm": 0.3007084131240845, |
| "learning_rate": 2.4782608695652173e-06, |
| "loss": 6.3782, |
| "step": 35300 |
| }, |
| { |
| "epoch": 78.39700582201276, |
| "eval_loss": 6.380030632019043, |
| "eval_runtime": 65.4722, |
| "eval_samples_per_second": 152.737, |
| "eval_steps_per_second": 19.092, |
| "step": 35300 |
| }, |
| { |
| "epoch": 78.61879678403105, |
| "grad_norm": 0.4903746247291565, |
| "learning_rate": 2.468227424749164e-06, |
| "loss": 6.3791, |
| "step": 35400 |
| }, |
| { |
| "epoch": 78.61879678403105, |
| "eval_loss": 6.382900714874268, |
| "eval_runtime": 64.4146, |
| "eval_samples_per_second": 155.244, |
| "eval_steps_per_second": 19.406, |
| "step": 35400 |
| }, |
| { |
| "epoch": 78.84058774604935, |
| "grad_norm": 0.41819822788238525, |
| "learning_rate": 2.4581939799331104e-06, |
| "loss": 6.3779, |
| "step": 35500 |
| }, |
| { |
| "epoch": 78.84058774604935, |
| "eval_loss": 6.380439281463623, |
| "eval_runtime": 63.6771, |
| "eval_samples_per_second": 157.042, |
| "eval_steps_per_second": 19.63, |
| "step": 35500 |
| }, |
| { |
| "epoch": 79.06237870806764, |
| "grad_norm": 0.4207383990287781, |
| "learning_rate": 2.4481605351170568e-06, |
| "loss": 6.3794, |
| "step": 35600 |
| }, |
| { |
| "epoch": 79.06237870806764, |
| "eval_loss": 6.381216049194336, |
| "eval_runtime": 63.8668, |
| "eval_samples_per_second": 156.576, |
| "eval_steps_per_second": 19.572, |
| "step": 35600 |
| }, |
| { |
| "epoch": 79.28416967008594, |
| "grad_norm": 0.3821701109409332, |
| "learning_rate": 2.4381270903010035e-06, |
| "loss": 6.3796, |
| "step": 35700 |
| }, |
| { |
| "epoch": 79.28416967008594, |
| "eval_loss": 6.380701541900635, |
| "eval_runtime": 66.2305, |
| "eval_samples_per_second": 150.988, |
| "eval_steps_per_second": 18.873, |
| "step": 35700 |
| }, |
| { |
| "epoch": 79.50596063210425, |
| "grad_norm": 0.3124147653579712, |
| "learning_rate": 2.42809364548495e-06, |
| "loss": 6.3792, |
| "step": 35800 |
| }, |
| { |
| "epoch": 79.50596063210425, |
| "eval_loss": 6.383649826049805, |
| "eval_runtime": 63.6853, |
| "eval_samples_per_second": 157.022, |
| "eval_steps_per_second": 19.628, |
| "step": 35800 |
| }, |
| { |
| "epoch": 79.72775159412254, |
| "grad_norm": 0.37319284677505493, |
| "learning_rate": 2.4180602006688962e-06, |
| "loss": 6.3793, |
| "step": 35900 |
| }, |
| { |
| "epoch": 79.72775159412254, |
| "eval_loss": 6.379690647125244, |
| "eval_runtime": 63.71, |
| "eval_samples_per_second": 156.961, |
| "eval_steps_per_second": 19.62, |
| "step": 35900 |
| }, |
| { |
| "epoch": 79.94954255614084, |
| "grad_norm": 0.3518475890159607, |
| "learning_rate": 2.408026755852843e-06, |
| "loss": 6.3794, |
| "step": 36000 |
| }, |
| { |
| "epoch": 79.94954255614084, |
| "eval_loss": 6.3837385177612305, |
| "eval_runtime": 66.2591, |
| "eval_samples_per_second": 150.923, |
| "eval_steps_per_second": 18.865, |
| "step": 36000 |
| }, |
| { |
| "epoch": 80.17133351815913, |
| "grad_norm": 0.3394939601421356, |
| "learning_rate": 2.3979933110367893e-06, |
| "loss": 6.3779, |
| "step": 36100 |
| }, |
| { |
| "epoch": 80.17133351815913, |
| "eval_loss": 6.383784294128418, |
| "eval_runtime": 63.536, |
| "eval_samples_per_second": 157.391, |
| "eval_steps_per_second": 19.674, |
| "step": 36100 |
| }, |
| { |
| "epoch": 80.39312448017743, |
| "grad_norm": 0.2030980885028839, |
| "learning_rate": 2.387959866220736e-06, |
| "loss": 6.3787, |
| "step": 36200 |
| }, |
| { |
| "epoch": 80.39312448017743, |
| "eval_loss": 6.381889820098877, |
| "eval_runtime": 63.5998, |
| "eval_samples_per_second": 157.233, |
| "eval_steps_per_second": 19.654, |
| "step": 36200 |
| }, |
| { |
| "epoch": 80.61491544219572, |
| "grad_norm": 0.35631629824638367, |
| "learning_rate": 2.3779264214046824e-06, |
| "loss": 6.3778, |
| "step": 36300 |
| }, |
| { |
| "epoch": 80.61491544219572, |
| "eval_loss": 6.382266998291016, |
| "eval_runtime": 66.2682, |
| "eval_samples_per_second": 150.902, |
| "eval_steps_per_second": 18.863, |
| "step": 36300 |
| }, |
| { |
| "epoch": 80.83670640421403, |
| "grad_norm": 0.38831663131713867, |
| "learning_rate": 2.3678929765886288e-06, |
| "loss": 6.3796, |
| "step": 36400 |
| }, |
| { |
| "epoch": 80.83670640421403, |
| "eval_loss": 6.379624843597412, |
| "eval_runtime": 63.7336, |
| "eval_samples_per_second": 156.903, |
| "eval_steps_per_second": 19.613, |
| "step": 36400 |
| }, |
| { |
| "epoch": 81.05849736623233, |
| "grad_norm": 0.29808080196380615, |
| "learning_rate": 2.3578595317725755e-06, |
| "loss": 6.3787, |
| "step": 36500 |
| }, |
| { |
| "epoch": 81.05849736623233, |
| "eval_loss": 6.380765914916992, |
| "eval_runtime": 63.6941, |
| "eval_samples_per_second": 157.0, |
| "eval_steps_per_second": 19.625, |
| "step": 36500 |
| }, |
| { |
| "epoch": 81.28028832825062, |
| "grad_norm": 0.32311221957206726, |
| "learning_rate": 2.347826086956522e-06, |
| "loss": 6.3795, |
| "step": 36600 |
| }, |
| { |
| "epoch": 81.28028832825062, |
| "eval_loss": 6.38113260269165, |
| "eval_runtime": 66.3064, |
| "eval_samples_per_second": 150.815, |
| "eval_steps_per_second": 18.852, |
| "step": 36600 |
| }, |
| { |
| "epoch": 81.50207929026892, |
| "grad_norm": 0.3027205765247345, |
| "learning_rate": 2.337792642140468e-06, |
| "loss": 6.3777, |
| "step": 36700 |
| }, |
| { |
| "epoch": 81.50207929026892, |
| "eval_loss": 6.378735542297363, |
| "eval_runtime": 63.8029, |
| "eval_samples_per_second": 156.733, |
| "eval_steps_per_second": 19.592, |
| "step": 36700 |
| }, |
| { |
| "epoch": 81.72387025228721, |
| "grad_norm": 0.44942182302474976, |
| "learning_rate": 2.327759197324415e-06, |
| "loss": 6.3793, |
| "step": 36800 |
| }, |
| { |
| "epoch": 81.72387025228721, |
| "eval_loss": 6.382872104644775, |
| "eval_runtime": 63.7382, |
| "eval_samples_per_second": 156.892, |
| "eval_steps_per_second": 19.611, |
| "step": 36800 |
| }, |
| { |
| "epoch": 81.94566121430552, |
| "grad_norm": 0.3363696038722992, |
| "learning_rate": 2.3177257525083613e-06, |
| "loss": 6.3786, |
| "step": 36900 |
| }, |
| { |
| "epoch": 81.94566121430552, |
| "eval_loss": 6.3805928230285645, |
| "eval_runtime": 66.2659, |
| "eval_samples_per_second": 150.907, |
| "eval_steps_per_second": 18.863, |
| "step": 36900 |
| }, |
| { |
| "epoch": 82.16745217632382, |
| "grad_norm": 0.3836919367313385, |
| "learning_rate": 2.307692307692308e-06, |
| "loss": 6.378, |
| "step": 37000 |
| }, |
| { |
| "epoch": 82.16745217632382, |
| "eval_loss": 6.381478786468506, |
| "eval_runtime": 63.6472, |
| "eval_samples_per_second": 157.116, |
| "eval_steps_per_second": 19.64, |
| "step": 37000 |
| }, |
| { |
| "epoch": 82.38924313834211, |
| "grad_norm": 0.3322221338748932, |
| "learning_rate": 2.2976588628762544e-06, |
| "loss": 6.3774, |
| "step": 37100 |
| }, |
| { |
| "epoch": 82.38924313834211, |
| "eval_loss": 6.381748199462891, |
| "eval_runtime": 63.7815, |
| "eval_samples_per_second": 156.785, |
| "eval_steps_per_second": 19.598, |
| "step": 37100 |
| }, |
| { |
| "epoch": 82.61103410036041, |
| "grad_norm": 0.33737483620643616, |
| "learning_rate": 2.2876254180602008e-06, |
| "loss": 6.3792, |
| "step": 37200 |
| }, |
| { |
| "epoch": 82.61103410036041, |
| "eval_loss": 6.381521224975586, |
| "eval_runtime": 66.3297, |
| "eval_samples_per_second": 150.762, |
| "eval_steps_per_second": 18.845, |
| "step": 37200 |
| }, |
| { |
| "epoch": 82.8328250623787, |
| "grad_norm": 0.34915590286254883, |
| "learning_rate": 2.2775919732441475e-06, |
| "loss": 6.3792, |
| "step": 37300 |
| }, |
| { |
| "epoch": 82.8328250623787, |
| "eval_loss": 6.382421493530273, |
| "eval_runtime": 63.7523, |
| "eval_samples_per_second": 156.857, |
| "eval_steps_per_second": 19.607, |
| "step": 37300 |
| }, |
| { |
| "epoch": 83.054616024397, |
| "grad_norm": 0.2967890202999115, |
| "learning_rate": 2.267558528428094e-06, |
| "loss": 6.379, |
| "step": 37400 |
| }, |
| { |
| "epoch": 83.054616024397, |
| "eval_loss": 6.379049301147461, |
| "eval_runtime": 63.7339, |
| "eval_samples_per_second": 156.902, |
| "eval_steps_per_second": 19.613, |
| "step": 37400 |
| }, |
| { |
| "epoch": 83.27640698641531, |
| "grad_norm": 0.3198423385620117, |
| "learning_rate": 2.25752508361204e-06, |
| "loss": 6.3784, |
| "step": 37500 |
| }, |
| { |
| "epoch": 83.27640698641531, |
| "eval_loss": 6.380875110626221, |
| "eval_runtime": 66.2715, |
| "eval_samples_per_second": 150.894, |
| "eval_steps_per_second": 18.862, |
| "step": 37500 |
| }, |
| { |
| "epoch": 83.4981979484336, |
| "grad_norm": 0.22756338119506836, |
| "learning_rate": 2.2474916387959865e-06, |
| "loss": 6.3772, |
| "step": 37600 |
| }, |
| { |
| "epoch": 83.4981979484336, |
| "eval_loss": 6.380899429321289, |
| "eval_runtime": 63.6746, |
| "eval_samples_per_second": 157.048, |
| "eval_steps_per_second": 19.631, |
| "step": 37600 |
| }, |
| { |
| "epoch": 83.7199889104519, |
| "grad_norm": 0.375475138425827, |
| "learning_rate": 2.237458193979933e-06, |
| "loss": 6.38, |
| "step": 37700 |
| }, |
| { |
| "epoch": 83.7199889104519, |
| "eval_loss": 6.379432201385498, |
| "eval_runtime": 63.7694, |
| "eval_samples_per_second": 156.815, |
| "eval_steps_per_second": 19.602, |
| "step": 37700 |
| }, |
| { |
| "epoch": 83.99805932908234, |
| "grad_norm": 0.26553675532341003, |
| "learning_rate": 2.2274247491638796e-06, |
| "loss": 6.3791, |
| "step": 37800 |
| }, |
| { |
| "epoch": 83.99805932908234, |
| "eval_loss": 6.386465072631836, |
| "eval_runtime": 66.2313, |
| "eval_samples_per_second": 150.986, |
| "eval_steps_per_second": 18.873, |
| "step": 37800 |
| }, |
| { |
| "epoch": 84.21985029110064, |
| "grad_norm": 0.2572327256202698, |
| "learning_rate": 2.217391304347826e-06, |
| "loss": 6.3779, |
| "step": 37900 |
| }, |
| { |
| "epoch": 84.21985029110064, |
| "eval_loss": 6.381786823272705, |
| "eval_runtime": 63.8252, |
| "eval_samples_per_second": 156.678, |
| "eval_steps_per_second": 19.585, |
| "step": 37900 |
| }, |
| { |
| "epoch": 84.44164125311893, |
| "grad_norm": 0.3603324294090271, |
| "learning_rate": 2.2073578595317723e-06, |
| "loss": 6.3796, |
| "step": 38000 |
| }, |
| { |
| "epoch": 84.44164125311893, |
| "eval_loss": 6.381040573120117, |
| "eval_runtime": 64.0412, |
| "eval_samples_per_second": 156.15, |
| "eval_steps_per_second": 19.519, |
| "step": 38000 |
| }, |
| { |
| "epoch": 84.66343221513723, |
| "grad_norm": 0.3384093642234802, |
| "learning_rate": 2.197324414715719e-06, |
| "loss": 6.3778, |
| "step": 38100 |
| }, |
| { |
| "epoch": 84.66343221513723, |
| "eval_loss": 6.377985000610352, |
| "eval_runtime": 66.1598, |
| "eval_samples_per_second": 151.149, |
| "eval_steps_per_second": 18.894, |
| "step": 38100 |
| }, |
| { |
| "epoch": 84.88522317715552, |
| "grad_norm": 0.3742137551307678, |
| "learning_rate": 2.1872909698996654e-06, |
| "loss": 6.3788, |
| "step": 38200 |
| }, |
| { |
| "epoch": 84.88522317715552, |
| "eval_loss": 6.382181167602539, |
| "eval_runtime": 63.6067, |
| "eval_samples_per_second": 157.216, |
| "eval_steps_per_second": 19.652, |
| "step": 38200 |
| }, |
| { |
| "epoch": 85.10701413917383, |
| "grad_norm": 0.31179383397102356, |
| "learning_rate": 2.177257525083612e-06, |
| "loss": 6.3771, |
| "step": 38300 |
| }, |
| { |
| "epoch": 85.10701413917383, |
| "eval_loss": 6.380379676818848, |
| "eval_runtime": 66.212, |
| "eval_samples_per_second": 151.03, |
| "eval_steps_per_second": 18.879, |
| "step": 38300 |
| }, |
| { |
| "epoch": 85.32880510119213, |
| "grad_norm": 0.36700376868247986, |
| "learning_rate": 2.1672240802675585e-06, |
| "loss": 6.3767, |
| "step": 38400 |
| }, |
| { |
| "epoch": 85.32880510119213, |
| "eval_loss": 6.3812575340271, |
| "eval_runtime": 63.6889, |
| "eval_samples_per_second": 157.013, |
| "eval_steps_per_second": 19.627, |
| "step": 38400 |
| }, |
| { |
| "epoch": 85.55059606321042, |
| "grad_norm": 0.38559991121292114, |
| "learning_rate": 2.157190635451505e-06, |
| "loss": 6.3781, |
| "step": 38500 |
| }, |
| { |
| "epoch": 85.55059606321042, |
| "eval_loss": 6.384213447570801, |
| "eval_runtime": 66.1477, |
| "eval_samples_per_second": 151.177, |
| "eval_steps_per_second": 18.897, |
| "step": 38500 |
| }, |
| { |
| "epoch": 85.77238702522872, |
| "grad_norm": 0.2753937244415283, |
| "learning_rate": 2.1471571906354516e-06, |
| "loss": 6.3795, |
| "step": 38600 |
| }, |
| { |
| "epoch": 85.77238702522872, |
| "eval_loss": 6.37845516204834, |
| "eval_runtime": 63.6513, |
| "eval_samples_per_second": 157.106, |
| "eval_steps_per_second": 19.638, |
| "step": 38600 |
| }, |
| { |
| "epoch": 85.99417798724701, |
| "grad_norm": 0.22831951081752777, |
| "learning_rate": 2.137123745819398e-06, |
| "loss": 6.3789, |
| "step": 38700 |
| }, |
| { |
| "epoch": 85.99417798724701, |
| "eval_loss": 6.381505966186523, |
| "eval_runtime": 66.231, |
| "eval_samples_per_second": 150.987, |
| "eval_steps_per_second": 18.873, |
| "step": 38700 |
| }, |
| { |
| "epoch": 86.21596894926532, |
| "grad_norm": 0.339546799659729, |
| "learning_rate": 2.1270903010033443e-06, |
| "loss": 6.379, |
| "step": 38800 |
| }, |
| { |
| "epoch": 86.21596894926532, |
| "eval_loss": 6.381498336791992, |
| "eval_runtime": 63.6802, |
| "eval_samples_per_second": 157.035, |
| "eval_steps_per_second": 19.629, |
| "step": 38800 |
| }, |
| { |
| "epoch": 86.43775991128362, |
| "grad_norm": 0.2600659728050232, |
| "learning_rate": 2.117056856187291e-06, |
| "loss": 6.3774, |
| "step": 38900 |
| }, |
| { |
| "epoch": 86.43775991128362, |
| "eval_loss": 6.381589889526367, |
| "eval_runtime": 63.6804, |
| "eval_samples_per_second": 157.034, |
| "eval_steps_per_second": 19.629, |
| "step": 38900 |
| }, |
| { |
| "epoch": 86.65955087330191, |
| "grad_norm": 0.32178473472595215, |
| "learning_rate": 2.1070234113712374e-06, |
| "loss": 6.3785, |
| "step": 39000 |
| }, |
| { |
| "epoch": 86.65955087330191, |
| "eval_loss": 6.377468585968018, |
| "eval_runtime": 66.295, |
| "eval_samples_per_second": 150.841, |
| "eval_steps_per_second": 18.855, |
| "step": 39000 |
| }, |
| { |
| "epoch": 86.88134183532021, |
| "grad_norm": 0.28717854619026184, |
| "learning_rate": 2.0969899665551837e-06, |
| "loss": 6.377, |
| "step": 39100 |
| }, |
| { |
| "epoch": 86.88134183532021, |
| "eval_loss": 6.3805928230285645, |
| "eval_runtime": 63.7605, |
| "eval_samples_per_second": 156.837, |
| "eval_steps_per_second": 19.605, |
| "step": 39100 |
| }, |
| { |
| "epoch": 87.1031327973385, |
| "grad_norm": 0.2932318150997162, |
| "learning_rate": 2.0869565217391305e-06, |
| "loss": 6.3791, |
| "step": 39200 |
| }, |
| { |
| "epoch": 87.1031327973385, |
| "eval_loss": 6.380700588226318, |
| "eval_runtime": 63.6583, |
| "eval_samples_per_second": 157.089, |
| "eval_steps_per_second": 19.636, |
| "step": 39200 |
| }, |
| { |
| "epoch": 87.3249237593568, |
| "grad_norm": 0.39832741022109985, |
| "learning_rate": 2.076923076923077e-06, |
| "loss": 6.3784, |
| "step": 39300 |
| }, |
| { |
| "epoch": 87.3249237593568, |
| "eval_loss": 6.37957763671875, |
| "eval_runtime": 66.5746, |
| "eval_samples_per_second": 150.207, |
| "eval_steps_per_second": 18.776, |
| "step": 39300 |
| }, |
| { |
| "epoch": 87.54671472137511, |
| "grad_norm": 0.3088468611240387, |
| "learning_rate": 2.0668896321070236e-06, |
| "loss": 6.3774, |
| "step": 39400 |
| }, |
| { |
| "epoch": 87.54671472137511, |
| "eval_loss": 6.379054069519043, |
| "eval_runtime": 66.2367, |
| "eval_samples_per_second": 150.974, |
| "eval_steps_per_second": 18.872, |
| "step": 39400 |
| }, |
| { |
| "epoch": 87.7685056833934, |
| "grad_norm": 0.284956693649292, |
| "learning_rate": 2.05685618729097e-06, |
| "loss": 6.3777, |
| "step": 39500 |
| }, |
| { |
| "epoch": 87.7685056833934, |
| "eval_loss": 6.381918907165527, |
| "eval_runtime": 66.2296, |
| "eval_samples_per_second": 150.99, |
| "eval_steps_per_second": 18.874, |
| "step": 39500 |
| }, |
| { |
| "epoch": 87.9902966454117, |
| "grad_norm": 0.26180529594421387, |
| "learning_rate": 2.0468227424749163e-06, |
| "loss": 6.3779, |
| "step": 39600 |
| }, |
| { |
| "epoch": 87.9902966454117, |
| "eval_loss": 6.381536483764648, |
| "eval_runtime": 66.1969, |
| "eval_samples_per_second": 151.064, |
| "eval_steps_per_second": 18.883, |
| "step": 39600 |
| }, |
| { |
| "epoch": 88.21208760742999, |
| "grad_norm": 0.39024651050567627, |
| "learning_rate": 2.036789297658863e-06, |
| "loss": 6.377, |
| "step": 39700 |
| }, |
| { |
| "epoch": 88.21208760742999, |
| "eval_loss": 6.3777618408203125, |
| "eval_runtime": 66.1491, |
| "eval_samples_per_second": 151.174, |
| "eval_steps_per_second": 18.897, |
| "step": 39700 |
| }, |
| { |
| "epoch": 88.4338785694483, |
| "grad_norm": 0.2729719579219818, |
| "learning_rate": 2.0267558528428094e-06, |
| "loss": 6.3782, |
| "step": 39800 |
| }, |
| { |
| "epoch": 88.4338785694483, |
| "eval_loss": 6.382574081420898, |
| "eval_runtime": 66.1908, |
| "eval_samples_per_second": 151.078, |
| "eval_steps_per_second": 18.885, |
| "step": 39800 |
| }, |
| { |
| "epoch": 88.65566953146659, |
| "grad_norm": 0.23033183813095093, |
| "learning_rate": 2.0167224080267557e-06, |
| "loss": 6.3776, |
| "step": 39900 |
| }, |
| { |
| "epoch": 88.65566953146659, |
| "eval_loss": 6.378293514251709, |
| "eval_runtime": 66.1466, |
| "eval_samples_per_second": 151.179, |
| "eval_steps_per_second": 18.897, |
| "step": 39900 |
| }, |
| { |
| "epoch": 88.87746049348489, |
| "grad_norm": 0.43995988368988037, |
| "learning_rate": 2.0066889632107025e-06, |
| "loss": 6.3793, |
| "step": 40000 |
| }, |
| { |
| "epoch": 88.87746049348489, |
| "eval_loss": 6.380235195159912, |
| "eval_runtime": 66.2981, |
| "eval_samples_per_second": 150.834, |
| "eval_steps_per_second": 18.854, |
| "step": 40000 |
| }, |
| { |
| "epoch": 89.0992514555032, |
| "grad_norm": 0.2878618836402893, |
| "learning_rate": 1.996655518394649e-06, |
| "loss": 6.3783, |
| "step": 40100 |
| }, |
| { |
| "epoch": 89.0992514555032, |
| "eval_loss": 6.379173755645752, |
| "eval_runtime": 63.6984, |
| "eval_samples_per_second": 156.99, |
| "eval_steps_per_second": 19.624, |
| "step": 40100 |
| }, |
| { |
| "epoch": 89.32104241752148, |
| "grad_norm": 0.22379851341247559, |
| "learning_rate": 1.986622073578595e-06, |
| "loss": 6.3785, |
| "step": 40200 |
| }, |
| { |
| "epoch": 89.32104241752148, |
| "eval_loss": 6.38173770904541, |
| "eval_runtime": 63.792, |
| "eval_samples_per_second": 156.759, |
| "eval_steps_per_second": 19.595, |
| "step": 40200 |
| }, |
| { |
| "epoch": 89.54283337953979, |
| "grad_norm": 0.3321212828159332, |
| "learning_rate": 1.976588628762542e-06, |
| "loss": 6.3775, |
| "step": 40300 |
| }, |
| { |
| "epoch": 89.54283337953979, |
| "eval_loss": 6.377793788909912, |
| "eval_runtime": 66.2425, |
| "eval_samples_per_second": 150.96, |
| "eval_steps_per_second": 18.87, |
| "step": 40300 |
| }, |
| { |
| "epoch": 89.76462434155808, |
| "grad_norm": 0.3513726592063904, |
| "learning_rate": 1.9665551839464883e-06, |
| "loss": 6.3777, |
| "step": 40400 |
| }, |
| { |
| "epoch": 89.76462434155808, |
| "eval_loss": 6.38060998916626, |
| "eval_runtime": 66.2865, |
| "eval_samples_per_second": 150.86, |
| "eval_steps_per_second": 18.858, |
| "step": 40400 |
| }, |
| { |
| "epoch": 89.98641530357638, |
| "grad_norm": 0.3225536048412323, |
| "learning_rate": 1.956521739130435e-06, |
| "loss": 6.3781, |
| "step": 40500 |
| }, |
| { |
| "epoch": 89.98641530357638, |
| "eval_loss": 6.3820648193359375, |
| "eval_runtime": 63.6933, |
| "eval_samples_per_second": 157.002, |
| "eval_steps_per_second": 19.625, |
| "step": 40500 |
| }, |
| { |
| "epoch": 90.20820626559468, |
| "grad_norm": 0.3866877853870392, |
| "learning_rate": 1.9464882943143814e-06, |
| "loss": 6.3772, |
| "step": 40600 |
| }, |
| { |
| "epoch": 90.20820626559468, |
| "eval_loss": 6.382141590118408, |
| "eval_runtime": 63.7832, |
| "eval_samples_per_second": 156.781, |
| "eval_steps_per_second": 19.598, |
| "step": 40600 |
| }, |
| { |
| "epoch": 90.42999722761297, |
| "grad_norm": 0.43070387840270996, |
| "learning_rate": 1.9364548494983277e-06, |
| "loss": 6.3778, |
| "step": 40700 |
| }, |
| { |
| "epoch": 90.42999722761297, |
| "eval_loss": 6.375494480133057, |
| "eval_runtime": 65.724, |
| "eval_samples_per_second": 152.152, |
| "eval_steps_per_second": 19.019, |
| "step": 40700 |
| }, |
| { |
| "epoch": 90.65178818963128, |
| "grad_norm": 0.35665562748908997, |
| "learning_rate": 1.9264214046822745e-06, |
| "loss": 6.3767, |
| "step": 40800 |
| }, |
| { |
| "epoch": 90.65178818963128, |
| "eval_loss": 6.379345417022705, |
| "eval_runtime": 64.4622, |
| "eval_samples_per_second": 155.13, |
| "eval_steps_per_second": 19.391, |
| "step": 40800 |
| }, |
| { |
| "epoch": 90.87357915164957, |
| "grad_norm": 0.35841798782348633, |
| "learning_rate": 1.916387959866221e-06, |
| "loss": 6.3765, |
| "step": 40900 |
| }, |
| { |
| "epoch": 90.87357915164957, |
| "eval_loss": 6.379830360412598, |
| "eval_runtime": 66.3033, |
| "eval_samples_per_second": 150.822, |
| "eval_steps_per_second": 18.853, |
| "step": 40900 |
| }, |
| { |
| "epoch": 91.09537011366787, |
| "grad_norm": 0.29910504817962646, |
| "learning_rate": 1.9063545150501674e-06, |
| "loss": 6.3774, |
| "step": 41000 |
| }, |
| { |
| "epoch": 91.09537011366787, |
| "eval_loss": 6.380716323852539, |
| "eval_runtime": 63.7905, |
| "eval_samples_per_second": 156.763, |
| "eval_steps_per_second": 19.595, |
| "step": 41000 |
| }, |
| { |
| "epoch": 91.31716107568617, |
| "grad_norm": 0.3775427043437958, |
| "learning_rate": 1.896321070234114e-06, |
| "loss": 6.3784, |
| "step": 41100 |
| }, |
| { |
| "epoch": 91.31716107568617, |
| "eval_loss": 6.38125467300415, |
| "eval_runtime": 63.7934, |
| "eval_samples_per_second": 156.756, |
| "eval_steps_per_second": 19.595, |
| "step": 41100 |
| }, |
| { |
| "epoch": 91.53895203770446, |
| "grad_norm": 0.2421695590019226, |
| "learning_rate": 1.8862876254180603e-06, |
| "loss": 6.377, |
| "step": 41200 |
| }, |
| { |
| "epoch": 91.53895203770446, |
| "eval_loss": 6.381397724151611, |
| "eval_runtime": 66.2535, |
| "eval_samples_per_second": 150.935, |
| "eval_steps_per_second": 18.867, |
| "step": 41200 |
| }, |
| { |
| "epoch": 91.76074299972277, |
| "grad_norm": 0.2967372238636017, |
| "learning_rate": 1.8762541806020068e-06, |
| "loss": 6.3783, |
| "step": 41300 |
| }, |
| { |
| "epoch": 91.76074299972277, |
| "eval_loss": 6.380742073059082, |
| "eval_runtime": 63.7433, |
| "eval_samples_per_second": 156.879, |
| "eval_steps_per_second": 19.61, |
| "step": 41300 |
| }, |
| { |
| "epoch": 91.98253396174105, |
| "grad_norm": 0.3849758505821228, |
| "learning_rate": 1.8662207357859534e-06, |
| "loss": 6.3789, |
| "step": 41400 |
| }, |
| { |
| "epoch": 91.98253396174105, |
| "eval_loss": 6.3830342292785645, |
| "eval_runtime": 66.3151, |
| "eval_samples_per_second": 150.795, |
| "eval_steps_per_second": 18.849, |
| "step": 41400 |
| }, |
| { |
| "epoch": 92.20432492375936, |
| "grad_norm": 0.377841055393219, |
| "learning_rate": 1.8561872909699e-06, |
| "loss": 6.3769, |
| "step": 41500 |
| }, |
| { |
| "epoch": 92.20432492375936, |
| "eval_loss": 6.381241798400879, |
| "eval_runtime": 63.6855, |
| "eval_samples_per_second": 157.022, |
| "eval_steps_per_second": 19.628, |
| "step": 41500 |
| }, |
| { |
| "epoch": 92.42611588577765, |
| "grad_norm": 0.4062901437282562, |
| "learning_rate": 1.8461538461538462e-06, |
| "loss": 6.3791, |
| "step": 41600 |
| }, |
| { |
| "epoch": 92.42611588577765, |
| "eval_loss": 6.378665924072266, |
| "eval_runtime": 63.7914, |
| "eval_samples_per_second": 156.761, |
| "eval_steps_per_second": 19.595, |
| "step": 41600 |
| }, |
| { |
| "epoch": 92.64790684779595, |
| "grad_norm": 0.33464646339416504, |
| "learning_rate": 1.8361204013377928e-06, |
| "loss": 6.3782, |
| "step": 41700 |
| }, |
| { |
| "epoch": 92.64790684779595, |
| "eval_loss": 6.379201412200928, |
| "eval_runtime": 66.2071, |
| "eval_samples_per_second": 151.041, |
| "eval_steps_per_second": 18.88, |
| "step": 41700 |
| }, |
| { |
| "epoch": 92.86969780981426, |
| "grad_norm": 0.26191645860671997, |
| "learning_rate": 1.8260869565217394e-06, |
| "loss": 6.3768, |
| "step": 41800 |
| }, |
| { |
| "epoch": 92.86969780981426, |
| "eval_loss": 6.380030632019043, |
| "eval_runtime": 63.7463, |
| "eval_samples_per_second": 156.872, |
| "eval_steps_per_second": 19.609, |
| "step": 41800 |
| }, |
| { |
| "epoch": 93.09148877183254, |
| "grad_norm": 0.4473271667957306, |
| "learning_rate": 1.8160535117056857e-06, |
| "loss": 6.376, |
| "step": 41900 |
| }, |
| { |
| "epoch": 93.09148877183254, |
| "eval_loss": 6.383362293243408, |
| "eval_runtime": 66.2652, |
| "eval_samples_per_second": 150.909, |
| "eval_steps_per_second": 18.864, |
| "step": 41900 |
| }, |
| { |
| "epoch": 93.31327973385085, |
| "grad_norm": 0.30396267771720886, |
| "learning_rate": 1.8060200668896322e-06, |
| "loss": 6.3782, |
| "step": 42000 |
| }, |
| { |
| "epoch": 93.31327973385085, |
| "eval_loss": 6.382277965545654, |
| "eval_runtime": 63.811, |
| "eval_samples_per_second": 156.713, |
| "eval_steps_per_second": 19.589, |
| "step": 42000 |
| }, |
| { |
| "epoch": 93.53507069586914, |
| "grad_norm": 0.2819732129573822, |
| "learning_rate": 1.7959866220735788e-06, |
| "loss": 6.3782, |
| "step": 42100 |
| }, |
| { |
| "epoch": 93.53507069586914, |
| "eval_loss": 6.381258010864258, |
| "eval_runtime": 63.7343, |
| "eval_samples_per_second": 156.901, |
| "eval_steps_per_second": 19.613, |
| "step": 42100 |
| }, |
| { |
| "epoch": 93.75686165788744, |
| "grad_norm": 0.2994706630706787, |
| "learning_rate": 1.7859531772575253e-06, |
| "loss": 6.3786, |
| "step": 42200 |
| }, |
| { |
| "epoch": 93.75686165788744, |
| "eval_loss": 6.381169319152832, |
| "eval_runtime": 66.2919, |
| "eval_samples_per_second": 150.848, |
| "eval_steps_per_second": 18.856, |
| "step": 42200 |
| }, |
| { |
| "epoch": 93.97865261990574, |
| "grad_norm": 0.31294333934783936, |
| "learning_rate": 1.7759197324414717e-06, |
| "loss": 6.3766, |
| "step": 42300 |
| }, |
| { |
| "epoch": 93.97865261990574, |
| "eval_loss": 6.379955291748047, |
| "eval_runtime": 63.7737, |
| "eval_samples_per_second": 156.804, |
| "eval_steps_per_second": 19.601, |
| "step": 42300 |
| }, |
| { |
| "epoch": 94.20044358192403, |
| "grad_norm": 0.291477769613266, |
| "learning_rate": 1.7658862876254182e-06, |
| "loss": 6.3777, |
| "step": 42400 |
| }, |
| { |
| "epoch": 94.20044358192403, |
| "eval_loss": 6.379477500915527, |
| "eval_runtime": 66.2866, |
| "eval_samples_per_second": 150.86, |
| "eval_steps_per_second": 18.857, |
| "step": 42400 |
| }, |
| { |
| "epoch": 94.42223454394234, |
| "grad_norm": 0.23638038337230682, |
| "learning_rate": 1.7558528428093648e-06, |
| "loss": 6.3781, |
| "step": 42500 |
| }, |
| { |
| "epoch": 94.42223454394234, |
| "eval_loss": 6.380892753601074, |
| "eval_runtime": 63.8247, |
| "eval_samples_per_second": 156.679, |
| "eval_steps_per_second": 19.585, |
| "step": 42500 |
| }, |
| { |
| "epoch": 94.64402550596063, |
| "grad_norm": 0.3445935547351837, |
| "learning_rate": 1.745819397993311e-06, |
| "loss": 6.3768, |
| "step": 42600 |
| }, |
| { |
| "epoch": 94.64402550596063, |
| "eval_loss": 6.382579803466797, |
| "eval_runtime": 63.8197, |
| "eval_samples_per_second": 156.691, |
| "eval_steps_per_second": 19.586, |
| "step": 42600 |
| }, |
| { |
| "epoch": 94.86581646797893, |
| "grad_norm": 0.3376341462135315, |
| "learning_rate": 1.7357859531772575e-06, |
| "loss": 6.3768, |
| "step": 42700 |
| }, |
| { |
| "epoch": 94.86581646797893, |
| "eval_loss": 6.381232261657715, |
| "eval_runtime": 66.3632, |
| "eval_samples_per_second": 150.686, |
| "eval_steps_per_second": 18.836, |
| "step": 42700 |
| }, |
| { |
| "epoch": 95.08760742999723, |
| "grad_norm": 0.29045116901397705, |
| "learning_rate": 1.7257525083612038e-06, |
| "loss": 6.3763, |
| "step": 42800 |
| }, |
| { |
| "epoch": 95.08760742999723, |
| "eval_loss": 6.3776373863220215, |
| "eval_runtime": 63.6759, |
| "eval_samples_per_second": 157.045, |
| "eval_steps_per_second": 19.631, |
| "step": 42800 |
| }, |
| { |
| "epoch": 95.30939839201552, |
| "grad_norm": 0.2851983308792114, |
| "learning_rate": 1.7157190635451504e-06, |
| "loss": 6.3778, |
| "step": 42900 |
| }, |
| { |
| "epoch": 95.30939839201552, |
| "eval_loss": 6.380300998687744, |
| "eval_runtime": 63.8175, |
| "eval_samples_per_second": 156.697, |
| "eval_steps_per_second": 19.587, |
| "step": 42900 |
| }, |
| { |
| "epoch": 95.53118935403383, |
| "grad_norm": 0.33936771750450134, |
| "learning_rate": 1.705685618729097e-06, |
| "loss": 6.3787, |
| "step": 43000 |
| }, |
| { |
| "epoch": 95.53118935403383, |
| "eval_loss": 6.37871789932251, |
| "eval_runtime": 63.8614, |
| "eval_samples_per_second": 156.589, |
| "eval_steps_per_second": 19.574, |
| "step": 43000 |
| }, |
| { |
| "epoch": 95.75298031605212, |
| "grad_norm": 0.4443320333957672, |
| "learning_rate": 1.6956521739130435e-06, |
| "loss": 6.3781, |
| "step": 43100 |
| }, |
| { |
| "epoch": 95.75298031605212, |
| "eval_loss": 6.382043838500977, |
| "eval_runtime": 66.3729, |
| "eval_samples_per_second": 150.664, |
| "eval_steps_per_second": 18.833, |
| "step": 43100 |
| }, |
| { |
| "epoch": 95.97477127807042, |
| "grad_norm": 0.33091309666633606, |
| "learning_rate": 1.6856187290969898e-06, |
| "loss": 6.3772, |
| "step": 43200 |
| }, |
| { |
| "epoch": 95.97477127807042, |
| "eval_loss": 6.380916595458984, |
| "eval_runtime": 63.7824, |
| "eval_samples_per_second": 156.783, |
| "eval_steps_per_second": 19.598, |
| "step": 43200 |
| }, |
| { |
| "epoch": 96.19656224008871, |
| "grad_norm": 0.3929876685142517, |
| "learning_rate": 1.6755852842809363e-06, |
| "loss": 6.3785, |
| "step": 43300 |
| }, |
| { |
| "epoch": 96.19656224008871, |
| "eval_loss": 6.377211570739746, |
| "eval_runtime": 66.2793, |
| "eval_samples_per_second": 150.877, |
| "eval_steps_per_second": 18.86, |
| "step": 43300 |
| }, |
| { |
| "epoch": 96.41835320210701, |
| "grad_norm": 0.3379896581172943, |
| "learning_rate": 1.665551839464883e-06, |
| "loss": 6.3772, |
| "step": 43400 |
| }, |
| { |
| "epoch": 96.41835320210701, |
| "eval_loss": 6.380885124206543, |
| "eval_runtime": 63.8749, |
| "eval_samples_per_second": 156.556, |
| "eval_steps_per_second": 19.569, |
| "step": 43400 |
| }, |
| { |
| "epoch": 96.64014416412532, |
| "grad_norm": 0.3330114483833313, |
| "learning_rate": 1.6555183946488294e-06, |
| "loss": 6.378, |
| "step": 43500 |
| }, |
| { |
| "epoch": 96.64014416412532, |
| "eval_loss": 6.381417751312256, |
| "eval_runtime": 66.3248, |
| "eval_samples_per_second": 150.773, |
| "eval_steps_per_second": 18.847, |
| "step": 43500 |
| }, |
| { |
| "epoch": 96.8619351261436, |
| "grad_norm": 0.5002055168151855, |
| "learning_rate": 1.6454849498327758e-06, |
| "loss": 6.3772, |
| "step": 43600 |
| }, |
| { |
| "epoch": 96.8619351261436, |
| "eval_loss": 6.379367351531982, |
| "eval_runtime": 63.7674, |
| "eval_samples_per_second": 156.82, |
| "eval_steps_per_second": 19.603, |
| "step": 43600 |
| }, |
| { |
| "epoch": 97.08372608816191, |
| "grad_norm": 0.4039636552333832, |
| "learning_rate": 1.6354515050167223e-06, |
| "loss": 6.376, |
| "step": 43700 |
| }, |
| { |
| "epoch": 97.08372608816191, |
| "eval_loss": 6.379873275756836, |
| "eval_runtime": 63.6881, |
| "eval_samples_per_second": 157.015, |
| "eval_steps_per_second": 19.627, |
| "step": 43700 |
| }, |
| { |
| "epoch": 97.3055170501802, |
| "grad_norm": 0.3500140309333801, |
| "learning_rate": 1.6254180602006689e-06, |
| "loss": 6.3793, |
| "step": 43800 |
| }, |
| { |
| "epoch": 97.3055170501802, |
| "eval_loss": 6.3825764656066895, |
| "eval_runtime": 66.2863, |
| "eval_samples_per_second": 150.861, |
| "eval_steps_per_second": 18.858, |
| "step": 43800 |
| }, |
| { |
| "epoch": 97.5273080121985, |
| "grad_norm": 0.343735009431839, |
| "learning_rate": 1.6153846153846154e-06, |
| "loss": 6.3779, |
| "step": 43900 |
| }, |
| { |
| "epoch": 97.5273080121985, |
| "eval_loss": 6.378231525421143, |
| "eval_runtime": 63.7143, |
| "eval_samples_per_second": 156.951, |
| "eval_steps_per_second": 19.619, |
| "step": 43900 |
| }, |
| { |
| "epoch": 97.7490989742168, |
| "grad_norm": 0.3836156129837036, |
| "learning_rate": 1.6053511705685618e-06, |
| "loss": 6.3773, |
| "step": 44000 |
| }, |
| { |
| "epoch": 97.7490989742168, |
| "eval_loss": 6.37751579284668, |
| "eval_runtime": 63.729, |
| "eval_samples_per_second": 156.915, |
| "eval_steps_per_second": 19.614, |
| "step": 44000 |
| }, |
| { |
| "epoch": 97.9708899362351, |
| "grad_norm": 0.3120937645435333, |
| "learning_rate": 1.5953177257525083e-06, |
| "loss": 6.3755, |
| "step": 44100 |
| }, |
| { |
| "epoch": 97.9708899362351, |
| "eval_loss": 6.3800272941589355, |
| "eval_runtime": 64.1744, |
| "eval_samples_per_second": 155.825, |
| "eval_steps_per_second": 19.478, |
| "step": 44100 |
| }, |
| { |
| "epoch": 98.1926808982534, |
| "grad_norm": 0.33682048320770264, |
| "learning_rate": 1.5852842809364549e-06, |
| "loss": 6.3765, |
| "step": 44200 |
| }, |
| { |
| "epoch": 98.1926808982534, |
| "eval_loss": 6.378459930419922, |
| "eval_runtime": 65.8486, |
| "eval_samples_per_second": 151.864, |
| "eval_steps_per_second": 18.983, |
| "step": 44200 |
| }, |
| { |
| "epoch": 98.41447186027169, |
| "grad_norm": 0.33430323004722595, |
| "learning_rate": 1.5752508361204012e-06, |
| "loss": 6.3784, |
| "step": 44300 |
| }, |
| { |
| "epoch": 98.41447186027169, |
| "eval_loss": 6.37835693359375, |
| "eval_runtime": 63.7423, |
| "eval_samples_per_second": 156.882, |
| "eval_steps_per_second": 19.61, |
| "step": 44300 |
| }, |
| { |
| "epoch": 98.63626282228999, |
| "grad_norm": 0.3729492425918579, |
| "learning_rate": 1.5652173913043478e-06, |
| "loss": 6.3775, |
| "step": 44400 |
| }, |
| { |
| "epoch": 98.63626282228999, |
| "eval_loss": 6.379312515258789, |
| "eval_runtime": 67.1919, |
| "eval_samples_per_second": 148.827, |
| "eval_steps_per_second": 18.603, |
| "step": 44400 |
| }, |
| { |
| "epoch": 98.8580537843083, |
| "grad_norm": 0.30378684401512146, |
| "learning_rate": 1.5551839464882943e-06, |
| "loss": 6.3773, |
| "step": 44500 |
| }, |
| { |
| "epoch": 98.8580537843083, |
| "eval_loss": 6.380176544189453, |
| "eval_runtime": 66.2505, |
| "eval_samples_per_second": 150.942, |
| "eval_steps_per_second": 18.868, |
| "step": 44500 |
| }, |
| { |
| "epoch": 99.07984474632659, |
| "grad_norm": 0.2708960771560669, |
| "learning_rate": 1.5451505016722409e-06, |
| "loss": 6.3791, |
| "step": 44600 |
| }, |
| { |
| "epoch": 99.07984474632659, |
| "eval_loss": 6.381106853485107, |
| "eval_runtime": 63.6851, |
| "eval_samples_per_second": 157.023, |
| "eval_steps_per_second": 19.628, |
| "step": 44600 |
| }, |
| { |
| "epoch": 99.30163570834489, |
| "grad_norm": 0.28966355323791504, |
| "learning_rate": 1.5351170568561872e-06, |
| "loss": 6.3769, |
| "step": 44700 |
| }, |
| { |
| "epoch": 99.30163570834489, |
| "eval_loss": 6.380806922912598, |
| "eval_runtime": 66.2295, |
| "eval_samples_per_second": 150.99, |
| "eval_steps_per_second": 18.874, |
| "step": 44700 |
| }, |
| { |
| "epoch": 99.52342667036318, |
| "grad_norm": 0.33378317952156067, |
| "learning_rate": 1.5250836120401338e-06, |
| "loss": 6.3764, |
| "step": 44800 |
| }, |
| { |
| "epoch": 99.52342667036318, |
| "eval_loss": 6.378901481628418, |
| "eval_runtime": 63.7387, |
| "eval_samples_per_second": 156.89, |
| "eval_steps_per_second": 19.611, |
| "step": 44800 |
| }, |
| { |
| "epoch": 99.74521763238148, |
| "grad_norm": 0.2659667134284973, |
| "learning_rate": 1.5150501672240803e-06, |
| "loss": 6.3763, |
| "step": 44900 |
| }, |
| { |
| "epoch": 99.74521763238148, |
| "eval_loss": 6.378689289093018, |
| "eval_runtime": 66.291, |
| "eval_samples_per_second": 150.85, |
| "eval_steps_per_second": 18.856, |
| "step": 44900 |
| }, |
| { |
| "epoch": 99.96700859439977, |
| "grad_norm": 0.36868181824684143, |
| "learning_rate": 1.5050167224080269e-06, |
| "loss": 6.3773, |
| "step": 45000 |
| }, |
| { |
| "epoch": 99.96700859439977, |
| "eval_loss": 6.379394054412842, |
| "eval_runtime": 63.8432, |
| "eval_samples_per_second": 156.634, |
| "eval_steps_per_second": 19.579, |
| "step": 45000 |
| }, |
| { |
| "epoch": 100.18879955641808, |
| "grad_norm": 0.2957492768764496, |
| "learning_rate": 1.4949832775919732e-06, |
| "loss": 6.3777, |
| "step": 45100 |
| }, |
| { |
| "epoch": 100.18879955641808, |
| "eval_loss": 6.37989616394043, |
| "eval_runtime": 63.7161, |
| "eval_samples_per_second": 156.946, |
| "eval_steps_per_second": 19.618, |
| "step": 45100 |
| }, |
| { |
| "epoch": 100.41059051843638, |
| "grad_norm": 0.36346226930618286, |
| "learning_rate": 1.4849498327759198e-06, |
| "loss": 6.3771, |
| "step": 45200 |
| }, |
| { |
| "epoch": 100.41059051843638, |
| "eval_loss": 6.382117748260498, |
| "eval_runtime": 66.181, |
| "eval_samples_per_second": 151.101, |
| "eval_steps_per_second": 18.888, |
| "step": 45200 |
| }, |
| { |
| "epoch": 100.63238148045467, |
| "grad_norm": 0.21758611500263214, |
| "learning_rate": 1.4749163879598663e-06, |
| "loss": 6.3768, |
| "step": 45300 |
| }, |
| { |
| "epoch": 100.63238148045467, |
| "eval_loss": 6.378548622131348, |
| "eval_runtime": 63.8643, |
| "eval_samples_per_second": 156.582, |
| "eval_steps_per_second": 19.573, |
| "step": 45300 |
| }, |
| { |
| "epoch": 100.85417244247297, |
| "grad_norm": 0.21891988813877106, |
| "learning_rate": 1.4648829431438129e-06, |
| "loss": 6.3759, |
| "step": 45400 |
| }, |
| { |
| "epoch": 100.85417244247297, |
| "eval_loss": 6.3807806968688965, |
| "eval_runtime": 66.1954, |
| "eval_samples_per_second": 151.068, |
| "eval_steps_per_second": 18.883, |
| "step": 45400 |
| }, |
| { |
| "epoch": 101.07596340449126, |
| "grad_norm": 0.31398728489875793, |
| "learning_rate": 1.4548494983277592e-06, |
| "loss": 6.3783, |
| "step": 45500 |
| }, |
| { |
| "epoch": 101.07596340449126, |
| "eval_loss": 6.3800740242004395, |
| "eval_runtime": 63.74, |
| "eval_samples_per_second": 156.887, |
| "eval_steps_per_second": 19.611, |
| "step": 45500 |
| }, |
| { |
| "epoch": 101.29775436650957, |
| "grad_norm": 0.3506067991256714, |
| "learning_rate": 1.4448160535117058e-06, |
| "loss": 6.3757, |
| "step": 45600 |
| }, |
| { |
| "epoch": 101.29775436650957, |
| "eval_loss": 6.3802642822265625, |
| "eval_runtime": 66.3029, |
| "eval_samples_per_second": 150.823, |
| "eval_steps_per_second": 18.853, |
| "step": 45600 |
| }, |
| { |
| "epoch": 101.51954532852787, |
| "grad_norm": 0.4127357304096222, |
| "learning_rate": 1.4347826086956523e-06, |
| "loss": 6.377, |
| "step": 45700 |
| }, |
| { |
| "epoch": 101.51954532852787, |
| "eval_loss": 6.379199028015137, |
| "eval_runtime": 63.6147, |
| "eval_samples_per_second": 157.196, |
| "eval_steps_per_second": 19.65, |
| "step": 45700 |
| }, |
| { |
| "epoch": 101.74133629054616, |
| "grad_norm": 0.40180787444114685, |
| "learning_rate": 1.4247491638795989e-06, |
| "loss": 6.3774, |
| "step": 45800 |
| }, |
| { |
| "epoch": 101.74133629054616, |
| "eval_loss": 6.378483295440674, |
| "eval_runtime": 63.6205, |
| "eval_samples_per_second": 157.182, |
| "eval_steps_per_second": 19.648, |
| "step": 45800 |
| }, |
| { |
| "epoch": 101.96312725256446, |
| "grad_norm": 0.2862705588340759, |
| "learning_rate": 1.4147157190635452e-06, |
| "loss": 6.3777, |
| "step": 45900 |
| }, |
| { |
| "epoch": 101.96312725256446, |
| "eval_loss": 6.377134323120117, |
| "eval_runtime": 63.9897, |
| "eval_samples_per_second": 156.275, |
| "eval_steps_per_second": 19.534, |
| "step": 45900 |
| }, |
| { |
| "epoch": 102.18491821458275, |
| "grad_norm": 0.2539602816104889, |
| "learning_rate": 1.4046822742474917e-06, |
| "loss": 6.3786, |
| "step": 46000 |
| }, |
| { |
| "epoch": 102.18491821458275, |
| "eval_loss": 6.379866123199463, |
| "eval_runtime": 66.1001, |
| "eval_samples_per_second": 151.286, |
| "eval_steps_per_second": 18.911, |
| "step": 46000 |
| }, |
| { |
| "epoch": 102.40670917660105, |
| "grad_norm": 0.36692872643470764, |
| "learning_rate": 1.3946488294314383e-06, |
| "loss": 6.3771, |
| "step": 46100 |
| }, |
| { |
| "epoch": 102.40670917660105, |
| "eval_loss": 6.379576683044434, |
| "eval_runtime": 63.6911, |
| "eval_samples_per_second": 157.008, |
| "eval_steps_per_second": 19.626, |
| "step": 46100 |
| }, |
| { |
| "epoch": 102.62850013861934, |
| "grad_norm": 0.3044676184654236, |
| "learning_rate": 1.3846153846153846e-06, |
| "loss": 6.3772, |
| "step": 46200 |
| }, |
| { |
| "epoch": 102.62850013861934, |
| "eval_loss": 6.381227493286133, |
| "eval_runtime": 63.7064, |
| "eval_samples_per_second": 156.97, |
| "eval_steps_per_second": 19.621, |
| "step": 46200 |
| }, |
| { |
| "epoch": 102.85029110063765, |
| "grad_norm": 0.3508971035480499, |
| "learning_rate": 1.374581939799331e-06, |
| "loss": 6.3762, |
| "step": 46300 |
| }, |
| { |
| "epoch": 102.85029110063765, |
| "eval_loss": 6.377274513244629, |
| "eval_runtime": 66.2947, |
| "eval_samples_per_second": 150.842, |
| "eval_steps_per_second": 18.855, |
| "step": 46300 |
| }, |
| { |
| "epoch": 103.07208206265595, |
| "grad_norm": 0.31413570046424866, |
| "learning_rate": 1.3645484949832775e-06, |
| "loss": 6.3774, |
| "step": 46400 |
| }, |
| { |
| "epoch": 103.07208206265595, |
| "eval_loss": 6.380115032196045, |
| "eval_runtime": 63.6441, |
| "eval_samples_per_second": 157.124, |
| "eval_steps_per_second": 19.64, |
| "step": 46400 |
| }, |
| { |
| "epoch": 103.29387302467424, |
| "grad_norm": 0.2552104890346527, |
| "learning_rate": 1.354515050167224e-06, |
| "loss": 6.3775, |
| "step": 46500 |
| }, |
| { |
| "epoch": 103.29387302467424, |
| "eval_loss": 6.379015922546387, |
| "eval_runtime": 63.6755, |
| "eval_samples_per_second": 157.046, |
| "eval_steps_per_second": 19.631, |
| "step": 46500 |
| }, |
| { |
| "epoch": 103.51566398669254, |
| "grad_norm": 0.3744960129261017, |
| "learning_rate": 1.3444816053511706e-06, |
| "loss": 6.3763, |
| "step": 46600 |
| }, |
| { |
| "epoch": 103.51566398669254, |
| "eval_loss": 6.374266624450684, |
| "eval_runtime": 66.5834, |
| "eval_samples_per_second": 150.188, |
| "eval_steps_per_second": 18.773, |
| "step": 46600 |
| }, |
| { |
| "epoch": 103.73745494871083, |
| "grad_norm": 0.27893921732902527, |
| "learning_rate": 1.334448160535117e-06, |
| "loss": 6.3775, |
| "step": 46700 |
| }, |
| { |
| "epoch": 103.73745494871083, |
| "eval_loss": 6.380270957946777, |
| "eval_runtime": 66.2442, |
| "eval_samples_per_second": 150.957, |
| "eval_steps_per_second": 18.87, |
| "step": 46700 |
| }, |
| { |
| "epoch": 103.95924591072914, |
| "grad_norm": 0.2601492404937744, |
| "learning_rate": 1.3244147157190635e-06, |
| "loss": 6.3775, |
| "step": 46800 |
| }, |
| { |
| "epoch": 103.95924591072914, |
| "eval_loss": 6.380533218383789, |
| "eval_runtime": 66.2494, |
| "eval_samples_per_second": 150.945, |
| "eval_steps_per_second": 18.868, |
| "step": 46800 |
| }, |
| { |
| "epoch": 104.18103687274744, |
| "grad_norm": 0.28285419940948486, |
| "learning_rate": 1.31438127090301e-06, |
| "loss": 6.3776, |
| "step": 46900 |
| }, |
| { |
| "epoch": 104.18103687274744, |
| "eval_loss": 6.3801751136779785, |
| "eval_runtime": 66.2411, |
| "eval_samples_per_second": 150.964, |
| "eval_steps_per_second": 18.87, |
| "step": 46900 |
| }, |
| { |
| "epoch": 104.40282783476573, |
| "grad_norm": 0.4723234176635742, |
| "learning_rate": 1.3043478260869566e-06, |
| "loss": 6.376, |
| "step": 47000 |
| }, |
| { |
| "epoch": 104.40282783476573, |
| "eval_loss": 6.379186153411865, |
| "eval_runtime": 63.7279, |
| "eval_samples_per_second": 156.917, |
| "eval_steps_per_second": 19.615, |
| "step": 47000 |
| }, |
| { |
| "epoch": 104.62461879678403, |
| "grad_norm": 0.3108322322368622, |
| "learning_rate": 1.294314381270903e-06, |
| "loss": 6.3773, |
| "step": 47100 |
| }, |
| { |
| "epoch": 104.62461879678403, |
| "eval_loss": 6.3764142990112305, |
| "eval_runtime": 63.7167, |
| "eval_samples_per_second": 156.945, |
| "eval_steps_per_second": 19.618, |
| "step": 47100 |
| }, |
| { |
| "epoch": 104.84640975880232, |
| "grad_norm": 0.38544511795043945, |
| "learning_rate": 1.2842809364548495e-06, |
| "loss": 6.3773, |
| "step": 47200 |
| }, |
| { |
| "epoch": 104.84640975880232, |
| "eval_loss": 6.379009246826172, |
| "eval_runtime": 66.2408, |
| "eval_samples_per_second": 150.964, |
| "eval_steps_per_second": 18.871, |
| "step": 47200 |
| }, |
| { |
| "epoch": 105.06820072082063, |
| "grad_norm": 0.2773985266685486, |
| "learning_rate": 1.274247491638796e-06, |
| "loss": 6.3772, |
| "step": 47300 |
| }, |
| { |
| "epoch": 105.06820072082063, |
| "eval_loss": 6.3756842613220215, |
| "eval_runtime": 63.6518, |
| "eval_samples_per_second": 157.105, |
| "eval_steps_per_second": 19.638, |
| "step": 47300 |
| }, |
| { |
| "epoch": 105.28999168283893, |
| "grad_norm": 0.2765492796897888, |
| "learning_rate": 1.2642140468227424e-06, |
| "loss": 6.3764, |
| "step": 47400 |
| }, |
| { |
| "epoch": 105.28999168283893, |
| "eval_loss": 6.377975940704346, |
| "eval_runtime": 66.2572, |
| "eval_samples_per_second": 150.927, |
| "eval_steps_per_second": 18.866, |
| "step": 47400 |
| }, |
| { |
| "epoch": 105.51178264485722, |
| "grad_norm": 0.30239638686180115, |
| "learning_rate": 1.254180602006689e-06, |
| "loss": 6.3761, |
| "step": 47500 |
| }, |
| { |
| "epoch": 105.51178264485722, |
| "eval_loss": 6.379149436950684, |
| "eval_runtime": 63.8068, |
| "eval_samples_per_second": 156.723, |
| "eval_steps_per_second": 19.59, |
| "step": 47500 |
| }, |
| { |
| "epoch": 105.73357360687552, |
| "grad_norm": 0.22471874952316284, |
| "learning_rate": 1.2441471571906355e-06, |
| "loss": 6.3775, |
| "step": 47600 |
| }, |
| { |
| "epoch": 105.73357360687552, |
| "eval_loss": 6.3783087730407715, |
| "eval_runtime": 66.2436, |
| "eval_samples_per_second": 150.958, |
| "eval_steps_per_second": 18.87, |
| "step": 47600 |
| }, |
| { |
| "epoch": 105.95536456889381, |
| "grad_norm": 0.23722052574157715, |
| "learning_rate": 1.234113712374582e-06, |
| "loss": 6.377, |
| "step": 47700 |
| }, |
| { |
| "epoch": 105.95536456889381, |
| "eval_loss": 6.376536846160889, |
| "eval_runtime": 63.6766, |
| "eval_samples_per_second": 157.044, |
| "eval_steps_per_second": 19.63, |
| "step": 47700 |
| }, |
| { |
| "epoch": 106.17715553091212, |
| "grad_norm": 0.26499879360198975, |
| "learning_rate": 1.2240802675585284e-06, |
| "loss": 6.3758, |
| "step": 47800 |
| }, |
| { |
| "epoch": 106.17715553091212, |
| "eval_loss": 6.380406856536865, |
| "eval_runtime": 66.1835, |
| "eval_samples_per_second": 151.095, |
| "eval_steps_per_second": 18.887, |
| "step": 47800 |
| }, |
| { |
| "epoch": 106.3989464929304, |
| "grad_norm": 0.32900717854499817, |
| "learning_rate": 1.214046822742475e-06, |
| "loss": 6.375, |
| "step": 47900 |
| }, |
| { |
| "epoch": 106.3989464929304, |
| "eval_loss": 6.375906467437744, |
| "eval_runtime": 63.8048, |
| "eval_samples_per_second": 156.728, |
| "eval_steps_per_second": 19.591, |
| "step": 47900 |
| }, |
| { |
| "epoch": 106.62073745494871, |
| "grad_norm": 0.3241865932941437, |
| "learning_rate": 1.2040133779264215e-06, |
| "loss": 6.3792, |
| "step": 48000 |
| }, |
| { |
| "epoch": 106.62073745494871, |
| "eval_loss": 6.37775993347168, |
| "eval_runtime": 66.3426, |
| "eval_samples_per_second": 150.733, |
| "eval_steps_per_second": 18.842, |
| "step": 48000 |
| }, |
| { |
| "epoch": 106.84252841696701, |
| "grad_norm": 0.3194703757762909, |
| "learning_rate": 1.193979933110368e-06, |
| "loss": 6.3766, |
| "step": 48100 |
| }, |
| { |
| "epoch": 106.84252841696701, |
| "eval_loss": 6.37912654876709, |
| "eval_runtime": 63.7236, |
| "eval_samples_per_second": 156.928, |
| "eval_steps_per_second": 19.616, |
| "step": 48100 |
| }, |
| { |
| "epoch": 107.0643193789853, |
| "grad_norm": 0.25526002049446106, |
| "learning_rate": 1.1839464882943144e-06, |
| "loss": 6.3776, |
| "step": 48200 |
| }, |
| { |
| "epoch": 107.0643193789853, |
| "eval_loss": 6.38245153427124, |
| "eval_runtime": 66.2659, |
| "eval_samples_per_second": 150.907, |
| "eval_steps_per_second": 18.863, |
| "step": 48200 |
| }, |
| { |
| "epoch": 107.2861103410036, |
| "grad_norm": 0.2747518718242645, |
| "learning_rate": 1.173913043478261e-06, |
| "loss": 6.3768, |
| "step": 48300 |
| }, |
| { |
| "epoch": 107.2861103410036, |
| "eval_loss": 6.380572319030762, |
| "eval_runtime": 63.8901, |
| "eval_samples_per_second": 156.519, |
| "eval_steps_per_second": 19.565, |
| "step": 48300 |
| }, |
| { |
| "epoch": 107.5079013030219, |
| "grad_norm": 0.2569632828235626, |
| "learning_rate": 1.1638795986622075e-06, |
| "loss": 6.3764, |
| "step": 48400 |
| }, |
| { |
| "epoch": 107.5079013030219, |
| "eval_loss": 6.380358695983887, |
| "eval_runtime": 66.316, |
| "eval_samples_per_second": 150.793, |
| "eval_steps_per_second": 18.849, |
| "step": 48400 |
| }, |
| { |
| "epoch": 107.7296922650402, |
| "grad_norm": 0.28270038962364197, |
| "learning_rate": 1.153846153846154e-06, |
| "loss": 6.3772, |
| "step": 48500 |
| }, |
| { |
| "epoch": 107.7296922650402, |
| "eval_loss": 6.3787407875061035, |
| "eval_runtime": 63.7582, |
| "eval_samples_per_second": 156.842, |
| "eval_steps_per_second": 19.605, |
| "step": 48500 |
| }, |
| { |
| "epoch": 107.9514832270585, |
| "grad_norm": 0.35361409187316895, |
| "learning_rate": 1.1438127090301004e-06, |
| "loss": 6.3754, |
| "step": 48600 |
| }, |
| { |
| "epoch": 107.9514832270585, |
| "eval_loss": 6.37959098815918, |
| "eval_runtime": 63.698, |
| "eval_samples_per_second": 156.991, |
| "eval_steps_per_second": 19.624, |
| "step": 48600 |
| }, |
| { |
| "epoch": 108.17327418907679, |
| "grad_norm": 0.2802847921848297, |
| "learning_rate": 1.133779264214047e-06, |
| "loss": 6.375, |
| "step": 48700 |
| }, |
| { |
| "epoch": 108.17327418907679, |
| "eval_loss": 6.376708030700684, |
| "eval_runtime": 66.263, |
| "eval_samples_per_second": 150.914, |
| "eval_steps_per_second": 18.864, |
| "step": 48700 |
| }, |
| { |
| "epoch": 108.3950651510951, |
| "grad_norm": 0.3533788323402405, |
| "learning_rate": 1.1237458193979933e-06, |
| "loss": 6.3757, |
| "step": 48800 |
| }, |
| { |
| "epoch": 108.3950651510951, |
| "eval_loss": 6.380278587341309, |
| "eval_runtime": 67.3818, |
| "eval_samples_per_second": 148.408, |
| "eval_steps_per_second": 18.551, |
| "step": 48800 |
| }, |
| { |
| "epoch": 108.61685611311339, |
| "grad_norm": 0.21207566559314728, |
| "learning_rate": 1.1137123745819398e-06, |
| "loss": 6.3776, |
| "step": 48900 |
| }, |
| { |
| "epoch": 108.61685611311339, |
| "eval_loss": 6.375850200653076, |
| "eval_runtime": 63.7895, |
| "eval_samples_per_second": 156.766, |
| "eval_steps_per_second": 19.596, |
| "step": 48900 |
| }, |
| { |
| "epoch": 108.83864707513169, |
| "grad_norm": 0.33531099557876587, |
| "learning_rate": 1.1036789297658862e-06, |
| "loss": 6.3765, |
| "step": 49000 |
| }, |
| { |
| "epoch": 108.83864707513169, |
| "eval_loss": 6.378798484802246, |
| "eval_runtime": 63.7683, |
| "eval_samples_per_second": 156.818, |
| "eval_steps_per_second": 19.602, |
| "step": 49000 |
| }, |
| { |
| "epoch": 109.06043803714999, |
| "grad_norm": 0.39727288484573364, |
| "learning_rate": 1.0936454849498327e-06, |
| "loss": 6.3774, |
| "step": 49100 |
| }, |
| { |
| "epoch": 109.06043803714999, |
| "eval_loss": 6.379205703735352, |
| "eval_runtime": 66.2384, |
| "eval_samples_per_second": 150.97, |
| "eval_steps_per_second": 18.871, |
| "step": 49100 |
| }, |
| { |
| "epoch": 109.28222899916828, |
| "grad_norm": 0.3876926004886627, |
| "learning_rate": 1.0836120401337793e-06, |
| "loss": 6.3772, |
| "step": 49200 |
| }, |
| { |
| "epoch": 109.28222899916828, |
| "eval_loss": 6.382777214050293, |
| "eval_runtime": 63.7163, |
| "eval_samples_per_second": 156.946, |
| "eval_steps_per_second": 19.618, |
| "step": 49200 |
| }, |
| { |
| "epoch": 109.50401996118659, |
| "grad_norm": 0.3268238604068756, |
| "learning_rate": 1.0735785953177258e-06, |
| "loss": 6.3765, |
| "step": 49300 |
| }, |
| { |
| "epoch": 109.50401996118659, |
| "eval_loss": 6.378788471221924, |
| "eval_runtime": 66.3254, |
| "eval_samples_per_second": 150.772, |
| "eval_steps_per_second": 18.846, |
| "step": 49300 |
| }, |
| { |
| "epoch": 109.72581092320488, |
| "grad_norm": 0.24343077838420868, |
| "learning_rate": 1.0635451505016722e-06, |
| "loss": 6.3766, |
| "step": 49400 |
| }, |
| { |
| "epoch": 109.72581092320488, |
| "eval_loss": 6.379393577575684, |
| "eval_runtime": 63.7485, |
| "eval_samples_per_second": 156.866, |
| "eval_steps_per_second": 19.608, |
| "step": 49400 |
| }, |
| { |
| "epoch": 109.94760188522318, |
| "grad_norm": 0.3532174229621887, |
| "learning_rate": 1.0535117056856187e-06, |
| "loss": 6.3762, |
| "step": 49500 |
| }, |
| { |
| "epoch": 109.94760188522318, |
| "eval_loss": 6.383326530456543, |
| "eval_runtime": 63.7304, |
| "eval_samples_per_second": 156.911, |
| "eval_steps_per_second": 19.614, |
| "step": 49500 |
| }, |
| { |
| "epoch": 110.16939284724147, |
| "grad_norm": 0.28071361780166626, |
| "learning_rate": 1.0434782608695653e-06, |
| "loss": 6.3763, |
| "step": 49600 |
| }, |
| { |
| "epoch": 110.16939284724147, |
| "eval_loss": 6.376327991485596, |
| "eval_runtime": 66.248, |
| "eval_samples_per_second": 150.948, |
| "eval_steps_per_second": 18.868, |
| "step": 49600 |
| }, |
| { |
| "epoch": 110.39118380925977, |
| "grad_norm": 0.3425652086734772, |
| "learning_rate": 1.0334448160535118e-06, |
| "loss": 6.3755, |
| "step": 49700 |
| }, |
| { |
| "epoch": 110.39118380925977, |
| "eval_loss": 6.3802337646484375, |
| "eval_runtime": 63.7015, |
| "eval_samples_per_second": 156.982, |
| "eval_steps_per_second": 19.623, |
| "step": 49700 |
| }, |
| { |
| "epoch": 110.61297477127808, |
| "grad_norm": 0.22676917910575867, |
| "learning_rate": 1.0234113712374581e-06, |
| "loss": 6.3773, |
| "step": 49800 |
| }, |
| { |
| "epoch": 110.61297477127808, |
| "eval_loss": 6.3807525634765625, |
| "eval_runtime": 66.2796, |
| "eval_samples_per_second": 150.876, |
| "eval_steps_per_second": 18.86, |
| "step": 49800 |
| }, |
| { |
| "epoch": 110.83476573329636, |
| "grad_norm": 0.25897106528282166, |
| "learning_rate": 1.0133779264214047e-06, |
| "loss": 6.3768, |
| "step": 49900 |
| }, |
| { |
| "epoch": 110.83476573329636, |
| "eval_loss": 6.381240367889404, |
| "eval_runtime": 63.8656, |
| "eval_samples_per_second": 156.579, |
| "eval_steps_per_second": 19.572, |
| "step": 49900 |
| }, |
| { |
| "epoch": 111.05655669531467, |
| "grad_norm": 0.2521306574344635, |
| "learning_rate": 1.0033444816053512e-06, |
| "loss": 6.3748, |
| "step": 50000 |
| }, |
| { |
| "epoch": 111.05655669531467, |
| "eval_loss": 6.379097938537598, |
| "eval_runtime": 63.7336, |
| "eval_samples_per_second": 156.903, |
| "eval_steps_per_second": 19.613, |
| "step": 50000 |
| }, |
| { |
| "epoch": 111.27834765733296, |
| "grad_norm": 0.32774215936660767, |
| "learning_rate": 9.933110367892976e-07, |
| "loss": 6.3777, |
| "step": 50100 |
| }, |
| { |
| "epoch": 111.27834765733296, |
| "eval_loss": 6.379392147064209, |
| "eval_runtime": 66.4051, |
| "eval_samples_per_second": 150.591, |
| "eval_steps_per_second": 18.824, |
| "step": 50100 |
| }, |
| { |
| "epoch": 111.50013861935126, |
| "grad_norm": 0.23284611105918884, |
| "learning_rate": 9.832775919732441e-07, |
| "loss": 6.3746, |
| "step": 50200 |
| }, |
| { |
| "epoch": 111.50013861935126, |
| "eval_loss": 6.377693176269531, |
| "eval_runtime": 64.6002, |
| "eval_samples_per_second": 154.798, |
| "eval_steps_per_second": 19.35, |
| "step": 50200 |
| }, |
| { |
| "epoch": 111.72192958136957, |
| "grad_norm": 0.2757164537906647, |
| "learning_rate": 9.732441471571907e-07, |
| "loss": 6.3743, |
| "step": 50300 |
| }, |
| { |
| "epoch": 111.72192958136957, |
| "eval_loss": 6.38041877746582, |
| "eval_runtime": 65.5393, |
| "eval_samples_per_second": 152.58, |
| "eval_steps_per_second": 19.073, |
| "step": 50300 |
| }, |
| { |
| "epoch": 111.94372054338785, |
| "grad_norm": 0.326815128326416, |
| "learning_rate": 9.632107023411372e-07, |
| "loss": 6.3765, |
| "step": 50400 |
| }, |
| { |
| "epoch": 111.94372054338785, |
| "eval_loss": 6.37969970703125, |
| "eval_runtime": 63.7883, |
| "eval_samples_per_second": 156.769, |
| "eval_steps_per_second": 19.596, |
| "step": 50400 |
| }, |
| { |
| "epoch": 112.16551150540616, |
| "grad_norm": 0.34073254466056824, |
| "learning_rate": 9.531772575250837e-07, |
| "loss": 6.3758, |
| "step": 50500 |
| }, |
| { |
| "epoch": 112.16551150540616, |
| "eval_loss": 6.380171298980713, |
| "eval_runtime": 66.2335, |
| "eval_samples_per_second": 150.981, |
| "eval_steps_per_second": 18.873, |
| "step": 50500 |
| }, |
| { |
| "epoch": 112.38730246742445, |
| "grad_norm": 0.2289067655801773, |
| "learning_rate": 9.431438127090301e-07, |
| "loss": 6.3766, |
| "step": 50600 |
| }, |
| { |
| "epoch": 112.38730246742445, |
| "eval_loss": 6.379415035247803, |
| "eval_runtime": 63.6851, |
| "eval_samples_per_second": 157.023, |
| "eval_steps_per_second": 19.628, |
| "step": 50600 |
| }, |
| { |
| "epoch": 112.60909342944275, |
| "grad_norm": 0.2386418581008911, |
| "learning_rate": 9.331103678929767e-07, |
| "loss": 6.375, |
| "step": 50700 |
| }, |
| { |
| "epoch": 112.60909342944275, |
| "eval_loss": 6.375070571899414, |
| "eval_runtime": 66.2164, |
| "eval_samples_per_second": 151.02, |
| "eval_steps_per_second": 18.878, |
| "step": 50700 |
| }, |
| { |
| "epoch": 112.83088439146105, |
| "grad_norm": 0.26779764890670776, |
| "learning_rate": 9.230769230769231e-07, |
| "loss": 6.3754, |
| "step": 50800 |
| }, |
| { |
| "epoch": 112.83088439146105, |
| "eval_loss": 6.377529621124268, |
| "eval_runtime": 63.7216, |
| "eval_samples_per_second": 156.933, |
| "eval_steps_per_second": 19.617, |
| "step": 50800 |
| }, |
| { |
| "epoch": 113.05267535347934, |
| "grad_norm": 0.2792610228061676, |
| "learning_rate": 9.130434782608697e-07, |
| "loss": 6.3768, |
| "step": 50900 |
| }, |
| { |
| "epoch": 113.05267535347934, |
| "eval_loss": 6.376430988311768, |
| "eval_runtime": 66.1841, |
| "eval_samples_per_second": 151.094, |
| "eval_steps_per_second": 18.887, |
| "step": 50900 |
| }, |
| { |
| "epoch": 113.27446631549765, |
| "grad_norm": 0.26424017548561096, |
| "learning_rate": 9.030100334448161e-07, |
| "loss": 6.3748, |
| "step": 51000 |
| }, |
| { |
| "epoch": 113.27446631549765, |
| "eval_loss": 6.37862491607666, |
| "eval_runtime": 63.7419, |
| "eval_samples_per_second": 156.883, |
| "eval_steps_per_second": 19.61, |
| "step": 51000 |
| }, |
| { |
| "epoch": 113.49625727751594, |
| "grad_norm": 0.26083120703697205, |
| "learning_rate": 8.929765886287627e-07, |
| "loss": 6.3779, |
| "step": 51100 |
| }, |
| { |
| "epoch": 113.49625727751594, |
| "eval_loss": 6.379500389099121, |
| "eval_runtime": 66.2253, |
| "eval_samples_per_second": 151.0, |
| "eval_steps_per_second": 18.875, |
| "step": 51100 |
| }, |
| { |
| "epoch": 113.77626836706405, |
| "grad_norm": 0.25904449820518494, |
| "learning_rate": 8.829431438127091e-07, |
| "loss": 6.3757, |
| "step": 51200 |
| }, |
| { |
| "epoch": 113.77626836706405, |
| "eval_loss": 6.375171661376953, |
| "eval_runtime": 66.093, |
| "eval_samples_per_second": 151.302, |
| "eval_steps_per_second": 18.913, |
| "step": 51200 |
| }, |
| { |
| "epoch": 113.99805932908234, |
| "grad_norm": 0.2680477499961853, |
| "learning_rate": 8.729096989966555e-07, |
| "loss": 6.3769, |
| "step": 51300 |
| }, |
| { |
| "epoch": 113.99805932908234, |
| "eval_loss": 6.376518726348877, |
| "eval_runtime": 63.5204, |
| "eval_samples_per_second": 157.43, |
| "eval_steps_per_second": 19.679, |
| "step": 51300 |
| }, |
| { |
| "epoch": 114.21985029110064, |
| "grad_norm": 0.30891552567481995, |
| "learning_rate": 8.628762541806019e-07, |
| "loss": 6.3752, |
| "step": 51400 |
| }, |
| { |
| "epoch": 114.21985029110064, |
| "eval_loss": 6.377015590667725, |
| "eval_runtime": 63.4143, |
| "eval_samples_per_second": 157.693, |
| "eval_steps_per_second": 19.712, |
| "step": 51400 |
| }, |
| { |
| "epoch": 114.44164125311893, |
| "grad_norm": 0.32155531644821167, |
| "learning_rate": 8.528428093645485e-07, |
| "loss": 6.3767, |
| "step": 51500 |
| }, |
| { |
| "epoch": 114.44164125311893, |
| "eval_loss": 6.377589702606201, |
| "eval_runtime": 66.1364, |
| "eval_samples_per_second": 151.203, |
| "eval_steps_per_second": 18.9, |
| "step": 51500 |
| }, |
| { |
| "epoch": 114.66343221513723, |
| "grad_norm": 0.28316569328308105, |
| "learning_rate": 8.428093645484949e-07, |
| "loss": 6.3755, |
| "step": 51600 |
| }, |
| { |
| "epoch": 114.66343221513723, |
| "eval_loss": 6.3766303062438965, |
| "eval_runtime": 65.9296, |
| "eval_samples_per_second": 151.677, |
| "eval_steps_per_second": 18.96, |
| "step": 51600 |
| }, |
| { |
| "epoch": 114.88522317715552, |
| "grad_norm": 0.24125680327415466, |
| "learning_rate": 8.327759197324414e-07, |
| "loss": 6.3773, |
| "step": 51700 |
| }, |
| { |
| "epoch": 114.88522317715552, |
| "eval_loss": 6.37697172164917, |
| "eval_runtime": 65.9478, |
| "eval_samples_per_second": 151.635, |
| "eval_steps_per_second": 18.954, |
| "step": 51700 |
| }, |
| { |
| "epoch": 115.10701413917383, |
| "grad_norm": 0.21407043933868408, |
| "learning_rate": 8.227424749163879e-07, |
| "loss": 6.3751, |
| "step": 51800 |
| }, |
| { |
| "epoch": 115.10701413917383, |
| "eval_loss": 6.377639293670654, |
| "eval_runtime": 63.6016, |
| "eval_samples_per_second": 157.229, |
| "eval_steps_per_second": 19.654, |
| "step": 51800 |
| }, |
| { |
| "epoch": 115.32880510119213, |
| "grad_norm": 0.23014885187149048, |
| "learning_rate": 8.127090301003344e-07, |
| "loss": 6.3771, |
| "step": 51900 |
| }, |
| { |
| "epoch": 115.32880510119213, |
| "eval_loss": 6.380842208862305, |
| "eval_runtime": 63.4674, |
| "eval_samples_per_second": 157.561, |
| "eval_steps_per_second": 19.695, |
| "step": 51900 |
| }, |
| { |
| "epoch": 115.55059606321042, |
| "grad_norm": 0.2553617060184479, |
| "learning_rate": 8.026755852842809e-07, |
| "loss": 6.3752, |
| "step": 52000 |
| }, |
| { |
| "epoch": 115.55059606321042, |
| "eval_loss": 6.377804756164551, |
| "eval_runtime": 64.2492, |
| "eval_samples_per_second": 155.644, |
| "eval_steps_per_second": 19.456, |
| "step": 52000 |
| }, |
| { |
| "epoch": 115.77238702522872, |
| "grad_norm": 0.32242822647094727, |
| "learning_rate": 7.926421404682274e-07, |
| "loss": 6.3762, |
| "step": 52100 |
| }, |
| { |
| "epoch": 115.77238702522872, |
| "eval_loss": 6.382247447967529, |
| "eval_runtime": 65.2652, |
| "eval_samples_per_second": 153.221, |
| "eval_steps_per_second": 19.153, |
| "step": 52100 |
| }, |
| { |
| "epoch": 115.99417798724701, |
| "grad_norm": 0.25089436769485474, |
| "learning_rate": 7.826086956521739e-07, |
| "loss": 6.3757, |
| "step": 52200 |
| }, |
| { |
| "epoch": 115.99417798724701, |
| "eval_loss": 6.379915714263916, |
| "eval_runtime": 63.3864, |
| "eval_samples_per_second": 157.763, |
| "eval_steps_per_second": 19.72, |
| "step": 52200 |
| }, |
| { |
| "epoch": 116.21596894926532, |
| "grad_norm": 0.24113717675209045, |
| "learning_rate": 7.725752508361204e-07, |
| "loss": 6.3761, |
| "step": 52300 |
| }, |
| { |
| "epoch": 116.21596894926532, |
| "eval_loss": 6.376662731170654, |
| "eval_runtime": 63.5361, |
| "eval_samples_per_second": 157.391, |
| "eval_steps_per_second": 19.674, |
| "step": 52300 |
| }, |
| { |
| "epoch": 116.43775991128362, |
| "grad_norm": 0.3414776027202606, |
| "learning_rate": 7.625418060200669e-07, |
| "loss": 6.3757, |
| "step": 52400 |
| }, |
| { |
| "epoch": 116.43775991128362, |
| "eval_loss": 6.377313137054443, |
| "eval_runtime": 63.5522, |
| "eval_samples_per_second": 157.351, |
| "eval_steps_per_second": 19.669, |
| "step": 52400 |
| }, |
| { |
| "epoch": 116.65955087330191, |
| "grad_norm": 0.24650247395038605, |
| "learning_rate": 7.525083612040134e-07, |
| "loss": 6.3754, |
| "step": 52500 |
| }, |
| { |
| "epoch": 116.65955087330191, |
| "eval_loss": 6.37901496887207, |
| "eval_runtime": 65.8363, |
| "eval_samples_per_second": 151.892, |
| "eval_steps_per_second": 18.986, |
| "step": 52500 |
| }, |
| { |
| "epoch": 116.88134183532021, |
| "grad_norm": 0.27944493293762207, |
| "learning_rate": 7.424749163879599e-07, |
| "loss": 6.3776, |
| "step": 52600 |
| }, |
| { |
| "epoch": 116.88134183532021, |
| "eval_loss": 6.376550197601318, |
| "eval_runtime": 63.5812, |
| "eval_samples_per_second": 157.279, |
| "eval_steps_per_second": 19.66, |
| "step": 52600 |
| }, |
| { |
| "epoch": 117.1031327973385, |
| "grad_norm": 0.2298879325389862, |
| "learning_rate": 7.324414715719064e-07, |
| "loss": 6.3751, |
| "step": 52700 |
| }, |
| { |
| "epoch": 117.1031327973385, |
| "eval_loss": 6.377909183502197, |
| "eval_runtime": 63.3541, |
| "eval_samples_per_second": 157.843, |
| "eval_steps_per_second": 19.73, |
| "step": 52700 |
| }, |
| { |
| "epoch": 117.3249237593568, |
| "grad_norm": 0.25682932138442993, |
| "learning_rate": 7.224080267558529e-07, |
| "loss": 6.3757, |
| "step": 52800 |
| }, |
| { |
| "epoch": 117.3249237593568, |
| "eval_loss": 6.378458023071289, |
| "eval_runtime": 65.7985, |
| "eval_samples_per_second": 151.979, |
| "eval_steps_per_second": 18.997, |
| "step": 52800 |
| }, |
| { |
| "epoch": 117.54671472137511, |
| "grad_norm": 0.2633031904697418, |
| "learning_rate": 7.123745819397994e-07, |
| "loss": 6.3767, |
| "step": 52900 |
| }, |
| { |
| "epoch": 117.54671472137511, |
| "eval_loss": 6.380926132202148, |
| "eval_runtime": 63.5491, |
| "eval_samples_per_second": 157.359, |
| "eval_steps_per_second": 19.67, |
| "step": 52900 |
| }, |
| { |
| "epoch": 117.7685056833934, |
| "grad_norm": 0.26749059557914734, |
| "learning_rate": 7.023411371237459e-07, |
| "loss": 6.3767, |
| "step": 53000 |
| }, |
| { |
| "epoch": 117.7685056833934, |
| "eval_loss": 6.381775856018066, |
| "eval_runtime": 63.4542, |
| "eval_samples_per_second": 157.594, |
| "eval_steps_per_second": 19.699, |
| "step": 53000 |
| }, |
| { |
| "epoch": 117.9902966454117, |
| "grad_norm": 0.22249187529087067, |
| "learning_rate": 6.923076923076923e-07, |
| "loss": 6.377, |
| "step": 53100 |
| }, |
| { |
| "epoch": 117.9902966454117, |
| "eval_loss": 6.38169002532959, |
| "eval_runtime": 63.5488, |
| "eval_samples_per_second": 157.359, |
| "eval_steps_per_second": 19.67, |
| "step": 53100 |
| }, |
| { |
| "epoch": 118.21208760742999, |
| "grad_norm": 0.22224722802639008, |
| "learning_rate": 6.822742474916388e-07, |
| "loss": 6.3764, |
| "step": 53200 |
| }, |
| { |
| "epoch": 118.21208760742999, |
| "eval_loss": 6.37975549697876, |
| "eval_runtime": 65.9614, |
| "eval_samples_per_second": 151.604, |
| "eval_steps_per_second": 18.95, |
| "step": 53200 |
| }, |
| { |
| "epoch": 118.4338785694483, |
| "grad_norm": 0.2897886037826538, |
| "learning_rate": 6.722408026755853e-07, |
| "loss": 6.3737, |
| "step": 53300 |
| }, |
| { |
| "epoch": 118.4338785694483, |
| "eval_loss": 6.376906394958496, |
| "eval_runtime": 63.536, |
| "eval_samples_per_second": 157.391, |
| "eval_steps_per_second": 19.674, |
| "step": 53300 |
| }, |
| { |
| "epoch": 118.65566953146659, |
| "grad_norm": 0.2731805145740509, |
| "learning_rate": 6.622073578595318e-07, |
| "loss": 6.3774, |
| "step": 53400 |
| }, |
| { |
| "epoch": 118.65566953146659, |
| "eval_loss": 6.377748489379883, |
| "eval_runtime": 63.612, |
| "eval_samples_per_second": 157.203, |
| "eval_steps_per_second": 19.65, |
| "step": 53400 |
| }, |
| { |
| "epoch": 118.87746049348489, |
| "grad_norm": 0.22697260975837708, |
| "learning_rate": 6.521739130434783e-07, |
| "loss": 6.3767, |
| "step": 53500 |
| }, |
| { |
| "epoch": 118.87746049348489, |
| "eval_loss": 6.381230354309082, |
| "eval_runtime": 65.9156, |
| "eval_samples_per_second": 151.709, |
| "eval_steps_per_second": 18.964, |
| "step": 53500 |
| }, |
| { |
| "epoch": 119.0992514555032, |
| "grad_norm": 0.30966779589653015, |
| "learning_rate": 6.421404682274248e-07, |
| "loss": 6.376, |
| "step": 53600 |
| }, |
| { |
| "epoch": 119.0992514555032, |
| "eval_loss": 6.37573766708374, |
| "eval_runtime": 63.3841, |
| "eval_samples_per_second": 157.768, |
| "eval_steps_per_second": 19.721, |
| "step": 53600 |
| }, |
| { |
| "epoch": 119.32104241752148, |
| "grad_norm": 0.2676733136177063, |
| "learning_rate": 6.321070234113712e-07, |
| "loss": 6.3759, |
| "step": 53700 |
| }, |
| { |
| "epoch": 119.32104241752148, |
| "eval_loss": 6.374691963195801, |
| "eval_runtime": 63.4737, |
| "eval_samples_per_second": 157.545, |
| "eval_steps_per_second": 19.693, |
| "step": 53700 |
| }, |
| { |
| "epoch": 119.54283337953979, |
| "grad_norm": 0.2713070213794708, |
| "learning_rate": 6.220735785953178e-07, |
| "loss": 6.3768, |
| "step": 53800 |
| }, |
| { |
| "epoch": 119.54283337953979, |
| "eval_loss": 6.378169059753418, |
| "eval_runtime": 65.7452, |
| "eval_samples_per_second": 152.102, |
| "eval_steps_per_second": 19.013, |
| "step": 53800 |
| }, |
| { |
| "epoch": 119.76462434155808, |
| "grad_norm": 0.2583908140659332, |
| "learning_rate": 6.120401337792642e-07, |
| "loss": 6.3756, |
| "step": 53900 |
| }, |
| { |
| "epoch": 119.76462434155808, |
| "eval_loss": 6.380895137786865, |
| "eval_runtime": 63.7905, |
| "eval_samples_per_second": 156.763, |
| "eval_steps_per_second": 19.595, |
| "step": 53900 |
| }, |
| { |
| "epoch": 119.98641530357638, |
| "grad_norm": 0.2636660933494568, |
| "learning_rate": 6.020066889632107e-07, |
| "loss": 6.376, |
| "step": 54000 |
| }, |
| { |
| "epoch": 119.98641530357638, |
| "eval_loss": 6.378993034362793, |
| "eval_runtime": 63.4987, |
| "eval_samples_per_second": 157.483, |
| "eval_steps_per_second": 19.685, |
| "step": 54000 |
| }, |
| { |
| "epoch": 120.20820626559468, |
| "grad_norm": 0.2040402740240097, |
| "learning_rate": 5.919732441471572e-07, |
| "loss": 6.3742, |
| "step": 54100 |
| }, |
| { |
| "epoch": 120.20820626559468, |
| "eval_loss": 6.379099369049072, |
| "eval_runtime": 63.4641, |
| "eval_samples_per_second": 157.569, |
| "eval_steps_per_second": 19.696, |
| "step": 54100 |
| }, |
| { |
| "epoch": 120.42999722761297, |
| "grad_norm": 0.2771637439727783, |
| "learning_rate": 5.819397993311037e-07, |
| "loss": 6.377, |
| "step": 54200 |
| }, |
| { |
| "epoch": 120.42999722761297, |
| "eval_loss": 6.380918025970459, |
| "eval_runtime": 63.5735, |
| "eval_samples_per_second": 157.298, |
| "eval_steps_per_second": 19.662, |
| "step": 54200 |
| }, |
| { |
| "epoch": 120.65178818963128, |
| "grad_norm": 0.2907504141330719, |
| "learning_rate": 5.719063545150502e-07, |
| "loss": 6.3771, |
| "step": 54300 |
| }, |
| { |
| "epoch": 120.65178818963128, |
| "eval_loss": 6.379312515258789, |
| "eval_runtime": 65.9548, |
| "eval_samples_per_second": 151.619, |
| "eval_steps_per_second": 18.952, |
| "step": 54300 |
| }, |
| { |
| "epoch": 120.87357915164957, |
| "grad_norm": 0.30987074971199036, |
| "learning_rate": 5.618729096989966e-07, |
| "loss": 6.3768, |
| "step": 54400 |
| }, |
| { |
| "epoch": 120.87357915164957, |
| "eval_loss": 6.37892484664917, |
| "eval_runtime": 63.4967, |
| "eval_samples_per_second": 157.488, |
| "eval_steps_per_second": 19.686, |
| "step": 54400 |
| }, |
| { |
| "epoch": 121.09537011366787, |
| "grad_norm": 0.3270675837993622, |
| "learning_rate": 5.518394648829431e-07, |
| "loss": 6.376, |
| "step": 54500 |
| }, |
| { |
| "epoch": 121.09537011366787, |
| "eval_loss": 6.377264976501465, |
| "eval_runtime": 63.4405, |
| "eval_samples_per_second": 157.628, |
| "eval_steps_per_second": 19.704, |
| "step": 54500 |
| }, |
| { |
| "epoch": 121.31716107568617, |
| "grad_norm": 0.23159100115299225, |
| "learning_rate": 5.418060200668896e-07, |
| "loss": 6.3773, |
| "step": 54600 |
| }, |
| { |
| "epoch": 121.31716107568617, |
| "eval_loss": 6.379176616668701, |
| "eval_runtime": 66.0387, |
| "eval_samples_per_second": 151.426, |
| "eval_steps_per_second": 18.928, |
| "step": 54600 |
| }, |
| { |
| "epoch": 121.53895203770446, |
| "grad_norm": 0.231267511844635, |
| "learning_rate": 5.317725752508361e-07, |
| "loss": 6.3773, |
| "step": 54700 |
| }, |
| { |
| "epoch": 121.53895203770446, |
| "eval_loss": 6.376558780670166, |
| "eval_runtime": 63.5938, |
| "eval_samples_per_second": 157.248, |
| "eval_steps_per_second": 19.656, |
| "step": 54700 |
| }, |
| { |
| "epoch": 121.76074299972277, |
| "grad_norm": 0.24276390671730042, |
| "learning_rate": 5.217391304347826e-07, |
| "loss": 6.3754, |
| "step": 54800 |
| }, |
| { |
| "epoch": 121.76074299972277, |
| "eval_loss": 6.378441333770752, |
| "eval_runtime": 63.5257, |
| "eval_samples_per_second": 157.417, |
| "eval_steps_per_second": 19.677, |
| "step": 54800 |
| }, |
| { |
| "epoch": 121.98253396174105, |
| "grad_norm": 0.26921290159225464, |
| "learning_rate": 5.117056856187291e-07, |
| "loss": 6.3751, |
| "step": 54900 |
| }, |
| { |
| "epoch": 121.98253396174105, |
| "eval_loss": 6.378532886505127, |
| "eval_runtime": 63.5535, |
| "eval_samples_per_second": 157.348, |
| "eval_steps_per_second": 19.668, |
| "step": 54900 |
| }, |
| { |
| "epoch": 122.20432492375936, |
| "grad_norm": 0.274029016494751, |
| "learning_rate": 5.016722408026756e-07, |
| "loss": 6.376, |
| "step": 55000 |
| }, |
| { |
| "epoch": 122.20432492375936, |
| "eval_loss": 6.378449440002441, |
| "eval_runtime": 65.8768, |
| "eval_samples_per_second": 151.798, |
| "eval_steps_per_second": 18.975, |
| "step": 55000 |
| }, |
| { |
| "epoch": 122.42611588577765, |
| "grad_norm": 0.27585527300834656, |
| "learning_rate": 4.916387959866221e-07, |
| "loss": 6.376, |
| "step": 55100 |
| }, |
| { |
| "epoch": 122.42611588577765, |
| "eval_loss": 6.37809944152832, |
| "eval_runtime": 63.5221, |
| "eval_samples_per_second": 157.426, |
| "eval_steps_per_second": 19.678, |
| "step": 55100 |
| }, |
| { |
| "epoch": 122.64790684779595, |
| "grad_norm": 0.2652019262313843, |
| "learning_rate": 4.816053511705686e-07, |
| "loss": 6.3753, |
| "step": 55200 |
| }, |
| { |
| "epoch": 122.64790684779595, |
| "eval_loss": 6.38352632522583, |
| "eval_runtime": 63.4719, |
| "eval_samples_per_second": 157.55, |
| "eval_steps_per_second": 19.694, |
| "step": 55200 |
| }, |
| { |
| "epoch": 122.86969780981426, |
| "grad_norm": 0.24283932149410248, |
| "learning_rate": 4.7157190635451506e-07, |
| "loss": 6.3761, |
| "step": 55300 |
| }, |
| { |
| "epoch": 122.86969780981426, |
| "eval_loss": 6.376107215881348, |
| "eval_runtime": 63.547, |
| "eval_samples_per_second": 157.364, |
| "eval_steps_per_second": 19.67, |
| "step": 55300 |
| }, |
| { |
| "epoch": 123.09148877183254, |
| "grad_norm": 0.29150310158729553, |
| "learning_rate": 4.6153846153846156e-07, |
| "loss": 6.3765, |
| "step": 55400 |
| }, |
| { |
| "epoch": 123.09148877183254, |
| "eval_loss": 6.37521505355835, |
| "eval_runtime": 65.9064, |
| "eval_samples_per_second": 151.73, |
| "eval_steps_per_second": 18.966, |
| "step": 55400 |
| }, |
| { |
| "epoch": 123.31327973385085, |
| "grad_norm": 0.28435659408569336, |
| "learning_rate": 4.5150501672240806e-07, |
| "loss": 6.3757, |
| "step": 55500 |
| }, |
| { |
| "epoch": 123.31327973385085, |
| "eval_loss": 6.378593921661377, |
| "eval_runtime": 63.5654, |
| "eval_samples_per_second": 157.318, |
| "eval_steps_per_second": 19.665, |
| "step": 55500 |
| }, |
| { |
| "epoch": 123.53507069586914, |
| "grad_norm": 0.2412547916173935, |
| "learning_rate": 4.4147157190635456e-07, |
| "loss": 6.3757, |
| "step": 55600 |
| }, |
| { |
| "epoch": 123.53507069586914, |
| "eval_loss": 6.377431869506836, |
| "eval_runtime": 66.0043, |
| "eval_samples_per_second": 151.505, |
| "eval_steps_per_second": 18.938, |
| "step": 55600 |
| }, |
| { |
| "epoch": 123.75686165788744, |
| "grad_norm": 0.21835213899612427, |
| "learning_rate": 4.3143812709030095e-07, |
| "loss": 6.3763, |
| "step": 55700 |
| }, |
| { |
| "epoch": 123.75686165788744, |
| "eval_loss": 6.378489971160889, |
| "eval_runtime": 63.7489, |
| "eval_samples_per_second": 156.865, |
| "eval_steps_per_second": 19.608, |
| "step": 55700 |
| }, |
| { |
| "epoch": 123.97865261990574, |
| "grad_norm": 0.18911224603652954, |
| "learning_rate": 4.2140468227424745e-07, |
| "loss": 6.3754, |
| "step": 55800 |
| }, |
| { |
| "epoch": 123.97865261990574, |
| "eval_loss": 6.379303932189941, |
| "eval_runtime": 66.1257, |
| "eval_samples_per_second": 151.227, |
| "eval_steps_per_second": 18.903, |
| "step": 55800 |
| }, |
| { |
| "epoch": 124.20044358192403, |
| "grad_norm": 0.283447265625, |
| "learning_rate": 4.1137123745819395e-07, |
| "loss": 6.3743, |
| "step": 55900 |
| }, |
| { |
| "epoch": 124.20044358192403, |
| "eval_loss": 6.381599426269531, |
| "eval_runtime": 63.605, |
| "eval_samples_per_second": 157.22, |
| "eval_steps_per_second": 19.653, |
| "step": 55900 |
| }, |
| { |
| "epoch": 124.42223454394234, |
| "grad_norm": 0.1898406594991684, |
| "learning_rate": 4.0133779264214045e-07, |
| "loss": 6.3755, |
| "step": 56000 |
| }, |
| { |
| "epoch": 124.42223454394234, |
| "eval_loss": 6.376759052276611, |
| "eval_runtime": 64.3574, |
| "eval_samples_per_second": 155.382, |
| "eval_steps_per_second": 19.423, |
| "step": 56000 |
| }, |
| { |
| "epoch": 124.64402550596063, |
| "grad_norm": 0.2740555703639984, |
| "learning_rate": 3.9130434782608694e-07, |
| "loss": 6.3767, |
| "step": 56100 |
| }, |
| { |
| "epoch": 124.64402550596063, |
| "eval_loss": 6.377686023712158, |
| "eval_runtime": 65.4964, |
| "eval_samples_per_second": 152.68, |
| "eval_steps_per_second": 19.085, |
| "step": 56100 |
| }, |
| { |
| "epoch": 124.86581646797893, |
| "grad_norm": 0.24969562888145447, |
| "learning_rate": 3.8127090301003344e-07, |
| "loss": 6.3749, |
| "step": 56200 |
| }, |
| { |
| "epoch": 124.86581646797893, |
| "eval_loss": 6.3803300857543945, |
| "eval_runtime": 63.6262, |
| "eval_samples_per_second": 157.168, |
| "eval_steps_per_second": 19.646, |
| "step": 56200 |
| }, |
| { |
| "epoch": 125.08760742999723, |
| "grad_norm": 0.271085649728775, |
| "learning_rate": 3.7123745819397994e-07, |
| "loss": 6.3761, |
| "step": 56300 |
| }, |
| { |
| "epoch": 125.08760742999723, |
| "eval_loss": 6.377999782562256, |
| "eval_runtime": 63.5511, |
| "eval_samples_per_second": 157.354, |
| "eval_steps_per_second": 19.669, |
| "step": 56300 |
| }, |
| { |
| "epoch": 125.30939839201552, |
| "grad_norm": 0.2341337651014328, |
| "learning_rate": 3.6120401337792644e-07, |
| "loss": 6.3787, |
| "step": 56400 |
| }, |
| { |
| "epoch": 125.30939839201552, |
| "eval_loss": 6.377155780792236, |
| "eval_runtime": 66.011, |
| "eval_samples_per_second": 151.49, |
| "eval_steps_per_second": 18.936, |
| "step": 56400 |
| }, |
| { |
| "epoch": 125.53118935403383, |
| "grad_norm": 0.2656327784061432, |
| "learning_rate": 3.5117056856187294e-07, |
| "loss": 6.3742, |
| "step": 56500 |
| }, |
| { |
| "epoch": 125.53118935403383, |
| "eval_loss": 6.378920078277588, |
| "eval_runtime": 63.6517, |
| "eval_samples_per_second": 157.105, |
| "eval_steps_per_second": 19.638, |
| "step": 56500 |
| }, |
| { |
| "epoch": 125.75298031605212, |
| "grad_norm": 0.261843204498291, |
| "learning_rate": 3.411371237458194e-07, |
| "loss": 6.3742, |
| "step": 56600 |
| }, |
| { |
| "epoch": 125.75298031605212, |
| "eval_loss": 6.376353740692139, |
| "eval_runtime": 65.896, |
| "eval_samples_per_second": 151.754, |
| "eval_steps_per_second": 18.969, |
| "step": 56600 |
| }, |
| { |
| "epoch": 125.97477127807042, |
| "grad_norm": 0.27163127064704895, |
| "learning_rate": 3.311036789297659e-07, |
| "loss": 6.3765, |
| "step": 56700 |
| }, |
| { |
| "epoch": 125.97477127807042, |
| "eval_loss": 6.3804826736450195, |
| "eval_runtime": 63.514, |
| "eval_samples_per_second": 157.446, |
| "eval_steps_per_second": 19.681, |
| "step": 56700 |
| }, |
| { |
| "epoch": 126.19656224008871, |
| "grad_norm": 0.2797481417655945, |
| "learning_rate": 3.210702341137124e-07, |
| "loss": 6.3764, |
| "step": 56800 |
| }, |
| { |
| "epoch": 126.19656224008871, |
| "eval_loss": 6.378259658813477, |
| "eval_runtime": 63.4475, |
| "eval_samples_per_second": 157.611, |
| "eval_steps_per_second": 19.701, |
| "step": 56800 |
| }, |
| { |
| "epoch": 126.41835320210701, |
| "grad_norm": 0.21093739569187164, |
| "learning_rate": 3.110367892976589e-07, |
| "loss": 6.3764, |
| "step": 56900 |
| }, |
| { |
| "epoch": 126.41835320210701, |
| "eval_loss": 6.378982067108154, |
| "eval_runtime": 66.045, |
| "eval_samples_per_second": 151.412, |
| "eval_steps_per_second": 18.927, |
| "step": 56900 |
| }, |
| { |
| "epoch": 126.64014416412532, |
| "grad_norm": 0.268632173538208, |
| "learning_rate": 3.010033444816054e-07, |
| "loss": 6.3762, |
| "step": 57000 |
| }, |
| { |
| "epoch": 126.64014416412532, |
| "eval_loss": 6.379413604736328, |
| "eval_runtime": 63.641, |
| "eval_samples_per_second": 157.131, |
| "eval_steps_per_second": 19.641, |
| "step": 57000 |
| }, |
| { |
| "epoch": 126.8619351261436, |
| "grad_norm": 0.2878783047199249, |
| "learning_rate": 2.9096989966555187e-07, |
| "loss": 6.376, |
| "step": 57100 |
| }, |
| { |
| "epoch": 126.8619351261436, |
| "eval_loss": 6.378924369812012, |
| "eval_runtime": 66.1831, |
| "eval_samples_per_second": 151.096, |
| "eval_steps_per_second": 18.887, |
| "step": 57100 |
| }, |
| { |
| "epoch": 127.08372608816191, |
| "grad_norm": 0.2618252635002136, |
| "learning_rate": 2.809364548494983e-07, |
| "loss": 6.3768, |
| "step": 57200 |
| }, |
| { |
| "epoch": 127.08372608816191, |
| "eval_loss": 6.37802267074585, |
| "eval_runtime": 63.5424, |
| "eval_samples_per_second": 157.375, |
| "eval_steps_per_second": 19.672, |
| "step": 57200 |
| }, |
| { |
| "epoch": 127.3055170501802, |
| "grad_norm": 0.20790652930736542, |
| "learning_rate": 2.709030100334448e-07, |
| "loss": 6.3763, |
| "step": 57300 |
| }, |
| { |
| "epoch": 127.3055170501802, |
| "eval_loss": 6.377635955810547, |
| "eval_runtime": 66.2394, |
| "eval_samples_per_second": 150.967, |
| "eval_steps_per_second": 18.871, |
| "step": 57300 |
| }, |
| { |
| "epoch": 127.5273080121985, |
| "grad_norm": 0.23446954786777496, |
| "learning_rate": 2.608695652173913e-07, |
| "loss": 6.3758, |
| "step": 57400 |
| }, |
| { |
| "epoch": 127.5273080121985, |
| "eval_loss": 6.378016471862793, |
| "eval_runtime": 63.7187, |
| "eval_samples_per_second": 156.94, |
| "eval_steps_per_second": 19.617, |
| "step": 57400 |
| }, |
| { |
| "epoch": 127.7490989742168, |
| "grad_norm": 0.2730012536048889, |
| "learning_rate": 2.508361204013378e-07, |
| "loss": 6.3771, |
| "step": 57500 |
| }, |
| { |
| "epoch": 127.7490989742168, |
| "eval_loss": 6.378283500671387, |
| "eval_runtime": 66.0326, |
| "eval_samples_per_second": 151.44, |
| "eval_steps_per_second": 18.93, |
| "step": 57500 |
| }, |
| { |
| "epoch": 127.9708899362351, |
| "grad_norm": 0.19740967452526093, |
| "learning_rate": 2.408026755852843e-07, |
| "loss": 6.3754, |
| "step": 57600 |
| }, |
| { |
| "epoch": 127.9708899362351, |
| "eval_loss": 6.377573490142822, |
| "eval_runtime": 68.5433, |
| "eval_samples_per_second": 145.893, |
| "eval_steps_per_second": 18.237, |
| "step": 57600 |
| }, |
| { |
| "epoch": 128.1926808982534, |
| "grad_norm": 0.20099857449531555, |
| "learning_rate": 2.3076923076923078e-07, |
| "loss": 6.3763, |
| "step": 57700 |
| }, |
| { |
| "epoch": 128.1926808982534, |
| "eval_loss": 6.380809783935547, |
| "eval_runtime": 63.6372, |
| "eval_samples_per_second": 157.141, |
| "eval_steps_per_second": 19.643, |
| "step": 57700 |
| }, |
| { |
| "epoch": 128.4144718602717, |
| "grad_norm": 0.26378223299980164, |
| "learning_rate": 2.2073578595317728e-07, |
| "loss": 6.3742, |
| "step": 57800 |
| }, |
| { |
| "epoch": 128.4144718602717, |
| "eval_loss": 6.377455234527588, |
| "eval_runtime": 63.6147, |
| "eval_samples_per_second": 157.196, |
| "eval_steps_per_second": 19.65, |
| "step": 57800 |
| }, |
| { |
| "epoch": 128.63626282228998, |
| "grad_norm": 0.22778332233428955, |
| "learning_rate": 2.1070234113712372e-07, |
| "loss": 6.3757, |
| "step": 57900 |
| }, |
| { |
| "epoch": 128.63626282228998, |
| "eval_loss": 6.376725196838379, |
| "eval_runtime": 63.6324, |
| "eval_samples_per_second": 157.153, |
| "eval_steps_per_second": 19.644, |
| "step": 57900 |
| }, |
| { |
| "epoch": 128.85805378430828, |
| "grad_norm": 0.25024932622909546, |
| "learning_rate": 2.0066889632107022e-07, |
| "loss": 6.3767, |
| "step": 58000 |
| }, |
| { |
| "epoch": 128.85805378430828, |
| "eval_loss": 6.378956317901611, |
| "eval_runtime": 66.0444, |
| "eval_samples_per_second": 151.413, |
| "eval_steps_per_second": 18.927, |
| "step": 58000 |
| }, |
| { |
| "epoch": 129.0798447463266, |
| "grad_norm": 0.22629129886627197, |
| "learning_rate": 1.9063545150501672e-07, |
| "loss": 6.3751, |
| "step": 58100 |
| }, |
| { |
| "epoch": 129.0798447463266, |
| "eval_loss": 6.378350734710693, |
| "eval_runtime": 63.6424, |
| "eval_samples_per_second": 157.128, |
| "eval_steps_per_second": 19.641, |
| "step": 58100 |
| }, |
| { |
| "epoch": 129.3016357083449, |
| "grad_norm": 0.22958730161190033, |
| "learning_rate": 1.8060200668896322e-07, |
| "loss": 6.3754, |
| "step": 58200 |
| }, |
| { |
| "epoch": 129.3016357083449, |
| "eval_loss": 6.379317760467529, |
| "eval_runtime": 66.1349, |
| "eval_samples_per_second": 151.206, |
| "eval_steps_per_second": 18.901, |
| "step": 58200 |
| }, |
| { |
| "epoch": 129.5234266703632, |
| "grad_norm": 0.29147765040397644, |
| "learning_rate": 1.705685618729097e-07, |
| "loss": 6.3766, |
| "step": 58300 |
| }, |
| { |
| "epoch": 129.5234266703632, |
| "eval_loss": 6.379565238952637, |
| "eval_runtime": 63.6308, |
| "eval_samples_per_second": 157.157, |
| "eval_steps_per_second": 19.645, |
| "step": 58300 |
| }, |
| { |
| "epoch": 129.74521763238147, |
| "grad_norm": 0.2274588942527771, |
| "learning_rate": 1.605351170568562e-07, |
| "loss": 6.3766, |
| "step": 58400 |
| }, |
| { |
| "epoch": 129.74521763238147, |
| "eval_loss": 6.378822326660156, |
| "eval_runtime": 63.7248, |
| "eval_samples_per_second": 156.925, |
| "eval_steps_per_second": 19.616, |
| "step": 58400 |
| }, |
| { |
| "epoch": 129.96700859439977, |
| "grad_norm": 0.27082857489585876, |
| "learning_rate": 1.505016722408027e-07, |
| "loss": 6.3762, |
| "step": 58500 |
| }, |
| { |
| "epoch": 129.96700859439977, |
| "eval_loss": 6.376942157745361, |
| "eval_runtime": 66.2694, |
| "eval_samples_per_second": 150.899, |
| "eval_steps_per_second": 18.862, |
| "step": 58500 |
| }, |
| { |
| "epoch": 130.18879955641808, |
| "grad_norm": 0.2117777317762375, |
| "learning_rate": 1.4046822742474916e-07, |
| "loss": 6.3756, |
| "step": 58600 |
| }, |
| { |
| "epoch": 130.18879955641808, |
| "eval_loss": 6.381185054779053, |
| "eval_runtime": 63.6203, |
| "eval_samples_per_second": 157.183, |
| "eval_steps_per_second": 19.648, |
| "step": 58600 |
| }, |
| { |
| "epoch": 130.41059051843638, |
| "grad_norm": 0.244340181350708, |
| "learning_rate": 1.3043478260869566e-07, |
| "loss": 6.3746, |
| "step": 58700 |
| }, |
| { |
| "epoch": 130.41059051843638, |
| "eval_loss": 6.378442764282227, |
| "eval_runtime": 63.6467, |
| "eval_samples_per_second": 157.117, |
| "eval_steps_per_second": 19.64, |
| "step": 58700 |
| }, |
| { |
| "epoch": 130.63238148045468, |
| "grad_norm": 0.23617205023765564, |
| "learning_rate": 1.2040133779264215e-07, |
| "loss": 6.3759, |
| "step": 58800 |
| }, |
| { |
| "epoch": 130.63238148045468, |
| "eval_loss": 6.377311706542969, |
| "eval_runtime": 66.2898, |
| "eval_samples_per_second": 150.853, |
| "eval_steps_per_second": 18.857, |
| "step": 58800 |
| }, |
| { |
| "epoch": 130.85417244247296, |
| "grad_norm": 0.22402510046958923, |
| "learning_rate": 1.1036789297658864e-07, |
| "loss": 6.3766, |
| "step": 58900 |
| }, |
| { |
| "epoch": 130.85417244247296, |
| "eval_loss": 6.378325939178467, |
| "eval_runtime": 63.7783, |
| "eval_samples_per_second": 156.793, |
| "eval_steps_per_second": 19.599, |
| "step": 58900 |
| }, |
| { |
| "epoch": 131.07596340449126, |
| "grad_norm": 0.22382721304893494, |
| "learning_rate": 1.0033444816053511e-07, |
| "loss": 6.377, |
| "step": 59000 |
| }, |
| { |
| "epoch": 131.07596340449126, |
| "eval_loss": 6.375909328460693, |
| "eval_runtime": 63.6862, |
| "eval_samples_per_second": 157.02, |
| "eval_steps_per_second": 19.627, |
| "step": 59000 |
| }, |
| { |
| "epoch": 131.29775436650957, |
| "grad_norm": 0.2319914549589157, |
| "learning_rate": 9.030100334448161e-08, |
| "loss": 6.3759, |
| "step": 59100 |
| }, |
| { |
| "epoch": 131.29775436650957, |
| "eval_loss": 6.380961894989014, |
| "eval_runtime": 63.73, |
| "eval_samples_per_second": 156.912, |
| "eval_steps_per_second": 19.614, |
| "step": 59100 |
| }, |
| { |
| "epoch": 131.51954532852787, |
| "grad_norm": 0.27138957381248474, |
| "learning_rate": 8.02675585284281e-08, |
| "loss": 6.3765, |
| "step": 59200 |
| }, |
| { |
| "epoch": 131.51954532852787, |
| "eval_loss": 6.378270626068115, |
| "eval_runtime": 66.164, |
| "eval_samples_per_second": 151.14, |
| "eval_steps_per_second": 18.892, |
| "step": 59200 |
| }, |
| { |
| "epoch": 131.74133629054617, |
| "grad_norm": 0.24163523316383362, |
| "learning_rate": 7.023411371237458e-08, |
| "loss": 6.3758, |
| "step": 59300 |
| }, |
| { |
| "epoch": 131.74133629054617, |
| "eval_loss": 6.379899024963379, |
| "eval_runtime": 66.2406, |
| "eval_samples_per_second": 150.965, |
| "eval_steps_per_second": 18.871, |
| "step": 59300 |
| }, |
| { |
| "epoch": 131.96312725256445, |
| "grad_norm": 0.20410296320915222, |
| "learning_rate": 6.020066889632108e-08, |
| "loss": 6.3753, |
| "step": 59400 |
| }, |
| { |
| "epoch": 131.96312725256445, |
| "eval_loss": 6.378077983856201, |
| "eval_runtime": 63.7013, |
| "eval_samples_per_second": 156.983, |
| "eval_steps_per_second": 19.623, |
| "step": 59400 |
| }, |
| { |
| "epoch": 132.18491821458275, |
| "grad_norm": 0.15991632640361786, |
| "learning_rate": 5.0167224080267556e-08, |
| "loss": 6.3762, |
| "step": 59500 |
| }, |
| { |
| "epoch": 132.18491821458275, |
| "eval_loss": 6.379003524780273, |
| "eval_runtime": 63.6773, |
| "eval_samples_per_second": 157.042, |
| "eval_steps_per_second": 19.63, |
| "step": 59500 |
| }, |
| { |
| "epoch": 132.40670917660105, |
| "grad_norm": 0.2014060765504837, |
| "learning_rate": 4.013377926421405e-08, |
| "loss": 6.3734, |
| "step": 59600 |
| }, |
| { |
| "epoch": 132.40670917660105, |
| "eval_loss": 6.377279758453369, |
| "eval_runtime": 64.9426, |
| "eval_samples_per_second": 153.982, |
| "eval_steps_per_second": 19.248, |
| "step": 59600 |
| }, |
| { |
| "epoch": 132.62850013861936, |
| "grad_norm": 0.23493210971355438, |
| "learning_rate": 3.010033444816054e-08, |
| "loss": 6.3767, |
| "step": 59700 |
| }, |
| { |
| "epoch": 132.62850013861936, |
| "eval_loss": 6.378801345825195, |
| "eval_runtime": 65.0941, |
| "eval_samples_per_second": 153.624, |
| "eval_steps_per_second": 19.203, |
| "step": 59700 |
| }, |
| { |
| "epoch": 132.85029110063766, |
| "grad_norm": 0.2207670956850052, |
| "learning_rate": 2.0066889632107024e-08, |
| "loss": 6.3764, |
| "step": 59800 |
| }, |
| { |
| "epoch": 132.85029110063766, |
| "eval_loss": 6.377054691314697, |
| "eval_runtime": 63.7133, |
| "eval_samples_per_second": 156.953, |
| "eval_steps_per_second": 19.619, |
| "step": 59800 |
| }, |
| { |
| "epoch": 133.07208206265594, |
| "grad_norm": 0.21483196318149567, |
| "learning_rate": 1.0033444816053512e-08, |
| "loss": 6.3763, |
| "step": 59900 |
| }, |
| { |
| "epoch": 133.07208206265594, |
| "eval_loss": 6.3776984214782715, |
| "eval_runtime": 63.6217, |
| "eval_samples_per_second": 157.179, |
| "eval_steps_per_second": 19.647, |
| "step": 59900 |
| }, |
| { |
| "epoch": 133.29387302467424, |
| "grad_norm": 0.1953832507133484, |
| "learning_rate": 0.0, |
| "loss": 6.3751, |
| "step": 60000 |
| }, |
| { |
| "epoch": 133.29387302467424, |
| "eval_loss": 6.377795219421387, |
| "eval_runtime": 66.2186, |
| "eval_samples_per_second": 151.015, |
| "eval_steps_per_second": 18.877, |
| "step": 60000 |
| } |
| ], |
| "logging_steps": 100, |
| "max_steps": 60000, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 134, |
| "save_steps": 100, |
| "stateful_callbacks": { |
| "EarlyStoppingCallback": { |
| "args": { |
| "early_stopping_patience": 10, |
| "early_stopping_threshold": 0.0 |
| }, |
| "attributes": { |
| "early_stopping_patience_counter": 10 |
| } |
| }, |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 2.020754951164035e+19, |
| "train_batch_size": 8, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|