deskull's picture
Upload MolCrawl genome-sequence BERT small model
c53a6fe verified
{
"best_metric": 6.374266624450684,
"best_model_checkpoint": "learning_source_20260316/genome_sequence/bert-output/genome_sequence-small/checkpoint-46600",
"epoch": 133.29387302467424,
"eval_steps": 100,
"global_step": 60000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.22179096201829776,
"grad_norm": 0.9666945338249207,
"learning_rate": 3e-06,
"loss": 8.2198,
"step": 100
},
{
"epoch": 0.22179096201829776,
"eval_loss": 7.910184383392334,
"eval_runtime": 100.9607,
"eval_samples_per_second": 99.048,
"eval_steps_per_second": 99.048,
"step": 100
},
{
"epoch": 0.4435819240365955,
"grad_norm": 0.7436413764953613,
"learning_rate": 6e-06,
"loss": 7.7448,
"step": 200
},
{
"epoch": 0.4435819240365955,
"eval_loss": 7.522714138031006,
"eval_runtime": 101.1239,
"eval_samples_per_second": 98.889,
"eval_steps_per_second": 98.889,
"step": 200
},
{
"epoch": 0.6653728860548933,
"grad_norm": 0.5597550868988037,
"learning_rate": 5.998999666555519e-06,
"loss": 7.3644,
"step": 300
},
{
"epoch": 0.6653728860548933,
"eval_loss": 7.118653297424316,
"eval_runtime": 101.7208,
"eval_samples_per_second": 98.308,
"eval_steps_per_second": 98.308,
"step": 300
},
{
"epoch": 0.887163848073191,
"grad_norm": 0.3977542519569397,
"learning_rate": 5.997999333111037e-06,
"loss": 7.039,
"step": 400
},
{
"epoch": 0.887163848073191,
"eval_loss": 6.858039855957031,
"eval_runtime": 103.108,
"eval_samples_per_second": 96.986,
"eval_steps_per_second": 96.986,
"step": 400
},
{
"epoch": 1.1089548100914888,
"grad_norm": 0.31371042132377625,
"learning_rate": 5.9969989996665554e-06,
"loss": 6.8537,
"step": 500
},
{
"epoch": 1.1089548100914888,
"eval_loss": 6.725042343139648,
"eval_runtime": 100.7633,
"eval_samples_per_second": 99.243,
"eval_steps_per_second": 99.243,
"step": 500
},
{
"epoch": 1.3307457721097866,
"grad_norm": 0.2910732924938202,
"learning_rate": 5.995998666222074e-06,
"loss": 6.749,
"step": 600
},
{
"epoch": 1.3307457721097866,
"eval_loss": 6.648338317871094,
"eval_runtime": 103.8281,
"eval_samples_per_second": 96.313,
"eval_steps_per_second": 96.313,
"step": 600
},
{
"epoch": 1.5525367341280842,
"grad_norm": 0.38117602467536926,
"learning_rate": 5.994998332777593e-06,
"loss": 6.6809,
"step": 700
},
{
"epoch": 1.5525367341280842,
"eval_loss": 6.598635196685791,
"eval_runtime": 100.7294,
"eval_samples_per_second": 99.276,
"eval_steps_per_second": 99.276,
"step": 700
},
{
"epoch": 1.774327696146382,
"grad_norm": 0.23082487285137177,
"learning_rate": 5.9939979993331115e-06,
"loss": 6.6363,
"step": 800
},
{
"epoch": 1.774327696146382,
"eval_loss": 6.5613298416137695,
"eval_runtime": 100.99,
"eval_samples_per_second": 99.02,
"eval_steps_per_second": 99.02,
"step": 800
},
{
"epoch": 1.9961186581646797,
"grad_norm": 0.3537309169769287,
"learning_rate": 5.992997665888629e-06,
"loss": 6.6008,
"step": 900
},
{
"epoch": 1.9961186581646797,
"eval_loss": 6.539489269256592,
"eval_runtime": 103.6291,
"eval_samples_per_second": 96.498,
"eval_steps_per_second": 96.498,
"step": 900
},
{
"epoch": 2.2179096201829775,
"grad_norm": 0.22692321240901947,
"learning_rate": 5.991997332444148e-06,
"loss": 6.5735,
"step": 1000
},
{
"epoch": 2.2179096201829775,
"eval_loss": 6.521015644073486,
"eval_runtime": 100.5379,
"eval_samples_per_second": 99.465,
"eval_steps_per_second": 99.465,
"step": 1000
},
{
"epoch": 2.4397005822012754,
"grad_norm": 0.5465587973594666,
"learning_rate": 5.990996998999667e-06,
"loss": 6.5555,
"step": 1100
},
{
"epoch": 2.4397005822012754,
"eval_loss": 6.505192279815674,
"eval_runtime": 101.8008,
"eval_samples_per_second": 98.231,
"eval_steps_per_second": 98.231,
"step": 1100
},
{
"epoch": 2.6614915442195732,
"grad_norm": 0.6720498204231262,
"learning_rate": 5.989996665555185e-06,
"loss": 6.5407,
"step": 1200
},
{
"epoch": 2.6614915442195732,
"eval_loss": 6.497246265411377,
"eval_runtime": 103.0853,
"eval_samples_per_second": 97.007,
"eval_steps_per_second": 97.007,
"step": 1200
},
{
"epoch": 2.8832825062378706,
"grad_norm": 0.3426739275455475,
"learning_rate": 5.988996332110703e-06,
"loss": 6.529,
"step": 1300
},
{
"epoch": 2.8832825062378706,
"eval_loss": 6.488556861877441,
"eval_runtime": 100.6535,
"eval_samples_per_second": 99.351,
"eval_steps_per_second": 99.351,
"step": 1300
},
{
"epoch": 3.1050734682561685,
"grad_norm": 0.2463805377483368,
"learning_rate": 5.987995998666222e-06,
"loss": 6.5196,
"step": 1400
},
{
"epoch": 3.1050734682561685,
"eval_loss": 6.484075546264648,
"eval_runtime": 104.3708,
"eval_samples_per_second": 95.812,
"eval_steps_per_second": 95.812,
"step": 1400
},
{
"epoch": 3.3268644302744663,
"grad_norm": 0.1849370300769806,
"learning_rate": 5.986995665221741e-06,
"loss": 6.5099,
"step": 1500
},
{
"epoch": 3.3268644302744663,
"eval_loss": 6.476208209991455,
"eval_runtime": 100.8511,
"eval_samples_per_second": 99.156,
"eval_steps_per_second": 99.156,
"step": 1500
},
{
"epoch": 3.548655392292764,
"grad_norm": 0.23534879088401794,
"learning_rate": 5.9859953317772595e-06,
"loss": 6.503,
"step": 1600
},
{
"epoch": 3.548655392292764,
"eval_loss": 6.473758220672607,
"eval_runtime": 100.8445,
"eval_samples_per_second": 99.163,
"eval_steps_per_second": 99.163,
"step": 1600
},
{
"epoch": 3.770446354311062,
"grad_norm": 0.3312935531139374,
"learning_rate": 5.984994998332777e-06,
"loss": 6.4991,
"step": 1700
},
{
"epoch": 3.770446354311062,
"eval_loss": 6.471902370452881,
"eval_runtime": 104.4468,
"eval_samples_per_second": 95.743,
"eval_steps_per_second": 95.743,
"step": 1700
},
{
"epoch": 3.9922373163293594,
"grad_norm": 0.27324172854423523,
"learning_rate": 5.983994664888296e-06,
"loss": 6.4936,
"step": 1800
},
{
"epoch": 3.9922373163293594,
"eval_loss": 6.464596271514893,
"eval_runtime": 100.6385,
"eval_samples_per_second": 99.366,
"eval_steps_per_second": 99.366,
"step": 1800
},
{
"epoch": 4.214028278347658,
"grad_norm": 0.29278630018234253,
"learning_rate": 5.982994331443815e-06,
"loss": 6.4875,
"step": 1900
},
{
"epoch": 4.214028278347658,
"eval_loss": 6.462095260620117,
"eval_runtime": 100.6404,
"eval_samples_per_second": 99.364,
"eval_steps_per_second": 99.364,
"step": 1900
},
{
"epoch": 4.435819240365955,
"grad_norm": 0.26022714376449585,
"learning_rate": 5.981993997999333e-06,
"loss": 6.4834,
"step": 2000
},
{
"epoch": 4.435819240365955,
"eval_loss": 6.45832633972168,
"eval_runtime": 104.5104,
"eval_samples_per_second": 95.684,
"eval_steps_per_second": 95.684,
"step": 2000
},
{
"epoch": 4.6576102023842525,
"grad_norm": 0.7873703837394714,
"learning_rate": 5.980993664554851e-06,
"loss": 6.4796,
"step": 2100
},
{
"epoch": 4.6576102023842525,
"eval_loss": 6.456444263458252,
"eval_runtime": 100.8687,
"eval_samples_per_second": 99.139,
"eval_steps_per_second": 99.139,
"step": 2100
},
{
"epoch": 4.887163848073191,
"grad_norm": 0.7525845766067505,
"learning_rate": 5.979993331110371e-06,
"loss": 6.4755,
"step": 2200
},
{
"epoch": 4.887163848073191,
"eval_loss": 6.453465938568115,
"eval_runtime": 66.4579,
"eval_samples_per_second": 150.471,
"eval_steps_per_second": 18.809,
"step": 2200
},
{
"epoch": 5.108954810091489,
"grad_norm": 0.5191181302070618,
"learning_rate": 5.978992997665889e-06,
"loss": 6.472,
"step": 2300
},
{
"epoch": 5.108954810091489,
"eval_loss": 6.44980525970459,
"eval_runtime": 63.8377,
"eval_samples_per_second": 156.647,
"eval_steps_per_second": 19.581,
"step": 2300
},
{
"epoch": 5.330745772109786,
"grad_norm": 0.31189826130867004,
"learning_rate": 5.9779926642214075e-06,
"loss": 6.4681,
"step": 2400
},
{
"epoch": 5.330745772109786,
"eval_loss": 6.448277473449707,
"eval_runtime": 63.9509,
"eval_samples_per_second": 156.37,
"eval_steps_per_second": 19.546,
"step": 2400
},
{
"epoch": 5.5525367341280845,
"grad_norm": 0.4947231113910675,
"learning_rate": 5.976992330776926e-06,
"loss": 6.4659,
"step": 2500
},
{
"epoch": 5.5525367341280845,
"eval_loss": 6.4454731941223145,
"eval_runtime": 66.4235,
"eval_samples_per_second": 150.549,
"eval_steps_per_second": 18.819,
"step": 2500
},
{
"epoch": 5.774327696146382,
"grad_norm": 0.22547227144241333,
"learning_rate": 5.975991997332444e-06,
"loss": 6.4619,
"step": 2600
},
{
"epoch": 5.774327696146382,
"eval_loss": 6.444580554962158,
"eval_runtime": 63.7522,
"eval_samples_per_second": 156.857,
"eval_steps_per_second": 19.607,
"step": 2600
},
{
"epoch": 5.99611865816468,
"grad_norm": 0.2726474404335022,
"learning_rate": 5.974991663887963e-06,
"loss": 6.4594,
"step": 2700
},
{
"epoch": 5.99611865816468,
"eval_loss": 6.44156551361084,
"eval_runtime": 66.3901,
"eval_samples_per_second": 150.625,
"eval_steps_per_second": 18.828,
"step": 2700
},
{
"epoch": 6.2179096201829775,
"grad_norm": 0.17645886540412903,
"learning_rate": 5.973991330443481e-06,
"loss": 6.4574,
"step": 2800
},
{
"epoch": 6.2179096201829775,
"eval_loss": 6.4393510818481445,
"eval_runtime": 63.8118,
"eval_samples_per_second": 156.711,
"eval_steps_per_second": 19.589,
"step": 2800
},
{
"epoch": 6.439700582201275,
"grad_norm": 0.9444617629051208,
"learning_rate": 5.972990996999e-06,
"loss": 6.4546,
"step": 2900
},
{
"epoch": 6.439700582201275,
"eval_loss": 6.439332008361816,
"eval_runtime": 63.6523,
"eval_samples_per_second": 157.103,
"eval_steps_per_second": 19.638,
"step": 2900
},
{
"epoch": 6.661491544219573,
"grad_norm": 0.4472251534461975,
"learning_rate": 5.971990663554519e-06,
"loss": 6.4515,
"step": 3000
},
{
"epoch": 6.661491544219573,
"eval_loss": 6.435446262359619,
"eval_runtime": 63.845,
"eval_samples_per_second": 156.629,
"eval_steps_per_second": 19.579,
"step": 3000
},
{
"epoch": 6.883282506237871,
"grad_norm": 0.29884466528892517,
"learning_rate": 5.970990330110037e-06,
"loss": 6.4483,
"step": 3100
},
{
"epoch": 6.883282506237871,
"eval_loss": 6.433766841888428,
"eval_runtime": 66.4883,
"eval_samples_per_second": 150.402,
"eval_steps_per_second": 18.8,
"step": 3100
},
{
"epoch": 7.105073468256169,
"grad_norm": 0.4576103687286377,
"learning_rate": 5.9699899966655554e-06,
"loss": 6.4465,
"step": 3200
},
{
"epoch": 7.105073468256169,
"eval_loss": 6.432063102722168,
"eval_runtime": 63.7483,
"eval_samples_per_second": 156.867,
"eval_steps_per_second": 19.608,
"step": 3200
},
{
"epoch": 7.326864430274466,
"grad_norm": 0.1679336577653885,
"learning_rate": 5.968989663221074e-06,
"loss": 6.4453,
"step": 3300
},
{
"epoch": 7.326864430274466,
"eval_loss": 6.430073261260986,
"eval_runtime": 63.7036,
"eval_samples_per_second": 156.977,
"eval_steps_per_second": 19.622,
"step": 3300
},
{
"epoch": 7.548655392292764,
"grad_norm": 0.3880283236503601,
"learning_rate": 5.967989329776592e-06,
"loss": 6.4406,
"step": 3400
},
{
"epoch": 7.548655392292764,
"eval_loss": 6.431549072265625,
"eval_runtime": 66.1695,
"eval_samples_per_second": 151.127,
"eval_steps_per_second": 18.891,
"step": 3400
},
{
"epoch": 7.770446354311062,
"grad_norm": 0.8515690565109253,
"learning_rate": 5.966988996332111e-06,
"loss": 6.4413,
"step": 3500
},
{
"epoch": 7.770446354311062,
"eval_loss": 6.42842435836792,
"eval_runtime": 63.7187,
"eval_samples_per_second": 156.94,
"eval_steps_per_second": 19.617,
"step": 3500
},
{
"epoch": 7.992237316329359,
"grad_norm": 0.4197738468647003,
"learning_rate": 5.965988662887629e-06,
"loss": 6.4404,
"step": 3600
},
{
"epoch": 7.992237316329359,
"eval_loss": 6.429299354553223,
"eval_runtime": 63.7081,
"eval_samples_per_second": 156.966,
"eval_steps_per_second": 19.621,
"step": 3600
},
{
"epoch": 8.214028278347657,
"grad_norm": 0.16546382009983063,
"learning_rate": 5.964988329443148e-06,
"loss": 6.438,
"step": 3700
},
{
"epoch": 8.214028278347657,
"eval_loss": 6.426889896392822,
"eval_runtime": 66.066,
"eval_samples_per_second": 151.364,
"eval_steps_per_second": 18.92,
"step": 3700
},
{
"epoch": 8.435819240365955,
"grad_norm": 0.48783496022224426,
"learning_rate": 5.963987995998667e-06,
"loss": 6.437,
"step": 3800
},
{
"epoch": 8.435819240365955,
"eval_loss": 6.424874305725098,
"eval_runtime": 63.6818,
"eval_samples_per_second": 157.031,
"eval_steps_per_second": 19.629,
"step": 3800
},
{
"epoch": 8.657610202384253,
"grad_norm": 0.2994876205921173,
"learning_rate": 5.962987662554185e-06,
"loss": 6.434,
"step": 3900
},
{
"epoch": 8.657610202384253,
"eval_loss": 6.428049087524414,
"eval_runtime": 63.6981,
"eval_samples_per_second": 156.991,
"eval_steps_per_second": 19.624,
"step": 3900
},
{
"epoch": 8.87940116440255,
"grad_norm": 0.26397526264190674,
"learning_rate": 5.961987329109703e-06,
"loss": 6.4344,
"step": 4000
},
{
"epoch": 8.87940116440255,
"eval_loss": 6.427630424499512,
"eval_runtime": 63.7853,
"eval_samples_per_second": 156.776,
"eval_steps_per_second": 19.597,
"step": 4000
},
{
"epoch": 9.101192126420848,
"grad_norm": 0.6336208581924438,
"learning_rate": 5.960986995665222e-06,
"loss": 6.4322,
"step": 4100
},
{
"epoch": 9.101192126420848,
"eval_loss": 6.423878192901611,
"eval_runtime": 66.3296,
"eval_samples_per_second": 150.762,
"eval_steps_per_second": 18.845,
"step": 4100
},
{
"epoch": 9.322983088439146,
"grad_norm": 0.5242211818695068,
"learning_rate": 5.95998666222074e-06,
"loss": 6.4302,
"step": 4200
},
{
"epoch": 9.322983088439146,
"eval_loss": 6.42392110824585,
"eval_runtime": 63.7079,
"eval_samples_per_second": 156.966,
"eval_steps_per_second": 19.621,
"step": 4200
},
{
"epoch": 9.544774050457445,
"grad_norm": 0.49379467964172363,
"learning_rate": 5.958986328776259e-06,
"loss": 6.4307,
"step": 4300
},
{
"epoch": 9.544774050457445,
"eval_loss": 6.422423839569092,
"eval_runtime": 63.6859,
"eval_samples_per_second": 157.021,
"eval_steps_per_second": 19.628,
"step": 4300
},
{
"epoch": 9.766565012475741,
"grad_norm": 0.305960476398468,
"learning_rate": 5.957985995331777e-06,
"loss": 6.4285,
"step": 4400
},
{
"epoch": 9.766565012475741,
"eval_loss": 6.421577453613281,
"eval_runtime": 66.1928,
"eval_samples_per_second": 151.074,
"eval_steps_per_second": 18.884,
"step": 4400
},
{
"epoch": 9.98835597449404,
"grad_norm": 0.3036479353904724,
"learning_rate": 5.956985661887296e-06,
"loss": 6.4249,
"step": 4500
},
{
"epoch": 9.98835597449404,
"eval_loss": 6.41899299621582,
"eval_runtime": 63.6775,
"eval_samples_per_second": 157.041,
"eval_steps_per_second": 19.63,
"step": 4500
},
{
"epoch": 10.210146936512338,
"grad_norm": 1.1105852127075195,
"learning_rate": 5.955985328442815e-06,
"loss": 6.4262,
"step": 4600
},
{
"epoch": 10.210146936512338,
"eval_loss": 6.420323371887207,
"eval_runtime": 63.5916,
"eval_samples_per_second": 157.253,
"eval_steps_per_second": 19.657,
"step": 4600
},
{
"epoch": 10.431937898530634,
"grad_norm": 0.38992971181869507,
"learning_rate": 5.954984994998333e-06,
"loss": 6.4259,
"step": 4700
},
{
"epoch": 10.431937898530634,
"eval_loss": 6.415469646453857,
"eval_runtime": 63.7968,
"eval_samples_per_second": 156.748,
"eval_steps_per_second": 19.593,
"step": 4700
},
{
"epoch": 10.653728860548933,
"grad_norm": 0.39246854186058044,
"learning_rate": 5.953984661553851e-06,
"loss": 6.4258,
"step": 4800
},
{
"epoch": 10.653728860548933,
"eval_loss": 6.414693832397461,
"eval_runtime": 66.2863,
"eval_samples_per_second": 150.861,
"eval_steps_per_second": 18.858,
"step": 4800
},
{
"epoch": 10.875519822567231,
"grad_norm": 0.6589607000350952,
"learning_rate": 5.95298432810937e-06,
"loss": 6.4226,
"step": 4900
},
{
"epoch": 10.875519822567231,
"eval_loss": 6.417821884155273,
"eval_runtime": 63.7381,
"eval_samples_per_second": 156.892,
"eval_steps_per_second": 19.612,
"step": 4900
},
{
"epoch": 11.097310784585527,
"grad_norm": 0.44160690903663635,
"learning_rate": 5.951983994664888e-06,
"loss": 6.4223,
"step": 5000
},
{
"epoch": 11.097310784585527,
"eval_loss": 6.417135715484619,
"eval_runtime": 63.7803,
"eval_samples_per_second": 156.788,
"eval_steps_per_second": 19.599,
"step": 5000
},
{
"epoch": 11.319101746603826,
"grad_norm": 0.7182816863059998,
"learning_rate": 5.950983661220407e-06,
"loss": 6.4221,
"step": 5100
},
{
"epoch": 11.319101746603826,
"eval_loss": 6.417608737945557,
"eval_runtime": 66.5138,
"eval_samples_per_second": 150.345,
"eval_steps_per_second": 18.793,
"step": 5100
},
{
"epoch": 11.540892708622124,
"grad_norm": 0.45741328597068787,
"learning_rate": 5.949983327775925e-06,
"loss": 6.4211,
"step": 5200
},
{
"epoch": 11.540892708622124,
"eval_loss": 6.411616325378418,
"eval_runtime": 63.8646,
"eval_samples_per_second": 156.581,
"eval_steps_per_second": 19.573,
"step": 5200
},
{
"epoch": 11.76268367064042,
"grad_norm": 0.37045249342918396,
"learning_rate": 5.948982994331444e-06,
"loss": 6.4203,
"step": 5300
},
{
"epoch": 11.76268367064042,
"eval_loss": 6.415543556213379,
"eval_runtime": 63.6959,
"eval_samples_per_second": 156.996,
"eval_steps_per_second": 19.624,
"step": 5300
},
{
"epoch": 11.984474632658719,
"grad_norm": 0.5875869989395142,
"learning_rate": 5.947982660886963e-06,
"loss": 6.4189,
"step": 5400
},
{
"epoch": 11.984474632658719,
"eval_loss": 6.417328834533691,
"eval_runtime": 63.8682,
"eval_samples_per_second": 156.572,
"eval_steps_per_second": 19.572,
"step": 5400
},
{
"epoch": 12.206265594677017,
"grad_norm": 0.39769718050956726,
"learning_rate": 5.9469823274424815e-06,
"loss": 6.4185,
"step": 5500
},
{
"epoch": 12.206265594677017,
"eval_loss": 6.417914390563965,
"eval_runtime": 66.821,
"eval_samples_per_second": 149.653,
"eval_steps_per_second": 18.707,
"step": 5500
},
{
"epoch": 12.428056556695315,
"grad_norm": 0.8144527673721313,
"learning_rate": 5.945981993997999e-06,
"loss": 6.417,
"step": 5600
},
{
"epoch": 12.428056556695315,
"eval_loss": 6.414742946624756,
"eval_runtime": 63.6455,
"eval_samples_per_second": 157.12,
"eval_steps_per_second": 19.64,
"step": 5600
},
{
"epoch": 12.649847518713612,
"grad_norm": 0.304855078458786,
"learning_rate": 5.944981660553518e-06,
"loss": 6.4169,
"step": 5700
},
{
"epoch": 12.649847518713612,
"eval_loss": 6.411574363708496,
"eval_runtime": 63.6479,
"eval_samples_per_second": 157.114,
"eval_steps_per_second": 19.639,
"step": 5700
},
{
"epoch": 12.87163848073191,
"grad_norm": 0.5774130821228027,
"learning_rate": 5.943981327109036e-06,
"loss": 6.4162,
"step": 5800
},
{
"epoch": 12.87163848073191,
"eval_loss": 6.4110517501831055,
"eval_runtime": 66.215,
"eval_samples_per_second": 151.023,
"eval_steps_per_second": 18.878,
"step": 5800
},
{
"epoch": 13.093429442750208,
"grad_norm": 0.6892155408859253,
"learning_rate": 5.942980993664555e-06,
"loss": 6.414,
"step": 5900
},
{
"epoch": 13.093429442750208,
"eval_loss": 6.413996696472168,
"eval_runtime": 63.6174,
"eval_samples_per_second": 157.19,
"eval_steps_per_second": 19.649,
"step": 5900
},
{
"epoch": 13.315220404768505,
"grad_norm": 0.5487566590309143,
"learning_rate": 5.941980660220073e-06,
"loss": 6.4153,
"step": 6000
},
{
"epoch": 13.315220404768505,
"eval_loss": 6.414098739624023,
"eval_runtime": 63.6464,
"eval_samples_per_second": 157.118,
"eval_steps_per_second": 19.64,
"step": 6000
},
{
"epoch": 13.537011366786803,
"grad_norm": 0.7147879004478455,
"learning_rate": 5.940980326775592e-06,
"loss": 6.4132,
"step": 6100
},
{
"epoch": 13.537011366786803,
"eval_loss": 6.411059379577637,
"eval_runtime": 66.5345,
"eval_samples_per_second": 150.298,
"eval_steps_per_second": 18.787,
"step": 6100
},
{
"epoch": 13.758802328805102,
"grad_norm": 0.4990188181400299,
"learning_rate": 5.939979993331111e-06,
"loss": 6.4127,
"step": 6200
},
{
"epoch": 13.758802328805102,
"eval_loss": 6.411470890045166,
"eval_runtime": 63.7718,
"eval_samples_per_second": 156.809,
"eval_steps_per_second": 19.601,
"step": 6200
},
{
"epoch": 13.9805932908234,
"grad_norm": 0.3841017782688141,
"learning_rate": 5.9389796598866294e-06,
"loss": 6.4133,
"step": 6300
},
{
"epoch": 13.9805932908234,
"eval_loss": 6.4090681076049805,
"eval_runtime": 63.7617,
"eval_samples_per_second": 156.834,
"eval_steps_per_second": 19.604,
"step": 6300
},
{
"epoch": 14.202384252841696,
"grad_norm": 0.3359989523887634,
"learning_rate": 5.937979326442147e-06,
"loss": 6.4107,
"step": 6400
},
{
"epoch": 14.202384252841696,
"eval_loss": 6.409322738647461,
"eval_runtime": 63.5969,
"eval_samples_per_second": 157.24,
"eval_steps_per_second": 19.655,
"step": 6400
},
{
"epoch": 14.424175214859995,
"grad_norm": 0.5810059905052185,
"learning_rate": 5.936978992997666e-06,
"loss": 6.411,
"step": 6500
},
{
"epoch": 14.424175214859995,
"eval_loss": 6.411257743835449,
"eval_runtime": 66.5523,
"eval_samples_per_second": 150.258,
"eval_steps_per_second": 18.782,
"step": 6500
},
{
"epoch": 14.645966176878293,
"grad_norm": 0.45823681354522705,
"learning_rate": 5.935978659553185e-06,
"loss": 6.4107,
"step": 6600
},
{
"epoch": 14.645966176878293,
"eval_loss": 6.4073872566223145,
"eval_runtime": 63.6788,
"eval_samples_per_second": 157.038,
"eval_steps_per_second": 19.63,
"step": 6600
},
{
"epoch": 14.86775713889659,
"grad_norm": 0.6735783815383911,
"learning_rate": 5.9349783261087026e-06,
"loss": 6.4112,
"step": 6700
},
{
"epoch": 14.86775713889659,
"eval_loss": 6.411919593811035,
"eval_runtime": 63.7297,
"eval_samples_per_second": 156.913,
"eval_steps_per_second": 19.614,
"step": 6700
},
{
"epoch": 15.089548100914888,
"grad_norm": 0.5670196413993835,
"learning_rate": 5.933977992664221e-06,
"loss": 6.4099,
"step": 6800
},
{
"epoch": 15.089548100914888,
"eval_loss": 6.407878875732422,
"eval_runtime": 66.3771,
"eval_samples_per_second": 150.654,
"eval_steps_per_second": 18.832,
"step": 6800
},
{
"epoch": 15.311339062933186,
"grad_norm": 0.3068266808986664,
"learning_rate": 5.93297765921974e-06,
"loss": 6.4089,
"step": 6900
},
{
"epoch": 15.311339062933186,
"eval_loss": 6.4104766845703125,
"eval_runtime": 63.6627,
"eval_samples_per_second": 157.078,
"eval_steps_per_second": 19.635,
"step": 6900
},
{
"epoch": 15.533130024951483,
"grad_norm": 0.8304972052574158,
"learning_rate": 5.931977325775259e-06,
"loss": 6.409,
"step": 7000
},
{
"epoch": 15.533130024951483,
"eval_loss": 6.414528846740723,
"eval_runtime": 63.6701,
"eval_samples_per_second": 157.06,
"eval_steps_per_second": 19.632,
"step": 7000
},
{
"epoch": 15.75492098696978,
"grad_norm": 0.5522041916847229,
"learning_rate": 5.930976992330777e-06,
"loss": 6.4089,
"step": 7100
},
{
"epoch": 15.75492098696978,
"eval_loss": 6.407095909118652,
"eval_runtime": 66.1999,
"eval_samples_per_second": 151.058,
"eval_steps_per_second": 18.882,
"step": 7100
},
{
"epoch": 15.97671194898808,
"grad_norm": 0.373626708984375,
"learning_rate": 5.929976658886295e-06,
"loss": 6.4071,
"step": 7200
},
{
"epoch": 15.97671194898808,
"eval_loss": 6.4060258865356445,
"eval_runtime": 63.7049,
"eval_samples_per_second": 156.974,
"eval_steps_per_second": 19.622,
"step": 7200
},
{
"epoch": 16.198502911006376,
"grad_norm": 0.3747236132621765,
"learning_rate": 5.928976325441814e-06,
"loss": 6.4072,
"step": 7300
},
{
"epoch": 16.198502911006376,
"eval_loss": 6.403803825378418,
"eval_runtime": 63.6478,
"eval_samples_per_second": 157.115,
"eval_steps_per_second": 19.639,
"step": 7300
},
{
"epoch": 16.420293873024676,
"grad_norm": 0.9381150007247925,
"learning_rate": 5.927975991997333e-06,
"loss": 6.4068,
"step": 7400
},
{
"epoch": 16.420293873024676,
"eval_loss": 6.406477451324463,
"eval_runtime": 66.4296,
"eval_samples_per_second": 150.535,
"eval_steps_per_second": 18.817,
"step": 7400
},
{
"epoch": 16.642084835042972,
"grad_norm": 0.4905136823654175,
"learning_rate": 5.9269756585528505e-06,
"loss": 6.4047,
"step": 7500
},
{
"epoch": 16.642084835042972,
"eval_loss": 6.4078850746154785,
"eval_runtime": 63.7258,
"eval_samples_per_second": 156.922,
"eval_steps_per_second": 19.615,
"step": 7500
},
{
"epoch": 16.86387579706127,
"grad_norm": 0.5776643753051758,
"learning_rate": 5.92597532510837e-06,
"loss": 6.4054,
"step": 7600
},
{
"epoch": 16.86387579706127,
"eval_loss": 6.403768539428711,
"eval_runtime": 63.7461,
"eval_samples_per_second": 156.872,
"eval_steps_per_second": 19.609,
"step": 7600
},
{
"epoch": 17.08566675907957,
"grad_norm": 0.791892945766449,
"learning_rate": 5.924974991663888e-06,
"loss": 6.4051,
"step": 7700
},
{
"epoch": 17.08566675907957,
"eval_loss": 6.403835773468018,
"eval_runtime": 63.9137,
"eval_samples_per_second": 156.461,
"eval_steps_per_second": 19.558,
"step": 7700
},
{
"epoch": 17.307457721097865,
"grad_norm": 0.485984206199646,
"learning_rate": 5.923974658219407e-06,
"loss": 6.4058,
"step": 7800
},
{
"epoch": 17.307457721097865,
"eval_loss": 6.405175685882568,
"eval_runtime": 66.2871,
"eval_samples_per_second": 150.859,
"eval_steps_per_second": 18.857,
"step": 7800
},
{
"epoch": 17.529248683116162,
"grad_norm": 1.0781219005584717,
"learning_rate": 5.922974324774925e-06,
"loss": 6.4037,
"step": 7900
},
{
"epoch": 17.529248683116162,
"eval_loss": 6.408561706542969,
"eval_runtime": 66.8857,
"eval_samples_per_second": 149.509,
"eval_steps_per_second": 18.689,
"step": 7900
},
{
"epoch": 17.751039645134462,
"grad_norm": 0.6358538269996643,
"learning_rate": 5.921973991330443e-06,
"loss": 6.403,
"step": 8000
},
{
"epoch": 17.751039645134462,
"eval_loss": 6.402519702911377,
"eval_runtime": 63.7653,
"eval_samples_per_second": 156.825,
"eval_steps_per_second": 19.603,
"step": 8000
},
{
"epoch": 17.97283060715276,
"grad_norm": 0.5632463097572327,
"learning_rate": 5.920973657885962e-06,
"loss": 6.4034,
"step": 8100
},
{
"epoch": 17.97283060715276,
"eval_loss": 6.403571128845215,
"eval_runtime": 63.7754,
"eval_samples_per_second": 156.8,
"eval_steps_per_second": 19.6,
"step": 8100
},
{
"epoch": 18.194621569171055,
"grad_norm": 0.23312948644161224,
"learning_rate": 5.919973324441481e-06,
"loss": 6.4048,
"step": 8200
},
{
"epoch": 18.194621569171055,
"eval_loss": 6.404890060424805,
"eval_runtime": 63.6831,
"eval_samples_per_second": 157.028,
"eval_steps_per_second": 19.628,
"step": 8200
},
{
"epoch": 18.416412531189355,
"grad_norm": 0.5255222916603088,
"learning_rate": 5.918972990996999e-06,
"loss": 6.4018,
"step": 8300
},
{
"epoch": 18.416412531189355,
"eval_loss": 6.401614665985107,
"eval_runtime": 66.6013,
"eval_samples_per_second": 150.147,
"eval_steps_per_second": 18.768,
"step": 8300
},
{
"epoch": 18.63820349320765,
"grad_norm": 0.44263362884521484,
"learning_rate": 5.917972657552518e-06,
"loss": 6.4018,
"step": 8400
},
{
"epoch": 18.63820349320765,
"eval_loss": 6.40390682220459,
"eval_runtime": 63.7484,
"eval_samples_per_second": 156.867,
"eval_steps_per_second": 19.608,
"step": 8400
},
{
"epoch": 18.859994455225948,
"grad_norm": 0.5826687812805176,
"learning_rate": 5.916972324108037e-06,
"loss": 6.402,
"step": 8500
},
{
"epoch": 18.859994455225948,
"eval_loss": 6.401444911956787,
"eval_runtime": 63.73,
"eval_samples_per_second": 156.912,
"eval_steps_per_second": 19.614,
"step": 8500
},
{
"epoch": 19.081785417244248,
"grad_norm": 0.5808525681495667,
"learning_rate": 5.915971990663555e-06,
"loss": 6.4031,
"step": 8600
},
{
"epoch": 19.081785417244248,
"eval_loss": 6.398373126983643,
"eval_runtime": 66.6574,
"eval_samples_per_second": 150.021,
"eval_steps_per_second": 18.753,
"step": 8600
},
{
"epoch": 19.303576379262545,
"grad_norm": 0.9179806113243103,
"learning_rate": 5.914971657219073e-06,
"loss": 6.4019,
"step": 8700
},
{
"epoch": 19.303576379262545,
"eval_loss": 6.399080276489258,
"eval_runtime": 63.6271,
"eval_samples_per_second": 157.166,
"eval_steps_per_second": 19.646,
"step": 8700
},
{
"epoch": 19.52536734128084,
"grad_norm": 0.45992511510849,
"learning_rate": 5.913971323774591e-06,
"loss": 6.4,
"step": 8800
},
{
"epoch": 19.52536734128084,
"eval_loss": 6.403900623321533,
"eval_runtime": 63.7034,
"eval_samples_per_second": 156.977,
"eval_steps_per_second": 19.622,
"step": 8800
},
{
"epoch": 19.74715830329914,
"grad_norm": 0.702781081199646,
"learning_rate": 5.91297099033011e-06,
"loss": 6.3993,
"step": 8900
},
{
"epoch": 19.74715830329914,
"eval_loss": 6.401424884796143,
"eval_runtime": 66.2276,
"eval_samples_per_second": 150.994,
"eval_steps_per_second": 18.874,
"step": 8900
},
{
"epoch": 19.968949265317438,
"grad_norm": 0.6189502477645874,
"learning_rate": 5.911970656885629e-06,
"loss": 6.3999,
"step": 9000
},
{
"epoch": 19.968949265317438,
"eval_loss": 6.400846481323242,
"eval_runtime": 63.7467,
"eval_samples_per_second": 156.871,
"eval_steps_per_second": 19.609,
"step": 9000
},
{
"epoch": 20.190740227335738,
"grad_norm": 0.37635141611099243,
"learning_rate": 5.910970323441147e-06,
"loss": 6.3994,
"step": 9100
},
{
"epoch": 20.190740227335738,
"eval_loss": 6.402886867523193,
"eval_runtime": 63.6159,
"eval_samples_per_second": 157.193,
"eval_steps_per_second": 19.649,
"step": 9100
},
{
"epoch": 20.412531189354034,
"grad_norm": 0.5809453129768372,
"learning_rate": 5.909969989996666e-06,
"loss": 6.3996,
"step": 9200
},
{
"epoch": 20.412531189354034,
"eval_loss": 6.399085998535156,
"eval_runtime": 66.2096,
"eval_samples_per_second": 151.035,
"eval_steps_per_second": 18.879,
"step": 9200
},
{
"epoch": 20.63432215137233,
"grad_norm": 0.535410463809967,
"learning_rate": 5.908969656552185e-06,
"loss": 6.3985,
"step": 9300
},
{
"epoch": 20.63432215137233,
"eval_loss": 6.399356842041016,
"eval_runtime": 63.8098,
"eval_samples_per_second": 156.716,
"eval_steps_per_second": 19.589,
"step": 9300
},
{
"epoch": 20.85611311339063,
"grad_norm": 0.5065354108810425,
"learning_rate": 5.907969323107703e-06,
"loss": 6.3993,
"step": 9400
},
{
"epoch": 20.85611311339063,
"eval_loss": 6.401696681976318,
"eval_runtime": 63.6775,
"eval_samples_per_second": 157.041,
"eval_steps_per_second": 19.63,
"step": 9400
},
{
"epoch": 21.077904075408927,
"grad_norm": 0.4803392291069031,
"learning_rate": 5.906968989663221e-06,
"loss": 6.4003,
"step": 9500
},
{
"epoch": 21.077904075408927,
"eval_loss": 6.399422645568848,
"eval_runtime": 63.6426,
"eval_samples_per_second": 157.127,
"eval_steps_per_second": 19.641,
"step": 9500
},
{
"epoch": 21.299695037427224,
"grad_norm": 0.7447142004966736,
"learning_rate": 5.90596865621874e-06,
"loss": 6.3992,
"step": 9600
},
{
"epoch": 21.299695037427224,
"eval_loss": 6.397017002105713,
"eval_runtime": 66.4941,
"eval_samples_per_second": 150.389,
"eval_steps_per_second": 18.799,
"step": 9600
},
{
"epoch": 21.521485999445524,
"grad_norm": 0.2856753468513489,
"learning_rate": 5.904968322774258e-06,
"loss": 6.3999,
"step": 9700
},
{
"epoch": 21.521485999445524,
"eval_loss": 6.400000095367432,
"eval_runtime": 63.7186,
"eval_samples_per_second": 156.94,
"eval_steps_per_second": 19.618,
"step": 9700
},
{
"epoch": 21.74327696146382,
"grad_norm": 0.8077158331871033,
"learning_rate": 5.9039679893297766e-06,
"loss": 6.3981,
"step": 9800
},
{
"epoch": 21.74327696146382,
"eval_loss": 6.398531436920166,
"eval_runtime": 63.7668,
"eval_samples_per_second": 156.821,
"eval_steps_per_second": 19.603,
"step": 9800
},
{
"epoch": 21.965067923482117,
"grad_norm": 0.8744412660598755,
"learning_rate": 5.902967655885295e-06,
"loss": 6.3988,
"step": 9900
},
{
"epoch": 21.965067923482117,
"eval_loss": 6.396906852722168,
"eval_runtime": 66.2535,
"eval_samples_per_second": 150.935,
"eval_steps_per_second": 18.867,
"step": 9900
},
{
"epoch": 22.186858885500417,
"grad_norm": 0.44601574540138245,
"learning_rate": 5.901967322440814e-06,
"loss": 6.3969,
"step": 10000
},
{
"epoch": 22.186858885500417,
"eval_loss": 6.395452976226807,
"eval_runtime": 63.6969,
"eval_samples_per_second": 156.994,
"eval_steps_per_second": 19.624,
"step": 10000
},
{
"epoch": 22.408649847518713,
"grad_norm": 0.6895701289176941,
"learning_rate": 5.900966988996333e-06,
"loss": 6.3967,
"step": 10100
},
{
"epoch": 22.408649847518713,
"eval_loss": 6.40028190612793,
"eval_runtime": 63.7023,
"eval_samples_per_second": 156.98,
"eval_steps_per_second": 19.623,
"step": 10100
},
{
"epoch": 22.63044080953701,
"grad_norm": 0.6166660189628601,
"learning_rate": 5.8999666555518505e-06,
"loss": 6.3968,
"step": 10200
},
{
"epoch": 22.63044080953701,
"eval_loss": 6.397933483123779,
"eval_runtime": 66.8627,
"eval_samples_per_second": 149.56,
"eval_steps_per_second": 18.695,
"step": 10200
},
{
"epoch": 22.85223177155531,
"grad_norm": 1.0633758306503296,
"learning_rate": 5.898966322107369e-06,
"loss": 6.3976,
"step": 10300
},
{
"epoch": 22.85223177155531,
"eval_loss": 6.396650791168213,
"eval_runtime": 63.7935,
"eval_samples_per_second": 156.756,
"eval_steps_per_second": 19.594,
"step": 10300
},
{
"epoch": 23.074022733573607,
"grad_norm": 0.4864283502101898,
"learning_rate": 5.897965988662888e-06,
"loss": 6.3967,
"step": 10400
},
{
"epoch": 23.074022733573607,
"eval_loss": 6.39711332321167,
"eval_runtime": 63.6284,
"eval_samples_per_second": 157.163,
"eval_steps_per_second": 19.645,
"step": 10400
},
{
"epoch": 23.295813695591903,
"grad_norm": 0.65082186460495,
"learning_rate": 5.896965655218406e-06,
"loss": 6.3973,
"step": 10500
},
{
"epoch": 23.295813695591903,
"eval_loss": 6.395853519439697,
"eval_runtime": 66.242,
"eval_samples_per_second": 150.962,
"eval_steps_per_second": 18.87,
"step": 10500
},
{
"epoch": 23.517604657610203,
"grad_norm": 0.45799535512924194,
"learning_rate": 5.8959653217739245e-06,
"loss": 6.396,
"step": 10600
},
{
"epoch": 23.517604657610203,
"eval_loss": 6.398243427276611,
"eval_runtime": 63.7686,
"eval_samples_per_second": 156.817,
"eval_steps_per_second": 19.602,
"step": 10600
},
{
"epoch": 23.7393956196285,
"grad_norm": 0.5860775709152222,
"learning_rate": 5.894964988329443e-06,
"loss": 6.3956,
"step": 10700
},
{
"epoch": 23.7393956196285,
"eval_loss": 6.3961687088012695,
"eval_runtime": 67.0182,
"eval_samples_per_second": 149.213,
"eval_steps_per_second": 18.652,
"step": 10700
},
{
"epoch": 23.9611865816468,
"grad_norm": 0.5584791898727417,
"learning_rate": 5.893964654884962e-06,
"loss": 6.3957,
"step": 10800
},
{
"epoch": 23.9611865816468,
"eval_loss": 6.396393775939941,
"eval_runtime": 63.8981,
"eval_samples_per_second": 156.499,
"eval_steps_per_second": 19.562,
"step": 10800
},
{
"epoch": 24.182977543665096,
"grad_norm": 0.7845295667648315,
"learning_rate": 5.892964321440481e-06,
"loss": 6.3956,
"step": 10900
},
{
"epoch": 24.182977543665096,
"eval_loss": 6.397210121154785,
"eval_runtime": 64.0302,
"eval_samples_per_second": 156.176,
"eval_steps_per_second": 19.522,
"step": 10900
},
{
"epoch": 24.404768505683393,
"grad_norm": 0.564857006072998,
"learning_rate": 5.8919639879959985e-06,
"loss": 6.3955,
"step": 11000
},
{
"epoch": 24.404768505683393,
"eval_loss": 6.395459175109863,
"eval_runtime": 67.2462,
"eval_samples_per_second": 148.707,
"eval_steps_per_second": 18.588,
"step": 11000
},
{
"epoch": 24.665372886054893,
"grad_norm": 0.7520161271095276,
"learning_rate": 4.906354515050168e-06,
"loss": 6.3944,
"step": 11100
},
{
"epoch": 24.665372886054893,
"eval_loss": 6.389779567718506,
"eval_runtime": 87.8112,
"eval_samples_per_second": 113.881,
"eval_steps_per_second": 14.235,
"step": 11100
},
{
"epoch": 24.88716384807319,
"grad_norm": 0.6003276705741882,
"learning_rate": 4.8963210702341136e-06,
"loss": 6.394,
"step": 11200
},
{
"epoch": 24.88716384807319,
"eval_loss": 6.394806861877441,
"eval_runtime": 75.8812,
"eval_samples_per_second": 131.785,
"eval_steps_per_second": 16.473,
"step": 11200
},
{
"epoch": 25.10895481009149,
"grad_norm": 0.28259870409965515,
"learning_rate": 4.88628762541806e-06,
"loss": 6.3945,
"step": 11300
},
{
"epoch": 25.10895481009149,
"eval_loss": 6.398300647735596,
"eval_runtime": 88.2774,
"eval_samples_per_second": 113.279,
"eval_steps_per_second": 14.16,
"step": 11300
},
{
"epoch": 25.330745772109786,
"grad_norm": 0.30802807211875916,
"learning_rate": 4.876254180602007e-06,
"loss": 6.3941,
"step": 11400
},
{
"epoch": 25.330745772109786,
"eval_loss": 6.394501686096191,
"eval_runtime": 66.156,
"eval_samples_per_second": 151.158,
"eval_steps_per_second": 18.895,
"step": 11400
},
{
"epoch": 25.552536734128083,
"grad_norm": 0.5175557732582092,
"learning_rate": 4.866220735785953e-06,
"loss": 6.394,
"step": 11500
},
{
"epoch": 25.552536734128083,
"eval_loss": 6.3985795974731445,
"eval_runtime": 63.6993,
"eval_samples_per_second": 156.988,
"eval_steps_per_second": 19.623,
"step": 11500
},
{
"epoch": 25.774327696146383,
"grad_norm": 0.5214359164237976,
"learning_rate": 4.8561872909699e-06,
"loss": 6.3942,
"step": 11600
},
{
"epoch": 25.774327696146383,
"eval_loss": 6.391521453857422,
"eval_runtime": 63.6987,
"eval_samples_per_second": 156.989,
"eval_steps_per_second": 19.624,
"step": 11600
},
{
"epoch": 25.99611865816468,
"grad_norm": 0.5827904343605042,
"learning_rate": 4.8461538461538465e-06,
"loss": 6.3953,
"step": 11700
},
{
"epoch": 25.99611865816468,
"eval_loss": 6.393467903137207,
"eval_runtime": 66.2727,
"eval_samples_per_second": 150.892,
"eval_steps_per_second": 18.861,
"step": 11700
},
{
"epoch": 26.21790962018298,
"grad_norm": 0.24229009449481964,
"learning_rate": 4.8361204013377925e-06,
"loss": 6.3945,
"step": 11800
},
{
"epoch": 26.21790962018298,
"eval_loss": 6.39454460144043,
"eval_runtime": 63.6782,
"eval_samples_per_second": 157.04,
"eval_steps_per_second": 19.63,
"step": 11800
},
{
"epoch": 26.439700582201276,
"grad_norm": 0.6859923005104065,
"learning_rate": 4.826086956521739e-06,
"loss": 6.3929,
"step": 11900
},
{
"epoch": 26.439700582201276,
"eval_loss": 6.394321918487549,
"eval_runtime": 66.2701,
"eval_samples_per_second": 150.898,
"eval_steps_per_second": 18.862,
"step": 11900
},
{
"epoch": 26.661491544219572,
"grad_norm": 0.4267604947090149,
"learning_rate": 4.816053511705686e-06,
"loss": 6.3941,
"step": 12000
},
{
"epoch": 26.661491544219572,
"eval_loss": 6.394528865814209,
"eval_runtime": 63.7313,
"eval_samples_per_second": 156.909,
"eval_steps_per_second": 19.614,
"step": 12000
},
{
"epoch": 26.883282506237872,
"grad_norm": 0.43895894289016724,
"learning_rate": 4.806020066889633e-06,
"loss": 6.3929,
"step": 12100
},
{
"epoch": 26.883282506237872,
"eval_loss": 6.3936076164245605,
"eval_runtime": 66.3275,
"eval_samples_per_second": 150.767,
"eval_steps_per_second": 18.846,
"step": 12100
},
{
"epoch": 27.10507346825617,
"grad_norm": 0.3438960015773773,
"learning_rate": 4.795986622073579e-06,
"loss": 6.3933,
"step": 12200
},
{
"epoch": 27.10507346825617,
"eval_loss": 6.397474765777588,
"eval_runtime": 63.621,
"eval_samples_per_second": 157.181,
"eval_steps_per_second": 19.648,
"step": 12200
},
{
"epoch": 27.326864430274465,
"grad_norm": 0.5950188636779785,
"learning_rate": 4.785953177257525e-06,
"loss": 6.394,
"step": 12300
},
{
"epoch": 27.326864430274465,
"eval_loss": 6.393238544464111,
"eval_runtime": 63.6999,
"eval_samples_per_second": 156.986,
"eval_steps_per_second": 19.623,
"step": 12300
},
{
"epoch": 27.548655392292765,
"grad_norm": 0.34001484513282776,
"learning_rate": 4.775919732441472e-06,
"loss": 6.3947,
"step": 12400
},
{
"epoch": 27.548655392292765,
"eval_loss": 6.394363880157471,
"eval_runtime": 66.2457,
"eval_samples_per_second": 150.953,
"eval_steps_per_second": 18.869,
"step": 12400
},
{
"epoch": 27.770446354311062,
"grad_norm": 0.47045424580574036,
"learning_rate": 4.765886287625418e-06,
"loss": 6.3929,
"step": 12500
},
{
"epoch": 27.770446354311062,
"eval_loss": 6.393606185913086,
"eval_runtime": 63.7187,
"eval_samples_per_second": 156.94,
"eval_steps_per_second": 19.617,
"step": 12500
},
{
"epoch": 27.99223731632936,
"grad_norm": 0.6604583859443665,
"learning_rate": 4.755852842809365e-06,
"loss": 6.3931,
"step": 12600
},
{
"epoch": 27.99223731632936,
"eval_loss": 6.39324426651001,
"eval_runtime": 63.6887,
"eval_samples_per_second": 157.014,
"eval_steps_per_second": 19.627,
"step": 12600
},
{
"epoch": 28.21402827834766,
"grad_norm": 0.6491646766662598,
"learning_rate": 4.745819397993312e-06,
"loss": 6.3912,
"step": 12700
},
{
"epoch": 28.21402827834766,
"eval_loss": 6.394981384277344,
"eval_runtime": 66.2742,
"eval_samples_per_second": 150.888,
"eval_steps_per_second": 18.861,
"step": 12700
},
{
"epoch": 28.435819240365955,
"grad_norm": 0.5381952524185181,
"learning_rate": 4.7357859531772575e-06,
"loss": 6.3929,
"step": 12800
},
{
"epoch": 28.435819240365955,
"eval_loss": 6.392743110656738,
"eval_runtime": 63.6892,
"eval_samples_per_second": 157.012,
"eval_steps_per_second": 19.627,
"step": 12800
},
{
"epoch": 28.65761020238425,
"grad_norm": 0.7769903540611267,
"learning_rate": 4.725752508361204e-06,
"loss": 6.3927,
"step": 12900
},
{
"epoch": 28.65761020238425,
"eval_loss": 6.390952110290527,
"eval_runtime": 66.3226,
"eval_samples_per_second": 150.778,
"eval_steps_per_second": 18.847,
"step": 12900
},
{
"epoch": 28.87940116440255,
"grad_norm": 0.4297138452529907,
"learning_rate": 4.715719063545151e-06,
"loss": 6.393,
"step": 13000
},
{
"epoch": 28.87940116440255,
"eval_loss": 6.390758037567139,
"eval_runtime": 63.8216,
"eval_samples_per_second": 156.687,
"eval_steps_per_second": 19.586,
"step": 13000
},
{
"epoch": 29.101192126420848,
"grad_norm": 0.7731721997261047,
"learning_rate": 4.705685618729097e-06,
"loss": 6.3923,
"step": 13100
},
{
"epoch": 29.101192126420848,
"eval_loss": 6.392960071563721,
"eval_runtime": 63.6867,
"eval_samples_per_second": 157.019,
"eval_steps_per_second": 19.627,
"step": 13100
},
{
"epoch": 29.322983088439145,
"grad_norm": 0.27714040875434875,
"learning_rate": 4.695652173913044e-06,
"loss": 6.3934,
"step": 13200
},
{
"epoch": 29.322983088439145,
"eval_loss": 6.395288944244385,
"eval_runtime": 66.2909,
"eval_samples_per_second": 150.85,
"eval_steps_per_second": 18.856,
"step": 13200
},
{
"epoch": 29.544774050457445,
"grad_norm": 0.5391174554824829,
"learning_rate": 4.6856187290969905e-06,
"loss": 6.3927,
"step": 13300
},
{
"epoch": 29.544774050457445,
"eval_loss": 6.395300388336182,
"eval_runtime": 63.6935,
"eval_samples_per_second": 157.002,
"eval_steps_per_second": 19.625,
"step": 13300
},
{
"epoch": 29.76656501247574,
"grad_norm": 0.9717122912406921,
"learning_rate": 4.675585284280936e-06,
"loss": 6.391,
"step": 13400
},
{
"epoch": 29.76656501247574,
"eval_loss": 6.3939642906188965,
"eval_runtime": 64.4676,
"eval_samples_per_second": 155.117,
"eval_steps_per_second": 19.39,
"step": 13400
},
{
"epoch": 29.988355974494038,
"grad_norm": 0.3409580588340759,
"learning_rate": 4.665551839464883e-06,
"loss": 6.3929,
"step": 13500
},
{
"epoch": 29.988355974494038,
"eval_loss": 6.393261909484863,
"eval_runtime": 65.5531,
"eval_samples_per_second": 152.548,
"eval_steps_per_second": 19.069,
"step": 13500
},
{
"epoch": 30.210146936512338,
"grad_norm": 0.7017607092857361,
"learning_rate": 4.65551839464883e-06,
"loss": 6.3914,
"step": 13600
},
{
"epoch": 30.210146936512338,
"eval_loss": 6.389814853668213,
"eval_runtime": 63.5889,
"eval_samples_per_second": 157.26,
"eval_steps_per_second": 19.658,
"step": 13600
},
{
"epoch": 30.431937898530634,
"grad_norm": 0.494228720664978,
"learning_rate": 4.645484949832776e-06,
"loss": 6.3913,
"step": 13700
},
{
"epoch": 30.431937898530634,
"eval_loss": 6.389814853668213,
"eval_runtime": 63.6983,
"eval_samples_per_second": 156.99,
"eval_steps_per_second": 19.624,
"step": 13700
},
{
"epoch": 30.65372886054893,
"grad_norm": 0.6848724484443665,
"learning_rate": 4.635451505016723e-06,
"loss": 6.3909,
"step": 13800
},
{
"epoch": 30.65372886054893,
"eval_loss": 6.391334533691406,
"eval_runtime": 66.3245,
"eval_samples_per_second": 150.774,
"eval_steps_per_second": 18.847,
"step": 13800
},
{
"epoch": 30.87551982256723,
"grad_norm": 0.5187550187110901,
"learning_rate": 4.625418060200669e-06,
"loss": 6.3905,
"step": 13900
},
{
"epoch": 30.87551982256723,
"eval_loss": 6.393035411834717,
"eval_runtime": 63.667,
"eval_samples_per_second": 157.067,
"eval_steps_per_second": 19.633,
"step": 13900
},
{
"epoch": 31.097310784585527,
"grad_norm": 0.4394451081752777,
"learning_rate": 4.615384615384616e-06,
"loss": 6.3902,
"step": 14000
},
{
"epoch": 31.097310784585527,
"eval_loss": 6.391651630401611,
"eval_runtime": 66.2607,
"eval_samples_per_second": 150.919,
"eval_steps_per_second": 18.865,
"step": 14000
},
{
"epoch": 31.319101746603828,
"grad_norm": 0.6403105854988098,
"learning_rate": 4.605351170568562e-06,
"loss": 6.3904,
"step": 14100
},
{
"epoch": 31.319101746603828,
"eval_loss": 6.390075206756592,
"eval_runtime": 63.7818,
"eval_samples_per_second": 156.785,
"eval_steps_per_second": 19.598,
"step": 14100
},
{
"epoch": 31.540892708622124,
"grad_norm": 0.41991308331489563,
"learning_rate": 4.595317725752509e-06,
"loss": 6.3915,
"step": 14200
},
{
"epoch": 31.540892708622124,
"eval_loss": 6.390388488769531,
"eval_runtime": 66.3061,
"eval_samples_per_second": 150.816,
"eval_steps_per_second": 18.852,
"step": 14200
},
{
"epoch": 31.76268367064042,
"grad_norm": 0.5049502849578857,
"learning_rate": 4.585284280936456e-06,
"loss": 6.3901,
"step": 14300
},
{
"epoch": 31.76268367064042,
"eval_loss": 6.394845485687256,
"eval_runtime": 63.7361,
"eval_samples_per_second": 156.897,
"eval_steps_per_second": 19.612,
"step": 14300
},
{
"epoch": 31.98447463265872,
"grad_norm": 0.5375522375106812,
"learning_rate": 4.5752508361204015e-06,
"loss": 6.3901,
"step": 14400
},
{
"epoch": 31.98447463265872,
"eval_loss": 6.3919267654418945,
"eval_runtime": 63.6609,
"eval_samples_per_second": 157.082,
"eval_steps_per_second": 19.635,
"step": 14400
},
{
"epoch": 32.206265594677014,
"grad_norm": 0.6649445295333862,
"learning_rate": 4.565217391304348e-06,
"loss": 6.3897,
"step": 14500
},
{
"epoch": 32.206265594677014,
"eval_loss": 6.391171932220459,
"eval_runtime": 66.188,
"eval_samples_per_second": 151.085,
"eval_steps_per_second": 18.886,
"step": 14500
},
{
"epoch": 32.42805655669532,
"grad_norm": 0.5367133021354675,
"learning_rate": 4.555183946488295e-06,
"loss": 6.3903,
"step": 14600
},
{
"epoch": 32.42805655669532,
"eval_loss": 6.390655517578125,
"eval_runtime": 63.747,
"eval_samples_per_second": 156.87,
"eval_steps_per_second": 19.609,
"step": 14600
},
{
"epoch": 32.649847518713614,
"grad_norm": 0.5683135986328125,
"learning_rate": 4.545150501672241e-06,
"loss": 6.3881,
"step": 14700
},
{
"epoch": 32.649847518713614,
"eval_loss": 6.387674808502197,
"eval_runtime": 63.678,
"eval_samples_per_second": 157.04,
"eval_steps_per_second": 19.63,
"step": 14700
},
{
"epoch": 32.87163848073191,
"grad_norm": 0.697325587272644,
"learning_rate": 4.535117056856188e-06,
"loss": 6.3908,
"step": 14800
},
{
"epoch": 32.87163848073191,
"eval_loss": 6.393805027008057,
"eval_runtime": 63.7212,
"eval_samples_per_second": 156.934,
"eval_steps_per_second": 19.617,
"step": 14800
},
{
"epoch": 33.09342944275021,
"grad_norm": 0.5757908225059509,
"learning_rate": 4.5250836120401345e-06,
"loss": 6.3907,
"step": 14900
},
{
"epoch": 33.09342944275021,
"eval_loss": 6.393499851226807,
"eval_runtime": 66.2096,
"eval_samples_per_second": 151.035,
"eval_steps_per_second": 18.879,
"step": 14900
},
{
"epoch": 33.3152204047685,
"grad_norm": 0.3517054319381714,
"learning_rate": 4.51505016722408e-06,
"loss": 6.3902,
"step": 15000
},
{
"epoch": 33.3152204047685,
"eval_loss": 6.386899471282959,
"eval_runtime": 63.7082,
"eval_samples_per_second": 156.966,
"eval_steps_per_second": 19.621,
"step": 15000
},
{
"epoch": 33.53701136678681,
"grad_norm": 0.7311076521873474,
"learning_rate": 4.505016722408027e-06,
"loss": 6.3905,
"step": 15100
},
{
"epoch": 33.53701136678681,
"eval_loss": 6.391955375671387,
"eval_runtime": 63.6711,
"eval_samples_per_second": 157.057,
"eval_steps_per_second": 19.632,
"step": 15100
},
{
"epoch": 33.7588023288051,
"grad_norm": 0.4526328444480896,
"learning_rate": 4.494983277591973e-06,
"loss": 6.3891,
"step": 15200
},
{
"epoch": 33.7588023288051,
"eval_loss": 6.390474796295166,
"eval_runtime": 66.2489,
"eval_samples_per_second": 150.946,
"eval_steps_per_second": 18.868,
"step": 15200
},
{
"epoch": 33.9805932908234,
"grad_norm": 0.5623629093170166,
"learning_rate": 4.48494983277592e-06,
"loss": 6.3901,
"step": 15300
},
{
"epoch": 33.9805932908234,
"eval_loss": 6.388679027557373,
"eval_runtime": 63.6854,
"eval_samples_per_second": 157.022,
"eval_steps_per_second": 19.628,
"step": 15300
},
{
"epoch": 34.202384252841696,
"grad_norm": 0.49122416973114014,
"learning_rate": 4.474916387959866e-06,
"loss": 6.389,
"step": 15400
},
{
"epoch": 34.202384252841696,
"eval_loss": 6.39013671875,
"eval_runtime": 63.5858,
"eval_samples_per_second": 157.268,
"eval_steps_per_second": 19.658,
"step": 15400
},
{
"epoch": 34.42417521485999,
"grad_norm": 0.674659013748169,
"learning_rate": 4.4648829431438125e-06,
"loss": 6.3887,
"step": 15500
},
{
"epoch": 34.42417521485999,
"eval_loss": 6.392813205718994,
"eval_runtime": 66.2307,
"eval_samples_per_second": 150.987,
"eval_steps_per_second": 18.873,
"step": 15500
},
{
"epoch": 34.64596617687829,
"grad_norm": 0.43613201379776,
"learning_rate": 4.454849498327759e-06,
"loss": 6.3889,
"step": 15600
},
{
"epoch": 34.64596617687829,
"eval_loss": 6.388660907745361,
"eval_runtime": 63.6774,
"eval_samples_per_second": 157.042,
"eval_steps_per_second": 19.63,
"step": 15600
},
{
"epoch": 34.86775713889659,
"grad_norm": 0.737578272819519,
"learning_rate": 4.444816053511705e-06,
"loss": 6.3894,
"step": 15700
},
{
"epoch": 34.86775713889659,
"eval_loss": 6.389644145965576,
"eval_runtime": 63.7079,
"eval_samples_per_second": 156.966,
"eval_steps_per_second": 19.621,
"step": 15700
},
{
"epoch": 35.08954810091489,
"grad_norm": 0.4716251790523529,
"learning_rate": 4.434782608695652e-06,
"loss": 6.3885,
"step": 15800
},
{
"epoch": 35.08954810091489,
"eval_loss": 6.392263412475586,
"eval_runtime": 66.1971,
"eval_samples_per_second": 151.064,
"eval_steps_per_second": 18.883,
"step": 15800
},
{
"epoch": 35.311339062933186,
"grad_norm": 0.47875767946243286,
"learning_rate": 4.424749163879599e-06,
"loss": 6.3886,
"step": 15900
},
{
"epoch": 35.311339062933186,
"eval_loss": 6.389831066131592,
"eval_runtime": 63.6821,
"eval_samples_per_second": 157.03,
"eval_steps_per_second": 19.629,
"step": 15900
},
{
"epoch": 35.53313002495148,
"grad_norm": 0.43402403593063354,
"learning_rate": 4.414715719063545e-06,
"loss": 6.3909,
"step": 16000
},
{
"epoch": 35.53313002495148,
"eval_loss": 6.389725208282471,
"eval_runtime": 63.7124,
"eval_samples_per_second": 156.955,
"eval_steps_per_second": 19.619,
"step": 16000
},
{
"epoch": 35.75492098696978,
"grad_norm": 0.5011460781097412,
"learning_rate": 4.404682274247491e-06,
"loss": 6.3891,
"step": 16100
},
{
"epoch": 35.75492098696978,
"eval_loss": 6.388359546661377,
"eval_runtime": 66.2636,
"eval_samples_per_second": 150.912,
"eval_steps_per_second": 18.864,
"step": 16100
},
{
"epoch": 35.976711948988076,
"grad_norm": 0.4029878079891205,
"learning_rate": 4.394648829431438e-06,
"loss": 6.3875,
"step": 16200
},
{
"epoch": 35.976711948988076,
"eval_loss": 6.387814044952393,
"eval_runtime": 63.7085,
"eval_samples_per_second": 156.965,
"eval_steps_per_second": 19.621,
"step": 16200
},
{
"epoch": 36.19850291100638,
"grad_norm": 0.5763450264930725,
"learning_rate": 4.384615384615384e-06,
"loss": 6.3889,
"step": 16300
},
{
"epoch": 36.19850291100638,
"eval_loss": 6.389321327209473,
"eval_runtime": 65.8717,
"eval_samples_per_second": 151.81,
"eval_steps_per_second": 18.976,
"step": 16300
},
{
"epoch": 36.420293873024676,
"grad_norm": 0.4742737412452698,
"learning_rate": 4.374581939799331e-06,
"loss": 6.3886,
"step": 16400
},
{
"epoch": 36.420293873024676,
"eval_loss": 6.388833522796631,
"eval_runtime": 63.752,
"eval_samples_per_second": 156.858,
"eval_steps_per_second": 19.607,
"step": 16400
},
{
"epoch": 36.64208483504297,
"grad_norm": 0.4631459414958954,
"learning_rate": 4.364548494983278e-06,
"loss": 6.3886,
"step": 16500
},
{
"epoch": 36.64208483504297,
"eval_loss": 6.387075901031494,
"eval_runtime": 63.6816,
"eval_samples_per_second": 157.031,
"eval_steps_per_second": 19.629,
"step": 16500
},
{
"epoch": 36.86387579706127,
"grad_norm": 0.5047929286956787,
"learning_rate": 4.354515050167224e-06,
"loss": 6.3869,
"step": 16600
},
{
"epoch": 36.86387579706127,
"eval_loss": 6.39074182510376,
"eval_runtime": 64.7171,
"eval_samples_per_second": 154.519,
"eval_steps_per_second": 19.315,
"step": 16600
},
{
"epoch": 37.085666759079565,
"grad_norm": 0.45218634605407715,
"learning_rate": 4.34448160535117e-06,
"loss": 6.3894,
"step": 16700
},
{
"epoch": 37.085666759079565,
"eval_loss": 6.393436908721924,
"eval_runtime": 64.9705,
"eval_samples_per_second": 153.916,
"eval_steps_per_second": 19.24,
"step": 16700
},
{
"epoch": 37.30745772109786,
"grad_norm": 0.5652719736099243,
"learning_rate": 4.334448160535117e-06,
"loss": 6.3873,
"step": 16800
},
{
"epoch": 37.30745772109786,
"eval_loss": 6.391731262207031,
"eval_runtime": 63.565,
"eval_samples_per_second": 157.319,
"eval_steps_per_second": 19.665,
"step": 16800
},
{
"epoch": 37.529248683116165,
"grad_norm": 0.28403371572494507,
"learning_rate": 4.324414715719064e-06,
"loss": 6.3882,
"step": 16900
},
{
"epoch": 37.529248683116165,
"eval_loss": 6.390590190887451,
"eval_runtime": 63.6107,
"eval_samples_per_second": 157.206,
"eval_steps_per_second": 19.651,
"step": 16900
},
{
"epoch": 37.75103964513446,
"grad_norm": 0.477235347032547,
"learning_rate": 4.31438127090301e-06,
"loss": 6.3872,
"step": 17000
},
{
"epoch": 37.75103964513446,
"eval_loss": 6.390269756317139,
"eval_runtime": 66.2763,
"eval_samples_per_second": 150.884,
"eval_steps_per_second": 18.86,
"step": 17000
},
{
"epoch": 37.97283060715276,
"grad_norm": 0.37472817301750183,
"learning_rate": 4.3043478260869565e-06,
"loss": 6.3874,
"step": 17100
},
{
"epoch": 37.97283060715276,
"eval_loss": 6.390199184417725,
"eval_runtime": 63.6243,
"eval_samples_per_second": 157.173,
"eval_steps_per_second": 19.647,
"step": 17100
},
{
"epoch": 38.194621569171055,
"grad_norm": 0.3379691243171692,
"learning_rate": 4.294314381270903e-06,
"loss": 6.387,
"step": 17200
},
{
"epoch": 38.194621569171055,
"eval_loss": 6.386340618133545,
"eval_runtime": 63.5571,
"eval_samples_per_second": 157.339,
"eval_steps_per_second": 19.667,
"step": 17200
},
{
"epoch": 38.41641253118935,
"grad_norm": 0.46496257185935974,
"learning_rate": 4.284280936454849e-06,
"loss": 6.3856,
"step": 17300
},
{
"epoch": 38.41641253118935,
"eval_loss": 6.3855695724487305,
"eval_runtime": 65.9737,
"eval_samples_per_second": 151.576,
"eval_steps_per_second": 18.947,
"step": 17300
},
{
"epoch": 38.638203493207655,
"grad_norm": 0.37888166308403015,
"learning_rate": 4.274247491638796e-06,
"loss": 6.3884,
"step": 17400
},
{
"epoch": 38.638203493207655,
"eval_loss": 6.388376235961914,
"eval_runtime": 63.6302,
"eval_samples_per_second": 157.158,
"eval_steps_per_second": 19.645,
"step": 17400
},
{
"epoch": 38.85999445522595,
"grad_norm": 0.25813955068588257,
"learning_rate": 4.264214046822743e-06,
"loss": 6.3885,
"step": 17500
},
{
"epoch": 38.85999445522595,
"eval_loss": 6.389296054840088,
"eval_runtime": 63.7359,
"eval_samples_per_second": 156.897,
"eval_steps_per_second": 19.612,
"step": 17500
},
{
"epoch": 39.08178541724425,
"grad_norm": 0.4262288510799408,
"learning_rate": 4.254180602006689e-06,
"loss": 6.3873,
"step": 17600
},
{
"epoch": 39.08178541724425,
"eval_loss": 6.389705657958984,
"eval_runtime": 66.0125,
"eval_samples_per_second": 151.486,
"eval_steps_per_second": 18.936,
"step": 17600
},
{
"epoch": 39.303576379262545,
"grad_norm": 0.5291593074798584,
"learning_rate": 4.244147157190635e-06,
"loss": 6.3875,
"step": 17700
},
{
"epoch": 39.303576379262545,
"eval_loss": 6.390807628631592,
"eval_runtime": 63.619,
"eval_samples_per_second": 157.186,
"eval_steps_per_second": 19.648,
"step": 17700
},
{
"epoch": 39.52536734128084,
"grad_norm": 0.3667999505996704,
"learning_rate": 4.234113712374582e-06,
"loss": 6.3887,
"step": 17800
},
{
"epoch": 39.52536734128084,
"eval_loss": 6.3871259689331055,
"eval_runtime": 63.6878,
"eval_samples_per_second": 157.016,
"eval_steps_per_second": 19.627,
"step": 17800
},
{
"epoch": 39.74715830329914,
"grad_norm": 0.40572404861450195,
"learning_rate": 4.224080267558528e-06,
"loss": 6.3877,
"step": 17900
},
{
"epoch": 39.74715830329914,
"eval_loss": 6.387050628662109,
"eval_runtime": 66.0715,
"eval_samples_per_second": 151.351,
"eval_steps_per_second": 18.919,
"step": 17900
},
{
"epoch": 39.96894926531744,
"grad_norm": 0.5057101845741272,
"learning_rate": 4.214046822742475e-06,
"loss": 6.385,
"step": 18000
},
{
"epoch": 39.96894926531744,
"eval_loss": 6.388771057128906,
"eval_runtime": 63.6312,
"eval_samples_per_second": 157.156,
"eval_steps_per_second": 19.644,
"step": 18000
},
{
"epoch": 40.19074022733574,
"grad_norm": 0.5846272110939026,
"learning_rate": 4.2040133779264216e-06,
"loss": 6.3873,
"step": 18100
},
{
"epoch": 40.19074022733574,
"eval_loss": 6.388961315155029,
"eval_runtime": 63.6009,
"eval_samples_per_second": 157.23,
"eval_steps_per_second": 19.654,
"step": 18100
},
{
"epoch": 40.412531189354034,
"grad_norm": 0.40428778529167175,
"learning_rate": 4.1939799331103675e-06,
"loss": 6.3878,
"step": 18200
},
{
"epoch": 40.412531189354034,
"eval_loss": 6.392088413238525,
"eval_runtime": 66.128,
"eval_samples_per_second": 151.222,
"eval_steps_per_second": 18.903,
"step": 18200
},
{
"epoch": 40.63432215137233,
"grad_norm": 0.46563634276390076,
"learning_rate": 4.183946488294314e-06,
"loss": 6.386,
"step": 18300
},
{
"epoch": 40.63432215137233,
"eval_loss": 6.389146327972412,
"eval_runtime": 63.6612,
"eval_samples_per_second": 157.082,
"eval_steps_per_second": 19.635,
"step": 18300
},
{
"epoch": 40.85611311339063,
"grad_norm": 0.4533691704273224,
"learning_rate": 4.173913043478261e-06,
"loss": 6.3874,
"step": 18400
},
{
"epoch": 40.85611311339063,
"eval_loss": 6.386475086212158,
"eval_runtime": 63.7394,
"eval_samples_per_second": 156.889,
"eval_steps_per_second": 19.611,
"step": 18400
},
{
"epoch": 41.077904075408924,
"grad_norm": 0.38121113181114197,
"learning_rate": 4.163879598662208e-06,
"loss": 6.3862,
"step": 18500
},
{
"epoch": 41.077904075408924,
"eval_loss": 6.384340763092041,
"eval_runtime": 65.9841,
"eval_samples_per_second": 151.552,
"eval_steps_per_second": 18.944,
"step": 18500
},
{
"epoch": 41.29969503742723,
"grad_norm": 0.4599936604499817,
"learning_rate": 4.153846153846154e-06,
"loss": 6.3871,
"step": 18600
},
{
"epoch": 41.29969503742723,
"eval_loss": 6.38564395904541,
"eval_runtime": 63.6008,
"eval_samples_per_second": 157.231,
"eval_steps_per_second": 19.654,
"step": 18600
},
{
"epoch": 41.521485999445524,
"grad_norm": 0.6862403154373169,
"learning_rate": 4.1438127090301005e-06,
"loss": 6.3867,
"step": 18700
},
{
"epoch": 41.521485999445524,
"eval_loss": 6.385303020477295,
"eval_runtime": 63.6207,
"eval_samples_per_second": 157.181,
"eval_steps_per_second": 19.648,
"step": 18700
},
{
"epoch": 41.74327696146382,
"grad_norm": 0.26633918285369873,
"learning_rate": 4.133779264214047e-06,
"loss": 6.3869,
"step": 18800
},
{
"epoch": 41.74327696146382,
"eval_loss": 6.389577388763428,
"eval_runtime": 66.0775,
"eval_samples_per_second": 151.337,
"eval_steps_per_second": 18.917,
"step": 18800
},
{
"epoch": 41.96506792348212,
"grad_norm": 0.30118024349212646,
"learning_rate": 4.123745819397993e-06,
"loss": 6.3869,
"step": 18900
},
{
"epoch": 41.96506792348212,
"eval_loss": 6.387940406799316,
"eval_runtime": 63.6813,
"eval_samples_per_second": 157.032,
"eval_steps_per_second": 19.629,
"step": 18900
},
{
"epoch": 42.18685888550041,
"grad_norm": 0.6833294630050659,
"learning_rate": 4.11371237458194e-06,
"loss": 6.3857,
"step": 19000
},
{
"epoch": 42.18685888550041,
"eval_loss": 6.3908514976501465,
"eval_runtime": 66.0844,
"eval_samples_per_second": 151.322,
"eval_steps_per_second": 18.915,
"step": 19000
},
{
"epoch": 42.40864984751872,
"grad_norm": 0.35510268807411194,
"learning_rate": 4.103678929765887e-06,
"loss": 6.3862,
"step": 19100
},
{
"epoch": 42.40864984751872,
"eval_loss": 6.3866119384765625,
"eval_runtime": 63.7625,
"eval_samples_per_second": 156.832,
"eval_steps_per_second": 19.604,
"step": 19100
},
{
"epoch": 42.63044080953701,
"grad_norm": 0.5903100371360779,
"learning_rate": 4.0936454849498326e-06,
"loss": 6.3857,
"step": 19200
},
{
"epoch": 42.63044080953701,
"eval_loss": 6.385927677154541,
"eval_runtime": 63.6174,
"eval_samples_per_second": 157.19,
"eval_steps_per_second": 19.649,
"step": 19200
},
{
"epoch": 42.85223177155531,
"grad_norm": 0.4845108091831207,
"learning_rate": 4.083612040133779e-06,
"loss": 6.387,
"step": 19300
},
{
"epoch": 42.85223177155531,
"eval_loss": 6.38942289352417,
"eval_runtime": 66.1264,
"eval_samples_per_second": 151.226,
"eval_steps_per_second": 18.903,
"step": 19300
},
{
"epoch": 43.07402273357361,
"grad_norm": 0.3592558801174164,
"learning_rate": 4.073578595317726e-06,
"loss": 6.3862,
"step": 19400
},
{
"epoch": 43.07402273357361,
"eval_loss": 6.389144420623779,
"eval_runtime": 63.5655,
"eval_samples_per_second": 157.318,
"eval_steps_per_second": 19.665,
"step": 19400
},
{
"epoch": 43.2958136955919,
"grad_norm": 0.5529589056968689,
"learning_rate": 4.063545150501672e-06,
"loss": 6.3842,
"step": 19500
},
{
"epoch": 43.2958136955919,
"eval_loss": 6.386436939239502,
"eval_runtime": 66.2264,
"eval_samples_per_second": 150.997,
"eval_steps_per_second": 18.875,
"step": 19500
},
{
"epoch": 43.5176046576102,
"grad_norm": 0.42238518595695496,
"learning_rate": 4.053511705685619e-06,
"loss": 6.3866,
"step": 19600
},
{
"epoch": 43.5176046576102,
"eval_loss": 6.385384559631348,
"eval_runtime": 63.7683,
"eval_samples_per_second": 156.818,
"eval_steps_per_second": 19.602,
"step": 19600
},
{
"epoch": 43.7393956196285,
"grad_norm": 0.5223355293273926,
"learning_rate": 4.0434782608695655e-06,
"loss": 6.3853,
"step": 19700
},
{
"epoch": 43.7393956196285,
"eval_loss": 6.385824203491211,
"eval_runtime": 63.6506,
"eval_samples_per_second": 157.108,
"eval_steps_per_second": 19.638,
"step": 19700
},
{
"epoch": 43.9611865816468,
"grad_norm": 0.46218928694725037,
"learning_rate": 4.0334448160535115e-06,
"loss": 6.387,
"step": 19800
},
{
"epoch": 43.9611865816468,
"eval_loss": 6.38681697845459,
"eval_runtime": 66.1858,
"eval_samples_per_second": 151.09,
"eval_steps_per_second": 18.886,
"step": 19800
},
{
"epoch": 44.182977543665096,
"grad_norm": 0.3450022041797638,
"learning_rate": 4.023411371237458e-06,
"loss": 6.3845,
"step": 19900
},
{
"epoch": 44.182977543665096,
"eval_loss": 6.386622428894043,
"eval_runtime": 63.5361,
"eval_samples_per_second": 157.391,
"eval_steps_per_second": 19.674,
"step": 19900
},
{
"epoch": 44.40476850568339,
"grad_norm": 0.39958134293556213,
"learning_rate": 4.013377926421405e-06,
"loss": 6.3863,
"step": 20000
},
{
"epoch": 44.40476850568339,
"eval_loss": 6.387628555297852,
"eval_runtime": 63.6316,
"eval_samples_per_second": 157.155,
"eval_steps_per_second": 19.644,
"step": 20000
},
{
"epoch": 44.62655946770169,
"grad_norm": 0.28472310304641724,
"learning_rate": 4.003344481605351e-06,
"loss": 6.3851,
"step": 20100
},
{
"epoch": 44.62655946770169,
"eval_loss": 6.388401031494141,
"eval_runtime": 63.6958,
"eval_samples_per_second": 156.996,
"eval_steps_per_second": 19.625,
"step": 20100
},
{
"epoch": 44.848350429719986,
"grad_norm": 0.39134547114372253,
"learning_rate": 3.993311036789298e-06,
"loss": 6.3849,
"step": 20200
},
{
"epoch": 44.848350429719986,
"eval_loss": 6.389621734619141,
"eval_runtime": 66.137,
"eval_samples_per_second": 151.201,
"eval_steps_per_second": 18.9,
"step": 20200
},
{
"epoch": 45.07014139173829,
"grad_norm": 0.5134591460227966,
"learning_rate": 3.9832775919732444e-06,
"loss": 6.3847,
"step": 20300
},
{
"epoch": 45.07014139173829,
"eval_loss": 6.387813568115234,
"eval_runtime": 63.5686,
"eval_samples_per_second": 157.31,
"eval_steps_per_second": 19.664,
"step": 20300
},
{
"epoch": 45.291932353756586,
"grad_norm": 0.2885007858276367,
"learning_rate": 3.97324414715719e-06,
"loss": 6.3865,
"step": 20400
},
{
"epoch": 45.291932353756586,
"eval_loss": 6.389806270599365,
"eval_runtime": 63.5893,
"eval_samples_per_second": 157.259,
"eval_steps_per_second": 19.657,
"step": 20400
},
{
"epoch": 45.51372331577488,
"grad_norm": 0.37093526124954224,
"learning_rate": 3.963210702341137e-06,
"loss": 6.3842,
"step": 20500
},
{
"epoch": 45.51372331577488,
"eval_loss": 6.386034965515137,
"eval_runtime": 66.1094,
"eval_samples_per_second": 151.265,
"eval_steps_per_second": 18.908,
"step": 20500
},
{
"epoch": 45.73551427779318,
"grad_norm": 0.4181094169616699,
"learning_rate": 3.953177257525084e-06,
"loss": 6.3827,
"step": 20600
},
{
"epoch": 45.73551427779318,
"eval_loss": 6.386598587036133,
"eval_runtime": 63.6628,
"eval_samples_per_second": 157.078,
"eval_steps_per_second": 19.635,
"step": 20600
},
{
"epoch": 45.957305239811475,
"grad_norm": 0.6212390661239624,
"learning_rate": 3.943143812709031e-06,
"loss": 6.3864,
"step": 20700
},
{
"epoch": 45.957305239811475,
"eval_loss": 6.3882646560668945,
"eval_runtime": 65.9973,
"eval_samples_per_second": 151.521,
"eval_steps_per_second": 18.94,
"step": 20700
},
{
"epoch": 46.17909620182977,
"grad_norm": 0.443857878446579,
"learning_rate": 3.9331103678929765e-06,
"loss": 6.3859,
"step": 20800
},
{
"epoch": 46.17909620182977,
"eval_loss": 6.388275623321533,
"eval_runtime": 63.7053,
"eval_samples_per_second": 156.973,
"eval_steps_per_second": 19.622,
"step": 20800
},
{
"epoch": 46.400887163848076,
"grad_norm": 0.2678993344306946,
"learning_rate": 3.923076923076923e-06,
"loss": 6.3865,
"step": 20900
},
{
"epoch": 46.400887163848076,
"eval_loss": 6.38779354095459,
"eval_runtime": 63.6908,
"eval_samples_per_second": 157.009,
"eval_steps_per_second": 19.626,
"step": 20900
},
{
"epoch": 46.62267812586637,
"grad_norm": 0.35121896862983704,
"learning_rate": 3.91304347826087e-06,
"loss": 6.3842,
"step": 21000
},
{
"epoch": 46.62267812586637,
"eval_loss": 6.385668754577637,
"eval_runtime": 66.0547,
"eval_samples_per_second": 151.39,
"eval_steps_per_second": 18.924,
"step": 21000
},
{
"epoch": 46.84446908788467,
"grad_norm": 0.6166325807571411,
"learning_rate": 3.903010033444816e-06,
"loss": 6.3848,
"step": 21100
},
{
"epoch": 46.84446908788467,
"eval_loss": 6.385282516479492,
"eval_runtime": 63.6134,
"eval_samples_per_second": 157.2,
"eval_steps_per_second": 19.65,
"step": 21100
},
{
"epoch": 47.066260049902965,
"grad_norm": 0.5324620008468628,
"learning_rate": 3.892976588628763e-06,
"loss": 6.3847,
"step": 21200
},
{
"epoch": 47.066260049902965,
"eval_loss": 6.386166572570801,
"eval_runtime": 63.7747,
"eval_samples_per_second": 156.802,
"eval_steps_per_second": 19.6,
"step": 21200
},
{
"epoch": 47.28805101192126,
"grad_norm": 0.37806278467178345,
"learning_rate": 3.8829431438127095e-06,
"loss": 6.3847,
"step": 21300
},
{
"epoch": 47.28805101192126,
"eval_loss": 6.387280464172363,
"eval_runtime": 66.0795,
"eval_samples_per_second": 151.333,
"eval_steps_per_second": 18.917,
"step": 21300
},
{
"epoch": 47.509841973939565,
"grad_norm": 0.2344857156276703,
"learning_rate": 3.8729096989966554e-06,
"loss": 6.3851,
"step": 21400
},
{
"epoch": 47.509841973939565,
"eval_loss": 6.38550329208374,
"eval_runtime": 63.7048,
"eval_samples_per_second": 156.974,
"eval_steps_per_second": 19.622,
"step": 21400
},
{
"epoch": 47.73163293595786,
"grad_norm": 0.47279292345046997,
"learning_rate": 3.862876254180602e-06,
"loss": 6.3843,
"step": 21500
},
{
"epoch": 47.73163293595786,
"eval_loss": 6.390079021453857,
"eval_runtime": 63.7137,
"eval_samples_per_second": 156.952,
"eval_steps_per_second": 19.619,
"step": 21500
},
{
"epoch": 47.95342389797616,
"grad_norm": 0.5413157343864441,
"learning_rate": 3.852842809364549e-06,
"loss": 6.3844,
"step": 21600
},
{
"epoch": 47.95342389797616,
"eval_loss": 6.385741233825684,
"eval_runtime": 66.101,
"eval_samples_per_second": 151.284,
"eval_steps_per_second": 18.91,
"step": 21600
},
{
"epoch": 48.175214859994455,
"grad_norm": 0.48085787892341614,
"learning_rate": 3.842809364548495e-06,
"loss": 6.3851,
"step": 21700
},
{
"epoch": 48.175214859994455,
"eval_loss": 6.385941505432129,
"eval_runtime": 63.6718,
"eval_samples_per_second": 157.055,
"eval_steps_per_second": 19.632,
"step": 21700
},
{
"epoch": 48.39700582201275,
"grad_norm": 0.6270382404327393,
"learning_rate": 3.832775919732442e-06,
"loss": 6.3845,
"step": 21800
},
{
"epoch": 48.39700582201275,
"eval_loss": 6.387849807739258,
"eval_runtime": 66.1314,
"eval_samples_per_second": 151.214,
"eval_steps_per_second": 18.902,
"step": 21800
},
{
"epoch": 48.61879678403105,
"grad_norm": 0.36722734570503235,
"learning_rate": 3.822742474916388e-06,
"loss": 6.3848,
"step": 21900
},
{
"epoch": 48.61879678403105,
"eval_loss": 6.387927532196045,
"eval_runtime": 63.6715,
"eval_samples_per_second": 157.056,
"eval_steps_per_second": 19.632,
"step": 21900
},
{
"epoch": 48.84058774604935,
"grad_norm": 0.4715673327445984,
"learning_rate": 3.8127090301003347e-06,
"loss": 6.3828,
"step": 22000
},
{
"epoch": 48.84058774604935,
"eval_loss": 6.388005256652832,
"eval_runtime": 63.7564,
"eval_samples_per_second": 156.847,
"eval_steps_per_second": 19.606,
"step": 22000
},
{
"epoch": 49.06237870806765,
"grad_norm": 0.46226397156715393,
"learning_rate": 3.802675585284281e-06,
"loss": 6.3839,
"step": 22100
},
{
"epoch": 49.06237870806765,
"eval_loss": 6.386138439178467,
"eval_runtime": 65.9562,
"eval_samples_per_second": 151.616,
"eval_steps_per_second": 18.952,
"step": 22100
},
{
"epoch": 49.284169670085944,
"grad_norm": 0.48933687806129456,
"learning_rate": 3.792642140468228e-06,
"loss": 6.3835,
"step": 22200
},
{
"epoch": 49.284169670085944,
"eval_loss": 6.386913776397705,
"eval_runtime": 63.5702,
"eval_samples_per_second": 157.306,
"eval_steps_per_second": 19.663,
"step": 22200
},
{
"epoch": 49.50596063210424,
"grad_norm": 0.4057106375694275,
"learning_rate": 3.782608695652174e-06,
"loss": 6.3831,
"step": 22300
},
{
"epoch": 49.50596063210424,
"eval_loss": 6.3875555992126465,
"eval_runtime": 63.6283,
"eval_samples_per_second": 157.163,
"eval_steps_per_second": 19.645,
"step": 22300
},
{
"epoch": 49.72775159412254,
"grad_norm": 0.4397966265678406,
"learning_rate": 3.7725752508361205e-06,
"loss": 6.3847,
"step": 22400
},
{
"epoch": 49.72775159412254,
"eval_loss": 6.386244297027588,
"eval_runtime": 63.6792,
"eval_samples_per_second": 157.037,
"eval_steps_per_second": 19.63,
"step": 22400
},
{
"epoch": 49.949542556140834,
"grad_norm": 0.4629203677177429,
"learning_rate": 3.7625418060200673e-06,
"loss": 6.384,
"step": 22500
},
{
"epoch": 49.949542556140834,
"eval_loss": 6.386322498321533,
"eval_runtime": 66.1359,
"eval_samples_per_second": 151.204,
"eval_steps_per_second": 18.9,
"step": 22500
},
{
"epoch": 50.17133351815914,
"grad_norm": 0.43559348583221436,
"learning_rate": 3.7525083612040136e-06,
"loss": 6.3831,
"step": 22600
},
{
"epoch": 50.17133351815914,
"eval_loss": 6.386173248291016,
"eval_runtime": 63.6043,
"eval_samples_per_second": 157.222,
"eval_steps_per_second": 19.653,
"step": 22600
},
{
"epoch": 50.393124480177434,
"grad_norm": 0.3772810399532318,
"learning_rate": 3.74247491638796e-06,
"loss": 6.3836,
"step": 22700
},
{
"epoch": 50.393124480177434,
"eval_loss": 6.38073205947876,
"eval_runtime": 63.7199,
"eval_samples_per_second": 156.937,
"eval_steps_per_second": 19.617,
"step": 22700
},
{
"epoch": 50.61491544219573,
"grad_norm": 0.36232537031173706,
"learning_rate": 3.7324414715719067e-06,
"loss": 6.3837,
"step": 22800
},
{
"epoch": 50.61491544219573,
"eval_loss": 6.385157108306885,
"eval_runtime": 66.1214,
"eval_samples_per_second": 151.237,
"eval_steps_per_second": 18.905,
"step": 22800
},
{
"epoch": 50.83670640421403,
"grad_norm": 0.3568231165409088,
"learning_rate": 3.722408026755853e-06,
"loss": 6.3837,
"step": 22900
},
{
"epoch": 50.83670640421403,
"eval_loss": 6.388894081115723,
"eval_runtime": 63.6202,
"eval_samples_per_second": 157.183,
"eval_steps_per_second": 19.648,
"step": 22900
},
{
"epoch": 51.058497366232324,
"grad_norm": 0.5292544960975647,
"learning_rate": 3.7123745819398e-06,
"loss": 6.3824,
"step": 23000
},
{
"epoch": 51.058497366232324,
"eval_loss": 6.382253170013428,
"eval_runtime": 63.6223,
"eval_samples_per_second": 157.178,
"eval_steps_per_second": 19.647,
"step": 23000
},
{
"epoch": 51.28028832825063,
"grad_norm": 0.47718894481658936,
"learning_rate": 3.702341137123746e-06,
"loss": 6.3833,
"step": 23100
},
{
"epoch": 51.28028832825063,
"eval_loss": 6.389714241027832,
"eval_runtime": 66.0943,
"eval_samples_per_second": 151.299,
"eval_steps_per_second": 18.912,
"step": 23100
},
{
"epoch": 51.502079290268924,
"grad_norm": 0.2303953319787979,
"learning_rate": 3.6923076923076925e-06,
"loss": 6.3822,
"step": 23200
},
{
"epoch": 51.502079290268924,
"eval_loss": 6.384761810302734,
"eval_runtime": 63.6768,
"eval_samples_per_second": 157.043,
"eval_steps_per_second": 19.63,
"step": 23200
},
{
"epoch": 51.72387025228722,
"grad_norm": 0.4536280035972595,
"learning_rate": 3.6822742474916393e-06,
"loss": 6.3829,
"step": 23300
},
{
"epoch": 51.72387025228722,
"eval_loss": 6.38330078125,
"eval_runtime": 63.6407,
"eval_samples_per_second": 157.132,
"eval_steps_per_second": 19.642,
"step": 23300
},
{
"epoch": 51.94566121430552,
"grad_norm": 0.36595970392227173,
"learning_rate": 3.6722408026755856e-06,
"loss": 6.3839,
"step": 23400
},
{
"epoch": 51.94566121430552,
"eval_loss": 6.384377956390381,
"eval_runtime": 63.6117,
"eval_samples_per_second": 157.204,
"eval_steps_per_second": 19.65,
"step": 23400
},
{
"epoch": 52.16745217632381,
"grad_norm": 0.4151841104030609,
"learning_rate": 3.662207357859532e-06,
"loss": 6.3838,
"step": 23500
},
{
"epoch": 52.16745217632381,
"eval_loss": 6.385963439941406,
"eval_runtime": 66.0487,
"eval_samples_per_second": 151.403,
"eval_steps_per_second": 18.925,
"step": 23500
},
{
"epoch": 52.38924313834211,
"grad_norm": 0.3460543155670166,
"learning_rate": 3.6521739130434787e-06,
"loss": 6.3828,
"step": 23600
},
{
"epoch": 52.38924313834211,
"eval_loss": 6.384364128112793,
"eval_runtime": 63.6451,
"eval_samples_per_second": 157.121,
"eval_steps_per_second": 19.64,
"step": 23600
},
{
"epoch": 52.61103410036041,
"grad_norm": 0.35991814732551575,
"learning_rate": 3.642140468227425e-06,
"loss": 6.3828,
"step": 23700
},
{
"epoch": 52.61103410036041,
"eval_loss": 6.382322311401367,
"eval_runtime": 63.5885,
"eval_samples_per_second": 157.261,
"eval_steps_per_second": 19.658,
"step": 23700
},
{
"epoch": 52.83282506237871,
"grad_norm": 0.556122899055481,
"learning_rate": 3.6321070234113714e-06,
"loss": 6.383,
"step": 23800
},
{
"epoch": 52.83282506237871,
"eval_loss": 6.387279987335205,
"eval_runtime": 63.668,
"eval_samples_per_second": 157.065,
"eval_steps_per_second": 19.633,
"step": 23800
},
{
"epoch": 53.054616024397006,
"grad_norm": 0.4246836304664612,
"learning_rate": 3.622073578595318e-06,
"loss": 6.3842,
"step": 23900
},
{
"epoch": 53.054616024397006,
"eval_loss": 6.382977485656738,
"eval_runtime": 65.9495,
"eval_samples_per_second": 151.631,
"eval_steps_per_second": 18.954,
"step": 23900
},
{
"epoch": 53.2764069864153,
"grad_norm": 0.4062933027744293,
"learning_rate": 3.6120401337792645e-06,
"loss": 6.3829,
"step": 24000
},
{
"epoch": 53.2764069864153,
"eval_loss": 6.386227130889893,
"eval_runtime": 63.6044,
"eval_samples_per_second": 157.222,
"eval_steps_per_second": 19.653,
"step": 24000
},
{
"epoch": 53.4981979484336,
"grad_norm": 0.36249685287475586,
"learning_rate": 3.6020066889632112e-06,
"loss": 6.3841,
"step": 24100
},
{
"epoch": 53.4981979484336,
"eval_loss": 6.388720989227295,
"eval_runtime": 63.7502,
"eval_samples_per_second": 156.862,
"eval_steps_per_second": 19.608,
"step": 24100
},
{
"epoch": 53.719988910451896,
"grad_norm": 0.464330792427063,
"learning_rate": 3.5919732441471576e-06,
"loss": 6.3821,
"step": 24200
},
{
"epoch": 53.719988910451896,
"eval_loss": 6.385589122772217,
"eval_runtime": 66.108,
"eval_samples_per_second": 151.268,
"eval_steps_per_second": 18.908,
"step": 24200
},
{
"epoch": 53.9417798724702,
"grad_norm": 0.36706265807151794,
"learning_rate": 3.581939799331104e-06,
"loss": 6.3834,
"step": 24300
},
{
"epoch": 53.9417798724702,
"eval_loss": 6.385077476501465,
"eval_runtime": 63.7574,
"eval_samples_per_second": 156.844,
"eval_steps_per_second": 19.606,
"step": 24300
},
{
"epoch": 54.2217909620183,
"grad_norm": 0.5084080100059509,
"learning_rate": 3.5719063545150507e-06,
"loss": 6.3829,
"step": 24400
},
{
"epoch": 54.2217909620183,
"eval_loss": 6.384501934051514,
"eval_runtime": 66.1045,
"eval_samples_per_second": 151.276,
"eval_steps_per_second": 18.909,
"step": 24400
},
{
"epoch": 54.44358192403659,
"grad_norm": 0.2843925952911377,
"learning_rate": 3.561872909698997e-06,
"loss": 6.3828,
"step": 24500
},
{
"epoch": 54.44358192403659,
"eval_loss": 6.386019706726074,
"eval_runtime": 63.7676,
"eval_samples_per_second": 156.819,
"eval_steps_per_second": 19.602,
"step": 24500
},
{
"epoch": 54.6653728860549,
"grad_norm": 0.3394639492034912,
"learning_rate": 3.5518394648829434e-06,
"loss": 6.3839,
"step": 24600
},
{
"epoch": 54.6653728860549,
"eval_loss": 6.385280132293701,
"eval_runtime": 64.0386,
"eval_samples_per_second": 156.156,
"eval_steps_per_second": 19.519,
"step": 24600
},
{
"epoch": 54.88716384807319,
"grad_norm": 0.5277294516563416,
"learning_rate": 3.54180602006689e-06,
"loss": 6.3827,
"step": 24700
},
{
"epoch": 54.88716384807319,
"eval_loss": 6.382243633270264,
"eval_runtime": 66.1687,
"eval_samples_per_second": 151.129,
"eval_steps_per_second": 18.891,
"step": 24700
},
{
"epoch": 55.10895481009149,
"grad_norm": 0.4542704224586487,
"learning_rate": 3.5317725752508365e-06,
"loss": 6.3835,
"step": 24800
},
{
"epoch": 55.10895481009149,
"eval_loss": 6.384250640869141,
"eval_runtime": 63.6729,
"eval_samples_per_second": 157.053,
"eval_steps_per_second": 19.632,
"step": 24800
},
{
"epoch": 55.330745772109786,
"grad_norm": 0.4311918318271637,
"learning_rate": 3.521739130434783e-06,
"loss": 6.3821,
"step": 24900
},
{
"epoch": 55.330745772109786,
"eval_loss": 6.382208824157715,
"eval_runtime": 63.7247,
"eval_samples_per_second": 156.925,
"eval_steps_per_second": 19.616,
"step": 24900
},
{
"epoch": 55.55253673412808,
"grad_norm": 0.5033969283103943,
"learning_rate": 3.5117056856187296e-06,
"loss": 6.3828,
"step": 25000
},
{
"epoch": 55.55253673412808,
"eval_loss": 6.384891510009766,
"eval_runtime": 66.1992,
"eval_samples_per_second": 151.059,
"eval_steps_per_second": 18.882,
"step": 25000
},
{
"epoch": 55.77432769614638,
"grad_norm": 0.389417827129364,
"learning_rate": 3.501672240802676e-06,
"loss": 6.3821,
"step": 25100
},
{
"epoch": 55.77432769614638,
"eval_loss": 6.3841633796691895,
"eval_runtime": 63.7582,
"eval_samples_per_second": 156.843,
"eval_steps_per_second": 19.605,
"step": 25100
},
{
"epoch": 55.99611865816468,
"grad_norm": 0.35223087668418884,
"learning_rate": 3.491638795986622e-06,
"loss": 6.382,
"step": 25200
},
{
"epoch": 55.99611865816468,
"eval_loss": 6.3838019371032715,
"eval_runtime": 63.6971,
"eval_samples_per_second": 156.993,
"eval_steps_per_second": 19.624,
"step": 25200
},
{
"epoch": 56.21790962018298,
"grad_norm": 0.3913029134273529,
"learning_rate": 3.481605351170568e-06,
"loss": 6.3815,
"step": 25300
},
{
"epoch": 56.21790962018298,
"eval_loss": 6.3869524002075195,
"eval_runtime": 66.1208,
"eval_samples_per_second": 151.238,
"eval_steps_per_second": 18.905,
"step": 25300
},
{
"epoch": 56.439700582201276,
"grad_norm": 0.4827691614627838,
"learning_rate": 3.471571906354515e-06,
"loss": 6.3827,
"step": 25400
},
{
"epoch": 56.439700582201276,
"eval_loss": 6.384666442871094,
"eval_runtime": 63.6765,
"eval_samples_per_second": 157.044,
"eval_steps_per_second": 19.63,
"step": 25400
},
{
"epoch": 56.66149154421957,
"grad_norm": 0.3427080512046814,
"learning_rate": 3.4615384615384613e-06,
"loss": 6.3827,
"step": 25500
},
{
"epoch": 56.66149154421957,
"eval_loss": 6.384727478027344,
"eval_runtime": 66.2151,
"eval_samples_per_second": 151.023,
"eval_steps_per_second": 18.878,
"step": 25500
},
{
"epoch": 56.88328250623787,
"grad_norm": 0.43282854557037354,
"learning_rate": 3.4515050167224076e-06,
"loss": 6.3822,
"step": 25600
},
{
"epoch": 56.88328250623787,
"eval_loss": 6.384084224700928,
"eval_runtime": 63.8392,
"eval_samples_per_second": 156.643,
"eval_steps_per_second": 19.58,
"step": 25600
},
{
"epoch": 57.105073468256165,
"grad_norm": 0.42564040422439575,
"learning_rate": 3.4414715719063544e-06,
"loss": 6.3814,
"step": 25700
},
{
"epoch": 57.105073468256165,
"eval_loss": 6.383011817932129,
"eval_runtime": 63.6955,
"eval_samples_per_second": 156.997,
"eval_steps_per_second": 19.625,
"step": 25700
},
{
"epoch": 57.32686443027447,
"grad_norm": 0.3655114471912384,
"learning_rate": 3.4314381270903007e-06,
"loss": 6.3813,
"step": 25800
},
{
"epoch": 57.32686443027447,
"eval_loss": 6.384052753448486,
"eval_runtime": 66.0629,
"eval_samples_per_second": 151.371,
"eval_steps_per_second": 18.921,
"step": 25800
},
{
"epoch": 57.548655392292765,
"grad_norm": 0.4009644389152527,
"learning_rate": 3.4214046822742475e-06,
"loss": 6.3819,
"step": 25900
},
{
"epoch": 57.548655392292765,
"eval_loss": 6.384483814239502,
"eval_runtime": 63.6201,
"eval_samples_per_second": 157.183,
"eval_steps_per_second": 19.648,
"step": 25900
},
{
"epoch": 57.77044635431106,
"grad_norm": 0.45892468094825745,
"learning_rate": 3.411371237458194e-06,
"loss": 6.3823,
"step": 26000
},
{
"epoch": 57.77044635431106,
"eval_loss": 6.382046222686768,
"eval_runtime": 63.6871,
"eval_samples_per_second": 157.018,
"eval_steps_per_second": 19.627,
"step": 26000
},
{
"epoch": 57.99223731632936,
"grad_norm": 0.6261206865310669,
"learning_rate": 3.40133779264214e-06,
"loss": 6.3822,
"step": 26100
},
{
"epoch": 57.99223731632936,
"eval_loss": 6.385235786437988,
"eval_runtime": 66.2139,
"eval_samples_per_second": 151.026,
"eval_steps_per_second": 18.878,
"step": 26100
},
{
"epoch": 58.214028278347655,
"grad_norm": 0.38988542556762695,
"learning_rate": 3.391304347826087e-06,
"loss": 6.3817,
"step": 26200
},
{
"epoch": 58.214028278347655,
"eval_loss": 6.385043144226074,
"eval_runtime": 63.6337,
"eval_samples_per_second": 157.149,
"eval_steps_per_second": 19.644,
"step": 26200
},
{
"epoch": 58.43581924036596,
"grad_norm": 0.3526028096675873,
"learning_rate": 3.3812709030100333e-06,
"loss": 6.3819,
"step": 26300
},
{
"epoch": 58.43581924036596,
"eval_loss": 6.385810375213623,
"eval_runtime": 63.6001,
"eval_samples_per_second": 157.233,
"eval_steps_per_second": 19.654,
"step": 26300
},
{
"epoch": 58.657610202384255,
"grad_norm": 0.38116052746772766,
"learning_rate": 3.3712374581939796e-06,
"loss": 6.3835,
"step": 26400
},
{
"epoch": 58.657610202384255,
"eval_loss": 6.383828639984131,
"eval_runtime": 66.0823,
"eval_samples_per_second": 151.327,
"eval_steps_per_second": 18.916,
"step": 26400
},
{
"epoch": 58.87940116440255,
"grad_norm": 0.5195460319519043,
"learning_rate": 3.3612040133779264e-06,
"loss": 6.3824,
"step": 26500
},
{
"epoch": 58.87940116440255,
"eval_loss": 6.3872599601745605,
"eval_runtime": 63.6944,
"eval_samples_per_second": 157.0,
"eval_steps_per_second": 19.625,
"step": 26500
},
{
"epoch": 59.10119212642085,
"grad_norm": 0.3986002206802368,
"learning_rate": 3.3511705685618727e-06,
"loss": 6.3813,
"step": 26600
},
{
"epoch": 59.10119212642085,
"eval_loss": 6.384389877319336,
"eval_runtime": 63.651,
"eval_samples_per_second": 157.107,
"eval_steps_per_second": 19.638,
"step": 26600
},
{
"epoch": 59.322983088439145,
"grad_norm": 0.3788560628890991,
"learning_rate": 3.3411371237458195e-06,
"loss": 6.3834,
"step": 26700
},
{
"epoch": 59.322983088439145,
"eval_loss": 6.383492946624756,
"eval_runtime": 66.1062,
"eval_samples_per_second": 151.272,
"eval_steps_per_second": 18.909,
"step": 26700
},
{
"epoch": 59.54477405045744,
"grad_norm": 0.3633769750595093,
"learning_rate": 3.331103678929766e-06,
"loss": 6.3806,
"step": 26800
},
{
"epoch": 59.54477405045744,
"eval_loss": 6.383812427520752,
"eval_runtime": 63.6852,
"eval_samples_per_second": 157.022,
"eval_steps_per_second": 19.628,
"step": 26800
},
{
"epoch": 59.766565012475745,
"grad_norm": 0.5389061570167542,
"learning_rate": 3.321070234113712e-06,
"loss": 6.3818,
"step": 26900
},
{
"epoch": 59.766565012475745,
"eval_loss": 6.386070251464844,
"eval_runtime": 63.6726,
"eval_samples_per_second": 157.054,
"eval_steps_per_second": 19.632,
"step": 26900
},
{
"epoch": 59.98835597449404,
"grad_norm": 0.5415310263633728,
"learning_rate": 3.311036789297659e-06,
"loss": 6.3812,
"step": 27000
},
{
"epoch": 59.98835597449404,
"eval_loss": 6.386297702789307,
"eval_runtime": 66.0998,
"eval_samples_per_second": 151.286,
"eval_steps_per_second": 18.911,
"step": 27000
},
{
"epoch": 60.21014693651234,
"grad_norm": 0.25073182582855225,
"learning_rate": 3.3010033444816052e-06,
"loss": 6.3825,
"step": 27100
},
{
"epoch": 60.21014693651234,
"eval_loss": 6.384896278381348,
"eval_runtime": 63.7213,
"eval_samples_per_second": 156.933,
"eval_steps_per_second": 19.617,
"step": 27100
},
{
"epoch": 60.431937898530634,
"grad_norm": 0.2894960045814514,
"learning_rate": 3.2909698996655516e-06,
"loss": 6.3806,
"step": 27200
},
{
"epoch": 60.431937898530634,
"eval_loss": 6.383223533630371,
"eval_runtime": 66.1431,
"eval_samples_per_second": 151.187,
"eval_steps_per_second": 18.898,
"step": 27200
},
{
"epoch": 60.65372886054893,
"grad_norm": 0.48593568801879883,
"learning_rate": 3.2809364548494983e-06,
"loss": 6.3814,
"step": 27300
},
{
"epoch": 60.65372886054893,
"eval_loss": 6.382923603057861,
"eval_runtime": 63.8018,
"eval_samples_per_second": 156.735,
"eval_steps_per_second": 19.592,
"step": 27300
},
{
"epoch": 60.87551982256723,
"grad_norm": 0.3919661343097687,
"learning_rate": 3.2709030100334447e-06,
"loss": 6.3812,
"step": 27400
},
{
"epoch": 60.87551982256723,
"eval_loss": 6.384592056274414,
"eval_runtime": 63.6834,
"eval_samples_per_second": 157.027,
"eval_steps_per_second": 19.628,
"step": 27400
},
{
"epoch": 61.09731078458553,
"grad_norm": 0.41026151180267334,
"learning_rate": 3.260869565217391e-06,
"loss": 6.3823,
"step": 27500
},
{
"epoch": 61.09731078458553,
"eval_loss": 6.385217189788818,
"eval_runtime": 66.0642,
"eval_samples_per_second": 151.368,
"eval_steps_per_second": 18.921,
"step": 27500
},
{
"epoch": 61.31910174660383,
"grad_norm": 0.3794995844364166,
"learning_rate": 3.2508361204013378e-06,
"loss": 6.3811,
"step": 27600
},
{
"epoch": 61.31910174660383,
"eval_loss": 6.383106231689453,
"eval_runtime": 63.705,
"eval_samples_per_second": 156.973,
"eval_steps_per_second": 19.622,
"step": 27600
},
{
"epoch": 61.540892708622124,
"grad_norm": 0.4461415410041809,
"learning_rate": 3.240802675585284e-06,
"loss": 6.3828,
"step": 27700
},
{
"epoch": 61.540892708622124,
"eval_loss": 6.384341239929199,
"eval_runtime": 63.6974,
"eval_samples_per_second": 156.992,
"eval_steps_per_second": 19.624,
"step": 27700
},
{
"epoch": 61.76268367064042,
"grad_norm": 0.24599848687648773,
"learning_rate": 3.230769230769231e-06,
"loss": 6.3807,
"step": 27800
},
{
"epoch": 61.76268367064042,
"eval_loss": 6.384603023529053,
"eval_runtime": 66.1353,
"eval_samples_per_second": 151.205,
"eval_steps_per_second": 18.901,
"step": 27800
},
{
"epoch": 61.98447463265872,
"grad_norm": 0.2466522455215454,
"learning_rate": 3.2207357859531772e-06,
"loss": 6.3823,
"step": 27900
},
{
"epoch": 61.98447463265872,
"eval_loss": 6.383478164672852,
"eval_runtime": 63.6948,
"eval_samples_per_second": 156.999,
"eval_steps_per_second": 19.625,
"step": 27900
},
{
"epoch": 62.206265594677014,
"grad_norm": 0.3806278705596924,
"learning_rate": 3.2107023411371236e-06,
"loss": 6.3806,
"step": 28000
},
{
"epoch": 62.206265594677014,
"eval_loss": 6.382126331329346,
"eval_runtime": 63.5806,
"eval_samples_per_second": 157.281,
"eval_steps_per_second": 19.66,
"step": 28000
},
{
"epoch": 62.42805655669532,
"grad_norm": 0.5161334872245789,
"learning_rate": 3.2006688963210703e-06,
"loss": 6.3816,
"step": 28100
},
{
"epoch": 62.42805655669532,
"eval_loss": 6.384099960327148,
"eval_runtime": 66.2035,
"eval_samples_per_second": 151.049,
"eval_steps_per_second": 18.881,
"step": 28100
},
{
"epoch": 62.649847518713614,
"grad_norm": 0.44599130749702454,
"learning_rate": 3.1906354515050167e-06,
"loss": 6.3799,
"step": 28200
},
{
"epoch": 62.649847518713614,
"eval_loss": 6.385605335235596,
"eval_runtime": 63.6738,
"eval_samples_per_second": 157.051,
"eval_steps_per_second": 19.631,
"step": 28200
},
{
"epoch": 62.87163848073191,
"grad_norm": 0.49202173948287964,
"learning_rate": 3.180602006688963e-06,
"loss": 6.3817,
"step": 28300
},
{
"epoch": 62.87163848073191,
"eval_loss": 6.3858418464660645,
"eval_runtime": 63.6792,
"eval_samples_per_second": 157.037,
"eval_steps_per_second": 19.63,
"step": 28300
},
{
"epoch": 63.09342944275021,
"grad_norm": 0.4090692400932312,
"learning_rate": 3.1705685618729098e-06,
"loss": 6.3797,
"step": 28400
},
{
"epoch": 63.09342944275021,
"eval_loss": 6.381466388702393,
"eval_runtime": 66.0632,
"eval_samples_per_second": 151.37,
"eval_steps_per_second": 18.921,
"step": 28400
},
{
"epoch": 63.3152204047685,
"grad_norm": 0.4286213517189026,
"learning_rate": 3.160535117056856e-06,
"loss": 6.3816,
"step": 28500
},
{
"epoch": 63.3152204047685,
"eval_loss": 6.383074760437012,
"eval_runtime": 63.6206,
"eval_samples_per_second": 157.182,
"eval_steps_per_second": 19.648,
"step": 28500
},
{
"epoch": 63.53701136678681,
"grad_norm": 0.36026620864868164,
"learning_rate": 3.1505016722408024e-06,
"loss": 6.3811,
"step": 28600
},
{
"epoch": 63.53701136678681,
"eval_loss": 6.383544445037842,
"eval_runtime": 63.7194,
"eval_samples_per_second": 156.938,
"eval_steps_per_second": 19.617,
"step": 28600
},
{
"epoch": 63.7588023288051,
"grad_norm": 0.5875244140625,
"learning_rate": 3.140468227424749e-06,
"loss": 6.3822,
"step": 28700
},
{
"epoch": 63.7588023288051,
"eval_loss": 6.384294033050537,
"eval_runtime": 66.1191,
"eval_samples_per_second": 151.242,
"eval_steps_per_second": 18.905,
"step": 28700
},
{
"epoch": 63.9805932908234,
"grad_norm": 0.39102068543434143,
"learning_rate": 3.1304347826086955e-06,
"loss": 6.3823,
"step": 28800
},
{
"epoch": 63.9805932908234,
"eval_loss": 6.381502628326416,
"eval_runtime": 63.7504,
"eval_samples_per_second": 156.862,
"eval_steps_per_second": 19.608,
"step": 28800
},
{
"epoch": 64.2023842528417,
"grad_norm": 0.4450345039367676,
"learning_rate": 3.1204013377926423e-06,
"loss": 6.3813,
"step": 28900
},
{
"epoch": 64.2023842528417,
"eval_loss": 6.384424209594727,
"eval_runtime": 66.2286,
"eval_samples_per_second": 150.992,
"eval_steps_per_second": 18.874,
"step": 28900
},
{
"epoch": 64.42417521486,
"grad_norm": 0.3168383240699768,
"learning_rate": 3.1103678929765886e-06,
"loss": 6.383,
"step": 29000
},
{
"epoch": 64.42417521486,
"eval_loss": 6.385626316070557,
"eval_runtime": 63.7217,
"eval_samples_per_second": 156.932,
"eval_steps_per_second": 19.617,
"step": 29000
},
{
"epoch": 64.64596617687829,
"grad_norm": 0.3088781535625458,
"learning_rate": 3.100334448160535e-06,
"loss": 6.3807,
"step": 29100
},
{
"epoch": 64.64596617687829,
"eval_loss": 6.385305881500244,
"eval_runtime": 63.6226,
"eval_samples_per_second": 157.177,
"eval_steps_per_second": 19.647,
"step": 29100
},
{
"epoch": 64.8677571388966,
"grad_norm": 0.4493953287601471,
"learning_rate": 3.0903010033444818e-06,
"loss": 6.381,
"step": 29200
},
{
"epoch": 64.8677571388966,
"eval_loss": 6.383870601654053,
"eval_runtime": 66.0987,
"eval_samples_per_second": 151.289,
"eval_steps_per_second": 18.911,
"step": 29200
},
{
"epoch": 65.08954810091488,
"grad_norm": 0.3246123194694519,
"learning_rate": 3.080267558528428e-06,
"loss": 6.3811,
"step": 29300
},
{
"epoch": 65.08954810091488,
"eval_loss": 6.383446216583252,
"eval_runtime": 63.63,
"eval_samples_per_second": 157.159,
"eval_steps_per_second": 19.645,
"step": 29300
},
{
"epoch": 65.31133906293319,
"grad_norm": 0.2923065721988678,
"learning_rate": 3.0702341137123744e-06,
"loss": 6.3805,
"step": 29400
},
{
"epoch": 65.31133906293319,
"eval_loss": 6.382349014282227,
"eval_runtime": 66.1161,
"eval_samples_per_second": 151.249,
"eval_steps_per_second": 18.906,
"step": 29400
},
{
"epoch": 65.53313002495149,
"grad_norm": 0.48411309719085693,
"learning_rate": 3.060200668896321e-06,
"loss": 6.3816,
"step": 29500
},
{
"epoch": 65.53313002495149,
"eval_loss": 6.381749153137207,
"eval_runtime": 63.6992,
"eval_samples_per_second": 156.988,
"eval_steps_per_second": 19.623,
"step": 29500
},
{
"epoch": 65.75492098696978,
"grad_norm": 0.3250056803226471,
"learning_rate": 3.0501672240802675e-06,
"loss": 6.3806,
"step": 29600
},
{
"epoch": 65.75492098696978,
"eval_loss": 6.383174896240234,
"eval_runtime": 63.6191,
"eval_samples_per_second": 157.185,
"eval_steps_per_second": 19.648,
"step": 29600
},
{
"epoch": 65.97671194898808,
"grad_norm": 0.337882399559021,
"learning_rate": 3.0401337792642143e-06,
"loss": 6.3793,
"step": 29700
},
{
"epoch": 65.97671194898808,
"eval_loss": 6.383576393127441,
"eval_runtime": 66.0393,
"eval_samples_per_second": 151.425,
"eval_steps_per_second": 18.928,
"step": 29700
},
{
"epoch": 66.19850291100637,
"grad_norm": 0.36923250555992126,
"learning_rate": 3.0301003344481606e-06,
"loss": 6.3805,
"step": 29800
},
{
"epoch": 66.19850291100637,
"eval_loss": 6.383658409118652,
"eval_runtime": 63.5576,
"eval_samples_per_second": 157.338,
"eval_steps_per_second": 19.667,
"step": 29800
},
{
"epoch": 66.42029387302468,
"grad_norm": 0.3375002443790436,
"learning_rate": 3.020066889632107e-06,
"loss": 6.3805,
"step": 29900
},
{
"epoch": 66.42029387302468,
"eval_loss": 6.382904529571533,
"eval_runtime": 66.0839,
"eval_samples_per_second": 151.323,
"eval_steps_per_second": 18.915,
"step": 29900
},
{
"epoch": 66.64208483504297,
"grad_norm": 0.44055986404418945,
"learning_rate": 3.0100334448160537e-06,
"loss": 6.3812,
"step": 30000
},
{
"epoch": 66.64208483504297,
"eval_loss": 6.384601593017578,
"eval_runtime": 63.7135,
"eval_samples_per_second": 156.953,
"eval_steps_per_second": 19.619,
"step": 30000
},
{
"epoch": 66.86387579706127,
"grad_norm": 0.5010361075401306,
"learning_rate": 3e-06,
"loss": 6.3814,
"step": 30100
},
{
"epoch": 66.86387579706127,
"eval_loss": 6.38201904296875,
"eval_runtime": 63.631,
"eval_samples_per_second": 157.156,
"eval_steps_per_second": 19.645,
"step": 30100
},
{
"epoch": 67.08566675907957,
"grad_norm": 0.36018142104148865,
"learning_rate": 2.9899665551839464e-06,
"loss": 6.3801,
"step": 30200
},
{
"epoch": 67.08566675907957,
"eval_loss": 6.384942054748535,
"eval_runtime": 66.0805,
"eval_samples_per_second": 151.331,
"eval_steps_per_second": 18.916,
"step": 30200
},
{
"epoch": 67.30745772109786,
"grad_norm": 0.34176790714263916,
"learning_rate": 2.979933110367893e-06,
"loss": 6.3815,
"step": 30300
},
{
"epoch": 67.30745772109786,
"eval_loss": 6.382652282714844,
"eval_runtime": 63.6886,
"eval_samples_per_second": 157.014,
"eval_steps_per_second": 19.627,
"step": 30300
},
{
"epoch": 67.52924868311617,
"grad_norm": 0.43459710478782654,
"learning_rate": 2.9698996655518395e-06,
"loss": 6.3811,
"step": 30400
},
{
"epoch": 67.52924868311617,
"eval_loss": 6.386653423309326,
"eval_runtime": 63.7002,
"eval_samples_per_second": 156.985,
"eval_steps_per_second": 19.623,
"step": 30400
},
{
"epoch": 67.75103964513445,
"grad_norm": 0.37743738293647766,
"learning_rate": 2.959866220735786e-06,
"loss": 6.3804,
"step": 30500
},
{
"epoch": 67.75103964513445,
"eval_loss": 6.383271217346191,
"eval_runtime": 66.2218,
"eval_samples_per_second": 151.008,
"eval_steps_per_second": 18.876,
"step": 30500
},
{
"epoch": 67.97283060715276,
"grad_norm": 0.34814783930778503,
"learning_rate": 2.9498327759197326e-06,
"loss": 6.3803,
"step": 30600
},
{
"epoch": 67.97283060715276,
"eval_loss": 6.38237190246582,
"eval_runtime": 63.6997,
"eval_samples_per_second": 156.987,
"eval_steps_per_second": 19.623,
"step": 30600
},
{
"epoch": 68.19462156917106,
"grad_norm": 0.344685435295105,
"learning_rate": 2.939799331103679e-06,
"loss": 6.3795,
"step": 30700
},
{
"epoch": 68.19462156917106,
"eval_loss": 6.384911060333252,
"eval_runtime": 63.7005,
"eval_samples_per_second": 156.985,
"eval_steps_per_second": 19.623,
"step": 30700
},
{
"epoch": 68.41641253118935,
"grad_norm": 0.30350542068481445,
"learning_rate": 2.9297658862876257e-06,
"loss": 6.3799,
"step": 30800
},
{
"epoch": 68.41641253118935,
"eval_loss": 6.383020401000977,
"eval_runtime": 66.1786,
"eval_samples_per_second": 151.106,
"eval_steps_per_second": 18.888,
"step": 30800
},
{
"epoch": 68.63820349320766,
"grad_norm": 0.33513781428337097,
"learning_rate": 2.919732441471572e-06,
"loss": 6.3808,
"step": 30900
},
{
"epoch": 68.63820349320766,
"eval_loss": 6.38442325592041,
"eval_runtime": 63.6954,
"eval_samples_per_second": 156.997,
"eval_steps_per_second": 19.625,
"step": 30900
},
{
"epoch": 68.85999445522594,
"grad_norm": 0.38895151019096375,
"learning_rate": 2.9096989966555184e-06,
"loss": 6.3803,
"step": 31000
},
{
"epoch": 68.85999445522594,
"eval_loss": 6.382268905639648,
"eval_runtime": 66.1082,
"eval_samples_per_second": 151.267,
"eval_steps_per_second": 18.908,
"step": 31000
},
{
"epoch": 69.08178541724425,
"grad_norm": 0.49591463804244995,
"learning_rate": 2.899665551839465e-06,
"loss": 6.381,
"step": 31100
},
{
"epoch": 69.08178541724425,
"eval_loss": 6.384127140045166,
"eval_runtime": 63.6361,
"eval_samples_per_second": 157.144,
"eval_steps_per_second": 19.643,
"step": 31100
},
{
"epoch": 69.30357637926255,
"grad_norm": 0.47946080565452576,
"learning_rate": 2.8896321070234115e-06,
"loss": 6.3803,
"step": 31200
},
{
"epoch": 69.30357637926255,
"eval_loss": 6.380748748779297,
"eval_runtime": 63.7274,
"eval_samples_per_second": 156.918,
"eval_steps_per_second": 19.615,
"step": 31200
},
{
"epoch": 69.52536734128084,
"grad_norm": 0.33409592509269714,
"learning_rate": 2.879598662207358e-06,
"loss": 6.3795,
"step": 31300
},
{
"epoch": 69.52536734128084,
"eval_loss": 6.3822197914123535,
"eval_runtime": 66.2573,
"eval_samples_per_second": 150.927,
"eval_steps_per_second": 18.866,
"step": 31300
},
{
"epoch": 69.74715830329914,
"grad_norm": 0.36530378460884094,
"learning_rate": 2.8695652173913046e-06,
"loss": 6.3793,
"step": 31400
},
{
"epoch": 69.74715830329914,
"eval_loss": 6.3831787109375,
"eval_runtime": 63.6807,
"eval_samples_per_second": 157.034,
"eval_steps_per_second": 19.629,
"step": 31400
},
{
"epoch": 69.96894926531743,
"grad_norm": 0.4838181436061859,
"learning_rate": 2.859531772575251e-06,
"loss": 6.3802,
"step": 31500
},
{
"epoch": 69.96894926531743,
"eval_loss": 6.383909225463867,
"eval_runtime": 66.1242,
"eval_samples_per_second": 151.231,
"eval_steps_per_second": 18.904,
"step": 31500
},
{
"epoch": 70.19074022733574,
"grad_norm": 0.3999974727630615,
"learning_rate": 2.8494983277591977e-06,
"loss": 6.3817,
"step": 31600
},
{
"epoch": 70.19074022733574,
"eval_loss": 6.382571220397949,
"eval_runtime": 63.5921,
"eval_samples_per_second": 157.252,
"eval_steps_per_second": 19.657,
"step": 31600
},
{
"epoch": 70.41253118935403,
"grad_norm": 0.37044674158096313,
"learning_rate": 2.839464882943144e-06,
"loss": 6.3785,
"step": 31700
},
{
"epoch": 70.41253118935403,
"eval_loss": 6.381692886352539,
"eval_runtime": 66.1492,
"eval_samples_per_second": 151.173,
"eval_steps_per_second": 18.897,
"step": 31700
},
{
"epoch": 70.63432215137233,
"grad_norm": 0.43440505862236023,
"learning_rate": 2.8294314381270904e-06,
"loss": 6.3811,
"step": 31800
},
{
"epoch": 70.63432215137233,
"eval_loss": 6.384181499481201,
"eval_runtime": 63.7156,
"eval_samples_per_second": 156.947,
"eval_steps_per_second": 19.618,
"step": 31800
},
{
"epoch": 70.85611311339063,
"grad_norm": 0.45394232869148254,
"learning_rate": 2.819397993311037e-06,
"loss": 6.3803,
"step": 31900
},
{
"epoch": 70.85611311339063,
"eval_loss": 6.382298469543457,
"eval_runtime": 66.1426,
"eval_samples_per_second": 151.189,
"eval_steps_per_second": 18.899,
"step": 31900
},
{
"epoch": 71.07790407540892,
"grad_norm": 0.24088256061077118,
"learning_rate": 2.8093645484949835e-06,
"loss": 6.3789,
"step": 32000
},
{
"epoch": 71.07790407540892,
"eval_loss": 6.378951072692871,
"eval_runtime": 63.7166,
"eval_samples_per_second": 156.945,
"eval_steps_per_second": 19.618,
"step": 32000
},
{
"epoch": 71.29969503742723,
"grad_norm": 0.3836078643798828,
"learning_rate": 2.79933110367893e-06,
"loss": 6.3793,
"step": 32100
},
{
"epoch": 71.29969503742723,
"eval_loss": 6.382381916046143,
"eval_runtime": 63.7336,
"eval_samples_per_second": 156.903,
"eval_steps_per_second": 19.613,
"step": 32100
},
{
"epoch": 71.52148599944552,
"grad_norm": 0.3558043837547302,
"learning_rate": 2.7892976588628766e-06,
"loss": 6.3779,
"step": 32200
},
{
"epoch": 71.52148599944552,
"eval_loss": 6.3820366859436035,
"eval_runtime": 66.1055,
"eval_samples_per_second": 151.273,
"eval_steps_per_second": 18.909,
"step": 32200
},
{
"epoch": 71.74327696146382,
"grad_norm": 0.2369541972875595,
"learning_rate": 2.779264214046823e-06,
"loss": 6.3808,
"step": 32300
},
{
"epoch": 71.74327696146382,
"eval_loss": 6.37972354888916,
"eval_runtime": 63.6602,
"eval_samples_per_second": 157.084,
"eval_steps_per_second": 19.636,
"step": 32300
},
{
"epoch": 71.96506792348212,
"grad_norm": 0.3357178270816803,
"learning_rate": 2.7692307692307693e-06,
"loss": 6.3796,
"step": 32400
},
{
"epoch": 71.96506792348212,
"eval_loss": 6.3810296058654785,
"eval_runtime": 66.2569,
"eval_samples_per_second": 150.928,
"eval_steps_per_second": 18.866,
"step": 32400
},
{
"epoch": 72.18685888550041,
"grad_norm": 0.2965914011001587,
"learning_rate": 2.759197324414716e-06,
"loss": 6.3794,
"step": 32500
},
{
"epoch": 72.18685888550041,
"eval_loss": 6.381561756134033,
"eval_runtime": 63.6325,
"eval_samples_per_second": 157.152,
"eval_steps_per_second": 19.644,
"step": 32500
},
{
"epoch": 72.40864984751872,
"grad_norm": 0.31444767117500305,
"learning_rate": 2.749163879598662e-06,
"loss": 6.3811,
"step": 32600
},
{
"epoch": 72.40864984751872,
"eval_loss": 6.383826732635498,
"eval_runtime": 63.819,
"eval_samples_per_second": 156.693,
"eval_steps_per_second": 19.587,
"step": 32600
},
{
"epoch": 72.630440809537,
"grad_norm": 0.335440456867218,
"learning_rate": 2.7391304347826087e-06,
"loss": 6.3787,
"step": 32700
},
{
"epoch": 72.630440809537,
"eval_loss": 6.382222652435303,
"eval_runtime": 66.3235,
"eval_samples_per_second": 150.776,
"eval_steps_per_second": 18.847,
"step": 32700
},
{
"epoch": 72.85223177155531,
"grad_norm": 0.3031088411808014,
"learning_rate": 2.729096989966555e-06,
"loss": 6.379,
"step": 32800
},
{
"epoch": 72.85223177155531,
"eval_loss": 6.380151748657227,
"eval_runtime": 63.7642,
"eval_samples_per_second": 156.828,
"eval_steps_per_second": 19.603,
"step": 32800
},
{
"epoch": 73.07402273357361,
"grad_norm": 0.2734851539134979,
"learning_rate": 2.7190635451505014e-06,
"loss": 6.3796,
"step": 32900
},
{
"epoch": 73.07402273357361,
"eval_loss": 6.381131172180176,
"eval_runtime": 66.3028,
"eval_samples_per_second": 150.823,
"eval_steps_per_second": 18.853,
"step": 32900
},
{
"epoch": 73.2958136955919,
"grad_norm": 0.4682227671146393,
"learning_rate": 2.709030100334448e-06,
"loss": 6.3791,
"step": 33000
},
{
"epoch": 73.2958136955919,
"eval_loss": 6.382552146911621,
"eval_runtime": 63.7907,
"eval_samples_per_second": 156.763,
"eval_steps_per_second": 19.595,
"step": 33000
},
{
"epoch": 73.5176046576102,
"grad_norm": 0.38640567660331726,
"learning_rate": 2.6989966555183945e-06,
"loss": 6.378,
"step": 33100
},
{
"epoch": 73.5176046576102,
"eval_loss": 6.37783670425415,
"eval_runtime": 66.2991,
"eval_samples_per_second": 150.832,
"eval_steps_per_second": 18.854,
"step": 33100
},
{
"epoch": 73.7393956196285,
"grad_norm": 0.3248431086540222,
"learning_rate": 2.6889632107023413e-06,
"loss": 6.3798,
"step": 33200
},
{
"epoch": 73.7393956196285,
"eval_loss": 6.382781982421875,
"eval_runtime": 63.7636,
"eval_samples_per_second": 156.829,
"eval_steps_per_second": 19.604,
"step": 33200
},
{
"epoch": 73.9611865816468,
"grad_norm": 0.40707120299339294,
"learning_rate": 2.6789297658862876e-06,
"loss": 6.3787,
"step": 33300
},
{
"epoch": 73.9611865816468,
"eval_loss": 6.381734371185303,
"eval_runtime": 63.7575,
"eval_samples_per_second": 156.844,
"eval_steps_per_second": 19.606,
"step": 33300
},
{
"epoch": 74.18297754366509,
"grad_norm": 0.3740542232990265,
"learning_rate": 2.668896321070234e-06,
"loss": 6.3799,
"step": 33400
},
{
"epoch": 74.18297754366509,
"eval_loss": 6.38359260559082,
"eval_runtime": 63.7058,
"eval_samples_per_second": 156.972,
"eval_steps_per_second": 19.621,
"step": 33400
},
{
"epoch": 74.40476850568339,
"grad_norm": 0.3560076057910919,
"learning_rate": 2.6588628762541807e-06,
"loss": 6.3788,
"step": 33500
},
{
"epoch": 74.40476850568339,
"eval_loss": 6.380216598510742,
"eval_runtime": 66.3163,
"eval_samples_per_second": 150.792,
"eval_steps_per_second": 18.849,
"step": 33500
},
{
"epoch": 74.6265594677017,
"grad_norm": 0.2998209595680237,
"learning_rate": 2.648829431438127e-06,
"loss": 6.3798,
"step": 33600
},
{
"epoch": 74.6265594677017,
"eval_loss": 6.3799357414245605,
"eval_runtime": 63.7525,
"eval_samples_per_second": 156.857,
"eval_steps_per_second": 19.607,
"step": 33600
},
{
"epoch": 74.84835042971999,
"grad_norm": 0.42181283235549927,
"learning_rate": 2.6387959866220734e-06,
"loss": 6.3797,
"step": 33700
},
{
"epoch": 74.84835042971999,
"eval_loss": 6.3854804039001465,
"eval_runtime": 63.7045,
"eval_samples_per_second": 156.975,
"eval_steps_per_second": 19.622,
"step": 33700
},
{
"epoch": 75.07014139173829,
"grad_norm": 0.35232749581336975,
"learning_rate": 2.62876254180602e-06,
"loss": 6.3794,
"step": 33800
},
{
"epoch": 75.07014139173829,
"eval_loss": 6.38426399230957,
"eval_runtime": 66.2061,
"eval_samples_per_second": 151.043,
"eval_steps_per_second": 18.88,
"step": 33800
},
{
"epoch": 75.29193235375658,
"grad_norm": 0.3319035470485687,
"learning_rate": 2.6187290969899665e-06,
"loss": 6.3801,
"step": 33900
},
{
"epoch": 75.29193235375658,
"eval_loss": 6.382733345031738,
"eval_runtime": 63.7241,
"eval_samples_per_second": 156.927,
"eval_steps_per_second": 19.616,
"step": 33900
},
{
"epoch": 75.51372331577488,
"grad_norm": 0.320116251707077,
"learning_rate": 2.6086956521739132e-06,
"loss": 6.3796,
"step": 34000
},
{
"epoch": 75.51372331577488,
"eval_loss": 6.383172035217285,
"eval_runtime": 66.2886,
"eval_samples_per_second": 150.856,
"eval_steps_per_second": 18.857,
"step": 34000
},
{
"epoch": 75.73551427779319,
"grad_norm": 0.25732365250587463,
"learning_rate": 2.5986622073578596e-06,
"loss": 6.3793,
"step": 34100
},
{
"epoch": 75.73551427779319,
"eval_loss": 6.3826189041137695,
"eval_runtime": 63.7021,
"eval_samples_per_second": 156.981,
"eval_steps_per_second": 19.623,
"step": 34100
},
{
"epoch": 75.95730523981148,
"grad_norm": 0.41861915588378906,
"learning_rate": 2.588628762541806e-06,
"loss": 6.3806,
"step": 34200
},
{
"epoch": 75.95730523981148,
"eval_loss": 6.3810224533081055,
"eval_runtime": 63.8703,
"eval_samples_per_second": 156.567,
"eval_steps_per_second": 19.571,
"step": 34200
},
{
"epoch": 76.17909620182978,
"grad_norm": 0.37039920687675476,
"learning_rate": 2.5785953177257527e-06,
"loss": 6.3782,
"step": 34300
},
{
"epoch": 76.17909620182978,
"eval_loss": 6.384817600250244,
"eval_runtime": 63.7083,
"eval_samples_per_second": 156.965,
"eval_steps_per_second": 19.621,
"step": 34300
},
{
"epoch": 76.40088716384807,
"grad_norm": 0.29002711176872253,
"learning_rate": 2.568561872909699e-06,
"loss": 6.3804,
"step": 34400
},
{
"epoch": 76.40088716384807,
"eval_loss": 6.381626605987549,
"eval_runtime": 66.318,
"eval_samples_per_second": 150.789,
"eval_steps_per_second": 18.849,
"step": 34400
},
{
"epoch": 76.62267812586637,
"grad_norm": 0.3963169455528259,
"learning_rate": 2.5585284280936454e-06,
"loss": 6.3802,
"step": 34500
},
{
"epoch": 76.62267812586637,
"eval_loss": 6.385863304138184,
"eval_runtime": 63.683,
"eval_samples_per_second": 157.028,
"eval_steps_per_second": 19.628,
"step": 34500
},
{
"epoch": 76.84446908788468,
"grad_norm": 0.3641812801361084,
"learning_rate": 2.548494983277592e-06,
"loss": 6.3794,
"step": 34600
},
{
"epoch": 76.84446908788468,
"eval_loss": 6.379196643829346,
"eval_runtime": 63.7355,
"eval_samples_per_second": 156.899,
"eval_steps_per_second": 19.612,
"step": 34600
},
{
"epoch": 77.06626004990297,
"grad_norm": 0.34516364336013794,
"learning_rate": 2.5384615384615385e-06,
"loss": 6.3781,
"step": 34700
},
{
"epoch": 77.06626004990297,
"eval_loss": 6.381167411804199,
"eval_runtime": 66.2593,
"eval_samples_per_second": 150.922,
"eval_steps_per_second": 18.865,
"step": 34700
},
{
"epoch": 77.28805101192127,
"grad_norm": 0.37135106325149536,
"learning_rate": 2.528428093645485e-06,
"loss": 6.3791,
"step": 34800
},
{
"epoch": 77.28805101192127,
"eval_loss": 6.3796210289001465,
"eval_runtime": 63.5952,
"eval_samples_per_second": 157.245,
"eval_steps_per_second": 19.656,
"step": 34800
},
{
"epoch": 77.50984197393956,
"grad_norm": 0.27615365386009216,
"learning_rate": 2.5183946488294316e-06,
"loss": 6.3788,
"step": 34900
},
{
"epoch": 77.50984197393956,
"eval_loss": 6.38156270980835,
"eval_runtime": 63.666,
"eval_samples_per_second": 157.07,
"eval_steps_per_second": 19.634,
"step": 34900
},
{
"epoch": 77.73163293595786,
"grad_norm": 0.40949293971061707,
"learning_rate": 2.508361204013378e-06,
"loss": 6.3784,
"step": 35000
},
{
"epoch": 77.73163293595786,
"eval_loss": 6.379955291748047,
"eval_runtime": 66.2108,
"eval_samples_per_second": 151.033,
"eval_steps_per_second": 18.879,
"step": 35000
},
{
"epoch": 77.95342389797615,
"grad_norm": 0.21426652371883392,
"learning_rate": 2.4983277591973247e-06,
"loss": 6.3792,
"step": 35100
},
{
"epoch": 77.95342389797615,
"eval_loss": 6.38067102432251,
"eval_runtime": 63.6933,
"eval_samples_per_second": 157.002,
"eval_steps_per_second": 19.625,
"step": 35100
},
{
"epoch": 78.17521485999445,
"grad_norm": 0.3121929466724396,
"learning_rate": 2.488294314381271e-06,
"loss": 6.3799,
"step": 35200
},
{
"epoch": 78.17521485999445,
"eval_loss": 6.383203983306885,
"eval_runtime": 63.6759,
"eval_samples_per_second": 157.045,
"eval_steps_per_second": 19.631,
"step": 35200
},
{
"epoch": 78.39700582201276,
"grad_norm": 0.3007084131240845,
"learning_rate": 2.4782608695652173e-06,
"loss": 6.3782,
"step": 35300
},
{
"epoch": 78.39700582201276,
"eval_loss": 6.380030632019043,
"eval_runtime": 65.4722,
"eval_samples_per_second": 152.737,
"eval_steps_per_second": 19.092,
"step": 35300
},
{
"epoch": 78.61879678403105,
"grad_norm": 0.4903746247291565,
"learning_rate": 2.468227424749164e-06,
"loss": 6.3791,
"step": 35400
},
{
"epoch": 78.61879678403105,
"eval_loss": 6.382900714874268,
"eval_runtime": 64.4146,
"eval_samples_per_second": 155.244,
"eval_steps_per_second": 19.406,
"step": 35400
},
{
"epoch": 78.84058774604935,
"grad_norm": 0.41819822788238525,
"learning_rate": 2.4581939799331104e-06,
"loss": 6.3779,
"step": 35500
},
{
"epoch": 78.84058774604935,
"eval_loss": 6.380439281463623,
"eval_runtime": 63.6771,
"eval_samples_per_second": 157.042,
"eval_steps_per_second": 19.63,
"step": 35500
},
{
"epoch": 79.06237870806764,
"grad_norm": 0.4207383990287781,
"learning_rate": 2.4481605351170568e-06,
"loss": 6.3794,
"step": 35600
},
{
"epoch": 79.06237870806764,
"eval_loss": 6.381216049194336,
"eval_runtime": 63.8668,
"eval_samples_per_second": 156.576,
"eval_steps_per_second": 19.572,
"step": 35600
},
{
"epoch": 79.28416967008594,
"grad_norm": 0.3821701109409332,
"learning_rate": 2.4381270903010035e-06,
"loss": 6.3796,
"step": 35700
},
{
"epoch": 79.28416967008594,
"eval_loss": 6.380701541900635,
"eval_runtime": 66.2305,
"eval_samples_per_second": 150.988,
"eval_steps_per_second": 18.873,
"step": 35700
},
{
"epoch": 79.50596063210425,
"grad_norm": 0.3124147653579712,
"learning_rate": 2.42809364548495e-06,
"loss": 6.3792,
"step": 35800
},
{
"epoch": 79.50596063210425,
"eval_loss": 6.383649826049805,
"eval_runtime": 63.6853,
"eval_samples_per_second": 157.022,
"eval_steps_per_second": 19.628,
"step": 35800
},
{
"epoch": 79.72775159412254,
"grad_norm": 0.37319284677505493,
"learning_rate": 2.4180602006688962e-06,
"loss": 6.3793,
"step": 35900
},
{
"epoch": 79.72775159412254,
"eval_loss": 6.379690647125244,
"eval_runtime": 63.71,
"eval_samples_per_second": 156.961,
"eval_steps_per_second": 19.62,
"step": 35900
},
{
"epoch": 79.94954255614084,
"grad_norm": 0.3518475890159607,
"learning_rate": 2.408026755852843e-06,
"loss": 6.3794,
"step": 36000
},
{
"epoch": 79.94954255614084,
"eval_loss": 6.3837385177612305,
"eval_runtime": 66.2591,
"eval_samples_per_second": 150.923,
"eval_steps_per_second": 18.865,
"step": 36000
},
{
"epoch": 80.17133351815913,
"grad_norm": 0.3394939601421356,
"learning_rate": 2.3979933110367893e-06,
"loss": 6.3779,
"step": 36100
},
{
"epoch": 80.17133351815913,
"eval_loss": 6.383784294128418,
"eval_runtime": 63.536,
"eval_samples_per_second": 157.391,
"eval_steps_per_second": 19.674,
"step": 36100
},
{
"epoch": 80.39312448017743,
"grad_norm": 0.2030980885028839,
"learning_rate": 2.387959866220736e-06,
"loss": 6.3787,
"step": 36200
},
{
"epoch": 80.39312448017743,
"eval_loss": 6.381889820098877,
"eval_runtime": 63.5998,
"eval_samples_per_second": 157.233,
"eval_steps_per_second": 19.654,
"step": 36200
},
{
"epoch": 80.61491544219572,
"grad_norm": 0.35631629824638367,
"learning_rate": 2.3779264214046824e-06,
"loss": 6.3778,
"step": 36300
},
{
"epoch": 80.61491544219572,
"eval_loss": 6.382266998291016,
"eval_runtime": 66.2682,
"eval_samples_per_second": 150.902,
"eval_steps_per_second": 18.863,
"step": 36300
},
{
"epoch": 80.83670640421403,
"grad_norm": 0.38831663131713867,
"learning_rate": 2.3678929765886288e-06,
"loss": 6.3796,
"step": 36400
},
{
"epoch": 80.83670640421403,
"eval_loss": 6.379624843597412,
"eval_runtime": 63.7336,
"eval_samples_per_second": 156.903,
"eval_steps_per_second": 19.613,
"step": 36400
},
{
"epoch": 81.05849736623233,
"grad_norm": 0.29808080196380615,
"learning_rate": 2.3578595317725755e-06,
"loss": 6.3787,
"step": 36500
},
{
"epoch": 81.05849736623233,
"eval_loss": 6.380765914916992,
"eval_runtime": 63.6941,
"eval_samples_per_second": 157.0,
"eval_steps_per_second": 19.625,
"step": 36500
},
{
"epoch": 81.28028832825062,
"grad_norm": 0.32311221957206726,
"learning_rate": 2.347826086956522e-06,
"loss": 6.3795,
"step": 36600
},
{
"epoch": 81.28028832825062,
"eval_loss": 6.38113260269165,
"eval_runtime": 66.3064,
"eval_samples_per_second": 150.815,
"eval_steps_per_second": 18.852,
"step": 36600
},
{
"epoch": 81.50207929026892,
"grad_norm": 0.3027205765247345,
"learning_rate": 2.337792642140468e-06,
"loss": 6.3777,
"step": 36700
},
{
"epoch": 81.50207929026892,
"eval_loss": 6.378735542297363,
"eval_runtime": 63.8029,
"eval_samples_per_second": 156.733,
"eval_steps_per_second": 19.592,
"step": 36700
},
{
"epoch": 81.72387025228721,
"grad_norm": 0.44942182302474976,
"learning_rate": 2.327759197324415e-06,
"loss": 6.3793,
"step": 36800
},
{
"epoch": 81.72387025228721,
"eval_loss": 6.382872104644775,
"eval_runtime": 63.7382,
"eval_samples_per_second": 156.892,
"eval_steps_per_second": 19.611,
"step": 36800
},
{
"epoch": 81.94566121430552,
"grad_norm": 0.3363696038722992,
"learning_rate": 2.3177257525083613e-06,
"loss": 6.3786,
"step": 36900
},
{
"epoch": 81.94566121430552,
"eval_loss": 6.3805928230285645,
"eval_runtime": 66.2659,
"eval_samples_per_second": 150.907,
"eval_steps_per_second": 18.863,
"step": 36900
},
{
"epoch": 82.16745217632382,
"grad_norm": 0.3836919367313385,
"learning_rate": 2.307692307692308e-06,
"loss": 6.378,
"step": 37000
},
{
"epoch": 82.16745217632382,
"eval_loss": 6.381478786468506,
"eval_runtime": 63.6472,
"eval_samples_per_second": 157.116,
"eval_steps_per_second": 19.64,
"step": 37000
},
{
"epoch": 82.38924313834211,
"grad_norm": 0.3322221338748932,
"learning_rate": 2.2976588628762544e-06,
"loss": 6.3774,
"step": 37100
},
{
"epoch": 82.38924313834211,
"eval_loss": 6.381748199462891,
"eval_runtime": 63.7815,
"eval_samples_per_second": 156.785,
"eval_steps_per_second": 19.598,
"step": 37100
},
{
"epoch": 82.61103410036041,
"grad_norm": 0.33737483620643616,
"learning_rate": 2.2876254180602008e-06,
"loss": 6.3792,
"step": 37200
},
{
"epoch": 82.61103410036041,
"eval_loss": 6.381521224975586,
"eval_runtime": 66.3297,
"eval_samples_per_second": 150.762,
"eval_steps_per_second": 18.845,
"step": 37200
},
{
"epoch": 82.8328250623787,
"grad_norm": 0.34915590286254883,
"learning_rate": 2.2775919732441475e-06,
"loss": 6.3792,
"step": 37300
},
{
"epoch": 82.8328250623787,
"eval_loss": 6.382421493530273,
"eval_runtime": 63.7523,
"eval_samples_per_second": 156.857,
"eval_steps_per_second": 19.607,
"step": 37300
},
{
"epoch": 83.054616024397,
"grad_norm": 0.2967890202999115,
"learning_rate": 2.267558528428094e-06,
"loss": 6.379,
"step": 37400
},
{
"epoch": 83.054616024397,
"eval_loss": 6.379049301147461,
"eval_runtime": 63.7339,
"eval_samples_per_second": 156.902,
"eval_steps_per_second": 19.613,
"step": 37400
},
{
"epoch": 83.27640698641531,
"grad_norm": 0.3198423385620117,
"learning_rate": 2.25752508361204e-06,
"loss": 6.3784,
"step": 37500
},
{
"epoch": 83.27640698641531,
"eval_loss": 6.380875110626221,
"eval_runtime": 66.2715,
"eval_samples_per_second": 150.894,
"eval_steps_per_second": 18.862,
"step": 37500
},
{
"epoch": 83.4981979484336,
"grad_norm": 0.22756338119506836,
"learning_rate": 2.2474916387959865e-06,
"loss": 6.3772,
"step": 37600
},
{
"epoch": 83.4981979484336,
"eval_loss": 6.380899429321289,
"eval_runtime": 63.6746,
"eval_samples_per_second": 157.048,
"eval_steps_per_second": 19.631,
"step": 37600
},
{
"epoch": 83.7199889104519,
"grad_norm": 0.375475138425827,
"learning_rate": 2.237458193979933e-06,
"loss": 6.38,
"step": 37700
},
{
"epoch": 83.7199889104519,
"eval_loss": 6.379432201385498,
"eval_runtime": 63.7694,
"eval_samples_per_second": 156.815,
"eval_steps_per_second": 19.602,
"step": 37700
},
{
"epoch": 83.99805932908234,
"grad_norm": 0.26553675532341003,
"learning_rate": 2.2274247491638796e-06,
"loss": 6.3791,
"step": 37800
},
{
"epoch": 83.99805932908234,
"eval_loss": 6.386465072631836,
"eval_runtime": 66.2313,
"eval_samples_per_second": 150.986,
"eval_steps_per_second": 18.873,
"step": 37800
},
{
"epoch": 84.21985029110064,
"grad_norm": 0.2572327256202698,
"learning_rate": 2.217391304347826e-06,
"loss": 6.3779,
"step": 37900
},
{
"epoch": 84.21985029110064,
"eval_loss": 6.381786823272705,
"eval_runtime": 63.8252,
"eval_samples_per_second": 156.678,
"eval_steps_per_second": 19.585,
"step": 37900
},
{
"epoch": 84.44164125311893,
"grad_norm": 0.3603324294090271,
"learning_rate": 2.2073578595317723e-06,
"loss": 6.3796,
"step": 38000
},
{
"epoch": 84.44164125311893,
"eval_loss": 6.381040573120117,
"eval_runtime": 64.0412,
"eval_samples_per_second": 156.15,
"eval_steps_per_second": 19.519,
"step": 38000
},
{
"epoch": 84.66343221513723,
"grad_norm": 0.3384093642234802,
"learning_rate": 2.197324414715719e-06,
"loss": 6.3778,
"step": 38100
},
{
"epoch": 84.66343221513723,
"eval_loss": 6.377985000610352,
"eval_runtime": 66.1598,
"eval_samples_per_second": 151.149,
"eval_steps_per_second": 18.894,
"step": 38100
},
{
"epoch": 84.88522317715552,
"grad_norm": 0.3742137551307678,
"learning_rate": 2.1872909698996654e-06,
"loss": 6.3788,
"step": 38200
},
{
"epoch": 84.88522317715552,
"eval_loss": 6.382181167602539,
"eval_runtime": 63.6067,
"eval_samples_per_second": 157.216,
"eval_steps_per_second": 19.652,
"step": 38200
},
{
"epoch": 85.10701413917383,
"grad_norm": 0.31179383397102356,
"learning_rate": 2.177257525083612e-06,
"loss": 6.3771,
"step": 38300
},
{
"epoch": 85.10701413917383,
"eval_loss": 6.380379676818848,
"eval_runtime": 66.212,
"eval_samples_per_second": 151.03,
"eval_steps_per_second": 18.879,
"step": 38300
},
{
"epoch": 85.32880510119213,
"grad_norm": 0.36700376868247986,
"learning_rate": 2.1672240802675585e-06,
"loss": 6.3767,
"step": 38400
},
{
"epoch": 85.32880510119213,
"eval_loss": 6.3812575340271,
"eval_runtime": 63.6889,
"eval_samples_per_second": 157.013,
"eval_steps_per_second": 19.627,
"step": 38400
},
{
"epoch": 85.55059606321042,
"grad_norm": 0.38559991121292114,
"learning_rate": 2.157190635451505e-06,
"loss": 6.3781,
"step": 38500
},
{
"epoch": 85.55059606321042,
"eval_loss": 6.384213447570801,
"eval_runtime": 66.1477,
"eval_samples_per_second": 151.177,
"eval_steps_per_second": 18.897,
"step": 38500
},
{
"epoch": 85.77238702522872,
"grad_norm": 0.2753937244415283,
"learning_rate": 2.1471571906354516e-06,
"loss": 6.3795,
"step": 38600
},
{
"epoch": 85.77238702522872,
"eval_loss": 6.37845516204834,
"eval_runtime": 63.6513,
"eval_samples_per_second": 157.106,
"eval_steps_per_second": 19.638,
"step": 38600
},
{
"epoch": 85.99417798724701,
"grad_norm": 0.22831951081752777,
"learning_rate": 2.137123745819398e-06,
"loss": 6.3789,
"step": 38700
},
{
"epoch": 85.99417798724701,
"eval_loss": 6.381505966186523,
"eval_runtime": 66.231,
"eval_samples_per_second": 150.987,
"eval_steps_per_second": 18.873,
"step": 38700
},
{
"epoch": 86.21596894926532,
"grad_norm": 0.339546799659729,
"learning_rate": 2.1270903010033443e-06,
"loss": 6.379,
"step": 38800
},
{
"epoch": 86.21596894926532,
"eval_loss": 6.381498336791992,
"eval_runtime": 63.6802,
"eval_samples_per_second": 157.035,
"eval_steps_per_second": 19.629,
"step": 38800
},
{
"epoch": 86.43775991128362,
"grad_norm": 0.2600659728050232,
"learning_rate": 2.117056856187291e-06,
"loss": 6.3774,
"step": 38900
},
{
"epoch": 86.43775991128362,
"eval_loss": 6.381589889526367,
"eval_runtime": 63.6804,
"eval_samples_per_second": 157.034,
"eval_steps_per_second": 19.629,
"step": 38900
},
{
"epoch": 86.65955087330191,
"grad_norm": 0.32178473472595215,
"learning_rate": 2.1070234113712374e-06,
"loss": 6.3785,
"step": 39000
},
{
"epoch": 86.65955087330191,
"eval_loss": 6.377468585968018,
"eval_runtime": 66.295,
"eval_samples_per_second": 150.841,
"eval_steps_per_second": 18.855,
"step": 39000
},
{
"epoch": 86.88134183532021,
"grad_norm": 0.28717854619026184,
"learning_rate": 2.0969899665551837e-06,
"loss": 6.377,
"step": 39100
},
{
"epoch": 86.88134183532021,
"eval_loss": 6.3805928230285645,
"eval_runtime": 63.7605,
"eval_samples_per_second": 156.837,
"eval_steps_per_second": 19.605,
"step": 39100
},
{
"epoch": 87.1031327973385,
"grad_norm": 0.2932318150997162,
"learning_rate": 2.0869565217391305e-06,
"loss": 6.3791,
"step": 39200
},
{
"epoch": 87.1031327973385,
"eval_loss": 6.380700588226318,
"eval_runtime": 63.6583,
"eval_samples_per_second": 157.089,
"eval_steps_per_second": 19.636,
"step": 39200
},
{
"epoch": 87.3249237593568,
"grad_norm": 0.39832741022109985,
"learning_rate": 2.076923076923077e-06,
"loss": 6.3784,
"step": 39300
},
{
"epoch": 87.3249237593568,
"eval_loss": 6.37957763671875,
"eval_runtime": 66.5746,
"eval_samples_per_second": 150.207,
"eval_steps_per_second": 18.776,
"step": 39300
},
{
"epoch": 87.54671472137511,
"grad_norm": 0.3088468611240387,
"learning_rate": 2.0668896321070236e-06,
"loss": 6.3774,
"step": 39400
},
{
"epoch": 87.54671472137511,
"eval_loss": 6.379054069519043,
"eval_runtime": 66.2367,
"eval_samples_per_second": 150.974,
"eval_steps_per_second": 18.872,
"step": 39400
},
{
"epoch": 87.7685056833934,
"grad_norm": 0.284956693649292,
"learning_rate": 2.05685618729097e-06,
"loss": 6.3777,
"step": 39500
},
{
"epoch": 87.7685056833934,
"eval_loss": 6.381918907165527,
"eval_runtime": 66.2296,
"eval_samples_per_second": 150.99,
"eval_steps_per_second": 18.874,
"step": 39500
},
{
"epoch": 87.9902966454117,
"grad_norm": 0.26180529594421387,
"learning_rate": 2.0468227424749163e-06,
"loss": 6.3779,
"step": 39600
},
{
"epoch": 87.9902966454117,
"eval_loss": 6.381536483764648,
"eval_runtime": 66.1969,
"eval_samples_per_second": 151.064,
"eval_steps_per_second": 18.883,
"step": 39600
},
{
"epoch": 88.21208760742999,
"grad_norm": 0.39024651050567627,
"learning_rate": 2.036789297658863e-06,
"loss": 6.377,
"step": 39700
},
{
"epoch": 88.21208760742999,
"eval_loss": 6.3777618408203125,
"eval_runtime": 66.1491,
"eval_samples_per_second": 151.174,
"eval_steps_per_second": 18.897,
"step": 39700
},
{
"epoch": 88.4338785694483,
"grad_norm": 0.2729719579219818,
"learning_rate": 2.0267558528428094e-06,
"loss": 6.3782,
"step": 39800
},
{
"epoch": 88.4338785694483,
"eval_loss": 6.382574081420898,
"eval_runtime": 66.1908,
"eval_samples_per_second": 151.078,
"eval_steps_per_second": 18.885,
"step": 39800
},
{
"epoch": 88.65566953146659,
"grad_norm": 0.23033183813095093,
"learning_rate": 2.0167224080267557e-06,
"loss": 6.3776,
"step": 39900
},
{
"epoch": 88.65566953146659,
"eval_loss": 6.378293514251709,
"eval_runtime": 66.1466,
"eval_samples_per_second": 151.179,
"eval_steps_per_second": 18.897,
"step": 39900
},
{
"epoch": 88.87746049348489,
"grad_norm": 0.43995988368988037,
"learning_rate": 2.0066889632107025e-06,
"loss": 6.3793,
"step": 40000
},
{
"epoch": 88.87746049348489,
"eval_loss": 6.380235195159912,
"eval_runtime": 66.2981,
"eval_samples_per_second": 150.834,
"eval_steps_per_second": 18.854,
"step": 40000
},
{
"epoch": 89.0992514555032,
"grad_norm": 0.2878618836402893,
"learning_rate": 1.996655518394649e-06,
"loss": 6.3783,
"step": 40100
},
{
"epoch": 89.0992514555032,
"eval_loss": 6.379173755645752,
"eval_runtime": 63.6984,
"eval_samples_per_second": 156.99,
"eval_steps_per_second": 19.624,
"step": 40100
},
{
"epoch": 89.32104241752148,
"grad_norm": 0.22379851341247559,
"learning_rate": 1.986622073578595e-06,
"loss": 6.3785,
"step": 40200
},
{
"epoch": 89.32104241752148,
"eval_loss": 6.38173770904541,
"eval_runtime": 63.792,
"eval_samples_per_second": 156.759,
"eval_steps_per_second": 19.595,
"step": 40200
},
{
"epoch": 89.54283337953979,
"grad_norm": 0.3321212828159332,
"learning_rate": 1.976588628762542e-06,
"loss": 6.3775,
"step": 40300
},
{
"epoch": 89.54283337953979,
"eval_loss": 6.377793788909912,
"eval_runtime": 66.2425,
"eval_samples_per_second": 150.96,
"eval_steps_per_second": 18.87,
"step": 40300
},
{
"epoch": 89.76462434155808,
"grad_norm": 0.3513726592063904,
"learning_rate": 1.9665551839464883e-06,
"loss": 6.3777,
"step": 40400
},
{
"epoch": 89.76462434155808,
"eval_loss": 6.38060998916626,
"eval_runtime": 66.2865,
"eval_samples_per_second": 150.86,
"eval_steps_per_second": 18.858,
"step": 40400
},
{
"epoch": 89.98641530357638,
"grad_norm": 0.3225536048412323,
"learning_rate": 1.956521739130435e-06,
"loss": 6.3781,
"step": 40500
},
{
"epoch": 89.98641530357638,
"eval_loss": 6.3820648193359375,
"eval_runtime": 63.6933,
"eval_samples_per_second": 157.002,
"eval_steps_per_second": 19.625,
"step": 40500
},
{
"epoch": 90.20820626559468,
"grad_norm": 0.3866877853870392,
"learning_rate": 1.9464882943143814e-06,
"loss": 6.3772,
"step": 40600
},
{
"epoch": 90.20820626559468,
"eval_loss": 6.382141590118408,
"eval_runtime": 63.7832,
"eval_samples_per_second": 156.781,
"eval_steps_per_second": 19.598,
"step": 40600
},
{
"epoch": 90.42999722761297,
"grad_norm": 0.43070387840270996,
"learning_rate": 1.9364548494983277e-06,
"loss": 6.3778,
"step": 40700
},
{
"epoch": 90.42999722761297,
"eval_loss": 6.375494480133057,
"eval_runtime": 65.724,
"eval_samples_per_second": 152.152,
"eval_steps_per_second": 19.019,
"step": 40700
},
{
"epoch": 90.65178818963128,
"grad_norm": 0.35665562748908997,
"learning_rate": 1.9264214046822745e-06,
"loss": 6.3767,
"step": 40800
},
{
"epoch": 90.65178818963128,
"eval_loss": 6.379345417022705,
"eval_runtime": 64.4622,
"eval_samples_per_second": 155.13,
"eval_steps_per_second": 19.391,
"step": 40800
},
{
"epoch": 90.87357915164957,
"grad_norm": 0.35841798782348633,
"learning_rate": 1.916387959866221e-06,
"loss": 6.3765,
"step": 40900
},
{
"epoch": 90.87357915164957,
"eval_loss": 6.379830360412598,
"eval_runtime": 66.3033,
"eval_samples_per_second": 150.822,
"eval_steps_per_second": 18.853,
"step": 40900
},
{
"epoch": 91.09537011366787,
"grad_norm": 0.29910504817962646,
"learning_rate": 1.9063545150501674e-06,
"loss": 6.3774,
"step": 41000
},
{
"epoch": 91.09537011366787,
"eval_loss": 6.380716323852539,
"eval_runtime": 63.7905,
"eval_samples_per_second": 156.763,
"eval_steps_per_second": 19.595,
"step": 41000
},
{
"epoch": 91.31716107568617,
"grad_norm": 0.3775427043437958,
"learning_rate": 1.896321070234114e-06,
"loss": 6.3784,
"step": 41100
},
{
"epoch": 91.31716107568617,
"eval_loss": 6.38125467300415,
"eval_runtime": 63.7934,
"eval_samples_per_second": 156.756,
"eval_steps_per_second": 19.595,
"step": 41100
},
{
"epoch": 91.53895203770446,
"grad_norm": 0.2421695590019226,
"learning_rate": 1.8862876254180603e-06,
"loss": 6.377,
"step": 41200
},
{
"epoch": 91.53895203770446,
"eval_loss": 6.381397724151611,
"eval_runtime": 66.2535,
"eval_samples_per_second": 150.935,
"eval_steps_per_second": 18.867,
"step": 41200
},
{
"epoch": 91.76074299972277,
"grad_norm": 0.2967372238636017,
"learning_rate": 1.8762541806020068e-06,
"loss": 6.3783,
"step": 41300
},
{
"epoch": 91.76074299972277,
"eval_loss": 6.380742073059082,
"eval_runtime": 63.7433,
"eval_samples_per_second": 156.879,
"eval_steps_per_second": 19.61,
"step": 41300
},
{
"epoch": 91.98253396174105,
"grad_norm": 0.3849758505821228,
"learning_rate": 1.8662207357859534e-06,
"loss": 6.3789,
"step": 41400
},
{
"epoch": 91.98253396174105,
"eval_loss": 6.3830342292785645,
"eval_runtime": 66.3151,
"eval_samples_per_second": 150.795,
"eval_steps_per_second": 18.849,
"step": 41400
},
{
"epoch": 92.20432492375936,
"grad_norm": 0.377841055393219,
"learning_rate": 1.8561872909699e-06,
"loss": 6.3769,
"step": 41500
},
{
"epoch": 92.20432492375936,
"eval_loss": 6.381241798400879,
"eval_runtime": 63.6855,
"eval_samples_per_second": 157.022,
"eval_steps_per_second": 19.628,
"step": 41500
},
{
"epoch": 92.42611588577765,
"grad_norm": 0.4062901437282562,
"learning_rate": 1.8461538461538462e-06,
"loss": 6.3791,
"step": 41600
},
{
"epoch": 92.42611588577765,
"eval_loss": 6.378665924072266,
"eval_runtime": 63.7914,
"eval_samples_per_second": 156.761,
"eval_steps_per_second": 19.595,
"step": 41600
},
{
"epoch": 92.64790684779595,
"grad_norm": 0.33464646339416504,
"learning_rate": 1.8361204013377928e-06,
"loss": 6.3782,
"step": 41700
},
{
"epoch": 92.64790684779595,
"eval_loss": 6.379201412200928,
"eval_runtime": 66.2071,
"eval_samples_per_second": 151.041,
"eval_steps_per_second": 18.88,
"step": 41700
},
{
"epoch": 92.86969780981426,
"grad_norm": 0.26191645860671997,
"learning_rate": 1.8260869565217394e-06,
"loss": 6.3768,
"step": 41800
},
{
"epoch": 92.86969780981426,
"eval_loss": 6.380030632019043,
"eval_runtime": 63.7463,
"eval_samples_per_second": 156.872,
"eval_steps_per_second": 19.609,
"step": 41800
},
{
"epoch": 93.09148877183254,
"grad_norm": 0.4473271667957306,
"learning_rate": 1.8160535117056857e-06,
"loss": 6.376,
"step": 41900
},
{
"epoch": 93.09148877183254,
"eval_loss": 6.383362293243408,
"eval_runtime": 66.2652,
"eval_samples_per_second": 150.909,
"eval_steps_per_second": 18.864,
"step": 41900
},
{
"epoch": 93.31327973385085,
"grad_norm": 0.30396267771720886,
"learning_rate": 1.8060200668896322e-06,
"loss": 6.3782,
"step": 42000
},
{
"epoch": 93.31327973385085,
"eval_loss": 6.382277965545654,
"eval_runtime": 63.811,
"eval_samples_per_second": 156.713,
"eval_steps_per_second": 19.589,
"step": 42000
},
{
"epoch": 93.53507069586914,
"grad_norm": 0.2819732129573822,
"learning_rate": 1.7959866220735788e-06,
"loss": 6.3782,
"step": 42100
},
{
"epoch": 93.53507069586914,
"eval_loss": 6.381258010864258,
"eval_runtime": 63.7343,
"eval_samples_per_second": 156.901,
"eval_steps_per_second": 19.613,
"step": 42100
},
{
"epoch": 93.75686165788744,
"grad_norm": 0.2994706630706787,
"learning_rate": 1.7859531772575253e-06,
"loss": 6.3786,
"step": 42200
},
{
"epoch": 93.75686165788744,
"eval_loss": 6.381169319152832,
"eval_runtime": 66.2919,
"eval_samples_per_second": 150.848,
"eval_steps_per_second": 18.856,
"step": 42200
},
{
"epoch": 93.97865261990574,
"grad_norm": 0.31294333934783936,
"learning_rate": 1.7759197324414717e-06,
"loss": 6.3766,
"step": 42300
},
{
"epoch": 93.97865261990574,
"eval_loss": 6.379955291748047,
"eval_runtime": 63.7737,
"eval_samples_per_second": 156.804,
"eval_steps_per_second": 19.601,
"step": 42300
},
{
"epoch": 94.20044358192403,
"grad_norm": 0.291477769613266,
"learning_rate": 1.7658862876254182e-06,
"loss": 6.3777,
"step": 42400
},
{
"epoch": 94.20044358192403,
"eval_loss": 6.379477500915527,
"eval_runtime": 66.2866,
"eval_samples_per_second": 150.86,
"eval_steps_per_second": 18.857,
"step": 42400
},
{
"epoch": 94.42223454394234,
"grad_norm": 0.23638038337230682,
"learning_rate": 1.7558528428093648e-06,
"loss": 6.3781,
"step": 42500
},
{
"epoch": 94.42223454394234,
"eval_loss": 6.380892753601074,
"eval_runtime": 63.8247,
"eval_samples_per_second": 156.679,
"eval_steps_per_second": 19.585,
"step": 42500
},
{
"epoch": 94.64402550596063,
"grad_norm": 0.3445935547351837,
"learning_rate": 1.745819397993311e-06,
"loss": 6.3768,
"step": 42600
},
{
"epoch": 94.64402550596063,
"eval_loss": 6.382579803466797,
"eval_runtime": 63.8197,
"eval_samples_per_second": 156.691,
"eval_steps_per_second": 19.586,
"step": 42600
},
{
"epoch": 94.86581646797893,
"grad_norm": 0.3376341462135315,
"learning_rate": 1.7357859531772575e-06,
"loss": 6.3768,
"step": 42700
},
{
"epoch": 94.86581646797893,
"eval_loss": 6.381232261657715,
"eval_runtime": 66.3632,
"eval_samples_per_second": 150.686,
"eval_steps_per_second": 18.836,
"step": 42700
},
{
"epoch": 95.08760742999723,
"grad_norm": 0.29045116901397705,
"learning_rate": 1.7257525083612038e-06,
"loss": 6.3763,
"step": 42800
},
{
"epoch": 95.08760742999723,
"eval_loss": 6.3776373863220215,
"eval_runtime": 63.6759,
"eval_samples_per_second": 157.045,
"eval_steps_per_second": 19.631,
"step": 42800
},
{
"epoch": 95.30939839201552,
"grad_norm": 0.2851983308792114,
"learning_rate": 1.7157190635451504e-06,
"loss": 6.3778,
"step": 42900
},
{
"epoch": 95.30939839201552,
"eval_loss": 6.380300998687744,
"eval_runtime": 63.8175,
"eval_samples_per_second": 156.697,
"eval_steps_per_second": 19.587,
"step": 42900
},
{
"epoch": 95.53118935403383,
"grad_norm": 0.33936771750450134,
"learning_rate": 1.705685618729097e-06,
"loss": 6.3787,
"step": 43000
},
{
"epoch": 95.53118935403383,
"eval_loss": 6.37871789932251,
"eval_runtime": 63.8614,
"eval_samples_per_second": 156.589,
"eval_steps_per_second": 19.574,
"step": 43000
},
{
"epoch": 95.75298031605212,
"grad_norm": 0.4443320333957672,
"learning_rate": 1.6956521739130435e-06,
"loss": 6.3781,
"step": 43100
},
{
"epoch": 95.75298031605212,
"eval_loss": 6.382043838500977,
"eval_runtime": 66.3729,
"eval_samples_per_second": 150.664,
"eval_steps_per_second": 18.833,
"step": 43100
},
{
"epoch": 95.97477127807042,
"grad_norm": 0.33091309666633606,
"learning_rate": 1.6856187290969898e-06,
"loss": 6.3772,
"step": 43200
},
{
"epoch": 95.97477127807042,
"eval_loss": 6.380916595458984,
"eval_runtime": 63.7824,
"eval_samples_per_second": 156.783,
"eval_steps_per_second": 19.598,
"step": 43200
},
{
"epoch": 96.19656224008871,
"grad_norm": 0.3929876685142517,
"learning_rate": 1.6755852842809363e-06,
"loss": 6.3785,
"step": 43300
},
{
"epoch": 96.19656224008871,
"eval_loss": 6.377211570739746,
"eval_runtime": 66.2793,
"eval_samples_per_second": 150.877,
"eval_steps_per_second": 18.86,
"step": 43300
},
{
"epoch": 96.41835320210701,
"grad_norm": 0.3379896581172943,
"learning_rate": 1.665551839464883e-06,
"loss": 6.3772,
"step": 43400
},
{
"epoch": 96.41835320210701,
"eval_loss": 6.380885124206543,
"eval_runtime": 63.8749,
"eval_samples_per_second": 156.556,
"eval_steps_per_second": 19.569,
"step": 43400
},
{
"epoch": 96.64014416412532,
"grad_norm": 0.3330114483833313,
"learning_rate": 1.6555183946488294e-06,
"loss": 6.378,
"step": 43500
},
{
"epoch": 96.64014416412532,
"eval_loss": 6.381417751312256,
"eval_runtime": 66.3248,
"eval_samples_per_second": 150.773,
"eval_steps_per_second": 18.847,
"step": 43500
},
{
"epoch": 96.8619351261436,
"grad_norm": 0.5002055168151855,
"learning_rate": 1.6454849498327758e-06,
"loss": 6.3772,
"step": 43600
},
{
"epoch": 96.8619351261436,
"eval_loss": 6.379367351531982,
"eval_runtime": 63.7674,
"eval_samples_per_second": 156.82,
"eval_steps_per_second": 19.603,
"step": 43600
},
{
"epoch": 97.08372608816191,
"grad_norm": 0.4039636552333832,
"learning_rate": 1.6354515050167223e-06,
"loss": 6.376,
"step": 43700
},
{
"epoch": 97.08372608816191,
"eval_loss": 6.379873275756836,
"eval_runtime": 63.6881,
"eval_samples_per_second": 157.015,
"eval_steps_per_second": 19.627,
"step": 43700
},
{
"epoch": 97.3055170501802,
"grad_norm": 0.3500140309333801,
"learning_rate": 1.6254180602006689e-06,
"loss": 6.3793,
"step": 43800
},
{
"epoch": 97.3055170501802,
"eval_loss": 6.3825764656066895,
"eval_runtime": 66.2863,
"eval_samples_per_second": 150.861,
"eval_steps_per_second": 18.858,
"step": 43800
},
{
"epoch": 97.5273080121985,
"grad_norm": 0.343735009431839,
"learning_rate": 1.6153846153846154e-06,
"loss": 6.3779,
"step": 43900
},
{
"epoch": 97.5273080121985,
"eval_loss": 6.378231525421143,
"eval_runtime": 63.7143,
"eval_samples_per_second": 156.951,
"eval_steps_per_second": 19.619,
"step": 43900
},
{
"epoch": 97.7490989742168,
"grad_norm": 0.3836156129837036,
"learning_rate": 1.6053511705685618e-06,
"loss": 6.3773,
"step": 44000
},
{
"epoch": 97.7490989742168,
"eval_loss": 6.37751579284668,
"eval_runtime": 63.729,
"eval_samples_per_second": 156.915,
"eval_steps_per_second": 19.614,
"step": 44000
},
{
"epoch": 97.9708899362351,
"grad_norm": 0.3120937645435333,
"learning_rate": 1.5953177257525083e-06,
"loss": 6.3755,
"step": 44100
},
{
"epoch": 97.9708899362351,
"eval_loss": 6.3800272941589355,
"eval_runtime": 64.1744,
"eval_samples_per_second": 155.825,
"eval_steps_per_second": 19.478,
"step": 44100
},
{
"epoch": 98.1926808982534,
"grad_norm": 0.33682048320770264,
"learning_rate": 1.5852842809364549e-06,
"loss": 6.3765,
"step": 44200
},
{
"epoch": 98.1926808982534,
"eval_loss": 6.378459930419922,
"eval_runtime": 65.8486,
"eval_samples_per_second": 151.864,
"eval_steps_per_second": 18.983,
"step": 44200
},
{
"epoch": 98.41447186027169,
"grad_norm": 0.33430323004722595,
"learning_rate": 1.5752508361204012e-06,
"loss": 6.3784,
"step": 44300
},
{
"epoch": 98.41447186027169,
"eval_loss": 6.37835693359375,
"eval_runtime": 63.7423,
"eval_samples_per_second": 156.882,
"eval_steps_per_second": 19.61,
"step": 44300
},
{
"epoch": 98.63626282228999,
"grad_norm": 0.3729492425918579,
"learning_rate": 1.5652173913043478e-06,
"loss": 6.3775,
"step": 44400
},
{
"epoch": 98.63626282228999,
"eval_loss": 6.379312515258789,
"eval_runtime": 67.1919,
"eval_samples_per_second": 148.827,
"eval_steps_per_second": 18.603,
"step": 44400
},
{
"epoch": 98.8580537843083,
"grad_norm": 0.30378684401512146,
"learning_rate": 1.5551839464882943e-06,
"loss": 6.3773,
"step": 44500
},
{
"epoch": 98.8580537843083,
"eval_loss": 6.380176544189453,
"eval_runtime": 66.2505,
"eval_samples_per_second": 150.942,
"eval_steps_per_second": 18.868,
"step": 44500
},
{
"epoch": 99.07984474632659,
"grad_norm": 0.2708960771560669,
"learning_rate": 1.5451505016722409e-06,
"loss": 6.3791,
"step": 44600
},
{
"epoch": 99.07984474632659,
"eval_loss": 6.381106853485107,
"eval_runtime": 63.6851,
"eval_samples_per_second": 157.023,
"eval_steps_per_second": 19.628,
"step": 44600
},
{
"epoch": 99.30163570834489,
"grad_norm": 0.28966355323791504,
"learning_rate": 1.5351170568561872e-06,
"loss": 6.3769,
"step": 44700
},
{
"epoch": 99.30163570834489,
"eval_loss": 6.380806922912598,
"eval_runtime": 66.2295,
"eval_samples_per_second": 150.99,
"eval_steps_per_second": 18.874,
"step": 44700
},
{
"epoch": 99.52342667036318,
"grad_norm": 0.33378317952156067,
"learning_rate": 1.5250836120401338e-06,
"loss": 6.3764,
"step": 44800
},
{
"epoch": 99.52342667036318,
"eval_loss": 6.378901481628418,
"eval_runtime": 63.7387,
"eval_samples_per_second": 156.89,
"eval_steps_per_second": 19.611,
"step": 44800
},
{
"epoch": 99.74521763238148,
"grad_norm": 0.2659667134284973,
"learning_rate": 1.5150501672240803e-06,
"loss": 6.3763,
"step": 44900
},
{
"epoch": 99.74521763238148,
"eval_loss": 6.378689289093018,
"eval_runtime": 66.291,
"eval_samples_per_second": 150.85,
"eval_steps_per_second": 18.856,
"step": 44900
},
{
"epoch": 99.96700859439977,
"grad_norm": 0.36868181824684143,
"learning_rate": 1.5050167224080269e-06,
"loss": 6.3773,
"step": 45000
},
{
"epoch": 99.96700859439977,
"eval_loss": 6.379394054412842,
"eval_runtime": 63.8432,
"eval_samples_per_second": 156.634,
"eval_steps_per_second": 19.579,
"step": 45000
},
{
"epoch": 100.18879955641808,
"grad_norm": 0.2957492768764496,
"learning_rate": 1.4949832775919732e-06,
"loss": 6.3777,
"step": 45100
},
{
"epoch": 100.18879955641808,
"eval_loss": 6.37989616394043,
"eval_runtime": 63.7161,
"eval_samples_per_second": 156.946,
"eval_steps_per_second": 19.618,
"step": 45100
},
{
"epoch": 100.41059051843638,
"grad_norm": 0.36346226930618286,
"learning_rate": 1.4849498327759198e-06,
"loss": 6.3771,
"step": 45200
},
{
"epoch": 100.41059051843638,
"eval_loss": 6.382117748260498,
"eval_runtime": 66.181,
"eval_samples_per_second": 151.101,
"eval_steps_per_second": 18.888,
"step": 45200
},
{
"epoch": 100.63238148045467,
"grad_norm": 0.21758611500263214,
"learning_rate": 1.4749163879598663e-06,
"loss": 6.3768,
"step": 45300
},
{
"epoch": 100.63238148045467,
"eval_loss": 6.378548622131348,
"eval_runtime": 63.8643,
"eval_samples_per_second": 156.582,
"eval_steps_per_second": 19.573,
"step": 45300
},
{
"epoch": 100.85417244247297,
"grad_norm": 0.21891988813877106,
"learning_rate": 1.4648829431438129e-06,
"loss": 6.3759,
"step": 45400
},
{
"epoch": 100.85417244247297,
"eval_loss": 6.3807806968688965,
"eval_runtime": 66.1954,
"eval_samples_per_second": 151.068,
"eval_steps_per_second": 18.883,
"step": 45400
},
{
"epoch": 101.07596340449126,
"grad_norm": 0.31398728489875793,
"learning_rate": 1.4548494983277592e-06,
"loss": 6.3783,
"step": 45500
},
{
"epoch": 101.07596340449126,
"eval_loss": 6.3800740242004395,
"eval_runtime": 63.74,
"eval_samples_per_second": 156.887,
"eval_steps_per_second": 19.611,
"step": 45500
},
{
"epoch": 101.29775436650957,
"grad_norm": 0.3506067991256714,
"learning_rate": 1.4448160535117058e-06,
"loss": 6.3757,
"step": 45600
},
{
"epoch": 101.29775436650957,
"eval_loss": 6.3802642822265625,
"eval_runtime": 66.3029,
"eval_samples_per_second": 150.823,
"eval_steps_per_second": 18.853,
"step": 45600
},
{
"epoch": 101.51954532852787,
"grad_norm": 0.4127357304096222,
"learning_rate": 1.4347826086956523e-06,
"loss": 6.377,
"step": 45700
},
{
"epoch": 101.51954532852787,
"eval_loss": 6.379199028015137,
"eval_runtime": 63.6147,
"eval_samples_per_second": 157.196,
"eval_steps_per_second": 19.65,
"step": 45700
},
{
"epoch": 101.74133629054616,
"grad_norm": 0.40180787444114685,
"learning_rate": 1.4247491638795989e-06,
"loss": 6.3774,
"step": 45800
},
{
"epoch": 101.74133629054616,
"eval_loss": 6.378483295440674,
"eval_runtime": 63.6205,
"eval_samples_per_second": 157.182,
"eval_steps_per_second": 19.648,
"step": 45800
},
{
"epoch": 101.96312725256446,
"grad_norm": 0.2862705588340759,
"learning_rate": 1.4147157190635452e-06,
"loss": 6.3777,
"step": 45900
},
{
"epoch": 101.96312725256446,
"eval_loss": 6.377134323120117,
"eval_runtime": 63.9897,
"eval_samples_per_second": 156.275,
"eval_steps_per_second": 19.534,
"step": 45900
},
{
"epoch": 102.18491821458275,
"grad_norm": 0.2539602816104889,
"learning_rate": 1.4046822742474917e-06,
"loss": 6.3786,
"step": 46000
},
{
"epoch": 102.18491821458275,
"eval_loss": 6.379866123199463,
"eval_runtime": 66.1001,
"eval_samples_per_second": 151.286,
"eval_steps_per_second": 18.911,
"step": 46000
},
{
"epoch": 102.40670917660105,
"grad_norm": 0.36692872643470764,
"learning_rate": 1.3946488294314383e-06,
"loss": 6.3771,
"step": 46100
},
{
"epoch": 102.40670917660105,
"eval_loss": 6.379576683044434,
"eval_runtime": 63.6911,
"eval_samples_per_second": 157.008,
"eval_steps_per_second": 19.626,
"step": 46100
},
{
"epoch": 102.62850013861934,
"grad_norm": 0.3044676184654236,
"learning_rate": 1.3846153846153846e-06,
"loss": 6.3772,
"step": 46200
},
{
"epoch": 102.62850013861934,
"eval_loss": 6.381227493286133,
"eval_runtime": 63.7064,
"eval_samples_per_second": 156.97,
"eval_steps_per_second": 19.621,
"step": 46200
},
{
"epoch": 102.85029110063765,
"grad_norm": 0.3508971035480499,
"learning_rate": 1.374581939799331e-06,
"loss": 6.3762,
"step": 46300
},
{
"epoch": 102.85029110063765,
"eval_loss": 6.377274513244629,
"eval_runtime": 66.2947,
"eval_samples_per_second": 150.842,
"eval_steps_per_second": 18.855,
"step": 46300
},
{
"epoch": 103.07208206265595,
"grad_norm": 0.31413570046424866,
"learning_rate": 1.3645484949832775e-06,
"loss": 6.3774,
"step": 46400
},
{
"epoch": 103.07208206265595,
"eval_loss": 6.380115032196045,
"eval_runtime": 63.6441,
"eval_samples_per_second": 157.124,
"eval_steps_per_second": 19.64,
"step": 46400
},
{
"epoch": 103.29387302467424,
"grad_norm": 0.2552104890346527,
"learning_rate": 1.354515050167224e-06,
"loss": 6.3775,
"step": 46500
},
{
"epoch": 103.29387302467424,
"eval_loss": 6.379015922546387,
"eval_runtime": 63.6755,
"eval_samples_per_second": 157.046,
"eval_steps_per_second": 19.631,
"step": 46500
},
{
"epoch": 103.51566398669254,
"grad_norm": 0.3744960129261017,
"learning_rate": 1.3444816053511706e-06,
"loss": 6.3763,
"step": 46600
},
{
"epoch": 103.51566398669254,
"eval_loss": 6.374266624450684,
"eval_runtime": 66.5834,
"eval_samples_per_second": 150.188,
"eval_steps_per_second": 18.773,
"step": 46600
},
{
"epoch": 103.73745494871083,
"grad_norm": 0.27893921732902527,
"learning_rate": 1.334448160535117e-06,
"loss": 6.3775,
"step": 46700
},
{
"epoch": 103.73745494871083,
"eval_loss": 6.380270957946777,
"eval_runtime": 66.2442,
"eval_samples_per_second": 150.957,
"eval_steps_per_second": 18.87,
"step": 46700
},
{
"epoch": 103.95924591072914,
"grad_norm": 0.2601492404937744,
"learning_rate": 1.3244147157190635e-06,
"loss": 6.3775,
"step": 46800
},
{
"epoch": 103.95924591072914,
"eval_loss": 6.380533218383789,
"eval_runtime": 66.2494,
"eval_samples_per_second": 150.945,
"eval_steps_per_second": 18.868,
"step": 46800
},
{
"epoch": 104.18103687274744,
"grad_norm": 0.28285419940948486,
"learning_rate": 1.31438127090301e-06,
"loss": 6.3776,
"step": 46900
},
{
"epoch": 104.18103687274744,
"eval_loss": 6.3801751136779785,
"eval_runtime": 66.2411,
"eval_samples_per_second": 150.964,
"eval_steps_per_second": 18.87,
"step": 46900
},
{
"epoch": 104.40282783476573,
"grad_norm": 0.4723234176635742,
"learning_rate": 1.3043478260869566e-06,
"loss": 6.376,
"step": 47000
},
{
"epoch": 104.40282783476573,
"eval_loss": 6.379186153411865,
"eval_runtime": 63.7279,
"eval_samples_per_second": 156.917,
"eval_steps_per_second": 19.615,
"step": 47000
},
{
"epoch": 104.62461879678403,
"grad_norm": 0.3108322322368622,
"learning_rate": 1.294314381270903e-06,
"loss": 6.3773,
"step": 47100
},
{
"epoch": 104.62461879678403,
"eval_loss": 6.3764142990112305,
"eval_runtime": 63.7167,
"eval_samples_per_second": 156.945,
"eval_steps_per_second": 19.618,
"step": 47100
},
{
"epoch": 104.84640975880232,
"grad_norm": 0.38544511795043945,
"learning_rate": 1.2842809364548495e-06,
"loss": 6.3773,
"step": 47200
},
{
"epoch": 104.84640975880232,
"eval_loss": 6.379009246826172,
"eval_runtime": 66.2408,
"eval_samples_per_second": 150.964,
"eval_steps_per_second": 18.871,
"step": 47200
},
{
"epoch": 105.06820072082063,
"grad_norm": 0.2773985266685486,
"learning_rate": 1.274247491638796e-06,
"loss": 6.3772,
"step": 47300
},
{
"epoch": 105.06820072082063,
"eval_loss": 6.3756842613220215,
"eval_runtime": 63.6518,
"eval_samples_per_second": 157.105,
"eval_steps_per_second": 19.638,
"step": 47300
},
{
"epoch": 105.28999168283893,
"grad_norm": 0.2765492796897888,
"learning_rate": 1.2642140468227424e-06,
"loss": 6.3764,
"step": 47400
},
{
"epoch": 105.28999168283893,
"eval_loss": 6.377975940704346,
"eval_runtime": 66.2572,
"eval_samples_per_second": 150.927,
"eval_steps_per_second": 18.866,
"step": 47400
},
{
"epoch": 105.51178264485722,
"grad_norm": 0.30239638686180115,
"learning_rate": 1.254180602006689e-06,
"loss": 6.3761,
"step": 47500
},
{
"epoch": 105.51178264485722,
"eval_loss": 6.379149436950684,
"eval_runtime": 63.8068,
"eval_samples_per_second": 156.723,
"eval_steps_per_second": 19.59,
"step": 47500
},
{
"epoch": 105.73357360687552,
"grad_norm": 0.22471874952316284,
"learning_rate": 1.2441471571906355e-06,
"loss": 6.3775,
"step": 47600
},
{
"epoch": 105.73357360687552,
"eval_loss": 6.3783087730407715,
"eval_runtime": 66.2436,
"eval_samples_per_second": 150.958,
"eval_steps_per_second": 18.87,
"step": 47600
},
{
"epoch": 105.95536456889381,
"grad_norm": 0.23722052574157715,
"learning_rate": 1.234113712374582e-06,
"loss": 6.377,
"step": 47700
},
{
"epoch": 105.95536456889381,
"eval_loss": 6.376536846160889,
"eval_runtime": 63.6766,
"eval_samples_per_second": 157.044,
"eval_steps_per_second": 19.63,
"step": 47700
},
{
"epoch": 106.17715553091212,
"grad_norm": 0.26499879360198975,
"learning_rate": 1.2240802675585284e-06,
"loss": 6.3758,
"step": 47800
},
{
"epoch": 106.17715553091212,
"eval_loss": 6.380406856536865,
"eval_runtime": 66.1835,
"eval_samples_per_second": 151.095,
"eval_steps_per_second": 18.887,
"step": 47800
},
{
"epoch": 106.3989464929304,
"grad_norm": 0.32900717854499817,
"learning_rate": 1.214046822742475e-06,
"loss": 6.375,
"step": 47900
},
{
"epoch": 106.3989464929304,
"eval_loss": 6.375906467437744,
"eval_runtime": 63.8048,
"eval_samples_per_second": 156.728,
"eval_steps_per_second": 19.591,
"step": 47900
},
{
"epoch": 106.62073745494871,
"grad_norm": 0.3241865932941437,
"learning_rate": 1.2040133779264215e-06,
"loss": 6.3792,
"step": 48000
},
{
"epoch": 106.62073745494871,
"eval_loss": 6.37775993347168,
"eval_runtime": 66.3426,
"eval_samples_per_second": 150.733,
"eval_steps_per_second": 18.842,
"step": 48000
},
{
"epoch": 106.84252841696701,
"grad_norm": 0.3194703757762909,
"learning_rate": 1.193979933110368e-06,
"loss": 6.3766,
"step": 48100
},
{
"epoch": 106.84252841696701,
"eval_loss": 6.37912654876709,
"eval_runtime": 63.7236,
"eval_samples_per_second": 156.928,
"eval_steps_per_second": 19.616,
"step": 48100
},
{
"epoch": 107.0643193789853,
"grad_norm": 0.25526002049446106,
"learning_rate": 1.1839464882943144e-06,
"loss": 6.3776,
"step": 48200
},
{
"epoch": 107.0643193789853,
"eval_loss": 6.38245153427124,
"eval_runtime": 66.2659,
"eval_samples_per_second": 150.907,
"eval_steps_per_second": 18.863,
"step": 48200
},
{
"epoch": 107.2861103410036,
"grad_norm": 0.2747518718242645,
"learning_rate": 1.173913043478261e-06,
"loss": 6.3768,
"step": 48300
},
{
"epoch": 107.2861103410036,
"eval_loss": 6.380572319030762,
"eval_runtime": 63.8901,
"eval_samples_per_second": 156.519,
"eval_steps_per_second": 19.565,
"step": 48300
},
{
"epoch": 107.5079013030219,
"grad_norm": 0.2569632828235626,
"learning_rate": 1.1638795986622075e-06,
"loss": 6.3764,
"step": 48400
},
{
"epoch": 107.5079013030219,
"eval_loss": 6.380358695983887,
"eval_runtime": 66.316,
"eval_samples_per_second": 150.793,
"eval_steps_per_second": 18.849,
"step": 48400
},
{
"epoch": 107.7296922650402,
"grad_norm": 0.28270038962364197,
"learning_rate": 1.153846153846154e-06,
"loss": 6.3772,
"step": 48500
},
{
"epoch": 107.7296922650402,
"eval_loss": 6.3787407875061035,
"eval_runtime": 63.7582,
"eval_samples_per_second": 156.842,
"eval_steps_per_second": 19.605,
"step": 48500
},
{
"epoch": 107.9514832270585,
"grad_norm": 0.35361409187316895,
"learning_rate": 1.1438127090301004e-06,
"loss": 6.3754,
"step": 48600
},
{
"epoch": 107.9514832270585,
"eval_loss": 6.37959098815918,
"eval_runtime": 63.698,
"eval_samples_per_second": 156.991,
"eval_steps_per_second": 19.624,
"step": 48600
},
{
"epoch": 108.17327418907679,
"grad_norm": 0.2802847921848297,
"learning_rate": 1.133779264214047e-06,
"loss": 6.375,
"step": 48700
},
{
"epoch": 108.17327418907679,
"eval_loss": 6.376708030700684,
"eval_runtime": 66.263,
"eval_samples_per_second": 150.914,
"eval_steps_per_second": 18.864,
"step": 48700
},
{
"epoch": 108.3950651510951,
"grad_norm": 0.3533788323402405,
"learning_rate": 1.1237458193979933e-06,
"loss": 6.3757,
"step": 48800
},
{
"epoch": 108.3950651510951,
"eval_loss": 6.380278587341309,
"eval_runtime": 67.3818,
"eval_samples_per_second": 148.408,
"eval_steps_per_second": 18.551,
"step": 48800
},
{
"epoch": 108.61685611311339,
"grad_norm": 0.21207566559314728,
"learning_rate": 1.1137123745819398e-06,
"loss": 6.3776,
"step": 48900
},
{
"epoch": 108.61685611311339,
"eval_loss": 6.375850200653076,
"eval_runtime": 63.7895,
"eval_samples_per_second": 156.766,
"eval_steps_per_second": 19.596,
"step": 48900
},
{
"epoch": 108.83864707513169,
"grad_norm": 0.33531099557876587,
"learning_rate": 1.1036789297658862e-06,
"loss": 6.3765,
"step": 49000
},
{
"epoch": 108.83864707513169,
"eval_loss": 6.378798484802246,
"eval_runtime": 63.7683,
"eval_samples_per_second": 156.818,
"eval_steps_per_second": 19.602,
"step": 49000
},
{
"epoch": 109.06043803714999,
"grad_norm": 0.39727288484573364,
"learning_rate": 1.0936454849498327e-06,
"loss": 6.3774,
"step": 49100
},
{
"epoch": 109.06043803714999,
"eval_loss": 6.379205703735352,
"eval_runtime": 66.2384,
"eval_samples_per_second": 150.97,
"eval_steps_per_second": 18.871,
"step": 49100
},
{
"epoch": 109.28222899916828,
"grad_norm": 0.3876926004886627,
"learning_rate": 1.0836120401337793e-06,
"loss": 6.3772,
"step": 49200
},
{
"epoch": 109.28222899916828,
"eval_loss": 6.382777214050293,
"eval_runtime": 63.7163,
"eval_samples_per_second": 156.946,
"eval_steps_per_second": 19.618,
"step": 49200
},
{
"epoch": 109.50401996118659,
"grad_norm": 0.3268238604068756,
"learning_rate": 1.0735785953177258e-06,
"loss": 6.3765,
"step": 49300
},
{
"epoch": 109.50401996118659,
"eval_loss": 6.378788471221924,
"eval_runtime": 66.3254,
"eval_samples_per_second": 150.772,
"eval_steps_per_second": 18.846,
"step": 49300
},
{
"epoch": 109.72581092320488,
"grad_norm": 0.24343077838420868,
"learning_rate": 1.0635451505016722e-06,
"loss": 6.3766,
"step": 49400
},
{
"epoch": 109.72581092320488,
"eval_loss": 6.379393577575684,
"eval_runtime": 63.7485,
"eval_samples_per_second": 156.866,
"eval_steps_per_second": 19.608,
"step": 49400
},
{
"epoch": 109.94760188522318,
"grad_norm": 0.3532174229621887,
"learning_rate": 1.0535117056856187e-06,
"loss": 6.3762,
"step": 49500
},
{
"epoch": 109.94760188522318,
"eval_loss": 6.383326530456543,
"eval_runtime": 63.7304,
"eval_samples_per_second": 156.911,
"eval_steps_per_second": 19.614,
"step": 49500
},
{
"epoch": 110.16939284724147,
"grad_norm": 0.28071361780166626,
"learning_rate": 1.0434782608695653e-06,
"loss": 6.3763,
"step": 49600
},
{
"epoch": 110.16939284724147,
"eval_loss": 6.376327991485596,
"eval_runtime": 66.248,
"eval_samples_per_second": 150.948,
"eval_steps_per_second": 18.868,
"step": 49600
},
{
"epoch": 110.39118380925977,
"grad_norm": 0.3425652086734772,
"learning_rate": 1.0334448160535118e-06,
"loss": 6.3755,
"step": 49700
},
{
"epoch": 110.39118380925977,
"eval_loss": 6.3802337646484375,
"eval_runtime": 63.7015,
"eval_samples_per_second": 156.982,
"eval_steps_per_second": 19.623,
"step": 49700
},
{
"epoch": 110.61297477127808,
"grad_norm": 0.22676917910575867,
"learning_rate": 1.0234113712374581e-06,
"loss": 6.3773,
"step": 49800
},
{
"epoch": 110.61297477127808,
"eval_loss": 6.3807525634765625,
"eval_runtime": 66.2796,
"eval_samples_per_second": 150.876,
"eval_steps_per_second": 18.86,
"step": 49800
},
{
"epoch": 110.83476573329636,
"grad_norm": 0.25897106528282166,
"learning_rate": 1.0133779264214047e-06,
"loss": 6.3768,
"step": 49900
},
{
"epoch": 110.83476573329636,
"eval_loss": 6.381240367889404,
"eval_runtime": 63.8656,
"eval_samples_per_second": 156.579,
"eval_steps_per_second": 19.572,
"step": 49900
},
{
"epoch": 111.05655669531467,
"grad_norm": 0.2521306574344635,
"learning_rate": 1.0033444816053512e-06,
"loss": 6.3748,
"step": 50000
},
{
"epoch": 111.05655669531467,
"eval_loss": 6.379097938537598,
"eval_runtime": 63.7336,
"eval_samples_per_second": 156.903,
"eval_steps_per_second": 19.613,
"step": 50000
},
{
"epoch": 111.27834765733296,
"grad_norm": 0.32774215936660767,
"learning_rate": 9.933110367892976e-07,
"loss": 6.3777,
"step": 50100
},
{
"epoch": 111.27834765733296,
"eval_loss": 6.379392147064209,
"eval_runtime": 66.4051,
"eval_samples_per_second": 150.591,
"eval_steps_per_second": 18.824,
"step": 50100
},
{
"epoch": 111.50013861935126,
"grad_norm": 0.23284611105918884,
"learning_rate": 9.832775919732441e-07,
"loss": 6.3746,
"step": 50200
},
{
"epoch": 111.50013861935126,
"eval_loss": 6.377693176269531,
"eval_runtime": 64.6002,
"eval_samples_per_second": 154.798,
"eval_steps_per_second": 19.35,
"step": 50200
},
{
"epoch": 111.72192958136957,
"grad_norm": 0.2757164537906647,
"learning_rate": 9.732441471571907e-07,
"loss": 6.3743,
"step": 50300
},
{
"epoch": 111.72192958136957,
"eval_loss": 6.38041877746582,
"eval_runtime": 65.5393,
"eval_samples_per_second": 152.58,
"eval_steps_per_second": 19.073,
"step": 50300
},
{
"epoch": 111.94372054338785,
"grad_norm": 0.326815128326416,
"learning_rate": 9.632107023411372e-07,
"loss": 6.3765,
"step": 50400
},
{
"epoch": 111.94372054338785,
"eval_loss": 6.37969970703125,
"eval_runtime": 63.7883,
"eval_samples_per_second": 156.769,
"eval_steps_per_second": 19.596,
"step": 50400
},
{
"epoch": 112.16551150540616,
"grad_norm": 0.34073254466056824,
"learning_rate": 9.531772575250837e-07,
"loss": 6.3758,
"step": 50500
},
{
"epoch": 112.16551150540616,
"eval_loss": 6.380171298980713,
"eval_runtime": 66.2335,
"eval_samples_per_second": 150.981,
"eval_steps_per_second": 18.873,
"step": 50500
},
{
"epoch": 112.38730246742445,
"grad_norm": 0.2289067655801773,
"learning_rate": 9.431438127090301e-07,
"loss": 6.3766,
"step": 50600
},
{
"epoch": 112.38730246742445,
"eval_loss": 6.379415035247803,
"eval_runtime": 63.6851,
"eval_samples_per_second": 157.023,
"eval_steps_per_second": 19.628,
"step": 50600
},
{
"epoch": 112.60909342944275,
"grad_norm": 0.2386418581008911,
"learning_rate": 9.331103678929767e-07,
"loss": 6.375,
"step": 50700
},
{
"epoch": 112.60909342944275,
"eval_loss": 6.375070571899414,
"eval_runtime": 66.2164,
"eval_samples_per_second": 151.02,
"eval_steps_per_second": 18.878,
"step": 50700
},
{
"epoch": 112.83088439146105,
"grad_norm": 0.26779764890670776,
"learning_rate": 9.230769230769231e-07,
"loss": 6.3754,
"step": 50800
},
{
"epoch": 112.83088439146105,
"eval_loss": 6.377529621124268,
"eval_runtime": 63.7216,
"eval_samples_per_second": 156.933,
"eval_steps_per_second": 19.617,
"step": 50800
},
{
"epoch": 113.05267535347934,
"grad_norm": 0.2792610228061676,
"learning_rate": 9.130434782608697e-07,
"loss": 6.3768,
"step": 50900
},
{
"epoch": 113.05267535347934,
"eval_loss": 6.376430988311768,
"eval_runtime": 66.1841,
"eval_samples_per_second": 151.094,
"eval_steps_per_second": 18.887,
"step": 50900
},
{
"epoch": 113.27446631549765,
"grad_norm": 0.26424017548561096,
"learning_rate": 9.030100334448161e-07,
"loss": 6.3748,
"step": 51000
},
{
"epoch": 113.27446631549765,
"eval_loss": 6.37862491607666,
"eval_runtime": 63.7419,
"eval_samples_per_second": 156.883,
"eval_steps_per_second": 19.61,
"step": 51000
},
{
"epoch": 113.49625727751594,
"grad_norm": 0.26083120703697205,
"learning_rate": 8.929765886287627e-07,
"loss": 6.3779,
"step": 51100
},
{
"epoch": 113.49625727751594,
"eval_loss": 6.379500389099121,
"eval_runtime": 66.2253,
"eval_samples_per_second": 151.0,
"eval_steps_per_second": 18.875,
"step": 51100
},
{
"epoch": 113.77626836706405,
"grad_norm": 0.25904449820518494,
"learning_rate": 8.829431438127091e-07,
"loss": 6.3757,
"step": 51200
},
{
"epoch": 113.77626836706405,
"eval_loss": 6.375171661376953,
"eval_runtime": 66.093,
"eval_samples_per_second": 151.302,
"eval_steps_per_second": 18.913,
"step": 51200
},
{
"epoch": 113.99805932908234,
"grad_norm": 0.2680477499961853,
"learning_rate": 8.729096989966555e-07,
"loss": 6.3769,
"step": 51300
},
{
"epoch": 113.99805932908234,
"eval_loss": 6.376518726348877,
"eval_runtime": 63.5204,
"eval_samples_per_second": 157.43,
"eval_steps_per_second": 19.679,
"step": 51300
},
{
"epoch": 114.21985029110064,
"grad_norm": 0.30891552567481995,
"learning_rate": 8.628762541806019e-07,
"loss": 6.3752,
"step": 51400
},
{
"epoch": 114.21985029110064,
"eval_loss": 6.377015590667725,
"eval_runtime": 63.4143,
"eval_samples_per_second": 157.693,
"eval_steps_per_second": 19.712,
"step": 51400
},
{
"epoch": 114.44164125311893,
"grad_norm": 0.32155531644821167,
"learning_rate": 8.528428093645485e-07,
"loss": 6.3767,
"step": 51500
},
{
"epoch": 114.44164125311893,
"eval_loss": 6.377589702606201,
"eval_runtime": 66.1364,
"eval_samples_per_second": 151.203,
"eval_steps_per_second": 18.9,
"step": 51500
},
{
"epoch": 114.66343221513723,
"grad_norm": 0.28316569328308105,
"learning_rate": 8.428093645484949e-07,
"loss": 6.3755,
"step": 51600
},
{
"epoch": 114.66343221513723,
"eval_loss": 6.3766303062438965,
"eval_runtime": 65.9296,
"eval_samples_per_second": 151.677,
"eval_steps_per_second": 18.96,
"step": 51600
},
{
"epoch": 114.88522317715552,
"grad_norm": 0.24125680327415466,
"learning_rate": 8.327759197324414e-07,
"loss": 6.3773,
"step": 51700
},
{
"epoch": 114.88522317715552,
"eval_loss": 6.37697172164917,
"eval_runtime": 65.9478,
"eval_samples_per_second": 151.635,
"eval_steps_per_second": 18.954,
"step": 51700
},
{
"epoch": 115.10701413917383,
"grad_norm": 0.21407043933868408,
"learning_rate": 8.227424749163879e-07,
"loss": 6.3751,
"step": 51800
},
{
"epoch": 115.10701413917383,
"eval_loss": 6.377639293670654,
"eval_runtime": 63.6016,
"eval_samples_per_second": 157.229,
"eval_steps_per_second": 19.654,
"step": 51800
},
{
"epoch": 115.32880510119213,
"grad_norm": 0.23014885187149048,
"learning_rate": 8.127090301003344e-07,
"loss": 6.3771,
"step": 51900
},
{
"epoch": 115.32880510119213,
"eval_loss": 6.380842208862305,
"eval_runtime": 63.4674,
"eval_samples_per_second": 157.561,
"eval_steps_per_second": 19.695,
"step": 51900
},
{
"epoch": 115.55059606321042,
"grad_norm": 0.2553617060184479,
"learning_rate": 8.026755852842809e-07,
"loss": 6.3752,
"step": 52000
},
{
"epoch": 115.55059606321042,
"eval_loss": 6.377804756164551,
"eval_runtime": 64.2492,
"eval_samples_per_second": 155.644,
"eval_steps_per_second": 19.456,
"step": 52000
},
{
"epoch": 115.77238702522872,
"grad_norm": 0.32242822647094727,
"learning_rate": 7.926421404682274e-07,
"loss": 6.3762,
"step": 52100
},
{
"epoch": 115.77238702522872,
"eval_loss": 6.382247447967529,
"eval_runtime": 65.2652,
"eval_samples_per_second": 153.221,
"eval_steps_per_second": 19.153,
"step": 52100
},
{
"epoch": 115.99417798724701,
"grad_norm": 0.25089436769485474,
"learning_rate": 7.826086956521739e-07,
"loss": 6.3757,
"step": 52200
},
{
"epoch": 115.99417798724701,
"eval_loss": 6.379915714263916,
"eval_runtime": 63.3864,
"eval_samples_per_second": 157.763,
"eval_steps_per_second": 19.72,
"step": 52200
},
{
"epoch": 116.21596894926532,
"grad_norm": 0.24113717675209045,
"learning_rate": 7.725752508361204e-07,
"loss": 6.3761,
"step": 52300
},
{
"epoch": 116.21596894926532,
"eval_loss": 6.376662731170654,
"eval_runtime": 63.5361,
"eval_samples_per_second": 157.391,
"eval_steps_per_second": 19.674,
"step": 52300
},
{
"epoch": 116.43775991128362,
"grad_norm": 0.3414776027202606,
"learning_rate": 7.625418060200669e-07,
"loss": 6.3757,
"step": 52400
},
{
"epoch": 116.43775991128362,
"eval_loss": 6.377313137054443,
"eval_runtime": 63.5522,
"eval_samples_per_second": 157.351,
"eval_steps_per_second": 19.669,
"step": 52400
},
{
"epoch": 116.65955087330191,
"grad_norm": 0.24650247395038605,
"learning_rate": 7.525083612040134e-07,
"loss": 6.3754,
"step": 52500
},
{
"epoch": 116.65955087330191,
"eval_loss": 6.37901496887207,
"eval_runtime": 65.8363,
"eval_samples_per_second": 151.892,
"eval_steps_per_second": 18.986,
"step": 52500
},
{
"epoch": 116.88134183532021,
"grad_norm": 0.27944493293762207,
"learning_rate": 7.424749163879599e-07,
"loss": 6.3776,
"step": 52600
},
{
"epoch": 116.88134183532021,
"eval_loss": 6.376550197601318,
"eval_runtime": 63.5812,
"eval_samples_per_second": 157.279,
"eval_steps_per_second": 19.66,
"step": 52600
},
{
"epoch": 117.1031327973385,
"grad_norm": 0.2298879325389862,
"learning_rate": 7.324414715719064e-07,
"loss": 6.3751,
"step": 52700
},
{
"epoch": 117.1031327973385,
"eval_loss": 6.377909183502197,
"eval_runtime": 63.3541,
"eval_samples_per_second": 157.843,
"eval_steps_per_second": 19.73,
"step": 52700
},
{
"epoch": 117.3249237593568,
"grad_norm": 0.25682932138442993,
"learning_rate": 7.224080267558529e-07,
"loss": 6.3757,
"step": 52800
},
{
"epoch": 117.3249237593568,
"eval_loss": 6.378458023071289,
"eval_runtime": 65.7985,
"eval_samples_per_second": 151.979,
"eval_steps_per_second": 18.997,
"step": 52800
},
{
"epoch": 117.54671472137511,
"grad_norm": 0.2633031904697418,
"learning_rate": 7.123745819397994e-07,
"loss": 6.3767,
"step": 52900
},
{
"epoch": 117.54671472137511,
"eval_loss": 6.380926132202148,
"eval_runtime": 63.5491,
"eval_samples_per_second": 157.359,
"eval_steps_per_second": 19.67,
"step": 52900
},
{
"epoch": 117.7685056833934,
"grad_norm": 0.26749059557914734,
"learning_rate": 7.023411371237459e-07,
"loss": 6.3767,
"step": 53000
},
{
"epoch": 117.7685056833934,
"eval_loss": 6.381775856018066,
"eval_runtime": 63.4542,
"eval_samples_per_second": 157.594,
"eval_steps_per_second": 19.699,
"step": 53000
},
{
"epoch": 117.9902966454117,
"grad_norm": 0.22249187529087067,
"learning_rate": 6.923076923076923e-07,
"loss": 6.377,
"step": 53100
},
{
"epoch": 117.9902966454117,
"eval_loss": 6.38169002532959,
"eval_runtime": 63.5488,
"eval_samples_per_second": 157.359,
"eval_steps_per_second": 19.67,
"step": 53100
},
{
"epoch": 118.21208760742999,
"grad_norm": 0.22224722802639008,
"learning_rate": 6.822742474916388e-07,
"loss": 6.3764,
"step": 53200
},
{
"epoch": 118.21208760742999,
"eval_loss": 6.37975549697876,
"eval_runtime": 65.9614,
"eval_samples_per_second": 151.604,
"eval_steps_per_second": 18.95,
"step": 53200
},
{
"epoch": 118.4338785694483,
"grad_norm": 0.2897886037826538,
"learning_rate": 6.722408026755853e-07,
"loss": 6.3737,
"step": 53300
},
{
"epoch": 118.4338785694483,
"eval_loss": 6.376906394958496,
"eval_runtime": 63.536,
"eval_samples_per_second": 157.391,
"eval_steps_per_second": 19.674,
"step": 53300
},
{
"epoch": 118.65566953146659,
"grad_norm": 0.2731805145740509,
"learning_rate": 6.622073578595318e-07,
"loss": 6.3774,
"step": 53400
},
{
"epoch": 118.65566953146659,
"eval_loss": 6.377748489379883,
"eval_runtime": 63.612,
"eval_samples_per_second": 157.203,
"eval_steps_per_second": 19.65,
"step": 53400
},
{
"epoch": 118.87746049348489,
"grad_norm": 0.22697260975837708,
"learning_rate": 6.521739130434783e-07,
"loss": 6.3767,
"step": 53500
},
{
"epoch": 118.87746049348489,
"eval_loss": 6.381230354309082,
"eval_runtime": 65.9156,
"eval_samples_per_second": 151.709,
"eval_steps_per_second": 18.964,
"step": 53500
},
{
"epoch": 119.0992514555032,
"grad_norm": 0.30966779589653015,
"learning_rate": 6.421404682274248e-07,
"loss": 6.376,
"step": 53600
},
{
"epoch": 119.0992514555032,
"eval_loss": 6.37573766708374,
"eval_runtime": 63.3841,
"eval_samples_per_second": 157.768,
"eval_steps_per_second": 19.721,
"step": 53600
},
{
"epoch": 119.32104241752148,
"grad_norm": 0.2676733136177063,
"learning_rate": 6.321070234113712e-07,
"loss": 6.3759,
"step": 53700
},
{
"epoch": 119.32104241752148,
"eval_loss": 6.374691963195801,
"eval_runtime": 63.4737,
"eval_samples_per_second": 157.545,
"eval_steps_per_second": 19.693,
"step": 53700
},
{
"epoch": 119.54283337953979,
"grad_norm": 0.2713070213794708,
"learning_rate": 6.220735785953178e-07,
"loss": 6.3768,
"step": 53800
},
{
"epoch": 119.54283337953979,
"eval_loss": 6.378169059753418,
"eval_runtime": 65.7452,
"eval_samples_per_second": 152.102,
"eval_steps_per_second": 19.013,
"step": 53800
},
{
"epoch": 119.76462434155808,
"grad_norm": 0.2583908140659332,
"learning_rate": 6.120401337792642e-07,
"loss": 6.3756,
"step": 53900
},
{
"epoch": 119.76462434155808,
"eval_loss": 6.380895137786865,
"eval_runtime": 63.7905,
"eval_samples_per_second": 156.763,
"eval_steps_per_second": 19.595,
"step": 53900
},
{
"epoch": 119.98641530357638,
"grad_norm": 0.2636660933494568,
"learning_rate": 6.020066889632107e-07,
"loss": 6.376,
"step": 54000
},
{
"epoch": 119.98641530357638,
"eval_loss": 6.378993034362793,
"eval_runtime": 63.4987,
"eval_samples_per_second": 157.483,
"eval_steps_per_second": 19.685,
"step": 54000
},
{
"epoch": 120.20820626559468,
"grad_norm": 0.2040402740240097,
"learning_rate": 5.919732441471572e-07,
"loss": 6.3742,
"step": 54100
},
{
"epoch": 120.20820626559468,
"eval_loss": 6.379099369049072,
"eval_runtime": 63.4641,
"eval_samples_per_second": 157.569,
"eval_steps_per_second": 19.696,
"step": 54100
},
{
"epoch": 120.42999722761297,
"grad_norm": 0.2771637439727783,
"learning_rate": 5.819397993311037e-07,
"loss": 6.377,
"step": 54200
},
{
"epoch": 120.42999722761297,
"eval_loss": 6.380918025970459,
"eval_runtime": 63.5735,
"eval_samples_per_second": 157.298,
"eval_steps_per_second": 19.662,
"step": 54200
},
{
"epoch": 120.65178818963128,
"grad_norm": 0.2907504141330719,
"learning_rate": 5.719063545150502e-07,
"loss": 6.3771,
"step": 54300
},
{
"epoch": 120.65178818963128,
"eval_loss": 6.379312515258789,
"eval_runtime": 65.9548,
"eval_samples_per_second": 151.619,
"eval_steps_per_second": 18.952,
"step": 54300
},
{
"epoch": 120.87357915164957,
"grad_norm": 0.30987074971199036,
"learning_rate": 5.618729096989966e-07,
"loss": 6.3768,
"step": 54400
},
{
"epoch": 120.87357915164957,
"eval_loss": 6.37892484664917,
"eval_runtime": 63.4967,
"eval_samples_per_second": 157.488,
"eval_steps_per_second": 19.686,
"step": 54400
},
{
"epoch": 121.09537011366787,
"grad_norm": 0.3270675837993622,
"learning_rate": 5.518394648829431e-07,
"loss": 6.376,
"step": 54500
},
{
"epoch": 121.09537011366787,
"eval_loss": 6.377264976501465,
"eval_runtime": 63.4405,
"eval_samples_per_second": 157.628,
"eval_steps_per_second": 19.704,
"step": 54500
},
{
"epoch": 121.31716107568617,
"grad_norm": 0.23159100115299225,
"learning_rate": 5.418060200668896e-07,
"loss": 6.3773,
"step": 54600
},
{
"epoch": 121.31716107568617,
"eval_loss": 6.379176616668701,
"eval_runtime": 66.0387,
"eval_samples_per_second": 151.426,
"eval_steps_per_second": 18.928,
"step": 54600
},
{
"epoch": 121.53895203770446,
"grad_norm": 0.231267511844635,
"learning_rate": 5.317725752508361e-07,
"loss": 6.3773,
"step": 54700
},
{
"epoch": 121.53895203770446,
"eval_loss": 6.376558780670166,
"eval_runtime": 63.5938,
"eval_samples_per_second": 157.248,
"eval_steps_per_second": 19.656,
"step": 54700
},
{
"epoch": 121.76074299972277,
"grad_norm": 0.24276390671730042,
"learning_rate": 5.217391304347826e-07,
"loss": 6.3754,
"step": 54800
},
{
"epoch": 121.76074299972277,
"eval_loss": 6.378441333770752,
"eval_runtime": 63.5257,
"eval_samples_per_second": 157.417,
"eval_steps_per_second": 19.677,
"step": 54800
},
{
"epoch": 121.98253396174105,
"grad_norm": 0.26921290159225464,
"learning_rate": 5.117056856187291e-07,
"loss": 6.3751,
"step": 54900
},
{
"epoch": 121.98253396174105,
"eval_loss": 6.378532886505127,
"eval_runtime": 63.5535,
"eval_samples_per_second": 157.348,
"eval_steps_per_second": 19.668,
"step": 54900
},
{
"epoch": 122.20432492375936,
"grad_norm": 0.274029016494751,
"learning_rate": 5.016722408026756e-07,
"loss": 6.376,
"step": 55000
},
{
"epoch": 122.20432492375936,
"eval_loss": 6.378449440002441,
"eval_runtime": 65.8768,
"eval_samples_per_second": 151.798,
"eval_steps_per_second": 18.975,
"step": 55000
},
{
"epoch": 122.42611588577765,
"grad_norm": 0.27585527300834656,
"learning_rate": 4.916387959866221e-07,
"loss": 6.376,
"step": 55100
},
{
"epoch": 122.42611588577765,
"eval_loss": 6.37809944152832,
"eval_runtime": 63.5221,
"eval_samples_per_second": 157.426,
"eval_steps_per_second": 19.678,
"step": 55100
},
{
"epoch": 122.64790684779595,
"grad_norm": 0.2652019262313843,
"learning_rate": 4.816053511705686e-07,
"loss": 6.3753,
"step": 55200
},
{
"epoch": 122.64790684779595,
"eval_loss": 6.38352632522583,
"eval_runtime": 63.4719,
"eval_samples_per_second": 157.55,
"eval_steps_per_second": 19.694,
"step": 55200
},
{
"epoch": 122.86969780981426,
"grad_norm": 0.24283932149410248,
"learning_rate": 4.7157190635451506e-07,
"loss": 6.3761,
"step": 55300
},
{
"epoch": 122.86969780981426,
"eval_loss": 6.376107215881348,
"eval_runtime": 63.547,
"eval_samples_per_second": 157.364,
"eval_steps_per_second": 19.67,
"step": 55300
},
{
"epoch": 123.09148877183254,
"grad_norm": 0.29150310158729553,
"learning_rate": 4.6153846153846156e-07,
"loss": 6.3765,
"step": 55400
},
{
"epoch": 123.09148877183254,
"eval_loss": 6.37521505355835,
"eval_runtime": 65.9064,
"eval_samples_per_second": 151.73,
"eval_steps_per_second": 18.966,
"step": 55400
},
{
"epoch": 123.31327973385085,
"grad_norm": 0.28435659408569336,
"learning_rate": 4.5150501672240806e-07,
"loss": 6.3757,
"step": 55500
},
{
"epoch": 123.31327973385085,
"eval_loss": 6.378593921661377,
"eval_runtime": 63.5654,
"eval_samples_per_second": 157.318,
"eval_steps_per_second": 19.665,
"step": 55500
},
{
"epoch": 123.53507069586914,
"grad_norm": 0.2412547916173935,
"learning_rate": 4.4147157190635456e-07,
"loss": 6.3757,
"step": 55600
},
{
"epoch": 123.53507069586914,
"eval_loss": 6.377431869506836,
"eval_runtime": 66.0043,
"eval_samples_per_second": 151.505,
"eval_steps_per_second": 18.938,
"step": 55600
},
{
"epoch": 123.75686165788744,
"grad_norm": 0.21835213899612427,
"learning_rate": 4.3143812709030095e-07,
"loss": 6.3763,
"step": 55700
},
{
"epoch": 123.75686165788744,
"eval_loss": 6.378489971160889,
"eval_runtime": 63.7489,
"eval_samples_per_second": 156.865,
"eval_steps_per_second": 19.608,
"step": 55700
},
{
"epoch": 123.97865261990574,
"grad_norm": 0.18911224603652954,
"learning_rate": 4.2140468227424745e-07,
"loss": 6.3754,
"step": 55800
},
{
"epoch": 123.97865261990574,
"eval_loss": 6.379303932189941,
"eval_runtime": 66.1257,
"eval_samples_per_second": 151.227,
"eval_steps_per_second": 18.903,
"step": 55800
},
{
"epoch": 124.20044358192403,
"grad_norm": 0.283447265625,
"learning_rate": 4.1137123745819395e-07,
"loss": 6.3743,
"step": 55900
},
{
"epoch": 124.20044358192403,
"eval_loss": 6.381599426269531,
"eval_runtime": 63.605,
"eval_samples_per_second": 157.22,
"eval_steps_per_second": 19.653,
"step": 55900
},
{
"epoch": 124.42223454394234,
"grad_norm": 0.1898406594991684,
"learning_rate": 4.0133779264214045e-07,
"loss": 6.3755,
"step": 56000
},
{
"epoch": 124.42223454394234,
"eval_loss": 6.376759052276611,
"eval_runtime": 64.3574,
"eval_samples_per_second": 155.382,
"eval_steps_per_second": 19.423,
"step": 56000
},
{
"epoch": 124.64402550596063,
"grad_norm": 0.2740555703639984,
"learning_rate": 3.9130434782608694e-07,
"loss": 6.3767,
"step": 56100
},
{
"epoch": 124.64402550596063,
"eval_loss": 6.377686023712158,
"eval_runtime": 65.4964,
"eval_samples_per_second": 152.68,
"eval_steps_per_second": 19.085,
"step": 56100
},
{
"epoch": 124.86581646797893,
"grad_norm": 0.24969562888145447,
"learning_rate": 3.8127090301003344e-07,
"loss": 6.3749,
"step": 56200
},
{
"epoch": 124.86581646797893,
"eval_loss": 6.3803300857543945,
"eval_runtime": 63.6262,
"eval_samples_per_second": 157.168,
"eval_steps_per_second": 19.646,
"step": 56200
},
{
"epoch": 125.08760742999723,
"grad_norm": 0.271085649728775,
"learning_rate": 3.7123745819397994e-07,
"loss": 6.3761,
"step": 56300
},
{
"epoch": 125.08760742999723,
"eval_loss": 6.377999782562256,
"eval_runtime": 63.5511,
"eval_samples_per_second": 157.354,
"eval_steps_per_second": 19.669,
"step": 56300
},
{
"epoch": 125.30939839201552,
"grad_norm": 0.2341337651014328,
"learning_rate": 3.6120401337792644e-07,
"loss": 6.3787,
"step": 56400
},
{
"epoch": 125.30939839201552,
"eval_loss": 6.377155780792236,
"eval_runtime": 66.011,
"eval_samples_per_second": 151.49,
"eval_steps_per_second": 18.936,
"step": 56400
},
{
"epoch": 125.53118935403383,
"grad_norm": 0.2656327784061432,
"learning_rate": 3.5117056856187294e-07,
"loss": 6.3742,
"step": 56500
},
{
"epoch": 125.53118935403383,
"eval_loss": 6.378920078277588,
"eval_runtime": 63.6517,
"eval_samples_per_second": 157.105,
"eval_steps_per_second": 19.638,
"step": 56500
},
{
"epoch": 125.75298031605212,
"grad_norm": 0.261843204498291,
"learning_rate": 3.411371237458194e-07,
"loss": 6.3742,
"step": 56600
},
{
"epoch": 125.75298031605212,
"eval_loss": 6.376353740692139,
"eval_runtime": 65.896,
"eval_samples_per_second": 151.754,
"eval_steps_per_second": 18.969,
"step": 56600
},
{
"epoch": 125.97477127807042,
"grad_norm": 0.27163127064704895,
"learning_rate": 3.311036789297659e-07,
"loss": 6.3765,
"step": 56700
},
{
"epoch": 125.97477127807042,
"eval_loss": 6.3804826736450195,
"eval_runtime": 63.514,
"eval_samples_per_second": 157.446,
"eval_steps_per_second": 19.681,
"step": 56700
},
{
"epoch": 126.19656224008871,
"grad_norm": 0.2797481417655945,
"learning_rate": 3.210702341137124e-07,
"loss": 6.3764,
"step": 56800
},
{
"epoch": 126.19656224008871,
"eval_loss": 6.378259658813477,
"eval_runtime": 63.4475,
"eval_samples_per_second": 157.611,
"eval_steps_per_second": 19.701,
"step": 56800
},
{
"epoch": 126.41835320210701,
"grad_norm": 0.21093739569187164,
"learning_rate": 3.110367892976589e-07,
"loss": 6.3764,
"step": 56900
},
{
"epoch": 126.41835320210701,
"eval_loss": 6.378982067108154,
"eval_runtime": 66.045,
"eval_samples_per_second": 151.412,
"eval_steps_per_second": 18.927,
"step": 56900
},
{
"epoch": 126.64014416412532,
"grad_norm": 0.268632173538208,
"learning_rate": 3.010033444816054e-07,
"loss": 6.3762,
"step": 57000
},
{
"epoch": 126.64014416412532,
"eval_loss": 6.379413604736328,
"eval_runtime": 63.641,
"eval_samples_per_second": 157.131,
"eval_steps_per_second": 19.641,
"step": 57000
},
{
"epoch": 126.8619351261436,
"grad_norm": 0.2878783047199249,
"learning_rate": 2.9096989966555187e-07,
"loss": 6.376,
"step": 57100
},
{
"epoch": 126.8619351261436,
"eval_loss": 6.378924369812012,
"eval_runtime": 66.1831,
"eval_samples_per_second": 151.096,
"eval_steps_per_second": 18.887,
"step": 57100
},
{
"epoch": 127.08372608816191,
"grad_norm": 0.2618252635002136,
"learning_rate": 2.809364548494983e-07,
"loss": 6.3768,
"step": 57200
},
{
"epoch": 127.08372608816191,
"eval_loss": 6.37802267074585,
"eval_runtime": 63.5424,
"eval_samples_per_second": 157.375,
"eval_steps_per_second": 19.672,
"step": 57200
},
{
"epoch": 127.3055170501802,
"grad_norm": 0.20790652930736542,
"learning_rate": 2.709030100334448e-07,
"loss": 6.3763,
"step": 57300
},
{
"epoch": 127.3055170501802,
"eval_loss": 6.377635955810547,
"eval_runtime": 66.2394,
"eval_samples_per_second": 150.967,
"eval_steps_per_second": 18.871,
"step": 57300
},
{
"epoch": 127.5273080121985,
"grad_norm": 0.23446954786777496,
"learning_rate": 2.608695652173913e-07,
"loss": 6.3758,
"step": 57400
},
{
"epoch": 127.5273080121985,
"eval_loss": 6.378016471862793,
"eval_runtime": 63.7187,
"eval_samples_per_second": 156.94,
"eval_steps_per_second": 19.617,
"step": 57400
},
{
"epoch": 127.7490989742168,
"grad_norm": 0.2730012536048889,
"learning_rate": 2.508361204013378e-07,
"loss": 6.3771,
"step": 57500
},
{
"epoch": 127.7490989742168,
"eval_loss": 6.378283500671387,
"eval_runtime": 66.0326,
"eval_samples_per_second": 151.44,
"eval_steps_per_second": 18.93,
"step": 57500
},
{
"epoch": 127.9708899362351,
"grad_norm": 0.19740967452526093,
"learning_rate": 2.408026755852843e-07,
"loss": 6.3754,
"step": 57600
},
{
"epoch": 127.9708899362351,
"eval_loss": 6.377573490142822,
"eval_runtime": 68.5433,
"eval_samples_per_second": 145.893,
"eval_steps_per_second": 18.237,
"step": 57600
},
{
"epoch": 128.1926808982534,
"grad_norm": 0.20099857449531555,
"learning_rate": 2.3076923076923078e-07,
"loss": 6.3763,
"step": 57700
},
{
"epoch": 128.1926808982534,
"eval_loss": 6.380809783935547,
"eval_runtime": 63.6372,
"eval_samples_per_second": 157.141,
"eval_steps_per_second": 19.643,
"step": 57700
},
{
"epoch": 128.4144718602717,
"grad_norm": 0.26378223299980164,
"learning_rate": 2.2073578595317728e-07,
"loss": 6.3742,
"step": 57800
},
{
"epoch": 128.4144718602717,
"eval_loss": 6.377455234527588,
"eval_runtime": 63.6147,
"eval_samples_per_second": 157.196,
"eval_steps_per_second": 19.65,
"step": 57800
},
{
"epoch": 128.63626282228998,
"grad_norm": 0.22778332233428955,
"learning_rate": 2.1070234113712372e-07,
"loss": 6.3757,
"step": 57900
},
{
"epoch": 128.63626282228998,
"eval_loss": 6.376725196838379,
"eval_runtime": 63.6324,
"eval_samples_per_second": 157.153,
"eval_steps_per_second": 19.644,
"step": 57900
},
{
"epoch": 128.85805378430828,
"grad_norm": 0.25024932622909546,
"learning_rate": 2.0066889632107022e-07,
"loss": 6.3767,
"step": 58000
},
{
"epoch": 128.85805378430828,
"eval_loss": 6.378956317901611,
"eval_runtime": 66.0444,
"eval_samples_per_second": 151.413,
"eval_steps_per_second": 18.927,
"step": 58000
},
{
"epoch": 129.0798447463266,
"grad_norm": 0.22629129886627197,
"learning_rate": 1.9063545150501672e-07,
"loss": 6.3751,
"step": 58100
},
{
"epoch": 129.0798447463266,
"eval_loss": 6.378350734710693,
"eval_runtime": 63.6424,
"eval_samples_per_second": 157.128,
"eval_steps_per_second": 19.641,
"step": 58100
},
{
"epoch": 129.3016357083449,
"grad_norm": 0.22958730161190033,
"learning_rate": 1.8060200668896322e-07,
"loss": 6.3754,
"step": 58200
},
{
"epoch": 129.3016357083449,
"eval_loss": 6.379317760467529,
"eval_runtime": 66.1349,
"eval_samples_per_second": 151.206,
"eval_steps_per_second": 18.901,
"step": 58200
},
{
"epoch": 129.5234266703632,
"grad_norm": 0.29147765040397644,
"learning_rate": 1.705685618729097e-07,
"loss": 6.3766,
"step": 58300
},
{
"epoch": 129.5234266703632,
"eval_loss": 6.379565238952637,
"eval_runtime": 63.6308,
"eval_samples_per_second": 157.157,
"eval_steps_per_second": 19.645,
"step": 58300
},
{
"epoch": 129.74521763238147,
"grad_norm": 0.2274588942527771,
"learning_rate": 1.605351170568562e-07,
"loss": 6.3766,
"step": 58400
},
{
"epoch": 129.74521763238147,
"eval_loss": 6.378822326660156,
"eval_runtime": 63.7248,
"eval_samples_per_second": 156.925,
"eval_steps_per_second": 19.616,
"step": 58400
},
{
"epoch": 129.96700859439977,
"grad_norm": 0.27082857489585876,
"learning_rate": 1.505016722408027e-07,
"loss": 6.3762,
"step": 58500
},
{
"epoch": 129.96700859439977,
"eval_loss": 6.376942157745361,
"eval_runtime": 66.2694,
"eval_samples_per_second": 150.899,
"eval_steps_per_second": 18.862,
"step": 58500
},
{
"epoch": 130.18879955641808,
"grad_norm": 0.2117777317762375,
"learning_rate": 1.4046822742474916e-07,
"loss": 6.3756,
"step": 58600
},
{
"epoch": 130.18879955641808,
"eval_loss": 6.381185054779053,
"eval_runtime": 63.6203,
"eval_samples_per_second": 157.183,
"eval_steps_per_second": 19.648,
"step": 58600
},
{
"epoch": 130.41059051843638,
"grad_norm": 0.244340181350708,
"learning_rate": 1.3043478260869566e-07,
"loss": 6.3746,
"step": 58700
},
{
"epoch": 130.41059051843638,
"eval_loss": 6.378442764282227,
"eval_runtime": 63.6467,
"eval_samples_per_second": 157.117,
"eval_steps_per_second": 19.64,
"step": 58700
},
{
"epoch": 130.63238148045468,
"grad_norm": 0.23617205023765564,
"learning_rate": 1.2040133779264215e-07,
"loss": 6.3759,
"step": 58800
},
{
"epoch": 130.63238148045468,
"eval_loss": 6.377311706542969,
"eval_runtime": 66.2898,
"eval_samples_per_second": 150.853,
"eval_steps_per_second": 18.857,
"step": 58800
},
{
"epoch": 130.85417244247296,
"grad_norm": 0.22402510046958923,
"learning_rate": 1.1036789297658864e-07,
"loss": 6.3766,
"step": 58900
},
{
"epoch": 130.85417244247296,
"eval_loss": 6.378325939178467,
"eval_runtime": 63.7783,
"eval_samples_per_second": 156.793,
"eval_steps_per_second": 19.599,
"step": 58900
},
{
"epoch": 131.07596340449126,
"grad_norm": 0.22382721304893494,
"learning_rate": 1.0033444816053511e-07,
"loss": 6.377,
"step": 59000
},
{
"epoch": 131.07596340449126,
"eval_loss": 6.375909328460693,
"eval_runtime": 63.6862,
"eval_samples_per_second": 157.02,
"eval_steps_per_second": 19.627,
"step": 59000
},
{
"epoch": 131.29775436650957,
"grad_norm": 0.2319914549589157,
"learning_rate": 9.030100334448161e-08,
"loss": 6.3759,
"step": 59100
},
{
"epoch": 131.29775436650957,
"eval_loss": 6.380961894989014,
"eval_runtime": 63.73,
"eval_samples_per_second": 156.912,
"eval_steps_per_second": 19.614,
"step": 59100
},
{
"epoch": 131.51954532852787,
"grad_norm": 0.27138957381248474,
"learning_rate": 8.02675585284281e-08,
"loss": 6.3765,
"step": 59200
},
{
"epoch": 131.51954532852787,
"eval_loss": 6.378270626068115,
"eval_runtime": 66.164,
"eval_samples_per_second": 151.14,
"eval_steps_per_second": 18.892,
"step": 59200
},
{
"epoch": 131.74133629054617,
"grad_norm": 0.24163523316383362,
"learning_rate": 7.023411371237458e-08,
"loss": 6.3758,
"step": 59300
},
{
"epoch": 131.74133629054617,
"eval_loss": 6.379899024963379,
"eval_runtime": 66.2406,
"eval_samples_per_second": 150.965,
"eval_steps_per_second": 18.871,
"step": 59300
},
{
"epoch": 131.96312725256445,
"grad_norm": 0.20410296320915222,
"learning_rate": 6.020066889632108e-08,
"loss": 6.3753,
"step": 59400
},
{
"epoch": 131.96312725256445,
"eval_loss": 6.378077983856201,
"eval_runtime": 63.7013,
"eval_samples_per_second": 156.983,
"eval_steps_per_second": 19.623,
"step": 59400
},
{
"epoch": 132.18491821458275,
"grad_norm": 0.15991632640361786,
"learning_rate": 5.0167224080267556e-08,
"loss": 6.3762,
"step": 59500
},
{
"epoch": 132.18491821458275,
"eval_loss": 6.379003524780273,
"eval_runtime": 63.6773,
"eval_samples_per_second": 157.042,
"eval_steps_per_second": 19.63,
"step": 59500
},
{
"epoch": 132.40670917660105,
"grad_norm": 0.2014060765504837,
"learning_rate": 4.013377926421405e-08,
"loss": 6.3734,
"step": 59600
},
{
"epoch": 132.40670917660105,
"eval_loss": 6.377279758453369,
"eval_runtime": 64.9426,
"eval_samples_per_second": 153.982,
"eval_steps_per_second": 19.248,
"step": 59600
},
{
"epoch": 132.62850013861936,
"grad_norm": 0.23493210971355438,
"learning_rate": 3.010033444816054e-08,
"loss": 6.3767,
"step": 59700
},
{
"epoch": 132.62850013861936,
"eval_loss": 6.378801345825195,
"eval_runtime": 65.0941,
"eval_samples_per_second": 153.624,
"eval_steps_per_second": 19.203,
"step": 59700
},
{
"epoch": 132.85029110063766,
"grad_norm": 0.2207670956850052,
"learning_rate": 2.0066889632107024e-08,
"loss": 6.3764,
"step": 59800
},
{
"epoch": 132.85029110063766,
"eval_loss": 6.377054691314697,
"eval_runtime": 63.7133,
"eval_samples_per_second": 156.953,
"eval_steps_per_second": 19.619,
"step": 59800
},
{
"epoch": 133.07208206265594,
"grad_norm": 0.21483196318149567,
"learning_rate": 1.0033444816053512e-08,
"loss": 6.3763,
"step": 59900
},
{
"epoch": 133.07208206265594,
"eval_loss": 6.3776984214782715,
"eval_runtime": 63.6217,
"eval_samples_per_second": 157.179,
"eval_steps_per_second": 19.647,
"step": 59900
},
{
"epoch": 133.29387302467424,
"grad_norm": 0.1953832507133484,
"learning_rate": 0.0,
"loss": 6.3751,
"step": 60000
},
{
"epoch": 133.29387302467424,
"eval_loss": 6.377795219421387,
"eval_runtime": 66.2186,
"eval_samples_per_second": 151.015,
"eval_steps_per_second": 18.877,
"step": 60000
}
],
"logging_steps": 100,
"max_steps": 60000,
"num_input_tokens_seen": 0,
"num_train_epochs": 134,
"save_steps": 100,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 10,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 10
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.020754951164035e+19,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}