gpn-msa-model-h1 / trainer_state.json
pl593's picture
upload trained GPN MSA model
a0c756c verified
{
"best_metric": 0.16085075220051892,
"best_model_checkpoint": "checkpoints/checkpoint-6750",
"epoch": 4.6360686138154845,
"eval_steps": 50,
"global_step": 10000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.023180343069077423,
"grad_norm": 0.08667636662721634,
"learning_rate": 5e-05,
"loss": 0.6255,
"step": 50
},
{
"epoch": 0.023180343069077423,
"eval_loss": 0.2028235954199523,
"eval_runtime": 58.9481,
"eval_samples_per_second": 705.468,
"eval_steps_per_second": 0.356,
"step": 50
},
{
"epoch": 0.04636068613815485,
"grad_norm": 0.0631365031003952,
"learning_rate": 0.0001,
"loss": 0.1532,
"step": 100
},
{
"epoch": 0.04636068613815485,
"eval_loss": 0.1865187252730081,
"eval_runtime": 59.7995,
"eval_samples_per_second": 695.424,
"eval_steps_per_second": 0.351,
"step": 100
},
{
"epoch": 0.06954102920723226,
"grad_norm": 0.11565029621124268,
"learning_rate": 9.999370638369377e-05,
"loss": 0.1448,
"step": 150
},
{
"epoch": 0.06954102920723226,
"eval_loss": 0.1811586473052961,
"eval_runtime": 59.448,
"eval_samples_per_second": 699.536,
"eval_steps_per_second": 0.353,
"step": 150
},
{
"epoch": 0.0927213722763097,
"grad_norm": 0.12287624180316925,
"learning_rate": 9.997482711915927e-05,
"loss": 0.1431,
"step": 200
},
{
"epoch": 0.0927213722763097,
"eval_loss": 0.17856710216099736,
"eval_runtime": 59.6811,
"eval_samples_per_second": 696.803,
"eval_steps_per_second": 0.352,
"step": 200
},
{
"epoch": 0.11590171534538711,
"grad_norm": 0.1239687129855156,
"learning_rate": 9.99433669591504e-05,
"loss": 0.1423,
"step": 250
},
{
"epoch": 0.11590171534538711,
"eval_loss": 0.17694150013393828,
"eval_runtime": 59.3601,
"eval_samples_per_second": 700.572,
"eval_steps_per_second": 0.354,
"step": 250
},
{
"epoch": 0.13908205841446453,
"grad_norm": 0.08375083655118942,
"learning_rate": 9.989933382359422e-05,
"loss": 0.1413,
"step": 300
},
{
"epoch": 0.13908205841446453,
"eval_loss": 0.17530383900746715,
"eval_runtime": 59.6335,
"eval_samples_per_second": 697.359,
"eval_steps_per_second": 0.352,
"step": 300
},
{
"epoch": 0.16226240148354196,
"grad_norm": 0.07470008730888367,
"learning_rate": 9.984273879759713e-05,
"loss": 0.1391,
"step": 350
},
{
"epoch": 0.16226240148354196,
"eval_loss": 0.17539924518994512,
"eval_runtime": 59.2699,
"eval_samples_per_second": 701.638,
"eval_steps_per_second": 0.354,
"step": 350
},
{
"epoch": 0.1854427445526194,
"grad_norm": 0.11057748645544052,
"learning_rate": 9.977359612865423e-05,
"loss": 0.1398,
"step": 400
},
{
"epoch": 0.1854427445526194,
"eval_loss": 0.17613958094562268,
"eval_runtime": 59.3258,
"eval_samples_per_second": 700.977,
"eval_steps_per_second": 0.354,
"step": 400
},
{
"epoch": 0.2086230876216968,
"grad_norm": 0.10229019820690155,
"learning_rate": 9.969192322306271e-05,
"loss": 0.1398,
"step": 450
},
{
"epoch": 0.2086230876216968,
"eval_loss": 0.17319489228196833,
"eval_runtime": 59.4596,
"eval_samples_per_second": 699.399,
"eval_steps_per_second": 0.353,
"step": 450
},
{
"epoch": 0.23180343069077422,
"grad_norm": 0.10784970223903656,
"learning_rate": 9.959774064153977e-05,
"loss": 0.1384,
"step": 500
},
{
"epoch": 0.23180343069077422,
"eval_loss": 0.17334065558523068,
"eval_runtime": 59.5777,
"eval_samples_per_second": 698.012,
"eval_steps_per_second": 0.352,
"step": 500
},
{
"epoch": 0.25498377375985165,
"grad_norm": 0.08434706926345825,
"learning_rate": 9.949107209404665e-05,
"loss": 0.1386,
"step": 550
},
{
"epoch": 0.25498377375985165,
"eval_loss": 0.17196178719739552,
"eval_runtime": 59.6138,
"eval_samples_per_second": 697.59,
"eval_steps_per_second": 0.352,
"step": 550
},
{
"epoch": 0.27816411682892905,
"grad_norm": 0.08924778550863266,
"learning_rate": 9.937194443381972e-05,
"loss": 0.1377,
"step": 600
},
{
"epoch": 0.27816411682892905,
"eval_loss": 0.1740634102700707,
"eval_runtime": 59.422,
"eval_samples_per_second": 699.842,
"eval_steps_per_second": 0.353,
"step": 600
},
{
"epoch": 0.3013444598980065,
"grad_norm": 0.15332703292369843,
"learning_rate": 9.924038765061042e-05,
"loss": 0.1372,
"step": 650
},
{
"epoch": 0.3013444598980065,
"eval_loss": 0.17391025863974857,
"eval_runtime": 59.5007,
"eval_samples_per_second": 698.917,
"eval_steps_per_second": 0.353,
"step": 650
},
{
"epoch": 0.3245248029670839,
"grad_norm": 0.08973913639783859,
"learning_rate": 9.909643486313533e-05,
"loss": 0.1374,
"step": 700
},
{
"epoch": 0.3245248029670839,
"eval_loss": 0.17245501519579134,
"eval_runtime": 59.2504,
"eval_samples_per_second": 701.868,
"eval_steps_per_second": 0.354,
"step": 700
},
{
"epoch": 0.3477051460361613,
"grad_norm": 0.07252663373947144,
"learning_rate": 9.894012231073894e-05,
"loss": 0.1378,
"step": 750
},
{
"epoch": 0.3477051460361613,
"eval_loss": 0.1731146153162719,
"eval_runtime": 59.6278,
"eval_samples_per_second": 697.426,
"eval_steps_per_second": 0.352,
"step": 750
},
{
"epoch": 0.3708854891052388,
"grad_norm": 0.09351957589387894,
"learning_rate": 9.877148934427037e-05,
"loss": 0.1371,
"step": 800
},
{
"epoch": 0.3708854891052388,
"eval_loss": 0.17056697699015605,
"eval_runtime": 59.4338,
"eval_samples_per_second": 699.703,
"eval_steps_per_second": 0.353,
"step": 800
},
{
"epoch": 0.3940658321743162,
"grad_norm": 0.06937623023986816,
"learning_rate": 9.859057841617709e-05,
"loss": 0.1364,
"step": 850
},
{
"epoch": 0.3940658321743162,
"eval_loss": 0.1730773180756858,
"eval_runtime": 59.2237,
"eval_samples_per_second": 702.185,
"eval_steps_per_second": 0.355,
"step": 850
},
{
"epoch": 0.4172461752433936,
"grad_norm": 0.1241346001625061,
"learning_rate": 9.839743506981782e-05,
"loss": 0.1382,
"step": 900
},
{
"epoch": 0.4172461752433936,
"eval_loss": 0.17300324635270986,
"eval_runtime": 59.1648,
"eval_samples_per_second": 702.884,
"eval_steps_per_second": 0.355,
"step": 900
},
{
"epoch": 0.44042651831247104,
"grad_norm": 0.0649554654955864,
"learning_rate": 9.819210792799712e-05,
"loss": 0.1369,
"step": 950
},
{
"epoch": 0.44042651831247104,
"eval_loss": 0.17298936761230632,
"eval_runtime": 59.4593,
"eval_samples_per_second": 699.402,
"eval_steps_per_second": 0.353,
"step": 950
},
{
"epoch": 0.46360686138154844,
"grad_norm": 0.07767663151025772,
"learning_rate": 9.797464868072488e-05,
"loss": 0.1373,
"step": 1000
},
{
"epoch": 0.46360686138154844,
"eval_loss": 0.1722117168758624,
"eval_runtime": 59.4433,
"eval_samples_per_second": 699.592,
"eval_steps_per_second": 0.353,
"step": 1000
},
{
"epoch": 0.48678720445062584,
"grad_norm": 0.09637939929962158,
"learning_rate": 9.77451120722037e-05,
"loss": 0.1357,
"step": 1050
},
{
"epoch": 0.48678720445062584,
"eval_loss": 0.17295359261954948,
"eval_runtime": 59.0076,
"eval_samples_per_second": 704.757,
"eval_steps_per_second": 0.356,
"step": 1050
},
{
"epoch": 0.5099675475197033,
"grad_norm": 0.0731373056769371,
"learning_rate": 9.750355588704727e-05,
"loss": 0.135,
"step": 1100
},
{
"epoch": 0.5099675475197033,
"eval_loss": 0.1715334055521701,
"eval_runtime": 59.0167,
"eval_samples_per_second": 704.648,
"eval_steps_per_second": 0.356,
"step": 1100
},
{
"epoch": 0.5331478905887808,
"grad_norm": 0.1365990936756134,
"learning_rate": 9.725004093573342e-05,
"loss": 0.1357,
"step": 1150
},
{
"epoch": 0.5331478905887808,
"eval_loss": 0.17017831764477356,
"eval_runtime": 59.0779,
"eval_samples_per_second": 703.918,
"eval_steps_per_second": 0.355,
"step": 1150
},
{
"epoch": 0.5563282336578581,
"grad_norm": 0.07747852057218552,
"learning_rate": 9.698463103929542e-05,
"loss": 0.1366,
"step": 1200
},
{
"epoch": 0.5563282336578581,
"eval_loss": 0.17079754339969364,
"eval_runtime": 59.1474,
"eval_samples_per_second": 703.091,
"eval_steps_per_second": 0.355,
"step": 1200
},
{
"epoch": 0.5795085767269356,
"grad_norm": 0.08369060605764389,
"learning_rate": 9.670739301325534e-05,
"loss": 0.1352,
"step": 1250
},
{
"epoch": 0.5795085767269356,
"eval_loss": 0.17218272966053694,
"eval_runtime": 59.4772,
"eval_samples_per_second": 699.192,
"eval_steps_per_second": 0.353,
"step": 1250
},
{
"epoch": 0.602688919796013,
"grad_norm": 0.15560708940029144,
"learning_rate": 9.641839665080363e-05,
"loss": 0.1366,
"step": 1300
},
{
"epoch": 0.602688919796013,
"eval_loss": 0.1698094484306934,
"eval_runtime": 59.4226,
"eval_samples_per_second": 699.835,
"eval_steps_per_second": 0.353,
"step": 1300
},
{
"epoch": 0.6258692628650904,
"grad_norm": 0.1404338777065277,
"learning_rate": 9.611771470522908e-05,
"loss": 0.1353,
"step": 1350
},
{
"epoch": 0.6258692628650904,
"eval_loss": 0.17023876656477224,
"eval_runtime": 59.3422,
"eval_samples_per_second": 700.783,
"eval_steps_per_second": 0.354,
"step": 1350
},
{
"epoch": 0.6490496059341678,
"grad_norm": 0.07887144386768341,
"learning_rate": 9.580542287160348e-05,
"loss": 0.1363,
"step": 1400
},
{
"epoch": 0.6490496059341678,
"eval_loss": 0.1706377184753332,
"eval_runtime": 59.2598,
"eval_samples_per_second": 701.758,
"eval_steps_per_second": 0.354,
"step": 1400
},
{
"epoch": 0.6722299490032453,
"grad_norm": 0.09286168217658997,
"learning_rate": 9.548159976772592e-05,
"loss": 0.1362,
"step": 1450
},
{
"epoch": 0.6722299490032453,
"eval_loss": 0.16891294843072946,
"eval_runtime": 59.4024,
"eval_samples_per_second": 700.073,
"eval_steps_per_second": 0.354,
"step": 1450
},
{
"epoch": 0.6954102920723226,
"grad_norm": 0.08167006820440292,
"learning_rate": 9.514632691433107e-05,
"loss": 0.1345,
"step": 1500
},
{
"epoch": 0.6954102920723226,
"eval_loss": 0.16790113662592512,
"eval_runtime": 60.0378,
"eval_samples_per_second": 692.664,
"eval_steps_per_second": 0.35,
"step": 1500
},
{
"epoch": 0.7185906351414001,
"grad_norm": 0.09860191494226456,
"learning_rate": 9.479968871456679e-05,
"loss": 0.1355,
"step": 1550
},
{
"epoch": 0.7185906351414001,
"eval_loss": 0.16903206921067584,
"eval_runtime": 59.5789,
"eval_samples_per_second": 697.999,
"eval_steps_per_second": 0.352,
"step": 1550
},
{
"epoch": 0.7417709782104775,
"grad_norm": 0.06466613709926605,
"learning_rate": 9.444177243274618e-05,
"loss": 0.135,
"step": 1600
},
{
"epoch": 0.7417709782104775,
"eval_loss": 0.1680566343999807,
"eval_runtime": 59.5911,
"eval_samples_per_second": 697.856,
"eval_steps_per_second": 0.352,
"step": 1600
},
{
"epoch": 0.7649513212795549,
"grad_norm": 0.07864313572645187,
"learning_rate": 9.407266817237911e-05,
"loss": 0.1348,
"step": 1650
},
{
"epoch": 0.7649513212795549,
"eval_loss": 0.16721375296765553,
"eval_runtime": 59.4289,
"eval_samples_per_second": 699.76,
"eval_steps_per_second": 0.353,
"step": 1650
},
{
"epoch": 0.7881316643486324,
"grad_norm": 0.09288563579320908,
"learning_rate": 9.369246885348926e-05,
"loss": 0.1343,
"step": 1700
},
{
"epoch": 0.7881316643486324,
"eval_loss": 0.16728526898731283,
"eval_runtime": 59.4209,
"eval_samples_per_second": 699.855,
"eval_steps_per_second": 0.353,
"step": 1700
},
{
"epoch": 0.8113120074177098,
"grad_norm": 0.1111670434474945,
"learning_rate": 9.330127018922194e-05,
"loss": 0.1342,
"step": 1750
},
{
"epoch": 0.8113120074177098,
"eval_loss": 0.1692570258991495,
"eval_runtime": 59.3557,
"eval_samples_per_second": 700.624,
"eval_steps_per_second": 0.354,
"step": 1750
},
{
"epoch": 0.8344923504867872,
"grad_norm": 0.06098225340247154,
"learning_rate": 9.289917066174886e-05,
"loss": 0.1334,
"step": 1800
},
{
"epoch": 0.8344923504867872,
"eval_loss": 0.16652011733605857,
"eval_runtime": 59.527,
"eval_samples_per_second": 698.607,
"eval_steps_per_second": 0.353,
"step": 1800
},
{
"epoch": 0.8576726935558646,
"grad_norm": 0.11042412370443344,
"learning_rate": 9.248627149747573e-05,
"loss": 0.136,
"step": 1850
},
{
"epoch": 0.8576726935558646,
"eval_loss": 0.16714222769914375,
"eval_runtime": 59.3645,
"eval_samples_per_second": 700.519,
"eval_steps_per_second": 0.354,
"step": 1850
},
{
"epoch": 0.8808530366249421,
"grad_norm": 0.09495564550161362,
"learning_rate": 9.206267664155907e-05,
"loss": 0.1349,
"step": 1900
},
{
"epoch": 0.8808530366249421,
"eval_loss": 0.1690081783682819,
"eval_runtime": 59.2609,
"eval_samples_per_second": 701.744,
"eval_steps_per_second": 0.354,
"step": 1900
},
{
"epoch": 0.9040333796940194,
"grad_norm": 0.08535555005073547,
"learning_rate": 9.162849273173857e-05,
"loss": 0.1345,
"step": 1950
},
{
"epoch": 0.9040333796940194,
"eval_loss": 0.16719838296653933,
"eval_runtime": 59.4328,
"eval_samples_per_second": 699.714,
"eval_steps_per_second": 0.353,
"step": 1950
},
{
"epoch": 0.9272137227630969,
"grad_norm": 0.08415450155735016,
"learning_rate": 9.118382907149165e-05,
"loss": 0.1332,
"step": 2000
},
{
"epoch": 0.9272137227630969,
"eval_loss": 0.16692495198886095,
"eval_runtime": 59.4174,
"eval_samples_per_second": 699.895,
"eval_steps_per_second": 0.353,
"step": 2000
},
{
"epoch": 0.9503940658321743,
"grad_norm": 0.07792109996080399,
"learning_rate": 9.072879760251679e-05,
"loss": 0.1349,
"step": 2050
},
{
"epoch": 0.9503940658321743,
"eval_loss": 0.16853327133732582,
"eval_runtime": 59.3211,
"eval_samples_per_second": 701.032,
"eval_steps_per_second": 0.354,
"step": 2050
},
{
"epoch": 0.9735744089012517,
"grad_norm": 0.09134557843208313,
"learning_rate": 9.026351287655294e-05,
"loss": 0.1355,
"step": 2100
},
{
"epoch": 0.9735744089012517,
"eval_loss": 0.16782760284485718,
"eval_runtime": 59.1119,
"eval_samples_per_second": 703.513,
"eval_steps_per_second": 0.355,
"step": 2100
},
{
"epoch": 0.9967547519703291,
"grad_norm": 0.11134419590234756,
"learning_rate": 8.978809202654162e-05,
"loss": 0.134,
"step": 2150
},
{
"epoch": 0.9967547519703291,
"eval_loss": 0.1670381695935501,
"eval_runtime": 59.4602,
"eval_samples_per_second": 699.393,
"eval_steps_per_second": 0.353,
"step": 2150
},
{
"epoch": 1.0199350950394066,
"grad_norm": 0.08943980187177658,
"learning_rate": 8.930265473713938e-05,
"loss": 0.1345,
"step": 2200
},
{
"epoch": 1.0199350950394066,
"eval_loss": 0.16720885753257103,
"eval_runtime": 59.8457,
"eval_samples_per_second": 694.887,
"eval_steps_per_second": 0.351,
"step": 2200
},
{
"epoch": 1.043115438108484,
"grad_norm": 0.05172237753868103,
"learning_rate": 8.880732321458784e-05,
"loss": 0.1345,
"step": 2250
},
{
"epoch": 1.043115438108484,
"eval_loss": 0.16808202774068384,
"eval_runtime": 59.7591,
"eval_samples_per_second": 695.894,
"eval_steps_per_second": 0.351,
"step": 2250
},
{
"epoch": 1.0662957811775615,
"grad_norm": 0.08457198739051819,
"learning_rate": 8.83022221559489e-05,
"loss": 0.1339,
"step": 2300
},
{
"epoch": 1.0662957811775615,
"eval_loss": 0.16620651689588106,
"eval_runtime": 59.8615,
"eval_samples_per_second": 694.704,
"eval_steps_per_second": 0.351,
"step": 2300
},
{
"epoch": 1.0894761242466389,
"grad_norm": 0.08191724866628647,
"learning_rate": 8.778747871771292e-05,
"loss": 0.1333,
"step": 2350
},
{
"epoch": 1.0894761242466389,
"eval_loss": 0.16742435976845876,
"eval_runtime": 59.8699,
"eval_samples_per_second": 694.606,
"eval_steps_per_second": 0.351,
"step": 2350
},
{
"epoch": 1.1126564673157162,
"grad_norm": 0.08220981061458588,
"learning_rate": 8.726322248378775e-05,
"loss": 0.1336,
"step": 2400
},
{
"epoch": 1.1126564673157162,
"eval_loss": 0.16507172522149283,
"eval_runtime": 59.8591,
"eval_samples_per_second": 694.731,
"eval_steps_per_second": 0.351,
"step": 2400
},
{
"epoch": 1.1358368103847938,
"grad_norm": 0.11390708386898041,
"learning_rate": 8.672958543287666e-05,
"loss": 0.1335,
"step": 2450
},
{
"epoch": 1.1358368103847938,
"eval_loss": 0.16567397155304947,
"eval_runtime": 59.7629,
"eval_samples_per_second": 695.85,
"eval_steps_per_second": 0.351,
"step": 2450
},
{
"epoch": 1.1590171534538711,
"grad_norm": 0.06390725821256638,
"learning_rate": 8.618670190525352e-05,
"loss": 0.1335,
"step": 2500
},
{
"epoch": 1.1590171534538711,
"eval_loss": 0.1671167116531541,
"eval_runtime": 59.5093,
"eval_samples_per_second": 698.815,
"eval_steps_per_second": 0.353,
"step": 2500
},
{
"epoch": 1.1821974965229485,
"grad_norm": 0.06458276510238647,
"learning_rate": 8.563470856894316e-05,
"loss": 0.1322,
"step": 2550
},
{
"epoch": 1.1821974965229485,
"eval_loss": 0.16552241982155386,
"eval_runtime": 59.3646,
"eval_samples_per_second": 700.519,
"eval_steps_per_second": 0.354,
"step": 2550
},
{
"epoch": 1.205377839592026,
"grad_norm": 0.07258091121912003,
"learning_rate": 8.507374438531607e-05,
"loss": 0.1333,
"step": 2600
},
{
"epoch": 1.205377839592026,
"eval_loss": 0.16643385319936513,
"eval_runtime": 59.7463,
"eval_samples_per_second": 696.043,
"eval_steps_per_second": 0.351,
"step": 2600
},
{
"epoch": 1.2285581826611034,
"grad_norm": 0.08584043383598328,
"learning_rate": 8.450395057410561e-05,
"loss": 0.1325,
"step": 2650
},
{
"epoch": 1.2285581826611034,
"eval_loss": 0.16595956749906993,
"eval_runtime": 59.7537,
"eval_samples_per_second": 695.957,
"eval_steps_per_second": 0.351,
"step": 2650
},
{
"epoch": 1.2517385257301807,
"grad_norm": 0.054344214498996735,
"learning_rate": 8.392547057785661e-05,
"loss": 0.1334,
"step": 2700
},
{
"epoch": 1.2517385257301807,
"eval_loss": 0.165368604673745,
"eval_runtime": 59.4339,
"eval_samples_per_second": 699.701,
"eval_steps_per_second": 0.353,
"step": 2700
},
{
"epoch": 1.2749188687992583,
"grad_norm": 0.07332266122102737,
"learning_rate": 8.333845002581458e-05,
"loss": 0.1326,
"step": 2750
},
{
"epoch": 1.2749188687992583,
"eval_loss": 0.16569167596925843,
"eval_runtime": 59.5544,
"eval_samples_per_second": 698.286,
"eval_steps_per_second": 0.353,
"step": 2750
},
{
"epoch": 1.2980992118683357,
"grad_norm": 0.07198917865753174,
"learning_rate": 8.274303669726426e-05,
"loss": 0.1323,
"step": 2800
},
{
"epoch": 1.2980992118683357,
"eval_loss": 0.16580398198626048,
"eval_runtime": 59.8451,
"eval_samples_per_second": 694.894,
"eval_steps_per_second": 0.351,
"step": 2800
},
{
"epoch": 1.321279554937413,
"grad_norm": 0.09278077632188797,
"learning_rate": 8.213938048432697e-05,
"loss": 0.1324,
"step": 2850
},
{
"epoch": 1.321279554937413,
"eval_loss": 0.16619885882978533,
"eval_runtime": 59.6857,
"eval_samples_per_second": 696.749,
"eval_steps_per_second": 0.352,
"step": 2850
},
{
"epoch": 1.3444598980064906,
"grad_norm": 0.04779389128088951,
"learning_rate": 8.152763335422613e-05,
"loss": 0.1327,
"step": 2900
},
{
"epoch": 1.3444598980064906,
"eval_loss": 0.16639967239163891,
"eval_runtime": 59.5347,
"eval_samples_per_second": 698.517,
"eval_steps_per_second": 0.353,
"step": 2900
},
{
"epoch": 1.367640241075568,
"grad_norm": 0.0650218203663826,
"learning_rate": 8.090794931103026e-05,
"loss": 0.1324,
"step": 2950
},
{
"epoch": 1.367640241075568,
"eval_loss": 0.16698249569806287,
"eval_runtime": 59.4938,
"eval_samples_per_second": 698.997,
"eval_steps_per_second": 0.353,
"step": 2950
},
{
"epoch": 1.3908205841446453,
"grad_norm": 0.07800327241420746,
"learning_rate": 8.028048435688333e-05,
"loss": 0.1325,
"step": 3000
},
{
"epoch": 1.3908205841446453,
"eval_loss": 0.16588903849861533,
"eval_runtime": 59.7308,
"eval_samples_per_second": 696.223,
"eval_steps_per_second": 0.352,
"step": 3000
},
{
"epoch": 1.4140009272137228,
"grad_norm": 0.09477279335260391,
"learning_rate": 7.964539645273204e-05,
"loss": 0.1318,
"step": 3050
},
{
"epoch": 1.4140009272137228,
"eval_loss": 0.16391722600570544,
"eval_runtime": 59.2552,
"eval_samples_per_second": 701.812,
"eval_steps_per_second": 0.354,
"step": 3050
},
{
"epoch": 1.4371812702828002,
"grad_norm": 0.061748892068862915,
"learning_rate": 7.900284547855991e-05,
"loss": 0.1328,
"step": 3100
},
{
"epoch": 1.4371812702828002,
"eval_loss": 0.1664695654356475,
"eval_runtime": 59.7882,
"eval_samples_per_second": 695.556,
"eval_steps_per_second": 0.351,
"step": 3100
},
{
"epoch": 1.4603616133518775,
"grad_norm": 0.07277340441942215,
"learning_rate": 7.835299319313853e-05,
"loss": 0.1332,
"step": 3150
},
{
"epoch": 1.4603616133518775,
"eval_loss": 0.16764915423728274,
"eval_runtime": 59.7168,
"eval_samples_per_second": 696.387,
"eval_steps_per_second": 0.352,
"step": 3150
},
{
"epoch": 1.483541956420955,
"grad_norm": 0.06525903195142746,
"learning_rate": 7.769600319330552e-05,
"loss": 0.1326,
"step": 3200
},
{
"epoch": 1.483541956420955,
"eval_loss": 0.16491104357870506,
"eval_runtime": 59.6126,
"eval_samples_per_second": 697.604,
"eval_steps_per_second": 0.352,
"step": 3200
},
{
"epoch": 1.5067222994900324,
"grad_norm": 0.06889070570468903,
"learning_rate": 7.703204087277988e-05,
"loss": 0.1327,
"step": 3250
},
{
"epoch": 1.5067222994900324,
"eval_loss": 0.16643899540149082,
"eval_runtime": 60.0,
"eval_samples_per_second": 693.1,
"eval_steps_per_second": 0.35,
"step": 3250
},
{
"epoch": 1.5299026425591098,
"grad_norm": 0.09515661001205444,
"learning_rate": 7.636127338052512e-05,
"loss": 0.1332,
"step": 3300
},
{
"epoch": 1.5299026425591098,
"eval_loss": 0.16578109982118125,
"eval_runtime": 60.2083,
"eval_samples_per_second": 690.703,
"eval_steps_per_second": 0.349,
"step": 3300
},
{
"epoch": 1.5530829856281874,
"grad_norm": 0.06826016306877136,
"learning_rate": 7.568386957867033e-05,
"loss": 0.1321,
"step": 3350
},
{
"epoch": 1.5530829856281874,
"eval_loss": 0.16615711001414799,
"eval_runtime": 59.8961,
"eval_samples_per_second": 694.303,
"eval_steps_per_second": 0.351,
"step": 3350
},
{
"epoch": 1.5762633286972647,
"grad_norm": 0.06259354203939438,
"learning_rate": 7.500000000000001e-05,
"loss": 0.1324,
"step": 3400
},
{
"epoch": 1.5762633286972647,
"eval_loss": 0.16420639359901218,
"eval_runtime": 59.8484,
"eval_samples_per_second": 694.856,
"eval_steps_per_second": 0.351,
"step": 3400
},
{
"epoch": 1.599443671766342,
"grad_norm": 0.08373662084341049,
"learning_rate": 7.430983680502344e-05,
"loss": 0.1317,
"step": 3450
},
{
"epoch": 1.599443671766342,
"eval_loss": 0.16580187809904914,
"eval_runtime": 59.5295,
"eval_samples_per_second": 698.578,
"eval_steps_per_second": 0.353,
"step": 3450
},
{
"epoch": 1.6226240148354196,
"grad_norm": 0.052068453282117844,
"learning_rate": 7.361355373863414e-05,
"loss": 0.1326,
"step": 3500
},
{
"epoch": 1.6226240148354196,
"eval_loss": 0.16511726778477553,
"eval_runtime": 59.3774,
"eval_samples_per_second": 700.368,
"eval_steps_per_second": 0.354,
"step": 3500
},
{
"epoch": 1.645804357904497,
"grad_norm": 0.1084132120013237,
"learning_rate": 7.291132608637052e-05,
"loss": 0.1328,
"step": 3550
},
{
"epoch": 1.645804357904497,
"eval_loss": 0.16512942482848092,
"eval_runtime": 59.7073,
"eval_samples_per_second": 696.497,
"eval_steps_per_second": 0.352,
"step": 3550
},
{
"epoch": 1.6689847009735743,
"grad_norm": 0.09590224921703339,
"learning_rate": 7.220333063028872e-05,
"loss": 0.1327,
"step": 3600
},
{
"epoch": 1.6689847009735743,
"eval_loss": 0.1653536906511234,
"eval_runtime": 59.8607,
"eval_samples_per_second": 694.713,
"eval_steps_per_second": 0.351,
"step": 3600
},
{
"epoch": 1.692165044042652,
"grad_norm": 0.09215644001960754,
"learning_rate": 7.148974560445859e-05,
"loss": 0.1314,
"step": 3650
},
{
"epoch": 1.692165044042652,
"eval_loss": 0.16392036224708054,
"eval_runtime": 59.6823,
"eval_samples_per_second": 696.79,
"eval_steps_per_second": 0.352,
"step": 3650
},
{
"epoch": 1.7153453871117292,
"grad_norm": 0.0847523957490921,
"learning_rate": 7.077075065009433e-05,
"loss": 0.1319,
"step": 3700
},
{
"epoch": 1.7153453871117292,
"eval_loss": 0.1658360792512092,
"eval_runtime": 59.6368,
"eval_samples_per_second": 697.322,
"eval_steps_per_second": 0.352,
"step": 3700
},
{
"epoch": 1.7385257301808066,
"grad_norm": 0.06882014125585556,
"learning_rate": 7.004652677033068e-05,
"loss": 0.1308,
"step": 3750
},
{
"epoch": 1.7385257301808066,
"eval_loss": 0.1656867715236748,
"eval_runtime": 59.8626,
"eval_samples_per_second": 694.691,
"eval_steps_per_second": 0.351,
"step": 3750
},
{
"epoch": 1.7617060732498842,
"grad_norm": 0.056948818266391754,
"learning_rate": 6.931725628465643e-05,
"loss": 0.1322,
"step": 3800
},
{
"epoch": 1.7617060732498842,
"eval_loss": 0.16491998551370737,
"eval_runtime": 59.5124,
"eval_samples_per_second": 698.779,
"eval_steps_per_second": 0.353,
"step": 3800
},
{
"epoch": 1.7848864163189615,
"grad_norm": 0.04779543727636337,
"learning_rate": 6.858312278301637e-05,
"loss": 0.1315,
"step": 3850
},
{
"epoch": 1.7848864163189615,
"eval_loss": 0.1649495124686108,
"eval_runtime": 59.9775,
"eval_samples_per_second": 693.36,
"eval_steps_per_second": 0.35,
"step": 3850
},
{
"epoch": 1.8080667593880388,
"grad_norm": 0.05969324707984924,
"learning_rate": 6.784431107959359e-05,
"loss": 0.1316,
"step": 3900
},
{
"epoch": 1.8080667593880388,
"eval_loss": 0.16391757633340012,
"eval_runtime": 60.0346,
"eval_samples_per_second": 692.7,
"eval_steps_per_second": 0.35,
"step": 3900
},
{
"epoch": 1.8312471024571164,
"grad_norm": 0.061390358954668045,
"learning_rate": 6.710100716628344e-05,
"loss": 0.1312,
"step": 3950
},
{
"epoch": 1.8312471024571164,
"eval_loss": 0.1658972028775054,
"eval_runtime": 59.9663,
"eval_samples_per_second": 693.489,
"eval_steps_per_second": 0.35,
"step": 3950
},
{
"epoch": 1.8544274455261938,
"grad_norm": 0.07332038879394531,
"learning_rate": 6.635339816587109e-05,
"loss": 0.1323,
"step": 4000
},
{
"epoch": 1.8544274455261938,
"eval_loss": 0.1647820455194368,
"eval_runtime": 59.5785,
"eval_samples_per_second": 698.004,
"eval_steps_per_second": 0.352,
"step": 4000
},
{
"epoch": 1.877607788595271,
"grad_norm": 0.07641714811325073,
"learning_rate": 6.560167228492436e-05,
"loss": 0.132,
"step": 4050
},
{
"epoch": 1.877607788595271,
"eval_loss": 0.16406535325266738,
"eval_runtime": 60.0931,
"eval_samples_per_second": 692.026,
"eval_steps_per_second": 0.349,
"step": 4050
},
{
"epoch": 1.9007881316643487,
"grad_norm": 0.08891258388757706,
"learning_rate": 6.484601876641375e-05,
"loss": 0.1308,
"step": 4100
},
{
"epoch": 1.9007881316643487,
"eval_loss": 0.164731109091856,
"eval_runtime": 59.8012,
"eval_samples_per_second": 695.405,
"eval_steps_per_second": 0.351,
"step": 4100
},
{
"epoch": 1.923968474733426,
"grad_norm": 0.0818193256855011,
"learning_rate": 6.408662784207149e-05,
"loss": 0.1323,
"step": 4150
},
{
"epoch": 1.923968474733426,
"eval_loss": 0.16444408652573528,
"eval_runtime": 59.6523,
"eval_samples_per_second": 697.14,
"eval_steps_per_second": 0.352,
"step": 4150
},
{
"epoch": 1.9471488178025034,
"grad_norm": 0.05766776204109192,
"learning_rate": 6.332369068450174e-05,
"loss": 0.131,
"step": 4200
},
{
"epoch": 1.9471488178025034,
"eval_loss": 0.1630568549542592,
"eval_runtime": 59.9782,
"eval_samples_per_second": 693.352,
"eval_steps_per_second": 0.35,
"step": 4200
},
{
"epoch": 1.970329160871581,
"grad_norm": 0.07093872129917145,
"learning_rate": 6.255739935905396e-05,
"loss": 0.1313,
"step": 4250
},
{
"epoch": 1.970329160871581,
"eval_loss": 0.16320942743206068,
"eval_runtime": 59.7408,
"eval_samples_per_second": 696.107,
"eval_steps_per_second": 0.352,
"step": 4250
},
{
"epoch": 1.9935095039406583,
"grad_norm": 0.051636241376399994,
"learning_rate": 6.178794677547137e-05,
"loss": 0.1309,
"step": 4300
},
{
"epoch": 1.9935095039406583,
"eval_loss": 0.16439976264264172,
"eval_runtime": 59.7092,
"eval_samples_per_second": 696.476,
"eval_steps_per_second": 0.352,
"step": 4300
},
{
"epoch": 2.0166898470097356,
"grad_norm": 0.05819587782025337,
"learning_rate": 6.1015526639327035e-05,
"loss": 0.1319,
"step": 4350
},
{
"epoch": 2.0166898470097356,
"eval_loss": 0.16432355870633325,
"eval_runtime": 59.2592,
"eval_samples_per_second": 701.765,
"eval_steps_per_second": 0.354,
"step": 4350
},
{
"epoch": 2.039870190078813,
"grad_norm": 0.07939411699771881,
"learning_rate": 6.024033340325954e-05,
"loss": 0.1316,
"step": 4400
},
{
"epoch": 2.039870190078813,
"eval_loss": 0.1641168338494948,
"eval_runtime": 59.9534,
"eval_samples_per_second": 693.639,
"eval_steps_per_second": 0.35,
"step": 4400
},
{
"epoch": 2.0630505331478908,
"grad_norm": 0.07020165026187897,
"learning_rate": 5.946256221802051e-05,
"loss": 0.1312,
"step": 4450
},
{
"epoch": 2.0630505331478908,
"eval_loss": 0.1633037564118911,
"eval_runtime": 60.3433,
"eval_samples_per_second": 689.157,
"eval_steps_per_second": 0.348,
"step": 4450
},
{
"epoch": 2.086230876216968,
"grad_norm": 0.07000721246004105,
"learning_rate": 5.868240888334653e-05,
"loss": 0.1313,
"step": 4500
},
{
"epoch": 2.086230876216968,
"eval_loss": 0.1646367282392535,
"eval_runtime": 60.5726,
"eval_samples_per_second": 686.548,
"eval_steps_per_second": 0.347,
"step": 4500
},
{
"epoch": 2.1094112192860455,
"grad_norm": 0.06988826394081116,
"learning_rate": 5.79000697986675e-05,
"loss": 0.1316,
"step": 4550
},
{
"epoch": 2.1094112192860455,
"eval_loss": 0.16286425765036744,
"eval_runtime": 60.2061,
"eval_samples_per_second": 690.727,
"eval_steps_per_second": 0.349,
"step": 4550
},
{
"epoch": 2.132591562355123,
"grad_norm": 0.0749220922589302,
"learning_rate": 5.7115741913664264e-05,
"loss": 0.1306,
"step": 4600
},
{
"epoch": 2.132591562355123,
"eval_loss": 0.1643572569196068,
"eval_runtime": 59.9586,
"eval_samples_per_second": 693.579,
"eval_steps_per_second": 0.35,
"step": 4600
},
{
"epoch": 2.1557719054242,
"grad_norm": 0.06533892452716827,
"learning_rate": 5.6329622678687463e-05,
"loss": 0.1313,
"step": 4650
},
{
"epoch": 2.1557719054242,
"eval_loss": 0.1635978048832001,
"eval_runtime": 59.6271,
"eval_samples_per_second": 697.435,
"eval_steps_per_second": 0.352,
"step": 4650
},
{
"epoch": 2.1789522484932777,
"grad_norm": 0.07881616055965424,
"learning_rate": 5.5541909995050554e-05,
"loss": 0.131,
"step": 4700
},
{
"epoch": 2.1789522484932777,
"eval_loss": 0.1634715372028324,
"eval_runtime": 59.564,
"eval_samples_per_second": 698.173,
"eval_steps_per_second": 0.353,
"step": 4700
},
{
"epoch": 2.2021325915623553,
"grad_norm": 0.05812694877386093,
"learning_rate": 5.475280216520913e-05,
"loss": 0.1311,
"step": 4750
},
{
"epoch": 2.2021325915623553,
"eval_loss": 0.1636915707335646,
"eval_runtime": 59.9343,
"eval_samples_per_second": 693.86,
"eval_steps_per_second": 0.35,
"step": 4750
},
{
"epoch": 2.2253129346314324,
"grad_norm": 0.09842361509799957,
"learning_rate": 5.396249784283942e-05,
"loss": 0.1315,
"step": 4800
},
{
"epoch": 2.2253129346314324,
"eval_loss": 0.16410182317726912,
"eval_runtime": 60.4431,
"eval_samples_per_second": 688.019,
"eval_steps_per_second": 0.347,
"step": 4800
},
{
"epoch": 2.24849327770051,
"grad_norm": 0.05664157494902611,
"learning_rate": 5.317119598282823e-05,
"loss": 0.1314,
"step": 4850
},
{
"epoch": 2.24849327770051,
"eval_loss": 0.16405877684845893,
"eval_runtime": 60.2757,
"eval_samples_per_second": 689.93,
"eval_steps_per_second": 0.348,
"step": 4850
},
{
"epoch": 2.2716736207695876,
"grad_norm": 0.08323252946138382,
"learning_rate": 5.2379095791187124e-05,
"loss": 0.1306,
"step": 4900
},
{
"epoch": 2.2716736207695876,
"eval_loss": 0.16356865120524391,
"eval_runtime": 60.2036,
"eval_samples_per_second": 690.756,
"eval_steps_per_second": 0.349,
"step": 4900
},
{
"epoch": 2.2948539638386647,
"grad_norm": 0.07163384556770325,
"learning_rate": 5.158639667490339e-05,
"loss": 0.1314,
"step": 4950
},
{
"epoch": 2.2948539638386647,
"eval_loss": 0.16350787082313517,
"eval_runtime": 59.6657,
"eval_samples_per_second": 696.983,
"eval_steps_per_second": 0.352,
"step": 4950
},
{
"epoch": 2.3180343069077423,
"grad_norm": 0.07729226350784302,
"learning_rate": 5.0793298191740404e-05,
"loss": 0.1321,
"step": 5000
},
{
"epoch": 2.3180343069077423,
"eval_loss": 0.16284041257465698,
"eval_runtime": 60.3671,
"eval_samples_per_second": 688.886,
"eval_steps_per_second": 0.348,
"step": 5000
},
{
"epoch": 2.34121464997682,
"grad_norm": 0.07920071482658386,
"learning_rate": 5e-05,
"loss": 0.13,
"step": 5050
},
{
"epoch": 2.34121464997682,
"eval_loss": 0.16350252303966548,
"eval_runtime": 60.0663,
"eval_samples_per_second": 692.335,
"eval_steps_per_second": 0.35,
"step": 5050
},
{
"epoch": 2.364394993045897,
"grad_norm": 0.05213838815689087,
"learning_rate": 4.92067018082596e-05,
"loss": 0.1315,
"step": 5100
},
{
"epoch": 2.364394993045897,
"eval_loss": 0.1640868928554377,
"eval_runtime": 60.1323,
"eval_samples_per_second": 691.575,
"eval_steps_per_second": 0.349,
"step": 5100
},
{
"epoch": 2.3875753361149745,
"grad_norm": 0.06551820039749146,
"learning_rate": 4.841360332509663e-05,
"loss": 0.1311,
"step": 5150
},
{
"epoch": 2.3875753361149745,
"eval_loss": 0.16375304166425866,
"eval_runtime": 60.0889,
"eval_samples_per_second": 692.074,
"eval_steps_per_second": 0.349,
"step": 5150
},
{
"epoch": 2.410755679184052,
"grad_norm": 0.06602519750595093,
"learning_rate": 4.762090420881289e-05,
"loss": 0.1304,
"step": 5200
},
{
"epoch": 2.410755679184052,
"eval_loss": 0.1646718036775546,
"eval_runtime": 60.1839,
"eval_samples_per_second": 690.982,
"eval_steps_per_second": 0.349,
"step": 5200
},
{
"epoch": 2.433936022253129,
"grad_norm": 0.050050172954797745,
"learning_rate": 4.6828804017171776e-05,
"loss": 0.131,
"step": 5250
},
{
"epoch": 2.433936022253129,
"eval_loss": 0.16238808458996815,
"eval_runtime": 60.2346,
"eval_samples_per_second": 690.401,
"eval_steps_per_second": 0.349,
"step": 5250
},
{
"epoch": 2.457116365322207,
"grad_norm": 0.06192226707935333,
"learning_rate": 4.603750215716057e-05,
"loss": 0.131,
"step": 5300
},
{
"epoch": 2.457116365322207,
"eval_loss": 0.1633245686996306,
"eval_runtime": 59.5691,
"eval_samples_per_second": 698.114,
"eval_steps_per_second": 0.353,
"step": 5300
},
{
"epoch": 2.4802967083912844,
"grad_norm": 0.07729701697826385,
"learning_rate": 4.5247197834790876e-05,
"loss": 0.1308,
"step": 5350
},
{
"epoch": 2.4802967083912844,
"eval_loss": 0.16388068444979556,
"eval_runtime": 60.3853,
"eval_samples_per_second": 688.677,
"eval_steps_per_second": 0.348,
"step": 5350
},
{
"epoch": 2.5034770514603615,
"grad_norm": 0.07346878945827484,
"learning_rate": 4.445809000494946e-05,
"loss": 0.1314,
"step": 5400
},
{
"epoch": 2.5034770514603615,
"eval_loss": 0.16427215786452162,
"eval_runtime": 60.0462,
"eval_samples_per_second": 692.567,
"eval_steps_per_second": 0.35,
"step": 5400
},
{
"epoch": 2.526657394529439,
"grad_norm": 0.08765513449907303,
"learning_rate": 4.3670377321312535e-05,
"loss": 0.1307,
"step": 5450
},
{
"epoch": 2.526657394529439,
"eval_loss": 0.16308954695612046,
"eval_runtime": 59.7344,
"eval_samples_per_second": 696.181,
"eval_steps_per_second": 0.352,
"step": 5450
},
{
"epoch": 2.5498377375985166,
"grad_norm": 0.04856225475668907,
"learning_rate": 4.288425808633575e-05,
"loss": 0.1314,
"step": 5500
},
{
"epoch": 2.5498377375985166,
"eval_loss": 0.1634677289958684,
"eval_runtime": 60.5651,
"eval_samples_per_second": 686.633,
"eval_steps_per_second": 0.347,
"step": 5500
},
{
"epoch": 2.5730180806675937,
"grad_norm": 0.07033301144838333,
"learning_rate": 4.20999302013325e-05,
"loss": 0.1303,
"step": 5550
},
{
"epoch": 2.5730180806675937,
"eval_loss": 0.16350203952668135,
"eval_runtime": 59.7363,
"eval_samples_per_second": 696.16,
"eval_steps_per_second": 0.352,
"step": 5550
},
{
"epoch": 2.5961984237366713,
"grad_norm": 0.07352133840322495,
"learning_rate": 4.131759111665349e-05,
"loss": 0.1304,
"step": 5600
},
{
"epoch": 2.5961984237366713,
"eval_loss": 0.16306162076252775,
"eval_runtime": 60.0517,
"eval_samples_per_second": 692.503,
"eval_steps_per_second": 0.35,
"step": 5600
},
{
"epoch": 2.619378766805749,
"grad_norm": 0.05432264879345894,
"learning_rate": 4.0537437781979506e-05,
"loss": 0.1298,
"step": 5650
},
{
"epoch": 2.619378766805749,
"eval_loss": 0.16234816348528708,
"eval_runtime": 60.3645,
"eval_samples_per_second": 688.915,
"eval_steps_per_second": 0.348,
"step": 5650
},
{
"epoch": 2.642559109874826,
"grad_norm": 0.04657018184661865,
"learning_rate": 3.9759666596740476e-05,
"loss": 0.1305,
"step": 5700
},
{
"epoch": 2.642559109874826,
"eval_loss": 0.16270628350418626,
"eval_runtime": 60.0809,
"eval_samples_per_second": 692.167,
"eval_steps_per_second": 0.35,
"step": 5700
},
{
"epoch": 2.6657394529439036,
"grad_norm": 0.04448065161705017,
"learning_rate": 3.898447336067297e-05,
"loss": 0.1308,
"step": 5750
},
{
"epoch": 2.6657394529439036,
"eval_loss": 0.162430409584318,
"eval_runtime": 59.8634,
"eval_samples_per_second": 694.682,
"eval_steps_per_second": 0.351,
"step": 5750
},
{
"epoch": 2.688919796012981,
"grad_norm": 0.047300901263952255,
"learning_rate": 3.821205322452863e-05,
"loss": 0.1306,
"step": 5800
},
{
"epoch": 2.688919796012981,
"eval_loss": 0.163914834923588,
"eval_runtime": 59.9699,
"eval_samples_per_second": 693.447,
"eval_steps_per_second": 0.35,
"step": 5800
},
{
"epoch": 2.7121001390820583,
"grad_norm": 0.09371935576200485,
"learning_rate": 3.744260064094604e-05,
"loss": 0.1303,
"step": 5850
},
{
"epoch": 2.7121001390820583,
"eval_loss": 0.16325797910827158,
"eval_runtime": 60.1596,
"eval_samples_per_second": 691.261,
"eval_steps_per_second": 0.349,
"step": 5850
},
{
"epoch": 2.735280482151136,
"grad_norm": 0.0451604500412941,
"learning_rate": 3.6676309315498256e-05,
"loss": 0.131,
"step": 5900
},
{
"epoch": 2.735280482151136,
"eval_loss": 0.16252548129222377,
"eval_runtime": 60.0104,
"eval_samples_per_second": 692.98,
"eval_steps_per_second": 0.35,
"step": 5900
},
{
"epoch": 2.7584608252202134,
"grad_norm": 0.058029964566230774,
"learning_rate": 3.591337215792852e-05,
"loss": 0.1305,
"step": 5950
},
{
"epoch": 2.7584608252202134,
"eval_loss": 0.16366348885138793,
"eval_runtime": 60.372,
"eval_samples_per_second": 688.83,
"eval_steps_per_second": 0.348,
"step": 5950
},
{
"epoch": 2.7816411682892905,
"grad_norm": 0.09429273754358292,
"learning_rate": 3.515398123358627e-05,
"loss": 0.1307,
"step": 6000
},
{
"epoch": 2.7816411682892905,
"eval_loss": 0.1623218584160889,
"eval_runtime": 59.5435,
"eval_samples_per_second": 698.413,
"eval_steps_per_second": 0.353,
"step": 6000
},
{
"epoch": 2.804821511358368,
"grad_norm": 0.05752315744757652,
"learning_rate": 3.439832771507565e-05,
"loss": 0.1296,
"step": 6050
},
{
"epoch": 2.804821511358368,
"eval_loss": 0.16326439732289802,
"eval_runtime": 59.8306,
"eval_samples_per_second": 695.063,
"eval_steps_per_second": 0.351,
"step": 6050
},
{
"epoch": 2.8280018544274457,
"grad_norm": 0.07225628942251205,
"learning_rate": 3.364660183412892e-05,
"loss": 0.1312,
"step": 6100
},
{
"epoch": 2.8280018544274457,
"eval_loss": 0.16322279137718054,
"eval_runtime": 59.8418,
"eval_samples_per_second": 694.932,
"eval_steps_per_second": 0.351,
"step": 6100
},
{
"epoch": 2.851182197496523,
"grad_norm": 0.06712605059146881,
"learning_rate": 3.289899283371657e-05,
"loss": 0.1305,
"step": 6150
},
{
"epoch": 2.851182197496523,
"eval_loss": 0.16403909400299824,
"eval_runtime": 59.4766,
"eval_samples_per_second": 699.199,
"eval_steps_per_second": 0.353,
"step": 6150
},
{
"epoch": 2.8743625405656004,
"grad_norm": 0.0743350014090538,
"learning_rate": 3.215568892040641e-05,
"loss": 0.1303,
"step": 6200
},
{
"epoch": 2.8743625405656004,
"eval_loss": 0.16315215653435175,
"eval_runtime": 60.0127,
"eval_samples_per_second": 692.953,
"eval_steps_per_second": 0.35,
"step": 6200
},
{
"epoch": 2.897542883634678,
"grad_norm": 0.07467668503522873,
"learning_rate": 3.141687721698363e-05,
"loss": 0.1302,
"step": 6250
},
{
"epoch": 2.897542883634678,
"eval_loss": 0.16213396084813272,
"eval_runtime": 59.8922,
"eval_samples_per_second": 694.348,
"eval_steps_per_second": 0.351,
"step": 6250
},
{
"epoch": 2.920723226703755,
"grad_norm": 0.050527870655059814,
"learning_rate": 3.0682743715343564e-05,
"loss": 0.1298,
"step": 6300
},
{
"epoch": 2.920723226703755,
"eval_loss": 0.16243251733829123,
"eval_runtime": 60.3601,
"eval_samples_per_second": 688.965,
"eval_steps_per_second": 0.348,
"step": 6300
},
{
"epoch": 2.9439035697728326,
"grad_norm": 0.05331522971391678,
"learning_rate": 2.9953473229669328e-05,
"loss": 0.1313,
"step": 6350
},
{
"epoch": 2.9439035697728326,
"eval_loss": 0.16321332234047015,
"eval_runtime": 60.2034,
"eval_samples_per_second": 690.759,
"eval_steps_per_second": 0.349,
"step": 6350
},
{
"epoch": 2.96708391284191,
"grad_norm": 0.0566866509616375,
"learning_rate": 2.9229249349905684e-05,
"loss": 0.1304,
"step": 6400
},
{
"epoch": 2.96708391284191,
"eval_loss": 0.1623781732971581,
"eval_runtime": 60.2575,
"eval_samples_per_second": 690.138,
"eval_steps_per_second": 0.349,
"step": 6400
},
{
"epoch": 2.9902642559109873,
"grad_norm": 0.0674847662448883,
"learning_rate": 2.851025439554142e-05,
"loss": 0.13,
"step": 6450
},
{
"epoch": 2.9902642559109873,
"eval_loss": 0.163704374422533,
"eval_runtime": 60.1942,
"eval_samples_per_second": 690.864,
"eval_steps_per_second": 0.349,
"step": 6450
},
{
"epoch": 3.013444598980065,
"grad_norm": 0.05663591995835304,
"learning_rate": 2.7796669369711294e-05,
"loss": 0.1313,
"step": 6500
},
{
"epoch": 3.013444598980065,
"eval_loss": 0.16296213660440473,
"eval_runtime": 60.9459,
"eval_samples_per_second": 682.343,
"eval_steps_per_second": 0.345,
"step": 6500
},
{
"epoch": 3.0366249420491425,
"grad_norm": 0.06456530839204788,
"learning_rate": 2.708867391362948e-05,
"loss": 0.131,
"step": 6550
},
{
"epoch": 3.0366249420491425,
"eval_loss": 0.16119627636966075,
"eval_runtime": 60.6451,
"eval_samples_per_second": 685.727,
"eval_steps_per_second": 0.346,
"step": 6550
},
{
"epoch": 3.0598052851182196,
"grad_norm": 0.05969541519880295,
"learning_rate": 2.638644626136587e-05,
"loss": 0.1311,
"step": 6600
},
{
"epoch": 3.0598052851182196,
"eval_loss": 0.16205494320222197,
"eval_runtime": 60.4532,
"eval_samples_per_second": 687.904,
"eval_steps_per_second": 0.347,
"step": 6600
},
{
"epoch": 3.082985628187297,
"grad_norm": 0.06604834645986557,
"learning_rate": 2.5690163194976575e-05,
"loss": 0.1301,
"step": 6650
},
{
"epoch": 3.082985628187297,
"eval_loss": 0.16191228875556468,
"eval_runtime": 60.3489,
"eval_samples_per_second": 689.093,
"eval_steps_per_second": 0.348,
"step": 6650
},
{
"epoch": 3.1061659712563747,
"grad_norm": 0.06501331180334091,
"learning_rate": 2.500000000000001e-05,
"loss": 0.1298,
"step": 6700
},
{
"epoch": 3.1061659712563747,
"eval_loss": 0.16219026561577268,
"eval_runtime": 60.2703,
"eval_samples_per_second": 689.992,
"eval_steps_per_second": 0.348,
"step": 6700
},
{
"epoch": 3.129346314325452,
"grad_norm": 0.056004952639341354,
"learning_rate": 2.4316130421329697e-05,
"loss": 0.1302,
"step": 6750
},
{
"epoch": 3.129346314325452,
"eval_loss": 0.16085075220051892,
"eval_runtime": 60.336,
"eval_samples_per_second": 689.24,
"eval_steps_per_second": 0.348,
"step": 6750
},
{
"epoch": 3.1525266573945294,
"grad_norm": 0.06331496685743332,
"learning_rate": 2.363872661947488e-05,
"loss": 0.1311,
"step": 6800
},
{
"epoch": 3.1525266573945294,
"eval_loss": 0.16229801712553438,
"eval_runtime": 59.8727,
"eval_samples_per_second": 694.573,
"eval_steps_per_second": 0.351,
"step": 6800
},
{
"epoch": 3.175707000463607,
"grad_norm": 0.05851437896490097,
"learning_rate": 2.296795912722014e-05,
"loss": 0.1304,
"step": 6850
},
{
"epoch": 3.175707000463607,
"eval_loss": 0.1624837018550472,
"eval_runtime": 60.0768,
"eval_samples_per_second": 692.214,
"eval_steps_per_second": 0.35,
"step": 6850
},
{
"epoch": 3.198887343532684,
"grad_norm": 0.06251411885023117,
"learning_rate": 2.2303996806694488e-05,
"loss": 0.1306,
"step": 6900
},
{
"epoch": 3.198887343532684,
"eval_loss": 0.16152431005864756,
"eval_runtime": 60.0137,
"eval_samples_per_second": 692.942,
"eval_steps_per_second": 0.35,
"step": 6900
},
{
"epoch": 3.2220676866017617,
"grad_norm": 0.055478889495134354,
"learning_rate": 2.164700680686147e-05,
"loss": 0.1302,
"step": 6950
},
{
"epoch": 3.2220676866017617,
"eval_loss": 0.16217289975188992,
"eval_runtime": 59.7201,
"eval_samples_per_second": 696.349,
"eval_steps_per_second": 0.352,
"step": 6950
},
{
"epoch": 3.2452480296708393,
"grad_norm": 0.04695391282439232,
"learning_rate": 2.09971545214401e-05,
"loss": 0.1307,
"step": 7000
},
{
"epoch": 3.2452480296708393,
"eval_loss": 0.16233282789861117,
"eval_runtime": 60.2382,
"eval_samples_per_second": 690.359,
"eval_steps_per_second": 0.349,
"step": 7000
},
{
"epoch": 3.2684283727399164,
"grad_norm": 0.05719252675771713,
"learning_rate": 2.0354603547267985e-05,
"loss": 0.1302,
"step": 7050
},
{
"epoch": 3.2684283727399164,
"eval_loss": 0.16257561894818798,
"eval_runtime": 59.9661,
"eval_samples_per_second": 693.491,
"eval_steps_per_second": 0.35,
"step": 7050
},
{
"epoch": 3.291608715808994,
"grad_norm": 0.05995924398303032,
"learning_rate": 1.9719515643116674e-05,
"loss": 0.1296,
"step": 7100
},
{
"epoch": 3.291608715808994,
"eval_loss": 0.1621910867534911,
"eval_runtime": 59.9872,
"eval_samples_per_second": 693.248,
"eval_steps_per_second": 0.35,
"step": 7100
},
{
"epoch": 3.3147890588780715,
"grad_norm": 0.06421925872564316,
"learning_rate": 1.9092050688969738e-05,
"loss": 0.1321,
"step": 7150
},
{
"epoch": 3.3147890588780715,
"eval_loss": 0.16221412998892937,
"eval_runtime": 59.9186,
"eval_samples_per_second": 694.042,
"eval_steps_per_second": 0.35,
"step": 7150
},
{
"epoch": 3.3379694019471486,
"grad_norm": 0.04900297895073891,
"learning_rate": 1.847236664577389e-05,
"loss": 0.1307,
"step": 7200
},
{
"epoch": 3.3379694019471486,
"eval_loss": 0.16276321033314364,
"eval_runtime": 59.8713,
"eval_samples_per_second": 694.59,
"eval_steps_per_second": 0.351,
"step": 7200
},
{
"epoch": 3.361149745016226,
"grad_norm": 0.06865038722753525,
"learning_rate": 1.7860619515673033e-05,
"loss": 0.1301,
"step": 7250
},
{
"epoch": 3.361149745016226,
"eval_loss": 0.16246198975876286,
"eval_runtime": 60.1366,
"eval_samples_per_second": 691.525,
"eval_steps_per_second": 0.349,
"step": 7250
},
{
"epoch": 3.384330088085304,
"grad_norm": 0.060604266822338104,
"learning_rate": 1.725696330273575e-05,
"loss": 0.1307,
"step": 7300
},
{
"epoch": 3.384330088085304,
"eval_loss": 0.16321014850841353,
"eval_runtime": 60.2328,
"eval_samples_per_second": 690.421,
"eval_steps_per_second": 0.349,
"step": 7300
},
{
"epoch": 3.407510431154381,
"grad_norm": 0.061620261520147324,
"learning_rate": 1.6661549974185424e-05,
"loss": 0.1305,
"step": 7350
},
{
"epoch": 3.407510431154381,
"eval_loss": 0.1627398211288196,
"eval_runtime": 60.2249,
"eval_samples_per_second": 690.512,
"eval_steps_per_second": 0.349,
"step": 7350
},
{
"epoch": 3.4306907742234585,
"grad_norm": 0.046630218625068665,
"learning_rate": 1.60745294221434e-05,
"loss": 0.1303,
"step": 7400
},
{
"epoch": 3.4306907742234585,
"eval_loss": 0.16263843892878527,
"eval_runtime": 59.9369,
"eval_samples_per_second": 693.829,
"eval_steps_per_second": 0.35,
"step": 7400
},
{
"epoch": 3.453871117292536,
"grad_norm": 0.06071937829256058,
"learning_rate": 1.549604942589441e-05,
"loss": 0.13,
"step": 7450
},
{
"epoch": 3.453871117292536,
"eval_loss": 0.1624999877883929,
"eval_runtime": 59.843,
"eval_samples_per_second": 694.919,
"eval_steps_per_second": 0.351,
"step": 7450
},
{
"epoch": 3.477051460361613,
"grad_norm": 0.0633426085114479,
"learning_rate": 1.4926255614683932e-05,
"loss": 0.1288,
"step": 7500
},
{
"epoch": 3.477051460361613,
"eval_loss": 0.1632884555568049,
"eval_runtime": 59.8153,
"eval_samples_per_second": 695.24,
"eval_steps_per_second": 0.351,
"step": 7500
},
{
"epoch": 3.5002318034306907,
"grad_norm": 0.06753742694854736,
"learning_rate": 1.4365291431056871e-05,
"loss": 0.1301,
"step": 7550
},
{
"epoch": 3.5002318034306907,
"eval_loss": 0.16175284084180716,
"eval_runtime": 59.9226,
"eval_samples_per_second": 693.995,
"eval_steps_per_second": 0.35,
"step": 7550
},
{
"epoch": 3.5234121464997683,
"grad_norm": 0.05140328034758568,
"learning_rate": 1.3813298094746491e-05,
"loss": 0.1304,
"step": 7600
},
{
"epoch": 3.5234121464997683,
"eval_loss": 0.16199897513596326,
"eval_runtime": 59.9917,
"eval_samples_per_second": 693.196,
"eval_steps_per_second": 0.35,
"step": 7600
},
{
"epoch": 3.5465924895688454,
"grad_norm": 0.054956089705228806,
"learning_rate": 1.327041456712334e-05,
"loss": 0.1303,
"step": 7650
},
{
"epoch": 3.5465924895688454,
"eval_loss": 0.16214041701821919,
"eval_runtime": 59.9306,
"eval_samples_per_second": 693.903,
"eval_steps_per_second": 0.35,
"step": 7650
},
{
"epoch": 3.569772832637923,
"grad_norm": 0.059684716165065765,
"learning_rate": 1.2736777516212266e-05,
"loss": 0.1308,
"step": 7700
},
{
"epoch": 3.569772832637923,
"eval_loss": 0.16299972612079205,
"eval_runtime": 59.9509,
"eval_samples_per_second": 693.668,
"eval_steps_per_second": 0.35,
"step": 7700
},
{
"epoch": 3.5929531757070006,
"grad_norm": 0.059858404099941254,
"learning_rate": 1.2212521282287092e-05,
"loss": 0.1297,
"step": 7750
},
{
"epoch": 3.5929531757070006,
"eval_loss": 0.1621026389214657,
"eval_runtime": 60.4063,
"eval_samples_per_second": 688.438,
"eval_steps_per_second": 0.348,
"step": 7750
},
{
"epoch": 3.6161335187760777,
"grad_norm": 0.07229738682508469,
"learning_rate": 1.1697777844051105e-05,
"loss": 0.13,
"step": 7800
},
{
"epoch": 3.6161335187760777,
"eval_loss": 0.16179662531772324,
"eval_runtime": 60.0141,
"eval_samples_per_second": 692.937,
"eval_steps_per_second": 0.35,
"step": 7800
},
{
"epoch": 3.6393138618451553,
"grad_norm": 0.058062318712472916,
"learning_rate": 1.1192676785412154e-05,
"loss": 0.1305,
"step": 7850
},
{
"epoch": 3.6393138618451553,
"eval_loss": 0.16283961568372932,
"eval_runtime": 59.7657,
"eval_samples_per_second": 695.817,
"eval_steps_per_second": 0.351,
"step": 7850
},
{
"epoch": 3.662494204914233,
"grad_norm": 0.053812187165021896,
"learning_rate": 1.0697345262860636e-05,
"loss": 0.1314,
"step": 7900
},
{
"epoch": 3.662494204914233,
"eval_loss": 0.16244345922930356,
"eval_runtime": 60.3156,
"eval_samples_per_second": 689.474,
"eval_steps_per_second": 0.348,
"step": 7900
},
{
"epoch": 3.68567454798331,
"grad_norm": 0.05528152361512184,
"learning_rate": 1.021190797345839e-05,
"loss": 0.1299,
"step": 7950
},
{
"epoch": 3.68567454798331,
"eval_loss": 0.1616077000723995,
"eval_runtime": 60.1023,
"eval_samples_per_second": 691.92,
"eval_steps_per_second": 0.349,
"step": 7950
},
{
"epoch": 3.7088548910523875,
"grad_norm": 0.04686369001865387,
"learning_rate": 9.73648712344707e-06,
"loss": 0.1294,
"step": 8000
},
{
"epoch": 3.7088548910523875,
"eval_loss": 0.16104961568942824,
"eval_runtime": 60.1871,
"eval_samples_per_second": 690.946,
"eval_steps_per_second": 0.349,
"step": 8000
},
{
"epoch": 3.732035234121465,
"grad_norm": 0.04791761189699173,
"learning_rate": 9.271202397483215e-06,
"loss": 0.1293,
"step": 8050
},
{
"epoch": 3.732035234121465,
"eval_loss": 0.16180993744676672,
"eval_runtime": 60.3514,
"eval_samples_per_second": 689.064,
"eval_steps_per_second": 0.348,
"step": 8050
},
{
"epoch": 3.755215577190542,
"grad_norm": 0.0580659918487072,
"learning_rate": 8.816170928508365e-06,
"loss": 0.1303,
"step": 8100
},
{
"epoch": 3.755215577190542,
"eval_loss": 0.16161086084498935,
"eval_runtime": 60.0527,
"eval_samples_per_second": 692.491,
"eval_steps_per_second": 0.35,
"step": 8100
},
{
"epoch": 3.77839592025962,
"grad_norm": 0.0652560144662857,
"learning_rate": 8.371507268261437e-06,
"loss": 0.1318,
"step": 8150
},
{
"epoch": 3.77839592025962,
"eval_loss": 0.16206722540467366,
"eval_runtime": 60.2482,
"eval_samples_per_second": 690.244,
"eval_steps_per_second": 0.349,
"step": 8150
},
{
"epoch": 3.8015762633286974,
"grad_norm": 0.07411529868841171,
"learning_rate": 7.937323358440935e-06,
"loss": 0.1295,
"step": 8200
},
{
"epoch": 3.8015762633286974,
"eval_loss": 0.1613134364148254,
"eval_runtime": 60.1488,
"eval_samples_per_second": 691.385,
"eval_steps_per_second": 0.349,
"step": 8200
},
{
"epoch": 3.8247566063977745,
"grad_norm": 0.05504234880208969,
"learning_rate": 7.513728502524286e-06,
"loss": 0.1309,
"step": 8250
},
{
"epoch": 3.8247566063977745,
"eval_loss": 0.16200784640385216,
"eval_runtime": 60.4444,
"eval_samples_per_second": 688.004,
"eval_steps_per_second": 0.347,
"step": 8250
},
{
"epoch": 3.847936949466852,
"grad_norm": 0.053017448633909225,
"learning_rate": 7.100829338251147e-06,
"loss": 0.1288,
"step": 8300
},
{
"epoch": 3.847936949466852,
"eval_loss": 0.1614959925734419,
"eval_runtime": 60.1621,
"eval_samples_per_second": 691.232,
"eval_steps_per_second": 0.349,
"step": 8300
},
{
"epoch": 3.8711172925359296,
"grad_norm": 0.055434294044971466,
"learning_rate": 6.698729810778065e-06,
"loss": 0.1296,
"step": 8350
},
{
"epoch": 3.8711172925359296,
"eval_loss": 0.16227277423563163,
"eval_runtime": 60.4168,
"eval_samples_per_second": 688.318,
"eval_steps_per_second": 0.348,
"step": 8350
},
{
"epoch": 3.8942976356050067,
"grad_norm": 0.06720498204231262,
"learning_rate": 6.3075311465107535e-06,
"loss": 0.1302,
"step": 8400
},
{
"epoch": 3.8942976356050067,
"eval_loss": 0.16212167182684745,
"eval_runtime": 60.4209,
"eval_samples_per_second": 688.271,
"eval_steps_per_second": 0.348,
"step": 8400
},
{
"epoch": 3.9174779786740843,
"grad_norm": 0.061678655445575714,
"learning_rate": 5.927331827620903e-06,
"loss": 0.1303,
"step": 8450
},
{
"epoch": 3.9174779786740843,
"eval_loss": 0.16245438240537732,
"eval_runtime": 60.3802,
"eval_samples_per_second": 688.735,
"eval_steps_per_second": 0.348,
"step": 8450
},
{
"epoch": 3.940658321743162,
"grad_norm": 0.05170401930809021,
"learning_rate": 5.558227567253832e-06,
"loss": 0.1296,
"step": 8500
},
{
"epoch": 3.940658321743162,
"eval_loss": 0.16238415050768779,
"eval_runtime": 59.8171,
"eval_samples_per_second": 695.22,
"eval_steps_per_second": 0.351,
"step": 8500
},
{
"epoch": 3.963838664812239,
"grad_norm": 0.047940943390131,
"learning_rate": 5.200311285433213e-06,
"loss": 0.1302,
"step": 8550
},
{
"epoch": 3.963838664812239,
"eval_loss": 0.1614615212760627,
"eval_runtime": 60.3377,
"eval_samples_per_second": 689.221,
"eval_steps_per_second": 0.348,
"step": 8550
},
{
"epoch": 3.9870190078813166,
"grad_norm": 0.05732366070151329,
"learning_rate": 4.853673085668947e-06,
"loss": 0.1311,
"step": 8600
},
{
"epoch": 3.9870190078813166,
"eval_loss": 0.16182338333614685,
"eval_runtime": 59.83,
"eval_samples_per_second": 695.07,
"eval_steps_per_second": 0.351,
"step": 8600
},
{
"epoch": 4.010199350950394,
"grad_norm": 0.04801890626549721,
"learning_rate": 4.5184002322740785e-06,
"loss": 0.13,
"step": 8650
},
{
"epoch": 4.010199350950394,
"eval_loss": 0.16167779009605182,
"eval_runtime": 60.1344,
"eval_samples_per_second": 691.551,
"eval_steps_per_second": 0.349,
"step": 8650
},
{
"epoch": 4.033379694019471,
"grad_norm": 0.04426449164748192,
"learning_rate": 4.19457712839652e-06,
"loss": 0.1299,
"step": 8700
},
{
"epoch": 4.033379694019471,
"eval_loss": 0.16225696461065126,
"eval_runtime": 60.1286,
"eval_samples_per_second": 691.618,
"eval_steps_per_second": 0.349,
"step": 8700
},
{
"epoch": 4.056560037088549,
"grad_norm": 0.04997009411454201,
"learning_rate": 3.8822852947709375e-06,
"loss": 0.1302,
"step": 8750
},
{
"epoch": 4.056560037088549,
"eval_loss": 0.1626912588154907,
"eval_runtime": 60.4545,
"eval_samples_per_second": 687.889,
"eval_steps_per_second": 0.347,
"step": 8750
},
{
"epoch": 4.079740380157626,
"grad_norm": 0.05177464708685875,
"learning_rate": 3.581603349196372e-06,
"loss": 0.1302,
"step": 8800
},
{
"epoch": 4.079740380157626,
"eval_loss": 0.16124445235835394,
"eval_runtime": 60.5756,
"eval_samples_per_second": 686.514,
"eval_steps_per_second": 0.347,
"step": 8800
},
{
"epoch": 4.1029207232267035,
"grad_norm": 0.050131019204854965,
"learning_rate": 3.2926069867446675e-06,
"loss": 0.1308,
"step": 8850
},
{
"epoch": 4.1029207232267035,
"eval_loss": 0.16266127792106785,
"eval_runtime": 60.3334,
"eval_samples_per_second": 689.27,
"eval_steps_per_second": 0.348,
"step": 8850
},
{
"epoch": 4.1261010662957815,
"grad_norm": 0.05185890197753906,
"learning_rate": 3.0153689607045845e-06,
"loss": 0.1298,
"step": 8900
},
{
"epoch": 4.1261010662957815,
"eval_loss": 0.16332698150424974,
"eval_runtime": 59.9427,
"eval_samples_per_second": 693.763,
"eval_steps_per_second": 0.35,
"step": 8900
},
{
"epoch": 4.149281409364859,
"grad_norm": 0.040892358869314194,
"learning_rate": 2.7499590642665774e-06,
"loss": 0.1297,
"step": 8950
},
{
"epoch": 4.149281409364859,
"eval_loss": 0.16260406271159317,
"eval_runtime": 60.3006,
"eval_samples_per_second": 689.645,
"eval_steps_per_second": 0.348,
"step": 8950
},
{
"epoch": 4.172461752433936,
"grad_norm": 0.05322985723614693,
"learning_rate": 2.496444112952734e-06,
"loss": 0.1298,
"step": 9000
},
{
"epoch": 4.172461752433936,
"eval_loss": 0.16165919748334914,
"eval_runtime": 59.7702,
"eval_samples_per_second": 695.765,
"eval_steps_per_second": 0.351,
"step": 9000
},
{
"epoch": 4.195642095503014,
"grad_norm": 0.04688135161995888,
"learning_rate": 2.2548879277963064e-06,
"loss": 0.1304,
"step": 9050
},
{
"epoch": 4.195642095503014,
"eval_loss": 0.16230118168852276,
"eval_runtime": 59.7759,
"eval_samples_per_second": 695.699,
"eval_steps_per_second": 0.351,
"step": 9050
},
{
"epoch": 4.218822438572091,
"grad_norm": 0.056906215846538544,
"learning_rate": 2.0253513192751373e-06,
"loss": 0.1302,
"step": 9100
},
{
"epoch": 4.218822438572091,
"eval_loss": 0.16160732294835795,
"eval_runtime": 59.9195,
"eval_samples_per_second": 694.031,
"eval_steps_per_second": 0.35,
"step": 9100
},
{
"epoch": 4.242002781641168,
"grad_norm": 0.05124938115477562,
"learning_rate": 1.807892072002898e-06,
"loss": 0.1298,
"step": 9150
},
{
"epoch": 4.242002781641168,
"eval_loss": 0.16257827555791163,
"eval_runtime": 59.809,
"eval_samples_per_second": 695.314,
"eval_steps_per_second": 0.351,
"step": 9150
},
{
"epoch": 4.265183124710246,
"grad_norm": 0.05366729572415352,
"learning_rate": 1.6025649301821876e-06,
"loss": 0.1294,
"step": 9200
},
{
"epoch": 4.265183124710246,
"eval_loss": 0.1625148541687181,
"eval_runtime": 60.1293,
"eval_samples_per_second": 691.61,
"eval_steps_per_second": 0.349,
"step": 9200
},
{
"epoch": 4.288363467779323,
"grad_norm": 0.04244421049952507,
"learning_rate": 1.4094215838229176e-06,
"loss": 0.1308,
"step": 9250
},
{
"epoch": 4.288363467779323,
"eval_loss": 0.16209612657051437,
"eval_runtime": 59.9231,
"eval_samples_per_second": 693.989,
"eval_steps_per_second": 0.35,
"step": 9250
},
{
"epoch": 4.3115438108484,
"grad_norm": 0.048628535121679306,
"learning_rate": 1.2285106557296477e-06,
"loss": 0.1302,
"step": 9300
},
{
"epoch": 4.3115438108484,
"eval_loss": 0.16243464161006987,
"eval_runtime": 59.5999,
"eval_samples_per_second": 697.753,
"eval_steps_per_second": 0.352,
"step": 9300
},
{
"epoch": 4.334724153917478,
"grad_norm": 0.0497569814324379,
"learning_rate": 1.0598776892610685e-06,
"loss": 0.1311,
"step": 9350
},
{
"epoch": 4.334724153917478,
"eval_loss": 0.16128818092832087,
"eval_runtime": 59.8848,
"eval_samples_per_second": 694.433,
"eval_steps_per_second": 0.351,
"step": 9350
},
{
"epoch": 4.3579044969865555,
"grad_norm": 0.07471216470003128,
"learning_rate": 9.035651368646648e-07,
"loss": 0.1304,
"step": 9400
},
{
"epoch": 4.3579044969865555,
"eval_loss": 0.16288733155633187,
"eval_runtime": 59.8227,
"eval_samples_per_second": 695.154,
"eval_steps_per_second": 0.351,
"step": 9400
},
{
"epoch": 4.381084840055633,
"grad_norm": 0.058552809059619904,
"learning_rate": 7.596123493895991e-07,
"loss": 0.13,
"step": 9450
},
{
"epoch": 4.381084840055633,
"eval_loss": 0.1634707775926499,
"eval_runtime": 59.9789,
"eval_samples_per_second": 693.344,
"eval_steps_per_second": 0.35,
"step": 9450
},
{
"epoch": 4.404265183124711,
"grad_norm": 0.05357597768306732,
"learning_rate": 6.280555661802856e-07,
"loss": 0.1295,
"step": 9500
},
{
"epoch": 4.404265183124711,
"eval_loss": 0.1615680252075211,
"eval_runtime": 60.1682,
"eval_samples_per_second": 691.163,
"eval_steps_per_second": 0.349,
"step": 9500
},
{
"epoch": 4.427445526193788,
"grad_norm": 0.05787508189678192,
"learning_rate": 5.089279059533658e-07,
"loss": 0.1305,
"step": 9550
},
{
"epoch": 4.427445526193788,
"eval_loss": 0.16174036094333355,
"eval_runtime": 60.1289,
"eval_samples_per_second": 691.615,
"eval_steps_per_second": 0.349,
"step": 9550
},
{
"epoch": 4.450625869262865,
"grad_norm": 0.049546804279088974,
"learning_rate": 4.02259358460233e-07,
"loss": 0.13,
"step": 9600
},
{
"epoch": 4.450625869262865,
"eval_loss": 0.16296962879417173,
"eval_runtime": 60.1209,
"eval_samples_per_second": 691.706,
"eval_steps_per_second": 0.349,
"step": 9600
},
{
"epoch": 4.473806212331943,
"grad_norm": 0.05137551948428154,
"learning_rate": 3.080767769372939e-07,
"loss": 0.1297,
"step": 9650
},
{
"epoch": 4.473806212331943,
"eval_loss": 0.16134209315513928,
"eval_runtime": 60.0886,
"eval_samples_per_second": 692.078,
"eval_steps_per_second": 0.349,
"step": 9650
},
{
"epoch": 4.49698655540102,
"grad_norm": 0.05584505572915077,
"learning_rate": 2.2640387134577058e-07,
"loss": 0.13,
"step": 9700
},
{
"epoch": 4.49698655540102,
"eval_loss": 0.1621784231504334,
"eval_runtime": 59.7716,
"eval_samples_per_second": 695.749,
"eval_steps_per_second": 0.351,
"step": 9700
},
{
"epoch": 4.520166898470097,
"grad_norm": 0.0450916662812233,
"learning_rate": 1.5726120240288634e-07,
"loss": 0.1302,
"step": 9750
},
{
"epoch": 4.520166898470097,
"eval_loss": 0.16172961751477263,
"eval_runtime": 59.9065,
"eval_samples_per_second": 694.182,
"eval_steps_per_second": 0.351,
"step": 9750
},
{
"epoch": 4.543347241539175,
"grad_norm": 0.0475350059568882,
"learning_rate": 1.0066617640578368e-07,
"loss": 0.1305,
"step": 9800
},
{
"epoch": 4.543347241539175,
"eval_loss": 0.16216248300305847,
"eval_runtime": 60.3498,
"eval_samples_per_second": 689.083,
"eval_steps_per_second": 0.348,
"step": 9800
},
{
"epoch": 4.566527584608252,
"grad_norm": 0.057694341987371445,
"learning_rate": 5.663304084960186e-08,
"loss": 0.1299,
"step": 9850
},
{
"epoch": 4.566527584608252,
"eval_loss": 0.16307967354038033,
"eval_runtime": 59.9352,
"eval_samples_per_second": 693.849,
"eval_steps_per_second": 0.35,
"step": 9850
},
{
"epoch": 4.589707927677329,
"grad_norm": 0.06310451030731201,
"learning_rate": 2.5172880840745873e-08,
"loss": 0.1299,
"step": 9900
},
{
"epoch": 4.589707927677329,
"eval_loss": 0.16178384342894997,
"eval_runtime": 60.0389,
"eval_samples_per_second": 692.651,
"eval_steps_per_second": 0.35,
"step": 9900
},
{
"epoch": 4.612888270746407,
"grad_norm": 0.041533030569553375,
"learning_rate": 6.293616306246586e-09,
"loss": 0.1307,
"step": 9950
},
{
"epoch": 4.612888270746407,
"eval_loss": 0.1629453700829326,
"eval_runtime": 59.9874,
"eval_samples_per_second": 693.246,
"eval_steps_per_second": 0.35,
"step": 9950
},
{
"epoch": 4.6360686138154845,
"grad_norm": 0.051685914397239685,
"learning_rate": 0.0,
"loss": 0.1293,
"step": 10000
},
{
"epoch": 4.6360686138154845,
"eval_loss": 0.161578423628233,
"eval_runtime": 60.1396,
"eval_samples_per_second": 691.491,
"eval_steps_per_second": 0.349,
"step": 10000
},
{
"epoch": 4.6360686138154845,
"step": 10000,
"total_flos": 1.2082504232914125e+17,
"train_loss": 0.134784215593338,
"train_runtime": 38606.1249,
"train_samples_per_second": 530.486,
"train_steps_per_second": 0.259
}
],
"logging_steps": 50,
"max_steps": 10000,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 50,
"total_flos": 1.2082504232914125e+17,
"train_batch_size": 2048,
"trial_name": null,
"trial_params": null
}