molcrawl-rna-bert-medium / trainer_state.json
deskull's picture
Upload MolCrawl RNA BERT medium model (60k steps)
02a0e57 verified
{
"best_metric": 4.588395118713379,
"best_model_checkpoint": "learning_source_20260316/rna/bert-output/rna-medium/checkpoint-59600",
"epoch": 0.9115595435441549,
"eval_steps": 100,
"global_step": 60000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0015192659059069249,
"grad_norm": 0.36328116059303284,
"learning_rate": 3e-06,
"loss": 10.2311,
"step": 100
},
{
"epoch": 0.0015192659059069249,
"eval_loss": 10.090826988220215,
"eval_runtime": 193.145,
"eval_samples_per_second": 51.775,
"eval_steps_per_second": 6.472,
"step": 100
},
{
"epoch": 0.0030385318118138498,
"grad_norm": 0.2686520218849182,
"learning_rate": 6e-06,
"loss": 9.9461,
"step": 200
},
{
"epoch": 0.0030385318118138498,
"eval_loss": 9.758237838745117,
"eval_runtime": 193.1826,
"eval_samples_per_second": 51.764,
"eval_steps_per_second": 6.471,
"step": 200
},
{
"epoch": 0.004557797717720775,
"grad_norm": 0.9466120004653931,
"learning_rate": 5.989966555183947e-06,
"loss": 9.6291,
"step": 300
},
{
"epoch": 0.004557797717720775,
"eval_loss": 9.439615249633789,
"eval_runtime": 193.0329,
"eval_samples_per_second": 51.805,
"eval_steps_per_second": 6.476,
"step": 300
},
{
"epoch": 0.0060770636236276996,
"grad_norm": 0.4738225042819977,
"learning_rate": 5.979933110367893e-06,
"loss": 9.3896,
"step": 400
},
{
"epoch": 0.0060770636236276996,
"eval_loss": 9.214252471923828,
"eval_runtime": 193.5666,
"eval_samples_per_second": 51.662,
"eval_steps_per_second": 6.458,
"step": 400
},
{
"epoch": 0.007596329529534624,
"grad_norm": 0.7529183626174927,
"learning_rate": 5.96989966555184e-06,
"loss": 9.2425,
"step": 500
},
{
"epoch": 0.007596329529534624,
"eval_loss": 9.07589340209961,
"eval_runtime": 193.2214,
"eval_samples_per_second": 51.754,
"eval_steps_per_second": 6.469,
"step": 500
},
{
"epoch": 0.00911559543544155,
"grad_norm": 0.48392170667648315,
"learning_rate": 5.959866220735786e-06,
"loss": 9.1413,
"step": 600
},
{
"epoch": 0.00911559543544155,
"eval_loss": 8.97410774230957,
"eval_runtime": 192.9985,
"eval_samples_per_second": 51.814,
"eval_steps_per_second": 6.477,
"step": 600
},
{
"epoch": 0.010634861341348474,
"grad_norm": 0.6194415092468262,
"learning_rate": 5.949832775919732e-06,
"loss": 9.0633,
"step": 700
},
{
"epoch": 0.010634861341348474,
"eval_loss": 8.891473770141602,
"eval_runtime": 193.0661,
"eval_samples_per_second": 51.796,
"eval_steps_per_second": 6.474,
"step": 700
},
{
"epoch": 0.012154127247255399,
"grad_norm": 2.9033119678497314,
"learning_rate": 5.939799331103679e-06,
"loss": 9.0036,
"step": 800
},
{
"epoch": 0.012154127247255399,
"eval_loss": 8.827642440795898,
"eval_runtime": 192.8551,
"eval_samples_per_second": 51.852,
"eval_steps_per_second": 6.482,
"step": 800
},
{
"epoch": 0.013673393153162324,
"grad_norm": 2.6778995990753174,
"learning_rate": 5.929765886287626e-06,
"loss": 8.9573,
"step": 900
},
{
"epoch": 0.013673393153162324,
"eval_loss": 8.794682502746582,
"eval_runtime": 193.0047,
"eval_samples_per_second": 51.812,
"eval_steps_per_second": 6.477,
"step": 900
},
{
"epoch": 0.015192659059069248,
"grad_norm": 2.586425304412842,
"learning_rate": 5.919732441471572e-06,
"loss": 8.927,
"step": 1000
},
{
"epoch": 0.015192659059069248,
"eval_loss": 8.765641212463379,
"eval_runtime": 193.0239,
"eval_samples_per_second": 51.807,
"eval_steps_per_second": 6.476,
"step": 1000
},
{
"epoch": 0.016711924964976175,
"grad_norm": 3.287247657775879,
"learning_rate": 5.9096989966555185e-06,
"loss": 8.8995,
"step": 1100
},
{
"epoch": 0.016711924964976175,
"eval_loss": 8.736359596252441,
"eval_runtime": 193.2903,
"eval_samples_per_second": 51.736,
"eval_steps_per_second": 6.467,
"step": 1100
},
{
"epoch": 0.0182311908708831,
"grad_norm": 3.2760348320007324,
"learning_rate": 5.899665551839465e-06,
"loss": 8.8734,
"step": 1200
},
{
"epoch": 0.0182311908708831,
"eval_loss": 8.70866870880127,
"eval_runtime": 193.2723,
"eval_samples_per_second": 51.74,
"eval_steps_per_second": 6.468,
"step": 1200
},
{
"epoch": 0.019750456776790024,
"grad_norm": 3.7369821071624756,
"learning_rate": 5.889632107023412e-06,
"loss": 8.8472,
"step": 1300
},
{
"epoch": 0.019750456776790024,
"eval_loss": 8.686373710632324,
"eval_runtime": 193.3013,
"eval_samples_per_second": 51.733,
"eval_steps_per_second": 6.467,
"step": 1300
},
{
"epoch": 0.02126972268269695,
"grad_norm": 3.6638362407684326,
"learning_rate": 5.879598662207358e-06,
"loss": 8.8219,
"step": 1400
},
{
"epoch": 0.02126972268269695,
"eval_loss": 8.66162109375,
"eval_runtime": 193.4171,
"eval_samples_per_second": 51.702,
"eval_steps_per_second": 6.463,
"step": 1400
},
{
"epoch": 0.022788988588603874,
"grad_norm": 3.1928629875183105,
"learning_rate": 5.869565217391305e-06,
"loss": 8.7973,
"step": 1500
},
{
"epoch": 0.022788988588603874,
"eval_loss": 8.633343696594238,
"eval_runtime": 193.7794,
"eval_samples_per_second": 51.605,
"eval_steps_per_second": 6.451,
"step": 1500
},
{
"epoch": 0.024308254494510798,
"grad_norm": 3.5108275413513184,
"learning_rate": 5.8595317725752514e-06,
"loss": 8.7682,
"step": 1600
},
{
"epoch": 0.024308254494510798,
"eval_loss": 8.609291076660156,
"eval_runtime": 193.2885,
"eval_samples_per_second": 51.736,
"eval_steps_per_second": 6.467,
"step": 1600
},
{
"epoch": 0.025827520400417723,
"grad_norm": 6.164127349853516,
"learning_rate": 5.849498327759197e-06,
"loss": 8.7411,
"step": 1700
},
{
"epoch": 0.025827520400417723,
"eval_loss": 8.578601837158203,
"eval_runtime": 193.259,
"eval_samples_per_second": 51.744,
"eval_steps_per_second": 6.468,
"step": 1700
},
{
"epoch": 0.027346786306324648,
"grad_norm": 2.5621981620788574,
"learning_rate": 5.839464882943144e-06,
"loss": 8.7198,
"step": 1800
},
{
"epoch": 0.027346786306324648,
"eval_loss": 8.555196762084961,
"eval_runtime": 193.4982,
"eval_samples_per_second": 51.68,
"eval_steps_per_second": 6.46,
"step": 1800
},
{
"epoch": 0.028866052212231572,
"grad_norm": 2.957981586456299,
"learning_rate": 5.829431438127091e-06,
"loss": 8.6935,
"step": 1900
},
{
"epoch": 0.028866052212231572,
"eval_loss": 8.530313491821289,
"eval_runtime": 193.7009,
"eval_samples_per_second": 51.626,
"eval_steps_per_second": 6.453,
"step": 1900
},
{
"epoch": 0.030385318118138497,
"grad_norm": 5.7702836990356445,
"learning_rate": 5.819397993311037e-06,
"loss": 8.6684,
"step": 2000
},
{
"epoch": 0.030385318118138497,
"eval_loss": 8.509552001953125,
"eval_runtime": 193.4646,
"eval_samples_per_second": 51.689,
"eval_steps_per_second": 6.461,
"step": 2000
},
{
"epoch": 0.03190458402404542,
"grad_norm": 3.653986930847168,
"learning_rate": 5.8093645484949836e-06,
"loss": 8.6505,
"step": 2100
},
{
"epoch": 0.03190458402404542,
"eval_loss": 8.499613761901855,
"eval_runtime": 193.2316,
"eval_samples_per_second": 51.751,
"eval_steps_per_second": 6.469,
"step": 2100
},
{
"epoch": 0.03342384992995235,
"grad_norm": 4.66618537902832,
"learning_rate": 5.79933110367893e-06,
"loss": 8.6175,
"step": 2200
},
{
"epoch": 0.03342384992995235,
"eval_loss": 8.473803520202637,
"eval_runtime": 193.4189,
"eval_samples_per_second": 51.701,
"eval_steps_per_second": 6.463,
"step": 2200
},
{
"epoch": 0.034943115835859274,
"grad_norm": 0.7005074620246887,
"learning_rate": 5.789297658862876e-06,
"loss": 8.5932,
"step": 2300
},
{
"epoch": 0.034943115835859274,
"eval_loss": 8.452431678771973,
"eval_runtime": 193.2263,
"eval_samples_per_second": 51.753,
"eval_steps_per_second": 6.469,
"step": 2300
},
{
"epoch": 0.0364623817417662,
"grad_norm": 5.592404842376709,
"learning_rate": 5.779264214046823e-06,
"loss": 8.572,
"step": 2400
},
{
"epoch": 0.0364623817417662,
"eval_loss": 8.448452949523926,
"eval_runtime": 193.5678,
"eval_samples_per_second": 51.661,
"eval_steps_per_second": 6.458,
"step": 2400
},
{
"epoch": 0.037981647647673124,
"grad_norm": 4.363527297973633,
"learning_rate": 5.76923076923077e-06,
"loss": 8.5536,
"step": 2500
},
{
"epoch": 0.037981647647673124,
"eval_loss": 8.417658805847168,
"eval_runtime": 193.2714,
"eval_samples_per_second": 51.741,
"eval_steps_per_second": 6.468,
"step": 2500
},
{
"epoch": 0.03950091355358005,
"grad_norm": 4.716485023498535,
"learning_rate": 5.759197324414716e-06,
"loss": 8.5332,
"step": 2600
},
{
"epoch": 0.03950091355358005,
"eval_loss": 8.41653823852539,
"eval_runtime": 193.336,
"eval_samples_per_second": 51.723,
"eval_steps_per_second": 6.465,
"step": 2600
},
{
"epoch": 0.04102017945948697,
"grad_norm": 2.145522117614746,
"learning_rate": 5.7491638795986624e-06,
"loss": 8.5152,
"step": 2700
},
{
"epoch": 0.04102017945948697,
"eval_loss": 8.391885757446289,
"eval_runtime": 193.7068,
"eval_samples_per_second": 51.624,
"eval_steps_per_second": 6.453,
"step": 2700
},
{
"epoch": 0.0425394453653939,
"grad_norm": 3.36438250541687,
"learning_rate": 5.739130434782609e-06,
"loss": 8.4964,
"step": 2800
},
{
"epoch": 0.0425394453653939,
"eval_loss": 8.382240295410156,
"eval_runtime": 193.7119,
"eval_samples_per_second": 51.623,
"eval_steps_per_second": 6.453,
"step": 2800
},
{
"epoch": 0.04405871127130082,
"grad_norm": 3.0056991577148438,
"learning_rate": 5.729096989966555e-06,
"loss": 8.4811,
"step": 2900
},
{
"epoch": 0.04405871127130082,
"eval_loss": 8.374021530151367,
"eval_runtime": 193.9566,
"eval_samples_per_second": 51.558,
"eval_steps_per_second": 6.445,
"step": 2900
},
{
"epoch": 0.04557797717720775,
"grad_norm": 2.388469696044922,
"learning_rate": 5.719063545150502e-06,
"loss": 8.4762,
"step": 3000
},
{
"epoch": 0.04557797717720775,
"eval_loss": 8.371816635131836,
"eval_runtime": 193.5842,
"eval_samples_per_second": 51.657,
"eval_steps_per_second": 6.457,
"step": 3000
},
{
"epoch": 0.04709724308311467,
"grad_norm": 4.248419761657715,
"learning_rate": 5.709030100334449e-06,
"loss": 8.458,
"step": 3100
},
{
"epoch": 0.04709724308311467,
"eval_loss": 8.359615325927734,
"eval_runtime": 193.612,
"eval_samples_per_second": 51.65,
"eval_steps_per_second": 6.456,
"step": 3100
},
{
"epoch": 0.048616508989021597,
"grad_norm": 1.2234629392623901,
"learning_rate": 5.698996655518395e-06,
"loss": 8.442,
"step": 3200
},
{
"epoch": 0.048616508989021597,
"eval_loss": 8.356290817260742,
"eval_runtime": 193.4203,
"eval_samples_per_second": 51.701,
"eval_steps_per_second": 6.463,
"step": 3200
},
{
"epoch": 0.05013577489492852,
"grad_norm": 1.149261236190796,
"learning_rate": 5.688963210702341e-06,
"loss": 8.434,
"step": 3300
},
{
"epoch": 0.05013577489492852,
"eval_loss": 8.348698616027832,
"eval_runtime": 193.5122,
"eval_samples_per_second": 51.676,
"eval_steps_per_second": 6.46,
"step": 3300
},
{
"epoch": 0.051655040800835446,
"grad_norm": 3.746015787124634,
"learning_rate": 5.678929765886288e-06,
"loss": 8.4225,
"step": 3400
},
{
"epoch": 0.051655040800835446,
"eval_loss": 8.341509819030762,
"eval_runtime": 193.694,
"eval_samples_per_second": 51.628,
"eval_steps_per_second": 6.453,
"step": 3400
},
{
"epoch": 0.05317430670674237,
"grad_norm": 3.512450933456421,
"learning_rate": 5.668896321070235e-06,
"loss": 8.4084,
"step": 3500
},
{
"epoch": 0.05317430670674237,
"eval_loss": 8.333552360534668,
"eval_runtime": 193.637,
"eval_samples_per_second": 51.643,
"eval_steps_per_second": 6.455,
"step": 3500
},
{
"epoch": 0.054693572612649295,
"grad_norm": 2.823720693588257,
"learning_rate": 5.658862876254181e-06,
"loss": 8.401,
"step": 3600
},
{
"epoch": 0.054693572612649295,
"eval_loss": 8.334371566772461,
"eval_runtime": 193.5758,
"eval_samples_per_second": 51.659,
"eval_steps_per_second": 6.457,
"step": 3600
},
{
"epoch": 0.05621283851855622,
"grad_norm": 3.2911577224731445,
"learning_rate": 5.6488294314381275e-06,
"loss": 8.3905,
"step": 3700
},
{
"epoch": 0.05621283851855622,
"eval_loss": 8.324334144592285,
"eval_runtime": 193.6614,
"eval_samples_per_second": 51.637,
"eval_steps_per_second": 6.455,
"step": 3700
},
{
"epoch": 0.057732104424463145,
"grad_norm": 2.3814852237701416,
"learning_rate": 5.638795986622074e-06,
"loss": 8.3799,
"step": 3800
},
{
"epoch": 0.057732104424463145,
"eval_loss": 8.320505142211914,
"eval_runtime": 193.6578,
"eval_samples_per_second": 51.637,
"eval_steps_per_second": 6.455,
"step": 3800
},
{
"epoch": 0.05925137033037007,
"grad_norm": 3.9368467330932617,
"learning_rate": 5.62876254180602e-06,
"loss": 8.3716,
"step": 3900
},
{
"epoch": 0.05925137033037007,
"eval_loss": 8.320087432861328,
"eval_runtime": 193.6177,
"eval_samples_per_second": 51.648,
"eval_steps_per_second": 6.456,
"step": 3900
},
{
"epoch": 0.060770636236276994,
"grad_norm": 3.7462780475616455,
"learning_rate": 5.618729096989967e-06,
"loss": 8.366,
"step": 4000
},
{
"epoch": 0.060770636236276994,
"eval_loss": 8.314221382141113,
"eval_runtime": 193.8249,
"eval_samples_per_second": 51.593,
"eval_steps_per_second": 6.449,
"step": 4000
},
{
"epoch": 0.06228990214218392,
"grad_norm": 4.8095598220825195,
"learning_rate": 5.608695652173914e-06,
"loss": 8.3588,
"step": 4100
},
{
"epoch": 0.06228990214218392,
"eval_loss": 8.31184196472168,
"eval_runtime": 193.6735,
"eval_samples_per_second": 51.633,
"eval_steps_per_second": 6.454,
"step": 4100
},
{
"epoch": 0.06380916804809084,
"grad_norm": 1.4702892303466797,
"learning_rate": 5.59866220735786e-06,
"loss": 8.349,
"step": 4200
},
{
"epoch": 0.06380916804809084,
"eval_loss": 8.315442085266113,
"eval_runtime": 193.7492,
"eval_samples_per_second": 51.613,
"eval_steps_per_second": 6.452,
"step": 4200
},
{
"epoch": 0.06532843395399776,
"grad_norm": 1.3424737453460693,
"learning_rate": 5.588628762541806e-06,
"loss": 8.3377,
"step": 4300
},
{
"epoch": 0.06532843395399776,
"eval_loss": 8.299623489379883,
"eval_runtime": 193.8901,
"eval_samples_per_second": 51.576,
"eval_steps_per_second": 6.447,
"step": 4300
},
{
"epoch": 0.0668476998599047,
"grad_norm": 3.2085587978363037,
"learning_rate": 5.578595317725753e-06,
"loss": 8.3276,
"step": 4400
},
{
"epoch": 0.0668476998599047,
"eval_loss": 8.291953086853027,
"eval_runtime": 193.3945,
"eval_samples_per_second": 51.708,
"eval_steps_per_second": 6.463,
"step": 4400
},
{
"epoch": 0.06836696576581162,
"grad_norm": 3.0818777084350586,
"learning_rate": 5.568561872909699e-06,
"loss": 8.3213,
"step": 4500
},
{
"epoch": 0.06836696576581162,
"eval_loss": 8.284076690673828,
"eval_runtime": 193.6893,
"eval_samples_per_second": 51.629,
"eval_steps_per_second": 6.454,
"step": 4500
},
{
"epoch": 0.06988623167171855,
"grad_norm": 2.9899518489837646,
"learning_rate": 5.558528428093646e-06,
"loss": 8.3146,
"step": 4600
},
{
"epoch": 0.06988623167171855,
"eval_loss": 8.288785934448242,
"eval_runtime": 193.4136,
"eval_samples_per_second": 51.703,
"eval_steps_per_second": 6.463,
"step": 4600
},
{
"epoch": 0.07140549757762547,
"grad_norm": 3.5509963035583496,
"learning_rate": 5.548494983277593e-06,
"loss": 8.3073,
"step": 4700
},
{
"epoch": 0.07140549757762547,
"eval_loss": 8.283821105957031,
"eval_runtime": 193.5669,
"eval_samples_per_second": 51.662,
"eval_steps_per_second": 6.458,
"step": 4700
},
{
"epoch": 0.0729247634835324,
"grad_norm": 3.2348263263702393,
"learning_rate": 5.5384615384615385e-06,
"loss": 8.3002,
"step": 4800
},
{
"epoch": 0.0729247634835324,
"eval_loss": 8.275022506713867,
"eval_runtime": 193.5313,
"eval_samples_per_second": 51.671,
"eval_steps_per_second": 6.459,
"step": 4800
},
{
"epoch": 0.07444402938943932,
"grad_norm": 3.035083293914795,
"learning_rate": 5.528428093645485e-06,
"loss": 8.2929,
"step": 4900
},
{
"epoch": 0.07444402938943932,
"eval_loss": 8.270652770996094,
"eval_runtime": 193.6056,
"eval_samples_per_second": 51.651,
"eval_steps_per_second": 6.456,
"step": 4900
},
{
"epoch": 0.07596329529534625,
"grad_norm": 2.820732593536377,
"learning_rate": 5.518394648829432e-06,
"loss": 8.2858,
"step": 5000
},
{
"epoch": 0.07596329529534625,
"eval_loss": 8.268174171447754,
"eval_runtime": 193.8467,
"eval_samples_per_second": 51.587,
"eval_steps_per_second": 6.448,
"step": 5000
},
{
"epoch": 0.07748256120125317,
"grad_norm": 3.0247511863708496,
"learning_rate": 5.508361204013378e-06,
"loss": 8.2786,
"step": 5100
},
{
"epoch": 0.07748256120125317,
"eval_loss": 8.26013469696045,
"eval_runtime": 193.8849,
"eval_samples_per_second": 51.577,
"eval_steps_per_second": 6.447,
"step": 5100
},
{
"epoch": 0.0790018271071601,
"grad_norm": 3.4587104320526123,
"learning_rate": 5.498327759197324e-06,
"loss": 8.2727,
"step": 5200
},
{
"epoch": 0.0790018271071601,
"eval_loss": 8.258410453796387,
"eval_runtime": 193.8157,
"eval_samples_per_second": 51.595,
"eval_steps_per_second": 6.449,
"step": 5200
},
{
"epoch": 0.08052109301306702,
"grad_norm": 3.1752476692199707,
"learning_rate": 5.488294314381271e-06,
"loss": 8.269,
"step": 5300
},
{
"epoch": 0.08052109301306702,
"eval_loss": 8.251901626586914,
"eval_runtime": 193.7661,
"eval_samples_per_second": 51.609,
"eval_steps_per_second": 6.451,
"step": 5300
},
{
"epoch": 0.08204035891897395,
"grad_norm": 2.231090784072876,
"learning_rate": 5.478260869565217e-06,
"loss": 8.2612,
"step": 5400
},
{
"epoch": 0.08204035891897395,
"eval_loss": 8.248543739318848,
"eval_runtime": 193.7807,
"eval_samples_per_second": 51.605,
"eval_steps_per_second": 6.451,
"step": 5400
},
{
"epoch": 0.08355962482488087,
"grad_norm": 1.8496346473693848,
"learning_rate": 5.468227424749163e-06,
"loss": 8.2594,
"step": 5500
},
{
"epoch": 0.08355962482488087,
"eval_loss": 8.25257396697998,
"eval_runtime": 193.5194,
"eval_samples_per_second": 51.674,
"eval_steps_per_second": 6.459,
"step": 5500
},
{
"epoch": 0.0850788907307878,
"grad_norm": 2.26971435546875,
"learning_rate": 5.45819397993311e-06,
"loss": 8.2519,
"step": 5600
},
{
"epoch": 0.0850788907307878,
"eval_loss": 8.239155769348145,
"eval_runtime": 193.6858,
"eval_samples_per_second": 51.63,
"eval_steps_per_second": 6.454,
"step": 5600
},
{
"epoch": 0.08659815663669472,
"grad_norm": 4.062191963195801,
"learning_rate": 5.448160535117057e-06,
"loss": 8.2494,
"step": 5700
},
{
"epoch": 0.08659815663669472,
"eval_loss": 8.248674392700195,
"eval_runtime": 193.7941,
"eval_samples_per_second": 51.601,
"eval_steps_per_second": 6.45,
"step": 5700
},
{
"epoch": 0.08811742254260164,
"grad_norm": 2.0019612312316895,
"learning_rate": 5.438127090301003e-06,
"loss": 8.246,
"step": 5800
},
{
"epoch": 0.08811742254260164,
"eval_loss": 8.234210968017578,
"eval_runtime": 193.7021,
"eval_samples_per_second": 51.626,
"eval_steps_per_second": 6.453,
"step": 5800
},
{
"epoch": 0.08963668844850857,
"grad_norm": 3.5080573558807373,
"learning_rate": 5.4280936454849495e-06,
"loss": 8.2391,
"step": 5900
},
{
"epoch": 0.08963668844850857,
"eval_loss": 8.240001678466797,
"eval_runtime": 193.5894,
"eval_samples_per_second": 51.656,
"eval_steps_per_second": 6.457,
"step": 5900
},
{
"epoch": 0.0911559543544155,
"grad_norm": 2.578500747680664,
"learning_rate": 5.418060200668896e-06,
"loss": 8.2361,
"step": 6000
},
{
"epoch": 0.0911559543544155,
"eval_loss": 8.238499641418457,
"eval_runtime": 193.7443,
"eval_samples_per_second": 51.614,
"eval_steps_per_second": 6.452,
"step": 6000
},
{
"epoch": 0.09267522026032242,
"grad_norm": 2.7456629276275635,
"learning_rate": 5.408026755852843e-06,
"loss": 8.2331,
"step": 6100
},
{
"epoch": 0.09267522026032242,
"eval_loss": 8.225603103637695,
"eval_runtime": 193.7051,
"eval_samples_per_second": 51.625,
"eval_steps_per_second": 6.453,
"step": 6100
},
{
"epoch": 0.09419448616622934,
"grad_norm": 1.1776982545852661,
"learning_rate": 5.397993311036789e-06,
"loss": 8.2294,
"step": 6200
},
{
"epoch": 0.09419448616622934,
"eval_loss": 8.235060691833496,
"eval_runtime": 193.5474,
"eval_samples_per_second": 51.667,
"eval_steps_per_second": 6.458,
"step": 6200
},
{
"epoch": 0.09571375207213627,
"grad_norm": 3.159752130508423,
"learning_rate": 5.387959866220736e-06,
"loss": 8.2273,
"step": 6300
},
{
"epoch": 0.09571375207213627,
"eval_loss": 8.22216510772705,
"eval_runtime": 193.5026,
"eval_samples_per_second": 51.679,
"eval_steps_per_second": 6.46,
"step": 6300
},
{
"epoch": 0.09723301797804319,
"grad_norm": 2.37727427482605,
"learning_rate": 5.3779264214046825e-06,
"loss": 8.2231,
"step": 6400
},
{
"epoch": 0.09723301797804319,
"eval_loss": 8.222684860229492,
"eval_runtime": 193.5974,
"eval_samples_per_second": 51.654,
"eval_steps_per_second": 6.457,
"step": 6400
},
{
"epoch": 0.09875228388395012,
"grad_norm": 2.0136072635650635,
"learning_rate": 5.367892976588628e-06,
"loss": 8.2203,
"step": 6500
},
{
"epoch": 0.09875228388395012,
"eval_loss": 8.220030784606934,
"eval_runtime": 193.6795,
"eval_samples_per_second": 51.632,
"eval_steps_per_second": 6.454,
"step": 6500
},
{
"epoch": 0.10027154978985704,
"grad_norm": 2.404653787612915,
"learning_rate": 5.357859531772575e-06,
"loss": 8.2154,
"step": 6600
},
{
"epoch": 0.10027154978985704,
"eval_loss": 8.219395637512207,
"eval_runtime": 193.5124,
"eval_samples_per_second": 51.676,
"eval_steps_per_second": 6.46,
"step": 6600
},
{
"epoch": 0.10179081569576397,
"grad_norm": 1.6043188571929932,
"learning_rate": 5.347826086956522e-06,
"loss": 8.2128,
"step": 6700
},
{
"epoch": 0.10179081569576397,
"eval_loss": 8.216435432434082,
"eval_runtime": 193.766,
"eval_samples_per_second": 51.609,
"eval_steps_per_second": 6.451,
"step": 6700
},
{
"epoch": 0.10331008160167089,
"grad_norm": 2.3386034965515137,
"learning_rate": 5.337792642140468e-06,
"loss": 8.2079,
"step": 6800
},
{
"epoch": 0.10331008160167089,
"eval_loss": 8.212626457214355,
"eval_runtime": 193.6166,
"eval_samples_per_second": 51.648,
"eval_steps_per_second": 6.456,
"step": 6800
},
{
"epoch": 0.10482934750757782,
"grad_norm": 2.259270668029785,
"learning_rate": 5.327759197324415e-06,
"loss": 8.2067,
"step": 6900
},
{
"epoch": 0.10482934750757782,
"eval_loss": 8.208475112915039,
"eval_runtime": 193.3797,
"eval_samples_per_second": 51.712,
"eval_steps_per_second": 6.464,
"step": 6900
},
{
"epoch": 0.10634861341348474,
"grad_norm": 2.469719409942627,
"learning_rate": 5.317725752508361e-06,
"loss": 8.1994,
"step": 7000
},
{
"epoch": 0.10634861341348474,
"eval_loss": 8.199501037597656,
"eval_runtime": 193.3429,
"eval_samples_per_second": 51.722,
"eval_steps_per_second": 6.465,
"step": 7000
},
{
"epoch": 0.10786787931939167,
"grad_norm": 4.370075702667236,
"learning_rate": 5.307692307692307e-06,
"loss": 8.1678,
"step": 7100
},
{
"epoch": 0.10786787931939167,
"eval_loss": 8.09277629852295,
"eval_runtime": 193.6702,
"eval_samples_per_second": 51.634,
"eval_steps_per_second": 6.454,
"step": 7100
},
{
"epoch": 0.10938714522529859,
"grad_norm": 5.548232555389404,
"learning_rate": 5.297658862876254e-06,
"loss": 8.0856,
"step": 7200
},
{
"epoch": 0.10938714522529859,
"eval_loss": 8.040851593017578,
"eval_runtime": 193.617,
"eval_samples_per_second": 51.648,
"eval_steps_per_second": 6.456,
"step": 7200
},
{
"epoch": 0.11090641113120552,
"grad_norm": 2.152247428894043,
"learning_rate": 5.287625418060201e-06,
"loss": 8.0478,
"step": 7300
},
{
"epoch": 0.11090641113120552,
"eval_loss": 7.996228218078613,
"eval_runtime": 193.613,
"eval_samples_per_second": 51.649,
"eval_steps_per_second": 6.456,
"step": 7300
},
{
"epoch": 0.11242567703711244,
"grad_norm": 5.204161167144775,
"learning_rate": 5.277591973244147e-06,
"loss": 8.0095,
"step": 7400
},
{
"epoch": 0.11242567703711244,
"eval_loss": 7.958820343017578,
"eval_runtime": 193.5944,
"eval_samples_per_second": 51.654,
"eval_steps_per_second": 6.457,
"step": 7400
},
{
"epoch": 0.11394494294301936,
"grad_norm": 7.082394123077393,
"learning_rate": 5.2675585284280935e-06,
"loss": 7.978,
"step": 7500
},
{
"epoch": 0.11394494294301936,
"eval_loss": 7.932178020477295,
"eval_runtime": 193.798,
"eval_samples_per_second": 51.6,
"eval_steps_per_second": 6.45,
"step": 7500
},
{
"epoch": 0.11546420884892629,
"grad_norm": 8.926252365112305,
"learning_rate": 5.25752508361204e-06,
"loss": 7.9505,
"step": 7600
},
{
"epoch": 0.11546420884892629,
"eval_loss": 7.882853031158447,
"eval_runtime": 193.6404,
"eval_samples_per_second": 51.642,
"eval_steps_per_second": 6.455,
"step": 7600
},
{
"epoch": 0.11698347475483321,
"grad_norm": 3.5671885013580322,
"learning_rate": 5.247491638795986e-06,
"loss": 7.9086,
"step": 7700
},
{
"epoch": 0.11698347475483321,
"eval_loss": 7.840451717376709,
"eval_runtime": 193.8911,
"eval_samples_per_second": 51.575,
"eval_steps_per_second": 6.447,
"step": 7700
},
{
"epoch": 0.11850274066074014,
"grad_norm": 5.790298938751221,
"learning_rate": 5.237458193979933e-06,
"loss": 7.861,
"step": 7800
},
{
"epoch": 0.11850274066074014,
"eval_loss": 7.790124416351318,
"eval_runtime": 193.8987,
"eval_samples_per_second": 51.573,
"eval_steps_per_second": 6.447,
"step": 7800
},
{
"epoch": 0.12002200656664706,
"grad_norm": 4.439774513244629,
"learning_rate": 5.22742474916388e-06,
"loss": 7.8082,
"step": 7900
},
{
"epoch": 0.12002200656664706,
"eval_loss": 7.72878885269165,
"eval_runtime": 193.9191,
"eval_samples_per_second": 51.568,
"eval_steps_per_second": 6.446,
"step": 7900
},
{
"epoch": 0.12154127247255399,
"grad_norm": 3.937167167663574,
"learning_rate": 5.2173913043478265e-06,
"loss": 7.757,
"step": 8000
},
{
"epoch": 0.12154127247255399,
"eval_loss": 7.679111003875732,
"eval_runtime": 193.7697,
"eval_samples_per_second": 51.608,
"eval_steps_per_second": 6.451,
"step": 8000
},
{
"epoch": 0.12306053837846091,
"grad_norm": 4.227074146270752,
"learning_rate": 5.207357859531772e-06,
"loss": 7.7088,
"step": 8100
},
{
"epoch": 0.12306053837846091,
"eval_loss": 7.634475231170654,
"eval_runtime": 193.766,
"eval_samples_per_second": 51.609,
"eval_steps_per_second": 6.451,
"step": 8100
},
{
"epoch": 0.12457980428436784,
"grad_norm": 3.042202949523926,
"learning_rate": 5.197324414715719e-06,
"loss": 7.6639,
"step": 8200
},
{
"epoch": 0.12457980428436784,
"eval_loss": 7.605250835418701,
"eval_runtime": 193.8104,
"eval_samples_per_second": 51.597,
"eval_steps_per_second": 6.45,
"step": 8200
},
{
"epoch": 0.12609907019027475,
"grad_norm": 4.436267375946045,
"learning_rate": 5.187290969899666e-06,
"loss": 7.6256,
"step": 8300
},
{
"epoch": 0.12609907019027475,
"eval_loss": 7.549579620361328,
"eval_runtime": 193.7749,
"eval_samples_per_second": 51.606,
"eval_steps_per_second": 6.451,
"step": 8300
},
{
"epoch": 0.12761833609618167,
"grad_norm": 3.9829390048980713,
"learning_rate": 5.177257525083612e-06,
"loss": 7.5838,
"step": 8400
},
{
"epoch": 0.12761833609618167,
"eval_loss": 7.504631042480469,
"eval_runtime": 193.7766,
"eval_samples_per_second": 51.606,
"eval_steps_per_second": 6.451,
"step": 8400
},
{
"epoch": 0.1291376020020886,
"grad_norm": 3.072918176651001,
"learning_rate": 5.167224080267559e-06,
"loss": 7.5446,
"step": 8500
},
{
"epoch": 0.1291376020020886,
"eval_loss": 7.467813968658447,
"eval_runtime": 194.0927,
"eval_samples_per_second": 51.522,
"eval_steps_per_second": 6.44,
"step": 8500
},
{
"epoch": 0.13065686790799552,
"grad_norm": 3.6358697414398193,
"learning_rate": 5.157190635451505e-06,
"loss": 7.5114,
"step": 8600
},
{
"epoch": 0.13065686790799552,
"eval_loss": 7.4314045906066895,
"eval_runtime": 197.5998,
"eval_samples_per_second": 50.607,
"eval_steps_per_second": 6.326,
"step": 8600
},
{
"epoch": 0.13217613381390245,
"grad_norm": 2.4115982055664062,
"learning_rate": 5.147157190635451e-06,
"loss": 7.4749,
"step": 8700
},
{
"epoch": 0.13217613381390245,
"eval_loss": 7.399141311645508,
"eval_runtime": 193.7763,
"eval_samples_per_second": 51.606,
"eval_steps_per_second": 6.451,
"step": 8700
},
{
"epoch": 0.1336953997198094,
"grad_norm": 3.8994717597961426,
"learning_rate": 5.137123745819398e-06,
"loss": 7.4388,
"step": 8800
},
{
"epoch": 0.1336953997198094,
"eval_loss": 7.3639349937438965,
"eval_runtime": 193.6905,
"eval_samples_per_second": 51.629,
"eval_steps_per_second": 6.454,
"step": 8800
},
{
"epoch": 0.13521466562571632,
"grad_norm": 3.6934337615966797,
"learning_rate": 5.127090301003345e-06,
"loss": 7.4034,
"step": 8900
},
{
"epoch": 0.13521466562571632,
"eval_loss": 7.327520370483398,
"eval_runtime": 193.686,
"eval_samples_per_second": 51.63,
"eval_steps_per_second": 6.454,
"step": 8900
},
{
"epoch": 0.13673393153162325,
"grad_norm": 3.4741897583007812,
"learning_rate": 5.117056856187291e-06,
"loss": 7.3684,
"step": 9000
},
{
"epoch": 0.13673393153162325,
"eval_loss": 7.294392108917236,
"eval_runtime": 193.3544,
"eval_samples_per_second": 51.719,
"eval_steps_per_second": 6.465,
"step": 9000
},
{
"epoch": 0.13825319743753017,
"grad_norm": 4.130598545074463,
"learning_rate": 5.1070234113712375e-06,
"loss": 7.3363,
"step": 9100
},
{
"epoch": 0.13825319743753017,
"eval_loss": 7.256102561950684,
"eval_runtime": 193.8255,
"eval_samples_per_second": 51.593,
"eval_steps_per_second": 6.449,
"step": 9100
},
{
"epoch": 0.1397724633434371,
"grad_norm": 3.8802666664123535,
"learning_rate": 5.096989966555184e-06,
"loss": 7.3054,
"step": 9200
},
{
"epoch": 0.1397724633434371,
"eval_loss": 7.220945358276367,
"eval_runtime": 193.6267,
"eval_samples_per_second": 51.646,
"eval_steps_per_second": 6.456,
"step": 9200
},
{
"epoch": 0.14129172924934402,
"grad_norm": 3.072411298751831,
"learning_rate": 5.08695652173913e-06,
"loss": 7.2674,
"step": 9300
},
{
"epoch": 0.14129172924934402,
"eval_loss": 7.181826591491699,
"eval_runtime": 193.6566,
"eval_samples_per_second": 51.638,
"eval_steps_per_second": 6.455,
"step": 9300
},
{
"epoch": 0.14281099515525095,
"grad_norm": 4.051361560821533,
"learning_rate": 5.076923076923077e-06,
"loss": 7.2294,
"step": 9400
},
{
"epoch": 0.14281099515525095,
"eval_loss": 7.154284477233887,
"eval_runtime": 193.737,
"eval_samples_per_second": 51.616,
"eval_steps_per_second": 6.452,
"step": 9400
},
{
"epoch": 0.14433026106115787,
"grad_norm": 3.4815194606781006,
"learning_rate": 5.066889632107024e-06,
"loss": 7.1993,
"step": 9500
},
{
"epoch": 0.14433026106115787,
"eval_loss": 7.109873294830322,
"eval_runtime": 193.7018,
"eval_samples_per_second": 51.626,
"eval_steps_per_second": 6.453,
"step": 9500
},
{
"epoch": 0.1458495269670648,
"grad_norm": 4.168730735778809,
"learning_rate": 5.05685618729097e-06,
"loss": 7.1617,
"step": 9600
},
{
"epoch": 0.1458495269670648,
"eval_loss": 7.068033695220947,
"eval_runtime": 193.7844,
"eval_samples_per_second": 51.604,
"eval_steps_per_second": 6.45,
"step": 9600
},
{
"epoch": 0.14736879287297172,
"grad_norm": 4.632892608642578,
"learning_rate": 5.046822742474916e-06,
"loss": 7.1265,
"step": 9700
},
{
"epoch": 0.14736879287297172,
"eval_loss": 7.029054641723633,
"eval_runtime": 193.7033,
"eval_samples_per_second": 51.625,
"eval_steps_per_second": 6.453,
"step": 9700
},
{
"epoch": 0.14888805877887865,
"grad_norm": 5.668432235717773,
"learning_rate": 5.036789297658863e-06,
"loss": 7.0973,
"step": 9800
},
{
"epoch": 0.14888805877887865,
"eval_loss": 7.001068115234375,
"eval_runtime": 193.7831,
"eval_samples_per_second": 51.604,
"eval_steps_per_second": 6.451,
"step": 9800
},
{
"epoch": 0.15040732468478557,
"grad_norm": 6.07447624206543,
"learning_rate": 5.02675585284281e-06,
"loss": 7.0693,
"step": 9900
},
{
"epoch": 0.15040732468478557,
"eval_loss": 6.974251747131348,
"eval_runtime": 193.7698,
"eval_samples_per_second": 51.608,
"eval_steps_per_second": 6.451,
"step": 9900
},
{
"epoch": 0.1519265905906925,
"grad_norm": 5.610072135925293,
"learning_rate": 5.016722408026756e-06,
"loss": 7.0395,
"step": 10000
},
{
"epoch": 0.1519265905906925,
"eval_loss": 6.959811687469482,
"eval_runtime": 193.7123,
"eval_samples_per_second": 51.623,
"eval_steps_per_second": 6.453,
"step": 10000
},
{
"epoch": 0.15344585649659942,
"grad_norm": 4.722342491149902,
"learning_rate": 5.0066889632107026e-06,
"loss": 7.0146,
"step": 10100
},
{
"epoch": 0.15344585649659942,
"eval_loss": 6.919780254364014,
"eval_runtime": 193.9627,
"eval_samples_per_second": 51.556,
"eval_steps_per_second": 6.445,
"step": 10100
},
{
"epoch": 0.15496512240250634,
"grad_norm": 2.454202175140381,
"learning_rate": 4.996655518394649e-06,
"loss": 6.979,
"step": 10200
},
{
"epoch": 0.15496512240250634,
"eval_loss": 6.8777360916137695,
"eval_runtime": 193.5746,
"eval_samples_per_second": 51.66,
"eval_steps_per_second": 6.457,
"step": 10200
},
{
"epoch": 0.15648438830841327,
"grad_norm": 6.628566265106201,
"learning_rate": 4.986622073578595e-06,
"loss": 6.9576,
"step": 10300
},
{
"epoch": 0.15648438830841327,
"eval_loss": 6.851335048675537,
"eval_runtime": 193.6238,
"eval_samples_per_second": 51.647,
"eval_steps_per_second": 6.456,
"step": 10300
},
{
"epoch": 0.1580036542143202,
"grad_norm": 4.226571559906006,
"learning_rate": 4.976588628762542e-06,
"loss": 6.9294,
"step": 10400
},
{
"epoch": 0.1580036542143202,
"eval_loss": 6.837319374084473,
"eval_runtime": 193.6321,
"eval_samples_per_second": 51.644,
"eval_steps_per_second": 6.456,
"step": 10400
},
{
"epoch": 0.15952292012022712,
"grad_norm": 3.949143648147583,
"learning_rate": 4.966555183946489e-06,
"loss": 6.906,
"step": 10500
},
{
"epoch": 0.15952292012022712,
"eval_loss": 6.798065662384033,
"eval_runtime": 193.7227,
"eval_samples_per_second": 51.62,
"eval_steps_per_second": 6.453,
"step": 10500
},
{
"epoch": 0.16104218602613404,
"grad_norm": 4.327299118041992,
"learning_rate": 4.956521739130435e-06,
"loss": 6.8789,
"step": 10600
},
{
"epoch": 0.16104218602613404,
"eval_loss": 6.793569564819336,
"eval_runtime": 193.6196,
"eval_samples_per_second": 51.648,
"eval_steps_per_second": 6.456,
"step": 10600
},
{
"epoch": 0.16256145193204097,
"grad_norm": 3.8152856826782227,
"learning_rate": 4.9464882943143815e-06,
"loss": 6.8557,
"step": 10700
},
{
"epoch": 0.16256145193204097,
"eval_loss": 6.754009246826172,
"eval_runtime": 193.7319,
"eval_samples_per_second": 51.618,
"eval_steps_per_second": 6.452,
"step": 10700
},
{
"epoch": 0.1640807178379479,
"grad_norm": 4.621021747589111,
"learning_rate": 4.936454849498328e-06,
"loss": 6.8387,
"step": 10800
},
{
"epoch": 0.1640807178379479,
"eval_loss": 6.748414993286133,
"eval_runtime": 193.8639,
"eval_samples_per_second": 51.583,
"eval_steps_per_second": 6.448,
"step": 10800
},
{
"epoch": 0.16559998374385482,
"grad_norm": 4.906980514526367,
"learning_rate": 4.926421404682274e-06,
"loss": 6.8054,
"step": 10900
},
{
"epoch": 0.16559998374385482,
"eval_loss": 6.722209453582764,
"eval_runtime": 193.5438,
"eval_samples_per_second": 51.668,
"eval_steps_per_second": 6.458,
"step": 10900
},
{
"epoch": 0.16711924964976174,
"grad_norm": 3.9837253093719482,
"learning_rate": 4.916387959866221e-06,
"loss": 6.7829,
"step": 11000
},
{
"epoch": 0.16711924964976174,
"eval_loss": 6.689162731170654,
"eval_runtime": 193.4992,
"eval_samples_per_second": 51.68,
"eval_steps_per_second": 6.46,
"step": 11000
},
{
"epoch": 0.16863851555566867,
"grad_norm": 4.781426906585693,
"learning_rate": 4.906354515050168e-06,
"loss": 6.7597,
"step": 11100
},
{
"epoch": 0.16863851555566867,
"eval_loss": 6.658721923828125,
"eval_runtime": 193.7146,
"eval_samples_per_second": 51.622,
"eval_steps_per_second": 6.453,
"step": 11100
},
{
"epoch": 0.1701577814615756,
"grad_norm": 6.702068328857422,
"learning_rate": 4.8963210702341136e-06,
"loss": 6.7416,
"step": 11200
},
{
"epoch": 0.1701577814615756,
"eval_loss": 6.642455577850342,
"eval_runtime": 193.6222,
"eval_samples_per_second": 51.647,
"eval_steps_per_second": 6.456,
"step": 11200
},
{
"epoch": 0.17167704736748252,
"grad_norm": 3.1839189529418945,
"learning_rate": 4.88628762541806e-06,
"loss": 6.7201,
"step": 11300
},
{
"epoch": 0.17167704736748252,
"eval_loss": 6.614835262298584,
"eval_runtime": 193.735,
"eval_samples_per_second": 51.617,
"eval_steps_per_second": 6.452,
"step": 11300
},
{
"epoch": 0.17319631327338944,
"grad_norm": 5.427370071411133,
"learning_rate": 4.876254180602007e-06,
"loss": 6.6993,
"step": 11400
},
{
"epoch": 0.17319631327338944,
"eval_loss": 6.601010799407959,
"eval_runtime": 193.6047,
"eval_samples_per_second": 51.652,
"eval_steps_per_second": 6.456,
"step": 11400
},
{
"epoch": 0.17471557917929637,
"grad_norm": 4.759448051452637,
"learning_rate": 4.866220735785953e-06,
"loss": 6.6775,
"step": 11500
},
{
"epoch": 0.17471557917929637,
"eval_loss": 6.579466342926025,
"eval_runtime": 193.6792,
"eval_samples_per_second": 51.632,
"eval_steps_per_second": 6.454,
"step": 11500
},
{
"epoch": 0.1762348450852033,
"grad_norm": 3.7401344776153564,
"learning_rate": 4.8561872909699e-06,
"loss": 6.6565,
"step": 11600
},
{
"epoch": 0.1762348450852033,
"eval_loss": 6.576225757598877,
"eval_runtime": 194.0381,
"eval_samples_per_second": 51.536,
"eval_steps_per_second": 6.442,
"step": 11600
},
{
"epoch": 0.17775411099111021,
"grad_norm": 5.589729309082031,
"learning_rate": 4.8461538461538465e-06,
"loss": 6.6419,
"step": 11700
},
{
"epoch": 0.17775411099111021,
"eval_loss": 6.542896270751953,
"eval_runtime": 193.798,
"eval_samples_per_second": 51.6,
"eval_steps_per_second": 6.45,
"step": 11700
},
{
"epoch": 0.17927337689701714,
"grad_norm": 4.623971939086914,
"learning_rate": 4.8361204013377925e-06,
"loss": 6.6199,
"step": 11800
},
{
"epoch": 0.17927337689701714,
"eval_loss": 6.519240856170654,
"eval_runtime": 193.739,
"eval_samples_per_second": 51.616,
"eval_steps_per_second": 6.452,
"step": 11800
},
{
"epoch": 0.18079264280292406,
"grad_norm": 4.685464859008789,
"learning_rate": 4.826086956521739e-06,
"loss": 6.6012,
"step": 11900
},
{
"epoch": 0.18079264280292406,
"eval_loss": 6.489596843719482,
"eval_runtime": 193.7453,
"eval_samples_per_second": 51.614,
"eval_steps_per_second": 6.452,
"step": 11900
},
{
"epoch": 0.182311908708831,
"grad_norm": 3.054800271987915,
"learning_rate": 4.816053511705686e-06,
"loss": 6.5813,
"step": 12000
},
{
"epoch": 0.182311908708831,
"eval_loss": 6.497661113739014,
"eval_runtime": 193.7138,
"eval_samples_per_second": 51.623,
"eval_steps_per_second": 6.453,
"step": 12000
},
{
"epoch": 0.1838311746147379,
"grad_norm": 3.547619342803955,
"learning_rate": 4.806020066889633e-06,
"loss": 6.5623,
"step": 12100
},
{
"epoch": 0.1838311746147379,
"eval_loss": 6.468958377838135,
"eval_runtime": 193.766,
"eval_samples_per_second": 51.609,
"eval_steps_per_second": 6.451,
"step": 12100
},
{
"epoch": 0.18535044052064484,
"grad_norm": 4.697444915771484,
"learning_rate": 4.795986622073579e-06,
"loss": 6.5448,
"step": 12200
},
{
"epoch": 0.18535044052064484,
"eval_loss": 6.436464309692383,
"eval_runtime": 193.7105,
"eval_samples_per_second": 51.623,
"eval_steps_per_second": 6.453,
"step": 12200
},
{
"epoch": 0.18686970642655176,
"grad_norm": 4.79019021987915,
"learning_rate": 4.785953177257525e-06,
"loss": 6.525,
"step": 12300
},
{
"epoch": 0.18686970642655176,
"eval_loss": 6.422084331512451,
"eval_runtime": 193.7424,
"eval_samples_per_second": 51.615,
"eval_steps_per_second": 6.452,
"step": 12300
},
{
"epoch": 0.1883889723324587,
"grad_norm": 3.7939579486846924,
"learning_rate": 4.775919732441472e-06,
"loss": 6.5131,
"step": 12400
},
{
"epoch": 0.1883889723324587,
"eval_loss": 6.425159931182861,
"eval_runtime": 193.6634,
"eval_samples_per_second": 51.636,
"eval_steps_per_second": 6.454,
"step": 12400
},
{
"epoch": 0.1899082382383656,
"grad_norm": 3.271348714828491,
"learning_rate": 4.765886287625418e-06,
"loss": 6.4978,
"step": 12500
},
{
"epoch": 0.1899082382383656,
"eval_loss": 6.395818710327148,
"eval_runtime": 193.5299,
"eval_samples_per_second": 51.672,
"eval_steps_per_second": 6.459,
"step": 12500
},
{
"epoch": 0.19142750414427254,
"grad_norm": 4.119296073913574,
"learning_rate": 4.755852842809365e-06,
"loss": 6.483,
"step": 12600
},
{
"epoch": 0.19142750414427254,
"eval_loss": 6.397064208984375,
"eval_runtime": 193.5731,
"eval_samples_per_second": 51.66,
"eval_steps_per_second": 6.458,
"step": 12600
},
{
"epoch": 0.19294677005017946,
"grad_norm": 3.7907373905181885,
"learning_rate": 4.745819397993312e-06,
"loss": 6.4639,
"step": 12700
},
{
"epoch": 0.19294677005017946,
"eval_loss": 6.348308563232422,
"eval_runtime": 193.6279,
"eval_samples_per_second": 51.645,
"eval_steps_per_second": 6.456,
"step": 12700
},
{
"epoch": 0.19446603595608639,
"grad_norm": 3.8455281257629395,
"learning_rate": 4.7357859531772575e-06,
"loss": 6.4471,
"step": 12800
},
{
"epoch": 0.19446603595608639,
"eval_loss": 6.347524642944336,
"eval_runtime": 193.9737,
"eval_samples_per_second": 51.553,
"eval_steps_per_second": 6.444,
"step": 12800
},
{
"epoch": 0.1959853018619933,
"grad_norm": 3.5916056632995605,
"learning_rate": 4.725752508361204e-06,
"loss": 6.4303,
"step": 12900
},
{
"epoch": 0.1959853018619933,
"eval_loss": 6.33302640914917,
"eval_runtime": 193.6018,
"eval_samples_per_second": 51.652,
"eval_steps_per_second": 6.457,
"step": 12900
},
{
"epoch": 0.19750456776790024,
"grad_norm": 3.734985589981079,
"learning_rate": 4.715719063545151e-06,
"loss": 6.4158,
"step": 13000
},
{
"epoch": 0.19750456776790024,
"eval_loss": 6.3155083656311035,
"eval_runtime": 193.6987,
"eval_samples_per_second": 51.627,
"eval_steps_per_second": 6.453,
"step": 13000
},
{
"epoch": 0.19902383367380716,
"grad_norm": 3.579678535461426,
"learning_rate": 4.705685618729097e-06,
"loss": 6.4024,
"step": 13100
},
{
"epoch": 0.19902383367380716,
"eval_loss": 6.29377555847168,
"eval_runtime": 193.6889,
"eval_samples_per_second": 51.629,
"eval_steps_per_second": 6.454,
"step": 13100
},
{
"epoch": 0.20054309957971408,
"grad_norm": 4.257501125335693,
"learning_rate": 4.695652173913044e-06,
"loss": 6.3894,
"step": 13200
},
{
"epoch": 0.20054309957971408,
"eval_loss": 6.288681507110596,
"eval_runtime": 193.6264,
"eval_samples_per_second": 51.646,
"eval_steps_per_second": 6.456,
"step": 13200
},
{
"epoch": 0.202062365485621,
"grad_norm": 3.8430824279785156,
"learning_rate": 4.6856187290969905e-06,
"loss": 6.3715,
"step": 13300
},
{
"epoch": 0.202062365485621,
"eval_loss": 6.247255802154541,
"eval_runtime": 193.7628,
"eval_samples_per_second": 51.609,
"eval_steps_per_second": 6.451,
"step": 13300
},
{
"epoch": 0.20358163139152793,
"grad_norm": 3.9459517002105713,
"learning_rate": 4.675585284280936e-06,
"loss": 6.3583,
"step": 13400
},
{
"epoch": 0.20358163139152793,
"eval_loss": 6.250117301940918,
"eval_runtime": 193.7665,
"eval_samples_per_second": 51.609,
"eval_steps_per_second": 6.451,
"step": 13400
},
{
"epoch": 0.20510089729743486,
"grad_norm": 3.475034475326538,
"learning_rate": 4.665551839464883e-06,
"loss": 6.3431,
"step": 13500
},
{
"epoch": 0.20510089729743486,
"eval_loss": 6.22703742980957,
"eval_runtime": 193.7742,
"eval_samples_per_second": 51.606,
"eval_steps_per_second": 6.451,
"step": 13500
},
{
"epoch": 0.20662016320334178,
"grad_norm": 4.17089319229126,
"learning_rate": 4.65551839464883e-06,
"loss": 6.3288,
"step": 13600
},
{
"epoch": 0.20662016320334178,
"eval_loss": 6.230484962463379,
"eval_runtime": 193.7687,
"eval_samples_per_second": 51.608,
"eval_steps_per_second": 6.451,
"step": 13600
},
{
"epoch": 0.2081394291092487,
"grad_norm": 2.118986129760742,
"learning_rate": 4.645484949832776e-06,
"loss": 6.3169,
"step": 13700
},
{
"epoch": 0.2081394291092487,
"eval_loss": 6.206001281738281,
"eval_runtime": 193.8247,
"eval_samples_per_second": 51.593,
"eval_steps_per_second": 6.449,
"step": 13700
},
{
"epoch": 0.20965869501515563,
"grad_norm": 4.810153007507324,
"learning_rate": 4.635451505016723e-06,
"loss": 6.3032,
"step": 13800
},
{
"epoch": 0.20965869501515563,
"eval_loss": 6.18707275390625,
"eval_runtime": 193.7442,
"eval_samples_per_second": 51.614,
"eval_steps_per_second": 6.452,
"step": 13800
},
{
"epoch": 0.21117796092106256,
"grad_norm": 3.7797763347625732,
"learning_rate": 4.625418060200669e-06,
"loss": 6.2918,
"step": 13900
},
{
"epoch": 0.21117796092106256,
"eval_loss": 6.1838603019714355,
"eval_runtime": 193.7881,
"eval_samples_per_second": 51.603,
"eval_steps_per_second": 6.45,
"step": 13900
},
{
"epoch": 0.21269722682696948,
"grad_norm": 4.482378959655762,
"learning_rate": 4.615384615384616e-06,
"loss": 6.2757,
"step": 14000
},
{
"epoch": 0.21269722682696948,
"eval_loss": 6.161965847015381,
"eval_runtime": 193.7356,
"eval_samples_per_second": 51.617,
"eval_steps_per_second": 6.452,
"step": 14000
},
{
"epoch": 0.2142164927328764,
"grad_norm": 4.001418590545654,
"learning_rate": 4.605351170568562e-06,
"loss": 6.2647,
"step": 14100
},
{
"epoch": 0.2142164927328764,
"eval_loss": 6.15457820892334,
"eval_runtime": 193.9234,
"eval_samples_per_second": 51.567,
"eval_steps_per_second": 6.446,
"step": 14100
},
{
"epoch": 0.21573575863878333,
"grad_norm": 3.8982086181640625,
"learning_rate": 4.595317725752509e-06,
"loss": 6.2545,
"step": 14200
},
{
"epoch": 0.21573575863878333,
"eval_loss": 6.143900394439697,
"eval_runtime": 193.7808,
"eval_samples_per_second": 51.605,
"eval_steps_per_second": 6.451,
"step": 14200
},
{
"epoch": 0.21725502454469026,
"grad_norm": 3.459050416946411,
"learning_rate": 4.585284280936456e-06,
"loss": 6.2398,
"step": 14300
},
{
"epoch": 0.21725502454469026,
"eval_loss": 6.131939888000488,
"eval_runtime": 193.8856,
"eval_samples_per_second": 51.577,
"eval_steps_per_second": 6.447,
"step": 14300
},
{
"epoch": 0.21877429045059718,
"grad_norm": 3.335505962371826,
"learning_rate": 4.5752508361204015e-06,
"loss": 6.2327,
"step": 14400
},
{
"epoch": 0.21877429045059718,
"eval_loss": 6.106751918792725,
"eval_runtime": 193.844,
"eval_samples_per_second": 51.588,
"eval_steps_per_second": 6.448,
"step": 14400
},
{
"epoch": 0.2202935563565041,
"grad_norm": 3.845909357070923,
"learning_rate": 4.565217391304348e-06,
"loss": 6.2144,
"step": 14500
},
{
"epoch": 0.2202935563565041,
"eval_loss": 6.094777584075928,
"eval_runtime": 193.5802,
"eval_samples_per_second": 51.658,
"eval_steps_per_second": 6.457,
"step": 14500
},
{
"epoch": 0.22181282226241103,
"grad_norm": 3.846149206161499,
"learning_rate": 4.555183946488295e-06,
"loss": 6.2022,
"step": 14600
},
{
"epoch": 0.22181282226241103,
"eval_loss": 6.085541248321533,
"eval_runtime": 193.8098,
"eval_samples_per_second": 51.597,
"eval_steps_per_second": 6.45,
"step": 14600
},
{
"epoch": 0.22333208816831795,
"grad_norm": 3.50091814994812,
"learning_rate": 4.545150501672241e-06,
"loss": 6.1915,
"step": 14700
},
{
"epoch": 0.22333208816831795,
"eval_loss": 6.058828830718994,
"eval_runtime": 193.4542,
"eval_samples_per_second": 51.692,
"eval_steps_per_second": 6.461,
"step": 14700
},
{
"epoch": 0.22485135407422488,
"grad_norm": 4.312457084655762,
"learning_rate": 4.535117056856188e-06,
"loss": 6.1776,
"step": 14800
},
{
"epoch": 0.22485135407422488,
"eval_loss": 6.067806720733643,
"eval_runtime": 193.5484,
"eval_samples_per_second": 51.667,
"eval_steps_per_second": 6.458,
"step": 14800
},
{
"epoch": 0.2263706199801318,
"grad_norm": 3.6476268768310547,
"learning_rate": 4.5250836120401345e-06,
"loss": 6.1703,
"step": 14900
},
{
"epoch": 0.2263706199801318,
"eval_loss": 6.045175552368164,
"eval_runtime": 193.3937,
"eval_samples_per_second": 51.708,
"eval_steps_per_second": 6.463,
"step": 14900
},
{
"epoch": 0.22788988588603873,
"grad_norm": 4.1377739906311035,
"learning_rate": 4.51505016722408e-06,
"loss": 6.1577,
"step": 15000
},
{
"epoch": 0.22788988588603873,
"eval_loss": 6.038886547088623,
"eval_runtime": 193.5749,
"eval_samples_per_second": 51.66,
"eval_steps_per_second": 6.457,
"step": 15000
},
{
"epoch": 0.22940915179194565,
"grad_norm": 4.192631244659424,
"learning_rate": 4.505016722408027e-06,
"loss": 6.1477,
"step": 15100
},
{
"epoch": 0.22940915179194565,
"eval_loss": 6.030833721160889,
"eval_runtime": 193.5655,
"eval_samples_per_second": 51.662,
"eval_steps_per_second": 6.458,
"step": 15100
},
{
"epoch": 0.23092841769785258,
"grad_norm": 3.234416961669922,
"learning_rate": 4.494983277591973e-06,
"loss": 6.1363,
"step": 15200
},
{
"epoch": 0.23092841769785258,
"eval_loss": 6.008749008178711,
"eval_runtime": 193.3663,
"eval_samples_per_second": 51.715,
"eval_steps_per_second": 6.464,
"step": 15200
},
{
"epoch": 0.2324476836037595,
"grad_norm": 4.860428810119629,
"learning_rate": 4.48494983277592e-06,
"loss": 6.1298,
"step": 15300
},
{
"epoch": 0.2324476836037595,
"eval_loss": 5.996873378753662,
"eval_runtime": 193.6007,
"eval_samples_per_second": 51.653,
"eval_steps_per_second": 6.457,
"step": 15300
},
{
"epoch": 0.23396694950966643,
"grad_norm": 4.0561323165893555,
"learning_rate": 4.474916387959866e-06,
"loss": 6.1122,
"step": 15400
},
{
"epoch": 0.23396694950966643,
"eval_loss": 5.984120845794678,
"eval_runtime": 193.5721,
"eval_samples_per_second": 51.66,
"eval_steps_per_second": 6.458,
"step": 15400
},
{
"epoch": 0.23548621541557335,
"grad_norm": 2.9819724559783936,
"learning_rate": 4.4648829431438125e-06,
"loss": 6.1024,
"step": 15500
},
{
"epoch": 0.23548621541557335,
"eval_loss": 5.982254981994629,
"eval_runtime": 193.7425,
"eval_samples_per_second": 51.615,
"eval_steps_per_second": 6.452,
"step": 15500
},
{
"epoch": 0.23700548132148028,
"grad_norm": 3.733194351196289,
"learning_rate": 4.454849498327759e-06,
"loss": 6.0924,
"step": 15600
},
{
"epoch": 0.23700548132148028,
"eval_loss": 5.969741344451904,
"eval_runtime": 193.8217,
"eval_samples_per_second": 51.594,
"eval_steps_per_second": 6.449,
"step": 15600
},
{
"epoch": 0.2385247472273872,
"grad_norm": 5.688018321990967,
"learning_rate": 4.444816053511705e-06,
"loss": 6.0871,
"step": 15700
},
{
"epoch": 0.2385247472273872,
"eval_loss": 5.9461445808410645,
"eval_runtime": 193.8283,
"eval_samples_per_second": 51.592,
"eval_steps_per_second": 6.449,
"step": 15700
},
{
"epoch": 0.24004401313329413,
"grad_norm": 2.9404726028442383,
"learning_rate": 4.434782608695652e-06,
"loss": 6.0706,
"step": 15800
},
{
"epoch": 0.24004401313329413,
"eval_loss": 5.936134338378906,
"eval_runtime": 193.8629,
"eval_samples_per_second": 51.583,
"eval_steps_per_second": 6.448,
"step": 15800
},
{
"epoch": 0.24156327903920105,
"grad_norm": 4.436812877655029,
"learning_rate": 4.424749163879599e-06,
"loss": 6.0652,
"step": 15900
},
{
"epoch": 0.24156327903920105,
"eval_loss": 5.9289655685424805,
"eval_runtime": 193.8246,
"eval_samples_per_second": 51.593,
"eval_steps_per_second": 6.449,
"step": 15900
},
{
"epoch": 0.24308254494510798,
"grad_norm": 4.113779544830322,
"learning_rate": 4.414715719063545e-06,
"loss": 6.0497,
"step": 16000
},
{
"epoch": 0.24308254494510798,
"eval_loss": 5.926904678344727,
"eval_runtime": 194.0466,
"eval_samples_per_second": 51.534,
"eval_steps_per_second": 6.442,
"step": 16000
},
{
"epoch": 0.2446018108510149,
"grad_norm": 3.2827975749969482,
"learning_rate": 4.404682274247491e-06,
"loss": 6.0365,
"step": 16100
},
{
"epoch": 0.2446018108510149,
"eval_loss": 5.9020185470581055,
"eval_runtime": 193.842,
"eval_samples_per_second": 51.588,
"eval_steps_per_second": 6.449,
"step": 16100
},
{
"epoch": 0.24612107675692182,
"grad_norm": 3.8352739810943604,
"learning_rate": 4.394648829431438e-06,
"loss": 6.0305,
"step": 16200
},
{
"epoch": 0.24612107675692182,
"eval_loss": 5.900501251220703,
"eval_runtime": 193.7914,
"eval_samples_per_second": 51.602,
"eval_steps_per_second": 6.45,
"step": 16200
},
{
"epoch": 0.24764034266282875,
"grad_norm": 3.2179617881774902,
"learning_rate": 4.384615384615384e-06,
"loss": 6.0166,
"step": 16300
},
{
"epoch": 0.24764034266282875,
"eval_loss": 5.891448497772217,
"eval_runtime": 193.74,
"eval_samples_per_second": 51.616,
"eval_steps_per_second": 6.452,
"step": 16300
},
{
"epoch": 0.24915960856873567,
"grad_norm": 3.446993112564087,
"learning_rate": 4.374581939799331e-06,
"loss": 6.0121,
"step": 16400
},
{
"epoch": 0.24915960856873567,
"eval_loss": 5.874625205993652,
"eval_runtime": 193.8125,
"eval_samples_per_second": 51.596,
"eval_steps_per_second": 6.45,
"step": 16400
},
{
"epoch": 0.2506788744746426,
"grad_norm": 4.3962578773498535,
"learning_rate": 4.364548494983278e-06,
"loss": 6.0029,
"step": 16500
},
{
"epoch": 0.2506788744746426,
"eval_loss": 5.884474754333496,
"eval_runtime": 193.754,
"eval_samples_per_second": 51.612,
"eval_steps_per_second": 6.451,
"step": 16500
},
{
"epoch": 0.2521981403805495,
"grad_norm": 4.52181339263916,
"learning_rate": 4.354515050167224e-06,
"loss": 5.9925,
"step": 16600
},
{
"epoch": 0.2521981403805495,
"eval_loss": 5.867855548858643,
"eval_runtime": 193.7648,
"eval_samples_per_second": 51.609,
"eval_steps_per_second": 6.451,
"step": 16600
},
{
"epoch": 0.25371740628645645,
"grad_norm": 3.847750186920166,
"learning_rate": 4.34448160535117e-06,
"loss": 5.9839,
"step": 16700
},
{
"epoch": 0.25371740628645645,
"eval_loss": 5.851235389709473,
"eval_runtime": 193.6623,
"eval_samples_per_second": 51.636,
"eval_steps_per_second": 6.455,
"step": 16700
},
{
"epoch": 0.25523667219236335,
"grad_norm": 2.9024147987365723,
"learning_rate": 4.334448160535117e-06,
"loss": 5.9744,
"step": 16800
},
{
"epoch": 0.25523667219236335,
"eval_loss": 5.854368686676025,
"eval_runtime": 193.8613,
"eval_samples_per_second": 51.583,
"eval_steps_per_second": 6.448,
"step": 16800
},
{
"epoch": 0.2567559380982703,
"grad_norm": 3.2213125228881836,
"learning_rate": 4.324414715719064e-06,
"loss": 5.9653,
"step": 16900
},
{
"epoch": 0.2567559380982703,
"eval_loss": 5.836233139038086,
"eval_runtime": 193.5642,
"eval_samples_per_second": 51.662,
"eval_steps_per_second": 6.458,
"step": 16900
},
{
"epoch": 0.2582752040041772,
"grad_norm": 4.198850631713867,
"learning_rate": 4.31438127090301e-06,
"loss": 5.957,
"step": 17000
},
{
"epoch": 0.2582752040041772,
"eval_loss": 5.818154811859131,
"eval_runtime": 193.5777,
"eval_samples_per_second": 51.659,
"eval_steps_per_second": 6.457,
"step": 17000
},
{
"epoch": 0.25979446991008415,
"grad_norm": 2.9214396476745605,
"learning_rate": 4.3043478260869565e-06,
"loss": 5.9417,
"step": 17100
},
{
"epoch": 0.25979446991008415,
"eval_loss": 5.829405784606934,
"eval_runtime": 193.6055,
"eval_samples_per_second": 51.651,
"eval_steps_per_second": 6.456,
"step": 17100
},
{
"epoch": 0.26131373581599104,
"grad_norm": 3.7691545486450195,
"learning_rate": 4.294314381270903e-06,
"loss": 5.934,
"step": 17200
},
{
"epoch": 0.26131373581599104,
"eval_loss": 5.794999122619629,
"eval_runtime": 193.5657,
"eval_samples_per_second": 51.662,
"eval_steps_per_second": 6.458,
"step": 17200
},
{
"epoch": 0.262833001721898,
"grad_norm": 4.013944625854492,
"learning_rate": 4.284280936454849e-06,
"loss": 5.9269,
"step": 17300
},
{
"epoch": 0.262833001721898,
"eval_loss": 5.787894248962402,
"eval_runtime": 193.7935,
"eval_samples_per_second": 51.601,
"eval_steps_per_second": 6.45,
"step": 17300
},
{
"epoch": 0.2643522676278049,
"grad_norm": 3.784191370010376,
"learning_rate": 4.274247491638796e-06,
"loss": 5.9224,
"step": 17400
},
{
"epoch": 0.2643522676278049,
"eval_loss": 5.795870780944824,
"eval_runtime": 193.6051,
"eval_samples_per_second": 51.652,
"eval_steps_per_second": 6.456,
"step": 17400
},
{
"epoch": 0.26587153353371185,
"grad_norm": 4.354425430297852,
"learning_rate": 4.264214046822743e-06,
"loss": 5.909,
"step": 17500
},
{
"epoch": 0.26587153353371185,
"eval_loss": 5.785282611846924,
"eval_runtime": 193.5413,
"eval_samples_per_second": 51.669,
"eval_steps_per_second": 6.459,
"step": 17500
},
{
"epoch": 0.2673907994396188,
"grad_norm": 3.2807064056396484,
"learning_rate": 4.254180602006689e-06,
"loss": 5.9017,
"step": 17600
},
{
"epoch": 0.2673907994396188,
"eval_loss": 5.772453308105469,
"eval_runtime": 193.741,
"eval_samples_per_second": 51.615,
"eval_steps_per_second": 6.452,
"step": 17600
},
{
"epoch": 0.2689100653455257,
"grad_norm": 3.0385000705718994,
"learning_rate": 4.244147157190635e-06,
"loss": 5.8906,
"step": 17700
},
{
"epoch": 0.2689100653455257,
"eval_loss": 5.765667915344238,
"eval_runtime": 193.8777,
"eval_samples_per_second": 51.579,
"eval_steps_per_second": 6.447,
"step": 17700
},
{
"epoch": 0.27042933125143265,
"grad_norm": 2.746528148651123,
"learning_rate": 4.234113712374582e-06,
"loss": 5.8847,
"step": 17800
},
{
"epoch": 0.27042933125143265,
"eval_loss": 5.7541351318359375,
"eval_runtime": 193.9065,
"eval_samples_per_second": 51.571,
"eval_steps_per_second": 6.446,
"step": 17800
},
{
"epoch": 0.27194859715733954,
"grad_norm": 3.3728785514831543,
"learning_rate": 4.224080267558528e-06,
"loss": 5.8769,
"step": 17900
},
{
"epoch": 0.27194859715733954,
"eval_loss": 5.7371392250061035,
"eval_runtime": 193.8325,
"eval_samples_per_second": 51.591,
"eval_steps_per_second": 6.449,
"step": 17900
},
{
"epoch": 0.2734678630632465,
"grad_norm": 3.4341955184936523,
"learning_rate": 4.214046822742475e-06,
"loss": 5.8711,
"step": 18000
},
{
"epoch": 0.2734678630632465,
"eval_loss": 5.715305328369141,
"eval_runtime": 193.9066,
"eval_samples_per_second": 51.571,
"eval_steps_per_second": 6.446,
"step": 18000
},
{
"epoch": 0.2749871289691534,
"grad_norm": 4.6379313468933105,
"learning_rate": 4.2040133779264216e-06,
"loss": 5.861,
"step": 18100
},
{
"epoch": 0.2749871289691534,
"eval_loss": 5.71766996383667,
"eval_runtime": 193.7937,
"eval_samples_per_second": 51.601,
"eval_steps_per_second": 6.45,
"step": 18100
},
{
"epoch": 0.27650639487506035,
"grad_norm": 3.901848554611206,
"learning_rate": 4.1939799331103675e-06,
"loss": 5.855,
"step": 18200
},
{
"epoch": 0.27650639487506035,
"eval_loss": 5.7228240966796875,
"eval_runtime": 194.0011,
"eval_samples_per_second": 51.546,
"eval_steps_per_second": 6.443,
"step": 18200
},
{
"epoch": 0.27802566078096724,
"grad_norm": 2.7498176097869873,
"learning_rate": 4.183946488294314e-06,
"loss": 5.8388,
"step": 18300
},
{
"epoch": 0.27802566078096724,
"eval_loss": 5.699355125427246,
"eval_runtime": 193.8501,
"eval_samples_per_second": 51.586,
"eval_steps_per_second": 6.448,
"step": 18300
},
{
"epoch": 0.2795449266868742,
"grad_norm": 3.4318690299987793,
"learning_rate": 4.173913043478261e-06,
"loss": 5.8356,
"step": 18400
},
{
"epoch": 0.2795449266868742,
"eval_loss": 5.697088241577148,
"eval_runtime": 193.7892,
"eval_samples_per_second": 51.602,
"eval_steps_per_second": 6.45,
"step": 18400
},
{
"epoch": 0.2810641925927811,
"grad_norm": 3.5657687187194824,
"learning_rate": 4.163879598662208e-06,
"loss": 5.8233,
"step": 18500
},
{
"epoch": 0.2810641925927811,
"eval_loss": 5.683408260345459,
"eval_runtime": 193.7355,
"eval_samples_per_second": 51.617,
"eval_steps_per_second": 6.452,
"step": 18500
},
{
"epoch": 0.28258345849868804,
"grad_norm": 4.344554424285889,
"learning_rate": 4.153846153846154e-06,
"loss": 5.8187,
"step": 18600
},
{
"epoch": 0.28258345849868804,
"eval_loss": 5.675909042358398,
"eval_runtime": 193.8113,
"eval_samples_per_second": 51.597,
"eval_steps_per_second": 6.45,
"step": 18600
},
{
"epoch": 0.28410272440459494,
"grad_norm": 3.3455545902252197,
"learning_rate": 4.1438127090301005e-06,
"loss": 5.8091,
"step": 18700
},
{
"epoch": 0.28410272440459494,
"eval_loss": 5.665746688842773,
"eval_runtime": 194.1351,
"eval_samples_per_second": 51.511,
"eval_steps_per_second": 6.439,
"step": 18700
},
{
"epoch": 0.2856219903105019,
"grad_norm": 3.412184476852417,
"learning_rate": 4.133779264214047e-06,
"loss": 5.8026,
"step": 18800
},
{
"epoch": 0.2856219903105019,
"eval_loss": 5.6578497886657715,
"eval_runtime": 193.7841,
"eval_samples_per_second": 51.604,
"eval_steps_per_second": 6.45,
"step": 18800
},
{
"epoch": 0.2871412562164088,
"grad_norm": 3.717855215072632,
"learning_rate": 4.123745819397993e-06,
"loss": 5.7957,
"step": 18900
},
{
"epoch": 0.2871412562164088,
"eval_loss": 5.665693759918213,
"eval_runtime": 193.7661,
"eval_samples_per_second": 51.609,
"eval_steps_per_second": 6.451,
"step": 18900
},
{
"epoch": 0.28866052212231574,
"grad_norm": 3.876275062561035,
"learning_rate": 4.11371237458194e-06,
"loss": 5.7846,
"step": 19000
},
{
"epoch": 0.28866052212231574,
"eval_loss": 5.648958206176758,
"eval_runtime": 193.7195,
"eval_samples_per_second": 51.621,
"eval_steps_per_second": 6.453,
"step": 19000
},
{
"epoch": 0.29017978802822264,
"grad_norm": 3.8186490535736084,
"learning_rate": 4.103678929765887e-06,
"loss": 5.7777,
"step": 19100
},
{
"epoch": 0.29017978802822264,
"eval_loss": 5.629169940948486,
"eval_runtime": 193.5999,
"eval_samples_per_second": 51.653,
"eval_steps_per_second": 6.457,
"step": 19100
},
{
"epoch": 0.2916990539341296,
"grad_norm": 5.3280839920043945,
"learning_rate": 4.0936454849498326e-06,
"loss": 5.77,
"step": 19200
},
{
"epoch": 0.2916990539341296,
"eval_loss": 5.620713233947754,
"eval_runtime": 193.6523,
"eval_samples_per_second": 51.639,
"eval_steps_per_second": 6.455,
"step": 19200
},
{
"epoch": 0.2932183198400365,
"grad_norm": 3.260324478149414,
"learning_rate": 4.083612040133779e-06,
"loss": 5.7611,
"step": 19300
},
{
"epoch": 0.2932183198400365,
"eval_loss": 5.629894733428955,
"eval_runtime": 193.685,
"eval_samples_per_second": 51.63,
"eval_steps_per_second": 6.454,
"step": 19300
},
{
"epoch": 0.29473758574594344,
"grad_norm": 4.145829200744629,
"learning_rate": 4.073578595317726e-06,
"loss": 5.7538,
"step": 19400
},
{
"epoch": 0.29473758574594344,
"eval_loss": 5.6320037841796875,
"eval_runtime": 193.4857,
"eval_samples_per_second": 51.683,
"eval_steps_per_second": 6.46,
"step": 19400
},
{
"epoch": 0.29625685165185034,
"grad_norm": 4.071881294250488,
"learning_rate": 4.063545150501672e-06,
"loss": 5.745,
"step": 19500
},
{
"epoch": 0.29625685165185034,
"eval_loss": 5.607526779174805,
"eval_runtime": 193.5349,
"eval_samples_per_second": 51.67,
"eval_steps_per_second": 6.459,
"step": 19500
},
{
"epoch": 0.2977761175577573,
"grad_norm": 3.4075276851654053,
"learning_rate": 4.053511705685619e-06,
"loss": 5.7404,
"step": 19600
},
{
"epoch": 0.2977761175577573,
"eval_loss": 5.603940010070801,
"eval_runtime": 193.5688,
"eval_samples_per_second": 51.661,
"eval_steps_per_second": 6.458,
"step": 19600
},
{
"epoch": 0.2992953834636642,
"grad_norm": 3.371760129928589,
"learning_rate": 4.0434782608695655e-06,
"loss": 5.7343,
"step": 19700
},
{
"epoch": 0.2992953834636642,
"eval_loss": 5.597903728485107,
"eval_runtime": 193.9636,
"eval_samples_per_second": 51.556,
"eval_steps_per_second": 6.445,
"step": 19700
},
{
"epoch": 0.30081464936957114,
"grad_norm": 3.1230831146240234,
"learning_rate": 4.0334448160535115e-06,
"loss": 5.7284,
"step": 19800
},
{
"epoch": 0.30081464936957114,
"eval_loss": 5.580268859863281,
"eval_runtime": 194.0245,
"eval_samples_per_second": 51.54,
"eval_steps_per_second": 6.442,
"step": 19800
},
{
"epoch": 0.30233391527547804,
"grad_norm": 3.339742422103882,
"learning_rate": 4.023411371237458e-06,
"loss": 5.7206,
"step": 19900
},
{
"epoch": 0.30233391527547804,
"eval_loss": 5.571849822998047,
"eval_runtime": 193.8323,
"eval_samples_per_second": 51.591,
"eval_steps_per_second": 6.449,
"step": 19900
},
{
"epoch": 0.303853181181385,
"grad_norm": 3.2297468185424805,
"learning_rate": 4.013377926421405e-06,
"loss": 5.7086,
"step": 20000
},
{
"epoch": 0.303853181181385,
"eval_loss": 5.5632548332214355,
"eval_runtime": 193.7563,
"eval_samples_per_second": 51.611,
"eval_steps_per_second": 6.451,
"step": 20000
},
{
"epoch": 0.3053724470872919,
"grad_norm": 2.8698532581329346,
"learning_rate": 4.003344481605351e-06,
"loss": 5.7027,
"step": 20100
},
{
"epoch": 0.3053724470872919,
"eval_loss": 5.559244632720947,
"eval_runtime": 193.8672,
"eval_samples_per_second": 51.582,
"eval_steps_per_second": 6.448,
"step": 20100
},
{
"epoch": 0.30689171299319884,
"grad_norm": 2.990452289581299,
"learning_rate": 3.993311036789298e-06,
"loss": 5.6953,
"step": 20200
},
{
"epoch": 0.30689171299319884,
"eval_loss": 5.560790061950684,
"eval_runtime": 193.8061,
"eval_samples_per_second": 51.598,
"eval_steps_per_second": 6.45,
"step": 20200
},
{
"epoch": 0.30841097889910574,
"grad_norm": 3.821631669998169,
"learning_rate": 3.9832775919732444e-06,
"loss": 5.6881,
"step": 20300
},
{
"epoch": 0.30841097889910574,
"eval_loss": 5.551888465881348,
"eval_runtime": 194.0615,
"eval_samples_per_second": 51.53,
"eval_steps_per_second": 6.441,
"step": 20300
},
{
"epoch": 0.3099302448050127,
"grad_norm": 3.209308624267578,
"learning_rate": 3.97324414715719e-06,
"loss": 5.683,
"step": 20400
},
{
"epoch": 0.3099302448050127,
"eval_loss": 5.5436787605285645,
"eval_runtime": 193.9187,
"eval_samples_per_second": 51.568,
"eval_steps_per_second": 6.446,
"step": 20400
},
{
"epoch": 0.3114495107109196,
"grad_norm": 4.5453901290893555,
"learning_rate": 3.963210702341137e-06,
"loss": 5.6747,
"step": 20500
},
{
"epoch": 0.3114495107109196,
"eval_loss": 5.523691177368164,
"eval_runtime": 193.8312,
"eval_samples_per_second": 51.591,
"eval_steps_per_second": 6.449,
"step": 20500
},
{
"epoch": 0.31296877661682654,
"grad_norm": 3.86855411529541,
"learning_rate": 3.953177257525084e-06,
"loss": 5.6753,
"step": 20600
},
{
"epoch": 0.31296877661682654,
"eval_loss": 5.530142784118652,
"eval_runtime": 193.8761,
"eval_samples_per_second": 51.579,
"eval_steps_per_second": 6.447,
"step": 20600
},
{
"epoch": 0.31448804252273344,
"grad_norm": 3.029080390930176,
"learning_rate": 3.943143812709031e-06,
"loss": 5.6683,
"step": 20700
},
{
"epoch": 0.31448804252273344,
"eval_loss": 5.527863025665283,
"eval_runtime": 193.8252,
"eval_samples_per_second": 51.593,
"eval_steps_per_second": 6.449,
"step": 20700
},
{
"epoch": 0.3160073084286404,
"grad_norm": 3.5344836711883545,
"learning_rate": 3.9331103678929765e-06,
"loss": 5.6554,
"step": 20800
},
{
"epoch": 0.3160073084286404,
"eval_loss": 5.510525226593018,
"eval_runtime": 193.8394,
"eval_samples_per_second": 51.589,
"eval_steps_per_second": 6.449,
"step": 20800
},
{
"epoch": 0.3175265743345473,
"grad_norm": 3.153604507446289,
"learning_rate": 3.923076923076923e-06,
"loss": 5.6508,
"step": 20900
},
{
"epoch": 0.3175265743345473,
"eval_loss": 5.508999824523926,
"eval_runtime": 194.0796,
"eval_samples_per_second": 51.525,
"eval_steps_per_second": 6.441,
"step": 20900
},
{
"epoch": 0.31904584024045424,
"grad_norm": 3.87959623336792,
"learning_rate": 3.91304347826087e-06,
"loss": 5.644,
"step": 21000
},
{
"epoch": 0.31904584024045424,
"eval_loss": 5.511682987213135,
"eval_runtime": 193.8814,
"eval_samples_per_second": 51.578,
"eval_steps_per_second": 6.447,
"step": 21000
},
{
"epoch": 0.32056510614636113,
"grad_norm": 3.9517741203308105,
"learning_rate": 3.903010033444816e-06,
"loss": 5.6387,
"step": 21100
},
{
"epoch": 0.32056510614636113,
"eval_loss": 5.499752521514893,
"eval_runtime": 193.8141,
"eval_samples_per_second": 51.596,
"eval_steps_per_second": 6.449,
"step": 21100
},
{
"epoch": 0.3220843720522681,
"grad_norm": 3.191702127456665,
"learning_rate": 3.892976588628763e-06,
"loss": 5.6333,
"step": 21200
},
{
"epoch": 0.3220843720522681,
"eval_loss": 5.476820945739746,
"eval_runtime": 193.9667,
"eval_samples_per_second": 51.555,
"eval_steps_per_second": 6.444,
"step": 21200
},
{
"epoch": 0.323603637958175,
"grad_norm": 3.1419906616210938,
"learning_rate": 3.8829431438127095e-06,
"loss": 5.6243,
"step": 21300
},
{
"epoch": 0.323603637958175,
"eval_loss": 5.486774444580078,
"eval_runtime": 193.7733,
"eval_samples_per_second": 51.607,
"eval_steps_per_second": 6.451,
"step": 21300
},
{
"epoch": 0.32512290386408194,
"grad_norm": 4.059791088104248,
"learning_rate": 3.8729096989966554e-06,
"loss": 5.6163,
"step": 21400
},
{
"epoch": 0.32512290386408194,
"eval_loss": 5.477799415588379,
"eval_runtime": 193.6285,
"eval_samples_per_second": 51.645,
"eval_steps_per_second": 6.456,
"step": 21400
},
{
"epoch": 0.32664216976998883,
"grad_norm": 2.990511417388916,
"learning_rate": 3.862876254180602e-06,
"loss": 5.6133,
"step": 21500
},
{
"epoch": 0.32664216976998883,
"eval_loss": 5.47875452041626,
"eval_runtime": 193.5641,
"eval_samples_per_second": 51.662,
"eval_steps_per_second": 6.458,
"step": 21500
},
{
"epoch": 0.3281614356758958,
"grad_norm": 2.3832523822784424,
"learning_rate": 3.852842809364549e-06,
"loss": 5.6062,
"step": 21600
},
{
"epoch": 0.3281614356758958,
"eval_loss": 5.4584455490112305,
"eval_runtime": 193.6584,
"eval_samples_per_second": 51.637,
"eval_steps_per_second": 6.455,
"step": 21600
},
{
"epoch": 0.3296807015818027,
"grad_norm": 3.010307788848877,
"learning_rate": 3.842809364548495e-06,
"loss": 5.5959,
"step": 21700
},
{
"epoch": 0.3296807015818027,
"eval_loss": 5.451364517211914,
"eval_runtime": 193.6921,
"eval_samples_per_second": 51.628,
"eval_steps_per_second": 6.454,
"step": 21700
},
{
"epoch": 0.33119996748770963,
"grad_norm": 3.718315601348877,
"learning_rate": 3.832775919732442e-06,
"loss": 5.5919,
"step": 21800
},
{
"epoch": 0.33119996748770963,
"eval_loss": 5.446727752685547,
"eval_runtime": 193.8195,
"eval_samples_per_second": 51.594,
"eval_steps_per_second": 6.449,
"step": 21800
},
{
"epoch": 0.33271923339361653,
"grad_norm": 4.131709098815918,
"learning_rate": 3.822742474916388e-06,
"loss": 5.5859,
"step": 21900
},
{
"epoch": 0.33271923339361653,
"eval_loss": 5.43417501449585,
"eval_runtime": 193.7876,
"eval_samples_per_second": 51.603,
"eval_steps_per_second": 6.45,
"step": 21900
},
{
"epoch": 0.3342384992995235,
"grad_norm": 3.7145907878875732,
"learning_rate": 3.8127090301003347e-06,
"loss": 5.5805,
"step": 22000
},
{
"epoch": 0.3342384992995235,
"eval_loss": 5.443439960479736,
"eval_runtime": 193.7659,
"eval_samples_per_second": 51.609,
"eval_steps_per_second": 6.451,
"step": 22000
},
{
"epoch": 0.3357577652054304,
"grad_norm": 3.167874574661255,
"learning_rate": 3.802675585284281e-06,
"loss": 5.5724,
"step": 22100
},
{
"epoch": 0.3357577652054304,
"eval_loss": 5.419732093811035,
"eval_runtime": 193.7654,
"eval_samples_per_second": 51.609,
"eval_steps_per_second": 6.451,
"step": 22100
},
{
"epoch": 0.33727703111133733,
"grad_norm": 3.820495367050171,
"learning_rate": 3.792642140468228e-06,
"loss": 5.5694,
"step": 22200
},
{
"epoch": 0.33727703111133733,
"eval_loss": 5.4181647300720215,
"eval_runtime": 193.8601,
"eval_samples_per_second": 51.584,
"eval_steps_per_second": 6.448,
"step": 22200
},
{
"epoch": 0.33879629701724423,
"grad_norm": 3.4039466381073,
"learning_rate": 3.782608695652174e-06,
"loss": 5.565,
"step": 22300
},
{
"epoch": 0.33879629701724423,
"eval_loss": 5.419365406036377,
"eval_runtime": 194.0048,
"eval_samples_per_second": 51.545,
"eval_steps_per_second": 6.443,
"step": 22300
},
{
"epoch": 0.3403155629231512,
"grad_norm": 2.805332660675049,
"learning_rate": 3.7725752508361205e-06,
"loss": 5.556,
"step": 22400
},
{
"epoch": 0.3403155629231512,
"eval_loss": 5.415971755981445,
"eval_runtime": 193.9692,
"eval_samples_per_second": 51.555,
"eval_steps_per_second": 6.444,
"step": 22400
},
{
"epoch": 0.3418348288290581,
"grad_norm": 2.43111252784729,
"learning_rate": 3.7625418060200673e-06,
"loss": 5.5536,
"step": 22500
},
{
"epoch": 0.3418348288290581,
"eval_loss": 5.405710220336914,
"eval_runtime": 193.9758,
"eval_samples_per_second": 51.553,
"eval_steps_per_second": 6.444,
"step": 22500
},
{
"epoch": 0.34335409473496503,
"grad_norm": 3.9612550735473633,
"learning_rate": 3.7525083612040136e-06,
"loss": 5.544,
"step": 22600
},
{
"epoch": 0.34335409473496503,
"eval_loss": 5.40699577331543,
"eval_runtime": 193.9672,
"eval_samples_per_second": 51.555,
"eval_steps_per_second": 6.444,
"step": 22600
},
{
"epoch": 0.34487336064087193,
"grad_norm": 2.8571434020996094,
"learning_rate": 3.74247491638796e-06,
"loss": 5.5392,
"step": 22700
},
{
"epoch": 0.34487336064087193,
"eval_loss": 5.397378921508789,
"eval_runtime": 194.1852,
"eval_samples_per_second": 51.497,
"eval_steps_per_second": 6.437,
"step": 22700
},
{
"epoch": 0.3463926265467789,
"grad_norm": 3.1463348865509033,
"learning_rate": 3.7324414715719067e-06,
"loss": 5.5296,
"step": 22800
},
{
"epoch": 0.3463926265467789,
"eval_loss": 5.3807501792907715,
"eval_runtime": 193.8656,
"eval_samples_per_second": 51.582,
"eval_steps_per_second": 6.448,
"step": 22800
},
{
"epoch": 0.3479118924526858,
"grad_norm": 3.696991205215454,
"learning_rate": 3.722408026755853e-06,
"loss": 5.5319,
"step": 22900
},
{
"epoch": 0.3479118924526858,
"eval_loss": 5.376627445220947,
"eval_runtime": 193.8475,
"eval_samples_per_second": 51.587,
"eval_steps_per_second": 6.448,
"step": 22900
},
{
"epoch": 0.34943115835859273,
"grad_norm": 3.691133737564087,
"learning_rate": 3.7123745819398e-06,
"loss": 5.5239,
"step": 23000
},
{
"epoch": 0.34943115835859273,
"eval_loss": 5.379115104675293,
"eval_runtime": 193.935,
"eval_samples_per_second": 51.564,
"eval_steps_per_second": 6.445,
"step": 23000
},
{
"epoch": 0.3509504242644996,
"grad_norm": 2.994180679321289,
"learning_rate": 3.702341137123746e-06,
"loss": 5.5174,
"step": 23100
},
{
"epoch": 0.3509504242644996,
"eval_loss": 5.36928129196167,
"eval_runtime": 193.8929,
"eval_samples_per_second": 51.575,
"eval_steps_per_second": 6.447,
"step": 23100
},
{
"epoch": 0.3524696901704066,
"grad_norm": 3.528660774230957,
"learning_rate": 3.6923076923076925e-06,
"loss": 5.5123,
"step": 23200
},
{
"epoch": 0.3524696901704066,
"eval_loss": 5.366269588470459,
"eval_runtime": 193.8307,
"eval_samples_per_second": 51.591,
"eval_steps_per_second": 6.449,
"step": 23200
},
{
"epoch": 0.3539889560763135,
"grad_norm": 2.8609702587127686,
"learning_rate": 3.6822742474916393e-06,
"loss": 5.5024,
"step": 23300
},
{
"epoch": 0.3539889560763135,
"eval_loss": 5.3636884689331055,
"eval_runtime": 193.9144,
"eval_samples_per_second": 51.569,
"eval_steps_per_second": 6.446,
"step": 23300
},
{
"epoch": 0.35550822198222043,
"grad_norm": 3.146467447280884,
"learning_rate": 3.6722408026755856e-06,
"loss": 5.4993,
"step": 23400
},
{
"epoch": 0.35550822198222043,
"eval_loss": 5.3594536781311035,
"eval_runtime": 193.8636,
"eval_samples_per_second": 51.583,
"eval_steps_per_second": 6.448,
"step": 23400
},
{
"epoch": 0.3570274878881273,
"grad_norm": 2.4640018939971924,
"learning_rate": 3.662207357859532e-06,
"loss": 5.4944,
"step": 23500
},
{
"epoch": 0.3570274878881273,
"eval_loss": 5.346569538116455,
"eval_runtime": 193.9228,
"eval_samples_per_second": 51.567,
"eval_steps_per_second": 6.446,
"step": 23500
},
{
"epoch": 0.3585467537940343,
"grad_norm": 4.175319671630859,
"learning_rate": 3.6521739130434787e-06,
"loss": 5.4874,
"step": 23600
},
{
"epoch": 0.3585467537940343,
"eval_loss": 5.343349456787109,
"eval_runtime": 193.585,
"eval_samples_per_second": 51.657,
"eval_steps_per_second": 6.457,
"step": 23600
},
{
"epoch": 0.3600660196999412,
"grad_norm": 3.4799277782440186,
"learning_rate": 3.642140468227425e-06,
"loss": 5.4874,
"step": 23700
},
{
"epoch": 0.3600660196999412,
"eval_loss": 5.342945098876953,
"eval_runtime": 193.6611,
"eval_samples_per_second": 51.637,
"eval_steps_per_second": 6.455,
"step": 23700
},
{
"epoch": 0.36158528560584813,
"grad_norm": 1.949639916419983,
"learning_rate": 3.6321070234113714e-06,
"loss": 5.4786,
"step": 23800
},
{
"epoch": 0.36158528560584813,
"eval_loss": 5.325855255126953,
"eval_runtime": 193.5983,
"eval_samples_per_second": 51.653,
"eval_steps_per_second": 6.457,
"step": 23800
},
{
"epoch": 0.363104551511755,
"grad_norm": 2.983280658721924,
"learning_rate": 3.622073578595318e-06,
"loss": 5.4736,
"step": 23900
},
{
"epoch": 0.363104551511755,
"eval_loss": 5.320173263549805,
"eval_runtime": 193.5482,
"eval_samples_per_second": 51.667,
"eval_steps_per_second": 6.458,
"step": 23900
},
{
"epoch": 0.364623817417662,
"grad_norm": 3.912425994873047,
"learning_rate": 3.6120401337792645e-06,
"loss": 5.4694,
"step": 24000
},
{
"epoch": 0.364623817417662,
"eval_loss": 5.3176751136779785,
"eval_runtime": 193.8093,
"eval_samples_per_second": 51.597,
"eval_steps_per_second": 6.45,
"step": 24000
},
{
"epoch": 0.3661430833235689,
"grad_norm": 3.230281114578247,
"learning_rate": 3.6020066889632112e-06,
"loss": 5.465,
"step": 24100
},
{
"epoch": 0.3661430833235689,
"eval_loss": 5.319676399230957,
"eval_runtime": 193.9617,
"eval_samples_per_second": 51.557,
"eval_steps_per_second": 6.445,
"step": 24100
},
{
"epoch": 0.3676623492294758,
"grad_norm": 2.6516830921173096,
"learning_rate": 3.5919732441471576e-06,
"loss": 5.4568,
"step": 24200
},
{
"epoch": 0.3676623492294758,
"eval_loss": 5.306182384490967,
"eval_runtime": 193.9303,
"eval_samples_per_second": 51.565,
"eval_steps_per_second": 6.446,
"step": 24200
},
{
"epoch": 0.3691816151353827,
"grad_norm": 3.398289442062378,
"learning_rate": 3.581939799331104e-06,
"loss": 5.4555,
"step": 24300
},
{
"epoch": 0.3691816151353827,
"eval_loss": 5.304970741271973,
"eval_runtime": 193.8994,
"eval_samples_per_second": 51.573,
"eval_steps_per_second": 6.447,
"step": 24300
},
{
"epoch": 0.3707008810412897,
"grad_norm": 2.9263579845428467,
"learning_rate": 3.5719063545150507e-06,
"loss": 5.446,
"step": 24400
},
{
"epoch": 0.3707008810412897,
"eval_loss": 5.304412364959717,
"eval_runtime": 193.8605,
"eval_samples_per_second": 51.583,
"eval_steps_per_second": 6.448,
"step": 24400
},
{
"epoch": 0.3722201469471966,
"grad_norm": 3.2696564197540283,
"learning_rate": 3.561872909698997e-06,
"loss": 5.4418,
"step": 24500
},
{
"epoch": 0.3722201469471966,
"eval_loss": 5.291959285736084,
"eval_runtime": 193.8303,
"eval_samples_per_second": 51.592,
"eval_steps_per_second": 6.449,
"step": 24500
},
{
"epoch": 0.3737394128531035,
"grad_norm": 3.3699710369110107,
"learning_rate": 3.5518394648829434e-06,
"loss": 5.4367,
"step": 24600
},
{
"epoch": 0.3737394128531035,
"eval_loss": 5.292627334594727,
"eval_runtime": 193.9116,
"eval_samples_per_second": 51.57,
"eval_steps_per_second": 6.446,
"step": 24600
},
{
"epoch": 0.3752586787590104,
"grad_norm": 3.646376848220825,
"learning_rate": 3.54180602006689e-06,
"loss": 5.4389,
"step": 24700
},
{
"epoch": 0.3752586787590104,
"eval_loss": 5.2781982421875,
"eval_runtime": 193.9384,
"eval_samples_per_second": 51.563,
"eval_steps_per_second": 6.445,
"step": 24700
},
{
"epoch": 0.3767779446649174,
"grad_norm": 2.873612880706787,
"learning_rate": 3.5317725752508365e-06,
"loss": 5.4344,
"step": 24800
},
{
"epoch": 0.3767779446649174,
"eval_loss": 5.286219596862793,
"eval_runtime": 193.9969,
"eval_samples_per_second": 51.547,
"eval_steps_per_second": 6.443,
"step": 24800
},
{
"epoch": 0.37829721057082427,
"grad_norm": 3.312747001647949,
"learning_rate": 3.521739130434783e-06,
"loss": 5.427,
"step": 24900
},
{
"epoch": 0.37829721057082427,
"eval_loss": 5.281394004821777,
"eval_runtime": 193.9463,
"eval_samples_per_second": 51.561,
"eval_steps_per_second": 6.445,
"step": 24900
},
{
"epoch": 0.3798164764767312,
"grad_norm": 3.727271556854248,
"learning_rate": 3.5117056856187296e-06,
"loss": 5.4212,
"step": 25000
},
{
"epoch": 0.3798164764767312,
"eval_loss": 5.266263484954834,
"eval_runtime": 193.9471,
"eval_samples_per_second": 51.56,
"eval_steps_per_second": 6.445,
"step": 25000
},
{
"epoch": 0.3813357423826381,
"grad_norm": 3.3630518913269043,
"learning_rate": 3.501672240802676e-06,
"loss": 5.4173,
"step": 25100
},
{
"epoch": 0.3813357423826381,
"eval_loss": 5.276744365692139,
"eval_runtime": 193.8804,
"eval_samples_per_second": 51.578,
"eval_steps_per_second": 6.447,
"step": 25100
},
{
"epoch": 0.3828550082885451,
"grad_norm": 4.12694787979126,
"learning_rate": 3.491638795986622e-06,
"loss": 5.413,
"step": 25200
},
{
"epoch": 0.3828550082885451,
"eval_loss": 5.260261535644531,
"eval_runtime": 193.919,
"eval_samples_per_second": 51.568,
"eval_steps_per_second": 6.446,
"step": 25200
},
{
"epoch": 0.38437427419445197,
"grad_norm": 3.921342611312866,
"learning_rate": 3.481605351170568e-06,
"loss": 5.4041,
"step": 25300
},
{
"epoch": 0.38437427419445197,
"eval_loss": 5.2696661949157715,
"eval_runtime": 193.8783,
"eval_samples_per_second": 51.579,
"eval_steps_per_second": 6.447,
"step": 25300
},
{
"epoch": 0.3858935401003589,
"grad_norm": 2.464872360229492,
"learning_rate": 3.471571906354515e-06,
"loss": 5.4035,
"step": 25400
},
{
"epoch": 0.3858935401003589,
"eval_loss": 5.251010894775391,
"eval_runtime": 193.8872,
"eval_samples_per_second": 51.576,
"eval_steps_per_second": 6.447,
"step": 25400
},
{
"epoch": 0.3874128060062658,
"grad_norm": 2.675010919570923,
"learning_rate": 3.4615384615384613e-06,
"loss": 5.3946,
"step": 25500
},
{
"epoch": 0.3874128060062658,
"eval_loss": 5.2538347244262695,
"eval_runtime": 193.8933,
"eval_samples_per_second": 51.575,
"eval_steps_per_second": 6.447,
"step": 25500
},
{
"epoch": 0.38893207191217277,
"grad_norm": 2.195725202560425,
"learning_rate": 3.4515050167224076e-06,
"loss": 5.3919,
"step": 25600
},
{
"epoch": 0.38893207191217277,
"eval_loss": 5.230504035949707,
"eval_runtime": 194.2054,
"eval_samples_per_second": 51.492,
"eval_steps_per_second": 6.436,
"step": 25600
},
{
"epoch": 0.39045133781807967,
"grad_norm": 3.359039545059204,
"learning_rate": 3.4414715719063544e-06,
"loss": 5.3855,
"step": 25700
},
{
"epoch": 0.39045133781807967,
"eval_loss": 5.245420455932617,
"eval_runtime": 193.8867,
"eval_samples_per_second": 51.577,
"eval_steps_per_second": 6.447,
"step": 25700
},
{
"epoch": 0.3919706037239866,
"grad_norm": 3.5205583572387695,
"learning_rate": 3.4314381270903007e-06,
"loss": 5.3822,
"step": 25800
},
{
"epoch": 0.3919706037239866,
"eval_loss": 5.227876663208008,
"eval_runtime": 194.0636,
"eval_samples_per_second": 51.529,
"eval_steps_per_second": 6.441,
"step": 25800
},
{
"epoch": 0.3934898696298935,
"grad_norm": 3.5619242191314697,
"learning_rate": 3.4214046822742475e-06,
"loss": 5.3773,
"step": 25900
},
{
"epoch": 0.3934898696298935,
"eval_loss": 5.234467029571533,
"eval_runtime": 193.7401,
"eval_samples_per_second": 51.616,
"eval_steps_per_second": 6.452,
"step": 25900
},
{
"epoch": 0.39500913553580047,
"grad_norm": 3.9920406341552734,
"learning_rate": 3.411371237458194e-06,
"loss": 5.3735,
"step": 26000
},
{
"epoch": 0.39500913553580047,
"eval_loss": 5.22184944152832,
"eval_runtime": 193.7394,
"eval_samples_per_second": 51.616,
"eval_steps_per_second": 6.452,
"step": 26000
},
{
"epoch": 0.39652840144170737,
"grad_norm": 3.559217691421509,
"learning_rate": 3.40133779264214e-06,
"loss": 5.3695,
"step": 26100
},
{
"epoch": 0.39652840144170737,
"eval_loss": 5.22100830078125,
"eval_runtime": 193.849,
"eval_samples_per_second": 51.587,
"eval_steps_per_second": 6.448,
"step": 26100
},
{
"epoch": 0.3980476673476143,
"grad_norm": 4.232235908508301,
"learning_rate": 3.391304347826087e-06,
"loss": 5.3703,
"step": 26200
},
{
"epoch": 0.3980476673476143,
"eval_loss": 5.208474159240723,
"eval_runtime": 193.8974,
"eval_samples_per_second": 51.574,
"eval_steps_per_second": 6.447,
"step": 26200
},
{
"epoch": 0.3995669332535212,
"grad_norm": 1.947100043296814,
"learning_rate": 3.3812709030100333e-06,
"loss": 5.3627,
"step": 26300
},
{
"epoch": 0.3995669332535212,
"eval_loss": 5.217953681945801,
"eval_runtime": 193.928,
"eval_samples_per_second": 51.566,
"eval_steps_per_second": 6.446,
"step": 26300
},
{
"epoch": 0.40108619915942817,
"grad_norm": 3.8494338989257812,
"learning_rate": 3.3712374581939796e-06,
"loss": 5.3584,
"step": 26400
},
{
"epoch": 0.40108619915942817,
"eval_loss": 5.212357044219971,
"eval_runtime": 193.9466,
"eval_samples_per_second": 51.561,
"eval_steps_per_second": 6.445,
"step": 26400
},
{
"epoch": 0.40260546506533507,
"grad_norm": 3.837324619293213,
"learning_rate": 3.3612040133779264e-06,
"loss": 5.3555,
"step": 26500
},
{
"epoch": 0.40260546506533507,
"eval_loss": 5.211539268493652,
"eval_runtime": 193.9781,
"eval_samples_per_second": 51.552,
"eval_steps_per_second": 6.444,
"step": 26500
},
{
"epoch": 0.404124730971242,
"grad_norm": 3.7754664421081543,
"learning_rate": 3.3511705685618727e-06,
"loss": 5.3476,
"step": 26600
},
{
"epoch": 0.404124730971242,
"eval_loss": 5.1926679611206055,
"eval_runtime": 194.0189,
"eval_samples_per_second": 51.541,
"eval_steps_per_second": 6.443,
"step": 26600
},
{
"epoch": 0.4056439968771489,
"grad_norm": 2.4836502075195312,
"learning_rate": 3.3411371237458195e-06,
"loss": 5.3471,
"step": 26700
},
{
"epoch": 0.4056439968771489,
"eval_loss": 5.188870429992676,
"eval_runtime": 194.0466,
"eval_samples_per_second": 51.534,
"eval_steps_per_second": 6.442,
"step": 26700
},
{
"epoch": 0.40716326278305587,
"grad_norm": 4.591010093688965,
"learning_rate": 3.331103678929766e-06,
"loss": 5.3431,
"step": 26800
},
{
"epoch": 0.40716326278305587,
"eval_loss": 5.2042717933654785,
"eval_runtime": 193.9239,
"eval_samples_per_second": 51.567,
"eval_steps_per_second": 6.446,
"step": 26800
},
{
"epoch": 0.40868252868896277,
"grad_norm": 3.4716506004333496,
"learning_rate": 3.321070234113712e-06,
"loss": 5.3363,
"step": 26900
},
{
"epoch": 0.40868252868896277,
"eval_loss": 5.18259859085083,
"eval_runtime": 193.9467,
"eval_samples_per_second": 51.561,
"eval_steps_per_second": 6.445,
"step": 26900
},
{
"epoch": 0.4102017945948697,
"grad_norm": 2.3968818187713623,
"learning_rate": 3.311036789297659e-06,
"loss": 5.335,
"step": 27000
},
{
"epoch": 0.4102017945948697,
"eval_loss": 5.189505577087402,
"eval_runtime": 193.9236,
"eval_samples_per_second": 51.567,
"eval_steps_per_second": 6.446,
"step": 27000
},
{
"epoch": 0.4117210605007766,
"grad_norm": 3.8948540687561035,
"learning_rate": 3.3010033444816052e-06,
"loss": 5.3306,
"step": 27100
},
{
"epoch": 0.4117210605007766,
"eval_loss": 5.193852424621582,
"eval_runtime": 193.9813,
"eval_samples_per_second": 51.551,
"eval_steps_per_second": 6.444,
"step": 27100
},
{
"epoch": 0.41324032640668357,
"grad_norm": 2.8864169120788574,
"learning_rate": 3.2909698996655516e-06,
"loss": 5.3292,
"step": 27200
},
{
"epoch": 0.41324032640668357,
"eval_loss": 5.173651695251465,
"eval_runtime": 193.9832,
"eval_samples_per_second": 51.551,
"eval_steps_per_second": 6.444,
"step": 27200
},
{
"epoch": 0.41475959231259046,
"grad_norm": 2.733299970626831,
"learning_rate": 3.2809364548494983e-06,
"loss": 5.33,
"step": 27300
},
{
"epoch": 0.41475959231259046,
"eval_loss": 5.169619083404541,
"eval_runtime": 193.8848,
"eval_samples_per_second": 51.577,
"eval_steps_per_second": 6.447,
"step": 27300
},
{
"epoch": 0.4162788582184974,
"grad_norm": 2.9062700271606445,
"learning_rate": 3.2709030100334447e-06,
"loss": 5.3173,
"step": 27400
},
{
"epoch": 0.4162788582184974,
"eval_loss": 5.1664323806762695,
"eval_runtime": 193.9119,
"eval_samples_per_second": 51.57,
"eval_steps_per_second": 6.446,
"step": 27400
},
{
"epoch": 0.4177981241244043,
"grad_norm": 3.473586320877075,
"learning_rate": 3.260869565217391e-06,
"loss": 5.3132,
"step": 27500
},
{
"epoch": 0.4177981241244043,
"eval_loss": 5.160322666168213,
"eval_runtime": 193.6677,
"eval_samples_per_second": 51.635,
"eval_steps_per_second": 6.454,
"step": 27500
},
{
"epoch": 0.41931739003031127,
"grad_norm": 3.763826847076416,
"learning_rate": 3.2508361204013378e-06,
"loss": 5.3079,
"step": 27600
},
{
"epoch": 0.41931739003031127,
"eval_loss": 5.159815788269043,
"eval_runtime": 193.6881,
"eval_samples_per_second": 51.629,
"eval_steps_per_second": 6.454,
"step": 27600
},
{
"epoch": 0.42083665593621816,
"grad_norm": 3.552210807800293,
"learning_rate": 3.240802675585284e-06,
"loss": 5.3065,
"step": 27700
},
{
"epoch": 0.42083665593621816,
"eval_loss": 5.150642395019531,
"eval_runtime": 193.6169,
"eval_samples_per_second": 51.648,
"eval_steps_per_second": 6.456,
"step": 27700
},
{
"epoch": 0.4223559218421251,
"grad_norm": 4.059215545654297,
"learning_rate": 3.230769230769231e-06,
"loss": 5.2979,
"step": 27800
},
{
"epoch": 0.4223559218421251,
"eval_loss": 5.1397881507873535,
"eval_runtime": 193.6252,
"eval_samples_per_second": 51.646,
"eval_steps_per_second": 6.456,
"step": 27800
},
{
"epoch": 0.423875187748032,
"grad_norm": 3.116863250732422,
"learning_rate": 3.2207357859531772e-06,
"loss": 5.2986,
"step": 27900
},
{
"epoch": 0.423875187748032,
"eval_loss": 5.141936779022217,
"eval_runtime": 193.8766,
"eval_samples_per_second": 51.579,
"eval_steps_per_second": 6.447,
"step": 27900
},
{
"epoch": 0.42539445365393896,
"grad_norm": 3.474275588989258,
"learning_rate": 3.2107023411371236e-06,
"loss": 5.2969,
"step": 28000
},
{
"epoch": 0.42539445365393896,
"eval_loss": 5.130954742431641,
"eval_runtime": 193.6114,
"eval_samples_per_second": 51.65,
"eval_steps_per_second": 6.456,
"step": 28000
},
{
"epoch": 0.42691371955984586,
"grad_norm": 4.147261619567871,
"learning_rate": 3.2006688963210703e-06,
"loss": 5.2919,
"step": 28100
},
{
"epoch": 0.42691371955984586,
"eval_loss": 5.131519794464111,
"eval_runtime": 193.5876,
"eval_samples_per_second": 51.656,
"eval_steps_per_second": 6.457,
"step": 28100
},
{
"epoch": 0.4284329854657528,
"grad_norm": 3.2498297691345215,
"learning_rate": 3.1906354515050167e-06,
"loss": 5.281,
"step": 28200
},
{
"epoch": 0.4284329854657528,
"eval_loss": 5.137979984283447,
"eval_runtime": 193.6213,
"eval_samples_per_second": 51.647,
"eval_steps_per_second": 6.456,
"step": 28200
},
{
"epoch": 0.4299522513716597,
"grad_norm": 2.9977059364318848,
"learning_rate": 3.180602006688963e-06,
"loss": 5.2799,
"step": 28300
},
{
"epoch": 0.4299522513716597,
"eval_loss": 5.123497009277344,
"eval_runtime": 193.6764,
"eval_samples_per_second": 51.633,
"eval_steps_per_second": 6.454,
"step": 28300
},
{
"epoch": 0.43147151727756666,
"grad_norm": 3.6998023986816406,
"learning_rate": 3.1705685618729098e-06,
"loss": 5.2772,
"step": 28400
},
{
"epoch": 0.43147151727756666,
"eval_loss": 5.125461101531982,
"eval_runtime": 194.1824,
"eval_samples_per_second": 51.498,
"eval_steps_per_second": 6.437,
"step": 28400
},
{
"epoch": 0.43299078318347356,
"grad_norm": 2.8865628242492676,
"learning_rate": 3.160535117056856e-06,
"loss": 5.2778,
"step": 28500
},
{
"epoch": 0.43299078318347356,
"eval_loss": 5.130805492401123,
"eval_runtime": 194.0322,
"eval_samples_per_second": 51.538,
"eval_steps_per_second": 6.442,
"step": 28500
},
{
"epoch": 0.4345100490893805,
"grad_norm": 3.853248357772827,
"learning_rate": 3.1505016722408024e-06,
"loss": 5.2722,
"step": 28600
},
{
"epoch": 0.4345100490893805,
"eval_loss": 5.125495910644531,
"eval_runtime": 194.031,
"eval_samples_per_second": 51.538,
"eval_steps_per_second": 6.442,
"step": 28600
},
{
"epoch": 0.4360293149952874,
"grad_norm": 2.8595046997070312,
"learning_rate": 3.140468227424749e-06,
"loss": 5.2644,
"step": 28700
},
{
"epoch": 0.4360293149952874,
"eval_loss": 5.113553524017334,
"eval_runtime": 194.0586,
"eval_samples_per_second": 51.531,
"eval_steps_per_second": 6.441,
"step": 28700
},
{
"epoch": 0.43754858090119436,
"grad_norm": 3.5894057750701904,
"learning_rate": 3.1304347826086955e-06,
"loss": 5.261,
"step": 28800
},
{
"epoch": 0.43754858090119436,
"eval_loss": 5.1062846183776855,
"eval_runtime": 194.0309,
"eval_samples_per_second": 51.538,
"eval_steps_per_second": 6.442,
"step": 28800
},
{
"epoch": 0.43906784680710126,
"grad_norm": 2.79595685005188,
"learning_rate": 3.1204013377926423e-06,
"loss": 5.257,
"step": 28900
},
{
"epoch": 0.43906784680710126,
"eval_loss": 5.108764171600342,
"eval_runtime": 194.0394,
"eval_samples_per_second": 51.536,
"eval_steps_per_second": 6.442,
"step": 28900
},
{
"epoch": 0.4405871127130082,
"grad_norm": 3.3071796894073486,
"learning_rate": 3.1103678929765886e-06,
"loss": 5.2543,
"step": 29000
},
{
"epoch": 0.4405871127130082,
"eval_loss": 5.101233005523682,
"eval_runtime": 194.0058,
"eval_samples_per_second": 51.545,
"eval_steps_per_second": 6.443,
"step": 29000
},
{
"epoch": 0.4421063786189151,
"grad_norm": 2.916874408721924,
"learning_rate": 3.100334448160535e-06,
"loss": 5.2474,
"step": 29100
},
{
"epoch": 0.4421063786189151,
"eval_loss": 5.100154876708984,
"eval_runtime": 194.0356,
"eval_samples_per_second": 51.537,
"eval_steps_per_second": 6.442,
"step": 29100
},
{
"epoch": 0.44362564452482206,
"grad_norm": 2.6649153232574463,
"learning_rate": 3.0903010033444818e-06,
"loss": 5.2504,
"step": 29200
},
{
"epoch": 0.44362564452482206,
"eval_loss": 5.0921311378479,
"eval_runtime": 194.0892,
"eval_samples_per_second": 51.523,
"eval_steps_per_second": 6.44,
"step": 29200
},
{
"epoch": 0.44514491043072896,
"grad_norm": 2.398049831390381,
"learning_rate": 3.080267558528428e-06,
"loss": 5.2441,
"step": 29300
},
{
"epoch": 0.44514491043072896,
"eval_loss": 5.0853142738342285,
"eval_runtime": 193.8364,
"eval_samples_per_second": 51.59,
"eval_steps_per_second": 6.449,
"step": 29300
},
{
"epoch": 0.4466641763366359,
"grad_norm": 2.485322952270508,
"learning_rate": 3.0702341137123744e-06,
"loss": 5.2415,
"step": 29400
},
{
"epoch": 0.4466641763366359,
"eval_loss": 5.091442584991455,
"eval_runtime": 193.9724,
"eval_samples_per_second": 51.554,
"eval_steps_per_second": 6.444,
"step": 29400
},
{
"epoch": 0.4481834422425428,
"grad_norm": 3.5554513931274414,
"learning_rate": 3.060200668896321e-06,
"loss": 5.2374,
"step": 29500
},
{
"epoch": 0.4481834422425428,
"eval_loss": 5.077342510223389,
"eval_runtime": 194.0745,
"eval_samples_per_second": 51.527,
"eval_steps_per_second": 6.441,
"step": 29500
},
{
"epoch": 0.44970270814844976,
"grad_norm": 3.598982810974121,
"learning_rate": 3.0501672240802675e-06,
"loss": 5.2324,
"step": 29600
},
{
"epoch": 0.44970270814844976,
"eval_loss": 5.088211536407471,
"eval_runtime": 193.9862,
"eval_samples_per_second": 51.55,
"eval_steps_per_second": 6.444,
"step": 29600
},
{
"epoch": 0.45122197405435666,
"grad_norm": 3.2339296340942383,
"learning_rate": 3.0401337792642143e-06,
"loss": 5.2295,
"step": 29700
},
{
"epoch": 0.45122197405435666,
"eval_loss": 5.077876567840576,
"eval_runtime": 193.8777,
"eval_samples_per_second": 51.579,
"eval_steps_per_second": 6.447,
"step": 29700
},
{
"epoch": 0.4527412399602636,
"grad_norm": 2.627495765686035,
"learning_rate": 3.0301003344481606e-06,
"loss": 5.2275,
"step": 29800
},
{
"epoch": 0.4527412399602636,
"eval_loss": 5.074822902679443,
"eval_runtime": 193.8311,
"eval_samples_per_second": 51.591,
"eval_steps_per_second": 6.449,
"step": 29800
},
{
"epoch": 0.4542605058661705,
"grad_norm": 2.9252991676330566,
"learning_rate": 3.020066889632107e-06,
"loss": 5.2238,
"step": 29900
},
{
"epoch": 0.4542605058661705,
"eval_loss": 5.063547611236572,
"eval_runtime": 193.7302,
"eval_samples_per_second": 51.618,
"eval_steps_per_second": 6.452,
"step": 29900
},
{
"epoch": 0.45577977177207746,
"grad_norm": 3.155406951904297,
"learning_rate": 3.0100334448160537e-06,
"loss": 5.2218,
"step": 30000
},
{
"epoch": 0.45577977177207746,
"eval_loss": 5.066218852996826,
"eval_runtime": 193.6072,
"eval_samples_per_second": 51.651,
"eval_steps_per_second": 6.456,
"step": 30000
},
{
"epoch": 0.45729903767798435,
"grad_norm": 3.476306915283203,
"learning_rate": 3e-06,
"loss": 5.2166,
"step": 30100
},
{
"epoch": 0.45729903767798435,
"eval_loss": 5.068021774291992,
"eval_runtime": 193.6477,
"eval_samples_per_second": 51.64,
"eval_steps_per_second": 6.455,
"step": 30100
},
{
"epoch": 0.4588183035838913,
"grad_norm": 3.618774175643921,
"learning_rate": 2.9899665551839464e-06,
"loss": 5.2154,
"step": 30200
},
{
"epoch": 0.4588183035838913,
"eval_loss": 5.0593461990356445,
"eval_runtime": 193.5821,
"eval_samples_per_second": 51.658,
"eval_steps_per_second": 6.457,
"step": 30200
},
{
"epoch": 0.4603375694897982,
"grad_norm": 2.838336229324341,
"learning_rate": 2.979933110367893e-06,
"loss": 5.2082,
"step": 30300
},
{
"epoch": 0.4603375694897982,
"eval_loss": 5.061206817626953,
"eval_runtime": 193.644,
"eval_samples_per_second": 51.641,
"eval_steps_per_second": 6.455,
"step": 30300
},
{
"epoch": 0.46185683539570516,
"grad_norm": 2.840545654296875,
"learning_rate": 2.9698996655518395e-06,
"loss": 5.2028,
"step": 30400
},
{
"epoch": 0.46185683539570516,
"eval_loss": 5.051755428314209,
"eval_runtime": 193.6726,
"eval_samples_per_second": 51.634,
"eval_steps_per_second": 6.454,
"step": 30400
},
{
"epoch": 0.46337610130161205,
"grad_norm": 2.4346399307250977,
"learning_rate": 2.959866220735786e-06,
"loss": 5.2001,
"step": 30500
},
{
"epoch": 0.46337610130161205,
"eval_loss": 5.050179481506348,
"eval_runtime": 193.6621,
"eval_samples_per_second": 51.636,
"eval_steps_per_second": 6.455,
"step": 30500
},
{
"epoch": 0.464895367207519,
"grad_norm": 2.331064224243164,
"learning_rate": 2.9498327759197326e-06,
"loss": 5.2018,
"step": 30600
},
{
"epoch": 0.464895367207519,
"eval_loss": 5.039993762969971,
"eval_runtime": 193.9744,
"eval_samples_per_second": 51.553,
"eval_steps_per_second": 6.444,
"step": 30600
},
{
"epoch": 0.4664146331134259,
"grad_norm": 3.012594223022461,
"learning_rate": 2.939799331103679e-06,
"loss": 5.1991,
"step": 30700
},
{
"epoch": 0.4664146331134259,
"eval_loss": 5.039401054382324,
"eval_runtime": 194.0713,
"eval_samples_per_second": 51.527,
"eval_steps_per_second": 6.441,
"step": 30700
},
{
"epoch": 0.46793389901933286,
"grad_norm": 3.4017112255096436,
"learning_rate": 2.9297658862876257e-06,
"loss": 5.1937,
"step": 30800
},
{
"epoch": 0.46793389901933286,
"eval_loss": 5.0479512214660645,
"eval_runtime": 194.0394,
"eval_samples_per_second": 51.536,
"eval_steps_per_second": 6.442,
"step": 30800
},
{
"epoch": 0.46945316492523975,
"grad_norm": 2.848475694656372,
"learning_rate": 2.919732441471572e-06,
"loss": 5.1898,
"step": 30900
},
{
"epoch": 0.46945316492523975,
"eval_loss": 5.043004035949707,
"eval_runtime": 194.2051,
"eval_samples_per_second": 51.492,
"eval_steps_per_second": 6.436,
"step": 30900
},
{
"epoch": 0.4709724308311467,
"grad_norm": 2.964001178741455,
"learning_rate": 2.9096989966555184e-06,
"loss": 5.1887,
"step": 31000
},
{
"epoch": 0.4709724308311467,
"eval_loss": 5.029993534088135,
"eval_runtime": 194.1479,
"eval_samples_per_second": 51.507,
"eval_steps_per_second": 6.438,
"step": 31000
},
{
"epoch": 0.4724916967370536,
"grad_norm": 2.698634147644043,
"learning_rate": 2.899665551839465e-06,
"loss": 5.1879,
"step": 31100
},
{
"epoch": 0.4724916967370536,
"eval_loss": 5.028534412384033,
"eval_runtime": 194.0408,
"eval_samples_per_second": 51.536,
"eval_steps_per_second": 6.442,
"step": 31100
},
{
"epoch": 0.47401096264296055,
"grad_norm": 2.757293224334717,
"learning_rate": 2.8896321070234115e-06,
"loss": 5.1818,
"step": 31200
},
{
"epoch": 0.47401096264296055,
"eval_loss": 5.024392127990723,
"eval_runtime": 194.1644,
"eval_samples_per_second": 51.503,
"eval_steps_per_second": 6.438,
"step": 31200
},
{
"epoch": 0.47553022854886745,
"grad_norm": 3.269547700881958,
"learning_rate": 2.879598662207358e-06,
"loss": 5.1784,
"step": 31300
},
{
"epoch": 0.47553022854886745,
"eval_loss": 5.026421546936035,
"eval_runtime": 193.9523,
"eval_samples_per_second": 51.559,
"eval_steps_per_second": 6.445,
"step": 31300
},
{
"epoch": 0.4770494944547744,
"grad_norm": 3.1080405712127686,
"learning_rate": 2.8695652173913046e-06,
"loss": 5.1725,
"step": 31400
},
{
"epoch": 0.4770494944547744,
"eval_loss": 5.025778770446777,
"eval_runtime": 193.9136,
"eval_samples_per_second": 51.569,
"eval_steps_per_second": 6.446,
"step": 31400
},
{
"epoch": 0.4785687603606813,
"grad_norm": 4.382852554321289,
"learning_rate": 2.859531772575251e-06,
"loss": 5.1691,
"step": 31500
},
{
"epoch": 0.4785687603606813,
"eval_loss": 5.0218424797058105,
"eval_runtime": 194.0079,
"eval_samples_per_second": 51.544,
"eval_steps_per_second": 6.443,
"step": 31500
},
{
"epoch": 0.48008802626658825,
"grad_norm": 2.4219489097595215,
"learning_rate": 2.8494983277591977e-06,
"loss": 5.1675,
"step": 31600
},
{
"epoch": 0.48008802626658825,
"eval_loss": 5.009864807128906,
"eval_runtime": 194.1924,
"eval_samples_per_second": 51.495,
"eval_steps_per_second": 6.437,
"step": 31600
},
{
"epoch": 0.48160729217249515,
"grad_norm": 3.9848620891571045,
"learning_rate": 2.839464882943144e-06,
"loss": 5.1607,
"step": 31700
},
{
"epoch": 0.48160729217249515,
"eval_loss": 5.008971214294434,
"eval_runtime": 194.2985,
"eval_samples_per_second": 51.467,
"eval_steps_per_second": 6.433,
"step": 31700
},
{
"epoch": 0.4831265580784021,
"grad_norm": 3.3474831581115723,
"learning_rate": 2.8294314381270904e-06,
"loss": 5.1598,
"step": 31800
},
{
"epoch": 0.4831265580784021,
"eval_loss": 5.004793167114258,
"eval_runtime": 194.03,
"eval_samples_per_second": 51.538,
"eval_steps_per_second": 6.442,
"step": 31800
},
{
"epoch": 0.484645823984309,
"grad_norm": 3.074587821960449,
"learning_rate": 2.819397993311037e-06,
"loss": 5.1588,
"step": 31900
},
{
"epoch": 0.484645823984309,
"eval_loss": 5.007466793060303,
"eval_runtime": 193.9052,
"eval_samples_per_second": 51.572,
"eval_steps_per_second": 6.446,
"step": 31900
},
{
"epoch": 0.48616508989021595,
"grad_norm": 2.631606340408325,
"learning_rate": 2.8093645484949835e-06,
"loss": 5.155,
"step": 32000
},
{
"epoch": 0.48616508989021595,
"eval_loss": 5.00339937210083,
"eval_runtime": 194.0167,
"eval_samples_per_second": 51.542,
"eval_steps_per_second": 6.443,
"step": 32000
},
{
"epoch": 0.48768435579612285,
"grad_norm": 2.5506277084350586,
"learning_rate": 2.79933110367893e-06,
"loss": 5.1544,
"step": 32100
},
{
"epoch": 0.48768435579612285,
"eval_loss": 4.995656967163086,
"eval_runtime": 193.7711,
"eval_samples_per_second": 51.607,
"eval_steps_per_second": 6.451,
"step": 32100
},
{
"epoch": 0.4892036217020298,
"grad_norm": 2.9476144313812256,
"learning_rate": 2.7892976588628766e-06,
"loss": 5.1477,
"step": 32200
},
{
"epoch": 0.4892036217020298,
"eval_loss": 4.994040012359619,
"eval_runtime": 193.6615,
"eval_samples_per_second": 51.636,
"eval_steps_per_second": 6.455,
"step": 32200
},
{
"epoch": 0.4907228876079367,
"grad_norm": 3.5395162105560303,
"learning_rate": 2.779264214046823e-06,
"loss": 5.1424,
"step": 32300
},
{
"epoch": 0.4907228876079367,
"eval_loss": 4.992614269256592,
"eval_runtime": 193.7065,
"eval_samples_per_second": 51.624,
"eval_steps_per_second": 6.453,
"step": 32300
},
{
"epoch": 0.49224215351384365,
"grad_norm": 2.805767297744751,
"learning_rate": 2.7692307692307693e-06,
"loss": 5.1446,
"step": 32400
},
{
"epoch": 0.49224215351384365,
"eval_loss": 4.987194538116455,
"eval_runtime": 193.7203,
"eval_samples_per_second": 51.621,
"eval_steps_per_second": 6.453,
"step": 32400
},
{
"epoch": 0.49376141941975055,
"grad_norm": 3.9371492862701416,
"learning_rate": 2.759197324414716e-06,
"loss": 5.1391,
"step": 32500
},
{
"epoch": 0.49376141941975055,
"eval_loss": 4.9901838302612305,
"eval_runtime": 193.6911,
"eval_samples_per_second": 51.629,
"eval_steps_per_second": 6.454,
"step": 32500
},
{
"epoch": 0.4952806853256575,
"grad_norm": 2.755789041519165,
"learning_rate": 2.749163879598662e-06,
"loss": 5.1393,
"step": 32600
},
{
"epoch": 0.4952806853256575,
"eval_loss": 4.992640018463135,
"eval_runtime": 193.6445,
"eval_samples_per_second": 51.641,
"eval_steps_per_second": 6.455,
"step": 32600
},
{
"epoch": 0.4967999512315644,
"grad_norm": 3.4700164794921875,
"learning_rate": 2.7391304347826087e-06,
"loss": 5.1375,
"step": 32700
},
{
"epoch": 0.4967999512315644,
"eval_loss": 4.975983142852783,
"eval_runtime": 193.7046,
"eval_samples_per_second": 51.625,
"eval_steps_per_second": 6.453,
"step": 32700
},
{
"epoch": 0.49831921713747135,
"grad_norm": 2.9584505558013916,
"learning_rate": 2.729096989966555e-06,
"loss": 5.1305,
"step": 32800
},
{
"epoch": 0.49831921713747135,
"eval_loss": 4.978539943695068,
"eval_runtime": 193.9644,
"eval_samples_per_second": 51.556,
"eval_steps_per_second": 6.444,
"step": 32800
},
{
"epoch": 0.49983848304337825,
"grad_norm": 3.1944355964660645,
"learning_rate": 2.7190635451505014e-06,
"loss": 5.1202,
"step": 32900
},
{
"epoch": 0.49983848304337825,
"eval_loss": 4.972556114196777,
"eval_runtime": 194.0092,
"eval_samples_per_second": 51.544,
"eval_steps_per_second": 6.443,
"step": 32900
},
{
"epoch": 0.5013577489492852,
"grad_norm": 2.980757713317871,
"learning_rate": 2.709030100334448e-06,
"loss": 5.1282,
"step": 33000
},
{
"epoch": 0.5013577489492852,
"eval_loss": 4.974079132080078,
"eval_runtime": 194.0276,
"eval_samples_per_second": 51.539,
"eval_steps_per_second": 6.442,
"step": 33000
},
{
"epoch": 0.5028770148551921,
"grad_norm": 3.100187063217163,
"learning_rate": 2.6989966555183945e-06,
"loss": 5.1259,
"step": 33100
},
{
"epoch": 0.5028770148551921,
"eval_loss": 4.963293552398682,
"eval_runtime": 194.0336,
"eval_samples_per_second": 51.537,
"eval_steps_per_second": 6.442,
"step": 33100
},
{
"epoch": 0.504396280761099,
"grad_norm": 2.542158603668213,
"learning_rate": 2.6889632107023413e-06,
"loss": 5.1217,
"step": 33200
},
{
"epoch": 0.504396280761099,
"eval_loss": 4.9611406326293945,
"eval_runtime": 193.9855,
"eval_samples_per_second": 51.55,
"eval_steps_per_second": 6.444,
"step": 33200
},
{
"epoch": 0.505915546667006,
"grad_norm": 2.545457363128662,
"learning_rate": 2.6789297658862876e-06,
"loss": 5.1158,
"step": 33300
},
{
"epoch": 0.505915546667006,
"eval_loss": 4.967195510864258,
"eval_runtime": 194.147,
"eval_samples_per_second": 51.507,
"eval_steps_per_second": 6.438,
"step": 33300
},
{
"epoch": 0.5074348125729129,
"grad_norm": 2.822507858276367,
"learning_rate": 2.668896321070234e-06,
"loss": 5.1109,
"step": 33400
},
{
"epoch": 0.5074348125729129,
"eval_loss": 4.9572014808654785,
"eval_runtime": 194.2258,
"eval_samples_per_second": 51.486,
"eval_steps_per_second": 6.436,
"step": 33400
},
{
"epoch": 0.5089540784788198,
"grad_norm": 2.361830949783325,
"learning_rate": 2.6588628762541807e-06,
"loss": 5.1154,
"step": 33500
},
{
"epoch": 0.5089540784788198,
"eval_loss": 4.94895076751709,
"eval_runtime": 194.1601,
"eval_samples_per_second": 51.504,
"eval_steps_per_second": 6.438,
"step": 33500
},
{
"epoch": 0.5104733443847267,
"grad_norm": 2.3638288974761963,
"learning_rate": 2.648829431438127e-06,
"loss": 5.1055,
"step": 33600
},
{
"epoch": 0.5104733443847267,
"eval_loss": 4.947831153869629,
"eval_runtime": 194.2013,
"eval_samples_per_second": 51.493,
"eval_steps_per_second": 6.437,
"step": 33600
},
{
"epoch": 0.5119926102906337,
"grad_norm": 2.163120746612549,
"learning_rate": 2.6387959866220734e-06,
"loss": 5.1048,
"step": 33700
},
{
"epoch": 0.5119926102906337,
"eval_loss": 4.943573951721191,
"eval_runtime": 194.2102,
"eval_samples_per_second": 51.491,
"eval_steps_per_second": 6.436,
"step": 33700
},
{
"epoch": 0.5135118761965406,
"grad_norm": 2.234380006790161,
"learning_rate": 2.62876254180602e-06,
"loss": 5.1042,
"step": 33800
},
{
"epoch": 0.5135118761965406,
"eval_loss": 4.945695400238037,
"eval_runtime": 194.1949,
"eval_samples_per_second": 51.495,
"eval_steps_per_second": 6.437,
"step": 33800
},
{
"epoch": 0.5150311421024475,
"grad_norm": 2.8607873916625977,
"learning_rate": 2.6187290969899665e-06,
"loss": 5.0977,
"step": 33900
},
{
"epoch": 0.5150311421024475,
"eval_loss": 4.940700531005859,
"eval_runtime": 194.1567,
"eval_samples_per_second": 51.505,
"eval_steps_per_second": 6.438,
"step": 33900
},
{
"epoch": 0.5165504080083544,
"grad_norm": 2.85111403465271,
"learning_rate": 2.6086956521739132e-06,
"loss": 5.0939,
"step": 34000
},
{
"epoch": 0.5165504080083544,
"eval_loss": 4.934571266174316,
"eval_runtime": 194.0792,
"eval_samples_per_second": 51.525,
"eval_steps_per_second": 6.441,
"step": 34000
},
{
"epoch": 0.5180696739142614,
"grad_norm": 3.2021050453186035,
"learning_rate": 2.5986622073578596e-06,
"loss": 5.0902,
"step": 34100
},
{
"epoch": 0.5180696739142614,
"eval_loss": 4.940134048461914,
"eval_runtime": 194.0612,
"eval_samples_per_second": 51.53,
"eval_steps_per_second": 6.441,
"step": 34100
},
{
"epoch": 0.5195889398201683,
"grad_norm": 2.500246047973633,
"learning_rate": 2.588628762541806e-06,
"loss": 5.0851,
"step": 34200
},
{
"epoch": 0.5195889398201683,
"eval_loss": 4.938769340515137,
"eval_runtime": 194.1164,
"eval_samples_per_second": 51.515,
"eval_steps_per_second": 6.439,
"step": 34200
},
{
"epoch": 0.5211082057260752,
"grad_norm": 2.7174882888793945,
"learning_rate": 2.5785953177257527e-06,
"loss": 5.0917,
"step": 34300
},
{
"epoch": 0.5211082057260752,
"eval_loss": 4.933419704437256,
"eval_runtime": 194.3056,
"eval_samples_per_second": 51.465,
"eval_steps_per_second": 6.433,
"step": 34300
},
{
"epoch": 0.5226274716319821,
"grad_norm": 3.255512237548828,
"learning_rate": 2.568561872909699e-06,
"loss": 5.0836,
"step": 34400
},
{
"epoch": 0.5226274716319821,
"eval_loss": 4.930218696594238,
"eval_runtime": 193.8112,
"eval_samples_per_second": 51.597,
"eval_steps_per_second": 6.45,
"step": 34400
},
{
"epoch": 0.5241467375378891,
"grad_norm": 2.2356820106506348,
"learning_rate": 2.5585284280936454e-06,
"loss": 5.0815,
"step": 34500
},
{
"epoch": 0.5241467375378891,
"eval_loss": 4.932159423828125,
"eval_runtime": 193.6954,
"eval_samples_per_second": 51.627,
"eval_steps_per_second": 6.453,
"step": 34500
},
{
"epoch": 0.525666003443796,
"grad_norm": 2.4992058277130127,
"learning_rate": 2.548494983277592e-06,
"loss": 5.0844,
"step": 34600
},
{
"epoch": 0.525666003443796,
"eval_loss": 4.924154758453369,
"eval_runtime": 193.7231,
"eval_samples_per_second": 51.62,
"eval_steps_per_second": 6.453,
"step": 34600
},
{
"epoch": 0.5271852693497029,
"grad_norm": 2.348440647125244,
"learning_rate": 2.5384615384615385e-06,
"loss": 5.0789,
"step": 34700
},
{
"epoch": 0.5271852693497029,
"eval_loss": 4.925171852111816,
"eval_runtime": 193.7816,
"eval_samples_per_second": 51.604,
"eval_steps_per_second": 6.451,
"step": 34700
},
{
"epoch": 0.5287045352556098,
"grad_norm": 2.589172840118408,
"learning_rate": 2.528428093645485e-06,
"loss": 5.0708,
"step": 34800
},
{
"epoch": 0.5287045352556098,
"eval_loss": 4.919689178466797,
"eval_runtime": 193.7778,
"eval_samples_per_second": 51.605,
"eval_steps_per_second": 6.451,
"step": 34800
},
{
"epoch": 0.5302238011615168,
"grad_norm": 2.950510263442993,
"learning_rate": 2.5183946488294316e-06,
"loss": 5.0707,
"step": 34900
},
{
"epoch": 0.5302238011615168,
"eval_loss": 4.9157304763793945,
"eval_runtime": 193.5504,
"eval_samples_per_second": 51.666,
"eval_steps_per_second": 6.458,
"step": 34900
},
{
"epoch": 0.5317430670674237,
"grad_norm": 3.1693990230560303,
"learning_rate": 2.508361204013378e-06,
"loss": 5.0653,
"step": 35000
},
{
"epoch": 0.5317430670674237,
"eval_loss": 4.910171985626221,
"eval_runtime": 194.0296,
"eval_samples_per_second": 51.539,
"eval_steps_per_second": 6.442,
"step": 35000
},
{
"epoch": 0.5332623329733306,
"grad_norm": 2.8548085689544678,
"learning_rate": 2.4983277591973247e-06,
"loss": 5.0706,
"step": 35100
},
{
"epoch": 0.5332623329733306,
"eval_loss": 4.9105353355407715,
"eval_runtime": 193.9374,
"eval_samples_per_second": 51.563,
"eval_steps_per_second": 6.445,
"step": 35100
},
{
"epoch": 0.5347815988792376,
"grad_norm": 1.8217041492462158,
"learning_rate": 2.488294314381271e-06,
"loss": 5.064,
"step": 35200
},
{
"epoch": 0.5347815988792376,
"eval_loss": 4.906797885894775,
"eval_runtime": 194.1314,
"eval_samples_per_second": 51.511,
"eval_steps_per_second": 6.439,
"step": 35200
},
{
"epoch": 0.5363008647851445,
"grad_norm": 2.089233875274658,
"learning_rate": 2.4782608695652173e-06,
"loss": 5.0612,
"step": 35300
},
{
"epoch": 0.5363008647851445,
"eval_loss": 4.904172420501709,
"eval_runtime": 194.1107,
"eval_samples_per_second": 51.517,
"eval_steps_per_second": 6.44,
"step": 35300
},
{
"epoch": 0.5378201306910514,
"grad_norm": 2.3225550651550293,
"learning_rate": 2.468227424749164e-06,
"loss": 5.06,
"step": 35400
},
{
"epoch": 0.5378201306910514,
"eval_loss": 4.904652118682861,
"eval_runtime": 194.1265,
"eval_samples_per_second": 51.513,
"eval_steps_per_second": 6.439,
"step": 35400
},
{
"epoch": 0.5393393965969583,
"grad_norm": 3.1568684577941895,
"learning_rate": 2.4581939799331104e-06,
"loss": 5.0579,
"step": 35500
},
{
"epoch": 0.5393393965969583,
"eval_loss": 4.90002965927124,
"eval_runtime": 194.1042,
"eval_samples_per_second": 51.519,
"eval_steps_per_second": 6.44,
"step": 35500
},
{
"epoch": 0.5408586625028653,
"grad_norm": 2.8267829418182373,
"learning_rate": 2.4481605351170568e-06,
"loss": 5.0568,
"step": 35600
},
{
"epoch": 0.5408586625028653,
"eval_loss": 4.89033842086792,
"eval_runtime": 194.0764,
"eval_samples_per_second": 51.526,
"eval_steps_per_second": 6.441,
"step": 35600
},
{
"epoch": 0.5423779284087722,
"grad_norm": 1.987886667251587,
"learning_rate": 2.4381270903010035e-06,
"loss": 5.0541,
"step": 35700
},
{
"epoch": 0.5423779284087722,
"eval_loss": 4.9011454582214355,
"eval_runtime": 194.1549,
"eval_samples_per_second": 51.505,
"eval_steps_per_second": 6.438,
"step": 35700
},
{
"epoch": 0.5438971943146791,
"grad_norm": 3.215435028076172,
"learning_rate": 2.42809364548495e-06,
"loss": 5.0506,
"step": 35800
},
{
"epoch": 0.5438971943146791,
"eval_loss": 4.890650272369385,
"eval_runtime": 194.1843,
"eval_samples_per_second": 51.497,
"eval_steps_per_second": 6.437,
"step": 35800
},
{
"epoch": 0.545416460220586,
"grad_norm": 1.9231488704681396,
"learning_rate": 2.4180602006688962e-06,
"loss": 5.0466,
"step": 35900
},
{
"epoch": 0.545416460220586,
"eval_loss": 4.890570163726807,
"eval_runtime": 194.206,
"eval_samples_per_second": 51.492,
"eval_steps_per_second": 6.436,
"step": 35900
},
{
"epoch": 0.546935726126493,
"grad_norm": 2.3541529178619385,
"learning_rate": 2.408026755852843e-06,
"loss": 5.0444,
"step": 36000
},
{
"epoch": 0.546935726126493,
"eval_loss": 4.887938022613525,
"eval_runtime": 194.1495,
"eval_samples_per_second": 51.507,
"eval_steps_per_second": 6.438,
"step": 36000
},
{
"epoch": 0.5484549920323999,
"grad_norm": 2.646209478378296,
"learning_rate": 2.3979933110367893e-06,
"loss": 5.0381,
"step": 36100
},
{
"epoch": 0.5484549920323999,
"eval_loss": 4.883460998535156,
"eval_runtime": 194.0814,
"eval_samples_per_second": 51.525,
"eval_steps_per_second": 6.441,
"step": 36100
},
{
"epoch": 0.5499742579383068,
"grad_norm": 2.2432219982147217,
"learning_rate": 2.387959866220736e-06,
"loss": 5.0363,
"step": 36200
},
{
"epoch": 0.5499742579383068,
"eval_loss": 4.881083011627197,
"eval_runtime": 194.1247,
"eval_samples_per_second": 51.513,
"eval_steps_per_second": 6.439,
"step": 36200
},
{
"epoch": 0.5514935238442137,
"grad_norm": 2.482103109359741,
"learning_rate": 2.3779264214046824e-06,
"loss": 5.0416,
"step": 36300
},
{
"epoch": 0.5514935238442137,
"eval_loss": 4.881221294403076,
"eval_runtime": 194.1059,
"eval_samples_per_second": 51.518,
"eval_steps_per_second": 6.44,
"step": 36300
},
{
"epoch": 0.5530127897501207,
"grad_norm": 2.0182697772979736,
"learning_rate": 2.3678929765886288e-06,
"loss": 5.0287,
"step": 36400
},
{
"epoch": 0.5530127897501207,
"eval_loss": 4.877261161804199,
"eval_runtime": 194.0469,
"eval_samples_per_second": 51.534,
"eval_steps_per_second": 6.442,
"step": 36400
},
{
"epoch": 0.5545320556560276,
"grad_norm": 3.02773380279541,
"learning_rate": 2.3578595317725755e-06,
"loss": 5.0328,
"step": 36500
},
{
"epoch": 0.5545320556560276,
"eval_loss": 4.869913101196289,
"eval_runtime": 194.0627,
"eval_samples_per_second": 51.53,
"eval_steps_per_second": 6.441,
"step": 36500
},
{
"epoch": 0.5560513215619345,
"grad_norm": 3.1895177364349365,
"learning_rate": 2.347826086956522e-06,
"loss": 5.0272,
"step": 36600
},
{
"epoch": 0.5560513215619345,
"eval_loss": 4.872635364532471,
"eval_runtime": 194.0337,
"eval_samples_per_second": 51.537,
"eval_steps_per_second": 6.442,
"step": 36600
},
{
"epoch": 0.5575705874678414,
"grad_norm": 2.474367141723633,
"learning_rate": 2.337792642140468e-06,
"loss": 5.0285,
"step": 36700
},
{
"epoch": 0.5575705874678414,
"eval_loss": 4.866065502166748,
"eval_runtime": 193.983,
"eval_samples_per_second": 51.551,
"eval_steps_per_second": 6.444,
"step": 36700
},
{
"epoch": 0.5590898533737484,
"grad_norm": 3.0734000205993652,
"learning_rate": 2.327759197324415e-06,
"loss": 5.0238,
"step": 36800
},
{
"epoch": 0.5590898533737484,
"eval_loss": 4.873917102813721,
"eval_runtime": 193.8114,
"eval_samples_per_second": 51.597,
"eval_steps_per_second": 6.45,
"step": 36800
},
{
"epoch": 0.5606091192796553,
"grad_norm": 2.379478931427002,
"learning_rate": 2.3177257525083613e-06,
"loss": 5.0225,
"step": 36900
},
{
"epoch": 0.5606091192796553,
"eval_loss": 4.864801406860352,
"eval_runtime": 193.8549,
"eval_samples_per_second": 51.585,
"eval_steps_per_second": 6.448,
"step": 36900
},
{
"epoch": 0.5621283851855622,
"grad_norm": 2.6084952354431152,
"learning_rate": 2.307692307692308e-06,
"loss": 5.0177,
"step": 37000
},
{
"epoch": 0.5621283851855622,
"eval_loss": 4.863184452056885,
"eval_runtime": 193.8212,
"eval_samples_per_second": 51.594,
"eval_steps_per_second": 6.449,
"step": 37000
},
{
"epoch": 0.5636476510914691,
"grad_norm": 2.194261312484741,
"learning_rate": 2.2976588628762544e-06,
"loss": 5.0167,
"step": 37100
},
{
"epoch": 0.5636476510914691,
"eval_loss": 4.855440139770508,
"eval_runtime": 193.8552,
"eval_samples_per_second": 51.585,
"eval_steps_per_second": 6.448,
"step": 37100
},
{
"epoch": 0.5651669169973761,
"grad_norm": 2.195667028427124,
"learning_rate": 2.2876254180602008e-06,
"loss": 5.0148,
"step": 37200
},
{
"epoch": 0.5651669169973761,
"eval_loss": 4.857753753662109,
"eval_runtime": 193.61,
"eval_samples_per_second": 51.65,
"eval_steps_per_second": 6.456,
"step": 37200
},
{
"epoch": 0.566686182903283,
"grad_norm": 2.308091402053833,
"learning_rate": 2.2775919732441475e-06,
"loss": 5.0152,
"step": 37300
},
{
"epoch": 0.566686182903283,
"eval_loss": 4.850945949554443,
"eval_runtime": 193.675,
"eval_samples_per_second": 51.633,
"eval_steps_per_second": 6.454,
"step": 37300
},
{
"epoch": 0.5682054488091899,
"grad_norm": 1.5866217613220215,
"learning_rate": 2.267558528428094e-06,
"loss": 5.0086,
"step": 37400
},
{
"epoch": 0.5682054488091899,
"eval_loss": 4.856834411621094,
"eval_runtime": 194.0621,
"eval_samples_per_second": 51.53,
"eval_steps_per_second": 6.441,
"step": 37400
},
{
"epoch": 0.5697247147150968,
"grad_norm": 2.3778269290924072,
"learning_rate": 2.25752508361204e-06,
"loss": 5.008,
"step": 37500
},
{
"epoch": 0.5697247147150968,
"eval_loss": 4.849526405334473,
"eval_runtime": 193.9807,
"eval_samples_per_second": 51.552,
"eval_steps_per_second": 6.444,
"step": 37500
},
{
"epoch": 0.5712439806210038,
"grad_norm": 2.434232234954834,
"learning_rate": 2.2474916387959865e-06,
"loss": 5.0049,
"step": 37600
},
{
"epoch": 0.5712439806210038,
"eval_loss": 4.849723815917969,
"eval_runtime": 194.1152,
"eval_samples_per_second": 51.516,
"eval_steps_per_second": 6.439,
"step": 37600
},
{
"epoch": 0.5727632465269107,
"grad_norm": 1.9899414777755737,
"learning_rate": 2.237458193979933e-06,
"loss": 5.0034,
"step": 37700
},
{
"epoch": 0.5727632465269107,
"eval_loss": 4.845240592956543,
"eval_runtime": 194.1331,
"eval_samples_per_second": 51.511,
"eval_steps_per_second": 6.439,
"step": 37700
},
{
"epoch": 0.5742825124328176,
"grad_norm": 2.168919086456299,
"learning_rate": 2.2274247491638796e-06,
"loss": 4.9989,
"step": 37800
},
{
"epoch": 0.5742825124328176,
"eval_loss": 4.840480327606201,
"eval_runtime": 194.2185,
"eval_samples_per_second": 51.488,
"eval_steps_per_second": 6.436,
"step": 37800
},
{
"epoch": 0.5758017783387245,
"grad_norm": 2.4156546592712402,
"learning_rate": 2.217391304347826e-06,
"loss": 4.9981,
"step": 37900
},
{
"epoch": 0.5758017783387245,
"eval_loss": 4.837850570678711,
"eval_runtime": 194.2958,
"eval_samples_per_second": 51.468,
"eval_steps_per_second": 6.433,
"step": 37900
},
{
"epoch": 0.5773210442446315,
"grad_norm": 2.725648880004883,
"learning_rate": 2.2073578595317723e-06,
"loss": 4.9999,
"step": 38000
},
{
"epoch": 0.5773210442446315,
"eval_loss": 4.840028285980225,
"eval_runtime": 194.1552,
"eval_samples_per_second": 51.505,
"eval_steps_per_second": 6.438,
"step": 38000
},
{
"epoch": 0.5788403101505384,
"grad_norm": 2.447983503341675,
"learning_rate": 2.197324414715719e-06,
"loss": 4.9909,
"step": 38100
},
{
"epoch": 0.5788403101505384,
"eval_loss": 4.840633392333984,
"eval_runtime": 200.6709,
"eval_samples_per_second": 49.833,
"eval_steps_per_second": 6.229,
"step": 38100
},
{
"epoch": 0.5803595760564453,
"grad_norm": 2.5275213718414307,
"learning_rate": 2.1872909698996654e-06,
"loss": 4.9924,
"step": 38200
},
{
"epoch": 0.5803595760564453,
"eval_loss": 4.838108539581299,
"eval_runtime": 194.1742,
"eval_samples_per_second": 51.5,
"eval_steps_per_second": 6.438,
"step": 38200
},
{
"epoch": 0.5818788419623522,
"grad_norm": 3.0799427032470703,
"learning_rate": 2.177257525083612e-06,
"loss": 4.9892,
"step": 38300
},
{
"epoch": 0.5818788419623522,
"eval_loss": 4.830769062042236,
"eval_runtime": 194.1344,
"eval_samples_per_second": 51.511,
"eval_steps_per_second": 6.439,
"step": 38300
},
{
"epoch": 0.5833981078682592,
"grad_norm": 2.262266159057617,
"learning_rate": 2.1672240802675585e-06,
"loss": 4.9869,
"step": 38400
},
{
"epoch": 0.5833981078682592,
"eval_loss": 4.82758903503418,
"eval_runtime": 194.1511,
"eval_samples_per_second": 51.506,
"eval_steps_per_second": 6.438,
"step": 38400
},
{
"epoch": 0.5849173737741661,
"grad_norm": 2.2345926761627197,
"learning_rate": 2.157190635451505e-06,
"loss": 4.9879,
"step": 38500
},
{
"epoch": 0.5849173737741661,
"eval_loss": 4.826181888580322,
"eval_runtime": 194.2128,
"eval_samples_per_second": 51.49,
"eval_steps_per_second": 6.436,
"step": 38500
},
{
"epoch": 0.586436639680073,
"grad_norm": 1.8996378183364868,
"learning_rate": 2.1471571906354516e-06,
"loss": 4.9854,
"step": 38600
},
{
"epoch": 0.586436639680073,
"eval_loss": 4.823826789855957,
"eval_runtime": 193.9959,
"eval_samples_per_second": 51.547,
"eval_steps_per_second": 6.443,
"step": 38600
},
{
"epoch": 0.5879559055859799,
"grad_norm": 2.0965209007263184,
"learning_rate": 2.137123745819398e-06,
"loss": 4.9833,
"step": 38700
},
{
"epoch": 0.5879559055859799,
"eval_loss": 4.819667816162109,
"eval_runtime": 194.1201,
"eval_samples_per_second": 51.515,
"eval_steps_per_second": 6.439,
"step": 38700
},
{
"epoch": 0.5894751714918869,
"grad_norm": 2.005686044692993,
"learning_rate": 2.1270903010033443e-06,
"loss": 4.9753,
"step": 38800
},
{
"epoch": 0.5894751714918869,
"eval_loss": 4.818215847015381,
"eval_runtime": 194.3711,
"eval_samples_per_second": 51.448,
"eval_steps_per_second": 6.431,
"step": 38800
},
{
"epoch": 0.5909944373977938,
"grad_norm": 2.056711196899414,
"learning_rate": 2.117056856187291e-06,
"loss": 4.9729,
"step": 38900
},
{
"epoch": 0.5909944373977938,
"eval_loss": 4.815535068511963,
"eval_runtime": 194.0241,
"eval_samples_per_second": 51.54,
"eval_steps_per_second": 6.442,
"step": 38900
},
{
"epoch": 0.5925137033037007,
"grad_norm": 2.186563730239868,
"learning_rate": 2.1070234113712374e-06,
"loss": 4.9738,
"step": 39000
},
{
"epoch": 0.5925137033037007,
"eval_loss": 4.811450958251953,
"eval_runtime": 193.7645,
"eval_samples_per_second": 51.609,
"eval_steps_per_second": 6.451,
"step": 39000
},
{
"epoch": 0.5940329692096076,
"grad_norm": 2.0862069129943848,
"learning_rate": 2.0969899665551837e-06,
"loss": 4.9714,
"step": 39100
},
{
"epoch": 0.5940329692096076,
"eval_loss": 4.812065601348877,
"eval_runtime": 193.9277,
"eval_samples_per_second": 51.566,
"eval_steps_per_second": 6.446,
"step": 39100
},
{
"epoch": 0.5955522351155146,
"grad_norm": 2.3990869522094727,
"learning_rate": 2.0869565217391305e-06,
"loss": 4.9745,
"step": 39200
},
{
"epoch": 0.5955522351155146,
"eval_loss": 4.809053421020508,
"eval_runtime": 194.0213,
"eval_samples_per_second": 51.541,
"eval_steps_per_second": 6.443,
"step": 39200
},
{
"epoch": 0.5970715010214215,
"grad_norm": 2.380688428878784,
"learning_rate": 2.076923076923077e-06,
"loss": 4.9709,
"step": 39300
},
{
"epoch": 0.5970715010214215,
"eval_loss": 4.810598373413086,
"eval_runtime": 193.7901,
"eval_samples_per_second": 51.602,
"eval_steps_per_second": 6.45,
"step": 39300
},
{
"epoch": 0.5985907669273284,
"grad_norm": 2.6398425102233887,
"learning_rate": 2.0668896321070236e-06,
"loss": 4.967,
"step": 39400
},
{
"epoch": 0.5985907669273284,
"eval_loss": 4.807140827178955,
"eval_runtime": 193.6791,
"eval_samples_per_second": 51.632,
"eval_steps_per_second": 6.454,
"step": 39400
},
{
"epoch": 0.6001100328332353,
"grad_norm": 2.365203619003296,
"learning_rate": 2.05685618729097e-06,
"loss": 4.9623,
"step": 39500
},
{
"epoch": 0.6001100328332353,
"eval_loss": 4.804749011993408,
"eval_runtime": 193.728,
"eval_samples_per_second": 51.619,
"eval_steps_per_second": 6.452,
"step": 39500
},
{
"epoch": 0.6016292987391423,
"grad_norm": 2.6509780883789062,
"learning_rate": 2.0468227424749163e-06,
"loss": 4.963,
"step": 39600
},
{
"epoch": 0.6016292987391423,
"eval_loss": 4.8039093017578125,
"eval_runtime": 194.0108,
"eval_samples_per_second": 51.544,
"eval_steps_per_second": 6.443,
"step": 39600
},
{
"epoch": 0.6031485646450492,
"grad_norm": 2.182466506958008,
"learning_rate": 2.036789297658863e-06,
"loss": 4.9585,
"step": 39700
},
{
"epoch": 0.6031485646450492,
"eval_loss": 4.798705577850342,
"eval_runtime": 194.1051,
"eval_samples_per_second": 51.518,
"eval_steps_per_second": 6.44,
"step": 39700
},
{
"epoch": 0.6046678305509561,
"grad_norm": 1.9312145709991455,
"learning_rate": 2.0267558528428094e-06,
"loss": 4.9604,
"step": 39800
},
{
"epoch": 0.6046678305509561,
"eval_loss": 4.799111843109131,
"eval_runtime": 194.0025,
"eval_samples_per_second": 51.546,
"eval_steps_per_second": 6.443,
"step": 39800
},
{
"epoch": 0.606187096456863,
"grad_norm": 2.0514976978302,
"learning_rate": 2.0167224080267557e-06,
"loss": 4.9551,
"step": 39900
},
{
"epoch": 0.606187096456863,
"eval_loss": 4.792530536651611,
"eval_runtime": 194.0231,
"eval_samples_per_second": 51.54,
"eval_steps_per_second": 6.443,
"step": 39900
},
{
"epoch": 0.60770636236277,
"grad_norm": 2.4416747093200684,
"learning_rate": 2.0066889632107025e-06,
"loss": 4.9522,
"step": 40000
},
{
"epoch": 0.60770636236277,
"eval_loss": 4.7944655418396,
"eval_runtime": 194.2247,
"eval_samples_per_second": 51.487,
"eval_steps_per_second": 6.436,
"step": 40000
},
{
"epoch": 0.6092256282686769,
"grad_norm": 2.400484561920166,
"learning_rate": 1.996655518394649e-06,
"loss": 4.9543,
"step": 40100
},
{
"epoch": 0.6092256282686769,
"eval_loss": 4.793302059173584,
"eval_runtime": 194.2542,
"eval_samples_per_second": 51.479,
"eval_steps_per_second": 6.435,
"step": 40100
},
{
"epoch": 0.6107448941745838,
"grad_norm": 1.9967873096466064,
"learning_rate": 1.986622073578595e-06,
"loss": 4.9507,
"step": 40200
},
{
"epoch": 0.6107448941745838,
"eval_loss": 4.793440818786621,
"eval_runtime": 194.3425,
"eval_samples_per_second": 51.456,
"eval_steps_per_second": 6.432,
"step": 40200
},
{
"epoch": 0.6122641600804907,
"grad_norm": 1.917490839958191,
"learning_rate": 1.976588628762542e-06,
"loss": 4.9505,
"step": 40300
},
{
"epoch": 0.6122641600804907,
"eval_loss": 4.786988258361816,
"eval_runtime": 194.1703,
"eval_samples_per_second": 51.501,
"eval_steps_per_second": 6.438,
"step": 40300
},
{
"epoch": 0.6137834259863977,
"grad_norm": 2.4164531230926514,
"learning_rate": 1.9665551839464883e-06,
"loss": 4.9423,
"step": 40400
},
{
"epoch": 0.6137834259863977,
"eval_loss": 4.786272048950195,
"eval_runtime": 194.6058,
"eval_samples_per_second": 51.386,
"eval_steps_per_second": 6.423,
"step": 40400
},
{
"epoch": 0.6153026918923046,
"grad_norm": 2.5412399768829346,
"learning_rate": 1.956521739130435e-06,
"loss": 4.9447,
"step": 40500
},
{
"epoch": 0.6153026918923046,
"eval_loss": 4.785825729370117,
"eval_runtime": 194.2484,
"eval_samples_per_second": 51.48,
"eval_steps_per_second": 6.435,
"step": 40500
},
{
"epoch": 0.6168219577982115,
"grad_norm": 2.2212436199188232,
"learning_rate": 1.9464882943143814e-06,
"loss": 4.9432,
"step": 40600
},
{
"epoch": 0.6168219577982115,
"eval_loss": 4.7824625968933105,
"eval_runtime": 194.1967,
"eval_samples_per_second": 51.494,
"eval_steps_per_second": 6.437,
"step": 40600
},
{
"epoch": 0.6183412237041184,
"grad_norm": 2.1287331581115723,
"learning_rate": 1.9364548494983277e-06,
"loss": 4.9416,
"step": 40700
},
{
"epoch": 0.6183412237041184,
"eval_loss": 4.776528358459473,
"eval_runtime": 194.1119,
"eval_samples_per_second": 51.517,
"eval_steps_per_second": 6.44,
"step": 40700
},
{
"epoch": 0.6198604896100254,
"grad_norm": 1.8793989419937134,
"learning_rate": 1.9264214046822745e-06,
"loss": 4.9357,
"step": 40800
},
{
"epoch": 0.6198604896100254,
"eval_loss": 4.779613494873047,
"eval_runtime": 194.0243,
"eval_samples_per_second": 51.54,
"eval_steps_per_second": 6.442,
"step": 40800
},
{
"epoch": 0.6213797555159323,
"grad_norm": 1.943474531173706,
"learning_rate": 1.916387959866221e-06,
"loss": 4.9389,
"step": 40900
},
{
"epoch": 0.6213797555159323,
"eval_loss": 4.774796009063721,
"eval_runtime": 194.066,
"eval_samples_per_second": 51.529,
"eval_steps_per_second": 6.441,
"step": 40900
},
{
"epoch": 0.6228990214218392,
"grad_norm": 2.138035774230957,
"learning_rate": 1.9063545150501674e-06,
"loss": 4.9344,
"step": 41000
},
{
"epoch": 0.6228990214218392,
"eval_loss": 4.774413108825684,
"eval_runtime": 194.289,
"eval_samples_per_second": 51.47,
"eval_steps_per_second": 6.434,
"step": 41000
},
{
"epoch": 0.6244182873277461,
"grad_norm": 2.1911377906799316,
"learning_rate": 1.896321070234114e-06,
"loss": 4.9307,
"step": 41100
},
{
"epoch": 0.6244182873277461,
"eval_loss": 4.7724833488464355,
"eval_runtime": 194.0879,
"eval_samples_per_second": 51.523,
"eval_steps_per_second": 6.44,
"step": 41100
},
{
"epoch": 0.6259375532336531,
"grad_norm": 2.186774730682373,
"learning_rate": 1.8862876254180603e-06,
"loss": 4.9316,
"step": 41200
},
{
"epoch": 0.6259375532336531,
"eval_loss": 4.7727203369140625,
"eval_runtime": 193.9834,
"eval_samples_per_second": 51.551,
"eval_steps_per_second": 6.444,
"step": 41200
},
{
"epoch": 0.62745681913956,
"grad_norm": 2.706834554672241,
"learning_rate": 1.8762541806020068e-06,
"loss": 4.9244,
"step": 41300
},
{
"epoch": 0.62745681913956,
"eval_loss": 4.769220352172852,
"eval_runtime": 193.833,
"eval_samples_per_second": 51.591,
"eval_steps_per_second": 6.449,
"step": 41300
},
{
"epoch": 0.6289760850454669,
"grad_norm": 2.0782527923583984,
"learning_rate": 1.8662207357859534e-06,
"loss": 4.9308,
"step": 41400
},
{
"epoch": 0.6289760850454669,
"eval_loss": 4.769233703613281,
"eval_runtime": 193.7092,
"eval_samples_per_second": 51.624,
"eval_steps_per_second": 6.453,
"step": 41400
},
{
"epoch": 0.6304953509513738,
"grad_norm": 2.107680559158325,
"learning_rate": 1.8561872909699e-06,
"loss": 4.9286,
"step": 41500
},
{
"epoch": 0.6304953509513738,
"eval_loss": 4.765684604644775,
"eval_runtime": 193.8101,
"eval_samples_per_second": 51.597,
"eval_steps_per_second": 6.45,
"step": 41500
},
{
"epoch": 0.6320146168572808,
"grad_norm": 1.861700177192688,
"learning_rate": 1.8461538461538462e-06,
"loss": 4.925,
"step": 41600
},
{
"epoch": 0.6320146168572808,
"eval_loss": 4.761124134063721,
"eval_runtime": 194.0209,
"eval_samples_per_second": 51.541,
"eval_steps_per_second": 6.443,
"step": 41600
},
{
"epoch": 0.6335338827631877,
"grad_norm": 2.256538152694702,
"learning_rate": 1.8361204013377928e-06,
"loss": 4.9214,
"step": 41700
},
{
"epoch": 0.6335338827631877,
"eval_loss": 4.761186122894287,
"eval_runtime": 193.8553,
"eval_samples_per_second": 51.585,
"eval_steps_per_second": 6.448,
"step": 41700
},
{
"epoch": 0.6350531486690946,
"grad_norm": 1.720786213874817,
"learning_rate": 1.8260869565217394e-06,
"loss": 4.9188,
"step": 41800
},
{
"epoch": 0.6350531486690946,
"eval_loss": 4.75638484954834,
"eval_runtime": 194.0169,
"eval_samples_per_second": 51.542,
"eval_steps_per_second": 6.443,
"step": 41800
},
{
"epoch": 0.6365724145750015,
"grad_norm": 1.9223599433898926,
"learning_rate": 1.8160535117056857e-06,
"loss": 4.9162,
"step": 41900
},
{
"epoch": 0.6365724145750015,
"eval_loss": 4.757732391357422,
"eval_runtime": 194.1596,
"eval_samples_per_second": 51.504,
"eval_steps_per_second": 6.438,
"step": 41900
},
{
"epoch": 0.6380916804809085,
"grad_norm": 1.7804436683654785,
"learning_rate": 1.8060200668896322e-06,
"loss": 4.9158,
"step": 42000
},
{
"epoch": 0.6380916804809085,
"eval_loss": 4.757546424865723,
"eval_runtime": 194.2282,
"eval_samples_per_second": 51.486,
"eval_steps_per_second": 6.436,
"step": 42000
},
{
"epoch": 0.6396109463868154,
"grad_norm": 1.9580631256103516,
"learning_rate": 1.7959866220735788e-06,
"loss": 4.9095,
"step": 42100
},
{
"epoch": 0.6396109463868154,
"eval_loss": 4.752386093139648,
"eval_runtime": 194.1745,
"eval_samples_per_second": 51.5,
"eval_steps_per_second": 6.438,
"step": 42100
},
{
"epoch": 0.6411302122927223,
"grad_norm": 2.1417272090911865,
"learning_rate": 1.7859531772575253e-06,
"loss": 4.9134,
"step": 42200
},
{
"epoch": 0.6411302122927223,
"eval_loss": 4.749510765075684,
"eval_runtime": 194.1938,
"eval_samples_per_second": 51.495,
"eval_steps_per_second": 6.437,
"step": 42200
},
{
"epoch": 0.6426494781986292,
"grad_norm": 2.4839389324188232,
"learning_rate": 1.7759197324414717e-06,
"loss": 4.9116,
"step": 42300
},
{
"epoch": 0.6426494781986292,
"eval_loss": 4.752679824829102,
"eval_runtime": 194.1618,
"eval_samples_per_second": 51.503,
"eval_steps_per_second": 6.438,
"step": 42300
},
{
"epoch": 0.6441687441045362,
"grad_norm": 2.5596067905426025,
"learning_rate": 1.7658862876254182e-06,
"loss": 4.9078,
"step": 42400
},
{
"epoch": 0.6441687441045362,
"eval_loss": 4.742520332336426,
"eval_runtime": 194.1697,
"eval_samples_per_second": 51.501,
"eval_steps_per_second": 6.438,
"step": 42400
},
{
"epoch": 0.6456880100104431,
"grad_norm": 1.7020114660263062,
"learning_rate": 1.7558528428093648e-06,
"loss": 4.9063,
"step": 42500
},
{
"epoch": 0.6456880100104431,
"eval_loss": 4.745018005371094,
"eval_runtime": 194.2677,
"eval_samples_per_second": 51.475,
"eval_steps_per_second": 6.434,
"step": 42500
},
{
"epoch": 0.64720727591635,
"grad_norm": 1.83507239818573,
"learning_rate": 1.745819397993311e-06,
"loss": 4.9067,
"step": 42600
},
{
"epoch": 0.64720727591635,
"eval_loss": 4.749469757080078,
"eval_runtime": 194.1266,
"eval_samples_per_second": 51.513,
"eval_steps_per_second": 6.439,
"step": 42600
},
{
"epoch": 0.6487265418222569,
"grad_norm": 1.7852286100387573,
"learning_rate": 1.7357859531772575e-06,
"loss": 4.909,
"step": 42700
},
{
"epoch": 0.6487265418222569,
"eval_loss": 4.74142599105835,
"eval_runtime": 194.3029,
"eval_samples_per_second": 51.466,
"eval_steps_per_second": 6.433,
"step": 42700
},
{
"epoch": 0.6502458077281639,
"grad_norm": 1.9356688261032104,
"learning_rate": 1.7257525083612038e-06,
"loss": 4.8968,
"step": 42800
},
{
"epoch": 0.6502458077281639,
"eval_loss": 4.742361068725586,
"eval_runtime": 194.1912,
"eval_samples_per_second": 51.496,
"eval_steps_per_second": 6.437,
"step": 42800
},
{
"epoch": 0.6517650736340708,
"grad_norm": 2.4372880458831787,
"learning_rate": 1.7157190635451504e-06,
"loss": 4.9034,
"step": 42900
},
{
"epoch": 0.6517650736340708,
"eval_loss": 4.737247467041016,
"eval_runtime": 194.1333,
"eval_samples_per_second": 51.511,
"eval_steps_per_second": 6.439,
"step": 42900
},
{
"epoch": 0.6532843395399777,
"grad_norm": 1.9416236877441406,
"learning_rate": 1.705685618729097e-06,
"loss": 4.8978,
"step": 43000
},
{
"epoch": 0.6532843395399777,
"eval_loss": 4.7349853515625,
"eval_runtime": 194.1706,
"eval_samples_per_second": 51.501,
"eval_steps_per_second": 6.438,
"step": 43000
},
{
"epoch": 0.6548036054458846,
"grad_norm": 2.3514084815979004,
"learning_rate": 1.6956521739130435e-06,
"loss": 4.8963,
"step": 43100
},
{
"epoch": 0.6548036054458846,
"eval_loss": 4.7388434410095215,
"eval_runtime": 194.3316,
"eval_samples_per_second": 51.458,
"eval_steps_per_second": 6.432,
"step": 43100
},
{
"epoch": 0.6563228713517916,
"grad_norm": 2.028310537338257,
"learning_rate": 1.6856187290969898e-06,
"loss": 4.8961,
"step": 43200
},
{
"epoch": 0.6563228713517916,
"eval_loss": 4.735996723175049,
"eval_runtime": 194.0261,
"eval_samples_per_second": 51.539,
"eval_steps_per_second": 6.442,
"step": 43200
},
{
"epoch": 0.6578421372576985,
"grad_norm": 2.360321521759033,
"learning_rate": 1.6755852842809363e-06,
"loss": 4.8892,
"step": 43300
},
{
"epoch": 0.6578421372576985,
"eval_loss": 4.731908798217773,
"eval_runtime": 194.2382,
"eval_samples_per_second": 51.483,
"eval_steps_per_second": 6.435,
"step": 43300
},
{
"epoch": 0.6593614031636054,
"grad_norm": 2.0614426136016846,
"learning_rate": 1.665551839464883e-06,
"loss": 4.8911,
"step": 43400
},
{
"epoch": 0.6593614031636054,
"eval_loss": 4.727632999420166,
"eval_runtime": 194.0495,
"eval_samples_per_second": 51.533,
"eval_steps_per_second": 6.442,
"step": 43400
},
{
"epoch": 0.6608806690695123,
"grad_norm": 2.058509349822998,
"learning_rate": 1.6555183946488294e-06,
"loss": 4.8883,
"step": 43500
},
{
"epoch": 0.6608806690695123,
"eval_loss": 4.72844123840332,
"eval_runtime": 194.0456,
"eval_samples_per_second": 51.534,
"eval_steps_per_second": 6.442,
"step": 43500
},
{
"epoch": 0.6623999349754193,
"grad_norm": 1.7108250856399536,
"learning_rate": 1.6454849498327758e-06,
"loss": 4.8866,
"step": 43600
},
{
"epoch": 0.6623999349754193,
"eval_loss": 4.726889133453369,
"eval_runtime": 193.8998,
"eval_samples_per_second": 51.573,
"eval_steps_per_second": 6.447,
"step": 43600
},
{
"epoch": 0.6639192008813262,
"grad_norm": 1.871711254119873,
"learning_rate": 1.6354515050167223e-06,
"loss": 4.888,
"step": 43700
},
{
"epoch": 0.6639192008813262,
"eval_loss": 4.726442813873291,
"eval_runtime": 193.895,
"eval_samples_per_second": 51.574,
"eval_steps_per_second": 6.447,
"step": 43700
},
{
"epoch": 0.6654384667872331,
"grad_norm": 1.9516098499298096,
"learning_rate": 1.6254180602006689e-06,
"loss": 4.887,
"step": 43800
},
{
"epoch": 0.6654384667872331,
"eval_loss": 4.72707986831665,
"eval_runtime": 193.7412,
"eval_samples_per_second": 51.615,
"eval_steps_per_second": 6.452,
"step": 43800
},
{
"epoch": 0.66695773269314,
"grad_norm": 1.870690107345581,
"learning_rate": 1.6153846153846154e-06,
"loss": 4.8794,
"step": 43900
},
{
"epoch": 0.66695773269314,
"eval_loss": 4.7214789390563965,
"eval_runtime": 193.7498,
"eval_samples_per_second": 51.613,
"eval_steps_per_second": 6.452,
"step": 43900
},
{
"epoch": 0.668476998599047,
"grad_norm": 1.8577009439468384,
"learning_rate": 1.6053511705685618e-06,
"loss": 4.8803,
"step": 44000
},
{
"epoch": 0.668476998599047,
"eval_loss": 4.719671726226807,
"eval_runtime": 193.9858,
"eval_samples_per_second": 51.55,
"eval_steps_per_second": 6.444,
"step": 44000
},
{
"epoch": 0.6699962645049539,
"grad_norm": 2.1134140491485596,
"learning_rate": 1.5953177257525083e-06,
"loss": 4.879,
"step": 44100
},
{
"epoch": 0.6699962645049539,
"eval_loss": 4.717536926269531,
"eval_runtime": 193.7322,
"eval_samples_per_second": 51.618,
"eval_steps_per_second": 6.452,
"step": 44100
},
{
"epoch": 0.6715155304108608,
"grad_norm": 2.10524845123291,
"learning_rate": 1.5852842809364549e-06,
"loss": 4.8782,
"step": 44200
},
{
"epoch": 0.6715155304108608,
"eval_loss": 4.712420463562012,
"eval_runtime": 194.1278,
"eval_samples_per_second": 51.512,
"eval_steps_per_second": 6.439,
"step": 44200
},
{
"epoch": 0.6730347963167677,
"grad_norm": 1.9747872352600098,
"learning_rate": 1.5752508361204012e-06,
"loss": 4.8782,
"step": 44300
},
{
"epoch": 0.6730347963167677,
"eval_loss": 4.716573238372803,
"eval_runtime": 194.2422,
"eval_samples_per_second": 51.482,
"eval_steps_per_second": 6.435,
"step": 44300
},
{
"epoch": 0.6745540622226747,
"grad_norm": 1.9124640226364136,
"learning_rate": 1.5652173913043478e-06,
"loss": 4.8808,
"step": 44400
},
{
"epoch": 0.6745540622226747,
"eval_loss": 4.715909481048584,
"eval_runtime": 199.7142,
"eval_samples_per_second": 50.072,
"eval_steps_per_second": 6.259,
"step": 44400
},
{
"epoch": 0.6760733281285816,
"grad_norm": 1.971144676208496,
"learning_rate": 1.5551839464882943e-06,
"loss": 4.8739,
"step": 44500
},
{
"epoch": 0.6760733281285816,
"eval_loss": 4.714458465576172,
"eval_runtime": 194.1832,
"eval_samples_per_second": 51.498,
"eval_steps_per_second": 6.437,
"step": 44500
},
{
"epoch": 0.6775925940344885,
"grad_norm": 2.0993101596832275,
"learning_rate": 1.5451505016722409e-06,
"loss": 4.8733,
"step": 44600
},
{
"epoch": 0.6775925940344885,
"eval_loss": 4.708896636962891,
"eval_runtime": 194.1323,
"eval_samples_per_second": 51.511,
"eval_steps_per_second": 6.439,
"step": 44600
},
{
"epoch": 0.6791118599403954,
"grad_norm": 1.5517523288726807,
"learning_rate": 1.5351170568561872e-06,
"loss": 4.877,
"step": 44700
},
{
"epoch": 0.6791118599403954,
"eval_loss": 4.706016540527344,
"eval_runtime": 194.2224,
"eval_samples_per_second": 51.487,
"eval_steps_per_second": 6.436,
"step": 44700
},
{
"epoch": 0.6806311258463024,
"grad_norm": 1.6051702499389648,
"learning_rate": 1.5250836120401338e-06,
"loss": 4.873,
"step": 44800
},
{
"epoch": 0.6806311258463024,
"eval_loss": 4.71004581451416,
"eval_runtime": 194.3369,
"eval_samples_per_second": 51.457,
"eval_steps_per_second": 6.432,
"step": 44800
},
{
"epoch": 0.6821503917522093,
"grad_norm": 1.8578929901123047,
"learning_rate": 1.5150501672240803e-06,
"loss": 4.8645,
"step": 44900
},
{
"epoch": 0.6821503917522093,
"eval_loss": 4.7041826248168945,
"eval_runtime": 194.6352,
"eval_samples_per_second": 51.378,
"eval_steps_per_second": 6.422,
"step": 44900
},
{
"epoch": 0.6836696576581162,
"grad_norm": 1.8288882970809937,
"learning_rate": 1.5050167224080269e-06,
"loss": 4.8717,
"step": 45000
},
{
"epoch": 0.6836696576581162,
"eval_loss": 4.704262733459473,
"eval_runtime": 194.5807,
"eval_samples_per_second": 51.393,
"eval_steps_per_second": 6.424,
"step": 45000
},
{
"epoch": 0.6851889235640231,
"grad_norm": 1.766317367553711,
"learning_rate": 1.4949832775919732e-06,
"loss": 4.8658,
"step": 45100
},
{
"epoch": 0.6851889235640231,
"eval_loss": 4.700209140777588,
"eval_runtime": 194.2902,
"eval_samples_per_second": 51.469,
"eval_steps_per_second": 6.434,
"step": 45100
},
{
"epoch": 0.6867081894699301,
"grad_norm": 2.1722605228424072,
"learning_rate": 1.4849498327759198e-06,
"loss": 4.868,
"step": 45200
},
{
"epoch": 0.6867081894699301,
"eval_loss": 4.7045111656188965,
"eval_runtime": 194.3122,
"eval_samples_per_second": 51.464,
"eval_steps_per_second": 6.433,
"step": 45200
},
{
"epoch": 0.688227455375837,
"grad_norm": 2.2012276649475098,
"learning_rate": 1.4749163879598663e-06,
"loss": 4.861,
"step": 45300
},
{
"epoch": 0.688227455375837,
"eval_loss": 4.699077606201172,
"eval_runtime": 194.099,
"eval_samples_per_second": 51.52,
"eval_steps_per_second": 6.44,
"step": 45300
},
{
"epoch": 0.6897467212817439,
"grad_norm": 1.9373100996017456,
"learning_rate": 1.4648829431438129e-06,
"loss": 4.8624,
"step": 45400
},
{
"epoch": 0.6897467212817439,
"eval_loss": 4.699510097503662,
"eval_runtime": 194.2648,
"eval_samples_per_second": 51.476,
"eval_steps_per_second": 6.435,
"step": 45400
},
{
"epoch": 0.6912659871876508,
"grad_norm": 1.5436214208602905,
"learning_rate": 1.4548494983277592e-06,
"loss": 4.8669,
"step": 45500
},
{
"epoch": 0.6912659871876508,
"eval_loss": 4.6950531005859375,
"eval_runtime": 194.1491,
"eval_samples_per_second": 51.507,
"eval_steps_per_second": 6.438,
"step": 45500
},
{
"epoch": 0.6927852530935578,
"grad_norm": 1.868397831916809,
"learning_rate": 1.4448160535117058e-06,
"loss": 4.8588,
"step": 45600
},
{
"epoch": 0.6927852530935578,
"eval_loss": 4.699548244476318,
"eval_runtime": 194.2333,
"eval_samples_per_second": 51.484,
"eval_steps_per_second": 6.436,
"step": 45600
},
{
"epoch": 0.6943045189994647,
"grad_norm": 1.9601666927337646,
"learning_rate": 1.4347826086956523e-06,
"loss": 4.8583,
"step": 45700
},
{
"epoch": 0.6943045189994647,
"eval_loss": 4.697216510772705,
"eval_runtime": 194.113,
"eval_samples_per_second": 51.516,
"eval_steps_per_second": 6.44,
"step": 45700
},
{
"epoch": 0.6958237849053716,
"grad_norm": 2.128359317779541,
"learning_rate": 1.4247491638795989e-06,
"loss": 4.8553,
"step": 45800
},
{
"epoch": 0.6958237849053716,
"eval_loss": 4.695890426635742,
"eval_runtime": 194.2141,
"eval_samples_per_second": 51.49,
"eval_steps_per_second": 6.436,
"step": 45800
},
{
"epoch": 0.6973430508112785,
"grad_norm": 1.7737051248550415,
"learning_rate": 1.4147157190635452e-06,
"loss": 4.8552,
"step": 45900
},
{
"epoch": 0.6973430508112785,
"eval_loss": 4.692898273468018,
"eval_runtime": 194.0574,
"eval_samples_per_second": 51.531,
"eval_steps_per_second": 6.441,
"step": 45900
},
{
"epoch": 0.6988623167171855,
"grad_norm": 1.8772127628326416,
"learning_rate": 1.4046822742474917e-06,
"loss": 4.8528,
"step": 46000
},
{
"epoch": 0.6988623167171855,
"eval_loss": 4.690573215484619,
"eval_runtime": 193.9137,
"eval_samples_per_second": 51.569,
"eval_steps_per_second": 6.446,
"step": 46000
},
{
"epoch": 0.7003815826230924,
"grad_norm": 1.9277006387710571,
"learning_rate": 1.3946488294314383e-06,
"loss": 4.851,
"step": 46100
},
{
"epoch": 0.7003815826230924,
"eval_loss": 4.688443183898926,
"eval_runtime": 193.7729,
"eval_samples_per_second": 51.607,
"eval_steps_per_second": 6.451,
"step": 46100
},
{
"epoch": 0.7019008485289993,
"grad_norm": 1.4775947332382202,
"learning_rate": 1.3846153846153846e-06,
"loss": 4.8477,
"step": 46200
},
{
"epoch": 0.7019008485289993,
"eval_loss": 4.689602375030518,
"eval_runtime": 193.799,
"eval_samples_per_second": 51.6,
"eval_steps_per_second": 6.45,
"step": 46200
},
{
"epoch": 0.7034201144349062,
"grad_norm": 1.9227460622787476,
"learning_rate": 1.374581939799331e-06,
"loss": 4.8447,
"step": 46300
},
{
"epoch": 0.7034201144349062,
"eval_loss": 4.6872076988220215,
"eval_runtime": 193.971,
"eval_samples_per_second": 51.554,
"eval_steps_per_second": 6.444,
"step": 46300
},
{
"epoch": 0.7049393803408132,
"grad_norm": 1.8744120597839355,
"learning_rate": 1.3645484949832775e-06,
"loss": 4.8443,
"step": 46400
},
{
"epoch": 0.7049393803408132,
"eval_loss": 4.684128284454346,
"eval_runtime": 193.7921,
"eval_samples_per_second": 51.602,
"eval_steps_per_second": 6.45,
"step": 46400
},
{
"epoch": 0.7064586462467201,
"grad_norm": 1.858379602432251,
"learning_rate": 1.354515050167224e-06,
"loss": 4.8418,
"step": 46500
},
{
"epoch": 0.7064586462467201,
"eval_loss": 4.681851387023926,
"eval_runtime": 194.0781,
"eval_samples_per_second": 51.526,
"eval_steps_per_second": 6.441,
"step": 46500
},
{
"epoch": 0.707977912152627,
"grad_norm": 1.8594979047775269,
"learning_rate": 1.3444816053511706e-06,
"loss": 4.8433,
"step": 46600
},
{
"epoch": 0.707977912152627,
"eval_loss": 4.6782755851745605,
"eval_runtime": 194.1927,
"eval_samples_per_second": 51.495,
"eval_steps_per_second": 6.437,
"step": 46600
},
{
"epoch": 0.7094971780585339,
"grad_norm": 1.8931249380111694,
"learning_rate": 1.334448160535117e-06,
"loss": 4.8404,
"step": 46700
},
{
"epoch": 0.7094971780585339,
"eval_loss": 4.683481216430664,
"eval_runtime": 194.2623,
"eval_samples_per_second": 51.477,
"eval_steps_per_second": 6.435,
"step": 46700
},
{
"epoch": 0.7110164439644409,
"grad_norm": 1.5091091394424438,
"learning_rate": 1.3244147157190635e-06,
"loss": 4.8423,
"step": 46800
},
{
"epoch": 0.7110164439644409,
"eval_loss": 4.679195880889893,
"eval_runtime": 194.1717,
"eval_samples_per_second": 51.501,
"eval_steps_per_second": 6.438,
"step": 46800
},
{
"epoch": 0.7125357098703478,
"grad_norm": 1.5617057085037231,
"learning_rate": 1.31438127090301e-06,
"loss": 4.8384,
"step": 46900
},
{
"epoch": 0.7125357098703478,
"eval_loss": 4.675555229187012,
"eval_runtime": 195.1352,
"eval_samples_per_second": 51.247,
"eval_steps_per_second": 6.406,
"step": 46900
},
{
"epoch": 0.7140549757762547,
"grad_norm": 1.5074530839920044,
"learning_rate": 1.3043478260869566e-06,
"loss": 4.8389,
"step": 47000
},
{
"epoch": 0.7140549757762547,
"eval_loss": 4.67551851272583,
"eval_runtime": 194.1856,
"eval_samples_per_second": 51.497,
"eval_steps_per_second": 6.437,
"step": 47000
},
{
"epoch": 0.7155742416821615,
"grad_norm": 1.5850820541381836,
"learning_rate": 1.294314381270903e-06,
"loss": 4.8428,
"step": 47100
},
{
"epoch": 0.7155742416821615,
"eval_loss": 4.677995681762695,
"eval_runtime": 194.3756,
"eval_samples_per_second": 51.447,
"eval_steps_per_second": 6.431,
"step": 47100
},
{
"epoch": 0.7170935075880686,
"grad_norm": 1.7918612957000732,
"learning_rate": 1.2842809364548495e-06,
"loss": 4.8399,
"step": 47200
},
{
"epoch": 0.7170935075880686,
"eval_loss": 4.672911167144775,
"eval_runtime": 194.224,
"eval_samples_per_second": 51.487,
"eval_steps_per_second": 6.436,
"step": 47200
},
{
"epoch": 0.7186127734939755,
"grad_norm": 1.955620527267456,
"learning_rate": 1.274247491638796e-06,
"loss": 4.8338,
"step": 47300
},
{
"epoch": 0.7186127734939755,
"eval_loss": 4.67067289352417,
"eval_runtime": 194.1805,
"eval_samples_per_second": 51.498,
"eval_steps_per_second": 6.437,
"step": 47300
},
{
"epoch": 0.7201320393998824,
"grad_norm": 1.994454264640808,
"learning_rate": 1.2642140468227424e-06,
"loss": 4.8314,
"step": 47400
},
{
"epoch": 0.7201320393998824,
"eval_loss": 4.672824859619141,
"eval_runtime": 194.2432,
"eval_samples_per_second": 51.482,
"eval_steps_per_second": 6.435,
"step": 47400
},
{
"epoch": 0.7216513053057892,
"grad_norm": 1.8769866228103638,
"learning_rate": 1.254180602006689e-06,
"loss": 4.8321,
"step": 47500
},
{
"epoch": 0.7216513053057892,
"eval_loss": 4.67031717300415,
"eval_runtime": 194.2044,
"eval_samples_per_second": 51.492,
"eval_steps_per_second": 6.437,
"step": 47500
},
{
"epoch": 0.7231705712116963,
"grad_norm": 1.7346811294555664,
"learning_rate": 1.2441471571906355e-06,
"loss": 4.8351,
"step": 47600
},
{
"epoch": 0.7231705712116963,
"eval_loss": 4.667263031005859,
"eval_runtime": 194.3279,
"eval_samples_per_second": 51.459,
"eval_steps_per_second": 6.432,
"step": 47600
},
{
"epoch": 0.7246898371176032,
"grad_norm": 2.0054638385772705,
"learning_rate": 1.234113712374582e-06,
"loss": 4.8311,
"step": 47700
},
{
"epoch": 0.7246898371176032,
"eval_loss": 4.670699119567871,
"eval_runtime": 194.3103,
"eval_samples_per_second": 51.464,
"eval_steps_per_second": 6.433,
"step": 47700
},
{
"epoch": 0.72620910302351,
"grad_norm": 1.9293532371520996,
"learning_rate": 1.2240802675585284e-06,
"loss": 4.8253,
"step": 47800
},
{
"epoch": 0.72620910302351,
"eval_loss": 4.665504455566406,
"eval_runtime": 194.5301,
"eval_samples_per_second": 51.406,
"eval_steps_per_second": 6.426,
"step": 47800
},
{
"epoch": 0.727728368929417,
"grad_norm": 1.612265944480896,
"learning_rate": 1.214046822742475e-06,
"loss": 4.826,
"step": 47900
},
{
"epoch": 0.727728368929417,
"eval_loss": 4.665849685668945,
"eval_runtime": 194.1122,
"eval_samples_per_second": 51.517,
"eval_steps_per_second": 6.44,
"step": 47900
},
{
"epoch": 0.729247634835324,
"grad_norm": 1.7139407396316528,
"learning_rate": 1.2040133779264215e-06,
"loss": 4.8267,
"step": 48000
},
{
"epoch": 0.729247634835324,
"eval_loss": 4.663412570953369,
"eval_runtime": 194.3177,
"eval_samples_per_second": 51.462,
"eval_steps_per_second": 6.433,
"step": 48000
},
{
"epoch": 0.7307669007412309,
"grad_norm": 1.8362255096435547,
"learning_rate": 1.193979933110368e-06,
"loss": 4.826,
"step": 48100
},
{
"epoch": 0.7307669007412309,
"eval_loss": 4.6637797355651855,
"eval_runtime": 194.0467,
"eval_samples_per_second": 51.534,
"eval_steps_per_second": 6.442,
"step": 48100
},
{
"epoch": 0.7322861666471377,
"grad_norm": 1.3808461427688599,
"learning_rate": 1.1839464882943144e-06,
"loss": 4.8203,
"step": 48200
},
{
"epoch": 0.7322861666471377,
"eval_loss": 4.66359281539917,
"eval_runtime": 194.0835,
"eval_samples_per_second": 51.524,
"eval_steps_per_second": 6.441,
"step": 48200
},
{
"epoch": 0.7338054325530446,
"grad_norm": 2.090758800506592,
"learning_rate": 1.173913043478261e-06,
"loss": 4.8246,
"step": 48300
},
{
"epoch": 0.7338054325530446,
"eval_loss": 4.658617973327637,
"eval_runtime": 193.919,
"eval_samples_per_second": 51.568,
"eval_steps_per_second": 6.446,
"step": 48300
},
{
"epoch": 0.7353246984589517,
"grad_norm": 1.410666584968567,
"learning_rate": 1.1638795986622075e-06,
"loss": 4.8198,
"step": 48400
},
{
"epoch": 0.7353246984589517,
"eval_loss": 4.662432670593262,
"eval_runtime": 193.8752,
"eval_samples_per_second": 51.58,
"eval_steps_per_second": 6.447,
"step": 48400
},
{
"epoch": 0.7368439643648586,
"grad_norm": 1.5587624311447144,
"learning_rate": 1.153846153846154e-06,
"loss": 4.8185,
"step": 48500
},
{
"epoch": 0.7368439643648586,
"eval_loss": 4.656804084777832,
"eval_runtime": 193.773,
"eval_samples_per_second": 51.607,
"eval_steps_per_second": 6.451,
"step": 48500
},
{
"epoch": 0.7383632302707654,
"grad_norm": 1.3816115856170654,
"learning_rate": 1.1438127090301004e-06,
"loss": 4.8168,
"step": 48600
},
{
"epoch": 0.7383632302707654,
"eval_loss": 4.656231880187988,
"eval_runtime": 193.8565,
"eval_samples_per_second": 51.585,
"eval_steps_per_second": 6.448,
"step": 48600
},
{
"epoch": 0.7398824961766723,
"grad_norm": 1.927064299583435,
"learning_rate": 1.133779264214047e-06,
"loss": 4.8182,
"step": 48700
},
{
"epoch": 0.7398824961766723,
"eval_loss": 4.656589031219482,
"eval_runtime": 193.8744,
"eval_samples_per_second": 51.58,
"eval_steps_per_second": 6.447,
"step": 48700
},
{
"epoch": 0.7414017620825794,
"grad_norm": 1.6699544191360474,
"learning_rate": 1.1237458193979933e-06,
"loss": 4.8185,
"step": 48800
},
{
"epoch": 0.7414017620825794,
"eval_loss": 4.655017852783203,
"eval_runtime": 193.8675,
"eval_samples_per_second": 51.582,
"eval_steps_per_second": 6.448,
"step": 48800
},
{
"epoch": 0.7429210279884862,
"grad_norm": 1.3378312587738037,
"learning_rate": 1.1137123745819398e-06,
"loss": 4.815,
"step": 48900
},
{
"epoch": 0.7429210279884862,
"eval_loss": 4.657501220703125,
"eval_runtime": 194.1277,
"eval_samples_per_second": 51.512,
"eval_steps_per_second": 6.439,
"step": 48900
},
{
"epoch": 0.7444402938943931,
"grad_norm": 1.6146018505096436,
"learning_rate": 1.1036789297658862e-06,
"loss": 4.8145,
"step": 49000
},
{
"epoch": 0.7444402938943931,
"eval_loss": 4.6548943519592285,
"eval_runtime": 194.2412,
"eval_samples_per_second": 51.482,
"eval_steps_per_second": 6.435,
"step": 49000
},
{
"epoch": 0.7459595598003,
"grad_norm": 1.5952975749969482,
"learning_rate": 1.0936454849498327e-06,
"loss": 4.813,
"step": 49100
},
{
"epoch": 0.7459595598003,
"eval_loss": 4.651684284210205,
"eval_runtime": 194.2268,
"eval_samples_per_second": 51.486,
"eval_steps_per_second": 6.436,
"step": 49100
},
{
"epoch": 0.747478825706207,
"grad_norm": 1.5946011543273926,
"learning_rate": 1.0836120401337793e-06,
"loss": 4.8148,
"step": 49200
},
{
"epoch": 0.747478825706207,
"eval_loss": 4.651627540588379,
"eval_runtime": 194.2502,
"eval_samples_per_second": 51.48,
"eval_steps_per_second": 6.435,
"step": 49200
},
{
"epoch": 0.748998091612114,
"grad_norm": 1.4675341844558716,
"learning_rate": 1.0735785953177258e-06,
"loss": 4.81,
"step": 49300
},
{
"epoch": 0.748998091612114,
"eval_loss": 4.650761127471924,
"eval_runtime": 194.1574,
"eval_samples_per_second": 51.505,
"eval_steps_per_second": 6.438,
"step": 49300
},
{
"epoch": 0.7505173575180208,
"grad_norm": 1.6807961463928223,
"learning_rate": 1.0635451505016722e-06,
"loss": 4.8115,
"step": 49400
},
{
"epoch": 0.7505173575180208,
"eval_loss": 4.6511101722717285,
"eval_runtime": 194.374,
"eval_samples_per_second": 51.447,
"eval_steps_per_second": 6.431,
"step": 49400
},
{
"epoch": 0.7520366234239277,
"grad_norm": 1.4846396446228027,
"learning_rate": 1.0535117056856187e-06,
"loss": 4.8069,
"step": 49500
},
{
"epoch": 0.7520366234239277,
"eval_loss": 4.647155284881592,
"eval_runtime": 194.3314,
"eval_samples_per_second": 51.458,
"eval_steps_per_second": 6.432,
"step": 49500
},
{
"epoch": 0.7535558893298347,
"grad_norm": 1.5872676372528076,
"learning_rate": 1.0434782608695653e-06,
"loss": 4.8084,
"step": 49600
},
{
"epoch": 0.7535558893298347,
"eval_loss": 4.644804954528809,
"eval_runtime": 194.3764,
"eval_samples_per_second": 51.447,
"eval_steps_per_second": 6.431,
"step": 49600
},
{
"epoch": 0.7550751552357416,
"grad_norm": 1.6138330698013306,
"learning_rate": 1.0334448160535118e-06,
"loss": 4.8086,
"step": 49700
},
{
"epoch": 0.7550751552357416,
"eval_loss": 4.644802093505859,
"eval_runtime": 194.3935,
"eval_samples_per_second": 51.442,
"eval_steps_per_second": 6.43,
"step": 49700
},
{
"epoch": 0.7565944211416485,
"grad_norm": 1.6802724599838257,
"learning_rate": 1.0234113712374581e-06,
"loss": 4.8052,
"step": 49800
},
{
"epoch": 0.7565944211416485,
"eval_loss": 4.646471977233887,
"eval_runtime": 194.4012,
"eval_samples_per_second": 51.44,
"eval_steps_per_second": 6.43,
"step": 49800
},
{
"epoch": 0.7581136870475554,
"grad_norm": 1.7580209970474243,
"learning_rate": 1.0133779264214047e-06,
"loss": 4.805,
"step": 49900
},
{
"epoch": 0.7581136870475554,
"eval_loss": 4.641211032867432,
"eval_runtime": 194.5581,
"eval_samples_per_second": 51.399,
"eval_steps_per_second": 6.425,
"step": 49900
},
{
"epoch": 0.7596329529534624,
"grad_norm": 1.732718586921692,
"learning_rate": 1.0033444816053512e-06,
"loss": 4.803,
"step": 50000
},
{
"epoch": 0.7596329529534624,
"eval_loss": 4.643296241760254,
"eval_runtime": 194.3028,
"eval_samples_per_second": 51.466,
"eval_steps_per_second": 6.433,
"step": 50000
},
{
"epoch": 0.7611522188593693,
"grad_norm": 1.6775901317596436,
"learning_rate": 9.933110367892976e-07,
"loss": 4.8009,
"step": 50100
},
{
"epoch": 0.7611522188593693,
"eval_loss": 4.639660358428955,
"eval_runtime": 194.3411,
"eval_samples_per_second": 51.456,
"eval_steps_per_second": 6.432,
"step": 50100
},
{
"epoch": 0.7626714847652762,
"grad_norm": 1.4055508375167847,
"learning_rate": 9.832775919732441e-07,
"loss": 4.8022,
"step": 50200
},
{
"epoch": 0.7626714847652762,
"eval_loss": 4.637509346008301,
"eval_runtime": 194.3272,
"eval_samples_per_second": 51.46,
"eval_steps_per_second": 6.432,
"step": 50200
},
{
"epoch": 0.7641907506711831,
"grad_norm": 1.6316554546356201,
"learning_rate": 9.732441471571907e-07,
"loss": 4.8019,
"step": 50300
},
{
"epoch": 0.7641907506711831,
"eval_loss": 4.6399359703063965,
"eval_runtime": 194.5106,
"eval_samples_per_second": 51.411,
"eval_steps_per_second": 6.426,
"step": 50300
},
{
"epoch": 0.7657100165770901,
"grad_norm": 1.87636137008667,
"learning_rate": 9.632107023411372e-07,
"loss": 4.8021,
"step": 50400
},
{
"epoch": 0.7657100165770901,
"eval_loss": 4.637732028961182,
"eval_runtime": 194.1555,
"eval_samples_per_second": 51.505,
"eval_steps_per_second": 6.438,
"step": 50400
},
{
"epoch": 0.767229282482997,
"grad_norm": 1.5560215711593628,
"learning_rate": 9.531772575250837e-07,
"loss": 4.797,
"step": 50500
},
{
"epoch": 0.767229282482997,
"eval_loss": 4.636757850646973,
"eval_runtime": 194.187,
"eval_samples_per_second": 51.497,
"eval_steps_per_second": 6.437,
"step": 50500
},
{
"epoch": 0.7687485483889039,
"grad_norm": 1.5681828260421753,
"learning_rate": 9.431438127090301e-07,
"loss": 4.7981,
"step": 50600
},
{
"epoch": 0.7687485483889039,
"eval_loss": 4.63712215423584,
"eval_runtime": 194.0875,
"eval_samples_per_second": 51.523,
"eval_steps_per_second": 6.44,
"step": 50600
},
{
"epoch": 0.7702678142948108,
"grad_norm": 1.725135087966919,
"learning_rate": 9.331103678929767e-07,
"loss": 4.7988,
"step": 50700
},
{
"epoch": 0.7702678142948108,
"eval_loss": 4.633908271789551,
"eval_runtime": 193.9714,
"eval_samples_per_second": 51.554,
"eval_steps_per_second": 6.444,
"step": 50700
},
{
"epoch": 0.7717870802007178,
"grad_norm": 1.5292387008666992,
"learning_rate": 9.230769230769231e-07,
"loss": 4.7942,
"step": 50800
},
{
"epoch": 0.7717870802007178,
"eval_loss": 4.634795188903809,
"eval_runtime": 193.9264,
"eval_samples_per_second": 51.566,
"eval_steps_per_second": 6.446,
"step": 50800
},
{
"epoch": 0.7733063461066247,
"grad_norm": 1.313671350479126,
"learning_rate": 9.130434782608697e-07,
"loss": 4.7971,
"step": 50900
},
{
"epoch": 0.7733063461066247,
"eval_loss": 4.632637977600098,
"eval_runtime": 193.9004,
"eval_samples_per_second": 51.573,
"eval_steps_per_second": 6.447,
"step": 50900
},
{
"epoch": 0.7748256120125316,
"grad_norm": 1.3143532276153564,
"learning_rate": 9.030100334448161e-07,
"loss": 4.7945,
"step": 51000
},
{
"epoch": 0.7748256120125316,
"eval_loss": 4.6306681632995605,
"eval_runtime": 194.3643,
"eval_samples_per_second": 51.45,
"eval_steps_per_second": 6.431,
"step": 51000
},
{
"epoch": 0.7763448779184385,
"grad_norm": 1.3034121990203857,
"learning_rate": 8.929765886287627e-07,
"loss": 4.7888,
"step": 51100
},
{
"epoch": 0.7763448779184385,
"eval_loss": 4.629621982574463,
"eval_runtime": 194.0292,
"eval_samples_per_second": 51.539,
"eval_steps_per_second": 6.442,
"step": 51100
},
{
"epoch": 0.7778641438243455,
"grad_norm": 1.739376425743103,
"learning_rate": 8.829431438127091e-07,
"loss": 4.7934,
"step": 51200
},
{
"epoch": 0.7778641438243455,
"eval_loss": 4.62890625,
"eval_runtime": 194.358,
"eval_samples_per_second": 51.451,
"eval_steps_per_second": 6.431,
"step": 51200
},
{
"epoch": 0.7793834097302524,
"grad_norm": 1.3741992712020874,
"learning_rate": 8.729096989966555e-07,
"loss": 4.7887,
"step": 51300
},
{
"epoch": 0.7793834097302524,
"eval_loss": 4.625428199768066,
"eval_runtime": 194.4886,
"eval_samples_per_second": 51.417,
"eval_steps_per_second": 6.427,
"step": 51300
},
{
"epoch": 0.7809026756361593,
"grad_norm": 1.423168420791626,
"learning_rate": 8.628762541806019e-07,
"loss": 4.7888,
"step": 51400
},
{
"epoch": 0.7809026756361593,
"eval_loss": 4.626926422119141,
"eval_runtime": 194.5901,
"eval_samples_per_second": 51.39,
"eval_steps_per_second": 6.424,
"step": 51400
},
{
"epoch": 0.7824219415420662,
"grad_norm": 1.5038503408432007,
"learning_rate": 8.528428093645485e-07,
"loss": 4.791,
"step": 51500
},
{
"epoch": 0.7824219415420662,
"eval_loss": 4.630486488342285,
"eval_runtime": 194.4102,
"eval_samples_per_second": 51.438,
"eval_steps_per_second": 6.43,
"step": 51500
},
{
"epoch": 0.7839412074479732,
"grad_norm": 1.6092890501022339,
"learning_rate": 8.428093645484949e-07,
"loss": 4.7863,
"step": 51600
},
{
"epoch": 0.7839412074479732,
"eval_loss": 4.626857280731201,
"eval_runtime": 194.4616,
"eval_samples_per_second": 51.424,
"eval_steps_per_second": 6.428,
"step": 51600
},
{
"epoch": 0.7854604733538801,
"grad_norm": 1.6199829578399658,
"learning_rate": 8.327759197324414e-07,
"loss": 4.7875,
"step": 51700
},
{
"epoch": 0.7854604733538801,
"eval_loss": 4.623871326446533,
"eval_runtime": 194.5129,
"eval_samples_per_second": 51.41,
"eval_steps_per_second": 6.426,
"step": 51700
},
{
"epoch": 0.786979739259787,
"grad_norm": 1.33729088306427,
"learning_rate": 8.227424749163879e-07,
"loss": 4.7836,
"step": 51800
},
{
"epoch": 0.786979739259787,
"eval_loss": 4.625426769256592,
"eval_runtime": 194.4935,
"eval_samples_per_second": 51.416,
"eval_steps_per_second": 6.427,
"step": 51800
},
{
"epoch": 0.7884990051656939,
"grad_norm": 1.6848562955856323,
"learning_rate": 8.127090301003344e-07,
"loss": 4.7874,
"step": 51900
},
{
"epoch": 0.7884990051656939,
"eval_loss": 4.626620292663574,
"eval_runtime": 194.6564,
"eval_samples_per_second": 51.373,
"eval_steps_per_second": 6.422,
"step": 51900
},
{
"epoch": 0.7900182710716009,
"grad_norm": 1.2945283651351929,
"learning_rate": 8.026755852842809e-07,
"loss": 4.7892,
"step": 52000
},
{
"epoch": 0.7900182710716009,
"eval_loss": 4.624682903289795,
"eval_runtime": 194.5982,
"eval_samples_per_second": 51.388,
"eval_steps_per_second": 6.423,
"step": 52000
},
{
"epoch": 0.7915375369775078,
"grad_norm": 1.5469530820846558,
"learning_rate": 7.926421404682274e-07,
"loss": 4.7828,
"step": 52100
},
{
"epoch": 0.7915375369775078,
"eval_loss": 4.622786521911621,
"eval_runtime": 194.4896,
"eval_samples_per_second": 51.417,
"eval_steps_per_second": 6.427,
"step": 52100
},
{
"epoch": 0.7930568028834147,
"grad_norm": 1.4468382596969604,
"learning_rate": 7.826086956521739e-07,
"loss": 4.7772,
"step": 52200
},
{
"epoch": 0.7930568028834147,
"eval_loss": 4.625532150268555,
"eval_runtime": 194.4728,
"eval_samples_per_second": 51.421,
"eval_steps_per_second": 6.428,
"step": 52200
},
{
"epoch": 0.7945760687893216,
"grad_norm": 1.244032382965088,
"learning_rate": 7.725752508361204e-07,
"loss": 4.7794,
"step": 52300
},
{
"epoch": 0.7945760687893216,
"eval_loss": 4.621998310089111,
"eval_runtime": 194.4943,
"eval_samples_per_second": 51.415,
"eval_steps_per_second": 6.427,
"step": 52300
},
{
"epoch": 0.7960953346952286,
"grad_norm": 1.416409969329834,
"learning_rate": 7.625418060200669e-07,
"loss": 4.784,
"step": 52400
},
{
"epoch": 0.7960953346952286,
"eval_loss": 4.620311260223389,
"eval_runtime": 194.4398,
"eval_samples_per_second": 51.43,
"eval_steps_per_second": 6.429,
"step": 52400
},
{
"epoch": 0.7976146006011355,
"grad_norm": 1.3747918605804443,
"learning_rate": 7.525083612040134e-07,
"loss": 4.7776,
"step": 52500
},
{
"epoch": 0.7976146006011355,
"eval_loss": 4.619593143463135,
"eval_runtime": 194.5835,
"eval_samples_per_second": 51.392,
"eval_steps_per_second": 6.424,
"step": 52500
},
{
"epoch": 0.7991338665070424,
"grad_norm": 1.4532439708709717,
"learning_rate": 7.424749163879599e-07,
"loss": 4.7805,
"step": 52600
},
{
"epoch": 0.7991338665070424,
"eval_loss": 4.619747161865234,
"eval_runtime": 194.2642,
"eval_samples_per_second": 51.476,
"eval_steps_per_second": 6.435,
"step": 52600
},
{
"epoch": 0.8006531324129494,
"grad_norm": 1.34298574924469,
"learning_rate": 7.324414715719064e-07,
"loss": 4.7778,
"step": 52700
},
{
"epoch": 0.8006531324129494,
"eval_loss": 4.61711311340332,
"eval_runtime": 194.691,
"eval_samples_per_second": 51.363,
"eval_steps_per_second": 6.42,
"step": 52700
},
{
"epoch": 0.8021723983188563,
"grad_norm": 1.4666342735290527,
"learning_rate": 7.224080267558529e-07,
"loss": 4.7792,
"step": 52800
},
{
"epoch": 0.8021723983188563,
"eval_loss": 4.615002155303955,
"eval_runtime": 194.4007,
"eval_samples_per_second": 51.44,
"eval_steps_per_second": 6.43,
"step": 52800
},
{
"epoch": 0.8036916642247632,
"grad_norm": 1.1881191730499268,
"learning_rate": 7.123745819397994e-07,
"loss": 4.7789,
"step": 52900
},
{
"epoch": 0.8036916642247632,
"eval_loss": 4.613386154174805,
"eval_runtime": 194.3959,
"eval_samples_per_second": 51.441,
"eval_steps_per_second": 6.43,
"step": 52900
},
{
"epoch": 0.8052109301306701,
"grad_norm": 1.1752644777297974,
"learning_rate": 7.023411371237459e-07,
"loss": 4.7766,
"step": 53000
},
{
"epoch": 0.8052109301306701,
"eval_loss": 4.616655349731445,
"eval_runtime": 194.0766,
"eval_samples_per_second": 51.526,
"eval_steps_per_second": 6.441,
"step": 53000
},
{
"epoch": 0.8067301960365771,
"grad_norm": 1.3520350456237793,
"learning_rate": 6.923076923076923e-07,
"loss": 4.7748,
"step": 53100
},
{
"epoch": 0.8067301960365771,
"eval_loss": 4.616769313812256,
"eval_runtime": 194.2084,
"eval_samples_per_second": 51.491,
"eval_steps_per_second": 6.436,
"step": 53100
},
{
"epoch": 0.808249461942484,
"grad_norm": 1.5536683797836304,
"learning_rate": 6.822742474916388e-07,
"loss": 4.7798,
"step": 53200
},
{
"epoch": 0.808249461942484,
"eval_loss": 4.615866661071777,
"eval_runtime": 193.8562,
"eval_samples_per_second": 51.585,
"eval_steps_per_second": 6.448,
"step": 53200
},
{
"epoch": 0.8097687278483909,
"grad_norm": 1.2618976831436157,
"learning_rate": 6.722408026755853e-07,
"loss": 4.7762,
"step": 53300
},
{
"epoch": 0.8097687278483909,
"eval_loss": 4.616024017333984,
"eval_runtime": 193.9219,
"eval_samples_per_second": 51.567,
"eval_steps_per_second": 6.446,
"step": 53300
},
{
"epoch": 0.8112879937542978,
"grad_norm": 1.8162367343902588,
"learning_rate": 6.622073578595318e-07,
"loss": 4.7761,
"step": 53400
},
{
"epoch": 0.8112879937542978,
"eval_loss": 4.613333702087402,
"eval_runtime": 194.0415,
"eval_samples_per_second": 51.535,
"eval_steps_per_second": 6.442,
"step": 53400
},
{
"epoch": 0.8128072596602048,
"grad_norm": 1.1924686431884766,
"learning_rate": 6.521739130434783e-07,
"loss": 4.7721,
"step": 53500
},
{
"epoch": 0.8128072596602048,
"eval_loss": 4.615184307098389,
"eval_runtime": 193.9446,
"eval_samples_per_second": 51.561,
"eval_steps_per_second": 6.445,
"step": 53500
},
{
"epoch": 0.8143265255661117,
"grad_norm": 1.1603306531906128,
"learning_rate": 6.421404682274248e-07,
"loss": 4.7746,
"step": 53600
},
{
"epoch": 0.8143265255661117,
"eval_loss": 4.611873626708984,
"eval_runtime": 194.2028,
"eval_samples_per_second": 51.493,
"eval_steps_per_second": 6.437,
"step": 53600
},
{
"epoch": 0.8158457914720186,
"grad_norm": 1.202577829360962,
"learning_rate": 6.321070234113712e-07,
"loss": 4.7745,
"step": 53700
},
{
"epoch": 0.8158457914720186,
"eval_loss": 4.610635757446289,
"eval_runtime": 194.3713,
"eval_samples_per_second": 51.448,
"eval_steps_per_second": 6.431,
"step": 53700
},
{
"epoch": 0.8173650573779255,
"grad_norm": 1.3371776342391968,
"learning_rate": 6.220735785953178e-07,
"loss": 4.7755,
"step": 53800
},
{
"epoch": 0.8173650573779255,
"eval_loss": 4.611499786376953,
"eval_runtime": 194.3936,
"eval_samples_per_second": 51.442,
"eval_steps_per_second": 6.43,
"step": 53800
},
{
"epoch": 0.8188843232838325,
"grad_norm": 1.3666436672210693,
"learning_rate": 6.120401337792642e-07,
"loss": 4.7701,
"step": 53900
},
{
"epoch": 0.8188843232838325,
"eval_loss": 4.610349655151367,
"eval_runtime": 194.5735,
"eval_samples_per_second": 51.394,
"eval_steps_per_second": 6.424,
"step": 53900
},
{
"epoch": 0.8204035891897394,
"grad_norm": 1.4433395862579346,
"learning_rate": 6.020066889632107e-07,
"loss": 4.7743,
"step": 54000
},
{
"epoch": 0.8204035891897394,
"eval_loss": 4.610903263092041,
"eval_runtime": 194.367,
"eval_samples_per_second": 51.449,
"eval_steps_per_second": 6.431,
"step": 54000
},
{
"epoch": 0.8219228550956463,
"grad_norm": 1.2440968751907349,
"learning_rate": 5.919732441471572e-07,
"loss": 4.7701,
"step": 54100
},
{
"epoch": 0.8219228550956463,
"eval_loss": 4.611226558685303,
"eval_runtime": 194.4358,
"eval_samples_per_second": 51.431,
"eval_steps_per_second": 6.429,
"step": 54100
},
{
"epoch": 0.8234421210015532,
"grad_norm": 1.311020016670227,
"learning_rate": 5.819397993311037e-07,
"loss": 4.767,
"step": 54200
},
{
"epoch": 0.8234421210015532,
"eval_loss": 4.608744144439697,
"eval_runtime": 194.5925,
"eval_samples_per_second": 51.389,
"eval_steps_per_second": 6.424,
"step": 54200
},
{
"epoch": 0.8249613869074602,
"grad_norm": 1.2300583124160767,
"learning_rate": 5.719063545150502e-07,
"loss": 4.7713,
"step": 54300
},
{
"epoch": 0.8249613869074602,
"eval_loss": 4.607234477996826,
"eval_runtime": 194.4772,
"eval_samples_per_second": 51.42,
"eval_steps_per_second": 6.427,
"step": 54300
},
{
"epoch": 0.8264806528133671,
"grad_norm": 1.3106154203414917,
"learning_rate": 5.618729096989966e-07,
"loss": 4.7698,
"step": 54400
},
{
"epoch": 0.8264806528133671,
"eval_loss": 4.604393005371094,
"eval_runtime": 194.687,
"eval_samples_per_second": 51.364,
"eval_steps_per_second": 6.421,
"step": 54400
},
{
"epoch": 0.827999918719274,
"grad_norm": 1.2660140991210938,
"learning_rate": 5.518394648829431e-07,
"loss": 4.7655,
"step": 54500
},
{
"epoch": 0.827999918719274,
"eval_loss": 4.602825164794922,
"eval_runtime": 194.597,
"eval_samples_per_second": 51.388,
"eval_steps_per_second": 6.424,
"step": 54500
},
{
"epoch": 0.8295191846251809,
"grad_norm": 1.4443926811218262,
"learning_rate": 5.418060200668896e-07,
"loss": 4.7727,
"step": 54600
},
{
"epoch": 0.8295191846251809,
"eval_loss": 4.606249809265137,
"eval_runtime": 194.4722,
"eval_samples_per_second": 51.421,
"eval_steps_per_second": 6.428,
"step": 54600
},
{
"epoch": 0.8310384505310879,
"grad_norm": 1.339629888534546,
"learning_rate": 5.317725752508361e-07,
"loss": 4.7639,
"step": 54700
},
{
"epoch": 0.8310384505310879,
"eval_loss": 4.604907512664795,
"eval_runtime": 194.6528,
"eval_samples_per_second": 51.374,
"eval_steps_per_second": 6.422,
"step": 54700
},
{
"epoch": 0.8325577164369948,
"grad_norm": 1.2703863382339478,
"learning_rate": 5.217391304347826e-07,
"loss": 4.762,
"step": 54800
},
{
"epoch": 0.8325577164369948,
"eval_loss": 4.605154037475586,
"eval_runtime": 194.4518,
"eval_samples_per_second": 51.427,
"eval_steps_per_second": 6.428,
"step": 54800
},
{
"epoch": 0.8340769823429017,
"grad_norm": 1.1100186109542847,
"learning_rate": 5.117056856187291e-07,
"loss": 4.7635,
"step": 54900
},
{
"epoch": 0.8340769823429017,
"eval_loss": 4.603663444519043,
"eval_runtime": 194.4154,
"eval_samples_per_second": 51.436,
"eval_steps_per_second": 6.43,
"step": 54900
},
{
"epoch": 0.8355962482488086,
"grad_norm": 1.6119050979614258,
"learning_rate": 5.016722408026756e-07,
"loss": 4.7627,
"step": 55000
},
{
"epoch": 0.8355962482488086,
"eval_loss": 4.603806495666504,
"eval_runtime": 194.3808,
"eval_samples_per_second": 51.445,
"eval_steps_per_second": 6.431,
"step": 55000
},
{
"epoch": 0.8371155141547156,
"grad_norm": 1.22734534740448,
"learning_rate": 4.916387959866221e-07,
"loss": 4.764,
"step": 55100
},
{
"epoch": 0.8371155141547156,
"eval_loss": 4.604480266571045,
"eval_runtime": 194.4351,
"eval_samples_per_second": 51.431,
"eval_steps_per_second": 6.429,
"step": 55100
},
{
"epoch": 0.8386347800606225,
"grad_norm": 1.1762231588363647,
"learning_rate": 4.816053511705686e-07,
"loss": 4.7674,
"step": 55200
},
{
"epoch": 0.8386347800606225,
"eval_loss": 4.60023307800293,
"eval_runtime": 194.3592,
"eval_samples_per_second": 51.451,
"eval_steps_per_second": 6.431,
"step": 55200
},
{
"epoch": 0.8401540459665294,
"grad_norm": 1.0889923572540283,
"learning_rate": 4.7157190635451506e-07,
"loss": 4.762,
"step": 55300
},
{
"epoch": 0.8401540459665294,
"eval_loss": 4.5993733406066895,
"eval_runtime": 194.2327,
"eval_samples_per_second": 51.485,
"eval_steps_per_second": 6.436,
"step": 55300
},
{
"epoch": 0.8416733118724363,
"grad_norm": 1.2975116968154907,
"learning_rate": 4.6153846153846156e-07,
"loss": 4.7635,
"step": 55400
},
{
"epoch": 0.8416733118724363,
"eval_loss": 4.599579811096191,
"eval_runtime": 194.1059,
"eval_samples_per_second": 51.518,
"eval_steps_per_second": 6.44,
"step": 55400
},
{
"epoch": 0.8431925777783433,
"grad_norm": 1.257307767868042,
"learning_rate": 4.5150501672240806e-07,
"loss": 4.7607,
"step": 55500
},
{
"epoch": 0.8431925777783433,
"eval_loss": 4.602155685424805,
"eval_runtime": 193.9982,
"eval_samples_per_second": 51.547,
"eval_steps_per_second": 6.443,
"step": 55500
},
{
"epoch": 0.8447118436842502,
"grad_norm": 1.2345635890960693,
"learning_rate": 4.4147157190635456e-07,
"loss": 4.7602,
"step": 55600
},
{
"epoch": 0.8447118436842502,
"eval_loss": 4.60153341293335,
"eval_runtime": 193.9915,
"eval_samples_per_second": 51.549,
"eval_steps_per_second": 6.444,
"step": 55600
},
{
"epoch": 0.8462311095901571,
"grad_norm": 1.2262383699417114,
"learning_rate": 4.3143812709030095e-07,
"loss": 4.7619,
"step": 55700
},
{
"epoch": 0.8462311095901571,
"eval_loss": 4.600053310394287,
"eval_runtime": 194.0432,
"eval_samples_per_second": 51.535,
"eval_steps_per_second": 6.442,
"step": 55700
},
{
"epoch": 0.847750375496064,
"grad_norm": 1.3070259094238281,
"learning_rate": 4.2140468227424745e-07,
"loss": 4.7564,
"step": 55800
},
{
"epoch": 0.847750375496064,
"eval_loss": 4.597591876983643,
"eval_runtime": 193.9858,
"eval_samples_per_second": 51.55,
"eval_steps_per_second": 6.444,
"step": 55800
},
{
"epoch": 0.849269641401971,
"grad_norm": 1.2372263669967651,
"learning_rate": 4.1137123745819395e-07,
"loss": 4.7601,
"step": 55900
},
{
"epoch": 0.849269641401971,
"eval_loss": 4.602851867675781,
"eval_runtime": 194.1982,
"eval_samples_per_second": 51.494,
"eval_steps_per_second": 6.437,
"step": 55900
},
{
"epoch": 0.8507889073078779,
"grad_norm": 1.1839525699615479,
"learning_rate": 4.0133779264214045e-07,
"loss": 4.7609,
"step": 56000
},
{
"epoch": 0.8507889073078779,
"eval_loss": 4.595503330230713,
"eval_runtime": 194.4031,
"eval_samples_per_second": 51.44,
"eval_steps_per_second": 6.43,
"step": 56000
},
{
"epoch": 0.8523081732137848,
"grad_norm": 1.4197345972061157,
"learning_rate": 3.9130434782608694e-07,
"loss": 4.7594,
"step": 56100
},
{
"epoch": 0.8523081732137848,
"eval_loss": 4.59796142578125,
"eval_runtime": 194.4678,
"eval_samples_per_second": 51.422,
"eval_steps_per_second": 6.428,
"step": 56100
},
{
"epoch": 0.8538274391196917,
"grad_norm": 1.1221038103103638,
"learning_rate": 3.8127090301003344e-07,
"loss": 4.7568,
"step": 56200
},
{
"epoch": 0.8538274391196917,
"eval_loss": 4.596600532531738,
"eval_runtime": 194.6362,
"eval_samples_per_second": 51.378,
"eval_steps_per_second": 6.422,
"step": 56200
},
{
"epoch": 0.8553467050255987,
"grad_norm": 1.2606701850891113,
"learning_rate": 3.7123745819397994e-07,
"loss": 4.7543,
"step": 56300
},
{
"epoch": 0.8553467050255987,
"eval_loss": 4.598119258880615,
"eval_runtime": 194.3842,
"eval_samples_per_second": 51.445,
"eval_steps_per_second": 6.431,
"step": 56300
},
{
"epoch": 0.8568659709315056,
"grad_norm": 1.3233997821807861,
"learning_rate": 3.6120401337792644e-07,
"loss": 4.7576,
"step": 56400
},
{
"epoch": 0.8568659709315056,
"eval_loss": 4.596184730529785,
"eval_runtime": 194.3736,
"eval_samples_per_second": 51.447,
"eval_steps_per_second": 6.431,
"step": 56400
},
{
"epoch": 0.8583852368374125,
"grad_norm": 1.2004015445709229,
"learning_rate": 3.5117056856187294e-07,
"loss": 4.7616,
"step": 56500
},
{
"epoch": 0.8583852368374125,
"eval_loss": 4.594801425933838,
"eval_runtime": 194.5029,
"eval_samples_per_second": 51.413,
"eval_steps_per_second": 6.427,
"step": 56500
},
{
"epoch": 0.8599045027433194,
"grad_norm": 1.2479798793792725,
"learning_rate": 3.411371237458194e-07,
"loss": 4.7628,
"step": 56600
},
{
"epoch": 0.8599045027433194,
"eval_loss": 4.599001407623291,
"eval_runtime": 194.5013,
"eval_samples_per_second": 51.414,
"eval_steps_per_second": 6.427,
"step": 56600
},
{
"epoch": 0.8614237686492264,
"grad_norm": 1.2455825805664062,
"learning_rate": 3.311036789297659e-07,
"loss": 4.756,
"step": 56700
},
{
"epoch": 0.8614237686492264,
"eval_loss": 4.596933364868164,
"eval_runtime": 194.5096,
"eval_samples_per_second": 51.411,
"eval_steps_per_second": 6.426,
"step": 56700
},
{
"epoch": 0.8629430345551333,
"grad_norm": 1.2096078395843506,
"learning_rate": 3.210702341137124e-07,
"loss": 4.7603,
"step": 56800
},
{
"epoch": 0.8629430345551333,
"eval_loss": 4.5964884757995605,
"eval_runtime": 194.6292,
"eval_samples_per_second": 51.38,
"eval_steps_per_second": 6.422,
"step": 56800
},
{
"epoch": 0.8644623004610402,
"grad_norm": 0.9795971512794495,
"learning_rate": 3.110367892976589e-07,
"loss": 4.7533,
"step": 56900
},
{
"epoch": 0.8644623004610402,
"eval_loss": 4.594615459442139,
"eval_runtime": 194.678,
"eval_samples_per_second": 51.367,
"eval_steps_per_second": 6.421,
"step": 56900
},
{
"epoch": 0.8659815663669471,
"grad_norm": 1.3727303743362427,
"learning_rate": 3.010033444816054e-07,
"loss": 4.7547,
"step": 57000
},
{
"epoch": 0.8659815663669471,
"eval_loss": 4.596096515655518,
"eval_runtime": 194.5316,
"eval_samples_per_second": 51.406,
"eval_steps_per_second": 6.426,
"step": 57000
},
{
"epoch": 0.8675008322728541,
"grad_norm": 1.1338236331939697,
"learning_rate": 2.9096989966555187e-07,
"loss": 4.7542,
"step": 57100
},
{
"epoch": 0.8675008322728541,
"eval_loss": 4.5944132804870605,
"eval_runtime": 194.4861,
"eval_samples_per_second": 51.418,
"eval_steps_per_second": 6.427,
"step": 57100
},
{
"epoch": 0.869020098178761,
"grad_norm": 1.1638000011444092,
"learning_rate": 2.809364548494983e-07,
"loss": 4.7509,
"step": 57200
},
{
"epoch": 0.869020098178761,
"eval_loss": 4.593369483947754,
"eval_runtime": 194.5214,
"eval_samples_per_second": 51.408,
"eval_steps_per_second": 6.426,
"step": 57200
},
{
"epoch": 0.8705393640846679,
"grad_norm": 0.9814125299453735,
"learning_rate": 2.709030100334448e-07,
"loss": 4.7565,
"step": 57300
},
{
"epoch": 0.8705393640846679,
"eval_loss": 4.595996856689453,
"eval_runtime": 194.4485,
"eval_samples_per_second": 51.427,
"eval_steps_per_second": 6.428,
"step": 57300
},
{
"epoch": 0.8720586299905748,
"grad_norm": 1.0250178575515747,
"learning_rate": 2.608695652173913e-07,
"loss": 4.7568,
"step": 57400
},
{
"epoch": 0.8720586299905748,
"eval_loss": 4.59307336807251,
"eval_runtime": 194.4712,
"eval_samples_per_second": 51.421,
"eval_steps_per_second": 6.428,
"step": 57400
},
{
"epoch": 0.8735778958964818,
"grad_norm": 0.9920938014984131,
"learning_rate": 2.508361204013378e-07,
"loss": 4.7567,
"step": 57500
},
{
"epoch": 0.8735778958964818,
"eval_loss": 4.5949625968933105,
"eval_runtime": 194.5139,
"eval_samples_per_second": 51.41,
"eval_steps_per_second": 6.426,
"step": 57500
},
{
"epoch": 0.8750971618023887,
"grad_norm": 1.0698268413543701,
"learning_rate": 2.408026755852843e-07,
"loss": 4.749,
"step": 57600
},
{
"epoch": 0.8750971618023887,
"eval_loss": 4.590822696685791,
"eval_runtime": 194.2365,
"eval_samples_per_second": 51.484,
"eval_steps_per_second": 6.435,
"step": 57600
},
{
"epoch": 0.8766164277082956,
"grad_norm": 1.0088557004928589,
"learning_rate": 2.3076923076923078e-07,
"loss": 4.7556,
"step": 57700
},
{
"epoch": 0.8766164277082956,
"eval_loss": 4.592673301696777,
"eval_runtime": 194.2155,
"eval_samples_per_second": 51.489,
"eval_steps_per_second": 6.436,
"step": 57700
},
{
"epoch": 0.8781356936142025,
"grad_norm": 0.989743173122406,
"learning_rate": 2.2073578595317728e-07,
"loss": 4.755,
"step": 57800
},
{
"epoch": 0.8781356936142025,
"eval_loss": 4.594258785247803,
"eval_runtime": 194.0368,
"eval_samples_per_second": 51.537,
"eval_steps_per_second": 6.442,
"step": 57800
},
{
"epoch": 0.8796549595201095,
"grad_norm": 1.0593464374542236,
"learning_rate": 2.1070234113712372e-07,
"loss": 4.7553,
"step": 57900
},
{
"epoch": 0.8796549595201095,
"eval_loss": 4.591804504394531,
"eval_runtime": 193.9201,
"eval_samples_per_second": 51.568,
"eval_steps_per_second": 6.446,
"step": 57900
},
{
"epoch": 0.8811742254260164,
"grad_norm": 1.0415208339691162,
"learning_rate": 2.0066889632107022e-07,
"loss": 4.7526,
"step": 58000
},
{
"epoch": 0.8811742254260164,
"eval_loss": 4.591397285461426,
"eval_runtime": 194.0639,
"eval_samples_per_second": 51.529,
"eval_steps_per_second": 6.441,
"step": 58000
},
{
"epoch": 0.8826934913319233,
"grad_norm": 1.08748197555542,
"learning_rate": 1.9063545150501672e-07,
"loss": 4.748,
"step": 58100
},
{
"epoch": 0.8826934913319233,
"eval_loss": 4.592258930206299,
"eval_runtime": 193.9701,
"eval_samples_per_second": 51.554,
"eval_steps_per_second": 6.444,
"step": 58100
},
{
"epoch": 0.8842127572378302,
"grad_norm": 0.875297486782074,
"learning_rate": 1.8060200668896322e-07,
"loss": 4.754,
"step": 58200
},
{
"epoch": 0.8842127572378302,
"eval_loss": 4.590017318725586,
"eval_runtime": 193.921,
"eval_samples_per_second": 51.567,
"eval_steps_per_second": 6.446,
"step": 58200
},
{
"epoch": 0.8857320231437372,
"grad_norm": 0.9465267062187195,
"learning_rate": 1.705685618729097e-07,
"loss": 4.7541,
"step": 58300
},
{
"epoch": 0.8857320231437372,
"eval_loss": 4.590794563293457,
"eval_runtime": 193.8053,
"eval_samples_per_second": 51.598,
"eval_steps_per_second": 6.45,
"step": 58300
},
{
"epoch": 0.8872512890496441,
"grad_norm": 1.108864426612854,
"learning_rate": 1.605351170568562e-07,
"loss": 4.7545,
"step": 58400
},
{
"epoch": 0.8872512890496441,
"eval_loss": 4.590878963470459,
"eval_runtime": 194.0226,
"eval_samples_per_second": 51.54,
"eval_steps_per_second": 6.443,
"step": 58400
},
{
"epoch": 0.888770554955551,
"grad_norm": 0.9311940670013428,
"learning_rate": 1.505016722408027e-07,
"loss": 4.7537,
"step": 58500
},
{
"epoch": 0.888770554955551,
"eval_loss": 4.589045524597168,
"eval_runtime": 194.3809,
"eval_samples_per_second": 51.445,
"eval_steps_per_second": 6.431,
"step": 58500
},
{
"epoch": 0.8902898208614579,
"grad_norm": 1.122527003288269,
"learning_rate": 1.4046822742474916e-07,
"loss": 4.7517,
"step": 58600
},
{
"epoch": 0.8902898208614579,
"eval_loss": 4.590823650360107,
"eval_runtime": 194.4167,
"eval_samples_per_second": 51.436,
"eval_steps_per_second": 6.429,
"step": 58600
},
{
"epoch": 0.8918090867673649,
"grad_norm": 1.0384498834609985,
"learning_rate": 1.3043478260869566e-07,
"loss": 4.7491,
"step": 58700
},
{
"epoch": 0.8918090867673649,
"eval_loss": 4.5913920402526855,
"eval_runtime": 194.5657,
"eval_samples_per_second": 51.397,
"eval_steps_per_second": 6.425,
"step": 58700
},
{
"epoch": 0.8933283526732718,
"grad_norm": 0.9574987292289734,
"learning_rate": 1.2040133779264215e-07,
"loss": 4.7512,
"step": 58800
},
{
"epoch": 0.8933283526732718,
"eval_loss": 4.5905609130859375,
"eval_runtime": 194.7064,
"eval_samples_per_second": 51.359,
"eval_steps_per_second": 6.42,
"step": 58800
},
{
"epoch": 0.8948476185791787,
"grad_norm": 0.8835811614990234,
"learning_rate": 1.1036789297658864e-07,
"loss": 4.7493,
"step": 58900
},
{
"epoch": 0.8948476185791787,
"eval_loss": 4.59054708480835,
"eval_runtime": 194.6192,
"eval_samples_per_second": 51.382,
"eval_steps_per_second": 6.423,
"step": 58900
},
{
"epoch": 0.8963668844850856,
"grad_norm": 0.8485853672027588,
"learning_rate": 1.0033444816053511e-07,
"loss": 4.7494,
"step": 59000
},
{
"epoch": 0.8963668844850856,
"eval_loss": 4.590051651000977,
"eval_runtime": 194.7537,
"eval_samples_per_second": 51.347,
"eval_steps_per_second": 6.418,
"step": 59000
},
{
"epoch": 0.8978861503909926,
"grad_norm": 0.9415624737739563,
"learning_rate": 9.030100334448161e-08,
"loss": 4.7461,
"step": 59100
},
{
"epoch": 0.8978861503909926,
"eval_loss": 4.593616962432861,
"eval_runtime": 194.5307,
"eval_samples_per_second": 51.406,
"eval_steps_per_second": 6.426,
"step": 59100
},
{
"epoch": 0.8994054162968995,
"grad_norm": 1.0554380416870117,
"learning_rate": 8.02675585284281e-08,
"loss": 4.7523,
"step": 59200
},
{
"epoch": 0.8994054162968995,
"eval_loss": 4.5899271965026855,
"eval_runtime": 194.6175,
"eval_samples_per_second": 51.383,
"eval_steps_per_second": 6.423,
"step": 59200
},
{
"epoch": 0.9009246822028064,
"grad_norm": 0.8636355400085449,
"learning_rate": 7.023411371237458e-08,
"loss": 4.7511,
"step": 59300
},
{
"epoch": 0.9009246822028064,
"eval_loss": 4.590203285217285,
"eval_runtime": 194.5121,
"eval_samples_per_second": 51.411,
"eval_steps_per_second": 6.426,
"step": 59300
},
{
"epoch": 0.9024439481087133,
"grad_norm": 0.7755019068717957,
"learning_rate": 6.020066889632108e-08,
"loss": 4.7495,
"step": 59400
},
{
"epoch": 0.9024439481087133,
"eval_loss": 4.591348648071289,
"eval_runtime": 194.53,
"eval_samples_per_second": 51.406,
"eval_steps_per_second": 6.426,
"step": 59400
},
{
"epoch": 0.9039632140146203,
"grad_norm": 0.9905518293380737,
"learning_rate": 5.0167224080267556e-08,
"loss": 4.7507,
"step": 59500
},
{
"epoch": 0.9039632140146203,
"eval_loss": 4.590855121612549,
"eval_runtime": 194.486,
"eval_samples_per_second": 51.418,
"eval_steps_per_second": 6.427,
"step": 59500
},
{
"epoch": 0.9054824799205272,
"grad_norm": 0.8361491560935974,
"learning_rate": 4.013377926421405e-08,
"loss": 4.7508,
"step": 59600
},
{
"epoch": 0.9054824799205272,
"eval_loss": 4.588395118713379,
"eval_runtime": 194.4894,
"eval_samples_per_second": 51.417,
"eval_steps_per_second": 6.427,
"step": 59600
},
{
"epoch": 0.9070017458264341,
"grad_norm": 0.8528068661689758,
"learning_rate": 3.010033444816054e-08,
"loss": 4.7485,
"step": 59700
},
{
"epoch": 0.9070017458264341,
"eval_loss": 4.589570999145508,
"eval_runtime": 194.4665,
"eval_samples_per_second": 51.423,
"eval_steps_per_second": 6.428,
"step": 59700
},
{
"epoch": 0.908521011732341,
"grad_norm": 0.9023746252059937,
"learning_rate": 2.0066889632107024e-08,
"loss": 4.7502,
"step": 59800
},
{
"epoch": 0.908521011732341,
"eval_loss": 4.588865280151367,
"eval_runtime": 194.2101,
"eval_samples_per_second": 51.491,
"eval_steps_per_second": 6.436,
"step": 59800
},
{
"epoch": 0.910040277638248,
"grad_norm": 0.866371750831604,
"learning_rate": 1.0033444816053512e-08,
"loss": 4.7503,
"step": 59900
},
{
"epoch": 0.910040277638248,
"eval_loss": 4.589888095855713,
"eval_runtime": 194.6389,
"eval_samples_per_second": 51.377,
"eval_steps_per_second": 6.422,
"step": 59900
},
{
"epoch": 0.9115595435441549,
"grad_norm": 0.7748922109603882,
"learning_rate": 0.0,
"loss": 4.7524,
"step": 60000
},
{
"epoch": 0.9115595435441549,
"eval_loss": 4.588863372802734,
"eval_runtime": 194.1121,
"eval_samples_per_second": 51.517,
"eval_steps_per_second": 6.44,
"step": 60000
}
],
"logging_steps": 100,
"max_steps": 60000,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 7.15783283933184e+19,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}