saute / last-checkpoint /trainer_state.json
JustinDuc's picture
Training in progress, step 10000, checkpoint
754e28f verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.016784409297891375,
"eval_steps": 50,
"global_step": 10000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 8.392204648945687e-05,
"grad_norm": 3.729597806930542,
"learning_rate": 4.999588781972202e-05,
"loss": 1.6987,
"step": 50
},
{
"epoch": 8.392204648945687e-05,
"eval_loss": 1.8344682455062866,
"eval_masked_accuracy": 0.6726457476615906,
"eval_runtime": 1.754,
"eval_samples_per_second": 5.701,
"eval_steps_per_second": 2.281,
"step": 50
},
{
"epoch": 0.00016784409297891374,
"grad_norm": 6.472073078155518,
"learning_rate": 4.999169171739755e-05,
"loss": 1.7415,
"step": 100
},
{
"epoch": 0.00016784409297891374,
"eval_loss": 1.7104336023330688,
"eval_masked_accuracy": 0.6737288236618042,
"eval_runtime": 1.7376,
"eval_samples_per_second": 5.755,
"eval_steps_per_second": 2.302,
"step": 100
},
{
"epoch": 0.0002517661394683706,
"grad_norm": 5.799453258514404,
"learning_rate": 4.998749561507307e-05,
"loss": 1.736,
"step": 150
},
{
"epoch": 0.0002517661394683706,
"eval_loss": 1.877158761024475,
"eval_masked_accuracy": 0.6936936974525452,
"eval_runtime": 1.7454,
"eval_samples_per_second": 5.729,
"eval_steps_per_second": 2.292,
"step": 150
},
{
"epoch": 0.0003356881859578275,
"grad_norm": 9.896933555603027,
"learning_rate": 4.99832995127486e-05,
"loss": 1.7919,
"step": 200
},
{
"epoch": 0.0003356881859578275,
"eval_loss": 1.424491047859192,
"eval_masked_accuracy": 0.7206477522850037,
"eval_runtime": 1.7315,
"eval_samples_per_second": 5.775,
"eval_steps_per_second": 2.31,
"step": 200
},
{
"epoch": 0.00041961023244728435,
"grad_norm": 4.745198726654053,
"learning_rate": 4.997910341042413e-05,
"loss": 1.7252,
"step": 250
},
{
"epoch": 0.00041961023244728435,
"eval_loss": 1.915906310081482,
"eval_masked_accuracy": 0.6486486196517944,
"eval_runtime": 1.7431,
"eval_samples_per_second": 5.737,
"eval_steps_per_second": 2.295,
"step": 250
},
{
"epoch": 0.0005035322789367412,
"grad_norm": 6.004683971405029,
"learning_rate": 4.9974907308099657e-05,
"loss": 1.7487,
"step": 300
},
{
"epoch": 0.0005035322789367412,
"eval_loss": 1.7426478862762451,
"eval_masked_accuracy": 0.6846473217010498,
"eval_runtime": 1.7474,
"eval_samples_per_second": 5.723,
"eval_steps_per_second": 2.289,
"step": 300
},
{
"epoch": 0.0005874543254261981,
"grad_norm": 8.232338905334473,
"learning_rate": 4.9970711205775185e-05,
"loss": 1.6958,
"step": 350
},
{
"epoch": 0.0005874543254261981,
"eval_loss": 1.8806991577148438,
"eval_masked_accuracy": 0.6256157755851746,
"eval_runtime": 1.7452,
"eval_samples_per_second": 5.73,
"eval_steps_per_second": 2.292,
"step": 350
},
{
"epoch": 0.000671376371915655,
"grad_norm": 8.929485321044922,
"learning_rate": 4.996651510345071e-05,
"loss": 1.7165,
"step": 400
},
{
"epoch": 0.000671376371915655,
"eval_loss": 1.6669635772705078,
"eval_masked_accuracy": 0.6816326379776001,
"eval_runtime": 1.7367,
"eval_samples_per_second": 5.758,
"eval_steps_per_second": 2.303,
"step": 400
},
{
"epoch": 0.0007552984184051118,
"grad_norm": 6.171640872955322,
"learning_rate": 4.9962402923172725e-05,
"loss": 1.6222,
"step": 450
},
{
"epoch": 0.0007552984184051118,
"eval_loss": 2.174530506134033,
"eval_masked_accuracy": 0.6891891956329346,
"eval_runtime": 1.7554,
"eval_samples_per_second": 5.697,
"eval_steps_per_second": 2.279,
"step": 450
},
{
"epoch": 0.0008392204648945687,
"grad_norm": 4.092519283294678,
"learning_rate": 4.9958206820848254e-05,
"loss": 1.6441,
"step": 500
},
{
"epoch": 0.0008392204648945687,
"eval_loss": 2.060279369354248,
"eval_masked_accuracy": 0.6461538672447205,
"eval_runtime": 1.8075,
"eval_samples_per_second": 5.532,
"eval_steps_per_second": 2.213,
"step": 500
},
{
"epoch": 0.0009231425113840256,
"grad_norm": 5.34571647644043,
"learning_rate": 4.995401071852378e-05,
"loss": 1.7198,
"step": 550
},
{
"epoch": 0.0009231425113840256,
"eval_loss": 1.6280667781829834,
"eval_masked_accuracy": 0.6775510311126709,
"eval_runtime": 1.7487,
"eval_samples_per_second": 5.718,
"eval_steps_per_second": 2.287,
"step": 550
},
{
"epoch": 0.0010070645578734824,
"grad_norm": 4.286564350128174,
"learning_rate": 4.994981461619931e-05,
"loss": 1.6823,
"step": 600
},
{
"epoch": 0.0010070645578734824,
"eval_loss": 1.5270774364471436,
"eval_masked_accuracy": 0.6832579374313354,
"eval_runtime": 1.7459,
"eval_samples_per_second": 5.728,
"eval_steps_per_second": 2.291,
"step": 600
},
{
"epoch": 0.0010909866043629394,
"grad_norm": 3.7731900215148926,
"learning_rate": 4.994561851387484e-05,
"loss": 1.573,
"step": 650
},
{
"epoch": 0.0010909866043629394,
"eval_loss": 1.522475242614746,
"eval_masked_accuracy": 0.7423076629638672,
"eval_runtime": 1.7483,
"eval_samples_per_second": 5.72,
"eval_steps_per_second": 2.288,
"step": 650
},
{
"epoch": 0.0011749086508523962,
"grad_norm": 4.305816650390625,
"learning_rate": 4.994142241155036e-05,
"loss": 1.6905,
"step": 700
},
{
"epoch": 0.0011749086508523962,
"eval_loss": 1.503122091293335,
"eval_masked_accuracy": 0.67136150598526,
"eval_runtime": 1.7423,
"eval_samples_per_second": 5.74,
"eval_steps_per_second": 2.296,
"step": 700
},
{
"epoch": 0.0012588306973418532,
"grad_norm": 6.982117176055908,
"learning_rate": 4.993722630922589e-05,
"loss": 1.6444,
"step": 750
},
{
"epoch": 0.0012588306973418532,
"eval_loss": 1.7397890090942383,
"eval_masked_accuracy": 0.6946902871131897,
"eval_runtime": 1.7436,
"eval_samples_per_second": 5.735,
"eval_steps_per_second": 2.294,
"step": 750
},
{
"epoch": 0.00134275274383131,
"grad_norm": 6.332937717437744,
"learning_rate": 4.993303020690142e-05,
"loss": 1.7488,
"step": 800
},
{
"epoch": 0.00134275274383131,
"eval_loss": 1.407382845878601,
"eval_masked_accuracy": 0.7051281929016113,
"eval_runtime": 1.7398,
"eval_samples_per_second": 5.748,
"eval_steps_per_second": 2.299,
"step": 800
},
{
"epoch": 0.001426674790320767,
"grad_norm": 5.491461753845215,
"learning_rate": 4.9928834104576946e-05,
"loss": 1.5959,
"step": 850
},
{
"epoch": 0.001426674790320767,
"eval_loss": 1.8142907619476318,
"eval_masked_accuracy": 0.6625514626502991,
"eval_runtime": 1.7407,
"eval_samples_per_second": 5.745,
"eval_steps_per_second": 2.298,
"step": 850
},
{
"epoch": 0.0015105968368102237,
"grad_norm": 12.12775707244873,
"learning_rate": 4.9924638002252474e-05,
"loss": 1.6085,
"step": 900
},
{
"epoch": 0.0015105968368102237,
"eval_loss": 1.9904667139053345,
"eval_masked_accuracy": 0.6278026700019836,
"eval_runtime": 1.7503,
"eval_samples_per_second": 5.713,
"eval_steps_per_second": 2.285,
"step": 900
},
{
"epoch": 0.0015945188832996806,
"grad_norm": 18.452600479125977,
"learning_rate": 4.9920441899928e-05,
"loss": 1.5793,
"step": 950
},
{
"epoch": 0.0015945188832996806,
"eval_loss": 1.797326683998108,
"eval_masked_accuracy": 0.6784313917160034,
"eval_runtime": 1.7403,
"eval_samples_per_second": 5.746,
"eval_steps_per_second": 2.299,
"step": 950
},
{
"epoch": 0.0016784409297891374,
"grad_norm": 8.000075340270996,
"learning_rate": 4.9916245797603524e-05,
"loss": 1.5353,
"step": 1000
},
{
"epoch": 0.0016784409297891374,
"eval_loss": 1.8558744192123413,
"eval_masked_accuracy": 0.6530612111091614,
"eval_runtime": 1.7574,
"eval_samples_per_second": 5.69,
"eval_steps_per_second": 2.276,
"step": 1000
},
{
"epoch": 0.0017623629762785944,
"grad_norm": 3.907064199447632,
"learning_rate": 4.991204969527905e-05,
"loss": 1.5363,
"step": 1050
},
{
"epoch": 0.0017623629762785944,
"eval_loss": 2.0765745639801025,
"eval_masked_accuracy": 0.6553191542625427,
"eval_runtime": 1.7983,
"eval_samples_per_second": 5.561,
"eval_steps_per_second": 2.224,
"step": 1050
},
{
"epoch": 0.0018462850227680511,
"grad_norm": 4.185476303100586,
"learning_rate": 4.990785359295458e-05,
"loss": 1.6641,
"step": 1100
},
{
"epoch": 0.0018462850227680511,
"eval_loss": 1.5849405527114868,
"eval_masked_accuracy": 0.71074378490448,
"eval_runtime": 1.7601,
"eval_samples_per_second": 5.681,
"eval_steps_per_second": 2.273,
"step": 1100
},
{
"epoch": 0.0019302070692575081,
"grad_norm": 5.447309494018555,
"learning_rate": 4.990365749063011e-05,
"loss": 1.7069,
"step": 1150
},
{
"epoch": 0.0019302070692575081,
"eval_loss": 1.6813358068466187,
"eval_masked_accuracy": 0.7231404781341553,
"eval_runtime": 1.7529,
"eval_samples_per_second": 5.705,
"eval_steps_per_second": 2.282,
"step": 1150
},
{
"epoch": 0.002014129115746965,
"grad_norm": 5.904290199279785,
"learning_rate": 4.989946138830564e-05,
"loss": 1.6996,
"step": 1200
},
{
"epoch": 0.002014129115746965,
"eval_loss": 1.6986854076385498,
"eval_masked_accuracy": 0.6554622054100037,
"eval_runtime": 1.7531,
"eval_samples_per_second": 5.704,
"eval_steps_per_second": 2.282,
"step": 1200
},
{
"epoch": 0.002098051162236422,
"grad_norm": 5.6478986740112305,
"learning_rate": 4.989526528598116e-05,
"loss": 1.5291,
"step": 1250
},
{
"epoch": 0.002098051162236422,
"eval_loss": 1.7059627771377563,
"eval_masked_accuracy": 0.6680498123168945,
"eval_runtime": 1.7416,
"eval_samples_per_second": 5.742,
"eval_steps_per_second": 2.297,
"step": 1250
},
{
"epoch": 0.002181973208725879,
"grad_norm": 6.695890426635742,
"learning_rate": 4.989106918365669e-05,
"loss": 1.8386,
"step": 1300
},
{
"epoch": 0.002181973208725879,
"eval_loss": 1.6500450372695923,
"eval_masked_accuracy": 0.6693877577781677,
"eval_runtime": 1.7414,
"eval_samples_per_second": 5.743,
"eval_steps_per_second": 2.297,
"step": 1300
},
{
"epoch": 0.0022658952552153354,
"grad_norm": 4.831510066986084,
"learning_rate": 4.9886873081332217e-05,
"loss": 1.691,
"step": 1350
},
{
"epoch": 0.0022658952552153354,
"eval_loss": 1.4610856771469116,
"eval_masked_accuracy": 0.7090163826942444,
"eval_runtime": 1.7413,
"eval_samples_per_second": 5.743,
"eval_steps_per_second": 2.297,
"step": 1350
},
{
"epoch": 0.0023498173017047924,
"grad_norm": 4.90496826171875,
"learning_rate": 4.9882676979007745e-05,
"loss": 1.7116,
"step": 1400
},
{
"epoch": 0.0023498173017047924,
"eval_loss": 1.6787996292114258,
"eval_masked_accuracy": 0.6153846383094788,
"eval_runtime": 1.7655,
"eval_samples_per_second": 5.664,
"eval_steps_per_second": 2.266,
"step": 1400
},
{
"epoch": 0.0024337393481942493,
"grad_norm": 5.956592559814453,
"learning_rate": 4.9878480876683273e-05,
"loss": 1.5348,
"step": 1450
},
{
"epoch": 0.0024337393481942493,
"eval_loss": 1.7995752096176147,
"eval_masked_accuracy": 0.6759999990463257,
"eval_runtime": 1.7486,
"eval_samples_per_second": 5.719,
"eval_steps_per_second": 2.288,
"step": 1450
},
{
"epoch": 0.0025176613946837063,
"grad_norm": 5.731600761413574,
"learning_rate": 4.9874284774358795e-05,
"loss": 1.5617,
"step": 1500
},
{
"epoch": 0.0025176613946837063,
"eval_loss": 2.028412342071533,
"eval_masked_accuracy": 0.6007905006408691,
"eval_runtime": 1.797,
"eval_samples_per_second": 5.565,
"eval_steps_per_second": 2.226,
"step": 1500
},
{
"epoch": 0.002601583441173163,
"grad_norm": 9.261569023132324,
"learning_rate": 4.9870088672034324e-05,
"loss": 1.7109,
"step": 1550
},
{
"epoch": 0.002601583441173163,
"eval_loss": 1.8843729496002197,
"eval_masked_accuracy": 0.6594203114509583,
"eval_runtime": 1.7575,
"eval_samples_per_second": 5.69,
"eval_steps_per_second": 2.276,
"step": 1550
},
{
"epoch": 0.00268550548766262,
"grad_norm": 7.181281089782715,
"learning_rate": 4.986589256970985e-05,
"loss": 1.6529,
"step": 1600
},
{
"epoch": 0.00268550548766262,
"eval_loss": 1.5639550685882568,
"eval_masked_accuracy": 0.6905829310417175,
"eval_runtime": 1.7429,
"eval_samples_per_second": 5.738,
"eval_steps_per_second": 2.295,
"step": 1600
},
{
"epoch": 0.002769427534152077,
"grad_norm": 5.245086193084717,
"learning_rate": 4.986169646738538e-05,
"loss": 1.6497,
"step": 1650
},
{
"epoch": 0.002769427534152077,
"eval_loss": 1.4776060581207275,
"eval_masked_accuracy": 0.7312775254249573,
"eval_runtime": 1.7496,
"eval_samples_per_second": 5.715,
"eval_steps_per_second": 2.286,
"step": 1650
},
{
"epoch": 0.002853349580641534,
"grad_norm": 5.593554496765137,
"learning_rate": 4.98575003650609e-05,
"loss": 1.5326,
"step": 1700
},
{
"epoch": 0.002853349580641534,
"eval_loss": 2.0159146785736084,
"eval_masked_accuracy": 0.6205357313156128,
"eval_runtime": 1.7381,
"eval_samples_per_second": 5.753,
"eval_steps_per_second": 2.301,
"step": 1700
},
{
"epoch": 0.0029372716271309908,
"grad_norm": 7.406851768493652,
"learning_rate": 4.985330426273643e-05,
"loss": 1.5081,
"step": 1750
},
{
"epoch": 0.0029372716271309908,
"eval_loss": 1.508250117301941,
"eval_masked_accuracy": 0.7027027010917664,
"eval_runtime": 1.7474,
"eval_samples_per_second": 5.723,
"eval_steps_per_second": 2.289,
"step": 1750
},
{
"epoch": 0.0030211936736204473,
"grad_norm": 4.5133514404296875,
"learning_rate": 4.984910816041196e-05,
"loss": 1.6619,
"step": 1800
},
{
"epoch": 0.0030211936736204473,
"eval_loss": 1.7022559642791748,
"eval_masked_accuracy": 0.694779098033905,
"eval_runtime": 1.748,
"eval_samples_per_second": 5.721,
"eval_steps_per_second": 2.288,
"step": 1800
},
{
"epoch": 0.0031051157201099043,
"grad_norm": 7.173299312591553,
"learning_rate": 4.984491205808749e-05,
"loss": 1.7603,
"step": 1850
},
{
"epoch": 0.0031051157201099043,
"eval_loss": 1.6458946466445923,
"eval_masked_accuracy": 0.6636771559715271,
"eval_runtime": 1.8607,
"eval_samples_per_second": 5.374,
"eval_steps_per_second": 2.15,
"step": 1850
},
{
"epoch": 0.0031890377665993613,
"grad_norm": 4.3678975105285645,
"learning_rate": 4.984071595576301e-05,
"loss": 1.6453,
"step": 1900
},
{
"epoch": 0.0031890377665993613,
"eval_loss": 1.8176072835922241,
"eval_masked_accuracy": 0.6724137663841248,
"eval_runtime": 1.7589,
"eval_samples_per_second": 5.685,
"eval_steps_per_second": 2.274,
"step": 1900
},
{
"epoch": 0.0032729598130888183,
"grad_norm": 7.378585338592529,
"learning_rate": 4.983651985343854e-05,
"loss": 1.6409,
"step": 1950
},
{
"epoch": 0.0032729598130888183,
"eval_loss": 2.0491786003112793,
"eval_masked_accuracy": 0.6374502182006836,
"eval_runtime": 1.757,
"eval_samples_per_second": 5.692,
"eval_steps_per_second": 2.277,
"step": 1950
},
{
"epoch": 0.003356881859578275,
"grad_norm": 4.898635387420654,
"learning_rate": 4.9832323751114066e-05,
"loss": 1.6994,
"step": 2000
},
{
"epoch": 0.003356881859578275,
"eval_loss": 1.4773211479187012,
"eval_masked_accuracy": 0.6739130616188049,
"eval_runtime": 1.7439,
"eval_samples_per_second": 5.734,
"eval_steps_per_second": 2.294,
"step": 2000
},
{
"epoch": 0.003440803906067732,
"grad_norm": 7.465532302856445,
"learning_rate": 4.9828127648789594e-05,
"loss": 1.5798,
"step": 2050
},
{
"epoch": 0.003440803906067732,
"eval_loss": 1.6743123531341553,
"eval_masked_accuracy": 0.6770427823066711,
"eval_runtime": 1.7546,
"eval_samples_per_second": 5.699,
"eval_steps_per_second": 2.28,
"step": 2050
},
{
"epoch": 0.0035247259525571888,
"grad_norm": 7.025172233581543,
"learning_rate": 4.982393154646512e-05,
"loss": 1.7312,
"step": 2100
},
{
"epoch": 0.0035247259525571888,
"eval_loss": 1.726737380027771,
"eval_masked_accuracy": 0.6824034452438354,
"eval_runtime": 1.7272,
"eval_samples_per_second": 5.79,
"eval_steps_per_second": 2.316,
"step": 2100
},
{
"epoch": 0.0036086479990466457,
"grad_norm": 8.405756950378418,
"learning_rate": 4.9819735444140644e-05,
"loss": 1.7284,
"step": 2150
},
{
"epoch": 0.0036086479990466457,
"eval_loss": 1.8043725490570068,
"eval_masked_accuracy": 0.6153846383094788,
"eval_runtime": 1.7569,
"eval_samples_per_second": 5.692,
"eval_steps_per_second": 2.277,
"step": 2150
},
{
"epoch": 0.0036925700455361023,
"grad_norm": 6.279454231262207,
"learning_rate": 4.981553934181617e-05,
"loss": 1.4629,
"step": 2200
},
{
"epoch": 0.0036925700455361023,
"eval_loss": 1.8529506921768188,
"eval_masked_accuracy": 0.6823529601097107,
"eval_runtime": 1.7798,
"eval_samples_per_second": 5.619,
"eval_steps_per_second": 2.247,
"step": 2200
},
{
"epoch": 0.0037764920920255593,
"grad_norm": 7.525041580200195,
"learning_rate": 4.98113432394917e-05,
"loss": 1.5309,
"step": 2250
},
{
"epoch": 0.0037764920920255593,
"eval_loss": 1.8144168853759766,
"eval_masked_accuracy": 0.7272727489471436,
"eval_runtime": 1.7418,
"eval_samples_per_second": 5.741,
"eval_steps_per_second": 2.297,
"step": 2250
},
{
"epoch": 0.0038604141385150162,
"grad_norm": 6.561546802520752,
"learning_rate": 4.980714713716723e-05,
"loss": 1.6761,
"step": 2300
},
{
"epoch": 0.0038604141385150162,
"eval_loss": 1.8419244289398193,
"eval_masked_accuracy": 0.6638298034667969,
"eval_runtime": 1.7921,
"eval_samples_per_second": 5.58,
"eval_steps_per_second": 2.232,
"step": 2300
},
{
"epoch": 0.003944336185004473,
"grad_norm": 4.7332987785339355,
"learning_rate": 4.980295103484276e-05,
"loss": 1.6738,
"step": 2350
},
{
"epoch": 0.003944336185004473,
"eval_loss": 1.576249122619629,
"eval_masked_accuracy": 0.7078651785850525,
"eval_runtime": 1.7414,
"eval_samples_per_second": 5.742,
"eval_steps_per_second": 2.297,
"step": 2350
},
{
"epoch": 0.00402825823149393,
"grad_norm": 3.7719192504882812,
"learning_rate": 4.979875493251828e-05,
"loss": 1.6432,
"step": 2400
},
{
"epoch": 0.00402825823149393,
"eval_loss": 1.811785340309143,
"eval_masked_accuracy": 0.6746031641960144,
"eval_runtime": 1.7463,
"eval_samples_per_second": 5.726,
"eval_steps_per_second": 2.291,
"step": 2400
},
{
"epoch": 0.004112180277983387,
"grad_norm": 6.218490123748779,
"learning_rate": 4.979455883019381e-05,
"loss": 1.5416,
"step": 2450
},
{
"epoch": 0.004112180277983387,
"eval_loss": 1.6883758306503296,
"eval_masked_accuracy": 0.6900826692581177,
"eval_runtime": 1.7374,
"eval_samples_per_second": 5.756,
"eval_steps_per_second": 2.302,
"step": 2450
},
{
"epoch": 0.004196102324472844,
"grad_norm": 5.042550086975098,
"learning_rate": 4.9790362727869336e-05,
"loss": 1.6701,
"step": 2500
},
{
"epoch": 0.004196102324472844,
"eval_loss": 1.567375898361206,
"eval_masked_accuracy": 0.6788617968559265,
"eval_runtime": 1.735,
"eval_samples_per_second": 5.764,
"eval_steps_per_second": 2.306,
"step": 2500
},
{
"epoch": 0.0042800243709623,
"grad_norm": 6.665520668029785,
"learning_rate": 4.9786166625544865e-05,
"loss": 1.6006,
"step": 2550
},
{
"epoch": 0.0042800243709623,
"eval_loss": 1.659168004989624,
"eval_masked_accuracy": 0.6385542154312134,
"eval_runtime": 1.7434,
"eval_samples_per_second": 5.736,
"eval_steps_per_second": 2.294,
"step": 2550
},
{
"epoch": 0.004363946417451758,
"grad_norm": 4.378693580627441,
"learning_rate": 4.978197052322039e-05,
"loss": 1.6363,
"step": 2600
},
{
"epoch": 0.004363946417451758,
"eval_loss": 1.6367610692977905,
"eval_masked_accuracy": 0.6679389476776123,
"eval_runtime": 1.7407,
"eval_samples_per_second": 5.745,
"eval_steps_per_second": 2.298,
"step": 2600
},
{
"epoch": 0.004447868463941214,
"grad_norm": 8.087454795837402,
"learning_rate": 4.977777442089592e-05,
"loss": 1.5518,
"step": 2650
},
{
"epoch": 0.004447868463941214,
"eval_loss": 2.035369873046875,
"eval_masked_accuracy": 0.64462810754776,
"eval_runtime": 1.7443,
"eval_samples_per_second": 5.733,
"eval_steps_per_second": 2.293,
"step": 2650
},
{
"epoch": 0.004531790510430671,
"grad_norm": 6.383141040802002,
"learning_rate": 4.977357831857144e-05,
"loss": 1.7726,
"step": 2700
},
{
"epoch": 0.004531790510430671,
"eval_loss": 1.9139858484268188,
"eval_masked_accuracy": 0.7137096524238586,
"eval_runtime": 1.7376,
"eval_samples_per_second": 5.755,
"eval_steps_per_second": 2.302,
"step": 2700
},
{
"epoch": 0.004615712556920128,
"grad_norm": 8.098458290100098,
"learning_rate": 4.976938221624697e-05,
"loss": 1.701,
"step": 2750
},
{
"epoch": 0.004615712556920128,
"eval_loss": 1.8784687519073486,
"eval_masked_accuracy": 0.6525096297264099,
"eval_runtime": 1.8538,
"eval_samples_per_second": 5.394,
"eval_steps_per_second": 2.158,
"step": 2750
},
{
"epoch": 0.004699634603409585,
"grad_norm": 5.3736138343811035,
"learning_rate": 4.97651861139225e-05,
"loss": 1.5577,
"step": 2800
},
{
"epoch": 0.004699634603409585,
"eval_loss": 1.6022107601165771,
"eval_masked_accuracy": 0.6907630562782288,
"eval_runtime": 1.7368,
"eval_samples_per_second": 5.758,
"eval_steps_per_second": 2.303,
"step": 2800
},
{
"epoch": 0.004783556649899042,
"grad_norm": 4.617998123168945,
"learning_rate": 4.976099001159803e-05,
"loss": 1.6194,
"step": 2850
},
{
"epoch": 0.004783556649899042,
"eval_loss": 1.398147702217102,
"eval_masked_accuracy": 0.6696035265922546,
"eval_runtime": 1.7507,
"eval_samples_per_second": 5.712,
"eval_steps_per_second": 2.285,
"step": 2850
},
{
"epoch": 0.004867478696388499,
"grad_norm": 4.976247787475586,
"learning_rate": 4.975679390927356e-05,
"loss": 1.6325,
"step": 2900
},
{
"epoch": 0.004867478696388499,
"eval_loss": 1.7178815603256226,
"eval_masked_accuracy": 0.6653696298599243,
"eval_runtime": 1.7533,
"eval_samples_per_second": 5.703,
"eval_steps_per_second": 2.281,
"step": 2900
},
{
"epoch": 0.004951400742877955,
"grad_norm": 5.229081153869629,
"learning_rate": 4.9752597806949085e-05,
"loss": 1.7057,
"step": 2950
},
{
"epoch": 0.004951400742877955,
"eval_loss": 1.8161494731903076,
"eval_masked_accuracy": 0.6431535482406616,
"eval_runtime": 1.7382,
"eval_samples_per_second": 5.753,
"eval_steps_per_second": 2.301,
"step": 2950
},
{
"epoch": 0.005035322789367413,
"grad_norm": 6.112144947052002,
"learning_rate": 4.974840170462461e-05,
"loss": 1.6189,
"step": 3000
},
{
"epoch": 0.005035322789367413,
"eval_loss": 1.8454160690307617,
"eval_masked_accuracy": 0.6767241358757019,
"eval_runtime": 1.7376,
"eval_samples_per_second": 5.755,
"eval_steps_per_second": 2.302,
"step": 3000
},
{
"epoch": 0.005119244835856869,
"grad_norm": 5.7698445320129395,
"learning_rate": 4.9744205602300135e-05,
"loss": 1.6734,
"step": 3050
},
{
"epoch": 0.005119244835856869,
"eval_loss": 1.6155188083648682,
"eval_masked_accuracy": 0.6991525292396545,
"eval_runtime": 1.7469,
"eval_samples_per_second": 5.724,
"eval_steps_per_second": 2.29,
"step": 3050
},
{
"epoch": 0.005203166882346326,
"grad_norm": 11.4446382522583,
"learning_rate": 4.9740009499975664e-05,
"loss": 1.602,
"step": 3100
},
{
"epoch": 0.005203166882346326,
"eval_loss": 1.7193024158477783,
"eval_masked_accuracy": 0.6454545259475708,
"eval_runtime": 1.8085,
"eval_samples_per_second": 5.529,
"eval_steps_per_second": 2.212,
"step": 3100
},
{
"epoch": 0.005287088928835783,
"grad_norm": 4.331955432891846,
"learning_rate": 4.973581339765119e-05,
"loss": 1.5886,
"step": 3150
},
{
"epoch": 0.005287088928835783,
"eval_loss": 1.7239084243774414,
"eval_masked_accuracy": 0.7025862336158752,
"eval_runtime": 1.7507,
"eval_samples_per_second": 5.712,
"eval_steps_per_second": 2.285,
"step": 3150
},
{
"epoch": 0.00537101097532524,
"grad_norm": 6.857669830322266,
"learning_rate": 4.973161729532672e-05,
"loss": 1.6531,
"step": 3200
},
{
"epoch": 0.00537101097532524,
"eval_loss": 1.7898776531219482,
"eval_masked_accuracy": 0.6463878154754639,
"eval_runtime": 1.807,
"eval_samples_per_second": 5.534,
"eval_steps_per_second": 2.214,
"step": 3200
},
{
"epoch": 0.005454933021814697,
"grad_norm": 6.366724491119385,
"learning_rate": 4.972742119300224e-05,
"loss": 1.5112,
"step": 3250
},
{
"epoch": 0.005454933021814697,
"eval_loss": 1.68304443359375,
"eval_masked_accuracy": 0.6958174705505371,
"eval_runtime": 1.7544,
"eval_samples_per_second": 5.7,
"eval_steps_per_second": 2.28,
"step": 3250
},
{
"epoch": 0.005538855068304154,
"grad_norm": 5.657731056213379,
"learning_rate": 4.972322509067777e-05,
"loss": 1.5622,
"step": 3300
},
{
"epoch": 0.005538855068304154,
"eval_loss": 1.7854249477386475,
"eval_masked_accuracy": 0.6833333373069763,
"eval_runtime": 1.7977,
"eval_samples_per_second": 5.563,
"eval_steps_per_second": 2.225,
"step": 3300
},
{
"epoch": 0.00562277711479361,
"grad_norm": 4.501428127288818,
"learning_rate": 4.97190289883533e-05,
"loss": 1.5736,
"step": 3350
},
{
"epoch": 0.00562277711479361,
"eval_loss": 1.4276224374771118,
"eval_masked_accuracy": 0.7192118167877197,
"eval_runtime": 1.7643,
"eval_samples_per_second": 5.668,
"eval_steps_per_second": 2.267,
"step": 3350
},
{
"epoch": 0.005706699161283068,
"grad_norm": 6.436139106750488,
"learning_rate": 4.971483288602883e-05,
"loss": 1.5653,
"step": 3400
},
{
"epoch": 0.005706699161283068,
"eval_loss": 1.674355149269104,
"eval_masked_accuracy": 0.718367338180542,
"eval_runtime": 1.7482,
"eval_samples_per_second": 5.72,
"eval_steps_per_second": 2.288,
"step": 3400
},
{
"epoch": 0.005790621207772524,
"grad_norm": 6.295548439025879,
"learning_rate": 4.9710636783704356e-05,
"loss": 1.5556,
"step": 3450
},
{
"epoch": 0.005790621207772524,
"eval_loss": 1.7501426935195923,
"eval_masked_accuracy": 0.7076271176338196,
"eval_runtime": 1.7554,
"eval_samples_per_second": 5.697,
"eval_steps_per_second": 2.279,
"step": 3450
},
{
"epoch": 0.0058745432542619816,
"grad_norm": 5.733904838562012,
"learning_rate": 4.9706440681379884e-05,
"loss": 1.5164,
"step": 3500
},
{
"epoch": 0.0058745432542619816,
"eval_loss": 1.520179033279419,
"eval_masked_accuracy": 0.7203390002250671,
"eval_runtime": 1.7629,
"eval_samples_per_second": 5.672,
"eval_steps_per_second": 2.269,
"step": 3500
},
{
"epoch": 0.005958465300751438,
"grad_norm": 5.285616397857666,
"learning_rate": 4.9702244579055406e-05,
"loss": 1.6254,
"step": 3550
},
{
"epoch": 0.005958465300751438,
"eval_loss": 1.7321217060089111,
"eval_masked_accuracy": 0.6712962985038757,
"eval_runtime": 1.7429,
"eval_samples_per_second": 5.738,
"eval_steps_per_second": 2.295,
"step": 3550
},
{
"epoch": 0.006042387347240895,
"grad_norm": 5.386379241943359,
"learning_rate": 4.9698048476730934e-05,
"loss": 1.505,
"step": 3600
},
{
"epoch": 0.006042387347240895,
"eval_loss": 1.7810560464859009,
"eval_masked_accuracy": 0.654618501663208,
"eval_runtime": 1.7454,
"eval_samples_per_second": 5.729,
"eval_steps_per_second": 2.292,
"step": 3600
},
{
"epoch": 0.006126309393730352,
"grad_norm": 6.726806640625,
"learning_rate": 4.969385237440646e-05,
"loss": 1.5011,
"step": 3650
},
{
"epoch": 0.006126309393730352,
"eval_loss": 1.5794349908828735,
"eval_masked_accuracy": 0.6979591846466064,
"eval_runtime": 1.7721,
"eval_samples_per_second": 5.643,
"eval_steps_per_second": 2.257,
"step": 3650
},
{
"epoch": 0.006210231440219809,
"grad_norm": 7.159238815307617,
"learning_rate": 4.968965627208199e-05,
"loss": 1.6134,
"step": 3700
},
{
"epoch": 0.006210231440219809,
"eval_loss": 1.4294860363006592,
"eval_masked_accuracy": 0.7136752009391785,
"eval_runtime": 1.752,
"eval_samples_per_second": 5.708,
"eval_steps_per_second": 2.283,
"step": 3700
},
{
"epoch": 0.006294153486709265,
"grad_norm": 5.560455799102783,
"learning_rate": 4.968546016975752e-05,
"loss": 1.5097,
"step": 3750
},
{
"epoch": 0.006294153486709265,
"eval_loss": 1.9169464111328125,
"eval_masked_accuracy": 0.6929824352264404,
"eval_runtime": 1.7457,
"eval_samples_per_second": 5.728,
"eval_steps_per_second": 2.291,
"step": 3750
},
{
"epoch": 0.0063780755331987226,
"grad_norm": 5.439815998077393,
"learning_rate": 4.968126406743305e-05,
"loss": 1.6706,
"step": 3800
},
{
"epoch": 0.0063780755331987226,
"eval_loss": 1.622685194015503,
"eval_masked_accuracy": 0.6913580298423767,
"eval_runtime": 1.7518,
"eval_samples_per_second": 5.709,
"eval_steps_per_second": 2.283,
"step": 3800
},
{
"epoch": 0.006461997579688179,
"grad_norm": 4.242193698883057,
"learning_rate": 4.967706796510857e-05,
"loss": 1.5511,
"step": 3850
},
{
"epoch": 0.006461997579688179,
"eval_loss": 1.3621394634246826,
"eval_masked_accuracy": 0.7379912734031677,
"eval_runtime": 1.7356,
"eval_samples_per_second": 5.762,
"eval_steps_per_second": 2.305,
"step": 3850
},
{
"epoch": 0.0065459196261776365,
"grad_norm": 5.056567668914795,
"learning_rate": 4.96728718627841e-05,
"loss": 1.6108,
"step": 3900
},
{
"epoch": 0.0065459196261776365,
"eval_loss": 1.5381476879119873,
"eval_masked_accuracy": 0.7165354490280151,
"eval_runtime": 1.7342,
"eval_samples_per_second": 5.767,
"eval_steps_per_second": 2.307,
"step": 3900
},
{
"epoch": 0.006629841672667093,
"grad_norm": 5.566115856170654,
"learning_rate": 4.966867576045963e-05,
"loss": 1.5858,
"step": 3950
},
{
"epoch": 0.006629841672667093,
"eval_loss": 1.9895532131195068,
"eval_masked_accuracy": 0.6399999856948853,
"eval_runtime": 1.7417,
"eval_samples_per_second": 5.742,
"eval_steps_per_second": 2.297,
"step": 3950
},
{
"epoch": 0.00671376371915655,
"grad_norm": 7.843978404998779,
"learning_rate": 4.9664479658135155e-05,
"loss": 1.5999,
"step": 4000
},
{
"epoch": 0.00671376371915655,
"eval_loss": 1.589036464691162,
"eval_masked_accuracy": 0.6991525292396545,
"eval_runtime": 1.7452,
"eval_samples_per_second": 5.73,
"eval_steps_per_second": 2.292,
"step": 4000
},
{
"epoch": 0.006797685765646007,
"grad_norm": 6.600104331970215,
"learning_rate": 4.9660283555810683e-05,
"loss": 1.6444,
"step": 4050
},
{
"epoch": 0.006797685765646007,
"eval_loss": 1.590256929397583,
"eval_masked_accuracy": 0.6895161271095276,
"eval_runtime": 1.7389,
"eval_samples_per_second": 5.751,
"eval_steps_per_second": 2.3,
"step": 4050
},
{
"epoch": 0.006881607812135464,
"grad_norm": 6.0659589767456055,
"learning_rate": 4.965608745348621e-05,
"loss": 1.5554,
"step": 4100
},
{
"epoch": 0.006881607812135464,
"eval_loss": 1.8275972604751587,
"eval_masked_accuracy": 0.6558139324188232,
"eval_runtime": 1.7513,
"eval_samples_per_second": 5.71,
"eval_steps_per_second": 2.284,
"step": 4100
},
{
"epoch": 0.00696552985862492,
"grad_norm": 6.09676456451416,
"learning_rate": 4.9651891351161734e-05,
"loss": 1.7191,
"step": 4150
},
{
"epoch": 0.00696552985862492,
"eval_loss": 1.8767850399017334,
"eval_masked_accuracy": 0.6508620977401733,
"eval_runtime": 1.7474,
"eval_samples_per_second": 5.723,
"eval_steps_per_second": 2.289,
"step": 4150
},
{
"epoch": 0.0070494519051143775,
"grad_norm": 5.208311080932617,
"learning_rate": 4.964769524883726e-05,
"loss": 1.585,
"step": 4200
},
{
"epoch": 0.0070494519051143775,
"eval_loss": 1.3652145862579346,
"eval_masked_accuracy": 0.7037037014961243,
"eval_runtime": 1.7463,
"eval_samples_per_second": 5.726,
"eval_steps_per_second": 2.291,
"step": 4200
},
{
"epoch": 0.007133373951603834,
"grad_norm": 8.517348289489746,
"learning_rate": 4.964349914651279e-05,
"loss": 1.6888,
"step": 4250
},
{
"epoch": 0.007133373951603834,
"eval_loss": 1.347320318222046,
"eval_masked_accuracy": 0.7190082669258118,
"eval_runtime": 1.7446,
"eval_samples_per_second": 5.732,
"eval_steps_per_second": 2.293,
"step": 4250
},
{
"epoch": 0.0072172959980932915,
"grad_norm": 5.57391357421875,
"learning_rate": 4.963930304418832e-05,
"loss": 1.6351,
"step": 4300
},
{
"epoch": 0.0072172959980932915,
"eval_loss": 1.563398003578186,
"eval_masked_accuracy": 0.6952789425849915,
"eval_runtime": 1.7535,
"eval_samples_per_second": 5.703,
"eval_steps_per_second": 2.281,
"step": 4300
},
{
"epoch": 0.007301218044582748,
"grad_norm": 4.073302745819092,
"learning_rate": 4.963510694186385e-05,
"loss": 1.7031,
"step": 4350
},
{
"epoch": 0.007301218044582748,
"eval_loss": 1.7390921115875244,
"eval_masked_accuracy": 0.6963562965393066,
"eval_runtime": 1.8598,
"eval_samples_per_second": 5.377,
"eval_steps_per_second": 2.151,
"step": 4350
},
{
"epoch": 0.007385140091072205,
"grad_norm": 4.129016876220703,
"learning_rate": 4.963091083953937e-05,
"loss": 1.5611,
"step": 4400
},
{
"epoch": 0.007385140091072205,
"eval_loss": 1.7892725467681885,
"eval_masked_accuracy": 0.7015503644943237,
"eval_runtime": 1.7481,
"eval_samples_per_second": 5.72,
"eval_steps_per_second": 2.288,
"step": 4400
},
{
"epoch": 0.007469062137561662,
"grad_norm": 8.45355224609375,
"learning_rate": 4.96267147372149e-05,
"loss": 1.679,
"step": 4450
},
{
"epoch": 0.007469062137561662,
"eval_loss": 1.8994945287704468,
"eval_masked_accuracy": 0.6711111068725586,
"eval_runtime": 1.748,
"eval_samples_per_second": 5.721,
"eval_steps_per_second": 2.288,
"step": 4450
},
{
"epoch": 0.0075529841840511185,
"grad_norm": 7.353001594543457,
"learning_rate": 4.9622602556936916e-05,
"loss": 1.5084,
"step": 4500
},
{
"epoch": 0.0075529841840511185,
"eval_loss": 1.6633514165878296,
"eval_masked_accuracy": 0.6792452931404114,
"eval_runtime": 1.7365,
"eval_samples_per_second": 5.759,
"eval_steps_per_second": 2.303,
"step": 4500
},
{
"epoch": 0.007636906230540575,
"grad_norm": 5.420140266418457,
"learning_rate": 4.9618406454612445e-05,
"loss": 1.6768,
"step": 4550
},
{
"epoch": 0.007636906230540575,
"eval_loss": 1.6823314428329468,
"eval_masked_accuracy": 0.700421929359436,
"eval_runtime": 1.7456,
"eval_samples_per_second": 5.729,
"eval_steps_per_second": 2.291,
"step": 4550
},
{
"epoch": 0.0077208282770300325,
"grad_norm": 5.6282572746276855,
"learning_rate": 4.961421035228797e-05,
"loss": 1.5346,
"step": 4600
},
{
"epoch": 0.0077208282770300325,
"eval_loss": 2.210347890853882,
"eval_masked_accuracy": 0.6339285969734192,
"eval_runtime": 1.7553,
"eval_samples_per_second": 5.697,
"eval_steps_per_second": 2.279,
"step": 4600
},
{
"epoch": 0.007804750323519489,
"grad_norm": 7.358382701873779,
"learning_rate": 4.96100142499635e-05,
"loss": 1.6792,
"step": 4650
},
{
"epoch": 0.007804750323519489,
"eval_loss": 1.742630958557129,
"eval_masked_accuracy": 0.6728110313415527,
"eval_runtime": 1.7331,
"eval_samples_per_second": 5.77,
"eval_steps_per_second": 2.308,
"step": 4650
},
{
"epoch": 0.007888672370008946,
"grad_norm": 5.980144500732422,
"learning_rate": 4.960581814763902e-05,
"loss": 1.4871,
"step": 4700
},
{
"epoch": 0.007888672370008946,
"eval_loss": 1.4571318626403809,
"eval_masked_accuracy": 0.7166666388511658,
"eval_runtime": 1.7531,
"eval_samples_per_second": 5.704,
"eval_steps_per_second": 2.282,
"step": 4700
},
{
"epoch": 0.007972594416498403,
"grad_norm": 8.18883228302002,
"learning_rate": 4.960162204531455e-05,
"loss": 1.527,
"step": 4750
},
{
"epoch": 0.007972594416498403,
"eval_loss": 2.062413454055786,
"eval_masked_accuracy": 0.6695278882980347,
"eval_runtime": 1.748,
"eval_samples_per_second": 5.721,
"eval_steps_per_second": 2.288,
"step": 4750
},
{
"epoch": 0.00805651646298786,
"grad_norm": 4.835183143615723,
"learning_rate": 4.959742594299008e-05,
"loss": 1.591,
"step": 4800
},
{
"epoch": 0.00805651646298786,
"eval_loss": 1.690118432044983,
"eval_masked_accuracy": 0.7049180269241333,
"eval_runtime": 1.7383,
"eval_samples_per_second": 5.753,
"eval_steps_per_second": 2.301,
"step": 4800
},
{
"epoch": 0.008140438509477316,
"grad_norm": 5.039312362670898,
"learning_rate": 4.959322984066561e-05,
"loss": 1.5386,
"step": 4850
},
{
"epoch": 0.008140438509477316,
"eval_loss": 1.9135382175445557,
"eval_masked_accuracy": 0.6181818246841431,
"eval_runtime": 1.7445,
"eval_samples_per_second": 5.732,
"eval_steps_per_second": 2.293,
"step": 4850
},
{
"epoch": 0.008224360555966774,
"grad_norm": 6.3293890953063965,
"learning_rate": 4.958903373834114e-05,
"loss": 1.4752,
"step": 4900
},
{
"epoch": 0.008224360555966774,
"eval_loss": 1.6353566646575928,
"eval_masked_accuracy": 0.7319999933242798,
"eval_runtime": 1.8458,
"eval_samples_per_second": 5.418,
"eval_steps_per_second": 2.167,
"step": 4900
},
{
"epoch": 0.008308282602456231,
"grad_norm": 7.455787658691406,
"learning_rate": 4.958483763601666e-05,
"loss": 1.5304,
"step": 4950
},
{
"epoch": 0.008308282602456231,
"eval_loss": 1.8691352605819702,
"eval_masked_accuracy": 0.6653386354446411,
"eval_runtime": 1.7533,
"eval_samples_per_second": 5.704,
"eval_steps_per_second": 2.281,
"step": 4950
},
{
"epoch": 0.008392204648945687,
"grad_norm": 5.682205677032471,
"learning_rate": 4.958064153369219e-05,
"loss": 1.5945,
"step": 5000
},
{
"epoch": 0.008392204648945687,
"eval_loss": 1.6161428689956665,
"eval_masked_accuracy": 0.6964285969734192,
"eval_runtime": 1.7814,
"eval_samples_per_second": 5.614,
"eval_steps_per_second": 2.245,
"step": 5000
},
{
"epoch": 0.008476126695435144,
"grad_norm": 6.474329471588135,
"learning_rate": 4.9576445431367715e-05,
"loss": 1.8228,
"step": 5050
},
{
"epoch": 0.008476126695435144,
"eval_loss": 1.4911173582077026,
"eval_masked_accuracy": 0.71875,
"eval_runtime": 1.8052,
"eval_samples_per_second": 5.54,
"eval_steps_per_second": 2.216,
"step": 5050
},
{
"epoch": 0.0085600487419246,
"grad_norm": 4.493051052093506,
"learning_rate": 4.9572249329043244e-05,
"loss": 1.5526,
"step": 5100
},
{
"epoch": 0.0085600487419246,
"eval_loss": 1.4060901403427124,
"eval_masked_accuracy": 0.7131474018096924,
"eval_runtime": 1.8193,
"eval_samples_per_second": 5.497,
"eval_steps_per_second": 2.199,
"step": 5100
},
{
"epoch": 0.008643970788414057,
"grad_norm": 5.657381057739258,
"learning_rate": 4.956805322671877e-05,
"loss": 1.5743,
"step": 5150
},
{
"epoch": 0.008643970788414057,
"eval_loss": 1.7347627878189087,
"eval_masked_accuracy": 0.6392694115638733,
"eval_runtime": 1.7632,
"eval_samples_per_second": 5.671,
"eval_steps_per_second": 2.269,
"step": 5150
},
{
"epoch": 0.008727892834903515,
"grad_norm": 5.059664726257324,
"learning_rate": 4.9563941046440784e-05,
"loss": 1.5923,
"step": 5200
},
{
"epoch": 0.008727892834903515,
"eval_loss": 1.7108001708984375,
"eval_masked_accuracy": 0.6759999990463257,
"eval_runtime": 1.7312,
"eval_samples_per_second": 5.776,
"eval_steps_per_second": 2.311,
"step": 5200
},
{
"epoch": 0.008811814881392972,
"grad_norm": 6.256536483764648,
"learning_rate": 4.955974494411631e-05,
"loss": 1.5454,
"step": 5250
},
{
"epoch": 0.008811814881392972,
"eval_loss": 1.8423763513565063,
"eval_masked_accuracy": 0.6590909361839294,
"eval_runtime": 1.7323,
"eval_samples_per_second": 5.773,
"eval_steps_per_second": 2.309,
"step": 5250
},
{
"epoch": 0.008895736927882428,
"grad_norm": 6.45760440826416,
"learning_rate": 4.955554884179184e-05,
"loss": 1.5381,
"step": 5300
},
{
"epoch": 0.008895736927882428,
"eval_loss": 1.8820030689239502,
"eval_masked_accuracy": 0.6486486196517944,
"eval_runtime": 1.7529,
"eval_samples_per_second": 5.705,
"eval_steps_per_second": 2.282,
"step": 5300
},
{
"epoch": 0.008979658974371885,
"grad_norm": 7.668667793273926,
"learning_rate": 4.955135273946737e-05,
"loss": 1.6363,
"step": 5350
},
{
"epoch": 0.008979658974371885,
"eval_loss": 1.631400465965271,
"eval_masked_accuracy": 0.7160493731498718,
"eval_runtime": 1.7511,
"eval_samples_per_second": 5.711,
"eval_steps_per_second": 2.284,
"step": 5350
},
{
"epoch": 0.009063581020861342,
"grad_norm": 7.2050018310546875,
"learning_rate": 4.954715663714289e-05,
"loss": 1.5738,
"step": 5400
},
{
"epoch": 0.009063581020861342,
"eval_loss": 1.5917881727218628,
"eval_masked_accuracy": 0.7405857443809509,
"eval_runtime": 1.75,
"eval_samples_per_second": 5.714,
"eval_steps_per_second": 2.286,
"step": 5400
},
{
"epoch": 0.0091475030673508,
"grad_norm": 6.094969749450684,
"learning_rate": 4.954296053481842e-05,
"loss": 1.7321,
"step": 5450
},
{
"epoch": 0.0091475030673508,
"eval_loss": 1.5327577590942383,
"eval_masked_accuracy": 0.707317054271698,
"eval_runtime": 1.7423,
"eval_samples_per_second": 5.74,
"eval_steps_per_second": 2.296,
"step": 5450
},
{
"epoch": 0.009231425113840256,
"grad_norm": 8.869881629943848,
"learning_rate": 4.953876443249395e-05,
"loss": 1.5768,
"step": 5500
},
{
"epoch": 0.009231425113840256,
"eval_loss": 1.3501726388931274,
"eval_masked_accuracy": 0.7801724076271057,
"eval_runtime": 1.7732,
"eval_samples_per_second": 5.64,
"eval_steps_per_second": 2.256,
"step": 5500
},
{
"epoch": 0.009315347160329713,
"grad_norm": 4.408574104309082,
"learning_rate": 4.9534568330169476e-05,
"loss": 1.5802,
"step": 5550
},
{
"epoch": 0.009315347160329713,
"eval_loss": 1.7055152654647827,
"eval_masked_accuracy": 0.6707317233085632,
"eval_runtime": 1.7418,
"eval_samples_per_second": 5.741,
"eval_steps_per_second": 2.296,
"step": 5550
},
{
"epoch": 0.00939926920681917,
"grad_norm": 5.3869147300720215,
"learning_rate": 4.9530372227845e-05,
"loss": 1.5547,
"step": 5600
},
{
"epoch": 0.00939926920681917,
"eval_loss": 1.3663699626922607,
"eval_masked_accuracy": 0.6974790096282959,
"eval_runtime": 1.7338,
"eval_samples_per_second": 5.768,
"eval_steps_per_second": 2.307,
"step": 5600
},
{
"epoch": 0.009483191253308626,
"grad_norm": 4.417982578277588,
"learning_rate": 4.9526176125520526e-05,
"loss": 1.5658,
"step": 5650
},
{
"epoch": 0.009483191253308626,
"eval_loss": 1.6572059392929077,
"eval_masked_accuracy": 0.6520000100135803,
"eval_runtime": 1.7583,
"eval_samples_per_second": 5.687,
"eval_steps_per_second": 2.275,
"step": 5650
},
{
"epoch": 0.009567113299798084,
"grad_norm": 5.2137861251831055,
"learning_rate": 4.9521980023196055e-05,
"loss": 1.5929,
"step": 5700
},
{
"epoch": 0.009567113299798084,
"eval_loss": 1.4574190378189087,
"eval_masked_accuracy": 0.6694560647010803,
"eval_runtime": 1.7352,
"eval_samples_per_second": 5.763,
"eval_steps_per_second": 2.305,
"step": 5700
},
{
"epoch": 0.00965103534628754,
"grad_norm": 6.848864555358887,
"learning_rate": 4.951778392087158e-05,
"loss": 1.6008,
"step": 5750
},
{
"epoch": 0.00965103534628754,
"eval_loss": 2.133417844772339,
"eval_masked_accuracy": 0.6540084481239319,
"eval_runtime": 1.8568,
"eval_samples_per_second": 5.386,
"eval_steps_per_second": 2.154,
"step": 5750
},
{
"epoch": 0.009734957392776997,
"grad_norm": 3.9827840328216553,
"learning_rate": 4.9513587818547105e-05,
"loss": 1.5811,
"step": 5800
},
{
"epoch": 0.009734957392776997,
"eval_loss": 1.403198003768921,
"eval_masked_accuracy": 0.7085201740264893,
"eval_runtime": 1.749,
"eval_samples_per_second": 5.717,
"eval_steps_per_second": 2.287,
"step": 5800
},
{
"epoch": 0.009818879439266454,
"grad_norm": 4.541887283325195,
"learning_rate": 4.950939171622263e-05,
"loss": 1.558,
"step": 5850
},
{
"epoch": 0.009818879439266454,
"eval_loss": 1.4281632900238037,
"eval_masked_accuracy": 0.7195122241973877,
"eval_runtime": 1.7523,
"eval_samples_per_second": 5.707,
"eval_steps_per_second": 2.283,
"step": 5850
},
{
"epoch": 0.00990280148575591,
"grad_norm": 8.121429443359375,
"learning_rate": 4.950519561389816e-05,
"loss": 1.5583,
"step": 5900
},
{
"epoch": 0.00990280148575591,
"eval_loss": 1.608547568321228,
"eval_masked_accuracy": 0.6582278609275818,
"eval_runtime": 1.7405,
"eval_samples_per_second": 5.745,
"eval_steps_per_second": 2.298,
"step": 5900
},
{
"epoch": 0.009986723532245369,
"grad_norm": 4.750977039337158,
"learning_rate": 4.950099951157369e-05,
"loss": 1.5378,
"step": 5950
},
{
"epoch": 0.009986723532245369,
"eval_loss": 1.3912121057510376,
"eval_masked_accuracy": 0.701298713684082,
"eval_runtime": 1.7623,
"eval_samples_per_second": 5.674,
"eval_steps_per_second": 2.27,
"step": 5950
},
{
"epoch": 0.010070645578734825,
"grad_norm": 4.445640563964844,
"learning_rate": 4.949680340924922e-05,
"loss": 1.5063,
"step": 6000
},
{
"epoch": 0.010070645578734825,
"eval_loss": 1.6513465642929077,
"eval_masked_accuracy": 0.6796537041664124,
"eval_runtime": 1.7424,
"eval_samples_per_second": 5.739,
"eval_steps_per_second": 2.296,
"step": 6000
},
{
"epoch": 0.010154567625224282,
"grad_norm": 13.394184112548828,
"learning_rate": 4.949260730692475e-05,
"loss": 1.5155,
"step": 6050
},
{
"epoch": 0.010154567625224282,
"eval_loss": 1.5842430591583252,
"eval_masked_accuracy": 0.6853448152542114,
"eval_runtime": 1.7416,
"eval_samples_per_second": 5.742,
"eval_steps_per_second": 2.297,
"step": 6050
},
{
"epoch": 0.010238489671713738,
"grad_norm": 7.441386699676514,
"learning_rate": 4.948841120460027e-05,
"loss": 1.5009,
"step": 6100
},
{
"epoch": 0.010238489671713738,
"eval_loss": 1.512109637260437,
"eval_masked_accuracy": 0.6987447738647461,
"eval_runtime": 1.7546,
"eval_samples_per_second": 5.699,
"eval_steps_per_second": 2.28,
"step": 6100
},
{
"epoch": 0.010322411718203195,
"grad_norm": 6.1988749504089355,
"learning_rate": 4.94842151022758e-05,
"loss": 1.5567,
"step": 6150
},
{
"epoch": 0.010322411718203195,
"eval_loss": 1.5210555791854858,
"eval_masked_accuracy": 0.7109375,
"eval_runtime": 1.7524,
"eval_samples_per_second": 5.707,
"eval_steps_per_second": 2.283,
"step": 6150
},
{
"epoch": 0.010406333764692651,
"grad_norm": 4.782381057739258,
"learning_rate": 4.9480018999951325e-05,
"loss": 1.6125,
"step": 6200
},
{
"epoch": 0.010406333764692651,
"eval_loss": 1.6434142589569092,
"eval_masked_accuracy": 0.6638655662536621,
"eval_runtime": 1.7489,
"eval_samples_per_second": 5.718,
"eval_steps_per_second": 2.287,
"step": 6200
},
{
"epoch": 0.01049025581118211,
"grad_norm": 5.14832878112793,
"learning_rate": 4.9475822897626854e-05,
"loss": 1.6089,
"step": 6250
},
{
"epoch": 0.01049025581118211,
"eval_loss": 1.239379644393921,
"eval_masked_accuracy": 0.7427386045455933,
"eval_runtime": 1.7532,
"eval_samples_per_second": 5.704,
"eval_steps_per_second": 2.282,
"step": 6250
},
{
"epoch": 0.010574177857671566,
"grad_norm": 5.390649795532227,
"learning_rate": 4.947162679530238e-05,
"loss": 1.6357,
"step": 6300
},
{
"epoch": 0.010574177857671566,
"eval_loss": 1.5129663944244385,
"eval_masked_accuracy": 0.692307710647583,
"eval_runtime": 1.7523,
"eval_samples_per_second": 5.707,
"eval_steps_per_second": 2.283,
"step": 6300
},
{
"epoch": 0.010658099904161023,
"grad_norm": 4.3327412605285645,
"learning_rate": 4.9467430692977904e-05,
"loss": 1.5318,
"step": 6350
},
{
"epoch": 0.010658099904161023,
"eval_loss": 1.7716737985610962,
"eval_masked_accuracy": 0.6942148804664612,
"eval_runtime": 1.7284,
"eval_samples_per_second": 5.786,
"eval_steps_per_second": 2.314,
"step": 6350
},
{
"epoch": 0.01074202195065048,
"grad_norm": 5.145776271820068,
"learning_rate": 4.946323459065343e-05,
"loss": 1.6081,
"step": 6400
},
{
"epoch": 0.01074202195065048,
"eval_loss": 1.6661970615386963,
"eval_masked_accuracy": 0.6882591247558594,
"eval_runtime": 1.7486,
"eval_samples_per_second": 5.719,
"eval_steps_per_second": 2.288,
"step": 6400
},
{
"epoch": 0.010825943997139936,
"grad_norm": 5.037006855010986,
"learning_rate": 4.945903848832896e-05,
"loss": 1.5028,
"step": 6450
},
{
"epoch": 0.010825943997139936,
"eval_loss": 1.4679136276245117,
"eval_masked_accuracy": 0.714893639087677,
"eval_runtime": 1.7514,
"eval_samples_per_second": 5.71,
"eval_steps_per_second": 2.284,
"step": 6450
},
{
"epoch": 0.010909866043629394,
"grad_norm": 5.618253707885742,
"learning_rate": 4.945484238600449e-05,
"loss": 1.5477,
"step": 6500
},
{
"epoch": 0.010909866043629394,
"eval_loss": 1.6666347980499268,
"eval_masked_accuracy": 0.7094017267227173,
"eval_runtime": 1.7486,
"eval_samples_per_second": 5.719,
"eval_steps_per_second": 2.288,
"step": 6500
},
{
"epoch": 0.01099378809011885,
"grad_norm": 14.34435749053955,
"learning_rate": 4.945064628368002e-05,
"loss": 1.6291,
"step": 6550
},
{
"epoch": 0.01099378809011885,
"eval_loss": 1.8381481170654297,
"eval_masked_accuracy": 0.6547085046768188,
"eval_runtime": 1.7548,
"eval_samples_per_second": 5.699,
"eval_steps_per_second": 2.279,
"step": 6550
},
{
"epoch": 0.011077710136608307,
"grad_norm": 4.846654891967773,
"learning_rate": 4.9446450181355546e-05,
"loss": 1.6077,
"step": 6600
},
{
"epoch": 0.011077710136608307,
"eval_loss": 1.5568077564239502,
"eval_masked_accuracy": 0.6872428059577942,
"eval_runtime": 1.7324,
"eval_samples_per_second": 5.772,
"eval_steps_per_second": 2.309,
"step": 6600
},
{
"epoch": 0.011161632183097764,
"grad_norm": 5.304859161376953,
"learning_rate": 4.944225407903107e-05,
"loss": 1.5758,
"step": 6650
},
{
"epoch": 0.011161632183097764,
"eval_loss": 1.3110054731369019,
"eval_masked_accuracy": 0.7312775254249573,
"eval_runtime": 1.7439,
"eval_samples_per_second": 5.734,
"eval_steps_per_second": 2.294,
"step": 6650
},
{
"epoch": 0.01124555422958722,
"grad_norm": 6.187143802642822,
"learning_rate": 4.9438057976706596e-05,
"loss": 1.5817,
"step": 6700
},
{
"epoch": 0.01124555422958722,
"eval_loss": 1.7989356517791748,
"eval_masked_accuracy": 0.6666666865348816,
"eval_runtime": 1.754,
"eval_samples_per_second": 5.701,
"eval_steps_per_second": 2.28,
"step": 6700
},
{
"epoch": 0.011329476276076679,
"grad_norm": 5.595826148986816,
"learning_rate": 4.9433861874382124e-05,
"loss": 1.6367,
"step": 6750
},
{
"epoch": 0.011329476276076679,
"eval_loss": 1.7425569295883179,
"eval_masked_accuracy": 0.6583333611488342,
"eval_runtime": 1.7467,
"eval_samples_per_second": 5.725,
"eval_steps_per_second": 2.29,
"step": 6750
},
{
"epoch": 0.011413398322566135,
"grad_norm": 4.125125408172607,
"learning_rate": 4.942966577205765e-05,
"loss": 1.641,
"step": 6800
},
{
"epoch": 0.011413398322566135,
"eval_loss": 1.728715181350708,
"eval_masked_accuracy": 0.6652892827987671,
"eval_runtime": 1.772,
"eval_samples_per_second": 5.643,
"eval_steps_per_second": 2.257,
"step": 6800
},
{
"epoch": 0.011497320369055592,
"grad_norm": 6.3898844718933105,
"learning_rate": 4.942546966973318e-05,
"loss": 1.6574,
"step": 6850
},
{
"epoch": 0.011497320369055592,
"eval_loss": 1.8261781930923462,
"eval_masked_accuracy": 0.6752136945724487,
"eval_runtime": 1.7446,
"eval_samples_per_second": 5.732,
"eval_steps_per_second": 2.293,
"step": 6850
},
{
"epoch": 0.011581242415545048,
"grad_norm": 5.9191155433654785,
"learning_rate": 4.942127356740871e-05,
"loss": 1.5732,
"step": 6900
},
{
"epoch": 0.011581242415545048,
"eval_loss": 1.2290430068969727,
"eval_masked_accuracy": 0.7573221921920776,
"eval_runtime": 1.7438,
"eval_samples_per_second": 5.735,
"eval_steps_per_second": 2.294,
"step": 6900
},
{
"epoch": 0.011665164462034505,
"grad_norm": 5.910600185394287,
"learning_rate": 4.941707746508423e-05,
"loss": 1.5018,
"step": 6950
},
{
"epoch": 0.011665164462034505,
"eval_loss": 1.3011202812194824,
"eval_masked_accuracy": 0.746835470199585,
"eval_runtime": 1.739,
"eval_samples_per_second": 5.751,
"eval_steps_per_second": 2.3,
"step": 6950
},
{
"epoch": 0.011749086508523963,
"grad_norm": 7.273187637329102,
"learning_rate": 4.941288136275976e-05,
"loss": 1.6083,
"step": 7000
},
{
"epoch": 0.011749086508523963,
"eval_loss": 1.7945482730865479,
"eval_masked_accuracy": 0.6719367504119873,
"eval_runtime": 1.7495,
"eval_samples_per_second": 5.716,
"eval_steps_per_second": 2.286,
"step": 7000
},
{
"epoch": 0.01183300855501342,
"grad_norm": 5.980038642883301,
"learning_rate": 4.940868526043529e-05,
"loss": 1.7157,
"step": 7050
},
{
"epoch": 0.01183300855501342,
"eval_loss": 1.6633656024932861,
"eval_masked_accuracy": 0.6859503984451294,
"eval_runtime": 1.7603,
"eval_samples_per_second": 5.681,
"eval_steps_per_second": 2.272,
"step": 7050
},
{
"epoch": 0.011916930601502876,
"grad_norm": 4.222002029418945,
"learning_rate": 4.9404489158110817e-05,
"loss": 1.4124,
"step": 7100
},
{
"epoch": 0.011916930601502876,
"eval_loss": 1.7207615375518799,
"eval_masked_accuracy": 0.6793248653411865,
"eval_runtime": 1.753,
"eval_samples_per_second": 5.704,
"eval_steps_per_second": 2.282,
"step": 7100
},
{
"epoch": 0.012000852647992333,
"grad_norm": 8.79937744140625,
"learning_rate": 4.9400293055786345e-05,
"loss": 1.5698,
"step": 7150
},
{
"epoch": 0.012000852647992333,
"eval_loss": 1.5078874826431274,
"eval_masked_accuracy": 0.7276119589805603,
"eval_runtime": 1.866,
"eval_samples_per_second": 5.359,
"eval_steps_per_second": 2.144,
"step": 7150
},
{
"epoch": 0.01208477469448179,
"grad_norm": 6.331279754638672,
"learning_rate": 4.939609695346187e-05,
"loss": 1.5354,
"step": 7200
},
{
"epoch": 0.01208477469448179,
"eval_loss": 1.3983685970306396,
"eval_masked_accuracy": 0.7590909004211426,
"eval_runtime": 1.7632,
"eval_samples_per_second": 5.672,
"eval_steps_per_second": 2.269,
"step": 7200
},
{
"epoch": 0.012168696740971246,
"grad_norm": 4.12935733795166,
"learning_rate": 4.9391900851137395e-05,
"loss": 1.4778,
"step": 7250
},
{
"epoch": 0.012168696740971246,
"eval_loss": 1.7603422403335571,
"eval_masked_accuracy": 0.686956524848938,
"eval_runtime": 1.7504,
"eval_samples_per_second": 5.713,
"eval_steps_per_second": 2.285,
"step": 7250
},
{
"epoch": 0.012252618787460704,
"grad_norm": 5.025778293609619,
"learning_rate": 4.9387704748812923e-05,
"loss": 1.5175,
"step": 7300
},
{
"epoch": 0.012252618787460704,
"eval_loss": 1.7313247919082642,
"eval_masked_accuracy": 0.6872428059577942,
"eval_runtime": 1.745,
"eval_samples_per_second": 5.731,
"eval_steps_per_second": 2.292,
"step": 7300
},
{
"epoch": 0.01233654083395016,
"grad_norm": 9.704473495483398,
"learning_rate": 4.938350864648845e-05,
"loss": 1.4634,
"step": 7350
},
{
"epoch": 0.01233654083395016,
"eval_loss": 1.271333932876587,
"eval_masked_accuracy": 0.7397260069847107,
"eval_runtime": 1.7484,
"eval_samples_per_second": 5.72,
"eval_steps_per_second": 2.288,
"step": 7350
},
{
"epoch": 0.012420462880439617,
"grad_norm": 6.080599308013916,
"learning_rate": 4.937931254416398e-05,
"loss": 1.5937,
"step": 7400
},
{
"epoch": 0.012420462880439617,
"eval_loss": 1.4850938320159912,
"eval_masked_accuracy": 0.7280701994895935,
"eval_runtime": 1.7517,
"eval_samples_per_second": 5.709,
"eval_steps_per_second": 2.284,
"step": 7400
},
{
"epoch": 0.012504384926929074,
"grad_norm": 3.824946880340576,
"learning_rate": 4.937511644183951e-05,
"loss": 1.6026,
"step": 7450
},
{
"epoch": 0.012504384926929074,
"eval_loss": 1.5267841815948486,
"eval_masked_accuracy": 0.7058823704719543,
"eval_runtime": 1.7438,
"eval_samples_per_second": 5.734,
"eval_steps_per_second": 2.294,
"step": 7450
},
{
"epoch": 0.01258830697341853,
"grad_norm": 4.5395989418029785,
"learning_rate": 4.937092033951503e-05,
"loss": 1.4575,
"step": 7500
},
{
"epoch": 0.01258830697341853,
"eval_loss": 1.4801056385040283,
"eval_masked_accuracy": 0.680672287940979,
"eval_runtime": 1.7409,
"eval_samples_per_second": 5.744,
"eval_steps_per_second": 2.298,
"step": 7500
},
{
"epoch": 0.012672229019907989,
"grad_norm": 6.853204250335693,
"learning_rate": 4.936672423719056e-05,
"loss": 1.4224,
"step": 7550
},
{
"epoch": 0.012672229019907989,
"eval_loss": 1.6892282962799072,
"eval_masked_accuracy": 0.6551724076271057,
"eval_runtime": 1.7414,
"eval_samples_per_second": 5.742,
"eval_steps_per_second": 2.297,
"step": 7550
},
{
"epoch": 0.012756151066397445,
"grad_norm": 5.53077507019043,
"learning_rate": 4.936252813486609e-05,
"loss": 1.6706,
"step": 7600
},
{
"epoch": 0.012756151066397445,
"eval_loss": 1.4235472679138184,
"eval_masked_accuracy": 0.7426160573959351,
"eval_runtime": 1.8082,
"eval_samples_per_second": 5.53,
"eval_steps_per_second": 2.212,
"step": 7600
},
{
"epoch": 0.012840073112886902,
"grad_norm": 4.5907087326049805,
"learning_rate": 4.9358332032541616e-05,
"loss": 1.6674,
"step": 7650
},
{
"epoch": 0.012840073112886902,
"eval_loss": 1.4942524433135986,
"eval_masked_accuracy": 0.7172995805740356,
"eval_runtime": 1.7449,
"eval_samples_per_second": 5.731,
"eval_steps_per_second": 2.292,
"step": 7650
},
{
"epoch": 0.012923995159376358,
"grad_norm": 8.004353523254395,
"learning_rate": 4.9354135930217144e-05,
"loss": 1.4294,
"step": 7700
},
{
"epoch": 0.012923995159376358,
"eval_loss": 1.7548024654388428,
"eval_masked_accuracy": 0.6547619104385376,
"eval_runtime": 1.7767,
"eval_samples_per_second": 5.628,
"eval_steps_per_second": 2.251,
"step": 7700
},
{
"epoch": 0.013007917205865815,
"grad_norm": 6.963031768798828,
"learning_rate": 4.934993982789267e-05,
"loss": 1.5078,
"step": 7750
},
{
"epoch": 0.013007917205865815,
"eval_loss": 1.4269187450408936,
"eval_masked_accuracy": 0.7027027010917664,
"eval_runtime": 1.7471,
"eval_samples_per_second": 5.724,
"eval_steps_per_second": 2.29,
"step": 7750
},
{
"epoch": 0.013091839252355273,
"grad_norm": 6.4043288230896,
"learning_rate": 4.9345743725568194e-05,
"loss": 1.604,
"step": 7800
},
{
"epoch": 0.013091839252355273,
"eval_loss": 1.4502145051956177,
"eval_masked_accuracy": 0.7172995805740356,
"eval_runtime": 1.748,
"eval_samples_per_second": 5.721,
"eval_steps_per_second": 2.288,
"step": 7800
},
{
"epoch": 0.01317576129884473,
"grad_norm": 5.293691158294678,
"learning_rate": 4.934154762324372e-05,
"loss": 1.6301,
"step": 7850
},
{
"epoch": 0.01317576129884473,
"eval_loss": 1.3547624349594116,
"eval_masked_accuracy": 0.7759336233139038,
"eval_runtime": 1.7437,
"eval_samples_per_second": 5.735,
"eval_steps_per_second": 2.294,
"step": 7850
},
{
"epoch": 0.013259683345334186,
"grad_norm": 7.364100933074951,
"learning_rate": 4.933735152091925e-05,
"loss": 1.5163,
"step": 7900
},
{
"epoch": 0.013259683345334186,
"eval_loss": 1.6089417934417725,
"eval_masked_accuracy": 0.6610878705978394,
"eval_runtime": 1.753,
"eval_samples_per_second": 5.704,
"eval_steps_per_second": 2.282,
"step": 7900
},
{
"epoch": 0.013343605391823643,
"grad_norm": 7.704033851623535,
"learning_rate": 4.933315541859478e-05,
"loss": 1.6564,
"step": 7950
},
{
"epoch": 0.013343605391823643,
"eval_loss": 1.4759953022003174,
"eval_masked_accuracy": 0.6958333253860474,
"eval_runtime": 1.7614,
"eval_samples_per_second": 5.677,
"eval_steps_per_second": 2.271,
"step": 7950
},
{
"epoch": 0.0134275274383131,
"grad_norm": 5.562460899353027,
"learning_rate": 4.932895931627031e-05,
"loss": 1.5703,
"step": 8000
},
{
"epoch": 0.0134275274383131,
"eval_loss": 1.735896348953247,
"eval_masked_accuracy": 0.6875,
"eval_runtime": 1.7493,
"eval_samples_per_second": 5.717,
"eval_steps_per_second": 2.287,
"step": 8000
},
{
"epoch": 0.013511449484802556,
"grad_norm": 8.801225662231445,
"learning_rate": 4.9324763213945836e-05,
"loss": 1.5328,
"step": 8050
},
{
"epoch": 0.013511449484802556,
"eval_loss": 1.2792503833770752,
"eval_masked_accuracy": 0.7292576432228088,
"eval_runtime": 1.7802,
"eval_samples_per_second": 5.617,
"eval_steps_per_second": 2.247,
"step": 8050
},
{
"epoch": 0.013595371531292014,
"grad_norm": 5.510076999664307,
"learning_rate": 4.932056711162136e-05,
"loss": 1.5086,
"step": 8100
},
{
"epoch": 0.013595371531292014,
"eval_loss": 1.811342477798462,
"eval_masked_accuracy": 0.6508620977401733,
"eval_runtime": 1.7772,
"eval_samples_per_second": 5.627,
"eval_steps_per_second": 2.251,
"step": 8100
},
{
"epoch": 0.01367929357778147,
"grad_norm": 4.370019912719727,
"learning_rate": 4.9316371009296886e-05,
"loss": 1.5992,
"step": 8150
},
{
"epoch": 0.01367929357778147,
"eval_loss": 1.7015224695205688,
"eval_masked_accuracy": 0.6945606470108032,
"eval_runtime": 1.7399,
"eval_samples_per_second": 5.747,
"eval_steps_per_second": 2.299,
"step": 8150
},
{
"epoch": 0.013763215624270927,
"grad_norm": 5.960280895233154,
"learning_rate": 4.9312174906972415e-05,
"loss": 1.6392,
"step": 8200
},
{
"epoch": 0.013763215624270927,
"eval_loss": 1.4644631147384644,
"eval_masked_accuracy": 0.7004830837249756,
"eval_runtime": 1.7493,
"eval_samples_per_second": 5.717,
"eval_steps_per_second": 2.287,
"step": 8200
},
{
"epoch": 0.013847137670760384,
"grad_norm": 5.401033878326416,
"learning_rate": 4.930797880464794e-05,
"loss": 1.6492,
"step": 8250
},
{
"epoch": 0.013847137670760384,
"eval_loss": 1.5244245529174805,
"eval_masked_accuracy": 0.688034176826477,
"eval_runtime": 1.7597,
"eval_samples_per_second": 5.683,
"eval_steps_per_second": 2.273,
"step": 8250
},
{
"epoch": 0.01393105971724984,
"grad_norm": 7.356916427612305,
"learning_rate": 4.930378270232347e-05,
"loss": 1.5673,
"step": 8300
},
{
"epoch": 0.01393105971724984,
"eval_loss": 1.4024368524551392,
"eval_masked_accuracy": 0.7016806602478027,
"eval_runtime": 1.7463,
"eval_samples_per_second": 5.726,
"eval_steps_per_second": 2.291,
"step": 8300
},
{
"epoch": 0.014014981763739299,
"grad_norm": 5.370472431182861,
"learning_rate": 4.929958659999899e-05,
"loss": 1.5267,
"step": 8350
},
{
"epoch": 0.014014981763739299,
"eval_loss": 1.7430174350738525,
"eval_masked_accuracy": 0.6653061509132385,
"eval_runtime": 1.7353,
"eval_samples_per_second": 5.763,
"eval_steps_per_second": 2.305,
"step": 8350
},
{
"epoch": 0.014098903810228755,
"grad_norm": 6.4656500816345215,
"learning_rate": 4.929539049767452e-05,
"loss": 1.5918,
"step": 8400
},
{
"epoch": 0.014098903810228755,
"eval_loss": 1.691054344177246,
"eval_masked_accuracy": 0.6849315166473389,
"eval_runtime": 1.746,
"eval_samples_per_second": 5.727,
"eval_steps_per_second": 2.291,
"step": 8400
},
{
"epoch": 0.014182825856718212,
"grad_norm": 5.481358051300049,
"learning_rate": 4.929119439535005e-05,
"loss": 1.5156,
"step": 8450
},
{
"epoch": 0.014182825856718212,
"eval_loss": 1.6469824314117432,
"eval_masked_accuracy": 0.6516393423080444,
"eval_runtime": 1.7423,
"eval_samples_per_second": 5.74,
"eval_steps_per_second": 2.296,
"step": 8450
},
{
"epoch": 0.014266747903207668,
"grad_norm": 4.755044937133789,
"learning_rate": 4.928699829302558e-05,
"loss": 1.5223,
"step": 8500
},
{
"epoch": 0.014266747903207668,
"eval_loss": 1.667824387550354,
"eval_masked_accuracy": 0.7068965435028076,
"eval_runtime": 1.7979,
"eval_samples_per_second": 5.562,
"eval_steps_per_second": 2.225,
"step": 8500
},
{
"epoch": 0.014350669949697125,
"grad_norm": 6.595943450927734,
"learning_rate": 4.928280219070111e-05,
"loss": 1.4699,
"step": 8550
},
{
"epoch": 0.014350669949697125,
"eval_loss": 1.2367641925811768,
"eval_masked_accuracy": 0.7447698712348938,
"eval_runtime": 1.7387,
"eval_samples_per_second": 5.752,
"eval_steps_per_second": 2.301,
"step": 8550
},
{
"epoch": 0.014434591996186583,
"grad_norm": 3.9210710525512695,
"learning_rate": 4.9278606088376635e-05,
"loss": 1.5695,
"step": 8600
},
{
"epoch": 0.014434591996186583,
"eval_loss": 1.3033006191253662,
"eval_masked_accuracy": 0.693965494632721,
"eval_runtime": 1.7554,
"eval_samples_per_second": 5.697,
"eval_steps_per_second": 2.279,
"step": 8600
},
{
"epoch": 0.01451851404267604,
"grad_norm": 4.682461261749268,
"learning_rate": 4.927440998605216e-05,
"loss": 1.5371,
"step": 8650
},
{
"epoch": 0.01451851404267604,
"eval_loss": 1.727216124534607,
"eval_masked_accuracy": 0.6639004349708557,
"eval_runtime": 1.7387,
"eval_samples_per_second": 5.751,
"eval_steps_per_second": 2.301,
"step": 8650
},
{
"epoch": 0.014602436089165496,
"grad_norm": 4.478100776672363,
"learning_rate": 4.9270213883727685e-05,
"loss": 1.5679,
"step": 8700
},
{
"epoch": 0.014602436089165496,
"eval_loss": 1.4694969654083252,
"eval_masked_accuracy": 0.7364016771316528,
"eval_runtime": 1.7474,
"eval_samples_per_second": 5.723,
"eval_steps_per_second": 2.289,
"step": 8700
},
{
"epoch": 0.014686358135654953,
"grad_norm": 8.149710655212402,
"learning_rate": 4.9266017781403214e-05,
"loss": 1.4814,
"step": 8750
},
{
"epoch": 0.014686358135654953,
"eval_loss": 1.9258610010147095,
"eval_masked_accuracy": 0.6228070259094238,
"eval_runtime": 1.7513,
"eval_samples_per_second": 5.71,
"eval_steps_per_second": 2.284,
"step": 8750
},
{
"epoch": 0.01477028018214441,
"grad_norm": 4.727016925811768,
"learning_rate": 4.926182167907874e-05,
"loss": 1.609,
"step": 8800
},
{
"epoch": 0.01477028018214441,
"eval_loss": 1.6111774444580078,
"eval_masked_accuracy": 0.6590038537979126,
"eval_runtime": 1.7579,
"eval_samples_per_second": 5.689,
"eval_steps_per_second": 2.275,
"step": 8800
},
{
"epoch": 0.014854202228633867,
"grad_norm": 5.348945140838623,
"learning_rate": 4.925762557675427e-05,
"loss": 1.5557,
"step": 8850
},
{
"epoch": 0.014854202228633867,
"eval_loss": 1.3535053730010986,
"eval_masked_accuracy": 0.7245762944221497,
"eval_runtime": 1.8639,
"eval_samples_per_second": 5.365,
"eval_steps_per_second": 2.146,
"step": 8850
},
{
"epoch": 0.014938124275123324,
"grad_norm": 6.573589324951172,
"learning_rate": 4.92534294744298e-05,
"loss": 1.6389,
"step": 8900
},
{
"epoch": 0.014938124275123324,
"eval_loss": 1.8509418964385986,
"eval_masked_accuracy": 0.7085201740264893,
"eval_runtime": 1.7536,
"eval_samples_per_second": 5.703,
"eval_steps_per_second": 2.281,
"step": 8900
},
{
"epoch": 0.01502204632161278,
"grad_norm": 7.373574256896973,
"learning_rate": 4.924923337210532e-05,
"loss": 1.4773,
"step": 8950
},
{
"epoch": 0.01502204632161278,
"eval_loss": 1.7772554159164429,
"eval_masked_accuracy": 0.6640625,
"eval_runtime": 1.7655,
"eval_samples_per_second": 5.664,
"eval_steps_per_second": 2.266,
"step": 8950
},
{
"epoch": 0.015105968368102237,
"grad_norm": 5.861003875732422,
"learning_rate": 4.924503726978085e-05,
"loss": 1.3842,
"step": 9000
},
{
"epoch": 0.015105968368102237,
"eval_loss": 1.6182334423065186,
"eval_masked_accuracy": 0.7183098793029785,
"eval_runtime": 1.7386,
"eval_samples_per_second": 5.752,
"eval_steps_per_second": 2.301,
"step": 9000
},
{
"epoch": 0.015189890414591694,
"grad_norm": 5.086306571960449,
"learning_rate": 4.924084116745638e-05,
"loss": 1.6445,
"step": 9050
},
{
"epoch": 0.015189890414591694,
"eval_loss": 1.3457679748535156,
"eval_masked_accuracy": 0.752293586730957,
"eval_runtime": 1.7595,
"eval_samples_per_second": 5.684,
"eval_steps_per_second": 2.273,
"step": 9050
},
{
"epoch": 0.01527381246108115,
"grad_norm": 7.099021911621094,
"learning_rate": 4.9236645065131906e-05,
"loss": 1.5536,
"step": 9100
},
{
"epoch": 0.01527381246108115,
"eval_loss": 1.8317623138427734,
"eval_masked_accuracy": 0.6588628888130188,
"eval_runtime": 1.8424,
"eval_samples_per_second": 5.428,
"eval_steps_per_second": 2.171,
"step": 9100
},
{
"epoch": 0.015357734507570608,
"grad_norm": 6.620283126831055,
"learning_rate": 4.9232448962807434e-05,
"loss": 1.5151,
"step": 9150
},
{
"epoch": 0.015357734507570608,
"eval_loss": 1.4230843782424927,
"eval_masked_accuracy": 0.700421929359436,
"eval_runtime": 1.7611,
"eval_samples_per_second": 5.678,
"eval_steps_per_second": 2.271,
"step": 9150
},
{
"epoch": 0.015441656554060065,
"grad_norm": 7.231357097625732,
"learning_rate": 4.922825286048296e-05,
"loss": 1.6078,
"step": 9200
},
{
"epoch": 0.015441656554060065,
"eval_loss": 1.7547998428344727,
"eval_masked_accuracy": 0.6745283007621765,
"eval_runtime": 1.8328,
"eval_samples_per_second": 5.456,
"eval_steps_per_second": 2.182,
"step": 9200
},
{
"epoch": 0.015525578600549522,
"grad_norm": 4.755532264709473,
"learning_rate": 4.9224140680204975e-05,
"loss": 1.5938,
"step": 9250
},
{
"epoch": 0.015525578600549522,
"eval_loss": 1.3346257209777832,
"eval_masked_accuracy": 0.7244444489479065,
"eval_runtime": 1.7553,
"eval_samples_per_second": 5.697,
"eval_steps_per_second": 2.279,
"step": 9250
},
{
"epoch": 0.015609500647038978,
"grad_norm": 5.728196620941162,
"learning_rate": 4.92199445778805e-05,
"loss": 1.5542,
"step": 9300
},
{
"epoch": 0.015609500647038978,
"eval_loss": 1.6833394765853882,
"eval_masked_accuracy": 0.6654929518699646,
"eval_runtime": 1.7516,
"eval_samples_per_second": 5.709,
"eval_steps_per_second": 2.284,
"step": 9300
},
{
"epoch": 0.015693422693528435,
"grad_norm": 5.66224479675293,
"learning_rate": 4.921574847555603e-05,
"loss": 1.6099,
"step": 9350
},
{
"epoch": 0.015693422693528435,
"eval_loss": 1.442452311515808,
"eval_masked_accuracy": 0.6905829310417175,
"eval_runtime": 1.7553,
"eval_samples_per_second": 5.697,
"eval_steps_per_second": 2.279,
"step": 9350
},
{
"epoch": 0.015777344740017893,
"grad_norm": 6.560795307159424,
"learning_rate": 4.921155237323155e-05,
"loss": 1.4188,
"step": 9400
},
{
"epoch": 0.015777344740017893,
"eval_loss": 1.539738416671753,
"eval_masked_accuracy": 0.68359375,
"eval_runtime": 1.7406,
"eval_samples_per_second": 5.745,
"eval_steps_per_second": 2.298,
"step": 9400
},
{
"epoch": 0.015861266786507348,
"grad_norm": 4.9847025871276855,
"learning_rate": 4.920735627090708e-05,
"loss": 1.6344,
"step": 9450
},
{
"epoch": 0.015861266786507348,
"eval_loss": 1.244769811630249,
"eval_masked_accuracy": 0.7078189253807068,
"eval_runtime": 1.77,
"eval_samples_per_second": 5.65,
"eval_steps_per_second": 2.26,
"step": 9450
},
{
"epoch": 0.015945188832996806,
"grad_norm": 6.173788070678711,
"learning_rate": 4.920316016858261e-05,
"loss": 1.6249,
"step": 9500
},
{
"epoch": 0.015945188832996806,
"eval_loss": 2.0483577251434326,
"eval_masked_accuracy": 0.607594907283783,
"eval_runtime": 1.7538,
"eval_samples_per_second": 5.702,
"eval_steps_per_second": 2.281,
"step": 9500
},
{
"epoch": 0.016029110879486264,
"grad_norm": 4.4076828956604,
"learning_rate": 4.919896406625814e-05,
"loss": 1.505,
"step": 9550
},
{
"epoch": 0.016029110879486264,
"eval_loss": 1.7403160333633423,
"eval_masked_accuracy": 0.7048457860946655,
"eval_runtime": 1.7491,
"eval_samples_per_second": 5.717,
"eval_steps_per_second": 2.287,
"step": 9550
},
{
"epoch": 0.01611303292597572,
"grad_norm": 6.358312129974365,
"learning_rate": 4.919476796393366e-05,
"loss": 1.655,
"step": 9600
},
{
"epoch": 0.01611303292597572,
"eval_loss": 1.8444688320159912,
"eval_masked_accuracy": 0.6808510422706604,
"eval_runtime": 1.7573,
"eval_samples_per_second": 5.691,
"eval_steps_per_second": 2.276,
"step": 9600
},
{
"epoch": 0.016196954972465177,
"grad_norm": 6.645698547363281,
"learning_rate": 4.919057186160919e-05,
"loss": 1.5926,
"step": 9650
},
{
"epoch": 0.016196954972465177,
"eval_loss": 1.6228317022323608,
"eval_masked_accuracy": 0.65625,
"eval_runtime": 1.8422,
"eval_samples_per_second": 5.428,
"eval_steps_per_second": 2.171,
"step": 9650
},
{
"epoch": 0.016280877018954632,
"grad_norm": 5.672697067260742,
"learning_rate": 4.918637575928472e-05,
"loss": 1.4762,
"step": 9700
},
{
"epoch": 0.016280877018954632,
"eval_loss": 1.5051512718200684,
"eval_masked_accuracy": 0.6943231225013733,
"eval_runtime": 1.7515,
"eval_samples_per_second": 5.709,
"eval_steps_per_second": 2.284,
"step": 9700
},
{
"epoch": 0.01636479906544409,
"grad_norm": 5.369190216064453,
"learning_rate": 4.9182179656960245e-05,
"loss": 1.5021,
"step": 9750
},
{
"epoch": 0.01636479906544409,
"eval_loss": 1.7301708459854126,
"eval_masked_accuracy": 0.6593886613845825,
"eval_runtime": 1.7374,
"eval_samples_per_second": 5.756,
"eval_steps_per_second": 2.302,
"step": 9750
},
{
"epoch": 0.01644872111193355,
"grad_norm": 4.986740589141846,
"learning_rate": 4.917798355463577e-05,
"loss": 1.5618,
"step": 9800
},
{
"epoch": 0.01644872111193355,
"eval_loss": 1.3315510749816895,
"eval_masked_accuracy": 0.700421929359436,
"eval_runtime": 1.7373,
"eval_samples_per_second": 5.756,
"eval_steps_per_second": 2.302,
"step": 9800
},
{
"epoch": 0.016532643158423004,
"grad_norm": 7.441061973571777,
"learning_rate": 4.9173787452311295e-05,
"loss": 1.5428,
"step": 9850
},
{
"epoch": 0.016532643158423004,
"eval_loss": 1.6381117105484009,
"eval_masked_accuracy": 0.6695652008056641,
"eval_runtime": 1.7386,
"eval_samples_per_second": 5.752,
"eval_steps_per_second": 2.301,
"step": 9850
},
{
"epoch": 0.016616565204912462,
"grad_norm": 6.459640979766846,
"learning_rate": 4.9169591349986824e-05,
"loss": 1.4702,
"step": 9900
},
{
"epoch": 0.016616565204912462,
"eval_loss": 1.537841796875,
"eval_masked_accuracy": 0.6741573214530945,
"eval_runtime": 1.7482,
"eval_samples_per_second": 5.72,
"eval_steps_per_second": 2.288,
"step": 9900
},
{
"epoch": 0.016700487251401917,
"grad_norm": 6.058482646942139,
"learning_rate": 4.916539524766235e-05,
"loss": 1.5765,
"step": 9950
},
{
"epoch": 0.016700487251401917,
"eval_loss": 1.688913345336914,
"eval_masked_accuracy": 0.692307710647583,
"eval_runtime": 1.7482,
"eval_samples_per_second": 5.72,
"eval_steps_per_second": 2.288,
"step": 9950
},
{
"epoch": 0.016784409297891375,
"grad_norm": 4.960835933685303,
"learning_rate": 4.916119914533788e-05,
"loss": 1.544,
"step": 10000
},
{
"epoch": 0.016784409297891375,
"eval_loss": 1.7901655435562134,
"eval_masked_accuracy": 0.6443514823913574,
"eval_runtime": 1.7882,
"eval_samples_per_second": 5.592,
"eval_steps_per_second": 2.237,
"step": 10000
}
],
"logging_steps": 50,
"max_steps": 595791,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 5000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.617791736784392e+16,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}