{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.016784409297891375, "eval_steps": 50, "global_step": 10000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 8.392204648945687e-05, "grad_norm": 3.729597806930542, "learning_rate": 4.999588781972202e-05, "loss": 1.6987, "step": 50 }, { "epoch": 8.392204648945687e-05, "eval_loss": 1.8344682455062866, "eval_masked_accuracy": 0.6726457476615906, "eval_runtime": 1.754, "eval_samples_per_second": 5.701, "eval_steps_per_second": 2.281, "step": 50 }, { "epoch": 0.00016784409297891374, "grad_norm": 6.472073078155518, "learning_rate": 4.999169171739755e-05, "loss": 1.7415, "step": 100 }, { "epoch": 0.00016784409297891374, "eval_loss": 1.7104336023330688, "eval_masked_accuracy": 0.6737288236618042, "eval_runtime": 1.7376, "eval_samples_per_second": 5.755, "eval_steps_per_second": 2.302, "step": 100 }, { "epoch": 0.0002517661394683706, "grad_norm": 5.799453258514404, "learning_rate": 4.998749561507307e-05, "loss": 1.736, "step": 150 }, { "epoch": 0.0002517661394683706, "eval_loss": 1.877158761024475, "eval_masked_accuracy": 0.6936936974525452, "eval_runtime": 1.7454, "eval_samples_per_second": 5.729, "eval_steps_per_second": 2.292, "step": 150 }, { "epoch": 0.0003356881859578275, "grad_norm": 9.896933555603027, "learning_rate": 4.99832995127486e-05, "loss": 1.7919, "step": 200 }, { "epoch": 0.0003356881859578275, "eval_loss": 1.424491047859192, "eval_masked_accuracy": 0.7206477522850037, "eval_runtime": 1.7315, "eval_samples_per_second": 5.775, "eval_steps_per_second": 2.31, "step": 200 }, { "epoch": 0.00041961023244728435, "grad_norm": 4.745198726654053, "learning_rate": 4.997910341042413e-05, "loss": 1.7252, "step": 250 }, { "epoch": 0.00041961023244728435, "eval_loss": 1.915906310081482, "eval_masked_accuracy": 0.6486486196517944, "eval_runtime": 1.7431, "eval_samples_per_second": 5.737, "eval_steps_per_second": 2.295, "step": 250 }, { "epoch": 0.0005035322789367412, "grad_norm": 6.004683971405029, "learning_rate": 4.9974907308099657e-05, "loss": 1.7487, "step": 300 }, { "epoch": 0.0005035322789367412, "eval_loss": 1.7426478862762451, "eval_masked_accuracy": 0.6846473217010498, "eval_runtime": 1.7474, "eval_samples_per_second": 5.723, "eval_steps_per_second": 2.289, "step": 300 }, { "epoch": 0.0005874543254261981, "grad_norm": 8.232338905334473, "learning_rate": 4.9970711205775185e-05, "loss": 1.6958, "step": 350 }, { "epoch": 0.0005874543254261981, "eval_loss": 1.8806991577148438, "eval_masked_accuracy": 0.6256157755851746, "eval_runtime": 1.7452, "eval_samples_per_second": 5.73, "eval_steps_per_second": 2.292, "step": 350 }, { "epoch": 0.000671376371915655, "grad_norm": 8.929485321044922, "learning_rate": 4.996651510345071e-05, "loss": 1.7165, "step": 400 }, { "epoch": 0.000671376371915655, "eval_loss": 1.6669635772705078, "eval_masked_accuracy": 0.6816326379776001, "eval_runtime": 1.7367, "eval_samples_per_second": 5.758, "eval_steps_per_second": 2.303, "step": 400 }, { "epoch": 0.0007552984184051118, "grad_norm": 6.171640872955322, "learning_rate": 4.9962402923172725e-05, "loss": 1.6222, "step": 450 }, { "epoch": 0.0007552984184051118, "eval_loss": 2.174530506134033, "eval_masked_accuracy": 0.6891891956329346, "eval_runtime": 1.7554, "eval_samples_per_second": 5.697, "eval_steps_per_second": 2.279, "step": 450 }, { "epoch": 0.0008392204648945687, "grad_norm": 4.092519283294678, "learning_rate": 4.9958206820848254e-05, "loss": 1.6441, "step": 500 }, { "epoch": 0.0008392204648945687, "eval_loss": 2.060279369354248, "eval_masked_accuracy": 0.6461538672447205, "eval_runtime": 1.8075, "eval_samples_per_second": 5.532, "eval_steps_per_second": 2.213, "step": 500 }, { "epoch": 0.0009231425113840256, "grad_norm": 5.34571647644043, "learning_rate": 4.995401071852378e-05, "loss": 1.7198, "step": 550 }, { "epoch": 0.0009231425113840256, "eval_loss": 1.6280667781829834, "eval_masked_accuracy": 0.6775510311126709, "eval_runtime": 1.7487, "eval_samples_per_second": 5.718, "eval_steps_per_second": 2.287, "step": 550 }, { "epoch": 0.0010070645578734824, "grad_norm": 4.286564350128174, "learning_rate": 4.994981461619931e-05, "loss": 1.6823, "step": 600 }, { "epoch": 0.0010070645578734824, "eval_loss": 1.5270774364471436, "eval_masked_accuracy": 0.6832579374313354, "eval_runtime": 1.7459, "eval_samples_per_second": 5.728, "eval_steps_per_second": 2.291, "step": 600 }, { "epoch": 0.0010909866043629394, "grad_norm": 3.7731900215148926, "learning_rate": 4.994561851387484e-05, "loss": 1.573, "step": 650 }, { "epoch": 0.0010909866043629394, "eval_loss": 1.522475242614746, "eval_masked_accuracy": 0.7423076629638672, "eval_runtime": 1.7483, "eval_samples_per_second": 5.72, "eval_steps_per_second": 2.288, "step": 650 }, { "epoch": 0.0011749086508523962, "grad_norm": 4.305816650390625, "learning_rate": 4.994142241155036e-05, "loss": 1.6905, "step": 700 }, { "epoch": 0.0011749086508523962, "eval_loss": 1.503122091293335, "eval_masked_accuracy": 0.67136150598526, "eval_runtime": 1.7423, "eval_samples_per_second": 5.74, "eval_steps_per_second": 2.296, "step": 700 }, { "epoch": 0.0012588306973418532, "grad_norm": 6.982117176055908, "learning_rate": 4.993722630922589e-05, "loss": 1.6444, "step": 750 }, { "epoch": 0.0012588306973418532, "eval_loss": 1.7397890090942383, "eval_masked_accuracy": 0.6946902871131897, "eval_runtime": 1.7436, "eval_samples_per_second": 5.735, "eval_steps_per_second": 2.294, "step": 750 }, { "epoch": 0.00134275274383131, "grad_norm": 6.332937717437744, "learning_rate": 4.993303020690142e-05, "loss": 1.7488, "step": 800 }, { "epoch": 0.00134275274383131, "eval_loss": 1.407382845878601, "eval_masked_accuracy": 0.7051281929016113, "eval_runtime": 1.7398, "eval_samples_per_second": 5.748, "eval_steps_per_second": 2.299, "step": 800 }, { "epoch": 0.001426674790320767, "grad_norm": 5.491461753845215, "learning_rate": 4.9928834104576946e-05, "loss": 1.5959, "step": 850 }, { "epoch": 0.001426674790320767, "eval_loss": 1.8142907619476318, "eval_masked_accuracy": 0.6625514626502991, "eval_runtime": 1.7407, "eval_samples_per_second": 5.745, "eval_steps_per_second": 2.298, "step": 850 }, { "epoch": 0.0015105968368102237, "grad_norm": 12.12775707244873, "learning_rate": 4.9924638002252474e-05, "loss": 1.6085, "step": 900 }, { "epoch": 0.0015105968368102237, "eval_loss": 1.9904667139053345, "eval_masked_accuracy": 0.6278026700019836, "eval_runtime": 1.7503, "eval_samples_per_second": 5.713, "eval_steps_per_second": 2.285, "step": 900 }, { "epoch": 0.0015945188832996806, "grad_norm": 18.452600479125977, "learning_rate": 4.9920441899928e-05, "loss": 1.5793, "step": 950 }, { "epoch": 0.0015945188832996806, "eval_loss": 1.797326683998108, "eval_masked_accuracy": 0.6784313917160034, "eval_runtime": 1.7403, "eval_samples_per_second": 5.746, "eval_steps_per_second": 2.299, "step": 950 }, { "epoch": 0.0016784409297891374, "grad_norm": 8.000075340270996, "learning_rate": 4.9916245797603524e-05, "loss": 1.5353, "step": 1000 }, { "epoch": 0.0016784409297891374, "eval_loss": 1.8558744192123413, "eval_masked_accuracy": 0.6530612111091614, "eval_runtime": 1.7574, "eval_samples_per_second": 5.69, "eval_steps_per_second": 2.276, "step": 1000 }, { "epoch": 0.0017623629762785944, "grad_norm": 3.907064199447632, "learning_rate": 4.991204969527905e-05, "loss": 1.5363, "step": 1050 }, { "epoch": 0.0017623629762785944, "eval_loss": 2.0765745639801025, "eval_masked_accuracy": 0.6553191542625427, "eval_runtime": 1.7983, "eval_samples_per_second": 5.561, "eval_steps_per_second": 2.224, "step": 1050 }, { "epoch": 0.0018462850227680511, "grad_norm": 4.185476303100586, "learning_rate": 4.990785359295458e-05, "loss": 1.6641, "step": 1100 }, { "epoch": 0.0018462850227680511, "eval_loss": 1.5849405527114868, "eval_masked_accuracy": 0.71074378490448, "eval_runtime": 1.7601, "eval_samples_per_second": 5.681, "eval_steps_per_second": 2.273, "step": 1100 }, { "epoch": 0.0019302070692575081, "grad_norm": 5.447309494018555, "learning_rate": 4.990365749063011e-05, "loss": 1.7069, "step": 1150 }, { "epoch": 0.0019302070692575081, "eval_loss": 1.6813358068466187, "eval_masked_accuracy": 0.7231404781341553, "eval_runtime": 1.7529, "eval_samples_per_second": 5.705, "eval_steps_per_second": 2.282, "step": 1150 }, { "epoch": 0.002014129115746965, "grad_norm": 5.904290199279785, "learning_rate": 4.989946138830564e-05, "loss": 1.6996, "step": 1200 }, { "epoch": 0.002014129115746965, "eval_loss": 1.6986854076385498, "eval_masked_accuracy": 0.6554622054100037, "eval_runtime": 1.7531, "eval_samples_per_second": 5.704, "eval_steps_per_second": 2.282, "step": 1200 }, { "epoch": 0.002098051162236422, "grad_norm": 5.6478986740112305, "learning_rate": 4.989526528598116e-05, "loss": 1.5291, "step": 1250 }, { "epoch": 0.002098051162236422, "eval_loss": 1.7059627771377563, "eval_masked_accuracy": 0.6680498123168945, "eval_runtime": 1.7416, "eval_samples_per_second": 5.742, "eval_steps_per_second": 2.297, "step": 1250 }, { "epoch": 0.002181973208725879, "grad_norm": 6.695890426635742, "learning_rate": 4.989106918365669e-05, "loss": 1.8386, "step": 1300 }, { "epoch": 0.002181973208725879, "eval_loss": 1.6500450372695923, "eval_masked_accuracy": 0.6693877577781677, "eval_runtime": 1.7414, "eval_samples_per_second": 5.743, "eval_steps_per_second": 2.297, "step": 1300 }, { "epoch": 0.0022658952552153354, "grad_norm": 4.831510066986084, "learning_rate": 4.9886873081332217e-05, "loss": 1.691, "step": 1350 }, { "epoch": 0.0022658952552153354, "eval_loss": 1.4610856771469116, "eval_masked_accuracy": 0.7090163826942444, "eval_runtime": 1.7413, "eval_samples_per_second": 5.743, "eval_steps_per_second": 2.297, "step": 1350 }, { "epoch": 0.0023498173017047924, "grad_norm": 4.90496826171875, "learning_rate": 4.9882676979007745e-05, "loss": 1.7116, "step": 1400 }, { "epoch": 0.0023498173017047924, "eval_loss": 1.6787996292114258, "eval_masked_accuracy": 0.6153846383094788, "eval_runtime": 1.7655, "eval_samples_per_second": 5.664, "eval_steps_per_second": 2.266, "step": 1400 }, { "epoch": 0.0024337393481942493, "grad_norm": 5.956592559814453, "learning_rate": 4.9878480876683273e-05, "loss": 1.5348, "step": 1450 }, { "epoch": 0.0024337393481942493, "eval_loss": 1.7995752096176147, "eval_masked_accuracy": 0.6759999990463257, "eval_runtime": 1.7486, "eval_samples_per_second": 5.719, "eval_steps_per_second": 2.288, "step": 1450 }, { "epoch": 0.0025176613946837063, "grad_norm": 5.731600761413574, "learning_rate": 4.9874284774358795e-05, "loss": 1.5617, "step": 1500 }, { "epoch": 0.0025176613946837063, "eval_loss": 2.028412342071533, "eval_masked_accuracy": 0.6007905006408691, "eval_runtime": 1.797, "eval_samples_per_second": 5.565, "eval_steps_per_second": 2.226, "step": 1500 }, { "epoch": 0.002601583441173163, "grad_norm": 9.261569023132324, "learning_rate": 4.9870088672034324e-05, "loss": 1.7109, "step": 1550 }, { "epoch": 0.002601583441173163, "eval_loss": 1.8843729496002197, "eval_masked_accuracy": 0.6594203114509583, "eval_runtime": 1.7575, "eval_samples_per_second": 5.69, "eval_steps_per_second": 2.276, "step": 1550 }, { "epoch": 0.00268550548766262, "grad_norm": 7.181281089782715, "learning_rate": 4.986589256970985e-05, "loss": 1.6529, "step": 1600 }, { "epoch": 0.00268550548766262, "eval_loss": 1.5639550685882568, "eval_masked_accuracy": 0.6905829310417175, "eval_runtime": 1.7429, "eval_samples_per_second": 5.738, "eval_steps_per_second": 2.295, "step": 1600 }, { "epoch": 0.002769427534152077, "grad_norm": 5.245086193084717, "learning_rate": 4.986169646738538e-05, "loss": 1.6497, "step": 1650 }, { "epoch": 0.002769427534152077, "eval_loss": 1.4776060581207275, "eval_masked_accuracy": 0.7312775254249573, "eval_runtime": 1.7496, "eval_samples_per_second": 5.715, "eval_steps_per_second": 2.286, "step": 1650 }, { "epoch": 0.002853349580641534, "grad_norm": 5.593554496765137, "learning_rate": 4.98575003650609e-05, "loss": 1.5326, "step": 1700 }, { "epoch": 0.002853349580641534, "eval_loss": 2.0159146785736084, "eval_masked_accuracy": 0.6205357313156128, "eval_runtime": 1.7381, "eval_samples_per_second": 5.753, "eval_steps_per_second": 2.301, "step": 1700 }, { "epoch": 0.0029372716271309908, "grad_norm": 7.406851768493652, "learning_rate": 4.985330426273643e-05, "loss": 1.5081, "step": 1750 }, { "epoch": 0.0029372716271309908, "eval_loss": 1.508250117301941, "eval_masked_accuracy": 0.7027027010917664, "eval_runtime": 1.7474, "eval_samples_per_second": 5.723, "eval_steps_per_second": 2.289, "step": 1750 }, { "epoch": 0.0030211936736204473, "grad_norm": 4.5133514404296875, "learning_rate": 4.984910816041196e-05, "loss": 1.6619, "step": 1800 }, { "epoch": 0.0030211936736204473, "eval_loss": 1.7022559642791748, "eval_masked_accuracy": 0.694779098033905, "eval_runtime": 1.748, "eval_samples_per_second": 5.721, "eval_steps_per_second": 2.288, "step": 1800 }, { "epoch": 0.0031051157201099043, "grad_norm": 7.173299312591553, "learning_rate": 4.984491205808749e-05, "loss": 1.7603, "step": 1850 }, { "epoch": 0.0031051157201099043, "eval_loss": 1.6458946466445923, "eval_masked_accuracy": 0.6636771559715271, "eval_runtime": 1.8607, "eval_samples_per_second": 5.374, "eval_steps_per_second": 2.15, "step": 1850 }, { "epoch": 0.0031890377665993613, "grad_norm": 4.3678975105285645, "learning_rate": 4.984071595576301e-05, "loss": 1.6453, "step": 1900 }, { "epoch": 0.0031890377665993613, "eval_loss": 1.8176072835922241, "eval_masked_accuracy": 0.6724137663841248, "eval_runtime": 1.7589, "eval_samples_per_second": 5.685, "eval_steps_per_second": 2.274, "step": 1900 }, { "epoch": 0.0032729598130888183, "grad_norm": 7.378585338592529, "learning_rate": 4.983651985343854e-05, "loss": 1.6409, "step": 1950 }, { "epoch": 0.0032729598130888183, "eval_loss": 2.0491786003112793, "eval_masked_accuracy": 0.6374502182006836, "eval_runtime": 1.757, "eval_samples_per_second": 5.692, "eval_steps_per_second": 2.277, "step": 1950 }, { "epoch": 0.003356881859578275, "grad_norm": 4.898635387420654, "learning_rate": 4.9832323751114066e-05, "loss": 1.6994, "step": 2000 }, { "epoch": 0.003356881859578275, "eval_loss": 1.4773211479187012, "eval_masked_accuracy": 0.6739130616188049, "eval_runtime": 1.7439, "eval_samples_per_second": 5.734, "eval_steps_per_second": 2.294, "step": 2000 }, { "epoch": 0.003440803906067732, "grad_norm": 7.465532302856445, "learning_rate": 4.9828127648789594e-05, "loss": 1.5798, "step": 2050 }, { "epoch": 0.003440803906067732, "eval_loss": 1.6743123531341553, "eval_masked_accuracy": 0.6770427823066711, "eval_runtime": 1.7546, "eval_samples_per_second": 5.699, "eval_steps_per_second": 2.28, "step": 2050 }, { "epoch": 0.0035247259525571888, "grad_norm": 7.025172233581543, "learning_rate": 4.982393154646512e-05, "loss": 1.7312, "step": 2100 }, { "epoch": 0.0035247259525571888, "eval_loss": 1.726737380027771, "eval_masked_accuracy": 0.6824034452438354, "eval_runtime": 1.7272, "eval_samples_per_second": 5.79, "eval_steps_per_second": 2.316, "step": 2100 }, { "epoch": 0.0036086479990466457, "grad_norm": 8.405756950378418, "learning_rate": 4.9819735444140644e-05, "loss": 1.7284, "step": 2150 }, { "epoch": 0.0036086479990466457, "eval_loss": 1.8043725490570068, "eval_masked_accuracy": 0.6153846383094788, "eval_runtime": 1.7569, "eval_samples_per_second": 5.692, "eval_steps_per_second": 2.277, "step": 2150 }, { "epoch": 0.0036925700455361023, "grad_norm": 6.279454231262207, "learning_rate": 4.981553934181617e-05, "loss": 1.4629, "step": 2200 }, { "epoch": 0.0036925700455361023, "eval_loss": 1.8529506921768188, "eval_masked_accuracy": 0.6823529601097107, "eval_runtime": 1.7798, "eval_samples_per_second": 5.619, "eval_steps_per_second": 2.247, "step": 2200 }, { "epoch": 0.0037764920920255593, "grad_norm": 7.525041580200195, "learning_rate": 4.98113432394917e-05, "loss": 1.5309, "step": 2250 }, { "epoch": 0.0037764920920255593, "eval_loss": 1.8144168853759766, "eval_masked_accuracy": 0.7272727489471436, "eval_runtime": 1.7418, "eval_samples_per_second": 5.741, "eval_steps_per_second": 2.297, "step": 2250 }, { "epoch": 0.0038604141385150162, "grad_norm": 6.561546802520752, "learning_rate": 4.980714713716723e-05, "loss": 1.6761, "step": 2300 }, { "epoch": 0.0038604141385150162, "eval_loss": 1.8419244289398193, "eval_masked_accuracy": 0.6638298034667969, "eval_runtime": 1.7921, "eval_samples_per_second": 5.58, "eval_steps_per_second": 2.232, "step": 2300 }, { "epoch": 0.003944336185004473, "grad_norm": 4.7332987785339355, "learning_rate": 4.980295103484276e-05, "loss": 1.6738, "step": 2350 }, { "epoch": 0.003944336185004473, "eval_loss": 1.576249122619629, "eval_masked_accuracy": 0.7078651785850525, "eval_runtime": 1.7414, "eval_samples_per_second": 5.742, "eval_steps_per_second": 2.297, "step": 2350 }, { "epoch": 0.00402825823149393, "grad_norm": 3.7719192504882812, "learning_rate": 4.979875493251828e-05, "loss": 1.6432, "step": 2400 }, { "epoch": 0.00402825823149393, "eval_loss": 1.811785340309143, "eval_masked_accuracy": 0.6746031641960144, "eval_runtime": 1.7463, "eval_samples_per_second": 5.726, "eval_steps_per_second": 2.291, "step": 2400 }, { "epoch": 0.004112180277983387, "grad_norm": 6.218490123748779, "learning_rate": 4.979455883019381e-05, "loss": 1.5416, "step": 2450 }, { "epoch": 0.004112180277983387, "eval_loss": 1.6883758306503296, "eval_masked_accuracy": 0.6900826692581177, "eval_runtime": 1.7374, "eval_samples_per_second": 5.756, "eval_steps_per_second": 2.302, "step": 2450 }, { "epoch": 0.004196102324472844, "grad_norm": 5.042550086975098, "learning_rate": 4.9790362727869336e-05, "loss": 1.6701, "step": 2500 }, { "epoch": 0.004196102324472844, "eval_loss": 1.567375898361206, "eval_masked_accuracy": 0.6788617968559265, "eval_runtime": 1.735, "eval_samples_per_second": 5.764, "eval_steps_per_second": 2.306, "step": 2500 }, { "epoch": 0.0042800243709623, "grad_norm": 6.665520668029785, "learning_rate": 4.9786166625544865e-05, "loss": 1.6006, "step": 2550 }, { "epoch": 0.0042800243709623, "eval_loss": 1.659168004989624, "eval_masked_accuracy": 0.6385542154312134, "eval_runtime": 1.7434, "eval_samples_per_second": 5.736, "eval_steps_per_second": 2.294, "step": 2550 }, { "epoch": 0.004363946417451758, "grad_norm": 4.378693580627441, "learning_rate": 4.978197052322039e-05, "loss": 1.6363, "step": 2600 }, { "epoch": 0.004363946417451758, "eval_loss": 1.6367610692977905, "eval_masked_accuracy": 0.6679389476776123, "eval_runtime": 1.7407, "eval_samples_per_second": 5.745, "eval_steps_per_second": 2.298, "step": 2600 }, { "epoch": 0.004447868463941214, "grad_norm": 8.087454795837402, "learning_rate": 4.977777442089592e-05, "loss": 1.5518, "step": 2650 }, { "epoch": 0.004447868463941214, "eval_loss": 2.035369873046875, "eval_masked_accuracy": 0.64462810754776, "eval_runtime": 1.7443, "eval_samples_per_second": 5.733, "eval_steps_per_second": 2.293, "step": 2650 }, { "epoch": 0.004531790510430671, "grad_norm": 6.383141040802002, "learning_rate": 4.977357831857144e-05, "loss": 1.7726, "step": 2700 }, { "epoch": 0.004531790510430671, "eval_loss": 1.9139858484268188, "eval_masked_accuracy": 0.7137096524238586, "eval_runtime": 1.7376, "eval_samples_per_second": 5.755, "eval_steps_per_second": 2.302, "step": 2700 }, { "epoch": 0.004615712556920128, "grad_norm": 8.098458290100098, "learning_rate": 4.976938221624697e-05, "loss": 1.701, "step": 2750 }, { "epoch": 0.004615712556920128, "eval_loss": 1.8784687519073486, "eval_masked_accuracy": 0.6525096297264099, "eval_runtime": 1.8538, "eval_samples_per_second": 5.394, "eval_steps_per_second": 2.158, "step": 2750 }, { "epoch": 0.004699634603409585, "grad_norm": 5.3736138343811035, "learning_rate": 4.97651861139225e-05, "loss": 1.5577, "step": 2800 }, { "epoch": 0.004699634603409585, "eval_loss": 1.6022107601165771, "eval_masked_accuracy": 0.6907630562782288, "eval_runtime": 1.7368, "eval_samples_per_second": 5.758, "eval_steps_per_second": 2.303, "step": 2800 }, { "epoch": 0.004783556649899042, "grad_norm": 4.617998123168945, "learning_rate": 4.976099001159803e-05, "loss": 1.6194, "step": 2850 }, { "epoch": 0.004783556649899042, "eval_loss": 1.398147702217102, "eval_masked_accuracy": 0.6696035265922546, "eval_runtime": 1.7507, "eval_samples_per_second": 5.712, "eval_steps_per_second": 2.285, "step": 2850 }, { "epoch": 0.004867478696388499, "grad_norm": 4.976247787475586, "learning_rate": 4.975679390927356e-05, "loss": 1.6325, "step": 2900 }, { "epoch": 0.004867478696388499, "eval_loss": 1.7178815603256226, "eval_masked_accuracy": 0.6653696298599243, "eval_runtime": 1.7533, "eval_samples_per_second": 5.703, "eval_steps_per_second": 2.281, "step": 2900 }, { "epoch": 0.004951400742877955, "grad_norm": 5.229081153869629, "learning_rate": 4.9752597806949085e-05, "loss": 1.7057, "step": 2950 }, { "epoch": 0.004951400742877955, "eval_loss": 1.8161494731903076, "eval_masked_accuracy": 0.6431535482406616, "eval_runtime": 1.7382, "eval_samples_per_second": 5.753, "eval_steps_per_second": 2.301, "step": 2950 }, { "epoch": 0.005035322789367413, "grad_norm": 6.112144947052002, "learning_rate": 4.974840170462461e-05, "loss": 1.6189, "step": 3000 }, { "epoch": 0.005035322789367413, "eval_loss": 1.8454160690307617, "eval_masked_accuracy": 0.6767241358757019, "eval_runtime": 1.7376, "eval_samples_per_second": 5.755, "eval_steps_per_second": 2.302, "step": 3000 }, { "epoch": 0.005119244835856869, "grad_norm": 5.7698445320129395, "learning_rate": 4.9744205602300135e-05, "loss": 1.6734, "step": 3050 }, { "epoch": 0.005119244835856869, "eval_loss": 1.6155188083648682, "eval_masked_accuracy": 0.6991525292396545, "eval_runtime": 1.7469, "eval_samples_per_second": 5.724, "eval_steps_per_second": 2.29, "step": 3050 }, { "epoch": 0.005203166882346326, "grad_norm": 11.4446382522583, "learning_rate": 4.9740009499975664e-05, "loss": 1.602, "step": 3100 }, { "epoch": 0.005203166882346326, "eval_loss": 1.7193024158477783, "eval_masked_accuracy": 0.6454545259475708, "eval_runtime": 1.8085, "eval_samples_per_second": 5.529, "eval_steps_per_second": 2.212, "step": 3100 }, { "epoch": 0.005287088928835783, "grad_norm": 4.331955432891846, "learning_rate": 4.973581339765119e-05, "loss": 1.5886, "step": 3150 }, { "epoch": 0.005287088928835783, "eval_loss": 1.7239084243774414, "eval_masked_accuracy": 0.7025862336158752, "eval_runtime": 1.7507, "eval_samples_per_second": 5.712, "eval_steps_per_second": 2.285, "step": 3150 }, { "epoch": 0.00537101097532524, "grad_norm": 6.857669830322266, "learning_rate": 4.973161729532672e-05, "loss": 1.6531, "step": 3200 }, { "epoch": 0.00537101097532524, "eval_loss": 1.7898776531219482, "eval_masked_accuracy": 0.6463878154754639, "eval_runtime": 1.807, "eval_samples_per_second": 5.534, "eval_steps_per_second": 2.214, "step": 3200 }, { "epoch": 0.005454933021814697, "grad_norm": 6.366724491119385, "learning_rate": 4.972742119300224e-05, "loss": 1.5112, "step": 3250 }, { "epoch": 0.005454933021814697, "eval_loss": 1.68304443359375, "eval_masked_accuracy": 0.6958174705505371, "eval_runtime": 1.7544, "eval_samples_per_second": 5.7, "eval_steps_per_second": 2.28, "step": 3250 }, { "epoch": 0.005538855068304154, "grad_norm": 5.657731056213379, "learning_rate": 4.972322509067777e-05, "loss": 1.5622, "step": 3300 }, { "epoch": 0.005538855068304154, "eval_loss": 1.7854249477386475, "eval_masked_accuracy": 0.6833333373069763, "eval_runtime": 1.7977, "eval_samples_per_second": 5.563, "eval_steps_per_second": 2.225, "step": 3300 }, { "epoch": 0.00562277711479361, "grad_norm": 4.501428127288818, "learning_rate": 4.97190289883533e-05, "loss": 1.5736, "step": 3350 }, { "epoch": 0.00562277711479361, "eval_loss": 1.4276224374771118, "eval_masked_accuracy": 0.7192118167877197, "eval_runtime": 1.7643, "eval_samples_per_second": 5.668, "eval_steps_per_second": 2.267, "step": 3350 }, { "epoch": 0.005706699161283068, "grad_norm": 6.436139106750488, "learning_rate": 4.971483288602883e-05, "loss": 1.5653, "step": 3400 }, { "epoch": 0.005706699161283068, "eval_loss": 1.674355149269104, "eval_masked_accuracy": 0.718367338180542, "eval_runtime": 1.7482, "eval_samples_per_second": 5.72, "eval_steps_per_second": 2.288, "step": 3400 }, { "epoch": 0.005790621207772524, "grad_norm": 6.295548439025879, "learning_rate": 4.9710636783704356e-05, "loss": 1.5556, "step": 3450 }, { "epoch": 0.005790621207772524, "eval_loss": 1.7501426935195923, "eval_masked_accuracy": 0.7076271176338196, "eval_runtime": 1.7554, "eval_samples_per_second": 5.697, "eval_steps_per_second": 2.279, "step": 3450 }, { "epoch": 0.0058745432542619816, "grad_norm": 5.733904838562012, "learning_rate": 4.9706440681379884e-05, "loss": 1.5164, "step": 3500 }, { "epoch": 0.0058745432542619816, "eval_loss": 1.520179033279419, "eval_masked_accuracy": 0.7203390002250671, "eval_runtime": 1.7629, "eval_samples_per_second": 5.672, "eval_steps_per_second": 2.269, "step": 3500 }, { "epoch": 0.005958465300751438, "grad_norm": 5.285616397857666, "learning_rate": 4.9702244579055406e-05, "loss": 1.6254, "step": 3550 }, { "epoch": 0.005958465300751438, "eval_loss": 1.7321217060089111, "eval_masked_accuracy": 0.6712962985038757, "eval_runtime": 1.7429, "eval_samples_per_second": 5.738, "eval_steps_per_second": 2.295, "step": 3550 }, { "epoch": 0.006042387347240895, "grad_norm": 5.386379241943359, "learning_rate": 4.9698048476730934e-05, "loss": 1.505, "step": 3600 }, { "epoch": 0.006042387347240895, "eval_loss": 1.7810560464859009, "eval_masked_accuracy": 0.654618501663208, "eval_runtime": 1.7454, "eval_samples_per_second": 5.729, "eval_steps_per_second": 2.292, "step": 3600 }, { "epoch": 0.006126309393730352, "grad_norm": 6.726806640625, "learning_rate": 4.969385237440646e-05, "loss": 1.5011, "step": 3650 }, { "epoch": 0.006126309393730352, "eval_loss": 1.5794349908828735, "eval_masked_accuracy": 0.6979591846466064, "eval_runtime": 1.7721, "eval_samples_per_second": 5.643, "eval_steps_per_second": 2.257, "step": 3650 }, { "epoch": 0.006210231440219809, "grad_norm": 7.159238815307617, "learning_rate": 4.968965627208199e-05, "loss": 1.6134, "step": 3700 }, { "epoch": 0.006210231440219809, "eval_loss": 1.4294860363006592, "eval_masked_accuracy": 0.7136752009391785, "eval_runtime": 1.752, "eval_samples_per_second": 5.708, "eval_steps_per_second": 2.283, "step": 3700 }, { "epoch": 0.006294153486709265, "grad_norm": 5.560455799102783, "learning_rate": 4.968546016975752e-05, "loss": 1.5097, "step": 3750 }, { "epoch": 0.006294153486709265, "eval_loss": 1.9169464111328125, "eval_masked_accuracy": 0.6929824352264404, "eval_runtime": 1.7457, "eval_samples_per_second": 5.728, "eval_steps_per_second": 2.291, "step": 3750 }, { "epoch": 0.0063780755331987226, "grad_norm": 5.439815998077393, "learning_rate": 4.968126406743305e-05, "loss": 1.6706, "step": 3800 }, { "epoch": 0.0063780755331987226, "eval_loss": 1.622685194015503, "eval_masked_accuracy": 0.6913580298423767, "eval_runtime": 1.7518, "eval_samples_per_second": 5.709, "eval_steps_per_second": 2.283, "step": 3800 }, { "epoch": 0.006461997579688179, "grad_norm": 4.242193698883057, "learning_rate": 4.967706796510857e-05, "loss": 1.5511, "step": 3850 }, { "epoch": 0.006461997579688179, "eval_loss": 1.3621394634246826, "eval_masked_accuracy": 0.7379912734031677, "eval_runtime": 1.7356, "eval_samples_per_second": 5.762, "eval_steps_per_second": 2.305, "step": 3850 }, { "epoch": 0.0065459196261776365, "grad_norm": 5.056567668914795, "learning_rate": 4.96728718627841e-05, "loss": 1.6108, "step": 3900 }, { "epoch": 0.0065459196261776365, "eval_loss": 1.5381476879119873, "eval_masked_accuracy": 0.7165354490280151, "eval_runtime": 1.7342, "eval_samples_per_second": 5.767, "eval_steps_per_second": 2.307, "step": 3900 }, { "epoch": 0.006629841672667093, "grad_norm": 5.566115856170654, "learning_rate": 4.966867576045963e-05, "loss": 1.5858, "step": 3950 }, { "epoch": 0.006629841672667093, "eval_loss": 1.9895532131195068, "eval_masked_accuracy": 0.6399999856948853, "eval_runtime": 1.7417, "eval_samples_per_second": 5.742, "eval_steps_per_second": 2.297, "step": 3950 }, { "epoch": 0.00671376371915655, "grad_norm": 7.843978404998779, "learning_rate": 4.9664479658135155e-05, "loss": 1.5999, "step": 4000 }, { "epoch": 0.00671376371915655, "eval_loss": 1.589036464691162, "eval_masked_accuracy": 0.6991525292396545, "eval_runtime": 1.7452, "eval_samples_per_second": 5.73, "eval_steps_per_second": 2.292, "step": 4000 }, { "epoch": 0.006797685765646007, "grad_norm": 6.600104331970215, "learning_rate": 4.9660283555810683e-05, "loss": 1.6444, "step": 4050 }, { "epoch": 0.006797685765646007, "eval_loss": 1.590256929397583, "eval_masked_accuracy": 0.6895161271095276, "eval_runtime": 1.7389, "eval_samples_per_second": 5.751, "eval_steps_per_second": 2.3, "step": 4050 }, { "epoch": 0.006881607812135464, "grad_norm": 6.0659589767456055, "learning_rate": 4.965608745348621e-05, "loss": 1.5554, "step": 4100 }, { "epoch": 0.006881607812135464, "eval_loss": 1.8275972604751587, "eval_masked_accuracy": 0.6558139324188232, "eval_runtime": 1.7513, "eval_samples_per_second": 5.71, "eval_steps_per_second": 2.284, "step": 4100 }, { "epoch": 0.00696552985862492, "grad_norm": 6.09676456451416, "learning_rate": 4.9651891351161734e-05, "loss": 1.7191, "step": 4150 }, { "epoch": 0.00696552985862492, "eval_loss": 1.8767850399017334, "eval_masked_accuracy": 0.6508620977401733, "eval_runtime": 1.7474, "eval_samples_per_second": 5.723, "eval_steps_per_second": 2.289, "step": 4150 }, { "epoch": 0.0070494519051143775, "grad_norm": 5.208311080932617, "learning_rate": 4.964769524883726e-05, "loss": 1.585, "step": 4200 }, { "epoch": 0.0070494519051143775, "eval_loss": 1.3652145862579346, "eval_masked_accuracy": 0.7037037014961243, "eval_runtime": 1.7463, "eval_samples_per_second": 5.726, "eval_steps_per_second": 2.291, "step": 4200 }, { "epoch": 0.007133373951603834, "grad_norm": 8.517348289489746, "learning_rate": 4.964349914651279e-05, "loss": 1.6888, "step": 4250 }, { "epoch": 0.007133373951603834, "eval_loss": 1.347320318222046, "eval_masked_accuracy": 0.7190082669258118, "eval_runtime": 1.7446, "eval_samples_per_second": 5.732, "eval_steps_per_second": 2.293, "step": 4250 }, { "epoch": 0.0072172959980932915, "grad_norm": 5.57391357421875, "learning_rate": 4.963930304418832e-05, "loss": 1.6351, "step": 4300 }, { "epoch": 0.0072172959980932915, "eval_loss": 1.563398003578186, "eval_masked_accuracy": 0.6952789425849915, "eval_runtime": 1.7535, "eval_samples_per_second": 5.703, "eval_steps_per_second": 2.281, "step": 4300 }, { "epoch": 0.007301218044582748, "grad_norm": 4.073302745819092, "learning_rate": 4.963510694186385e-05, "loss": 1.7031, "step": 4350 }, { "epoch": 0.007301218044582748, "eval_loss": 1.7390921115875244, "eval_masked_accuracy": 0.6963562965393066, "eval_runtime": 1.8598, "eval_samples_per_second": 5.377, "eval_steps_per_second": 2.151, "step": 4350 }, { "epoch": 0.007385140091072205, "grad_norm": 4.129016876220703, "learning_rate": 4.963091083953937e-05, "loss": 1.5611, "step": 4400 }, { "epoch": 0.007385140091072205, "eval_loss": 1.7892725467681885, "eval_masked_accuracy": 0.7015503644943237, "eval_runtime": 1.7481, "eval_samples_per_second": 5.72, "eval_steps_per_second": 2.288, "step": 4400 }, { "epoch": 0.007469062137561662, "grad_norm": 8.45355224609375, "learning_rate": 4.96267147372149e-05, "loss": 1.679, "step": 4450 }, { "epoch": 0.007469062137561662, "eval_loss": 1.8994945287704468, "eval_masked_accuracy": 0.6711111068725586, "eval_runtime": 1.748, "eval_samples_per_second": 5.721, "eval_steps_per_second": 2.288, "step": 4450 }, { "epoch": 0.0075529841840511185, "grad_norm": 7.353001594543457, "learning_rate": 4.9622602556936916e-05, "loss": 1.5084, "step": 4500 }, { "epoch": 0.0075529841840511185, "eval_loss": 1.6633514165878296, "eval_masked_accuracy": 0.6792452931404114, "eval_runtime": 1.7365, "eval_samples_per_second": 5.759, "eval_steps_per_second": 2.303, "step": 4500 }, { "epoch": 0.007636906230540575, "grad_norm": 5.420140266418457, "learning_rate": 4.9618406454612445e-05, "loss": 1.6768, "step": 4550 }, { "epoch": 0.007636906230540575, "eval_loss": 1.6823314428329468, "eval_masked_accuracy": 0.700421929359436, "eval_runtime": 1.7456, "eval_samples_per_second": 5.729, "eval_steps_per_second": 2.291, "step": 4550 }, { "epoch": 0.0077208282770300325, "grad_norm": 5.6282572746276855, "learning_rate": 4.961421035228797e-05, "loss": 1.5346, "step": 4600 }, { "epoch": 0.0077208282770300325, "eval_loss": 2.210347890853882, "eval_masked_accuracy": 0.6339285969734192, "eval_runtime": 1.7553, "eval_samples_per_second": 5.697, "eval_steps_per_second": 2.279, "step": 4600 }, { "epoch": 0.007804750323519489, "grad_norm": 7.358382701873779, "learning_rate": 4.96100142499635e-05, "loss": 1.6792, "step": 4650 }, { "epoch": 0.007804750323519489, "eval_loss": 1.742630958557129, "eval_masked_accuracy": 0.6728110313415527, "eval_runtime": 1.7331, "eval_samples_per_second": 5.77, "eval_steps_per_second": 2.308, "step": 4650 }, { "epoch": 0.007888672370008946, "grad_norm": 5.980144500732422, "learning_rate": 4.960581814763902e-05, "loss": 1.4871, "step": 4700 }, { "epoch": 0.007888672370008946, "eval_loss": 1.4571318626403809, "eval_masked_accuracy": 0.7166666388511658, "eval_runtime": 1.7531, "eval_samples_per_second": 5.704, "eval_steps_per_second": 2.282, "step": 4700 }, { "epoch": 0.007972594416498403, "grad_norm": 8.18883228302002, "learning_rate": 4.960162204531455e-05, "loss": 1.527, "step": 4750 }, { "epoch": 0.007972594416498403, "eval_loss": 2.062413454055786, "eval_masked_accuracy": 0.6695278882980347, "eval_runtime": 1.748, "eval_samples_per_second": 5.721, "eval_steps_per_second": 2.288, "step": 4750 }, { "epoch": 0.00805651646298786, "grad_norm": 4.835183143615723, "learning_rate": 4.959742594299008e-05, "loss": 1.591, "step": 4800 }, { "epoch": 0.00805651646298786, "eval_loss": 1.690118432044983, "eval_masked_accuracy": 0.7049180269241333, "eval_runtime": 1.7383, "eval_samples_per_second": 5.753, "eval_steps_per_second": 2.301, "step": 4800 }, { "epoch": 0.008140438509477316, "grad_norm": 5.039312362670898, "learning_rate": 4.959322984066561e-05, "loss": 1.5386, "step": 4850 }, { "epoch": 0.008140438509477316, "eval_loss": 1.9135382175445557, "eval_masked_accuracy": 0.6181818246841431, "eval_runtime": 1.7445, "eval_samples_per_second": 5.732, "eval_steps_per_second": 2.293, "step": 4850 }, { "epoch": 0.008224360555966774, "grad_norm": 6.3293890953063965, "learning_rate": 4.958903373834114e-05, "loss": 1.4752, "step": 4900 }, { "epoch": 0.008224360555966774, "eval_loss": 1.6353566646575928, "eval_masked_accuracy": 0.7319999933242798, "eval_runtime": 1.8458, "eval_samples_per_second": 5.418, "eval_steps_per_second": 2.167, "step": 4900 }, { "epoch": 0.008308282602456231, "grad_norm": 7.455787658691406, "learning_rate": 4.958483763601666e-05, "loss": 1.5304, "step": 4950 }, { "epoch": 0.008308282602456231, "eval_loss": 1.8691352605819702, "eval_masked_accuracy": 0.6653386354446411, "eval_runtime": 1.7533, "eval_samples_per_second": 5.704, "eval_steps_per_second": 2.281, "step": 4950 }, { "epoch": 0.008392204648945687, "grad_norm": 5.682205677032471, "learning_rate": 4.958064153369219e-05, "loss": 1.5945, "step": 5000 }, { "epoch": 0.008392204648945687, "eval_loss": 1.6161428689956665, "eval_masked_accuracy": 0.6964285969734192, "eval_runtime": 1.7814, "eval_samples_per_second": 5.614, "eval_steps_per_second": 2.245, "step": 5000 }, { "epoch": 0.008476126695435144, "grad_norm": 6.474329471588135, "learning_rate": 4.9576445431367715e-05, "loss": 1.8228, "step": 5050 }, { "epoch": 0.008476126695435144, "eval_loss": 1.4911173582077026, "eval_masked_accuracy": 0.71875, "eval_runtime": 1.8052, "eval_samples_per_second": 5.54, "eval_steps_per_second": 2.216, "step": 5050 }, { "epoch": 0.0085600487419246, "grad_norm": 4.493051052093506, "learning_rate": 4.9572249329043244e-05, "loss": 1.5526, "step": 5100 }, { "epoch": 0.0085600487419246, "eval_loss": 1.4060901403427124, "eval_masked_accuracy": 0.7131474018096924, "eval_runtime": 1.8193, "eval_samples_per_second": 5.497, "eval_steps_per_second": 2.199, "step": 5100 }, { "epoch": 0.008643970788414057, "grad_norm": 5.657381057739258, "learning_rate": 4.956805322671877e-05, "loss": 1.5743, "step": 5150 }, { "epoch": 0.008643970788414057, "eval_loss": 1.7347627878189087, "eval_masked_accuracy": 0.6392694115638733, "eval_runtime": 1.7632, "eval_samples_per_second": 5.671, "eval_steps_per_second": 2.269, "step": 5150 }, { "epoch": 0.008727892834903515, "grad_norm": 5.059664726257324, "learning_rate": 4.9563941046440784e-05, "loss": 1.5923, "step": 5200 }, { "epoch": 0.008727892834903515, "eval_loss": 1.7108001708984375, "eval_masked_accuracy": 0.6759999990463257, "eval_runtime": 1.7312, "eval_samples_per_second": 5.776, "eval_steps_per_second": 2.311, "step": 5200 }, { "epoch": 0.008811814881392972, "grad_norm": 6.256536483764648, "learning_rate": 4.955974494411631e-05, "loss": 1.5454, "step": 5250 }, { "epoch": 0.008811814881392972, "eval_loss": 1.8423763513565063, "eval_masked_accuracy": 0.6590909361839294, "eval_runtime": 1.7323, "eval_samples_per_second": 5.773, "eval_steps_per_second": 2.309, "step": 5250 }, { "epoch": 0.008895736927882428, "grad_norm": 6.45760440826416, "learning_rate": 4.955554884179184e-05, "loss": 1.5381, "step": 5300 }, { "epoch": 0.008895736927882428, "eval_loss": 1.8820030689239502, "eval_masked_accuracy": 0.6486486196517944, "eval_runtime": 1.7529, "eval_samples_per_second": 5.705, "eval_steps_per_second": 2.282, "step": 5300 }, { "epoch": 0.008979658974371885, "grad_norm": 7.668667793273926, "learning_rate": 4.955135273946737e-05, "loss": 1.6363, "step": 5350 }, { "epoch": 0.008979658974371885, "eval_loss": 1.631400465965271, "eval_masked_accuracy": 0.7160493731498718, "eval_runtime": 1.7511, "eval_samples_per_second": 5.711, "eval_steps_per_second": 2.284, "step": 5350 }, { "epoch": 0.009063581020861342, "grad_norm": 7.2050018310546875, "learning_rate": 4.954715663714289e-05, "loss": 1.5738, "step": 5400 }, { "epoch": 0.009063581020861342, "eval_loss": 1.5917881727218628, "eval_masked_accuracy": 0.7405857443809509, "eval_runtime": 1.75, "eval_samples_per_second": 5.714, "eval_steps_per_second": 2.286, "step": 5400 }, { "epoch": 0.0091475030673508, "grad_norm": 6.094969749450684, "learning_rate": 4.954296053481842e-05, "loss": 1.7321, "step": 5450 }, { "epoch": 0.0091475030673508, "eval_loss": 1.5327577590942383, "eval_masked_accuracy": 0.707317054271698, "eval_runtime": 1.7423, "eval_samples_per_second": 5.74, "eval_steps_per_second": 2.296, "step": 5450 }, { "epoch": 0.009231425113840256, "grad_norm": 8.869881629943848, "learning_rate": 4.953876443249395e-05, "loss": 1.5768, "step": 5500 }, { "epoch": 0.009231425113840256, "eval_loss": 1.3501726388931274, "eval_masked_accuracy": 0.7801724076271057, "eval_runtime": 1.7732, "eval_samples_per_second": 5.64, "eval_steps_per_second": 2.256, "step": 5500 }, { "epoch": 0.009315347160329713, "grad_norm": 4.408574104309082, "learning_rate": 4.9534568330169476e-05, "loss": 1.5802, "step": 5550 }, { "epoch": 0.009315347160329713, "eval_loss": 1.7055152654647827, "eval_masked_accuracy": 0.6707317233085632, "eval_runtime": 1.7418, "eval_samples_per_second": 5.741, "eval_steps_per_second": 2.296, "step": 5550 }, { "epoch": 0.00939926920681917, "grad_norm": 5.3869147300720215, "learning_rate": 4.9530372227845e-05, "loss": 1.5547, "step": 5600 }, { "epoch": 0.00939926920681917, "eval_loss": 1.3663699626922607, "eval_masked_accuracy": 0.6974790096282959, "eval_runtime": 1.7338, "eval_samples_per_second": 5.768, "eval_steps_per_second": 2.307, "step": 5600 }, { "epoch": 0.009483191253308626, "grad_norm": 4.417982578277588, "learning_rate": 4.9526176125520526e-05, "loss": 1.5658, "step": 5650 }, { "epoch": 0.009483191253308626, "eval_loss": 1.6572059392929077, "eval_masked_accuracy": 0.6520000100135803, "eval_runtime": 1.7583, "eval_samples_per_second": 5.687, "eval_steps_per_second": 2.275, "step": 5650 }, { "epoch": 0.009567113299798084, "grad_norm": 5.2137861251831055, "learning_rate": 4.9521980023196055e-05, "loss": 1.5929, "step": 5700 }, { "epoch": 0.009567113299798084, "eval_loss": 1.4574190378189087, "eval_masked_accuracy": 0.6694560647010803, "eval_runtime": 1.7352, "eval_samples_per_second": 5.763, "eval_steps_per_second": 2.305, "step": 5700 }, { "epoch": 0.00965103534628754, "grad_norm": 6.848864555358887, "learning_rate": 4.951778392087158e-05, "loss": 1.6008, "step": 5750 }, { "epoch": 0.00965103534628754, "eval_loss": 2.133417844772339, "eval_masked_accuracy": 0.6540084481239319, "eval_runtime": 1.8568, "eval_samples_per_second": 5.386, "eval_steps_per_second": 2.154, "step": 5750 }, { "epoch": 0.009734957392776997, "grad_norm": 3.9827840328216553, "learning_rate": 4.9513587818547105e-05, "loss": 1.5811, "step": 5800 }, { "epoch": 0.009734957392776997, "eval_loss": 1.403198003768921, "eval_masked_accuracy": 0.7085201740264893, "eval_runtime": 1.749, "eval_samples_per_second": 5.717, "eval_steps_per_second": 2.287, "step": 5800 }, { "epoch": 0.009818879439266454, "grad_norm": 4.541887283325195, "learning_rate": 4.950939171622263e-05, "loss": 1.558, "step": 5850 }, { "epoch": 0.009818879439266454, "eval_loss": 1.4281632900238037, "eval_masked_accuracy": 0.7195122241973877, "eval_runtime": 1.7523, "eval_samples_per_second": 5.707, "eval_steps_per_second": 2.283, "step": 5850 }, { "epoch": 0.00990280148575591, "grad_norm": 8.121429443359375, "learning_rate": 4.950519561389816e-05, "loss": 1.5583, "step": 5900 }, { "epoch": 0.00990280148575591, "eval_loss": 1.608547568321228, "eval_masked_accuracy": 0.6582278609275818, "eval_runtime": 1.7405, "eval_samples_per_second": 5.745, "eval_steps_per_second": 2.298, "step": 5900 }, { "epoch": 0.009986723532245369, "grad_norm": 4.750977039337158, "learning_rate": 4.950099951157369e-05, "loss": 1.5378, "step": 5950 }, { "epoch": 0.009986723532245369, "eval_loss": 1.3912121057510376, "eval_masked_accuracy": 0.701298713684082, "eval_runtime": 1.7623, "eval_samples_per_second": 5.674, "eval_steps_per_second": 2.27, "step": 5950 }, { "epoch": 0.010070645578734825, "grad_norm": 4.445640563964844, "learning_rate": 4.949680340924922e-05, "loss": 1.5063, "step": 6000 }, { "epoch": 0.010070645578734825, "eval_loss": 1.6513465642929077, "eval_masked_accuracy": 0.6796537041664124, "eval_runtime": 1.7424, "eval_samples_per_second": 5.739, "eval_steps_per_second": 2.296, "step": 6000 }, { "epoch": 0.010154567625224282, "grad_norm": 13.394184112548828, "learning_rate": 4.949260730692475e-05, "loss": 1.5155, "step": 6050 }, { "epoch": 0.010154567625224282, "eval_loss": 1.5842430591583252, "eval_masked_accuracy": 0.6853448152542114, "eval_runtime": 1.7416, "eval_samples_per_second": 5.742, "eval_steps_per_second": 2.297, "step": 6050 }, { "epoch": 0.010238489671713738, "grad_norm": 7.441386699676514, "learning_rate": 4.948841120460027e-05, "loss": 1.5009, "step": 6100 }, { "epoch": 0.010238489671713738, "eval_loss": 1.512109637260437, "eval_masked_accuracy": 0.6987447738647461, "eval_runtime": 1.7546, "eval_samples_per_second": 5.699, "eval_steps_per_second": 2.28, "step": 6100 }, { "epoch": 0.010322411718203195, "grad_norm": 6.1988749504089355, "learning_rate": 4.94842151022758e-05, "loss": 1.5567, "step": 6150 }, { "epoch": 0.010322411718203195, "eval_loss": 1.5210555791854858, "eval_masked_accuracy": 0.7109375, "eval_runtime": 1.7524, "eval_samples_per_second": 5.707, "eval_steps_per_second": 2.283, "step": 6150 }, { "epoch": 0.010406333764692651, "grad_norm": 4.782381057739258, "learning_rate": 4.9480018999951325e-05, "loss": 1.6125, "step": 6200 }, { "epoch": 0.010406333764692651, "eval_loss": 1.6434142589569092, "eval_masked_accuracy": 0.6638655662536621, "eval_runtime": 1.7489, "eval_samples_per_second": 5.718, "eval_steps_per_second": 2.287, "step": 6200 }, { "epoch": 0.01049025581118211, "grad_norm": 5.14832878112793, "learning_rate": 4.9475822897626854e-05, "loss": 1.6089, "step": 6250 }, { "epoch": 0.01049025581118211, "eval_loss": 1.239379644393921, "eval_masked_accuracy": 0.7427386045455933, "eval_runtime": 1.7532, "eval_samples_per_second": 5.704, "eval_steps_per_second": 2.282, "step": 6250 }, { "epoch": 0.010574177857671566, "grad_norm": 5.390649795532227, "learning_rate": 4.947162679530238e-05, "loss": 1.6357, "step": 6300 }, { "epoch": 0.010574177857671566, "eval_loss": 1.5129663944244385, "eval_masked_accuracy": 0.692307710647583, "eval_runtime": 1.7523, "eval_samples_per_second": 5.707, "eval_steps_per_second": 2.283, "step": 6300 }, { "epoch": 0.010658099904161023, "grad_norm": 4.3327412605285645, "learning_rate": 4.9467430692977904e-05, "loss": 1.5318, "step": 6350 }, { "epoch": 0.010658099904161023, "eval_loss": 1.7716737985610962, "eval_masked_accuracy": 0.6942148804664612, "eval_runtime": 1.7284, "eval_samples_per_second": 5.786, "eval_steps_per_second": 2.314, "step": 6350 }, { "epoch": 0.01074202195065048, "grad_norm": 5.145776271820068, "learning_rate": 4.946323459065343e-05, "loss": 1.6081, "step": 6400 }, { "epoch": 0.01074202195065048, "eval_loss": 1.6661970615386963, "eval_masked_accuracy": 0.6882591247558594, "eval_runtime": 1.7486, "eval_samples_per_second": 5.719, "eval_steps_per_second": 2.288, "step": 6400 }, { "epoch": 0.010825943997139936, "grad_norm": 5.037006855010986, "learning_rate": 4.945903848832896e-05, "loss": 1.5028, "step": 6450 }, { "epoch": 0.010825943997139936, "eval_loss": 1.4679136276245117, "eval_masked_accuracy": 0.714893639087677, "eval_runtime": 1.7514, "eval_samples_per_second": 5.71, "eval_steps_per_second": 2.284, "step": 6450 }, { "epoch": 0.010909866043629394, "grad_norm": 5.618253707885742, "learning_rate": 4.945484238600449e-05, "loss": 1.5477, "step": 6500 }, { "epoch": 0.010909866043629394, "eval_loss": 1.6666347980499268, "eval_masked_accuracy": 0.7094017267227173, "eval_runtime": 1.7486, "eval_samples_per_second": 5.719, "eval_steps_per_second": 2.288, "step": 6500 }, { "epoch": 0.01099378809011885, "grad_norm": 14.34435749053955, "learning_rate": 4.945064628368002e-05, "loss": 1.6291, "step": 6550 }, { "epoch": 0.01099378809011885, "eval_loss": 1.8381481170654297, "eval_masked_accuracy": 0.6547085046768188, "eval_runtime": 1.7548, "eval_samples_per_second": 5.699, "eval_steps_per_second": 2.279, "step": 6550 }, { "epoch": 0.011077710136608307, "grad_norm": 4.846654891967773, "learning_rate": 4.9446450181355546e-05, "loss": 1.6077, "step": 6600 }, { "epoch": 0.011077710136608307, "eval_loss": 1.5568077564239502, "eval_masked_accuracy": 0.6872428059577942, "eval_runtime": 1.7324, "eval_samples_per_second": 5.772, "eval_steps_per_second": 2.309, "step": 6600 }, { "epoch": 0.011161632183097764, "grad_norm": 5.304859161376953, "learning_rate": 4.944225407903107e-05, "loss": 1.5758, "step": 6650 }, { "epoch": 0.011161632183097764, "eval_loss": 1.3110054731369019, "eval_masked_accuracy": 0.7312775254249573, "eval_runtime": 1.7439, "eval_samples_per_second": 5.734, "eval_steps_per_second": 2.294, "step": 6650 }, { "epoch": 0.01124555422958722, "grad_norm": 6.187143802642822, "learning_rate": 4.9438057976706596e-05, "loss": 1.5817, "step": 6700 }, { "epoch": 0.01124555422958722, "eval_loss": 1.7989356517791748, "eval_masked_accuracy": 0.6666666865348816, "eval_runtime": 1.754, "eval_samples_per_second": 5.701, "eval_steps_per_second": 2.28, "step": 6700 }, { "epoch": 0.011329476276076679, "grad_norm": 5.595826148986816, "learning_rate": 4.9433861874382124e-05, "loss": 1.6367, "step": 6750 }, { "epoch": 0.011329476276076679, "eval_loss": 1.7425569295883179, "eval_masked_accuracy": 0.6583333611488342, "eval_runtime": 1.7467, "eval_samples_per_second": 5.725, "eval_steps_per_second": 2.29, "step": 6750 }, { "epoch": 0.011413398322566135, "grad_norm": 4.125125408172607, "learning_rate": 4.942966577205765e-05, "loss": 1.641, "step": 6800 }, { "epoch": 0.011413398322566135, "eval_loss": 1.728715181350708, "eval_masked_accuracy": 0.6652892827987671, "eval_runtime": 1.772, "eval_samples_per_second": 5.643, "eval_steps_per_second": 2.257, "step": 6800 }, { "epoch": 0.011497320369055592, "grad_norm": 6.3898844718933105, "learning_rate": 4.942546966973318e-05, "loss": 1.6574, "step": 6850 }, { "epoch": 0.011497320369055592, "eval_loss": 1.8261781930923462, "eval_masked_accuracy": 0.6752136945724487, "eval_runtime": 1.7446, "eval_samples_per_second": 5.732, "eval_steps_per_second": 2.293, "step": 6850 }, { "epoch": 0.011581242415545048, "grad_norm": 5.9191155433654785, "learning_rate": 4.942127356740871e-05, "loss": 1.5732, "step": 6900 }, { "epoch": 0.011581242415545048, "eval_loss": 1.2290430068969727, "eval_masked_accuracy": 0.7573221921920776, "eval_runtime": 1.7438, "eval_samples_per_second": 5.735, "eval_steps_per_second": 2.294, "step": 6900 }, { "epoch": 0.011665164462034505, "grad_norm": 5.910600185394287, "learning_rate": 4.941707746508423e-05, "loss": 1.5018, "step": 6950 }, { "epoch": 0.011665164462034505, "eval_loss": 1.3011202812194824, "eval_masked_accuracy": 0.746835470199585, "eval_runtime": 1.739, "eval_samples_per_second": 5.751, "eval_steps_per_second": 2.3, "step": 6950 }, { "epoch": 0.011749086508523963, "grad_norm": 7.273187637329102, "learning_rate": 4.941288136275976e-05, "loss": 1.6083, "step": 7000 }, { "epoch": 0.011749086508523963, "eval_loss": 1.7945482730865479, "eval_masked_accuracy": 0.6719367504119873, "eval_runtime": 1.7495, "eval_samples_per_second": 5.716, "eval_steps_per_second": 2.286, "step": 7000 }, { "epoch": 0.01183300855501342, "grad_norm": 5.980038642883301, "learning_rate": 4.940868526043529e-05, "loss": 1.7157, "step": 7050 }, { "epoch": 0.01183300855501342, "eval_loss": 1.6633656024932861, "eval_masked_accuracy": 0.6859503984451294, "eval_runtime": 1.7603, "eval_samples_per_second": 5.681, "eval_steps_per_second": 2.272, "step": 7050 }, { "epoch": 0.011916930601502876, "grad_norm": 4.222002029418945, "learning_rate": 4.9404489158110817e-05, "loss": 1.4124, "step": 7100 }, { "epoch": 0.011916930601502876, "eval_loss": 1.7207615375518799, "eval_masked_accuracy": 0.6793248653411865, "eval_runtime": 1.753, "eval_samples_per_second": 5.704, "eval_steps_per_second": 2.282, "step": 7100 }, { "epoch": 0.012000852647992333, "grad_norm": 8.79937744140625, "learning_rate": 4.9400293055786345e-05, "loss": 1.5698, "step": 7150 }, { "epoch": 0.012000852647992333, "eval_loss": 1.5078874826431274, "eval_masked_accuracy": 0.7276119589805603, "eval_runtime": 1.866, "eval_samples_per_second": 5.359, "eval_steps_per_second": 2.144, "step": 7150 }, { "epoch": 0.01208477469448179, "grad_norm": 6.331279754638672, "learning_rate": 4.939609695346187e-05, "loss": 1.5354, "step": 7200 }, { "epoch": 0.01208477469448179, "eval_loss": 1.3983685970306396, "eval_masked_accuracy": 0.7590909004211426, "eval_runtime": 1.7632, "eval_samples_per_second": 5.672, "eval_steps_per_second": 2.269, "step": 7200 }, { "epoch": 0.012168696740971246, "grad_norm": 4.12935733795166, "learning_rate": 4.9391900851137395e-05, "loss": 1.4778, "step": 7250 }, { "epoch": 0.012168696740971246, "eval_loss": 1.7603422403335571, "eval_masked_accuracy": 0.686956524848938, "eval_runtime": 1.7504, "eval_samples_per_second": 5.713, "eval_steps_per_second": 2.285, "step": 7250 }, { "epoch": 0.012252618787460704, "grad_norm": 5.025778293609619, "learning_rate": 4.9387704748812923e-05, "loss": 1.5175, "step": 7300 }, { "epoch": 0.012252618787460704, "eval_loss": 1.7313247919082642, "eval_masked_accuracy": 0.6872428059577942, "eval_runtime": 1.745, "eval_samples_per_second": 5.731, "eval_steps_per_second": 2.292, "step": 7300 }, { "epoch": 0.01233654083395016, "grad_norm": 9.704473495483398, "learning_rate": 4.938350864648845e-05, "loss": 1.4634, "step": 7350 }, { "epoch": 0.01233654083395016, "eval_loss": 1.271333932876587, "eval_masked_accuracy": 0.7397260069847107, "eval_runtime": 1.7484, "eval_samples_per_second": 5.72, "eval_steps_per_second": 2.288, "step": 7350 }, { "epoch": 0.012420462880439617, "grad_norm": 6.080599308013916, "learning_rate": 4.937931254416398e-05, "loss": 1.5937, "step": 7400 }, { "epoch": 0.012420462880439617, "eval_loss": 1.4850938320159912, "eval_masked_accuracy": 0.7280701994895935, "eval_runtime": 1.7517, "eval_samples_per_second": 5.709, "eval_steps_per_second": 2.284, "step": 7400 }, { "epoch": 0.012504384926929074, "grad_norm": 3.824946880340576, "learning_rate": 4.937511644183951e-05, "loss": 1.6026, "step": 7450 }, { "epoch": 0.012504384926929074, "eval_loss": 1.5267841815948486, "eval_masked_accuracy": 0.7058823704719543, "eval_runtime": 1.7438, "eval_samples_per_second": 5.734, "eval_steps_per_second": 2.294, "step": 7450 }, { "epoch": 0.01258830697341853, "grad_norm": 4.5395989418029785, "learning_rate": 4.937092033951503e-05, "loss": 1.4575, "step": 7500 }, { "epoch": 0.01258830697341853, "eval_loss": 1.4801056385040283, "eval_masked_accuracy": 0.680672287940979, "eval_runtime": 1.7409, "eval_samples_per_second": 5.744, "eval_steps_per_second": 2.298, "step": 7500 }, { "epoch": 0.012672229019907989, "grad_norm": 6.853204250335693, "learning_rate": 4.936672423719056e-05, "loss": 1.4224, "step": 7550 }, { "epoch": 0.012672229019907989, "eval_loss": 1.6892282962799072, "eval_masked_accuracy": 0.6551724076271057, "eval_runtime": 1.7414, "eval_samples_per_second": 5.742, "eval_steps_per_second": 2.297, "step": 7550 }, { "epoch": 0.012756151066397445, "grad_norm": 5.53077507019043, "learning_rate": 4.936252813486609e-05, "loss": 1.6706, "step": 7600 }, { "epoch": 0.012756151066397445, "eval_loss": 1.4235472679138184, "eval_masked_accuracy": 0.7426160573959351, "eval_runtime": 1.8082, "eval_samples_per_second": 5.53, "eval_steps_per_second": 2.212, "step": 7600 }, { "epoch": 0.012840073112886902, "grad_norm": 4.5907087326049805, "learning_rate": 4.9358332032541616e-05, "loss": 1.6674, "step": 7650 }, { "epoch": 0.012840073112886902, "eval_loss": 1.4942524433135986, "eval_masked_accuracy": 0.7172995805740356, "eval_runtime": 1.7449, "eval_samples_per_second": 5.731, "eval_steps_per_second": 2.292, "step": 7650 }, { "epoch": 0.012923995159376358, "grad_norm": 8.004353523254395, "learning_rate": 4.9354135930217144e-05, "loss": 1.4294, "step": 7700 }, { "epoch": 0.012923995159376358, "eval_loss": 1.7548024654388428, "eval_masked_accuracy": 0.6547619104385376, "eval_runtime": 1.7767, "eval_samples_per_second": 5.628, "eval_steps_per_second": 2.251, "step": 7700 }, { "epoch": 0.013007917205865815, "grad_norm": 6.963031768798828, "learning_rate": 4.934993982789267e-05, "loss": 1.5078, "step": 7750 }, { "epoch": 0.013007917205865815, "eval_loss": 1.4269187450408936, "eval_masked_accuracy": 0.7027027010917664, "eval_runtime": 1.7471, "eval_samples_per_second": 5.724, "eval_steps_per_second": 2.29, "step": 7750 }, { "epoch": 0.013091839252355273, "grad_norm": 6.4043288230896, "learning_rate": 4.9345743725568194e-05, "loss": 1.604, "step": 7800 }, { "epoch": 0.013091839252355273, "eval_loss": 1.4502145051956177, "eval_masked_accuracy": 0.7172995805740356, "eval_runtime": 1.748, "eval_samples_per_second": 5.721, "eval_steps_per_second": 2.288, "step": 7800 }, { "epoch": 0.01317576129884473, "grad_norm": 5.293691158294678, "learning_rate": 4.934154762324372e-05, "loss": 1.6301, "step": 7850 }, { "epoch": 0.01317576129884473, "eval_loss": 1.3547624349594116, "eval_masked_accuracy": 0.7759336233139038, "eval_runtime": 1.7437, "eval_samples_per_second": 5.735, "eval_steps_per_second": 2.294, "step": 7850 }, { "epoch": 0.013259683345334186, "grad_norm": 7.364100933074951, "learning_rate": 4.933735152091925e-05, "loss": 1.5163, "step": 7900 }, { "epoch": 0.013259683345334186, "eval_loss": 1.6089417934417725, "eval_masked_accuracy": 0.6610878705978394, "eval_runtime": 1.753, "eval_samples_per_second": 5.704, "eval_steps_per_second": 2.282, "step": 7900 }, { "epoch": 0.013343605391823643, "grad_norm": 7.704033851623535, "learning_rate": 4.933315541859478e-05, "loss": 1.6564, "step": 7950 }, { "epoch": 0.013343605391823643, "eval_loss": 1.4759953022003174, "eval_masked_accuracy": 0.6958333253860474, "eval_runtime": 1.7614, "eval_samples_per_second": 5.677, "eval_steps_per_second": 2.271, "step": 7950 }, { "epoch": 0.0134275274383131, "grad_norm": 5.562460899353027, "learning_rate": 4.932895931627031e-05, "loss": 1.5703, "step": 8000 }, { "epoch": 0.0134275274383131, "eval_loss": 1.735896348953247, "eval_masked_accuracy": 0.6875, "eval_runtime": 1.7493, "eval_samples_per_second": 5.717, "eval_steps_per_second": 2.287, "step": 8000 }, { "epoch": 0.013511449484802556, "grad_norm": 8.801225662231445, "learning_rate": 4.9324763213945836e-05, "loss": 1.5328, "step": 8050 }, { "epoch": 0.013511449484802556, "eval_loss": 1.2792503833770752, "eval_masked_accuracy": 0.7292576432228088, "eval_runtime": 1.7802, "eval_samples_per_second": 5.617, "eval_steps_per_second": 2.247, "step": 8050 }, { "epoch": 0.013595371531292014, "grad_norm": 5.510076999664307, "learning_rate": 4.932056711162136e-05, "loss": 1.5086, "step": 8100 }, { "epoch": 0.013595371531292014, "eval_loss": 1.811342477798462, "eval_masked_accuracy": 0.6508620977401733, "eval_runtime": 1.7772, "eval_samples_per_second": 5.627, "eval_steps_per_second": 2.251, "step": 8100 }, { "epoch": 0.01367929357778147, "grad_norm": 4.370019912719727, "learning_rate": 4.9316371009296886e-05, "loss": 1.5992, "step": 8150 }, { "epoch": 0.01367929357778147, "eval_loss": 1.7015224695205688, "eval_masked_accuracy": 0.6945606470108032, "eval_runtime": 1.7399, "eval_samples_per_second": 5.747, "eval_steps_per_second": 2.299, "step": 8150 }, { "epoch": 0.013763215624270927, "grad_norm": 5.960280895233154, "learning_rate": 4.9312174906972415e-05, "loss": 1.6392, "step": 8200 }, { "epoch": 0.013763215624270927, "eval_loss": 1.4644631147384644, "eval_masked_accuracy": 0.7004830837249756, "eval_runtime": 1.7493, "eval_samples_per_second": 5.717, "eval_steps_per_second": 2.287, "step": 8200 }, { "epoch": 0.013847137670760384, "grad_norm": 5.401033878326416, "learning_rate": 4.930797880464794e-05, "loss": 1.6492, "step": 8250 }, { "epoch": 0.013847137670760384, "eval_loss": 1.5244245529174805, "eval_masked_accuracy": 0.688034176826477, "eval_runtime": 1.7597, "eval_samples_per_second": 5.683, "eval_steps_per_second": 2.273, "step": 8250 }, { "epoch": 0.01393105971724984, "grad_norm": 7.356916427612305, "learning_rate": 4.930378270232347e-05, "loss": 1.5673, "step": 8300 }, { "epoch": 0.01393105971724984, "eval_loss": 1.4024368524551392, "eval_masked_accuracy": 0.7016806602478027, "eval_runtime": 1.7463, "eval_samples_per_second": 5.726, "eval_steps_per_second": 2.291, "step": 8300 }, { "epoch": 0.014014981763739299, "grad_norm": 5.370472431182861, "learning_rate": 4.929958659999899e-05, "loss": 1.5267, "step": 8350 }, { "epoch": 0.014014981763739299, "eval_loss": 1.7430174350738525, "eval_masked_accuracy": 0.6653061509132385, "eval_runtime": 1.7353, "eval_samples_per_second": 5.763, "eval_steps_per_second": 2.305, "step": 8350 }, { "epoch": 0.014098903810228755, "grad_norm": 6.4656500816345215, "learning_rate": 4.929539049767452e-05, "loss": 1.5918, "step": 8400 }, { "epoch": 0.014098903810228755, "eval_loss": 1.691054344177246, "eval_masked_accuracy": 0.6849315166473389, "eval_runtime": 1.746, "eval_samples_per_second": 5.727, "eval_steps_per_second": 2.291, "step": 8400 }, { "epoch": 0.014182825856718212, "grad_norm": 5.481358051300049, "learning_rate": 4.929119439535005e-05, "loss": 1.5156, "step": 8450 }, { "epoch": 0.014182825856718212, "eval_loss": 1.6469824314117432, "eval_masked_accuracy": 0.6516393423080444, "eval_runtime": 1.7423, "eval_samples_per_second": 5.74, "eval_steps_per_second": 2.296, "step": 8450 }, { "epoch": 0.014266747903207668, "grad_norm": 4.755044937133789, "learning_rate": 4.928699829302558e-05, "loss": 1.5223, "step": 8500 }, { "epoch": 0.014266747903207668, "eval_loss": 1.667824387550354, "eval_masked_accuracy": 0.7068965435028076, "eval_runtime": 1.7979, "eval_samples_per_second": 5.562, "eval_steps_per_second": 2.225, "step": 8500 }, { "epoch": 0.014350669949697125, "grad_norm": 6.595943450927734, "learning_rate": 4.928280219070111e-05, "loss": 1.4699, "step": 8550 }, { "epoch": 0.014350669949697125, "eval_loss": 1.2367641925811768, "eval_masked_accuracy": 0.7447698712348938, "eval_runtime": 1.7387, "eval_samples_per_second": 5.752, "eval_steps_per_second": 2.301, "step": 8550 }, { "epoch": 0.014434591996186583, "grad_norm": 3.9210710525512695, "learning_rate": 4.9278606088376635e-05, "loss": 1.5695, "step": 8600 }, { "epoch": 0.014434591996186583, "eval_loss": 1.3033006191253662, "eval_masked_accuracy": 0.693965494632721, "eval_runtime": 1.7554, "eval_samples_per_second": 5.697, "eval_steps_per_second": 2.279, "step": 8600 }, { "epoch": 0.01451851404267604, "grad_norm": 4.682461261749268, "learning_rate": 4.927440998605216e-05, "loss": 1.5371, "step": 8650 }, { "epoch": 0.01451851404267604, "eval_loss": 1.727216124534607, "eval_masked_accuracy": 0.6639004349708557, "eval_runtime": 1.7387, "eval_samples_per_second": 5.751, "eval_steps_per_second": 2.301, "step": 8650 }, { "epoch": 0.014602436089165496, "grad_norm": 4.478100776672363, "learning_rate": 4.9270213883727685e-05, "loss": 1.5679, "step": 8700 }, { "epoch": 0.014602436089165496, "eval_loss": 1.4694969654083252, "eval_masked_accuracy": 0.7364016771316528, "eval_runtime": 1.7474, "eval_samples_per_second": 5.723, "eval_steps_per_second": 2.289, "step": 8700 }, { "epoch": 0.014686358135654953, "grad_norm": 8.149710655212402, "learning_rate": 4.9266017781403214e-05, "loss": 1.4814, "step": 8750 }, { "epoch": 0.014686358135654953, "eval_loss": 1.9258610010147095, "eval_masked_accuracy": 0.6228070259094238, "eval_runtime": 1.7513, "eval_samples_per_second": 5.71, "eval_steps_per_second": 2.284, "step": 8750 }, { "epoch": 0.01477028018214441, "grad_norm": 4.727016925811768, "learning_rate": 4.926182167907874e-05, "loss": 1.609, "step": 8800 }, { "epoch": 0.01477028018214441, "eval_loss": 1.6111774444580078, "eval_masked_accuracy": 0.6590038537979126, "eval_runtime": 1.7579, "eval_samples_per_second": 5.689, "eval_steps_per_second": 2.275, "step": 8800 }, { "epoch": 0.014854202228633867, "grad_norm": 5.348945140838623, "learning_rate": 4.925762557675427e-05, "loss": 1.5557, "step": 8850 }, { "epoch": 0.014854202228633867, "eval_loss": 1.3535053730010986, "eval_masked_accuracy": 0.7245762944221497, "eval_runtime": 1.8639, "eval_samples_per_second": 5.365, "eval_steps_per_second": 2.146, "step": 8850 }, { "epoch": 0.014938124275123324, "grad_norm": 6.573589324951172, "learning_rate": 4.92534294744298e-05, "loss": 1.6389, "step": 8900 }, { "epoch": 0.014938124275123324, "eval_loss": 1.8509418964385986, "eval_masked_accuracy": 0.7085201740264893, "eval_runtime": 1.7536, "eval_samples_per_second": 5.703, "eval_steps_per_second": 2.281, "step": 8900 }, { "epoch": 0.01502204632161278, "grad_norm": 7.373574256896973, "learning_rate": 4.924923337210532e-05, "loss": 1.4773, "step": 8950 }, { "epoch": 0.01502204632161278, "eval_loss": 1.7772554159164429, "eval_masked_accuracy": 0.6640625, "eval_runtime": 1.7655, "eval_samples_per_second": 5.664, "eval_steps_per_second": 2.266, "step": 8950 }, { "epoch": 0.015105968368102237, "grad_norm": 5.861003875732422, "learning_rate": 4.924503726978085e-05, "loss": 1.3842, "step": 9000 }, { "epoch": 0.015105968368102237, "eval_loss": 1.6182334423065186, "eval_masked_accuracy": 0.7183098793029785, "eval_runtime": 1.7386, "eval_samples_per_second": 5.752, "eval_steps_per_second": 2.301, "step": 9000 }, { "epoch": 0.015189890414591694, "grad_norm": 5.086306571960449, "learning_rate": 4.924084116745638e-05, "loss": 1.6445, "step": 9050 }, { "epoch": 0.015189890414591694, "eval_loss": 1.3457679748535156, "eval_masked_accuracy": 0.752293586730957, "eval_runtime": 1.7595, "eval_samples_per_second": 5.684, "eval_steps_per_second": 2.273, "step": 9050 }, { "epoch": 0.01527381246108115, "grad_norm": 7.099021911621094, "learning_rate": 4.9236645065131906e-05, "loss": 1.5536, "step": 9100 }, { "epoch": 0.01527381246108115, "eval_loss": 1.8317623138427734, "eval_masked_accuracy": 0.6588628888130188, "eval_runtime": 1.8424, "eval_samples_per_second": 5.428, "eval_steps_per_second": 2.171, "step": 9100 }, { "epoch": 0.015357734507570608, "grad_norm": 6.620283126831055, "learning_rate": 4.9232448962807434e-05, "loss": 1.5151, "step": 9150 }, { "epoch": 0.015357734507570608, "eval_loss": 1.4230843782424927, "eval_masked_accuracy": 0.700421929359436, "eval_runtime": 1.7611, "eval_samples_per_second": 5.678, "eval_steps_per_second": 2.271, "step": 9150 }, { "epoch": 0.015441656554060065, "grad_norm": 7.231357097625732, "learning_rate": 4.922825286048296e-05, "loss": 1.6078, "step": 9200 }, { "epoch": 0.015441656554060065, "eval_loss": 1.7547998428344727, "eval_masked_accuracy": 0.6745283007621765, "eval_runtime": 1.8328, "eval_samples_per_second": 5.456, "eval_steps_per_second": 2.182, "step": 9200 }, { "epoch": 0.015525578600549522, "grad_norm": 4.755532264709473, "learning_rate": 4.9224140680204975e-05, "loss": 1.5938, "step": 9250 }, { "epoch": 0.015525578600549522, "eval_loss": 1.3346257209777832, "eval_masked_accuracy": 0.7244444489479065, "eval_runtime": 1.7553, "eval_samples_per_second": 5.697, "eval_steps_per_second": 2.279, "step": 9250 }, { "epoch": 0.015609500647038978, "grad_norm": 5.728196620941162, "learning_rate": 4.92199445778805e-05, "loss": 1.5542, "step": 9300 }, { "epoch": 0.015609500647038978, "eval_loss": 1.6833394765853882, "eval_masked_accuracy": 0.6654929518699646, "eval_runtime": 1.7516, "eval_samples_per_second": 5.709, "eval_steps_per_second": 2.284, "step": 9300 }, { "epoch": 0.015693422693528435, "grad_norm": 5.66224479675293, "learning_rate": 4.921574847555603e-05, "loss": 1.6099, "step": 9350 }, { "epoch": 0.015693422693528435, "eval_loss": 1.442452311515808, "eval_masked_accuracy": 0.6905829310417175, "eval_runtime": 1.7553, "eval_samples_per_second": 5.697, "eval_steps_per_second": 2.279, "step": 9350 }, { "epoch": 0.015777344740017893, "grad_norm": 6.560795307159424, "learning_rate": 4.921155237323155e-05, "loss": 1.4188, "step": 9400 }, { "epoch": 0.015777344740017893, "eval_loss": 1.539738416671753, "eval_masked_accuracy": 0.68359375, "eval_runtime": 1.7406, "eval_samples_per_second": 5.745, "eval_steps_per_second": 2.298, "step": 9400 }, { "epoch": 0.015861266786507348, "grad_norm": 4.9847025871276855, "learning_rate": 4.920735627090708e-05, "loss": 1.6344, "step": 9450 }, { "epoch": 0.015861266786507348, "eval_loss": 1.244769811630249, "eval_masked_accuracy": 0.7078189253807068, "eval_runtime": 1.77, "eval_samples_per_second": 5.65, "eval_steps_per_second": 2.26, "step": 9450 }, { "epoch": 0.015945188832996806, "grad_norm": 6.173788070678711, "learning_rate": 4.920316016858261e-05, "loss": 1.6249, "step": 9500 }, { "epoch": 0.015945188832996806, "eval_loss": 2.0483577251434326, "eval_masked_accuracy": 0.607594907283783, "eval_runtime": 1.7538, "eval_samples_per_second": 5.702, "eval_steps_per_second": 2.281, "step": 9500 }, { "epoch": 0.016029110879486264, "grad_norm": 4.4076828956604, "learning_rate": 4.919896406625814e-05, "loss": 1.505, "step": 9550 }, { "epoch": 0.016029110879486264, "eval_loss": 1.7403160333633423, "eval_masked_accuracy": 0.7048457860946655, "eval_runtime": 1.7491, "eval_samples_per_second": 5.717, "eval_steps_per_second": 2.287, "step": 9550 }, { "epoch": 0.01611303292597572, "grad_norm": 6.358312129974365, "learning_rate": 4.919476796393366e-05, "loss": 1.655, "step": 9600 }, { "epoch": 0.01611303292597572, "eval_loss": 1.8444688320159912, "eval_masked_accuracy": 0.6808510422706604, "eval_runtime": 1.7573, "eval_samples_per_second": 5.691, "eval_steps_per_second": 2.276, "step": 9600 }, { "epoch": 0.016196954972465177, "grad_norm": 6.645698547363281, "learning_rate": 4.919057186160919e-05, "loss": 1.5926, "step": 9650 }, { "epoch": 0.016196954972465177, "eval_loss": 1.6228317022323608, "eval_masked_accuracy": 0.65625, "eval_runtime": 1.8422, "eval_samples_per_second": 5.428, "eval_steps_per_second": 2.171, "step": 9650 }, { "epoch": 0.016280877018954632, "grad_norm": 5.672697067260742, "learning_rate": 4.918637575928472e-05, "loss": 1.4762, "step": 9700 }, { "epoch": 0.016280877018954632, "eval_loss": 1.5051512718200684, "eval_masked_accuracy": 0.6943231225013733, "eval_runtime": 1.7515, "eval_samples_per_second": 5.709, "eval_steps_per_second": 2.284, "step": 9700 }, { "epoch": 0.01636479906544409, "grad_norm": 5.369190216064453, "learning_rate": 4.9182179656960245e-05, "loss": 1.5021, "step": 9750 }, { "epoch": 0.01636479906544409, "eval_loss": 1.7301708459854126, "eval_masked_accuracy": 0.6593886613845825, "eval_runtime": 1.7374, "eval_samples_per_second": 5.756, "eval_steps_per_second": 2.302, "step": 9750 }, { "epoch": 0.01644872111193355, "grad_norm": 4.986740589141846, "learning_rate": 4.917798355463577e-05, "loss": 1.5618, "step": 9800 }, { "epoch": 0.01644872111193355, "eval_loss": 1.3315510749816895, "eval_masked_accuracy": 0.700421929359436, "eval_runtime": 1.7373, "eval_samples_per_second": 5.756, "eval_steps_per_second": 2.302, "step": 9800 }, { "epoch": 0.016532643158423004, "grad_norm": 7.441061973571777, "learning_rate": 4.9173787452311295e-05, "loss": 1.5428, "step": 9850 }, { "epoch": 0.016532643158423004, "eval_loss": 1.6381117105484009, "eval_masked_accuracy": 0.6695652008056641, "eval_runtime": 1.7386, "eval_samples_per_second": 5.752, "eval_steps_per_second": 2.301, "step": 9850 }, { "epoch": 0.016616565204912462, "grad_norm": 6.459640979766846, "learning_rate": 4.9169591349986824e-05, "loss": 1.4702, "step": 9900 }, { "epoch": 0.016616565204912462, "eval_loss": 1.537841796875, "eval_masked_accuracy": 0.6741573214530945, "eval_runtime": 1.7482, "eval_samples_per_second": 5.72, "eval_steps_per_second": 2.288, "step": 9900 }, { "epoch": 0.016700487251401917, "grad_norm": 6.058482646942139, "learning_rate": 4.916539524766235e-05, "loss": 1.5765, "step": 9950 }, { "epoch": 0.016700487251401917, "eval_loss": 1.688913345336914, "eval_masked_accuracy": 0.692307710647583, "eval_runtime": 1.7482, "eval_samples_per_second": 5.72, "eval_steps_per_second": 2.288, "step": 9950 }, { "epoch": 0.016784409297891375, "grad_norm": 4.960835933685303, "learning_rate": 4.916119914533788e-05, "loss": 1.544, "step": 10000 }, { "epoch": 0.016784409297891375, "eval_loss": 1.7901655435562134, "eval_masked_accuracy": 0.6443514823913574, "eval_runtime": 1.7882, "eval_samples_per_second": 5.592, "eval_steps_per_second": 2.237, "step": 10000 } ], "logging_steps": 50, "max_steps": 595791, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.617791736784392e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }