| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.016784409297891375, | |
| "eval_steps": 50, | |
| "global_step": 10000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 8.392204648945687e-05, | |
| "grad_norm": 3.729597806930542, | |
| "learning_rate": 4.999588781972202e-05, | |
| "loss": 1.6987, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 8.392204648945687e-05, | |
| "eval_loss": 1.8344682455062866, | |
| "eval_masked_accuracy": 0.6726457476615906, | |
| "eval_runtime": 1.754, | |
| "eval_samples_per_second": 5.701, | |
| "eval_steps_per_second": 2.281, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.00016784409297891374, | |
| "grad_norm": 6.472073078155518, | |
| "learning_rate": 4.999169171739755e-05, | |
| "loss": 1.7415, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.00016784409297891374, | |
| "eval_loss": 1.7104336023330688, | |
| "eval_masked_accuracy": 0.6737288236618042, | |
| "eval_runtime": 1.7376, | |
| "eval_samples_per_second": 5.755, | |
| "eval_steps_per_second": 2.302, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.0002517661394683706, | |
| "grad_norm": 5.799453258514404, | |
| "learning_rate": 4.998749561507307e-05, | |
| "loss": 1.736, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.0002517661394683706, | |
| "eval_loss": 1.877158761024475, | |
| "eval_masked_accuracy": 0.6936936974525452, | |
| "eval_runtime": 1.7454, | |
| "eval_samples_per_second": 5.729, | |
| "eval_steps_per_second": 2.292, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.0003356881859578275, | |
| "grad_norm": 9.896933555603027, | |
| "learning_rate": 4.99832995127486e-05, | |
| "loss": 1.7919, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.0003356881859578275, | |
| "eval_loss": 1.424491047859192, | |
| "eval_masked_accuracy": 0.7206477522850037, | |
| "eval_runtime": 1.7315, | |
| "eval_samples_per_second": 5.775, | |
| "eval_steps_per_second": 2.31, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.00041961023244728435, | |
| "grad_norm": 4.745198726654053, | |
| "learning_rate": 4.997910341042413e-05, | |
| "loss": 1.7252, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.00041961023244728435, | |
| "eval_loss": 1.915906310081482, | |
| "eval_masked_accuracy": 0.6486486196517944, | |
| "eval_runtime": 1.7431, | |
| "eval_samples_per_second": 5.737, | |
| "eval_steps_per_second": 2.295, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.0005035322789367412, | |
| "grad_norm": 6.004683971405029, | |
| "learning_rate": 4.9974907308099657e-05, | |
| "loss": 1.7487, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.0005035322789367412, | |
| "eval_loss": 1.7426478862762451, | |
| "eval_masked_accuracy": 0.6846473217010498, | |
| "eval_runtime": 1.7474, | |
| "eval_samples_per_second": 5.723, | |
| "eval_steps_per_second": 2.289, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.0005874543254261981, | |
| "grad_norm": 8.232338905334473, | |
| "learning_rate": 4.9970711205775185e-05, | |
| "loss": 1.6958, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.0005874543254261981, | |
| "eval_loss": 1.8806991577148438, | |
| "eval_masked_accuracy": 0.6256157755851746, | |
| "eval_runtime": 1.7452, | |
| "eval_samples_per_second": 5.73, | |
| "eval_steps_per_second": 2.292, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.000671376371915655, | |
| "grad_norm": 8.929485321044922, | |
| "learning_rate": 4.996651510345071e-05, | |
| "loss": 1.7165, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.000671376371915655, | |
| "eval_loss": 1.6669635772705078, | |
| "eval_masked_accuracy": 0.6816326379776001, | |
| "eval_runtime": 1.7367, | |
| "eval_samples_per_second": 5.758, | |
| "eval_steps_per_second": 2.303, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.0007552984184051118, | |
| "grad_norm": 6.171640872955322, | |
| "learning_rate": 4.9962402923172725e-05, | |
| "loss": 1.6222, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.0007552984184051118, | |
| "eval_loss": 2.174530506134033, | |
| "eval_masked_accuracy": 0.6891891956329346, | |
| "eval_runtime": 1.7554, | |
| "eval_samples_per_second": 5.697, | |
| "eval_steps_per_second": 2.279, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.0008392204648945687, | |
| "grad_norm": 4.092519283294678, | |
| "learning_rate": 4.9958206820848254e-05, | |
| "loss": 1.6441, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.0008392204648945687, | |
| "eval_loss": 2.060279369354248, | |
| "eval_masked_accuracy": 0.6461538672447205, | |
| "eval_runtime": 1.8075, | |
| "eval_samples_per_second": 5.532, | |
| "eval_steps_per_second": 2.213, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.0009231425113840256, | |
| "grad_norm": 5.34571647644043, | |
| "learning_rate": 4.995401071852378e-05, | |
| "loss": 1.7198, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.0009231425113840256, | |
| "eval_loss": 1.6280667781829834, | |
| "eval_masked_accuracy": 0.6775510311126709, | |
| "eval_runtime": 1.7487, | |
| "eval_samples_per_second": 5.718, | |
| "eval_steps_per_second": 2.287, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.0010070645578734824, | |
| "grad_norm": 4.286564350128174, | |
| "learning_rate": 4.994981461619931e-05, | |
| "loss": 1.6823, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.0010070645578734824, | |
| "eval_loss": 1.5270774364471436, | |
| "eval_masked_accuracy": 0.6832579374313354, | |
| "eval_runtime": 1.7459, | |
| "eval_samples_per_second": 5.728, | |
| "eval_steps_per_second": 2.291, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.0010909866043629394, | |
| "grad_norm": 3.7731900215148926, | |
| "learning_rate": 4.994561851387484e-05, | |
| "loss": 1.573, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.0010909866043629394, | |
| "eval_loss": 1.522475242614746, | |
| "eval_masked_accuracy": 0.7423076629638672, | |
| "eval_runtime": 1.7483, | |
| "eval_samples_per_second": 5.72, | |
| "eval_steps_per_second": 2.288, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.0011749086508523962, | |
| "grad_norm": 4.305816650390625, | |
| "learning_rate": 4.994142241155036e-05, | |
| "loss": 1.6905, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.0011749086508523962, | |
| "eval_loss": 1.503122091293335, | |
| "eval_masked_accuracy": 0.67136150598526, | |
| "eval_runtime": 1.7423, | |
| "eval_samples_per_second": 5.74, | |
| "eval_steps_per_second": 2.296, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.0012588306973418532, | |
| "grad_norm": 6.982117176055908, | |
| "learning_rate": 4.993722630922589e-05, | |
| "loss": 1.6444, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.0012588306973418532, | |
| "eval_loss": 1.7397890090942383, | |
| "eval_masked_accuracy": 0.6946902871131897, | |
| "eval_runtime": 1.7436, | |
| "eval_samples_per_second": 5.735, | |
| "eval_steps_per_second": 2.294, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.00134275274383131, | |
| "grad_norm": 6.332937717437744, | |
| "learning_rate": 4.993303020690142e-05, | |
| "loss": 1.7488, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.00134275274383131, | |
| "eval_loss": 1.407382845878601, | |
| "eval_masked_accuracy": 0.7051281929016113, | |
| "eval_runtime": 1.7398, | |
| "eval_samples_per_second": 5.748, | |
| "eval_steps_per_second": 2.299, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.001426674790320767, | |
| "grad_norm": 5.491461753845215, | |
| "learning_rate": 4.9928834104576946e-05, | |
| "loss": 1.5959, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.001426674790320767, | |
| "eval_loss": 1.8142907619476318, | |
| "eval_masked_accuracy": 0.6625514626502991, | |
| "eval_runtime": 1.7407, | |
| "eval_samples_per_second": 5.745, | |
| "eval_steps_per_second": 2.298, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.0015105968368102237, | |
| "grad_norm": 12.12775707244873, | |
| "learning_rate": 4.9924638002252474e-05, | |
| "loss": 1.6085, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.0015105968368102237, | |
| "eval_loss": 1.9904667139053345, | |
| "eval_masked_accuracy": 0.6278026700019836, | |
| "eval_runtime": 1.7503, | |
| "eval_samples_per_second": 5.713, | |
| "eval_steps_per_second": 2.285, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.0015945188832996806, | |
| "grad_norm": 18.452600479125977, | |
| "learning_rate": 4.9920441899928e-05, | |
| "loss": 1.5793, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.0015945188832996806, | |
| "eval_loss": 1.797326683998108, | |
| "eval_masked_accuracy": 0.6784313917160034, | |
| "eval_runtime": 1.7403, | |
| "eval_samples_per_second": 5.746, | |
| "eval_steps_per_second": 2.299, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.0016784409297891374, | |
| "grad_norm": 8.000075340270996, | |
| "learning_rate": 4.9916245797603524e-05, | |
| "loss": 1.5353, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.0016784409297891374, | |
| "eval_loss": 1.8558744192123413, | |
| "eval_masked_accuracy": 0.6530612111091614, | |
| "eval_runtime": 1.7574, | |
| "eval_samples_per_second": 5.69, | |
| "eval_steps_per_second": 2.276, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.0017623629762785944, | |
| "grad_norm": 3.907064199447632, | |
| "learning_rate": 4.991204969527905e-05, | |
| "loss": 1.5363, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.0017623629762785944, | |
| "eval_loss": 2.0765745639801025, | |
| "eval_masked_accuracy": 0.6553191542625427, | |
| "eval_runtime": 1.7983, | |
| "eval_samples_per_second": 5.561, | |
| "eval_steps_per_second": 2.224, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.0018462850227680511, | |
| "grad_norm": 4.185476303100586, | |
| "learning_rate": 4.990785359295458e-05, | |
| "loss": 1.6641, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.0018462850227680511, | |
| "eval_loss": 1.5849405527114868, | |
| "eval_masked_accuracy": 0.71074378490448, | |
| "eval_runtime": 1.7601, | |
| "eval_samples_per_second": 5.681, | |
| "eval_steps_per_second": 2.273, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.0019302070692575081, | |
| "grad_norm": 5.447309494018555, | |
| "learning_rate": 4.990365749063011e-05, | |
| "loss": 1.7069, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.0019302070692575081, | |
| "eval_loss": 1.6813358068466187, | |
| "eval_masked_accuracy": 0.7231404781341553, | |
| "eval_runtime": 1.7529, | |
| "eval_samples_per_second": 5.705, | |
| "eval_steps_per_second": 2.282, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.002014129115746965, | |
| "grad_norm": 5.904290199279785, | |
| "learning_rate": 4.989946138830564e-05, | |
| "loss": 1.6996, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.002014129115746965, | |
| "eval_loss": 1.6986854076385498, | |
| "eval_masked_accuracy": 0.6554622054100037, | |
| "eval_runtime": 1.7531, | |
| "eval_samples_per_second": 5.704, | |
| "eval_steps_per_second": 2.282, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.002098051162236422, | |
| "grad_norm": 5.6478986740112305, | |
| "learning_rate": 4.989526528598116e-05, | |
| "loss": 1.5291, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.002098051162236422, | |
| "eval_loss": 1.7059627771377563, | |
| "eval_masked_accuracy": 0.6680498123168945, | |
| "eval_runtime": 1.7416, | |
| "eval_samples_per_second": 5.742, | |
| "eval_steps_per_second": 2.297, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.002181973208725879, | |
| "grad_norm": 6.695890426635742, | |
| "learning_rate": 4.989106918365669e-05, | |
| "loss": 1.8386, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.002181973208725879, | |
| "eval_loss": 1.6500450372695923, | |
| "eval_masked_accuracy": 0.6693877577781677, | |
| "eval_runtime": 1.7414, | |
| "eval_samples_per_second": 5.743, | |
| "eval_steps_per_second": 2.297, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.0022658952552153354, | |
| "grad_norm": 4.831510066986084, | |
| "learning_rate": 4.9886873081332217e-05, | |
| "loss": 1.691, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.0022658952552153354, | |
| "eval_loss": 1.4610856771469116, | |
| "eval_masked_accuracy": 0.7090163826942444, | |
| "eval_runtime": 1.7413, | |
| "eval_samples_per_second": 5.743, | |
| "eval_steps_per_second": 2.297, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.0023498173017047924, | |
| "grad_norm": 4.90496826171875, | |
| "learning_rate": 4.9882676979007745e-05, | |
| "loss": 1.7116, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.0023498173017047924, | |
| "eval_loss": 1.6787996292114258, | |
| "eval_masked_accuracy": 0.6153846383094788, | |
| "eval_runtime": 1.7655, | |
| "eval_samples_per_second": 5.664, | |
| "eval_steps_per_second": 2.266, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.0024337393481942493, | |
| "grad_norm": 5.956592559814453, | |
| "learning_rate": 4.9878480876683273e-05, | |
| "loss": 1.5348, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.0024337393481942493, | |
| "eval_loss": 1.7995752096176147, | |
| "eval_masked_accuracy": 0.6759999990463257, | |
| "eval_runtime": 1.7486, | |
| "eval_samples_per_second": 5.719, | |
| "eval_steps_per_second": 2.288, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.0025176613946837063, | |
| "grad_norm": 5.731600761413574, | |
| "learning_rate": 4.9874284774358795e-05, | |
| "loss": 1.5617, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.0025176613946837063, | |
| "eval_loss": 2.028412342071533, | |
| "eval_masked_accuracy": 0.6007905006408691, | |
| "eval_runtime": 1.797, | |
| "eval_samples_per_second": 5.565, | |
| "eval_steps_per_second": 2.226, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.002601583441173163, | |
| "grad_norm": 9.261569023132324, | |
| "learning_rate": 4.9870088672034324e-05, | |
| "loss": 1.7109, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.002601583441173163, | |
| "eval_loss": 1.8843729496002197, | |
| "eval_masked_accuracy": 0.6594203114509583, | |
| "eval_runtime": 1.7575, | |
| "eval_samples_per_second": 5.69, | |
| "eval_steps_per_second": 2.276, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.00268550548766262, | |
| "grad_norm": 7.181281089782715, | |
| "learning_rate": 4.986589256970985e-05, | |
| "loss": 1.6529, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.00268550548766262, | |
| "eval_loss": 1.5639550685882568, | |
| "eval_masked_accuracy": 0.6905829310417175, | |
| "eval_runtime": 1.7429, | |
| "eval_samples_per_second": 5.738, | |
| "eval_steps_per_second": 2.295, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.002769427534152077, | |
| "grad_norm": 5.245086193084717, | |
| "learning_rate": 4.986169646738538e-05, | |
| "loss": 1.6497, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.002769427534152077, | |
| "eval_loss": 1.4776060581207275, | |
| "eval_masked_accuracy": 0.7312775254249573, | |
| "eval_runtime": 1.7496, | |
| "eval_samples_per_second": 5.715, | |
| "eval_steps_per_second": 2.286, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.002853349580641534, | |
| "grad_norm": 5.593554496765137, | |
| "learning_rate": 4.98575003650609e-05, | |
| "loss": 1.5326, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.002853349580641534, | |
| "eval_loss": 2.0159146785736084, | |
| "eval_masked_accuracy": 0.6205357313156128, | |
| "eval_runtime": 1.7381, | |
| "eval_samples_per_second": 5.753, | |
| "eval_steps_per_second": 2.301, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.0029372716271309908, | |
| "grad_norm": 7.406851768493652, | |
| "learning_rate": 4.985330426273643e-05, | |
| "loss": 1.5081, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.0029372716271309908, | |
| "eval_loss": 1.508250117301941, | |
| "eval_masked_accuracy": 0.7027027010917664, | |
| "eval_runtime": 1.7474, | |
| "eval_samples_per_second": 5.723, | |
| "eval_steps_per_second": 2.289, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.0030211936736204473, | |
| "grad_norm": 4.5133514404296875, | |
| "learning_rate": 4.984910816041196e-05, | |
| "loss": 1.6619, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.0030211936736204473, | |
| "eval_loss": 1.7022559642791748, | |
| "eval_masked_accuracy": 0.694779098033905, | |
| "eval_runtime": 1.748, | |
| "eval_samples_per_second": 5.721, | |
| "eval_steps_per_second": 2.288, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.0031051157201099043, | |
| "grad_norm": 7.173299312591553, | |
| "learning_rate": 4.984491205808749e-05, | |
| "loss": 1.7603, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.0031051157201099043, | |
| "eval_loss": 1.6458946466445923, | |
| "eval_masked_accuracy": 0.6636771559715271, | |
| "eval_runtime": 1.8607, | |
| "eval_samples_per_second": 5.374, | |
| "eval_steps_per_second": 2.15, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.0031890377665993613, | |
| "grad_norm": 4.3678975105285645, | |
| "learning_rate": 4.984071595576301e-05, | |
| "loss": 1.6453, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.0031890377665993613, | |
| "eval_loss": 1.8176072835922241, | |
| "eval_masked_accuracy": 0.6724137663841248, | |
| "eval_runtime": 1.7589, | |
| "eval_samples_per_second": 5.685, | |
| "eval_steps_per_second": 2.274, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.0032729598130888183, | |
| "grad_norm": 7.378585338592529, | |
| "learning_rate": 4.983651985343854e-05, | |
| "loss": 1.6409, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.0032729598130888183, | |
| "eval_loss": 2.0491786003112793, | |
| "eval_masked_accuracy": 0.6374502182006836, | |
| "eval_runtime": 1.757, | |
| "eval_samples_per_second": 5.692, | |
| "eval_steps_per_second": 2.277, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.003356881859578275, | |
| "grad_norm": 4.898635387420654, | |
| "learning_rate": 4.9832323751114066e-05, | |
| "loss": 1.6994, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.003356881859578275, | |
| "eval_loss": 1.4773211479187012, | |
| "eval_masked_accuracy": 0.6739130616188049, | |
| "eval_runtime": 1.7439, | |
| "eval_samples_per_second": 5.734, | |
| "eval_steps_per_second": 2.294, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.003440803906067732, | |
| "grad_norm": 7.465532302856445, | |
| "learning_rate": 4.9828127648789594e-05, | |
| "loss": 1.5798, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.003440803906067732, | |
| "eval_loss": 1.6743123531341553, | |
| "eval_masked_accuracy": 0.6770427823066711, | |
| "eval_runtime": 1.7546, | |
| "eval_samples_per_second": 5.699, | |
| "eval_steps_per_second": 2.28, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.0035247259525571888, | |
| "grad_norm": 7.025172233581543, | |
| "learning_rate": 4.982393154646512e-05, | |
| "loss": 1.7312, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.0035247259525571888, | |
| "eval_loss": 1.726737380027771, | |
| "eval_masked_accuracy": 0.6824034452438354, | |
| "eval_runtime": 1.7272, | |
| "eval_samples_per_second": 5.79, | |
| "eval_steps_per_second": 2.316, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.0036086479990466457, | |
| "grad_norm": 8.405756950378418, | |
| "learning_rate": 4.9819735444140644e-05, | |
| "loss": 1.7284, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.0036086479990466457, | |
| "eval_loss": 1.8043725490570068, | |
| "eval_masked_accuracy": 0.6153846383094788, | |
| "eval_runtime": 1.7569, | |
| "eval_samples_per_second": 5.692, | |
| "eval_steps_per_second": 2.277, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.0036925700455361023, | |
| "grad_norm": 6.279454231262207, | |
| "learning_rate": 4.981553934181617e-05, | |
| "loss": 1.4629, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.0036925700455361023, | |
| "eval_loss": 1.8529506921768188, | |
| "eval_masked_accuracy": 0.6823529601097107, | |
| "eval_runtime": 1.7798, | |
| "eval_samples_per_second": 5.619, | |
| "eval_steps_per_second": 2.247, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.0037764920920255593, | |
| "grad_norm": 7.525041580200195, | |
| "learning_rate": 4.98113432394917e-05, | |
| "loss": 1.5309, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.0037764920920255593, | |
| "eval_loss": 1.8144168853759766, | |
| "eval_masked_accuracy": 0.7272727489471436, | |
| "eval_runtime": 1.7418, | |
| "eval_samples_per_second": 5.741, | |
| "eval_steps_per_second": 2.297, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.0038604141385150162, | |
| "grad_norm": 6.561546802520752, | |
| "learning_rate": 4.980714713716723e-05, | |
| "loss": 1.6761, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.0038604141385150162, | |
| "eval_loss": 1.8419244289398193, | |
| "eval_masked_accuracy": 0.6638298034667969, | |
| "eval_runtime": 1.7921, | |
| "eval_samples_per_second": 5.58, | |
| "eval_steps_per_second": 2.232, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.003944336185004473, | |
| "grad_norm": 4.7332987785339355, | |
| "learning_rate": 4.980295103484276e-05, | |
| "loss": 1.6738, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.003944336185004473, | |
| "eval_loss": 1.576249122619629, | |
| "eval_masked_accuracy": 0.7078651785850525, | |
| "eval_runtime": 1.7414, | |
| "eval_samples_per_second": 5.742, | |
| "eval_steps_per_second": 2.297, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.00402825823149393, | |
| "grad_norm": 3.7719192504882812, | |
| "learning_rate": 4.979875493251828e-05, | |
| "loss": 1.6432, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.00402825823149393, | |
| "eval_loss": 1.811785340309143, | |
| "eval_masked_accuracy": 0.6746031641960144, | |
| "eval_runtime": 1.7463, | |
| "eval_samples_per_second": 5.726, | |
| "eval_steps_per_second": 2.291, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.004112180277983387, | |
| "grad_norm": 6.218490123748779, | |
| "learning_rate": 4.979455883019381e-05, | |
| "loss": 1.5416, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.004112180277983387, | |
| "eval_loss": 1.6883758306503296, | |
| "eval_masked_accuracy": 0.6900826692581177, | |
| "eval_runtime": 1.7374, | |
| "eval_samples_per_second": 5.756, | |
| "eval_steps_per_second": 2.302, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.004196102324472844, | |
| "grad_norm": 5.042550086975098, | |
| "learning_rate": 4.9790362727869336e-05, | |
| "loss": 1.6701, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.004196102324472844, | |
| "eval_loss": 1.567375898361206, | |
| "eval_masked_accuracy": 0.6788617968559265, | |
| "eval_runtime": 1.735, | |
| "eval_samples_per_second": 5.764, | |
| "eval_steps_per_second": 2.306, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.0042800243709623, | |
| "grad_norm": 6.665520668029785, | |
| "learning_rate": 4.9786166625544865e-05, | |
| "loss": 1.6006, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.0042800243709623, | |
| "eval_loss": 1.659168004989624, | |
| "eval_masked_accuracy": 0.6385542154312134, | |
| "eval_runtime": 1.7434, | |
| "eval_samples_per_second": 5.736, | |
| "eval_steps_per_second": 2.294, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.004363946417451758, | |
| "grad_norm": 4.378693580627441, | |
| "learning_rate": 4.978197052322039e-05, | |
| "loss": 1.6363, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.004363946417451758, | |
| "eval_loss": 1.6367610692977905, | |
| "eval_masked_accuracy": 0.6679389476776123, | |
| "eval_runtime": 1.7407, | |
| "eval_samples_per_second": 5.745, | |
| "eval_steps_per_second": 2.298, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.004447868463941214, | |
| "grad_norm": 8.087454795837402, | |
| "learning_rate": 4.977777442089592e-05, | |
| "loss": 1.5518, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.004447868463941214, | |
| "eval_loss": 2.035369873046875, | |
| "eval_masked_accuracy": 0.64462810754776, | |
| "eval_runtime": 1.7443, | |
| "eval_samples_per_second": 5.733, | |
| "eval_steps_per_second": 2.293, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.004531790510430671, | |
| "grad_norm": 6.383141040802002, | |
| "learning_rate": 4.977357831857144e-05, | |
| "loss": 1.7726, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.004531790510430671, | |
| "eval_loss": 1.9139858484268188, | |
| "eval_masked_accuracy": 0.7137096524238586, | |
| "eval_runtime": 1.7376, | |
| "eval_samples_per_second": 5.755, | |
| "eval_steps_per_second": 2.302, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.004615712556920128, | |
| "grad_norm": 8.098458290100098, | |
| "learning_rate": 4.976938221624697e-05, | |
| "loss": 1.701, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.004615712556920128, | |
| "eval_loss": 1.8784687519073486, | |
| "eval_masked_accuracy": 0.6525096297264099, | |
| "eval_runtime": 1.8538, | |
| "eval_samples_per_second": 5.394, | |
| "eval_steps_per_second": 2.158, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.004699634603409585, | |
| "grad_norm": 5.3736138343811035, | |
| "learning_rate": 4.97651861139225e-05, | |
| "loss": 1.5577, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.004699634603409585, | |
| "eval_loss": 1.6022107601165771, | |
| "eval_masked_accuracy": 0.6907630562782288, | |
| "eval_runtime": 1.7368, | |
| "eval_samples_per_second": 5.758, | |
| "eval_steps_per_second": 2.303, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.004783556649899042, | |
| "grad_norm": 4.617998123168945, | |
| "learning_rate": 4.976099001159803e-05, | |
| "loss": 1.6194, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.004783556649899042, | |
| "eval_loss": 1.398147702217102, | |
| "eval_masked_accuracy": 0.6696035265922546, | |
| "eval_runtime": 1.7507, | |
| "eval_samples_per_second": 5.712, | |
| "eval_steps_per_second": 2.285, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.004867478696388499, | |
| "grad_norm": 4.976247787475586, | |
| "learning_rate": 4.975679390927356e-05, | |
| "loss": 1.6325, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.004867478696388499, | |
| "eval_loss": 1.7178815603256226, | |
| "eval_masked_accuracy": 0.6653696298599243, | |
| "eval_runtime": 1.7533, | |
| "eval_samples_per_second": 5.703, | |
| "eval_steps_per_second": 2.281, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.004951400742877955, | |
| "grad_norm": 5.229081153869629, | |
| "learning_rate": 4.9752597806949085e-05, | |
| "loss": 1.7057, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 0.004951400742877955, | |
| "eval_loss": 1.8161494731903076, | |
| "eval_masked_accuracy": 0.6431535482406616, | |
| "eval_runtime": 1.7382, | |
| "eval_samples_per_second": 5.753, | |
| "eval_steps_per_second": 2.301, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 0.005035322789367413, | |
| "grad_norm": 6.112144947052002, | |
| "learning_rate": 4.974840170462461e-05, | |
| "loss": 1.6189, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.005035322789367413, | |
| "eval_loss": 1.8454160690307617, | |
| "eval_masked_accuracy": 0.6767241358757019, | |
| "eval_runtime": 1.7376, | |
| "eval_samples_per_second": 5.755, | |
| "eval_steps_per_second": 2.302, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.005119244835856869, | |
| "grad_norm": 5.7698445320129395, | |
| "learning_rate": 4.9744205602300135e-05, | |
| "loss": 1.6734, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 0.005119244835856869, | |
| "eval_loss": 1.6155188083648682, | |
| "eval_masked_accuracy": 0.6991525292396545, | |
| "eval_runtime": 1.7469, | |
| "eval_samples_per_second": 5.724, | |
| "eval_steps_per_second": 2.29, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 0.005203166882346326, | |
| "grad_norm": 11.4446382522583, | |
| "learning_rate": 4.9740009499975664e-05, | |
| "loss": 1.602, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.005203166882346326, | |
| "eval_loss": 1.7193024158477783, | |
| "eval_masked_accuracy": 0.6454545259475708, | |
| "eval_runtime": 1.8085, | |
| "eval_samples_per_second": 5.529, | |
| "eval_steps_per_second": 2.212, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.005287088928835783, | |
| "grad_norm": 4.331955432891846, | |
| "learning_rate": 4.973581339765119e-05, | |
| "loss": 1.5886, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 0.005287088928835783, | |
| "eval_loss": 1.7239084243774414, | |
| "eval_masked_accuracy": 0.7025862336158752, | |
| "eval_runtime": 1.7507, | |
| "eval_samples_per_second": 5.712, | |
| "eval_steps_per_second": 2.285, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 0.00537101097532524, | |
| "grad_norm": 6.857669830322266, | |
| "learning_rate": 4.973161729532672e-05, | |
| "loss": 1.6531, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.00537101097532524, | |
| "eval_loss": 1.7898776531219482, | |
| "eval_masked_accuracy": 0.6463878154754639, | |
| "eval_runtime": 1.807, | |
| "eval_samples_per_second": 5.534, | |
| "eval_steps_per_second": 2.214, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.005454933021814697, | |
| "grad_norm": 6.366724491119385, | |
| "learning_rate": 4.972742119300224e-05, | |
| "loss": 1.5112, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 0.005454933021814697, | |
| "eval_loss": 1.68304443359375, | |
| "eval_masked_accuracy": 0.6958174705505371, | |
| "eval_runtime": 1.7544, | |
| "eval_samples_per_second": 5.7, | |
| "eval_steps_per_second": 2.28, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 0.005538855068304154, | |
| "grad_norm": 5.657731056213379, | |
| "learning_rate": 4.972322509067777e-05, | |
| "loss": 1.5622, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.005538855068304154, | |
| "eval_loss": 1.7854249477386475, | |
| "eval_masked_accuracy": 0.6833333373069763, | |
| "eval_runtime": 1.7977, | |
| "eval_samples_per_second": 5.563, | |
| "eval_steps_per_second": 2.225, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.00562277711479361, | |
| "grad_norm": 4.501428127288818, | |
| "learning_rate": 4.97190289883533e-05, | |
| "loss": 1.5736, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 0.00562277711479361, | |
| "eval_loss": 1.4276224374771118, | |
| "eval_masked_accuracy": 0.7192118167877197, | |
| "eval_runtime": 1.7643, | |
| "eval_samples_per_second": 5.668, | |
| "eval_steps_per_second": 2.267, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 0.005706699161283068, | |
| "grad_norm": 6.436139106750488, | |
| "learning_rate": 4.971483288602883e-05, | |
| "loss": 1.5653, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.005706699161283068, | |
| "eval_loss": 1.674355149269104, | |
| "eval_masked_accuracy": 0.718367338180542, | |
| "eval_runtime": 1.7482, | |
| "eval_samples_per_second": 5.72, | |
| "eval_steps_per_second": 2.288, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.005790621207772524, | |
| "grad_norm": 6.295548439025879, | |
| "learning_rate": 4.9710636783704356e-05, | |
| "loss": 1.5556, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 0.005790621207772524, | |
| "eval_loss": 1.7501426935195923, | |
| "eval_masked_accuracy": 0.7076271176338196, | |
| "eval_runtime": 1.7554, | |
| "eval_samples_per_second": 5.697, | |
| "eval_steps_per_second": 2.279, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 0.0058745432542619816, | |
| "grad_norm": 5.733904838562012, | |
| "learning_rate": 4.9706440681379884e-05, | |
| "loss": 1.5164, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.0058745432542619816, | |
| "eval_loss": 1.520179033279419, | |
| "eval_masked_accuracy": 0.7203390002250671, | |
| "eval_runtime": 1.7629, | |
| "eval_samples_per_second": 5.672, | |
| "eval_steps_per_second": 2.269, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.005958465300751438, | |
| "grad_norm": 5.285616397857666, | |
| "learning_rate": 4.9702244579055406e-05, | |
| "loss": 1.6254, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 0.005958465300751438, | |
| "eval_loss": 1.7321217060089111, | |
| "eval_masked_accuracy": 0.6712962985038757, | |
| "eval_runtime": 1.7429, | |
| "eval_samples_per_second": 5.738, | |
| "eval_steps_per_second": 2.295, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 0.006042387347240895, | |
| "grad_norm": 5.386379241943359, | |
| "learning_rate": 4.9698048476730934e-05, | |
| "loss": 1.505, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.006042387347240895, | |
| "eval_loss": 1.7810560464859009, | |
| "eval_masked_accuracy": 0.654618501663208, | |
| "eval_runtime": 1.7454, | |
| "eval_samples_per_second": 5.729, | |
| "eval_steps_per_second": 2.292, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.006126309393730352, | |
| "grad_norm": 6.726806640625, | |
| "learning_rate": 4.969385237440646e-05, | |
| "loss": 1.5011, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 0.006126309393730352, | |
| "eval_loss": 1.5794349908828735, | |
| "eval_masked_accuracy": 0.6979591846466064, | |
| "eval_runtime": 1.7721, | |
| "eval_samples_per_second": 5.643, | |
| "eval_steps_per_second": 2.257, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 0.006210231440219809, | |
| "grad_norm": 7.159238815307617, | |
| "learning_rate": 4.968965627208199e-05, | |
| "loss": 1.6134, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.006210231440219809, | |
| "eval_loss": 1.4294860363006592, | |
| "eval_masked_accuracy": 0.7136752009391785, | |
| "eval_runtime": 1.752, | |
| "eval_samples_per_second": 5.708, | |
| "eval_steps_per_second": 2.283, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.006294153486709265, | |
| "grad_norm": 5.560455799102783, | |
| "learning_rate": 4.968546016975752e-05, | |
| "loss": 1.5097, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 0.006294153486709265, | |
| "eval_loss": 1.9169464111328125, | |
| "eval_masked_accuracy": 0.6929824352264404, | |
| "eval_runtime": 1.7457, | |
| "eval_samples_per_second": 5.728, | |
| "eval_steps_per_second": 2.291, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 0.0063780755331987226, | |
| "grad_norm": 5.439815998077393, | |
| "learning_rate": 4.968126406743305e-05, | |
| "loss": 1.6706, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.0063780755331987226, | |
| "eval_loss": 1.622685194015503, | |
| "eval_masked_accuracy": 0.6913580298423767, | |
| "eval_runtime": 1.7518, | |
| "eval_samples_per_second": 5.709, | |
| "eval_steps_per_second": 2.283, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.006461997579688179, | |
| "grad_norm": 4.242193698883057, | |
| "learning_rate": 4.967706796510857e-05, | |
| "loss": 1.5511, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 0.006461997579688179, | |
| "eval_loss": 1.3621394634246826, | |
| "eval_masked_accuracy": 0.7379912734031677, | |
| "eval_runtime": 1.7356, | |
| "eval_samples_per_second": 5.762, | |
| "eval_steps_per_second": 2.305, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 0.0065459196261776365, | |
| "grad_norm": 5.056567668914795, | |
| "learning_rate": 4.96728718627841e-05, | |
| "loss": 1.6108, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.0065459196261776365, | |
| "eval_loss": 1.5381476879119873, | |
| "eval_masked_accuracy": 0.7165354490280151, | |
| "eval_runtime": 1.7342, | |
| "eval_samples_per_second": 5.767, | |
| "eval_steps_per_second": 2.307, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.006629841672667093, | |
| "grad_norm": 5.566115856170654, | |
| "learning_rate": 4.966867576045963e-05, | |
| "loss": 1.5858, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 0.006629841672667093, | |
| "eval_loss": 1.9895532131195068, | |
| "eval_masked_accuracy": 0.6399999856948853, | |
| "eval_runtime": 1.7417, | |
| "eval_samples_per_second": 5.742, | |
| "eval_steps_per_second": 2.297, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 0.00671376371915655, | |
| "grad_norm": 7.843978404998779, | |
| "learning_rate": 4.9664479658135155e-05, | |
| "loss": 1.5999, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.00671376371915655, | |
| "eval_loss": 1.589036464691162, | |
| "eval_masked_accuracy": 0.6991525292396545, | |
| "eval_runtime": 1.7452, | |
| "eval_samples_per_second": 5.73, | |
| "eval_steps_per_second": 2.292, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.006797685765646007, | |
| "grad_norm": 6.600104331970215, | |
| "learning_rate": 4.9660283555810683e-05, | |
| "loss": 1.6444, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 0.006797685765646007, | |
| "eval_loss": 1.590256929397583, | |
| "eval_masked_accuracy": 0.6895161271095276, | |
| "eval_runtime": 1.7389, | |
| "eval_samples_per_second": 5.751, | |
| "eval_steps_per_second": 2.3, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 0.006881607812135464, | |
| "grad_norm": 6.0659589767456055, | |
| "learning_rate": 4.965608745348621e-05, | |
| "loss": 1.5554, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.006881607812135464, | |
| "eval_loss": 1.8275972604751587, | |
| "eval_masked_accuracy": 0.6558139324188232, | |
| "eval_runtime": 1.7513, | |
| "eval_samples_per_second": 5.71, | |
| "eval_steps_per_second": 2.284, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.00696552985862492, | |
| "grad_norm": 6.09676456451416, | |
| "learning_rate": 4.9651891351161734e-05, | |
| "loss": 1.7191, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 0.00696552985862492, | |
| "eval_loss": 1.8767850399017334, | |
| "eval_masked_accuracy": 0.6508620977401733, | |
| "eval_runtime": 1.7474, | |
| "eval_samples_per_second": 5.723, | |
| "eval_steps_per_second": 2.289, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 0.0070494519051143775, | |
| "grad_norm": 5.208311080932617, | |
| "learning_rate": 4.964769524883726e-05, | |
| "loss": 1.585, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.0070494519051143775, | |
| "eval_loss": 1.3652145862579346, | |
| "eval_masked_accuracy": 0.7037037014961243, | |
| "eval_runtime": 1.7463, | |
| "eval_samples_per_second": 5.726, | |
| "eval_steps_per_second": 2.291, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.007133373951603834, | |
| "grad_norm": 8.517348289489746, | |
| "learning_rate": 4.964349914651279e-05, | |
| "loss": 1.6888, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 0.007133373951603834, | |
| "eval_loss": 1.347320318222046, | |
| "eval_masked_accuracy": 0.7190082669258118, | |
| "eval_runtime": 1.7446, | |
| "eval_samples_per_second": 5.732, | |
| "eval_steps_per_second": 2.293, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 0.0072172959980932915, | |
| "grad_norm": 5.57391357421875, | |
| "learning_rate": 4.963930304418832e-05, | |
| "loss": 1.6351, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 0.0072172959980932915, | |
| "eval_loss": 1.563398003578186, | |
| "eval_masked_accuracy": 0.6952789425849915, | |
| "eval_runtime": 1.7535, | |
| "eval_samples_per_second": 5.703, | |
| "eval_steps_per_second": 2.281, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 0.007301218044582748, | |
| "grad_norm": 4.073302745819092, | |
| "learning_rate": 4.963510694186385e-05, | |
| "loss": 1.7031, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 0.007301218044582748, | |
| "eval_loss": 1.7390921115875244, | |
| "eval_masked_accuracy": 0.6963562965393066, | |
| "eval_runtime": 1.8598, | |
| "eval_samples_per_second": 5.377, | |
| "eval_steps_per_second": 2.151, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 0.007385140091072205, | |
| "grad_norm": 4.129016876220703, | |
| "learning_rate": 4.963091083953937e-05, | |
| "loss": 1.5611, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.007385140091072205, | |
| "eval_loss": 1.7892725467681885, | |
| "eval_masked_accuracy": 0.7015503644943237, | |
| "eval_runtime": 1.7481, | |
| "eval_samples_per_second": 5.72, | |
| "eval_steps_per_second": 2.288, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.007469062137561662, | |
| "grad_norm": 8.45355224609375, | |
| "learning_rate": 4.96267147372149e-05, | |
| "loss": 1.679, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 0.007469062137561662, | |
| "eval_loss": 1.8994945287704468, | |
| "eval_masked_accuracy": 0.6711111068725586, | |
| "eval_runtime": 1.748, | |
| "eval_samples_per_second": 5.721, | |
| "eval_steps_per_second": 2.288, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 0.0075529841840511185, | |
| "grad_norm": 7.353001594543457, | |
| "learning_rate": 4.9622602556936916e-05, | |
| "loss": 1.5084, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.0075529841840511185, | |
| "eval_loss": 1.6633514165878296, | |
| "eval_masked_accuracy": 0.6792452931404114, | |
| "eval_runtime": 1.7365, | |
| "eval_samples_per_second": 5.759, | |
| "eval_steps_per_second": 2.303, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.007636906230540575, | |
| "grad_norm": 5.420140266418457, | |
| "learning_rate": 4.9618406454612445e-05, | |
| "loss": 1.6768, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 0.007636906230540575, | |
| "eval_loss": 1.6823314428329468, | |
| "eval_masked_accuracy": 0.700421929359436, | |
| "eval_runtime": 1.7456, | |
| "eval_samples_per_second": 5.729, | |
| "eval_steps_per_second": 2.291, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 0.0077208282770300325, | |
| "grad_norm": 5.6282572746276855, | |
| "learning_rate": 4.961421035228797e-05, | |
| "loss": 1.5346, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 0.0077208282770300325, | |
| "eval_loss": 2.210347890853882, | |
| "eval_masked_accuracy": 0.6339285969734192, | |
| "eval_runtime": 1.7553, | |
| "eval_samples_per_second": 5.697, | |
| "eval_steps_per_second": 2.279, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 0.007804750323519489, | |
| "grad_norm": 7.358382701873779, | |
| "learning_rate": 4.96100142499635e-05, | |
| "loss": 1.6792, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 0.007804750323519489, | |
| "eval_loss": 1.742630958557129, | |
| "eval_masked_accuracy": 0.6728110313415527, | |
| "eval_runtime": 1.7331, | |
| "eval_samples_per_second": 5.77, | |
| "eval_steps_per_second": 2.308, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 0.007888672370008946, | |
| "grad_norm": 5.980144500732422, | |
| "learning_rate": 4.960581814763902e-05, | |
| "loss": 1.4871, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 0.007888672370008946, | |
| "eval_loss": 1.4571318626403809, | |
| "eval_masked_accuracy": 0.7166666388511658, | |
| "eval_runtime": 1.7531, | |
| "eval_samples_per_second": 5.704, | |
| "eval_steps_per_second": 2.282, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 0.007972594416498403, | |
| "grad_norm": 8.18883228302002, | |
| "learning_rate": 4.960162204531455e-05, | |
| "loss": 1.527, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 0.007972594416498403, | |
| "eval_loss": 2.062413454055786, | |
| "eval_masked_accuracy": 0.6695278882980347, | |
| "eval_runtime": 1.748, | |
| "eval_samples_per_second": 5.721, | |
| "eval_steps_per_second": 2.288, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 0.00805651646298786, | |
| "grad_norm": 4.835183143615723, | |
| "learning_rate": 4.959742594299008e-05, | |
| "loss": 1.591, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.00805651646298786, | |
| "eval_loss": 1.690118432044983, | |
| "eval_masked_accuracy": 0.7049180269241333, | |
| "eval_runtime": 1.7383, | |
| "eval_samples_per_second": 5.753, | |
| "eval_steps_per_second": 2.301, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.008140438509477316, | |
| "grad_norm": 5.039312362670898, | |
| "learning_rate": 4.959322984066561e-05, | |
| "loss": 1.5386, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 0.008140438509477316, | |
| "eval_loss": 1.9135382175445557, | |
| "eval_masked_accuracy": 0.6181818246841431, | |
| "eval_runtime": 1.7445, | |
| "eval_samples_per_second": 5.732, | |
| "eval_steps_per_second": 2.293, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 0.008224360555966774, | |
| "grad_norm": 6.3293890953063965, | |
| "learning_rate": 4.958903373834114e-05, | |
| "loss": 1.4752, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 0.008224360555966774, | |
| "eval_loss": 1.6353566646575928, | |
| "eval_masked_accuracy": 0.7319999933242798, | |
| "eval_runtime": 1.8458, | |
| "eval_samples_per_second": 5.418, | |
| "eval_steps_per_second": 2.167, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 0.008308282602456231, | |
| "grad_norm": 7.455787658691406, | |
| "learning_rate": 4.958483763601666e-05, | |
| "loss": 1.5304, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 0.008308282602456231, | |
| "eval_loss": 1.8691352605819702, | |
| "eval_masked_accuracy": 0.6653386354446411, | |
| "eval_runtime": 1.7533, | |
| "eval_samples_per_second": 5.704, | |
| "eval_steps_per_second": 2.281, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 0.008392204648945687, | |
| "grad_norm": 5.682205677032471, | |
| "learning_rate": 4.958064153369219e-05, | |
| "loss": 1.5945, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.008392204648945687, | |
| "eval_loss": 1.6161428689956665, | |
| "eval_masked_accuracy": 0.6964285969734192, | |
| "eval_runtime": 1.7814, | |
| "eval_samples_per_second": 5.614, | |
| "eval_steps_per_second": 2.245, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.008476126695435144, | |
| "grad_norm": 6.474329471588135, | |
| "learning_rate": 4.9576445431367715e-05, | |
| "loss": 1.8228, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 0.008476126695435144, | |
| "eval_loss": 1.4911173582077026, | |
| "eval_masked_accuracy": 0.71875, | |
| "eval_runtime": 1.8052, | |
| "eval_samples_per_second": 5.54, | |
| "eval_steps_per_second": 2.216, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 0.0085600487419246, | |
| "grad_norm": 4.493051052093506, | |
| "learning_rate": 4.9572249329043244e-05, | |
| "loss": 1.5526, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 0.0085600487419246, | |
| "eval_loss": 1.4060901403427124, | |
| "eval_masked_accuracy": 0.7131474018096924, | |
| "eval_runtime": 1.8193, | |
| "eval_samples_per_second": 5.497, | |
| "eval_steps_per_second": 2.199, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 0.008643970788414057, | |
| "grad_norm": 5.657381057739258, | |
| "learning_rate": 4.956805322671877e-05, | |
| "loss": 1.5743, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 0.008643970788414057, | |
| "eval_loss": 1.7347627878189087, | |
| "eval_masked_accuracy": 0.6392694115638733, | |
| "eval_runtime": 1.7632, | |
| "eval_samples_per_second": 5.671, | |
| "eval_steps_per_second": 2.269, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 0.008727892834903515, | |
| "grad_norm": 5.059664726257324, | |
| "learning_rate": 4.9563941046440784e-05, | |
| "loss": 1.5923, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 0.008727892834903515, | |
| "eval_loss": 1.7108001708984375, | |
| "eval_masked_accuracy": 0.6759999990463257, | |
| "eval_runtime": 1.7312, | |
| "eval_samples_per_second": 5.776, | |
| "eval_steps_per_second": 2.311, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 0.008811814881392972, | |
| "grad_norm": 6.256536483764648, | |
| "learning_rate": 4.955974494411631e-05, | |
| "loss": 1.5454, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 0.008811814881392972, | |
| "eval_loss": 1.8423763513565063, | |
| "eval_masked_accuracy": 0.6590909361839294, | |
| "eval_runtime": 1.7323, | |
| "eval_samples_per_second": 5.773, | |
| "eval_steps_per_second": 2.309, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 0.008895736927882428, | |
| "grad_norm": 6.45760440826416, | |
| "learning_rate": 4.955554884179184e-05, | |
| "loss": 1.5381, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 0.008895736927882428, | |
| "eval_loss": 1.8820030689239502, | |
| "eval_masked_accuracy": 0.6486486196517944, | |
| "eval_runtime": 1.7529, | |
| "eval_samples_per_second": 5.705, | |
| "eval_steps_per_second": 2.282, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 0.008979658974371885, | |
| "grad_norm": 7.668667793273926, | |
| "learning_rate": 4.955135273946737e-05, | |
| "loss": 1.6363, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 0.008979658974371885, | |
| "eval_loss": 1.631400465965271, | |
| "eval_masked_accuracy": 0.7160493731498718, | |
| "eval_runtime": 1.7511, | |
| "eval_samples_per_second": 5.711, | |
| "eval_steps_per_second": 2.284, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 0.009063581020861342, | |
| "grad_norm": 7.2050018310546875, | |
| "learning_rate": 4.954715663714289e-05, | |
| "loss": 1.5738, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 0.009063581020861342, | |
| "eval_loss": 1.5917881727218628, | |
| "eval_masked_accuracy": 0.7405857443809509, | |
| "eval_runtime": 1.75, | |
| "eval_samples_per_second": 5.714, | |
| "eval_steps_per_second": 2.286, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 0.0091475030673508, | |
| "grad_norm": 6.094969749450684, | |
| "learning_rate": 4.954296053481842e-05, | |
| "loss": 1.7321, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 0.0091475030673508, | |
| "eval_loss": 1.5327577590942383, | |
| "eval_masked_accuracy": 0.707317054271698, | |
| "eval_runtime": 1.7423, | |
| "eval_samples_per_second": 5.74, | |
| "eval_steps_per_second": 2.296, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 0.009231425113840256, | |
| "grad_norm": 8.869881629943848, | |
| "learning_rate": 4.953876443249395e-05, | |
| "loss": 1.5768, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.009231425113840256, | |
| "eval_loss": 1.3501726388931274, | |
| "eval_masked_accuracy": 0.7801724076271057, | |
| "eval_runtime": 1.7732, | |
| "eval_samples_per_second": 5.64, | |
| "eval_steps_per_second": 2.256, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.009315347160329713, | |
| "grad_norm": 4.408574104309082, | |
| "learning_rate": 4.9534568330169476e-05, | |
| "loss": 1.5802, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 0.009315347160329713, | |
| "eval_loss": 1.7055152654647827, | |
| "eval_masked_accuracy": 0.6707317233085632, | |
| "eval_runtime": 1.7418, | |
| "eval_samples_per_second": 5.741, | |
| "eval_steps_per_second": 2.296, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 0.00939926920681917, | |
| "grad_norm": 5.3869147300720215, | |
| "learning_rate": 4.9530372227845e-05, | |
| "loss": 1.5547, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 0.00939926920681917, | |
| "eval_loss": 1.3663699626922607, | |
| "eval_masked_accuracy": 0.6974790096282959, | |
| "eval_runtime": 1.7338, | |
| "eval_samples_per_second": 5.768, | |
| "eval_steps_per_second": 2.307, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 0.009483191253308626, | |
| "grad_norm": 4.417982578277588, | |
| "learning_rate": 4.9526176125520526e-05, | |
| "loss": 1.5658, | |
| "step": 5650 | |
| }, | |
| { | |
| "epoch": 0.009483191253308626, | |
| "eval_loss": 1.6572059392929077, | |
| "eval_masked_accuracy": 0.6520000100135803, | |
| "eval_runtime": 1.7583, | |
| "eval_samples_per_second": 5.687, | |
| "eval_steps_per_second": 2.275, | |
| "step": 5650 | |
| }, | |
| { | |
| "epoch": 0.009567113299798084, | |
| "grad_norm": 5.2137861251831055, | |
| "learning_rate": 4.9521980023196055e-05, | |
| "loss": 1.5929, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 0.009567113299798084, | |
| "eval_loss": 1.4574190378189087, | |
| "eval_masked_accuracy": 0.6694560647010803, | |
| "eval_runtime": 1.7352, | |
| "eval_samples_per_second": 5.763, | |
| "eval_steps_per_second": 2.305, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 0.00965103534628754, | |
| "grad_norm": 6.848864555358887, | |
| "learning_rate": 4.951778392087158e-05, | |
| "loss": 1.6008, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 0.00965103534628754, | |
| "eval_loss": 2.133417844772339, | |
| "eval_masked_accuracy": 0.6540084481239319, | |
| "eval_runtime": 1.8568, | |
| "eval_samples_per_second": 5.386, | |
| "eval_steps_per_second": 2.154, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 0.009734957392776997, | |
| "grad_norm": 3.9827840328216553, | |
| "learning_rate": 4.9513587818547105e-05, | |
| "loss": 1.5811, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 0.009734957392776997, | |
| "eval_loss": 1.403198003768921, | |
| "eval_masked_accuracy": 0.7085201740264893, | |
| "eval_runtime": 1.749, | |
| "eval_samples_per_second": 5.717, | |
| "eval_steps_per_second": 2.287, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 0.009818879439266454, | |
| "grad_norm": 4.541887283325195, | |
| "learning_rate": 4.950939171622263e-05, | |
| "loss": 1.558, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 0.009818879439266454, | |
| "eval_loss": 1.4281632900238037, | |
| "eval_masked_accuracy": 0.7195122241973877, | |
| "eval_runtime": 1.7523, | |
| "eval_samples_per_second": 5.707, | |
| "eval_steps_per_second": 2.283, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 0.00990280148575591, | |
| "grad_norm": 8.121429443359375, | |
| "learning_rate": 4.950519561389816e-05, | |
| "loss": 1.5583, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 0.00990280148575591, | |
| "eval_loss": 1.608547568321228, | |
| "eval_masked_accuracy": 0.6582278609275818, | |
| "eval_runtime": 1.7405, | |
| "eval_samples_per_second": 5.745, | |
| "eval_steps_per_second": 2.298, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 0.009986723532245369, | |
| "grad_norm": 4.750977039337158, | |
| "learning_rate": 4.950099951157369e-05, | |
| "loss": 1.5378, | |
| "step": 5950 | |
| }, | |
| { | |
| "epoch": 0.009986723532245369, | |
| "eval_loss": 1.3912121057510376, | |
| "eval_masked_accuracy": 0.701298713684082, | |
| "eval_runtime": 1.7623, | |
| "eval_samples_per_second": 5.674, | |
| "eval_steps_per_second": 2.27, | |
| "step": 5950 | |
| }, | |
| { | |
| "epoch": 0.010070645578734825, | |
| "grad_norm": 4.445640563964844, | |
| "learning_rate": 4.949680340924922e-05, | |
| "loss": 1.5063, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.010070645578734825, | |
| "eval_loss": 1.6513465642929077, | |
| "eval_masked_accuracy": 0.6796537041664124, | |
| "eval_runtime": 1.7424, | |
| "eval_samples_per_second": 5.739, | |
| "eval_steps_per_second": 2.296, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.010154567625224282, | |
| "grad_norm": 13.394184112548828, | |
| "learning_rate": 4.949260730692475e-05, | |
| "loss": 1.5155, | |
| "step": 6050 | |
| }, | |
| { | |
| "epoch": 0.010154567625224282, | |
| "eval_loss": 1.5842430591583252, | |
| "eval_masked_accuracy": 0.6853448152542114, | |
| "eval_runtime": 1.7416, | |
| "eval_samples_per_second": 5.742, | |
| "eval_steps_per_second": 2.297, | |
| "step": 6050 | |
| }, | |
| { | |
| "epoch": 0.010238489671713738, | |
| "grad_norm": 7.441386699676514, | |
| "learning_rate": 4.948841120460027e-05, | |
| "loss": 1.5009, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 0.010238489671713738, | |
| "eval_loss": 1.512109637260437, | |
| "eval_masked_accuracy": 0.6987447738647461, | |
| "eval_runtime": 1.7546, | |
| "eval_samples_per_second": 5.699, | |
| "eval_steps_per_second": 2.28, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 0.010322411718203195, | |
| "grad_norm": 6.1988749504089355, | |
| "learning_rate": 4.94842151022758e-05, | |
| "loss": 1.5567, | |
| "step": 6150 | |
| }, | |
| { | |
| "epoch": 0.010322411718203195, | |
| "eval_loss": 1.5210555791854858, | |
| "eval_masked_accuracy": 0.7109375, | |
| "eval_runtime": 1.7524, | |
| "eval_samples_per_second": 5.707, | |
| "eval_steps_per_second": 2.283, | |
| "step": 6150 | |
| }, | |
| { | |
| "epoch": 0.010406333764692651, | |
| "grad_norm": 4.782381057739258, | |
| "learning_rate": 4.9480018999951325e-05, | |
| "loss": 1.6125, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 0.010406333764692651, | |
| "eval_loss": 1.6434142589569092, | |
| "eval_masked_accuracy": 0.6638655662536621, | |
| "eval_runtime": 1.7489, | |
| "eval_samples_per_second": 5.718, | |
| "eval_steps_per_second": 2.287, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 0.01049025581118211, | |
| "grad_norm": 5.14832878112793, | |
| "learning_rate": 4.9475822897626854e-05, | |
| "loss": 1.6089, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 0.01049025581118211, | |
| "eval_loss": 1.239379644393921, | |
| "eval_masked_accuracy": 0.7427386045455933, | |
| "eval_runtime": 1.7532, | |
| "eval_samples_per_second": 5.704, | |
| "eval_steps_per_second": 2.282, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 0.010574177857671566, | |
| "grad_norm": 5.390649795532227, | |
| "learning_rate": 4.947162679530238e-05, | |
| "loss": 1.6357, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 0.010574177857671566, | |
| "eval_loss": 1.5129663944244385, | |
| "eval_masked_accuracy": 0.692307710647583, | |
| "eval_runtime": 1.7523, | |
| "eval_samples_per_second": 5.707, | |
| "eval_steps_per_second": 2.283, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 0.010658099904161023, | |
| "grad_norm": 4.3327412605285645, | |
| "learning_rate": 4.9467430692977904e-05, | |
| "loss": 1.5318, | |
| "step": 6350 | |
| }, | |
| { | |
| "epoch": 0.010658099904161023, | |
| "eval_loss": 1.7716737985610962, | |
| "eval_masked_accuracy": 0.6942148804664612, | |
| "eval_runtime": 1.7284, | |
| "eval_samples_per_second": 5.786, | |
| "eval_steps_per_second": 2.314, | |
| "step": 6350 | |
| }, | |
| { | |
| "epoch": 0.01074202195065048, | |
| "grad_norm": 5.145776271820068, | |
| "learning_rate": 4.946323459065343e-05, | |
| "loss": 1.6081, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 0.01074202195065048, | |
| "eval_loss": 1.6661970615386963, | |
| "eval_masked_accuracy": 0.6882591247558594, | |
| "eval_runtime": 1.7486, | |
| "eval_samples_per_second": 5.719, | |
| "eval_steps_per_second": 2.288, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 0.010825943997139936, | |
| "grad_norm": 5.037006855010986, | |
| "learning_rate": 4.945903848832896e-05, | |
| "loss": 1.5028, | |
| "step": 6450 | |
| }, | |
| { | |
| "epoch": 0.010825943997139936, | |
| "eval_loss": 1.4679136276245117, | |
| "eval_masked_accuracy": 0.714893639087677, | |
| "eval_runtime": 1.7514, | |
| "eval_samples_per_second": 5.71, | |
| "eval_steps_per_second": 2.284, | |
| "step": 6450 | |
| }, | |
| { | |
| "epoch": 0.010909866043629394, | |
| "grad_norm": 5.618253707885742, | |
| "learning_rate": 4.945484238600449e-05, | |
| "loss": 1.5477, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.010909866043629394, | |
| "eval_loss": 1.6666347980499268, | |
| "eval_masked_accuracy": 0.7094017267227173, | |
| "eval_runtime": 1.7486, | |
| "eval_samples_per_second": 5.719, | |
| "eval_steps_per_second": 2.288, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.01099378809011885, | |
| "grad_norm": 14.34435749053955, | |
| "learning_rate": 4.945064628368002e-05, | |
| "loss": 1.6291, | |
| "step": 6550 | |
| }, | |
| { | |
| "epoch": 0.01099378809011885, | |
| "eval_loss": 1.8381481170654297, | |
| "eval_masked_accuracy": 0.6547085046768188, | |
| "eval_runtime": 1.7548, | |
| "eval_samples_per_second": 5.699, | |
| "eval_steps_per_second": 2.279, | |
| "step": 6550 | |
| }, | |
| { | |
| "epoch": 0.011077710136608307, | |
| "grad_norm": 4.846654891967773, | |
| "learning_rate": 4.9446450181355546e-05, | |
| "loss": 1.6077, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 0.011077710136608307, | |
| "eval_loss": 1.5568077564239502, | |
| "eval_masked_accuracy": 0.6872428059577942, | |
| "eval_runtime": 1.7324, | |
| "eval_samples_per_second": 5.772, | |
| "eval_steps_per_second": 2.309, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 0.011161632183097764, | |
| "grad_norm": 5.304859161376953, | |
| "learning_rate": 4.944225407903107e-05, | |
| "loss": 1.5758, | |
| "step": 6650 | |
| }, | |
| { | |
| "epoch": 0.011161632183097764, | |
| "eval_loss": 1.3110054731369019, | |
| "eval_masked_accuracy": 0.7312775254249573, | |
| "eval_runtime": 1.7439, | |
| "eval_samples_per_second": 5.734, | |
| "eval_steps_per_second": 2.294, | |
| "step": 6650 | |
| }, | |
| { | |
| "epoch": 0.01124555422958722, | |
| "grad_norm": 6.187143802642822, | |
| "learning_rate": 4.9438057976706596e-05, | |
| "loss": 1.5817, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 0.01124555422958722, | |
| "eval_loss": 1.7989356517791748, | |
| "eval_masked_accuracy": 0.6666666865348816, | |
| "eval_runtime": 1.754, | |
| "eval_samples_per_second": 5.701, | |
| "eval_steps_per_second": 2.28, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 0.011329476276076679, | |
| "grad_norm": 5.595826148986816, | |
| "learning_rate": 4.9433861874382124e-05, | |
| "loss": 1.6367, | |
| "step": 6750 | |
| }, | |
| { | |
| "epoch": 0.011329476276076679, | |
| "eval_loss": 1.7425569295883179, | |
| "eval_masked_accuracy": 0.6583333611488342, | |
| "eval_runtime": 1.7467, | |
| "eval_samples_per_second": 5.725, | |
| "eval_steps_per_second": 2.29, | |
| "step": 6750 | |
| }, | |
| { | |
| "epoch": 0.011413398322566135, | |
| "grad_norm": 4.125125408172607, | |
| "learning_rate": 4.942966577205765e-05, | |
| "loss": 1.641, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 0.011413398322566135, | |
| "eval_loss": 1.728715181350708, | |
| "eval_masked_accuracy": 0.6652892827987671, | |
| "eval_runtime": 1.772, | |
| "eval_samples_per_second": 5.643, | |
| "eval_steps_per_second": 2.257, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 0.011497320369055592, | |
| "grad_norm": 6.3898844718933105, | |
| "learning_rate": 4.942546966973318e-05, | |
| "loss": 1.6574, | |
| "step": 6850 | |
| }, | |
| { | |
| "epoch": 0.011497320369055592, | |
| "eval_loss": 1.8261781930923462, | |
| "eval_masked_accuracy": 0.6752136945724487, | |
| "eval_runtime": 1.7446, | |
| "eval_samples_per_second": 5.732, | |
| "eval_steps_per_second": 2.293, | |
| "step": 6850 | |
| }, | |
| { | |
| "epoch": 0.011581242415545048, | |
| "grad_norm": 5.9191155433654785, | |
| "learning_rate": 4.942127356740871e-05, | |
| "loss": 1.5732, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 0.011581242415545048, | |
| "eval_loss": 1.2290430068969727, | |
| "eval_masked_accuracy": 0.7573221921920776, | |
| "eval_runtime": 1.7438, | |
| "eval_samples_per_second": 5.735, | |
| "eval_steps_per_second": 2.294, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 0.011665164462034505, | |
| "grad_norm": 5.910600185394287, | |
| "learning_rate": 4.941707746508423e-05, | |
| "loss": 1.5018, | |
| "step": 6950 | |
| }, | |
| { | |
| "epoch": 0.011665164462034505, | |
| "eval_loss": 1.3011202812194824, | |
| "eval_masked_accuracy": 0.746835470199585, | |
| "eval_runtime": 1.739, | |
| "eval_samples_per_second": 5.751, | |
| "eval_steps_per_second": 2.3, | |
| "step": 6950 | |
| }, | |
| { | |
| "epoch": 0.011749086508523963, | |
| "grad_norm": 7.273187637329102, | |
| "learning_rate": 4.941288136275976e-05, | |
| "loss": 1.6083, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.011749086508523963, | |
| "eval_loss": 1.7945482730865479, | |
| "eval_masked_accuracy": 0.6719367504119873, | |
| "eval_runtime": 1.7495, | |
| "eval_samples_per_second": 5.716, | |
| "eval_steps_per_second": 2.286, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.01183300855501342, | |
| "grad_norm": 5.980038642883301, | |
| "learning_rate": 4.940868526043529e-05, | |
| "loss": 1.7157, | |
| "step": 7050 | |
| }, | |
| { | |
| "epoch": 0.01183300855501342, | |
| "eval_loss": 1.6633656024932861, | |
| "eval_masked_accuracy": 0.6859503984451294, | |
| "eval_runtime": 1.7603, | |
| "eval_samples_per_second": 5.681, | |
| "eval_steps_per_second": 2.272, | |
| "step": 7050 | |
| }, | |
| { | |
| "epoch": 0.011916930601502876, | |
| "grad_norm": 4.222002029418945, | |
| "learning_rate": 4.9404489158110817e-05, | |
| "loss": 1.4124, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 0.011916930601502876, | |
| "eval_loss": 1.7207615375518799, | |
| "eval_masked_accuracy": 0.6793248653411865, | |
| "eval_runtime": 1.753, | |
| "eval_samples_per_second": 5.704, | |
| "eval_steps_per_second": 2.282, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 0.012000852647992333, | |
| "grad_norm": 8.79937744140625, | |
| "learning_rate": 4.9400293055786345e-05, | |
| "loss": 1.5698, | |
| "step": 7150 | |
| }, | |
| { | |
| "epoch": 0.012000852647992333, | |
| "eval_loss": 1.5078874826431274, | |
| "eval_masked_accuracy": 0.7276119589805603, | |
| "eval_runtime": 1.866, | |
| "eval_samples_per_second": 5.359, | |
| "eval_steps_per_second": 2.144, | |
| "step": 7150 | |
| }, | |
| { | |
| "epoch": 0.01208477469448179, | |
| "grad_norm": 6.331279754638672, | |
| "learning_rate": 4.939609695346187e-05, | |
| "loss": 1.5354, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 0.01208477469448179, | |
| "eval_loss": 1.3983685970306396, | |
| "eval_masked_accuracy": 0.7590909004211426, | |
| "eval_runtime": 1.7632, | |
| "eval_samples_per_second": 5.672, | |
| "eval_steps_per_second": 2.269, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 0.012168696740971246, | |
| "grad_norm": 4.12935733795166, | |
| "learning_rate": 4.9391900851137395e-05, | |
| "loss": 1.4778, | |
| "step": 7250 | |
| }, | |
| { | |
| "epoch": 0.012168696740971246, | |
| "eval_loss": 1.7603422403335571, | |
| "eval_masked_accuracy": 0.686956524848938, | |
| "eval_runtime": 1.7504, | |
| "eval_samples_per_second": 5.713, | |
| "eval_steps_per_second": 2.285, | |
| "step": 7250 | |
| }, | |
| { | |
| "epoch": 0.012252618787460704, | |
| "grad_norm": 5.025778293609619, | |
| "learning_rate": 4.9387704748812923e-05, | |
| "loss": 1.5175, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 0.012252618787460704, | |
| "eval_loss": 1.7313247919082642, | |
| "eval_masked_accuracy": 0.6872428059577942, | |
| "eval_runtime": 1.745, | |
| "eval_samples_per_second": 5.731, | |
| "eval_steps_per_second": 2.292, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 0.01233654083395016, | |
| "grad_norm": 9.704473495483398, | |
| "learning_rate": 4.938350864648845e-05, | |
| "loss": 1.4634, | |
| "step": 7350 | |
| }, | |
| { | |
| "epoch": 0.01233654083395016, | |
| "eval_loss": 1.271333932876587, | |
| "eval_masked_accuracy": 0.7397260069847107, | |
| "eval_runtime": 1.7484, | |
| "eval_samples_per_second": 5.72, | |
| "eval_steps_per_second": 2.288, | |
| "step": 7350 | |
| }, | |
| { | |
| "epoch": 0.012420462880439617, | |
| "grad_norm": 6.080599308013916, | |
| "learning_rate": 4.937931254416398e-05, | |
| "loss": 1.5937, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 0.012420462880439617, | |
| "eval_loss": 1.4850938320159912, | |
| "eval_masked_accuracy": 0.7280701994895935, | |
| "eval_runtime": 1.7517, | |
| "eval_samples_per_second": 5.709, | |
| "eval_steps_per_second": 2.284, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 0.012504384926929074, | |
| "grad_norm": 3.824946880340576, | |
| "learning_rate": 4.937511644183951e-05, | |
| "loss": 1.6026, | |
| "step": 7450 | |
| }, | |
| { | |
| "epoch": 0.012504384926929074, | |
| "eval_loss": 1.5267841815948486, | |
| "eval_masked_accuracy": 0.7058823704719543, | |
| "eval_runtime": 1.7438, | |
| "eval_samples_per_second": 5.734, | |
| "eval_steps_per_second": 2.294, | |
| "step": 7450 | |
| }, | |
| { | |
| "epoch": 0.01258830697341853, | |
| "grad_norm": 4.5395989418029785, | |
| "learning_rate": 4.937092033951503e-05, | |
| "loss": 1.4575, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.01258830697341853, | |
| "eval_loss": 1.4801056385040283, | |
| "eval_masked_accuracy": 0.680672287940979, | |
| "eval_runtime": 1.7409, | |
| "eval_samples_per_second": 5.744, | |
| "eval_steps_per_second": 2.298, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.012672229019907989, | |
| "grad_norm": 6.853204250335693, | |
| "learning_rate": 4.936672423719056e-05, | |
| "loss": 1.4224, | |
| "step": 7550 | |
| }, | |
| { | |
| "epoch": 0.012672229019907989, | |
| "eval_loss": 1.6892282962799072, | |
| "eval_masked_accuracy": 0.6551724076271057, | |
| "eval_runtime": 1.7414, | |
| "eval_samples_per_second": 5.742, | |
| "eval_steps_per_second": 2.297, | |
| "step": 7550 | |
| }, | |
| { | |
| "epoch": 0.012756151066397445, | |
| "grad_norm": 5.53077507019043, | |
| "learning_rate": 4.936252813486609e-05, | |
| "loss": 1.6706, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 0.012756151066397445, | |
| "eval_loss": 1.4235472679138184, | |
| "eval_masked_accuracy": 0.7426160573959351, | |
| "eval_runtime": 1.8082, | |
| "eval_samples_per_second": 5.53, | |
| "eval_steps_per_second": 2.212, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 0.012840073112886902, | |
| "grad_norm": 4.5907087326049805, | |
| "learning_rate": 4.9358332032541616e-05, | |
| "loss": 1.6674, | |
| "step": 7650 | |
| }, | |
| { | |
| "epoch": 0.012840073112886902, | |
| "eval_loss": 1.4942524433135986, | |
| "eval_masked_accuracy": 0.7172995805740356, | |
| "eval_runtime": 1.7449, | |
| "eval_samples_per_second": 5.731, | |
| "eval_steps_per_second": 2.292, | |
| "step": 7650 | |
| }, | |
| { | |
| "epoch": 0.012923995159376358, | |
| "grad_norm": 8.004353523254395, | |
| "learning_rate": 4.9354135930217144e-05, | |
| "loss": 1.4294, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 0.012923995159376358, | |
| "eval_loss": 1.7548024654388428, | |
| "eval_masked_accuracy": 0.6547619104385376, | |
| "eval_runtime": 1.7767, | |
| "eval_samples_per_second": 5.628, | |
| "eval_steps_per_second": 2.251, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 0.013007917205865815, | |
| "grad_norm": 6.963031768798828, | |
| "learning_rate": 4.934993982789267e-05, | |
| "loss": 1.5078, | |
| "step": 7750 | |
| }, | |
| { | |
| "epoch": 0.013007917205865815, | |
| "eval_loss": 1.4269187450408936, | |
| "eval_masked_accuracy": 0.7027027010917664, | |
| "eval_runtime": 1.7471, | |
| "eval_samples_per_second": 5.724, | |
| "eval_steps_per_second": 2.29, | |
| "step": 7750 | |
| }, | |
| { | |
| "epoch": 0.013091839252355273, | |
| "grad_norm": 6.4043288230896, | |
| "learning_rate": 4.9345743725568194e-05, | |
| "loss": 1.604, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 0.013091839252355273, | |
| "eval_loss": 1.4502145051956177, | |
| "eval_masked_accuracy": 0.7172995805740356, | |
| "eval_runtime": 1.748, | |
| "eval_samples_per_second": 5.721, | |
| "eval_steps_per_second": 2.288, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 0.01317576129884473, | |
| "grad_norm": 5.293691158294678, | |
| "learning_rate": 4.934154762324372e-05, | |
| "loss": 1.6301, | |
| "step": 7850 | |
| }, | |
| { | |
| "epoch": 0.01317576129884473, | |
| "eval_loss": 1.3547624349594116, | |
| "eval_masked_accuracy": 0.7759336233139038, | |
| "eval_runtime": 1.7437, | |
| "eval_samples_per_second": 5.735, | |
| "eval_steps_per_second": 2.294, | |
| "step": 7850 | |
| }, | |
| { | |
| "epoch": 0.013259683345334186, | |
| "grad_norm": 7.364100933074951, | |
| "learning_rate": 4.933735152091925e-05, | |
| "loss": 1.5163, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 0.013259683345334186, | |
| "eval_loss": 1.6089417934417725, | |
| "eval_masked_accuracy": 0.6610878705978394, | |
| "eval_runtime": 1.753, | |
| "eval_samples_per_second": 5.704, | |
| "eval_steps_per_second": 2.282, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 0.013343605391823643, | |
| "grad_norm": 7.704033851623535, | |
| "learning_rate": 4.933315541859478e-05, | |
| "loss": 1.6564, | |
| "step": 7950 | |
| }, | |
| { | |
| "epoch": 0.013343605391823643, | |
| "eval_loss": 1.4759953022003174, | |
| "eval_masked_accuracy": 0.6958333253860474, | |
| "eval_runtime": 1.7614, | |
| "eval_samples_per_second": 5.677, | |
| "eval_steps_per_second": 2.271, | |
| "step": 7950 | |
| }, | |
| { | |
| "epoch": 0.0134275274383131, | |
| "grad_norm": 5.562460899353027, | |
| "learning_rate": 4.932895931627031e-05, | |
| "loss": 1.5703, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.0134275274383131, | |
| "eval_loss": 1.735896348953247, | |
| "eval_masked_accuracy": 0.6875, | |
| "eval_runtime": 1.7493, | |
| "eval_samples_per_second": 5.717, | |
| "eval_steps_per_second": 2.287, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.013511449484802556, | |
| "grad_norm": 8.801225662231445, | |
| "learning_rate": 4.9324763213945836e-05, | |
| "loss": 1.5328, | |
| "step": 8050 | |
| }, | |
| { | |
| "epoch": 0.013511449484802556, | |
| "eval_loss": 1.2792503833770752, | |
| "eval_masked_accuracy": 0.7292576432228088, | |
| "eval_runtime": 1.7802, | |
| "eval_samples_per_second": 5.617, | |
| "eval_steps_per_second": 2.247, | |
| "step": 8050 | |
| }, | |
| { | |
| "epoch": 0.013595371531292014, | |
| "grad_norm": 5.510076999664307, | |
| "learning_rate": 4.932056711162136e-05, | |
| "loss": 1.5086, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 0.013595371531292014, | |
| "eval_loss": 1.811342477798462, | |
| "eval_masked_accuracy": 0.6508620977401733, | |
| "eval_runtime": 1.7772, | |
| "eval_samples_per_second": 5.627, | |
| "eval_steps_per_second": 2.251, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 0.01367929357778147, | |
| "grad_norm": 4.370019912719727, | |
| "learning_rate": 4.9316371009296886e-05, | |
| "loss": 1.5992, | |
| "step": 8150 | |
| }, | |
| { | |
| "epoch": 0.01367929357778147, | |
| "eval_loss": 1.7015224695205688, | |
| "eval_masked_accuracy": 0.6945606470108032, | |
| "eval_runtime": 1.7399, | |
| "eval_samples_per_second": 5.747, | |
| "eval_steps_per_second": 2.299, | |
| "step": 8150 | |
| }, | |
| { | |
| "epoch": 0.013763215624270927, | |
| "grad_norm": 5.960280895233154, | |
| "learning_rate": 4.9312174906972415e-05, | |
| "loss": 1.6392, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 0.013763215624270927, | |
| "eval_loss": 1.4644631147384644, | |
| "eval_masked_accuracy": 0.7004830837249756, | |
| "eval_runtime": 1.7493, | |
| "eval_samples_per_second": 5.717, | |
| "eval_steps_per_second": 2.287, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 0.013847137670760384, | |
| "grad_norm": 5.401033878326416, | |
| "learning_rate": 4.930797880464794e-05, | |
| "loss": 1.6492, | |
| "step": 8250 | |
| }, | |
| { | |
| "epoch": 0.013847137670760384, | |
| "eval_loss": 1.5244245529174805, | |
| "eval_masked_accuracy": 0.688034176826477, | |
| "eval_runtime": 1.7597, | |
| "eval_samples_per_second": 5.683, | |
| "eval_steps_per_second": 2.273, | |
| "step": 8250 | |
| }, | |
| { | |
| "epoch": 0.01393105971724984, | |
| "grad_norm": 7.356916427612305, | |
| "learning_rate": 4.930378270232347e-05, | |
| "loss": 1.5673, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 0.01393105971724984, | |
| "eval_loss": 1.4024368524551392, | |
| "eval_masked_accuracy": 0.7016806602478027, | |
| "eval_runtime": 1.7463, | |
| "eval_samples_per_second": 5.726, | |
| "eval_steps_per_second": 2.291, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 0.014014981763739299, | |
| "grad_norm": 5.370472431182861, | |
| "learning_rate": 4.929958659999899e-05, | |
| "loss": 1.5267, | |
| "step": 8350 | |
| }, | |
| { | |
| "epoch": 0.014014981763739299, | |
| "eval_loss": 1.7430174350738525, | |
| "eval_masked_accuracy": 0.6653061509132385, | |
| "eval_runtime": 1.7353, | |
| "eval_samples_per_second": 5.763, | |
| "eval_steps_per_second": 2.305, | |
| "step": 8350 | |
| }, | |
| { | |
| "epoch": 0.014098903810228755, | |
| "grad_norm": 6.4656500816345215, | |
| "learning_rate": 4.929539049767452e-05, | |
| "loss": 1.5918, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 0.014098903810228755, | |
| "eval_loss": 1.691054344177246, | |
| "eval_masked_accuracy": 0.6849315166473389, | |
| "eval_runtime": 1.746, | |
| "eval_samples_per_second": 5.727, | |
| "eval_steps_per_second": 2.291, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 0.014182825856718212, | |
| "grad_norm": 5.481358051300049, | |
| "learning_rate": 4.929119439535005e-05, | |
| "loss": 1.5156, | |
| "step": 8450 | |
| }, | |
| { | |
| "epoch": 0.014182825856718212, | |
| "eval_loss": 1.6469824314117432, | |
| "eval_masked_accuracy": 0.6516393423080444, | |
| "eval_runtime": 1.7423, | |
| "eval_samples_per_second": 5.74, | |
| "eval_steps_per_second": 2.296, | |
| "step": 8450 | |
| }, | |
| { | |
| "epoch": 0.014266747903207668, | |
| "grad_norm": 4.755044937133789, | |
| "learning_rate": 4.928699829302558e-05, | |
| "loss": 1.5223, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 0.014266747903207668, | |
| "eval_loss": 1.667824387550354, | |
| "eval_masked_accuracy": 0.7068965435028076, | |
| "eval_runtime": 1.7979, | |
| "eval_samples_per_second": 5.562, | |
| "eval_steps_per_second": 2.225, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 0.014350669949697125, | |
| "grad_norm": 6.595943450927734, | |
| "learning_rate": 4.928280219070111e-05, | |
| "loss": 1.4699, | |
| "step": 8550 | |
| }, | |
| { | |
| "epoch": 0.014350669949697125, | |
| "eval_loss": 1.2367641925811768, | |
| "eval_masked_accuracy": 0.7447698712348938, | |
| "eval_runtime": 1.7387, | |
| "eval_samples_per_second": 5.752, | |
| "eval_steps_per_second": 2.301, | |
| "step": 8550 | |
| }, | |
| { | |
| "epoch": 0.014434591996186583, | |
| "grad_norm": 3.9210710525512695, | |
| "learning_rate": 4.9278606088376635e-05, | |
| "loss": 1.5695, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 0.014434591996186583, | |
| "eval_loss": 1.3033006191253662, | |
| "eval_masked_accuracy": 0.693965494632721, | |
| "eval_runtime": 1.7554, | |
| "eval_samples_per_second": 5.697, | |
| "eval_steps_per_second": 2.279, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 0.01451851404267604, | |
| "grad_norm": 4.682461261749268, | |
| "learning_rate": 4.927440998605216e-05, | |
| "loss": 1.5371, | |
| "step": 8650 | |
| }, | |
| { | |
| "epoch": 0.01451851404267604, | |
| "eval_loss": 1.727216124534607, | |
| "eval_masked_accuracy": 0.6639004349708557, | |
| "eval_runtime": 1.7387, | |
| "eval_samples_per_second": 5.751, | |
| "eval_steps_per_second": 2.301, | |
| "step": 8650 | |
| }, | |
| { | |
| "epoch": 0.014602436089165496, | |
| "grad_norm": 4.478100776672363, | |
| "learning_rate": 4.9270213883727685e-05, | |
| "loss": 1.5679, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 0.014602436089165496, | |
| "eval_loss": 1.4694969654083252, | |
| "eval_masked_accuracy": 0.7364016771316528, | |
| "eval_runtime": 1.7474, | |
| "eval_samples_per_second": 5.723, | |
| "eval_steps_per_second": 2.289, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 0.014686358135654953, | |
| "grad_norm": 8.149710655212402, | |
| "learning_rate": 4.9266017781403214e-05, | |
| "loss": 1.4814, | |
| "step": 8750 | |
| }, | |
| { | |
| "epoch": 0.014686358135654953, | |
| "eval_loss": 1.9258610010147095, | |
| "eval_masked_accuracy": 0.6228070259094238, | |
| "eval_runtime": 1.7513, | |
| "eval_samples_per_second": 5.71, | |
| "eval_steps_per_second": 2.284, | |
| "step": 8750 | |
| }, | |
| { | |
| "epoch": 0.01477028018214441, | |
| "grad_norm": 4.727016925811768, | |
| "learning_rate": 4.926182167907874e-05, | |
| "loss": 1.609, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 0.01477028018214441, | |
| "eval_loss": 1.6111774444580078, | |
| "eval_masked_accuracy": 0.6590038537979126, | |
| "eval_runtime": 1.7579, | |
| "eval_samples_per_second": 5.689, | |
| "eval_steps_per_second": 2.275, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 0.014854202228633867, | |
| "grad_norm": 5.348945140838623, | |
| "learning_rate": 4.925762557675427e-05, | |
| "loss": 1.5557, | |
| "step": 8850 | |
| }, | |
| { | |
| "epoch": 0.014854202228633867, | |
| "eval_loss": 1.3535053730010986, | |
| "eval_masked_accuracy": 0.7245762944221497, | |
| "eval_runtime": 1.8639, | |
| "eval_samples_per_second": 5.365, | |
| "eval_steps_per_second": 2.146, | |
| "step": 8850 | |
| }, | |
| { | |
| "epoch": 0.014938124275123324, | |
| "grad_norm": 6.573589324951172, | |
| "learning_rate": 4.92534294744298e-05, | |
| "loss": 1.6389, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 0.014938124275123324, | |
| "eval_loss": 1.8509418964385986, | |
| "eval_masked_accuracy": 0.7085201740264893, | |
| "eval_runtime": 1.7536, | |
| "eval_samples_per_second": 5.703, | |
| "eval_steps_per_second": 2.281, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 0.01502204632161278, | |
| "grad_norm": 7.373574256896973, | |
| "learning_rate": 4.924923337210532e-05, | |
| "loss": 1.4773, | |
| "step": 8950 | |
| }, | |
| { | |
| "epoch": 0.01502204632161278, | |
| "eval_loss": 1.7772554159164429, | |
| "eval_masked_accuracy": 0.6640625, | |
| "eval_runtime": 1.7655, | |
| "eval_samples_per_second": 5.664, | |
| "eval_steps_per_second": 2.266, | |
| "step": 8950 | |
| }, | |
| { | |
| "epoch": 0.015105968368102237, | |
| "grad_norm": 5.861003875732422, | |
| "learning_rate": 4.924503726978085e-05, | |
| "loss": 1.3842, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.015105968368102237, | |
| "eval_loss": 1.6182334423065186, | |
| "eval_masked_accuracy": 0.7183098793029785, | |
| "eval_runtime": 1.7386, | |
| "eval_samples_per_second": 5.752, | |
| "eval_steps_per_second": 2.301, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.015189890414591694, | |
| "grad_norm": 5.086306571960449, | |
| "learning_rate": 4.924084116745638e-05, | |
| "loss": 1.6445, | |
| "step": 9050 | |
| }, | |
| { | |
| "epoch": 0.015189890414591694, | |
| "eval_loss": 1.3457679748535156, | |
| "eval_masked_accuracy": 0.752293586730957, | |
| "eval_runtime": 1.7595, | |
| "eval_samples_per_second": 5.684, | |
| "eval_steps_per_second": 2.273, | |
| "step": 9050 | |
| }, | |
| { | |
| "epoch": 0.01527381246108115, | |
| "grad_norm": 7.099021911621094, | |
| "learning_rate": 4.9236645065131906e-05, | |
| "loss": 1.5536, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 0.01527381246108115, | |
| "eval_loss": 1.8317623138427734, | |
| "eval_masked_accuracy": 0.6588628888130188, | |
| "eval_runtime": 1.8424, | |
| "eval_samples_per_second": 5.428, | |
| "eval_steps_per_second": 2.171, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 0.015357734507570608, | |
| "grad_norm": 6.620283126831055, | |
| "learning_rate": 4.9232448962807434e-05, | |
| "loss": 1.5151, | |
| "step": 9150 | |
| }, | |
| { | |
| "epoch": 0.015357734507570608, | |
| "eval_loss": 1.4230843782424927, | |
| "eval_masked_accuracy": 0.700421929359436, | |
| "eval_runtime": 1.7611, | |
| "eval_samples_per_second": 5.678, | |
| "eval_steps_per_second": 2.271, | |
| "step": 9150 | |
| }, | |
| { | |
| "epoch": 0.015441656554060065, | |
| "grad_norm": 7.231357097625732, | |
| "learning_rate": 4.922825286048296e-05, | |
| "loss": 1.6078, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 0.015441656554060065, | |
| "eval_loss": 1.7547998428344727, | |
| "eval_masked_accuracy": 0.6745283007621765, | |
| "eval_runtime": 1.8328, | |
| "eval_samples_per_second": 5.456, | |
| "eval_steps_per_second": 2.182, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 0.015525578600549522, | |
| "grad_norm": 4.755532264709473, | |
| "learning_rate": 4.9224140680204975e-05, | |
| "loss": 1.5938, | |
| "step": 9250 | |
| }, | |
| { | |
| "epoch": 0.015525578600549522, | |
| "eval_loss": 1.3346257209777832, | |
| "eval_masked_accuracy": 0.7244444489479065, | |
| "eval_runtime": 1.7553, | |
| "eval_samples_per_second": 5.697, | |
| "eval_steps_per_second": 2.279, | |
| "step": 9250 | |
| }, | |
| { | |
| "epoch": 0.015609500647038978, | |
| "grad_norm": 5.728196620941162, | |
| "learning_rate": 4.92199445778805e-05, | |
| "loss": 1.5542, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 0.015609500647038978, | |
| "eval_loss": 1.6833394765853882, | |
| "eval_masked_accuracy": 0.6654929518699646, | |
| "eval_runtime": 1.7516, | |
| "eval_samples_per_second": 5.709, | |
| "eval_steps_per_second": 2.284, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 0.015693422693528435, | |
| "grad_norm": 5.66224479675293, | |
| "learning_rate": 4.921574847555603e-05, | |
| "loss": 1.6099, | |
| "step": 9350 | |
| }, | |
| { | |
| "epoch": 0.015693422693528435, | |
| "eval_loss": 1.442452311515808, | |
| "eval_masked_accuracy": 0.6905829310417175, | |
| "eval_runtime": 1.7553, | |
| "eval_samples_per_second": 5.697, | |
| "eval_steps_per_second": 2.279, | |
| "step": 9350 | |
| }, | |
| { | |
| "epoch": 0.015777344740017893, | |
| "grad_norm": 6.560795307159424, | |
| "learning_rate": 4.921155237323155e-05, | |
| "loss": 1.4188, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 0.015777344740017893, | |
| "eval_loss": 1.539738416671753, | |
| "eval_masked_accuracy": 0.68359375, | |
| "eval_runtime": 1.7406, | |
| "eval_samples_per_second": 5.745, | |
| "eval_steps_per_second": 2.298, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 0.015861266786507348, | |
| "grad_norm": 4.9847025871276855, | |
| "learning_rate": 4.920735627090708e-05, | |
| "loss": 1.6344, | |
| "step": 9450 | |
| }, | |
| { | |
| "epoch": 0.015861266786507348, | |
| "eval_loss": 1.244769811630249, | |
| "eval_masked_accuracy": 0.7078189253807068, | |
| "eval_runtime": 1.77, | |
| "eval_samples_per_second": 5.65, | |
| "eval_steps_per_second": 2.26, | |
| "step": 9450 | |
| }, | |
| { | |
| "epoch": 0.015945188832996806, | |
| "grad_norm": 6.173788070678711, | |
| "learning_rate": 4.920316016858261e-05, | |
| "loss": 1.6249, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 0.015945188832996806, | |
| "eval_loss": 2.0483577251434326, | |
| "eval_masked_accuracy": 0.607594907283783, | |
| "eval_runtime": 1.7538, | |
| "eval_samples_per_second": 5.702, | |
| "eval_steps_per_second": 2.281, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 0.016029110879486264, | |
| "grad_norm": 4.4076828956604, | |
| "learning_rate": 4.919896406625814e-05, | |
| "loss": 1.505, | |
| "step": 9550 | |
| }, | |
| { | |
| "epoch": 0.016029110879486264, | |
| "eval_loss": 1.7403160333633423, | |
| "eval_masked_accuracy": 0.7048457860946655, | |
| "eval_runtime": 1.7491, | |
| "eval_samples_per_second": 5.717, | |
| "eval_steps_per_second": 2.287, | |
| "step": 9550 | |
| }, | |
| { | |
| "epoch": 0.01611303292597572, | |
| "grad_norm": 6.358312129974365, | |
| "learning_rate": 4.919476796393366e-05, | |
| "loss": 1.655, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 0.01611303292597572, | |
| "eval_loss": 1.8444688320159912, | |
| "eval_masked_accuracy": 0.6808510422706604, | |
| "eval_runtime": 1.7573, | |
| "eval_samples_per_second": 5.691, | |
| "eval_steps_per_second": 2.276, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 0.016196954972465177, | |
| "grad_norm": 6.645698547363281, | |
| "learning_rate": 4.919057186160919e-05, | |
| "loss": 1.5926, | |
| "step": 9650 | |
| }, | |
| { | |
| "epoch": 0.016196954972465177, | |
| "eval_loss": 1.6228317022323608, | |
| "eval_masked_accuracy": 0.65625, | |
| "eval_runtime": 1.8422, | |
| "eval_samples_per_second": 5.428, | |
| "eval_steps_per_second": 2.171, | |
| "step": 9650 | |
| }, | |
| { | |
| "epoch": 0.016280877018954632, | |
| "grad_norm": 5.672697067260742, | |
| "learning_rate": 4.918637575928472e-05, | |
| "loss": 1.4762, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 0.016280877018954632, | |
| "eval_loss": 1.5051512718200684, | |
| "eval_masked_accuracy": 0.6943231225013733, | |
| "eval_runtime": 1.7515, | |
| "eval_samples_per_second": 5.709, | |
| "eval_steps_per_second": 2.284, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 0.01636479906544409, | |
| "grad_norm": 5.369190216064453, | |
| "learning_rate": 4.9182179656960245e-05, | |
| "loss": 1.5021, | |
| "step": 9750 | |
| }, | |
| { | |
| "epoch": 0.01636479906544409, | |
| "eval_loss": 1.7301708459854126, | |
| "eval_masked_accuracy": 0.6593886613845825, | |
| "eval_runtime": 1.7374, | |
| "eval_samples_per_second": 5.756, | |
| "eval_steps_per_second": 2.302, | |
| "step": 9750 | |
| }, | |
| { | |
| "epoch": 0.01644872111193355, | |
| "grad_norm": 4.986740589141846, | |
| "learning_rate": 4.917798355463577e-05, | |
| "loss": 1.5618, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 0.01644872111193355, | |
| "eval_loss": 1.3315510749816895, | |
| "eval_masked_accuracy": 0.700421929359436, | |
| "eval_runtime": 1.7373, | |
| "eval_samples_per_second": 5.756, | |
| "eval_steps_per_second": 2.302, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 0.016532643158423004, | |
| "grad_norm": 7.441061973571777, | |
| "learning_rate": 4.9173787452311295e-05, | |
| "loss": 1.5428, | |
| "step": 9850 | |
| }, | |
| { | |
| "epoch": 0.016532643158423004, | |
| "eval_loss": 1.6381117105484009, | |
| "eval_masked_accuracy": 0.6695652008056641, | |
| "eval_runtime": 1.7386, | |
| "eval_samples_per_second": 5.752, | |
| "eval_steps_per_second": 2.301, | |
| "step": 9850 | |
| }, | |
| { | |
| "epoch": 0.016616565204912462, | |
| "grad_norm": 6.459640979766846, | |
| "learning_rate": 4.9169591349986824e-05, | |
| "loss": 1.4702, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 0.016616565204912462, | |
| "eval_loss": 1.537841796875, | |
| "eval_masked_accuracy": 0.6741573214530945, | |
| "eval_runtime": 1.7482, | |
| "eval_samples_per_second": 5.72, | |
| "eval_steps_per_second": 2.288, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 0.016700487251401917, | |
| "grad_norm": 6.058482646942139, | |
| "learning_rate": 4.916539524766235e-05, | |
| "loss": 1.5765, | |
| "step": 9950 | |
| }, | |
| { | |
| "epoch": 0.016700487251401917, | |
| "eval_loss": 1.688913345336914, | |
| "eval_masked_accuracy": 0.692307710647583, | |
| "eval_runtime": 1.7482, | |
| "eval_samples_per_second": 5.72, | |
| "eval_steps_per_second": 2.288, | |
| "step": 9950 | |
| }, | |
| { | |
| "epoch": 0.016784409297891375, | |
| "grad_norm": 4.960835933685303, | |
| "learning_rate": 4.916119914533788e-05, | |
| "loss": 1.544, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.016784409297891375, | |
| "eval_loss": 1.7901655435562134, | |
| "eval_masked_accuracy": 0.6443514823913574, | |
| "eval_runtime": 1.7882, | |
| "eval_samples_per_second": 5.592, | |
| "eval_steps_per_second": 2.237, | |
| "step": 10000 | |
| } | |
| ], | |
| "logging_steps": 50, | |
| "max_steps": 595791, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 5000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.617791736784392e+16, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |