| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 12.269938650306749, | |
| "eval_steps": 500, | |
| "global_step": 2000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.03067484662576687, | |
| "grad_norm": 2.5978403091430664, | |
| "learning_rate": 1.6326530612244901e-07, | |
| "loss": 1.1372, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.06134969325153374, | |
| "grad_norm": 2.5917587280273438, | |
| "learning_rate": 3.6734693877551025e-07, | |
| "loss": 1.1384, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.09202453987730061, | |
| "grad_norm": 2.413616895675659, | |
| "learning_rate": 5.714285714285715e-07, | |
| "loss": 1.1299, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.12269938650306748, | |
| "grad_norm": 2.1455910205841064, | |
| "learning_rate": 7.755102040816327e-07, | |
| "loss": 1.1148, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.15337423312883436, | |
| "grad_norm": 1.4798907041549683, | |
| "learning_rate": 9.795918367346939e-07, | |
| "loss": 1.0779, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.18404907975460122, | |
| "grad_norm": 1.122052550315857, | |
| "learning_rate": 1.1836734693877552e-06, | |
| "loss": 1.0483, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.2147239263803681, | |
| "grad_norm": 0.9710678458213806, | |
| "learning_rate": 1.3877551020408165e-06, | |
| "loss": 1.0004, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.24539877300613497, | |
| "grad_norm": 1.1292918920516968, | |
| "learning_rate": 1.5918367346938775e-06, | |
| "loss": 0.9636, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.27607361963190186, | |
| "grad_norm": 0.7304956912994385, | |
| "learning_rate": 1.7959183673469388e-06, | |
| "loss": 0.914, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.3067484662576687, | |
| "grad_norm": 0.5761268734931946, | |
| "learning_rate": 2.0000000000000003e-06, | |
| "loss": 0.8871, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.3374233128834356, | |
| "grad_norm": 0.5395930409431458, | |
| "learning_rate": 2.2040816326530616e-06, | |
| "loss": 0.8649, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.36809815950920244, | |
| "grad_norm": 0.4449169635772705, | |
| "learning_rate": 2.4081632653061225e-06, | |
| "loss": 0.8375, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.3987730061349693, | |
| "grad_norm": 0.3775625228881836, | |
| "learning_rate": 2.6122448979591842e-06, | |
| "loss": 0.8179, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.4294478527607362, | |
| "grad_norm": 0.33234792947769165, | |
| "learning_rate": 2.816326530612245e-06, | |
| "loss": 0.8027, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.4601226993865031, | |
| "grad_norm": 0.3218803405761719, | |
| "learning_rate": 3.0204081632653064e-06, | |
| "loss": 0.7834, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.49079754601226994, | |
| "grad_norm": 0.30811169743537903, | |
| "learning_rate": 3.2244897959183672e-06, | |
| "loss": 0.7754, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.5214723926380368, | |
| "grad_norm": 0.3008837401866913, | |
| "learning_rate": 3.428571428571429e-06, | |
| "loss": 0.77, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.5521472392638037, | |
| "grad_norm": 0.30381834506988525, | |
| "learning_rate": 3.6326530612244903e-06, | |
| "loss": 0.7576, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.5828220858895705, | |
| "grad_norm": 0.29334792494773865, | |
| "learning_rate": 3.836734693877551e-06, | |
| "loss": 0.7528, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.6134969325153374, | |
| "grad_norm": 0.3081793785095215, | |
| "learning_rate": 4.040816326530612e-06, | |
| "loss": 0.743, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.6441717791411042, | |
| "grad_norm": 0.2884787917137146, | |
| "learning_rate": 4.244897959183674e-06, | |
| "loss": 0.7357, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.6748466257668712, | |
| "grad_norm": 0.31221410632133484, | |
| "learning_rate": 4.448979591836735e-06, | |
| "loss": 0.736, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.7055214723926381, | |
| "grad_norm": 0.29866281151771545, | |
| "learning_rate": 4.653061224489796e-06, | |
| "loss": 0.7242, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.7361963190184049, | |
| "grad_norm": 0.3165671229362488, | |
| "learning_rate": 4.857142857142858e-06, | |
| "loss": 0.7137, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.7668711656441718, | |
| "grad_norm": 0.30676907300949097, | |
| "learning_rate": 5.061224489795918e-06, | |
| "loss": 0.7153, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.7975460122699386, | |
| "grad_norm": 0.3097768723964691, | |
| "learning_rate": 5.26530612244898e-06, | |
| "loss": 0.7099, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.8282208588957055, | |
| "grad_norm": 0.3433418869972229, | |
| "learning_rate": 5.4693877551020415e-06, | |
| "loss": 0.7029, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.8588957055214724, | |
| "grad_norm": 0.3297426104545593, | |
| "learning_rate": 5.673469387755103e-06, | |
| "loss": 0.7023, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.8895705521472392, | |
| "grad_norm": 0.32176053524017334, | |
| "learning_rate": 5.877551020408164e-06, | |
| "loss": 0.6954, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.9202453987730062, | |
| "grad_norm": 0.35964256525039673, | |
| "learning_rate": 6.0816326530612245e-06, | |
| "loss": 0.6938, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.950920245398773, | |
| "grad_norm": 0.34248167276382446, | |
| "learning_rate": 6.285714285714286e-06, | |
| "loss": 0.6882, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.9815950920245399, | |
| "grad_norm": 0.34893444180488586, | |
| "learning_rate": 6.489795918367348e-06, | |
| "loss": 0.6905, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 1.0122699386503067, | |
| "grad_norm": 0.3583211302757263, | |
| "learning_rate": 6.693877551020409e-06, | |
| "loss": 0.679, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 1.0429447852760736, | |
| "grad_norm": 0.33445632457733154, | |
| "learning_rate": 6.8979591836734705e-06, | |
| "loss": 0.6711, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 1.0736196319018405, | |
| "grad_norm": 0.3518800437450409, | |
| "learning_rate": 7.102040816326531e-06, | |
| "loss": 0.6684, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 1.1042944785276074, | |
| "grad_norm": 0.37601813673973083, | |
| "learning_rate": 7.306122448979592e-06, | |
| "loss": 0.67, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 1.1349693251533743, | |
| "grad_norm": 0.35547634959220886, | |
| "learning_rate": 7.5102040816326536e-06, | |
| "loss": 0.6614, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 1.165644171779141, | |
| "grad_norm": 0.3293817937374115, | |
| "learning_rate": 7.714285714285716e-06, | |
| "loss": 0.6615, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 1.196319018404908, | |
| "grad_norm": 0.39242789149284363, | |
| "learning_rate": 7.918367346938776e-06, | |
| "loss": 0.6563, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 1.2269938650306749, | |
| "grad_norm": 0.3492467701435089, | |
| "learning_rate": 8.122448979591837e-06, | |
| "loss": 0.6559, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.2576687116564418, | |
| "grad_norm": 0.38162752985954285, | |
| "learning_rate": 8.326530612244899e-06, | |
| "loss": 0.6534, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 1.2883435582822087, | |
| "grad_norm": 0.41921180486679077, | |
| "learning_rate": 8.530612244897961e-06, | |
| "loss": 0.6537, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 1.3190184049079754, | |
| "grad_norm": 0.35677486658096313, | |
| "learning_rate": 8.734693877551021e-06, | |
| "loss": 0.6468, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 1.3496932515337423, | |
| "grad_norm": 0.4290676414966583, | |
| "learning_rate": 8.938775510204082e-06, | |
| "loss": 0.6457, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 1.3803680981595092, | |
| "grad_norm": 0.33468857407569885, | |
| "learning_rate": 9.142857142857144e-06, | |
| "loss": 0.646, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 1.4110429447852761, | |
| "grad_norm": 0.37300559878349304, | |
| "learning_rate": 9.346938775510204e-06, | |
| "loss": 0.6436, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 1.441717791411043, | |
| "grad_norm": 0.37263476848602295, | |
| "learning_rate": 9.551020408163266e-06, | |
| "loss": 0.6364, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 1.4723926380368098, | |
| "grad_norm": 0.4007490873336792, | |
| "learning_rate": 9.755102040816327e-06, | |
| "loss": 0.6394, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 1.5030674846625767, | |
| "grad_norm": 0.3758663833141327, | |
| "learning_rate": 9.959183673469387e-06, | |
| "loss": 0.6392, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 1.5337423312883436, | |
| "grad_norm": 0.39793241024017334, | |
| "learning_rate": 9.999918433243253e-06, | |
| "loss": 0.6389, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.5644171779141103, | |
| "grad_norm": 0.4773501455783844, | |
| "learning_rate": 9.999587072854989e-06, | |
| "loss": 0.6364, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 1.5950920245398774, | |
| "grad_norm": 0.43220365047454834, | |
| "learning_rate": 9.99900083779239e-06, | |
| "loss": 0.6321, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 1.6257668711656441, | |
| "grad_norm": 0.4072875678539276, | |
| "learning_rate": 9.998159757941219e-06, | |
| "loss": 0.6328, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 1.656441717791411, | |
| "grad_norm": 0.3672327995300293, | |
| "learning_rate": 9.997063876179007e-06, | |
| "loss": 0.6314, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 1.687116564417178, | |
| "grad_norm": 0.44787004590034485, | |
| "learning_rate": 9.99571324837287e-06, | |
| "loss": 0.6265, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 1.7177914110429446, | |
| "grad_norm": 0.4095243513584137, | |
| "learning_rate": 9.994107943376654e-06, | |
| "loss": 0.6241, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.7484662576687118, | |
| "grad_norm": 0.48604634404182434, | |
| "learning_rate": 9.992248043027441e-06, | |
| "loss": 0.6219, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 1.7791411042944785, | |
| "grad_norm": 0.46355390548706055, | |
| "learning_rate": 9.990133642141359e-06, | |
| "loss": 0.6224, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.8098159509202454, | |
| "grad_norm": 0.36252495646476746, | |
| "learning_rate": 9.987764848508756e-06, | |
| "loss": 0.6295, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 1.8404907975460123, | |
| "grad_norm": 0.39227545261383057, | |
| "learning_rate": 9.985141782888705e-06, | |
| "loss": 0.6188, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.871165644171779, | |
| "grad_norm": 0.4363463222980499, | |
| "learning_rate": 9.982264579002853e-06, | |
| "loss": 0.6156, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 1.9018404907975461, | |
| "grad_norm": 0.36619096994400024, | |
| "learning_rate": 9.979133383528591e-06, | |
| "loss": 0.6186, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 1.9325153374233128, | |
| "grad_norm": 0.38282138109207153, | |
| "learning_rate": 9.975748356091589e-06, | |
| "loss": 0.6179, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 1.9631901840490797, | |
| "grad_norm": 0.377532958984375, | |
| "learning_rate": 9.972109669257645e-06, | |
| "loss": 0.6103, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.9938650306748467, | |
| "grad_norm": 0.3768117129802704, | |
| "learning_rate": 9.968217508523913e-06, | |
| "loss": 0.6126, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 2.0245398773006134, | |
| "grad_norm": 0.3956761360168457, | |
| "learning_rate": 9.964072072309412e-06, | |
| "loss": 0.598, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 2.0552147239263805, | |
| "grad_norm": 0.4610372483730316, | |
| "learning_rate": 9.959673571944939e-06, | |
| "loss": 0.5874, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 2.085889570552147, | |
| "grad_norm": 0.4218359589576721, | |
| "learning_rate": 9.955022231662282e-06, | |
| "loss": 0.5883, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 2.116564417177914, | |
| "grad_norm": 0.3745018243789673, | |
| "learning_rate": 9.95011828858279e-06, | |
| "loss": 0.5911, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 2.147239263803681, | |
| "grad_norm": 0.3876558840274811, | |
| "learning_rate": 9.944961992705288e-06, | |
| "loss": 0.5871, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 2.1779141104294477, | |
| "grad_norm": 0.43544134497642517, | |
| "learning_rate": 9.939553606893334e-06, | |
| "loss": 0.5923, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 2.208588957055215, | |
| "grad_norm": 0.42091232538223267, | |
| "learning_rate": 9.933893406861808e-06, | |
| "loss": 0.5869, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 2.2392638036809815, | |
| "grad_norm": 0.4135364890098572, | |
| "learning_rate": 9.927981681162873e-06, | |
| "loss": 0.5893, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 2.2699386503067487, | |
| "grad_norm": 0.36126476526260376, | |
| "learning_rate": 9.921818731171249e-06, | |
| "loss": 0.5884, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 2.3006134969325154, | |
| "grad_norm": 0.36540037393569946, | |
| "learning_rate": 9.915404871068855e-06, | |
| "loss": 0.5851, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 2.331288343558282, | |
| "grad_norm": 0.40658003091812134, | |
| "learning_rate": 9.9087404278288e-06, | |
| "loss": 0.5864, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 2.361963190184049, | |
| "grad_norm": 0.3715730607509613, | |
| "learning_rate": 9.901825741198697e-06, | |
| "loss": 0.5869, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 2.392638036809816, | |
| "grad_norm": 0.414098858833313, | |
| "learning_rate": 9.894661163683361e-06, | |
| "loss": 0.5865, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 2.4233128834355826, | |
| "grad_norm": 0.40678825974464417, | |
| "learning_rate": 9.887247060526827e-06, | |
| "loss": 0.5866, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 2.4539877300613497, | |
| "grad_norm": 0.37045034766197205, | |
| "learning_rate": 9.879583809693737e-06, | |
| "loss": 0.5765, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 2.4846625766871164, | |
| "grad_norm": 0.36025142669677734, | |
| "learning_rate": 9.871671801850065e-06, | |
| "loss": 0.5828, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 2.5153374233128836, | |
| "grad_norm": 0.36954501271247864, | |
| "learning_rate": 9.863511440343206e-06, | |
| "loss": 0.5799, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 2.5460122699386503, | |
| "grad_norm": 0.3953179121017456, | |
| "learning_rate": 9.855103141181412e-06, | |
| "loss": 0.5814, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 2.5766871165644174, | |
| "grad_norm": 0.39428064227104187, | |
| "learning_rate": 9.846447333012587e-06, | |
| "loss": 0.5797, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 2.607361963190184, | |
| "grad_norm": 0.3589676022529602, | |
| "learning_rate": 9.837544457102428e-06, | |
| "loss": 0.5843, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 2.638036809815951, | |
| "grad_norm": 0.3701938986778259, | |
| "learning_rate": 9.82839496731194e-06, | |
| "loss": 0.5775, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 2.668711656441718, | |
| "grad_norm": 0.4202510118484497, | |
| "learning_rate": 9.818999330074288e-06, | |
| "loss": 0.5833, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 2.6993865030674846, | |
| "grad_norm": 0.39413678646087646, | |
| "learning_rate": 9.809358024371025e-06, | |
| "loss": 0.5748, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 2.7300613496932513, | |
| "grad_norm": 0.3659398555755615, | |
| "learning_rate": 9.799471541707672e-06, | |
| "loss": 0.5769, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 2.7607361963190185, | |
| "grad_norm": 0.3878413140773773, | |
| "learning_rate": 9.789340386088663e-06, | |
| "loss": 0.5786, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 2.791411042944785, | |
| "grad_norm": 0.38252297043800354, | |
| "learning_rate": 9.778965073991652e-06, | |
| "loss": 0.5767, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 2.8220858895705523, | |
| "grad_norm": 0.36087849736213684, | |
| "learning_rate": 9.768346134341174e-06, | |
| "loss": 0.578, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 2.852760736196319, | |
| "grad_norm": 0.376136839389801, | |
| "learning_rate": 9.757484108481695e-06, | |
| "loss": 0.574, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 2.883435582822086, | |
| "grad_norm": 0.4361047148704529, | |
| "learning_rate": 9.74637955015001e-06, | |
| "loss": 0.5728, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 2.914110429447853, | |
| "grad_norm": 0.39456483721733093, | |
| "learning_rate": 9.735033025447e-06, | |
| "loss": 0.5757, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 2.9447852760736195, | |
| "grad_norm": 0.4185245633125305, | |
| "learning_rate": 9.723445112808802e-06, | |
| "loss": 0.5727, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 2.9754601226993866, | |
| "grad_norm": 0.39723867177963257, | |
| "learning_rate": 9.71161640297729e-06, | |
| "loss": 0.5687, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 3.0061349693251533, | |
| "grad_norm": 0.416050523519516, | |
| "learning_rate": 9.699547498969978e-06, | |
| "loss": 0.5705, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 3.03680981595092, | |
| "grad_norm": 0.423635333776474, | |
| "learning_rate": 9.687239016049275e-06, | |
| "loss": 0.5491, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 3.067484662576687, | |
| "grad_norm": 0.3904474675655365, | |
| "learning_rate": 9.674691581691114e-06, | |
| "loss": 0.5454, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 3.098159509202454, | |
| "grad_norm": 0.4019912779331207, | |
| "learning_rate": 9.661905835552974e-06, | |
| "loss": 0.5467, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 3.128834355828221, | |
| "grad_norm": 0.41804239153862, | |
| "learning_rate": 9.648882429441258e-06, | |
| "loss": 0.5475, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 3.1595092024539877, | |
| "grad_norm": 0.4303177297115326, | |
| "learning_rate": 9.635622027278076e-06, | |
| "loss": 0.5455, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 3.190184049079755, | |
| "grad_norm": 0.4063819348812103, | |
| "learning_rate": 9.622125305067394e-06, | |
| "loss": 0.5473, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 3.2208588957055215, | |
| "grad_norm": 0.43505850434303284, | |
| "learning_rate": 9.608392950860568e-06, | |
| "loss": 0.5436, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 3.2515337423312882, | |
| "grad_norm": 0.3967374563217163, | |
| "learning_rate": 9.594425664721275e-06, | |
| "loss": 0.5445, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 3.2822085889570554, | |
| "grad_norm": 0.47021791338920593, | |
| "learning_rate": 9.580224158689821e-06, | |
| "loss": 0.5504, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 3.312883435582822, | |
| "grad_norm": 0.3992370069026947, | |
| "learning_rate": 9.565789156746843e-06, | |
| "loss": 0.547, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 3.3435582822085887, | |
| "grad_norm": 0.39715853333473206, | |
| "learning_rate": 9.551121394776395e-06, | |
| "loss": 0.5412, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 3.374233128834356, | |
| "grad_norm": 0.42063596844673157, | |
| "learning_rate": 9.536221620528442e-06, | |
| "loss": 0.5488, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 3.4049079754601226, | |
| "grad_norm": 0.42583799362182617, | |
| "learning_rate": 9.521090593580737e-06, | |
| "loss": 0.5445, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 3.4355828220858897, | |
| "grad_norm": 0.43498581647872925, | |
| "learning_rate": 9.505729085300098e-06, | |
| "loss": 0.5463, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 3.4662576687116564, | |
| "grad_norm": 0.4072875380516052, | |
| "learning_rate": 9.490137878803078e-06, | |
| "loss": 0.55, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 3.4969325153374236, | |
| "grad_norm": 0.3940719664096832, | |
| "learning_rate": 9.47431776891606e-06, | |
| "loss": 0.5473, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 3.5276073619631902, | |
| "grad_norm": 0.4124390184879303, | |
| "learning_rate": 9.458269562134717e-06, | |
| "loss": 0.5386, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 3.558282208588957, | |
| "grad_norm": 0.3916858732700348, | |
| "learning_rate": 9.441994076582907e-06, | |
| "loss": 0.5448, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 3.588957055214724, | |
| "grad_norm": 0.3731173276901245, | |
| "learning_rate": 9.425492141970973e-06, | |
| "loss": 0.5464, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 3.6196319018404908, | |
| "grad_norm": 0.3597932457923889, | |
| "learning_rate": 9.408764599553429e-06, | |
| "loss": 0.5479, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 3.6503067484662575, | |
| "grad_norm": 0.3645245432853699, | |
| "learning_rate": 9.391812302086088e-06, | |
| "loss": 0.5439, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 3.6809815950920246, | |
| "grad_norm": 0.3898046016693115, | |
| "learning_rate": 9.374636113782576e-06, | |
| "loss": 0.5476, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 3.7116564417177913, | |
| "grad_norm": 0.43808862566947937, | |
| "learning_rate": 9.357236910270292e-06, | |
| "loss": 0.5449, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 3.7423312883435584, | |
| "grad_norm": 0.4190289378166199, | |
| "learning_rate": 9.339615578545753e-06, | |
| "loss": 0.5437, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 3.773006134969325, | |
| "grad_norm": 0.4174676239490509, | |
| "learning_rate": 9.321773016929382e-06, | |
| "loss": 0.5397, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 3.8036809815950923, | |
| "grad_norm": 0.43420159816741943, | |
| "learning_rate": 9.30371013501972e-06, | |
| "loss": 0.5424, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 3.834355828220859, | |
| "grad_norm": 0.3853442668914795, | |
| "learning_rate": 9.285427853647038e-06, | |
| "loss": 0.5445, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 3.8650306748466257, | |
| "grad_norm": 0.3969941735267639, | |
| "learning_rate": 9.26692710482641e-06, | |
| "loss": 0.5443, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 3.895705521472393, | |
| "grad_norm": 0.3643360137939453, | |
| "learning_rate": 9.248208831710195e-06, | |
| "loss": 0.5439, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 3.9263803680981595, | |
| "grad_norm": 0.4028114676475525, | |
| "learning_rate": 9.229273988539951e-06, | |
| "loss": 0.5413, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 3.957055214723926, | |
| "grad_norm": 0.37028396129608154, | |
| "learning_rate": 9.210123540597792e-06, | |
| "loss": 0.5454, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 3.9877300613496933, | |
| "grad_norm": 0.37116727232933044, | |
| "learning_rate": 9.190758464157184e-06, | |
| "loss": 0.5383, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 4.0184049079754605, | |
| "grad_norm": 0.5018491744995117, | |
| "learning_rate": 9.171179746433164e-06, | |
| "loss": 0.5207, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 4.049079754601227, | |
| "grad_norm": 0.4621087610721588, | |
| "learning_rate": 9.151388385532022e-06, | |
| "loss": 0.5123, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 4.079754601226994, | |
| "grad_norm": 0.42231205105781555, | |
| "learning_rate": 9.131385390400417e-06, | |
| "loss": 0.5174, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 4.110429447852761, | |
| "grad_norm": 0.3881925046443939, | |
| "learning_rate": 9.111171780773938e-06, | |
| "loss": 0.5137, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 4.141104294478527, | |
| "grad_norm": 0.4086911380290985, | |
| "learning_rate": 9.090748587125118e-06, | |
| "loss": 0.516, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 4.171779141104294, | |
| "grad_norm": 0.39884117245674133, | |
| "learning_rate": 9.070116850610911e-06, | |
| "loss": 0.5126, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 4.2024539877300615, | |
| "grad_norm": 0.44755688309669495, | |
| "learning_rate": 9.049277623019603e-06, | |
| "loss": 0.5165, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 4.233128834355828, | |
| "grad_norm": 0.39085808396339417, | |
| "learning_rate": 9.0282319667172e-06, | |
| "loss": 0.5149, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 4.263803680981595, | |
| "grad_norm": 0.43618932366371155, | |
| "learning_rate": 9.006980954593262e-06, | |
| "loss": 0.5121, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 4.294478527607362, | |
| "grad_norm": 0.3801281452178955, | |
| "learning_rate": 8.985525670006225e-06, | |
| "loss": 0.5152, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 4.325153374233129, | |
| "grad_norm": 0.4198702871799469, | |
| "learning_rate": 8.963867206728147e-06, | |
| "loss": 0.5182, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 4.355828220858895, | |
| "grad_norm": 0.39775219559669495, | |
| "learning_rate": 8.942006668888972e-06, | |
| "loss": 0.5196, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 4.386503067484663, | |
| "grad_norm": 0.39523759484291077, | |
| "learning_rate": 8.919945170920224e-06, | |
| "loss": 0.5174, | |
| "step": 715 | |
| }, | |
| { | |
| "epoch": 4.41717791411043, | |
| "grad_norm": 0.39512258768081665, | |
| "learning_rate": 8.89768383749821e-06, | |
| "loss": 0.5165, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 4.447852760736196, | |
| "grad_norm": 0.4098737835884094, | |
| "learning_rate": 8.875223803486674e-06, | |
| "loss": 0.5181, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 4.478527607361963, | |
| "grad_norm": 0.39851200580596924, | |
| "learning_rate": 8.852566213878947e-06, | |
| "loss": 0.5145, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 4.50920245398773, | |
| "grad_norm": 0.41544604301452637, | |
| "learning_rate": 8.829712223739574e-06, | |
| "loss": 0.5121, | |
| "step": 735 | |
| }, | |
| { | |
| "epoch": 4.539877300613497, | |
| "grad_norm": 0.42748644948005676, | |
| "learning_rate": 8.80666299814543e-06, | |
| "loss": 0.5148, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 4.570552147239264, | |
| "grad_norm": 0.42468902468681335, | |
| "learning_rate": 8.783419712126335e-06, | |
| "loss": 0.5164, | |
| "step": 745 | |
| }, | |
| { | |
| "epoch": 4.601226993865031, | |
| "grad_norm": 0.41848933696746826, | |
| "learning_rate": 8.759983550605132e-06, | |
| "loss": 0.511, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 4.631901840490798, | |
| "grad_norm": 0.3945312201976776, | |
| "learning_rate": 8.736355708337298e-06, | |
| "loss": 0.5143, | |
| "step": 755 | |
| }, | |
| { | |
| "epoch": 4.662576687116564, | |
| "grad_norm": 0.3941504657268524, | |
| "learning_rate": 8.71253738985003e-06, | |
| "loss": 0.5131, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 4.693251533742331, | |
| "grad_norm": 0.4362662732601166, | |
| "learning_rate": 8.688529809380843e-06, | |
| "loss": 0.517, | |
| "step": 765 | |
| }, | |
| { | |
| "epoch": 4.723926380368098, | |
| "grad_norm": 0.4152030646800995, | |
| "learning_rate": 8.66433419081566e-06, | |
| "loss": 0.5199, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 4.754601226993865, | |
| "grad_norm": 0.40423014760017395, | |
| "learning_rate": 8.639951767626429e-06, | |
| "loss": 0.5178, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 4.785276073619632, | |
| "grad_norm": 0.393555223941803, | |
| "learning_rate": 8.615383782808238e-06, | |
| "loss": 0.5141, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 4.815950920245399, | |
| "grad_norm": 0.3810482323169708, | |
| "learning_rate": 8.590631488815945e-06, | |
| "loss": 0.5155, | |
| "step": 785 | |
| }, | |
| { | |
| "epoch": 4.846625766871165, | |
| "grad_norm": 0.3922368884086609, | |
| "learning_rate": 8.565696147500338e-06, | |
| "loss": 0.5115, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 4.877300613496932, | |
| "grad_norm": 0.36917635798454285, | |
| "learning_rate": 8.540579030043795e-06, | |
| "loss": 0.5185, | |
| "step": 795 | |
| }, | |
| { | |
| "epoch": 4.9079754601226995, | |
| "grad_norm": 0.37421196699142456, | |
| "learning_rate": 8.515281416895489e-06, | |
| "loss": 0.5143, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 4.938650306748467, | |
| "grad_norm": 0.37189459800720215, | |
| "learning_rate": 8.48980459770611e-06, | |
| "loss": 0.5145, | |
| "step": 805 | |
| }, | |
| { | |
| "epoch": 4.969325153374233, | |
| "grad_norm": 0.3715671896934509, | |
| "learning_rate": 8.464149871262118e-06, | |
| "loss": 0.5116, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "grad_norm": 0.4017976224422455, | |
| "learning_rate": 8.43831854541953e-06, | |
| "loss": 0.5113, | |
| "step": 815 | |
| }, | |
| { | |
| "epoch": 5.030674846625767, | |
| "grad_norm": 0.43677598237991333, | |
| "learning_rate": 8.412311937037255e-06, | |
| "loss": 0.484, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 5.061349693251533, | |
| "grad_norm": 0.4145989418029785, | |
| "learning_rate": 8.386131371909948e-06, | |
| "loss": 0.4879, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 5.0920245398773005, | |
| "grad_norm": 0.4305966794490814, | |
| "learning_rate": 8.35977818470044e-06, | |
| "loss": 0.4855, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 5.122699386503068, | |
| "grad_norm": 0.40984871983528137, | |
| "learning_rate": 8.33325371887168e-06, | |
| "loss": 0.4853, | |
| "step": 835 | |
| }, | |
| { | |
| "epoch": 5.153374233128835, | |
| "grad_norm": 0.4154280722141266, | |
| "learning_rate": 8.30655932661826e-06, | |
| "loss": 0.4889, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 5.184049079754601, | |
| "grad_norm": 0.403626948595047, | |
| "learning_rate": 8.279696368797471e-06, | |
| "loss": 0.4946, | |
| "step": 845 | |
| }, | |
| { | |
| "epoch": 5.214723926380368, | |
| "grad_norm": 0.45823124051094055, | |
| "learning_rate": 8.252666214859936e-06, | |
| "loss": 0.4855, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 5.245398773006135, | |
| "grad_norm": 0.4164683520793915, | |
| "learning_rate": 8.225470242779791e-06, | |
| "loss": 0.4873, | |
| "step": 855 | |
| }, | |
| { | |
| "epoch": 5.276073619631902, | |
| "grad_norm": 0.42235732078552246, | |
| "learning_rate": 8.19810983898444e-06, | |
| "loss": 0.492, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 5.306748466257669, | |
| "grad_norm": 0.39570316672325134, | |
| "learning_rate": 8.170586398283878e-06, | |
| "loss": 0.4879, | |
| "step": 865 | |
| }, | |
| { | |
| "epoch": 5.337423312883436, | |
| "grad_norm": 0.4298191964626312, | |
| "learning_rate": 8.142901323799578e-06, | |
| "loss": 0.4877, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 5.368098159509202, | |
| "grad_norm": 0.4283137321472168, | |
| "learning_rate": 8.115056026892965e-06, | |
| "loss": 0.4877, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 5.398773006134969, | |
| "grad_norm": 0.4249846935272217, | |
| "learning_rate": 8.08705192709347e-06, | |
| "loss": 0.4899, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 5.429447852760736, | |
| "grad_norm": 0.40995773673057556, | |
| "learning_rate": 8.058890452026155e-06, | |
| "loss": 0.488, | |
| "step": 885 | |
| }, | |
| { | |
| "epoch": 5.460122699386503, | |
| "grad_norm": 0.40157032012939453, | |
| "learning_rate": 8.030573037338942e-06, | |
| "loss": 0.4924, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 5.49079754601227, | |
| "grad_norm": 0.40010377764701843, | |
| "learning_rate": 8.002101126629422e-06, | |
| "loss": 0.4884, | |
| "step": 895 | |
| }, | |
| { | |
| "epoch": 5.521472392638037, | |
| "grad_norm": 0.40110117197036743, | |
| "learning_rate": 7.973476171371255e-06, | |
| "loss": 0.4868, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 5.552147239263804, | |
| "grad_norm": 0.43940532207489014, | |
| "learning_rate": 7.94469963084019e-06, | |
| "loss": 0.486, | |
| "step": 905 | |
| }, | |
| { | |
| "epoch": 5.58282208588957, | |
| "grad_norm": 0.389099657535553, | |
| "learning_rate": 7.91577297203966e-06, | |
| "loss": 0.4873, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 5.613496932515337, | |
| "grad_norm": 0.45393890142440796, | |
| "learning_rate": 7.886697669625995e-06, | |
| "loss": 0.4935, | |
| "step": 915 | |
| }, | |
| { | |
| "epoch": 5.644171779141105, | |
| "grad_norm": 0.44767045974731445, | |
| "learning_rate": 7.857475205833255e-06, | |
| "loss": 0.4939, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 5.674846625766871, | |
| "grad_norm": 0.37822064757347107, | |
| "learning_rate": 7.828107070397657e-06, | |
| "loss": 0.4833, | |
| "step": 925 | |
| }, | |
| { | |
| "epoch": 5.705521472392638, | |
| "grad_norm": 0.47380632162094116, | |
| "learning_rate": 7.798594760481639e-06, | |
| "loss": 0.4883, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 5.736196319018405, | |
| "grad_norm": 0.41387900710105896, | |
| "learning_rate": 7.768939780597523e-06, | |
| "loss": 0.491, | |
| "step": 935 | |
| }, | |
| { | |
| "epoch": 5.766871165644172, | |
| "grad_norm": 0.44347015023231506, | |
| "learning_rate": 7.739143642530833e-06, | |
| "loss": 0.4913, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 5.7975460122699385, | |
| "grad_norm": 0.43487074971199036, | |
| "learning_rate": 7.70920786526321e-06, | |
| "loss": 0.4937, | |
| "step": 945 | |
| }, | |
| { | |
| "epoch": 5.828220858895706, | |
| "grad_norm": 0.39147791266441345, | |
| "learning_rate": 7.679133974894984e-06, | |
| "loss": 0.4887, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 5.858895705521473, | |
| "grad_norm": 0.4729338586330414, | |
| "learning_rate": 7.648923504567374e-06, | |
| "loss": 0.4871, | |
| "step": 955 | |
| }, | |
| { | |
| "epoch": 5.889570552147239, | |
| "grad_norm": 0.38986459374427795, | |
| "learning_rate": 7.618577994384324e-06, | |
| "loss": 0.4881, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 5.920245398773006, | |
| "grad_norm": 0.421047180891037, | |
| "learning_rate": 7.588098991334001e-06, | |
| "loss": 0.4875, | |
| "step": 965 | |
| }, | |
| { | |
| "epoch": 5.950920245398773, | |
| "grad_norm": 0.4043465852737427, | |
| "learning_rate": 7.557488049209921e-06, | |
| "loss": 0.4878, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 5.9815950920245395, | |
| "grad_norm": 0.39679425954818726, | |
| "learning_rate": 7.52674672853174e-06, | |
| "loss": 0.4873, | |
| "step": 975 | |
| }, | |
| { | |
| "epoch": 6.012269938650307, | |
| "grad_norm": 0.4268780052661896, | |
| "learning_rate": 7.495876596465703e-06, | |
| "loss": 0.4755, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 6.042944785276074, | |
| "grad_norm": 0.49131566286087036, | |
| "learning_rate": 7.464879226744748e-06, | |
| "loss": 0.46, | |
| "step": 985 | |
| }, | |
| { | |
| "epoch": 6.07361963190184, | |
| "grad_norm": 0.422818660736084, | |
| "learning_rate": 7.433756199588282e-06, | |
| "loss": 0.4606, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 6.104294478527607, | |
| "grad_norm": 0.4517934322357178, | |
| "learning_rate": 7.402509101621618e-06, | |
| "loss": 0.4616, | |
| "step": 995 | |
| }, | |
| { | |
| "epoch": 6.134969325153374, | |
| "grad_norm": 0.42230966687202454, | |
| "learning_rate": 7.371139525795094e-06, | |
| "loss": 0.4625, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 6.1656441717791415, | |
| "grad_norm": 0.4756508767604828, | |
| "learning_rate": 7.3396490713028674e-06, | |
| "loss": 0.4631, | |
| "step": 1005 | |
| }, | |
| { | |
| "epoch": 6.196319018404908, | |
| "grad_norm": 0.4194376468658447, | |
| "learning_rate": 7.308039343501381e-06, | |
| "loss": 0.4636, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 6.226993865030675, | |
| "grad_norm": 0.4464778006076813, | |
| "learning_rate": 7.276311953827533e-06, | |
| "loss": 0.4634, | |
| "step": 1015 | |
| }, | |
| { | |
| "epoch": 6.257668711656442, | |
| "grad_norm": 0.42353737354278564, | |
| "learning_rate": 7.244468519716521e-06, | |
| "loss": 0.4618, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 6.288343558282208, | |
| "grad_norm": 0.4474755525588989, | |
| "learning_rate": 7.212510664519391e-06, | |
| "loss": 0.4638, | |
| "step": 1025 | |
| }, | |
| { | |
| "epoch": 6.319018404907975, | |
| "grad_norm": 0.41456103324890137, | |
| "learning_rate": 7.180440017420277e-06, | |
| "loss": 0.4603, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 6.3496932515337425, | |
| "grad_norm": 0.41820865869522095, | |
| "learning_rate": 7.148258213353347e-06, | |
| "loss": 0.4625, | |
| "step": 1035 | |
| }, | |
| { | |
| "epoch": 6.38036809815951, | |
| "grad_norm": 0.42110052704811096, | |
| "learning_rate": 7.115966892919459e-06, | |
| "loss": 0.4629, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 6.411042944785276, | |
| "grad_norm": 0.4150598645210266, | |
| "learning_rate": 7.083567702302517e-06, | |
| "loss": 0.4604, | |
| "step": 1045 | |
| }, | |
| { | |
| "epoch": 6.441717791411043, | |
| "grad_norm": 0.40612462162971497, | |
| "learning_rate": 7.05106229318556e-06, | |
| "loss": 0.4642, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 6.47239263803681, | |
| "grad_norm": 0.4007202386856079, | |
| "learning_rate": 7.018452322666549e-06, | |
| "loss": 0.4692, | |
| "step": 1055 | |
| }, | |
| { | |
| "epoch": 6.5030674846625764, | |
| "grad_norm": 0.4103841483592987, | |
| "learning_rate": 6.985739453173903e-06, | |
| "loss": 0.4606, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 6.533742331288344, | |
| "grad_norm": 0.41603884100914, | |
| "learning_rate": 6.9529253523817396e-06, | |
| "loss": 0.4646, | |
| "step": 1065 | |
| }, | |
| { | |
| "epoch": 6.564417177914111, | |
| "grad_norm": 0.40389156341552734, | |
| "learning_rate": 6.9200116931248575e-06, | |
| "loss": 0.4662, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 6.595092024539877, | |
| "grad_norm": 0.4311538338661194, | |
| "learning_rate": 6.887000153313468e-06, | |
| "loss": 0.467, | |
| "step": 1075 | |
| }, | |
| { | |
| "epoch": 6.625766871165644, | |
| "grad_norm": 0.4369893968105316, | |
| "learning_rate": 6.853892415847645e-06, | |
| "loss": 0.4656, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 6.656441717791411, | |
| "grad_norm": 0.4239542484283447, | |
| "learning_rate": 6.8206901685315366e-06, | |
| "loss": 0.4635, | |
| "step": 1085 | |
| }, | |
| { | |
| "epoch": 6.6871165644171775, | |
| "grad_norm": 0.4784742593765259, | |
| "learning_rate": 6.787395103987323e-06, | |
| "loss": 0.4647, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 6.717791411042945, | |
| "grad_norm": 0.4250466227531433, | |
| "learning_rate": 6.754008919568927e-06, | |
| "loss": 0.4642, | |
| "step": 1095 | |
| }, | |
| { | |
| "epoch": 6.748466257668712, | |
| "grad_norm": 0.40565210580825806, | |
| "learning_rate": 6.72053331727549e-06, | |
| "loss": 0.4647, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 6.779141104294479, | |
| "grad_norm": 0.41947269439697266, | |
| "learning_rate": 6.686970003664588e-06, | |
| "loss": 0.4678, | |
| "step": 1105 | |
| }, | |
| { | |
| "epoch": 6.809815950920245, | |
| "grad_norm": 0.43468502163887024, | |
| "learning_rate": 6.653320689765257e-06, | |
| "loss": 0.466, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 6.840490797546012, | |
| "grad_norm": 0.4085966944694519, | |
| "learning_rate": 6.619587090990748e-06, | |
| "loss": 0.4642, | |
| "step": 1115 | |
| }, | |
| { | |
| "epoch": 6.871165644171779, | |
| "grad_norm": 0.40702173113822937, | |
| "learning_rate": 6.585770927051085e-06, | |
| "loss": 0.4645, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 6.901840490797546, | |
| "grad_norm": 0.4169243276119232, | |
| "learning_rate": 6.551873921865393e-06, | |
| "loss": 0.4661, | |
| "step": 1125 | |
| }, | |
| { | |
| "epoch": 6.932515337423313, | |
| "grad_norm": 0.4328126013278961, | |
| "learning_rate": 6.517897803474011e-06, | |
| "loss": 0.4679, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 6.96319018404908, | |
| "grad_norm": 0.40873920917510986, | |
| "learning_rate": 6.483844303950411e-06, | |
| "loss": 0.4635, | |
| "step": 1135 | |
| }, | |
| { | |
| "epoch": 6.993865030674847, | |
| "grad_norm": 0.44031643867492676, | |
| "learning_rate": 6.4497151593128795e-06, | |
| "loss": 0.4651, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 7.024539877300613, | |
| "grad_norm": 0.6135973334312439, | |
| "learning_rate": 6.415512109436031e-06, | |
| "loss": 0.4404, | |
| "step": 1145 | |
| }, | |
| { | |
| "epoch": 7.0552147239263805, | |
| "grad_norm": 0.5111101269721985, | |
| "learning_rate": 6.381236897962102e-06, | |
| "loss": 0.4372, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 7.085889570552148, | |
| "grad_norm": 0.4504176080226898, | |
| "learning_rate": 6.3468912722120715e-06, | |
| "loss": 0.4397, | |
| "step": 1155 | |
| }, | |
| { | |
| "epoch": 7.116564417177914, | |
| "grad_norm": 0.42820578813552856, | |
| "learning_rate": 6.312476983096573e-06, | |
| "loss": 0.44, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 7.147239263803681, | |
| "grad_norm": 0.44418537616729736, | |
| "learning_rate": 6.277995785026642e-06, | |
| "loss": 0.4374, | |
| "step": 1165 | |
| }, | |
| { | |
| "epoch": 7.177914110429448, | |
| "grad_norm": 0.43857964873313904, | |
| "learning_rate": 6.243449435824276e-06, | |
| "loss": 0.4394, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 7.208588957055214, | |
| "grad_norm": 0.4350876808166504, | |
| "learning_rate": 6.2088396966328155e-06, | |
| "loss": 0.4346, | |
| "step": 1175 | |
| }, | |
| { | |
| "epoch": 7.2392638036809815, | |
| "grad_norm": 0.4213852286338806, | |
| "learning_rate": 6.174168331827179e-06, | |
| "loss": 0.4381, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 7.269938650306749, | |
| "grad_norm": 0.4279233515262604, | |
| "learning_rate": 6.139437108923898e-06, | |
| "loss": 0.4405, | |
| "step": 1185 | |
| }, | |
| { | |
| "epoch": 7.300613496932515, | |
| "grad_norm": 0.4222026467323303, | |
| "learning_rate": 6.1046477984910215e-06, | |
| "loss": 0.4383, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 7.331288343558282, | |
| "grad_norm": 0.4608646631240845, | |
| "learning_rate": 6.069802174057849e-06, | |
| "loss": 0.4399, | |
| "step": 1195 | |
| }, | |
| { | |
| "epoch": 7.361963190184049, | |
| "grad_norm": 0.43802234530448914, | |
| "learning_rate": 6.034902012024521e-06, | |
| "loss": 0.4418, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 7.392638036809816, | |
| "grad_norm": 0.45217955112457275, | |
| "learning_rate": 5.999949091571462e-06, | |
| "loss": 0.4432, | |
| "step": 1205 | |
| }, | |
| { | |
| "epoch": 7.423312883435583, | |
| "grad_norm": 0.4765477478504181, | |
| "learning_rate": 5.964945194568669e-06, | |
| "loss": 0.4341, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 7.45398773006135, | |
| "grad_norm": 0.4318167269229889, | |
| "learning_rate": 5.9298921054848826e-06, | |
| "loss": 0.4355, | |
| "step": 1215 | |
| }, | |
| { | |
| "epoch": 7.484662576687117, | |
| "grad_norm": 0.4203420877456665, | |
| "learning_rate": 5.894791611296614e-06, | |
| "loss": 0.4417, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 7.515337423312883, | |
| "grad_norm": 0.4351012408733368, | |
| "learning_rate": 5.859645501397048e-06, | |
| "loss": 0.4461, | |
| "step": 1225 | |
| }, | |
| { | |
| "epoch": 7.54601226993865, | |
| "grad_norm": 0.4254397749900818, | |
| "learning_rate": 5.824455567504817e-06, | |
| "loss": 0.4399, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 7.576687116564417, | |
| "grad_norm": 0.40902575850486755, | |
| "learning_rate": 5.789223603572663e-06, | |
| "loss": 0.4407, | |
| "step": 1235 | |
| }, | |
| { | |
| "epoch": 7.6073619631901845, | |
| "grad_norm": 0.4211549162864685, | |
| "learning_rate": 5.753951405695981e-06, | |
| "loss": 0.4388, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 7.638036809815951, | |
| "grad_norm": 0.43186110258102417, | |
| "learning_rate": 5.7186407720212655e-06, | |
| "loss": 0.4431, | |
| "step": 1245 | |
| }, | |
| { | |
| "epoch": 7.668711656441718, | |
| "grad_norm": 0.4310693144798279, | |
| "learning_rate": 5.683293502654429e-06, | |
| "loss": 0.4419, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 7.699386503067485, | |
| "grad_norm": 0.4637373089790344, | |
| "learning_rate": 5.647911399569043e-06, | |
| "loss": 0.4415, | |
| "step": 1255 | |
| }, | |
| { | |
| "epoch": 7.730061349693251, | |
| "grad_norm": 0.4452053904533386, | |
| "learning_rate": 5.612496266514468e-06, | |
| "loss": 0.4443, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 7.7607361963190185, | |
| "grad_norm": 0.44300079345703125, | |
| "learning_rate": 5.577049908923912e-06, | |
| "loss": 0.4424, | |
| "step": 1265 | |
| }, | |
| { | |
| "epoch": 7.791411042944786, | |
| "grad_norm": 0.4136279225349426, | |
| "learning_rate": 5.541574133822374e-06, | |
| "loss": 0.4381, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 7.822085889570552, | |
| "grad_norm": 0.39727237820625305, | |
| "learning_rate": 5.506070749734539e-06, | |
| "loss": 0.4449, | |
| "step": 1275 | |
| }, | |
| { | |
| "epoch": 7.852760736196319, | |
| "grad_norm": 0.4188498258590698, | |
| "learning_rate": 5.470541566592573e-06, | |
| "loss": 0.4438, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 7.883435582822086, | |
| "grad_norm": 0.43856287002563477, | |
| "learning_rate": 5.434988395643852e-06, | |
| "loss": 0.4433, | |
| "step": 1285 | |
| }, | |
| { | |
| "epoch": 7.914110429447852, | |
| "grad_norm": 0.4030952453613281, | |
| "learning_rate": 5.3994130493586385e-06, | |
| "loss": 0.4419, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 7.9447852760736195, | |
| "grad_norm": 0.425555557012558, | |
| "learning_rate": 5.363817341337665e-06, | |
| "loss": 0.4424, | |
| "step": 1295 | |
| }, | |
| { | |
| "epoch": 7.975460122699387, | |
| "grad_norm": 0.4450756013393402, | |
| "learning_rate": 5.328203086219693e-06, | |
| "loss": 0.4433, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 8.006134969325153, | |
| "grad_norm": 0.5804970860481262, | |
| "learning_rate": 5.292572099588998e-06, | |
| "loss": 0.4375, | |
| "step": 1305 | |
| }, | |
| { | |
| "epoch": 8.036809815950921, | |
| "grad_norm": 0.5236377716064453, | |
| "learning_rate": 5.2569261978828155e-06, | |
| "loss": 0.4167, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 8.067484662576687, | |
| "grad_norm": 0.467786580324173, | |
| "learning_rate": 5.221267198298738e-06, | |
| "loss": 0.4159, | |
| "step": 1315 | |
| }, | |
| { | |
| "epoch": 8.098159509202453, | |
| "grad_norm": 0.4583148956298828, | |
| "learning_rate": 5.185596918702072e-06, | |
| "loss": 0.4125, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 8.128834355828221, | |
| "grad_norm": 0.443567156791687, | |
| "learning_rate": 5.1499171775331754e-06, | |
| "loss": 0.4165, | |
| "step": 1325 | |
| }, | |
| { | |
| "epoch": 8.159509202453988, | |
| "grad_norm": 0.47137385606765747, | |
| "learning_rate": 5.114229793714749e-06, | |
| "loss": 0.4157, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 8.190184049079754, | |
| "grad_norm": 0.4504995048046112, | |
| "learning_rate": 5.078536586559104e-06, | |
| "loss": 0.4155, | |
| "step": 1335 | |
| }, | |
| { | |
| "epoch": 8.220858895705522, | |
| "grad_norm": 0.4822964668273926, | |
| "learning_rate": 5.042839375675425e-06, | |
| "loss": 0.4189, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 8.251533742331288, | |
| "grad_norm": 0.441819429397583, | |
| "learning_rate": 5.0071399808770015e-06, | |
| "loss": 0.4165, | |
| "step": 1345 | |
| }, | |
| { | |
| "epoch": 8.282208588957054, | |
| "grad_norm": 0.46287593245506287, | |
| "learning_rate": 4.971440222088459e-06, | |
| "loss": 0.4158, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 8.312883435582823, | |
| "grad_norm": 0.46531328558921814, | |
| "learning_rate": 4.935741919252973e-06, | |
| "loss": 0.4213, | |
| "step": 1355 | |
| }, | |
| { | |
| "epoch": 8.343558282208589, | |
| "grad_norm": 0.45542043447494507, | |
| "learning_rate": 4.900046892239507e-06, | |
| "loss": 0.4122, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 8.374233128834355, | |
| "grad_norm": 0.4518256187438965, | |
| "learning_rate": 4.864356960750011e-06, | |
| "loss": 0.4179, | |
| "step": 1365 | |
| }, | |
| { | |
| "epoch": 8.404907975460123, | |
| "grad_norm": 0.4747672975063324, | |
| "learning_rate": 4.828673944226684e-06, | |
| "loss": 0.4187, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 8.43558282208589, | |
| "grad_norm": 0.4235823452472687, | |
| "learning_rate": 4.792999661759196e-06, | |
| "loss": 0.421, | |
| "step": 1375 | |
| }, | |
| { | |
| "epoch": 8.466257668711656, | |
| "grad_norm": 0.41661733388900757, | |
| "learning_rate": 4.757335931991965e-06, | |
| "loss": 0.4189, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 8.496932515337424, | |
| "grad_norm": 0.4476306140422821, | |
| "learning_rate": 4.721684573031447e-06, | |
| "loss": 0.4189, | |
| "step": 1385 | |
| }, | |
| { | |
| "epoch": 8.52760736196319, | |
| "grad_norm": 0.45303910970687866, | |
| "learning_rate": 4.686047402353433e-06, | |
| "loss": 0.4181, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 8.558282208588958, | |
| "grad_norm": 0.44240671396255493, | |
| "learning_rate": 4.650426236710421e-06, | |
| "loss": 0.418, | |
| "step": 1395 | |
| }, | |
| { | |
| "epoch": 8.588957055214724, | |
| "grad_norm": 0.45954430103302, | |
| "learning_rate": 4.614822892038974e-06, | |
| "loss": 0.419, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 8.61963190184049, | |
| "grad_norm": 0.4528447985649109, | |
| "learning_rate": 4.579239183367166e-06, | |
| "loss": 0.418, | |
| "step": 1405 | |
| }, | |
| { | |
| "epoch": 8.650306748466258, | |
| "grad_norm": 0.4467931389808655, | |
| "learning_rate": 4.543676924722042e-06, | |
| "loss": 0.4177, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 8.680981595092025, | |
| "grad_norm": 0.4380550682544708, | |
| "learning_rate": 4.508137929037138e-06, | |
| "loss": 0.4206, | |
| "step": 1415 | |
| }, | |
| { | |
| "epoch": 8.71165644171779, | |
| "grad_norm": 0.4435415267944336, | |
| "learning_rate": 4.472624008060071e-06, | |
| "loss": 0.42, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 8.742331288343559, | |
| "grad_norm": 0.4348682761192322, | |
| "learning_rate": 4.437136972260168e-06, | |
| "loss": 0.4205, | |
| "step": 1425 | |
| }, | |
| { | |
| "epoch": 8.773006134969325, | |
| "grad_norm": 0.42294904589653015, | |
| "learning_rate": 4.401678630736172e-06, | |
| "loss": 0.4211, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 8.803680981595091, | |
| "grad_norm": 0.42331093549728394, | |
| "learning_rate": 4.366250791124017e-06, | |
| "loss": 0.4185, | |
| "step": 1435 | |
| }, | |
| { | |
| "epoch": 8.83435582822086, | |
| "grad_norm": 0.41695448756217957, | |
| "learning_rate": 4.330855259504676e-06, | |
| "loss": 0.4215, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 8.865030674846626, | |
| "grad_norm": 0.4440816342830658, | |
| "learning_rate": 4.295493840312087e-06, | |
| "loss": 0.4217, | |
| "step": 1445 | |
| }, | |
| { | |
| "epoch": 8.895705521472392, | |
| "grad_norm": 0.42288738489151, | |
| "learning_rate": 4.260168336241169e-06, | |
| "loss": 0.4225, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 8.92638036809816, | |
| "grad_norm": 0.43750008940696716, | |
| "learning_rate": 4.224880548155913e-06, | |
| "loss": 0.4209, | |
| "step": 1455 | |
| }, | |
| { | |
| "epoch": 8.957055214723926, | |
| "grad_norm": 0.4323638081550598, | |
| "learning_rate": 4.1896322749975885e-06, | |
| "loss": 0.4284, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 8.987730061349692, | |
| "grad_norm": 0.42216283082962036, | |
| "learning_rate": 4.154425313693018e-06, | |
| "loss": 0.4209, | |
| "step": 1465 | |
| }, | |
| { | |
| "epoch": 9.01840490797546, | |
| "grad_norm": 0.5621673464775085, | |
| "learning_rate": 4.119261459062992e-06, | |
| "loss": 0.4052, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 9.049079754601227, | |
| "grad_norm": 0.48066237568855286, | |
| "learning_rate": 4.084142503730754e-06, | |
| "loss": 0.3959, | |
| "step": 1475 | |
| }, | |
| { | |
| "epoch": 9.079754601226995, | |
| "grad_norm": 0.4512442648410797, | |
| "learning_rate": 4.049070238030618e-06, | |
| "loss": 0.3949, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 9.110429447852761, | |
| "grad_norm": 0.4584384560585022, | |
| "learning_rate": 4.014046449916703e-06, | |
| "loss": 0.394, | |
| "step": 1485 | |
| }, | |
| { | |
| "epoch": 9.141104294478527, | |
| "grad_norm": 0.4640498161315918, | |
| "learning_rate": 3.979072924871784e-06, | |
| "loss": 0.3932, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 9.171779141104295, | |
| "grad_norm": 0.44619888067245483, | |
| "learning_rate": 3.944151445816265e-06, | |
| "loss": 0.3989, | |
| "step": 1495 | |
| }, | |
| { | |
| "epoch": 9.202453987730062, | |
| "grad_norm": 0.4594890773296356, | |
| "learning_rate": 3.909283793017289e-06, | |
| "loss": 0.3939, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 9.233128834355828, | |
| "grad_norm": 0.47108158469200134, | |
| "learning_rate": 3.874471743997983e-06, | |
| "loss": 0.3981, | |
| "step": 1505 | |
| }, | |
| { | |
| "epoch": 9.263803680981596, | |
| "grad_norm": 0.4786073565483093, | |
| "learning_rate": 3.839717073446842e-06, | |
| "loss": 0.3977, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 9.294478527607362, | |
| "grad_norm": 0.47187334299087524, | |
| "learning_rate": 3.80502155312726e-06, | |
| "loss": 0.395, | |
| "step": 1515 | |
| }, | |
| { | |
| "epoch": 9.325153374233128, | |
| "grad_norm": 0.46660172939300537, | |
| "learning_rate": 3.770386951787193e-06, | |
| "loss": 0.3971, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 9.355828220858896, | |
| "grad_norm": 0.4632403552532196, | |
| "learning_rate": 3.735815035069007e-06, | |
| "loss": 0.3947, | |
| "step": 1525 | |
| }, | |
| { | |
| "epoch": 9.386503067484663, | |
| "grad_norm": 0.47112977504730225, | |
| "learning_rate": 3.7013075654194586e-06, | |
| "loss": 0.3999, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 9.417177914110429, | |
| "grad_norm": 0.48859986662864685, | |
| "learning_rate": 3.666866301999843e-06, | |
| "loss": 0.3983, | |
| "step": 1535 | |
| }, | |
| { | |
| "epoch": 9.447852760736197, | |
| "grad_norm": 0.44551530480384827, | |
| "learning_rate": 3.6324930005963256e-06, | |
| "loss": 0.4012, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 9.478527607361963, | |
| "grad_norm": 0.4551395773887634, | |
| "learning_rate": 3.5981894135304207e-06, | |
| "loss": 0.4018, | |
| "step": 1545 | |
| }, | |
| { | |
| "epoch": 9.50920245398773, | |
| "grad_norm": 0.4616907835006714, | |
| "learning_rate": 3.563957289569669e-06, | |
| "loss": 0.3995, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 9.539877300613497, | |
| "grad_norm": 0.4593202769756317, | |
| "learning_rate": 3.5297983738384813e-06, | |
| "loss": 0.3998, | |
| "step": 1555 | |
| }, | |
| { | |
| "epoch": 9.570552147239264, | |
| "grad_norm": 0.44857659935951233, | |
| "learning_rate": 3.495714407729174e-06, | |
| "loss": 0.4041, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 9.60122699386503, | |
| "grad_norm": 0.4826293885707855, | |
| "learning_rate": 3.461707128813201e-06, | |
| "loss": 0.3994, | |
| "step": 1565 | |
| }, | |
| { | |
| "epoch": 9.631901840490798, | |
| "grad_norm": 0.4611422121524811, | |
| "learning_rate": 3.427778270752561e-06, | |
| "loss": 0.4035, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 9.662576687116564, | |
| "grad_norm": 0.4648556411266327, | |
| "learning_rate": 3.3939295632114313e-06, | |
| "loss": 0.4021, | |
| "step": 1575 | |
| }, | |
| { | |
| "epoch": 9.69325153374233, | |
| "grad_norm": 0.4454943835735321, | |
| "learning_rate": 3.3601627317679832e-06, | |
| "loss": 0.3982, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 9.723926380368098, | |
| "grad_norm": 0.45751452445983887, | |
| "learning_rate": 3.326479497826409e-06, | |
| "loss": 0.4007, | |
| "step": 1585 | |
| }, | |
| { | |
| "epoch": 9.754601226993865, | |
| "grad_norm": 0.4437613785266876, | |
| "learning_rate": 3.292881578529179e-06, | |
| "loss": 0.4038, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 9.785276073619633, | |
| "grad_norm": 0.44608426094055176, | |
| "learning_rate": 3.2593706866694934e-06, | |
| "loss": 0.399, | |
| "step": 1595 | |
| }, | |
| { | |
| "epoch": 9.815950920245399, | |
| "grad_norm": 0.44054606556892395, | |
| "learning_rate": 3.225948530603965e-06, | |
| "loss": 0.4004, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 9.846625766871165, | |
| "grad_norm": 0.44678565859794617, | |
| "learning_rate": 3.192616814165537e-06, | |
| "loss": 0.3987, | |
| "step": 1605 | |
| }, | |
| { | |
| "epoch": 9.877300613496933, | |
| "grad_norm": 0.4583883285522461, | |
| "learning_rate": 3.1593772365766107e-06, | |
| "loss": 0.4016, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 9.9079754601227, | |
| "grad_norm": 0.44818422198295593, | |
| "learning_rate": 3.126231492362435e-06, | |
| "loss": 0.4031, | |
| "step": 1615 | |
| }, | |
| { | |
| "epoch": 9.938650306748466, | |
| "grad_norm": 0.4891637861728668, | |
| "learning_rate": 3.0931812712647107e-06, | |
| "loss": 0.4017, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 9.969325153374234, | |
| "grad_norm": 0.47009608149528503, | |
| "learning_rate": 3.0602282581554498e-06, | |
| "loss": 0.3981, | |
| "step": 1625 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "grad_norm": 0.476095050573349, | |
| "learning_rate": 3.0273741329510852e-06, | |
| "loss": 0.3981, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 10.030674846625766, | |
| "grad_norm": 0.5378819108009338, | |
| "learning_rate": 2.9946205705268337e-06, | |
| "loss": 0.3802, | |
| "step": 1635 | |
| }, | |
| { | |
| "epoch": 10.061349693251534, | |
| "grad_norm": 0.5001988410949707, | |
| "learning_rate": 2.9619692406312983e-06, | |
| "loss": 0.3811, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 10.0920245398773, | |
| "grad_norm": 0.49986687302589417, | |
| "learning_rate": 2.929421807801364e-06, | |
| "loss": 0.3819, | |
| "step": 1645 | |
| }, | |
| { | |
| "epoch": 10.122699386503067, | |
| "grad_norm": 0.47913286089897156, | |
| "learning_rate": 2.8969799312773263e-06, | |
| "loss": 0.3786, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 10.153374233128835, | |
| "grad_norm": 0.46896302700042725, | |
| "learning_rate": 2.8646452649183132e-06, | |
| "loss": 0.3775, | |
| "step": 1655 | |
| }, | |
| { | |
| "epoch": 10.184049079754601, | |
| "grad_norm": 0.4729619324207306, | |
| "learning_rate": 2.8324194571179696e-06, | |
| "loss": 0.3781, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 10.214723926380367, | |
| "grad_norm": 0.49238312244415283, | |
| "learning_rate": 2.800304150720424e-06, | |
| "loss": 0.3841, | |
| "step": 1665 | |
| }, | |
| { | |
| "epoch": 10.245398773006135, | |
| "grad_norm": 0.5394685864448547, | |
| "learning_rate": 2.7683009829365417e-06, | |
| "loss": 0.3834, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 10.276073619631902, | |
| "grad_norm": 0.47765809297561646, | |
| "learning_rate": 2.736411585260445e-06, | |
| "loss": 0.382, | |
| "step": 1675 | |
| }, | |
| { | |
| "epoch": 10.30674846625767, | |
| "grad_norm": 0.4734129309654236, | |
| "learning_rate": 2.704637583386369e-06, | |
| "loss": 0.3828, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 10.337423312883436, | |
| "grad_norm": 0.48051029443740845, | |
| "learning_rate": 2.672980597125764e-06, | |
| "loss": 0.3784, | |
| "step": 1685 | |
| }, | |
| { | |
| "epoch": 10.368098159509202, | |
| "grad_norm": 0.4490968585014343, | |
| "learning_rate": 2.6414422403247174e-06, | |
| "loss": 0.3786, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 10.39877300613497, | |
| "grad_norm": 0.44560185074806213, | |
| "learning_rate": 2.610024120781694e-06, | |
| "loss": 0.3839, | |
| "step": 1695 | |
| }, | |
| { | |
| "epoch": 10.429447852760736, | |
| "grad_norm": 0.4588739573955536, | |
| "learning_rate": 2.5787278401655714e-06, | |
| "loss": 0.381, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 10.460122699386503, | |
| "grad_norm": 0.4592171311378479, | |
| "learning_rate": 2.5475549939339716e-06, | |
| "loss": 0.3828, | |
| "step": 1705 | |
| }, | |
| { | |
| "epoch": 10.49079754601227, | |
| "grad_norm": 0.47413405776023865, | |
| "learning_rate": 2.5165071712519447e-06, | |
| "loss": 0.3812, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 10.521472392638037, | |
| "grad_norm": 0.4702529013156891, | |
| "learning_rate": 2.4855859549109446e-06, | |
| "loss": 0.378, | |
| "step": 1715 | |
| }, | |
| { | |
| "epoch": 10.552147239263803, | |
| "grad_norm": 0.44477346539497375, | |
| "learning_rate": 2.4547929212481436e-06, | |
| "loss": 0.3841, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 10.582822085889571, | |
| "grad_norm": 0.4779837131500244, | |
| "learning_rate": 2.4241296400660696e-06, | |
| "loss": 0.383, | |
| "step": 1725 | |
| }, | |
| { | |
| "epoch": 10.613496932515337, | |
| "grad_norm": 0.46424034237861633, | |
| "learning_rate": 2.39359767455258e-06, | |
| "loss": 0.3832, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 10.644171779141104, | |
| "grad_norm": 0.4713508188724518, | |
| "learning_rate": 2.3631985812011736e-06, | |
| "loss": 0.3843, | |
| "step": 1735 | |
| }, | |
| { | |
| "epoch": 10.674846625766872, | |
| "grad_norm": 0.4567889869213104, | |
| "learning_rate": 2.332933909731635e-06, | |
| "loss": 0.3798, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 10.705521472392638, | |
| "grad_norm": 0.44534802436828613, | |
| "learning_rate": 2.302805203011039e-06, | |
| "loss": 0.38, | |
| "step": 1745 | |
| }, | |
| { | |
| "epoch": 10.736196319018404, | |
| "grad_norm": 0.4395006000995636, | |
| "learning_rate": 2.2728139969751005e-06, | |
| "loss": 0.384, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 10.766871165644172, | |
| "grad_norm": 0.4457756280899048, | |
| "learning_rate": 2.2429618205498543e-06, | |
| "loss": 0.3833, | |
| "step": 1755 | |
| }, | |
| { | |
| "epoch": 10.797546012269938, | |
| "grad_norm": 0.46420347690582275, | |
| "learning_rate": 2.213250195573734e-06, | |
| "loss": 0.3822, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 10.828220858895705, | |
| "grad_norm": 0.4520081877708435, | |
| "learning_rate": 2.1836806367199763e-06, | |
| "loss": 0.3836, | |
| "step": 1765 | |
| }, | |
| { | |
| "epoch": 10.858895705521473, | |
| "grad_norm": 0.4488300681114197, | |
| "learning_rate": 2.1542546514194103e-06, | |
| "loss": 0.3824, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 10.889570552147239, | |
| "grad_norm": 0.4576306939125061, | |
| "learning_rate": 2.124973739783609e-06, | |
| "loss": 0.385, | |
| "step": 1775 | |
| }, | |
| { | |
| "epoch": 10.920245398773005, | |
| "grad_norm": 0.46807441115379333, | |
| "learning_rate": 2.0958393945284074e-06, | |
| "loss": 0.3865, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 10.950920245398773, | |
| "grad_norm": 0.4553256332874298, | |
| "learning_rate": 2.066853100897822e-06, | |
| "loss": 0.3845, | |
| "step": 1785 | |
| }, | |
| { | |
| "epoch": 10.98159509202454, | |
| "grad_norm": 0.43590047955513, | |
| "learning_rate": 2.0380163365883188e-06, | |
| "loss": 0.3853, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 11.012269938650308, | |
| "grad_norm": 0.4983638525009155, | |
| "learning_rate": 2.0093305716734814e-06, | |
| "loss": 0.3732, | |
| "step": 1795 | |
| }, | |
| { | |
| "epoch": 11.042944785276074, | |
| "grad_norm": 0.4627988934516907, | |
| "learning_rate": 1.9807972685290843e-06, | |
| "loss": 0.3674, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 11.07361963190184, | |
| "grad_norm": 0.4763619303703308, | |
| "learning_rate": 1.952417881758526e-06, | |
| "loss": 0.3691, | |
| "step": 1805 | |
| }, | |
| { | |
| "epoch": 11.104294478527608, | |
| "grad_norm": 0.49472418427467346, | |
| "learning_rate": 1.9241938581186766e-06, | |
| "loss": 0.3666, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 11.134969325153374, | |
| "grad_norm": 0.45994389057159424, | |
| "learning_rate": 1.8961266364461306e-06, | |
| "loss": 0.3685, | |
| "step": 1815 | |
| }, | |
| { | |
| "epoch": 11.16564417177914, | |
| "grad_norm": 0.4785662889480591, | |
| "learning_rate": 1.868217647583852e-06, | |
| "loss": 0.3652, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 11.196319018404909, | |
| "grad_norm": 0.4587504267692566, | |
| "learning_rate": 1.8404683143082309e-06, | |
| "loss": 0.3667, | |
| "step": 1825 | |
| }, | |
| { | |
| "epoch": 11.226993865030675, | |
| "grad_norm": 0.45679721236228943, | |
| "learning_rate": 1.8128800512565514e-06, | |
| "loss": 0.3665, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 11.257668711656441, | |
| "grad_norm": 0.4806920289993286, | |
| "learning_rate": 1.7854542648548773e-06, | |
| "loss": 0.3663, | |
| "step": 1835 | |
| }, | |
| { | |
| "epoch": 11.28834355828221, | |
| "grad_norm": 0.45936256647109985, | |
| "learning_rate": 1.7581923532463507e-06, | |
| "loss": 0.3665, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 11.319018404907975, | |
| "grad_norm": 0.4692569673061371, | |
| "learning_rate": 1.731095706219914e-06, | |
| "loss": 0.3655, | |
| "step": 1845 | |
| }, | |
| { | |
| "epoch": 11.349693251533742, | |
| "grad_norm": 0.48757705092430115, | |
| "learning_rate": 1.7041657051394645e-06, | |
| "loss": 0.3696, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 11.38036809815951, | |
| "grad_norm": 0.4615464508533478, | |
| "learning_rate": 1.6774037228734375e-06, | |
| "loss": 0.3689, | |
| "step": 1855 | |
| }, | |
| { | |
| "epoch": 11.411042944785276, | |
| "grad_norm": 0.4684435725212097, | |
| "learning_rate": 1.650811123724802e-06, | |
| "loss": 0.3677, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 11.441717791411042, | |
| "grad_norm": 0.46784135699272156, | |
| "learning_rate": 1.6243892633615272e-06, | |
| "loss": 0.37, | |
| "step": 1865 | |
| }, | |
| { | |
| "epoch": 11.47239263803681, | |
| "grad_norm": 0.44636908173561096, | |
| "learning_rate": 1.598139488747467e-06, | |
| "loss": 0.3653, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 11.503067484662576, | |
| "grad_norm": 0.45427629351615906, | |
| "learning_rate": 1.5720631380736839e-06, | |
| "loss": 0.3678, | |
| "step": 1875 | |
| }, | |
| { | |
| "epoch": 11.533742331288344, | |
| "grad_norm": 0.44010183215141296, | |
| "learning_rate": 1.5461615406902414e-06, | |
| "loss": 0.3698, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 11.56441717791411, | |
| "grad_norm": 0.4783858358860016, | |
| "learning_rate": 1.5204360170384286e-06, | |
| "loss": 0.372, | |
| "step": 1885 | |
| }, | |
| { | |
| "epoch": 11.595092024539877, | |
| "grad_norm": 0.4582843780517578, | |
| "learning_rate": 1.4948878785834453e-06, | |
| "loss": 0.3719, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 11.625766871165645, | |
| "grad_norm": 0.44995734095573425, | |
| "learning_rate": 1.4695184277475482e-06, | |
| "loss": 0.3704, | |
| "step": 1895 | |
| }, | |
| { | |
| "epoch": 11.656441717791411, | |
| "grad_norm": 0.46788886189460754, | |
| "learning_rate": 1.4443289578436459e-06, | |
| "loss": 0.3687, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 11.687116564417177, | |
| "grad_norm": 0.4666103720664978, | |
| "learning_rate": 1.4193207530093806e-06, | |
| "loss": 0.3675, | |
| "step": 1905 | |
| }, | |
| { | |
| "epoch": 11.717791411042946, | |
| "grad_norm": 0.4628080427646637, | |
| "learning_rate": 1.3944950881416541e-06, | |
| "loss": 0.3707, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 11.748466257668712, | |
| "grad_norm": 0.44799864292144775, | |
| "learning_rate": 1.369853228831632e-06, | |
| "loss": 0.3708, | |
| "step": 1915 | |
| }, | |
| { | |
| "epoch": 11.779141104294478, | |
| "grad_norm": 0.47202908992767334, | |
| "learning_rate": 1.3453964313002337e-06, | |
| "loss": 0.3701, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 11.809815950920246, | |
| "grad_norm": 0.4674662947654724, | |
| "learning_rate": 1.3211259423340882e-06, | |
| "loss": 0.3689, | |
| "step": 1925 | |
| }, | |
| { | |
| "epoch": 11.840490797546012, | |
| "grad_norm": 0.4523491859436035, | |
| "learning_rate": 1.2970429992219714e-06, | |
| "loss": 0.3665, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 11.871165644171779, | |
| "grad_norm": 0.4463663101196289, | |
| "learning_rate": 1.2731488296917315e-06, | |
| "loss": 0.3732, | |
| "step": 1935 | |
| }, | |
| { | |
| "epoch": 11.901840490797547, | |
| "grad_norm": 0.4559671878814697, | |
| "learning_rate": 1.2494446518477022e-06, | |
| "loss": 0.3683, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 11.932515337423313, | |
| "grad_norm": 0.4432070851325989, | |
| "learning_rate": 1.2259316741086052e-06, | |
| "loss": 0.3703, | |
| "step": 1945 | |
| }, | |
| { | |
| "epoch": 11.963190184049079, | |
| "grad_norm": 0.4581555128097534, | |
| "learning_rate": 1.2026110951459364e-06, | |
| "loss": 0.3686, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 11.993865030674847, | |
| "grad_norm": 0.4442911744117737, | |
| "learning_rate": 1.1794841038228772e-06, | |
| "loss": 0.3686, | |
| "step": 1955 | |
| }, | |
| { | |
| "epoch": 12.024539877300613, | |
| "grad_norm": 0.4809685945510864, | |
| "learning_rate": 1.156551879133672e-06, | |
| "loss": 0.3568, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 12.05521472392638, | |
| "grad_norm": 0.4492509067058563, | |
| "learning_rate": 1.133815590143525e-06, | |
| "loss": 0.3586, | |
| "step": 1965 | |
| }, | |
| { | |
| "epoch": 12.085889570552148, | |
| "grad_norm": 0.46284252405166626, | |
| "learning_rate": 1.1112763959290102e-06, | |
| "loss": 0.3601, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 12.116564417177914, | |
| "grad_norm": 0.45158612728118896, | |
| "learning_rate": 1.088935445518981e-06, | |
| "loss": 0.3574, | |
| "step": 1975 | |
| }, | |
| { | |
| "epoch": 12.14723926380368, | |
| "grad_norm": 0.4570388197898865, | |
| "learning_rate": 1.0667938778359838e-06, | |
| "loss": 0.3584, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 12.177914110429448, | |
| "grad_norm": 0.4470795691013336, | |
| "learning_rate": 1.0448528216382103e-06, | |
| "loss": 0.3582, | |
| "step": 1985 | |
| }, | |
| { | |
| "epoch": 12.208588957055214, | |
| "grad_norm": 0.4757451117038727, | |
| "learning_rate": 1.0231133954619449e-06, | |
| "loss": 0.3595, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 12.239263803680982, | |
| "grad_norm": 0.44901788234710693, | |
| "learning_rate": 1.0015767075645472e-06, | |
| "loss": 0.3573, | |
| "step": 1995 | |
| }, | |
| { | |
| "epoch": 12.269938650306749, | |
| "grad_norm": 0.457893431186676, | |
| "learning_rate": 9.802438558679529e-07, | |
| "loss": 0.3602, | |
| "step": 2000 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 2445, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 15, | |
| "save_steps": 400, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 3.747402775285413e+19, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |