| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 7.361963190184049, | |
| "eval_steps": 500, | |
| "global_step": 1200, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.03067484662576687, | |
| "grad_norm": 2.916747570037842, | |
| "learning_rate": 1.6326530612244901e-07, | |
| "loss": 1.2014, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.06134969325153374, | |
| "grad_norm": 2.8873817920684814, | |
| "learning_rate": 3.6734693877551025e-07, | |
| "loss": 1.1968, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.09202453987730061, | |
| "grad_norm": 2.65090012550354, | |
| "learning_rate": 5.714285714285715e-07, | |
| "loss": 1.1791, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.12269938650306748, | |
| "grad_norm": 2.3519155979156494, | |
| "learning_rate": 7.755102040816327e-07, | |
| "loss": 1.1753, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.15337423312883436, | |
| "grad_norm": 1.5954474210739136, | |
| "learning_rate": 9.795918367346939e-07, | |
| "loss": 1.1336, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.18404907975460122, | |
| "grad_norm": 1.1593321561813354, | |
| "learning_rate": 1.1836734693877552e-06, | |
| "loss": 1.1015, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.2147239263803681, | |
| "grad_norm": 1.082676887512207, | |
| "learning_rate": 1.3877551020408165e-06, | |
| "loss": 1.054, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.24539877300613497, | |
| "grad_norm": 1.2616770267486572, | |
| "learning_rate": 1.5918367346938775e-06, | |
| "loss": 1.0152, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.27607361963190186, | |
| "grad_norm": 0.7849059700965881, | |
| "learning_rate": 1.7959183673469388e-06, | |
| "loss": 0.9693, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.3067484662576687, | |
| "grad_norm": 0.5774770379066467, | |
| "learning_rate": 2.0000000000000003e-06, | |
| "loss": 0.9483, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.3374233128834356, | |
| "grad_norm": 0.5419191718101501, | |
| "learning_rate": 2.2040816326530616e-06, | |
| "loss": 0.928, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.36809815950920244, | |
| "grad_norm": 0.5010412931442261, | |
| "learning_rate": 2.4081632653061225e-06, | |
| "loss": 0.8946, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.3987730061349693, | |
| "grad_norm": 0.45867446064949036, | |
| "learning_rate": 2.6122448979591842e-06, | |
| "loss": 0.8818, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.4294478527607362, | |
| "grad_norm": 0.41711899638175964, | |
| "learning_rate": 2.816326530612245e-06, | |
| "loss": 0.8657, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.4601226993865031, | |
| "grad_norm": 0.3863469362258911, | |
| "learning_rate": 3.0204081632653064e-06, | |
| "loss": 0.8518, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.49079754601226994, | |
| "grad_norm": 0.3639077842235565, | |
| "learning_rate": 3.2244897959183672e-06, | |
| "loss": 0.8378, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.5214723926380368, | |
| "grad_norm": 0.3680429458618164, | |
| "learning_rate": 3.428571428571429e-06, | |
| "loss": 0.8328, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.5521472392638037, | |
| "grad_norm": 0.3493054211139679, | |
| "learning_rate": 3.6326530612244903e-06, | |
| "loss": 0.8278, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.5828220858895705, | |
| "grad_norm": 0.35972270369529724, | |
| "learning_rate": 3.836734693877551e-06, | |
| "loss": 0.8177, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.6134969325153374, | |
| "grad_norm": 0.3597475588321686, | |
| "learning_rate": 4.040816326530612e-06, | |
| "loss": 0.8045, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.6441717791411042, | |
| "grad_norm": 0.3763820230960846, | |
| "learning_rate": 4.244897959183674e-06, | |
| "loss": 0.8003, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.6748466257668712, | |
| "grad_norm": 0.4080606997013092, | |
| "learning_rate": 4.448979591836735e-06, | |
| "loss": 0.7961, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.7055214723926381, | |
| "grad_norm": 0.3602585196495056, | |
| "learning_rate": 4.653061224489796e-06, | |
| "loss": 0.7918, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.7361963190184049, | |
| "grad_norm": 0.3696184456348419, | |
| "learning_rate": 4.857142857142858e-06, | |
| "loss": 0.7875, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.7668711656441718, | |
| "grad_norm": 0.37600934505462646, | |
| "learning_rate": 5.061224489795918e-06, | |
| "loss": 0.7875, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.7975460122699386, | |
| "grad_norm": 0.38016051054000854, | |
| "learning_rate": 5.26530612244898e-06, | |
| "loss": 0.7748, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.8282208588957055, | |
| "grad_norm": 0.3912647068500519, | |
| "learning_rate": 5.4693877551020415e-06, | |
| "loss": 0.772, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.8588957055214724, | |
| "grad_norm": 0.3811889886856079, | |
| "learning_rate": 5.673469387755103e-06, | |
| "loss": 0.7678, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.8895705521472392, | |
| "grad_norm": 0.38242849707603455, | |
| "learning_rate": 5.877551020408164e-06, | |
| "loss": 0.761, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.9202453987730062, | |
| "grad_norm": 0.4102751910686493, | |
| "learning_rate": 6.0816326530612245e-06, | |
| "loss": 0.763, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.950920245398773, | |
| "grad_norm": 0.38672712445259094, | |
| "learning_rate": 6.285714285714286e-06, | |
| "loss": 0.7576, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.9815950920245399, | |
| "grad_norm": 0.3906507194042206, | |
| "learning_rate": 6.489795918367348e-06, | |
| "loss": 0.751, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 1.0122699386503067, | |
| "grad_norm": 0.4010821580886841, | |
| "learning_rate": 6.693877551020409e-06, | |
| "loss": 0.7451, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 1.0429447852760736, | |
| "grad_norm": 0.454095721244812, | |
| "learning_rate": 6.8979591836734705e-06, | |
| "loss": 0.7373, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 1.0736196319018405, | |
| "grad_norm": 0.44405630230903625, | |
| "learning_rate": 7.102040816326531e-06, | |
| "loss": 0.7361, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 1.1042944785276074, | |
| "grad_norm": 0.42133402824401855, | |
| "learning_rate": 7.306122448979592e-06, | |
| "loss": 0.7309, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 1.1349693251533743, | |
| "grad_norm": 0.4115142226219177, | |
| "learning_rate": 7.5102040816326536e-06, | |
| "loss": 0.7213, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 1.165644171779141, | |
| "grad_norm": 0.48450756072998047, | |
| "learning_rate": 7.714285714285716e-06, | |
| "loss": 0.7311, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 1.196319018404908, | |
| "grad_norm": 0.44604143500328064, | |
| "learning_rate": 7.918367346938776e-06, | |
| "loss": 0.7265, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 1.2269938650306749, | |
| "grad_norm": 0.4400920271873474, | |
| "learning_rate": 8.122448979591837e-06, | |
| "loss": 0.7231, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.2576687116564418, | |
| "grad_norm": 0.4529883861541748, | |
| "learning_rate": 8.326530612244899e-06, | |
| "loss": 0.7177, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 1.2883435582822087, | |
| "grad_norm": 0.4646182954311371, | |
| "learning_rate": 8.530612244897961e-06, | |
| "loss": 0.7119, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 1.3190184049079754, | |
| "grad_norm": 0.45873209834098816, | |
| "learning_rate": 8.734693877551021e-06, | |
| "loss": 0.7131, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 1.3496932515337423, | |
| "grad_norm": 0.45401182770729065, | |
| "learning_rate": 8.938775510204082e-06, | |
| "loss": 0.7136, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 1.3803680981595092, | |
| "grad_norm": 0.4406537711620331, | |
| "learning_rate": 9.142857142857144e-06, | |
| "loss": 0.7148, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 1.4110429447852761, | |
| "grad_norm": 0.5117276310920715, | |
| "learning_rate": 9.346938775510204e-06, | |
| "loss": 0.7112, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 1.441717791411043, | |
| "grad_norm": 0.4318268895149231, | |
| "learning_rate": 9.551020408163266e-06, | |
| "loss": 0.7065, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 1.4723926380368098, | |
| "grad_norm": 0.4682416617870331, | |
| "learning_rate": 9.755102040816327e-06, | |
| "loss": 0.7019, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 1.5030674846625767, | |
| "grad_norm": 0.48544424772262573, | |
| "learning_rate": 9.959183673469387e-06, | |
| "loss": 0.7084, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 1.5337423312883436, | |
| "grad_norm": 0.45956820249557495, | |
| "learning_rate": 9.999918433243253e-06, | |
| "loss": 0.7045, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.5644171779141103, | |
| "grad_norm": 0.4829379618167877, | |
| "learning_rate": 9.999587072854989e-06, | |
| "loss": 0.6992, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 1.5950920245398774, | |
| "grad_norm": 0.4817286431789398, | |
| "learning_rate": 9.99900083779239e-06, | |
| "loss": 0.6949, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 1.6257668711656441, | |
| "grad_norm": 0.41464245319366455, | |
| "learning_rate": 9.998159757941219e-06, | |
| "loss": 0.6943, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 1.656441717791411, | |
| "grad_norm": 0.45459651947021484, | |
| "learning_rate": 9.997063876179007e-06, | |
| "loss": 0.6978, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 1.687116564417178, | |
| "grad_norm": 0.4576292335987091, | |
| "learning_rate": 9.99571324837287e-06, | |
| "loss": 0.6944, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 1.7177914110429446, | |
| "grad_norm": 0.4427788555622101, | |
| "learning_rate": 9.994107943376654e-06, | |
| "loss": 0.6914, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.7484662576687118, | |
| "grad_norm": 0.4589153528213501, | |
| "learning_rate": 9.992248043027441e-06, | |
| "loss": 0.6818, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 1.7791411042944785, | |
| "grad_norm": 0.4295940399169922, | |
| "learning_rate": 9.990133642141359e-06, | |
| "loss": 0.6902, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.8098159509202454, | |
| "grad_norm": 0.4209536910057068, | |
| "learning_rate": 9.987764848508756e-06, | |
| "loss": 0.6856, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 1.8404907975460123, | |
| "grad_norm": 0.4128643870353699, | |
| "learning_rate": 9.985141782888705e-06, | |
| "loss": 0.683, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.871165644171779, | |
| "grad_norm": 0.44537627696990967, | |
| "learning_rate": 9.982264579002853e-06, | |
| "loss": 0.6837, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 1.9018404907975461, | |
| "grad_norm": 0.40354472398757935, | |
| "learning_rate": 9.979133383528591e-06, | |
| "loss": 0.686, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 1.9325153374233128, | |
| "grad_norm": 0.43714097142219543, | |
| "learning_rate": 9.975748356091589e-06, | |
| "loss": 0.6809, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 1.9631901840490797, | |
| "grad_norm": 0.4338328540325165, | |
| "learning_rate": 9.972109669257645e-06, | |
| "loss": 0.6787, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.9938650306748467, | |
| "grad_norm": 0.4852101802825928, | |
| "learning_rate": 9.968217508523913e-06, | |
| "loss": 0.6696, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 2.0245398773006134, | |
| "grad_norm": 0.49633219838142395, | |
| "learning_rate": 9.964072072309412e-06, | |
| "loss": 0.652, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 2.0552147239263805, | |
| "grad_norm": 0.48235881328582764, | |
| "learning_rate": 9.959673571944939e-06, | |
| "loss": 0.6525, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 2.085889570552147, | |
| "grad_norm": 0.47604846954345703, | |
| "learning_rate": 9.955022231662282e-06, | |
| "loss": 0.6445, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 2.116564417177914, | |
| "grad_norm": 0.4336640238761902, | |
| "learning_rate": 9.95011828858279e-06, | |
| "loss": 0.6557, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 2.147239263803681, | |
| "grad_norm": 0.4802437722682953, | |
| "learning_rate": 9.944961992705288e-06, | |
| "loss": 0.6467, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 2.1779141104294477, | |
| "grad_norm": 0.476906955242157, | |
| "learning_rate": 9.939553606893334e-06, | |
| "loss": 0.6463, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 2.208588957055215, | |
| "grad_norm": 0.49822670221328735, | |
| "learning_rate": 9.933893406861808e-06, | |
| "loss": 0.6483, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 2.2392638036809815, | |
| "grad_norm": 0.49377647042274475, | |
| "learning_rate": 9.927981681162873e-06, | |
| "loss": 0.6413, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 2.2699386503067487, | |
| "grad_norm": 0.4360464811325073, | |
| "learning_rate": 9.921818731171249e-06, | |
| "loss": 0.6478, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 2.3006134969325154, | |
| "grad_norm": 0.5180469155311584, | |
| "learning_rate": 9.915404871068855e-06, | |
| "loss": 0.6458, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 2.331288343558282, | |
| "grad_norm": 0.5449960231781006, | |
| "learning_rate": 9.9087404278288e-06, | |
| "loss": 0.6445, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 2.361963190184049, | |
| "grad_norm": 0.4497530460357666, | |
| "learning_rate": 9.901825741198697e-06, | |
| "loss": 0.6403, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 2.392638036809816, | |
| "grad_norm": 0.4317333698272705, | |
| "learning_rate": 9.894661163683361e-06, | |
| "loss": 0.6434, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 2.4233128834355826, | |
| "grad_norm": 0.47986018657684326, | |
| "learning_rate": 9.887247060526827e-06, | |
| "loss": 0.6422, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 2.4539877300613497, | |
| "grad_norm": 0.4332720637321472, | |
| "learning_rate": 9.879583809693737e-06, | |
| "loss": 0.6438, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 2.4846625766871164, | |
| "grad_norm": 0.4776013493537903, | |
| "learning_rate": 9.871671801850065e-06, | |
| "loss": 0.6393, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 2.5153374233128836, | |
| "grad_norm": 0.48002681136131287, | |
| "learning_rate": 9.863511440343206e-06, | |
| "loss": 0.6385, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 2.5460122699386503, | |
| "grad_norm": 0.4508747458457947, | |
| "learning_rate": 9.855103141181412e-06, | |
| "loss": 0.6362, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 2.5766871165644174, | |
| "grad_norm": 0.5055172443389893, | |
| "learning_rate": 9.846447333012587e-06, | |
| "loss": 0.6388, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 2.607361963190184, | |
| "grad_norm": 0.4450007975101471, | |
| "learning_rate": 9.837544457102428e-06, | |
| "loss": 0.6377, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 2.638036809815951, | |
| "grad_norm": 0.4479919970035553, | |
| "learning_rate": 9.82839496731194e-06, | |
| "loss": 0.6372, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 2.668711656441718, | |
| "grad_norm": 0.45310866832733154, | |
| "learning_rate": 9.818999330074288e-06, | |
| "loss": 0.6336, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 2.6993865030674846, | |
| "grad_norm": 0.4643765091896057, | |
| "learning_rate": 9.809358024371025e-06, | |
| "loss": 0.6363, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 2.7300613496932513, | |
| "grad_norm": 0.48741263151168823, | |
| "learning_rate": 9.799471541707672e-06, | |
| "loss": 0.6382, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 2.7607361963190185, | |
| "grad_norm": 0.45080405473709106, | |
| "learning_rate": 9.789340386088663e-06, | |
| "loss": 0.6357, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 2.791411042944785, | |
| "grad_norm": 0.5053460597991943, | |
| "learning_rate": 9.778965073991652e-06, | |
| "loss": 0.6315, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 2.8220858895705523, | |
| "grad_norm": 0.523303747177124, | |
| "learning_rate": 9.768346134341174e-06, | |
| "loss": 0.6336, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 2.852760736196319, | |
| "grad_norm": 0.4880037307739258, | |
| "learning_rate": 9.757484108481695e-06, | |
| "loss": 0.63, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 2.883435582822086, | |
| "grad_norm": 0.4551119804382324, | |
| "learning_rate": 9.74637955015001e-06, | |
| "loss": 0.6315, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 2.914110429447853, | |
| "grad_norm": 0.529259204864502, | |
| "learning_rate": 9.735033025447e-06, | |
| "loss": 0.6346, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 2.9447852760736195, | |
| "grad_norm": 0.44139307737350464, | |
| "learning_rate": 9.723445112808802e-06, | |
| "loss": 0.6284, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 2.9754601226993866, | |
| "grad_norm": 0.44775474071502686, | |
| "learning_rate": 9.71161640297729e-06, | |
| "loss": 0.6316, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 3.0061349693251533, | |
| "grad_norm": 0.5552355051040649, | |
| "learning_rate": 9.699547498969978e-06, | |
| "loss": 0.6217, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 3.03680981595092, | |
| "grad_norm": 0.5540191531181335, | |
| "learning_rate": 9.687239016049275e-06, | |
| "loss": 0.5991, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 3.067484662576687, | |
| "grad_norm": 0.5161706209182739, | |
| "learning_rate": 9.674691581691114e-06, | |
| "loss": 0.5983, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 3.098159509202454, | |
| "grad_norm": 0.564895510673523, | |
| "learning_rate": 9.661905835552974e-06, | |
| "loss": 0.599, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 3.128834355828221, | |
| "grad_norm": 0.5886032581329346, | |
| "learning_rate": 9.648882429441258e-06, | |
| "loss": 0.5958, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 3.1595092024539877, | |
| "grad_norm": 0.5232917666435242, | |
| "learning_rate": 9.635622027278076e-06, | |
| "loss": 0.5981, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 3.190184049079755, | |
| "grad_norm": 0.5367550849914551, | |
| "learning_rate": 9.622125305067394e-06, | |
| "loss": 0.6009, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 3.2208588957055215, | |
| "grad_norm": 0.5145596861839294, | |
| "learning_rate": 9.608392950860568e-06, | |
| "loss": 0.5964, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 3.2515337423312882, | |
| "grad_norm": 0.48504817485809326, | |
| "learning_rate": 9.594425664721275e-06, | |
| "loss": 0.5954, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 3.2822085889570554, | |
| "grad_norm": 0.44789251685142517, | |
| "learning_rate": 9.580224158689821e-06, | |
| "loss": 0.5963, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 3.312883435582822, | |
| "grad_norm": 0.44385582208633423, | |
| "learning_rate": 9.565789156746843e-06, | |
| "loss": 0.5953, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 3.3435582822085887, | |
| "grad_norm": 0.46638399362564087, | |
| "learning_rate": 9.551121394776395e-06, | |
| "loss": 0.5953, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 3.374233128834356, | |
| "grad_norm": 0.47886648774147034, | |
| "learning_rate": 9.536221620528442e-06, | |
| "loss": 0.5984, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 3.4049079754601226, | |
| "grad_norm": 0.43750545382499695, | |
| "learning_rate": 9.521090593580737e-06, | |
| "loss": 0.5966, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 3.4355828220858897, | |
| "grad_norm": 0.4883286654949188, | |
| "learning_rate": 9.505729085300098e-06, | |
| "loss": 0.5976, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 3.4662576687116564, | |
| "grad_norm": 0.5246576070785522, | |
| "learning_rate": 9.490137878803078e-06, | |
| "loss": 0.596, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 3.4969325153374236, | |
| "grad_norm": 0.47524797916412354, | |
| "learning_rate": 9.47431776891606e-06, | |
| "loss": 0.5991, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 3.5276073619631902, | |
| "grad_norm": 0.4653734266757965, | |
| "learning_rate": 9.458269562134717e-06, | |
| "loss": 0.594, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 3.558282208588957, | |
| "grad_norm": 0.5341756343841553, | |
| "learning_rate": 9.441994076582907e-06, | |
| "loss": 0.5985, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 3.588957055214724, | |
| "grad_norm": 0.5514233708381653, | |
| "learning_rate": 9.425492141970973e-06, | |
| "loss": 0.5992, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 3.6196319018404908, | |
| "grad_norm": 0.4911518096923828, | |
| "learning_rate": 9.408764599553429e-06, | |
| "loss": 0.5935, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 3.6503067484662575, | |
| "grad_norm": 0.4620342552661896, | |
| "learning_rate": 9.391812302086088e-06, | |
| "loss": 0.5977, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 3.6809815950920246, | |
| "grad_norm": 0.44591575860977173, | |
| "learning_rate": 9.374636113782576e-06, | |
| "loss": 0.5928, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 3.7116564417177913, | |
| "grad_norm": 0.40037840604782104, | |
| "learning_rate": 9.357236910270292e-06, | |
| "loss": 0.5883, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 3.7423312883435584, | |
| "grad_norm": 0.4316963255405426, | |
| "learning_rate": 9.339615578545753e-06, | |
| "loss": 0.5916, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 3.773006134969325, | |
| "grad_norm": 0.47416016459465027, | |
| "learning_rate": 9.321773016929382e-06, | |
| "loss": 0.5935, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 3.8036809815950923, | |
| "grad_norm": 0.486749529838562, | |
| "learning_rate": 9.30371013501972e-06, | |
| "loss": 0.5976, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 3.834355828220859, | |
| "grad_norm": 0.46578896045684814, | |
| "learning_rate": 9.285427853647038e-06, | |
| "loss": 0.5964, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 3.8650306748466257, | |
| "grad_norm": 0.4229313135147095, | |
| "learning_rate": 9.26692710482641e-06, | |
| "loss": 0.5941, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 3.895705521472393, | |
| "grad_norm": 0.45090341567993164, | |
| "learning_rate": 9.248208831710195e-06, | |
| "loss": 0.5922, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 3.9263803680981595, | |
| "grad_norm": 0.4940469264984131, | |
| "learning_rate": 9.229273988539951e-06, | |
| "loss": 0.5957, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 3.957055214723926, | |
| "grad_norm": 0.45583370327949524, | |
| "learning_rate": 9.210123540597792e-06, | |
| "loss": 0.592, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 3.9877300613496933, | |
| "grad_norm": 0.43879297375679016, | |
| "learning_rate": 9.190758464157184e-06, | |
| "loss": 0.5936, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 4.0184049079754605, | |
| "grad_norm": 0.5458897948265076, | |
| "learning_rate": 9.171179746433164e-06, | |
| "loss": 0.5735, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 4.049079754601227, | |
| "grad_norm": 0.5021305084228516, | |
| "learning_rate": 9.151388385532022e-06, | |
| "loss": 0.5571, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 4.079754601226994, | |
| "grad_norm": 0.47301429510116577, | |
| "learning_rate": 9.131385390400417e-06, | |
| "loss": 0.5594, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 4.110429447852761, | |
| "grad_norm": 0.46195101737976074, | |
| "learning_rate": 9.111171780773938e-06, | |
| "loss": 0.556, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 4.141104294478527, | |
| "grad_norm": 0.4663168489933014, | |
| "learning_rate": 9.090748587125118e-06, | |
| "loss": 0.5576, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 4.171779141104294, | |
| "grad_norm": 0.5041781067848206, | |
| "learning_rate": 9.070116850610911e-06, | |
| "loss": 0.5546, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 4.2024539877300615, | |
| "grad_norm": 0.571372389793396, | |
| "learning_rate": 9.049277623019603e-06, | |
| "loss": 0.5609, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 4.233128834355828, | |
| "grad_norm": 0.4924178421497345, | |
| "learning_rate": 9.0282319667172e-06, | |
| "loss": 0.5569, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 4.263803680981595, | |
| "grad_norm": 0.4700438678264618, | |
| "learning_rate": 9.006980954593262e-06, | |
| "loss": 0.5622, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 4.294478527607362, | |
| "grad_norm": 0.46645787358283997, | |
| "learning_rate": 8.985525670006225e-06, | |
| "loss": 0.5561, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 4.325153374233129, | |
| "grad_norm": 0.47866857051849365, | |
| "learning_rate": 8.963867206728147e-06, | |
| "loss": 0.5594, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 4.355828220858895, | |
| "grad_norm": 0.4613681733608246, | |
| "learning_rate": 8.942006668888972e-06, | |
| "loss": 0.5556, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 4.386503067484663, | |
| "grad_norm": 0.451261043548584, | |
| "learning_rate": 8.919945170920224e-06, | |
| "loss": 0.558, | |
| "step": 715 | |
| }, | |
| { | |
| "epoch": 4.41717791411043, | |
| "grad_norm": 0.5014335513114929, | |
| "learning_rate": 8.89768383749821e-06, | |
| "loss": 0.5622, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 4.447852760736196, | |
| "grad_norm": 0.5099634528160095, | |
| "learning_rate": 8.875223803486674e-06, | |
| "loss": 0.566, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 4.478527607361963, | |
| "grad_norm": 0.5409094095230103, | |
| "learning_rate": 8.852566213878947e-06, | |
| "loss": 0.5562, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 4.50920245398773, | |
| "grad_norm": 0.5168077945709229, | |
| "learning_rate": 8.829712223739574e-06, | |
| "loss": 0.5589, | |
| "step": 735 | |
| }, | |
| { | |
| "epoch": 4.539877300613497, | |
| "grad_norm": 0.5725398063659668, | |
| "learning_rate": 8.80666299814543e-06, | |
| "loss": 0.5567, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 4.570552147239264, | |
| "grad_norm": 0.45501619577407837, | |
| "learning_rate": 8.783419712126335e-06, | |
| "loss": 0.5615, | |
| "step": 745 | |
| }, | |
| { | |
| "epoch": 4.601226993865031, | |
| "grad_norm": 0.5540989637374878, | |
| "learning_rate": 8.759983550605132e-06, | |
| "loss": 0.5578, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 4.631901840490798, | |
| "grad_norm": 0.5188400745391846, | |
| "learning_rate": 8.736355708337298e-06, | |
| "loss": 0.5605, | |
| "step": 755 | |
| }, | |
| { | |
| "epoch": 4.662576687116564, | |
| "grad_norm": 0.48551928997039795, | |
| "learning_rate": 8.71253738985003e-06, | |
| "loss": 0.5579, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 4.693251533742331, | |
| "grad_norm": 0.44432246685028076, | |
| "learning_rate": 8.688529809380843e-06, | |
| "loss": 0.5595, | |
| "step": 765 | |
| }, | |
| { | |
| "epoch": 4.723926380368098, | |
| "grad_norm": 0.45983144640922546, | |
| "learning_rate": 8.66433419081566e-06, | |
| "loss": 0.5611, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 4.754601226993865, | |
| "grad_norm": 0.48174920678138733, | |
| "learning_rate": 8.639951767626429e-06, | |
| "loss": 0.5637, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 4.785276073619632, | |
| "grad_norm": 0.4685279130935669, | |
| "learning_rate": 8.615383782808238e-06, | |
| "loss": 0.5608, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 4.815950920245399, | |
| "grad_norm": 0.5617627501487732, | |
| "learning_rate": 8.590631488815945e-06, | |
| "loss": 0.5596, | |
| "step": 785 | |
| }, | |
| { | |
| "epoch": 4.846625766871165, | |
| "grad_norm": 0.5751484036445618, | |
| "learning_rate": 8.565696147500338e-06, | |
| "loss": 0.5596, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 4.877300613496932, | |
| "grad_norm": 0.6005717515945435, | |
| "learning_rate": 8.540579030043795e-06, | |
| "loss": 0.5614, | |
| "step": 795 | |
| }, | |
| { | |
| "epoch": 4.9079754601226995, | |
| "grad_norm": 0.5048331022262573, | |
| "learning_rate": 8.515281416895489e-06, | |
| "loss": 0.5585, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 4.938650306748467, | |
| "grad_norm": 0.4952603578567505, | |
| "learning_rate": 8.48980459770611e-06, | |
| "loss": 0.5599, | |
| "step": 805 | |
| }, | |
| { | |
| "epoch": 4.969325153374233, | |
| "grad_norm": 0.49712324142456055, | |
| "learning_rate": 8.464149871262118e-06, | |
| "loss": 0.5614, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "grad_norm": 0.5496305227279663, | |
| "learning_rate": 8.43831854541953e-06, | |
| "loss": 0.559, | |
| "step": 815 | |
| }, | |
| { | |
| "epoch": 5.030674846625767, | |
| "grad_norm": 0.565785825252533, | |
| "learning_rate": 8.412311937037255e-06, | |
| "loss": 0.5256, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 5.061349693251533, | |
| "grad_norm": 0.5519700050354004, | |
| "learning_rate": 8.386131371909948e-06, | |
| "loss": 0.5198, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 5.0920245398773005, | |
| "grad_norm": 0.5226123332977295, | |
| "learning_rate": 8.35977818470044e-06, | |
| "loss": 0.5235, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 5.122699386503068, | |
| "grad_norm": 0.5327089428901672, | |
| "learning_rate": 8.33325371887168e-06, | |
| "loss": 0.5267, | |
| "step": 835 | |
| }, | |
| { | |
| "epoch": 5.153374233128835, | |
| "grad_norm": 0.5217044949531555, | |
| "learning_rate": 8.30655932661826e-06, | |
| "loss": 0.5215, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 5.184049079754601, | |
| "grad_norm": 0.5113396048545837, | |
| "learning_rate": 8.279696368797471e-06, | |
| "loss": 0.5227, | |
| "step": 845 | |
| }, | |
| { | |
| "epoch": 5.214723926380368, | |
| "grad_norm": 0.5224789977073669, | |
| "learning_rate": 8.252666214859936e-06, | |
| "loss": 0.5211, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 5.245398773006135, | |
| "grad_norm": 0.47648775577545166, | |
| "learning_rate": 8.225470242779791e-06, | |
| "loss": 0.5262, | |
| "step": 855 | |
| }, | |
| { | |
| "epoch": 5.276073619631902, | |
| "grad_norm": 0.5127813816070557, | |
| "learning_rate": 8.19810983898444e-06, | |
| "loss": 0.5256, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 5.306748466257669, | |
| "grad_norm": 0.5499957799911499, | |
| "learning_rate": 8.170586398283878e-06, | |
| "loss": 0.5267, | |
| "step": 865 | |
| }, | |
| { | |
| "epoch": 5.337423312883436, | |
| "grad_norm": 0.5156161189079285, | |
| "learning_rate": 8.142901323799578e-06, | |
| "loss": 0.5252, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 5.368098159509202, | |
| "grad_norm": 0.4825647175312042, | |
| "learning_rate": 8.115056026892965e-06, | |
| "loss": 0.5266, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 5.398773006134969, | |
| "grad_norm": 0.5043969750404358, | |
| "learning_rate": 8.08705192709347e-06, | |
| "loss": 0.5241, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 5.429447852760736, | |
| "grad_norm": 0.47043198347091675, | |
| "learning_rate": 8.058890452026155e-06, | |
| "loss": 0.5272, | |
| "step": 885 | |
| }, | |
| { | |
| "epoch": 5.460122699386503, | |
| "grad_norm": 0.471513956785202, | |
| "learning_rate": 8.030573037338942e-06, | |
| "loss": 0.5321, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 5.49079754601227, | |
| "grad_norm": 0.48654860258102417, | |
| "learning_rate": 8.002101126629422e-06, | |
| "loss": 0.5267, | |
| "step": 895 | |
| }, | |
| { | |
| "epoch": 5.521472392638037, | |
| "grad_norm": 0.4844699501991272, | |
| "learning_rate": 7.973476171371255e-06, | |
| "loss": 0.5301, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 5.552147239263804, | |
| "grad_norm": 0.4706033170223236, | |
| "learning_rate": 7.94469963084019e-06, | |
| "loss": 0.5276, | |
| "step": 905 | |
| }, | |
| { | |
| "epoch": 5.58282208588957, | |
| "grad_norm": 0.47882503271102905, | |
| "learning_rate": 7.91577297203966e-06, | |
| "loss": 0.5267, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 5.613496932515337, | |
| "grad_norm": 0.5027369856834412, | |
| "learning_rate": 7.886697669625995e-06, | |
| "loss": 0.5313, | |
| "step": 915 | |
| }, | |
| { | |
| "epoch": 5.644171779141105, | |
| "grad_norm": 0.4971833825111389, | |
| "learning_rate": 7.857475205833255e-06, | |
| "loss": 0.5219, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 5.674846625766871, | |
| "grad_norm": 0.488737016916275, | |
| "learning_rate": 7.828107070397657e-06, | |
| "loss": 0.5233, | |
| "step": 925 | |
| }, | |
| { | |
| "epoch": 5.705521472392638, | |
| "grad_norm": 0.49035337567329407, | |
| "learning_rate": 7.798594760481639e-06, | |
| "loss": 0.5254, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 5.736196319018405, | |
| "grad_norm": 0.5278029441833496, | |
| "learning_rate": 7.768939780597523e-06, | |
| "loss": 0.5246, | |
| "step": 935 | |
| }, | |
| { | |
| "epoch": 5.766871165644172, | |
| "grad_norm": 0.4724002480506897, | |
| "learning_rate": 7.739143642530833e-06, | |
| "loss": 0.5218, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 5.7975460122699385, | |
| "grad_norm": 0.5203721523284912, | |
| "learning_rate": 7.70920786526321e-06, | |
| "loss": 0.5256, | |
| "step": 945 | |
| }, | |
| { | |
| "epoch": 5.828220858895706, | |
| "grad_norm": 0.5066553950309753, | |
| "learning_rate": 7.679133974894984e-06, | |
| "loss": 0.5272, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 5.858895705521473, | |
| "grad_norm": 0.5213795304298401, | |
| "learning_rate": 7.648923504567374e-06, | |
| "loss": 0.5293, | |
| "step": 955 | |
| }, | |
| { | |
| "epoch": 5.889570552147239, | |
| "grad_norm": 0.5189357399940491, | |
| "learning_rate": 7.618577994384324e-06, | |
| "loss": 0.525, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 5.920245398773006, | |
| "grad_norm": 0.45614001154899597, | |
| "learning_rate": 7.588098991334001e-06, | |
| "loss": 0.5254, | |
| "step": 965 | |
| }, | |
| { | |
| "epoch": 5.950920245398773, | |
| "grad_norm": 0.5392826199531555, | |
| "learning_rate": 7.557488049209921e-06, | |
| "loss": 0.5215, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 5.9815950920245395, | |
| "grad_norm": 0.5028013586997986, | |
| "learning_rate": 7.52674672853174e-06, | |
| "loss": 0.5249, | |
| "step": 975 | |
| }, | |
| { | |
| "epoch": 6.012269938650307, | |
| "grad_norm": 0.5520371198654175, | |
| "learning_rate": 7.495876596465703e-06, | |
| "loss": 0.5087, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 6.042944785276074, | |
| "grad_norm": 0.6230819225311279, | |
| "learning_rate": 7.464879226744748e-06, | |
| "loss": 0.4881, | |
| "step": 985 | |
| }, | |
| { | |
| "epoch": 6.07361963190184, | |
| "grad_norm": 0.581471860408783, | |
| "learning_rate": 7.433756199588282e-06, | |
| "loss": 0.4891, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 6.104294478527607, | |
| "grad_norm": 0.6113958358764648, | |
| "learning_rate": 7.402509101621618e-06, | |
| "loss": 0.4848, | |
| "step": 995 | |
| }, | |
| { | |
| "epoch": 6.134969325153374, | |
| "grad_norm": 0.5332604050636292, | |
| "learning_rate": 7.371139525795094e-06, | |
| "loss": 0.488, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 6.1656441717791415, | |
| "grad_norm": 0.538058340549469, | |
| "learning_rate": 7.3396490713028674e-06, | |
| "loss": 0.4904, | |
| "step": 1005 | |
| }, | |
| { | |
| "epoch": 6.196319018404908, | |
| "grad_norm": 0.5761954188346863, | |
| "learning_rate": 7.308039343501381e-06, | |
| "loss": 0.4907, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 6.226993865030675, | |
| "grad_norm": 0.5636390447616577, | |
| "learning_rate": 7.276311953827533e-06, | |
| "loss": 0.4872, | |
| "step": 1015 | |
| }, | |
| { | |
| "epoch": 6.257668711656442, | |
| "grad_norm": 0.5386704802513123, | |
| "learning_rate": 7.244468519716521e-06, | |
| "loss": 0.4946, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 6.288343558282208, | |
| "grad_norm": 0.585161566734314, | |
| "learning_rate": 7.212510664519391e-06, | |
| "loss": 0.4934, | |
| "step": 1025 | |
| }, | |
| { | |
| "epoch": 6.319018404907975, | |
| "grad_norm": 0.5781230330467224, | |
| "learning_rate": 7.180440017420277e-06, | |
| "loss": 0.494, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 6.3496932515337425, | |
| "grad_norm": 0.5272154211997986, | |
| "learning_rate": 7.148258213353347e-06, | |
| "loss": 0.4944, | |
| "step": 1035 | |
| }, | |
| { | |
| "epoch": 6.38036809815951, | |
| "grad_norm": 0.5416070222854614, | |
| "learning_rate": 7.115966892919459e-06, | |
| "loss": 0.4936, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 6.411042944785276, | |
| "grad_norm": 0.5524059534072876, | |
| "learning_rate": 7.083567702302517e-06, | |
| "loss": 0.4901, | |
| "step": 1045 | |
| }, | |
| { | |
| "epoch": 6.441717791411043, | |
| "grad_norm": 0.5226433277130127, | |
| "learning_rate": 7.05106229318556e-06, | |
| "loss": 0.4931, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 6.47239263803681, | |
| "grad_norm": 0.5324561595916748, | |
| "learning_rate": 7.018452322666549e-06, | |
| "loss": 0.4906, | |
| "step": 1055 | |
| }, | |
| { | |
| "epoch": 6.5030674846625764, | |
| "grad_norm": 0.5099997520446777, | |
| "learning_rate": 6.985739453173903e-06, | |
| "loss": 0.4909, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 6.533742331288344, | |
| "grad_norm": 0.5172037482261658, | |
| "learning_rate": 6.9529253523817396e-06, | |
| "loss": 0.4936, | |
| "step": 1065 | |
| }, | |
| { | |
| "epoch": 6.564417177914111, | |
| "grad_norm": 0.5281527638435364, | |
| "learning_rate": 6.9200116931248575e-06, | |
| "loss": 0.4949, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 6.595092024539877, | |
| "grad_norm": 0.5232874751091003, | |
| "learning_rate": 6.887000153313468e-06, | |
| "loss": 0.493, | |
| "step": 1075 | |
| }, | |
| { | |
| "epoch": 6.625766871165644, | |
| "grad_norm": 0.5422880053520203, | |
| "learning_rate": 6.853892415847645e-06, | |
| "loss": 0.4944, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 6.656441717791411, | |
| "grad_norm": 0.5587918758392334, | |
| "learning_rate": 6.8206901685315366e-06, | |
| "loss": 0.4974, | |
| "step": 1085 | |
| }, | |
| { | |
| "epoch": 6.6871165644171775, | |
| "grad_norm": 0.5716516971588135, | |
| "learning_rate": 6.787395103987323e-06, | |
| "loss": 0.4959, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 6.717791411042945, | |
| "grad_norm": 0.5290241837501526, | |
| "learning_rate": 6.754008919568927e-06, | |
| "loss": 0.49, | |
| "step": 1095 | |
| }, | |
| { | |
| "epoch": 6.748466257668712, | |
| "grad_norm": 0.545229971408844, | |
| "learning_rate": 6.72053331727549e-06, | |
| "loss": 0.4913, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 6.779141104294479, | |
| "grad_norm": 0.5339862108230591, | |
| "learning_rate": 6.686970003664588e-06, | |
| "loss": 0.4933, | |
| "step": 1105 | |
| }, | |
| { | |
| "epoch": 6.809815950920245, | |
| "grad_norm": 0.5517235398292542, | |
| "learning_rate": 6.653320689765257e-06, | |
| "loss": 0.4957, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 6.840490797546012, | |
| "grad_norm": 0.5221419334411621, | |
| "learning_rate": 6.619587090990748e-06, | |
| "loss": 0.4938, | |
| "step": 1115 | |
| }, | |
| { | |
| "epoch": 6.871165644171779, | |
| "grad_norm": 0.5578407049179077, | |
| "learning_rate": 6.585770927051085e-06, | |
| "loss": 0.4992, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 6.901840490797546, | |
| "grad_norm": 0.5061231255531311, | |
| "learning_rate": 6.551873921865393e-06, | |
| "loss": 0.4968, | |
| "step": 1125 | |
| }, | |
| { | |
| "epoch": 6.932515337423313, | |
| "grad_norm": 0.5198781490325928, | |
| "learning_rate": 6.517897803474011e-06, | |
| "loss": 0.4982, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 6.96319018404908, | |
| "grad_norm": 0.5066606998443604, | |
| "learning_rate": 6.483844303950411e-06, | |
| "loss": 0.4975, | |
| "step": 1135 | |
| }, | |
| { | |
| "epoch": 6.993865030674847, | |
| "grad_norm": 0.5105178952217102, | |
| "learning_rate": 6.4497151593128795e-06, | |
| "loss": 0.5015, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 7.024539877300613, | |
| "grad_norm": 0.6838037967681885, | |
| "learning_rate": 6.415512109436031e-06, | |
| "loss": 0.4634, | |
| "step": 1145 | |
| }, | |
| { | |
| "epoch": 7.0552147239263805, | |
| "grad_norm": 0.6098262667655945, | |
| "learning_rate": 6.381236897962102e-06, | |
| "loss": 0.4616, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 7.085889570552148, | |
| "grad_norm": 0.5857096314430237, | |
| "learning_rate": 6.3468912722120715e-06, | |
| "loss": 0.4557, | |
| "step": 1155 | |
| }, | |
| { | |
| "epoch": 7.116564417177914, | |
| "grad_norm": 0.5579211115837097, | |
| "learning_rate": 6.312476983096573e-06, | |
| "loss": 0.454, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 7.147239263803681, | |
| "grad_norm": 0.5579200387001038, | |
| "learning_rate": 6.277995785026642e-06, | |
| "loss": 0.4606, | |
| "step": 1165 | |
| }, | |
| { | |
| "epoch": 7.177914110429448, | |
| "grad_norm": 0.5602801442146301, | |
| "learning_rate": 6.243449435824276e-06, | |
| "loss": 0.4557, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 7.208588957055214, | |
| "grad_norm": 0.5627221465110779, | |
| "learning_rate": 6.2088396966328155e-06, | |
| "loss": 0.4564, | |
| "step": 1175 | |
| }, | |
| { | |
| "epoch": 7.2392638036809815, | |
| "grad_norm": 0.5709859728813171, | |
| "learning_rate": 6.174168331827179e-06, | |
| "loss": 0.4596, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 7.269938650306749, | |
| "grad_norm": 0.554961621761322, | |
| "learning_rate": 6.139437108923898e-06, | |
| "loss": 0.4583, | |
| "step": 1185 | |
| }, | |
| { | |
| "epoch": 7.300613496932515, | |
| "grad_norm": 0.5725740790367126, | |
| "learning_rate": 6.1046477984910215e-06, | |
| "loss": 0.458, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 7.331288343558282, | |
| "grad_norm": 0.5590230226516724, | |
| "learning_rate": 6.069802174057849e-06, | |
| "loss": 0.4604, | |
| "step": 1195 | |
| }, | |
| { | |
| "epoch": 7.361963190184049, | |
| "grad_norm": 0.5823590159416199, | |
| "learning_rate": 6.034902012024521e-06, | |
| "loss": 0.4575, | |
| "step": 1200 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 2445, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 15, | |
| "save_steps": 400, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.6846057011425575e+19, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |