{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 733, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0068212824010914054, "grad_norm": 6.1454085107347405, "learning_rate": 6.7567567567567575e-06, "loss": 1.1085, "step": 5 }, { "epoch": 0.013642564802182811, "grad_norm": 2.39833034924335, "learning_rate": 1.3513513513513515e-05, "loss": 0.9616, "step": 10 }, { "epoch": 0.020463847203274217, "grad_norm": 1.4670809925130541, "learning_rate": 2.0270270270270273e-05, "loss": 0.7925, "step": 15 }, { "epoch": 0.027285129604365622, "grad_norm": 0.7952305107776637, "learning_rate": 2.702702702702703e-05, "loss": 0.733, "step": 20 }, { "epoch": 0.034106412005457026, "grad_norm": 0.7221356192862481, "learning_rate": 3.3783783783783784e-05, "loss": 0.7143, "step": 25 }, { "epoch": 0.040927694406548434, "grad_norm": 0.6588446209529583, "learning_rate": 4.0540540540540545e-05, "loss": 0.6942, "step": 30 }, { "epoch": 0.047748976807639835, "grad_norm": 0.6997289473400741, "learning_rate": 4.72972972972973e-05, "loss": 0.6864, "step": 35 }, { "epoch": 0.054570259208731244, "grad_norm": 0.6270561374043836, "learning_rate": 4.999793714044176e-05, "loss": 0.6694, "step": 40 }, { "epoch": 0.061391541609822645, "grad_norm": 0.7728158038416499, "learning_rate": 4.9985332146267735e-05, "loss": 0.6677, "step": 45 }, { "epoch": 0.06821282401091405, "grad_norm": 0.6815807295103395, "learning_rate": 4.996127460337901e-05, "loss": 0.6692, "step": 50 }, { "epoch": 0.07503410641200546, "grad_norm": 0.6683515488743137, "learning_rate": 4.992577676510502e-05, "loss": 0.6797, "step": 55 }, { "epoch": 0.08185538881309687, "grad_norm": 0.6677440130204387, "learning_rate": 4.987885671170889e-05, "loss": 0.6534, "step": 60 }, { "epoch": 0.08867667121418826, "grad_norm": 0.6924034414550392, "learning_rate": 4.9820538341178595e-05, "loss": 0.662, "step": 65 }, { "epoch": 0.09549795361527967, "grad_norm": 0.6088924375067402, "learning_rate": 4.97508513570549e-05, "loss": 0.6524, "step": 70 }, { "epoch": 0.10231923601637108, "grad_norm": 0.6543644961547558, "learning_rate": 4.966983125330225e-05, "loss": 0.6585, "step": 75 }, { "epoch": 0.10914051841746249, "grad_norm": 0.7375667828979487, "learning_rate": 4.957751929623059e-05, "loss": 0.6392, "step": 80 }, { "epoch": 0.11596180081855388, "grad_norm": 0.6497105162296632, "learning_rate": 4.947396250347695e-05, "loss": 0.6243, "step": 85 }, { "epoch": 0.12278308321964529, "grad_norm": 0.59440185826942, "learning_rate": 4.9359213620057766e-05, "loss": 0.6437, "step": 90 }, { "epoch": 0.1296043656207367, "grad_norm": 0.5929742801757779, "learning_rate": 4.9233331091504034e-05, "loss": 0.6412, "step": 95 }, { "epoch": 0.1364256480218281, "grad_norm": 0.5453418936918566, "learning_rate": 4.909637903409306e-05, "loss": 0.6372, "step": 100 }, { "epoch": 0.1432469304229195, "grad_norm": 0.6103486251810093, "learning_rate": 4.8948427202191766e-05, "loss": 0.6358, "step": 105 }, { "epoch": 0.15006821282401092, "grad_norm": 0.552127051653267, "learning_rate": 4.878955095272844e-05, "loss": 0.6365, "step": 110 }, { "epoch": 0.15688949522510232, "grad_norm": 0.5781918279256535, "learning_rate": 4.861983120681089e-05, "loss": 0.6354, "step": 115 }, { "epoch": 0.16371077762619374, "grad_norm": 0.5965443692805242, "learning_rate": 4.8439354408510536e-05, "loss": 0.6276, "step": 120 }, { "epoch": 0.17053206002728513, "grad_norm": 0.5333632544147189, "learning_rate": 4.82482124808335e-05, "loss": 0.62, "step": 125 }, { "epoch": 0.17735334242837653, "grad_norm": 0.5387742075687054, "learning_rate": 4.804650277890105e-05, "loss": 0.6307, "step": 130 }, { "epoch": 0.18417462482946795, "grad_norm": 0.5636611618052851, "learning_rate": 4.783432804036335e-05, "loss": 0.6258, "step": 135 }, { "epoch": 0.19099590723055934, "grad_norm": 0.583094121226939, "learning_rate": 4.761179633307163e-05, "loss": 0.6378, "step": 140 }, { "epoch": 0.19781718963165076, "grad_norm": 0.5778161827296184, "learning_rate": 4.737902100003552e-05, "loss": 0.6204, "step": 145 }, { "epoch": 0.20463847203274216, "grad_norm": 0.514316052197073, "learning_rate": 4.713612060169362e-05, "loss": 0.6245, "step": 150 }, { "epoch": 0.21145975443383355, "grad_norm": 0.5252543534103011, "learning_rate": 4.688321885552659e-05, "loss": 0.6152, "step": 155 }, { "epoch": 0.21828103683492497, "grad_norm": 0.5493165240335753, "learning_rate": 4.662044457304359e-05, "loss": 0.6314, "step": 160 }, { "epoch": 0.22510231923601637, "grad_norm": 0.5856552277186949, "learning_rate": 4.634793159417421e-05, "loss": 0.6202, "step": 165 }, { "epoch": 0.23192360163710776, "grad_norm": 0.5423682228591846, "learning_rate": 4.606581871909919e-05, "loss": 0.6129, "step": 170 }, { "epoch": 0.23874488403819918, "grad_norm": 0.5347440738496132, "learning_rate": 4.577424963755475e-05, "loss": 0.6273, "step": 175 }, { "epoch": 0.24556616643929058, "grad_norm": 0.5232051897086609, "learning_rate": 4.547337285564649e-05, "loss": 0.6191, "step": 180 }, { "epoch": 0.252387448840382, "grad_norm": 0.5498344358428939, "learning_rate": 4.516334162021013e-05, "loss": 0.6171, "step": 185 }, { "epoch": 0.2592087312414734, "grad_norm": 0.5357101369486424, "learning_rate": 4.484431384075771e-05, "loss": 0.6247, "step": 190 }, { "epoch": 0.2660300136425648, "grad_norm": 0.6086904184284695, "learning_rate": 4.4516452009048814e-05, "loss": 0.6132, "step": 195 }, { "epoch": 0.2728512960436562, "grad_norm": 0.507515886549876, "learning_rate": 4.4179923116328005e-05, "loss": 0.6146, "step": 200 }, { "epoch": 0.27967257844474763, "grad_norm": 0.4863854335263699, "learning_rate": 4.3834898568270444e-05, "loss": 0.6196, "step": 205 }, { "epoch": 0.286493860845839, "grad_norm": 0.5429472321131933, "learning_rate": 4.348155409767913e-05, "loss": 0.608, "step": 210 }, { "epoch": 0.2933151432469304, "grad_norm": 0.5004823143600294, "learning_rate": 4.3120069674978156e-05, "loss": 0.6166, "step": 215 }, { "epoch": 0.30013642564802184, "grad_norm": 0.5861095991824197, "learning_rate": 4.275062941654767e-05, "loss": 0.6075, "step": 220 }, { "epoch": 0.3069577080491132, "grad_norm": 0.48865014985382416, "learning_rate": 4.237342149094701e-05, "loss": 0.6083, "step": 225 }, { "epoch": 0.31377899045020463, "grad_norm": 0.5421972030587298, "learning_rate": 4.1988638023074116e-05, "loss": 0.6016, "step": 230 }, { "epoch": 0.32060027285129605, "grad_norm": 0.5129090640390279, "learning_rate": 4.159647499630971e-05, "loss": 0.5941, "step": 235 }, { "epoch": 0.3274215552523875, "grad_norm": 0.5152854095629694, "learning_rate": 4.1197132152696215e-05, "loss": 0.6027, "step": 240 }, { "epoch": 0.33424283765347884, "grad_norm": 0.5024101740456829, "learning_rate": 4.07908128912024e-05, "loss": 0.6139, "step": 245 }, { "epoch": 0.34106412005457026, "grad_norm": 0.5221755877853638, "learning_rate": 4.037772416412524e-05, "loss": 0.6021, "step": 250 }, { "epoch": 0.3478854024556617, "grad_norm": 0.5073607350902808, "learning_rate": 3.995807637168205e-05, "loss": 0.5947, "step": 255 }, { "epoch": 0.35470668485675305, "grad_norm": 0.5437155651864307, "learning_rate": 3.9532083254846505e-05, "loss": 0.5835, "step": 260 }, { "epoch": 0.3615279672578445, "grad_norm": 0.5167530841109798, "learning_rate": 3.909996178648299e-05, "loss": 0.603, "step": 265 }, { "epoch": 0.3683492496589359, "grad_norm": 0.4920056856552527, "learning_rate": 3.866193206083494e-05, "loss": 0.5961, "step": 270 }, { "epoch": 0.37517053206002726, "grad_norm": 0.4961555706995555, "learning_rate": 3.821821718142332e-05, "loss": 0.601, "step": 275 }, { "epoch": 0.3819918144611187, "grad_norm": 0.5487961030086439, "learning_rate": 3.77690431474123e-05, "loss": 0.5998, "step": 280 }, { "epoch": 0.3888130968622101, "grad_norm": 0.4992358431940564, "learning_rate": 3.7314638738500265e-05, "loss": 0.6052, "step": 285 }, { "epoch": 0.3956343792633015, "grad_norm": 0.47944580478456583, "learning_rate": 3.685523539839439e-05, "loss": 0.6017, "step": 290 }, { "epoch": 0.4024556616643929, "grad_norm": 0.5179240011002079, "learning_rate": 3.63910671169285e-05, "loss": 0.5925, "step": 295 }, { "epoch": 0.4092769440654843, "grad_norm": 0.4975170780548866, "learning_rate": 3.5922370310884014e-05, "loss": 0.5855, "step": 300 }, { "epoch": 0.41609822646657574, "grad_norm": 0.47615944524761544, "learning_rate": 3.5449383703574806e-05, "loss": 0.5977, "step": 305 }, { "epoch": 0.4229195088676671, "grad_norm": 0.4937468785781773, "learning_rate": 3.4972348203257274e-05, "loss": 0.6045, "step": 310 }, { "epoch": 0.4297407912687585, "grad_norm": 0.5080917863084714, "learning_rate": 3.449150678042748e-05, "loss": 0.6023, "step": 315 }, { "epoch": 0.43656207366984995, "grad_norm": 0.426217329519065, "learning_rate": 3.400710434406803e-05, "loss": 0.5838, "step": 320 }, { "epoch": 0.4433833560709413, "grad_norm": 0.4652871672601851, "learning_rate": 3.351938761690748e-05, "loss": 0.5968, "step": 325 }, { "epoch": 0.45020463847203274, "grad_norm": 0.4854772461454675, "learning_rate": 3.302860500975605e-05, "loss": 0.5891, "step": 330 }, { "epoch": 0.45702592087312416, "grad_norm": 0.46643925398275227, "learning_rate": 3.253500649498153e-05, "loss": 0.6021, "step": 335 }, { "epoch": 0.4638472032742155, "grad_norm": 0.4998198413420019, "learning_rate": 3.203884347918975e-05, "loss": 0.5983, "step": 340 }, { "epoch": 0.47066848567530695, "grad_norm": 0.4688650362571065, "learning_rate": 3.154036867517462e-05, "loss": 0.5957, "step": 345 }, { "epoch": 0.47748976807639837, "grad_norm": 0.4643721515147853, "learning_rate": 3.1039835973202865e-05, "loss": 0.5959, "step": 350 }, { "epoch": 0.4843110504774898, "grad_norm": 0.471561868907386, "learning_rate": 3.053750031169903e-05, "loss": 0.5991, "step": 355 }, { "epoch": 0.49113233287858116, "grad_norm": 0.4888124381385658, "learning_rate": 3.0033617547396614e-05, "loss": 0.5857, "step": 360 }, { "epoch": 0.4979536152796726, "grad_norm": 0.47521724347614236, "learning_rate": 2.9528444325021477e-05, "loss": 0.5859, "step": 365 }, { "epoch": 0.504774897680764, "grad_norm": 0.47459489416728035, "learning_rate": 2.902223794657391e-05, "loss": 0.5815, "step": 370 }, { "epoch": 0.5115961800818554, "grad_norm": 0.45758379959975604, "learning_rate": 2.8515256240275946e-05, "loss": 0.5887, "step": 375 }, { "epoch": 0.5184174624829468, "grad_norm": 0.4549993384212536, "learning_rate": 2.8007757429250597e-05, "loss": 0.5784, "step": 380 }, { "epoch": 0.5252387448840382, "grad_norm": 0.46975416839444245, "learning_rate": 2.7500000000000004e-05, "loss": 0.5839, "step": 385 }, { "epoch": 0.5320600272851296, "grad_norm": 0.5210561617558792, "learning_rate": 2.699224257074941e-05, "loss": 0.5927, "step": 390 }, { "epoch": 0.538881309686221, "grad_norm": 0.49345802812406225, "learning_rate": 2.6484743759724062e-05, "loss": 0.58, "step": 395 }, { "epoch": 0.5457025920873124, "grad_norm": 0.4493716482313927, "learning_rate": 2.5977762053426098e-05, "loss": 0.5884, "step": 400 }, { "epoch": 0.5525238744884038, "grad_norm": 0.46381303791009604, "learning_rate": 2.547155567497854e-05, "loss": 0.5884, "step": 405 }, { "epoch": 0.5593451568894953, "grad_norm": 0.44059241665107607, "learning_rate": 2.496638245260339e-05, "loss": 0.5818, "step": 410 }, { "epoch": 0.5661664392905866, "grad_norm": 0.4723412306031858, "learning_rate": 2.446249968830097e-05, "loss": 0.5717, "step": 415 }, { "epoch": 0.572987721691678, "grad_norm": 0.486458782299585, "learning_rate": 2.3960164026797137e-05, "loss": 0.5752, "step": 420 }, { "epoch": 0.5798090040927695, "grad_norm": 0.43429727676625984, "learning_rate": 2.3459631324825388e-05, "loss": 0.5823, "step": 425 }, { "epoch": 0.5866302864938608, "grad_norm": 0.4891386016432944, "learning_rate": 2.2961156520810255e-05, "loss": 0.5681, "step": 430 }, { "epoch": 0.5934515688949522, "grad_norm": 0.4865714375440033, "learning_rate": 2.246499350501848e-05, "loss": 0.5746, "step": 435 }, { "epoch": 0.6002728512960437, "grad_norm": 0.4431956837418487, "learning_rate": 2.197139499024396e-05, "loss": 0.5673, "step": 440 }, { "epoch": 0.607094133697135, "grad_norm": 0.4765080896581954, "learning_rate": 2.1480612383092536e-05, "loss": 0.5679, "step": 445 }, { "epoch": 0.6139154160982264, "grad_norm": 0.46366735904716083, "learning_rate": 2.0992895655931984e-05, "loss": 0.5788, "step": 450 }, { "epoch": 0.6207366984993179, "grad_norm": 0.4350049519288381, "learning_rate": 2.0508493219572522e-05, "loss": 0.5819, "step": 455 }, { "epoch": 0.6275579809004093, "grad_norm": 0.4286514802588011, "learning_rate": 2.0027651796742735e-05, "loss": 0.5702, "step": 460 }, { "epoch": 0.6343792633015006, "grad_norm": 0.45786236026543076, "learning_rate": 1.95506162964252e-05, "loss": 0.5633, "step": 465 }, { "epoch": 0.6412005457025921, "grad_norm": 0.4657739270838876, "learning_rate": 1.9077629689115995e-05, "loss": 0.5824, "step": 470 }, { "epoch": 0.6480218281036835, "grad_norm": 0.4466235343446829, "learning_rate": 1.8608932883071507e-05, "loss": 0.5681, "step": 475 }, { "epoch": 0.654843110504775, "grad_norm": 0.46375008358895903, "learning_rate": 1.8144764601605613e-05, "loss": 0.5617, "step": 480 }, { "epoch": 0.6616643929058663, "grad_norm": 0.4391041671626772, "learning_rate": 1.7685361261499733e-05, "loss": 0.5744, "step": 485 }, { "epoch": 0.6684856753069577, "grad_norm": 0.46579833218034034, "learning_rate": 1.72309568525877e-05, "loss": 0.5746, "step": 490 }, { "epoch": 0.6753069577080492, "grad_norm": 0.45492406514149997, "learning_rate": 1.6781782818576686e-05, "loss": 0.5654, "step": 495 }, { "epoch": 0.6821282401091405, "grad_norm": 0.427984837549718, "learning_rate": 1.6338067939165058e-05, "loss": 0.5656, "step": 500 }, { "epoch": 0.6889495225102319, "grad_norm": 0.43618818210889354, "learning_rate": 1.590003821351701e-05, "loss": 0.5699, "step": 505 }, { "epoch": 0.6957708049113234, "grad_norm": 0.44605546197769363, "learning_rate": 1.54679167451535e-05, "loss": 0.5678, "step": 510 }, { "epoch": 0.7025920873124147, "grad_norm": 0.4352482969573091, "learning_rate": 1.5041923628317948e-05, "loss": 0.5681, "step": 515 }, { "epoch": 0.7094133697135061, "grad_norm": 0.4449647817433157, "learning_rate": 1.4622275835874766e-05, "loss": 0.5804, "step": 520 }, { "epoch": 0.7162346521145976, "grad_norm": 0.45853870173355127, "learning_rate": 1.4209187108797607e-05, "loss": 0.5683, "step": 525 }, { "epoch": 0.723055934515689, "grad_norm": 0.45196828201062145, "learning_rate": 1.3802867847303785e-05, "loss": 0.5666, "step": 530 }, { "epoch": 0.7298772169167803, "grad_norm": 0.443145839773193, "learning_rate": 1.3403525003690304e-05, "loss": 0.5653, "step": 535 }, { "epoch": 0.7366984993178718, "grad_norm": 0.4389866077035684, "learning_rate": 1.3011361976925884e-05, "loss": 0.5638, "step": 540 }, { "epoch": 0.7435197817189632, "grad_norm": 0.45763004903362015, "learning_rate": 1.2626578509052997e-05, "loss": 0.5753, "step": 545 }, { "epoch": 0.7503410641200545, "grad_norm": 0.4458577804463173, "learning_rate": 1.2249370583452342e-05, "loss": 0.5666, "step": 550 }, { "epoch": 0.757162346521146, "grad_norm": 0.4273191608544512, "learning_rate": 1.1879930325021841e-05, "loss": 0.5509, "step": 555 }, { "epoch": 0.7639836289222374, "grad_norm": 0.4321968347494333, "learning_rate": 1.1518445902320878e-05, "loss": 0.561, "step": 560 }, { "epoch": 0.7708049113233287, "grad_norm": 0.40146636682824943, "learning_rate": 1.1165101431729561e-05, "loss": 0.5513, "step": 565 }, { "epoch": 0.7776261937244202, "grad_norm": 0.44158119129301215, "learning_rate": 1.0820076883671999e-05, "loss": 0.5639, "step": 570 }, { "epoch": 0.7844474761255116, "grad_norm": 0.4324647822785342, "learning_rate": 1.0483547990951195e-05, "loss": 0.5606, "step": 575 }, { "epoch": 0.791268758526603, "grad_norm": 0.42146490667069647, "learning_rate": 1.0155686159242297e-05, "loss": 0.5592, "step": 580 }, { "epoch": 0.7980900409276944, "grad_norm": 0.42206025971080063, "learning_rate": 9.836658379789875e-06, "loss": 0.5672, "step": 585 }, { "epoch": 0.8049113233287858, "grad_norm": 0.43989515008873703, "learning_rate": 9.52662714435352e-06, "loss": 0.5712, "step": 590 }, { "epoch": 0.8117326057298773, "grad_norm": 0.413076985358387, "learning_rate": 9.225750362445255e-06, "loss": 0.5612, "step": 595 }, { "epoch": 0.8185538881309686, "grad_norm": 0.45181977159984366, "learning_rate": 8.93418128090081e-06, "loss": 0.553, "step": 600 }, { "epoch": 0.82537517053206, "grad_norm": 0.42516984261163593, "learning_rate": 8.652068405825798e-06, "loss": 0.5599, "step": 605 }, { "epoch": 0.8321964529331515, "grad_norm": 0.44202050757097777, "learning_rate": 8.379555426956415e-06, "loss": 0.5613, "step": 610 }, { "epoch": 0.8390177353342428, "grad_norm": 0.4222812758989466, "learning_rate": 8.11678114447342e-06, "loss": 0.559, "step": 615 }, { "epoch": 0.8458390177353342, "grad_norm": 0.4142488219922559, "learning_rate": 7.863879398306385e-06, "loss": 0.556, "step": 620 }, { "epoch": 0.8526603001364257, "grad_norm": 0.4196305373032907, "learning_rate": 7.620978999964487e-06, "loss": 0.5567, "step": 625 }, { "epoch": 0.859481582537517, "grad_norm": 0.43830466066352947, "learning_rate": 7.3882036669283754e-06, "loss": 0.5642, "step": 630 }, { "epoch": 0.8663028649386084, "grad_norm": 0.44230634956976506, "learning_rate": 7.16567195963665e-06, "loss": 0.5584, "step": 635 }, { "epoch": 0.8731241473396999, "grad_norm": 0.40907280623831926, "learning_rate": 6.953497221098949e-06, "loss": 0.552, "step": 640 }, { "epoch": 0.8799454297407913, "grad_norm": 0.42299806448024535, "learning_rate": 6.751787519166505e-06, "loss": 0.5519, "step": 645 }, { "epoch": 0.8867667121418826, "grad_norm": 0.42184825271829834, "learning_rate": 6.560645591489468e-06, "loss": 0.5522, "step": 650 }, { "epoch": 0.8935879945429741, "grad_norm": 0.41416983905343346, "learning_rate": 6.380168793189115e-06, "loss": 0.553, "step": 655 }, { "epoch": 0.9004092769440655, "grad_norm": 0.41897995581897757, "learning_rate": 6.210449047271566e-06, "loss": 0.5603, "step": 660 }, { "epoch": 0.9072305593451568, "grad_norm": 0.419192708432206, "learning_rate": 6.0515727978082415e-06, "loss": 0.5422, "step": 665 }, { "epoch": 0.9140518417462483, "grad_norm": 0.41895036086302273, "learning_rate": 5.9036209659069404e-06, "loss": 0.5598, "step": 670 }, { "epoch": 0.9208731241473397, "grad_norm": 0.42577206145519453, "learning_rate": 5.766668908495966e-06, "loss": 0.5504, "step": 675 }, { "epoch": 0.927694406548431, "grad_norm": 0.4145726978622104, "learning_rate": 5.64078637994224e-06, "loss": 0.5593, "step": 680 }, { "epoch": 0.9345156889495225, "grad_norm": 0.42111567199984934, "learning_rate": 5.526037496523051e-06, "loss": 0.5539, "step": 685 }, { "epoch": 0.9413369713506139, "grad_norm": 0.4166688060168719, "learning_rate": 5.422480703769408e-06, "loss": 0.5558, "step": 690 }, { "epoch": 0.9481582537517054, "grad_norm": 0.411549627490275, "learning_rate": 5.330168746697747e-06, "loss": 0.561, "step": 695 }, { "epoch": 0.9549795361527967, "grad_norm": 0.42683438445133004, "learning_rate": 5.249148642945106e-06, "loss": 0.56, "step": 700 }, { "epoch": 0.9618008185538881, "grad_norm": 0.4154382781777128, "learning_rate": 5.179461658821403e-06, "loss": 0.547, "step": 705 }, { "epoch": 0.9686221009549796, "grad_norm": 0.4152930390889386, "learning_rate": 5.121143288291119e-06, "loss": 0.553, "step": 710 }, { "epoch": 0.975443383356071, "grad_norm": 0.41768488555308536, "learning_rate": 5.07422323489499e-06, "loss": 0.5605, "step": 715 }, { "epoch": 0.9822646657571623, "grad_norm": 0.4185926642966096, "learning_rate": 5.03872539662099e-06, "loss": 0.557, "step": 720 }, { "epoch": 0.9890859481582538, "grad_norm": 0.42121298592890755, "learning_rate": 5.014667853732269e-06, "loss": 0.5535, "step": 725 }, { "epoch": 0.9959072305593452, "grad_norm": 0.4069511980616117, "learning_rate": 5.00206285955824e-06, "loss": 0.5545, "step": 730 }, { "epoch": 1.0, "step": 733, "total_flos": 83525973835776.0, "train_loss": 0.600835688280051, "train_runtime": 738.4345, "train_samples_per_second": 126.935, "train_steps_per_second": 0.993 } ], "logging_steps": 5, "max_steps": 733, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 83525973835776.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }