| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.0, | |
| "eval_steps": 500, | |
| "global_step": 740, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.013531799729364006, | |
| "grad_norm": 1.4412583112716675, | |
| "learning_rate": 1.2903225806451614e-06, | |
| "loss": 1.3064, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.02706359945872801, | |
| "grad_norm": 0.9369994401931763, | |
| "learning_rate": 2.9032258064516128e-06, | |
| "loss": 1.3176, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.04059539918809202, | |
| "grad_norm": 0.6337246298789978, | |
| "learning_rate": 4.516129032258065e-06, | |
| "loss": 1.3047, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.05412719891745602, | |
| "grad_norm": 0.7102669477462769, | |
| "learning_rate": 6.129032258064517e-06, | |
| "loss": 1.2226, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.06765899864682003, | |
| "grad_norm": 0.5792216062545776, | |
| "learning_rate": 7.741935483870968e-06, | |
| "loss": 1.2671, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.08119079837618404, | |
| "grad_norm": 0.5476118326187134, | |
| "learning_rate": 9.35483870967742e-06, | |
| "loss": 1.2585, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.09472259810554803, | |
| "grad_norm": 0.6799878478050232, | |
| "learning_rate": 1.0967741935483872e-05, | |
| "loss": 1.2151, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.10825439783491204, | |
| "grad_norm": 0.5542110800743103, | |
| "learning_rate": 1.2580645161290324e-05, | |
| "loss": 1.185, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.12178619756427606, | |
| "grad_norm": 0.4575681686401367, | |
| "learning_rate": 1.4193548387096776e-05, | |
| "loss": 1.1688, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.13531799729364005, | |
| "grad_norm": 0.5137224197387695, | |
| "learning_rate": 1.5806451612903226e-05, | |
| "loss": 1.1865, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.14884979702300405, | |
| "grad_norm": 0.4554082453250885, | |
| "learning_rate": 1.741935483870968e-05, | |
| "loss": 1.163, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.16238159675236807, | |
| "grad_norm": 0.5306389331817627, | |
| "learning_rate": 1.903225806451613e-05, | |
| "loss": 1.1513, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.17591339648173207, | |
| "grad_norm": 0.5879004597663879, | |
| "learning_rate": 2.0645161290322582e-05, | |
| "loss": 1.1948, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.18944519621109607, | |
| "grad_norm": 0.4350433647632599, | |
| "learning_rate": 2.2258064516129034e-05, | |
| "loss": 1.0976, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.2029769959404601, | |
| "grad_norm": 0.49727171659469604, | |
| "learning_rate": 2.3870967741935483e-05, | |
| "loss": 1.1021, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.2165087956698241, | |
| "grad_norm": 0.551313042640686, | |
| "learning_rate": 2.548387096774194e-05, | |
| "loss": 1.1962, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.23004059539918809, | |
| "grad_norm": 0.4979248046875, | |
| "learning_rate": 2.7096774193548387e-05, | |
| "loss": 1.1318, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.2435723951285521, | |
| "grad_norm": 0.5184155702590942, | |
| "learning_rate": 2.870967741935484e-05, | |
| "loss": 1.1268, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.2571041948579161, | |
| "grad_norm": 0.6341415643692017, | |
| "learning_rate": 2.9999976021756284e-05, | |
| "loss": 1.0454, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.2706359945872801, | |
| "grad_norm": 0.4915357530117035, | |
| "learning_rate": 2.9999136791275564e-05, | |
| "loss": 1.078, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.28416779431664413, | |
| "grad_norm": 0.541904091835022, | |
| "learning_rate": 2.999709872526874e-05, | |
| "loss": 1.0623, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.2976995940460081, | |
| "grad_norm": 0.5630597472190857, | |
| "learning_rate": 2.999386198663225e-05, | |
| "loss": 1.0517, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.3112313937753721, | |
| "grad_norm": 0.7628896832466125, | |
| "learning_rate": 2.9989426834068792e-05, | |
| "loss": 1.084, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.32476319350473615, | |
| "grad_norm": 0.6299301981925964, | |
| "learning_rate": 2.9983793622066668e-05, | |
| "loss": 1.0186, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.3382949932341001, | |
| "grad_norm": 0.6008714437484741, | |
| "learning_rate": 2.9976962800871434e-05, | |
| "loss": 1.0228, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.35182679296346414, | |
| "grad_norm": 0.6700873374938965, | |
| "learning_rate": 2.9968934916449923e-05, | |
| "loss": 0.9923, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.36535859269282817, | |
| "grad_norm": 0.5688751339912415, | |
| "learning_rate": 2.9959710610446577e-05, | |
| "loss": 0.998, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.37889039242219213, | |
| "grad_norm": 0.7533664703369141, | |
| "learning_rate": 2.9949290620132225e-05, | |
| "loss": 1.0353, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.39242219215155616, | |
| "grad_norm": 0.7014450430870056, | |
| "learning_rate": 2.99376757783451e-05, | |
| "loss": 0.9277, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.4059539918809202, | |
| "grad_norm": 0.6594902873039246, | |
| "learning_rate": 2.992486701342427e-05, | |
| "loss": 0.9636, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.41948579161028415, | |
| "grad_norm": 0.7603819966316223, | |
| "learning_rate": 2.9910865349135498e-05, | |
| "loss": 0.9665, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.4330175913396482, | |
| "grad_norm": 0.642242968082428, | |
| "learning_rate": 2.989567190458935e-05, | |
| "loss": 0.9616, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.4465493910690122, | |
| "grad_norm": 0.6365484595298767, | |
| "learning_rate": 2.9879287894151786e-05, | |
| "loss": 0.979, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.46008119079837617, | |
| "grad_norm": 0.6748781800270081, | |
| "learning_rate": 2.9861714627347076e-05, | |
| "loss": 0.9437, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.4736129905277402, | |
| "grad_norm": 0.6489437222480774, | |
| "learning_rate": 2.984295350875316e-05, | |
| "loss": 0.9036, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.4871447902571042, | |
| "grad_norm": 0.6802551746368408, | |
| "learning_rate": 2.9823006037889358e-05, | |
| "loss": 0.8769, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.5006765899864682, | |
| "grad_norm": 0.7979145050048828, | |
| "learning_rate": 2.9801873809096543e-05, | |
| "loss": 0.9136, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.5142083897158322, | |
| "grad_norm": 0.774574875831604, | |
| "learning_rate": 2.9779558511409678e-05, | |
| "loss": 0.8767, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.5277401894451962, | |
| "grad_norm": 0.724077045917511, | |
| "learning_rate": 2.9756061928422857e-05, | |
| "loss": 0.913, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.5412719891745602, | |
| "grad_norm": 0.7980031967163086, | |
| "learning_rate": 2.973138593814671e-05, | |
| "loss": 0.9224, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.5548037889039242, | |
| "grad_norm": 0.900132417678833, | |
| "learning_rate": 2.9705532512858324e-05, | |
| "loss": 0.8389, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.5683355886332883, | |
| "grad_norm": 0.8545295596122742, | |
| "learning_rate": 2.9678503718943594e-05, | |
| "loss": 0.8391, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.5818673883626523, | |
| "grad_norm": 0.78533935546875, | |
| "learning_rate": 2.965030171673207e-05, | |
| "loss": 0.8716, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.5953991880920162, | |
| "grad_norm": 0.9080139994621277, | |
| "learning_rate": 2.962092876032427e-05, | |
| "loss": 0.8158, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.6089309878213802, | |
| "grad_norm": 0.9167734384536743, | |
| "learning_rate": 2.9590387197411547e-05, | |
| "loss": 0.8416, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.6224627875507442, | |
| "grad_norm": 1.0270551443099976, | |
| "learning_rate": 2.9558679469088423e-05, | |
| "loss": 0.8628, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.6359945872801083, | |
| "grad_norm": 0.8939360976219177, | |
| "learning_rate": 2.9525808109657485e-05, | |
| "loss": 0.8487, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.6495263870094723, | |
| "grad_norm": 0.8669422268867493, | |
| "learning_rate": 2.949177574642682e-05, | |
| "loss": 0.8317, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.6630581867388363, | |
| "grad_norm": 0.7396143674850464, | |
| "learning_rate": 2.9456585099500036e-05, | |
| "loss": 0.784, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.6765899864682002, | |
| "grad_norm": 0.9191597700119019, | |
| "learning_rate": 2.942023898155885e-05, | |
| "loss": 0.8148, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.6901217861975643, | |
| "grad_norm": 1.05917227268219, | |
| "learning_rate": 2.938274029763826e-05, | |
| "loss": 0.7824, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.7036535859269283, | |
| "grad_norm": 0.8561118245124817, | |
| "learning_rate": 2.934409204489438e-05, | |
| "loss": 0.8054, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.7171853856562923, | |
| "grad_norm": 0.7686528563499451, | |
| "learning_rate": 2.9304297312364865e-05, | |
| "loss": 0.7765, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.7307171853856563, | |
| "grad_norm": 0.9166774749755859, | |
| "learning_rate": 2.926335928072203e-05, | |
| "loss": 0.7964, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.7442489851150202, | |
| "grad_norm": 0.9427902102470398, | |
| "learning_rate": 2.922128122201862e-05, | |
| "loss": 0.7521, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.7577807848443843, | |
| "grad_norm": 0.8347809314727783, | |
| "learning_rate": 2.9178066499426284e-05, | |
| "loss": 0.7492, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.7713125845737483, | |
| "grad_norm": 0.9658071994781494, | |
| "learning_rate": 2.9133718566966773e-05, | |
| "loss": 0.6946, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.7848443843031123, | |
| "grad_norm": 0.8596900105476379, | |
| "learning_rate": 2.9088240969235864e-05, | |
| "loss": 0.7395, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.7983761840324763, | |
| "grad_norm": 0.9865032434463501, | |
| "learning_rate": 2.9041637341120054e-05, | |
| "loss": 0.7497, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.8119079837618404, | |
| "grad_norm": 0.8877797722816467, | |
| "learning_rate": 2.8993911407506037e-05, | |
| "loss": 0.7162, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.8254397834912043, | |
| "grad_norm": 0.9064735174179077, | |
| "learning_rate": 2.8945066982982984e-05, | |
| "loss": 0.6864, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.8389715832205683, | |
| "grad_norm": 0.9386357665061951, | |
| "learning_rate": 2.889510797153764e-05, | |
| "loss": 0.6957, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.8525033829499323, | |
| "grad_norm": 1.0564672946929932, | |
| "learning_rate": 2.8844038366242326e-05, | |
| "loss": 0.698, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.8660351826792964, | |
| "grad_norm": 0.9778911471366882, | |
| "learning_rate": 2.879186224893574e-05, | |
| "loss": 0.6916, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.8795669824086604, | |
| "grad_norm": 0.9041974544525146, | |
| "learning_rate": 2.8738583789896743e-05, | |
| "loss": 0.6482, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.8930987821380244, | |
| "grad_norm": 1.026167392730713, | |
| "learning_rate": 2.8684207247511025e-05, | |
| "loss": 0.7138, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.9066305818673883, | |
| "grad_norm": 0.8488349318504333, | |
| "learning_rate": 2.8628736967930747e-05, | |
| "loss": 0.6957, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.9201623815967523, | |
| "grad_norm": 0.9476014375686646, | |
| "learning_rate": 2.8572177384727167e-05, | |
| "loss": 0.6485, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.9336941813261164, | |
| "grad_norm": 0.9190114140510559, | |
| "learning_rate": 2.8514533018536286e-05, | |
| "loss": 0.6747, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.9472259810554804, | |
| "grad_norm": 0.9507735371589661, | |
| "learning_rate": 2.8455808476697513e-05, | |
| "loss": 0.6732, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.9607577807848444, | |
| "grad_norm": 1.0128976106643677, | |
| "learning_rate": 2.8396008452885426e-05, | |
| "loss": 0.6633, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.9742895805142084, | |
| "grad_norm": 1.2091327905654907, | |
| "learning_rate": 2.8335137726734608e-05, | |
| "loss": 0.6888, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.9878213802435724, | |
| "grad_norm": 0.998440682888031, | |
| "learning_rate": 2.827320116345764e-05, | |
| "loss": 0.6605, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 1.5365850925445557, | |
| "learning_rate": 2.821020371345624e-05, | |
| "loss": 0.6687, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.013531799729364, | |
| "grad_norm": 1.3685508966445923, | |
| "learning_rate": 2.8146150411925568e-05, | |
| "loss": 0.541, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 1.027063599458728, | |
| "grad_norm": 1.0007332563400269, | |
| "learning_rate": 2.8081046378451807e-05, | |
| "loss": 0.5494, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.040595399188092, | |
| "grad_norm": 1.3241759538650513, | |
| "learning_rate": 2.801489681660296e-05, | |
| "loss": 0.5614, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 1.054127198917456, | |
| "grad_norm": 1.0600066184997559, | |
| "learning_rate": 2.7947707013512936e-05, | |
| "loss": 0.5606, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.0676589986468201, | |
| "grad_norm": 0.9737664461135864, | |
| "learning_rate": 2.7879482339458974e-05, | |
| "loss": 0.5386, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 1.0811907983761841, | |
| "grad_norm": 0.9769577980041504, | |
| "learning_rate": 2.7810228247432415e-05, | |
| "loss": 0.5397, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.094722598105548, | |
| "grad_norm": 0.8770543336868286, | |
| "learning_rate": 2.7739950272702856e-05, | |
| "loss": 0.5319, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 1.108254397834912, | |
| "grad_norm": 0.9559663534164429, | |
| "learning_rate": 2.7668654032375733e-05, | |
| "loss": 0.5952, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.121786197564276, | |
| "grad_norm": 0.9816983938217163, | |
| "learning_rate": 2.7596345224943357e-05, | |
| "loss": 0.5308, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 1.13531799729364, | |
| "grad_norm": 1.0468382835388184, | |
| "learning_rate": 2.7523029629829478e-05, | |
| "loss": 0.5199, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.148849797023004, | |
| "grad_norm": 1.1081461906433105, | |
| "learning_rate": 2.744871310692731e-05, | |
| "loss": 0.5094, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 1.162381596752368, | |
| "grad_norm": 1.0275734663009644, | |
| "learning_rate": 2.73734015961312e-05, | |
| "loss": 0.5744, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.175913396481732, | |
| "grad_norm": 0.9587875604629517, | |
| "learning_rate": 2.7297101116861862e-05, | |
| "loss": 0.5095, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 1.1894451962110961, | |
| "grad_norm": 1.1627024412155151, | |
| "learning_rate": 2.721981776758526e-05, | |
| "loss": 0.4957, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.2029769959404601, | |
| "grad_norm": 0.9614389538764954, | |
| "learning_rate": 2.714155772532518e-05, | |
| "loss": 0.506, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 1.2165087956698242, | |
| "grad_norm": 1.0214952230453491, | |
| "learning_rate": 2.7062327245169506e-05, | |
| "loss": 0.5174, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.230040595399188, | |
| "grad_norm": 1.1152983903884888, | |
| "learning_rate": 2.6982132659770298e-05, | |
| "loss": 0.5119, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 1.243572395128552, | |
| "grad_norm": 0.9631413817405701, | |
| "learning_rate": 2.6900980378837614e-05, | |
| "loss": 0.5094, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.257104194857916, | |
| "grad_norm": 1.333390712738037, | |
| "learning_rate": 2.6818876888627204e-05, | |
| "loss": 0.5279, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 1.27063599458728, | |
| "grad_norm": 1.1421831846237183, | |
| "learning_rate": 2.6735828751422117e-05, | |
| "loss": 0.5038, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.284167794316644, | |
| "grad_norm": 1.093878149986267, | |
| "learning_rate": 2.6651842605008142e-05, | |
| "loss": 0.4847, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 1.297699594046008, | |
| "grad_norm": 1.0584564208984375, | |
| "learning_rate": 2.6566925162143322e-05, | |
| "loss": 0.463, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.3112313937753721, | |
| "grad_norm": 1.1284931898117065, | |
| "learning_rate": 2.6481083210021396e-05, | |
| "loss": 0.5409, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 1.3247631935047361, | |
| "grad_norm": 1.6687848567962646, | |
| "learning_rate": 2.6394323609729317e-05, | |
| "loss": 0.5144, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.3382949932341002, | |
| "grad_norm": 1.1010960340499878, | |
| "learning_rate": 2.6306653295698885e-05, | |
| "loss": 0.526, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 1.3518267929634642, | |
| "grad_norm": 1.0190247297286987, | |
| "learning_rate": 2.6218079275152485e-05, | |
| "loss": 0.4194, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.3653585926928282, | |
| "grad_norm": 1.2479708194732666, | |
| "learning_rate": 2.6128608627543012e-05, | |
| "loss": 0.4678, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 1.3788903924221922, | |
| "grad_norm": 1.0126714706420898, | |
| "learning_rate": 2.6038248503988058e-05, | |
| "loss": 0.4241, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.3924221921515563, | |
| "grad_norm": 1.1457223892211914, | |
| "learning_rate": 2.5947006126698325e-05, | |
| "loss": 0.4553, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 1.4059539918809203, | |
| "grad_norm": 0.9630009531974792, | |
| "learning_rate": 2.5854888788400384e-05, | |
| "loss": 0.4664, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.419485791610284, | |
| "grad_norm": 1.067337989807129, | |
| "learning_rate": 2.5761903851753783e-05, | |
| "loss": 0.4379, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 1.4330175913396481, | |
| "grad_norm": 1.0295122861862183, | |
| "learning_rate": 2.5668058748762574e-05, | |
| "loss": 0.4428, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 1.4465493910690121, | |
| "grad_norm": 0.9696286916732788, | |
| "learning_rate": 2.5573360980181297e-05, | |
| "loss": 0.4245, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 1.4600811907983762, | |
| "grad_norm": 1.2453484535217285, | |
| "learning_rate": 2.5477818114915477e-05, | |
| "loss": 0.462, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.4736129905277402, | |
| "grad_norm": 1.038318395614624, | |
| "learning_rate": 2.5381437789416643e-05, | |
| "loss": 0.4367, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 1.4871447902571042, | |
| "grad_norm": 1.1128343343734741, | |
| "learning_rate": 2.5284227707071986e-05, | |
| "loss": 0.4184, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.5006765899864682, | |
| "grad_norm": 1.2020708322525024, | |
| "learning_rate": 2.518619563758864e-05, | |
| "loss": 0.4334, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 1.514208389715832, | |
| "grad_norm": 1.0298750400543213, | |
| "learning_rate": 2.5087349416372696e-05, | |
| "loss": 0.4125, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 1.527740189445196, | |
| "grad_norm": 1.0778883695602417, | |
| "learning_rate": 2.49876969439029e-05, | |
| "loss": 0.4133, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 1.54127198917456, | |
| "grad_norm": 1.1158068180084229, | |
| "learning_rate": 2.4887246185099237e-05, | |
| "loss": 0.4366, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.5548037889039241, | |
| "grad_norm": 1.1676713228225708, | |
| "learning_rate": 2.4786005168686286e-05, | |
| "loss": 0.436, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 1.5683355886332881, | |
| "grad_norm": 1.2565547227859497, | |
| "learning_rate": 2.4683981986551526e-05, | |
| "loss": 0.4557, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 1.5818673883626522, | |
| "grad_norm": 1.133944034576416, | |
| "learning_rate": 2.458118479309857e-05, | |
| "loss": 0.4026, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 1.5953991880920162, | |
| "grad_norm": 1.0183407068252563, | |
| "learning_rate": 2.4477621804595402e-05, | |
| "loss": 0.4533, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 1.6089309878213802, | |
| "grad_norm": 1.26309335231781, | |
| "learning_rate": 2.4373301298517696e-05, | |
| "loss": 0.4314, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 1.6224627875507442, | |
| "grad_norm": 1.1744800806045532, | |
| "learning_rate": 2.42682316128872e-05, | |
| "loss": 0.3831, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.6359945872801083, | |
| "grad_norm": 1.1486276388168335, | |
| "learning_rate": 2.4162421145605308e-05, | |
| "loss": 0.4609, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 1.6495263870094723, | |
| "grad_norm": 1.1308343410491943, | |
| "learning_rate": 2.4055878353781858e-05, | |
| "loss": 0.3715, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 1.6630581867388363, | |
| "grad_norm": 1.048828125, | |
| "learning_rate": 2.3948611753059155e-05, | |
| "loss": 0.3978, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 1.6765899864682003, | |
| "grad_norm": 1.2727230787277222, | |
| "learning_rate": 2.3840629916931362e-05, | |
| "loss": 0.3986, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 1.6901217861975644, | |
| "grad_norm": 1.1679140329360962, | |
| "learning_rate": 2.3731941476059243e-05, | |
| "loss": 0.3896, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 1.7036535859269284, | |
| "grad_norm": 1.1558784246444702, | |
| "learning_rate": 2.362255511758033e-05, | |
| "loss": 0.3888, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 1.7171853856562924, | |
| "grad_norm": 1.337999939918518, | |
| "learning_rate": 2.351247958441459e-05, | |
| "loss": 0.3811, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 1.7307171853856564, | |
| "grad_norm": 1.1977120637893677, | |
| "learning_rate": 2.340172367456564e-05, | |
| "loss": 0.3987, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 1.7442489851150202, | |
| "grad_norm": 1.2168259620666504, | |
| "learning_rate": 2.3290296240417544e-05, | |
| "loss": 0.3497, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 1.7577807848443843, | |
| "grad_norm": 1.1579980850219727, | |
| "learning_rate": 2.3178206188027265e-05, | |
| "loss": 0.3342, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.7713125845737483, | |
| "grad_norm": 0.9696447253227234, | |
| "learning_rate": 2.3065462476412825e-05, | |
| "loss": 0.3763, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 1.7848443843031123, | |
| "grad_norm": 1.072059154510498, | |
| "learning_rate": 2.295207411683725e-05, | |
| "loss": 0.3668, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 1.7983761840324763, | |
| "grad_norm": 1.0907052755355835, | |
| "learning_rate": 2.283805017208834e-05, | |
| "loss": 0.369, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 1.8119079837618404, | |
| "grad_norm": 1.1626112461090088, | |
| "learning_rate": 2.2723399755754262e-05, | |
| "loss": 0.349, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 1.8254397834912042, | |
| "grad_norm": 1.0268486738204956, | |
| "learning_rate": 2.2608132031495184e-05, | |
| "loss": 0.3214, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 1.8389715832205682, | |
| "grad_norm": 1.1764811277389526, | |
| "learning_rate": 2.2492256212310805e-05, | |
| "loss": 0.3133, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 1.8525033829499322, | |
| "grad_norm": 1.0286613702774048, | |
| "learning_rate": 2.2375781559804012e-05, | |
| "loss": 0.3362, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 1.8660351826792962, | |
| "grad_norm": 1.2203805446624756, | |
| "learning_rate": 2.2258717383440632e-05, | |
| "loss": 0.3344, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 1.8795669824086603, | |
| "grad_norm": 1.0343270301818848, | |
| "learning_rate": 2.2141073039805344e-05, | |
| "loss": 0.3352, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 1.8930987821380243, | |
| "grad_norm": 1.082728385925293, | |
| "learning_rate": 2.202285793185383e-05, | |
| "loss": 0.3419, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.9066305818673883, | |
| "grad_norm": 1.1293714046478271, | |
| "learning_rate": 2.1904081508161236e-05, | |
| "loss": 0.3589, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 1.9201623815967523, | |
| "grad_norm": 1.090536117553711, | |
| "learning_rate": 2.1784753262166984e-05, | |
| "loss": 0.3487, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 1.9336941813261164, | |
| "grad_norm": 1.2215139865875244, | |
| "learning_rate": 2.166488273141597e-05, | |
| "loss": 0.3917, | |
| "step": 715 | |
| }, | |
| { | |
| "epoch": 1.9472259810554804, | |
| "grad_norm": 1.2397185564041138, | |
| "learning_rate": 2.1544479496796258e-05, | |
| "loss": 0.3715, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 1.9607577807848444, | |
| "grad_norm": 1.0956600904464722, | |
| "learning_rate": 2.1423553181773336e-05, | |
| "loss": 0.3448, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 1.9742895805142084, | |
| "grad_norm": 1.1384000778198242, | |
| "learning_rate": 2.130211345162091e-05, | |
| "loss": 0.3248, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 1.9878213802435725, | |
| "grad_norm": 1.2017817497253418, | |
| "learning_rate": 2.1180170012648406e-05, | |
| "loss": 0.3016, | |
| "step": 735 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 1.444764256477356, | |
| "learning_rate": 2.105773261142516e-05, | |
| "loss": 0.3309, | |
| "step": 740 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 1850, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 2000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.0801623899381432e+18, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |