{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9932279909706545, "eval_steps": 500, "global_step": 996, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.015048908954100828, "grad_norm": 1.2988319396972656, "learning_rate": 4.9996890990217804e-05, "loss": 2.4707, "num_input_tokens_seen": 5864, "step": 5 }, { "epoch": 0.030097817908201655, "grad_norm": 1.8058427572250366, "learning_rate": 4.9987564734146566e-05, "loss": 2.2509, "num_input_tokens_seen": 11432, "step": 10 }, { "epoch": 0.045146726862302484, "grad_norm": 0.8231738209724426, "learning_rate": 4.997202355141999e-05, "loss": 1.6895, "num_input_tokens_seen": 17000, "step": 15 }, { "epoch": 0.06019563581640331, "grad_norm": 0.7266705632209778, "learning_rate": 4.995027130745321e-05, "loss": 1.4876, "num_input_tokens_seen": 22840, "step": 20 }, { "epoch": 0.07524454477050414, "grad_norm": 1.1722582578659058, "learning_rate": 4.992231341248137e-05, "loss": 1.4812, "num_input_tokens_seen": 28984, "step": 25 }, { "epoch": 0.09029345372460497, "grad_norm": 0.9262341260910034, "learning_rate": 4.9888156820213974e-05, "loss": 1.3642, "num_input_tokens_seen": 34856, "step": 30 }, { "epoch": 0.1053423626787058, "grad_norm": 0.8832902908325195, "learning_rate": 4.9847810026105394e-05, "loss": 1.3651, "num_input_tokens_seen": 41216, "step": 35 }, { "epoch": 0.12039127163280662, "grad_norm": 0.8503655791282654, "learning_rate": 4.980128306524183e-05, "loss": 1.1321, "num_input_tokens_seen": 47304, "step": 40 }, { "epoch": 0.13544018058690746, "grad_norm": 1.348948359489441, "learning_rate": 4.97485875098454e-05, "loss": 1.3012, "num_input_tokens_seen": 53184, "step": 45 }, { "epoch": 0.1504890895410083, "grad_norm": 0.7177269458770752, "learning_rate": 4.968973646639589e-05, "loss": 0.9827, "num_input_tokens_seen": 59024, "step": 50 }, { "epoch": 0.1655379984951091, "grad_norm": 0.6005258560180664, "learning_rate": 4.9624744572370865e-05, "loss": 1.2313, "num_input_tokens_seen": 64816, "step": 55 }, { "epoch": 0.18058690744920994, "grad_norm": 0.6153081059455872, "learning_rate": 4.9553627992605066e-05, "loss": 1.0347, "num_input_tokens_seen": 70848, "step": 60 }, { "epoch": 0.19563581640331076, "grad_norm": 0.7796200513839722, "learning_rate": 4.947640441526989e-05, "loss": 1.0422, "num_input_tokens_seen": 76888, "step": 65 }, { "epoch": 0.2106847253574116, "grad_norm": 0.7273033857345581, "learning_rate": 4.939309304747391e-05, "loss": 0.9996, "num_input_tokens_seen": 82840, "step": 70 }, { "epoch": 0.22573363431151242, "grad_norm": 0.7943289875984192, "learning_rate": 4.930371461048571e-05, "loss": 1.0755, "num_input_tokens_seen": 88824, "step": 75 }, { "epoch": 0.24078254326561324, "grad_norm": 0.6128024458885193, "learning_rate": 4.9208291334580104e-05, "loss": 1.026, "num_input_tokens_seen": 94264, "step": 80 }, { "epoch": 0.2558314522197141, "grad_norm": 0.7087495923042297, "learning_rate": 4.910684695350895e-05, "loss": 1.1307, "num_input_tokens_seen": 99896, "step": 85 }, { "epoch": 0.2708803611738149, "grad_norm": 0.711476743221283, "learning_rate": 4.8999406698598074e-05, "loss": 1.0221, "num_input_tokens_seen": 105640, "step": 90 }, { "epoch": 0.28592927012791575, "grad_norm": 0.5772566795349121, "learning_rate": 4.8885997292471774e-05, "loss": 1.012, "num_input_tokens_seen": 111280, "step": 95 }, { "epoch": 0.3009781790820166, "grad_norm": 0.6769325137138367, "learning_rate": 4.87666469424063e-05, "loss": 1.0151, "num_input_tokens_seen": 116640, "step": 100 }, { "epoch": 0.3160270880361174, "grad_norm": 0.679373025894165, "learning_rate": 4.86413853333141e-05, "loss": 1.0028, "num_input_tokens_seen": 121864, "step": 105 }, { "epoch": 0.3310759969902182, "grad_norm": 0.9181504845619202, "learning_rate": 4.851024362036064e-05, "loss": 1.143, "num_input_tokens_seen": 127384, "step": 110 }, { "epoch": 0.34612490594431905, "grad_norm": 0.7842696905136108, "learning_rate": 4.837325442121538e-05, "loss": 0.9695, "num_input_tokens_seen": 133008, "step": 115 }, { "epoch": 0.3611738148984199, "grad_norm": 0.6459535360336304, "learning_rate": 4.8230451807939135e-05, "loss": 0.9017, "num_input_tokens_seen": 139144, "step": 120 }, { "epoch": 0.3762227238525207, "grad_norm": 0.6695935726165771, "learning_rate": 4.808187129850963e-05, "loss": 1.035, "num_input_tokens_seen": 144848, "step": 125 }, { "epoch": 0.3912716328066215, "grad_norm": 0.9289236664772034, "learning_rate": 4.792754984798745e-05, "loss": 1.0128, "num_input_tokens_seen": 150480, "step": 130 }, { "epoch": 0.40632054176072235, "grad_norm": 0.6192979216575623, "learning_rate": 4.776752583932454e-05, "loss": 0.9432, "num_input_tokens_seen": 156336, "step": 135 }, { "epoch": 0.4213694507148232, "grad_norm": 0.7946303486824036, "learning_rate": 4.760183907381757e-05, "loss": 1.0344, "num_input_tokens_seen": 162440, "step": 140 }, { "epoch": 0.436418359668924, "grad_norm": 0.6548484563827515, "learning_rate": 4.7430530761208494e-05, "loss": 0.9452, "num_input_tokens_seen": 168304, "step": 145 }, { "epoch": 0.45146726862302483, "grad_norm": 0.9075986742973328, "learning_rate": 4.725364350943492e-05, "loss": 0.9559, "num_input_tokens_seen": 173984, "step": 150 }, { "epoch": 0.46651617757712566, "grad_norm": 0.8047800660133362, "learning_rate": 4.707122131403251e-05, "loss": 0.9726, "num_input_tokens_seen": 179896, "step": 155 }, { "epoch": 0.4815650865312265, "grad_norm": 0.6954847574234009, "learning_rate": 4.6883309547192476e-05, "loss": 0.9344, "num_input_tokens_seen": 185296, "step": 160 }, { "epoch": 0.4966139954853273, "grad_norm": 0.7912609577178955, "learning_rate": 4.668995494647653e-05, "loss": 0.9497, "num_input_tokens_seen": 190928, "step": 165 }, { "epoch": 0.5116629044394282, "grad_norm": 0.7360678315162659, "learning_rate": 4.649120560319225e-05, "loss": 1.057, "num_input_tokens_seen": 197352, "step": 170 }, { "epoch": 0.526711813393529, "grad_norm": 0.7325194478034973, "learning_rate": 4.6287110950431865e-05, "loss": 0.9847, "num_input_tokens_seen": 203216, "step": 175 }, { "epoch": 0.5417607223476298, "grad_norm": 0.7140082120895386, "learning_rate": 4.607772175077711e-05, "loss": 1.001, "num_input_tokens_seen": 208624, "step": 180 }, { "epoch": 0.5568096313017307, "grad_norm": 0.9454194903373718, "learning_rate": 4.586309008367359e-05, "loss": 0.9384, "num_input_tokens_seen": 214552, "step": 185 }, { "epoch": 0.5718585402558315, "grad_norm": 0.9370235800743103, "learning_rate": 4.564326933247752e-05, "loss": 1.0312, "num_input_tokens_seen": 220704, "step": 190 }, { "epoch": 0.5869074492099323, "grad_norm": 0.7274216413497925, "learning_rate": 4.541831417117815e-05, "loss": 0.9112, "num_input_tokens_seen": 226480, "step": 195 }, { "epoch": 0.6019563581640331, "grad_norm": 0.9026529788970947, "learning_rate": 4.518828055079925e-05, "loss": 0.9967, "num_input_tokens_seen": 232136, "step": 200 }, { "epoch": 0.617005267118134, "grad_norm": 0.9668667316436768, "learning_rate": 4.4953225685482904e-05, "loss": 1.0905, "num_input_tokens_seen": 238072, "step": 205 }, { "epoch": 0.6320541760722348, "grad_norm": 0.7728851437568665, "learning_rate": 4.471320803825915e-05, "loss": 0.9487, "num_input_tokens_seen": 243680, "step": 210 }, { "epoch": 0.6471030850263356, "grad_norm": 0.7141396999359131, "learning_rate": 4.4468287306505045e-05, "loss": 0.8675, "num_input_tokens_seen": 249376, "step": 215 }, { "epoch": 0.6621519939804364, "grad_norm": 0.7524191737174988, "learning_rate": 4.421852440709666e-05, "loss": 0.8624, "num_input_tokens_seen": 255288, "step": 220 }, { "epoch": 0.6772009029345373, "grad_norm": 1.1502355337142944, "learning_rate": 4.39639814612578e-05, "loss": 1.0489, "num_input_tokens_seen": 261592, "step": 225 }, { "epoch": 0.6922498118886381, "grad_norm": 0.7467320561408997, "learning_rate": 4.370472177910914e-05, "loss": 0.9139, "num_input_tokens_seen": 267192, "step": 230 }, { "epoch": 0.7072987208427389, "grad_norm": 0.6400129795074463, "learning_rate": 4.3440809843921725e-05, "loss": 0.9905, "num_input_tokens_seen": 272712, "step": 235 }, { "epoch": 0.7223476297968398, "grad_norm": 0.6654481291770935, "learning_rate": 4.3172311296078595e-05, "loss": 0.8974, "num_input_tokens_seen": 278720, "step": 240 }, { "epoch": 0.7373965387509406, "grad_norm": 0.7487585544586182, "learning_rate": 4.28992929167487e-05, "loss": 0.999, "num_input_tokens_seen": 284584, "step": 245 }, { "epoch": 0.7524454477050414, "grad_norm": 0.6885581612586975, "learning_rate": 4.2621822611277e-05, "loss": 0.9916, "num_input_tokens_seen": 290408, "step": 250 }, { "epoch": 0.7674943566591422, "grad_norm": 0.774027407169342, "learning_rate": 4.233996939229502e-05, "loss": 0.9242, "num_input_tokens_seen": 295776, "step": 255 }, { "epoch": 0.782543265613243, "grad_norm": 0.8608073592185974, "learning_rate": 4.205380336255594e-05, "loss": 1.0426, "num_input_tokens_seen": 301736, "step": 260 }, { "epoch": 0.7975921745673439, "grad_norm": 0.6539498567581177, "learning_rate": 4.176339569749865e-05, "loss": 0.8625, "num_input_tokens_seen": 307224, "step": 265 }, { "epoch": 0.8126410835214447, "grad_norm": 0.8432996273040771, "learning_rate": 4.1468818627544845e-05, "loss": 0.9959, "num_input_tokens_seen": 313040, "step": 270 }, { "epoch": 0.8276899924755455, "grad_norm": 0.877001166343689, "learning_rate": 4.11701454201339e-05, "loss": 0.939, "num_input_tokens_seen": 319112, "step": 275 }, { "epoch": 0.8427389014296464, "grad_norm": 0.9003238081932068, "learning_rate": 4.08674503614997e-05, "loss": 0.9741, "num_input_tokens_seen": 325040, "step": 280 }, { "epoch": 0.8577878103837472, "grad_norm": 0.8585950136184692, "learning_rate": 4.0560808738194114e-05, "loss": 0.98, "num_input_tokens_seen": 330904, "step": 285 }, { "epoch": 0.872836719337848, "grad_norm": 0.8015385270118713, "learning_rate": 4.0250296818361647e-05, "loss": 0.8898, "num_input_tokens_seen": 336392, "step": 290 }, { "epoch": 0.8878856282919488, "grad_norm": 0.8380082845687866, "learning_rate": 3.993599183277001e-05, "loss": 0.953, "num_input_tokens_seen": 342832, "step": 295 }, { "epoch": 0.9029345372460497, "grad_norm": 0.8890098929405212, "learning_rate": 3.961797195560118e-05, "loss": 0.9311, "num_input_tokens_seen": 348944, "step": 300 }, { "epoch": 0.9179834462001505, "grad_norm": 0.9356483221054077, "learning_rate": 3.9296316285007887e-05, "loss": 0.9114, "num_input_tokens_seen": 354680, "step": 305 }, { "epoch": 0.9330323551542513, "grad_norm": 0.8241044878959656, "learning_rate": 3.897110482344024e-05, "loss": 0.9674, "num_input_tokens_seen": 361008, "step": 310 }, { "epoch": 0.9480812641083521, "grad_norm": 0.7882922887802124, "learning_rate": 3.864241845774746e-05, "loss": 0.9582, "num_input_tokens_seen": 366760, "step": 315 }, { "epoch": 0.963130173062453, "grad_norm": 0.7503064274787903, "learning_rate": 3.8310338939059644e-05, "loss": 0.9863, "num_input_tokens_seen": 372448, "step": 320 }, { "epoch": 0.9781790820165538, "grad_norm": 0.6487952470779419, "learning_rate": 3.797494886245456e-05, "loss": 0.906, "num_input_tokens_seen": 378520, "step": 325 }, { "epoch": 0.9932279909706546, "grad_norm": 0.8584316968917847, "learning_rate": 3.7636331646414524e-05, "loss": 0.8958, "num_input_tokens_seen": 384272, "step": 330 }, { "epoch": 1.0060195635816402, "grad_norm": 0.8825767040252686, "learning_rate": 3.7294571512078506e-05, "loss": 0.8349, "num_input_tokens_seen": 389280, "step": 335 }, { "epoch": 1.021068472535741, "grad_norm": 0.8422874808311462, "learning_rate": 3.694975346229458e-05, "loss": 0.8507, "num_input_tokens_seen": 394944, "step": 340 }, { "epoch": 1.036117381489842, "grad_norm": 0.8337146639823914, "learning_rate": 3.6601963260477924e-05, "loss": 0.9287, "num_input_tokens_seen": 400800, "step": 345 }, { "epoch": 1.0511662904439427, "grad_norm": 0.936469316482544, "learning_rate": 3.625128740927971e-05, "loss": 0.9107, "num_input_tokens_seen": 406728, "step": 350 }, { "epoch": 1.0662151993980435, "grad_norm": 0.8475446105003357, "learning_rate": 3.589781312907207e-05, "loss": 0.952, "num_input_tokens_seen": 412656, "step": 355 }, { "epoch": 1.0812641083521444, "grad_norm": 0.7245047092437744, "learning_rate": 3.55416283362546e-05, "loss": 0.9526, "num_input_tokens_seen": 418488, "step": 360 }, { "epoch": 1.0963130173062452, "grad_norm": 1.0173735618591309, "learning_rate": 3.518282162138772e-05, "loss": 0.8775, "num_input_tokens_seen": 424192, "step": 365 }, { "epoch": 1.111361926260346, "grad_norm": 0.9992531538009644, "learning_rate": 3.482148222715835e-05, "loss": 0.883, "num_input_tokens_seen": 430312, "step": 370 }, { "epoch": 1.1264108352144468, "grad_norm": 1.0938397645950317, "learning_rate": 3.4457700026183374e-05, "loss": 1.0032, "num_input_tokens_seen": 436128, "step": 375 }, { "epoch": 1.141459744168548, "grad_norm": 0.8988808989524841, "learning_rate": 3.409156549865654e-05, "loss": 0.943, "num_input_tokens_seen": 441928, "step": 380 }, { "epoch": 1.1565086531226485, "grad_norm": 0.9952559471130371, "learning_rate": 3.3723169709844026e-05, "loss": 0.801, "num_input_tokens_seen": 447560, "step": 385 }, { "epoch": 1.1715575620767495, "grad_norm": 0.7556662559509277, "learning_rate": 3.335260428743475e-05, "loss": 0.9294, "num_input_tokens_seen": 453296, "step": 390 }, { "epoch": 1.1866064710308502, "grad_norm": 0.8362197279930115, "learning_rate": 3.297996139875055e-05, "loss": 0.9528, "num_input_tokens_seen": 459336, "step": 395 }, { "epoch": 1.2016553799849512, "grad_norm": 0.9389665722846985, "learning_rate": 3.260533372782234e-05, "loss": 0.8981, "num_input_tokens_seen": 464944, "step": 400 }, { "epoch": 1.2167042889390518, "grad_norm": 1.1821860074996948, "learning_rate": 3.222881445233759e-05, "loss": 0.9823, "num_input_tokens_seen": 470992, "step": 405 }, { "epoch": 1.2317531978931529, "grad_norm": 1.0015898942947388, "learning_rate": 3.185049722046516e-05, "loss": 0.9047, "num_input_tokens_seen": 476216, "step": 410 }, { "epoch": 1.2468021068472535, "grad_norm": 0.8765709400177002, "learning_rate": 3.147047612756302e-05, "loss": 0.8582, "num_input_tokens_seen": 481824, "step": 415 }, { "epoch": 1.2618510158013545, "grad_norm": 0.9712916612625122, "learning_rate": 3.10888456927748e-05, "loss": 0.8787, "num_input_tokens_seen": 487576, "step": 420 }, { "epoch": 1.276899924755455, "grad_norm": 1.1555066108703613, "learning_rate": 3.0705700835520895e-05, "loss": 0.8729, "num_input_tokens_seen": 493336, "step": 425 }, { "epoch": 1.2919488337095562, "grad_norm": 1.1198400259017944, "learning_rate": 3.0321136851890036e-05, "loss": 0.8772, "num_input_tokens_seen": 499760, "step": 430 }, { "epoch": 1.3069977426636568, "grad_norm": 1.1468943357467651, "learning_rate": 2.9935249390937183e-05, "loss": 0.9451, "num_input_tokens_seen": 505400, "step": 435 }, { "epoch": 1.3220466516177578, "grad_norm": 0.8468641042709351, "learning_rate": 2.9548134430893604e-05, "loss": 0.8202, "num_input_tokens_seen": 511760, "step": 440 }, { "epoch": 1.3370955605718584, "grad_norm": 1.3206151723861694, "learning_rate": 2.9159888255295116e-05, "loss": 0.9773, "num_input_tokens_seen": 517616, "step": 445 }, { "epoch": 1.3521444695259595, "grad_norm": 1.1996040344238281, "learning_rate": 2.8770607429034352e-05, "loss": 0.9101, "num_input_tokens_seen": 522744, "step": 450 }, { "epoch": 1.36719337848006, "grad_norm": 1.1539313793182373, "learning_rate": 2.8380388774343047e-05, "loss": 0.9633, "num_input_tokens_seen": 528648, "step": 455 }, { "epoch": 1.382242287434161, "grad_norm": 1.021848440170288, "learning_rate": 2.7989329346710375e-05, "loss": 0.8886, "num_input_tokens_seen": 534000, "step": 460 }, { "epoch": 1.3972911963882617, "grad_norm": 0.8612179160118103, "learning_rate": 2.759752641074322e-05, "loss": 0.9258, "num_input_tokens_seen": 539688, "step": 465 }, { "epoch": 1.4123401053423628, "grad_norm": 1.0109293460845947, "learning_rate": 2.7205077415974416e-05, "loss": 0.9039, "num_input_tokens_seen": 545112, "step": 470 }, { "epoch": 1.4273890142964636, "grad_norm": 1.1920832395553589, "learning_rate": 2.6812079972625077e-05, "loss": 1.0116, "num_input_tokens_seen": 551328, "step": 475 }, { "epoch": 1.4424379232505644, "grad_norm": 1.0512142181396484, "learning_rate": 2.6418631827326857e-05, "loss": 0.8218, "num_input_tokens_seen": 556816, "step": 480 }, { "epoch": 1.4574868322046652, "grad_norm": 1.146946907043457, "learning_rate": 2.602483083881035e-05, "loss": 0.8604, "num_input_tokens_seen": 562552, "step": 485 }, { "epoch": 1.472535741158766, "grad_norm": 1.1064790487289429, "learning_rate": 2.563077495356561e-05, "loss": 0.8044, "num_input_tokens_seen": 568480, "step": 490 }, { "epoch": 1.487584650112867, "grad_norm": 0.9678347110748291, "learning_rate": 2.5236562181480794e-05, "loss": 0.9198, "num_input_tokens_seen": 574072, "step": 495 }, { "epoch": 1.5026335590669677, "grad_norm": 0.9460956454277039, "learning_rate": 2.484229057146507e-05, "loss": 0.9181, "num_input_tokens_seen": 580040, "step": 500 }, { "epoch": 1.5176824680210683, "grad_norm": 1.175920844078064, "learning_rate": 2.4448058187061835e-05, "loss": 0.8644, "num_input_tokens_seen": 586128, "step": 505 }, { "epoch": 1.5327313769751694, "grad_norm": 1.2150397300720215, "learning_rate": 2.4053963082058244e-05, "loss": 1.0127, "num_input_tokens_seen": 592256, "step": 510 }, { "epoch": 1.54778028592927, "grad_norm": 0.9520708918571472, "learning_rate": 2.3660103276097232e-05, "loss": 0.7937, "num_input_tokens_seen": 597704, "step": 515 }, { "epoch": 1.562829194883371, "grad_norm": 1.0742231607437134, "learning_rate": 2.3266576730297956e-05, "loss": 0.9806, "num_input_tokens_seen": 603240, "step": 520 }, { "epoch": 1.5778781038374716, "grad_norm": 1.0484352111816406, "learning_rate": 2.2873481322890862e-05, "loss": 0.934, "num_input_tokens_seen": 609616, "step": 525 }, { "epoch": 1.5929270127915727, "grad_norm": 0.8829598426818848, "learning_rate": 2.2480914824873297e-05, "loss": 0.9288, "num_input_tokens_seen": 615520, "step": 530 }, { "epoch": 1.6079759217456733, "grad_norm": 0.9222884178161621, "learning_rate": 2.2088974875691863e-05, "loss": 0.8597, "num_input_tokens_seen": 621208, "step": 535 }, { "epoch": 1.6230248306997743, "grad_norm": 0.894801914691925, "learning_rate": 2.1697758958957448e-05, "loss": 0.8817, "num_input_tokens_seen": 627176, "step": 540 }, { "epoch": 1.6380737396538751, "grad_norm": 1.1703195571899414, "learning_rate": 2.1307364378199005e-05, "loss": 0.777, "num_input_tokens_seen": 633248, "step": 545 }, { "epoch": 1.653122648607976, "grad_norm": 1.0596733093261719, "learning_rate": 2.0917888232662196e-05, "loss": 0.798, "num_input_tokens_seen": 639000, "step": 550 }, { "epoch": 1.6681715575620768, "grad_norm": 1.0426228046417236, "learning_rate": 2.0529427393158705e-05, "loss": 0.9104, "num_input_tokens_seen": 645280, "step": 555 }, { "epoch": 1.6832204665161776, "grad_norm": 1.3300392627716064, "learning_rate": 2.014207847797256e-05, "loss": 0.8293, "num_input_tokens_seen": 651760, "step": 560 }, { "epoch": 1.6982693754702785, "grad_norm": 1.2664028406143188, "learning_rate": 1.9755937828829067e-05, "loss": 0.8821, "num_input_tokens_seen": 657272, "step": 565 }, { "epoch": 1.7133182844243793, "grad_norm": 0.9889734983444214, "learning_rate": 1.937110148693265e-05, "loss": 0.8253, "num_input_tokens_seen": 663336, "step": 570 }, { "epoch": 1.72836719337848, "grad_norm": 1.0789241790771484, "learning_rate": 1.8987665169079454e-05, "loss": 0.9391, "num_input_tokens_seen": 668936, "step": 575 }, { "epoch": 1.743416102332581, "grad_norm": 1.2337504625320435, "learning_rate": 1.8605724243850502e-05, "loss": 0.8711, "num_input_tokens_seen": 675000, "step": 580 }, { "epoch": 1.7584650112866818, "grad_norm": 0.905838668346405, "learning_rate": 1.822537370789163e-05, "loss": 0.8346, "num_input_tokens_seen": 680584, "step": 585 }, { "epoch": 1.7735139202407826, "grad_norm": 1.1633321046829224, "learning_rate": 1.7846708162285785e-05, "loss": 0.8275, "num_input_tokens_seen": 686416, "step": 590 }, { "epoch": 1.7885628291948834, "grad_norm": 0.9946597814559937, "learning_rate": 1.7469821789023815e-05, "loss": 0.9435, "num_input_tokens_seen": 692016, "step": 595 }, { "epoch": 1.8036117381489842, "grad_norm": 1.0259568691253662, "learning_rate": 1.70948083275794e-05, "loss": 0.8584, "num_input_tokens_seen": 697984, "step": 600 }, { "epoch": 1.818660647103085, "grad_norm": 1.0644334554672241, "learning_rate": 1.672176105159417e-05, "loss": 0.88, "num_input_tokens_seen": 704056, "step": 605 }, { "epoch": 1.8337095560571859, "grad_norm": 1.0443474054336548, "learning_rate": 1.635077274567854e-05, "loss": 0.8825, "num_input_tokens_seen": 709760, "step": 610 }, { "epoch": 1.8487584650112867, "grad_norm": 1.0267105102539062, "learning_rate": 1.5981935682334264e-05, "loss": 0.9978, "num_input_tokens_seen": 715872, "step": 615 }, { "epoch": 1.8638073739653875, "grad_norm": 1.3127869367599487, "learning_rate": 1.561534159900441e-05, "loss": 0.9626, "num_input_tokens_seen": 722184, "step": 620 }, { "epoch": 1.8788562829194884, "grad_norm": 1.2093840837478638, "learning_rate": 1.525108167525624e-05, "loss": 0.9308, "num_input_tokens_seen": 727776, "step": 625 }, { "epoch": 1.8939051918735892, "grad_norm": 0.982764482498169, "learning_rate": 1.4889246510103077e-05, "loss": 0.9757, "num_input_tokens_seen": 733760, "step": 630 }, { "epoch": 1.90895410082769, "grad_norm": 1.111680507659912, "learning_rate": 1.4529926099470348e-05, "loss": 0.767, "num_input_tokens_seen": 740024, "step": 635 }, { "epoch": 1.9240030097817908, "grad_norm": 1.218017578125, "learning_rate": 1.4173209813811788e-05, "loss": 0.9272, "num_input_tokens_seen": 745480, "step": 640 }, { "epoch": 1.9390519187358917, "grad_norm": 1.3443623781204224, "learning_rate": 1.381918637588112e-05, "loss": 0.7941, "num_input_tokens_seen": 751384, "step": 645 }, { "epoch": 1.9541008276899925, "grad_norm": 0.9702039361000061, "learning_rate": 1.3467943838664863e-05, "loss": 0.8408, "num_input_tokens_seen": 756920, "step": 650 }, { "epoch": 1.9691497366440933, "grad_norm": 1.1215064525604248, "learning_rate": 1.311956956348177e-05, "loss": 0.8459, "num_input_tokens_seen": 762424, "step": 655 }, { "epoch": 1.9841986455981941, "grad_norm": 1.3830626010894775, "learning_rate": 1.277415019825417e-05, "loss": 1.0117, "num_input_tokens_seen": 768224, "step": 660 }, { "epoch": 1.999247554552295, "grad_norm": 1.028895616531372, "learning_rate": 1.2431771655956925e-05, "loss": 0.9665, "num_input_tokens_seen": 773568, "step": 665 }, { "epoch": 2.0120391271632805, "grad_norm": 1.1555911302566528, "learning_rate": 1.2092519093248988e-05, "loss": 0.7625, "num_input_tokens_seen": 778672, "step": 670 }, { "epoch": 2.0270880361173815, "grad_norm": 1.037429690361023, "learning_rate": 1.1756476889293269e-05, "loss": 0.8667, "num_input_tokens_seen": 784488, "step": 675 }, { "epoch": 2.042136945071482, "grad_norm": 1.053051471710205, "learning_rate": 1.1423728624769695e-05, "loss": 0.8297, "num_input_tokens_seen": 790304, "step": 680 }, { "epoch": 2.057185854025583, "grad_norm": 1.0523649454116821, "learning_rate": 1.1094357061087033e-05, "loss": 0.8774, "num_input_tokens_seen": 796192, "step": 685 }, { "epoch": 2.072234762979684, "grad_norm": 1.0367976427078247, "learning_rate": 1.0768444119798357e-05, "loss": 0.8476, "num_input_tokens_seen": 802144, "step": 690 }, { "epoch": 2.087283671933785, "grad_norm": 1.4130756855010986, "learning_rate": 1.0446070862225463e-05, "loss": 0.8641, "num_input_tokens_seen": 807768, "step": 695 }, { "epoch": 2.1023325808878854, "grad_norm": 1.1584120988845825, "learning_rate": 1.0127317469297277e-05, "loss": 0.8383, "num_input_tokens_seen": 813712, "step": 700 }, { "epoch": 2.1173814898419865, "grad_norm": 1.2318339347839355, "learning_rate": 9.812263221607112e-06, "loss": 0.9123, "num_input_tokens_seen": 819360, "step": 705 }, { "epoch": 2.132430398796087, "grad_norm": 1.6237512826919556, "learning_rate": 9.500986479694036e-06, "loss": 0.9635, "num_input_tokens_seen": 824584, "step": 710 }, { "epoch": 2.147479307750188, "grad_norm": 1.106604814529419, "learning_rate": 9.19356466455287e-06, "loss": 0.9221, "num_input_tokens_seen": 830600, "step": 715 }, { "epoch": 2.1625282167042887, "grad_norm": 0.8615310788154602, "learning_rate": 8.890074238378074e-06, "loss": 0.8757, "num_input_tokens_seen": 836856, "step": 720 }, { "epoch": 2.17757712565839, "grad_norm": 0.8537486791610718, "learning_rate": 8.590590685545946e-06, "loss": 0.7958, "num_input_tokens_seen": 842872, "step": 725 }, { "epoch": 2.1926260346124904, "grad_norm": 0.8556107878684998, "learning_rate": 8.295188493840104e-06, "loss": 0.7993, "num_input_tokens_seen": 848664, "step": 730 }, { "epoch": 2.2076749435665914, "grad_norm": 1.093944787979126, "learning_rate": 8.003941135924858e-06, "loss": 0.8436, "num_input_tokens_seen": 854712, "step": 735 }, { "epoch": 2.222723852520692, "grad_norm": 1.2639975547790527, "learning_rate": 7.71692105107098e-06, "loss": 0.896, "num_input_tokens_seen": 860648, "step": 740 }, { "epoch": 2.237772761474793, "grad_norm": 1.177778720855713, "learning_rate": 7.434199627138602e-06, "loss": 0.8948, "num_input_tokens_seen": 866080, "step": 745 }, { "epoch": 2.2528216704288937, "grad_norm": 0.9701932668685913, "learning_rate": 7.155847182821523e-06, "loss": 0.8546, "num_input_tokens_seen": 871560, "step": 750 }, { "epoch": 2.2678705793829947, "grad_norm": 1.0232161283493042, "learning_rate": 6.881932950157538e-06, "loss": 0.8494, "num_input_tokens_seen": 877568, "step": 755 }, { "epoch": 2.282919488337096, "grad_norm": 1.119441270828247, "learning_rate": 6.612525057308949e-06, "loss": 0.7723, "num_input_tokens_seen": 883808, "step": 760 }, { "epoch": 2.2979683972911964, "grad_norm": 1.5488731861114502, "learning_rate": 6.347690511617693e-06, "loss": 0.9168, "num_input_tokens_seen": 889296, "step": 765 }, { "epoch": 2.313017306245297, "grad_norm": 1.2143895626068115, "learning_rate": 6.0874951829392234e-06, "loss": 0.8831, "num_input_tokens_seen": 895120, "step": 770 }, { "epoch": 2.328066215199398, "grad_norm": 1.157663106918335, "learning_rate": 5.832003787259327e-06, "loss": 0.854, "num_input_tokens_seen": 900320, "step": 775 }, { "epoch": 2.343115124153499, "grad_norm": 1.4496403932571411, "learning_rate": 5.581279870597867e-06, "loss": 0.8843, "num_input_tokens_seen": 905928, "step": 780 }, { "epoch": 2.3581640331075997, "grad_norm": 0.8820686936378479, "learning_rate": 5.335385793203604e-06, "loss": 0.862, "num_input_tokens_seen": 911976, "step": 785 }, { "epoch": 2.3732129420617003, "grad_norm": 1.622916579246521, "learning_rate": 5.094382714043907e-06, "loss": 0.985, "num_input_tokens_seen": 917840, "step": 790 }, { "epoch": 2.3882618510158014, "grad_norm": 1.0603710412979126, "learning_rate": 4.85833057559322e-06, "loss": 0.7679, "num_input_tokens_seen": 923168, "step": 795 }, { "epoch": 2.4033107599699024, "grad_norm": 1.0989526510238647, "learning_rate": 4.627288088924156e-06, "loss": 0.8198, "num_input_tokens_seen": 928720, "step": 800 }, { "epoch": 2.418359668924003, "grad_norm": 0.9745952486991882, "learning_rate": 4.401312719104802e-06, "loss": 0.7773, "num_input_tokens_seen": 934568, "step": 805 }, { "epoch": 2.4334085778781036, "grad_norm": 1.529707670211792, "learning_rate": 4.180460670905978e-06, "loss": 0.9312, "num_input_tokens_seen": 940264, "step": 810 }, { "epoch": 2.4484574868322047, "grad_norm": 1.2537649869918823, "learning_rate": 3.964786874821955e-06, "loss": 0.8497, "num_input_tokens_seen": 946128, "step": 815 }, { "epoch": 2.4635063957863057, "grad_norm": 1.0871232748031616, "learning_rate": 3.754344973408064e-06, "loss": 0.782, "num_input_tokens_seen": 952032, "step": 820 }, { "epoch": 2.4785553047404063, "grad_norm": 1.2940268516540527, "learning_rate": 3.5491873079387256e-06, "loss": 0.8937, "num_input_tokens_seen": 957960, "step": 825 }, { "epoch": 2.493604213694507, "grad_norm": 1.2327598333358765, "learning_rate": 3.3493649053890326e-06, "loss": 0.7039, "num_input_tokens_seen": 964336, "step": 830 }, { "epoch": 2.508653122648608, "grad_norm": 1.516093373298645, "learning_rate": 3.1549274657433375e-06, "loss": 0.9265, "num_input_tokens_seen": 970168, "step": 835 }, { "epoch": 2.523702031602709, "grad_norm": 1.1418204307556152, "learning_rate": 2.9659233496337786e-06, "loss": 0.8669, "num_input_tokens_seen": 975752, "step": 840 }, { "epoch": 2.5387509405568096, "grad_norm": 1.3584462404251099, "learning_rate": 2.7823995663120327e-06, "loss": 0.9174, "num_input_tokens_seen": 981672, "step": 845 }, { "epoch": 2.55379984951091, "grad_norm": 1.1911269426345825, "learning_rate": 2.6044017619571065e-06, "loss": 0.8718, "num_input_tokens_seen": 987560, "step": 850 }, { "epoch": 2.5688487584650113, "grad_norm": 1.3048710823059082, "learning_rate": 2.431974208322191e-06, "loss": 0.8634, "num_input_tokens_seen": 993200, "step": 855 }, { "epoch": 2.5838976674191123, "grad_norm": 1.1356749534606934, "learning_rate": 2.265159791723373e-06, "loss": 0.845, "num_input_tokens_seen": 999192, "step": 860 }, { "epoch": 2.598946576373213, "grad_norm": 1.2655149698257446, "learning_rate": 2.104000002372886e-06, "loss": 0.8008, "num_input_tokens_seen": 1004576, "step": 865 }, { "epoch": 2.6139954853273135, "grad_norm": 1.354706048965454, "learning_rate": 1.9485349240596613e-06, "loss": 0.8797, "num_input_tokens_seen": 1010352, "step": 870 }, { "epoch": 2.6290443942814146, "grad_norm": 1.0957777500152588, "learning_rate": 1.7988032241796376e-06, "loss": 0.946, "num_input_tokens_seen": 1016272, "step": 875 }, { "epoch": 2.6440933032355156, "grad_norm": 1.3322904109954834, "learning_rate": 1.6548421441183875e-06, "loss": 0.8032, "num_input_tokens_seen": 1021896, "step": 880 }, { "epoch": 2.659142212189616, "grad_norm": 1.1363080739974976, "learning_rate": 1.5166874899884053e-06, "loss": 0.8892, "num_input_tokens_seen": 1027704, "step": 885 }, { "epoch": 2.674191121143717, "grad_norm": 1.2706754207611084, "learning_rate": 1.3843736237233784e-06, "loss": 0.856, "num_input_tokens_seen": 1033800, "step": 890 }, { "epoch": 2.689240030097818, "grad_norm": 1.1934438943862915, "learning_rate": 1.2579334545316733e-06, "loss": 0.8617, "num_input_tokens_seen": 1040008, "step": 895 }, { "epoch": 2.704288939051919, "grad_norm": 1.4581674337387085, "learning_rate": 1.137398430711123e-06, "loss": 0.9117, "num_input_tokens_seen": 1046272, "step": 900 }, { "epoch": 2.7193378480060195, "grad_norm": 1.080992579460144, "learning_rate": 1.0227985318271682e-06, "loss": 0.7855, "num_input_tokens_seen": 1052032, "step": 905 }, { "epoch": 2.73438675696012, "grad_norm": 1.0012861490249634, "learning_rate": 9.141622612563571e-07, "loss": 0.8212, "num_input_tokens_seen": 1057584, "step": 910 }, { "epoch": 2.749435665914221, "grad_norm": 1.1472314596176147, "learning_rate": 8.115166390969125e-07, "loss": 0.8404, "num_input_tokens_seen": 1063760, "step": 915 }, { "epoch": 2.764484574868322, "grad_norm": 1.2558523416519165, "learning_rate": 7.148871954483105e-07, "loss": 0.7782, "num_input_tokens_seen": 1069544, "step": 920 }, { "epoch": 2.779533483822423, "grad_norm": 1.1380338668823242, "learning_rate": 6.242979640613933e-07, "loss": 0.7847, "num_input_tokens_seen": 1075472, "step": 925 }, { "epoch": 2.7945823927765234, "grad_norm": 0.972878098487854, "learning_rate": 5.397714763606843e-07, "loss": 0.8857, "num_input_tokens_seen": 1081464, "step": 930 }, { "epoch": 2.8096313017306245, "grad_norm": 1.2546579837799072, "learning_rate": 4.613287558403512e-07, "loss": 0.8029, "num_input_tokens_seen": 1087464, "step": 935 }, { "epoch": 2.8246802106847255, "grad_norm": 1.1165034770965576, "learning_rate": 3.8898931283523344e-07, "loss": 0.8154, "num_input_tokens_seen": 1092888, "step": 940 }, { "epoch": 2.839729119638826, "grad_norm": 1.3924362659454346, "learning_rate": 3.227711396682015e-07, "loss": 0.8791, "num_input_tokens_seen": 1098808, "step": 945 }, { "epoch": 2.854778028592927, "grad_norm": 1.021448016166687, "learning_rate": 2.626907061751116e-07, "loss": 0.787, "num_input_tokens_seen": 1104688, "step": 950 }, { "epoch": 2.869826937547028, "grad_norm": 1.3344382047653198, "learning_rate": 2.0876295560839364e-07, "loss": 0.8831, "num_input_tokens_seen": 1110960, "step": 955 }, { "epoch": 2.884875846501129, "grad_norm": 1.3956490755081177, "learning_rate": 1.6100130092037703e-07, "loss": 0.7677, "num_input_tokens_seen": 1116800, "step": 960 }, { "epoch": 2.8999247554552294, "grad_norm": 1.1644206047058105, "learning_rate": 1.194176214271897e-07, "loss": 0.7567, "num_input_tokens_seen": 1122248, "step": 965 }, { "epoch": 2.9149736644093305, "grad_norm": 1.2540746927261353, "learning_rate": 8.402225985413848e-08, "loss": 0.8944, "num_input_tokens_seen": 1127928, "step": 970 }, { "epoch": 2.930022573363431, "grad_norm": 1.1684881448745728, "learning_rate": 5.4824019763252685e-08, "loss": 0.9737, "num_input_tokens_seen": 1133336, "step": 975 }, { "epoch": 2.945071482317532, "grad_norm": 1.072198510169983, "learning_rate": 3.1830163363655296e-08, "loss": 0.8965, "num_input_tokens_seen": 1139048, "step": 980 }, { "epoch": 2.9601203912716327, "grad_norm": 1.7171086072921753, "learning_rate": 1.504640970531046e-08, "loss": 0.837, "num_input_tokens_seen": 1144456, "step": 985 }, { "epoch": 2.975169300225734, "grad_norm": 1.4984806776046753, "learning_rate": 4.4769332565558485e-09, "loss": 0.7812, "num_input_tokens_seen": 1150160, "step": 990 }, { "epoch": 2.9902182091798344, "grad_norm": 1.2322272062301636, "learning_rate": 1.2436286584982527e-10, "loss": 0.8613, "num_input_tokens_seen": 1156704, "step": 995 }, { "epoch": 2.9932279909706545, "num_input_tokens_seen": 1157808, "step": 996, "total_flos": 1.3788411572404224e+16, "train_loss": 0.939127180590687, "train_runtime": 10484.6402, "train_samples_per_second": 0.761, "train_steps_per_second": 0.095 } ], "logging_steps": 5, "max_steps": 996, "num_input_tokens_seen": 1157808, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.3788411572404224e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }