{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9955911823647297, "eval_steps": 500, "global_step": 1869, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.008016032064128256, "grad_norm": 1.7998353242874146, "learning_rate": 1.4035087719298246e-06, "loss": 2.8148, "step": 5 }, { "epoch": 0.01603206412825651, "grad_norm": 1.8694814443588257, "learning_rate": 3.157894736842105e-06, "loss": 2.7787, "step": 10 }, { "epoch": 0.02404809619238477, "grad_norm": 2.0035526752471924, "learning_rate": 4.912280701754386e-06, "loss": 2.8341, "step": 15 }, { "epoch": 0.03206412825651302, "grad_norm": 2.0097479820251465, "learning_rate": 6.666666666666667e-06, "loss": 2.7125, "step": 20 }, { "epoch": 0.04008016032064128, "grad_norm": 1.6507200002670288, "learning_rate": 8.421052631578948e-06, "loss": 2.4992, "step": 25 }, { "epoch": 0.04809619238476954, "grad_norm": 2.043926477432251, "learning_rate": 1.017543859649123e-05, "loss": 2.551, "step": 30 }, { "epoch": 0.056112224448897796, "grad_norm": 1.697209119796753, "learning_rate": 1.192982456140351e-05, "loss": 2.525, "step": 35 }, { "epoch": 0.06412825651302605, "grad_norm": 1.7798157930374146, "learning_rate": 1.3684210526315791e-05, "loss": 2.4075, "step": 40 }, { "epoch": 0.07214428857715431, "grad_norm": 1.4598116874694824, "learning_rate": 1.543859649122807e-05, "loss": 2.093, "step": 45 }, { "epoch": 0.08016032064128256, "grad_norm": 1.4842052459716797, "learning_rate": 1.719298245614035e-05, "loss": 1.9538, "step": 50 }, { "epoch": 0.08817635270541083, "grad_norm": 1.1754084825515747, "learning_rate": 1.894736842105263e-05, "loss": 1.7687, "step": 55 }, { "epoch": 0.09619238476953908, "grad_norm": 0.93027663230896, "learning_rate": 1.999993988083788e-05, "loss": 1.6448, "step": 60 }, { "epoch": 0.10420841683366733, "grad_norm": 1.0409395694732666, "learning_rate": 1.999926354856561e-05, "loss": 1.4744, "step": 65 }, { "epoch": 0.11222444889779559, "grad_norm": 0.9661508202552795, "learning_rate": 1.999783578606323e-05, "loss": 1.3396, "step": 70 }, { "epoch": 0.12024048096192384, "grad_norm": 0.6014273166656494, "learning_rate": 1.999565670062504e-05, "loss": 1.2886, "step": 75 }, { "epoch": 0.1282565130260521, "grad_norm": 0.5582892894744873, "learning_rate": 1.9992726456006157e-05, "loss": 1.1745, "step": 80 }, { "epoch": 0.13627254509018036, "grad_norm": 0.4300733804702759, "learning_rate": 1.9989045272410242e-05, "loss": 1.163, "step": 85 }, { "epoch": 0.14428857715430862, "grad_norm": 0.6262882351875305, "learning_rate": 1.9984613426472934e-05, "loss": 1.2483, "step": 90 }, { "epoch": 0.1523046092184369, "grad_norm": 0.3891115188598633, "learning_rate": 1.9979431251241057e-05, "loss": 1.1076, "step": 95 }, { "epoch": 0.16032064128256512, "grad_norm": 3.9172680377960205, "learning_rate": 1.997349913614761e-05, "loss": 1.1621, "step": 100 }, { "epoch": 0.1683366733466934, "grad_norm": 0.3938579857349396, "learning_rate": 1.9966817526982473e-05, "loss": 1.1969, "step": 105 }, { "epoch": 0.17635270541082165, "grad_norm": 0.4831806719303131, "learning_rate": 1.9959386925858944e-05, "loss": 1.1086, "step": 110 }, { "epoch": 0.1843687374749499, "grad_norm": 0.3328356444835663, "learning_rate": 1.9951207891175973e-05, "loss": 1.0541, "step": 115 }, { "epoch": 0.19238476953907815, "grad_norm": 0.5235080122947693, "learning_rate": 1.9942281037576223e-05, "loss": 1.0936, "step": 120 }, { "epoch": 0.20040080160320642, "grad_norm": 0.3490229547023773, "learning_rate": 1.993260703589986e-05, "loss": 1.0498, "step": 125 }, { "epoch": 0.20841683366733466, "grad_norm": 1.1637717485427856, "learning_rate": 1.9922186613134152e-05, "loss": 1.0424, "step": 130 }, { "epoch": 0.21643286573146292, "grad_norm": 0.3831629157066345, "learning_rate": 1.991102055235884e-05, "loss": 1.105, "step": 135 }, { "epoch": 0.22444889779559118, "grad_norm": 0.5610520243644714, "learning_rate": 1.989910969268728e-05, "loss": 1.1341, "step": 140 }, { "epoch": 0.23246492985971945, "grad_norm": 0.5133630037307739, "learning_rate": 1.9886454929203394e-05, "loss": 1.0598, "step": 145 }, { "epoch": 0.24048096192384769, "grad_norm": 0.3511698246002197, "learning_rate": 1.98730572128944e-05, "loss": 0.9851, "step": 150 }, { "epoch": 0.24849699398797595, "grad_norm": 0.4384966790676117, "learning_rate": 1.985891755057935e-05, "loss": 1.0148, "step": 155 }, { "epoch": 0.2565130260521042, "grad_norm": 0.5588643550872803, "learning_rate": 1.984403700483347e-05, "loss": 1.0541, "step": 160 }, { "epoch": 0.26452905811623245, "grad_norm": 0.3638758063316345, "learning_rate": 1.9828416693908305e-05, "loss": 0.8998, "step": 165 }, { "epoch": 0.2725450901803607, "grad_norm": 0.3875952959060669, "learning_rate": 1.9812057791647687e-05, "loss": 0.9024, "step": 170 }, { "epoch": 0.280561122244489, "grad_norm": 0.3295404016971588, "learning_rate": 1.9794961527399518e-05, "loss": 0.9846, "step": 175 }, { "epoch": 0.28857715430861725, "grad_norm": 0.3046537935733795, "learning_rate": 1.9777129185923397e-05, "loss": 0.9746, "step": 180 }, { "epoch": 0.2965931863727455, "grad_norm": 0.41136643290519714, "learning_rate": 1.975856210729405e-05, "loss": 1.0259, "step": 185 }, { "epoch": 0.3046092184368738, "grad_norm": 0.4319979250431061, "learning_rate": 1.9739261686800662e-05, "loss": 1.0129, "step": 190 }, { "epoch": 0.312625250501002, "grad_norm": 0.45283690094947815, "learning_rate": 1.9719229374841978e-05, "loss": 0.9649, "step": 195 }, { "epoch": 0.32064128256513025, "grad_norm": 0.2747628092765808, "learning_rate": 1.9698466676817348e-05, "loss": 0.9863, "step": 200 }, { "epoch": 0.3286573146292585, "grad_norm": 1.5212770700454712, "learning_rate": 1.9676975153013574e-05, "loss": 0.9803, "step": 205 }, { "epoch": 0.3366733466933868, "grad_norm": 0.48161494731903076, "learning_rate": 1.965475641848767e-05, "loss": 1.031, "step": 210 }, { "epoch": 0.34468937875751504, "grad_norm": 0.31916359066963196, "learning_rate": 1.9631812142945473e-05, "loss": 1.0912, "step": 215 }, { "epoch": 0.3527054108216433, "grad_norm": 0.418302059173584, "learning_rate": 1.9608144050616192e-05, "loss": 0.9857, "step": 220 }, { "epoch": 0.36072144288577157, "grad_norm": 0.38761621713638306, "learning_rate": 1.9583753920122822e-05, "loss": 1.0441, "step": 225 }, { "epoch": 0.3687374749498998, "grad_norm": 0.33479398488998413, "learning_rate": 1.9558643584348478e-05, "loss": 0.9453, "step": 230 }, { "epoch": 0.37675350701402804, "grad_norm": 0.36575448513031006, "learning_rate": 1.9532814930298673e-05, "loss": 0.9196, "step": 235 }, { "epoch": 0.3847695390781563, "grad_norm": 0.33325284719467163, "learning_rate": 1.95062698989595e-05, "loss": 1.0024, "step": 240 }, { "epoch": 0.3927855711422846, "grad_norm": 0.3346043825149536, "learning_rate": 1.947901048515176e-05, "loss": 0.9017, "step": 245 }, { "epoch": 0.40080160320641284, "grad_norm": 0.4760594666004181, "learning_rate": 1.9451038737381078e-05, "loss": 0.941, "step": 250 }, { "epoch": 0.4088176352705411, "grad_norm": 0.3105100989341736, "learning_rate": 1.9422356757683946e-05, "loss": 0.9357, "step": 255 }, { "epoch": 0.4168336673346693, "grad_norm": 0.35822534561157227, "learning_rate": 1.939296670146976e-05, "loss": 1.002, "step": 260 }, { "epoch": 0.4248496993987976, "grad_norm": 0.4366324245929718, "learning_rate": 1.936287077735884e-05, "loss": 0.9836, "step": 265 }, { "epoch": 0.43286573146292584, "grad_norm": 0.339522123336792, "learning_rate": 1.9332071247016476e-05, "loss": 0.8653, "step": 270 }, { "epoch": 0.4408817635270541, "grad_norm": 0.31696006655693054, "learning_rate": 1.930057042498293e-05, "loss": 0.9843, "step": 275 }, { "epoch": 0.44889779559118237, "grad_norm": 0.30729272961616516, "learning_rate": 1.926837067849953e-05, "loss": 1.0606, "step": 280 }, { "epoch": 0.45691382765531063, "grad_norm": 0.4165307879447937, "learning_rate": 1.9235474427330783e-05, "loss": 0.9156, "step": 285 }, { "epoch": 0.4649298597194389, "grad_norm": 0.43856576085090637, "learning_rate": 1.9201884143582496e-05, "loss": 0.9984, "step": 290 }, { "epoch": 0.4729458917835671, "grad_norm": 0.333844393491745, "learning_rate": 1.916760235151604e-05, "loss": 0.9388, "step": 295 }, { "epoch": 0.48096192384769537, "grad_norm": 0.34350189566612244, "learning_rate": 1.913263162735862e-05, "loss": 0.9525, "step": 300 }, { "epoch": 0.48897795591182364, "grad_norm": 0.37638700008392334, "learning_rate": 1.909697459910972e-05, "loss": 0.9491, "step": 305 }, { "epoch": 0.4969939879759519, "grad_norm": 0.37804660201072693, "learning_rate": 1.906063394634356e-05, "loss": 0.9613, "step": 310 }, { "epoch": 0.5050100200400801, "grad_norm": 0.39323610067367554, "learning_rate": 1.902361240000778e-05, "loss": 0.9028, "step": 315 }, { "epoch": 0.5130260521042084, "grad_norm": 0.3674936294555664, "learning_rate": 1.8985912742218167e-05, "loss": 1.0493, "step": 320 }, { "epoch": 0.5210420841683366, "grad_norm": 0.432797908782959, "learning_rate": 1.894753780604962e-05, "loss": 1.0113, "step": 325 }, { "epoch": 0.5290581162324649, "grad_norm": 0.33035048842430115, "learning_rate": 1.8908490475323234e-05, "loss": 0.9485, "step": 330 }, { "epoch": 0.5370741482965932, "grad_norm": 0.4390115439891815, "learning_rate": 1.886877368438957e-05, "loss": 0.8867, "step": 335 }, { "epoch": 0.5450901803607214, "grad_norm": 0.3174438178539276, "learning_rate": 1.882839041790818e-05, "loss": 0.8476, "step": 340 }, { "epoch": 0.5531062124248497, "grad_norm": 0.35620829463005066, "learning_rate": 1.8787343710623278e-05, "loss": 0.9164, "step": 345 }, { "epoch": 0.561122244488978, "grad_norm": 0.36334139108657837, "learning_rate": 1.8745636647135693e-05, "loss": 1.0097, "step": 350 }, { "epoch": 0.5691382765531062, "grad_norm": 0.2904795706272125, "learning_rate": 1.870327236167109e-05, "loss": 0.849, "step": 355 }, { "epoch": 0.5771543086172345, "grad_norm": 0.287715882062912, "learning_rate": 1.866025403784439e-05, "loss": 0.9578, "step": 360 }, { "epoch": 0.5851703406813628, "grad_norm": 0.3586503565311432, "learning_rate": 1.8616584908420573e-05, "loss": 0.9323, "step": 365 }, { "epoch": 0.593186372745491, "grad_norm": 0.8753472566604614, "learning_rate": 1.8572268255071718e-05, "loss": 0.903, "step": 370 }, { "epoch": 0.6012024048096193, "grad_norm": 0.5796067118644714, "learning_rate": 1.8527307408130388e-05, "loss": 0.9665, "step": 375 }, { "epoch": 0.6092184368737475, "grad_norm": 0.3357750177383423, "learning_rate": 1.848170574633937e-05, "loss": 0.8183, "step": 380 }, { "epoch": 0.6172344689378757, "grad_norm": 0.3579539954662323, "learning_rate": 1.8435466696597758e-05, "loss": 0.9593, "step": 385 }, { "epoch": 0.625250501002004, "grad_norm": 0.30244457721710205, "learning_rate": 1.8388593733703428e-05, "loss": 0.9102, "step": 390 }, { "epoch": 0.6332665330661322, "grad_norm": 0.37754353880882263, "learning_rate": 1.8341090380091926e-05, "loss": 0.8921, "step": 395 }, { "epoch": 0.6412825651302605, "grad_norm": 0.316244512796402, "learning_rate": 1.8292960205571742e-05, "loss": 0.8915, "step": 400 }, { "epoch": 0.6492985971943888, "grad_norm": 0.3463846445083618, "learning_rate": 1.824420682705606e-05, "loss": 0.921, "step": 405 }, { "epoch": 0.657314629258517, "grad_norm": 0.3335030674934387, "learning_rate": 1.8194833908290933e-05, "loss": 0.8892, "step": 410 }, { "epoch": 0.6653306613226453, "grad_norm": 0.34191393852233887, "learning_rate": 1.814484515957998e-05, "loss": 0.8977, "step": 415 }, { "epoch": 0.6733466933867736, "grad_norm": 0.3235023319721222, "learning_rate": 1.809424433750555e-05, "loss": 0.9092, "step": 420 }, { "epoch": 0.6813627254509018, "grad_norm": 0.36189472675323486, "learning_rate": 1.804303524464643e-05, "loss": 0.8349, "step": 425 }, { "epoch": 0.6893787575150301, "grad_norm": 0.3975241482257843, "learning_rate": 1.799122172929206e-05, "loss": 0.8988, "step": 430 }, { "epoch": 0.6973947895791583, "grad_norm": 0.3317061960697174, "learning_rate": 1.793880768515337e-05, "loss": 1.0006, "step": 435 }, { "epoch": 0.7054108216432866, "grad_norm": 0.380412220954895, "learning_rate": 1.788579705107017e-05, "loss": 0.8576, "step": 440 }, { "epoch": 0.7134268537074149, "grad_norm": 0.3645264804363251, "learning_rate": 1.7832193810715125e-05, "loss": 1.0026, "step": 445 }, { "epoch": 0.7214428857715431, "grad_norm": 0.3171364665031433, "learning_rate": 1.7778001992294426e-05, "loss": 0.8791, "step": 450 }, { "epoch": 0.7294589178356713, "grad_norm": 0.3164263069629669, "learning_rate": 1.772322566824504e-05, "loss": 0.8095, "step": 455 }, { "epoch": 0.7374749498997996, "grad_norm": 0.43192753195762634, "learning_rate": 1.7667868954928695e-05, "loss": 0.962, "step": 460 }, { "epoch": 0.7454909819639278, "grad_norm": 0.3209589421749115, "learning_rate": 1.761193601232254e-05, "loss": 0.858, "step": 465 }, { "epoch": 0.7535070140280561, "grad_norm": 0.3736160397529602, "learning_rate": 1.7555431043706517e-05, "loss": 1.0115, "step": 470 }, { "epoch": 0.7615230460921844, "grad_norm": 0.36832720041275024, "learning_rate": 1.74983582953475e-05, "loss": 0.8106, "step": 475 }, { "epoch": 0.7695390781563126, "grad_norm": 0.3430776596069336, "learning_rate": 1.744072205618019e-05, "loss": 0.9519, "step": 480 }, { "epoch": 0.7775551102204409, "grad_norm": 0.3065486252307892, "learning_rate": 1.7382526657484815e-05, "loss": 0.8079, "step": 485 }, { "epoch": 0.7855711422845691, "grad_norm": 0.33461305499076843, "learning_rate": 1.7323776472561625e-05, "loss": 0.8935, "step": 490 }, { "epoch": 0.7935871743486974, "grad_norm": 0.4995952546596527, "learning_rate": 1.7264475916402264e-05, "loss": 0.885, "step": 495 }, { "epoch": 0.8016032064128257, "grad_norm": 0.29805514216423035, "learning_rate": 1.7204629445357978e-05, "loss": 0.8196, "step": 500 }, { "epoch": 0.8096192384769539, "grad_norm": 0.2961992621421814, "learning_rate": 1.7144241556804724e-05, "loss": 0.8429, "step": 505 }, { "epoch": 0.8176352705410822, "grad_norm": 0.3210907280445099, "learning_rate": 1.7083316788805212e-05, "loss": 1.0272, "step": 510 }, { "epoch": 0.8256513026052105, "grad_norm": 0.34204360842704773, "learning_rate": 1.7021859719767855e-05, "loss": 0.9828, "step": 515 }, { "epoch": 0.8336673346693386, "grad_norm": 0.367639422416687, "learning_rate": 1.6959874968102736e-05, "loss": 1.0168, "step": 520 }, { "epoch": 0.8416833667334669, "grad_norm": 0.393257200717926, "learning_rate": 1.689736719187452e-05, "loss": 0.8997, "step": 525 }, { "epoch": 0.8496993987975952, "grad_norm": 0.35654187202453613, "learning_rate": 1.683434108845241e-05, "loss": 0.9833, "step": 530 }, { "epoch": 0.8577154308617234, "grad_norm": 0.3392307460308075, "learning_rate": 1.677080139415715e-05, "loss": 0.8618, "step": 535 }, { "epoch": 0.8657314629258517, "grad_norm": 0.4089815020561218, "learning_rate": 1.6706752883905107e-05, "loss": 0.8715, "step": 540 }, { "epoch": 0.87374749498998, "grad_norm": 0.340652734041214, "learning_rate": 1.6642200370849427e-05, "loss": 0.9329, "step": 545 }, { "epoch": 0.8817635270541082, "grad_norm": 0.3595132529735565, "learning_rate": 1.657714870601833e-05, "loss": 0.8755, "step": 550 }, { "epoch": 0.8897795591182365, "grad_norm": 0.3543437421321869, "learning_rate": 1.6511602777950585e-05, "loss": 0.9098, "step": 555 }, { "epoch": 0.8977955911823647, "grad_norm": 0.3310873508453369, "learning_rate": 1.6445567512328122e-05, "loss": 0.8433, "step": 560 }, { "epoch": 0.905811623246493, "grad_norm": 0.3629891574382782, "learning_rate": 1.6379047871605897e-05, "loss": 0.9434, "step": 565 }, { "epoch": 0.9138276553106213, "grad_norm": 0.3259185552597046, "learning_rate": 1.6312048854638927e-05, "loss": 0.8551, "step": 570 }, { "epoch": 0.9218436873747495, "grad_norm": 0.4061657786369324, "learning_rate": 1.6244575496306696e-05, "loss": 0.8558, "step": 575 }, { "epoch": 0.9298597194388778, "grad_norm": 0.28805726766586304, "learning_rate": 1.6176632867134738e-05, "loss": 0.8577, "step": 580 }, { "epoch": 0.9378757515030061, "grad_norm": 1.7116715908050537, "learning_rate": 1.610822607291361e-05, "loss": 0.8773, "step": 585 }, { "epoch": 0.9458917835671342, "grad_norm": 0.36534276604652405, "learning_rate": 1.6039360254315213e-05, "loss": 0.9772, "step": 590 }, { "epoch": 0.9539078156312625, "grad_norm": 0.33474084734916687, "learning_rate": 1.597004058650647e-05, "loss": 0.8995, "step": 595 }, { "epoch": 0.9619238476953907, "grad_norm": 0.3544546067714691, "learning_rate": 1.5900272278760407e-05, "loss": 0.8963, "step": 600 }, { "epoch": 0.969939879759519, "grad_norm": 0.8063123822212219, "learning_rate": 1.5830060574064698e-05, "loss": 0.9184, "step": 605 }, { "epoch": 0.9779559118236473, "grad_norm": 0.35473623871803284, "learning_rate": 1.5759410748727663e-05, "loss": 0.8378, "step": 610 }, { "epoch": 0.9859719438877755, "grad_norm": 0.4922749996185303, "learning_rate": 1.5688328111981747e-05, "loss": 0.8228, "step": 615 }, { "epoch": 0.9939879759519038, "grad_norm": 0.3475538492202759, "learning_rate": 1.5616818005584554e-05, "loss": 0.8499, "step": 620 }, { "epoch": 1.0016032064128257, "grad_norm": 0.40670621395111084, "learning_rate": 1.554488580341742e-05, "loss": 0.9474, "step": 625 }, { "epoch": 1.009619238476954, "grad_norm": 0.35096031427383423, "learning_rate": 1.547253691108156e-05, "loss": 0.9408, "step": 630 }, { "epoch": 1.0176352705410823, "grad_norm": 0.477050244808197, "learning_rate": 1.539977676549186e-05, "loss": 0.8426, "step": 635 }, { "epoch": 1.0256513026052103, "grad_norm": 0.3859562575817108, "learning_rate": 1.532661083446829e-05, "loss": 0.8802, "step": 640 }, { "epoch": 1.0336673346693386, "grad_norm": 0.920563280582428, "learning_rate": 1.5253044616325015e-05, "loss": 0.8237, "step": 645 }, { "epoch": 1.0416833667334668, "grad_norm": 0.3554023504257202, "learning_rate": 1.5179083639457193e-05, "loss": 0.8263, "step": 650 }, { "epoch": 1.049699398797595, "grad_norm": 0.3896832764148712, "learning_rate": 1.510473346192554e-05, "loss": 0.9112, "step": 655 }, { "epoch": 1.0577154308617234, "grad_norm": 0.39052242040634155, "learning_rate": 1.5029999671038636e-05, "loss": 0.7501, "step": 660 }, { "epoch": 1.0657314629258516, "grad_norm": 0.9520488977432251, "learning_rate": 1.4954887882933045e-05, "loss": 0.8634, "step": 665 }, { "epoch": 1.07374749498998, "grad_norm": 0.4311390817165375, "learning_rate": 1.4879403742151283e-05, "loss": 1.0027, "step": 670 }, { "epoch": 1.0817635270541082, "grad_norm": 0.29344749450683594, "learning_rate": 1.4803552921217636e-05, "loss": 0.7963, "step": 675 }, { "epoch": 1.0897795591182364, "grad_norm": 0.4940725266933441, "learning_rate": 1.4727341120211869e-05, "loss": 0.864, "step": 680 }, { "epoch": 1.0977955911823647, "grad_norm": 0.36703604459762573, "learning_rate": 1.4650774066340877e-05, "loss": 0.8763, "step": 685 }, { "epoch": 1.105811623246493, "grad_norm": 0.485721617937088, "learning_rate": 1.4573857513508297e-05, "loss": 0.8671, "step": 690 }, { "epoch": 1.1138276553106212, "grad_norm": 0.3963873088359833, "learning_rate": 1.4496597241882113e-05, "loss": 0.8517, "step": 695 }, { "epoch": 1.1218436873747495, "grad_norm": 0.329689085483551, "learning_rate": 1.4418999057460277e-05, "loss": 0.832, "step": 700 }, { "epoch": 1.1298597194388778, "grad_norm": 0.4781339466571808, "learning_rate": 1.4341068791634399e-05, "loss": 0.9939, "step": 705 }, { "epoch": 1.137875751503006, "grad_norm": 0.33180761337280273, "learning_rate": 1.4262812300751528e-05, "loss": 0.8397, "step": 710 }, { "epoch": 1.1458917835671343, "grad_norm": 0.3313557505607605, "learning_rate": 1.4184235465674055e-05, "loss": 0.8507, "step": 715 }, { "epoch": 1.1539078156312625, "grad_norm": 0.4401721954345703, "learning_rate": 1.4105344191337783e-05, "loss": 0.7881, "step": 720 }, { "epoch": 1.1619238476953908, "grad_norm": 0.29335707426071167, "learning_rate": 1.4026144406308155e-05, "loss": 0.8678, "step": 725 }, { "epoch": 1.169939879759519, "grad_norm": 0.2999337911605835, "learning_rate": 1.3946642062334765e-05, "loss": 0.8667, "step": 730 }, { "epoch": 1.1779559118236473, "grad_norm": 0.2895663380622864, "learning_rate": 1.3866843133904064e-05, "loss": 0.8994, "step": 735 }, { "epoch": 1.1859719438877756, "grad_norm": 0.4325989782810211, "learning_rate": 1.3786753617790405e-05, "loss": 0.8374, "step": 740 }, { "epoch": 1.1939879759519039, "grad_norm": 0.4179665148258209, "learning_rate": 1.3706379532605377e-05, "loss": 0.9817, "step": 745 }, { "epoch": 1.2020040080160321, "grad_norm": 0.3351852297782898, "learning_rate": 1.362572691834553e-05, "loss": 0.8651, "step": 750 }, { "epoch": 1.2100200400801604, "grad_norm": 0.3361961245536804, "learning_rate": 1.3544801835938466e-05, "loss": 0.8969, "step": 755 }, { "epoch": 1.2180360721442887, "grad_norm": 0.5419386029243469, "learning_rate": 1.3463610366787392e-05, "loss": 0.8676, "step": 760 }, { "epoch": 1.226052104208417, "grad_norm": 4.783193111419678, "learning_rate": 1.3382158612314075e-05, "loss": 0.8724, "step": 765 }, { "epoch": 1.234068136272545, "grad_norm": 0.3341674208641052, "learning_rate": 1.3300452693500358e-05, "loss": 0.8773, "step": 770 }, { "epoch": 1.2420841683366732, "grad_norm": 0.3183631896972656, "learning_rate": 1.3218498750428164e-05, "loss": 0.9829, "step": 775 }, { "epoch": 1.2501002004008015, "grad_norm": 0.30661240220069885, "learning_rate": 1.3136302941818084e-05, "loss": 0.8996, "step": 780 }, { "epoch": 1.2581162324649298, "grad_norm": 0.31990957260131836, "learning_rate": 1.3053871444566555e-05, "loss": 0.8845, "step": 785 }, { "epoch": 1.266132264529058, "grad_norm": 0.34186115860939026, "learning_rate": 1.2971210453281675e-05, "loss": 0.9203, "step": 790 }, { "epoch": 1.2741482965931863, "grad_norm": 0.3045668601989746, "learning_rate": 1.2888326179817686e-05, "loss": 0.827, "step": 795 }, { "epoch": 1.2821643286573146, "grad_norm": 0.34101662039756775, "learning_rate": 1.2805224852808165e-05, "loss": 0.8177, "step": 800 }, { "epoch": 1.2901803607214428, "grad_norm": 0.3752049505710602, "learning_rate": 1.2721912717197949e-05, "loss": 0.797, "step": 805 }, { "epoch": 1.298196392785571, "grad_norm": 0.35838568210601807, "learning_rate": 1.2638396033773836e-05, "loss": 0.8504, "step": 810 }, { "epoch": 1.3062124248496993, "grad_norm": 0.4186570346355438, "learning_rate": 1.2554681078694104e-05, "loss": 0.9181, "step": 815 }, { "epoch": 1.3142284569138276, "grad_norm": 0.3518775701522827, "learning_rate": 1.2470774143016854e-05, "loss": 0.8941, "step": 820 }, { "epoch": 1.3222444889779559, "grad_norm": 0.37892553210258484, "learning_rate": 1.238668153222725e-05, "loss": 0.8469, "step": 825 }, { "epoch": 1.3302605210420841, "grad_norm": 0.3626463711261749, "learning_rate": 1.230240956576367e-05, "loss": 0.9468, "step": 830 }, { "epoch": 1.3382765531062124, "grad_norm": 0.3972296118736267, "learning_rate": 1.2217964576542829e-05, "loss": 0.9451, "step": 835 }, { "epoch": 1.3462925851703407, "grad_norm": 0.3794783055782318, "learning_rate": 1.2133352910483838e-05, "loss": 0.8176, "step": 840 }, { "epoch": 1.354308617234469, "grad_norm": 0.3861818313598633, "learning_rate": 1.204858092603133e-05, "loss": 0.7841, "step": 845 }, { "epoch": 1.3623246492985972, "grad_norm": 0.30657151341438293, "learning_rate": 1.1963654993677645e-05, "loss": 0.7061, "step": 850 }, { "epoch": 1.3703406813627255, "grad_norm": 0.338156521320343, "learning_rate": 1.1878581495484074e-05, "loss": 0.7851, "step": 855 }, { "epoch": 1.3783567134268537, "grad_norm": 0.36862286925315857, "learning_rate": 1.179336682460128e-05, "loss": 1.0114, "step": 860 }, { "epoch": 1.386372745490982, "grad_norm": 0.3428620398044586, "learning_rate": 1.1708017384788842e-05, "loss": 0.9138, "step": 865 }, { "epoch": 1.3943887775551103, "grad_norm": 0.3690953254699707, "learning_rate": 1.1622539589934027e-05, "loss": 0.9095, "step": 870 }, { "epoch": 1.4024048096192385, "grad_norm": 0.36460062861442566, "learning_rate": 1.153693986356981e-05, "loss": 0.8913, "step": 875 }, { "epoch": 1.4104208416833668, "grad_norm": 0.33620065450668335, "learning_rate": 1.145122463839213e-05, "loss": 0.882, "step": 880 }, { "epoch": 1.418436873747495, "grad_norm": 0.3633309304714203, "learning_rate": 1.1365400355776504e-05, "loss": 0.8295, "step": 885 }, { "epoch": 1.4264529058116233, "grad_norm": 0.3613939881324768, "learning_rate": 1.1279473465293953e-05, "loss": 0.8322, "step": 890 }, { "epoch": 1.4344689378757516, "grad_norm": 0.3712775409221649, "learning_rate": 1.1193450424226333e-05, "loss": 0.9109, "step": 895 }, { "epoch": 1.4424849699398798, "grad_norm": 0.4387439787387848, "learning_rate": 1.1107337697081079e-05, "loss": 0.7593, "step": 900 }, { "epoch": 1.450501002004008, "grad_norm": 0.3387526869773865, "learning_rate": 1.1021141755105408e-05, "loss": 0.834, "step": 905 }, { "epoch": 1.4585170340681364, "grad_norm": 0.37399402260780334, "learning_rate": 1.09348690758e-05, "loss": 0.9176, "step": 910 }, { "epoch": 1.4665330661322646, "grad_norm": 0.4290678799152374, "learning_rate": 1.0848526142432252e-05, "loss": 0.8756, "step": 915 }, { "epoch": 1.474549098196393, "grad_norm": 0.3614942133426666, "learning_rate": 1.0762119443549035e-05, "loss": 0.8709, "step": 920 }, { "epoch": 1.4825651302605212, "grad_norm": 0.3980346620082855, "learning_rate": 1.0675655472489117e-05, "loss": 0.9928, "step": 925 }, { "epoch": 1.4905811623246494, "grad_norm": 4.1907243728637695, "learning_rate": 1.0589140726895179e-05, "loss": 0.7494, "step": 930 }, { "epoch": 1.4985971943887775, "grad_norm": 0.4177466332912445, "learning_rate": 1.0502581708225555e-05, "loss": 0.8649, "step": 935 }, { "epoch": 1.506613226452906, "grad_norm": 0.4919191300868988, "learning_rate": 1.041598492126561e-05, "loss": 0.8504, "step": 940 }, { "epoch": 1.5146292585170342, "grad_norm": 0.33262577652931213, "learning_rate": 1.0329356873638958e-05, "loss": 0.8721, "step": 945 }, { "epoch": 1.5226452905811623, "grad_norm": 0.3962733745574951, "learning_rate": 1.0242704075318402e-05, "loss": 0.9185, "step": 950 }, { "epoch": 1.5306613226452905, "grad_norm": 0.4132327735424042, "learning_rate": 1.0156033038136728e-05, "loss": 0.8198, "step": 955 }, { "epoch": 1.5386773547094188, "grad_norm": 0.36191555857658386, "learning_rate": 1.0069350275297338e-05, "loss": 0.8699, "step": 960 }, { "epoch": 1.546693386773547, "grad_norm": 0.434069961309433, "learning_rate": 9.982662300884813e-06, "loss": 0.8678, "step": 965 }, { "epoch": 1.5547094188376753, "grad_norm": 0.4045700430870056, "learning_rate": 9.89597562937536e-06, "loss": 0.8307, "step": 970 }, { "epoch": 1.5627254509018036, "grad_norm": 0.31574735045433044, "learning_rate": 9.809296775147287e-06, "loss": 0.8058, "step": 975 }, { "epoch": 1.5707414829659319, "grad_norm": 0.38601160049438477, "learning_rate": 9.722632251991445e-06, "loss": 0.8026, "step": 980 }, { "epoch": 1.5787575150300601, "grad_norm": 0.3911404013633728, "learning_rate": 9.635988572621716e-06, "loss": 0.9165, "step": 985 }, { "epoch": 1.5867735470941884, "grad_norm": 0.4252789318561554, "learning_rate": 9.54937224818561e-06, "loss": 0.9078, "step": 990 }, { "epoch": 1.5947895791583167, "grad_norm": 0.32735535502433777, "learning_rate": 9.462789787774944e-06, "loss": 0.8508, "step": 995 }, { "epoch": 1.602805611222445, "grad_norm": 0.3567563593387604, "learning_rate": 9.376247697936719e-06, "loss": 0.8884, "step": 1000 }, { "epoch": 1.6108216432865732, "grad_norm": 0.3730451166629791, "learning_rate": 9.289752482184128e-06, "loss": 0.8712, "step": 1005 }, { "epoch": 1.6188376753507014, "grad_norm": 0.34543997049331665, "learning_rate": 9.20331064050785e-06, "loss": 0.8948, "step": 1010 }, { "epoch": 1.6268537074148297, "grad_norm": 0.3326113522052765, "learning_rate": 9.116928668887587e-06, "loss": 0.8879, "step": 1015 }, { "epoch": 1.6348697394789578, "grad_norm": 0.33069467544555664, "learning_rate": 9.03061305880388e-06, "loss": 0.7994, "step": 1020 }, { "epoch": 1.642885771543086, "grad_norm": 0.34401947259902954, "learning_rate": 8.94437029675031e-06, "loss": 0.9361, "step": 1025 }, { "epoch": 1.6509018036072143, "grad_norm": 0.3548894226551056, "learning_rate": 8.858206863746018e-06, "loss": 0.8401, "step": 1030 }, { "epoch": 1.6589178356713425, "grad_norm": 0.3403220474720001, "learning_rate": 8.772129234848692e-06, "loss": 0.7461, "step": 1035 }, { "epoch": 1.6669338677354708, "grad_norm": 0.4974400997161865, "learning_rate": 8.686143878667965e-06, "loss": 0.8533, "step": 1040 }, { "epoch": 1.674949899799599, "grad_norm": 0.4107082188129425, "learning_rate": 8.600257256879306e-06, "loss": 0.8885, "step": 1045 }, { "epoch": 1.6829659318637273, "grad_norm": 0.3612527847290039, "learning_rate": 8.514475823738431e-06, "loss": 0.835, "step": 1050 }, { "epoch": 1.6909819639278556, "grad_norm": 0.38753876090049744, "learning_rate": 8.428806025596295e-06, "loss": 0.9262, "step": 1055 }, { "epoch": 1.6989979959919839, "grad_norm": 1.5978801250457764, "learning_rate": 8.343254300414629e-06, "loss": 0.8689, "step": 1060 }, { "epoch": 1.7070140280561121, "grad_norm": 0.44398370385169983, "learning_rate": 8.257827077282164e-06, "loss": 0.9272, "step": 1065 }, { "epoch": 1.7150300601202404, "grad_norm": 0.3783067464828491, "learning_rate": 8.172530775931476e-06, "loss": 0.8476, "step": 1070 }, { "epoch": 1.7230460921843687, "grad_norm": 0.33193129301071167, "learning_rate": 8.087371806256548e-06, "loss": 0.8736, "step": 1075 }, { "epoch": 1.731062124248497, "grad_norm": 0.4492955207824707, "learning_rate": 8.002356567831104e-06, "loss": 0.8475, "step": 1080 }, { "epoch": 1.7390781563126252, "grad_norm": 0.3451695740222931, "learning_rate": 7.917491449427664e-06, "loss": 0.8555, "step": 1085 }, { "epoch": 1.7470941883767535, "grad_norm": 0.34094515442848206, "learning_rate": 7.832782828537437e-06, "loss": 0.7514, "step": 1090 }, { "epoch": 1.7551102204408817, "grad_norm": 0.4276813864707947, "learning_rate": 7.748237070891085e-06, "loss": 0.8078, "step": 1095 }, { "epoch": 1.76312625250501, "grad_norm": 0.31638622283935547, "learning_rate": 7.663860529980318e-06, "loss": 0.8424, "step": 1100 }, { "epoch": 1.7711422845691382, "grad_norm": 0.3871704339981079, "learning_rate": 7.5796595465804616e-06, "loss": 0.7848, "step": 1105 }, { "epoch": 1.7791583166332665, "grad_norm": 0.4998142123222351, "learning_rate": 7.495640448273947e-06, "loss": 0.8611, "step": 1110 }, { "epoch": 1.7871743486973948, "grad_norm": 0.35793057084083557, "learning_rate": 7.411809548974792e-06, "loss": 0.9746, "step": 1115 }, { "epoch": 1.795190380761523, "grad_norm": 0.4235399067401886, "learning_rate": 7.328173148454151e-06, "loss": 0.9494, "step": 1120 }, { "epoch": 1.8032064128256513, "grad_norm": 0.35276517271995544, "learning_rate": 7.2447375318668545e-06, "loss": 0.7732, "step": 1125 }, { "epoch": 1.8112224448897796, "grad_norm": 0.2976204752922058, "learning_rate": 7.1615089692791225e-06, "loss": 0.8344, "step": 1130 }, { "epoch": 1.8192384769539078, "grad_norm": 0.3903176784515381, "learning_rate": 7.0784937151973666e-06, "loss": 0.8048, "step": 1135 }, { "epoch": 1.827254509018036, "grad_norm": 0.37830016016960144, "learning_rate": 6.99569800809816e-06, "loss": 0.8156, "step": 1140 }, { "epoch": 1.8352705410821644, "grad_norm": 0.35692963004112244, "learning_rate": 6.9131280699594545e-06, "loss": 0.8472, "step": 1145 }, { "epoch": 1.8432865731462926, "grad_norm": 0.35208597779273987, "learning_rate": 6.8307901057929735e-06, "loss": 0.792, "step": 1150 }, { "epoch": 1.851302605210421, "grad_norm": 0.38317593932151794, "learning_rate": 6.748690303177941e-06, "loss": 0.8337, "step": 1155 }, { "epoch": 1.8593186372745492, "grad_norm": 0.416199654340744, "learning_rate": 6.66683483179608e-06, "loss": 0.9125, "step": 1160 }, { "epoch": 1.8673346693386774, "grad_norm": 0.4500935673713684, "learning_rate": 6.585229842967977e-06, "loss": 0.8482, "step": 1165 }, { "epoch": 1.8753507014028057, "grad_norm": 0.4686901271343231, "learning_rate": 6.5038814691908095e-06, "loss": 0.8502, "step": 1170 }, { "epoch": 1.883366733466934, "grad_norm": 0.41259345412254333, "learning_rate": 6.422795823677515e-06, "loss": 0.8229, "step": 1175 }, { "epoch": 1.8913827655310622, "grad_norm": 0.3402479887008667, "learning_rate": 6.3419789998973655e-06, "loss": 0.787, "step": 1180 }, { "epoch": 1.8993987975951905, "grad_norm": 0.34355294704437256, "learning_rate": 6.261437071118086e-06, "loss": 0.7658, "step": 1185 }, { "epoch": 1.9074148296593187, "grad_norm": 0.3370957374572754, "learning_rate": 6.1811760899494276e-06, "loss": 0.8361, "step": 1190 }, { "epoch": 1.915430861723447, "grad_norm": 0.4238271713256836, "learning_rate": 6.101202087888329e-06, "loss": 0.8249, "step": 1195 }, { "epoch": 1.9234468937875753, "grad_norm": 0.41702085733413696, "learning_rate": 6.0215210748656785e-06, "loss": 0.8571, "step": 1200 }, { "epoch": 1.9314629258517035, "grad_norm": 0.3196205794811249, "learning_rate": 5.942139038794645e-06, "loss": 0.803, "step": 1205 }, { "epoch": 1.9394789579158318, "grad_norm": 0.3203709125518799, "learning_rate": 5.863061945120719e-06, "loss": 0.8299, "step": 1210 }, { "epoch": 1.94749498997996, "grad_norm": 0.4405556917190552, "learning_rate": 5.784295736373413e-06, "loss": 0.7888, "step": 1215 }, { "epoch": 1.9555110220440883, "grad_norm": 0.42714062333106995, "learning_rate": 5.705846331719676e-06, "loss": 0.9376, "step": 1220 }, { "epoch": 1.9635270541082166, "grad_norm": 0.4327429234981537, "learning_rate": 5.627719626519096e-06, "loss": 0.7978, "step": 1225 }, { "epoch": 1.9715430861723446, "grad_norm": 0.37963250279426575, "learning_rate": 5.549921491880856e-06, "loss": 0.8513, "step": 1230 }, { "epoch": 1.979559118236473, "grad_norm": 0.4149099588394165, "learning_rate": 5.472457774222535e-06, "loss": 0.8542, "step": 1235 }, { "epoch": 1.9875751503006012, "grad_norm": 0.34531357884407043, "learning_rate": 5.395334294830766e-06, "loss": 0.858, "step": 1240 }, { "epoch": 1.9955911823647294, "grad_norm": 0.3555367887020111, "learning_rate": 5.318556849423757e-06, "loss": 0.7818, "step": 1245 }, { "epoch": 2.0032064128256515, "grad_norm": 0.47876620292663574, "learning_rate": 5.242131207715768e-06, "loss": 0.9044, "step": 1250 }, { "epoch": 2.0112224448897797, "grad_norm": 0.3445394039154053, "learning_rate": 5.166063112983522e-06, "loss": 0.8851, "step": 1255 }, { "epoch": 2.019238476953908, "grad_norm": 0.34241387248039246, "learning_rate": 5.090358281634594e-06, "loss": 0.7266, "step": 1260 }, { "epoch": 2.0272545090180363, "grad_norm": 0.3980085253715515, "learning_rate": 5.015022402777838e-06, "loss": 0.8228, "step": 1265 }, { "epoch": 2.0352705410821645, "grad_norm": 0.3420424163341522, "learning_rate": 4.940061137795876e-06, "loss": 0.7685, "step": 1270 }, { "epoch": 2.0432865731462924, "grad_norm": 0.41949328780174255, "learning_rate": 4.8654801199196176e-06, "loss": 0.9227, "step": 1275 }, { "epoch": 2.0513026052104206, "grad_norm": 0.4226691722869873, "learning_rate": 4.791284953804969e-06, "loss": 0.7992, "step": 1280 }, { "epoch": 2.059318637274549, "grad_norm": 0.39590245485305786, "learning_rate": 4.717481215111622e-06, "loss": 0.8595, "step": 1285 }, { "epoch": 2.067334669338677, "grad_norm": 0.4448534846305847, "learning_rate": 4.644074450084061e-06, "loss": 0.9019, "step": 1290 }, { "epoch": 2.0753507014028054, "grad_norm": 0.3919767737388611, "learning_rate": 4.571070175134781e-06, "loss": 0.8359, "step": 1295 }, { "epoch": 2.0833667334669337, "grad_norm": 0.4835250675678253, "learning_rate": 4.498473876429727e-06, "loss": 0.8491, "step": 1300 }, { "epoch": 2.091382765531062, "grad_norm": 0.45917949080467224, "learning_rate": 4.426291009476007e-06, "loss": 0.7978, "step": 1305 }, { "epoch": 2.09939879759519, "grad_norm": 0.30657947063446045, "learning_rate": 4.354526998711945e-06, "loss": 0.8031, "step": 1310 }, { "epoch": 2.1074148296593185, "grad_norm": 0.4451172649860382, "learning_rate": 4.283187237099412e-06, "loss": 0.8898, "step": 1315 }, { "epoch": 2.1154308617234467, "grad_norm": 0.4376426637172699, "learning_rate": 4.2122770857185805e-06, "loss": 0.8588, "step": 1320 }, { "epoch": 2.123446893787575, "grad_norm": 0.3958570957183838, "learning_rate": 4.141801873365023e-06, "loss": 0.7816, "step": 1325 }, { "epoch": 2.1314629258517033, "grad_norm": 0.42820340394973755, "learning_rate": 4.0717668961492725e-06, "loss": 0.845, "step": 1330 }, { "epoch": 2.1394789579158315, "grad_norm": 0.4091578423976898, "learning_rate": 4.0021774170988395e-06, "loss": 0.8996, "step": 1335 }, { "epoch": 2.14749498997996, "grad_norm": 0.44767269492149353, "learning_rate": 3.9330386657626696e-06, "loss": 0.7196, "step": 1340 }, { "epoch": 2.155511022044088, "grad_norm": 0.47936999797821045, "learning_rate": 3.864355837818188e-06, "loss": 0.7673, "step": 1345 }, { "epoch": 2.1635270541082163, "grad_norm": 0.3325825035572052, "learning_rate": 3.79613409468083e-06, "loss": 0.7788, "step": 1350 }, { "epoch": 2.1715430861723446, "grad_norm": 0.36496493220329285, "learning_rate": 3.7283785631161663e-06, "loss": 0.8764, "step": 1355 }, { "epoch": 2.179559118236473, "grad_norm": 0.37848904728889465, "learning_rate": 3.6610943348546524e-06, "loss": 0.8016, "step": 1360 }, { "epoch": 2.187575150300601, "grad_norm": 0.3615955412387848, "learning_rate": 3.5942864662089684e-06, "loss": 0.9018, "step": 1365 }, { "epoch": 2.1955911823647294, "grad_norm": 0.45091915130615234, "learning_rate": 3.527959977694061e-06, "loss": 0.8454, "step": 1370 }, { "epoch": 2.2036072144288577, "grad_norm": 0.4201883375644684, "learning_rate": 3.462119853649859e-06, "loss": 0.9691, "step": 1375 }, { "epoch": 2.211623246492986, "grad_norm": 0.3573856055736542, "learning_rate": 3.3967710418666986e-06, "loss": 0.8465, "step": 1380 }, { "epoch": 2.219639278557114, "grad_norm": 0.447793185710907, "learning_rate": 3.331918453213505e-06, "loss": 0.9212, "step": 1385 }, { "epoch": 2.2276553106212424, "grad_norm": 0.4908188581466675, "learning_rate": 3.2675669612687565e-06, "loss": 0.8528, "step": 1390 }, { "epoch": 2.2356713426853707, "grad_norm": 0.44964873790740967, "learning_rate": 3.203721401954242e-06, "loss": 0.9147, "step": 1395 }, { "epoch": 2.243687374749499, "grad_norm": 0.4697072207927704, "learning_rate": 3.1403865731716266e-06, "loss": 0.8588, "step": 1400 }, { "epoch": 2.2517034068136272, "grad_norm": 0.36253607273101807, "learning_rate": 3.0775672344419305e-06, "loss": 0.8024, "step": 1405 }, { "epoch": 2.2597194388777555, "grad_norm": 0.5492545366287231, "learning_rate": 3.0152681065478252e-06, "loss": 0.8588, "step": 1410 }, { "epoch": 2.2677354709418838, "grad_norm": 0.35868972539901733, "learning_rate": 2.953493871178902e-06, "loss": 0.8138, "step": 1415 }, { "epoch": 2.275751503006012, "grad_norm": 0.38911786675453186, "learning_rate": 2.892249170579826e-06, "loss": 0.7942, "step": 1420 }, { "epoch": 2.2837675350701403, "grad_norm": 0.5678386092185974, "learning_rate": 2.8315386072014883e-06, "loss": 0.7764, "step": 1425 }, { "epoch": 2.2917835671342686, "grad_norm": 0.36749324202537537, "learning_rate": 2.7713667433551495e-06, "loss": 0.8247, "step": 1430 }, { "epoch": 2.299799599198397, "grad_norm": 0.4348406493663788, "learning_rate": 2.711738100869563e-06, "loss": 0.9319, "step": 1435 }, { "epoch": 2.307815631262525, "grad_norm": 0.4162181615829468, "learning_rate": 2.652657160751193e-06, "loss": 0.9147, "step": 1440 }, { "epoch": 2.3158316633266534, "grad_norm": 0.36371198296546936, "learning_rate": 2.59412836284745e-06, "loss": 0.8078, "step": 1445 }, { "epoch": 2.3238476953907816, "grad_norm": 0.39601725339889526, "learning_rate": 2.5361561055130625e-06, "loss": 0.8824, "step": 1450 }, { "epoch": 2.33186372745491, "grad_norm": 0.34811967611312866, "learning_rate": 2.4787447452795366e-06, "loss": 0.8163, "step": 1455 }, { "epoch": 2.339879759519038, "grad_norm": 0.405057430267334, "learning_rate": 2.4218985965277676e-06, "loss": 0.8474, "step": 1460 }, { "epoch": 2.3478957915831664, "grad_norm": 0.3901906907558441, "learning_rate": 2.3656219311638194e-06, "loss": 0.771, "step": 1465 }, { "epoch": 2.3559118236472947, "grad_norm": 0.3617786467075348, "learning_rate": 2.3099189782979126e-06, "loss": 0.8022, "step": 1470 }, { "epoch": 2.363927855711423, "grad_norm": 0.3886442184448242, "learning_rate": 2.2547939239265893e-06, "loss": 0.8938, "step": 1475 }, { "epoch": 2.371943887775551, "grad_norm": 0.4365661144256592, "learning_rate": 2.2002509106181625e-06, "loss": 0.8959, "step": 1480 }, { "epoch": 2.3799599198396795, "grad_norm": 0.3170413374900818, "learning_rate": 2.146294037201394e-06, "loss": 0.7777, "step": 1485 }, { "epoch": 2.3879759519038077, "grad_norm": 0.43696296215057373, "learning_rate": 2.092927358457476e-06, "loss": 0.8724, "step": 1490 }, { "epoch": 2.395991983967936, "grad_norm": 0.38862401247024536, "learning_rate": 2.0401548848153296e-06, "loss": 0.8958, "step": 1495 }, { "epoch": 2.4040080160320643, "grad_norm": 0.4901878535747528, "learning_rate": 1.9879805820502176e-06, "loss": 0.903, "step": 1500 }, { "epoch": 2.4120240480961925, "grad_norm": 0.37639907002449036, "learning_rate": 1.9364083709857184e-06, "loss": 0.7387, "step": 1505 }, { "epoch": 2.420040080160321, "grad_norm": 0.34562456607818604, "learning_rate": 1.8854421271990964e-06, "loss": 0.7325, "step": 1510 }, { "epoch": 2.428056112224449, "grad_norm": 0.44138360023498535, "learning_rate": 1.835085680730041e-06, "loss": 0.8788, "step": 1515 }, { "epoch": 2.4360721442885773, "grad_norm": 0.3983897864818573, "learning_rate": 1.785342815792862e-06, "loss": 0.8522, "step": 1520 }, { "epoch": 2.4440881763527056, "grad_norm": 0.40477579832077026, "learning_rate": 1.7362172704920933e-06, "loss": 0.7911, "step": 1525 }, { "epoch": 2.452104208416834, "grad_norm": 0.4484027624130249, "learning_rate": 1.6877127365415924e-06, "loss": 0.8903, "step": 1530 }, { "epoch": 2.460120240480962, "grad_norm": 0.3873910903930664, "learning_rate": 1.6398328589871126e-06, "loss": 0.8499, "step": 1535 }, { "epoch": 2.46813627254509, "grad_norm": 0.4115862548351288, "learning_rate": 1.5925812359323745e-06, "loss": 0.9026, "step": 1540 }, { "epoch": 2.4761523046092186, "grad_norm": 0.3982996344566345, "learning_rate": 1.5459614182686866e-06, "loss": 0.8284, "step": 1545 }, { "epoch": 2.4841683366733465, "grad_norm": 0.3346659541130066, "learning_rate": 1.4999769094080853e-06, "loss": 0.8033, "step": 1550 }, { "epoch": 2.492184368737475, "grad_norm": 0.3832646310329437, "learning_rate": 1.454631165020075e-06, "loss": 0.8919, "step": 1555 }, { "epoch": 2.500200400801603, "grad_norm": 0.7681372761726379, "learning_rate": 1.4099275927719235e-06, "loss": 0.8321, "step": 1560 }, { "epoch": 2.5082164328657317, "grad_norm": 0.415099173784256, "learning_rate": 1.3658695520725984e-06, "loss": 0.8178, "step": 1565 }, { "epoch": 2.5162324649298595, "grad_norm": 0.3748820722103119, "learning_rate": 1.3224603538202929e-06, "loss": 0.8834, "step": 1570 }, { "epoch": 2.5242484969939882, "grad_norm": 0.37983429431915283, "learning_rate": 1.2797032601536342e-06, "loss": 0.8233, "step": 1575 }, { "epoch": 2.532264529058116, "grad_norm": 0.40843164920806885, "learning_rate": 1.2376014842065264e-06, "loss": 0.9251, "step": 1580 }, { "epoch": 2.5402805611222448, "grad_norm": 0.34796783328056335, "learning_rate": 1.1961581898666895e-06, "loss": 0.8777, "step": 1585 }, { "epoch": 2.5482965931863726, "grad_norm": 0.3763981759548187, "learning_rate": 1.1553764915379095e-06, "loss": 0.8709, "step": 1590 }, { "epoch": 2.556312625250501, "grad_norm": 0.7970382571220398, "learning_rate": 1.115259453905978e-06, "loss": 0.8451, "step": 1595 }, { "epoch": 2.564328657314629, "grad_norm": 0.3026469051837921, "learning_rate": 1.075810091708399e-06, "loss": 0.8302, "step": 1600 }, { "epoch": 2.5723446893787574, "grad_norm": 0.3812445104122162, "learning_rate": 1.0370313695078316e-06, "loss": 0.8251, "step": 1605 }, { "epoch": 2.5803607214428856, "grad_norm": 0.36716336011886597, "learning_rate": 9.989262014693013e-07, "loss": 0.8475, "step": 1610 }, { "epoch": 2.588376753507014, "grad_norm": 0.4053398072719574, "learning_rate": 9.614974511412156e-07, "loss": 0.8174, "step": 1615 }, { "epoch": 2.596392785571142, "grad_norm": 0.4070420563220978, "learning_rate": 9.247479312401642e-07, "loss": 0.8769, "step": 1620 }, { "epoch": 2.6044088176352704, "grad_norm": 0.7767839431762695, "learning_rate": 8.88680403439548e-07, "loss": 0.8343, "step": 1625 }, { "epoch": 2.6124248496993987, "grad_norm": 0.411374568939209, "learning_rate": 8.532975781620511e-07, "loss": 0.8021, "step": 1630 }, { "epoch": 2.620440881763527, "grad_norm": 0.32186996936798096, "learning_rate": 8.18602114375947e-07, "loss": 0.8267, "step": 1635 }, { "epoch": 2.6284569138276552, "grad_norm": 0.34994935989379883, "learning_rate": 7.845966193952825e-07, "loss": 0.8122, "step": 1640 }, { "epoch": 2.6364729458917835, "grad_norm": 0.399689644575119, "learning_rate": 7.512836486839492e-07, "loss": 0.7816, "step": 1645 }, { "epoch": 2.6444889779559118, "grad_norm": 0.3521658778190613, "learning_rate": 7.18665705663637e-07, "loss": 0.8229, "step": 1650 }, { "epoch": 2.65250501002004, "grad_norm": 0.46236297488212585, "learning_rate": 6.867452415257081e-07, "loss": 0.7963, "step": 1655 }, { "epoch": 2.6605210420841683, "grad_norm": 0.47568464279174805, "learning_rate": 6.555246550469907e-07, "loss": 0.7728, "step": 1660 }, { "epoch": 2.6685370741482966, "grad_norm": 0.3366374373435974, "learning_rate": 6.250062924095158e-07, "loss": 0.8544, "step": 1665 }, { "epoch": 2.676553106212425, "grad_norm": 0.3431518077850342, "learning_rate": 5.951924470242121e-07, "loss": 0.6777, "step": 1670 }, { "epoch": 2.684569138276553, "grad_norm": 0.3027094602584839, "learning_rate": 5.660853593585458e-07, "loss": 0.7997, "step": 1675 }, { "epoch": 2.6925851703406813, "grad_norm": 0.38628339767456055, "learning_rate": 5.376872167681634e-07, "loss": 0.8907, "step": 1680 }, { "epoch": 2.7006012024048096, "grad_norm": 0.38656118512153625, "learning_rate": 5.10000153332515e-07, "loss": 0.8128, "step": 1685 }, { "epoch": 2.708617234468938, "grad_norm": 0.3081871569156647, "learning_rate": 4.830262496944693e-07, "loss": 0.8044, "step": 1690 }, { "epoch": 2.716633266533066, "grad_norm": 0.3895088732242584, "learning_rate": 4.5676753290397445e-07, "loss": 0.8063, "step": 1695 }, { "epoch": 2.7246492985971944, "grad_norm": 0.3495742082595825, "learning_rate": 4.312259762657145e-07, "loss": 0.8404, "step": 1700 }, { "epoch": 2.7326653306613227, "grad_norm": 0.3835850954055786, "learning_rate": 4.0640349919082056e-07, "loss": 0.9816, "step": 1705 }, { "epoch": 2.740681362725451, "grad_norm": 0.4039825201034546, "learning_rate": 3.8230196705263734e-07, "loss": 0.936, "step": 1710 }, { "epoch": 2.748697394789579, "grad_norm": 0.5234516263008118, "learning_rate": 3.5892319104653294e-07, "loss": 0.831, "step": 1715 }, { "epoch": 2.7567134268537075, "grad_norm": 0.3361075520515442, "learning_rate": 3.3626892805379565e-07, "loss": 0.8202, "step": 1720 }, { "epoch": 2.7647294589178357, "grad_norm": 0.3541311025619507, "learning_rate": 3.1434088050960934e-07, "loss": 0.8253, "step": 1725 }, { "epoch": 2.772745490981964, "grad_norm": 0.36018094420433044, "learning_rate": 2.9314069627511045e-07, "loss": 0.8884, "step": 1730 }, { "epoch": 2.7807615230460923, "grad_norm": 0.47873905301094055, "learning_rate": 2.726699685135603e-07, "loss": 0.8518, "step": 1735 }, { "epoch": 2.7887775551102205, "grad_norm": 0.4113430380821228, "learning_rate": 2.529302355706165e-07, "loss": 0.859, "step": 1740 }, { "epoch": 2.796793587174349, "grad_norm": 0.43893691897392273, "learning_rate": 2.3392298085873288e-07, "loss": 0.8878, "step": 1745 }, { "epoch": 2.804809619238477, "grad_norm": 0.30622348189353943, "learning_rate": 2.1564963274568028e-07, "loss": 0.8205, "step": 1750 }, { "epoch": 2.8128256513026053, "grad_norm": 1.027692198753357, "learning_rate": 1.9811156444720648e-07, "loss": 0.8271, "step": 1755 }, { "epoch": 2.8208416833667336, "grad_norm": 0.3843301832675934, "learning_rate": 1.8131009392384324e-07, "loss": 0.9512, "step": 1760 }, { "epoch": 2.828857715430862, "grad_norm": 0.4416702091693878, "learning_rate": 1.6524648378186125e-07, "loss": 0.8444, "step": 1765 }, { "epoch": 2.83687374749499, "grad_norm": 0.34150800108909607, "learning_rate": 1.49921941178387e-07, "loss": 0.8285, "step": 1770 }, { "epoch": 2.8448897795591184, "grad_norm": 0.4354546368122101, "learning_rate": 1.35337617730692e-07, "loss": 0.8678, "step": 1775 }, { "epoch": 2.8529058116232466, "grad_norm": 0.5867286920547485, "learning_rate": 1.2149460942964097e-07, "loss": 0.9001, "step": 1780 }, { "epoch": 2.860921843687375, "grad_norm": 0.40360739827156067, "learning_rate": 1.0839395655733664e-07, "loss": 0.7915, "step": 1785 }, { "epoch": 2.868937875751503, "grad_norm": 0.4062676727771759, "learning_rate": 9.603664360894327e-08, "loss": 0.9189, "step": 1790 }, { "epoch": 2.8769539078156314, "grad_norm": 0.36837005615234375, "learning_rate": 8.442359921870148e-08, "loss": 0.82, "step": 1795 }, { "epoch": 2.8849699398797597, "grad_norm": 0.34926676750183105, "learning_rate": 7.35556960901429e-08, "loss": 0.7414, "step": 1800 }, { "epoch": 2.8929859719438875, "grad_norm": 0.36959943175315857, "learning_rate": 6.343375093050941e-08, "loss": 0.8122, "step": 1805 }, { "epoch": 2.901002004008016, "grad_norm": 0.3383861184120178, "learning_rate": 5.405852438937764e-08, "loss": 0.9356, "step": 1810 }, { "epoch": 2.909018036072144, "grad_norm": 0.3275286853313446, "learning_rate": 4.543072100149704e-08, "loss": 0.9095, "step": 1815 }, { "epoch": 2.9170340681362728, "grad_norm": 0.35631418228149414, "learning_rate": 3.755098913384325e-08, "loss": 0.8541, "step": 1820 }, { "epoch": 2.9250501002004006, "grad_norm": 0.33556675910949707, "learning_rate": 3.0419920936900494e-08, "loss": 0.7787, "step": 1825 }, { "epoch": 2.9330661322645293, "grad_norm": 0.3943828046321869, "learning_rate": 2.403805230015488e-08, "loss": 0.937, "step": 1830 }, { "epoch": 2.941082164328657, "grad_norm": 0.37884265184402466, "learning_rate": 1.840586281182888e-08, "loss": 0.7455, "step": 1835 }, { "epoch": 2.949098196392786, "grad_norm": 0.4555797576904297, "learning_rate": 1.3523775722834586e-08, "loss": 0.8243, "step": 1840 }, { "epoch": 2.9571142284569136, "grad_norm": 0.392288476228714, "learning_rate": 9.39215791497583e-09, "loss": 0.8212, "step": 1845 }, { "epoch": 2.9651302605210423, "grad_norm": 0.4135635495185852, "learning_rate": 6.011319873370225e-09, "loss": 0.9677, "step": 1850 }, { "epoch": 2.97314629258517, "grad_norm": 0.3438234329223633, "learning_rate": 3.3815156631178404e-09, "loss": 0.8026, "step": 1855 }, { "epoch": 2.981162324649299, "grad_norm": 0.3370349109172821, "learning_rate": 1.502942910212024e-09, "loss": 0.812, "step": 1860 }, { "epoch": 2.9891783567134267, "grad_norm": 0.3954819142818451, "learning_rate": 3.757427866846186e-10, "loss": 0.8795, "step": 1865 } ], "logging_steps": 5, "max_steps": 1869, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.163374755613706e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }