{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.276707530647986, "eval_steps": 100, "global_step": 1300, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.017513134851138354, "grad_norm": 0.4135197103023529, "learning_rate": 3.6e-05, "loss": 0.8109177589416504, "step": 10 }, { "epoch": 0.03502626970227671, "grad_norm": 0.5954136252403259, "learning_rate": 7.6e-05, "loss": 0.6212304115295411, "step": 20 }, { "epoch": 0.05253940455341506, "grad_norm": 0.4027167856693268, "learning_rate": 0.000116, "loss": 0.44783411026000974, "step": 30 }, { "epoch": 0.07005253940455342, "grad_norm": 0.47371360659599304, "learning_rate": 0.00015600000000000002, "loss": 0.3630207538604736, "step": 40 }, { "epoch": 0.08756567425569177, "grad_norm": 0.48840901255607605, "learning_rate": 0.000196, "loss": 0.32424685955047605, "step": 50 }, { "epoch": 0.10507880910683012, "grad_norm": 0.5532234311103821, "learning_rate": 0.0001989176187612748, "loss": 0.2953991413116455, "step": 60 }, { "epoch": 0.12259194395796848, "grad_norm": 0.5430059432983398, "learning_rate": 0.00019771497294046903, "loss": 0.26429708003997804, "step": 70 }, { "epoch": 0.14010507880910683, "grad_norm": 0.5477070212364197, "learning_rate": 0.00019651232711966328, "loss": 0.2550451040267944, "step": 80 }, { "epoch": 0.15761821366024517, "grad_norm": 0.37017086148262024, "learning_rate": 0.00019530968129885748, "loss": 0.23371753692626954, "step": 90 }, { "epoch": 0.17513134851138354, "grad_norm": 0.38276150822639465, "learning_rate": 0.0001941070354780517, "loss": 0.2195589542388916, "step": 100 }, { "epoch": 0.17513134851138354, "eval_loss": 0.23231205344200134, "eval_runtime": 169.8531, "eval_samples_per_second": 2.991, "eval_steps_per_second": 0.748, "step": 100 }, { "epoch": 0.19264448336252188, "grad_norm": 0.406323105096817, "learning_rate": 0.00019290438965724596, "loss": 0.2108442783355713, "step": 110 }, { "epoch": 0.21015761821366025, "grad_norm": 0.47465822100639343, "learning_rate": 0.00019170174383644018, "loss": 0.2249575138092041, "step": 120 }, { "epoch": 0.2276707530647986, "grad_norm": 0.35268914699554443, "learning_rate": 0.0001904990980156344, "loss": 0.16998076438903809, "step": 130 }, { "epoch": 0.24518388791593695, "grad_norm": 0.31479501724243164, "learning_rate": 0.00018929645219482863, "loss": 0.1623205780982971, "step": 140 }, { "epoch": 0.2626970227670753, "grad_norm": 0.3868594467639923, "learning_rate": 0.00018809380637402286, "loss": 0.16868008375167848, "step": 150 }, { "epoch": 0.28021015761821366, "grad_norm": 0.4887761175632477, "learning_rate": 0.00018689116055321708, "loss": 0.1882340431213379, "step": 160 }, { "epoch": 0.29772329246935203, "grad_norm": 0.39412927627563477, "learning_rate": 0.0001856885147324113, "loss": 0.15920686721801758, "step": 170 }, { "epoch": 0.31523642732049034, "grad_norm": 0.41622865200042725, "learning_rate": 0.00018448586891160553, "loss": 0.16607775688171386, "step": 180 }, { "epoch": 0.3327495621716287, "grad_norm": 0.4045696258544922, "learning_rate": 0.00018328322309079978, "loss": 0.158127498626709, "step": 190 }, { "epoch": 0.3502626970227671, "grad_norm": 0.3789847493171692, "learning_rate": 0.00018208057726999398, "loss": 0.14632443189620972, "step": 200 }, { "epoch": 0.3502626970227671, "eval_loss": 0.1353635936975479, "eval_runtime": 171.8534, "eval_samples_per_second": 2.956, "eval_steps_per_second": 0.739, "step": 200 }, { "epoch": 0.36777583187390545, "grad_norm": 0.41194388270378113, "learning_rate": 0.00018087793144918823, "loss": 0.1293831706047058, "step": 210 }, { "epoch": 0.38528896672504376, "grad_norm": 0.35434651374816895, "learning_rate": 0.00017967528562838245, "loss": 0.13147668838500975, "step": 220 }, { "epoch": 0.4028021015761821, "grad_norm": 0.3050230145454407, "learning_rate": 0.00017847263980757668, "loss": 0.12810969352722168, "step": 230 }, { "epoch": 0.4203152364273205, "grad_norm": 0.29852065443992615, "learning_rate": 0.0001772699939867709, "loss": 0.13389307260513306, "step": 240 }, { "epoch": 0.43782837127845886, "grad_norm": 0.3992239832878113, "learning_rate": 0.00017606734816596513, "loss": 0.11474900245666504, "step": 250 }, { "epoch": 0.4553415061295972, "grad_norm": 0.323345422744751, "learning_rate": 0.00017486470234515935, "loss": 0.11180757284164429, "step": 260 }, { "epoch": 0.47285464098073554, "grad_norm": 0.3820851147174835, "learning_rate": 0.00017366205652435358, "loss": 0.10637552738189697, "step": 270 }, { "epoch": 0.4903677758318739, "grad_norm": 0.3785695433616638, "learning_rate": 0.0001724594107035478, "loss": 0.11243565082550049, "step": 280 }, { "epoch": 0.5078809106830122, "grad_norm": 0.34767481684684753, "learning_rate": 0.00017125676488274205, "loss": 0.11057982444763184, "step": 290 }, { "epoch": 0.5253940455341506, "grad_norm": 0.32242536544799805, "learning_rate": 0.00017005411906193628, "loss": 0.09878214001655579, "step": 300 }, { "epoch": 0.5253940455341506, "eval_loss": 0.10276732593774796, "eval_runtime": 170.0789, "eval_samples_per_second": 2.987, "eval_steps_per_second": 0.747, "step": 300 }, { "epoch": 0.542907180385289, "grad_norm": 0.3188435435295105, "learning_rate": 0.00016885147324113047, "loss": 0.08771577477455139, "step": 310 }, { "epoch": 0.5604203152364273, "grad_norm": 0.2941615879535675, "learning_rate": 0.00016764882742032473, "loss": 0.08557047247886658, "step": 320 }, { "epoch": 0.5779334500875657, "grad_norm": 0.2936120927333832, "learning_rate": 0.00016644618159951895, "loss": 0.08636216521263122, "step": 330 }, { "epoch": 0.5954465849387041, "grad_norm": 0.21349965035915375, "learning_rate": 0.0001652435357787132, "loss": 0.08149101734161376, "step": 340 }, { "epoch": 0.6129597197898424, "grad_norm": 0.2442740797996521, "learning_rate": 0.0001640408899579074, "loss": 0.08436259627342224, "step": 350 }, { "epoch": 0.6304728546409807, "grad_norm": 0.3144635856151581, "learning_rate": 0.00016283824413710162, "loss": 0.0912843644618988, "step": 360 }, { "epoch": 0.647985989492119, "grad_norm": 0.18774041533470154, "learning_rate": 0.00016163559831629587, "loss": 0.08484984040260315, "step": 370 }, { "epoch": 0.6654991243432574, "grad_norm": 0.3200187385082245, "learning_rate": 0.0001604329524954901, "loss": 0.08420997262001037, "step": 380 }, { "epoch": 0.6830122591943958, "grad_norm": 0.20744681358337402, "learning_rate": 0.0001592303066746843, "loss": 0.07883568406105042, "step": 390 }, { "epoch": 0.7005253940455342, "grad_norm": 0.49990326166152954, "learning_rate": 0.00015802766085387855, "loss": 0.07491461634635925, "step": 400 }, { "epoch": 0.7005253940455342, "eval_loss": 0.08543122559785843, "eval_runtime": 169.5964, "eval_samples_per_second": 2.995, "eval_steps_per_second": 0.749, "step": 400 }, { "epoch": 0.7180385288966725, "grad_norm": 0.21963991224765778, "learning_rate": 0.00015682501503307277, "loss": 0.07940490245819092, "step": 410 }, { "epoch": 0.7355516637478109, "grad_norm": 0.282270610332489, "learning_rate": 0.000155622369212267, "loss": 0.08389427065849304, "step": 420 }, { "epoch": 0.7530647985989493, "grad_norm": 0.19522342085838318, "learning_rate": 0.00015441972339146122, "loss": 0.07796943187713623, "step": 430 }, { "epoch": 0.7705779334500875, "grad_norm": 0.20144295692443848, "learning_rate": 0.00015321707757065545, "loss": 0.08569519519805908, "step": 440 }, { "epoch": 0.7880910683012259, "grad_norm": 0.31299343705177307, "learning_rate": 0.0001520144317498497, "loss": 0.07234247326850891, "step": 450 }, { "epoch": 0.8056042031523643, "grad_norm": 0.22233198583126068, "learning_rate": 0.0001508117859290439, "loss": 0.06918607354164123, "step": 460 }, { "epoch": 0.8231173380035026, "grad_norm": 0.3281087577342987, "learning_rate": 0.00014960914010823812, "loss": 0.06424351334571839, "step": 470 }, { "epoch": 0.840630472854641, "grad_norm": 0.23634330928325653, "learning_rate": 0.00014840649428743237, "loss": 0.07089964151382447, "step": 480 }, { "epoch": 0.8581436077057794, "grad_norm": 0.24085308611392975, "learning_rate": 0.0001472038484666266, "loss": 0.07725317478179931, "step": 490 }, { "epoch": 0.8756567425569177, "grad_norm": 0.2506239712238312, "learning_rate": 0.00014600120264582082, "loss": 0.07955536246299744, "step": 500 }, { "epoch": 0.8756567425569177, "eval_loss": 0.07601634413003922, "eval_runtime": 170.4186, "eval_samples_per_second": 2.981, "eval_steps_per_second": 0.745, "step": 500 }, { "epoch": 0.8931698774080561, "grad_norm": 0.30001509189605713, "learning_rate": 0.00014479855682501504, "loss": 0.06071768999099732, "step": 510 }, { "epoch": 0.9106830122591943, "grad_norm": 0.1644354909658432, "learning_rate": 0.00014359591100420927, "loss": 0.07156956791877747, "step": 520 }, { "epoch": 0.9281961471103327, "grad_norm": 0.2289579063653946, "learning_rate": 0.0001423932651834035, "loss": 0.07050368785858155, "step": 530 }, { "epoch": 0.9457092819614711, "grad_norm": 0.3195700943470001, "learning_rate": 0.00014119061936259772, "loss": 0.06230233311653137, "step": 540 }, { "epoch": 0.9632224168126094, "grad_norm": 0.15884605050086975, "learning_rate": 0.00013998797354179194, "loss": 0.06492781639099121, "step": 550 }, { "epoch": 0.9807355516637478, "grad_norm": 0.17338015139102936, "learning_rate": 0.0001387853277209862, "loss": 0.07274928689002991, "step": 560 }, { "epoch": 0.9982486865148862, "grad_norm": 0.18797871470451355, "learning_rate": 0.0001375826819001804, "loss": 0.07553291320800781, "step": 570 }, { "epoch": 1.0157618213660244, "grad_norm": 0.14001163840293884, "learning_rate": 0.00013638003607937464, "loss": 0.04513072073459625, "step": 580 }, { "epoch": 1.0332749562171628, "grad_norm": 0.25820890069007874, "learning_rate": 0.00013517739025856887, "loss": 0.05151134729385376, "step": 590 }, { "epoch": 1.0507880910683012, "grad_norm": 0.2387373149394989, "learning_rate": 0.0001339747444377631, "loss": 0.05233837962150574, "step": 600 }, { "epoch": 1.0507880910683012, "eval_loss": 0.07496609538793564, "eval_runtime": 169.9129, "eval_samples_per_second": 2.99, "eval_steps_per_second": 0.747, "step": 600 }, { "epoch": 1.0683012259194395, "grad_norm": 0.21280422806739807, "learning_rate": 0.00013277209861695731, "loss": 0.04595586657524109, "step": 610 }, { "epoch": 1.085814360770578, "grad_norm": 0.2865266799926758, "learning_rate": 0.00013156945279615154, "loss": 0.04963254630565643, "step": 620 }, { "epoch": 1.1033274956217163, "grad_norm": 0.19880151748657227, "learning_rate": 0.00013036680697534576, "loss": 0.05288234353065491, "step": 630 }, { "epoch": 1.1208406304728546, "grad_norm": 0.25318190455436707, "learning_rate": 0.00012916416115454, "loss": 0.04070430099964142, "step": 640 }, { "epoch": 1.138353765323993, "grad_norm": 0.2229541689157486, "learning_rate": 0.0001279615153337342, "loss": 0.04462625682353973, "step": 650 }, { "epoch": 1.1558669001751314, "grad_norm": 0.15195652842521667, "learning_rate": 0.00012675886951292846, "loss": 0.04568430483341217, "step": 660 }, { "epoch": 1.1733800350262698, "grad_norm": 0.2872307300567627, "learning_rate": 0.0001255562236921227, "loss": 0.04056203365325928, "step": 670 }, { "epoch": 1.1908931698774081, "grad_norm": 0.30495700240135193, "learning_rate": 0.00012435357787131689, "loss": 0.047316303849220274, "step": 680 }, { "epoch": 1.2084063047285465, "grad_norm": 0.1586247980594635, "learning_rate": 0.00012315093205051114, "loss": 0.044099316000938416, "step": 690 }, { "epoch": 1.2259194395796849, "grad_norm": 0.19665417075157166, "learning_rate": 0.00012194828622970536, "loss": 0.04525145888328552, "step": 700 }, { "epoch": 1.2259194395796849, "eval_loss": 0.07472622394561768, "eval_runtime": 169.568, "eval_samples_per_second": 2.996, "eval_steps_per_second": 0.749, "step": 700 }, { "epoch": 1.2434325744308232, "grad_norm": 0.21693575382232666, "learning_rate": 0.00012074564040889957, "loss": 0.04104744493961334, "step": 710 }, { "epoch": 1.2609457092819616, "grad_norm": 0.24825339019298553, "learning_rate": 0.00011954299458809381, "loss": 0.0438425600528717, "step": 720 }, { "epoch": 1.2784588441331, "grad_norm": 0.18047627806663513, "learning_rate": 0.00011834034876728803, "loss": 0.047738096117973326, "step": 730 }, { "epoch": 1.295971978984238, "grad_norm": 0.19772164523601532, "learning_rate": 0.00011713770294648227, "loss": 0.04714350998401642, "step": 740 }, { "epoch": 1.3134851138353765, "grad_norm": 0.22316114604473114, "learning_rate": 0.0001159350571256765, "loss": 0.04388459920883179, "step": 750 }, { "epoch": 1.3309982486865148, "grad_norm": 0.1677238643169403, "learning_rate": 0.00011473241130487071, "loss": 0.04296576082706451, "step": 760 }, { "epoch": 1.3485113835376532, "grad_norm": 0.2544882595539093, "learning_rate": 0.00011352976548406496, "loss": 0.037767985463142396, "step": 770 }, { "epoch": 1.3660245183887916, "grad_norm": 0.17373642325401306, "learning_rate": 0.00011232711966325917, "loss": 0.04673008918762207, "step": 780 }, { "epoch": 1.38353765323993, "grad_norm": 0.23099961876869202, "learning_rate": 0.00011112447384245341, "loss": 0.04906592071056366, "step": 790 }, { "epoch": 1.4010507880910683, "grad_norm": 0.2572455406188965, "learning_rate": 0.00010992182802164763, "loss": 0.04228177070617676, "step": 800 }, { "epoch": 1.4010507880910683, "eval_loss": 0.07377293705940247, "eval_runtime": 169.6978, "eval_samples_per_second": 2.994, "eval_steps_per_second": 0.748, "step": 800 }, { "epoch": 1.4185639229422067, "grad_norm": 0.1933060735464096, "learning_rate": 0.00010871918220084186, "loss": 0.039757218956947324, "step": 810 }, { "epoch": 1.436077057793345, "grad_norm": 0.21861182153224945, "learning_rate": 0.0001075165363800361, "loss": 0.04450837075710297, "step": 820 }, { "epoch": 1.4535901926444834, "grad_norm": 0.27015894651412964, "learning_rate": 0.0001063138905592303, "loss": 0.04501202404499054, "step": 830 }, { "epoch": 1.4711033274956218, "grad_norm": 0.15882235765457153, "learning_rate": 0.00010511124473842453, "loss": 0.040595722198486325, "step": 840 }, { "epoch": 1.4886164623467601, "grad_norm": 0.22079160809516907, "learning_rate": 0.00010390859891761877, "loss": 0.04613872766494751, "step": 850 }, { "epoch": 1.5061295971978983, "grad_norm": 0.26043882966041565, "learning_rate": 0.00010270595309681299, "loss": 0.052975207567214966, "step": 860 }, { "epoch": 1.5236427320490367, "grad_norm": 0.1896980255842209, "learning_rate": 0.00010150330727600723, "loss": 0.04145742654800415, "step": 870 }, { "epoch": 1.541155866900175, "grad_norm": 0.17354312539100647, "learning_rate": 0.00010030066145520146, "loss": 0.04943464994430542, "step": 880 }, { "epoch": 1.5586690017513134, "grad_norm": 0.14007078111171722, "learning_rate": 9.909801563439568e-05, "loss": 0.04217578768730164, "step": 890 }, { "epoch": 1.5761821366024518, "grad_norm": 0.20131802558898926, "learning_rate": 9.78953698135899e-05, "loss": 0.041672542691230774, "step": 900 }, { "epoch": 1.5761821366024518, "eval_loss": 0.07051914185285568, "eval_runtime": 169.9735, "eval_samples_per_second": 2.989, "eval_steps_per_second": 0.747, "step": 900 }, { "epoch": 1.5936952714535901, "grad_norm": 0.22193501889705658, "learning_rate": 9.669272399278413e-05, "loss": 0.04524196684360504, "step": 910 }, { "epoch": 1.6112084063047285, "grad_norm": 0.23595920205116272, "learning_rate": 9.549007817197835e-05, "loss": 0.04126276075839996, "step": 920 }, { "epoch": 1.6287215411558669, "grad_norm": 0.2922545373439789, "learning_rate": 9.428743235117259e-05, "loss": 0.04022812843322754, "step": 930 }, { "epoch": 1.6462346760070052, "grad_norm": 0.23278813064098358, "learning_rate": 9.30847865303668e-05, "loss": 0.04213928878307342, "step": 940 }, { "epoch": 1.6637478108581436, "grad_norm": 0.14974910020828247, "learning_rate": 9.188214070956104e-05, "loss": 0.0363939642906189, "step": 950 }, { "epoch": 1.681260945709282, "grad_norm": 0.1183304563164711, "learning_rate": 9.067949488875526e-05, "loss": 0.04207303524017334, "step": 960 }, { "epoch": 1.6987740805604203, "grad_norm": 0.23170360922813416, "learning_rate": 8.94768490679495e-05, "loss": 0.042323988676071164, "step": 970 }, { "epoch": 1.7162872154115587, "grad_norm": 0.14556758105754852, "learning_rate": 8.827420324714371e-05, "loss": 0.042339283227920535, "step": 980 }, { "epoch": 1.733800350262697, "grad_norm": 0.1421191394329071, "learning_rate": 8.707155742633795e-05, "loss": 0.04450683891773224, "step": 990 }, { "epoch": 1.7513134851138354, "grad_norm": 0.31845614314079285, "learning_rate": 8.586891160553218e-05, "loss": 0.042928069829940796, "step": 1000 }, { "epoch": 1.7513134851138354, "eval_loss": 0.0688522532582283, "eval_runtime": 169.5678, "eval_samples_per_second": 2.996, "eval_steps_per_second": 0.749, "step": 1000 }, { "epoch": 1.7688266199649738, "grad_norm": 0.1398610770702362, "learning_rate": 8.46662657847264e-05, "loss": 0.042378559708595276, "step": 1010 }, { "epoch": 1.7863397548161122, "grad_norm": 0.18888983130455017, "learning_rate": 8.346361996392062e-05, "loss": 0.044092172384262086, "step": 1020 }, { "epoch": 1.8038528896672505, "grad_norm": 0.192138671875, "learning_rate": 8.226097414311485e-05, "loss": 0.03955377042293549, "step": 1030 }, { "epoch": 1.821366024518389, "grad_norm": 0.2001374512910843, "learning_rate": 8.105832832230909e-05, "loss": 0.04774285852909088, "step": 1040 }, { "epoch": 1.8388791593695273, "grad_norm": 0.24916240572929382, "learning_rate": 7.985568250150331e-05, "loss": 0.044192954897880554, "step": 1050 }, { "epoch": 1.8563922942206657, "grad_norm": 0.21104031801223755, "learning_rate": 7.865303668069754e-05, "loss": 0.0387516975402832, "step": 1060 }, { "epoch": 1.873905429071804, "grad_norm": 0.27948206663131714, "learning_rate": 7.745039085989176e-05, "loss": 0.042763397097587585, "step": 1070 }, { "epoch": 1.8914185639229422, "grad_norm": 0.21115849912166595, "learning_rate": 7.6247745039086e-05, "loss": 0.03943166434764862, "step": 1080 }, { "epoch": 1.9089316987740805, "grad_norm": 0.24164821207523346, "learning_rate": 7.504509921828022e-05, "loss": 0.04395500421524048, "step": 1090 }, { "epoch": 1.926444833625219, "grad_norm": 0.14232757687568665, "learning_rate": 7.384245339747445e-05, "loss": 0.03802197575569153, "step": 1100 }, { "epoch": 1.926444833625219, "eval_loss": 0.0663708746433258, "eval_runtime": 170.0427, "eval_samples_per_second": 2.987, "eval_steps_per_second": 0.747, "step": 1100 }, { "epoch": 1.9439579684763573, "grad_norm": 0.20456406474113464, "learning_rate": 7.263980757666867e-05, "loss": 0.04351660311222076, "step": 1110 }, { "epoch": 1.9614711033274956, "grad_norm": 0.28461146354675293, "learning_rate": 7.14371617558629e-05, "loss": 0.04411421418190002, "step": 1120 }, { "epoch": 1.978984238178634, "grad_norm": 0.33428093791007996, "learning_rate": 7.023451593505713e-05, "loss": 0.04533115029335022, "step": 1130 }, { "epoch": 1.9964973730297724, "grad_norm": 0.2965065538883209, "learning_rate": 6.903187011425134e-05, "loss": 0.04683744609355926, "step": 1140 }, { "epoch": 2.0140105078809105, "grad_norm": 0.13189074397087097, "learning_rate": 6.782922429344558e-05, "loss": 0.024469637870788576, "step": 1150 }, { "epoch": 2.031523642732049, "grad_norm": 0.26192790269851685, "learning_rate": 6.662657847263981e-05, "loss": 0.020343032479286195, "step": 1160 }, { "epoch": 2.0490367775831873, "grad_norm": 0.17017051577568054, "learning_rate": 6.542393265183405e-05, "loss": 0.023167347908020018, "step": 1170 }, { "epoch": 2.0665499124343256, "grad_norm": 0.23270311951637268, "learning_rate": 6.422128683102826e-05, "loss": 0.019265547394752502, "step": 1180 }, { "epoch": 2.084063047285464, "grad_norm": 0.17566721141338348, "learning_rate": 6.30186410102225e-05, "loss": 0.020077353715896605, "step": 1190 }, { "epoch": 2.1015761821366024, "grad_norm": 0.21460862457752228, "learning_rate": 6.181599518941672e-05, "loss": 0.020433691143989564, "step": 1200 }, { "epoch": 2.1015761821366024, "eval_loss": 0.0755230188369751, "eval_runtime": 169.6234, "eval_samples_per_second": 2.995, "eval_steps_per_second": 0.749, "step": 1200 }, { "epoch": 2.1190893169877407, "grad_norm": 0.19966909289360046, "learning_rate": 6.061334936861095e-05, "loss": 0.019319312274456026, "step": 1210 }, { "epoch": 2.136602451838879, "grad_norm": 0.19373339414596558, "learning_rate": 5.941070354780517e-05, "loss": 0.022010722756385805, "step": 1220 }, { "epoch": 2.1541155866900175, "grad_norm": 0.19323857128620148, "learning_rate": 5.82080577269994e-05, "loss": 0.021162202954292296, "step": 1230 }, { "epoch": 2.171628721541156, "grad_norm": 0.16135787963867188, "learning_rate": 5.700541190619363e-05, "loss": 0.02209024876356125, "step": 1240 }, { "epoch": 2.189141856392294, "grad_norm": 0.1409604251384735, "learning_rate": 5.580276608538786e-05, "loss": 0.020828820765018463, "step": 1250 }, { "epoch": 2.2066549912434326, "grad_norm": 0.15199248492717743, "learning_rate": 5.460012026458209e-05, "loss": 0.019746646285057068, "step": 1260 }, { "epoch": 2.224168126094571, "grad_norm": 0.1164596751332283, "learning_rate": 5.339747444377631e-05, "loss": 0.02107318639755249, "step": 1270 }, { "epoch": 2.2416812609457093, "grad_norm": 0.14257144927978516, "learning_rate": 5.219482862297054e-05, "loss": 0.018259820342063905, "step": 1280 }, { "epoch": 2.2591943957968477, "grad_norm": 0.1540592759847641, "learning_rate": 5.0992182802164765e-05, "loss": 0.0190964937210083, "step": 1290 }, { "epoch": 2.276707530647986, "grad_norm": 0.2179027795791626, "learning_rate": 4.978953698135899e-05, "loss": 0.020862923562526704, "step": 1300 }, { "epoch": 2.276707530647986, "eval_loss": 0.0765165463089943, "eval_runtime": 170.3828, "eval_samples_per_second": 2.982, "eval_steps_per_second": 0.745, "step": 1300 } ], "logging_steps": 10, "max_steps": 1713, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5.0067417630582374e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }