{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.999162712810494, "eval_steps": 500, "global_step": 895, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0011163829193413341, "grad_norm": 0.3974737008844776, "learning_rate": 2.2222222222222225e-06, "loss": 1.607, "step": 1 }, { "epoch": 0.0055819145967066705, "grad_norm": 0.4252789938746273, "learning_rate": 1.1111111111111112e-05, "loss": 1.5942, "step": 5 }, { "epoch": 0.011163829193413341, "grad_norm": 0.4658525758416883, "learning_rate": 2.2222222222222223e-05, "loss": 1.5877, "step": 10 }, { "epoch": 0.01674574379012001, "grad_norm": 0.27282017063503095, "learning_rate": 3.3333333333333335e-05, "loss": 1.5695, "step": 15 }, { "epoch": 0.022327658386826682, "grad_norm": 0.24165395076839943, "learning_rate": 4.4444444444444447e-05, "loss": 1.558, "step": 20 }, { "epoch": 0.027909572983533353, "grad_norm": 0.1767403193777301, "learning_rate": 5.555555555555556e-05, "loss": 1.4678, "step": 25 }, { "epoch": 0.03349148758024002, "grad_norm": 0.16356442786177314, "learning_rate": 6.666666666666667e-05, "loss": 1.467, "step": 30 }, { "epoch": 0.039073402176946694, "grad_norm": 0.15556520577978836, "learning_rate": 7.777777777777778e-05, "loss": 1.429, "step": 35 }, { "epoch": 0.044655316773653364, "grad_norm": 0.1263609432879071, "learning_rate": 8.888888888888889e-05, "loss": 1.4253, "step": 40 }, { "epoch": 0.050237231370360035, "grad_norm": 0.1696978939183065, "learning_rate": 0.0001, "loss": 1.3895, "step": 45 }, { "epoch": 0.055819145967066705, "grad_norm": 0.10830406775154863, "learning_rate": 0.00011111111111111112, "loss": 1.3645, "step": 50 }, { "epoch": 0.061401060563773376, "grad_norm": 0.08414898733986972, "learning_rate": 0.00012222222222222224, "loss": 1.3082, "step": 55 }, { "epoch": 0.06698297516048005, "grad_norm": 0.07973185533121883, "learning_rate": 0.00013333333333333334, "loss": 1.2962, "step": 60 }, { "epoch": 0.07256488975718671, "grad_norm": 0.09811845100733502, "learning_rate": 0.00014444444444444444, "loss": 1.3061, "step": 65 }, { "epoch": 0.07814680435389339, "grad_norm": 0.08298371354138047, "learning_rate": 0.00015555555555555556, "loss": 1.3017, "step": 70 }, { "epoch": 0.08372871895060005, "grad_norm": 0.07510078793315819, "learning_rate": 0.0001666666666666667, "loss": 1.2989, "step": 75 }, { "epoch": 0.08931063354730673, "grad_norm": 0.07085309149624731, "learning_rate": 0.00017777777777777779, "loss": 1.2787, "step": 80 }, { "epoch": 0.09489254814401339, "grad_norm": 0.09400917029194135, "learning_rate": 0.00018888888888888888, "loss": 1.2843, "step": 85 }, { "epoch": 0.10047446274072007, "grad_norm": 0.09230059652672952, "learning_rate": 0.0002, "loss": 1.262, "step": 90 }, { "epoch": 0.10605637733742673, "grad_norm": 0.10009657676945562, "learning_rate": 0.00019998096274980728, "loss": 1.2821, "step": 95 }, { "epoch": 0.11163829193413341, "grad_norm": 0.12201167887174731, "learning_rate": 0.000199923858247567, "loss": 1.2668, "step": 100 }, { "epoch": 0.11722020653084007, "grad_norm": 0.09628889966493127, "learning_rate": 0.00019982870823553308, "loss": 1.2503, "step": 105 }, { "epoch": 0.12280212112754675, "grad_norm": 0.10028621820088561, "learning_rate": 0.00019969554894159723, "loss": 1.2632, "step": 110 }, { "epoch": 0.12838403572425341, "grad_norm": 0.08593461106683208, "learning_rate": 0.00019952443106549533, "loss": 1.2396, "step": 115 }, { "epoch": 0.1339659503209601, "grad_norm": 0.08827739693201113, "learning_rate": 0.00019931541975950378, "loss": 1.2784, "step": 120 }, { "epoch": 0.13954786491766677, "grad_norm": 0.0911508607290428, "learning_rate": 0.00019906859460363307, "loss": 1.2689, "step": 125 }, { "epoch": 0.14512977951437342, "grad_norm": 0.12157025851983183, "learning_rate": 0.00019878404957532814, "loss": 1.2563, "step": 130 }, { "epoch": 0.1507116941110801, "grad_norm": 0.10772740664174668, "learning_rate": 0.0001984618930136869, "loss": 1.2853, "step": 135 }, { "epoch": 0.15629360870778677, "grad_norm": 0.09940063564218579, "learning_rate": 0.00019810224757821064, "loss": 1.241, "step": 140 }, { "epoch": 0.16187552330449345, "grad_norm": 0.09118466185918958, "learning_rate": 0.00019770525020210204, "loss": 1.2746, "step": 145 }, { "epoch": 0.1674574379012001, "grad_norm": 0.09674538853934604, "learning_rate": 0.0001972710520401287, "loss": 1.2561, "step": 150 }, { "epoch": 0.17303935249790678, "grad_norm": 0.1126652956332537, "learning_rate": 0.0001967998184110713, "loss": 1.257, "step": 155 }, { "epoch": 0.17862126709461346, "grad_norm": 0.0869341846350413, "learning_rate": 0.00019629172873477995, "loss": 1.2529, "step": 160 }, { "epoch": 0.18420318169132013, "grad_norm": 0.09888626799953022, "learning_rate": 0.00019574697646386027, "loss": 1.244, "step": 165 }, { "epoch": 0.18978509628802678, "grad_norm": 0.09785278620381999, "learning_rate": 0.0001951657690100178, "loss": 1.2334, "step": 170 }, { "epoch": 0.19536701088473346, "grad_norm": 0.07378537831469305, "learning_rate": 0.0001945483276650868, "loss": 1.2415, "step": 175 }, { "epoch": 0.20094892548144014, "grad_norm": 0.08814263560160436, "learning_rate": 0.0001938948875167745, "loss": 1.2512, "step": 180 }, { "epoch": 0.20653084007814682, "grad_norm": 0.09775538276417937, "learning_rate": 0.00019320569735915271, "loss": 1.2213, "step": 185 }, { "epoch": 0.21211275467485347, "grad_norm": 0.09538626874304115, "learning_rate": 0.00019248101959793066, "loss": 1.2354, "step": 190 }, { "epoch": 0.21769466927156014, "grad_norm": 0.08332625788355251, "learning_rate": 0.00019172113015054532, "loss": 1.2444, "step": 195 }, { "epoch": 0.22327658386826682, "grad_norm": 0.08309090570657847, "learning_rate": 0.00019092631834110723, "loss": 1.2316, "step": 200 }, { "epoch": 0.2288584984649735, "grad_norm": 0.09054323693110126, "learning_rate": 0.0001900968867902419, "loss": 1.27, "step": 205 }, { "epoch": 0.23444041306168015, "grad_norm": 0.08549436898181585, "learning_rate": 0.00018923315129986835, "loss": 1.2348, "step": 210 }, { "epoch": 0.24002232765838682, "grad_norm": 0.086610993256363, "learning_rate": 0.00018833544073295917, "loss": 1.2461, "step": 215 }, { "epoch": 0.2456042422550935, "grad_norm": 0.08146109722648563, "learning_rate": 0.00018740409688832764, "loss": 1.2431, "step": 220 }, { "epoch": 0.2511861568518002, "grad_norm": 0.08232534290451142, "learning_rate": 0.00018643947437048944, "loss": 1.2408, "step": 225 }, { "epoch": 0.25676807144850683, "grad_norm": 0.08507739560575232, "learning_rate": 0.00018544194045464886, "loss": 1.243, "step": 230 }, { "epoch": 0.26234998604521353, "grad_norm": 0.09782665661618925, "learning_rate": 0.00018441187494686053, "loss": 1.2426, "step": 235 }, { "epoch": 0.2679319006419202, "grad_norm": 0.0809973818897895, "learning_rate": 0.0001833496700394202, "loss": 1.2345, "step": 240 }, { "epoch": 0.27351381523862683, "grad_norm": 0.09269081567542259, "learning_rate": 0.00018225573016153945, "loss": 1.2343, "step": 245 }, { "epoch": 0.27909572983533354, "grad_norm": 0.09671785308848269, "learning_rate": 0.00018113047182536127, "loss": 1.2327, "step": 250 }, { "epoch": 0.2846776444320402, "grad_norm": 0.0906432644454991, "learning_rate": 0.00017997432346737524, "loss": 1.2532, "step": 255 }, { "epoch": 0.29025955902874684, "grad_norm": 0.08371586611488784, "learning_rate": 0.00017878772528529232, "loss": 1.2384, "step": 260 }, { "epoch": 0.29584147362545354, "grad_norm": 0.08640773776491195, "learning_rate": 0.000177571129070442, "loss": 1.2193, "step": 265 }, { "epoch": 0.3014233882221602, "grad_norm": 0.08164649256677078, "learning_rate": 0.00017632499803575474, "loss": 1.2327, "step": 270 }, { "epoch": 0.3070053028188669, "grad_norm": 0.09156690890905773, "learning_rate": 0.00017504980663939613, "loss": 1.2534, "step": 275 }, { "epoch": 0.31258721741557355, "grad_norm": 0.08393163680296412, "learning_rate": 0.00017374604040411935, "loss": 1.2411, "step": 280 }, { "epoch": 0.3181691320122802, "grad_norm": 0.08340859881557235, "learning_rate": 0.00017241419573240462, "loss": 1.2398, "step": 285 }, { "epoch": 0.3237510466089869, "grad_norm": 0.08622506272483123, "learning_rate": 0.00017105477971745666, "loss": 1.2321, "step": 290 }, { "epoch": 0.32933296120569355, "grad_norm": 0.08338497396964428, "learning_rate": 0.00016966830995013133, "loss": 1.2453, "step": 295 }, { "epoch": 0.3349148758024002, "grad_norm": 0.08718794446584939, "learning_rate": 0.00016825531432186543, "loss": 1.2134, "step": 300 }, { "epoch": 0.3404967903991069, "grad_norm": 0.09158015865602193, "learning_rate": 0.00016681633082368498, "loss": 1.223, "step": 305 }, { "epoch": 0.34607870499581356, "grad_norm": 0.08768121171152027, "learning_rate": 0.0001653519073413675, "loss": 1.235, "step": 310 }, { "epoch": 0.3516606195925202, "grad_norm": 0.08907125432704804, "learning_rate": 0.00016386260144683745, "loss": 1.2169, "step": 315 }, { "epoch": 0.3572425341892269, "grad_norm": 0.08767993008424768, "learning_rate": 0.00016234898018587337, "loss": 1.2435, "step": 320 }, { "epoch": 0.36282444878593356, "grad_norm": 0.08991663909567185, "learning_rate": 0.00016081161986220807, "loss": 1.2371, "step": 325 }, { "epoch": 0.36840636338264027, "grad_norm": 0.07876061570647706, "learning_rate": 0.00015925110581810394, "loss": 1.2118, "step": 330 }, { "epoch": 0.3739882779793469, "grad_norm": 0.09088539514665886, "learning_rate": 0.00015766803221148673, "loss": 1.2333, "step": 335 }, { "epoch": 0.37957019257605357, "grad_norm": 0.09371191064756335, "learning_rate": 0.00015606300178972287, "loss": 1.2192, "step": 340 }, { "epoch": 0.38515210717276027, "grad_norm": 0.0988524027231739, "learning_rate": 0.00015443662566012645, "loss": 1.2201, "step": 345 }, { "epoch": 0.3907340217694669, "grad_norm": 0.08068655015289312, "learning_rate": 0.00015278952305728324, "loss": 1.2312, "step": 350 }, { "epoch": 0.39631593636617357, "grad_norm": 0.08530580419429784, "learning_rate": 0.00015112232110728015, "loss": 1.2103, "step": 355 }, { "epoch": 0.4018978509628803, "grad_norm": 0.0832856621155852, "learning_rate": 0.00014943565458893, "loss": 1.2049, "step": 360 }, { "epoch": 0.4074797655595869, "grad_norm": 0.10112900442930213, "learning_rate": 0.00014773016569208283, "loss": 1.2381, "step": 365 }, { "epoch": 0.41306168015629363, "grad_norm": 0.08250019530921109, "learning_rate": 0.00014600650377311522, "loss": 1.2185, "step": 370 }, { "epoch": 0.4186435947530003, "grad_norm": 0.0987578329954232, "learning_rate": 0.0001442653251076912, "loss": 1.2222, "step": 375 }, { "epoch": 0.42422550934970693, "grad_norm": 0.08530899013880136, "learning_rate": 0.00014250729264088843, "loss": 1.2556, "step": 380 }, { "epoch": 0.42980742394641364, "grad_norm": 0.10267562745822716, "learning_rate": 0.00014073307573478526, "loss": 1.2146, "step": 385 }, { "epoch": 0.4353893385431203, "grad_norm": 0.09189285950155643, "learning_rate": 0.00013894334991360448, "loss": 1.2206, "step": 390 }, { "epoch": 0.44097125313982694, "grad_norm": 0.08370196846674145, "learning_rate": 0.00013713879660651068, "loss": 1.2076, "step": 395 }, { "epoch": 0.44655316773653364, "grad_norm": 0.08423557906306067, "learning_rate": 0.0001353201028881598, "loss": 1.2223, "step": 400 }, { "epoch": 0.4521350823332403, "grad_norm": 0.08292081122541138, "learning_rate": 0.00013348796121709862, "loss": 1.2294, "step": 405 }, { "epoch": 0.457716996929947, "grad_norm": 0.08767079524531268, "learning_rate": 0.00013164306917211476, "loss": 1.2229, "step": 410 }, { "epoch": 0.46329891152665365, "grad_norm": 0.0865942463810843, "learning_rate": 0.000129786129186637, "loss": 1.2163, "step": 415 }, { "epoch": 0.4688808261233603, "grad_norm": 0.08101515714055764, "learning_rate": 0.00012791784828128724, "loss": 1.2337, "step": 420 }, { "epoch": 0.474462740720067, "grad_norm": 0.09009147490161429, "learning_rate": 0.00012603893779468604, "loss": 1.2148, "step": 425 }, { "epoch": 0.48004465531677365, "grad_norm": 0.08757351279515291, "learning_rate": 0.0001241501131126138, "loss": 1.2056, "step": 430 }, { "epoch": 0.4856265699134803, "grad_norm": 0.08418609867162384, "learning_rate": 0.00012225209339563145, "loss": 1.2419, "step": 435 }, { "epoch": 0.491208484510187, "grad_norm": 0.08790367723325618, "learning_rate": 0.0001203456013052634, "loss": 1.2115, "step": 440 }, { "epoch": 0.49679039910689365, "grad_norm": 0.08071789319204539, "learning_rate": 0.00011843136272884794, "loss": 1.2072, "step": 445 }, { "epoch": 0.5023723137036004, "grad_norm": 0.0879278395825441, "learning_rate": 0.00011651010650315923, "loss": 1.2194, "step": 450 }, { "epoch": 0.507954228300307, "grad_norm": 0.08506166782358492, "learning_rate": 0.00011458256413690633, "loss": 1.2077, "step": 455 }, { "epoch": 0.5135361428970137, "grad_norm": 0.08984730610411729, "learning_rate": 0.00011264946953221496, "loss": 1.2484, "step": 460 }, { "epoch": 0.5191180574937203, "grad_norm": 0.2978083078661545, "learning_rate": 0.00011071155870519777, "loss": 1.2491, "step": 465 }, { "epoch": 0.5246999720904271, "grad_norm": 0.08504227931172395, "learning_rate": 0.00010876956950572006, "loss": 1.2268, "step": 470 }, { "epoch": 0.5302818866871337, "grad_norm": 0.08620167875904892, "learning_rate": 0.0001068242413364671, "loss": 1.2252, "step": 475 }, { "epoch": 0.5358638012838404, "grad_norm": 0.08669957736640198, "learning_rate": 0.00010487631487142017, "loss": 1.217, "step": 480 }, { "epoch": 0.541445715880547, "grad_norm": 0.08577871896034497, "learning_rate": 0.00010292653177384876, "loss": 1.2169, "step": 485 }, { "epoch": 0.5470276304772537, "grad_norm": 0.08417260057895289, "learning_rate": 0.00010097563441392581, "loss": 1.2354, "step": 490 }, { "epoch": 0.5526095450739603, "grad_norm": 0.08676422431924583, "learning_rate": 9.90243655860742e-05, "loss": 1.2039, "step": 495 }, { "epoch": 0.5581914596706671, "grad_norm": 0.09103906295111437, "learning_rate": 9.707346822615128e-05, "loss": 1.2194, "step": 500 }, { "epoch": 0.5637733742673737, "grad_norm": 0.08594537537719427, "learning_rate": 9.512368512857984e-05, "loss": 1.1949, "step": 505 }, { "epoch": 0.5693552888640804, "grad_norm": 0.08392759057088481, "learning_rate": 9.317575866353292e-05, "loss": 1.2196, "step": 510 }, { "epoch": 0.574937203460787, "grad_norm": 0.08201912454761111, "learning_rate": 9.123043049427995e-05, "loss": 1.2131, "step": 515 }, { "epoch": 0.5805191180574937, "grad_norm": 0.08925291750313868, "learning_rate": 8.928844129480227e-05, "loss": 1.2369, "step": 520 }, { "epoch": 0.5861010326542004, "grad_norm": 0.08954980070951671, "learning_rate": 8.735053046778506e-05, "loss": 1.2175, "step": 525 }, { "epoch": 0.5916829472509071, "grad_norm": 0.08574100993825345, "learning_rate": 8.541743586309365e-05, "loss": 1.2166, "step": 530 }, { "epoch": 0.5972648618476137, "grad_norm": 0.08840883290578404, "learning_rate": 8.348989349684076e-05, "loss": 1.2271, "step": 535 }, { "epoch": 0.6028467764443204, "grad_norm": 0.08443946017557556, "learning_rate": 8.156863727115211e-05, "loss": 1.2329, "step": 540 }, { "epoch": 0.608428691041027, "grad_norm": 0.0902640782545258, "learning_rate": 7.965439869473664e-05, "loss": 1.2253, "step": 545 }, { "epoch": 0.6140106056377338, "grad_norm": 0.08988630625422679, "learning_rate": 7.774790660436858e-05, "loss": 1.1785, "step": 550 }, { "epoch": 0.6195925202344404, "grad_norm": 0.08134808753957644, "learning_rate": 7.584988688738622e-05, "loss": 1.2261, "step": 555 }, { "epoch": 0.6251744348311471, "grad_norm": 0.08768193779762151, "learning_rate": 7.396106220531398e-05, "loss": 1.2463, "step": 560 }, { "epoch": 0.6307563494278537, "grad_norm": 0.0885816930556393, "learning_rate": 7.208215171871277e-05, "loss": 1.2141, "step": 565 }, { "epoch": 0.6363382640245604, "grad_norm": 0.08553683878588977, "learning_rate": 7.021387081336301e-05, "loss": 1.2026, "step": 570 }, { "epoch": 0.641920178621267, "grad_norm": 0.09505838067263224, "learning_rate": 6.835693082788525e-05, "loss": 1.2168, "step": 575 }, { "epoch": 0.6475020932179738, "grad_norm": 0.08769224685329463, "learning_rate": 6.651203878290139e-05, "loss": 1.2493, "step": 580 }, { "epoch": 0.6530840078146805, "grad_norm": 0.07990213288377576, "learning_rate": 6.46798971118402e-05, "loss": 1.2308, "step": 585 }, { "epoch": 0.6586659224113871, "grad_norm": 0.08133261350163556, "learning_rate": 6.286120339348935e-05, "loss": 1.2014, "step": 590 }, { "epoch": 0.6642478370080938, "grad_norm": 0.09363089434544866, "learning_rate": 6.105665008639557e-05, "loss": 1.2238, "step": 595 }, { "epoch": 0.6698297516048004, "grad_norm": 0.07910287951552411, "learning_rate": 5.926692426521474e-05, "loss": 1.2473, "step": 600 }, { "epoch": 0.6754116662015072, "grad_norm": 0.0801209902764544, "learning_rate": 5.749270735911158e-05, "loss": 1.1975, "step": 605 }, { "epoch": 0.6809935807982138, "grad_norm": 0.08087293360533905, "learning_rate": 5.573467489230879e-05, "loss": 1.1966, "step": 610 }, { "epoch": 0.6865754953949205, "grad_norm": 0.08220997258417966, "learning_rate": 5.399349622688479e-05, "loss": 1.2345, "step": 615 }, { "epoch": 0.6921574099916271, "grad_norm": 0.0825575277760057, "learning_rate": 5.226983430791722e-05, "loss": 1.2289, "step": 620 }, { "epoch": 0.6977393245883338, "grad_norm": 0.08305460425818378, "learning_rate": 5.0564345411070025e-05, "loss": 1.204, "step": 625 }, { "epoch": 0.7033212391850404, "grad_norm": 0.08011105262542664, "learning_rate": 4.8877678892719866e-05, "loss": 1.1946, "step": 630 }, { "epoch": 0.7089031537817472, "grad_norm": 0.08686069747720479, "learning_rate": 4.721047694271676e-05, "loss": 1.2, "step": 635 }, { "epoch": 0.7144850683784538, "grad_norm": 0.08537977661965272, "learning_rate": 4.556337433987359e-05, "loss": 1.2054, "step": 640 }, { "epoch": 0.7200669829751605, "grad_norm": 0.08857193949478791, "learning_rate": 4.393699821027716e-05, "loss": 1.1988, "step": 645 }, { "epoch": 0.7256488975718671, "grad_norm": 0.09608004999262602, "learning_rate": 4.2331967788513295e-05, "loss": 1.2226, "step": 650 }, { "epoch": 0.7312308121685738, "grad_norm": 0.08235757922811432, "learning_rate": 4.074889418189608e-05, "loss": 1.2202, "step": 655 }, { "epoch": 0.7368127267652805, "grad_norm": 0.08660069823512372, "learning_rate": 3.9188380137791936e-05, "loss": 1.215, "step": 660 }, { "epoch": 0.7423946413619872, "grad_norm": 0.08090639704744831, "learning_rate": 3.7651019814126654e-05, "loss": 1.2255, "step": 665 }, { "epoch": 0.7479765559586938, "grad_norm": 0.08082821477995833, "learning_rate": 3.613739855316257e-05, "loss": 1.2176, "step": 670 }, { "epoch": 0.7535584705554005, "grad_norm": 0.08469395080984878, "learning_rate": 3.46480926586325e-05, "loss": 1.2275, "step": 675 }, { "epoch": 0.7591403851521071, "grad_norm": 0.0871555466504494, "learning_rate": 3.3183669176315045e-05, "loss": 1.2351, "step": 680 }, { "epoch": 0.7647222997488139, "grad_norm": 0.08170223557553191, "learning_rate": 3.174468567813461e-05, "loss": 1.2074, "step": 685 }, { "epoch": 0.7703042143455205, "grad_norm": 0.0838318843856818, "learning_rate": 3.033169004986873e-05, "loss": 1.2396, "step": 690 }, { "epoch": 0.7758861289422272, "grad_norm": 0.08831381148889993, "learning_rate": 2.894522028254334e-05, "loss": 1.1947, "step": 695 }, { "epoch": 0.7814680435389338, "grad_norm": 0.08158536981215994, "learning_rate": 2.7585804267595384e-05, "loss": 1.208, "step": 700 }, { "epoch": 0.7870499581356405, "grad_norm": 0.08116519613000232, "learning_rate": 2.6253959595880673e-05, "loss": 1.2191, "step": 705 }, { "epoch": 0.7926318727323471, "grad_norm": 0.08294169676184929, "learning_rate": 2.495019336060387e-05, "loss": 1.195, "step": 710 }, { "epoch": 0.7982137873290539, "grad_norm": 0.08406756837278591, "learning_rate": 2.367500196424529e-05, "loss": 1.2203, "step": 715 }, { "epoch": 0.8037957019257606, "grad_norm": 0.08211403607563178, "learning_rate": 2.242887092955801e-05, "loss": 1.2041, "step": 720 }, { "epoch": 0.8093776165224672, "grad_norm": 0.07980978787138238, "learning_rate": 2.121227471470768e-05, "loss": 1.2394, "step": 725 }, { "epoch": 0.8149595311191739, "grad_norm": 0.08416184610807921, "learning_rate": 2.002567653262479e-05, "loss": 1.2228, "step": 730 }, { "epoch": 0.8205414457158805, "grad_norm": 0.08256062792318115, "learning_rate": 1.8869528174638752e-05, "loss": 1.203, "step": 735 }, { "epoch": 0.8261233603125873, "grad_norm": 0.09043351264554417, "learning_rate": 1.774426983846058e-05, "loss": 1.2275, "step": 740 }, { "epoch": 0.8317052749092939, "grad_norm": 0.08486147964302236, "learning_rate": 1.6650329960579792e-05, "loss": 1.2208, "step": 745 }, { "epoch": 0.8372871895060006, "grad_norm": 0.0935945466460169, "learning_rate": 1.5588125053139468e-05, "loss": 1.2131, "step": 750 }, { "epoch": 0.8428691041027072, "grad_norm": 0.08282716353976063, "learning_rate": 1.4558059545351143e-05, "loss": 1.2284, "step": 755 }, { "epoch": 0.8484510186994139, "grad_norm": 0.08286515378820142, "learning_rate": 1.3560525629510568e-05, "loss": 1.2086, "step": 760 }, { "epoch": 0.8540329332961206, "grad_norm": 0.08295259360853054, "learning_rate": 1.259590311167238e-05, "loss": 1.2061, "step": 765 }, { "epoch": 0.8596148478928273, "grad_norm": 0.08358389042910293, "learning_rate": 1.166455926704082e-05, "loss": 1.222, "step": 770 }, { "epoch": 0.8651967624895339, "grad_norm": 0.08388863476839661, "learning_rate": 1.0766848700131648e-05, "loss": 1.2143, "step": 775 }, { "epoch": 0.8707786770862406, "grad_norm": 0.08277339984932784, "learning_rate": 9.903113209758096e-06, "loss": 1.2192, "step": 780 }, { "epoch": 0.8763605916829472, "grad_norm": 0.08938310164317657, "learning_rate": 9.073681658892775e-06, "loss": 1.2191, "step": 785 }, { "epoch": 0.8819425062796539, "grad_norm": 0.07910593096708422, "learning_rate": 8.278869849454718e-06, "loss": 1.2269, "step": 790 }, { "epoch": 0.8875244208763606, "grad_norm": 0.08295037453317607, "learning_rate": 7.5189804020693536e-06, "loss": 1.2021, "step": 795 }, { "epoch": 0.8931063354730673, "grad_norm": 0.08199446080472911, "learning_rate": 6.794302640847294e-06, "loss": 1.1961, "step": 800 }, { "epoch": 0.8986882500697739, "grad_norm": 0.08481342663212112, "learning_rate": 6.1051124832254944e-06, "loss": 1.2069, "step": 805 }, { "epoch": 0.9042701646664806, "grad_norm": 0.08217551850800063, "learning_rate": 5.451672334913216e-06, "loss": 1.2055, "step": 810 }, { "epoch": 0.9098520792631872, "grad_norm": 0.08322503504827561, "learning_rate": 4.834230989982213e-06, "loss": 1.2156, "step": 815 }, { "epoch": 0.915433993859894, "grad_norm": 0.08125961805104615, "learning_rate": 4.253023536139733e-06, "loss": 1.2005, "step": 820 }, { "epoch": 0.9210159084566006, "grad_norm": 0.09037682759604541, "learning_rate": 3.7082712652200867e-06, "loss": 1.2079, "step": 825 }, { "epoch": 0.9265978230533073, "grad_norm": 0.08711894287392291, "learning_rate": 3.2001815889286856e-06, "loss": 1.232, "step": 830 }, { "epoch": 0.9321797376500139, "grad_norm": 0.08367132801462379, "learning_rate": 2.728947959871353e-06, "loss": 1.1858, "step": 835 }, { "epoch": 0.9377616522467206, "grad_norm": 0.0809801248589102, "learning_rate": 2.294749797897955e-06, "loss": 1.1871, "step": 840 }, { "epoch": 0.9433435668434274, "grad_norm": 0.08412969109149288, "learning_rate": 1.8977524217893783e-06, "loss": 1.2248, "step": 845 }, { "epoch": 0.948925481440134, "grad_norm": 0.08014128153610968, "learning_rate": 1.5381069863131037e-06, "loss": 1.2312, "step": 850 }, { "epoch": 0.9545073960368406, "grad_norm": 0.08040835492341503, "learning_rate": 1.2159504246718522e-06, "loss": 1.2213, "step": 855 }, { "epoch": 0.9600893106335473, "grad_norm": 0.08170226749481643, "learning_rate": 9.314053963669245e-07, "loss": 1.2114, "step": 860 }, { "epoch": 0.965671225230254, "grad_norm": 0.08123838559159317, "learning_rate": 6.845802404962243e-07, "loss": 1.2455, "step": 865 }, { "epoch": 0.9712531398269606, "grad_norm": 0.08532355248950987, "learning_rate": 4.7556893450466653e-07, "loss": 1.2017, "step": 870 }, { "epoch": 0.9768350544236674, "grad_norm": 0.07935413274906811, "learning_rate": 3.044510584027771e-07, "loss": 1.203, "step": 875 }, { "epoch": 0.982416969020374, "grad_norm": 0.07922680701516337, "learning_rate": 1.7129176446692984e-07, "loss": 1.1993, "step": 880 }, { "epoch": 0.9879988836170807, "grad_norm": 0.08007277288266887, "learning_rate": 7.614175243301213e-08, "loss": 1.221, "step": 885 }, { "epoch": 0.9935807982137873, "grad_norm": 0.08190648675567455, "learning_rate": 1.9037250192732726e-08, "loss": 1.2245, "step": 890 }, { "epoch": 0.999162712810494, "grad_norm": 0.07884795604109555, "learning_rate": 0.0, "loss": 1.2359, "step": 895 }, { "epoch": 0.999162712810494, "eval_loss": 1.1748292446136475, "eval_runtime": 1569.4225, "eval_samples_per_second": 8.524, "eval_steps_per_second": 0.533, "step": 895 }, { "epoch": 0.999162712810494, "step": 895, "total_flos": 1.1254972268150784e+16, "train_loss": 1.2433469767011078, "train_runtime": 20318.3129, "train_samples_per_second": 2.821, "train_steps_per_second": 0.044 } ], "logging_steps": 5, "max_steps": 895, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.1254972268150784e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }