{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9997268505872713, "eval_steps": 500, "global_step": 915, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0010925976509150504, "grad_norm": 22.730928776382456, "learning_rate": 1.0869565217391305e-07, "loss": 1.3223, "step": 1 }, { "epoch": 0.005462988254575253, "grad_norm": 20.891256691927374, "learning_rate": 5.434782608695653e-07, "loss": 1.3066, "step": 5 }, { "epoch": 0.010925976509150505, "grad_norm": 8.917157931216027, "learning_rate": 1.0869565217391306e-06, "loss": 1.1943, "step": 10 }, { "epoch": 0.01638896476372576, "grad_norm": 7.290952602788098, "learning_rate": 1.6304347826086957e-06, "loss": 1.0279, "step": 15 }, { "epoch": 0.02185195301830101, "grad_norm": 3.006256144496407, "learning_rate": 2.173913043478261e-06, "loss": 0.9049, "step": 20 }, { "epoch": 0.027314941272876262, "grad_norm": 2.371134339375521, "learning_rate": 2.7173913043478263e-06, "loss": 0.8647, "step": 25 }, { "epoch": 0.03277792952745152, "grad_norm": 2.1504800702490865, "learning_rate": 3.2608695652173914e-06, "loss": 0.833, "step": 30 }, { "epoch": 0.03824091778202677, "grad_norm": 2.2317280108941, "learning_rate": 3.804347826086957e-06, "loss": 0.807, "step": 35 }, { "epoch": 0.04370390603660202, "grad_norm": 2.2488850758737255, "learning_rate": 4.347826086956522e-06, "loss": 0.7919, "step": 40 }, { "epoch": 0.04916689429117727, "grad_norm": 2.3580705798318484, "learning_rate": 4.891304347826087e-06, "loss": 0.7761, "step": 45 }, { "epoch": 0.054629882545752524, "grad_norm": 2.4373495899440414, "learning_rate": 5.4347826086956525e-06, "loss": 0.7614, "step": 50 }, { "epoch": 0.060092870800327776, "grad_norm": 2.4135433324014763, "learning_rate": 5.978260869565218e-06, "loss": 0.7458, "step": 55 }, { "epoch": 0.06555585905490303, "grad_norm": 2.81358704818424, "learning_rate": 6.521739130434783e-06, "loss": 0.7291, "step": 60 }, { "epoch": 0.07101884730947829, "grad_norm": 2.438627231166693, "learning_rate": 7.065217391304349e-06, "loss": 0.72, "step": 65 }, { "epoch": 0.07648183556405354, "grad_norm": 2.1869040708927177, "learning_rate": 7.608695652173914e-06, "loss": 0.7141, "step": 70 }, { "epoch": 0.08194482381862879, "grad_norm": 2.184483160699592, "learning_rate": 8.15217391304348e-06, "loss": 0.6994, "step": 75 }, { "epoch": 0.08740781207320404, "grad_norm": 2.3203787824757507, "learning_rate": 8.695652173913044e-06, "loss": 0.7019, "step": 80 }, { "epoch": 0.0928708003277793, "grad_norm": 2.3068506646214915, "learning_rate": 9.23913043478261e-06, "loss": 0.6958, "step": 85 }, { "epoch": 0.09833378858235454, "grad_norm": 2.257062695334498, "learning_rate": 9.782608695652175e-06, "loss": 0.6842, "step": 90 }, { "epoch": 0.1037967768369298, "grad_norm": 2.400983485932064, "learning_rate": 9.999672147992806e-06, "loss": 0.6914, "step": 95 }, { "epoch": 0.10925976509150505, "grad_norm": 2.324571446077097, "learning_rate": 9.997668763647963e-06, "loss": 0.6867, "step": 100 }, { "epoch": 0.1147227533460803, "grad_norm": 2.137443230070172, "learning_rate": 9.993844863856281e-06, "loss": 0.689, "step": 105 }, { "epoch": 0.12018574160065555, "grad_norm": 2.0564600115869585, "learning_rate": 9.988201841560945e-06, "loss": 0.6688, "step": 110 }, { "epoch": 0.12564872985523082, "grad_norm": 2.139005852684594, "learning_rate": 9.980741752362222e-06, "loss": 0.6711, "step": 115 }, { "epoch": 0.13111171810980607, "grad_norm": 2.530899194975221, "learning_rate": 9.971467313768668e-06, "loss": 0.6681, "step": 120 }, { "epoch": 0.13657470636438132, "grad_norm": 2.1541912419217466, "learning_rate": 9.96038190420721e-06, "loss": 0.6624, "step": 125 }, { "epoch": 0.14203769461895657, "grad_norm": 2.2266155964762646, "learning_rate": 9.947489561792475e-06, "loss": 0.6678, "step": 130 }, { "epoch": 0.14750068287353182, "grad_norm": 2.0151113143541215, "learning_rate": 9.932794982855818e-06, "loss": 0.6576, "step": 135 }, { "epoch": 0.15296367112810708, "grad_norm": 2.122951721690571, "learning_rate": 9.916303520234573e-06, "loss": 0.6518, "step": 140 }, { "epoch": 0.15842665938268233, "grad_norm": 2.073005839186561, "learning_rate": 9.898021181322158e-06, "loss": 0.6467, "step": 145 }, { "epoch": 0.16388964763725758, "grad_norm": 2.122513038916697, "learning_rate": 9.877954625879746e-06, "loss": 0.6422, "step": 150 }, { "epoch": 0.16935263589183283, "grad_norm": 1.938211750086781, "learning_rate": 9.8561111636103e-06, "loss": 0.6313, "step": 155 }, { "epoch": 0.17481562414640808, "grad_norm": 2.1510416920205997, "learning_rate": 9.832498751495832e-06, "loss": 0.637, "step": 160 }, { "epoch": 0.18027861240098333, "grad_norm": 2.368980517738678, "learning_rate": 9.807125990898905e-06, "loss": 0.636, "step": 165 }, { "epoch": 0.1857416006555586, "grad_norm": 2.2294247264281553, "learning_rate": 9.780002124429377e-06, "loss": 0.6378, "step": 170 }, { "epoch": 0.19120458891013384, "grad_norm": 2.044434482857163, "learning_rate": 9.75113703257758e-06, "loss": 0.6299, "step": 175 }, { "epoch": 0.1966675771647091, "grad_norm": 2.273754929604351, "learning_rate": 9.720541230115113e-06, "loss": 0.6278, "step": 180 }, { "epoch": 0.20213056541928434, "grad_norm": 1.9049721088945737, "learning_rate": 9.688225862264604e-06, "loss": 0.6215, "step": 185 }, { "epoch": 0.2075935536738596, "grad_norm": 2.2956911916513936, "learning_rate": 9.654202700639805e-06, "loss": 0.6114, "step": 190 }, { "epoch": 0.21305654192843484, "grad_norm": 2.047625547879147, "learning_rate": 9.61848413895751e-06, "loss": 0.6272, "step": 195 }, { "epoch": 0.2185195301830101, "grad_norm": 2.144882455415688, "learning_rate": 9.581083188522862e-06, "loss": 0.6146, "step": 200 }, { "epoch": 0.22398251843758535, "grad_norm": 1.882314204150727, "learning_rate": 9.542013473489683e-06, "loss": 0.6115, "step": 205 }, { "epoch": 0.2294455066921606, "grad_norm": 2.0782768044314746, "learning_rate": 9.501289225897566e-06, "loss": 0.5965, "step": 210 }, { "epoch": 0.23490849494673585, "grad_norm": 1.9514474259034185, "learning_rate": 9.458925280487532e-06, "loss": 0.6176, "step": 215 }, { "epoch": 0.2403714832013111, "grad_norm": 1.8865949054260884, "learning_rate": 9.414937069298125e-06, "loss": 0.6018, "step": 220 }, { "epoch": 0.24583447145588638, "grad_norm": 2.0683179915172962, "learning_rate": 9.369340616043948e-06, "loss": 0.6105, "step": 225 }, { "epoch": 0.25129745971046163, "grad_norm": 2.004631261000376, "learning_rate": 9.322152530278658e-06, "loss": 0.5931, "step": 230 }, { "epoch": 0.2567604479650369, "grad_norm": 1.9653795566834225, "learning_rate": 9.273390001344544e-06, "loss": 0.5878, "step": 235 }, { "epoch": 0.26222343621961214, "grad_norm": 2.0291055623314715, "learning_rate": 9.223070792110927e-06, "loss": 0.5827, "step": 240 }, { "epoch": 0.2676864244741874, "grad_norm": 1.9570477475533896, "learning_rate": 9.17121323250362e-06, "loss": 0.5943, "step": 245 }, { "epoch": 0.27314941272876264, "grad_norm": 2.1357120267635463, "learning_rate": 9.117836212827839e-06, "loss": 0.571, "step": 250 }, { "epoch": 0.2786124009833379, "grad_norm": 1.9807416332273953, "learning_rate": 9.062959176886967e-06, "loss": 0.5895, "step": 255 }, { "epoch": 0.28407538923791315, "grad_norm": 1.8679007980966873, "learning_rate": 9.006602114899711e-06, "loss": 0.5837, "step": 260 }, { "epoch": 0.2895383774924884, "grad_norm": 2.159508129093504, "learning_rate": 8.948785556218202e-06, "loss": 0.5872, "step": 265 }, { "epoch": 0.29500136574706365, "grad_norm": 2.011039338549636, "learning_rate": 8.88953056184971e-06, "loss": 0.5656, "step": 270 }, { "epoch": 0.3004643540016389, "grad_norm": 1.8497602272423903, "learning_rate": 8.828858716784692e-06, "loss": 0.5607, "step": 275 }, { "epoch": 0.30592734225621415, "grad_norm": 1.8697548121419354, "learning_rate": 8.766792122133949e-06, "loss": 0.569, "step": 280 }, { "epoch": 0.3113903305107894, "grad_norm": 2.023364352460863, "learning_rate": 8.703353387077813e-06, "loss": 0.5694, "step": 285 }, { "epoch": 0.31685331876536466, "grad_norm": 1.9966954532417194, "learning_rate": 8.638565620630219e-06, "loss": 0.5485, "step": 290 }, { "epoch": 0.3223163070199399, "grad_norm": 2.118048147240604, "learning_rate": 8.572452423220717e-06, "loss": 0.5544, "step": 295 }, { "epoch": 0.32777929527451516, "grad_norm": 1.9646992858208574, "learning_rate": 8.505037878097481e-06, "loss": 0.5585, "step": 300 }, { "epoch": 0.3332422835290904, "grad_norm": 1.842699727967433, "learning_rate": 8.436346542554432e-06, "loss": 0.5525, "step": 305 }, { "epoch": 0.33870527178366566, "grad_norm": 2.1516397434897, "learning_rate": 8.366403438985675e-06, "loss": 0.5598, "step": 310 }, { "epoch": 0.3441682600382409, "grad_norm": 1.935829862534685, "learning_rate": 8.295234045770524e-06, "loss": 0.5474, "step": 315 }, { "epoch": 0.34963124829281617, "grad_norm": 2.0947580703115, "learning_rate": 8.222864287992419e-06, "loss": 0.5444, "step": 320 }, { "epoch": 0.3550942365473914, "grad_norm": 2.0230909691945462, "learning_rate": 8.149320527995111e-06, "loss": 0.5367, "step": 325 }, { "epoch": 0.36055722480196667, "grad_norm": 1.9422498738110598, "learning_rate": 8.074629555779586e-06, "loss": 0.5374, "step": 330 }, { "epoch": 0.3660202130565419, "grad_norm": 2.017157014853384, "learning_rate": 7.998818579245183e-06, "loss": 0.538, "step": 335 }, { "epoch": 0.3714832013111172, "grad_norm": 2.0200129213489952, "learning_rate": 7.9219152142785e-06, "loss": 0.5261, "step": 340 }, { "epoch": 0.3769461895656924, "grad_norm": 1.9025863914724077, "learning_rate": 7.843947474693665e-06, "loss": 0.5272, "step": 345 }, { "epoch": 0.3824091778202677, "grad_norm": 1.98859186872293, "learning_rate": 7.764943762027675e-06, "loss": 0.5271, "step": 350 }, { "epoch": 0.38787216607484293, "grad_norm": 1.9641160396193444, "learning_rate": 7.68493285519447e-06, "loss": 0.5156, "step": 355 }, { "epoch": 0.3933351543294182, "grad_norm": 1.9337347711635438, "learning_rate": 7.603943900001567e-06, "loss": 0.5152, "step": 360 }, { "epoch": 0.39879814258399343, "grad_norm": 1.9734924840604742, "learning_rate": 7.522006398533022e-06, "loss": 0.5157, "step": 365 }, { "epoch": 0.4042611308385687, "grad_norm": 2.3339176407449904, "learning_rate": 7.439150198402619e-06, "loss": 0.5162, "step": 370 }, { "epoch": 0.40972411909314393, "grad_norm": 2.2975439038971732, "learning_rate": 7.355405481881207e-06, "loss": 0.5157, "step": 375 }, { "epoch": 0.4151871073477192, "grad_norm": 1.971313224728527, "learning_rate": 7.270802754902092e-06, "loss": 0.5014, "step": 380 }, { "epoch": 0.42065009560229444, "grad_norm": 1.8926014206142687, "learning_rate": 7.185372835948574e-06, "loss": 0.5082, "step": 385 }, { "epoch": 0.4261130838568697, "grad_norm": 2.0424297512212677, "learning_rate": 7.0991468448275914e-06, "loss": 0.4969, "step": 390 }, { "epoch": 0.43157607211144494, "grad_norm": 1.9075652902136018, "learning_rate": 7.0121561913336255e-06, "loss": 0.5025, "step": 395 }, { "epoch": 0.4370390603660202, "grad_norm": 1.972187929750843, "learning_rate": 6.924432563806962e-06, "loss": 0.4805, "step": 400 }, { "epoch": 0.44250204862059545, "grad_norm": 2.242916059633561, "learning_rate": 6.836007917590487e-06, "loss": 0.503, "step": 405 }, { "epoch": 0.4479650368751707, "grad_norm": 2.0649875803912274, "learning_rate": 6.746914463389216e-06, "loss": 0.4901, "step": 410 }, { "epoch": 0.45342802512974595, "grad_norm": 1.8063638082708948, "learning_rate": 6.65718465553681e-06, "loss": 0.4755, "step": 415 }, { "epoch": 0.4588910133843212, "grad_norm": 1.969494410176558, "learning_rate": 6.566851180173344e-06, "loss": 0.4874, "step": 420 }, { "epoch": 0.46435400163889645, "grad_norm": 1.8970828426082733, "learning_rate": 6.475946943338616e-06, "loss": 0.4766, "step": 425 }, { "epoch": 0.4698169898934717, "grad_norm": 1.9623505460334127, "learning_rate": 6.384505058985388e-06, "loss": 0.487, "step": 430 }, { "epoch": 0.47527997814804696, "grad_norm": 2.016298870850614, "learning_rate": 6.292558836916856e-06, "loss": 0.4684, "step": 435 }, { "epoch": 0.4807429664026222, "grad_norm": 1.9782532484829365, "learning_rate": 6.200141770652791e-06, "loss": 0.4655, "step": 440 }, { "epoch": 0.4862059546571975, "grad_norm": 1.9875880346692325, "learning_rate": 6.107287525228764e-06, "loss": 0.4829, "step": 445 }, { "epoch": 0.49166894291177277, "grad_norm": 2.0320364233161836, "learning_rate": 6.014029924932874e-06, "loss": 0.4753, "step": 450 }, { "epoch": 0.497131931166348, "grad_norm": 2.025675106544592, "learning_rate": 5.920402940984483e-06, "loss": 0.4673, "step": 455 }, { "epoch": 0.5025949194209233, "grad_norm": 1.921954531735398, "learning_rate": 5.826440679159424e-06, "loss": 0.4532, "step": 460 }, { "epoch": 0.5080579076754985, "grad_norm": 2.01066093050374, "learning_rate": 5.732177367366176e-06, "loss": 0.4658, "step": 465 }, { "epoch": 0.5135208959300738, "grad_norm": 1.8422653149296644, "learning_rate": 5.63764734317758e-06, "loss": 0.4529, "step": 470 }, { "epoch": 0.518983884184649, "grad_norm": 1.9255817468742156, "learning_rate": 5.542885041322577e-06, "loss": 0.4561, "step": 475 }, { "epoch": 0.5244468724392243, "grad_norm": 1.951977869616497, "learning_rate": 5.447924981142578e-06, "loss": 0.4565, "step": 480 }, { "epoch": 0.5299098606937995, "grad_norm": 1.9554258723769773, "learning_rate": 5.3528017540169974e-06, "loss": 0.4475, "step": 485 }, { "epoch": 0.5353728489483748, "grad_norm": 1.9102850022081725, "learning_rate": 5.25755001076255e-06, "loss": 0.4497, "step": 490 }, { "epoch": 0.54083583720295, "grad_norm": 2.239195466962009, "learning_rate": 5.1622044490108994e-06, "loss": 0.4454, "step": 495 }, { "epoch": 0.5462988254575253, "grad_norm": 1.8827718246138567, "learning_rate": 5.066799800569247e-06, "loss": 0.4404, "step": 500 }, { "epoch": 0.5517618137121005, "grad_norm": 1.9069058786965245, "learning_rate": 4.971370818768475e-06, "loss": 0.4236, "step": 505 }, { "epoch": 0.5572248019666758, "grad_norm": 1.855279389571921, "learning_rate": 4.875952265803452e-06, "loss": 0.4403, "step": 510 }, { "epoch": 0.562687790221251, "grad_norm": 1.8972542181259846, "learning_rate": 4.780578900070104e-06, "loss": 0.4372, "step": 515 }, { "epoch": 0.5681507784758263, "grad_norm": 1.844147614606819, "learning_rate": 4.685285463503867e-06, "loss": 0.4331, "step": 520 }, { "epoch": 0.5736137667304015, "grad_norm": 1.9625504078338127, "learning_rate": 4.5901066689241514e-06, "loss": 0.4292, "step": 525 }, { "epoch": 0.5790767549849768, "grad_norm": 2.2025005649073433, "learning_rate": 4.49507718738939e-06, "loss": 0.4293, "step": 530 }, { "epoch": 0.584539743239552, "grad_norm": 1.9450789672739341, "learning_rate": 4.400231635567319e-06, "loss": 0.4192, "step": 535 }, { "epoch": 0.5900027314941273, "grad_norm": 1.9764711012724976, "learning_rate": 4.305604563125061e-06, "loss": 0.4182, "step": 540 }, { "epoch": 0.5954657197487025, "grad_norm": 2.1198793543903194, "learning_rate": 4.211230440143604e-06, "loss": 0.4114, "step": 545 }, { "epoch": 0.6009287080032778, "grad_norm": 3.1276187198521463, "learning_rate": 4.117143644561306e-06, "loss": 0.4166, "step": 550 }, { "epoch": 0.606391696257853, "grad_norm": 1.8794840574959752, "learning_rate": 4.023378449650928e-06, "loss": 0.4076, "step": 555 }, { "epoch": 0.6118546845124283, "grad_norm": 1.9139438651443652, "learning_rate": 3.929969011534811e-06, "loss": 0.4218, "step": 560 }, { "epoch": 0.6173176727670036, "grad_norm": 1.861741261098825, "learning_rate": 3.83694935674272e-06, "loss": 0.4169, "step": 565 }, { "epoch": 0.6227806610215788, "grad_norm": 1.8247668127301933, "learning_rate": 3.7443533698169186e-06, "loss": 0.4132, "step": 570 }, { "epoch": 0.6282436492761541, "grad_norm": 1.9796553518989775, "learning_rate": 3.652214780968926e-06, "loss": 0.3996, "step": 575 }, { "epoch": 0.6337066375307293, "grad_norm": 1.823241119094308, "learning_rate": 3.5605671537925264e-06, "loss": 0.4088, "step": 580 }, { "epoch": 0.6391696257853046, "grad_norm": 1.881819370184891, "learning_rate": 3.469443873037457e-06, "loss": 0.4073, "step": 585 }, { "epoch": 0.6446326140398798, "grad_norm": 1.947962800229108, "learning_rate": 3.3788781324482446e-06, "loss": 0.4067, "step": 590 }, { "epoch": 0.6500956022944551, "grad_norm": 1.9214089256240032, "learning_rate": 3.2889029226726286e-06, "loss": 0.3996, "step": 595 }, { "epoch": 0.6555585905490303, "grad_norm": 1.9543022070758702, "learning_rate": 3.1995510192439587e-06, "loss": 0.3982, "step": 600 }, { "epoch": 0.6610215788036056, "grad_norm": 1.9173316158851936, "learning_rate": 3.1108549706419553e-06, "loss": 0.4, "step": 605 }, { "epoch": 0.6664845670581808, "grad_norm": 1.8159793606682484, "learning_rate": 3.022847086436176e-06, "loss": 0.3988, "step": 610 }, { "epoch": 0.6719475553127561, "grad_norm": 1.9089073637989584, "learning_rate": 2.9355594255165186e-06, "loss": 0.3871, "step": 615 }, { "epoch": 0.6774105435673313, "grad_norm": 1.8914866485656172, "learning_rate": 2.8490237844150335e-06, "loss": 0.3885, "step": 620 }, { "epoch": 0.6828735318219066, "grad_norm": 1.9221955744591699, "learning_rate": 2.763271685723311e-06, "loss": 0.3818, "step": 625 }, { "epoch": 0.6883365200764818, "grad_norm": 1.8733166396673193, "learning_rate": 2.6783343666096445e-06, "loss": 0.3874, "step": 630 }, { "epoch": 0.6937995083310571, "grad_norm": 2.134783287581667, "learning_rate": 2.5942427674401936e-06, "loss": 0.3899, "step": 635 }, { "epoch": 0.6992624965856323, "grad_norm": 1.8202198608461309, "learning_rate": 2.5110275205082224e-06, "loss": 0.3839, "step": 640 }, { "epoch": 0.7047254848402076, "grad_norm": 1.9005906531792638, "learning_rate": 2.428718938875607e-06, "loss": 0.3812, "step": 645 }, { "epoch": 0.7101884730947828, "grad_norm": 1.8656177124830933, "learning_rate": 2.3473470053305954e-06, "loss": 0.3805, "step": 650 }, { "epoch": 0.7156514613493581, "grad_norm": 1.9350473980336609, "learning_rate": 2.266941361465886e-06, "loss": 0.381, "step": 655 }, { "epoch": 0.7211144496039333, "grad_norm": 1.8146852411399312, "learning_rate": 2.187531296881017e-06, "loss": 0.3751, "step": 660 }, { "epoch": 0.7265774378585086, "grad_norm": 1.8246829733561232, "learning_rate": 2.1091457385129267e-06, "loss": 0.3752, "step": 665 }, { "epoch": 0.7320404261130838, "grad_norm": 1.8550222595976351, "learning_rate": 2.031813240098686e-06, "loss": 0.3721, "step": 670 }, { "epoch": 0.7375034143676591, "grad_norm": 1.8354415599143141, "learning_rate": 1.9555619717741248e-06, "loss": 0.3733, "step": 675 }, { "epoch": 0.7429664026222343, "grad_norm": 1.9427997534753285, "learning_rate": 1.8804197098122168e-06, "loss": 0.3745, "step": 680 }, { "epoch": 0.7484293908768096, "grad_norm": 1.9154466908271706, "learning_rate": 1.8064138265049457e-06, "loss": 0.3743, "step": 685 }, { "epoch": 0.7538923791313848, "grad_norm": 1.8117953280714802, "learning_rate": 1.7335712801923015e-06, "loss": 0.3707, "step": 690 }, { "epoch": 0.7593553673859601, "grad_norm": 1.894844068057027, "learning_rate": 1.6619186054421088e-06, "loss": 0.3677, "step": 695 }, { "epoch": 0.7648183556405354, "grad_norm": 1.8949515530814003, "learning_rate": 1.5914819033841843e-06, "loss": 0.3639, "step": 700 }, { "epoch": 0.7702813438951106, "grad_norm": 1.7568307263853036, "learning_rate": 1.5222868322024092e-06, "loss": 0.3586, "step": 705 }, { "epoch": 0.7757443321496859, "grad_norm": 1.759178792643852, "learning_rate": 1.4543585977881513e-06, "loss": 0.35, "step": 710 }, { "epoch": 0.7812073204042611, "grad_norm": 1.8973466785062572, "learning_rate": 1.3877219445584283e-06, "loss": 0.3593, "step": 715 }, { "epoch": 0.7866703086588364, "grad_norm": 1.9605293056514521, "learning_rate": 1.3224011464422032e-06, "loss": 0.3621, "step": 720 }, { "epoch": 0.7921332969134116, "grad_norm": 1.812660380797101, "learning_rate": 1.2584199980380358e-06, "loss": 0.3599, "step": 725 }, { "epoch": 0.7975962851679869, "grad_norm": 1.8356228496720945, "learning_rate": 1.1958018059463577e-06, "loss": 0.3605, "step": 730 }, { "epoch": 0.8030592734225621, "grad_norm": 1.691309969533953, "learning_rate": 1.1345693802795177e-06, "loss": 0.3583, "step": 735 }, { "epoch": 0.8085222616771374, "grad_norm": 1.8242600919765277, "learning_rate": 1.0747450263526576e-06, "loss": 0.3598, "step": 740 }, { "epoch": 0.8139852499317126, "grad_norm": 1.7765876082507979, "learning_rate": 1.0163505365585086e-06, "loss": 0.3493, "step": 745 }, { "epoch": 0.8194482381862879, "grad_norm": 1.8095739075310635, "learning_rate": 9.594071824289986e-07, "loss": 0.3571, "step": 750 }, { "epoch": 0.8249112264408631, "grad_norm": 1.7811537133839481, "learning_rate": 9.039357068866178e-07, "loss": 0.3518, "step": 755 }, { "epoch": 0.8303742146954384, "grad_norm": 1.7625847558197685, "learning_rate": 8.49956316688329e-07, "loss": 0.3518, "step": 760 }, { "epoch": 0.8358372029500136, "grad_norm": 1.8189872840713601, "learning_rate": 7.974886750647887e-07, "loss": 0.3381, "step": 765 }, { "epoch": 0.8413001912045889, "grad_norm": 1.8024026803759359, "learning_rate": 7.465518945575789e-07, "loss": 0.345, "step": 770 }, { "epoch": 0.8467631794591641, "grad_norm": 1.895342687896103, "learning_rate": 6.971645300570084e-07, "loss": 0.3464, "step": 775 }, { "epoch": 0.8522261677137394, "grad_norm": 1.796319045444646, "learning_rate": 6.49344572043083e-07, "loss": 0.3477, "step": 780 }, { "epoch": 0.8576891559683146, "grad_norm": 1.8845046027721593, "learning_rate": 6.031094400320498e-07, "loss": 0.3443, "step": 785 }, { "epoch": 0.8631521442228899, "grad_norm": 1.724637234062916, "learning_rate": 5.584759762309422e-07, "loss": 0.3367, "step": 790 }, { "epoch": 0.8686151324774651, "grad_norm": 1.8964510379949302, "learning_rate": 5.154604394024254e-07, "loss": 0.3396, "step": 795 }, { "epoch": 0.8740781207320404, "grad_norm": 1.6763296815328177, "learning_rate": 4.7407849894216097e-07, "loss": 0.3348, "step": 800 }, { "epoch": 0.8795411089866156, "grad_norm": 1.7128909727748394, "learning_rate": 4.343452291708783e-07, "loss": 0.343, "step": 805 }, { "epoch": 0.8850040972411909, "grad_norm": 1.8153223119786095, "learning_rate": 3.962751038431961e-07, "loss": 0.3405, "step": 810 }, { "epoch": 0.8904670854957661, "grad_norm": 1.7625733448640701, "learning_rate": 3.5988199087523987e-07, "loss": 0.3396, "step": 815 }, { "epoch": 0.8959300737503414, "grad_norm": 1.8570474171994868, "learning_rate": 3.2517914729292446e-07, "loss": 0.3391, "step": 820 }, { "epoch": 0.9013930620049166, "grad_norm": 1.8466215461741358, "learning_rate": 2.921792144027813e-07, "loss": 0.3407, "step": 825 }, { "epoch": 0.9068560502594919, "grad_norm": 1.7424828950437026, "learning_rate": 2.6089421318707785e-07, "loss": 0.3365, "step": 830 }, { "epoch": 0.9123190385140671, "grad_norm": 1.8348358804511151, "learning_rate": 2.3133553992488845e-07, "loss": 0.3453, "step": 835 }, { "epoch": 0.9177820267686424, "grad_norm": 1.8267323618432934, "learning_rate": 2.0351396204074946e-07, "loss": 0.337, "step": 840 }, { "epoch": 0.9232450150232177, "grad_norm": 1.6632279811187767, "learning_rate": 1.7743961418237144e-07, "loss": 0.3341, "step": 845 }, { "epoch": 0.9287080032777929, "grad_norm": 1.8291284979726596, "learning_rate": 1.5312199452886145e-07, "loss": 0.332, "step": 850 }, { "epoch": 0.9341709915323682, "grad_norm": 1.7047344118755032, "learning_rate": 1.3056996133079924e-07, "loss": 0.3375, "step": 855 }, { "epoch": 0.9396339797869434, "grad_norm": 1.745785425026002, "learning_rate": 1.0979172968340667e-07, "loss": 0.3321, "step": 860 }, { "epoch": 0.9450969680415187, "grad_norm": 1.8776860033466447, "learning_rate": 9.079486853402097e-08, "loss": 0.34, "step": 865 }, { "epoch": 0.9505599562960939, "grad_norm": 1.7269678812008018, "learning_rate": 7.358629792492522e-08, "loss": 0.342, "step": 870 }, { "epoch": 0.9560229445506692, "grad_norm": 1.7727161663929172, "learning_rate": 5.8172286472566456e-08, "loss": 0.3391, "step": 875 }, { "epoch": 0.9614859328052444, "grad_norm": 1.7673439074550763, "learning_rate": 4.455844908407059e-08, "loss": 0.3416, "step": 880 }, { "epoch": 0.9669489210598197, "grad_norm": 1.7746641186916818, "learning_rate": 3.27497449118791e-08, "loss": 0.3268, "step": 885 }, { "epoch": 0.972411909314395, "grad_norm": 1.858186055296109, "learning_rate": 2.275047554726795e-08, "loss": 0.3371, "step": 890 }, { "epoch": 0.9778748975689703, "grad_norm": 1.6930712261364957, "learning_rate": 1.4564283453392669e-08, "loss": 0.3425, "step": 895 }, { "epoch": 0.9833378858235455, "grad_norm": 1.778838215839912, "learning_rate": 8.194150638438093e-09, "loss": 0.3315, "step": 900 }, { "epoch": 0.9888008740781208, "grad_norm": 1.8180837953030582, "learning_rate": 3.6423975693533618e-09, "loss": 0.3404, "step": 905 }, { "epoch": 0.994263862332696, "grad_norm": 1.7628508134053706, "learning_rate": 9.106823265681019e-10, "loss": 0.335, "step": 910 }, { "epoch": 0.9997268505872713, "grad_norm": 1.7810702627412893, "learning_rate": 0.0, "loss": 0.3383, "step": 915 }, { "epoch": 0.9997268505872713, "eval_loss": 0.2970966100692749, "eval_runtime": 1.1854, "eval_samples_per_second": 1.687, "eval_steps_per_second": 0.844, "step": 915 }, { "epoch": 0.9997268505872713, "step": 915, "total_flos": 191530040033280.0, "train_loss": 0.4996221627042593, "train_runtime": 20787.7504, "train_samples_per_second": 1.409, "train_steps_per_second": 0.044 } ], "logging_steps": 5, "max_steps": 915, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 191530040033280.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }