{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9997049277072882, "eval_steps": 500, "global_step": 10166, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.009835743090390479, "grad_norm": 0.05815335735678673, "learning_rate": 0.00019999032134813635, "loss": 0.9008, "step": 50 }, { "epoch": 0.019671486180780958, "grad_norm": 0.06804105639457703, "learning_rate": 0.00019995686668528316, "loss": 0.7784, "step": 100 }, { "epoch": 0.029507229271171435, "grad_norm": 0.07179820537567139, "learning_rate": 0.0001998995245149411, "loss": 0.765, "step": 150 }, { "epoch": 0.039342972361561916, "grad_norm": 0.07887151092290878, "learning_rate": 0.00019981830854063617, "loss": 0.7539, "step": 200 }, { "epoch": 0.0491787154519524, "grad_norm": 0.06816300749778748, "learning_rate": 0.00019971323817121194, "loss": 0.7355, "step": 250 }, { "epoch": 0.05901445854234287, "grad_norm": 0.0781150683760643, "learning_rate": 0.00019958433851619142, "loss": 0.7295, "step": 300 }, { "epoch": 0.06885020163273335, "grad_norm": 0.07763133943080902, "learning_rate": 0.00019943164037977625, "loss": 0.7326, "step": 350 }, { "epoch": 0.07868594472312383, "grad_norm": 0.07704794406890869, "learning_rate": 0.00019925518025348511, "loss": 0.7267, "step": 400 }, { "epoch": 0.08852168781351431, "grad_norm": 0.06781066209077835, "learning_rate": 0.00019905500030743332, "loss": 0.7203, "step": 450 }, { "epoch": 0.0983574309039048, "grad_norm": 0.06790652126073837, "learning_rate": 0.00019883114838025484, "loss": 0.7243, "step": 500 }, { "epoch": 0.10819317399429527, "grad_norm": 0.07213232666254044, "learning_rate": 0.00019858367796767002, "loss": 0.7159, "step": 550 }, { "epoch": 0.11802891708468574, "grad_norm": 0.06770846992731094, "learning_rate": 0.00019831264820970127, "loss": 0.7115, "step": 600 }, { "epoch": 0.12786466017507622, "grad_norm": 0.07667895406484604, "learning_rate": 0.00019801812387653983, "loss": 0.7135, "step": 650 }, { "epoch": 0.1377004032654667, "grad_norm": 0.06398730725049973, "learning_rate": 0.00019770017535306717, "loss": 0.707, "step": 700 }, { "epoch": 0.14753614635585718, "grad_norm": 0.0711282268166542, "learning_rate": 0.00019735887862203457, "loss": 0.7045, "step": 750 }, { "epoch": 0.15737188944624766, "grad_norm": 0.06350903958082199, "learning_rate": 0.00019699431524590477, "loss": 0.6977, "step": 800 }, { "epoch": 0.16720763253663815, "grad_norm": 0.06291402131319046, "learning_rate": 0.00019660657234736045, "loss": 0.7017, "step": 850 }, { "epoch": 0.17704337562702863, "grad_norm": 0.07249592989683151, "learning_rate": 0.00019619574258848376, "loss": 0.6997, "step": 900 }, { "epoch": 0.1868791187174191, "grad_norm": 0.07148096710443497, "learning_rate": 0.00019576192414861215, "loss": 0.6931, "step": 950 }, { "epoch": 0.1967148618078096, "grad_norm": 0.080138199031353, "learning_rate": 0.00019530522070087554, "loss": 0.6956, "step": 1000 }, { "epoch": 0.20655060489820007, "grad_norm": 0.07587867230176926, "learning_rate": 0.00019482574138742086, "loss": 0.6972, "step": 1050 }, { "epoch": 0.21638634798859055, "grad_norm": 0.0822625681757927, "learning_rate": 0.0001943236007933294, "loss": 0.6887, "step": 1100 }, { "epoch": 0.22622209107898103, "grad_norm": 0.06909151375293732, "learning_rate": 0.0001937989189192334, "loss": 0.6872, "step": 1150 }, { "epoch": 0.23605783416937148, "grad_norm": 0.06804945319890976, "learning_rate": 0.00019325182115263859, "loss": 0.6875, "step": 1200 }, { "epoch": 0.24589357725976196, "grad_norm": 0.07097342610359192, "learning_rate": 0.00019268243823795936, "loss": 0.6794, "step": 1250 }, { "epoch": 0.25572932035015244, "grad_norm": 0.06648524850606918, "learning_rate": 0.0001920909062452736, "loss": 0.6844, "step": 1300 }, { "epoch": 0.2655650634405429, "grad_norm": 0.06843659281730652, "learning_rate": 0.00019147736653780508, "loss": 0.684, "step": 1350 }, { "epoch": 0.2754008065309334, "grad_norm": 0.06251444667577744, "learning_rate": 0.00019084196573814047, "loss": 0.6856, "step": 1400 }, { "epoch": 0.2852365496213239, "grad_norm": 0.07899219542741776, "learning_rate": 0.00019018485569319003, "loss": 0.6753, "step": 1450 }, { "epoch": 0.29507229271171437, "grad_norm": 0.07931499183177948, "learning_rate": 0.00018950619343789934, "loss": 0.6808, "step": 1500 }, { "epoch": 0.30490803580210485, "grad_norm": 0.061664845794439316, "learning_rate": 0.00018880614115772138, "loss": 0.6797, "step": 1550 }, { "epoch": 0.31474377889249533, "grad_norm": 0.0717281773686409, "learning_rate": 0.00018808486614985782, "loss": 0.6774, "step": 1600 }, { "epoch": 0.3245795219828858, "grad_norm": 0.07137037068605423, "learning_rate": 0.00018734254078327868, "loss": 0.6787, "step": 1650 }, { "epoch": 0.3344152650732763, "grad_norm": 0.07462574541568756, "learning_rate": 0.0001865793424575298, "loss": 0.6836, "step": 1700 }, { "epoch": 0.34425100816366677, "grad_norm": 0.06719885766506195, "learning_rate": 0.00018579545356033823, "loss": 0.6737, "step": 1750 }, { "epoch": 0.35408675125405725, "grad_norm": 0.06752898544073105, "learning_rate": 0.00018499106142402563, "loss": 0.6767, "step": 1800 }, { "epoch": 0.36392249434444773, "grad_norm": 0.072923943400383, "learning_rate": 0.00018416635828073994, "loss": 0.6756, "step": 1850 }, { "epoch": 0.3737582374348382, "grad_norm": 0.07471544295549393, "learning_rate": 0.00018332154121651598, "loss": 0.675, "step": 1900 }, { "epoch": 0.3835939805252287, "grad_norm": 0.0654042437672615, "learning_rate": 0.00018245681212417631, "loss": 0.6662, "step": 1950 }, { "epoch": 0.3934297236156192, "grad_norm": 0.07551202178001404, "learning_rate": 0.00018157237765508325, "loss": 0.6635, "step": 2000 }, { "epoch": 0.40326546670600966, "grad_norm": 0.07835708558559418, "learning_rate": 0.00018066844916975354, "loss": 0.6701, "step": 2050 }, { "epoch": 0.41310120979640014, "grad_norm": 0.06583021581172943, "learning_rate": 0.0001797452426873481, "loss": 0.6644, "step": 2100 }, { "epoch": 0.4229369528867906, "grad_norm": 0.07056088745594025, "learning_rate": 0.0001788029788340479, "loss": 0.6696, "step": 2150 }, { "epoch": 0.4327726959771811, "grad_norm": 0.061976175755262375, "learning_rate": 0.00017784188279032932, "loss": 0.6641, "step": 2200 }, { "epoch": 0.4426084390675716, "grad_norm": 0.06996449083089828, "learning_rate": 0.00017686218423715072, "loss": 0.6666, "step": 2250 }, { "epoch": 0.45244418215796206, "grad_norm": 0.07750783860683441, "learning_rate": 0.00017586411730106399, "loss": 0.6608, "step": 2300 }, { "epoch": 0.4622799252483525, "grad_norm": 0.06929858028888702, "learning_rate": 0.00017484792049826306, "loss": 0.6693, "step": 2350 }, { "epoch": 0.47211566833874297, "grad_norm": 0.07935669273138046, "learning_rate": 0.00017381383667758416, "loss": 0.6709, "step": 2400 }, { "epoch": 0.48195141142913345, "grad_norm": 0.06911829113960266, "learning_rate": 0.00017276211296246986, "loss": 0.6592, "step": 2450 }, { "epoch": 0.49178715451952393, "grad_norm": 0.06784563511610031, "learning_rate": 0.00017169300069191224, "loss": 0.6631, "step": 2500 }, { "epoch": 0.5016228976099144, "grad_norm": 0.06838862597942352, "learning_rate": 0.00017060675536038818, "loss": 0.6685, "step": 2550 }, { "epoch": 0.5114586407003049, "grad_norm": 0.07473283261060715, "learning_rate": 0.000169503636556802, "loss": 0.6579, "step": 2600 }, { "epoch": 0.5212943837906954, "grad_norm": 0.07981903851032257, "learning_rate": 0.000168383907902449, "loss": 0.6531, "step": 2650 }, { "epoch": 0.5311301268810859, "grad_norm": 0.07383566349744797, "learning_rate": 0.00016724783698801615, "loss": 0.6552, "step": 2700 }, { "epoch": 0.5409658699714763, "grad_norm": 0.075548455119133, "learning_rate": 0.00016609569530963352, "loss": 0.665, "step": 2750 }, { "epoch": 0.5508016130618668, "grad_norm": 0.0663590133190155, "learning_rate": 0.00016492775820399258, "loss": 0.663, "step": 2800 }, { "epoch": 0.5606373561522573, "grad_norm": 0.07384879887104034, "learning_rate": 0.00016374430478254697, "loss": 0.656, "step": 2850 }, { "epoch": 0.5704730992426478, "grad_norm": 0.06946936994791031, "learning_rate": 0.00016254561786481077, "loss": 0.6585, "step": 2900 }, { "epoch": 0.5803088423330383, "grad_norm": 0.06723761558532715, "learning_rate": 0.00016133198391077096, "loss": 0.6563, "step": 2950 }, { "epoch": 0.5901445854234287, "grad_norm": 0.06926653534173965, "learning_rate": 0.00016010369295242955, "loss": 0.6542, "step": 3000 }, { "epoch": 0.5999803285138192, "grad_norm": 0.07461749017238617, "learning_rate": 0.00015886103852449254, "loss": 0.6618, "step": 3050 }, { "epoch": 0.6098160716042097, "grad_norm": 0.07314767688512802, "learning_rate": 0.00015760431759422118, "loss": 0.6509, "step": 3100 }, { "epoch": 0.6196518146946002, "grad_norm": 0.07121975719928741, "learning_rate": 0.00015633383049046365, "loss": 0.6574, "step": 3150 }, { "epoch": 0.6294875577849907, "grad_norm": 0.08515851944684982, "learning_rate": 0.00015504988083188281, "loss": 0.6576, "step": 3200 }, { "epoch": 0.6393233008753811, "grad_norm": 0.07423117756843567, "learning_rate": 0.000153752775454398, "loss": 0.6512, "step": 3250 }, { "epoch": 0.6491590439657716, "grad_norm": 0.06631764024496078, "learning_rate": 0.00015244282433785796, "loss": 0.6552, "step": 3300 }, { "epoch": 0.6589947870561621, "grad_norm": 0.06597639620304108, "learning_rate": 0.00015112034053196247, "loss": 0.6572, "step": 3350 }, { "epoch": 0.6688305301465526, "grad_norm": 0.07060743868350983, "learning_rate": 0.00014978564008145032, "loss": 0.6468, "step": 3400 }, { "epoch": 0.6786662732369431, "grad_norm": 0.07211815565824509, "learning_rate": 0.00014843904195057137, "loss": 0.6476, "step": 3450 }, { "epoch": 0.6885020163273335, "grad_norm": 0.0679902508854866, "learning_rate": 0.00014708086794686128, "loss": 0.6516, "step": 3500 }, { "epoch": 0.698337759417724, "grad_norm": 0.06677327305078506, "learning_rate": 0.00014571144264423642, "loss": 0.654, "step": 3550 }, { "epoch": 0.7081735025081145, "grad_norm": 0.07432771474123001, "learning_rate": 0.00014433109330542769, "loss": 0.6484, "step": 3600 }, { "epoch": 0.718009245598505, "grad_norm": 0.07363554835319519, "learning_rate": 0.00014294014980377213, "loss": 0.6477, "step": 3650 }, { "epoch": 0.7278449886888955, "grad_norm": 0.07398252189159393, "learning_rate": 0.00014153894454438018, "loss": 0.6494, "step": 3700 }, { "epoch": 0.737680731779286, "grad_norm": 0.06380564719438553, "learning_rate": 0.00014012781238469823, "loss": 0.6511, "step": 3750 }, { "epoch": 0.7475164748696764, "grad_norm": 0.06673400104045868, "learning_rate": 0.00013870709055448504, "loss": 0.649, "step": 3800 }, { "epoch": 0.7573522179600669, "grad_norm": 0.07433997839689255, "learning_rate": 0.00013727711857522095, "loss": 0.642, "step": 3850 }, { "epoch": 0.7671879610504574, "grad_norm": 0.07073818892240524, "learning_rate": 0.00013583823817897006, "loss": 0.6371, "step": 3900 }, { "epoch": 0.7770237041408479, "grad_norm": 0.06574368476867676, "learning_rate": 0.0001343907932267134, "loss": 0.6493, "step": 3950 }, { "epoch": 0.7868594472312384, "grad_norm": 0.067158043384552, "learning_rate": 0.00013293512962617377, "loss": 0.6433, "step": 4000 }, { "epoch": 0.7966951903216288, "grad_norm": 0.07653222978115082, "learning_rate": 0.0001314715952491514, "loss": 0.6457, "step": 4050 }, { "epoch": 0.8065309334120193, "grad_norm": 0.07320449501276016, "learning_rate": 0.0001300005398483902, "loss": 0.642, "step": 4100 }, { "epoch": 0.8163666765024098, "grad_norm": 0.07599062472581863, "learning_rate": 0.0001285223149739944, "loss": 0.6444, "step": 4150 }, { "epoch": 0.8262024195928003, "grad_norm": 0.07315944135189056, "learning_rate": 0.00012703727388941577, "loss": 0.646, "step": 4200 }, { "epoch": 0.8360381626831908, "grad_norm": 0.0766439437866211, "learning_rate": 0.00012554577148703148, "loss": 0.6391, "step": 4250 }, { "epoch": 0.8458739057735812, "grad_norm": 0.07688874751329422, "learning_rate": 0.00012404816420333247, "loss": 0.6423, "step": 4300 }, { "epoch": 0.8557096488639717, "grad_norm": 0.07058276981115341, "learning_rate": 0.0001225448099337429, "loss": 0.6473, "step": 4350 }, { "epoch": 0.8655453919543622, "grad_norm": 0.07005015760660172, "learning_rate": 0.00012103606794709112, "loss": 0.638, "step": 4400 }, { "epoch": 0.8753811350447527, "grad_norm": 0.06887346506118774, "learning_rate": 0.00011952229879975207, "loss": 0.6428, "step": 4450 }, { "epoch": 0.8852168781351432, "grad_norm": 0.06948423385620117, "learning_rate": 0.00011800386424948227, "loss": 0.6413, "step": 4500 }, { "epoch": 0.8950526212255336, "grad_norm": 0.08220444619655609, "learning_rate": 0.00011648112716896771, "loss": 0.6413, "step": 4550 }, { "epoch": 0.9048883643159241, "grad_norm": 0.07942084223031998, "learning_rate": 0.000114954451459105, "loss": 0.642, "step": 4600 }, { "epoch": 0.9147241074063146, "grad_norm": 0.074773870408535, "learning_rate": 0.00011342420196203719, "loss": 0.64, "step": 4650 }, { "epoch": 0.924559850496705, "grad_norm": 0.06527584791183472, "learning_rate": 0.00011189074437396438, "loss": 0.6448, "step": 4700 }, { "epoch": 0.9343955935870955, "grad_norm": 0.0640060305595398, "learning_rate": 0.00011035444515775035, "loss": 0.6381, "step": 4750 }, { "epoch": 0.9442313366774859, "grad_norm": 0.06932298839092255, "learning_rate": 0.00010881567145534591, "loss": 0.6424, "step": 4800 }, { "epoch": 0.9540670797678764, "grad_norm": 0.0729447677731514, "learning_rate": 0.00010727479100005005, "loss": 0.6398, "step": 4850 }, { "epoch": 0.9639028228582669, "grad_norm": 0.06698109209537506, "learning_rate": 0.00010573217202862959, "loss": 0.64, "step": 4900 }, { "epoch": 0.9737385659486574, "grad_norm": 0.0725114643573761, "learning_rate": 0.0001041881831933188, "loss": 0.6389, "step": 4950 }, { "epoch": 0.9835743090390479, "grad_norm": 0.06850885599851608, "learning_rate": 0.00010264319347371951, "loss": 0.6341, "step": 5000 }, { "epoch": 0.9934100521294383, "grad_norm": 0.07343582063913345, "learning_rate": 0.00010109757208862299, "loss": 0.6315, "step": 5050 }, { "epoch": 1.003147437788925, "grad_norm": 0.07858431339263916, "learning_rate": 9.955168840777474e-05, "loss": 0.6336, "step": 5100 }, { "epoch": 1.0129831808793155, "grad_norm": 0.07285405695438385, "learning_rate": 9.800591186360323e-05, "loss": 0.6202, "step": 5150 }, { "epoch": 1.0228189239697059, "grad_norm": 0.07421938329935074, "learning_rate": 9.646061186293367e-05, "loss": 0.6256, "step": 5200 }, { "epoch": 1.0326546670600965, "grad_norm": 0.06922327727079391, "learning_rate": 9.491615769870769e-05, "loss": 0.6214, "step": 5250 }, { "epoch": 1.0424904101504868, "grad_norm": 0.08220986276865005, "learning_rate": 9.337291846173059e-05, "loss": 0.6272, "step": 5300 }, { "epoch": 1.0523261532408774, "grad_norm": 0.06896129250526428, "learning_rate": 9.183126295246645e-05, "loss": 0.6219, "step": 5350 }, { "epoch": 1.0621618963312678, "grad_norm": 0.07372142374515533, "learning_rate": 9.029155959290319e-05, "loss": 0.6171, "step": 5400 }, { "epoch": 1.0719976394216584, "grad_norm": 0.07114165276288986, "learning_rate": 8.875417633850746e-05, "loss": 0.6189, "step": 5450 }, { "epoch": 1.0818333825120487, "grad_norm": 0.0711623951792717, "learning_rate": 8.721948059029161e-05, "loss": 0.6167, "step": 5500 }, { "epoch": 1.0916691256024393, "grad_norm": 0.07456561177968979, "learning_rate": 8.568783910701252e-05, "loss": 0.6141, "step": 5550 }, { "epoch": 1.1015048686928297, "grad_norm": 0.0760912150144577, "learning_rate": 8.415961791752472e-05, "loss": 0.6167, "step": 5600 }, { "epoch": 1.1113406117832203, "grad_norm": 0.07034651935100555, "learning_rate": 8.263518223330697e-05, "loss": 0.6125, "step": 5650 }, { "epoch": 1.1211763548736107, "grad_norm": 0.07252858579158783, "learning_rate": 8.111489636118522e-05, "loss": 0.6181, "step": 5700 }, { "epoch": 1.1310120979640013, "grad_norm": 0.07909699529409409, "learning_rate": 7.959912361627082e-05, "loss": 0.6214, "step": 5750 }, { "epoch": 1.1408478410543916, "grad_norm": 0.07374490797519684, "learning_rate": 7.808822623513643e-05, "loss": 0.625, "step": 5800 }, { "epoch": 1.1506835841447822, "grad_norm": 0.07597927004098892, "learning_rate": 7.658256528924909e-05, "loss": 0.6199, "step": 5850 }, { "epoch": 1.1605193272351726, "grad_norm": 0.07946628332138062, "learning_rate": 7.508250059868249e-05, "loss": 0.615, "step": 5900 }, { "epoch": 1.1703550703255632, "grad_norm": 0.07706974446773529, "learning_rate": 7.358839064612726e-05, "loss": 0.6126, "step": 5950 }, { "epoch": 1.1801908134159536, "grad_norm": 0.0789932906627655, "learning_rate": 7.210059249122193e-05, "loss": 0.6192, "step": 6000 }, { "epoch": 1.1900265565063441, "grad_norm": 0.08002398908138275, "learning_rate": 7.061946168522318e-05, "loss": 0.6167, "step": 6050 }, { "epoch": 1.1998622995967345, "grad_norm": 0.07844787836074829, "learning_rate": 6.914535218603708e-05, "loss": 0.6208, "step": 6100 }, { "epoch": 1.209698042687125, "grad_norm": 0.08679769188165665, "learning_rate": 6.767861627363054e-05, "loss": 0.6157, "step": 6150 }, { "epoch": 1.2195337857775155, "grad_norm": 0.07716654241085052, "learning_rate": 6.621960446584452e-05, "loss": 0.61, "step": 6200 }, { "epoch": 1.229369528867906, "grad_norm": 0.08282492309808731, "learning_rate": 6.476866543462761e-05, "loss": 0.6145, "step": 6250 }, { "epoch": 1.2392052719582964, "grad_norm": 0.09066256135702133, "learning_rate": 6.332614592271122e-05, "loss": 0.6237, "step": 6300 }, { "epoch": 1.2490410150486868, "grad_norm": 0.07888253778219223, "learning_rate": 6.18923906607455e-05, "loss": 0.6149, "step": 6350 }, { "epoch": 1.2588767581390774, "grad_norm": 0.08149804919958115, "learning_rate": 6.046774228491643e-05, "loss": 0.6166, "step": 6400 }, { "epoch": 1.268712501229468, "grad_norm": 0.07619079202413559, "learning_rate": 5.905254125506301e-05, "loss": 0.6111, "step": 6450 }, { "epoch": 1.2785482443198584, "grad_norm": 0.08567807078361511, "learning_rate": 5.76471257733151e-05, "loss": 0.6153, "step": 6500 }, { "epoch": 1.2883839874102487, "grad_norm": 0.0711495652794838, "learning_rate": 5.625183170327017e-05, "loss": 0.6126, "step": 6550 }, { "epoch": 1.2982197305006393, "grad_norm": 0.08142837882041931, "learning_rate": 5.4866992489729554e-05, "loss": 0.6212, "step": 6600 }, { "epoch": 1.30805547359103, "grad_norm": 0.07909776270389557, "learning_rate": 5.3492939079012206e-05, "loss": 0.6139, "step": 6650 }, { "epoch": 1.3178912166814203, "grad_norm": 0.08278420567512512, "learning_rate": 5.2129999839865796e-05, "loss": 0.6115, "step": 6700 }, { "epoch": 1.3277269597718107, "grad_norm": 0.08086064457893372, "learning_rate": 5.077850048499388e-05, "loss": 0.6111, "step": 6750 }, { "epoch": 1.3375627028622012, "grad_norm": 0.07711977511644363, "learning_rate": 4.9438763993217495e-05, "loss": 0.6101, "step": 6800 }, { "epoch": 1.3473984459525918, "grad_norm": 0.08161487430334091, "learning_rate": 4.811111053229043e-05, "loss": 0.6167, "step": 6850 }, { "epoch": 1.3572341890429822, "grad_norm": 0.0841764435172081, "learning_rate": 4.6795857382386044e-05, "loss": 0.6121, "step": 6900 }, { "epoch": 1.3670699321333726, "grad_norm": 0.0816132128238678, "learning_rate": 4.549331886027429e-05, "loss": 0.6078, "step": 6950 }, { "epoch": 1.3769056752237632, "grad_norm": 0.08330381661653519, "learning_rate": 4.4203806244206756e-05, "loss": 0.6195, "step": 7000 }, { "epoch": 1.3867414183141538, "grad_norm": 0.0845024362206459, "learning_rate": 4.292762769952816e-05, "loss": 0.6133, "step": 7050 }, { "epoch": 1.3965771614045441, "grad_norm": 0.07662446796894073, "learning_rate": 4.1665088205031334e-05, "loss": 0.6129, "step": 7100 }, { "epoch": 1.4064129044949345, "grad_norm": 0.08240839838981628, "learning_rate": 4.041648948007416e-05, "loss": 0.6156, "step": 7150 }, { "epoch": 1.416248647585325, "grad_norm": 0.07303918898105621, "learning_rate": 3.918212991247514e-05, "loss": 0.6156, "step": 7200 }, { "epoch": 1.4260843906757157, "grad_norm": 0.07239814847707748, "learning_rate": 3.796230448720526e-05, "loss": 0.6131, "step": 7250 }, { "epoch": 1.435920133766106, "grad_norm": 0.07538265734910965, "learning_rate": 3.675730471589286e-05, "loss": 0.6116, "step": 7300 }, { "epoch": 1.4457558768564964, "grad_norm": 0.08947084844112396, "learning_rate": 3.556741856715907e-05, "loss": 0.6147, "step": 7350 }, { "epoch": 1.455591619946887, "grad_norm": 0.08240395039319992, "learning_rate": 3.4392930397799194e-05, "loss": 0.6125, "step": 7400 }, { "epoch": 1.4654273630372774, "grad_norm": 0.07328338176012039, "learning_rate": 3.3234120884828e-05, "loss": 0.6041, "step": 7450 }, { "epoch": 1.475263106127668, "grad_norm": 0.07761271297931671, "learning_rate": 3.209126695840382e-05, "loss": 0.6116, "step": 7500 }, { "epoch": 1.4850988492180583, "grad_norm": 0.084846630692482, "learning_rate": 3.0964641735648423e-05, "loss": 0.6141, "step": 7550 }, { "epoch": 1.494934592308449, "grad_norm": 0.08141667395830154, "learning_rate": 2.9854514455377837e-05, "loss": 0.6073, "step": 7600 }, { "epoch": 1.5047703353988395, "grad_norm": 0.07505550980567932, "learning_rate": 2.876115041376034e-05, "loss": 0.6091, "step": 7650 }, { "epoch": 1.51460607848923, "grad_norm": 0.07819739729166031, "learning_rate": 2.7684810900916315e-05, "loss": 0.6141, "step": 7700 }, { "epoch": 1.5244418215796203, "grad_norm": 0.07576938718557358, "learning_rate": 2.6625753138475718e-05, "loss": 0.6046, "step": 7750 }, { "epoch": 1.5342775646700109, "grad_norm": 0.07528570294380188, "learning_rate": 2.5584230218107718e-05, "loss": 0.6114, "step": 7800 }, { "epoch": 1.5441133077604015, "grad_norm": 0.08330899477005005, "learning_rate": 2.456049104103726e-05, "loss": 0.609, "step": 7850 }, { "epoch": 1.5539490508507918, "grad_norm": 0.0729718878865242, "learning_rate": 2.3554780258563125e-05, "loss": 0.6138, "step": 7900 }, { "epoch": 1.5637847939411822, "grad_norm": 0.07709172368049622, "learning_rate": 2.256733821359168e-05, "loss": 0.6038, "step": 7950 }, { "epoch": 1.5736205370315728, "grad_norm": 0.07105692476034164, "learning_rate": 2.1598400883200065e-05, "loss": 0.6078, "step": 8000 }, { "epoch": 1.5834562801219634, "grad_norm": 0.0719430074095726, "learning_rate": 2.0648199822242953e-05, "loss": 0.6121, "step": 8050 }, { "epoch": 1.5932920232123537, "grad_norm": 0.08640766143798828, "learning_rate": 1.971696210801589e-05, "loss": 0.5973, "step": 8100 }, { "epoch": 1.6031277663027441, "grad_norm": 0.07854746282100677, "learning_rate": 1.8804910285988885e-05, "loss": 0.6176, "step": 8150 }, { "epoch": 1.6129635093931345, "grad_norm": 0.09165063500404358, "learning_rate": 1.7912262316622753e-05, "loss": 0.6112, "step": 8200 }, { "epoch": 1.622799252483525, "grad_norm": 0.07630006968975067, "learning_rate": 1.703923152328145e-05, "loss": 0.6053, "step": 8250 }, { "epoch": 1.6326349955739157, "grad_norm": 0.0875934585928917, "learning_rate": 1.6186026541252452e-05, "loss": 0.6082, "step": 8300 }, { "epoch": 1.642470738664306, "grad_norm": 0.08449984341859818, "learning_rate": 1.5352851267887423e-05, "loss": 0.6139, "step": 8350 }, { "epoch": 1.6523064817546964, "grad_norm": 0.07857895642518997, "learning_rate": 1.453990481387526e-05, "loss": 0.6134, "step": 8400 }, { "epoch": 1.662142224845087, "grad_norm": 0.0947275385260582, "learning_rate": 1.3747381455658848e-05, "loss": 0.6018, "step": 8450 }, { "epoch": 1.6719779679354776, "grad_norm": 0.08134233206510544, "learning_rate": 1.2975470589007454e-05, "loss": 0.6127, "step": 8500 }, { "epoch": 1.681813711025868, "grad_norm": 0.07631494104862213, "learning_rate": 1.2224356683755089e-05, "loss": 0.6105, "step": 8550 }, { "epoch": 1.6916494541162583, "grad_norm": 0.07870069146156311, "learning_rate": 1.1494219239716353e-05, "loss": 0.6076, "step": 8600 }, { "epoch": 1.701485197206649, "grad_norm": 0.08626607805490494, "learning_rate": 1.0785232743789808e-05, "loss": 0.6102, "step": 8650 }, { "epoch": 1.7113209402970395, "grad_norm": 0.08474079519510269, "learning_rate": 1.0097566628259614e-05, "loss": 0.6018, "step": 8700 }, { "epoch": 1.7211566833874299, "grad_norm": 0.08202967047691345, "learning_rate": 9.431385230304613e-06, "loss": 0.6016, "step": 8750 }, { "epoch": 1.7309924264778203, "grad_norm": 0.07875273376703262, "learning_rate": 8.786847752725614e-06, "loss": 0.616, "step": 8800 }, { "epoch": 1.7408281695682108, "grad_norm": 0.09360364079475403, "learning_rate": 8.164108225899214e-06, "loss": 0.6046, "step": 8850 }, { "epoch": 1.7506639126586014, "grad_norm": 0.07391002029180527, "learning_rate": 7.5633154709680575e-06, "loss": 0.6117, "step": 8900 }, { "epoch": 1.7604996557489918, "grad_norm": 0.06737073510885239, "learning_rate": 6.984613064275658e-06, "loss": 0.6034, "step": 8950 }, { "epoch": 1.7703353988393822, "grad_norm": 0.0800662711262703, "learning_rate": 6.428139303054981e-06, "loss": 0.6054, "step": 9000 }, { "epoch": 1.7801711419297728, "grad_norm": 0.07634767144918442, "learning_rate": 5.894027172378247e-06, "loss": 0.6041, "step": 9050 }, { "epoch": 1.7900068850201634, "grad_norm": 0.0848456546664238, "learning_rate": 5.382404313376555e-06, "loss": 0.6013, "step": 9100 }, { "epoch": 1.7998426281105537, "grad_norm": 0.08109795302152634, "learning_rate": 4.893392992736301e-06, "loss": 0.6037, "step": 9150 }, { "epoch": 1.809678371200944, "grad_norm": 0.08365663886070251, "learning_rate": 4.427110073480245e-06, "loss": 0.604, "step": 9200 }, { "epoch": 1.8195141142913347, "grad_norm": 0.07841967046260834, "learning_rate": 3.983666987039691e-06, "loss": 0.602, "step": 9250 }, { "epoch": 1.8293498573817253, "grad_norm": 0.07770159840583801, "learning_rate": 3.5631697066249024e-06, "loss": 0.6051, "step": 9300 }, { "epoch": 1.8391856004721157, "grad_norm": 0.07518593221902847, "learning_rate": 3.1657187218997464e-06, "loss": 0.6065, "step": 9350 }, { "epoch": 1.849021343562506, "grad_norm": 0.08144789189100266, "learning_rate": 2.791409014966895e-06, "loss": 0.6096, "step": 9400 }, { "epoch": 1.8588570866528966, "grad_norm": 0.08269818127155304, "learning_rate": 2.4403300376691473e-06, "loss": 0.61, "step": 9450 }, { "epoch": 1.8686928297432872, "grad_norm": 0.07822602987289429, "learning_rate": 2.112565690212465e-06, "loss": 0.6063, "step": 9500 }, { "epoch": 1.8785285728336776, "grad_norm": 0.09586796164512634, "learning_rate": 1.8081943011155623e-06, "loss": 0.6025, "step": 9550 }, { "epoch": 1.888364315924068, "grad_norm": 0.0846213847398758, "learning_rate": 1.527288608491173e-06, "loss": 0.6098, "step": 9600 }, { "epoch": 1.8982000590144585, "grad_norm": 0.08927745372056961, "learning_rate": 1.2699157426631858e-06, "loss": 0.6077, "step": 9650 }, { "epoch": 1.9080358021048491, "grad_norm": 0.07482849806547165, "learning_rate": 1.0361372101239997e-06, "loss": 0.6045, "step": 9700 }, { "epoch": 1.9178715451952395, "grad_norm": 0.07779071480035782, "learning_rate": 8.260088788357711e-07, "loss": 0.6068, "step": 9750 }, { "epoch": 1.9277072882856299, "grad_norm": 0.079287588596344, "learning_rate": 6.395809648792384e-07, "loss": 0.6053, "step": 9800 }, { "epoch": 1.9375430313760205, "grad_norm": 0.08390713483095169, "learning_rate": 4.768980204531426e-07, "loss": 0.6083, "step": 9850 }, { "epoch": 1.947378774466411, "grad_norm": 0.07710904628038406, "learning_rate": 3.3799892322726735e-07, "loss": 0.6058, "step": 9900 }, { "epoch": 1.9572145175568014, "grad_norm": 0.08125531673431396, "learning_rate": 2.229168670514481e-07, "loss": 0.6087, "step": 9950 }, { "epoch": 1.9670502606471918, "grad_norm": 0.07178232818841934, "learning_rate": 1.3167935402305098e-07, "loss": 0.6064, "step": 10000 }, { "epoch": 1.9768860037375824, "grad_norm": 0.08764079213142395, "learning_rate": 6.430818791451909e-08, "loss": 0.6119, "step": 10050 }, { "epoch": 1.986721746827973, "grad_norm": 0.07766247540712357, "learning_rate": 2.0819468962773868e-08, "loss": 0.6038, "step": 10100 }, { "epoch": 1.9965574899183633, "grad_norm": 0.08261710405349731, "learning_rate": 1.2235900215817885e-09, "loss": 0.6115, "step": 10150 } ], "logging_steps": 50, "max_steps": 10166, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.464591551750786e+19, "train_batch_size": 4, "trial_name": null, "trial_params": null }