{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.8972542072630647, "eval_steps": 2000, "global_step": 22000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00354295837023915, "grad_norm": 10.283039659237906, "learning_rate": 3.5423308537017364e-08, "loss": 1.5344, "step": 20 }, { "epoch": 0.0070859167404783, "grad_norm": 10.29248009558218, "learning_rate": 7.084661707403473e-08, "loss": 1.4742, "step": 40 }, { "epoch": 0.010628875110717449, "grad_norm": 9.936150778118938, "learning_rate": 1.0626992561105209e-07, "loss": 1.4303, "step": 60 }, { "epoch": 0.0141718334809566, "grad_norm": 10.764517644587348, "learning_rate": 1.4169323414806946e-07, "loss": 1.41, "step": 80 }, { "epoch": 0.01771479185119575, "grad_norm": 7.66807491548178, "learning_rate": 1.7711654268508678e-07, "loss": 1.3917, "step": 100 }, { "epoch": 0.021257750221434897, "grad_norm": 4.50717632446115, "learning_rate": 2.1253985122210417e-07, "loss": 1.4334, "step": 120 }, { "epoch": 0.024800708591674048, "grad_norm": 5.252802352230719, "learning_rate": 2.479631597591215e-07, "loss": 1.3535, "step": 140 }, { "epoch": 0.0283436669619132, "grad_norm": 6.854461866498012, "learning_rate": 2.833864682961389e-07, "loss": 1.327, "step": 160 }, { "epoch": 0.03188662533215235, "grad_norm": 5.151839746898103, "learning_rate": 3.188097768331563e-07, "loss": 1.3742, "step": 180 }, { "epoch": 0.0354295837023915, "grad_norm": 5.045147162247824, "learning_rate": 3.5423308537017355e-07, "loss": 1.3332, "step": 200 }, { "epoch": 0.03897254207263064, "grad_norm": 4.683603653905373, "learning_rate": 3.89656393907191e-07, "loss": 1.2738, "step": 220 }, { "epoch": 0.042515500442869794, "grad_norm": 3.3123783143845853, "learning_rate": 4.2507970244420835e-07, "loss": 1.3093, "step": 240 }, { "epoch": 0.046058458813108945, "grad_norm": 7.022433229508388, "learning_rate": 4.605030109812257e-07, "loss": 1.3152, "step": 260 }, { "epoch": 0.049601417183348095, "grad_norm": 4.657117511243846, "learning_rate": 4.95926319518243e-07, "loss": 1.2072, "step": 280 }, { "epoch": 0.053144375553587246, "grad_norm": 3.8212763356692094, "learning_rate": 5.313496280552604e-07, "loss": 1.2571, "step": 300 }, { "epoch": 0.0566873339238264, "grad_norm": 3.510893973188609, "learning_rate": 5.667729365922778e-07, "loss": 1.2607, "step": 320 }, { "epoch": 0.06023029229406555, "grad_norm": 5.0086319180776195, "learning_rate": 6.021962451292952e-07, "loss": 1.197, "step": 340 }, { "epoch": 0.0637732506643047, "grad_norm": 3.431634184610715, "learning_rate": 6.376195536663126e-07, "loss": 1.2377, "step": 360 }, { "epoch": 0.06731620903454384, "grad_norm": 6.75935217841033, "learning_rate": 6.730428622033298e-07, "loss": 1.1773, "step": 380 }, { "epoch": 0.070859167404783, "grad_norm": 4.209102695925099, "learning_rate": 7.084661707403471e-07, "loss": 1.219, "step": 400 }, { "epoch": 0.07440212577502214, "grad_norm": 3.49834326636571, "learning_rate": 7.438894792773646e-07, "loss": 1.1681, "step": 420 }, { "epoch": 0.07794508414526129, "grad_norm": 6.9169037458817515, "learning_rate": 7.79312787814382e-07, "loss": 1.159, "step": 440 }, { "epoch": 0.08148804251550044, "grad_norm": 4.117418143738441, "learning_rate": 8.147360963513992e-07, "loss": 1.1361, "step": 460 }, { "epoch": 0.08503100088573959, "grad_norm": 3.878686980846809, "learning_rate": 8.501594048884167e-07, "loss": 1.1408, "step": 480 }, { "epoch": 0.08857395925597875, "grad_norm": 2.4629212636233513, "learning_rate": 8.85582713425434e-07, "loss": 1.1273, "step": 500 }, { "epoch": 0.09211691762621789, "grad_norm": 3.5335784704034263, "learning_rate": 9.210060219624514e-07, "loss": 1.1072, "step": 520 }, { "epoch": 0.09565987599645705, "grad_norm": 2.993397755869922, "learning_rate": 9.564293304994688e-07, "loss": 1.1374, "step": 540 }, { "epoch": 0.09920283436669619, "grad_norm": 3.3645219648004074, "learning_rate": 9.91852639036486e-07, "loss": 1.0846, "step": 560 }, { "epoch": 0.10274579273693533, "grad_norm": 3.5598287380775657, "learning_rate": 1.0272759475735035e-06, "loss": 1.1543, "step": 580 }, { "epoch": 0.10628875110717449, "grad_norm": 5.756341944728728, "learning_rate": 1.0626992561105207e-06, "loss": 1.1218, "step": 600 }, { "epoch": 0.10983170947741364, "grad_norm": 4.622099244697303, "learning_rate": 1.098122564647538e-06, "loss": 1.1217, "step": 620 }, { "epoch": 0.1133746678476528, "grad_norm": 4.036105893234525, "learning_rate": 1.1335458731845557e-06, "loss": 1.1389, "step": 640 }, { "epoch": 0.11691762621789194, "grad_norm": 4.152917559749059, "learning_rate": 1.1689691817215728e-06, "loss": 1.1605, "step": 660 }, { "epoch": 0.1204605845881311, "grad_norm": 2.353928725309983, "learning_rate": 1.2043924902585904e-06, "loss": 1.1468, "step": 680 }, { "epoch": 0.12400354295837024, "grad_norm": 3.628879073452012, "learning_rate": 1.2398157987956076e-06, "loss": 1.1481, "step": 700 }, { "epoch": 0.1275465013286094, "grad_norm": 4.198170873976199, "learning_rate": 1.2752391073326251e-06, "loss": 1.1093, "step": 720 }, { "epoch": 0.13108945969884853, "grad_norm": 3.269113557624318, "learning_rate": 1.3106624158696423e-06, "loss": 1.102, "step": 740 }, { "epoch": 0.13463241806908768, "grad_norm": 4.528204224506456, "learning_rate": 1.3460857244066597e-06, "loss": 1.0894, "step": 760 }, { "epoch": 0.13817537643932684, "grad_norm": 3.27137530368028, "learning_rate": 1.381509032943677e-06, "loss": 1.0747, "step": 780 }, { "epoch": 0.141718334809566, "grad_norm": 7.233836600777667, "learning_rate": 1.4169323414806942e-06, "loss": 1.1337, "step": 800 }, { "epoch": 0.14526129317980513, "grad_norm": 2.8025481056888815, "learning_rate": 1.4523556500177118e-06, "loss": 1.0662, "step": 820 }, { "epoch": 0.1488042515500443, "grad_norm": 3.388696427420553, "learning_rate": 1.4877789585547292e-06, "loss": 1.0438, "step": 840 }, { "epoch": 0.15234720992028344, "grad_norm": 4.710208067024261, "learning_rate": 1.5232022670917465e-06, "loss": 1.1523, "step": 860 }, { "epoch": 0.15589016829052257, "grad_norm": 3.564554055568693, "learning_rate": 1.558625575628764e-06, "loss": 1.1362, "step": 880 }, { "epoch": 0.15943312666076173, "grad_norm": 4.195782034527705, "learning_rate": 1.594048884165781e-06, "loss": 1.1197, "step": 900 }, { "epoch": 0.1629760850310009, "grad_norm": 5.136529290856518, "learning_rate": 1.6294721927027984e-06, "loss": 1.0747, "step": 920 }, { "epoch": 0.16651904340124005, "grad_norm": 3.0425557174200875, "learning_rate": 1.664895501239816e-06, "loss": 1.0078, "step": 940 }, { "epoch": 0.17006200177147918, "grad_norm": 4.70753429709887, "learning_rate": 1.7003188097768334e-06, "loss": 1.0692, "step": 960 }, { "epoch": 0.17360496014171833, "grad_norm": 2.352046892407463, "learning_rate": 1.7357421183138505e-06, "loss": 1.1146, "step": 980 }, { "epoch": 0.1771479185119575, "grad_norm": 4.251351248901323, "learning_rate": 1.771165426850868e-06, "loss": 1.1281, "step": 1000 }, { "epoch": 0.18069087688219662, "grad_norm": 3.1244602580086203, "learning_rate": 1.8065887353878853e-06, "loss": 1.0573, "step": 1020 }, { "epoch": 0.18423383525243578, "grad_norm": 3.7998767293490014, "learning_rate": 1.8420120439249029e-06, "loss": 1.0572, "step": 1040 }, { "epoch": 0.18777679362267494, "grad_norm": 4.226400750966905, "learning_rate": 1.8774353524619202e-06, "loss": 1.0907, "step": 1060 }, { "epoch": 0.1913197519929141, "grad_norm": 8.680035093004253, "learning_rate": 1.9128586609989376e-06, "loss": 1.1316, "step": 1080 }, { "epoch": 0.19486271036315322, "grad_norm": 5.6932973340030175, "learning_rate": 1.9482819695359548e-06, "loss": 1.0741, "step": 1100 }, { "epoch": 0.19840566873339238, "grad_norm": 4.008840499431025, "learning_rate": 1.983705278072972e-06, "loss": 1.0599, "step": 1120 }, { "epoch": 0.20194862710363154, "grad_norm": 4.7089915409873555, "learning_rate": 2.0191285866099895e-06, "loss": 1.1045, "step": 1140 }, { "epoch": 0.20549158547387067, "grad_norm": 3.8841603187249665, "learning_rate": 2.054551895147007e-06, "loss": 1.1013, "step": 1160 }, { "epoch": 0.20903454384410983, "grad_norm": 2.5908607447164256, "learning_rate": 2.0899752036840243e-06, "loss": 1.0688, "step": 1180 }, { "epoch": 0.21257750221434898, "grad_norm": 4.285794532561674, "learning_rate": 2.1253985122210414e-06, "loss": 1.044, "step": 1200 }, { "epoch": 0.21612046058458814, "grad_norm": 5.061151481127176, "learning_rate": 2.160821820758059e-06, "loss": 1.0987, "step": 1220 }, { "epoch": 0.21966341895482727, "grad_norm": 2.4272321583338945, "learning_rate": 2.196245129295076e-06, "loss": 1.0697, "step": 1240 }, { "epoch": 0.22320637732506643, "grad_norm": 3.516050346917228, "learning_rate": 2.2316684378320937e-06, "loss": 1.0373, "step": 1260 }, { "epoch": 0.2267493356953056, "grad_norm": 5.174609559420662, "learning_rate": 2.2670917463691113e-06, "loss": 1.0212, "step": 1280 }, { "epoch": 0.23029229406554472, "grad_norm": 5.096030335997553, "learning_rate": 2.3025150549061285e-06, "loss": 1.1042, "step": 1300 }, { "epoch": 0.23383525243578387, "grad_norm": 3.21646646241324, "learning_rate": 2.3379383634431456e-06, "loss": 1.074, "step": 1320 }, { "epoch": 0.23737821080602303, "grad_norm": 4.102524624460841, "learning_rate": 2.3733616719801632e-06, "loss": 1.1245, "step": 1340 }, { "epoch": 0.2409211691762622, "grad_norm": 3.537479639297508, "learning_rate": 2.408784980517181e-06, "loss": 1.1179, "step": 1360 }, { "epoch": 0.24446412754650132, "grad_norm": 3.886486819810854, "learning_rate": 2.444208289054198e-06, "loss": 1.0651, "step": 1380 }, { "epoch": 0.24800708591674048, "grad_norm": 4.034282862676682, "learning_rate": 2.479631597591215e-06, "loss": 1.02, "step": 1400 }, { "epoch": 0.25155004428697963, "grad_norm": 3.0945919872830663, "learning_rate": 2.5150549061282327e-06, "loss": 1.0985, "step": 1420 }, { "epoch": 0.2550930026572188, "grad_norm": 3.634569400423284, "learning_rate": 2.5504782146652503e-06, "loss": 1.0805, "step": 1440 }, { "epoch": 0.25863596102745795, "grad_norm": 3.920774204411743, "learning_rate": 2.585901523202267e-06, "loss": 1.0352, "step": 1460 }, { "epoch": 0.26217891939769705, "grad_norm": 3.650545041007239, "learning_rate": 2.6213248317392846e-06, "loss": 1.0575, "step": 1480 }, { "epoch": 0.2657218777679362, "grad_norm": 3.8026274989044793, "learning_rate": 2.6567481402763018e-06, "loss": 1.0806, "step": 1500 }, { "epoch": 0.26926483613817537, "grad_norm": 4.1381541786166895, "learning_rate": 2.6921714488133194e-06, "loss": 1.0501, "step": 1520 }, { "epoch": 0.2728077945084145, "grad_norm": 5.519250816332529, "learning_rate": 2.727594757350337e-06, "loss": 1.0583, "step": 1540 }, { "epoch": 0.2763507528786537, "grad_norm": 4.595209023098072, "learning_rate": 2.763018065887354e-06, "loss": 1.0662, "step": 1560 }, { "epoch": 0.27989371124889284, "grad_norm": 3.540003351520752, "learning_rate": 2.7984413744243717e-06, "loss": 1.0286, "step": 1580 }, { "epoch": 0.283436669619132, "grad_norm": 3.4373968392712184, "learning_rate": 2.8338646829613884e-06, "loss": 1.0434, "step": 1600 }, { "epoch": 0.2869796279893711, "grad_norm": 17.453104932319967, "learning_rate": 2.869287991498406e-06, "loss": 1.0223, "step": 1620 }, { "epoch": 0.29052258635961026, "grad_norm": 2.6061492522441863, "learning_rate": 2.9047113000354236e-06, "loss": 1.0935, "step": 1640 }, { "epoch": 0.2940655447298494, "grad_norm": 4.139323586910726, "learning_rate": 2.9401346085724407e-06, "loss": 1.0754, "step": 1660 }, { "epoch": 0.2976085031000886, "grad_norm": 5.300424892826558, "learning_rate": 2.9755579171094583e-06, "loss": 1.0681, "step": 1680 }, { "epoch": 0.30115146147032773, "grad_norm": 3.649885398648624, "learning_rate": 3.0109812256464755e-06, "loss": 1.0922, "step": 1700 }, { "epoch": 0.3046944198405669, "grad_norm": 4.140426538668616, "learning_rate": 3.046404534183493e-06, "loss": 1.0644, "step": 1720 }, { "epoch": 0.30823737821080605, "grad_norm": 2.175231115055194, "learning_rate": 3.0818278427205106e-06, "loss": 1.0494, "step": 1740 }, { "epoch": 0.31178033658104515, "grad_norm": 3.028695259816387, "learning_rate": 3.117251151257528e-06, "loss": 1.0932, "step": 1760 }, { "epoch": 0.3153232949512843, "grad_norm": 2.7850683084236394, "learning_rate": 3.1526744597945454e-06, "loss": 1.0287, "step": 1780 }, { "epoch": 0.31886625332152346, "grad_norm": 2.933892885639913, "learning_rate": 3.188097768331562e-06, "loss": 1.0669, "step": 1800 }, { "epoch": 0.3224092116917626, "grad_norm": 4.948366022661806, "learning_rate": 3.2235210768685797e-06, "loss": 1.0984, "step": 1820 }, { "epoch": 0.3259521700620018, "grad_norm": 3.086993856127569, "learning_rate": 3.258944385405597e-06, "loss": 1.0421, "step": 1840 }, { "epoch": 0.32949512843224094, "grad_norm": 4.135810740344135, "learning_rate": 3.2943676939426144e-06, "loss": 0.9316, "step": 1860 }, { "epoch": 0.3330380868024801, "grad_norm": 2.7787248572400673, "learning_rate": 3.329791002479632e-06, "loss": 1.0354, "step": 1880 }, { "epoch": 0.3365810451727192, "grad_norm": 6.6200330325040815, "learning_rate": 3.365214311016649e-06, "loss": 1.0825, "step": 1900 }, { "epoch": 0.34012400354295835, "grad_norm": 3.9618002923502607, "learning_rate": 3.4006376195536668e-06, "loss": 1.0643, "step": 1920 }, { "epoch": 0.3436669619131975, "grad_norm": 2.9478604282057987, "learning_rate": 3.4360609280906835e-06, "loss": 1.0942, "step": 1940 }, { "epoch": 0.34720992028343667, "grad_norm": 3.1696939381732596, "learning_rate": 3.471484236627701e-06, "loss": 1.0848, "step": 1960 }, { "epoch": 0.3507528786536758, "grad_norm": 2.610545027614052, "learning_rate": 3.5069075451647187e-06, "loss": 1.0424, "step": 1980 }, { "epoch": 0.354295837023915, "grad_norm": 2.8653023342432844, "learning_rate": 3.542330853701736e-06, "loss": 1.0376, "step": 2000 }, { "epoch": 0.354295837023915, "eval_loss": 0.9136635661125183, "eval_runtime": 366.8623, "eval_samples_per_second": 25.914, "eval_steps_per_second": 3.241, "step": 2000 }, { "epoch": 0.35783879539415414, "grad_norm": 5.501631748880912, "learning_rate": 3.5777541622387534e-06, "loss": 1.0794, "step": 2020 }, { "epoch": 0.36138175376439324, "grad_norm": 3.9781584018724216, "learning_rate": 3.6131774707757706e-06, "loss": 1.0918, "step": 2040 }, { "epoch": 0.3649247121346324, "grad_norm": 5.9653615606161035, "learning_rate": 3.648600779312788e-06, "loss": 1.0281, "step": 2060 }, { "epoch": 0.36846767050487156, "grad_norm": 4.911079902501515, "learning_rate": 3.6840240878498057e-06, "loss": 1.0565, "step": 2080 }, { "epoch": 0.3720106288751107, "grad_norm": 6.677202780526525, "learning_rate": 3.719447396386823e-06, "loss": 1.0622, "step": 2100 }, { "epoch": 0.3755535872453499, "grad_norm": 2.9957559478511513, "learning_rate": 3.7548707049238405e-06, "loss": 1.014, "step": 2120 }, { "epoch": 0.37909654561558903, "grad_norm": 6.136487010459827, "learning_rate": 3.7902940134608572e-06, "loss": 1.0463, "step": 2140 }, { "epoch": 0.3826395039858282, "grad_norm": 2.6989289543608987, "learning_rate": 3.825717321997875e-06, "loss": 1.0334, "step": 2160 }, { "epoch": 0.3861824623560673, "grad_norm": 2.8559280148544715, "learning_rate": 3.861140630534892e-06, "loss": 0.9961, "step": 2180 }, { "epoch": 0.38972542072630645, "grad_norm": 3.9195236355689618, "learning_rate": 3.8965639390719095e-06, "loss": 1.0501, "step": 2200 }, { "epoch": 0.3932683790965456, "grad_norm": 4.745715075717865, "learning_rate": 3.9319872476089276e-06, "loss": 1.0532, "step": 2220 }, { "epoch": 0.39681133746678476, "grad_norm": 3.1678711880303365, "learning_rate": 3.967410556145944e-06, "loss": 1.0464, "step": 2240 }, { "epoch": 0.4003542958370239, "grad_norm": 4.318491084289353, "learning_rate": 4.002833864682962e-06, "loss": 1.01, "step": 2260 }, { "epoch": 0.4038972542072631, "grad_norm": 3.877214772420464, "learning_rate": 4.038257173219979e-06, "loss": 1.04, "step": 2280 }, { "epoch": 0.40744021257750224, "grad_norm": 4.408726611386237, "learning_rate": 4.073680481756996e-06, "loss": 0.9864, "step": 2300 }, { "epoch": 0.41098317094774134, "grad_norm": 3.1476639776400264, "learning_rate": 4.109103790294014e-06, "loss": 1.0471, "step": 2320 }, { "epoch": 0.4145261293179805, "grad_norm": 4.057188755394368, "learning_rate": 4.144527098831031e-06, "loss": 1.0187, "step": 2340 }, { "epoch": 0.41806908768821965, "grad_norm": 3.7443003760493547, "learning_rate": 4.1799504073680485e-06, "loss": 1.0207, "step": 2360 }, { "epoch": 0.4216120460584588, "grad_norm": 3.4133153204439375, "learning_rate": 4.215373715905066e-06, "loss": 1.0412, "step": 2380 }, { "epoch": 0.42515500442869797, "grad_norm": 5.271529700638458, "learning_rate": 4.250797024442083e-06, "loss": 1.0446, "step": 2400 }, { "epoch": 0.4286979627989371, "grad_norm": 3.690737939017104, "learning_rate": 4.286220332979101e-06, "loss": 1.0893, "step": 2420 }, { "epoch": 0.4322409211691763, "grad_norm": 4.6971388539053445, "learning_rate": 4.321643641516118e-06, "loss": 1.0552, "step": 2440 }, { "epoch": 0.4357838795394154, "grad_norm": 3.833304687965468, "learning_rate": 4.357066950053135e-06, "loss": 1.018, "step": 2460 }, { "epoch": 0.43932683790965454, "grad_norm": 3.876707930916304, "learning_rate": 4.392490258590152e-06, "loss": 1.0593, "step": 2480 }, { "epoch": 0.4428697962798937, "grad_norm": 4.485093155652708, "learning_rate": 4.42791356712717e-06, "loss": 1.0315, "step": 2500 }, { "epoch": 0.44641275465013286, "grad_norm": 3.240309715459973, "learning_rate": 4.4633368756641875e-06, "loss": 1.0228, "step": 2520 }, { "epoch": 0.449955713020372, "grad_norm": 2.8577216948048085, "learning_rate": 4.498760184201205e-06, "loss": 0.9622, "step": 2540 }, { "epoch": 0.4534986713906112, "grad_norm": 2.3204510234528004, "learning_rate": 4.534183492738223e-06, "loss": 1.0417, "step": 2560 }, { "epoch": 0.45704162976085033, "grad_norm": 4.8495156054088655, "learning_rate": 4.569606801275239e-06, "loss": 1.0108, "step": 2580 }, { "epoch": 0.46058458813108943, "grad_norm": 5.060714565551563, "learning_rate": 4.605030109812257e-06, "loss": 1.0303, "step": 2600 }, { "epoch": 0.4641275465013286, "grad_norm": 6.095607076544207, "learning_rate": 4.640453418349274e-06, "loss": 1.0116, "step": 2620 }, { "epoch": 0.46767050487156775, "grad_norm": 2.9232128503389183, "learning_rate": 4.675876726886291e-06, "loss": 0.983, "step": 2640 }, { "epoch": 0.4712134632418069, "grad_norm": 2.36685887518906, "learning_rate": 4.711300035423309e-06, "loss": 1.0277, "step": 2660 }, { "epoch": 0.47475642161204606, "grad_norm": 4.128356071985117, "learning_rate": 4.7467233439603264e-06, "loss": 1.0411, "step": 2680 }, { "epoch": 0.4782993799822852, "grad_norm": 2.7297669503368804, "learning_rate": 4.782146652497344e-06, "loss": 1.0553, "step": 2700 }, { "epoch": 0.4818423383525244, "grad_norm": 2.7607226135533103, "learning_rate": 4.817569961034362e-06, "loss": 1.0547, "step": 2720 }, { "epoch": 0.4853852967227635, "grad_norm": 4.187021213318743, "learning_rate": 4.852993269571378e-06, "loss": 1.0437, "step": 2740 }, { "epoch": 0.48892825509300264, "grad_norm": 3.7584391728566695, "learning_rate": 4.888416578108396e-06, "loss": 1.035, "step": 2760 }, { "epoch": 0.4924712134632418, "grad_norm": 4.830912639451228, "learning_rate": 4.923839886645413e-06, "loss": 1.0675, "step": 2780 }, { "epoch": 0.49601417183348095, "grad_norm": 4.685752123568836, "learning_rate": 4.95926319518243e-06, "loss": 1.0493, "step": 2800 }, { "epoch": 0.4995571302037201, "grad_norm": 3.375680932358239, "learning_rate": 4.994686503719448e-06, "loss": 1.0686, "step": 2820 }, { "epoch": 0.5031000885739593, "grad_norm": 2.4729024999298534, "learning_rate": 4.999994474499561e-06, "loss": 1.0283, "step": 2840 }, { "epoch": 0.5066430469441984, "grad_norm": 4.50380809454446, "learning_rate": 4.999973825606614e-06, "loss": 1.0188, "step": 2860 }, { "epoch": 0.5101860053144376, "grad_norm": 3.418425163716614, "learning_rate": 4.999937881373025e-06, "loss": 1.0617, "step": 2880 }, { "epoch": 0.5137289636846767, "grad_norm": 3.6088000942292164, "learning_rate": 4.999886642018707e-06, "loss": 1.0723, "step": 2900 }, { "epoch": 0.5172719220549159, "grad_norm": 6.244089618662361, "learning_rate": 4.999820107857154e-06, "loss": 1.0662, "step": 2920 }, { "epoch": 0.520814880425155, "grad_norm": 2.609964349424898, "learning_rate": 4.999738279295433e-06, "loss": 1.0324, "step": 2940 }, { "epoch": 0.5243578387953941, "grad_norm": 2.679897624387073, "learning_rate": 4.9996411568341896e-06, "loss": 1.0207, "step": 2960 }, { "epoch": 0.5279007971656333, "grad_norm": 3.27573058708962, "learning_rate": 4.999528741067638e-06, "loss": 1.0939, "step": 2980 }, { "epoch": 0.5314437555358724, "grad_norm": 3.3784052333905756, "learning_rate": 4.99940103268356e-06, "loss": 1.0193, "step": 3000 }, { "epoch": 0.5349867139061116, "grad_norm": 4.924298481099745, "learning_rate": 4.999258032463301e-06, "loss": 1.1053, "step": 3020 }, { "epoch": 0.5385296722763507, "grad_norm": 4.1853626057858895, "learning_rate": 4.999099741281766e-06, "loss": 1.0337, "step": 3040 }, { "epoch": 0.54207263064659, "grad_norm": 3.5502444781095104, "learning_rate": 4.998926160107411e-06, "loss": 1.0786, "step": 3060 }, { "epoch": 0.545615589016829, "grad_norm": 3.335300103709776, "learning_rate": 4.998737290002241e-06, "loss": 1.0507, "step": 3080 }, { "epoch": 0.5491585473870682, "grad_norm": 3.114815124169259, "learning_rate": 4.9985331321218e-06, "loss": 1.0352, "step": 3100 }, { "epoch": 0.5527015057573074, "grad_norm": 4.900399906908037, "learning_rate": 4.998313687715169e-06, "loss": 1.0244, "step": 3120 }, { "epoch": 0.5562444641275465, "grad_norm": 2.853630801128127, "learning_rate": 4.9980789581249515e-06, "loss": 1.0552, "step": 3140 }, { "epoch": 0.5597874224977857, "grad_norm": 3.824968946809653, "learning_rate": 4.9978289447872695e-06, "loss": 1.0109, "step": 3160 }, { "epoch": 0.5633303808680248, "grad_norm": 3.6496584120718314, "learning_rate": 4.997563649231755e-06, "loss": 1.0097, "step": 3180 }, { "epoch": 0.566873339238264, "grad_norm": 4.089881200621581, "learning_rate": 4.997283073081541e-06, "loss": 1.0687, "step": 3200 }, { "epoch": 0.5704162976085031, "grad_norm": 4.25767273401204, "learning_rate": 4.996987218053247e-06, "loss": 1.0032, "step": 3220 }, { "epoch": 0.5739592559787422, "grad_norm": 4.054242038282677, "learning_rate": 4.996676085956973e-06, "loss": 1.0109, "step": 3240 }, { "epoch": 0.5775022143489814, "grad_norm": 2.2212311640306934, "learning_rate": 4.996349678696288e-06, "loss": 0.9873, "step": 3260 }, { "epoch": 0.5810451727192205, "grad_norm": 2.910691796089737, "learning_rate": 4.996007998268219e-06, "loss": 1.0389, "step": 3280 }, { "epoch": 0.5845881310894597, "grad_norm": 1.804064223013201, "learning_rate": 4.995651046763232e-06, "loss": 1.0065, "step": 3300 }, { "epoch": 0.5881310894596988, "grad_norm": 1.787168345455913, "learning_rate": 4.99527882636523e-06, "loss": 0.9964, "step": 3320 }, { "epoch": 0.591674047829938, "grad_norm": 2.6825449022104584, "learning_rate": 4.99489133935153e-06, "loss": 1.0113, "step": 3340 }, { "epoch": 0.5952170062001771, "grad_norm": 3.3430457632929986, "learning_rate": 4.9944885880928576e-06, "loss": 1.0159, "step": 3360 }, { "epoch": 0.5987599645704162, "grad_norm": 3.4188233454866777, "learning_rate": 4.994070575053324e-06, "loss": 1.0332, "step": 3380 }, { "epoch": 0.6023029229406555, "grad_norm": 5.035300065424226, "learning_rate": 4.993637302790417e-06, "loss": 1.0072, "step": 3400 }, { "epoch": 0.6058458813108946, "grad_norm": 2.4629577630265067, "learning_rate": 4.9931887739549845e-06, "loss": 1.0246, "step": 3420 }, { "epoch": 0.6093888396811338, "grad_norm": 3.2222908571387605, "learning_rate": 4.9927249912912135e-06, "loss": 1.1202, "step": 3440 }, { "epoch": 0.6129317980513729, "grad_norm": 3.329571486141443, "learning_rate": 4.99224595763662e-06, "loss": 1.0052, "step": 3460 }, { "epoch": 0.6164747564216121, "grad_norm": 3.2262886392708023, "learning_rate": 4.991751675922029e-06, "loss": 1.0563, "step": 3480 }, { "epoch": 0.6200177147918512, "grad_norm": 2.430384347609413, "learning_rate": 4.991242149171554e-06, "loss": 1.0084, "step": 3500 }, { "epoch": 0.6235606731620903, "grad_norm": 2.2313346912151437, "learning_rate": 4.990717380502581e-06, "loss": 1.1098, "step": 3520 }, { "epoch": 0.6271036315323295, "grad_norm": 3.269110632691307, "learning_rate": 4.990177373125752e-06, "loss": 0.9681, "step": 3540 }, { "epoch": 0.6306465899025686, "grad_norm": 2.8574079693862213, "learning_rate": 4.989622130344939e-06, "loss": 1.0192, "step": 3560 }, { "epoch": 0.6341895482728078, "grad_norm": 3.663783109216899, "learning_rate": 4.989051655557228e-06, "loss": 0.9997, "step": 3580 }, { "epoch": 0.6377325066430469, "grad_norm": 3.3537695839211916, "learning_rate": 4.9884659522528985e-06, "loss": 0.9669, "step": 3600 }, { "epoch": 0.6412754650132861, "grad_norm": 2.809218376275673, "learning_rate": 4.987865024015401e-06, "loss": 1.0155, "step": 3620 }, { "epoch": 0.6448184233835252, "grad_norm": 5.035643249923235, "learning_rate": 4.9872488745213356e-06, "loss": 1.0125, "step": 3640 }, { "epoch": 0.6483613817537643, "grad_norm": 3.6981075116685798, "learning_rate": 4.986617507540426e-06, "loss": 0.9861, "step": 3660 }, { "epoch": 0.6519043401240036, "grad_norm": 2.5959218064380725, "learning_rate": 4.985970926935504e-06, "loss": 1.0936, "step": 3680 }, { "epoch": 0.6554472984942427, "grad_norm": 2.72099024631383, "learning_rate": 4.985309136662478e-06, "loss": 1.0458, "step": 3700 }, { "epoch": 0.6589902568644819, "grad_norm": 2.3741120569873937, "learning_rate": 4.984632140770314e-06, "loss": 0.9733, "step": 3720 }, { "epoch": 0.662533215234721, "grad_norm": 2.9843690685487316, "learning_rate": 4.983939943401009e-06, "loss": 0.9865, "step": 3740 }, { "epoch": 0.6660761736049602, "grad_norm": 3.753859508812206, "learning_rate": 4.9832325487895625e-06, "loss": 1.0373, "step": 3760 }, { "epoch": 0.6696191319751993, "grad_norm": 2.36323312181139, "learning_rate": 4.98250996126396e-06, "loss": 1.0007, "step": 3780 }, { "epoch": 0.6731620903454384, "grad_norm": 2.4128489083499916, "learning_rate": 4.981772185245135e-06, "loss": 1.0155, "step": 3800 }, { "epoch": 0.6767050487156776, "grad_norm": 4.471122710526832, "learning_rate": 4.98101922524695e-06, "loss": 1.0625, "step": 3820 }, { "epoch": 0.6802480070859167, "grad_norm": 3.9450022308941035, "learning_rate": 4.980251085876163e-06, "loss": 1.0608, "step": 3840 }, { "epoch": 0.6837909654561559, "grad_norm": 3.750505881130023, "learning_rate": 4.979467771832407e-06, "loss": 1.0401, "step": 3860 }, { "epoch": 0.687333923826395, "grad_norm": 3.0818679668346918, "learning_rate": 4.978669287908152e-06, "loss": 0.9782, "step": 3880 }, { "epoch": 0.6908768821966342, "grad_norm": 3.585328199676442, "learning_rate": 4.9778556389886836e-06, "loss": 1.0293, "step": 3900 }, { "epoch": 0.6944198405668733, "grad_norm": 3.352267909342498, "learning_rate": 4.97702683005207e-06, "loss": 1.0443, "step": 3920 }, { "epoch": 0.6979627989371124, "grad_norm": 2.740609338046614, "learning_rate": 4.976182866169128e-06, "loss": 0.983, "step": 3940 }, { "epoch": 0.7015057573073517, "grad_norm": 3.815853619798988, "learning_rate": 4.9753237525033995e-06, "loss": 1.0241, "step": 3960 }, { "epoch": 0.7050487156775908, "grad_norm": 2.056928416192008, "learning_rate": 4.974449494311113e-06, "loss": 0.935, "step": 3980 }, { "epoch": 0.70859167404783, "grad_norm": 4.501534397772864, "learning_rate": 4.973560096941157e-06, "loss": 1.0417, "step": 4000 }, { "epoch": 0.70859167404783, "eval_loss": 0.8831750154495239, "eval_runtime": 377.54, "eval_samples_per_second": 25.181, "eval_steps_per_second": 3.149, "step": 4000 }, { "epoch": 0.7121346324180691, "grad_norm": 4.163953326960491, "learning_rate": 4.97265556583504e-06, "loss": 0.9787, "step": 4020 }, { "epoch": 0.7156775907883083, "grad_norm": 2.1612827561442143, "learning_rate": 4.971735906526867e-06, "loss": 1.0187, "step": 4040 }, { "epoch": 0.7192205491585474, "grad_norm": 4.132524372144837, "learning_rate": 4.9708011246432954e-06, "loss": 1.0447, "step": 4060 }, { "epoch": 0.7227635075287865, "grad_norm": 4.902391594790166, "learning_rate": 4.969851225903511e-06, "loss": 1.0849, "step": 4080 }, { "epoch": 0.7263064658990257, "grad_norm": 4.040812745701054, "learning_rate": 4.968886216119181e-06, "loss": 0.9977, "step": 4100 }, { "epoch": 0.7298494242692648, "grad_norm": 4.403191939391695, "learning_rate": 4.967906101194432e-06, "loss": 1.0151, "step": 4120 }, { "epoch": 0.733392382639504, "grad_norm": 4.888547155083341, "learning_rate": 4.9669108871258005e-06, "loss": 1.0488, "step": 4140 }, { "epoch": 0.7369353410097431, "grad_norm": 2.5511865400666247, "learning_rate": 4.965900580002208e-06, "loss": 0.9839, "step": 4160 }, { "epoch": 0.7404782993799823, "grad_norm": 3.7363422093828267, "learning_rate": 4.9648751860049146e-06, "loss": 0.9671, "step": 4180 }, { "epoch": 0.7440212577502214, "grad_norm": 4.5882225405684105, "learning_rate": 4.963834711407487e-06, "loss": 1.0153, "step": 4200 }, { "epoch": 0.7475642161204605, "grad_norm": 3.3728967510985175, "learning_rate": 4.962779162575757e-06, "loss": 0.9866, "step": 4220 }, { "epoch": 0.7511071744906997, "grad_norm": 3.3582439513903695, "learning_rate": 4.961708545967782e-06, "loss": 1.0012, "step": 4240 }, { "epoch": 0.7546501328609388, "grad_norm": 4.229044845753943, "learning_rate": 4.960622868133811e-06, "loss": 1.0264, "step": 4260 }, { "epoch": 0.7581930912311781, "grad_norm": 3.604122128833424, "learning_rate": 4.959522135716238e-06, "loss": 1.0334, "step": 4280 }, { "epoch": 0.7617360496014172, "grad_norm": 4.330367291558989, "learning_rate": 4.958406355449564e-06, "loss": 1.0528, "step": 4300 }, { "epoch": 0.7652790079716564, "grad_norm": 3.96229330754017, "learning_rate": 4.957275534160356e-06, "loss": 1.0142, "step": 4320 }, { "epoch": 0.7688219663418955, "grad_norm": 1.880123067964198, "learning_rate": 4.956129678767206e-06, "loss": 0.9585, "step": 4340 }, { "epoch": 0.7723649247121346, "grad_norm": 2.9357741019622026, "learning_rate": 4.954968796280685e-06, "loss": 1.0118, "step": 4360 }, { "epoch": 0.7759078830823738, "grad_norm": 5.337878559961154, "learning_rate": 4.953792893803308e-06, "loss": 0.96, "step": 4380 }, { "epoch": 0.7794508414526129, "grad_norm": 2.7737448076628146, "learning_rate": 4.952601978529479e-06, "loss": 1.0095, "step": 4400 }, { "epoch": 0.7829937998228521, "grad_norm": 2.81198860896816, "learning_rate": 4.951396057745457e-06, "loss": 1.0025, "step": 4420 }, { "epoch": 0.7865367581930912, "grad_norm": 3.2768364907663843, "learning_rate": 4.950175138829306e-06, "loss": 1.0062, "step": 4440 }, { "epoch": 0.7900797165633304, "grad_norm": 4.503697644314717, "learning_rate": 4.948939229250855e-06, "loss": 0.9866, "step": 4460 }, { "epoch": 0.7936226749335695, "grad_norm": 3.709697666688961, "learning_rate": 4.947688336571644e-06, "loss": 1.0234, "step": 4480 }, { "epoch": 0.7971656333038086, "grad_norm": 2.9177293521142804, "learning_rate": 4.946422468444886e-06, "loss": 0.9501, "step": 4500 }, { "epoch": 0.8007085916740478, "grad_norm": 3.3045345091796197, "learning_rate": 4.945141632615416e-06, "loss": 1.0335, "step": 4520 }, { "epoch": 0.804251550044287, "grad_norm": 4.0701401436058795, "learning_rate": 4.943845836919642e-06, "loss": 1.0438, "step": 4540 }, { "epoch": 0.8077945084145262, "grad_norm": 3.9759562983887213, "learning_rate": 4.942535089285505e-06, "loss": 1.0283, "step": 4560 }, { "epoch": 0.8113374667847653, "grad_norm": 5.075198666819325, "learning_rate": 4.9412093977324196e-06, "loss": 1.0087, "step": 4580 }, { "epoch": 0.8148804251550045, "grad_norm": 4.64113304477617, "learning_rate": 4.9398687703712324e-06, "loss": 1.0335, "step": 4600 }, { "epoch": 0.8184233835252436, "grad_norm": 3.6121906044220835, "learning_rate": 4.938513215404171e-06, "loss": 1.036, "step": 4620 }, { "epoch": 0.8219663418954827, "grad_norm": 2.8641112530538297, "learning_rate": 4.9371427411247905e-06, "loss": 0.9476, "step": 4640 }, { "epoch": 0.8255093002657219, "grad_norm": 2.2471640500283194, "learning_rate": 4.935757355917929e-06, "loss": 1.0, "step": 4660 }, { "epoch": 0.829052258635961, "grad_norm": 2.6668789188777553, "learning_rate": 4.93435706825965e-06, "loss": 1.042, "step": 4680 }, { "epoch": 0.8325952170062002, "grad_norm": 3.382540393461139, "learning_rate": 4.932941886717193e-06, "loss": 0.9925, "step": 4700 }, { "epoch": 0.8361381753764393, "grad_norm": 2.9612515882054224, "learning_rate": 4.931511819948924e-06, "loss": 1.0038, "step": 4720 }, { "epoch": 0.8396811337466785, "grad_norm": 5.805032752593361, "learning_rate": 4.930066876704276e-06, "loss": 1.0752, "step": 4740 }, { "epoch": 0.8432240921169176, "grad_norm": 2.906268834013272, "learning_rate": 4.9286070658237025e-06, "loss": 0.9574, "step": 4760 }, { "epoch": 0.8467670504871567, "grad_norm": 3.908291806399892, "learning_rate": 4.9271323962386185e-06, "loss": 0.9355, "step": 4780 }, { "epoch": 0.8503100088573959, "grad_norm": 2.862223610977262, "learning_rate": 4.925642876971347e-06, "loss": 0.9913, "step": 4800 }, { "epoch": 0.853852967227635, "grad_norm": 3.885251950370837, "learning_rate": 4.924138517135068e-06, "loss": 0.9437, "step": 4820 }, { "epoch": 0.8573959255978743, "grad_norm": 2.687403470850269, "learning_rate": 4.922619325933753e-06, "loss": 1.0183, "step": 4840 }, { "epoch": 0.8609388839681134, "grad_norm": 5.1744645832504945, "learning_rate": 4.921085312662119e-06, "loss": 0.9639, "step": 4860 }, { "epoch": 0.8644818423383526, "grad_norm": 4.73053239403457, "learning_rate": 4.919536486705569e-06, "loss": 1.0124, "step": 4880 }, { "epoch": 0.8680248007085917, "grad_norm": 4.4563037107783865, "learning_rate": 4.917972857540126e-06, "loss": 0.99, "step": 4900 }, { "epoch": 0.8715677590788308, "grad_norm": 4.614255774835929, "learning_rate": 4.916394434732391e-06, "loss": 1.0037, "step": 4920 }, { "epoch": 0.87511071744907, "grad_norm": 3.533058939111727, "learning_rate": 4.914801227939467e-06, "loss": 1.0177, "step": 4940 }, { "epoch": 0.8786536758193091, "grad_norm": 2.6727964096166583, "learning_rate": 4.913193246908916e-06, "loss": 0.9957, "step": 4960 }, { "epoch": 0.8821966341895483, "grad_norm": 6.368757949715121, "learning_rate": 4.911570501478686e-06, "loss": 1.0324, "step": 4980 }, { "epoch": 0.8857395925597874, "grad_norm": 3.238302734032586, "learning_rate": 4.909933001577057e-06, "loss": 0.9778, "step": 5000 }, { "epoch": 0.8892825509300266, "grad_norm": 2.9414571649379173, "learning_rate": 4.908280757222585e-06, "loss": 1.0183, "step": 5020 }, { "epoch": 0.8928255093002657, "grad_norm": 3.8675143162693417, "learning_rate": 4.906613778524029e-06, "loss": 1.0417, "step": 5040 }, { "epoch": 0.8963684676705048, "grad_norm": 4.062896818204324, "learning_rate": 4.9049320756803e-06, "loss": 0.9951, "step": 5060 }, { "epoch": 0.899911426040744, "grad_norm": 3.965608738547987, "learning_rate": 4.9032356589803935e-06, "loss": 1.0096, "step": 5080 }, { "epoch": 0.9034543844109831, "grad_norm": 2.4470182814478845, "learning_rate": 4.901524538803325e-06, "loss": 0.9706, "step": 5100 }, { "epoch": 0.9069973427812223, "grad_norm": 3.3652865356433788, "learning_rate": 4.899798725618071e-06, "loss": 1.0189, "step": 5120 }, { "epoch": 0.9105403011514615, "grad_norm": 2.1015419863160316, "learning_rate": 4.898058229983502e-06, "loss": 0.9427, "step": 5140 }, { "epoch": 0.9140832595217007, "grad_norm": 2.494263988797181, "learning_rate": 4.896303062548321e-06, "loss": 0.9542, "step": 5160 }, { "epoch": 0.9176262178919398, "grad_norm": 5.0132756287008355, "learning_rate": 4.894533234050992e-06, "loss": 1.0177, "step": 5180 }, { "epoch": 0.9211691762621789, "grad_norm": 3.848341234829432, "learning_rate": 4.892748755319679e-06, "loss": 0.9785, "step": 5200 }, { "epoch": 0.9247121346324181, "grad_norm": 3.9783991127336824, "learning_rate": 4.890949637272184e-06, "loss": 0.9964, "step": 5220 }, { "epoch": 0.9282550930026572, "grad_norm": 3.7994392920413333, "learning_rate": 4.8891358909158695e-06, "loss": 1.0164, "step": 5240 }, { "epoch": 0.9317980513728964, "grad_norm": 3.646774315012477, "learning_rate": 4.887307527347598e-06, "loss": 1.008, "step": 5260 }, { "epoch": 0.9353410097431355, "grad_norm": 6.203065849865684, "learning_rate": 4.885464557753666e-06, "loss": 1.0426, "step": 5280 }, { "epoch": 0.9388839681133747, "grad_norm": 2.9353445219470444, "learning_rate": 4.88360699340973e-06, "loss": 1.0052, "step": 5300 }, { "epoch": 0.9424269264836138, "grad_norm": 3.9291014774722033, "learning_rate": 4.88173484568074e-06, "loss": 0.9606, "step": 5320 }, { "epoch": 0.9459698848538529, "grad_norm": 3.9955198747460634, "learning_rate": 4.8798481260208715e-06, "loss": 0.9862, "step": 5340 }, { "epoch": 0.9495128432240921, "grad_norm": 4.103474267744412, "learning_rate": 4.877946845973453e-06, "loss": 1.008, "step": 5360 }, { "epoch": 0.9530558015943312, "grad_norm": 3.817035998173409, "learning_rate": 4.876031017170898e-06, "loss": 0.9696, "step": 5380 }, { "epoch": 0.9565987599645704, "grad_norm": 5.198464280181255, "learning_rate": 4.874100651334629e-06, "loss": 1.0248, "step": 5400 }, { "epoch": 0.9601417183348095, "grad_norm": 3.289343037136709, "learning_rate": 4.872155760275012e-06, "loss": 0.9793, "step": 5420 }, { "epoch": 0.9636846767050488, "grad_norm": 2.2435930827644746, "learning_rate": 4.87019635589128e-06, "loss": 1.0101, "step": 5440 }, { "epoch": 0.9672276350752879, "grad_norm": 4.310791558682954, "learning_rate": 4.86822245017146e-06, "loss": 0.995, "step": 5460 }, { "epoch": 0.970770593445527, "grad_norm": 3.352727892906536, "learning_rate": 4.866234055192306e-06, "loss": 0.9751, "step": 5480 }, { "epoch": 0.9743135518157662, "grad_norm": 5.068267192871768, "learning_rate": 4.864231183119212e-06, "loss": 0.9629, "step": 5500 }, { "epoch": 0.9778565101860053, "grad_norm": 3.5363020465890704, "learning_rate": 4.862213846206155e-06, "loss": 0.9977, "step": 5520 }, { "epoch": 0.9813994685562445, "grad_norm": 3.3894885205413834, "learning_rate": 4.860182056795604e-06, "loss": 0.9575, "step": 5540 }, { "epoch": 0.9849424269264836, "grad_norm": 2.6287369097602578, "learning_rate": 4.8581358273184545e-06, "loss": 0.989, "step": 5560 }, { "epoch": 0.9884853852967228, "grad_norm": 2.9015866832665185, "learning_rate": 4.856075170293948e-06, "loss": 1.0018, "step": 5580 }, { "epoch": 0.9920283436669619, "grad_norm": 3.0658535183131423, "learning_rate": 4.854000098329596e-06, "loss": 1.0078, "step": 5600 }, { "epoch": 0.995571302037201, "grad_norm": 2.5404639262387025, "learning_rate": 4.851910624121106e-06, "loss": 0.9407, "step": 5620 }, { "epoch": 0.9991142604074402, "grad_norm": 3.158604742158805, "learning_rate": 4.849806760452299e-06, "loss": 0.98, "step": 5640 }, { "epoch": 1.0026572187776794, "grad_norm": 3.3616076027225965, "learning_rate": 4.8476885201950345e-06, "loss": 0.9476, "step": 5660 }, { "epoch": 1.0062001771479185, "grad_norm": 3.690305513366549, "learning_rate": 4.84555591630913e-06, "loss": 0.9102, "step": 5680 }, { "epoch": 1.0097431355181576, "grad_norm": 5.359946307013767, "learning_rate": 4.843408961842285e-06, "loss": 0.9232, "step": 5700 }, { "epoch": 1.0132860938883967, "grad_norm": 3.569793560947344, "learning_rate": 4.841247669929995e-06, "loss": 0.935, "step": 5720 }, { "epoch": 1.016829052258636, "grad_norm": 3.874750269772003, "learning_rate": 4.839072053795479e-06, "loss": 0.9331, "step": 5740 }, { "epoch": 1.0203720106288752, "grad_norm": 2.7385873822832023, "learning_rate": 4.83688212674959e-06, "loss": 0.9371, "step": 5760 }, { "epoch": 1.0239149689991143, "grad_norm": 3.665069410011044, "learning_rate": 4.834677902190742e-06, "loss": 0.9085, "step": 5780 }, { "epoch": 1.0274579273693534, "grad_norm": 2.3598498586364416, "learning_rate": 4.832459393604822e-06, "loss": 0.8526, "step": 5800 }, { "epoch": 1.0310008857395925, "grad_norm": 2.7411380905564613, "learning_rate": 4.830226614565109e-06, "loss": 0.9451, "step": 5820 }, { "epoch": 1.0345438441098318, "grad_norm": 4.338981698988937, "learning_rate": 4.8279795787321935e-06, "loss": 0.9065, "step": 5840 }, { "epoch": 1.038086802480071, "grad_norm": 5.731424678260782, "learning_rate": 4.8257182998538895e-06, "loss": 0.8988, "step": 5860 }, { "epoch": 1.04162976085031, "grad_norm": 4.552868648680658, "learning_rate": 4.823442791765157e-06, "loss": 0.9059, "step": 5880 }, { "epoch": 1.045172719220549, "grad_norm": 2.50768692217334, "learning_rate": 4.821153068388007e-06, "loss": 0.9601, "step": 5900 }, { "epoch": 1.0487156775907882, "grad_norm": 5.095351842225188, "learning_rate": 4.818849143731428e-06, "loss": 0.9152, "step": 5920 }, { "epoch": 1.0522586359610275, "grad_norm": 2.8481200255546493, "learning_rate": 4.816531031891292e-06, "loss": 0.8828, "step": 5940 }, { "epoch": 1.0558015943312666, "grad_norm": 4.464320906945187, "learning_rate": 4.814198747050271e-06, "loss": 0.9552, "step": 5960 }, { "epoch": 1.0593445527015057, "grad_norm": 5.546584102940785, "learning_rate": 4.811852303477751e-06, "loss": 0.8654, "step": 5980 }, { "epoch": 1.0628875110717448, "grad_norm": 4.466057137043603, "learning_rate": 4.809491715529744e-06, "loss": 0.8941, "step": 6000 }, { "epoch": 1.0628875110717448, "eval_loss": 0.8596345782279968, "eval_runtime": 368.3497, "eval_samples_per_second": 25.81, "eval_steps_per_second": 3.228, "step": 6000 }, { "epoch": 1.066430469441984, "grad_norm": 4.240898370302153, "learning_rate": 4.8071169976488e-06, "loss": 0.9238, "step": 6020 }, { "epoch": 1.0699734278122233, "grad_norm": 3.2580988514752343, "learning_rate": 4.804728164363918e-06, "loss": 0.9158, "step": 6040 }, { "epoch": 1.0735163861824624, "grad_norm": 3.870662909162037, "learning_rate": 4.80232523029046e-06, "loss": 0.9688, "step": 6060 }, { "epoch": 1.0770593445527015, "grad_norm": 1.7822539984397539, "learning_rate": 4.799908210130058e-06, "loss": 0.9053, "step": 6080 }, { "epoch": 1.0806023029229406, "grad_norm": 3.7732124960041276, "learning_rate": 4.797477118670524e-06, "loss": 0.9815, "step": 6100 }, { "epoch": 1.08414526129318, "grad_norm": 6.115099007176549, "learning_rate": 4.7950319707857615e-06, "loss": 0.9064, "step": 6120 }, { "epoch": 1.087688219663419, "grad_norm": 2.577014162447891, "learning_rate": 4.792572781435678e-06, "loss": 0.8382, "step": 6140 }, { "epoch": 1.091231178033658, "grad_norm": 3.220759744400382, "learning_rate": 4.790099565666086e-06, "loss": 0.8572, "step": 6160 }, { "epoch": 1.0947741364038972, "grad_norm": 2.7603504810469097, "learning_rate": 4.787612338608614e-06, "loss": 0.9017, "step": 6180 }, { "epoch": 1.0983170947741363, "grad_norm": 3.4076917515040686, "learning_rate": 4.785111115480615e-06, "loss": 0.9043, "step": 6200 }, { "epoch": 1.1018600531443756, "grad_norm": 2.714069163764211, "learning_rate": 4.782595911585074e-06, "loss": 0.9445, "step": 6220 }, { "epoch": 1.1054030115146147, "grad_norm": 2.579769298355691, "learning_rate": 4.780066742310512e-06, "loss": 0.8789, "step": 6240 }, { "epoch": 1.1089459698848538, "grad_norm": 4.326106933810614, "learning_rate": 4.777523623130894e-06, "loss": 0.9087, "step": 6260 }, { "epoch": 1.112488928255093, "grad_norm": 4.203656763039969, "learning_rate": 4.774966569605531e-06, "loss": 0.9168, "step": 6280 }, { "epoch": 1.1160318866253323, "grad_norm": 4.154066900716089, "learning_rate": 4.772395597378991e-06, "loss": 0.8687, "step": 6300 }, { "epoch": 1.1195748449955714, "grad_norm": 2.08540747712982, "learning_rate": 4.769810722180994e-06, "loss": 0.871, "step": 6320 }, { "epoch": 1.1231178033658105, "grad_norm": 3.1465129817119677, "learning_rate": 4.767211959826326e-06, "loss": 0.9231, "step": 6340 }, { "epoch": 1.1266607617360496, "grad_norm": 2.3015425077734233, "learning_rate": 4.764599326214736e-06, "loss": 0.91, "step": 6360 }, { "epoch": 1.1302037201062887, "grad_norm": 3.2081119900478083, "learning_rate": 4.761972837330839e-06, "loss": 0.9247, "step": 6380 }, { "epoch": 1.133746678476528, "grad_norm": 3.093019376379044, "learning_rate": 4.7593325092440204e-06, "loss": 0.8783, "step": 6400 }, { "epoch": 1.137289636846767, "grad_norm": 4.476714117074904, "learning_rate": 4.756678358108337e-06, "loss": 0.9356, "step": 6420 }, { "epoch": 1.1408325952170062, "grad_norm": 4.415252399299131, "learning_rate": 4.754010400162416e-06, "loss": 0.8873, "step": 6440 }, { "epoch": 1.1443755535872453, "grad_norm": 4.618410771551369, "learning_rate": 4.7513286517293585e-06, "loss": 0.9271, "step": 6460 }, { "epoch": 1.1479185119574846, "grad_norm": 3.1205309158472465, "learning_rate": 4.74863312921664e-06, "loss": 0.8835, "step": 6480 }, { "epoch": 1.1514614703277237, "grad_norm": 2.4282160366985255, "learning_rate": 4.7459238491160056e-06, "loss": 0.9308, "step": 6500 }, { "epoch": 1.1550044286979628, "grad_norm": 2.865738470619868, "learning_rate": 4.743200828003374e-06, "loss": 0.9414, "step": 6520 }, { "epoch": 1.158547387068202, "grad_norm": 3.6374100587906835, "learning_rate": 4.740464082538735e-06, "loss": 0.9106, "step": 6540 }, { "epoch": 1.162090345438441, "grad_norm": 3.0695217920809053, "learning_rate": 4.737713629466045e-06, "loss": 0.8616, "step": 6560 }, { "epoch": 1.1656333038086801, "grad_norm": 3.9353520892249363, "learning_rate": 4.734949485613126e-06, "loss": 0.8914, "step": 6580 }, { "epoch": 1.1691762621789195, "grad_norm": 3.484414702314974, "learning_rate": 4.732171667891564e-06, "loss": 0.92, "step": 6600 }, { "epoch": 1.1727192205491586, "grad_norm": 3.504870240653996, "learning_rate": 4.729380193296605e-06, "loss": 0.9396, "step": 6620 }, { "epoch": 1.1762621789193977, "grad_norm": 5.929401699342908, "learning_rate": 4.726575078907049e-06, "loss": 0.9188, "step": 6640 }, { "epoch": 1.1798051372896368, "grad_norm": 6.554517511673939, "learning_rate": 4.723756341885148e-06, "loss": 0.9534, "step": 6660 }, { "epoch": 1.183348095659876, "grad_norm": 2.434654285685298, "learning_rate": 4.7209239994765e-06, "loss": 0.8497, "step": 6680 }, { "epoch": 1.1868910540301152, "grad_norm": 2.9006017781540483, "learning_rate": 4.718078069009944e-06, "loss": 0.9326, "step": 6700 }, { "epoch": 1.1904340124003543, "grad_norm": 6.939455220904752, "learning_rate": 4.71521856789745e-06, "loss": 0.9234, "step": 6720 }, { "epoch": 1.1939769707705934, "grad_norm": 3.3273297930889814, "learning_rate": 4.712345513634021e-06, "loss": 0.9146, "step": 6740 }, { "epoch": 1.1975199291408325, "grad_norm": 3.0714433546937774, "learning_rate": 4.709458923797579e-06, "loss": 0.9112, "step": 6760 }, { "epoch": 1.2010628875110718, "grad_norm": 3.686574518500066, "learning_rate": 4.7065588160488565e-06, "loss": 0.9353, "step": 6780 }, { "epoch": 1.204605845881311, "grad_norm": 3.1307294434556256, "learning_rate": 4.703645208131294e-06, "loss": 0.8906, "step": 6800 }, { "epoch": 1.20814880425155, "grad_norm": 3.9540023879464616, "learning_rate": 4.70071811787093e-06, "loss": 0.9389, "step": 6820 }, { "epoch": 1.2116917626217891, "grad_norm": 4.99757735756388, "learning_rate": 4.697777563176288e-06, "loss": 0.8728, "step": 6840 }, { "epoch": 1.2152347209920284, "grad_norm": 1.8831614270222023, "learning_rate": 4.694823562038271e-06, "loss": 0.8971, "step": 6860 }, { "epoch": 1.2187776793622676, "grad_norm": 3.3841095933963747, "learning_rate": 4.69185613253005e-06, "loss": 0.9404, "step": 6880 }, { "epoch": 1.2223206377325067, "grad_norm": 2.5639299660820543, "learning_rate": 4.688875292806952e-06, "loss": 0.8651, "step": 6900 }, { "epoch": 1.2258635961027458, "grad_norm": 2.5209787475843024, "learning_rate": 4.685881061106352e-06, "loss": 0.8783, "step": 6920 }, { "epoch": 1.2294065544729849, "grad_norm": 4.174003458453298, "learning_rate": 4.68287345574756e-06, "loss": 0.939, "step": 6940 }, { "epoch": 1.2329495128432242, "grad_norm": 3.7213954731195944, "learning_rate": 4.679852495131708e-06, "loss": 0.9698, "step": 6960 }, { "epoch": 1.2364924712134633, "grad_norm": 3.1050229374536826, "learning_rate": 4.676818197741637e-06, "loss": 0.901, "step": 6980 }, { "epoch": 1.2400354295837024, "grad_norm": 4.537242489543826, "learning_rate": 4.673770582141788e-06, "loss": 0.8826, "step": 7000 }, { "epoch": 1.2435783879539415, "grad_norm": 3.157545512432448, "learning_rate": 4.670709666978081e-06, "loss": 0.9426, "step": 7020 }, { "epoch": 1.2471213463241808, "grad_norm": 3.6885040878766975, "learning_rate": 4.667635470977811e-06, "loss": 0.9253, "step": 7040 }, { "epoch": 1.25066430469442, "grad_norm": 3.8470192299130495, "learning_rate": 4.664548012949523e-06, "loss": 0.9516, "step": 7060 }, { "epoch": 1.254207263064659, "grad_norm": 2.7610537115436897, "learning_rate": 4.661447311782905e-06, "loss": 0.9632, "step": 7080 }, { "epoch": 1.2577502214348981, "grad_norm": 2.9076857370419726, "learning_rate": 4.658333386448668e-06, "loss": 0.8516, "step": 7100 }, { "epoch": 1.2612931798051372, "grad_norm": 2.8712653113719178, "learning_rate": 4.655206255998429e-06, "loss": 0.8681, "step": 7120 }, { "epoch": 1.2648361381753763, "grad_norm": 3.5281659347458443, "learning_rate": 4.652065939564601e-06, "loss": 0.8612, "step": 7140 }, { "epoch": 1.2683790965456156, "grad_norm": 3.0200930161561836, "learning_rate": 4.648912456360266e-06, "loss": 0.9232, "step": 7160 }, { "epoch": 1.2719220549158547, "grad_norm": 2.9141762965804068, "learning_rate": 4.645745825679069e-06, "loss": 0.8704, "step": 7180 }, { "epoch": 1.2754650132860939, "grad_norm": 3.466517729779936, "learning_rate": 4.642566066895089e-06, "loss": 0.9167, "step": 7200 }, { "epoch": 1.2790079716563332, "grad_norm": 4.016886275498443, "learning_rate": 4.639373199462728e-06, "loss": 0.8753, "step": 7220 }, { "epoch": 1.2825509300265723, "grad_norm": 2.6960913937478064, "learning_rate": 4.636167242916588e-06, "loss": 0.9387, "step": 7240 }, { "epoch": 1.2860938883968114, "grad_norm": 4.156865933729297, "learning_rate": 4.6329482168713535e-06, "loss": 0.8807, "step": 7260 }, { "epoch": 1.2896368467670505, "grad_norm": 2.3052353180349194, "learning_rate": 4.62971614102167e-06, "loss": 0.9344, "step": 7280 }, { "epoch": 1.2931798051372896, "grad_norm": 4.351236836672874, "learning_rate": 4.626471035142027e-06, "loss": 0.9368, "step": 7300 }, { "epoch": 1.2967227635075287, "grad_norm": 2.96988427417053, "learning_rate": 4.62321291908663e-06, "loss": 0.9225, "step": 7320 }, { "epoch": 1.300265721877768, "grad_norm": 2.803880488767538, "learning_rate": 4.619941812789287e-06, "loss": 0.9065, "step": 7340 }, { "epoch": 1.3038086802480071, "grad_norm": 2.2501268712600115, "learning_rate": 4.616657736263282e-06, "loss": 0.9095, "step": 7360 }, { "epoch": 1.3073516386182462, "grad_norm": 3.530457146463456, "learning_rate": 4.613360709601251e-06, "loss": 0.8956, "step": 7380 }, { "epoch": 1.3108945969884853, "grad_norm": 2.9920752839459848, "learning_rate": 4.6100507529750656e-06, "loss": 0.8907, "step": 7400 }, { "epoch": 1.3144375553587246, "grad_norm": 2.3035469610025614, "learning_rate": 4.6067278866357025e-06, "loss": 0.9135, "step": 7420 }, { "epoch": 1.3179805137289637, "grad_norm": 3.742157836005142, "learning_rate": 4.603392130913123e-06, "loss": 0.9146, "step": 7440 }, { "epoch": 1.3215234720992028, "grad_norm": 2.228186526375547, "learning_rate": 4.600043506216151e-06, "loss": 0.9103, "step": 7460 }, { "epoch": 1.325066430469442, "grad_norm": 2.4528087238581504, "learning_rate": 4.5966820330323405e-06, "loss": 0.9298, "step": 7480 }, { "epoch": 1.328609388839681, "grad_norm": 2.8414827920653125, "learning_rate": 4.59330773192786e-06, "loss": 0.8779, "step": 7500 }, { "epoch": 1.3321523472099202, "grad_norm": 3.492728703129015, "learning_rate": 4.5899206235473585e-06, "loss": 0.9399, "step": 7520 }, { "epoch": 1.3356953055801595, "grad_norm": 4.648463092813606, "learning_rate": 4.586520728613842e-06, "loss": 0.9026, "step": 7540 }, { "epoch": 1.3392382639503986, "grad_norm": 3.9103385940119297, "learning_rate": 4.583108067928552e-06, "loss": 0.8996, "step": 7560 }, { "epoch": 1.3427812223206377, "grad_norm": 4.1525594833780035, "learning_rate": 4.579682662370829e-06, "loss": 0.911, "step": 7580 }, { "epoch": 1.346324180690877, "grad_norm": 4.692836743880663, "learning_rate": 4.576244532897988e-06, "loss": 0.8638, "step": 7600 }, { "epoch": 1.349867139061116, "grad_norm": 2.805532914309198, "learning_rate": 4.572793700545197e-06, "loss": 0.9105, "step": 7620 }, { "epoch": 1.3534100974313552, "grad_norm": 3.179829503013336, "learning_rate": 4.569330186425339e-06, "loss": 0.9251, "step": 7640 }, { "epoch": 1.3569530558015943, "grad_norm": 2.862631874344196, "learning_rate": 4.565854011728885e-06, "loss": 0.9681, "step": 7660 }, { "epoch": 1.3604960141718334, "grad_norm": 3.7867164985506876, "learning_rate": 4.562365197723771e-06, "loss": 0.9298, "step": 7680 }, { "epoch": 1.3640389725420725, "grad_norm": 4.25563448626548, "learning_rate": 4.558863765755257e-06, "loss": 0.8872, "step": 7700 }, { "epoch": 1.3675819309123118, "grad_norm": 2.742439626228035, "learning_rate": 4.555349737245808e-06, "loss": 0.8776, "step": 7720 }, { "epoch": 1.371124889282551, "grad_norm": 3.4373516712170504, "learning_rate": 4.5518231336949526e-06, "loss": 0.8886, "step": 7740 }, { "epoch": 1.37466784765279, "grad_norm": 3.098577500060214, "learning_rate": 4.548283976679158e-06, "loss": 0.8762, "step": 7760 }, { "epoch": 1.3782108060230294, "grad_norm": 4.088083429018507, "learning_rate": 4.5447322878516965e-06, "loss": 0.8655, "step": 7780 }, { "epoch": 1.3817537643932685, "grad_norm": 4.5726701868460475, "learning_rate": 4.541168088942511e-06, "loss": 0.9061, "step": 7800 }, { "epoch": 1.3852967227635076, "grad_norm": 3.093081976098804, "learning_rate": 4.537591401758084e-06, "loss": 0.934, "step": 7820 }, { "epoch": 1.3888396811337467, "grad_norm": 5.171297134853851, "learning_rate": 4.5340022481813055e-06, "loss": 0.9712, "step": 7840 }, { "epoch": 1.3923826395039858, "grad_norm": 2.889572353723012, "learning_rate": 4.530400650171335e-06, "loss": 0.8755, "step": 7860 }, { "epoch": 1.3959255978742249, "grad_norm": 5.753273400624572, "learning_rate": 4.526786629763471e-06, "loss": 0.8735, "step": 7880 }, { "epoch": 1.3994685562444642, "grad_norm": 5.098809443054654, "learning_rate": 4.523160209069014e-06, "loss": 0.8922, "step": 7900 }, { "epoch": 1.4030115146147033, "grad_norm": 2.357594211768667, "learning_rate": 4.5195214102751324e-06, "loss": 0.9088, "step": 7920 }, { "epoch": 1.4065544729849424, "grad_norm": 3.9367579628501184, "learning_rate": 4.515870255644727e-06, "loss": 0.9186, "step": 7940 }, { "epoch": 1.4100974313551815, "grad_norm": 5.456286963862879, "learning_rate": 4.512206767516291e-06, "loss": 0.9111, "step": 7960 }, { "epoch": 1.4136403897254208, "grad_norm": 3.5921469100201944, "learning_rate": 4.508530968303781e-06, "loss": 0.9028, "step": 7980 }, { "epoch": 1.41718334809566, "grad_norm": 2.9603120534047056, "learning_rate": 4.504842880496472e-06, "loss": 0.8972, "step": 8000 }, { "epoch": 1.41718334809566, "eval_loss": 0.837660551071167, "eval_runtime": 368.7294, "eval_samples_per_second": 25.783, "eval_steps_per_second": 3.225, "step": 8000 }, { "epoch": 1.420726306465899, "grad_norm": 4.883873708709509, "learning_rate": 4.5011425266588225e-06, "loss": 0.9461, "step": 8020 }, { "epoch": 1.4242692648361381, "grad_norm": 4.135582187777777, "learning_rate": 4.497429929430341e-06, "loss": 0.9508, "step": 8040 }, { "epoch": 1.4278122232063772, "grad_norm": 4.734069578966871, "learning_rate": 4.493705111525439e-06, "loss": 0.9336, "step": 8060 }, { "epoch": 1.4313551815766163, "grad_norm": 3.218508259496232, "learning_rate": 4.4899680957333e-06, "loss": 0.8421, "step": 8080 }, { "epoch": 1.4348981399468557, "grad_norm": 2.1197851091950355, "learning_rate": 4.486218904917735e-06, "loss": 0.8656, "step": 8100 }, { "epoch": 1.4384410983170948, "grad_norm": 3.0696326095251694, "learning_rate": 4.482457562017043e-06, "loss": 0.8596, "step": 8120 }, { "epoch": 1.4419840566873339, "grad_norm": 3.0008100293231617, "learning_rate": 4.478684090043875e-06, "loss": 0.92, "step": 8140 }, { "epoch": 1.4455270150575732, "grad_norm": 2.9367769313434207, "learning_rate": 4.474898512085088e-06, "loss": 0.8598, "step": 8160 }, { "epoch": 1.4490699734278123, "grad_norm": 2.3972324015727473, "learning_rate": 4.471100851301605e-06, "loss": 0.8952, "step": 8180 }, { "epoch": 1.4526129317980514, "grad_norm": 2.2321286554170476, "learning_rate": 4.467291130928277e-06, "loss": 0.9081, "step": 8200 }, { "epoch": 1.4561558901682905, "grad_norm": 2.5187305501003725, "learning_rate": 4.463469374273737e-06, "loss": 0.9273, "step": 8220 }, { "epoch": 1.4596988485385296, "grad_norm": 5.185336322080168, "learning_rate": 4.459635604720255e-06, "loss": 0.8962, "step": 8240 }, { "epoch": 1.4632418069087687, "grad_norm": 2.6043475650440895, "learning_rate": 4.4557898457236025e-06, "loss": 0.9125, "step": 8260 }, { "epoch": 1.466784765279008, "grad_norm": 3.0322932631838233, "learning_rate": 4.4519321208129044e-06, "loss": 0.8977, "step": 8280 }, { "epoch": 1.4703277236492471, "grad_norm": 4.742695243075337, "learning_rate": 4.448062453590493e-06, "loss": 0.9128, "step": 8300 }, { "epoch": 1.4738706820194862, "grad_norm": 2.709718326834174, "learning_rate": 4.444180867731769e-06, "loss": 0.8838, "step": 8320 }, { "epoch": 1.4774136403897256, "grad_norm": 4.119470722998265, "learning_rate": 4.44028738698505e-06, "loss": 0.8819, "step": 8340 }, { "epoch": 1.4809565987599647, "grad_norm": 3.1816783974486382, "learning_rate": 4.436382035171432e-06, "loss": 0.8797, "step": 8360 }, { "epoch": 1.4844995571302038, "grad_norm": 3.1914690524711844, "learning_rate": 4.4324648361846424e-06, "loss": 0.8278, "step": 8380 }, { "epoch": 1.4880425155004429, "grad_norm": 4.009571063093233, "learning_rate": 4.428535813990885e-06, "loss": 0.9445, "step": 8400 }, { "epoch": 1.491585473870682, "grad_norm": 2.9833892404793736, "learning_rate": 4.424594992628708e-06, "loss": 0.8951, "step": 8420 }, { "epoch": 1.495128432240921, "grad_norm": 2.478036010569002, "learning_rate": 4.420642396208844e-06, "loss": 0.8963, "step": 8440 }, { "epoch": 1.4986713906111604, "grad_norm": 3.24442393322022, "learning_rate": 4.416678048914069e-06, "loss": 0.8875, "step": 8460 }, { "epoch": 1.5022143489813995, "grad_norm": 5.0690401321371334, "learning_rate": 4.412701974999057e-06, "loss": 0.9041, "step": 8480 }, { "epoch": 1.5057573073516386, "grad_norm": 3.5032838009682172, "learning_rate": 4.4087141987902215e-06, "loss": 0.9024, "step": 8500 }, { "epoch": 1.509300265721878, "grad_norm": 3.4593082915022384, "learning_rate": 4.404714744685578e-06, "loss": 0.9299, "step": 8520 }, { "epoch": 1.512843224092117, "grad_norm": 2.906522716091139, "learning_rate": 4.4007036371545865e-06, "loss": 0.8399, "step": 8540 }, { "epoch": 1.5163861824623561, "grad_norm": 2.556244604059082, "learning_rate": 4.396680900738007e-06, "loss": 0.8959, "step": 8560 }, { "epoch": 1.5199291408325952, "grad_norm": 2.872414224448131, "learning_rate": 4.392646560047746e-06, "loss": 0.8837, "step": 8580 }, { "epoch": 1.5234720992028343, "grad_norm": 4.371819876343486, "learning_rate": 4.388600639766711e-06, "loss": 0.9246, "step": 8600 }, { "epoch": 1.5270150575730734, "grad_norm": 3.5523806587587514, "learning_rate": 4.384543164648649e-06, "loss": 0.931, "step": 8620 }, { "epoch": 1.5305580159433125, "grad_norm": 2.929477900211409, "learning_rate": 4.380474159518007e-06, "loss": 0.8985, "step": 8640 }, { "epoch": 1.5341009743135519, "grad_norm": 4.211828377945081, "learning_rate": 4.3763936492697735e-06, "loss": 0.8785, "step": 8660 }, { "epoch": 1.537643932683791, "grad_norm": 2.4701458590021956, "learning_rate": 4.372301658869327e-06, "loss": 0.9385, "step": 8680 }, { "epoch": 1.54118689105403, "grad_norm": 3.142058476836683, "learning_rate": 4.368198213352286e-06, "loss": 0.902, "step": 8700 }, { "epoch": 1.5447298494242694, "grad_norm": 2.771138222528823, "learning_rate": 4.3640833378243505e-06, "loss": 0.8804, "step": 8720 }, { "epoch": 1.5482728077945085, "grad_norm": 3.9280654601321006, "learning_rate": 4.3599570574611545e-06, "loss": 0.8938, "step": 8740 }, { "epoch": 1.5518157661647476, "grad_norm": 2.2418472839675463, "learning_rate": 4.355819397508106e-06, "loss": 0.8968, "step": 8760 }, { "epoch": 1.5553587245349867, "grad_norm": 3.399601556100489, "learning_rate": 4.35167038328024e-06, "loss": 0.8546, "step": 8780 }, { "epoch": 1.5589016829052258, "grad_norm": 3.455386669336896, "learning_rate": 4.3475100401620555e-06, "loss": 0.8987, "step": 8800 }, { "epoch": 1.562444641275465, "grad_norm": 4.528613569077828, "learning_rate": 4.3433383936073635e-06, "loss": 0.9096, "step": 8820 }, { "epoch": 1.565987599645704, "grad_norm": 2.885188625705028, "learning_rate": 4.3391554691391345e-06, "loss": 0.8747, "step": 8840 }, { "epoch": 1.5695305580159433, "grad_norm": 4.53397939933421, "learning_rate": 4.334961292349339e-06, "loss": 0.9238, "step": 8860 }, { "epoch": 1.5730735163861824, "grad_norm": 4.595528706412981, "learning_rate": 4.33075588889879e-06, "loss": 0.9103, "step": 8880 }, { "epoch": 1.5766164747564217, "grad_norm": 4.301564262537775, "learning_rate": 4.326539284516989e-06, "loss": 0.8638, "step": 8900 }, { "epoch": 1.5801594331266609, "grad_norm": 2.0941698417183883, "learning_rate": 4.322311505001964e-06, "loss": 0.9186, "step": 8920 }, { "epoch": 1.5837023914969, "grad_norm": 2.2302494709941416, "learning_rate": 4.318072576220119e-06, "loss": 0.9041, "step": 8940 }, { "epoch": 1.587245349867139, "grad_norm": 3.6126726071073914, "learning_rate": 4.31382252410607e-06, "loss": 0.9306, "step": 8960 }, { "epoch": 1.5907883082373782, "grad_norm": 3.4874125803596265, "learning_rate": 4.309561374662486e-06, "loss": 0.9067, "step": 8980 }, { "epoch": 1.5943312666076173, "grad_norm": 3.2939583547971076, "learning_rate": 4.3052891539599315e-06, "loss": 0.9511, "step": 9000 }, { "epoch": 1.5978742249778564, "grad_norm": 4.614562529533774, "learning_rate": 4.301005888136711e-06, "loss": 0.8829, "step": 9020 }, { "epoch": 1.6014171833480957, "grad_norm": 3.801219562406557, "learning_rate": 4.2967116033987015e-06, "loss": 0.912, "step": 9040 }, { "epoch": 1.6049601417183348, "grad_norm": 3.753568798707887, "learning_rate": 4.292406326019198e-06, "loss": 0.8699, "step": 9060 }, { "epoch": 1.6085031000885741, "grad_norm": 4.73422889733671, "learning_rate": 4.288090082338749e-06, "loss": 0.8836, "step": 9080 }, { "epoch": 1.6120460584588132, "grad_norm": 2.9548744217680873, "learning_rate": 4.283762898764998e-06, "loss": 0.8952, "step": 9100 }, { "epoch": 1.6155890168290523, "grad_norm": 3.206506941283786, "learning_rate": 4.2794248017725226e-06, "loss": 0.8603, "step": 9120 }, { "epoch": 1.6191319751992914, "grad_norm": 3.530944230336076, "learning_rate": 4.275075817902667e-06, "loss": 0.9217, "step": 9140 }, { "epoch": 1.6226749335695305, "grad_norm": 3.203969300098245, "learning_rate": 4.270715973763387e-06, "loss": 0.8971, "step": 9160 }, { "epoch": 1.6262178919397696, "grad_norm": 3.0239142090145052, "learning_rate": 4.2663452960290805e-06, "loss": 0.9334, "step": 9180 }, { "epoch": 1.6297608503100087, "grad_norm": 2.5777090574009, "learning_rate": 4.261963811440432e-06, "loss": 0.8392, "step": 9200 }, { "epoch": 1.633303808680248, "grad_norm": 4.058353343726418, "learning_rate": 4.25757154680424e-06, "loss": 0.8933, "step": 9220 }, { "epoch": 1.6368467670504872, "grad_norm": 3.507645687067127, "learning_rate": 4.253168528993261e-06, "loss": 0.8899, "step": 9240 }, { "epoch": 1.6403897254207263, "grad_norm": 3.513884615072957, "learning_rate": 4.248754784946038e-06, "loss": 0.9113, "step": 9260 }, { "epoch": 1.6439326837909656, "grad_norm": 2.773723856571234, "learning_rate": 4.244330341666743e-06, "loss": 0.9056, "step": 9280 }, { "epoch": 1.6474756421612047, "grad_norm": 3.4180543280648243, "learning_rate": 4.239895226225005e-06, "loss": 0.8966, "step": 9300 }, { "epoch": 1.6510186005314438, "grad_norm": 3.918770035240026, "learning_rate": 4.2354494657557485e-06, "loss": 0.8769, "step": 9320 }, { "epoch": 1.6545615589016829, "grad_norm": 3.372636086492444, "learning_rate": 4.230993087459028e-06, "loss": 0.8915, "step": 9340 }, { "epoch": 1.658104517271922, "grad_norm": 3.510054489740414, "learning_rate": 4.226526118599858e-06, "loss": 0.9184, "step": 9360 }, { "epoch": 1.661647475642161, "grad_norm": 2.0157440996114526, "learning_rate": 4.222048586508048e-06, "loss": 0.9172, "step": 9380 }, { "epoch": 1.6651904340124002, "grad_norm": 3.6068994268498074, "learning_rate": 4.2175605185780375e-06, "loss": 0.8873, "step": 9400 }, { "epoch": 1.6687333923826395, "grad_norm": 4.420449955455039, "learning_rate": 4.213061942268724e-06, "loss": 0.8436, "step": 9420 }, { "epoch": 1.6722763507528786, "grad_norm": 3.1825503459026168, "learning_rate": 4.208552885103299e-06, "loss": 0.8543, "step": 9440 }, { "epoch": 1.675819309123118, "grad_norm": 2.8992183977213535, "learning_rate": 4.204033374669077e-06, "loss": 0.8824, "step": 9460 }, { "epoch": 1.679362267493357, "grad_norm": 2.799570684359126, "learning_rate": 4.19950343861733e-06, "loss": 0.8671, "step": 9480 }, { "epoch": 1.6829052258635961, "grad_norm": 3.641582622193844, "learning_rate": 4.194963104663112e-06, "loss": 0.8628, "step": 9500 }, { "epoch": 1.6864481842338352, "grad_norm": 4.398919078323408, "learning_rate": 4.1904124005850954e-06, "loss": 0.9005, "step": 9520 }, { "epoch": 1.6899911426040743, "grad_norm": 2.9512298802973143, "learning_rate": 4.185851354225401e-06, "loss": 0.9078, "step": 9540 }, { "epoch": 1.6935341009743134, "grad_norm": 3.8044700286481246, "learning_rate": 4.181279993489423e-06, "loss": 0.9168, "step": 9560 }, { "epoch": 1.6970770593445526, "grad_norm": 3.308352478584529, "learning_rate": 4.176698346345663e-06, "loss": 0.8434, "step": 9580 }, { "epoch": 1.7006200177147919, "grad_norm": 3.904869182178536, "learning_rate": 4.1721064408255555e-06, "loss": 0.9005, "step": 9600 }, { "epoch": 1.704162976085031, "grad_norm": 2.6424522860538615, "learning_rate": 4.167504305023298e-06, "loss": 0.9278, "step": 9620 }, { "epoch": 1.7077059344552703, "grad_norm": 5.57177952217328, "learning_rate": 4.162891967095679e-06, "loss": 0.8677, "step": 9640 }, { "epoch": 1.7112488928255094, "grad_norm": 2.7234322141678686, "learning_rate": 4.158269455261906e-06, "loss": 0.8629, "step": 9660 }, { "epoch": 1.7147918511957485, "grad_norm": 4.837352939716468, "learning_rate": 4.1536367978034335e-06, "loss": 0.9231, "step": 9680 }, { "epoch": 1.7183348095659876, "grad_norm": 2.5650760898970653, "learning_rate": 4.148994023063787e-06, "loss": 0.91, "step": 9700 }, { "epoch": 1.7218777679362267, "grad_norm": 2.1623432794551216, "learning_rate": 4.1443411594483915e-06, "loss": 0.8876, "step": 9720 }, { "epoch": 1.7254207263064658, "grad_norm": 5.619389806185287, "learning_rate": 4.139678235424399e-06, "loss": 0.8599, "step": 9740 }, { "epoch": 1.728963684676705, "grad_norm": 4.021341243552535, "learning_rate": 4.135005279520514e-06, "loss": 0.9074, "step": 9760 }, { "epoch": 1.7325066430469442, "grad_norm": 3.5635632300979707, "learning_rate": 4.130322320326816e-06, "loss": 0.8933, "step": 9780 }, { "epoch": 1.7360496014171833, "grad_norm": 2.540277205876661, "learning_rate": 4.125629386494587e-06, "loss": 0.9291, "step": 9800 }, { "epoch": 1.7395925597874224, "grad_norm": 3.7695170204780957, "learning_rate": 4.120926506736137e-06, "loss": 0.903, "step": 9820 }, { "epoch": 1.7431355181576618, "grad_norm": 1.914697967954277, "learning_rate": 4.116213709824625e-06, "loss": 0.8321, "step": 9840 }, { "epoch": 1.7466784765279009, "grad_norm": 3.4586226366917194, "learning_rate": 4.111491024593889e-06, "loss": 0.8858, "step": 9860 }, { "epoch": 1.75022143489814, "grad_norm": 2.2832262533351195, "learning_rate": 4.10675847993826e-06, "loss": 0.9102, "step": 9880 }, { "epoch": 1.753764393268379, "grad_norm": 4.02421843778651, "learning_rate": 4.102016104812396e-06, "loss": 0.8392, "step": 9900 }, { "epoch": 1.7573073516386182, "grad_norm": 3.890461917932318, "learning_rate": 4.0972639282311e-06, "loss": 0.8785, "step": 9920 }, { "epoch": 1.7608503100088573, "grad_norm": 2.297623079323031, "learning_rate": 4.092501979269137e-06, "loss": 0.8855, "step": 9940 }, { "epoch": 1.7643932683790964, "grad_norm": 3.6677681482613216, "learning_rate": 4.087730287061065e-06, "loss": 0.8625, "step": 9960 }, { "epoch": 1.7679362267493357, "grad_norm": 3.9358016909534395, "learning_rate": 4.082948880801054e-06, "loss": 0.833, "step": 9980 }, { "epoch": 1.7714791851195748, "grad_norm": 4.267043354085718, "learning_rate": 4.078157789742706e-06, "loss": 0.9039, "step": 10000 }, { "epoch": 1.7714791851195748, "eval_loss": 0.8149307370185852, "eval_runtime": 369.1142, "eval_samples_per_second": 25.756, "eval_steps_per_second": 3.221, "step": 10000 }, { "epoch": 1.7750221434898141, "grad_norm": 3.7339229818305384, "learning_rate": 4.073357043198874e-06, "loss": 0.8925, "step": 10020 }, { "epoch": 1.7785651018600532, "grad_norm": 3.428649696183049, "learning_rate": 4.068546670541487e-06, "loss": 0.8256, "step": 10040 }, { "epoch": 1.7821080602302923, "grad_norm": 2.561924691977688, "learning_rate": 4.06372670120137e-06, "loss": 0.8685, "step": 10060 }, { "epoch": 1.7856510186005314, "grad_norm": 2.2436966577943855, "learning_rate": 4.05889716466806e-06, "loss": 0.8773, "step": 10080 }, { "epoch": 1.7891939769707705, "grad_norm": 2.5739823412815395, "learning_rate": 4.054058090489628e-06, "loss": 0.9268, "step": 10100 }, { "epoch": 1.7927369353410096, "grad_norm": 3.639815355685089, "learning_rate": 4.049209508272501e-06, "loss": 0.901, "step": 10120 }, { "epoch": 1.7962798937112487, "grad_norm": 4.803567565137283, "learning_rate": 4.044351447681276e-06, "loss": 0.8509, "step": 10140 }, { "epoch": 1.799822852081488, "grad_norm": 4.961777896889485, "learning_rate": 4.0394839384385395e-06, "loss": 0.9093, "step": 10160 }, { "epoch": 1.8033658104517272, "grad_norm": 3.0658650580570823, "learning_rate": 4.034607010324689e-06, "loss": 0.8937, "step": 10180 }, { "epoch": 1.8069087688219665, "grad_norm": 3.236704486524868, "learning_rate": 4.029720693177747e-06, "loss": 0.8738, "step": 10200 }, { "epoch": 1.8104517271922056, "grad_norm": 4.564091129067981, "learning_rate": 4.024825016893182e-06, "loss": 0.8737, "step": 10220 }, { "epoch": 1.8139946855624447, "grad_norm": 2.347358666581137, "learning_rate": 4.01992001142372e-06, "loss": 0.8632, "step": 10240 }, { "epoch": 1.8175376439326838, "grad_norm": 3.5395322977603927, "learning_rate": 4.015005706779169e-06, "loss": 0.8579, "step": 10260 }, { "epoch": 1.821080602302923, "grad_norm": 2.3411115987360827, "learning_rate": 4.010082133026229e-06, "loss": 0.9116, "step": 10280 }, { "epoch": 1.824623560673162, "grad_norm": 5.420278546986004, "learning_rate": 4.005149320288308e-06, "loss": 0.9216, "step": 10300 }, { "epoch": 1.828166519043401, "grad_norm": 3.177246662363069, "learning_rate": 4.000207298745347e-06, "loss": 0.8348, "step": 10320 }, { "epoch": 1.8317094774136404, "grad_norm": 3.260983474589408, "learning_rate": 3.995256098633618e-06, "loss": 0.8853, "step": 10340 }, { "epoch": 1.8352524357838795, "grad_norm": 4.262465852736409, "learning_rate": 3.9902957502455605e-06, "loss": 0.8776, "step": 10360 }, { "epoch": 1.8387953941541186, "grad_norm": 4.227525664714203, "learning_rate": 3.985326283929577e-06, "loss": 0.8614, "step": 10380 }, { "epoch": 1.842338352524358, "grad_norm": 3.1605952131114625, "learning_rate": 3.9803477300898574e-06, "loss": 0.863, "step": 10400 }, { "epoch": 1.845881310894597, "grad_norm": 2.751402878519728, "learning_rate": 3.975360119186192e-06, "loss": 0.8683, "step": 10420 }, { "epoch": 1.8494242692648362, "grad_norm": 3.1105411540099714, "learning_rate": 3.970363481733784e-06, "loss": 0.9019, "step": 10440 }, { "epoch": 1.8529672276350753, "grad_norm": 2.090165548223681, "learning_rate": 3.965357848303061e-06, "loss": 0.9317, "step": 10460 }, { "epoch": 1.8565101860053144, "grad_norm": 2.551059143441233, "learning_rate": 3.960343249519493e-06, "loss": 0.8711, "step": 10480 }, { "epoch": 1.8600531443755535, "grad_norm": 4.2607971452767766, "learning_rate": 3.955319716063397e-06, "loss": 0.8526, "step": 10500 }, { "epoch": 1.8635961027457926, "grad_norm": 1.8942460590897414, "learning_rate": 3.950287278669759e-06, "loss": 0.8988, "step": 10520 }, { "epoch": 1.867139061116032, "grad_norm": 3.6086912268779483, "learning_rate": 3.945245968128039e-06, "loss": 0.828, "step": 10540 }, { "epoch": 1.870682019486271, "grad_norm": 2.2171636599062294, "learning_rate": 3.940195815281984e-06, "loss": 0.8195, "step": 10560 }, { "epoch": 1.8742249778565103, "grad_norm": 3.3159052435056333, "learning_rate": 3.935136851029441e-06, "loss": 0.9019, "step": 10580 }, { "epoch": 1.8777679362267494, "grad_norm": 2.4712113724302998, "learning_rate": 3.930069106322167e-06, "loss": 0.867, "step": 10600 }, { "epoch": 1.8813108945969885, "grad_norm": 3.5773386774255473, "learning_rate": 3.924992612165638e-06, "loss": 0.9161, "step": 10620 }, { "epoch": 1.8848538529672276, "grad_norm": 3.63341154286338, "learning_rate": 3.919907399618864e-06, "loss": 0.9039, "step": 10640 }, { "epoch": 1.8883968113374667, "grad_norm": 2.5738022632421806, "learning_rate": 3.914813499794193e-06, "loss": 0.9, "step": 10660 }, { "epoch": 1.8919397697077058, "grad_norm": 3.282128941108443, "learning_rate": 3.909710943857125e-06, "loss": 0.8783, "step": 10680 }, { "epoch": 1.895482728077945, "grad_norm": 3.079231075434731, "learning_rate": 3.904599763026117e-06, "loss": 0.8829, "step": 10700 }, { "epoch": 1.8990256864481843, "grad_norm": 4.0065620278734055, "learning_rate": 3.899479988572401e-06, "loss": 0.9157, "step": 10720 }, { "epoch": 1.9025686448184234, "grad_norm": 4.857718153080153, "learning_rate": 3.89435165181978e-06, "loss": 0.8971, "step": 10740 }, { "epoch": 1.9061116031886627, "grad_norm": 5.451024881061867, "learning_rate": 3.8892147841444465e-06, "loss": 0.9133, "step": 10760 }, { "epoch": 1.9096545615589018, "grad_norm": 2.9783216918153688, "learning_rate": 3.884069416974785e-06, "loss": 0.8671, "step": 10780 }, { "epoch": 1.9131975199291409, "grad_norm": 4.130442740763265, "learning_rate": 3.878915581791184e-06, "loss": 0.8812, "step": 10800 }, { "epoch": 1.91674047829938, "grad_norm": 3.087103014469501, "learning_rate": 3.873753310125838e-06, "loss": 0.8596, "step": 10820 }, { "epoch": 1.920283436669619, "grad_norm": 3.4980061190534832, "learning_rate": 3.868582633562561e-06, "loss": 0.9347, "step": 10840 }, { "epoch": 1.9238263950398582, "grad_norm": 2.7736175156145353, "learning_rate": 3.863403583736586e-06, "loss": 0.8216, "step": 10860 }, { "epoch": 1.9273693534100973, "grad_norm": 3.2203528253737184, "learning_rate": 3.858216192334377e-06, "loss": 0.9563, "step": 10880 }, { "epoch": 1.9309123117803366, "grad_norm": 3.423660636662842, "learning_rate": 3.853020491093436e-06, "loss": 0.9045, "step": 10900 }, { "epoch": 1.9344552701505757, "grad_norm": 4.576810058530241, "learning_rate": 3.847816511802104e-06, "loss": 0.8914, "step": 10920 }, { "epoch": 1.9379982285208148, "grad_norm": 4.986380910931554, "learning_rate": 3.842604286299366e-06, "loss": 0.9222, "step": 10940 }, { "epoch": 1.9415411868910541, "grad_norm": 5.049582396983484, "learning_rate": 3.837383846474663e-06, "loss": 0.8764, "step": 10960 }, { "epoch": 1.9450841452612933, "grad_norm": 3.943843485476312, "learning_rate": 3.832155224267693e-06, "loss": 0.8614, "step": 10980 }, { "epoch": 1.9486271036315324, "grad_norm": 4.818067048868772, "learning_rate": 3.8269184516682114e-06, "loss": 0.8844, "step": 11000 }, { "epoch": 1.9521700620017715, "grad_norm": 2.6435129006124356, "learning_rate": 3.821673560715844e-06, "loss": 0.8859, "step": 11020 }, { "epoch": 1.9557130203720106, "grad_norm": 4.218938965276267, "learning_rate": 3.816420583499883e-06, "loss": 0.8694, "step": 11040 }, { "epoch": 1.9592559787422497, "grad_norm": 3.1248706808657993, "learning_rate": 3.811159552159097e-06, "loss": 0.8484, "step": 11060 }, { "epoch": 1.9627989371124888, "grad_norm": 3.806523511219131, "learning_rate": 3.8058904988815274e-06, "loss": 0.8471, "step": 11080 }, { "epoch": 1.966341895482728, "grad_norm": 3.51824367536095, "learning_rate": 3.800613455904299e-06, "loss": 0.9077, "step": 11100 }, { "epoch": 1.9698848538529672, "grad_norm": 3.3138759907728677, "learning_rate": 3.795328455513418e-06, "loss": 0.8493, "step": 11120 }, { "epoch": 1.9734278122232065, "grad_norm": 2.581058466584183, "learning_rate": 3.7900355300435744e-06, "loss": 0.8834, "step": 11140 }, { "epoch": 1.9769707705934456, "grad_norm": 2.712205874433446, "learning_rate": 3.7847347118779464e-06, "loss": 0.8902, "step": 11160 }, { "epoch": 1.9805137289636847, "grad_norm": 1.9869529582535677, "learning_rate": 3.7794260334480026e-06, "loss": 0.849, "step": 11180 }, { "epoch": 1.9840566873339238, "grad_norm": 5.413750266589672, "learning_rate": 3.7741095272333008e-06, "loss": 0.8644, "step": 11200 }, { "epoch": 1.987599645704163, "grad_norm": 2.941600930177304, "learning_rate": 3.76878522576129e-06, "loss": 0.8478, "step": 11220 }, { "epoch": 1.991142604074402, "grad_norm": 4.120948714739335, "learning_rate": 3.7634531616071137e-06, "loss": 0.9119, "step": 11240 }, { "epoch": 1.9946855624446411, "grad_norm": 3.1166160914538636, "learning_rate": 3.758113367393409e-06, "loss": 0.8953, "step": 11260 }, { "epoch": 1.9982285208148804, "grad_norm": 3.148186400991171, "learning_rate": 3.7527658757901046e-06, "loss": 0.8957, "step": 11280 }, { "epoch": 2.0017714791851198, "grad_norm": 4.237349805163588, "learning_rate": 3.7474107195142273e-06, "loss": 0.8242, "step": 11300 }, { "epoch": 2.005314437555359, "grad_norm": 3.964803901086038, "learning_rate": 3.7420479313296964e-06, "loss": 0.733, "step": 11320 }, { "epoch": 2.008857395925598, "grad_norm": 2.851459294376265, "learning_rate": 3.7366775440471213e-06, "loss": 0.8224, "step": 11340 }, { "epoch": 2.012400354295837, "grad_norm": 3.6339236231049847, "learning_rate": 3.7312995905236105e-06, "loss": 0.8078, "step": 11360 }, { "epoch": 2.015943312666076, "grad_norm": 2.394977571241479, "learning_rate": 3.725914103662559e-06, "loss": 0.7777, "step": 11380 }, { "epoch": 2.0194862710363153, "grad_norm": 3.5149972337594475, "learning_rate": 3.7205211164134547e-06, "loss": 0.7742, "step": 11400 }, { "epoch": 2.0230292294065544, "grad_norm": 4.113453512038928, "learning_rate": 3.7151206617716734e-06, "loss": 0.7539, "step": 11420 }, { "epoch": 2.0265721877767935, "grad_norm": 3.2750961089816353, "learning_rate": 3.709712772778279e-06, "loss": 0.7788, "step": 11440 }, { "epoch": 2.0301151461470326, "grad_norm": 4.035570508399602, "learning_rate": 3.70429748251982e-06, "loss": 0.7829, "step": 11460 }, { "epoch": 2.033658104517272, "grad_norm": 4.436303471963281, "learning_rate": 3.698874824128126e-06, "loss": 0.7311, "step": 11480 }, { "epoch": 2.0372010628875112, "grad_norm": 5.818707534945472, "learning_rate": 3.693444830780107e-06, "loss": 0.773, "step": 11500 }, { "epoch": 2.0407440212577503, "grad_norm": 3.540550130014629, "learning_rate": 3.6880075356975515e-06, "loss": 0.7814, "step": 11520 }, { "epoch": 2.0442869796279894, "grad_norm": 3.6346205822234943, "learning_rate": 3.6825629721469188e-06, "loss": 0.8135, "step": 11540 }, { "epoch": 2.0478299379982285, "grad_norm": 2.4682825796443475, "learning_rate": 3.6771111734391397e-06, "loss": 0.745, "step": 11560 }, { "epoch": 2.0513728963684676, "grad_norm": 2.7295024854988195, "learning_rate": 3.6716521729294104e-06, "loss": 0.7792, "step": 11580 }, { "epoch": 2.0549158547387067, "grad_norm": 3.3999116860452525, "learning_rate": 3.66618600401699e-06, "loss": 0.7841, "step": 11600 }, { "epoch": 2.058458813108946, "grad_norm": 1.979950381057992, "learning_rate": 3.660712700144995e-06, "loss": 0.7577, "step": 11620 }, { "epoch": 2.062001771479185, "grad_norm": 2.961231731307014, "learning_rate": 3.655232294800194e-06, "loss": 0.8112, "step": 11640 }, { "epoch": 2.065544729849424, "grad_norm": 2.9168214431871546, "learning_rate": 3.6497448215128054e-06, "loss": 0.7407, "step": 11660 }, { "epoch": 2.0690876882196636, "grad_norm": 3.3537918642406814, "learning_rate": 3.6442503138562902e-06, "loss": 0.7432, "step": 11680 }, { "epoch": 2.0726306465899027, "grad_norm": 2.885605458154861, "learning_rate": 3.638748805447146e-06, "loss": 0.7657, "step": 11700 }, { "epoch": 2.076173604960142, "grad_norm": 4.064439601513717, "learning_rate": 3.6332403299447046e-06, "loss": 0.7374, "step": 11720 }, { "epoch": 2.079716563330381, "grad_norm": 4.214372884804176, "learning_rate": 3.6277249210509208e-06, "loss": 0.758, "step": 11740 }, { "epoch": 2.08325952170062, "grad_norm": 3.8146529692577937, "learning_rate": 3.6222026125101717e-06, "loss": 0.7635, "step": 11760 }, { "epoch": 2.086802480070859, "grad_norm": 4.249159301555663, "learning_rate": 3.6166734381090483e-06, "loss": 0.7928, "step": 11780 }, { "epoch": 2.090345438441098, "grad_norm": 5.7403075464380375, "learning_rate": 3.611137431676146e-06, "loss": 0.7451, "step": 11800 }, { "epoch": 2.0938883968113373, "grad_norm": 3.6236953170629667, "learning_rate": 3.605594627081861e-06, "loss": 0.7332, "step": 11820 }, { "epoch": 2.0974313551815764, "grad_norm": 3.240921081992807, "learning_rate": 3.6000450582381823e-06, "loss": 0.75, "step": 11840 }, { "epoch": 2.100974313551816, "grad_norm": 4.877179694529326, "learning_rate": 3.5944887590984846e-06, "loss": 0.7824, "step": 11860 }, { "epoch": 2.104517271922055, "grad_norm": 2.2176442012470576, "learning_rate": 3.5889257636573183e-06, "loss": 0.7936, "step": 11880 }, { "epoch": 2.108060230292294, "grad_norm": 4.221294517545139, "learning_rate": 3.583356105950203e-06, "loss": 0.7548, "step": 11900 }, { "epoch": 2.1116031886625333, "grad_norm": 3.8503143338096786, "learning_rate": 3.5777798200534214e-06, "loss": 0.7967, "step": 11920 }, { "epoch": 2.1151461470327724, "grad_norm": 3.1910071043427326, "learning_rate": 3.5721969400838073e-06, "loss": 0.7786, "step": 11940 }, { "epoch": 2.1186891054030115, "grad_norm": 2.860111918382593, "learning_rate": 3.5666075001985386e-06, "loss": 0.7517, "step": 11960 }, { "epoch": 2.1222320637732506, "grad_norm": 2.0905030982179134, "learning_rate": 3.561011534594928e-06, "loss": 0.7558, "step": 11980 }, { "epoch": 2.1257750221434897, "grad_norm": 5.060645708453129, "learning_rate": 3.555409077510215e-06, "loss": 0.7414, "step": 12000 }, { "epoch": 2.1257750221434897, "eval_loss": 0.8194052577018738, "eval_runtime": 368.7991, "eval_samples_per_second": 25.778, "eval_steps_per_second": 3.224, "step": 12000 }, { "epoch": 2.129317980513729, "grad_norm": 3.6709097863394655, "learning_rate": 3.549800163221353e-06, "loss": 0.7369, "step": 12020 }, { "epoch": 2.132860938883968, "grad_norm": 4.6404940480464125, "learning_rate": 3.5441848260448035e-06, "loss": 0.7919, "step": 12040 }, { "epoch": 2.1364038972542074, "grad_norm": 3.4473691964125353, "learning_rate": 3.5385631003363245e-06, "loss": 0.7841, "step": 12060 }, { "epoch": 2.1399468556244465, "grad_norm": 4.36997053485404, "learning_rate": 3.532935020490761e-06, "loss": 0.7681, "step": 12080 }, { "epoch": 2.1434898139946856, "grad_norm": 3.642775603252494, "learning_rate": 3.5273006209418297e-06, "loss": 0.7377, "step": 12100 }, { "epoch": 2.1470327723649247, "grad_norm": 2.9088288240530806, "learning_rate": 3.5216599361619193e-06, "loss": 0.7356, "step": 12120 }, { "epoch": 2.150575730735164, "grad_norm": 4.94624315541774, "learning_rate": 3.5160130006618665e-06, "loss": 0.7688, "step": 12140 }, { "epoch": 2.154118689105403, "grad_norm": 3.431747470285621, "learning_rate": 3.5103598489907553e-06, "loss": 0.7322, "step": 12160 }, { "epoch": 2.157661647475642, "grad_norm": 3.5652670345700876, "learning_rate": 3.5047005157357e-06, "loss": 0.7752, "step": 12180 }, { "epoch": 2.161204605845881, "grad_norm": 4.315860117945172, "learning_rate": 3.4990350355216347e-06, "loss": 0.7443, "step": 12200 }, { "epoch": 2.1647475642161202, "grad_norm": 3.963300128138939, "learning_rate": 3.493363443011102e-06, "loss": 0.7595, "step": 12220 }, { "epoch": 2.16829052258636, "grad_norm": 3.3278580876056623, "learning_rate": 3.487685772904041e-06, "loss": 0.7573, "step": 12240 }, { "epoch": 2.171833480956599, "grad_norm": 3.6502088387557516, "learning_rate": 3.4820020599375755e-06, "loss": 0.7675, "step": 12260 }, { "epoch": 2.175376439326838, "grad_norm": 2.110435980731087, "learning_rate": 3.476312338885799e-06, "loss": 0.7659, "step": 12280 }, { "epoch": 2.178919397697077, "grad_norm": 3.132585579346833, "learning_rate": 3.4706166445595657e-06, "loss": 0.7691, "step": 12300 }, { "epoch": 2.182462356067316, "grad_norm": 5.677900838287172, "learning_rate": 3.4649150118062737e-06, "loss": 0.7543, "step": 12320 }, { "epoch": 2.1860053144375553, "grad_norm": 3.5849859798668047, "learning_rate": 3.4592074755096533e-06, "loss": 0.7485, "step": 12340 }, { "epoch": 2.1895482728077944, "grad_norm": 2.897975653877466, "learning_rate": 3.453494070589556e-06, "loss": 0.741, "step": 12360 }, { "epoch": 2.1930912311780335, "grad_norm": 2.300880578954949, "learning_rate": 3.4477748320017386e-06, "loss": 0.7245, "step": 12380 }, { "epoch": 2.1966341895482726, "grad_norm": 3.4934403676076213, "learning_rate": 3.442049794737647e-06, "loss": 0.7645, "step": 12400 }, { "epoch": 2.200177147918512, "grad_norm": 3.0309935521375695, "learning_rate": 3.436318993824206e-06, "loss": 0.7822, "step": 12420 }, { "epoch": 2.2037201062887513, "grad_norm": 5.162246999416868, "learning_rate": 3.430582464323603e-06, "loss": 0.7638, "step": 12440 }, { "epoch": 2.2072630646589904, "grad_norm": 4.3206905241817175, "learning_rate": 3.4248402413330766e-06, "loss": 0.7872, "step": 12460 }, { "epoch": 2.2108060230292295, "grad_norm": 4.236342233174995, "learning_rate": 3.419092359984695e-06, "loss": 0.7546, "step": 12480 }, { "epoch": 2.2143489813994686, "grad_norm": 2.8151366817841756, "learning_rate": 3.41333885544515e-06, "loss": 0.7635, "step": 12500 }, { "epoch": 2.2178919397697077, "grad_norm": 3.2839030456741978, "learning_rate": 3.4075797629155336e-06, "loss": 0.7588, "step": 12520 }, { "epoch": 2.2214348981399468, "grad_norm": 3.0947148397280997, "learning_rate": 3.4018151176311267e-06, "loss": 0.7277, "step": 12540 }, { "epoch": 2.224977856510186, "grad_norm": 3.5428621372363063, "learning_rate": 3.396044954861185e-06, "loss": 0.7679, "step": 12560 }, { "epoch": 2.228520814880425, "grad_norm": 2.02419126865859, "learning_rate": 3.39026930990872e-06, "loss": 0.7446, "step": 12580 }, { "epoch": 2.2320637732506645, "grad_norm": 4.0306736526937765, "learning_rate": 3.384488218110285e-06, "loss": 0.7599, "step": 12600 }, { "epoch": 2.2356067316209036, "grad_norm": 3.3079541839461606, "learning_rate": 3.378701714835756e-06, "loss": 0.7325, "step": 12620 }, { "epoch": 2.2391496899911427, "grad_norm": 3.6500019106828754, "learning_rate": 3.3729098354881207e-06, "loss": 0.7834, "step": 12640 }, { "epoch": 2.242692648361382, "grad_norm": 2.9776776199055073, "learning_rate": 3.367112615503256e-06, "loss": 0.7479, "step": 12660 }, { "epoch": 2.246235606731621, "grad_norm": 2.8136251511132, "learning_rate": 3.3613100903497165e-06, "loss": 0.7972, "step": 12680 }, { "epoch": 2.24977856510186, "grad_norm": 2.6979166100675633, "learning_rate": 3.355502295528512e-06, "loss": 0.785, "step": 12700 }, { "epoch": 2.253321523472099, "grad_norm": 2.555788234192105, "learning_rate": 3.349689266572896e-06, "loss": 0.7337, "step": 12720 }, { "epoch": 2.2568644818423382, "grad_norm": 4.085833264183358, "learning_rate": 3.3438710390481423e-06, "loss": 0.7795, "step": 12740 }, { "epoch": 2.2604074402125773, "grad_norm": 4.190825689674867, "learning_rate": 3.338047648551333e-06, "loss": 0.7946, "step": 12760 }, { "epoch": 2.263950398582817, "grad_norm": 3.1997981806226496, "learning_rate": 3.3322191307111386e-06, "loss": 0.7573, "step": 12780 }, { "epoch": 2.267493356953056, "grad_norm": 3.0496961823219237, "learning_rate": 3.326385521187598e-06, "loss": 0.7191, "step": 12800 }, { "epoch": 2.271036315323295, "grad_norm": 2.3669933953301996, "learning_rate": 3.320546855671903e-06, "loss": 0.7787, "step": 12820 }, { "epoch": 2.274579273693534, "grad_norm": 4.128804064988176, "learning_rate": 3.3147031698861783e-06, "loss": 0.8122, "step": 12840 }, { "epoch": 2.2781222320637733, "grad_norm": 3.6541280879376288, "learning_rate": 3.308854499583265e-06, "loss": 0.8089, "step": 12860 }, { "epoch": 2.2816651904340124, "grad_norm": 3.9405511693030513, "learning_rate": 3.3030008805464987e-06, "loss": 0.7806, "step": 12880 }, { "epoch": 2.2852081488042515, "grad_norm": 2.7534555896912183, "learning_rate": 3.297142348589493e-06, "loss": 0.7826, "step": 12900 }, { "epoch": 2.2887511071744906, "grad_norm": 3.3797355039239956, "learning_rate": 3.2912789395559226e-06, "loss": 0.8049, "step": 12920 }, { "epoch": 2.2922940655447297, "grad_norm": 3.9407118834982677, "learning_rate": 3.285410689319295e-06, "loss": 0.7897, "step": 12940 }, { "epoch": 2.2958370239149692, "grad_norm": 2.4540627878279713, "learning_rate": 3.2795376337827416e-06, "loss": 0.7869, "step": 12960 }, { "epoch": 2.299379982285208, "grad_norm": 5.328758496807459, "learning_rate": 3.273659808878794e-06, "loss": 0.7567, "step": 12980 }, { "epoch": 2.3029229406554474, "grad_norm": 2.939528537204539, "learning_rate": 3.2677772505691614e-06, "loss": 0.7337, "step": 13000 }, { "epoch": 2.3064658990256866, "grad_norm": 2.1870838009976206, "learning_rate": 3.2618899948445143e-06, "loss": 0.8036, "step": 13020 }, { "epoch": 2.3100088573959257, "grad_norm": 2.3749989783363037, "learning_rate": 3.255998077724261e-06, "loss": 0.7477, "step": 13040 }, { "epoch": 2.3135518157661648, "grad_norm": 4.691180192385719, "learning_rate": 3.250101535256333e-06, "loss": 0.7982, "step": 13060 }, { "epoch": 2.317094774136404, "grad_norm": 3.160712448911439, "learning_rate": 3.2442004035169566e-06, "loss": 0.7429, "step": 13080 }, { "epoch": 2.320637732506643, "grad_norm": 3.397209885685089, "learning_rate": 3.2382947186104385e-06, "loss": 0.7749, "step": 13100 }, { "epoch": 2.324180690876882, "grad_norm": 3.15455456395674, "learning_rate": 3.232384516668943e-06, "loss": 0.7693, "step": 13120 }, { "epoch": 2.327723649247121, "grad_norm": 5.565654365935215, "learning_rate": 3.2264698338522664e-06, "loss": 0.772, "step": 13140 }, { "epoch": 2.3312666076173603, "grad_norm": 4.698072365225265, "learning_rate": 3.2205507063476255e-06, "loss": 0.7808, "step": 13160 }, { "epoch": 2.3348095659876, "grad_norm": 3.59501906125545, "learning_rate": 3.2146271703694277e-06, "loss": 0.7505, "step": 13180 }, { "epoch": 2.338352524357839, "grad_norm": 3.679313411486785, "learning_rate": 3.208699262159052e-06, "loss": 0.7336, "step": 13200 }, { "epoch": 2.341895482728078, "grad_norm": 2.8791170352318276, "learning_rate": 3.2027670179846294e-06, "loss": 0.7307, "step": 13220 }, { "epoch": 2.345438441098317, "grad_norm": 4.059760879698832, "learning_rate": 3.196830474140816e-06, "loss": 0.753, "step": 13240 }, { "epoch": 2.348981399468556, "grad_norm": 2.996057141115257, "learning_rate": 3.190889666948579e-06, "loss": 0.7399, "step": 13260 }, { "epoch": 2.3525243578387953, "grad_norm": 4.234482920426088, "learning_rate": 3.184944632754964e-06, "loss": 0.7904, "step": 13280 }, { "epoch": 2.3560673162090344, "grad_norm": 2.3964341925650463, "learning_rate": 3.1789954079328835e-06, "loss": 0.7534, "step": 13300 }, { "epoch": 2.3596102745792735, "grad_norm": 2.9159272754142918, "learning_rate": 3.1730420288808862e-06, "loss": 0.7834, "step": 13320 }, { "epoch": 2.3631532329495126, "grad_norm": 3.781684460395673, "learning_rate": 3.1670845320229355e-06, "loss": 0.771, "step": 13340 }, { "epoch": 2.366696191319752, "grad_norm": 2.826416108672929, "learning_rate": 3.161122953808192e-06, "loss": 0.7354, "step": 13360 }, { "epoch": 2.3702391496899913, "grad_norm": 5.104935252190013, "learning_rate": 3.1551573307107867e-06, "loss": 0.7503, "step": 13380 }, { "epoch": 2.3737821080602304, "grad_norm": 4.124050748848566, "learning_rate": 3.149187699229595e-06, "loss": 0.7191, "step": 13400 }, { "epoch": 2.3773250664304695, "grad_norm": 4.433917574735756, "learning_rate": 3.1432140958880186e-06, "loss": 0.7036, "step": 13420 }, { "epoch": 2.3808680248007086, "grad_norm": 6.0333723611681975, "learning_rate": 3.1372365572337592e-06, "loss": 0.6947, "step": 13440 }, { "epoch": 2.3844109831709477, "grad_norm": 3.496479107809089, "learning_rate": 3.1312551198385964e-06, "loss": 0.8186, "step": 13460 }, { "epoch": 2.387953941541187, "grad_norm": 2.9985043132815608, "learning_rate": 3.1252698202981613e-06, "loss": 0.762, "step": 13480 }, { "epoch": 2.391496899911426, "grad_norm": 3.5778965231230733, "learning_rate": 3.1192806952317155e-06, "loss": 0.7475, "step": 13500 }, { "epoch": 2.395039858281665, "grad_norm": 2.4853017301906046, "learning_rate": 3.113287781281927e-06, "loss": 0.7673, "step": 13520 }, { "epoch": 2.3985828166519045, "grad_norm": 4.385979644321999, "learning_rate": 3.107291115114643e-06, "loss": 0.7664, "step": 13540 }, { "epoch": 2.4021257750221436, "grad_norm": 3.519870816810653, "learning_rate": 3.1012907334186676e-06, "loss": 0.7708, "step": 13560 }, { "epoch": 2.4056687333923827, "grad_norm": 3.6501676511740913, "learning_rate": 3.09528667290554e-06, "loss": 0.7354, "step": 13580 }, { "epoch": 2.409211691762622, "grad_norm": 5.282285428849241, "learning_rate": 3.0892789703093025e-06, "loss": 0.7679, "step": 13600 }, { "epoch": 2.412754650132861, "grad_norm": 4.970772234231322, "learning_rate": 3.0832676623862847e-06, "loss": 0.7753, "step": 13620 }, { "epoch": 2.4162976085031, "grad_norm": 5.300930589688842, "learning_rate": 3.0772527859148726e-06, "loss": 0.7309, "step": 13640 }, { "epoch": 2.419840566873339, "grad_norm": 3.195715973824964, "learning_rate": 3.0712343776952845e-06, "loss": 0.8118, "step": 13660 }, { "epoch": 2.4233835252435783, "grad_norm": 3.8485686008854025, "learning_rate": 3.0652124745493483e-06, "loss": 0.7677, "step": 13680 }, { "epoch": 2.4269264836138174, "grad_norm": 3.6307249326313844, "learning_rate": 3.0591871133202733e-06, "loss": 0.7562, "step": 13700 }, { "epoch": 2.430469441984057, "grad_norm": 3.316064861185543, "learning_rate": 3.0531583308724267e-06, "loss": 0.7626, "step": 13720 }, { "epoch": 2.434012400354296, "grad_norm": 6.634549317490503, "learning_rate": 3.0471261640911065e-06, "loss": 0.758, "step": 13740 }, { "epoch": 2.437555358724535, "grad_norm": 3.175037432084709, "learning_rate": 3.0410906498823176e-06, "loss": 0.747, "step": 13760 }, { "epoch": 2.441098317094774, "grad_norm": 4.104765159107976, "learning_rate": 3.0350518251725466e-06, "loss": 0.7529, "step": 13780 }, { "epoch": 2.4446412754650133, "grad_norm": 4.108216250616023, "learning_rate": 3.02900972690853e-06, "loss": 0.7329, "step": 13800 }, { "epoch": 2.4481842338352524, "grad_norm": 5.179386294221434, "learning_rate": 3.0229643920570368e-06, "loss": 0.7756, "step": 13820 }, { "epoch": 2.4517271922054915, "grad_norm": 3.532213415891956, "learning_rate": 3.0169158576046364e-06, "loss": 0.7857, "step": 13840 }, { "epoch": 2.4552701505757306, "grad_norm": 3.1668720104134898, "learning_rate": 3.0108641605574746e-06, "loss": 0.7689, "step": 13860 }, { "epoch": 2.4588131089459697, "grad_norm": 5.323518128860221, "learning_rate": 3.0048093379410455e-06, "loss": 0.7193, "step": 13880 }, { "epoch": 2.4623560673162093, "grad_norm": 2.8074163772217346, "learning_rate": 2.998751426799967e-06, "loss": 0.7663, "step": 13900 }, { "epoch": 2.4658990256864484, "grad_norm": 3.3433870569143402, "learning_rate": 2.9926904641977524e-06, "loss": 0.7351, "step": 13920 }, { "epoch": 2.4694419840566875, "grad_norm": 4.317693104328955, "learning_rate": 2.986626487216586e-06, "loss": 0.7303, "step": 13940 }, { "epoch": 2.4729849424269266, "grad_norm": 2.9507964917970093, "learning_rate": 2.9805595329570926e-06, "loss": 0.7355, "step": 13960 }, { "epoch": 2.4765279007971657, "grad_norm": 5.5399797509230035, "learning_rate": 2.974489638538115e-06, "loss": 0.7673, "step": 13980 }, { "epoch": 2.4800708591674048, "grad_norm": 6.341760350289788, "learning_rate": 2.9684168410964815e-06, "loss": 0.7332, "step": 14000 }, { "epoch": 2.4800708591674048, "eval_loss": 0.8049691915512085, "eval_runtime": 366.4293, "eval_samples_per_second": 25.945, "eval_steps_per_second": 3.245, "step": 14000 }, { "epoch": 2.483613817537644, "grad_norm": 3.362490165226843, "learning_rate": 2.9623411777867845e-06, "loss": 0.8132, "step": 14020 }, { "epoch": 2.487156775907883, "grad_norm": 3.067701488951309, "learning_rate": 2.9562626857811486e-06, "loss": 0.73, "step": 14040 }, { "epoch": 2.490699734278122, "grad_norm": 5.481445388434678, "learning_rate": 2.950181402269007e-06, "loss": 0.7727, "step": 14060 }, { "epoch": 2.4942426926483616, "grad_norm": 3.704158248514089, "learning_rate": 2.944097364456867e-06, "loss": 0.7594, "step": 14080 }, { "epoch": 2.4977856510186003, "grad_norm": 3.703332272719033, "learning_rate": 2.9380106095680943e-06, "loss": 0.7816, "step": 14100 }, { "epoch": 2.50132860938884, "grad_norm": 5.075701987314385, "learning_rate": 2.931921174842672e-06, "loss": 0.8127, "step": 14120 }, { "epoch": 2.504871567759079, "grad_norm": 3.3757570715718126, "learning_rate": 2.925829097536983e-06, "loss": 0.7594, "step": 14140 }, { "epoch": 2.508414526129318, "grad_norm": 5.003414946307021, "learning_rate": 2.9197344149235762e-06, "loss": 0.802, "step": 14160 }, { "epoch": 2.511957484499557, "grad_norm": 3.883907931049023, "learning_rate": 2.9136371642909406e-06, "loss": 0.7292, "step": 14180 }, { "epoch": 2.5155004428697962, "grad_norm": 4.7644188631188555, "learning_rate": 2.9075373829432766e-06, "loss": 0.7899, "step": 14200 }, { "epoch": 2.5190434012400353, "grad_norm": 3.1005288937544795, "learning_rate": 2.901435108200269e-06, "loss": 0.7501, "step": 14220 }, { "epoch": 2.5225863596102744, "grad_norm": 4.202314740845049, "learning_rate": 2.8953303773968566e-06, "loss": 0.733, "step": 14240 }, { "epoch": 2.526129317980514, "grad_norm": 3.0865315047496256, "learning_rate": 2.889223227883006e-06, "loss": 0.7218, "step": 14260 }, { "epoch": 2.5296722763507526, "grad_norm": 3.687380874866973, "learning_rate": 2.8831136970234798e-06, "loss": 0.7539, "step": 14280 }, { "epoch": 2.533215234720992, "grad_norm": 4.787195673053088, "learning_rate": 2.8770018221976126e-06, "loss": 0.7733, "step": 14300 }, { "epoch": 2.5367581930912313, "grad_norm": 3.131210531061707, "learning_rate": 2.8708876407990794e-06, "loss": 0.8023, "step": 14320 }, { "epoch": 2.5403011514614704, "grad_norm": 6.575229105312035, "learning_rate": 2.8647711902356653e-06, "loss": 0.7857, "step": 14340 }, { "epoch": 2.5438441098317095, "grad_norm": 3.132660461857889, "learning_rate": 2.858652507929042e-06, "loss": 0.6994, "step": 14360 }, { "epoch": 2.5473870682019486, "grad_norm": 4.422023062404442, "learning_rate": 2.852531631314531e-06, "loss": 0.7629, "step": 14380 }, { "epoch": 2.5509300265721877, "grad_norm": 6.144792250633855, "learning_rate": 2.846408597840884e-06, "loss": 0.8015, "step": 14400 }, { "epoch": 2.554472984942427, "grad_norm": 4.254153197416055, "learning_rate": 2.8402834449700444e-06, "loss": 0.8166, "step": 14420 }, { "epoch": 2.5580159433126664, "grad_norm": 4.276352369102927, "learning_rate": 2.8341562101769258e-06, "loss": 0.7488, "step": 14440 }, { "epoch": 2.561558901682905, "grad_norm": 2.7388450962012008, "learning_rate": 2.8280269309491783e-06, "loss": 0.731, "step": 14460 }, { "epoch": 2.5651018600531446, "grad_norm": 3.155033747973735, "learning_rate": 2.821895644786958e-06, "loss": 0.7601, "step": 14480 }, { "epoch": 2.5686448184233837, "grad_norm": 3.8509609959663242, "learning_rate": 2.815762389202703e-06, "loss": 0.7773, "step": 14500 }, { "epoch": 2.5721877767936228, "grad_norm": 2.7079281092995617, "learning_rate": 2.8096272017208996e-06, "loss": 0.6832, "step": 14520 }, { "epoch": 2.575730735163862, "grad_norm": 4.486116320592051, "learning_rate": 2.8034901198778537e-06, "loss": 0.7792, "step": 14540 }, { "epoch": 2.579273693534101, "grad_norm": 3.703073425582054, "learning_rate": 2.7973511812214614e-06, "loss": 0.7297, "step": 14560 }, { "epoch": 2.58281665190434, "grad_norm": 4.327931475336945, "learning_rate": 2.79121042331098e-06, "loss": 0.7332, "step": 14580 }, { "epoch": 2.586359610274579, "grad_norm": 4.730270311129735, "learning_rate": 2.7850678837167943e-06, "loss": 0.7537, "step": 14600 }, { "epoch": 2.5899025686448183, "grad_norm": 4.061197200463917, "learning_rate": 2.778923600020193e-06, "loss": 0.7364, "step": 14620 }, { "epoch": 2.5934455270150574, "grad_norm": 5.875022772190702, "learning_rate": 2.7727776098131355e-06, "loss": 0.763, "step": 14640 }, { "epoch": 2.596988485385297, "grad_norm": 4.2525211564332155, "learning_rate": 2.76662995069802e-06, "loss": 0.7106, "step": 14660 }, { "epoch": 2.600531443755536, "grad_norm": 2.812095781009687, "learning_rate": 2.760480660287457e-06, "loss": 0.7964, "step": 14680 }, { "epoch": 2.604074402125775, "grad_norm": 2.982535796988278, "learning_rate": 2.7543297762040367e-06, "loss": 0.7471, "step": 14700 }, { "epoch": 2.6076173604960142, "grad_norm": 3.3609650163002285, "learning_rate": 2.748177336080099e-06, "loss": 0.7404, "step": 14720 }, { "epoch": 2.6111603188662533, "grad_norm": 3.3659754899747183, "learning_rate": 2.7420233775575062e-06, "loss": 0.7088, "step": 14740 }, { "epoch": 2.6147032772364924, "grad_norm": 4.8930238508897865, "learning_rate": 2.73586793828741e-06, "loss": 0.8097, "step": 14760 }, { "epoch": 2.6182462356067315, "grad_norm": 3.7139806486443523, "learning_rate": 2.7297110559300196e-06, "loss": 0.726, "step": 14780 }, { "epoch": 2.6217891939769706, "grad_norm": 5.786625818280847, "learning_rate": 2.7235527681543745e-06, "loss": 0.7663, "step": 14800 }, { "epoch": 2.6253321523472097, "grad_norm": 2.4715288407192917, "learning_rate": 2.717393112638113e-06, "loss": 0.8067, "step": 14820 }, { "epoch": 2.6288751107174493, "grad_norm": 2.48294124433425, "learning_rate": 2.7112321270672427e-06, "loss": 0.7436, "step": 14840 }, { "epoch": 2.6324180690876884, "grad_norm": 4.533346896730641, "learning_rate": 2.705069849135905e-06, "loss": 0.7542, "step": 14860 }, { "epoch": 2.6359610274579275, "grad_norm": 2.595943549639139, "learning_rate": 2.698906316546154e-06, "loss": 0.7206, "step": 14880 }, { "epoch": 2.6395039858281666, "grad_norm": 4.011814531987077, "learning_rate": 2.6927415670077133e-06, "loss": 0.7981, "step": 14900 }, { "epoch": 2.6430469441984057, "grad_norm": 3.060330539530146, "learning_rate": 2.6865756382377577e-06, "loss": 0.7805, "step": 14920 }, { "epoch": 2.646589902568645, "grad_norm": 2.9995394478553803, "learning_rate": 2.6804085679606735e-06, "loss": 0.7601, "step": 14940 }, { "epoch": 2.650132860938884, "grad_norm": 5.280778921498777, "learning_rate": 2.674240393907832e-06, "loss": 0.7646, "step": 14960 }, { "epoch": 2.653675819309123, "grad_norm": 3.798446112260494, "learning_rate": 2.6680711538173595e-06, "loss": 0.7871, "step": 14980 }, { "epoch": 2.657218777679362, "grad_norm": 5.01912287250632, "learning_rate": 2.661900885433899e-06, "loss": 0.745, "step": 15000 }, { "epoch": 2.6607617360496016, "grad_norm": 2.960773034701044, "learning_rate": 2.6557296265083917e-06, "loss": 0.7822, "step": 15020 }, { "epoch": 2.6643046944198403, "grad_norm": 3.6183436876712287, "learning_rate": 2.649557414797834e-06, "loss": 0.7811, "step": 15040 }, { "epoch": 2.66784765279008, "grad_norm": 2.9204062861952242, "learning_rate": 2.6433842880650552e-06, "loss": 0.7684, "step": 15060 }, { "epoch": 2.671390611160319, "grad_norm": 3.8802496727785587, "learning_rate": 2.63721028407848e-06, "loss": 0.6913, "step": 15080 }, { "epoch": 2.674933569530558, "grad_norm": 2.5788899500499554, "learning_rate": 2.6310354406119022e-06, "loss": 0.7309, "step": 15100 }, { "epoch": 2.678476527900797, "grad_norm": 4.277286678336026, "learning_rate": 2.6248597954442493e-06, "loss": 0.7644, "step": 15120 }, { "epoch": 2.6820194862710363, "grad_norm": 3.188282097160713, "learning_rate": 2.6186833863593576e-06, "loss": 0.7619, "step": 15140 }, { "epoch": 2.6855624446412754, "grad_norm": 2.200272309526565, "learning_rate": 2.6125062511457344e-06, "loss": 0.7518, "step": 15160 }, { "epoch": 2.6891054030115145, "grad_norm": 4.286800017508891, "learning_rate": 2.6063284275963296e-06, "loss": 0.7551, "step": 15180 }, { "epoch": 2.692648361381754, "grad_norm": 3.9692599529255337, "learning_rate": 2.6001499535083067e-06, "loss": 0.7885, "step": 15200 }, { "epoch": 2.6961913197519927, "grad_norm": 3.3386892906925203, "learning_rate": 2.593970866682806e-06, "loss": 0.7603, "step": 15220 }, { "epoch": 2.699734278122232, "grad_norm": 3.5522070573455298, "learning_rate": 2.5877912049247206e-06, "loss": 0.7833, "step": 15240 }, { "epoch": 2.7032772364924713, "grad_norm": 3.6763130005746327, "learning_rate": 2.5816110060424566e-06, "loss": 0.7451, "step": 15260 }, { "epoch": 2.7068201948627104, "grad_norm": 4.065296942712161, "learning_rate": 2.57543030784771e-06, "loss": 0.7856, "step": 15280 }, { "epoch": 2.7103631532329495, "grad_norm": 3.925534297841757, "learning_rate": 2.5692491481552314e-06, "loss": 0.7869, "step": 15300 }, { "epoch": 2.7139061116031886, "grad_norm": 5.0603926851389245, "learning_rate": 2.5630675647825913e-06, "loss": 0.7616, "step": 15320 }, { "epoch": 2.7174490699734277, "grad_norm": 4.055554853144999, "learning_rate": 2.5568855955499573e-06, "loss": 0.7882, "step": 15340 }, { "epoch": 2.720992028343667, "grad_norm": 3.478661550213012, "learning_rate": 2.5507032782798553e-06, "loss": 0.7852, "step": 15360 }, { "epoch": 2.7245349867139064, "grad_norm": 2.171548931614822, "learning_rate": 2.5445206507969395e-06, "loss": 0.77, "step": 15380 }, { "epoch": 2.728077945084145, "grad_norm": 3.0583258036393604, "learning_rate": 2.5383377509277648e-06, "loss": 0.7404, "step": 15400 }, { "epoch": 2.7316209034543846, "grad_norm": 3.1484952352447273, "learning_rate": 2.5321546165005497e-06, "loss": 0.7266, "step": 15420 }, { "epoch": 2.7351638618246237, "grad_norm": 2.942970058363834, "learning_rate": 2.5259712853449503e-06, "loss": 0.7527, "step": 15440 }, { "epoch": 2.738706820194863, "grad_norm": 2.645281388123703, "learning_rate": 2.5197877952918243e-06, "loss": 0.7662, "step": 15460 }, { "epoch": 2.742249778565102, "grad_norm": 3.4746684651389548, "learning_rate": 2.5136041841730026e-06, "loss": 0.7628, "step": 15480 }, { "epoch": 2.745792736935341, "grad_norm": 3.7116192950002795, "learning_rate": 2.5074204898210587e-06, "loss": 0.7428, "step": 15500 }, { "epoch": 2.74933569530558, "grad_norm": 5.909705442043078, "learning_rate": 2.50123675006907e-06, "loss": 0.7538, "step": 15520 }, { "epoch": 2.752878653675819, "grad_norm": 3.8189355746524045, "learning_rate": 2.4950530027503963e-06, "loss": 0.7647, "step": 15540 }, { "epoch": 2.7564216120460587, "grad_norm": 3.5256750418834515, "learning_rate": 2.4888692856984446e-06, "loss": 0.7332, "step": 15560 }, { "epoch": 2.7599645704162974, "grad_norm": 5.2988278310924715, "learning_rate": 2.482685636746432e-06, "loss": 0.7446, "step": 15580 }, { "epoch": 2.763507528786537, "grad_norm": 2.9280069670539146, "learning_rate": 2.4765020937271615e-06, "loss": 0.7999, "step": 15600 }, { "epoch": 2.767050487156776, "grad_norm": 3.3339855601700243, "learning_rate": 2.4703186944727885e-06, "loss": 0.7421, "step": 15620 }, { "epoch": 2.770593445527015, "grad_norm": 3.1483076854706993, "learning_rate": 2.464135476814589e-06, "loss": 0.7523, "step": 15640 }, { "epoch": 2.7741364038972542, "grad_norm": 3.1964072857533687, "learning_rate": 2.4579524785827254e-06, "loss": 0.7793, "step": 15660 }, { "epoch": 2.7776793622674933, "grad_norm": 5.327494932320985, "learning_rate": 2.451769737606021e-06, "loss": 0.7604, "step": 15680 }, { "epoch": 2.7812223206377324, "grad_norm": 2.0488240092746413, "learning_rate": 2.4455872917117233e-06, "loss": 0.7198, "step": 15700 }, { "epoch": 2.7847652790079716, "grad_norm": 3.6254982549079506, "learning_rate": 2.439405178725274e-06, "loss": 0.7811, "step": 15720 }, { "epoch": 2.7883082373782107, "grad_norm": 3.813833819224438, "learning_rate": 2.4332234364700793e-06, "loss": 0.7857, "step": 15740 }, { "epoch": 2.7918511957484498, "grad_norm": 2.332438247153177, "learning_rate": 2.427042102767278e-06, "loss": 0.7741, "step": 15760 }, { "epoch": 2.7953941541186893, "grad_norm": 5.637052434142593, "learning_rate": 2.4208612154355054e-06, "loss": 0.7873, "step": 15780 }, { "epoch": 2.7989371124889284, "grad_norm": 3.61371927386971, "learning_rate": 2.4146808122906685e-06, "loss": 0.7667, "step": 15800 }, { "epoch": 2.8024800708591675, "grad_norm": 3.081712372431115, "learning_rate": 2.408500931145713e-06, "loss": 0.7637, "step": 15820 }, { "epoch": 2.8060230292294066, "grad_norm": 2.8637593806841064, "learning_rate": 2.4023216098103892e-06, "loss": 0.7406, "step": 15840 }, { "epoch": 2.8095659875996457, "grad_norm": 4.855719945200848, "learning_rate": 2.396142886091023e-06, "loss": 0.762, "step": 15860 }, { "epoch": 2.813108945969885, "grad_norm": 2.3565232263934592, "learning_rate": 2.389964797790283e-06, "loss": 0.7539, "step": 15880 }, { "epoch": 2.816651904340124, "grad_norm": 3.4605355857001325, "learning_rate": 2.383787382706953e-06, "loss": 0.7435, "step": 15900 }, { "epoch": 2.820194862710363, "grad_norm": 2.2057409877337992, "learning_rate": 2.377610678635693e-06, "loss": 0.7737, "step": 15920 }, { "epoch": 2.823737821080602, "grad_norm": 2.136429659045698, "learning_rate": 2.371434723366818e-06, "loss": 0.7759, "step": 15940 }, { "epoch": 2.8272807794508417, "grad_norm": 2.9028483092829815, "learning_rate": 2.3652595546860595e-06, "loss": 0.7826, "step": 15960 }, { "epoch": 2.8308237378210808, "grad_norm": 3.7595789211806383, "learning_rate": 2.359085210374335e-06, "loss": 0.7565, "step": 15980 }, { "epoch": 2.83436669619132, "grad_norm": 3.402275688537832, "learning_rate": 2.3529117282075207e-06, "loss": 0.7222, "step": 16000 }, { "epoch": 2.83436669619132, "eval_loss": 0.7888814210891724, "eval_runtime": 368.71, "eval_samples_per_second": 25.784, "eval_steps_per_second": 3.225, "step": 16000 }, { "epoch": 2.837909654561559, "grad_norm": 4.2698147796391455, "learning_rate": 2.3467391459562163e-06, "loss": 0.772, "step": 16020 }, { "epoch": 2.841452612931798, "grad_norm": 5.560388355211716, "learning_rate": 2.340567501385518e-06, "loss": 0.7719, "step": 16040 }, { "epoch": 2.844995571302037, "grad_norm": 2.294411499429463, "learning_rate": 2.3343968322547816e-06, "loss": 0.7737, "step": 16060 }, { "epoch": 2.8485385296722763, "grad_norm": 1.995090359134686, "learning_rate": 2.3282271763173984e-06, "loss": 0.7808, "step": 16080 }, { "epoch": 2.8520814880425154, "grad_norm": 2.8221763040945422, "learning_rate": 2.322058571320559e-06, "loss": 0.7943, "step": 16100 }, { "epoch": 2.8556244464127545, "grad_norm": 2.8411701267751264, "learning_rate": 2.315891055005024e-06, "loss": 0.7458, "step": 16120 }, { "epoch": 2.859167404782994, "grad_norm": 3.841302334883737, "learning_rate": 2.3097246651048937e-06, "loss": 0.77, "step": 16140 }, { "epoch": 2.8627103631532327, "grad_norm": 3.890480722348098, "learning_rate": 2.3035594393473777e-06, "loss": 0.7384, "step": 16160 }, { "epoch": 2.8662533215234722, "grad_norm": 5.679565106603388, "learning_rate": 2.297395415452562e-06, "loss": 0.803, "step": 16180 }, { "epoch": 2.8697962798937113, "grad_norm": 3.337771681609959, "learning_rate": 2.2912326311331774e-06, "loss": 0.7028, "step": 16200 }, { "epoch": 2.8733392382639504, "grad_norm": 3.9468562372490443, "learning_rate": 2.285071124094375e-06, "loss": 0.778, "step": 16220 }, { "epoch": 2.8768821966341895, "grad_norm": 3.7924647740545203, "learning_rate": 2.2789109320334885e-06, "loss": 0.7559, "step": 16240 }, { "epoch": 2.8804251550044286, "grad_norm": 3.3047453094089505, "learning_rate": 2.2727520926398067e-06, "loss": 0.7563, "step": 16260 }, { "epoch": 2.8839681133746677, "grad_norm": 4.691213784478345, "learning_rate": 2.2665946435943425e-06, "loss": 0.7708, "step": 16280 }, { "epoch": 2.887511071744907, "grad_norm": 5.050030297267503, "learning_rate": 2.2604386225696035e-06, "loss": 0.7855, "step": 16300 }, { "epoch": 2.8910540301151464, "grad_norm": 3.297233505976855, "learning_rate": 2.254284067229359e-06, "loss": 0.7273, "step": 16320 }, { "epoch": 2.894596988485385, "grad_norm": 3.0043017323996106, "learning_rate": 2.24813101522841e-06, "loss": 0.7538, "step": 16340 }, { "epoch": 2.8981399468556246, "grad_norm": 3.7341413882543857, "learning_rate": 2.2419795042123644e-06, "loss": 0.7414, "step": 16360 }, { "epoch": 2.9016829052258637, "grad_norm": 4.878090641375587, "learning_rate": 2.2358295718173966e-06, "loss": 0.7679, "step": 16380 }, { "epoch": 2.905225863596103, "grad_norm": 4.429314718401584, "learning_rate": 2.2296812556700245e-06, "loss": 0.7517, "step": 16400 }, { "epoch": 2.908768821966342, "grad_norm": 2.9862439233870943, "learning_rate": 2.2235345933868785e-06, "loss": 0.7818, "step": 16420 }, { "epoch": 2.912311780336581, "grad_norm": 6.698492865099133, "learning_rate": 2.2173896225744704e-06, "loss": 0.7695, "step": 16440 }, { "epoch": 2.91585473870682, "grad_norm": 4.153101032253308, "learning_rate": 2.2112463808289613e-06, "loss": 0.7296, "step": 16460 }, { "epoch": 2.919397697077059, "grad_norm": 2.779471876140217, "learning_rate": 2.2051049057359354e-06, "loss": 0.7283, "step": 16480 }, { "epoch": 2.9229406554472988, "grad_norm": 3.2836392616484984, "learning_rate": 2.1989652348701683e-06, "loss": 0.7383, "step": 16500 }, { "epoch": 2.9264836138175374, "grad_norm": 1.895818806571468, "learning_rate": 2.192827405795395e-06, "loss": 0.7345, "step": 16520 }, { "epoch": 2.930026572187777, "grad_norm": 5.6943248214395545, "learning_rate": 2.1866914560640832e-06, "loss": 0.7717, "step": 16540 }, { "epoch": 2.933569530558016, "grad_norm": 4.513933666928205, "learning_rate": 2.1805574232172044e-06, "loss": 0.7773, "step": 16560 }, { "epoch": 2.937112488928255, "grad_norm": 4.620092058556062, "learning_rate": 2.1744253447839988e-06, "loss": 0.7592, "step": 16580 }, { "epoch": 2.9406554472984943, "grad_norm": 3.858292964944665, "learning_rate": 2.16829525828175e-06, "loss": 0.7854, "step": 16600 }, { "epoch": 2.9441984056687334, "grad_norm": 3.8882587756941174, "learning_rate": 2.1621672012155552e-06, "loss": 0.7434, "step": 16620 }, { "epoch": 2.9477413640389725, "grad_norm": 4.499656195479335, "learning_rate": 2.1560412110780967e-06, "loss": 0.7695, "step": 16640 }, { "epoch": 2.9512843224092116, "grad_norm": 3.7965733733040494, "learning_rate": 2.149917325349408e-06, "loss": 0.7197, "step": 16660 }, { "epoch": 2.954827280779451, "grad_norm": 3.860376338376774, "learning_rate": 2.143795581496648e-06, "loss": 0.7403, "step": 16680 }, { "epoch": 2.9583702391496898, "grad_norm": 3.323974522469437, "learning_rate": 2.1376760169738746e-06, "loss": 0.7497, "step": 16700 }, { "epoch": 2.9619131975199293, "grad_norm": 2.3281130863968382, "learning_rate": 2.131558669221806e-06, "loss": 0.7319, "step": 16720 }, { "epoch": 2.9654561558901684, "grad_norm": 3.4381149445643517, "learning_rate": 2.125443575667603e-06, "loss": 0.7817, "step": 16740 }, { "epoch": 2.9689991142604075, "grad_norm": 3.9685288684047815, "learning_rate": 2.1193307737246336e-06, "loss": 0.7764, "step": 16760 }, { "epoch": 2.9725420726306466, "grad_norm": 3.396199970398508, "learning_rate": 2.113220300792243e-06, "loss": 0.7661, "step": 16780 }, { "epoch": 2.9760850310008857, "grad_norm": 3.6420647109134943, "learning_rate": 2.10711219425553e-06, "loss": 0.7035, "step": 16800 }, { "epoch": 2.979627989371125, "grad_norm": 4.626273852138959, "learning_rate": 2.101006491485112e-06, "loss": 0.753, "step": 16820 }, { "epoch": 2.983170947741364, "grad_norm": 2.4460574774339654, "learning_rate": 2.0949032298369035e-06, "loss": 0.7692, "step": 16840 }, { "epoch": 2.986713906111603, "grad_norm": 3.41069245958657, "learning_rate": 2.0888024466518804e-06, "loss": 0.6976, "step": 16860 }, { "epoch": 2.990256864481842, "grad_norm": 3.403936749564734, "learning_rate": 2.082704179255857e-06, "loss": 0.7946, "step": 16880 }, { "epoch": 2.9937998228520817, "grad_norm": 4.194199735821774, "learning_rate": 2.076608464959255e-06, "loss": 0.7235, "step": 16900 }, { "epoch": 2.997342781222321, "grad_norm": 4.995370798020937, "learning_rate": 2.0705153410568753e-06, "loss": 0.7518, "step": 16920 }, { "epoch": 3.00088573959256, "grad_norm": 4.498354371200463, "learning_rate": 2.0644248448276698e-06, "loss": 0.6865, "step": 16940 }, { "epoch": 3.004428697962799, "grad_norm": 5.9848761265551484, "learning_rate": 2.0583370135345157e-06, "loss": 0.6598, "step": 16960 }, { "epoch": 3.007971656333038, "grad_norm": 3.1703728963133844, "learning_rate": 2.0522518844239834e-06, "loss": 0.634, "step": 16980 }, { "epoch": 3.011514614703277, "grad_norm": 4.221579231662142, "learning_rate": 2.0461694947261127e-06, "loss": 0.6631, "step": 17000 }, { "epoch": 3.0150575730735163, "grad_norm": 4.154018707851057, "learning_rate": 2.0400898816541807e-06, "loss": 0.6633, "step": 17020 }, { "epoch": 3.0186005314437554, "grad_norm": 4.631953700832906, "learning_rate": 2.034013082404479e-06, "loss": 0.6674, "step": 17040 }, { "epoch": 3.0221434898139945, "grad_norm": 3.085154700215037, "learning_rate": 2.0279391341560823e-06, "loss": 0.6241, "step": 17060 }, { "epoch": 3.025686448184234, "grad_norm": 4.415029344873564, "learning_rate": 2.0218680740706227e-06, "loss": 0.6436, "step": 17080 }, { "epoch": 3.029229406554473, "grad_norm": 4.050012850016261, "learning_rate": 2.0157999392920626e-06, "loss": 0.6809, "step": 17100 }, { "epoch": 3.0327723649247122, "grad_norm": 3.474053983443441, "learning_rate": 2.009734766946465e-06, "loss": 0.6748, "step": 17120 }, { "epoch": 3.0363153232949514, "grad_norm": 5.502084090817595, "learning_rate": 2.0036725941417695e-06, "loss": 0.7077, "step": 17140 }, { "epoch": 3.0398582816651905, "grad_norm": 4.7328265250493375, "learning_rate": 1.997613457967565e-06, "loss": 0.6685, "step": 17160 }, { "epoch": 3.0434012400354296, "grad_norm": 4.990841825704372, "learning_rate": 1.991557395494858e-06, "loss": 0.6576, "step": 17180 }, { "epoch": 3.0469441984056687, "grad_norm": 4.446445857081803, "learning_rate": 1.9855044437758542e-06, "loss": 0.6291, "step": 17200 }, { "epoch": 3.0504871567759078, "grad_norm": 4.010559033356023, "learning_rate": 1.9794546398437233e-06, "loss": 0.6821, "step": 17220 }, { "epoch": 3.054030115146147, "grad_norm": 5.668917044427614, "learning_rate": 1.973408020712378e-06, "loss": 0.6501, "step": 17240 }, { "epoch": 3.057573073516386, "grad_norm": 4.507165538436801, "learning_rate": 1.967364623376245e-06, "loss": 0.6634, "step": 17260 }, { "epoch": 3.0611160318866255, "grad_norm": 6.594268797496839, "learning_rate": 1.9613244848100393e-06, "loss": 0.6777, "step": 17280 }, { "epoch": 3.0646589902568646, "grad_norm": 5.86179333081565, "learning_rate": 1.9552876419685404e-06, "loss": 0.6966, "step": 17300 }, { "epoch": 3.0682019486271037, "grad_norm": 6.897830395606888, "learning_rate": 1.94925413178636e-06, "loss": 0.6358, "step": 17320 }, { "epoch": 3.071744906997343, "grad_norm": 3.105114998536212, "learning_rate": 1.9432239911777234e-06, "loss": 0.6144, "step": 17340 }, { "epoch": 3.075287865367582, "grad_norm": 4.226605746771805, "learning_rate": 1.9371972570362386e-06, "loss": 0.6445, "step": 17360 }, { "epoch": 3.078830823737821, "grad_norm": 3.2064699379152946, "learning_rate": 1.9311739662346714e-06, "loss": 0.6295, "step": 17380 }, { "epoch": 3.08237378210806, "grad_norm": 4.509110054894344, "learning_rate": 1.925154155624723e-06, "loss": 0.6584, "step": 17400 }, { "epoch": 3.0859167404782992, "grad_norm": 2.7613419445656877, "learning_rate": 1.9191378620367992e-06, "loss": 0.6872, "step": 17420 }, { "epoch": 3.0894596988485383, "grad_norm": 5.249702772830893, "learning_rate": 1.91312512227979e-06, "loss": 0.659, "step": 17440 }, { "epoch": 3.093002657218778, "grad_norm": 4.594127042773178, "learning_rate": 1.907115973140841e-06, "loss": 0.6445, "step": 17460 }, { "epoch": 3.096545615589017, "grad_norm": 3.5475423306782, "learning_rate": 1.9011104513851306e-06, "loss": 0.6446, "step": 17480 }, { "epoch": 3.100088573959256, "grad_norm": 6.443218004353607, "learning_rate": 1.8951085937556447e-06, "loss": 0.6642, "step": 17500 }, { "epoch": 3.103631532329495, "grad_norm": 4.817902509140939, "learning_rate": 1.889110436972949e-06, "loss": 0.6675, "step": 17520 }, { "epoch": 3.1071744906997343, "grad_norm": 3.6588947567775576, "learning_rate": 1.8831160177349694e-06, "loss": 0.6011, "step": 17540 }, { "epoch": 3.1107174490699734, "grad_norm": 3.28321156533759, "learning_rate": 1.8771253727167639e-06, "loss": 0.6553, "step": 17560 }, { "epoch": 3.1142604074402125, "grad_norm": 5.587131615275665, "learning_rate": 1.8711385385702973e-06, "loss": 0.6896, "step": 17580 }, { "epoch": 3.1178033658104516, "grad_norm": 4.969644996204083, "learning_rate": 1.8651555519242215e-06, "loss": 0.648, "step": 17600 }, { "epoch": 3.1213463241806907, "grad_norm": 4.564430542899775, "learning_rate": 1.8591764493836468e-06, "loss": 0.6673, "step": 17620 }, { "epoch": 3.1248892825509302, "grad_norm": 4.821775727343219, "learning_rate": 1.8532012675299198e-06, "loss": 0.6368, "step": 17640 }, { "epoch": 3.1284322409211693, "grad_norm": 3.550472754727026, "learning_rate": 1.8472300429203998e-06, "loss": 0.6763, "step": 17660 }, { "epoch": 3.1319751992914084, "grad_norm": 5.387384363073119, "learning_rate": 1.8412628120882359e-06, "loss": 0.6228, "step": 17680 }, { "epoch": 3.1355181576616475, "grad_norm": 2.0363557538364527, "learning_rate": 1.8352996115421417e-06, "loss": 0.6165, "step": 17700 }, { "epoch": 3.1390611160318866, "grad_norm": 3.302139015386827, "learning_rate": 1.829340477766172e-06, "loss": 0.6668, "step": 17720 }, { "epoch": 3.1426040744021257, "grad_norm": 4.799357203523113, "learning_rate": 1.8233854472195014e-06, "loss": 0.6657, "step": 17740 }, { "epoch": 3.146147032772365, "grad_norm": 4.447535055541927, "learning_rate": 1.8174345563361992e-06, "loss": 0.6814, "step": 17760 }, { "epoch": 3.149689991142604, "grad_norm": 2.19457736885214, "learning_rate": 1.8114878415250082e-06, "loss": 0.6682, "step": 17780 }, { "epoch": 3.153232949512843, "grad_norm": 3.0879621968638755, "learning_rate": 1.8055453391691209e-06, "loss": 0.6, "step": 17800 }, { "epoch": 3.156775907883082, "grad_norm": 3.9283834956111705, "learning_rate": 1.7996070856259568e-06, "loss": 0.6664, "step": 17820 }, { "epoch": 3.1603188662533217, "grad_norm": 5.062334002651309, "learning_rate": 1.7936731172269414e-06, "loss": 0.6691, "step": 17840 }, { "epoch": 3.163861824623561, "grad_norm": 3.592859470647672, "learning_rate": 1.7877434702772807e-06, "loss": 0.6632, "step": 17860 }, { "epoch": 3.1674047829938, "grad_norm": 3.5217893259822306, "learning_rate": 1.7818181810557428e-06, "loss": 0.6588, "step": 17880 }, { "epoch": 3.170947741364039, "grad_norm": 5.792763255978902, "learning_rate": 1.7758972858144351e-06, "loss": 0.6843, "step": 17900 }, { "epoch": 3.174490699734278, "grad_norm": 5.619010064865972, "learning_rate": 1.7699808207785796e-06, "loss": 0.6304, "step": 17920 }, { "epoch": 3.178033658104517, "grad_norm": 3.233388216079928, "learning_rate": 1.7640688221462955e-06, "loss": 0.6481, "step": 17940 }, { "epoch": 3.1815766164747563, "grad_norm": 7.189211258809608, "learning_rate": 1.7581613260883733e-06, "loss": 0.6516, "step": 17960 }, { "epoch": 3.1851195748449954, "grad_norm": 4.794500314138224, "learning_rate": 1.7522583687480587e-06, "loss": 0.6276, "step": 17980 }, { "epoch": 3.1886625332152345, "grad_norm": 5.414264886735773, "learning_rate": 1.7463599862408265e-06, "loss": 0.6461, "step": 18000 }, { "epoch": 3.1886625332152345, "eval_loss": 0.814194917678833, "eval_runtime": 367.8278, "eval_samples_per_second": 25.846, "eval_steps_per_second": 3.232, "step": 18000 }, { "epoch": 3.192205491585474, "grad_norm": 2.678076603574593, "learning_rate": 1.7404662146541622e-06, "loss": 0.6586, "step": 18020 }, { "epoch": 3.195748449955713, "grad_norm": 5.435434057895623, "learning_rate": 1.7345770900473424e-06, "loss": 0.6378, "step": 18040 }, { "epoch": 3.1992914083259523, "grad_norm": 4.182207392231193, "learning_rate": 1.7286926484512088e-06, "loss": 0.6429, "step": 18060 }, { "epoch": 3.2028343666961914, "grad_norm": 6.163291864345331, "learning_rate": 1.722812925867955e-06, "loss": 0.6215, "step": 18080 }, { "epoch": 3.2063773250664305, "grad_norm": 4.221207066086723, "learning_rate": 1.7169379582709018e-06, "loss": 0.6734, "step": 18100 }, { "epoch": 3.2099202834366696, "grad_norm": 3.6319759767031345, "learning_rate": 1.711067781604277e-06, "loss": 0.6688, "step": 18120 }, { "epoch": 3.2134632418069087, "grad_norm": 2.4361609804138986, "learning_rate": 1.7052024317829986e-06, "loss": 0.6779, "step": 18140 }, { "epoch": 3.217006200177148, "grad_norm": 3.2229485582444646, "learning_rate": 1.69934194469245e-06, "loss": 0.5963, "step": 18160 }, { "epoch": 3.220549158547387, "grad_norm": 4.04542335006295, "learning_rate": 1.6934863561882664e-06, "loss": 0.6149, "step": 18180 }, { "epoch": 3.2240921169176264, "grad_norm": 4.224812708749921, "learning_rate": 1.687635702096111e-06, "loss": 0.6544, "step": 18200 }, { "epoch": 3.2276350752878655, "grad_norm": 2.5768170370991133, "learning_rate": 1.681790018211457e-06, "loss": 0.6455, "step": 18220 }, { "epoch": 3.2311780336581046, "grad_norm": 8.446037248628688, "learning_rate": 1.6759493402993713e-06, "loss": 0.6399, "step": 18240 }, { "epoch": 3.2347209920283437, "grad_norm": 4.421664554190382, "learning_rate": 1.6701137040942884e-06, "loss": 0.6605, "step": 18260 }, { "epoch": 3.238263950398583, "grad_norm": 3.867362303030101, "learning_rate": 1.664283145299801e-06, "loss": 0.6197, "step": 18280 }, { "epoch": 3.241806908768822, "grad_norm": 2.9062273272206975, "learning_rate": 1.658457699588436e-06, "loss": 0.6415, "step": 18300 }, { "epoch": 3.245349867139061, "grad_norm": 5.355980117276988, "learning_rate": 1.6526374026014366e-06, "loss": 0.6154, "step": 18320 }, { "epoch": 3.2488928255093, "grad_norm": 3.447853280839097, "learning_rate": 1.6468222899485464e-06, "loss": 0.6004, "step": 18340 }, { "epoch": 3.2524357838795392, "grad_norm": 3.7093160550377813, "learning_rate": 1.6410123972077884e-06, "loss": 0.6604, "step": 18360 }, { "epoch": 3.255978742249779, "grad_norm": 5.03373607670554, "learning_rate": 1.6352077599252508e-06, "loss": 0.6942, "step": 18380 }, { "epoch": 3.259521700620018, "grad_norm": 2.7267954903558462, "learning_rate": 1.6294084136148677e-06, "loss": 0.6245, "step": 18400 }, { "epoch": 3.263064658990257, "grad_norm": 3.422200189756036, "learning_rate": 1.6236143937582006e-06, "loss": 0.6454, "step": 18420 }, { "epoch": 3.266607617360496, "grad_norm": 3.004333600805996, "learning_rate": 1.6178257358042238e-06, "loss": 0.6308, "step": 18440 }, { "epoch": 3.270150575730735, "grad_norm": 3.205706761360872, "learning_rate": 1.6120424751691078e-06, "loss": 0.7113, "step": 18460 }, { "epoch": 3.2736935341009743, "grad_norm": 4.169462011619208, "learning_rate": 1.6062646472359967e-06, "loss": 0.6739, "step": 18480 }, { "epoch": 3.2772364924712134, "grad_norm": 5.474090442872105, "learning_rate": 1.6004922873548014e-06, "loss": 0.6459, "step": 18500 }, { "epoch": 3.2807794508414525, "grad_norm": 4.123370814980775, "learning_rate": 1.594725430841975e-06, "loss": 0.6329, "step": 18520 }, { "epoch": 3.2843224092116916, "grad_norm": 3.386294588511836, "learning_rate": 1.5889641129803013e-06, "loss": 0.6978, "step": 18540 }, { "epoch": 3.287865367581931, "grad_norm": 4.479997130104378, "learning_rate": 1.5832083690186763e-06, "loss": 0.6942, "step": 18560 }, { "epoch": 3.2914083259521703, "grad_norm": 3.403229229215997, "learning_rate": 1.5774582341718952e-06, "loss": 0.6561, "step": 18580 }, { "epoch": 3.2949512843224094, "grad_norm": 3.3840120900240045, "learning_rate": 1.571713743620435e-06, "loss": 0.6464, "step": 18600 }, { "epoch": 3.2984942426926485, "grad_norm": 2.166301971846268, "learning_rate": 1.5659749325102391e-06, "loss": 0.6633, "step": 18620 }, { "epoch": 3.3020372010628876, "grad_norm": 4.571502120958036, "learning_rate": 1.5602418359525029e-06, "loss": 0.6449, "step": 18640 }, { "epoch": 3.3055801594331267, "grad_norm": 6.850347513041369, "learning_rate": 1.5545144890234618e-06, "loss": 0.6375, "step": 18660 }, { "epoch": 3.3091231178033658, "grad_norm": 5.268732271957646, "learning_rate": 1.5487929267641688e-06, "loss": 0.6387, "step": 18680 }, { "epoch": 3.312666076173605, "grad_norm": 6.854691620973651, "learning_rate": 1.5430771841802894e-06, "loss": 0.6792, "step": 18700 }, { "epoch": 3.316209034543844, "grad_norm": 5.772199960663563, "learning_rate": 1.537367296241881e-06, "loss": 0.5957, "step": 18720 }, { "epoch": 3.319751992914083, "grad_norm": 4.543759004099246, "learning_rate": 1.531663297883183e-06, "loss": 0.6704, "step": 18740 }, { "epoch": 3.323294951284322, "grad_norm": 4.253141544006728, "learning_rate": 1.525965224002398e-06, "loss": 0.6591, "step": 18760 }, { "epoch": 3.3268379096545617, "grad_norm": 4.030607569941474, "learning_rate": 1.5202731094614848e-06, "loss": 0.6153, "step": 18780 }, { "epoch": 3.330380868024801, "grad_norm": 4.541306419220621, "learning_rate": 1.5145869890859404e-06, "loss": 0.6801, "step": 18800 }, { "epoch": 3.33392382639504, "grad_norm": 4.860276758268095, "learning_rate": 1.5089068976645876e-06, "loss": 0.6129, "step": 18820 }, { "epoch": 3.337466784765279, "grad_norm": 3.7898282834137875, "learning_rate": 1.503232869949364e-06, "loss": 0.647, "step": 18840 }, { "epoch": 3.341009743135518, "grad_norm": 4.475193399989839, "learning_rate": 1.4975649406551081e-06, "loss": 0.6015, "step": 18860 }, { "epoch": 3.3445527015057572, "grad_norm": 3.824693434860729, "learning_rate": 1.4919031444593458e-06, "loss": 0.6672, "step": 18880 }, { "epoch": 3.3480956598759963, "grad_norm": 3.5036219590743447, "learning_rate": 1.4862475160020806e-06, "loss": 0.6771, "step": 18900 }, { "epoch": 3.3516386182462354, "grad_norm": 4.1272200518774325, "learning_rate": 1.48059808988558e-06, "loss": 0.6757, "step": 18920 }, { "epoch": 3.3551815766164745, "grad_norm": 3.5389364787677957, "learning_rate": 1.4749549006741655e-06, "loss": 0.7042, "step": 18940 }, { "epoch": 3.358724534986714, "grad_norm": 2.5975704276819, "learning_rate": 1.4693179828939985e-06, "loss": 0.6987, "step": 18960 }, { "epoch": 3.362267493356953, "grad_norm": 3.8727633599446794, "learning_rate": 1.463687371032871e-06, "loss": 0.6685, "step": 18980 }, { "epoch": 3.3658104517271923, "grad_norm": 3.803175388434909, "learning_rate": 1.4580630995399949e-06, "loss": 0.6214, "step": 19000 }, { "epoch": 3.3693534100974314, "grad_norm": 3.1226214687691445, "learning_rate": 1.4524452028257884e-06, "loss": 0.6516, "step": 19020 }, { "epoch": 3.3728963684676705, "grad_norm": 3.9984001662113986, "learning_rate": 1.4468337152616712e-06, "loss": 0.6686, "step": 19040 }, { "epoch": 3.3764393268379096, "grad_norm": 5.2905284840587985, "learning_rate": 1.4412286711798473e-06, "loss": 0.643, "step": 19060 }, { "epoch": 3.3799822852081487, "grad_norm": 3.996754961897811, "learning_rate": 1.4356301048730987e-06, "loss": 0.6707, "step": 19080 }, { "epoch": 3.383525243578388, "grad_norm": 2.7287259969894757, "learning_rate": 1.4300380505945754e-06, "loss": 0.647, "step": 19100 }, { "epoch": 3.387068201948627, "grad_norm": 4.627474263327022, "learning_rate": 1.4244525425575862e-06, "loss": 0.6579, "step": 19120 }, { "epoch": 3.3906111603188664, "grad_norm": 6.3578848860264054, "learning_rate": 1.418873614935387e-06, "loss": 0.6214, "step": 19140 }, { "epoch": 3.3941541186891055, "grad_norm": 2.5409208272433292, "learning_rate": 1.4133013018609762e-06, "loss": 0.6916, "step": 19160 }, { "epoch": 3.3976970770593447, "grad_norm": 5.73185853916136, "learning_rate": 1.4077356374268808e-06, "loss": 0.639, "step": 19180 }, { "epoch": 3.4012400354295838, "grad_norm": 5.547561124073059, "learning_rate": 1.4021766556849492e-06, "loss": 0.6472, "step": 19200 }, { "epoch": 3.404782993799823, "grad_norm": 6.154343627624317, "learning_rate": 1.3966243906461477e-06, "loss": 0.632, "step": 19220 }, { "epoch": 3.408325952170062, "grad_norm": 3.352883710099438, "learning_rate": 1.3910788762803448e-06, "loss": 0.6399, "step": 19240 }, { "epoch": 3.411868910540301, "grad_norm": 5.151354471677131, "learning_rate": 1.3855401465161072e-06, "loss": 0.6439, "step": 19260 }, { "epoch": 3.41541186891054, "grad_norm": 3.533259346429167, "learning_rate": 1.3800082352404964e-06, "loss": 0.7011, "step": 19280 }, { "epoch": 3.4189548272807793, "grad_norm": 2.422586203772127, "learning_rate": 1.3744831762988492e-06, "loss": 0.6802, "step": 19300 }, { "epoch": 3.422497785651019, "grad_norm": 2.0987507209228773, "learning_rate": 1.368965003494586e-06, "loss": 0.653, "step": 19320 }, { "epoch": 3.426040744021258, "grad_norm": 5.860157027237845, "learning_rate": 1.3634537505889927e-06, "loss": 0.6517, "step": 19340 }, { "epoch": 3.429583702391497, "grad_norm": 4.150793117319128, "learning_rate": 1.3579494513010178e-06, "loss": 0.6702, "step": 19360 }, { "epoch": 3.433126660761736, "grad_norm": 2.376892166127197, "learning_rate": 1.352452139307068e-06, "loss": 0.6578, "step": 19380 }, { "epoch": 3.436669619131975, "grad_norm": 3.5677324284202974, "learning_rate": 1.3469618482407993e-06, "loss": 0.6466, "step": 19400 }, { "epoch": 3.4402125775022143, "grad_norm": 4.23134121034947, "learning_rate": 1.3414786116929102e-06, "loss": 0.6529, "step": 19420 }, { "epoch": 3.4437555358724534, "grad_norm": 5.745078327509347, "learning_rate": 1.3360024632109431e-06, "loss": 0.6484, "step": 19440 }, { "epoch": 3.4472984942426925, "grad_norm": 3.6442103656861864, "learning_rate": 1.3305334362990697e-06, "loss": 0.6669, "step": 19460 }, { "epoch": 3.4508414526129316, "grad_norm": 36.732566773014995, "learning_rate": 1.3250715644178926e-06, "loss": 0.6526, "step": 19480 }, { "epoch": 3.454384410983171, "grad_norm": 6.375922451901222, "learning_rate": 1.3196168809842384e-06, "loss": 0.6773, "step": 19500 }, { "epoch": 3.4579273693534103, "grad_norm": 4.596315770556884, "learning_rate": 1.314169419370952e-06, "loss": 0.6634, "step": 19520 }, { "epoch": 3.4614703277236494, "grad_norm": 5.605691895671033, "learning_rate": 1.3087292129066947e-06, "loss": 0.6925, "step": 19540 }, { "epoch": 3.4650132860938885, "grad_norm": 4.310812747641876, "learning_rate": 1.3032962948757406e-06, "loss": 0.6323, "step": 19560 }, { "epoch": 3.4685562444641276, "grad_norm": 2.539953889588533, "learning_rate": 1.2978706985177702e-06, "loss": 0.6603, "step": 19580 }, { "epoch": 3.4720992028343667, "grad_norm": 4.700922466636149, "learning_rate": 1.2924524570276676e-06, "loss": 0.6387, "step": 19600 }, { "epoch": 3.475642161204606, "grad_norm": 3.386892338712282, "learning_rate": 1.2870416035553213e-06, "loss": 0.665, "step": 19620 }, { "epoch": 3.479185119574845, "grad_norm": 4.047488784454614, "learning_rate": 1.2816381712054157e-06, "loss": 0.6442, "step": 19640 }, { "epoch": 3.482728077945084, "grad_norm": 4.456956526186442, "learning_rate": 1.2762421930372318e-06, "loss": 0.637, "step": 19660 }, { "epoch": 3.4862710363153235, "grad_norm": 4.141098829330102, "learning_rate": 1.2708537020644465e-06, "loss": 0.6384, "step": 19680 }, { "epoch": 3.4898139946855626, "grad_norm": 3.179545115166026, "learning_rate": 1.265472731254926e-06, "loss": 0.6259, "step": 19700 }, { "epoch": 3.4933569530558017, "grad_norm": 3.085506510833184, "learning_rate": 1.2600993135305278e-06, "loss": 0.6297, "step": 19720 }, { "epoch": 3.496899911426041, "grad_norm": 4.667399574209881, "learning_rate": 1.254733481766898e-06, "loss": 0.6576, "step": 19740 }, { "epoch": 3.50044286979628, "grad_norm": 4.264096999147888, "learning_rate": 1.2493752687932687e-06, "loss": 0.6778, "step": 19760 }, { "epoch": 3.503985828166519, "grad_norm": 2.0289989732438936, "learning_rate": 1.2440247073922627e-06, "loss": 0.6264, "step": 19780 }, { "epoch": 3.507528786536758, "grad_norm": 2.316066403465445, "learning_rate": 1.2386818302996847e-06, "loss": 0.6594, "step": 19800 }, { "epoch": 3.5110717449069972, "grad_norm": 4.626840288617283, "learning_rate": 1.233346670204327e-06, "loss": 0.691, "step": 19820 }, { "epoch": 3.5146147032772364, "grad_norm": 3.3340384188287193, "learning_rate": 1.228019259747769e-06, "loss": 0.6249, "step": 19840 }, { "epoch": 3.518157661647476, "grad_norm": 3.501104395294738, "learning_rate": 1.2226996315241743e-06, "loss": 0.6646, "step": 19860 }, { "epoch": 3.5217006200177146, "grad_norm": 1.7915799562011805, "learning_rate": 1.217387818080093e-06, "loss": 0.6616, "step": 19880 }, { "epoch": 3.525243578387954, "grad_norm": 4.085420259424753, "learning_rate": 1.2120838519142664e-06, "loss": 0.6475, "step": 19900 }, { "epoch": 3.528786536758193, "grad_norm": 2.889218336443782, "learning_rate": 1.2067877654774195e-06, "loss": 0.6577, "step": 19920 }, { "epoch": 3.5323294951284323, "grad_norm": 3.4338184164603773, "learning_rate": 1.20149959117207e-06, "loss": 0.6706, "step": 19940 }, { "epoch": 3.5358724534986714, "grad_norm": 5.515577025317211, "learning_rate": 1.196219361352329e-06, "loss": 0.646, "step": 19960 }, { "epoch": 3.5394154118689105, "grad_norm": 5.178906055641794, "learning_rate": 1.1909471083236999e-06, "loss": 0.6457, "step": 19980 }, { "epoch": 3.5429583702391496, "grad_norm": 2.503351819448841, "learning_rate": 1.1856828643428813e-06, "loss": 0.644, "step": 20000 }, { "epoch": 3.5429583702391496, "eval_loss": 0.8062734603881836, "eval_runtime": 374.2695, "eval_samples_per_second": 25.401, "eval_steps_per_second": 3.177, "step": 20000 }, { "epoch": 3.5465013286093887, "grad_norm": 4.448169389650755, "learning_rate": 1.1804266616175747e-06, "loss": 0.6384, "step": 20020 }, { "epoch": 3.5500442869796283, "grad_norm": 7.501920429960917, "learning_rate": 1.17517853230628e-06, "loss": 0.6745, "step": 20040 }, { "epoch": 3.553587245349867, "grad_norm": 4.264187361101836, "learning_rate": 1.169938508518103e-06, "loss": 0.6495, "step": 20060 }, { "epoch": 3.5571302037201065, "grad_norm": 4.121288134877933, "learning_rate": 1.1647066223125606e-06, "loss": 0.6297, "step": 20080 }, { "epoch": 3.5606731620903456, "grad_norm": 4.279937646373024, "learning_rate": 1.1594829056993794e-06, "loss": 0.6421, "step": 20100 }, { "epoch": 3.5642161204605847, "grad_norm": 2.387088275176141, "learning_rate": 1.1542673906383045e-06, "loss": 0.6768, "step": 20120 }, { "epoch": 3.5677590788308238, "grad_norm": 4.372760494789765, "learning_rate": 1.1490601090389014e-06, "loss": 0.6512, "step": 20140 }, { "epoch": 3.571302037201063, "grad_norm": 3.3326241965778425, "learning_rate": 1.1438610927603614e-06, "loss": 0.6615, "step": 20160 }, { "epoch": 3.574844995571302, "grad_norm": 3.04200662462561, "learning_rate": 1.1386703736113092e-06, "loss": 0.6343, "step": 20180 }, { "epoch": 3.578387953941541, "grad_norm": 3.4142428638907254, "learning_rate": 1.1334879833496033e-06, "loss": 0.6929, "step": 20200 }, { "epoch": 3.58193091231178, "grad_norm": 2.3882204523624484, "learning_rate": 1.1283139536821446e-06, "loss": 0.6017, "step": 20220 }, { "epoch": 3.5854738706820193, "grad_norm": 2.733962119410601, "learning_rate": 1.1231483162646851e-06, "loss": 0.6503, "step": 20240 }, { "epoch": 3.589016829052259, "grad_norm": 3.7061102390832383, "learning_rate": 1.1179911027016277e-06, "loss": 0.6049, "step": 20260 }, { "epoch": 3.592559787422498, "grad_norm": 4.725193440295439, "learning_rate": 1.1128423445458378e-06, "loss": 0.6488, "step": 20280 }, { "epoch": 3.596102745792737, "grad_norm": 6.502036568180023, "learning_rate": 1.1077020732984508e-06, "loss": 0.635, "step": 20300 }, { "epoch": 3.599645704162976, "grad_norm": 3.9545813460025676, "learning_rate": 1.1025703204086758e-06, "loss": 0.679, "step": 20320 }, { "epoch": 3.6031886625332152, "grad_norm": 3.6704467908903013, "learning_rate": 1.097447117273602e-06, "loss": 0.6222, "step": 20340 }, { "epoch": 3.6067316209034543, "grad_norm": 3.6721020929579655, "learning_rate": 1.0923324952380158e-06, "loss": 0.6313, "step": 20360 }, { "epoch": 3.6102745792736934, "grad_norm": 2.6332813999535247, "learning_rate": 1.0872264855941974e-06, "loss": 0.6067, "step": 20380 }, { "epoch": 3.6138175376439325, "grad_norm": 6.199873637972493, "learning_rate": 1.0821291195817368e-06, "loss": 0.6525, "step": 20400 }, { "epoch": 3.6173604960141716, "grad_norm": 5.810381931293109, "learning_rate": 1.077040428387341e-06, "loss": 0.6836, "step": 20420 }, { "epoch": 3.620903454384411, "grad_norm": 3.782224795545236, "learning_rate": 1.0719604431446424e-06, "loss": 0.6494, "step": 20440 }, { "epoch": 3.6244464127546503, "grad_norm": 3.813897952858666, "learning_rate": 1.0668891949340066e-06, "loss": 0.6666, "step": 20460 }, { "epoch": 3.6279893711248894, "grad_norm": 3.689851683344868, "learning_rate": 1.061826714782348e-06, "loss": 0.665, "step": 20480 }, { "epoch": 3.6315323294951285, "grad_norm": 2.9395205953490975, "learning_rate": 1.0567730336629332e-06, "loss": 0.6364, "step": 20500 }, { "epoch": 3.6350752878653676, "grad_norm": 3.6265579695009444, "learning_rate": 1.0517281824951958e-06, "loss": 0.6308, "step": 20520 }, { "epoch": 3.6386182462356067, "grad_norm": 3.827147896558186, "learning_rate": 1.0466921921445455e-06, "loss": 0.6372, "step": 20540 }, { "epoch": 3.642161204605846, "grad_norm": 4.1574203155116445, "learning_rate": 1.0416650934221797e-06, "loss": 0.6439, "step": 20560 }, { "epoch": 3.645704162976085, "grad_norm": 4.4390839250431515, "learning_rate": 1.0366469170848966e-06, "loss": 0.6009, "step": 20580 }, { "epoch": 3.649247121346324, "grad_norm": 5.0129876566121165, "learning_rate": 1.0316376938349037e-06, "loss": 0.692, "step": 20600 }, { "epoch": 3.6527900797165636, "grad_norm": 3.704661473447835, "learning_rate": 1.0266374543196312e-06, "loss": 0.6231, "step": 20620 }, { "epoch": 3.656333038086802, "grad_norm": 3.8011769756171954, "learning_rate": 1.021646229131548e-06, "loss": 0.6669, "step": 20640 }, { "epoch": 3.6598759964570418, "grad_norm": 4.1871605406787875, "learning_rate": 1.0166640488079682e-06, "loss": 0.6749, "step": 20660 }, { "epoch": 3.663418954827281, "grad_norm": 4.393069433206878, "learning_rate": 1.0116909438308689e-06, "loss": 0.6444, "step": 20680 }, { "epoch": 3.66696191319752, "grad_norm": 3.8353595768451725, "learning_rate": 1.006726944626704e-06, "loss": 0.6717, "step": 20700 }, { "epoch": 3.670504871567759, "grad_norm": 5.904031353317382, "learning_rate": 1.0017720815662137e-06, "loss": 0.634, "step": 20720 }, { "epoch": 3.674047829937998, "grad_norm": 4.827433370442803, "learning_rate": 9.968263849642434e-07, "loss": 0.6189, "step": 20740 }, { "epoch": 3.6775907883082373, "grad_norm": 5.58653647794514, "learning_rate": 9.91889885079555e-07, "loss": 0.6557, "step": 20760 }, { "epoch": 3.6811337466784764, "grad_norm": 6.026946025657904, "learning_rate": 9.869626121146442e-07, "loss": 0.6468, "step": 20780 }, { "epoch": 3.684676705048716, "grad_norm": 2.8168530647204384, "learning_rate": 9.820445962155526e-07, "loss": 0.6782, "step": 20800 }, { "epoch": 3.6882196634189546, "grad_norm": 5.029755822261837, "learning_rate": 9.771358674716886e-07, "loss": 0.6575, "step": 20820 }, { "epoch": 3.691762621789194, "grad_norm": 3.0952751296440826, "learning_rate": 9.722364559156373e-07, "loss": 0.6524, "step": 20840 }, { "epoch": 3.6953055801594332, "grad_norm": 3.5997851014244033, "learning_rate": 9.673463915229786e-07, "loss": 0.672, "step": 20860 }, { "epoch": 3.6988485385296723, "grad_norm": 2.1873288789656913, "learning_rate": 9.62465704212108e-07, "loss": 0.7021, "step": 20880 }, { "epoch": 3.7023914968999114, "grad_norm": 3.8549619707969636, "learning_rate": 9.575944238440473e-07, "loss": 0.6788, "step": 20900 }, { "epoch": 3.7059344552701505, "grad_norm": 2.517253170655585, "learning_rate": 9.527325802222651e-07, "loss": 0.6652, "step": 20920 }, { "epoch": 3.7094774136403896, "grad_norm": 4.344342864101945, "learning_rate": 9.478802030924964e-07, "loss": 0.66, "step": 20940 }, { "epoch": 3.7130203720106287, "grad_norm": 5.501462412298871, "learning_rate": 9.430373221425534e-07, "loss": 0.6083, "step": 20960 }, { "epoch": 3.7165633303808683, "grad_norm": 5.05343578594501, "learning_rate": 9.382039670021548e-07, "loss": 0.6454, "step": 20980 }, { "epoch": 3.720106288751107, "grad_norm": 3.501826208404435, "learning_rate": 9.333801672427339e-07, "loss": 0.6739, "step": 21000 }, { "epoch": 3.7236492471213465, "grad_norm": 2.7034068335281836, "learning_rate": 9.285659523772636e-07, "loss": 0.6527, "step": 21020 }, { "epoch": 3.7271922054915856, "grad_norm": 5.720702183074802, "learning_rate": 9.237613518600763e-07, "loss": 0.6369, "step": 21040 }, { "epoch": 3.7307351638618247, "grad_norm": 3.1813749076707207, "learning_rate": 9.189663950866795e-07, "loss": 0.6318, "step": 21060 }, { "epoch": 3.734278122232064, "grad_norm": 6.860403932855227, "learning_rate": 9.141811113935786e-07, "loss": 0.6501, "step": 21080 }, { "epoch": 3.737821080602303, "grad_norm": 5.995799501979168, "learning_rate": 9.094055300580992e-07, "loss": 0.686, "step": 21100 }, { "epoch": 3.741364038972542, "grad_norm": 3.980336679764352, "learning_rate": 9.046396802982041e-07, "loss": 0.6047, "step": 21120 }, { "epoch": 3.744906997342781, "grad_norm": 6.358653317261383, "learning_rate": 8.998835912723162e-07, "loss": 0.6597, "step": 21140 }, { "epoch": 3.7484499557130206, "grad_norm": 5.811283042903301, "learning_rate": 8.951372920791412e-07, "loss": 0.6643, "step": 21160 }, { "epoch": 3.7519929140832593, "grad_norm": 3.162264075637326, "learning_rate": 8.904008117574886e-07, "loss": 0.6319, "step": 21180 }, { "epoch": 3.755535872453499, "grad_norm": 4.2075636016893805, "learning_rate": 8.856741792860923e-07, "loss": 0.6902, "step": 21200 }, { "epoch": 3.759078830823738, "grad_norm": 4.321253377690551, "learning_rate": 8.80957423583439e-07, "loss": 0.7127, "step": 21220 }, { "epoch": 3.762621789193977, "grad_norm": 4.271237705673551, "learning_rate": 8.762505735075833e-07, "loss": 0.6617, "step": 21240 }, { "epoch": 3.766164747564216, "grad_norm": 3.0868138114437866, "learning_rate": 8.715536578559763e-07, "loss": 0.6178, "step": 21260 }, { "epoch": 3.7697077059344553, "grad_norm": 5.628060531285577, "learning_rate": 8.668667053652907e-07, "loss": 0.6439, "step": 21280 }, { "epoch": 3.7732506643046944, "grad_norm": 2.4998507467015245, "learning_rate": 8.621897447112395e-07, "loss": 0.6257, "step": 21300 }, { "epoch": 3.7767936226749335, "grad_norm": 3.5267634187350874, "learning_rate": 8.575228045084044e-07, "loss": 0.6537, "step": 21320 }, { "epoch": 3.7803365810451726, "grad_norm": 5.53390702998575, "learning_rate": 8.528659133100616e-07, "loss": 0.6343, "step": 21340 }, { "epoch": 3.7838795394154117, "grad_norm": 5.193506072777215, "learning_rate": 8.482190996080042e-07, "loss": 0.6457, "step": 21360 }, { "epoch": 3.787422497785651, "grad_norm": 3.306608753288234, "learning_rate": 8.435823918323682e-07, "loss": 0.674, "step": 21380 }, { "epoch": 3.7909654561558903, "grad_norm": 5.818341238525181, "learning_rate": 8.389558183514615e-07, "loss": 0.6551, "step": 21400 }, { "epoch": 3.7945084145261294, "grad_norm": 1.869734323352245, "learning_rate": 8.34339407471586e-07, "loss": 0.6328, "step": 21420 }, { "epoch": 3.7980513728963685, "grad_norm": 3.061312982527864, "learning_rate": 8.297331874368702e-07, "loss": 0.6127, "step": 21440 }, { "epoch": 3.8015943312666076, "grad_norm": 2.810183936591826, "learning_rate": 8.2513718642909e-07, "loss": 0.6226, "step": 21460 }, { "epoch": 3.8051372896368467, "grad_norm": 3.6433306189517127, "learning_rate": 8.205514325674993e-07, "loss": 0.6773, "step": 21480 }, { "epoch": 3.808680248007086, "grad_norm": 6.17382443818236, "learning_rate": 8.159759539086603e-07, "loss": 0.6604, "step": 21500 }, { "epoch": 3.812223206377325, "grad_norm": 3.3939319572629563, "learning_rate": 8.114107784462677e-07, "loss": 0.6187, "step": 21520 }, { "epoch": 3.815766164747564, "grad_norm": 5.234765372841271, "learning_rate": 8.068559341109791e-07, "loss": 0.6466, "step": 21540 }, { "epoch": 3.8193091231178036, "grad_norm": 3.1266699022361357, "learning_rate": 8.023114487702446e-07, "loss": 0.6708, "step": 21560 }, { "epoch": 3.8228520814880427, "grad_norm": 4.632817513631642, "learning_rate": 7.977773502281355e-07, "loss": 0.6564, "step": 21580 }, { "epoch": 3.8263950398582818, "grad_norm": 5.332789338782664, "learning_rate": 7.932536662251747e-07, "loss": 0.6521, "step": 21600 }, { "epoch": 3.829937998228521, "grad_norm": 4.360391036257879, "learning_rate": 7.887404244381683e-07, "loss": 0.6484, "step": 21620 }, { "epoch": 3.83348095659876, "grad_norm": 3.444603203101798, "learning_rate": 7.84237652480033e-07, "loss": 0.6651, "step": 21640 }, { "epoch": 3.837023914968999, "grad_norm": 4.102682609294379, "learning_rate": 7.797453778996284e-07, "loss": 0.6597, "step": 21660 }, { "epoch": 3.840566873339238, "grad_norm": 3.7772046806599917, "learning_rate": 7.752636281815923e-07, "loss": 0.669, "step": 21680 }, { "epoch": 3.8441098317094773, "grad_norm": 4.059355924000635, "learning_rate": 7.707924307461664e-07, "loss": 0.6333, "step": 21700 }, { "epoch": 3.8476527900797164, "grad_norm": 5.671547408240144, "learning_rate": 7.663318129490313e-07, "loss": 0.6299, "step": 21720 }, { "epoch": 3.851195748449956, "grad_norm": 3.8675019069565595, "learning_rate": 7.61881802081142e-07, "loss": 0.6915, "step": 21740 }, { "epoch": 3.8547387068201946, "grad_norm": 4.07995418510124, "learning_rate": 7.57442425368555e-07, "loss": 0.6051, "step": 21760 }, { "epoch": 3.858281665190434, "grad_norm": 4.316690207012028, "learning_rate": 7.53013709972267e-07, "loss": 0.6172, "step": 21780 }, { "epoch": 3.8618246235606732, "grad_norm": 2.973078463900836, "learning_rate": 7.485956829880455e-07, "loss": 0.6679, "step": 21800 }, { "epoch": 3.8653675819309123, "grad_norm": 5.5663765632065525, "learning_rate": 7.441883714462641e-07, "loss": 0.6259, "step": 21820 }, { "epoch": 3.8689105403011514, "grad_norm": 4.08577312555125, "learning_rate": 7.397918023117389e-07, "loss": 0.6318, "step": 21840 }, { "epoch": 3.8724534986713905, "grad_norm": 4.203797767658477, "learning_rate": 7.354060024835599e-07, "loss": 0.6391, "step": 21860 }, { "epoch": 3.8759964570416297, "grad_norm": 2.811964097058608, "learning_rate": 7.310309987949294e-07, "loss": 0.6946, "step": 21880 }, { "epoch": 3.8795394154118688, "grad_norm": 3.853792141294366, "learning_rate": 7.266668180129946e-07, "loss": 0.6468, "step": 21900 }, { "epoch": 3.8830823737821083, "grad_norm": 5.804907803894153, "learning_rate": 7.223134868386903e-07, "loss": 0.6124, "step": 21920 }, { "epoch": 3.886625332152347, "grad_norm": 5.778097135701537, "learning_rate": 7.179710319065672e-07, "loss": 0.7053, "step": 21940 }, { "epoch": 3.8901682905225865, "grad_norm": 3.6866863372568712, "learning_rate": 7.136394797846338e-07, "loss": 0.6541, "step": 21960 }, { "epoch": 3.8937112488928256, "grad_norm": 3.29536698985515, "learning_rate": 7.093188569741962e-07, "loss": 0.6287, "step": 21980 }, { "epoch": 3.8972542072630647, "grad_norm": 4.9327123681203275, "learning_rate": 7.050091899096869e-07, "loss": 0.6666, "step": 22000 }, { "epoch": 3.8972542072630647, "eval_loss": 0.8004346489906311, "eval_runtime": 378.4036, "eval_samples_per_second": 25.124, "eval_steps_per_second": 3.142, "step": 22000 } ], "logging_steps": 20, "max_steps": 28225, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2905457032298496.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }