{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0855545301558005, "eval_steps": 500, "global_step": 5000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00217143477552793, "grad_norm": 0.9825782179832458, "learning_rate": 3.6000000000000003e-06, "loss": 1.7137, "step": 10 }, { "epoch": 0.00434286955105586, "grad_norm": 1.4503270387649536, "learning_rate": 7.600000000000001e-06, "loss": 1.7044, "step": 20 }, { "epoch": 0.006514304326583791, "grad_norm": 0.7030352354049683, "learning_rate": 1.16e-05, "loss": 1.5721, "step": 30 }, { "epoch": 0.00868573910211172, "grad_norm": 0.9047777652740479, "learning_rate": 1.5600000000000003e-05, "loss": 1.4224, "step": 40 }, { "epoch": 0.01085717387763965, "grad_norm": 0.6958425641059875, "learning_rate": 1.9600000000000002e-05, "loss": 1.3783, "step": 50 }, { "epoch": 0.013028608653167581, "grad_norm": 0.683045506477356, "learning_rate": 1.9980353634577606e-05, "loss": 1.3706, "step": 60 }, { "epoch": 0.015200043428695511, "grad_norm": 0.5452519655227661, "learning_rate": 1.9958524339663828e-05, "loss": 1.3022, "step": 70 }, { "epoch": 0.01737147820422344, "grad_norm": 0.8496165871620178, "learning_rate": 1.9936695044750056e-05, "loss": 1.2474, "step": 80 }, { "epoch": 0.01954291297975137, "grad_norm": 0.5976231098175049, "learning_rate": 1.991486574983628e-05, "loss": 1.2646, "step": 90 }, { "epoch": 0.0217143477552793, "grad_norm": 0.7484721541404724, "learning_rate": 1.9893036454922506e-05, "loss": 1.2051, "step": 100 }, { "epoch": 0.02388578253080723, "grad_norm": 0.6440810561180115, "learning_rate": 1.9871207160008735e-05, "loss": 1.2289, "step": 110 }, { "epoch": 0.026057217306335163, "grad_norm": 0.7614450454711914, "learning_rate": 1.984937786509496e-05, "loss": 1.26, "step": 120 }, { "epoch": 0.028228652081863093, "grad_norm": 0.7417937517166138, "learning_rate": 1.9827548570181185e-05, "loss": 1.2034, "step": 130 }, { "epoch": 0.030400086857391023, "grad_norm": 0.7169002890586853, "learning_rate": 1.980571927526741e-05, "loss": 1.2561, "step": 140 }, { "epoch": 0.03257152163291895, "grad_norm": 0.6170061826705933, "learning_rate": 1.9783889980353638e-05, "loss": 1.1907, "step": 150 }, { "epoch": 0.03474295640844688, "grad_norm": 0.7643230557441711, "learning_rate": 1.976206068543986e-05, "loss": 1.2109, "step": 160 }, { "epoch": 0.03691439118397481, "grad_norm": 0.8660950660705566, "learning_rate": 1.9740231390526088e-05, "loss": 1.3197, "step": 170 }, { "epoch": 0.03908582595950274, "grad_norm": 0.7613770961761475, "learning_rate": 1.9718402095612313e-05, "loss": 1.2072, "step": 180 }, { "epoch": 0.04125726073503067, "grad_norm": 0.880974531173706, "learning_rate": 1.9696572800698538e-05, "loss": 1.226, "step": 190 }, { "epoch": 0.0434286955105586, "grad_norm": 0.8629663586616516, "learning_rate": 1.9674743505784766e-05, "loss": 1.2298, "step": 200 }, { "epoch": 0.04560013028608653, "grad_norm": 0.7981083393096924, "learning_rate": 1.965291421087099e-05, "loss": 1.145, "step": 210 }, { "epoch": 0.04777156506161446, "grad_norm": 0.8174938559532166, "learning_rate": 1.9631084915957216e-05, "loss": 1.1745, "step": 220 }, { "epoch": 0.04994299983714239, "grad_norm": 0.7619920969009399, "learning_rate": 1.960925562104344e-05, "loss": 1.1557, "step": 230 }, { "epoch": 0.052114434612670325, "grad_norm": 0.8618036508560181, "learning_rate": 1.958742632612967e-05, "loss": 1.2093, "step": 240 }, { "epoch": 0.05428586938819825, "grad_norm": 0.9684587717056274, "learning_rate": 1.956559703121589e-05, "loss": 1.1358, "step": 250 }, { "epoch": 0.056457304163726185, "grad_norm": 0.856431245803833, "learning_rate": 1.954376773630212e-05, "loss": 1.0864, "step": 260 }, { "epoch": 0.05862873893925411, "grad_norm": 0.8133667707443237, "learning_rate": 1.9521938441388345e-05, "loss": 1.1005, "step": 270 }, { "epoch": 0.060800173714782045, "grad_norm": 1.0199098587036133, "learning_rate": 1.950010914647457e-05, "loss": 1.1349, "step": 280 }, { "epoch": 0.06297160849030997, "grad_norm": 0.8546782732009888, "learning_rate": 1.9478279851560794e-05, "loss": 1.1272, "step": 290 }, { "epoch": 0.0651430432658379, "grad_norm": 1.0497276782989502, "learning_rate": 1.9456450556647023e-05, "loss": 1.176, "step": 300 }, { "epoch": 0.06731447804136584, "grad_norm": 0.9524215459823608, "learning_rate": 1.9434621261733248e-05, "loss": 1.1281, "step": 310 }, { "epoch": 0.06948591281689376, "grad_norm": 0.9029881954193115, "learning_rate": 1.9412791966819473e-05, "loss": 1.0668, "step": 320 }, { "epoch": 0.07165734759242169, "grad_norm": 1.0050421953201294, "learning_rate": 1.93909626719057e-05, "loss": 1.0993, "step": 330 }, { "epoch": 0.07382878236794962, "grad_norm": 0.8202849626541138, "learning_rate": 1.9369133376991923e-05, "loss": 1.1794, "step": 340 }, { "epoch": 0.07600021714347756, "grad_norm": 0.8110634684562683, "learning_rate": 1.934730408207815e-05, "loss": 1.1452, "step": 350 }, { "epoch": 0.07817165191900548, "grad_norm": 0.9648256301879883, "learning_rate": 1.9325474787164376e-05, "loss": 1.1503, "step": 360 }, { "epoch": 0.08034308669453341, "grad_norm": 0.969715416431427, "learning_rate": 1.93036454922506e-05, "loss": 1.129, "step": 370 }, { "epoch": 0.08251452147006134, "grad_norm": 1.0881967544555664, "learning_rate": 1.9281816197336826e-05, "loss": 1.1217, "step": 380 }, { "epoch": 0.08468595624558928, "grad_norm": 0.9472118616104126, "learning_rate": 1.9259986902423054e-05, "loss": 1.1206, "step": 390 }, { "epoch": 0.0868573910211172, "grad_norm": 1.0082671642303467, "learning_rate": 1.923815760750928e-05, "loss": 1.1371, "step": 400 }, { "epoch": 0.08902882579664513, "grad_norm": 1.0587445497512817, "learning_rate": 1.9216328312595504e-05, "loss": 1.0499, "step": 410 }, { "epoch": 0.09120026057217306, "grad_norm": 0.869490385055542, "learning_rate": 1.9194499017681733e-05, "loss": 1.0992, "step": 420 }, { "epoch": 0.093371695347701, "grad_norm": 1.024477243423462, "learning_rate": 1.9172669722767954e-05, "loss": 1.1033, "step": 430 }, { "epoch": 0.09554313012322892, "grad_norm": 0.7851136326789856, "learning_rate": 1.9150840427854183e-05, "loss": 1.1261, "step": 440 }, { "epoch": 0.09771456489875685, "grad_norm": 1.0576775074005127, "learning_rate": 1.9129011132940408e-05, "loss": 1.0772, "step": 450 }, { "epoch": 0.09988599967428478, "grad_norm": 0.9781667590141296, "learning_rate": 1.9107181838026633e-05, "loss": 1.0995, "step": 460 }, { "epoch": 0.10205743444981272, "grad_norm": 1.0188452005386353, "learning_rate": 1.9085352543112858e-05, "loss": 1.1518, "step": 470 }, { "epoch": 0.10422886922534065, "grad_norm": 1.052553653717041, "learning_rate": 1.9063523248199086e-05, "loss": 1.1514, "step": 480 }, { "epoch": 0.10640030400086857, "grad_norm": 0.9977424144744873, "learning_rate": 1.904169395328531e-05, "loss": 1.1605, "step": 490 }, { "epoch": 0.1085717387763965, "grad_norm": 0.9981403946876526, "learning_rate": 1.9019864658371536e-05, "loss": 1.0996, "step": 500 }, { "epoch": 0.11074317355192444, "grad_norm": 0.9761925339698792, "learning_rate": 1.899803536345776e-05, "loss": 1.048, "step": 510 }, { "epoch": 0.11291460832745237, "grad_norm": 0.9788073301315308, "learning_rate": 1.8976206068543986e-05, "loss": 1.1175, "step": 520 }, { "epoch": 0.11508604310298029, "grad_norm": 0.9808152914047241, "learning_rate": 1.8954376773630214e-05, "loss": 1.1271, "step": 530 }, { "epoch": 0.11725747787850822, "grad_norm": 0.9630600214004517, "learning_rate": 1.893254747871644e-05, "loss": 1.0699, "step": 540 }, { "epoch": 0.11942891265403616, "grad_norm": 1.1894537210464478, "learning_rate": 1.8910718183802664e-05, "loss": 1.0589, "step": 550 }, { "epoch": 0.12160034742956409, "grad_norm": 0.967409074306488, "learning_rate": 1.888888888888889e-05, "loss": 1.1132, "step": 560 }, { "epoch": 0.12377178220509201, "grad_norm": 0.9783412218093872, "learning_rate": 1.8867059593975117e-05, "loss": 1.0887, "step": 570 }, { "epoch": 0.12594321698061994, "grad_norm": 0.9031311869621277, "learning_rate": 1.8845230299061342e-05, "loss": 1.0836, "step": 580 }, { "epoch": 0.12811465175614786, "grad_norm": 1.2321075201034546, "learning_rate": 1.8823401004147567e-05, "loss": 1.084, "step": 590 }, { "epoch": 0.1302860865316758, "grad_norm": 0.9194741249084473, "learning_rate": 1.8801571709233792e-05, "loss": 1.0999, "step": 600 }, { "epoch": 0.13245752130720373, "grad_norm": 1.2474993467330933, "learning_rate": 1.877974241432002e-05, "loss": 1.0497, "step": 610 }, { "epoch": 0.13462895608273168, "grad_norm": 1.0515743494033813, "learning_rate": 1.8757913119406246e-05, "loss": 1.0983, "step": 620 }, { "epoch": 0.1368003908582596, "grad_norm": 1.2497025728225708, "learning_rate": 1.873608382449247e-05, "loss": 1.118, "step": 630 }, { "epoch": 0.13897182563378752, "grad_norm": 1.1340830326080322, "learning_rate": 1.8714254529578696e-05, "loss": 1.0629, "step": 640 }, { "epoch": 0.14114326040931546, "grad_norm": 1.1488502025604248, "learning_rate": 1.869242523466492e-05, "loss": 1.1057, "step": 650 }, { "epoch": 0.14331469518484338, "grad_norm": 1.1718027591705322, "learning_rate": 1.867059593975115e-05, "loss": 1.0895, "step": 660 }, { "epoch": 0.1454861299603713, "grad_norm": 0.8492761850357056, "learning_rate": 1.8648766644837374e-05, "loss": 1.0919, "step": 670 }, { "epoch": 0.14765756473589925, "grad_norm": 1.0783703327178955, "learning_rate": 1.86269373499236e-05, "loss": 1.0929, "step": 680 }, { "epoch": 0.14982899951142717, "grad_norm": 1.0920681953430176, "learning_rate": 1.8605108055009824e-05, "loss": 1.0545, "step": 690 }, { "epoch": 0.15200043428695512, "grad_norm": 1.0387171506881714, "learning_rate": 1.8583278760096052e-05, "loss": 1.0386, "step": 700 }, { "epoch": 0.15417186906248304, "grad_norm": 1.2252532243728638, "learning_rate": 1.8561449465182274e-05, "loss": 1.0743, "step": 710 }, { "epoch": 0.15634330383801096, "grad_norm": 1.0585488080978394, "learning_rate": 1.8539620170268502e-05, "loss": 1.1129, "step": 720 }, { "epoch": 0.1585147386135389, "grad_norm": 0.9711065292358398, "learning_rate": 1.8517790875354727e-05, "loss": 1.1111, "step": 730 }, { "epoch": 0.16068617338906682, "grad_norm": 1.1681485176086426, "learning_rate": 1.8495961580440952e-05, "loss": 1.072, "step": 740 }, { "epoch": 0.16285760816459477, "grad_norm": 0.9218672513961792, "learning_rate": 1.847413228552718e-05, "loss": 1.0748, "step": 750 }, { "epoch": 0.1650290429401227, "grad_norm": 0.9746413230895996, "learning_rate": 1.8452302990613406e-05, "loss": 1.0946, "step": 760 }, { "epoch": 0.1672004777156506, "grad_norm": 1.1038978099822998, "learning_rate": 1.843047369569963e-05, "loss": 1.1346, "step": 770 }, { "epoch": 0.16937191249117856, "grad_norm": 0.9651903510093689, "learning_rate": 1.8408644400785856e-05, "loss": 1.0662, "step": 780 }, { "epoch": 0.17154334726670648, "grad_norm": 1.1864938735961914, "learning_rate": 1.8386815105872084e-05, "loss": 1.1023, "step": 790 }, { "epoch": 0.1737147820422344, "grad_norm": 0.9629665017127991, "learning_rate": 1.8364985810958305e-05, "loss": 1.0739, "step": 800 }, { "epoch": 0.17588621681776234, "grad_norm": 1.2128831148147583, "learning_rate": 1.8343156516044534e-05, "loss": 1.0484, "step": 810 }, { "epoch": 0.17805765159329026, "grad_norm": 1.0595309734344482, "learning_rate": 1.832132722113076e-05, "loss": 1.0829, "step": 820 }, { "epoch": 0.1802290863688182, "grad_norm": 1.1851084232330322, "learning_rate": 1.8299497926216984e-05, "loss": 1.0791, "step": 830 }, { "epoch": 0.18240052114434613, "grad_norm": 0.9105240702629089, "learning_rate": 1.8277668631303212e-05, "loss": 1.0909, "step": 840 }, { "epoch": 0.18457195591987405, "grad_norm": 1.0232548713684082, "learning_rate": 1.8255839336389437e-05, "loss": 1.0585, "step": 850 }, { "epoch": 0.186743390695402, "grad_norm": 1.0697710514068604, "learning_rate": 1.8234010041475662e-05, "loss": 1.1133, "step": 860 }, { "epoch": 0.18891482547092991, "grad_norm": 0.9465317130088806, "learning_rate": 1.8212180746561887e-05, "loss": 1.0755, "step": 870 }, { "epoch": 0.19108626024645783, "grad_norm": 1.0849310159683228, "learning_rate": 1.8190351451648115e-05, "loss": 1.1369, "step": 880 }, { "epoch": 0.19325769502198578, "grad_norm": 1.0284308195114136, "learning_rate": 1.8168522156734337e-05, "loss": 1.0504, "step": 890 }, { "epoch": 0.1954291297975137, "grad_norm": 1.000159740447998, "learning_rate": 1.8146692861820565e-05, "loss": 1.0658, "step": 900 }, { "epoch": 0.19760056457304165, "grad_norm": 1.0055243968963623, "learning_rate": 1.812486356690679e-05, "loss": 1.0563, "step": 910 }, { "epoch": 0.19977199934856957, "grad_norm": 1.1526374816894531, "learning_rate": 1.8103034271993015e-05, "loss": 1.0802, "step": 920 }, { "epoch": 0.2019434341240975, "grad_norm": 0.8575794696807861, "learning_rate": 1.808120497707924e-05, "loss": 1.0893, "step": 930 }, { "epoch": 0.20411486889962543, "grad_norm": 0.985564649105072, "learning_rate": 1.805937568216547e-05, "loss": 1.0543, "step": 940 }, { "epoch": 0.20628630367515335, "grad_norm": 1.2791037559509277, "learning_rate": 1.8037546387251694e-05, "loss": 1.0984, "step": 950 }, { "epoch": 0.2084577384506813, "grad_norm": 1.1033849716186523, "learning_rate": 1.801571709233792e-05, "loss": 1.0456, "step": 960 }, { "epoch": 0.21062917322620922, "grad_norm": 1.1214113235473633, "learning_rate": 1.7993887797424147e-05, "loss": 1.0519, "step": 970 }, { "epoch": 0.21280060800173714, "grad_norm": 1.1759611368179321, "learning_rate": 1.797205850251037e-05, "loss": 1.0646, "step": 980 }, { "epoch": 0.2149720427772651, "grad_norm": 1.0244547128677368, "learning_rate": 1.7950229207596597e-05, "loss": 1.0995, "step": 990 }, { "epoch": 0.217143477552793, "grad_norm": 1.134796142578125, "learning_rate": 1.7928399912682822e-05, "loss": 1.0889, "step": 1000 }, { "epoch": 0.21931491232832093, "grad_norm": 1.0857653617858887, "learning_rate": 1.7906570617769047e-05, "loss": 1.0283, "step": 1010 }, { "epoch": 0.22148634710384887, "grad_norm": 1.1252498626708984, "learning_rate": 1.7884741322855272e-05, "loss": 1.0462, "step": 1020 }, { "epoch": 0.2236577818793768, "grad_norm": 1.0542049407958984, "learning_rate": 1.78629120279415e-05, "loss": 1.0499, "step": 1030 }, { "epoch": 0.22582921665490474, "grad_norm": 1.1074199676513672, "learning_rate": 1.7841082733027725e-05, "loss": 1.0394, "step": 1040 }, { "epoch": 0.22800065143043266, "grad_norm": 1.0936591625213623, "learning_rate": 1.781925343811395e-05, "loss": 1.0529, "step": 1050 }, { "epoch": 0.23017208620596058, "grad_norm": 1.032329797744751, "learning_rate": 1.779742414320018e-05, "loss": 1.0311, "step": 1060 }, { "epoch": 0.23234352098148853, "grad_norm": 1.6111783981323242, "learning_rate": 1.77755948482864e-05, "loss": 1.0481, "step": 1070 }, { "epoch": 0.23451495575701645, "grad_norm": 1.1454813480377197, "learning_rate": 1.775376555337263e-05, "loss": 1.0231, "step": 1080 }, { "epoch": 0.23668639053254437, "grad_norm": 1.0079035758972168, "learning_rate": 1.7731936258458853e-05, "loss": 1.0907, "step": 1090 }, { "epoch": 0.2388578253080723, "grad_norm": 1.0366803407669067, "learning_rate": 1.771010696354508e-05, "loss": 1.0368, "step": 1100 }, { "epoch": 0.24102926008360023, "grad_norm": 1.057990550994873, "learning_rate": 1.7688277668631303e-05, "loss": 1.0462, "step": 1110 }, { "epoch": 0.24320069485912818, "grad_norm": 0.9940240383148193, "learning_rate": 1.7666448373717532e-05, "loss": 1.089, "step": 1120 }, { "epoch": 0.2453721296346561, "grad_norm": 1.0284287929534912, "learning_rate": 1.7644619078803757e-05, "loss": 1.044, "step": 1130 }, { "epoch": 0.24754356441018402, "grad_norm": 1.2615009546279907, "learning_rate": 1.7622789783889982e-05, "loss": 1.0978, "step": 1140 }, { "epoch": 0.24971499918571197, "grad_norm": 1.1974271535873413, "learning_rate": 1.7600960488976207e-05, "loss": 1.1405, "step": 1150 }, { "epoch": 0.2518864339612399, "grad_norm": 1.1542342901229858, "learning_rate": 1.757913119406243e-05, "loss": 1.0874, "step": 1160 }, { "epoch": 0.25405786873676783, "grad_norm": 1.2193187475204468, "learning_rate": 1.755730189914866e-05, "loss": 1.06, "step": 1170 }, { "epoch": 0.2562293035122957, "grad_norm": 0.8851369619369507, "learning_rate": 1.7535472604234885e-05, "loss": 1.0793, "step": 1180 }, { "epoch": 0.2584007382878237, "grad_norm": 0.9751698970794678, "learning_rate": 1.751364330932111e-05, "loss": 1.085, "step": 1190 }, { "epoch": 0.2605721730633516, "grad_norm": 1.1651514768600464, "learning_rate": 1.7491814014407335e-05, "loss": 1.0548, "step": 1200 }, { "epoch": 0.26274360783887957, "grad_norm": 1.0106171369552612, "learning_rate": 1.7469984719493563e-05, "loss": 1.0162, "step": 1210 }, { "epoch": 0.26491504261440746, "grad_norm": 1.0185978412628174, "learning_rate": 1.7448155424579788e-05, "loss": 1.0557, "step": 1220 }, { "epoch": 0.2670864773899354, "grad_norm": 1.0883762836456299, "learning_rate": 1.7426326129666013e-05, "loss": 1.0507, "step": 1230 }, { "epoch": 0.26925791216546335, "grad_norm": 1.1618812084197998, "learning_rate": 1.7404496834752238e-05, "loss": 1.0452, "step": 1240 }, { "epoch": 0.27142934694099125, "grad_norm": 1.1427685022354126, "learning_rate": 1.7382667539838463e-05, "loss": 1.0772, "step": 1250 }, { "epoch": 0.2736007817165192, "grad_norm": 1.1353427171707153, "learning_rate": 1.7360838244924688e-05, "loss": 1.0152, "step": 1260 }, { "epoch": 0.27577221649204714, "grad_norm": 1.0538302659988403, "learning_rate": 1.7339008950010917e-05, "loss": 1.0453, "step": 1270 }, { "epoch": 0.27794365126757503, "grad_norm": 0.9475343823432922, "learning_rate": 1.731717965509714e-05, "loss": 1.0159, "step": 1280 }, { "epoch": 0.280115086043103, "grad_norm": 1.1043903827667236, "learning_rate": 1.7295350360183367e-05, "loss": 1.0767, "step": 1290 }, { "epoch": 0.2822865208186309, "grad_norm": 1.2597566843032837, "learning_rate": 1.7273521065269595e-05, "loss": 1.0976, "step": 1300 }, { "epoch": 0.2844579555941588, "grad_norm": 1.2252488136291504, "learning_rate": 1.725169177035582e-05, "loss": 1.1019, "step": 1310 }, { "epoch": 0.28662939036968677, "grad_norm": 1.4177309274673462, "learning_rate": 1.7229862475442045e-05, "loss": 1.1039, "step": 1320 }, { "epoch": 0.2888008251452147, "grad_norm": 1.2762172222137451, "learning_rate": 1.720803318052827e-05, "loss": 1.0195, "step": 1330 }, { "epoch": 0.2909722599207426, "grad_norm": 1.3187838792800903, "learning_rate": 1.7186203885614495e-05, "loss": 1.0851, "step": 1340 }, { "epoch": 0.29314369469627055, "grad_norm": 1.2178953886032104, "learning_rate": 1.716437459070072e-05, "loss": 1.0503, "step": 1350 }, { "epoch": 0.2953151294717985, "grad_norm": 0.9904911518096924, "learning_rate": 1.7142545295786948e-05, "loss": 1.0683, "step": 1360 }, { "epoch": 0.29748656424732645, "grad_norm": 0.9594365358352661, "learning_rate": 1.7120716000873173e-05, "loss": 1.052, "step": 1370 }, { "epoch": 0.29965799902285434, "grad_norm": 1.218839406967163, "learning_rate": 1.7098886705959398e-05, "loss": 1.0563, "step": 1380 }, { "epoch": 0.3018294337983823, "grad_norm": 1.2965632677078247, "learning_rate": 1.7077057411045626e-05, "loss": 1.1087, "step": 1390 }, { "epoch": 0.30400086857391023, "grad_norm": 1.0554800033569336, "learning_rate": 1.705522811613185e-05, "loss": 1.036, "step": 1400 }, { "epoch": 0.3061723033494381, "grad_norm": 1.1262216567993164, "learning_rate": 1.7033398821218076e-05, "loss": 1.0489, "step": 1410 }, { "epoch": 0.30834373812496607, "grad_norm": 1.044252872467041, "learning_rate": 1.70115695263043e-05, "loss": 1.048, "step": 1420 }, { "epoch": 0.310515172900494, "grad_norm": 1.4321969747543335, "learning_rate": 1.6989740231390526e-05, "loss": 1.0712, "step": 1430 }, { "epoch": 0.3126866076760219, "grad_norm": 0.9649491310119629, "learning_rate": 1.696791093647675e-05, "loss": 1.0366, "step": 1440 }, { "epoch": 0.31485804245154986, "grad_norm": 1.0629323720932007, "learning_rate": 1.694608164156298e-05, "loss": 1.0527, "step": 1450 }, { "epoch": 0.3170294772270778, "grad_norm": 1.1887277364730835, "learning_rate": 1.6924252346649205e-05, "loss": 1.0597, "step": 1460 }, { "epoch": 0.3192009120026057, "grad_norm": 1.0008701086044312, "learning_rate": 1.690242305173543e-05, "loss": 1.0733, "step": 1470 }, { "epoch": 0.32137234677813364, "grad_norm": 1.1184202432632446, "learning_rate": 1.6880593756821658e-05, "loss": 1.0093, "step": 1480 }, { "epoch": 0.3235437815536616, "grad_norm": 1.2822941541671753, "learning_rate": 1.6858764461907883e-05, "loss": 1.0505, "step": 1490 }, { "epoch": 0.32571521632918954, "grad_norm": 1.365919828414917, "learning_rate": 1.6836935166994108e-05, "loss": 1.0616, "step": 1500 }, { "epoch": 0.32788665110471743, "grad_norm": 1.4940375089645386, "learning_rate": 1.6815105872080333e-05, "loss": 1.0189, "step": 1510 }, { "epoch": 0.3300580858802454, "grad_norm": 1.443363070487976, "learning_rate": 1.679327657716656e-05, "loss": 1.0591, "step": 1520 }, { "epoch": 0.3322295206557733, "grad_norm": 1.0023658275604248, "learning_rate": 1.6771447282252783e-05, "loss": 1.0784, "step": 1530 }, { "epoch": 0.3344009554313012, "grad_norm": 1.0569523572921753, "learning_rate": 1.674961798733901e-05, "loss": 1.0167, "step": 1540 }, { "epoch": 0.33657239020682916, "grad_norm": 1.5533829927444458, "learning_rate": 1.6727788692425236e-05, "loss": 1.059, "step": 1550 }, { "epoch": 0.3387438249823571, "grad_norm": 1.2175540924072266, "learning_rate": 1.670595939751146e-05, "loss": 1.0566, "step": 1560 }, { "epoch": 0.340915259757885, "grad_norm": 1.2406116724014282, "learning_rate": 1.6684130102597686e-05, "loss": 1.0284, "step": 1570 }, { "epoch": 0.34308669453341295, "grad_norm": 1.1116441488265991, "learning_rate": 1.6662300807683914e-05, "loss": 1.0686, "step": 1580 }, { "epoch": 0.3452581293089409, "grad_norm": 0.9541231989860535, "learning_rate": 1.664047151277014e-05, "loss": 1.0569, "step": 1590 }, { "epoch": 0.3474295640844688, "grad_norm": 0.9048693180084229, "learning_rate": 1.6618642217856364e-05, "loss": 1.0503, "step": 1600 }, { "epoch": 0.34960099885999674, "grad_norm": 1.2782031297683716, "learning_rate": 1.6596812922942593e-05, "loss": 1.0462, "step": 1610 }, { "epoch": 0.3517724336355247, "grad_norm": 1.0912036895751953, "learning_rate": 1.6574983628028814e-05, "loss": 1.0488, "step": 1620 }, { "epoch": 0.3539438684110526, "grad_norm": 1.2449527978897095, "learning_rate": 1.6553154333115043e-05, "loss": 1.0283, "step": 1630 }, { "epoch": 0.3561153031865805, "grad_norm": 1.367113471031189, "learning_rate": 1.6531325038201268e-05, "loss": 1.0687, "step": 1640 }, { "epoch": 0.35828673796210847, "grad_norm": 1.4987077713012695, "learning_rate": 1.6509495743287493e-05, "loss": 1.0419, "step": 1650 }, { "epoch": 0.3604581727376364, "grad_norm": 1.0947941541671753, "learning_rate": 1.6487666448373718e-05, "loss": 1.0666, "step": 1660 }, { "epoch": 0.3626296075131643, "grad_norm": 1.199379801750183, "learning_rate": 1.6465837153459946e-05, "loss": 1.0062, "step": 1670 }, { "epoch": 0.36480104228869226, "grad_norm": 1.281281590461731, "learning_rate": 1.6444007858546168e-05, "loss": 1.0054, "step": 1680 }, { "epoch": 0.3669724770642202, "grad_norm": 1.00531804561615, "learning_rate": 1.6422178563632396e-05, "loss": 1.0951, "step": 1690 }, { "epoch": 0.3691439118397481, "grad_norm": 1.2501758337020874, "learning_rate": 1.6400349268718624e-05, "loss": 1.0289, "step": 1700 }, { "epoch": 0.37131534661527604, "grad_norm": 1.1288474798202515, "learning_rate": 1.6378519973804846e-05, "loss": 1.0539, "step": 1710 }, { "epoch": 0.373486781390804, "grad_norm": 1.2077093124389648, "learning_rate": 1.6356690678891074e-05, "loss": 1.0112, "step": 1720 }, { "epoch": 0.3756582161663319, "grad_norm": 1.0771955251693726, "learning_rate": 1.63348613839773e-05, "loss": 1.0527, "step": 1730 }, { "epoch": 0.37782965094185983, "grad_norm": 1.5062224864959717, "learning_rate": 1.6313032089063524e-05, "loss": 1.0023, "step": 1740 }, { "epoch": 0.3800010857173878, "grad_norm": 1.4642319679260254, "learning_rate": 1.629120279414975e-05, "loss": 1.0253, "step": 1750 }, { "epoch": 0.38217252049291567, "grad_norm": 1.17564058303833, "learning_rate": 1.6269373499235978e-05, "loss": 1.0378, "step": 1760 }, { "epoch": 0.3843439552684436, "grad_norm": 1.155928134918213, "learning_rate": 1.6247544204322203e-05, "loss": 1.0797, "step": 1770 }, { "epoch": 0.38651539004397156, "grad_norm": 1.161272406578064, "learning_rate": 1.6225714909408428e-05, "loss": 1.0645, "step": 1780 }, { "epoch": 0.3886868248194995, "grad_norm": 1.1190975904464722, "learning_rate": 1.6203885614494653e-05, "loss": 0.9895, "step": 1790 }, { "epoch": 0.3908582595950274, "grad_norm": 1.0364742279052734, "learning_rate": 1.6182056319580877e-05, "loss": 0.993, "step": 1800 }, { "epoch": 0.39302969437055535, "grad_norm": 0.9722704887390137, "learning_rate": 1.6160227024667106e-05, "loss": 1.074, "step": 1810 }, { "epoch": 0.3952011291460833, "grad_norm": 1.196349024772644, "learning_rate": 1.613839772975333e-05, "loss": 1.0192, "step": 1820 }, { "epoch": 0.3973725639216112, "grad_norm": 1.2496604919433594, "learning_rate": 1.6116568434839556e-05, "loss": 1.0539, "step": 1830 }, { "epoch": 0.39954399869713914, "grad_norm": 1.273461937904358, "learning_rate": 1.609473913992578e-05, "loss": 1.0283, "step": 1840 }, { "epoch": 0.4017154334726671, "grad_norm": 1.267354965209961, "learning_rate": 1.607290984501201e-05, "loss": 1.0265, "step": 1850 }, { "epoch": 0.403886868248195, "grad_norm": 1.1388341188430786, "learning_rate": 1.6051080550098234e-05, "loss": 1.0998, "step": 1860 }, { "epoch": 0.4060583030237229, "grad_norm": 1.7409948110580444, "learning_rate": 1.602925125518446e-05, "loss": 1.0246, "step": 1870 }, { "epoch": 0.40822973779925087, "grad_norm": 1.0280303955078125, "learning_rate": 1.6007421960270684e-05, "loss": 1.075, "step": 1880 }, { "epoch": 0.41040117257477876, "grad_norm": 1.093042016029358, "learning_rate": 1.598559266535691e-05, "loss": 1.018, "step": 1890 }, { "epoch": 0.4125726073503067, "grad_norm": 0.9621986746788025, "learning_rate": 1.5963763370443134e-05, "loss": 1.1104, "step": 1900 }, { "epoch": 0.41474404212583466, "grad_norm": 1.0159006118774414, "learning_rate": 1.5941934075529362e-05, "loss": 1.0231, "step": 1910 }, { "epoch": 0.4169154769013626, "grad_norm": 1.2041517496109009, "learning_rate": 1.5920104780615587e-05, "loss": 1.0426, "step": 1920 }, { "epoch": 0.4190869116768905, "grad_norm": 1.2012951374053955, "learning_rate": 1.5898275485701812e-05, "loss": 1.0376, "step": 1930 }, { "epoch": 0.42125834645241844, "grad_norm": 1.207979679107666, "learning_rate": 1.587644619078804e-05, "loss": 1.0054, "step": 1940 }, { "epoch": 0.4234297812279464, "grad_norm": 1.1251835823059082, "learning_rate": 1.5854616895874266e-05, "loss": 1.068, "step": 1950 }, { "epoch": 0.4256012160034743, "grad_norm": 1.2626091241836548, "learning_rate": 1.583278760096049e-05, "loss": 1.0705, "step": 1960 }, { "epoch": 0.42777265077900223, "grad_norm": 1.203305959701538, "learning_rate": 1.5810958306046716e-05, "loss": 1.0319, "step": 1970 }, { "epoch": 0.4299440855545302, "grad_norm": 1.3643816709518433, "learning_rate": 1.578912901113294e-05, "loss": 1.0383, "step": 1980 }, { "epoch": 0.43211552033005807, "grad_norm": 1.3260042667388916, "learning_rate": 1.5767299716219166e-05, "loss": 1.0949, "step": 1990 }, { "epoch": 0.434286955105586, "grad_norm": 1.0160613059997559, "learning_rate": 1.5745470421305394e-05, "loss": 1.0717, "step": 2000 }, { "epoch": 0.43645838988111396, "grad_norm": 0.9759429693222046, "learning_rate": 1.572364112639162e-05, "loss": 1.023, "step": 2010 }, { "epoch": 0.43862982465664185, "grad_norm": 1.268486738204956, "learning_rate": 1.5701811831477844e-05, "loss": 0.986, "step": 2020 }, { "epoch": 0.4408012594321698, "grad_norm": 1.326611876487732, "learning_rate": 1.5679982536564072e-05, "loss": 1.0641, "step": 2030 }, { "epoch": 0.44297269420769775, "grad_norm": 1.638113021850586, "learning_rate": 1.5658153241650297e-05, "loss": 1.0302, "step": 2040 }, { "epoch": 0.44514412898322564, "grad_norm": 1.5037381649017334, "learning_rate": 1.5636323946736522e-05, "loss": 1.0291, "step": 2050 }, { "epoch": 0.4473155637587536, "grad_norm": 1.1574699878692627, "learning_rate": 1.5614494651822747e-05, "loss": 1.0268, "step": 2060 }, { "epoch": 0.44948699853428153, "grad_norm": 1.1230093240737915, "learning_rate": 1.5592665356908972e-05, "loss": 1.0471, "step": 2070 }, { "epoch": 0.4516584333098095, "grad_norm": 1.134092092514038, "learning_rate": 1.5570836061995197e-05, "loss": 1.045, "step": 2080 }, { "epoch": 0.4538298680853374, "grad_norm": 1.4253817796707153, "learning_rate": 1.5549006767081425e-05, "loss": 1.0575, "step": 2090 }, { "epoch": 0.4560013028608653, "grad_norm": 1.322679877281189, "learning_rate": 1.552717747216765e-05, "loss": 1.005, "step": 2100 }, { "epoch": 0.45817273763639327, "grad_norm": 1.190661072731018, "learning_rate": 1.5505348177253875e-05, "loss": 1.0392, "step": 2110 }, { "epoch": 0.46034417241192116, "grad_norm": 1.10509192943573, "learning_rate": 1.54835188823401e-05, "loss": 1.046, "step": 2120 }, { "epoch": 0.4625156071874491, "grad_norm": 1.305440902709961, "learning_rate": 1.546168958742633e-05, "loss": 1.067, "step": 2130 }, { "epoch": 0.46468704196297705, "grad_norm": 0.9714760184288025, "learning_rate": 1.5439860292512554e-05, "loss": 1.0648, "step": 2140 }, { "epoch": 0.46685847673850495, "grad_norm": 1.249341368675232, "learning_rate": 1.541803099759878e-05, "loss": 1.0326, "step": 2150 }, { "epoch": 0.4690299115140329, "grad_norm": 1.182078242301941, "learning_rate": 1.5396201702685004e-05, "loss": 0.9649, "step": 2160 }, { "epoch": 0.47120134628956084, "grad_norm": 1.1989095211029053, "learning_rate": 1.537437240777123e-05, "loss": 1.0324, "step": 2170 }, { "epoch": 0.47337278106508873, "grad_norm": 1.1520359516143799, "learning_rate": 1.5352543112857457e-05, "loss": 1.0453, "step": 2180 }, { "epoch": 0.4755442158406167, "grad_norm": 0.9840512871742249, "learning_rate": 1.5330713817943682e-05, "loss": 1.028, "step": 2190 }, { "epoch": 0.4777156506161446, "grad_norm": 1.2729812860488892, "learning_rate": 1.5308884523029907e-05, "loss": 1.0866, "step": 2200 }, { "epoch": 0.4798870853916726, "grad_norm": 1.2075546979904175, "learning_rate": 1.5287055228116132e-05, "loss": 1.0633, "step": 2210 }, { "epoch": 0.48205852016720047, "grad_norm": 1.6592689752578735, "learning_rate": 1.526522593320236e-05, "loss": 1.0107, "step": 2220 }, { "epoch": 0.4842299549427284, "grad_norm": 1.2771036624908447, "learning_rate": 1.5243396638288585e-05, "loss": 0.9855, "step": 2230 }, { "epoch": 0.48640138971825636, "grad_norm": 1.4246838092803955, "learning_rate": 1.522156734337481e-05, "loss": 1.0405, "step": 2240 }, { "epoch": 0.48857282449378425, "grad_norm": 1.1746619939804077, "learning_rate": 1.5199738048461037e-05, "loss": 0.9615, "step": 2250 }, { "epoch": 0.4907442592693122, "grad_norm": 0.9854568243026733, "learning_rate": 1.5177908753547262e-05, "loss": 1.0748, "step": 2260 }, { "epoch": 0.49291569404484015, "grad_norm": 1.2437796592712402, "learning_rate": 1.5156079458633489e-05, "loss": 1.0428, "step": 2270 }, { "epoch": 0.49508712882036804, "grad_norm": 1.4417718648910522, "learning_rate": 1.5134250163719712e-05, "loss": 1.0339, "step": 2280 }, { "epoch": 0.497258563595896, "grad_norm": 1.5475140810012817, "learning_rate": 1.5112420868805939e-05, "loss": 1.0054, "step": 2290 }, { "epoch": 0.49942999837142393, "grad_norm": 1.2441282272338867, "learning_rate": 1.5090591573892164e-05, "loss": 1.0406, "step": 2300 }, { "epoch": 0.5016014331469518, "grad_norm": 1.3758796453475952, "learning_rate": 1.506876227897839e-05, "loss": 1.0709, "step": 2310 }, { "epoch": 0.5037728679224798, "grad_norm": 1.412845492362976, "learning_rate": 1.5046932984064615e-05, "loss": 1.0111, "step": 2320 }, { "epoch": 0.5059443026980077, "grad_norm": 1.2830110788345337, "learning_rate": 1.5025103689150842e-05, "loss": 1.0142, "step": 2330 }, { "epoch": 0.5081157374735357, "grad_norm": 1.1173603534698486, "learning_rate": 1.5003274394237068e-05, "loss": 1.0425, "step": 2340 }, { "epoch": 0.5102871722490636, "grad_norm": 1.3206751346588135, "learning_rate": 1.4981445099323293e-05, "loss": 1.0147, "step": 2350 }, { "epoch": 0.5124586070245915, "grad_norm": 1.1469355821609497, "learning_rate": 1.495961580440952e-05, "loss": 1.1042, "step": 2360 }, { "epoch": 0.5146300418001194, "grad_norm": 1.5979527235031128, "learning_rate": 1.4937786509495743e-05, "loss": 1.0293, "step": 2370 }, { "epoch": 0.5168014765756473, "grad_norm": 1.1847728490829468, "learning_rate": 1.491595721458197e-05, "loss": 1.0589, "step": 2380 }, { "epoch": 0.5189729113511753, "grad_norm": 1.1340694427490234, "learning_rate": 1.4894127919668195e-05, "loss": 1.008, "step": 2390 }, { "epoch": 0.5211443461267032, "grad_norm": 1.3193624019622803, "learning_rate": 1.4872298624754422e-05, "loss": 1.0455, "step": 2400 }, { "epoch": 0.5233157809022312, "grad_norm": 1.3973023891448975, "learning_rate": 1.4850469329840647e-05, "loss": 1.0455, "step": 2410 }, { "epoch": 0.5254872156777591, "grad_norm": 1.4529467821121216, "learning_rate": 1.4828640034926873e-05, "loss": 1.0726, "step": 2420 }, { "epoch": 0.527658650453287, "grad_norm": 1.2769255638122559, "learning_rate": 1.4806810740013098e-05, "loss": 1.0646, "step": 2430 }, { "epoch": 0.5298300852288149, "grad_norm": 1.4367311000823975, "learning_rate": 1.4784981445099325e-05, "loss": 0.9821, "step": 2440 }, { "epoch": 0.5320015200043429, "grad_norm": 1.6156344413757324, "learning_rate": 1.4763152150185552e-05, "loss": 1.0581, "step": 2450 }, { "epoch": 0.5341729547798708, "grad_norm": 1.093217372894287, "learning_rate": 1.4741322855271775e-05, "loss": 1.0125, "step": 2460 }, { "epoch": 0.5363443895553988, "grad_norm": 1.3095054626464844, "learning_rate": 1.4719493560358002e-05, "loss": 1.0268, "step": 2470 }, { "epoch": 0.5385158243309267, "grad_norm": 1.3275405168533325, "learning_rate": 1.4697664265444227e-05, "loss": 1.036, "step": 2480 }, { "epoch": 0.5406872591064545, "grad_norm": 1.1585111618041992, "learning_rate": 1.4675834970530453e-05, "loss": 1.0893, "step": 2490 }, { "epoch": 0.5428586938819825, "grad_norm": 1.1334049701690674, "learning_rate": 1.4654005675616678e-05, "loss": 1.0356, "step": 2500 }, { "epoch": 0.5450301286575104, "grad_norm": 1.1498132944107056, "learning_rate": 1.4632176380702905e-05, "loss": 1.0297, "step": 2510 }, { "epoch": 0.5472015634330384, "grad_norm": 1.3892987966537476, "learning_rate": 1.461034708578913e-05, "loss": 0.9885, "step": 2520 }, { "epoch": 0.5493729982085663, "grad_norm": 1.1444848775863647, "learning_rate": 1.4588517790875357e-05, "loss": 1.0221, "step": 2530 }, { "epoch": 0.5515444329840943, "grad_norm": 1.0999592542648315, "learning_rate": 1.456668849596158e-05, "loss": 1.0053, "step": 2540 }, { "epoch": 0.5537158677596221, "grad_norm": 1.2366653680801392, "learning_rate": 1.4544859201047807e-05, "loss": 0.9872, "step": 2550 }, { "epoch": 0.5558873025351501, "grad_norm": 1.066278100013733, "learning_rate": 1.4523029906134035e-05, "loss": 1.0833, "step": 2560 }, { "epoch": 0.558058737310678, "grad_norm": 1.418614149093628, "learning_rate": 1.4501200611220258e-05, "loss": 1.0316, "step": 2570 }, { "epoch": 0.560230172086206, "grad_norm": 1.2488312721252441, "learning_rate": 1.4479371316306485e-05, "loss": 1.0261, "step": 2580 }, { "epoch": 0.5624016068617339, "grad_norm": 1.1262556314468384, "learning_rate": 1.445754202139271e-05, "loss": 0.9926, "step": 2590 }, { "epoch": 0.5645730416372619, "grad_norm": 1.2547680139541626, "learning_rate": 1.4435712726478936e-05, "loss": 1.0211, "step": 2600 }, { "epoch": 0.5667444764127898, "grad_norm": 1.3836477994918823, "learning_rate": 1.4413883431565161e-05, "loss": 0.9906, "step": 2610 }, { "epoch": 0.5689159111883176, "grad_norm": 1.0498002767562866, "learning_rate": 1.4392054136651388e-05, "loss": 0.9824, "step": 2620 }, { "epoch": 0.5710873459638456, "grad_norm": 0.9665150046348572, "learning_rate": 1.4370224841737611e-05, "loss": 1.0113, "step": 2630 }, { "epoch": 0.5732587807393735, "grad_norm": 1.2889072895050049, "learning_rate": 1.434839554682384e-05, "loss": 0.9909, "step": 2640 }, { "epoch": 0.5754302155149015, "grad_norm": 1.5180598497390747, "learning_rate": 1.4326566251910063e-05, "loss": 1.0092, "step": 2650 }, { "epoch": 0.5776016502904294, "grad_norm": 1.6388850212097168, "learning_rate": 1.430473695699629e-05, "loss": 1.0116, "step": 2660 }, { "epoch": 0.5797730850659574, "grad_norm": 1.2516218423843384, "learning_rate": 1.4282907662082516e-05, "loss": 0.9775, "step": 2670 }, { "epoch": 0.5819445198414852, "grad_norm": 1.1634091138839722, "learning_rate": 1.4261078367168741e-05, "loss": 0.9898, "step": 2680 }, { "epoch": 0.5841159546170132, "grad_norm": 1.1283944845199585, "learning_rate": 1.4239249072254968e-05, "loss": 1.0265, "step": 2690 }, { "epoch": 0.5862873893925411, "grad_norm": 1.3887890577316284, "learning_rate": 1.4217419777341193e-05, "loss": 1.0433, "step": 2700 }, { "epoch": 0.588458824168069, "grad_norm": 1.2213870286941528, "learning_rate": 1.419559048242742e-05, "loss": 1.0116, "step": 2710 }, { "epoch": 0.590630258943597, "grad_norm": 1.2879663705825806, "learning_rate": 1.4173761187513645e-05, "loss": 0.9866, "step": 2720 }, { "epoch": 0.592801693719125, "grad_norm": 1.3769855499267578, "learning_rate": 1.4151931892599871e-05, "loss": 0.9735, "step": 2730 }, { "epoch": 0.5949731284946529, "grad_norm": 1.347123146057129, "learning_rate": 1.4130102597686095e-05, "loss": 1.0169, "step": 2740 }, { "epoch": 0.5971445632701807, "grad_norm": 1.093166708946228, "learning_rate": 1.4108273302772321e-05, "loss": 1.0288, "step": 2750 }, { "epoch": 0.5993159980457087, "grad_norm": 1.3573272228240967, "learning_rate": 1.4086444007858546e-05, "loss": 1.0042, "step": 2760 }, { "epoch": 0.6014874328212366, "grad_norm": 1.400972604751587, "learning_rate": 1.4064614712944773e-05, "loss": 1.0181, "step": 2770 }, { "epoch": 0.6036588675967646, "grad_norm": 1.1371185779571533, "learning_rate": 1.4042785418031e-05, "loss": 1.0504, "step": 2780 }, { "epoch": 0.6058303023722925, "grad_norm": 1.31002676486969, "learning_rate": 1.4020956123117225e-05, "loss": 1.0615, "step": 2790 }, { "epoch": 0.6080017371478205, "grad_norm": 1.557403326034546, "learning_rate": 1.3999126828203451e-05, "loss": 1.0286, "step": 2800 }, { "epoch": 0.6101731719233483, "grad_norm": 1.2506225109100342, "learning_rate": 1.3977297533289676e-05, "loss": 1.0316, "step": 2810 }, { "epoch": 0.6123446066988762, "grad_norm": 1.2750262022018433, "learning_rate": 1.3955468238375903e-05, "loss": 1.0691, "step": 2820 }, { "epoch": 0.6145160414744042, "grad_norm": 1.3119608163833618, "learning_rate": 1.3933638943462126e-05, "loss": 1.0107, "step": 2830 }, { "epoch": 0.6166874762499321, "grad_norm": 1.269987940788269, "learning_rate": 1.3911809648548353e-05, "loss": 1.0298, "step": 2840 }, { "epoch": 0.6188589110254601, "grad_norm": 1.1371833086013794, "learning_rate": 1.3889980353634578e-05, "loss": 1.0541, "step": 2850 }, { "epoch": 0.621030345800988, "grad_norm": 1.2296518087387085, "learning_rate": 1.3868151058720804e-05, "loss": 1.0225, "step": 2860 }, { "epoch": 0.623201780576516, "grad_norm": 1.5416007041931152, "learning_rate": 1.384632176380703e-05, "loss": 0.9838, "step": 2870 }, { "epoch": 0.6253732153520438, "grad_norm": 1.2770878076553345, "learning_rate": 1.3824492468893256e-05, "loss": 0.9917, "step": 2880 }, { "epoch": 0.6275446501275718, "grad_norm": 1.3633027076721191, "learning_rate": 1.3802663173979483e-05, "loss": 1.0636, "step": 2890 }, { "epoch": 0.6297160849030997, "grad_norm": 1.2924447059631348, "learning_rate": 1.3780833879065708e-05, "loss": 1.0151, "step": 2900 }, { "epoch": 0.6318875196786277, "grad_norm": 1.3453025817871094, "learning_rate": 1.3759004584151934e-05, "loss": 1.0053, "step": 2910 }, { "epoch": 0.6340589544541556, "grad_norm": 1.497462511062622, "learning_rate": 1.3737175289238158e-05, "loss": 0.9936, "step": 2920 }, { "epoch": 0.6362303892296836, "grad_norm": 1.0469037294387817, "learning_rate": 1.3715345994324384e-05, "loss": 1.0465, "step": 2930 }, { "epoch": 0.6384018240052114, "grad_norm": 1.4272680282592773, "learning_rate": 1.369351669941061e-05, "loss": 1.0634, "step": 2940 }, { "epoch": 0.6405732587807393, "grad_norm": 1.065047264099121, "learning_rate": 1.3671687404496836e-05, "loss": 1.0464, "step": 2950 }, { "epoch": 0.6427446935562673, "grad_norm": 1.3233064413070679, "learning_rate": 1.3649858109583061e-05, "loss": 1.07, "step": 2960 }, { "epoch": 0.6449161283317952, "grad_norm": 1.792734980583191, "learning_rate": 1.3628028814669288e-05, "loss": 0.9722, "step": 2970 }, { "epoch": 0.6470875631073232, "grad_norm": 1.7977020740509033, "learning_rate": 1.3606199519755514e-05, "loss": 0.9811, "step": 2980 }, { "epoch": 0.6492589978828511, "grad_norm": 1.2973439693450928, "learning_rate": 1.358437022484174e-05, "loss": 0.9958, "step": 2990 }, { "epoch": 0.6514304326583791, "grad_norm": 1.249764323234558, "learning_rate": 1.3562540929927966e-05, "loss": 1.0675, "step": 3000 }, { "epoch": 0.6536018674339069, "grad_norm": 1.343056559562683, "learning_rate": 1.354071163501419e-05, "loss": 1.0493, "step": 3010 }, { "epoch": 0.6557733022094349, "grad_norm": 1.6171714067459106, "learning_rate": 1.3518882340100416e-05, "loss": 1.044, "step": 3020 }, { "epoch": 0.6579447369849628, "grad_norm": 1.2323534488677979, "learning_rate": 1.3497053045186641e-05, "loss": 1.0386, "step": 3030 }, { "epoch": 0.6601161717604908, "grad_norm": 1.1134217977523804, "learning_rate": 1.3475223750272868e-05, "loss": 1.0225, "step": 3040 }, { "epoch": 0.6622876065360187, "grad_norm": 1.6027779579162598, "learning_rate": 1.3453394455359093e-05, "loss": 1.0195, "step": 3050 }, { "epoch": 0.6644590413115466, "grad_norm": 1.3403127193450928, "learning_rate": 1.343156516044532e-05, "loss": 0.9529, "step": 3060 }, { "epoch": 0.6666304760870745, "grad_norm": 1.3543404340744019, "learning_rate": 1.3409735865531544e-05, "loss": 0.9783, "step": 3070 }, { "epoch": 0.6688019108626024, "grad_norm": 1.1751652956008911, "learning_rate": 1.3387906570617771e-05, "loss": 1.0199, "step": 3080 }, { "epoch": 0.6709733456381304, "grad_norm": 1.44953453540802, "learning_rate": 1.3366077275703998e-05, "loss": 1.04, "step": 3090 }, { "epoch": 0.6731447804136583, "grad_norm": 1.2177844047546387, "learning_rate": 1.334424798079022e-05, "loss": 1.0228, "step": 3100 }, { "epoch": 0.6753162151891863, "grad_norm": 1.2051985263824463, "learning_rate": 1.3322418685876447e-05, "loss": 0.9834, "step": 3110 }, { "epoch": 0.6774876499647142, "grad_norm": 1.249619483947754, "learning_rate": 1.3300589390962672e-05, "loss": 1.0089, "step": 3120 }, { "epoch": 0.6796590847402422, "grad_norm": 1.3662210702896118, "learning_rate": 1.3278760096048899e-05, "loss": 1.0312, "step": 3130 }, { "epoch": 0.68183051951577, "grad_norm": 1.2683398723602295, "learning_rate": 1.3256930801135124e-05, "loss": 0.9806, "step": 3140 }, { "epoch": 0.684001954291298, "grad_norm": 1.3069689273834229, "learning_rate": 1.323510150622135e-05, "loss": 1.009, "step": 3150 }, { "epoch": 0.6861733890668259, "grad_norm": 1.4314109086990356, "learning_rate": 1.3213272211307576e-05, "loss": 0.9918, "step": 3160 }, { "epoch": 0.6883448238423538, "grad_norm": 1.2950971126556396, "learning_rate": 1.3191442916393802e-05, "loss": 1.0356, "step": 3170 }, { "epoch": 0.6905162586178818, "grad_norm": 1.4553663730621338, "learning_rate": 1.3169613621480026e-05, "loss": 1.0255, "step": 3180 }, { "epoch": 0.6926876933934097, "grad_norm": 1.439324140548706, "learning_rate": 1.3147784326566252e-05, "loss": 1.0561, "step": 3190 }, { "epoch": 0.6948591281689376, "grad_norm": 1.1153829097747803, "learning_rate": 1.3125955031652479e-05, "loss": 1.017, "step": 3200 }, { "epoch": 0.6970305629444655, "grad_norm": 1.1670260429382324, "learning_rate": 1.3104125736738704e-05, "loss": 1.0096, "step": 3210 }, { "epoch": 0.6992019977199935, "grad_norm": 1.400228500366211, "learning_rate": 1.308229644182493e-05, "loss": 1.0309, "step": 3220 }, { "epoch": 0.7013734324955214, "grad_norm": 1.1673344373703003, "learning_rate": 1.3060467146911156e-05, "loss": 1.0515, "step": 3230 }, { "epoch": 0.7035448672710494, "grad_norm": 1.152686357498169, "learning_rate": 1.3038637851997382e-05, "loss": 0.9687, "step": 3240 }, { "epoch": 0.7057163020465773, "grad_norm": 1.3322280645370483, "learning_rate": 1.3016808557083607e-05, "loss": 1.0255, "step": 3250 }, { "epoch": 0.7078877368221052, "grad_norm": 1.0270700454711914, "learning_rate": 1.2994979262169834e-05, "loss": 1.0124, "step": 3260 }, { "epoch": 0.7100591715976331, "grad_norm": 1.0481797456741333, "learning_rate": 1.2973149967256057e-05, "loss": 1.0298, "step": 3270 }, { "epoch": 0.712230606373161, "grad_norm": 1.358763337135315, "learning_rate": 1.2951320672342284e-05, "loss": 1.0009, "step": 3280 }, { "epoch": 0.714402041148689, "grad_norm": 1.3017981052398682, "learning_rate": 1.2929491377428509e-05, "loss": 1.0362, "step": 3290 }, { "epoch": 0.7165734759242169, "grad_norm": 1.4643291234970093, "learning_rate": 1.2907662082514736e-05, "loss": 0.96, "step": 3300 }, { "epoch": 0.7187449106997449, "grad_norm": 1.158682942390442, "learning_rate": 1.2885832787600962e-05, "loss": 0.9807, "step": 3310 }, { "epoch": 0.7209163454752728, "grad_norm": 1.2945632934570312, "learning_rate": 1.2864003492687187e-05, "loss": 0.977, "step": 3320 }, { "epoch": 0.7230877802508007, "grad_norm": 1.6654890775680542, "learning_rate": 1.2842174197773414e-05, "loss": 1.0128, "step": 3330 }, { "epoch": 0.7252592150263286, "grad_norm": 1.2067387104034424, "learning_rate": 1.2820344902859639e-05, "loss": 1.0261, "step": 3340 }, { "epoch": 0.7274306498018566, "grad_norm": 1.4484736919403076, "learning_rate": 1.2798515607945866e-05, "loss": 1.0055, "step": 3350 }, { "epoch": 0.7296020845773845, "grad_norm": 1.428499698638916, "learning_rate": 1.2776686313032089e-05, "loss": 1.0584, "step": 3360 }, { "epoch": 0.7317735193529125, "grad_norm": 1.454953670501709, "learning_rate": 1.2754857018118315e-05, "loss": 1.0327, "step": 3370 }, { "epoch": 0.7339449541284404, "grad_norm": 1.1868793964385986, "learning_rate": 1.273302772320454e-05, "loss": 1.019, "step": 3380 }, { "epoch": 0.7361163889039682, "grad_norm": 1.2822529077529907, "learning_rate": 1.2711198428290767e-05, "loss": 0.9966, "step": 3390 }, { "epoch": 0.7382878236794962, "grad_norm": 1.2787412405014038, "learning_rate": 1.2689369133376992e-05, "loss": 1.0473, "step": 3400 }, { "epoch": 0.7404592584550241, "grad_norm": 1.3772400617599487, "learning_rate": 1.2667539838463219e-05, "loss": 0.9689, "step": 3410 }, { "epoch": 0.7426306932305521, "grad_norm": 1.2161903381347656, "learning_rate": 1.2645710543549445e-05, "loss": 1.0082, "step": 3420 }, { "epoch": 0.74480212800608, "grad_norm": 1.489033579826355, "learning_rate": 1.262388124863567e-05, "loss": 1.0139, "step": 3430 }, { "epoch": 0.746973562781608, "grad_norm": 1.3982605934143066, "learning_rate": 1.2602051953721897e-05, "loss": 0.9921, "step": 3440 }, { "epoch": 0.7491449975571359, "grad_norm": 1.378158450126648, "learning_rate": 1.258022265880812e-05, "loss": 0.9678, "step": 3450 }, { "epoch": 0.7513164323326638, "grad_norm": 1.4947155714035034, "learning_rate": 1.2558393363894347e-05, "loss": 1.0051, "step": 3460 }, { "epoch": 0.7534878671081917, "grad_norm": 1.1531239748001099, "learning_rate": 1.2536564068980572e-05, "loss": 1.1186, "step": 3470 }, { "epoch": 0.7556593018837197, "grad_norm": 1.38021981716156, "learning_rate": 1.2514734774066799e-05, "loss": 0.9771, "step": 3480 }, { "epoch": 0.7578307366592476, "grad_norm": 1.2459088563919067, "learning_rate": 1.2492905479153024e-05, "loss": 1.0216, "step": 3490 }, { "epoch": 0.7600021714347756, "grad_norm": 2.1082191467285156, "learning_rate": 1.247107618423925e-05, "loss": 0.9956, "step": 3500 }, { "epoch": 0.7621736062103035, "grad_norm": 1.1670981645584106, "learning_rate": 1.2449246889325475e-05, "loss": 1.0518, "step": 3510 }, { "epoch": 0.7643450409858313, "grad_norm": 1.453430414199829, "learning_rate": 1.2427417594411702e-05, "loss": 0.986, "step": 3520 }, { "epoch": 0.7665164757613593, "grad_norm": 0.9967979788780212, "learning_rate": 1.2405588299497929e-05, "loss": 1.0468, "step": 3530 }, { "epoch": 0.7686879105368872, "grad_norm": 1.5002816915512085, "learning_rate": 1.2383759004584152e-05, "loss": 1.0078, "step": 3540 }, { "epoch": 0.7708593453124152, "grad_norm": 1.4501177072525024, "learning_rate": 1.236192970967038e-05, "loss": 0.9615, "step": 3550 }, { "epoch": 0.7730307800879431, "grad_norm": 1.377883791923523, "learning_rate": 1.2340100414756604e-05, "loss": 0.9432, "step": 3560 }, { "epoch": 0.7752022148634711, "grad_norm": 1.2856801748275757, "learning_rate": 1.231827111984283e-05, "loss": 0.9958, "step": 3570 }, { "epoch": 0.777373649638999, "grad_norm": 1.2119390964508057, "learning_rate": 1.2296441824929055e-05, "loss": 1.0029, "step": 3580 }, { "epoch": 0.7795450844145269, "grad_norm": 1.4396144151687622, "learning_rate": 1.2274612530015282e-05, "loss": 1.0159, "step": 3590 }, { "epoch": 0.7817165191900548, "grad_norm": 1.6045223474502563, "learning_rate": 1.2252783235101507e-05, "loss": 0.9767, "step": 3600 }, { "epoch": 0.7838879539655828, "grad_norm": 1.2426387071609497, "learning_rate": 1.2230953940187733e-05, "loss": 1.0287, "step": 3610 }, { "epoch": 0.7860593887411107, "grad_norm": 1.2435184717178345, "learning_rate": 1.2209124645273957e-05, "loss": 1.0052, "step": 3620 }, { "epoch": 0.7882308235166386, "grad_norm": 1.6939178705215454, "learning_rate": 1.2187295350360185e-05, "loss": 1.0216, "step": 3630 }, { "epoch": 0.7904022582921666, "grad_norm": 1.1843641996383667, "learning_rate": 1.2165466055446412e-05, "loss": 0.9738, "step": 3640 }, { "epoch": 0.7925736930676944, "grad_norm": 1.3802050352096558, "learning_rate": 1.2143636760532635e-05, "loss": 0.9216, "step": 3650 }, { "epoch": 0.7947451278432224, "grad_norm": 1.2471121549606323, "learning_rate": 1.2121807465618862e-05, "loss": 0.977, "step": 3660 }, { "epoch": 0.7969165626187503, "grad_norm": 1.3608779907226562, "learning_rate": 1.2099978170705087e-05, "loss": 1.009, "step": 3670 }, { "epoch": 0.7990879973942783, "grad_norm": 1.4472932815551758, "learning_rate": 1.2078148875791313e-05, "loss": 0.9946, "step": 3680 }, { "epoch": 0.8012594321698062, "grad_norm": 1.7036590576171875, "learning_rate": 1.2056319580877538e-05, "loss": 0.999, "step": 3690 }, { "epoch": 0.8034308669453342, "grad_norm": 1.258748173713684, "learning_rate": 1.2034490285963765e-05, "loss": 0.9968, "step": 3700 }, { "epoch": 0.8056023017208621, "grad_norm": 1.191994547843933, "learning_rate": 1.2012660991049988e-05, "loss": 0.9941, "step": 3710 }, { "epoch": 0.80777373649639, "grad_norm": 1.9393503665924072, "learning_rate": 1.1990831696136217e-05, "loss": 1.0167, "step": 3720 }, { "epoch": 0.8099451712719179, "grad_norm": 1.5484780073165894, "learning_rate": 1.196900240122244e-05, "loss": 0.9962, "step": 3730 }, { "epoch": 0.8121166060474458, "grad_norm": 1.2578662633895874, "learning_rate": 1.1947173106308667e-05, "loss": 1.0057, "step": 3740 }, { "epoch": 0.8142880408229738, "grad_norm": 1.3208587169647217, "learning_rate": 1.1925343811394893e-05, "loss": 1.0086, "step": 3750 }, { "epoch": 0.8164594755985017, "grad_norm": 1.2795675992965698, "learning_rate": 1.1903514516481118e-05, "loss": 1.051, "step": 3760 }, { "epoch": 0.8186309103740297, "grad_norm": 1.1958470344543457, "learning_rate": 1.1881685221567345e-05, "loss": 0.9974, "step": 3770 }, { "epoch": 0.8208023451495575, "grad_norm": 1.1479548215866089, "learning_rate": 1.185985592665357e-05, "loss": 0.9678, "step": 3780 }, { "epoch": 0.8229737799250855, "grad_norm": 1.616144061088562, "learning_rate": 1.1838026631739797e-05, "loss": 1.0586, "step": 3790 }, { "epoch": 0.8251452147006134, "grad_norm": 1.3224166631698608, "learning_rate": 1.1816197336826022e-05, "loss": 1.0533, "step": 3800 }, { "epoch": 0.8273166494761414, "grad_norm": 1.6158727407455444, "learning_rate": 1.1794368041912248e-05, "loss": 1.0128, "step": 3810 }, { "epoch": 0.8294880842516693, "grad_norm": 1.3982148170471191, "learning_rate": 1.1772538746998472e-05, "loss": 1.0249, "step": 3820 }, { "epoch": 0.8316595190271973, "grad_norm": 1.2071058750152588, "learning_rate": 1.1750709452084698e-05, "loss": 0.965, "step": 3830 }, { "epoch": 0.8338309538027252, "grad_norm": 1.3230708837509155, "learning_rate": 1.1728880157170925e-05, "loss": 0.9441, "step": 3840 }, { "epoch": 0.836002388578253, "grad_norm": 1.106053113937378, "learning_rate": 1.170705086225715e-05, "loss": 1.0488, "step": 3850 }, { "epoch": 0.838173823353781, "grad_norm": 1.5212702751159668, "learning_rate": 1.1685221567343377e-05, "loss": 1.0445, "step": 3860 }, { "epoch": 0.8403452581293089, "grad_norm": 1.3804950714111328, "learning_rate": 1.1663392272429601e-05, "loss": 1.0183, "step": 3870 }, { "epoch": 0.8425166929048369, "grad_norm": 1.3932008743286133, "learning_rate": 1.1641562977515828e-05, "loss": 1.0027, "step": 3880 }, { "epoch": 0.8446881276803648, "grad_norm": 1.3928159475326538, "learning_rate": 1.1619733682602053e-05, "loss": 0.9646, "step": 3890 }, { "epoch": 0.8468595624558928, "grad_norm": 1.3050851821899414, "learning_rate": 1.159790438768828e-05, "loss": 1.0099, "step": 3900 }, { "epoch": 0.8490309972314206, "grad_norm": 1.2780051231384277, "learning_rate": 1.1576075092774503e-05, "loss": 0.9859, "step": 3910 }, { "epoch": 0.8512024320069486, "grad_norm": 1.317460060119629, "learning_rate": 1.155424579786073e-05, "loss": 1.0021, "step": 3920 }, { "epoch": 0.8533738667824765, "grad_norm": 1.4765187501907349, "learning_rate": 1.1532416502946955e-05, "loss": 1.0305, "step": 3930 }, { "epoch": 0.8555453015580045, "grad_norm": 1.1514675617218018, "learning_rate": 1.1510587208033181e-05, "loss": 0.946, "step": 3940 }, { "epoch": 0.8577167363335324, "grad_norm": 1.3265900611877441, "learning_rate": 1.1488757913119408e-05, "loss": 0.9427, "step": 3950 }, { "epoch": 0.8598881711090604, "grad_norm": 1.4531445503234863, "learning_rate": 1.1466928618205633e-05, "loss": 1.0272, "step": 3960 }, { "epoch": 0.8620596058845882, "grad_norm": 1.0620979070663452, "learning_rate": 1.144509932329186e-05, "loss": 1.0114, "step": 3970 }, { "epoch": 0.8642310406601161, "grad_norm": 1.086349606513977, "learning_rate": 1.1423270028378085e-05, "loss": 0.9946, "step": 3980 }, { "epoch": 0.8664024754356441, "grad_norm": 1.3090065717697144, "learning_rate": 1.1401440733464311e-05, "loss": 0.9915, "step": 3990 }, { "epoch": 0.868573910211172, "grad_norm": 1.1086080074310303, "learning_rate": 1.1379611438550535e-05, "loss": 0.9599, "step": 4000 }, { "epoch": 0.8707453449867, "grad_norm": 1.4512288570404053, "learning_rate": 1.1357782143636761e-05, "loss": 1.0143, "step": 4010 }, { "epoch": 0.8729167797622279, "grad_norm": 1.2470262050628662, "learning_rate": 1.1335952848722986e-05, "loss": 0.9715, "step": 4020 }, { "epoch": 0.8750882145377559, "grad_norm": 1.5051038265228271, "learning_rate": 1.1314123553809213e-05, "loss": 1.0206, "step": 4030 }, { "epoch": 0.8772596493132837, "grad_norm": 1.607826828956604, "learning_rate": 1.1292294258895438e-05, "loss": 0.9833, "step": 4040 }, { "epoch": 0.8794310840888117, "grad_norm": 1.431874394416809, "learning_rate": 1.1270464963981665e-05, "loss": 1.0264, "step": 4050 }, { "epoch": 0.8816025188643396, "grad_norm": 1.440034031867981, "learning_rate": 1.1248635669067891e-05, "loss": 1.0013, "step": 4060 }, { "epoch": 0.8837739536398675, "grad_norm": 1.4963476657867432, "learning_rate": 1.1226806374154116e-05, "loss": 0.9861, "step": 4070 }, { "epoch": 0.8859453884153955, "grad_norm": 1.5683997869491577, "learning_rate": 1.1204977079240343e-05, "loss": 1.0247, "step": 4080 }, { "epoch": 0.8881168231909234, "grad_norm": 1.4047991037368774, "learning_rate": 1.1183147784326566e-05, "loss": 0.9966, "step": 4090 }, { "epoch": 0.8902882579664513, "grad_norm": 1.3178616762161255, "learning_rate": 1.1161318489412793e-05, "loss": 1.0107, "step": 4100 }, { "epoch": 0.8924596927419792, "grad_norm": 1.5227705240249634, "learning_rate": 1.1139489194499018e-05, "loss": 0.9826, "step": 4110 }, { "epoch": 0.8946311275175072, "grad_norm": 1.4800081253051758, "learning_rate": 1.1117659899585244e-05, "loss": 1.0544, "step": 4120 }, { "epoch": 0.8968025622930351, "grad_norm": 1.3340637683868408, "learning_rate": 1.109583060467147e-05, "loss": 1.0342, "step": 4130 }, { "epoch": 0.8989739970685631, "grad_norm": 1.6699985265731812, "learning_rate": 1.1074001309757696e-05, "loss": 0.9726, "step": 4140 }, { "epoch": 0.901145431844091, "grad_norm": 1.466199517250061, "learning_rate": 1.1052172014843921e-05, "loss": 0.9623, "step": 4150 }, { "epoch": 0.903316866619619, "grad_norm": 1.6779991388320923, "learning_rate": 1.1030342719930148e-05, "loss": 1.033, "step": 4160 }, { "epoch": 0.9054883013951468, "grad_norm": 1.338218331336975, "learning_rate": 1.1008513425016374e-05, "loss": 0.983, "step": 4170 }, { "epoch": 0.9076597361706747, "grad_norm": 1.430690884590149, "learning_rate": 1.0986684130102598e-05, "loss": 0.9673, "step": 4180 }, { "epoch": 0.9098311709462027, "grad_norm": 1.381343126296997, "learning_rate": 1.0964854835188824e-05, "loss": 0.9552, "step": 4190 }, { "epoch": 0.9120026057217306, "grad_norm": 1.2798620462417603, "learning_rate": 1.094302554027505e-05, "loss": 1.0115, "step": 4200 }, { "epoch": 0.9141740404972586, "grad_norm": 1.5903421640396118, "learning_rate": 1.0921196245361276e-05, "loss": 0.9903, "step": 4210 }, { "epoch": 0.9163454752727865, "grad_norm": 1.1908365488052368, "learning_rate": 1.0899366950447501e-05, "loss": 1.0046, "step": 4220 }, { "epoch": 0.9185169100483144, "grad_norm": 1.1967812776565552, "learning_rate": 1.0877537655533728e-05, "loss": 0.9842, "step": 4230 }, { "epoch": 0.9206883448238423, "grad_norm": 1.2975422143936157, "learning_rate": 1.0855708360619953e-05, "loss": 1.1223, "step": 4240 }, { "epoch": 0.9228597795993703, "grad_norm": 1.213766098022461, "learning_rate": 1.083387906570618e-05, "loss": 1.0106, "step": 4250 }, { "epoch": 0.9250312143748982, "grad_norm": 1.301695704460144, "learning_rate": 1.0812049770792403e-05, "loss": 0.9959, "step": 4260 }, { "epoch": 0.9272026491504262, "grad_norm": 1.3527394533157349, "learning_rate": 1.079022047587863e-05, "loss": 1.0124, "step": 4270 }, { "epoch": 0.9293740839259541, "grad_norm": 1.3432750701904297, "learning_rate": 1.0768391180964856e-05, "loss": 1.0047, "step": 4280 }, { "epoch": 0.9315455187014821, "grad_norm": 1.329483151435852, "learning_rate": 1.0746561886051081e-05, "loss": 1.0124, "step": 4290 }, { "epoch": 0.9337169534770099, "grad_norm": 1.430738091468811, "learning_rate": 1.0724732591137308e-05, "loss": 0.9462, "step": 4300 }, { "epoch": 0.9358883882525378, "grad_norm": 1.491452693939209, "learning_rate": 1.0702903296223533e-05, "loss": 0.9885, "step": 4310 }, { "epoch": 0.9380598230280658, "grad_norm": 1.4353605508804321, "learning_rate": 1.068107400130976e-05, "loss": 1.0131, "step": 4320 }, { "epoch": 0.9402312578035937, "grad_norm": 1.1809788942337036, "learning_rate": 1.0659244706395984e-05, "loss": 0.9926, "step": 4330 }, { "epoch": 0.9424026925791217, "grad_norm": 1.2355526685714722, "learning_rate": 1.0637415411482211e-05, "loss": 0.9945, "step": 4340 }, { "epoch": 0.9445741273546496, "grad_norm": 1.3314152956008911, "learning_rate": 1.0615586116568434e-05, "loss": 1.0037, "step": 4350 }, { "epoch": 0.9467455621301775, "grad_norm": 1.2427114248275757, "learning_rate": 1.059375682165466e-05, "loss": 1.0048, "step": 4360 }, { "epoch": 0.9489169969057054, "grad_norm": 1.298858642578125, "learning_rate": 1.0571927526740886e-05, "loss": 1.068, "step": 4370 }, { "epoch": 0.9510884316812334, "grad_norm": 1.432786226272583, "learning_rate": 1.0550098231827112e-05, "loss": 0.9922, "step": 4380 }, { "epoch": 0.9532598664567613, "grad_norm": 1.3567193746566772, "learning_rate": 1.0528268936913339e-05, "loss": 1.0097, "step": 4390 }, { "epoch": 0.9554313012322893, "grad_norm": 1.4737164974212646, "learning_rate": 1.0506439641999564e-05, "loss": 1.0053, "step": 4400 }, { "epoch": 0.9576027360078172, "grad_norm": 1.1993675231933594, "learning_rate": 1.048461034708579e-05, "loss": 1.0553, "step": 4410 }, { "epoch": 0.9597741707833451, "grad_norm": 1.483333945274353, "learning_rate": 1.0462781052172016e-05, "loss": 1.0163, "step": 4420 }, { "epoch": 0.961945605558873, "grad_norm": 1.4248449802398682, "learning_rate": 1.0440951757258242e-05, "loss": 1.0013, "step": 4430 }, { "epoch": 0.9641170403344009, "grad_norm": 1.4888718128204346, "learning_rate": 1.0419122462344466e-05, "loss": 1.0428, "step": 4440 }, { "epoch": 0.9662884751099289, "grad_norm": 1.2882726192474365, "learning_rate": 1.0397293167430692e-05, "loss": 0.9764, "step": 4450 }, { "epoch": 0.9684599098854568, "grad_norm": 1.3666644096374512, "learning_rate": 1.0375463872516917e-05, "loss": 1.0235, "step": 4460 }, { "epoch": 0.9706313446609848, "grad_norm": 1.5665643215179443, "learning_rate": 1.0353634577603144e-05, "loss": 0.9966, "step": 4470 }, { "epoch": 0.9728027794365127, "grad_norm": 1.4171271324157715, "learning_rate": 1.0331805282689369e-05, "loss": 1.021, "step": 4480 }, { "epoch": 0.9749742142120406, "grad_norm": 1.4926506280899048, "learning_rate": 1.0309975987775596e-05, "loss": 0.9794, "step": 4490 }, { "epoch": 0.9771456489875685, "grad_norm": 1.1166307926177979, "learning_rate": 1.0288146692861822e-05, "loss": 1.0138, "step": 4500 }, { "epoch": 0.9793170837630965, "grad_norm": 1.515855312347412, "learning_rate": 1.0266317397948047e-05, "loss": 0.9691, "step": 4510 }, { "epoch": 0.9814885185386244, "grad_norm": 1.421080231666565, "learning_rate": 1.0244488103034274e-05, "loss": 0.9646, "step": 4520 }, { "epoch": 0.9836599533141523, "grad_norm": 1.4241400957107544, "learning_rate": 1.0222658808120497e-05, "loss": 0.9447, "step": 4530 }, { "epoch": 0.9858313880896803, "grad_norm": 1.6205312013626099, "learning_rate": 1.0200829513206724e-05, "loss": 0.9843, "step": 4540 }, { "epoch": 0.9880028228652082, "grad_norm": 1.3039618730545044, "learning_rate": 1.0179000218292949e-05, "loss": 1.0065, "step": 4550 }, { "epoch": 0.9901742576407361, "grad_norm": 1.4685053825378418, "learning_rate": 1.0157170923379176e-05, "loss": 0.9925, "step": 4560 }, { "epoch": 0.992345692416264, "grad_norm": 1.2964003086090088, "learning_rate": 1.01353416284654e-05, "loss": 1.0104, "step": 4570 }, { "epoch": 0.994517127191792, "grad_norm": 1.4937127828598022, "learning_rate": 1.0113512333551627e-05, "loss": 1.0642, "step": 4580 }, { "epoch": 0.9966885619673199, "grad_norm": 1.2731589078903198, "learning_rate": 1.0091683038637854e-05, "loss": 1.0486, "step": 4590 }, { "epoch": 0.9988599967428479, "grad_norm": 1.3573518991470337, "learning_rate": 1.0069853743724079e-05, "loss": 0.9839, "step": 4600 }, { "epoch": 1.0008685739102112, "grad_norm": 1.6150940656661987, "learning_rate": 1.0048024448810306e-05, "loss": 1.0595, "step": 4610 }, { "epoch": 1.0030400086857392, "grad_norm": 1.435672640800476, "learning_rate": 1.0026195153896529e-05, "loss": 1.0018, "step": 4620 }, { "epoch": 1.0052114434612671, "grad_norm": 1.3522926568984985, "learning_rate": 1.0004365858982757e-05, "loss": 0.9845, "step": 4630 }, { "epoch": 1.007382878236795, "grad_norm": 1.327671766281128, "learning_rate": 9.98253656406898e-06, "loss": 0.947, "step": 4640 }, { "epoch": 1.0095543130123228, "grad_norm": 1.40632164478302, "learning_rate": 9.960707269155207e-06, "loss": 0.955, "step": 4650 }, { "epoch": 1.0117257477878507, "grad_norm": 1.7449159622192383, "learning_rate": 9.938877974241434e-06, "loss": 0.9406, "step": 4660 }, { "epoch": 1.0138971825633787, "grad_norm": 1.410897135734558, "learning_rate": 9.917048679327659e-06, "loss": 1.0253, "step": 4670 }, { "epoch": 1.0160686173389066, "grad_norm": 1.3368771076202393, "learning_rate": 9.895219384413884e-06, "loss": 0.9922, "step": 4680 }, { "epoch": 1.0182400521144346, "grad_norm": 1.2922542095184326, "learning_rate": 9.87339008950011e-06, "loss": 1.0153, "step": 4690 }, { "epoch": 1.0204114868899625, "grad_norm": 1.4930267333984375, "learning_rate": 9.851560794586335e-06, "loss": 0.9725, "step": 4700 }, { "epoch": 1.0225829216654905, "grad_norm": 1.2955012321472168, "learning_rate": 9.829731499672562e-06, "loss": 0.9858, "step": 4710 }, { "epoch": 1.0247543564410184, "grad_norm": 1.5806477069854736, "learning_rate": 9.807902204758787e-06, "loss": 0.9046, "step": 4720 }, { "epoch": 1.0269257912165464, "grad_norm": 1.3869348764419556, "learning_rate": 9.786072909845012e-06, "loss": 1.0147, "step": 4730 }, { "epoch": 1.0290972259920743, "grad_norm": 1.4592316150665283, "learning_rate": 9.764243614931239e-06, "loss": 0.9497, "step": 4740 }, { "epoch": 1.0312686607676023, "grad_norm": 1.9150491952896118, "learning_rate": 9.742414320017464e-06, "loss": 0.9584, "step": 4750 }, { "epoch": 1.0334400955431302, "grad_norm": 1.2069435119628906, "learning_rate": 9.72058502510369e-06, "loss": 0.9853, "step": 4760 }, { "epoch": 1.0356115303186582, "grad_norm": 1.521933674812317, "learning_rate": 9.698755730189915e-06, "loss": 0.9611, "step": 4770 }, { "epoch": 1.0377829650941859, "grad_norm": 1.9448108673095703, "learning_rate": 9.676926435276142e-06, "loss": 1.0092, "step": 4780 }, { "epoch": 1.0399543998697138, "grad_norm": 1.577696442604065, "learning_rate": 9.655097140362367e-06, "loss": 1.0072, "step": 4790 }, { "epoch": 1.0421258346452418, "grad_norm": 1.9846240282058716, "learning_rate": 9.633267845448594e-06, "loss": 0.9533, "step": 4800 }, { "epoch": 1.0442972694207697, "grad_norm": 1.4275234937667847, "learning_rate": 9.611438550534819e-06, "loss": 0.9662, "step": 4810 }, { "epoch": 1.0464687041962977, "grad_norm": 1.548954963684082, "learning_rate": 9.589609255621044e-06, "loss": 0.9454, "step": 4820 }, { "epoch": 1.0486401389718256, "grad_norm": 1.8117595911026, "learning_rate": 9.56777996070727e-06, "loss": 0.9623, "step": 4830 }, { "epoch": 1.0508115737473536, "grad_norm": 1.417375087738037, "learning_rate": 9.545950665793495e-06, "loss": 0.9791, "step": 4840 }, { "epoch": 1.0529830085228815, "grad_norm": 1.2770414352416992, "learning_rate": 9.52412137087972e-06, "loss": 0.8828, "step": 4850 }, { "epoch": 1.0551544432984095, "grad_norm": 1.3013825416564941, "learning_rate": 9.502292075965947e-06, "loss": 1.029, "step": 4860 }, { "epoch": 1.0573258780739374, "grad_norm": 1.5322422981262207, "learning_rate": 9.480462781052174e-06, "loss": 0.9592, "step": 4870 }, { "epoch": 1.0594973128494654, "grad_norm": 1.7801984548568726, "learning_rate": 9.458633486138398e-06, "loss": 0.9531, "step": 4880 }, { "epoch": 1.0616687476249933, "grad_norm": 2.0160224437713623, "learning_rate": 9.436804191224625e-06, "loss": 0.9471, "step": 4890 }, { "epoch": 1.0638401824005213, "grad_norm": 1.4919092655181885, "learning_rate": 9.41497489631085e-06, "loss": 0.9719, "step": 4900 }, { "epoch": 1.066011617176049, "grad_norm": 1.379225730895996, "learning_rate": 9.393145601397075e-06, "loss": 0.9406, "step": 4910 }, { "epoch": 1.068183051951577, "grad_norm": 1.4247862100601196, "learning_rate": 9.371316306483302e-06, "loss": 0.975, "step": 4920 }, { "epoch": 1.0703544867271049, "grad_norm": 1.290443778038025, "learning_rate": 9.349487011569527e-06, "loss": 0.9636, "step": 4930 }, { "epoch": 1.0725259215026328, "grad_norm": 1.2737443447113037, "learning_rate": 9.327657716655752e-06, "loss": 0.9779, "step": 4940 }, { "epoch": 1.0746973562781608, "grad_norm": 1.1298906803131104, "learning_rate": 9.305828421741978e-06, "loss": 0.9705, "step": 4950 }, { "epoch": 1.0768687910536887, "grad_norm": 1.368236780166626, "learning_rate": 9.283999126828203e-06, "loss": 0.9791, "step": 4960 }, { "epoch": 1.0790402258292167, "grad_norm": 1.3343724012374878, "learning_rate": 9.26216983191443e-06, "loss": 1.0074, "step": 4970 }, { "epoch": 1.0812116606047446, "grad_norm": 1.547235369682312, "learning_rate": 9.240340537000657e-06, "loss": 0.9545, "step": 4980 }, { "epoch": 1.0833830953802726, "grad_norm": 1.8547582626342773, "learning_rate": 9.218511242086882e-06, "loss": 0.9981, "step": 4990 }, { "epoch": 1.0855545301558005, "grad_norm": 1.3031221628189087, "learning_rate": 9.196681947173107e-06, "loss": 0.9537, "step": 5000 } ], "logging_steps": 10, "max_steps": 9212, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 6.20553175074816e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }