{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.000629326620516, "eval_steps": 500, "global_step": 1589, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0062932662051604785, "grad_norm": 39.40880584716797, "learning_rate": 5.660377358490567e-07, "loss": 1.2507, "step": 10 }, { "epoch": 0.012586532410320957, "grad_norm": 12.689669609069824, "learning_rate": 1.1949685534591195e-06, "loss": 0.9818, "step": 20 }, { "epoch": 0.018879798615481436, "grad_norm": 1.60954749584198, "learning_rate": 1.8238993710691824e-06, "loss": 0.6639, "step": 30 }, { "epoch": 0.025173064820641914, "grad_norm": 0.9736618995666504, "learning_rate": 2.4528301886792453e-06, "loss": 0.603, "step": 40 }, { "epoch": 0.03146633102580239, "grad_norm": 0.9699676632881165, "learning_rate": 3.0817610062893084e-06, "loss": 0.5679, "step": 50 }, { "epoch": 0.03775959723096287, "grad_norm": 0.8372435569763184, "learning_rate": 3.710691823899371e-06, "loss": 0.5549, "step": 60 }, { "epoch": 0.04405286343612335, "grad_norm": 0.8186138272285461, "learning_rate": 4.339622641509435e-06, "loss": 0.5552, "step": 70 }, { "epoch": 0.05034612964128383, "grad_norm": 0.7362136840820312, "learning_rate": 4.968553459119497e-06, "loss": 0.558, "step": 80 }, { "epoch": 0.056639395846444306, "grad_norm": 0.8293086290359497, "learning_rate": 5.59748427672956e-06, "loss": 0.5432, "step": 90 }, { "epoch": 0.06293266205160478, "grad_norm": 0.7764604091644287, "learning_rate": 6.226415094339623e-06, "loss": 0.541, "step": 100 }, { "epoch": 0.06922592825676527, "grad_norm": 0.8436954021453857, "learning_rate": 6.855345911949685e-06, "loss": 0.5457, "step": 110 }, { "epoch": 0.07551919446192575, "grad_norm": 0.7573267817497253, "learning_rate": 7.484276729559748e-06, "loss": 0.5285, "step": 120 }, { "epoch": 0.08181246066708622, "grad_norm": 0.8208069801330566, "learning_rate": 8.113207547169812e-06, "loss": 0.5352, "step": 130 }, { "epoch": 0.0881057268722467, "grad_norm": 0.759560227394104, "learning_rate": 8.742138364779875e-06, "loss": 0.5333, "step": 140 }, { "epoch": 0.09439899307740718, "grad_norm": 0.8434644341468811, "learning_rate": 9.371069182389939e-06, "loss": 0.5372, "step": 150 }, { "epoch": 0.10069225928256766, "grad_norm": 0.8114253878593445, "learning_rate": 1e-05, "loss": 0.5339, "step": 160 }, { "epoch": 0.10698552548772813, "grad_norm": 0.8041621446609497, "learning_rate": 9.998793436421342e-06, "loss": 0.5371, "step": 170 }, { "epoch": 0.11327879169288861, "grad_norm": 0.782455563545227, "learning_rate": 9.99517432800363e-06, "loss": 0.5224, "step": 180 }, { "epoch": 0.11957205789804909, "grad_norm": 0.802542507648468, "learning_rate": 9.98914442142063e-06, "loss": 0.5209, "step": 190 }, { "epoch": 0.12586532410320955, "grad_norm": 0.8419063091278076, "learning_rate": 9.980706626858607e-06, "loss": 0.5261, "step": 200 }, { "epoch": 0.13215859030837004, "grad_norm": 0.7716870307922363, "learning_rate": 9.9698650166118e-06, "loss": 0.522, "step": 210 }, { "epoch": 0.13845185651353054, "grad_norm": 0.774776816368103, "learning_rate": 9.956624823117036e-06, "loss": 0.5305, "step": 220 }, { "epoch": 0.144745122718691, "grad_norm": 0.7823233008384705, "learning_rate": 9.94099243642841e-06, "loss": 0.5247, "step": 230 }, { "epoch": 0.1510383889238515, "grad_norm": 0.7220829725265503, "learning_rate": 9.922975401133292e-06, "loss": 0.5286, "step": 240 }, { "epoch": 0.15733165512901195, "grad_norm": 0.7797294855117798, "learning_rate": 9.90258241271112e-06, "loss": 0.5299, "step": 250 }, { "epoch": 0.16362492133417245, "grad_norm": 0.7687580585479736, "learning_rate": 9.879823313336723e-06, "loss": 0.5262, "step": 260 }, { "epoch": 0.1699181875393329, "grad_norm": 0.7156737446784973, "learning_rate": 9.854709087130261e-06, "loss": 0.5227, "step": 270 }, { "epoch": 0.1762114537444934, "grad_norm": 0.747580885887146, "learning_rate": 9.827251854855992e-06, "loss": 0.5186, "step": 280 }, { "epoch": 0.18250471994965387, "grad_norm": 0.7566559910774231, "learning_rate": 9.797464868072489e-06, "loss": 0.5127, "step": 290 }, { "epoch": 0.18879798615481436, "grad_norm": 0.747591495513916, "learning_rate": 9.765362502737098e-06, "loss": 0.5167, "step": 300 }, { "epoch": 0.19509125235997482, "grad_norm": 0.732440173625946, "learning_rate": 9.730960252267744e-06, "loss": 0.5225, "step": 310 }, { "epoch": 0.2013845185651353, "grad_norm": 0.7387551069259644, "learning_rate": 9.6942747200654e-06, "loss": 0.5149, "step": 320 }, { "epoch": 0.20767778477029578, "grad_norm": 0.7358985543251038, "learning_rate": 9.655323611500876e-06, "loss": 0.518, "step": 330 }, { "epoch": 0.21397105097545627, "grad_norm": 0.7722839117050171, "learning_rate": 9.614125725369748e-06, "loss": 0.5095, "step": 340 }, { "epoch": 0.22026431718061673, "grad_norm": 0.677197277545929, "learning_rate": 9.570700944819584e-06, "loss": 0.5233, "step": 350 }, { "epoch": 0.22655758338577722, "grad_norm": 0.6825560331344604, "learning_rate": 9.525070227753835e-06, "loss": 0.5125, "step": 360 }, { "epoch": 0.2328508495909377, "grad_norm": 0.6920183300971985, "learning_rate": 9.477255596717012e-06, "loss": 0.5191, "step": 370 }, { "epoch": 0.23914411579609818, "grad_norm": 0.7336747646331787, "learning_rate": 9.427280128266049e-06, "loss": 0.5163, "step": 380 }, { "epoch": 0.24543738200125864, "grad_norm": 0.7665858268737793, "learning_rate": 9.375167941832974e-06, "loss": 0.5062, "step": 390 }, { "epoch": 0.2517306482064191, "grad_norm": 0.6906554102897644, "learning_rate": 9.320944188084241e-06, "loss": 0.518, "step": 400 }, { "epoch": 0.2580239144115796, "grad_norm": 0.7612572312355042, "learning_rate": 9.264635036782406e-06, "loss": 0.5042, "step": 410 }, { "epoch": 0.2643171806167401, "grad_norm": 0.7517194747924805, "learning_rate": 9.206267664155906e-06, "loss": 0.5221, "step": 420 }, { "epoch": 0.27061044682190055, "grad_norm": 0.7678345441818237, "learning_rate": 9.145870239783143e-06, "loss": 0.5172, "step": 430 }, { "epoch": 0.27690371302706107, "grad_norm": 0.7215328812599182, "learning_rate": 9.08347191299711e-06, "loss": 0.5143, "step": 440 }, { "epoch": 0.28319697923222154, "grad_norm": 0.6326926350593567, "learning_rate": 9.019102798817196e-06, "loss": 0.5164, "step": 450 }, { "epoch": 0.289490245437382, "grad_norm": 0.689453661441803, "learning_rate": 8.952793963414908e-06, "loss": 0.5179, "step": 460 }, { "epoch": 0.29578351164254246, "grad_norm": 0.7151985168457031, "learning_rate": 8.884577409120535e-06, "loss": 0.5073, "step": 470 }, { "epoch": 0.302076777847703, "grad_norm": 0.7656172513961792, "learning_rate": 8.814486058978035e-06, "loss": 0.5042, "step": 480 }, { "epoch": 0.30837004405286345, "grad_norm": 0.680549144744873, "learning_rate": 8.742553740855507e-06, "loss": 0.5191, "step": 490 }, { "epoch": 0.3146633102580239, "grad_norm": 0.7366177439689636, "learning_rate": 8.66881517111902e-06, "loss": 0.5163, "step": 500 }, { "epoch": 0.3209565764631844, "grad_norm": 0.8044482469558716, "learning_rate": 8.593305937877614e-06, "loss": 0.5152, "step": 510 }, { "epoch": 0.3272498426683449, "grad_norm": 0.6989796161651611, "learning_rate": 8.516062483807556e-06, "loss": 0.5192, "step": 520 }, { "epoch": 0.33354310887350536, "grad_norm": 0.705839216709137, "learning_rate": 8.437122088564197e-06, "loss": 0.5054, "step": 530 }, { "epoch": 0.3398363750786658, "grad_norm": 0.6799296736717224, "learning_rate": 8.356522850789852e-06, "loss": 0.5032, "step": 540 }, { "epoch": 0.3461296412838263, "grad_norm": 0.8019563555717468, "learning_rate": 8.274303669726427e-06, "loss": 0.5113, "step": 550 }, { "epoch": 0.3524229074889868, "grad_norm": 0.7109248638153076, "learning_rate": 8.190504226441654e-06, "loss": 0.5029, "step": 560 }, { "epoch": 0.35871617369414727, "grad_norm": 0.7193357944488525, "learning_rate": 8.105164964678009e-06, "loss": 0.5127, "step": 570 }, { "epoch": 0.36500943989930773, "grad_norm": 0.6586730480194092, "learning_rate": 8.018327071333521e-06, "loss": 0.5178, "step": 580 }, { "epoch": 0.3713027061044682, "grad_norm": 0.7992932796478271, "learning_rate": 7.930032456583931e-06, "loss": 0.5064, "step": 590 }, { "epoch": 0.3775959723096287, "grad_norm": 0.6866645812988281, "learning_rate": 7.84032373365578e-06, "loss": 0.5025, "step": 600 }, { "epoch": 0.3838892385147892, "grad_norm": 0.662344217300415, "learning_rate": 7.749244198260175e-06, "loss": 0.5103, "step": 610 }, { "epoch": 0.39018250471994964, "grad_norm": 0.6587361693382263, "learning_rate": 7.656837807697187e-06, "loss": 0.5129, "step": 620 }, { "epoch": 0.3964757709251101, "grad_norm": 0.6918533444404602, "learning_rate": 7.563149159640929e-06, "loss": 0.5053, "step": 630 }, { "epoch": 0.4027690371302706, "grad_norm": 0.6420175433158875, "learning_rate": 7.468223470615593e-06, "loss": 0.5223, "step": 640 }, { "epoch": 0.4090623033354311, "grad_norm": 0.7031328678131104, "learning_rate": 7.372106554172802e-06, "loss": 0.5024, "step": 650 }, { "epoch": 0.41535556954059155, "grad_norm": 0.7460775375366211, "learning_rate": 7.274844798780826e-06, "loss": 0.5123, "step": 660 }, { "epoch": 0.42164883574575207, "grad_norm": 0.6937898397445679, "learning_rate": 7.176485145436325e-06, "loss": 0.5051, "step": 670 }, { "epoch": 0.42794210195091253, "grad_norm": 0.623894453048706, "learning_rate": 7.0770750650094335e-06, "loss": 0.5059, "step": 680 }, { "epoch": 0.434235368156073, "grad_norm": 0.6496269106864929, "learning_rate": 6.976662535333107e-06, "loss": 0.4999, "step": 690 }, { "epoch": 0.44052863436123346, "grad_norm": 0.6723958253860474, "learning_rate": 6.87529601804781e-06, "loss": 0.5054, "step": 700 }, { "epoch": 0.446821900566394, "grad_norm": 0.6890814900398254, "learning_rate": 6.773024435212678e-06, "loss": 0.5066, "step": 710 }, { "epoch": 0.45311516677155445, "grad_norm": 0.6805148720741272, "learning_rate": 6.669897145694507e-06, "loss": 0.5086, "step": 720 }, { "epoch": 0.4594084329767149, "grad_norm": 0.6633646488189697, "learning_rate": 6.565963921345896e-06, "loss": 0.4939, "step": 730 }, { "epoch": 0.4657016991818754, "grad_norm": 0.6664919257164001, "learning_rate": 6.461274922984087e-06, "loss": 0.4995, "step": 740 }, { "epoch": 0.4719949653870359, "grad_norm": 0.6816923022270203, "learning_rate": 6.355880676182086e-06, "loss": 0.5038, "step": 750 }, { "epoch": 0.47828823159219636, "grad_norm": 0.6514876484870911, "learning_rate": 6.249832046883729e-06, "loss": 0.5011, "step": 760 }, { "epoch": 0.4845814977973568, "grad_norm": 0.6344130039215088, "learning_rate": 6.143180216854488e-06, "loss": 0.5034, "step": 770 }, { "epoch": 0.4908747640025173, "grad_norm": 0.6643583178520203, "learning_rate": 6.035976658979846e-06, "loss": 0.4956, "step": 780 }, { "epoch": 0.4971680302076778, "grad_norm": 0.7020254731178284, "learning_rate": 5.928273112423177e-06, "loss": 0.497, "step": 790 }, { "epoch": 1.0037759597230962, "grad_norm": 0.6873272657394409, "learning_rate": 5.820121557655109e-06, "loss": 0.5445, "step": 800 }, { "epoch": 1.0100692259282569, "grad_norm": 0.6778249144554138, "learning_rate": 5.711574191366427e-06, "loss": 0.4808, "step": 810 }, { "epoch": 1.0163624921334173, "grad_norm": 0.6917534470558167, "learning_rate": 5.6026834012766155e-06, "loss": 0.4871, "step": 820 }, { "epoch": 1.0226557583385778, "grad_norm": 0.6982170343399048, "learning_rate": 5.493501740850228e-06, "loss": 0.4768, "step": 830 }, { "epoch": 1.0289490245437383, "grad_norm": 0.622083842754364, "learning_rate": 5.384081903933235e-06, "loss": 0.4874, "step": 840 }, { "epoch": 1.0352422907488987, "grad_norm": 0.682299792766571, "learning_rate": 5.274476699321638e-06, "loss": 0.4787, "step": 850 }, { "epoch": 1.0415355569540592, "grad_norm": 0.719980776309967, "learning_rate": 5.164739025274604e-06, "loss": 0.4731, "step": 860 }, { "epoch": 1.0478288231592197, "grad_norm": 0.7684125304222107, "learning_rate": 5.0549218439844185e-06, "loss": 0.4858, "step": 870 }, { "epoch": 1.0541220893643801, "grad_norm": 0.6164060831069946, "learning_rate": 4.945078156015582e-06, "loss": 0.4803, "step": 880 }, { "epoch": 1.0604153555695406, "grad_norm": 0.7356188297271729, "learning_rate": 4.835260974725397e-06, "loss": 0.4756, "step": 890 }, { "epoch": 1.066708621774701, "grad_norm": 0.6549850106239319, "learning_rate": 4.7255233006783626e-06, "loss": 0.4665, "step": 900 }, { "epoch": 1.0730018879798615, "grad_norm": 0.6597391366958618, "learning_rate": 4.615918096066766e-06, "loss": 0.4669, "step": 910 }, { "epoch": 1.079295154185022, "grad_norm": 0.6646954417228699, "learning_rate": 4.506498259149774e-06, "loss": 0.4717, "step": 920 }, { "epoch": 1.0855884203901824, "grad_norm": 0.6768482327461243, "learning_rate": 4.397316598723385e-06, "loss": 0.4702, "step": 930 }, { "epoch": 1.091881686595343, "grad_norm": 0.7066707611083984, "learning_rate": 4.2884258086335755e-06, "loss": 0.4502, "step": 940 }, { "epoch": 1.0981749528005034, "grad_norm": 0.6753413677215576, "learning_rate": 4.179878442344892e-06, "loss": 0.4721, "step": 950 }, { "epoch": 1.104468219005664, "grad_norm": 0.6423931121826172, "learning_rate": 4.071726887576823e-06, "loss": 0.4661, "step": 960 }, { "epoch": 1.1107614852108245, "grad_norm": 0.686931848526001, "learning_rate": 3.9640233410201555e-06, "loss": 0.4684, "step": 970 }, { "epoch": 1.117054751415985, "grad_norm": 0.6282669901847839, "learning_rate": 3.856819783145514e-06, "loss": 0.4621, "step": 980 }, { "epoch": 1.1233480176211454, "grad_norm": 0.7436355352401733, "learning_rate": 3.750167953116272e-06, "loss": 0.4575, "step": 990 }, { "epoch": 1.129641283826306, "grad_norm": 0.6427481174468994, "learning_rate": 3.6441193238179152e-06, "loss": 0.4591, "step": 1000 }, { "epoch": 1.1359345500314664, "grad_norm": 0.6406493782997131, "learning_rate": 3.5387250770159152e-06, "loss": 0.4503, "step": 1010 }, { "epoch": 1.1422278162366268, "grad_norm": 0.6536152958869934, "learning_rate": 3.4340360786541067e-06, "loss": 0.454, "step": 1020 }, { "epoch": 1.1485210824417873, "grad_norm": 0.7142040729522705, "learning_rate": 3.3301028543054935e-06, "loss": 0.4527, "step": 1030 }, { "epoch": 1.1548143486469478, "grad_norm": 0.6241782903671265, "learning_rate": 3.226975564787322e-06, "loss": 0.4472, "step": 1040 }, { "epoch": 1.1611076148521082, "grad_norm": 0.6668282747268677, "learning_rate": 3.1247039819521907e-06, "loss": 0.4509, "step": 1050 }, { "epoch": 1.1674008810572687, "grad_norm": 0.6013820171356201, "learning_rate": 3.0233374646668935e-06, "loss": 0.443, "step": 1060 }, { "epoch": 1.1736941472624292, "grad_norm": 0.6819528341293335, "learning_rate": 2.9229249349905686e-06, "loss": 0.4598, "step": 1070 }, { "epoch": 1.1799874134675896, "grad_norm": 0.6443890929222107, "learning_rate": 2.8235148545636776e-06, "loss": 0.447, "step": 1080 }, { "epoch": 1.18628067967275, "grad_norm": 0.6288453340530396, "learning_rate": 2.7251552012191763e-06, "loss": 0.4563, "step": 1090 }, { "epoch": 1.1925739458779105, "grad_norm": 0.661491334438324, "learning_rate": 2.6278934458271998e-06, "loss": 0.4443, "step": 1100 }, { "epoch": 1.198867212083071, "grad_norm": 0.6220525503158569, "learning_rate": 2.531776529384407e-06, "loss": 0.446, "step": 1110 }, { "epoch": 1.2051604782882315, "grad_norm": 0.6786297559738159, "learning_rate": 2.436850840359073e-06, "loss": 0.4464, "step": 1120 }, { "epoch": 1.2114537444933922, "grad_norm": 0.7809886336326599, "learning_rate": 2.3431621923028146e-06, "loss": 0.4554, "step": 1130 }, { "epoch": 1.2177470106985526, "grad_norm": 0.7114007472991943, "learning_rate": 2.2507558017398263e-06, "loss": 0.4405, "step": 1140 }, { "epoch": 1.224040276903713, "grad_norm": 0.726741373538971, "learning_rate": 2.159676266344222e-06, "loss": 0.4463, "step": 1150 }, { "epoch": 1.2303335431088736, "grad_norm": 0.6209679841995239, "learning_rate": 2.06996754341607e-06, "loss": 0.4601, "step": 1160 }, { "epoch": 1.236626809314034, "grad_norm": 0.7238234877586365, "learning_rate": 1.98167292866648e-06, "loss": 0.4498, "step": 1170 }, { "epoch": 1.2429200755191945, "grad_norm": 0.663245439529419, "learning_rate": 1.8948350353219913e-06, "loss": 0.4507, "step": 1180 }, { "epoch": 1.249213341724355, "grad_norm": 0.6694537997245789, "learning_rate": 1.8094957735583463e-06, "loss": 0.4616, "step": 1190 }, { "epoch": 1.2555066079295154, "grad_norm": 0.6908589005470276, "learning_rate": 1.7256963302735752e-06, "loss": 0.439, "step": 1200 }, { "epoch": 1.2617998741346759, "grad_norm": 0.6885989308357239, "learning_rate": 1.6434771492101487e-06, "loss": 0.4444, "step": 1210 }, { "epoch": 1.2680931403398363, "grad_norm": 0.6936271786689758, "learning_rate": 1.5628779114358034e-06, "loss": 0.4535, "step": 1220 }, { "epoch": 1.2743864065449968, "grad_norm": 0.6543680429458618, "learning_rate": 1.4839375161924446e-06, "loss": 0.4584, "step": 1230 }, { "epoch": 1.2806796727501573, "grad_norm": 0.6488659381866455, "learning_rate": 1.406694062122389e-06, "loss": 0.4532, "step": 1240 }, { "epoch": 1.286972938955318, "grad_norm": 0.668251097202301, "learning_rate": 1.3311848288809815e-06, "loss": 0.4432, "step": 1250 }, { "epoch": 1.2932662051604784, "grad_norm": 0.6807104349136353, "learning_rate": 1.257446259144494e-06, "loss": 0.4324, "step": 1260 }, { "epoch": 1.2995594713656389, "grad_norm": 0.6319746971130371, "learning_rate": 1.1855139410219657e-06, "loss": 0.4493, "step": 1270 }, { "epoch": 1.3058527375707993, "grad_norm": 0.6163645386695862, "learning_rate": 1.115422590879464e-06, "loss": 0.4501, "step": 1280 }, { "epoch": 1.3121460037759598, "grad_norm": 0.6366046071052551, "learning_rate": 1.047206036585095e-06, "loss": 0.4423, "step": 1290 }, { "epoch": 1.3184392699811203, "grad_norm": 0.5919917821884155, "learning_rate": 9.808972011828055e-07, "loss": 0.4379, "step": 1300 }, { "epoch": 1.3247325361862807, "grad_norm": 0.6659424304962158, "learning_rate": 9.165280870028919e-07, "loss": 0.4548, "step": 1310 }, { "epoch": 1.3310258023914412, "grad_norm": 0.6823071837425232, "learning_rate": 8.541297602168591e-07, "loss": 0.464, "step": 1320 }, { "epoch": 1.3373190685966017, "grad_norm": 0.6970245838165283, "learning_rate": 7.937323358440935e-07, "loss": 0.4598, "step": 1330 }, { "epoch": 1.3436123348017621, "grad_norm": 0.6151891946792603, "learning_rate": 7.353649632175957e-07, "loss": 0.4569, "step": 1340 }, { "epoch": 1.3499056010069226, "grad_norm": 0.6397150754928589, "learning_rate": 6.790558119157597e-07, "loss": 0.4598, "step": 1350 }, { "epoch": 1.356198867212083, "grad_norm": 0.6420609951019287, "learning_rate": 6.248320581670281e-07, "loss": 0.4576, "step": 1360 }, { "epoch": 1.3624921334172435, "grad_norm": 0.6434526443481445, "learning_rate": 5.727198717339511e-07, "loss": 0.4544, "step": 1370 }, { "epoch": 1.368785399622404, "grad_norm": 0.6515443921089172, "learning_rate": 5.227444032829887e-07, "loss": 0.4462, "step": 1380 }, { "epoch": 1.3750786658275644, "grad_norm": 0.59568852186203, "learning_rate": 4.74929772246166e-07, "loss": 0.4697, "step": 1390 }, { "epoch": 1.381371932032725, "grad_norm": 0.6043298244476318, "learning_rate": 4.2929905518041714e-07, "loss": 0.4506, "step": 1400 }, { "epoch": 1.3876651982378854, "grad_norm": 0.5912688970565796, "learning_rate": 3.858742746302535e-07, "loss": 0.4358, "step": 1410 }, { "epoch": 1.3939584644430458, "grad_norm": 0.6092295050621033, "learning_rate": 3.44676388499125e-07, "loss": 0.4545, "step": 1420 }, { "epoch": 1.4002517306482063, "grad_norm": 0.6405302882194519, "learning_rate": 3.0572527993460054e-07, "loss": 0.4584, "step": 1430 }, { "epoch": 1.406544996853367, "grad_norm": 0.6120900511741638, "learning_rate": 2.6903974773225703e-07, "loss": 0.4461, "step": 1440 }, { "epoch": 1.4128382630585274, "grad_norm": 0.594711184501648, "learning_rate": 2.3463749726290287e-07, "loss": 0.4519, "step": 1450 }, { "epoch": 1.419131529263688, "grad_norm": 0.6510460376739502, "learning_rate": 2.0253513192751374e-07, "loss": 0.4509, "step": 1460 }, { "epoch": 1.4254247954688484, "grad_norm": 0.574932873249054, "learning_rate": 1.7274814514400995e-07, "loss": 0.4511, "step": 1470 }, { "epoch": 1.4317180616740088, "grad_norm": 0.6897699236869812, "learning_rate": 1.4529091286973994e-07, "loss": 0.4496, "step": 1480 }, { "epoch": 1.4380113278791693, "grad_norm": 0.6335061192512512, "learning_rate": 1.2017668666327752e-07, "loss": 0.4481, "step": 1490 }, { "epoch": 1.4443045940843298, "grad_norm": 0.6672943830490112, "learning_rate": 9.741758728888218e-08, "loss": 0.4518, "step": 1500 }, { "epoch": 1.4505978602894902, "grad_norm": 0.673734188079834, "learning_rate": 7.702459886670788e-08, "loss": 0.4495, "step": 1510 }, { "epoch": 1.4568911264946507, "grad_norm": 0.6775258183479309, "learning_rate": 5.900756357159143e-08, "loss": 0.458, "step": 1520 }, { "epoch": 1.4631843926998112, "grad_norm": 0.588603138923645, "learning_rate": 4.337517688296544e-08, "loss": 0.4543, "step": 1530 }, { "epoch": 1.4694776589049716, "grad_norm": 0.642877459526062, "learning_rate": 3.013498338820031e-08, "loss": 0.4522, "step": 1540 }, { "epoch": 1.475770925110132, "grad_norm": 0.6829048991203308, "learning_rate": 1.9293373141394124e-08, "loss": 0.4583, "step": 1550 }, { "epoch": 1.4820641913152928, "grad_norm": 0.6072986721992493, "learning_rate": 1.0855578579370696e-08, "loss": 0.4504, "step": 1560 }, { "epoch": 1.4883574575204532, "grad_norm": 0.6607363820075989, "learning_rate": 4.825671996370184e-09, "loss": 0.4618, "step": 1570 }, { "epoch": 1.4946507237256137, "grad_norm": 0.6756806969642639, "learning_rate": 1.2065635786595586e-09, "loss": 0.4539, "step": 1580 }, { "epoch": 2.000629326620516, "step": 1589, "total_flos": 208043401019392.0, "train_loss": 0.4968605734703299, "train_runtime": 50248.0079, "train_samples_per_second": 1.012, "train_steps_per_second": 0.032 } ], "logging_steps": 10, "max_steps": 1589, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 208043401019392.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }