{ "best_metric": 0.09759029000997543, "best_model_checkpoint": "./xlam_lora_new_2560_1_delete_over_size_3epoch_multi_t2/checkpoint-1384", "epoch": 2.9994592321095954, "eval_steps": 173, "global_step": 1560, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.019227302769933306, "grad_norm": 0.8343315720558167, "learning_rate": 3.846153846153846e-06, "loss": 0.6641, "step": 10 }, { "epoch": 0.03845460553986661, "grad_norm": 0.461275190114975, "learning_rate": 7.692307692307692e-06, "loss": 0.6363, "step": 20 }, { "epoch": 0.05768190830979992, "grad_norm": 0.5201115608215332, "learning_rate": 1.153846153846154e-05, "loss": 0.6127, "step": 30 }, { "epoch": 0.07690921107973323, "grad_norm": 0.3447195589542389, "learning_rate": 1.5384615384615384e-05, "loss": 0.5328, "step": 40 }, { "epoch": 0.09613651384966652, "grad_norm": 0.37843698263168335, "learning_rate": 1.923076923076923e-05, "loss": 0.4072, "step": 50 }, { "epoch": 0.11536381661959984, "grad_norm": 0.3421487510204315, "learning_rate": 2.307692307692308e-05, "loss": 0.3608, "step": 60 }, { "epoch": 0.13459111938953314, "grad_norm": 0.28197693824768066, "learning_rate": 2.6923076923076923e-05, "loss": 0.3121, "step": 70 }, { "epoch": 0.15381842215946645, "grad_norm": 0.2969784438610077, "learning_rate": 2.999986518943083e-05, "loss": 0.2089, "step": 80 }, { "epoch": 0.17304572492939974, "grad_norm": 0.35552915930747986, "learning_rate": 2.999514707393943e-05, "loss": 0.238, "step": 90 }, { "epoch": 0.19227302769933305, "grad_norm": 0.36513009667396545, "learning_rate": 2.9983690852978995e-05, "loss": 0.2228, "step": 100 }, { "epoch": 0.21150033046926636, "grad_norm": 0.3532279431819916, "learning_rate": 2.996550167443001e-05, "loss": 0.2321, "step": 110 }, { "epoch": 0.23072763323919968, "grad_norm": 0.35095521807670593, "learning_rate": 2.9940587711643693e-05, "loss": 0.1888, "step": 120 }, { "epoch": 0.24995493600913296, "grad_norm": 0.2918124198913574, "learning_rate": 2.9908960159769243e-05, "loss": 0.1585, "step": 130 }, { "epoch": 0.2691822387790663, "grad_norm": 0.372545063495636, "learning_rate": 2.9870633230723313e-05, "loss": 0.1973, "step": 140 }, { "epoch": 0.28840954154899956, "grad_norm": 0.36561667919158936, "learning_rate": 2.9825624146803807e-05, "loss": 0.1859, "step": 150 }, { "epoch": 0.3076368443189329, "grad_norm": 0.38816386461257935, "learning_rate": 2.977395313295105e-05, "loss": 0.1965, "step": 160 }, { "epoch": 0.3268641470888662, "grad_norm": 0.3820708096027374, "learning_rate": 2.971564340765961e-05, "loss": 0.1516, "step": 170 }, { "epoch": 0.3326323379198462, "eval_loss": 0.13869456946849823, "eval_runtime": 203.7684, "eval_samples_per_second": 5.261, "eval_steps_per_second": 5.261, "step": 173 }, { "epoch": 0.34609144985879947, "grad_norm": 0.41539186239242554, "learning_rate": 2.9650721172545048e-05, "loss": 0.1895, "step": 180 }, { "epoch": 0.3653187526287328, "grad_norm": 0.4073317348957062, "learning_rate": 2.9579215600570152e-05, "loss": 0.1545, "step": 190 }, { "epoch": 0.3845460553986661, "grad_norm": 0.3595292866230011, "learning_rate": 2.950115882293597e-05, "loss": 0.1629, "step": 200 }, { "epoch": 0.40377335816859944, "grad_norm": 0.28112220764160156, "learning_rate": 2.9416585914643627e-05, "loss": 0.1576, "step": 210 }, { "epoch": 0.4230006609385327, "grad_norm": 0.4690268933773041, "learning_rate": 2.932553487873324e-05, "loss": 0.1614, "step": 220 }, { "epoch": 0.442227963708466, "grad_norm": 0.5640648007392883, "learning_rate": 2.922804662920718e-05, "loss": 0.1689, "step": 230 }, { "epoch": 0.46145526647839935, "grad_norm": 0.6498159766197205, "learning_rate": 2.912416497264529e-05, "loss": 0.1704, "step": 240 }, { "epoch": 0.48068256924833264, "grad_norm": 0.46260717511177063, "learning_rate": 2.9013936588520235e-05, "loss": 0.1938, "step": 250 }, { "epoch": 0.4999098720182659, "grad_norm": 0.44871020317077637, "learning_rate": 2.8897411008222026e-05, "loss": 0.1674, "step": 260 }, { "epoch": 0.5191371747881992, "grad_norm": 0.5815374255180359, "learning_rate": 2.8774640592800948e-05, "loss": 0.1424, "step": 270 }, { "epoch": 0.5383644775581325, "grad_norm": 0.5914287567138672, "learning_rate": 2.864568050943899e-05, "loss": 0.1818, "step": 280 }, { "epoch": 0.5575917803280659, "grad_norm": 0.5079910755157471, "learning_rate": 2.8510588706660338e-05, "loss": 0.1633, "step": 290 }, { "epoch": 0.5768190830979991, "grad_norm": 0.5987181067466736, "learning_rate": 2.836942588829208e-05, "loss": 0.1455, "step": 300 }, { "epoch": 0.5960463858679325, "grad_norm": 0.6342700719833374, "learning_rate": 2.8222255486186798e-05, "loss": 0.1522, "step": 310 }, { "epoch": 0.6152736886378658, "grad_norm": 0.5555654764175415, "learning_rate": 2.8069143631719276e-05, "loss": 0.1394, "step": 320 }, { "epoch": 0.634500991407799, "grad_norm": 0.5811598300933838, "learning_rate": 2.7910159126070257e-05, "loss": 0.1623, "step": 330 }, { "epoch": 0.6537282941777324, "grad_norm": 0.7482380270957947, "learning_rate": 2.774537340931043e-05, "loss": 0.1458, "step": 340 }, { "epoch": 0.6652646758396924, "eval_loss": 0.11664145439863205, "eval_runtime": 203.7283, "eval_samples_per_second": 5.262, "eval_steps_per_second": 5.262, "step": 346 }, { "epoch": 0.6729555969476657, "grad_norm": 0.5891281962394714, "learning_rate": 2.7574860528298677e-05, "loss": 0.1406, "step": 350 }, { "epoch": 0.6921828997175989, "grad_norm": 0.5311967730522156, "learning_rate": 2.739869710340894e-05, "loss": 0.1525, "step": 360 }, { "epoch": 0.7114102024875323, "grad_norm": 0.5632440447807312, "learning_rate": 2.7216962294100668e-05, "loss": 0.1392, "step": 370 }, { "epoch": 0.7306375052574656, "grad_norm": 0.6121944785118103, "learning_rate": 2.7029737763348316e-05, "loss": 0.1602, "step": 380 }, { "epoch": 0.7498648080273989, "grad_norm": 0.6687933802604675, "learning_rate": 2.6837107640945904e-05, "loss": 0.1583, "step": 390 }, { "epoch": 0.7690921107973322, "grad_norm": 0.5596562623977661, "learning_rate": 2.6639158485703087e-05, "loss": 0.1667, "step": 400 }, { "epoch": 0.7883194135672655, "grad_norm": 0.7156023979187012, "learning_rate": 2.6435979246549727e-05, "loss": 0.1438, "step": 410 }, { "epoch": 0.8075467163371989, "grad_norm": 0.5293470621109009, "learning_rate": 2.6227661222566516e-05, "loss": 0.1865, "step": 420 }, { "epoch": 0.8267740191071321, "grad_norm": 0.5734898447990417, "learning_rate": 2.6014298021959482e-05, "loss": 0.1477, "step": 430 }, { "epoch": 0.8460013218770654, "grad_norm": 0.5020838975906372, "learning_rate": 2.5795985519996915e-05, "loss": 0.1303, "step": 440 }, { "epoch": 0.8652286246469988, "grad_norm": 0.6881216764450073, "learning_rate": 2.5572821815927615e-05, "loss": 0.1429, "step": 450 }, { "epoch": 0.884455927416932, "grad_norm": 0.4911053776741028, "learning_rate": 2.5344907188899715e-05, "loss": 0.1547, "step": 460 }, { "epoch": 0.9036832301868654, "grad_norm": 0.8948251605033875, "learning_rate": 2.511234405290005e-05, "loss": 0.136, "step": 470 }, { "epoch": 0.9229105329567987, "grad_norm": 0.4923257529735565, "learning_rate": 2.4875236910734145e-05, "loss": 0.123, "step": 480 }, { "epoch": 0.9421378357267319, "grad_norm": 0.6984175443649292, "learning_rate": 2.4633692307067654e-05, "loss": 0.1519, "step": 490 }, { "epoch": 0.9613651384966653, "grad_norm": 0.6080285310745239, "learning_rate": 2.4387818780550236e-05, "loss": 0.1267, "step": 500 }, { "epoch": 0.9805924412665986, "grad_norm": 0.6914392113685608, "learning_rate": 2.4137726815043483e-05, "loss": 0.1664, "step": 510 }, { "epoch": 0.9978970137595385, "eval_loss": 0.10869105905294418, "eval_runtime": 203.0899, "eval_samples_per_second": 5.278, "eval_steps_per_second": 5.278, "step": 519 }, { "epoch": 0.9998197440365318, "grad_norm": 0.5257564187049866, "learning_rate": 2.3883528789974703e-05, "loss": 0.1474, "step": 520 }, { "epoch": 1.0190470468064652, "grad_norm": 0.5023784041404724, "learning_rate": 2.3625338929838952e-05, "loss": 0.1057, "step": 530 }, { "epoch": 1.0382743495763984, "grad_norm": 0.6757215857505798, "learning_rate": 2.3363273252872003e-05, "loss": 0.1477, "step": 540 }, { "epoch": 1.0575016523463319, "grad_norm": 0.5462861657142639, "learning_rate": 2.3097449518917257e-05, "loss": 0.1205, "step": 550 }, { "epoch": 1.076728955116265, "grad_norm": 0.7116460800170898, "learning_rate": 2.2827987176510082e-05, "loss": 0.1164, "step": 560 }, { "epoch": 1.0959562578861983, "grad_norm": 0.5546866655349731, "learning_rate": 2.255500730920332e-05, "loss": 0.1304, "step": 570 }, { "epoch": 1.1151835606561318, "grad_norm": 0.4724363088607788, "learning_rate": 2.2278632581158095e-05, "loss": 0.1295, "step": 580 }, { "epoch": 1.134410863426065, "grad_norm": 1.0140602588653564, "learning_rate": 2.1998987182024384e-05, "loss": 0.1122, "step": 590 }, { "epoch": 1.1536381661959982, "grad_norm": 0.790867805480957, "learning_rate": 2.1716196771136115e-05, "loss": 0.1169, "step": 600 }, { "epoch": 1.1728654689659317, "grad_norm": 0.6885173320770264, "learning_rate": 2.1430388421045812e-05, "loss": 0.1352, "step": 610 }, { "epoch": 1.192092771735865, "grad_norm": 0.6807064414024353, "learning_rate": 2.1141690560424253e-05, "loss": 0.1226, "step": 620 }, { "epoch": 1.2113200745057981, "grad_norm": 0.5460578799247742, "learning_rate": 2.0850232916350735e-05, "loss": 0.1214, "step": 630 }, { "epoch": 1.2305473772757316, "grad_norm": 0.7612866163253784, "learning_rate": 2.05561464560199e-05, "loss": 0.1164, "step": 640 }, { "epoch": 1.2497746800456648, "grad_norm": 0.38693496584892273, "learning_rate": 2.025956332789132e-05, "loss": 0.1398, "step": 650 }, { "epoch": 1.269001982815598, "grad_norm": 0.5924756526947021, "learning_rate": 1.996061680230823e-05, "loss": 0.1214, "step": 660 }, { "epoch": 1.2882292855855315, "grad_norm": 0.7164785861968994, "learning_rate": 1.9659441211612234e-05, "loss": 0.1226, "step": 670 }, { "epoch": 1.3074565883554647, "grad_norm": 0.5729460716247559, "learning_rate": 1.93561718897807e-05, "loss": 0.1481, "step": 680 }, { "epoch": 1.326683891125398, "grad_norm": 0.6892575025558472, "learning_rate": 1.9050945111614142e-05, "loss": 0.1498, "step": 690 }, { "epoch": 1.3305293516793848, "eval_loss": 0.10392692685127258, "eval_runtime": 203.0603, "eval_samples_per_second": 5.279, "eval_steps_per_second": 5.279, "step": 692 }, { "epoch": 1.3459111938953314, "grad_norm": 0.5614696145057678, "learning_rate": 1.8743898031500772e-05, "loss": 0.1105, "step": 700 }, { "epoch": 1.3651384966652647, "grad_norm": 0.6355635523796082, "learning_rate": 1.843516862178589e-05, "loss": 0.1291, "step": 710 }, { "epoch": 1.3843657994351979, "grad_norm": 0.7176327109336853, "learning_rate": 1.8124895610773645e-05, "loss": 0.1387, "step": 720 }, { "epoch": 1.4035931022051313, "grad_norm": 0.9504517316818237, "learning_rate": 1.781321842038914e-05, "loss": 0.1346, "step": 730 }, { "epoch": 1.4228204049750646, "grad_norm": 0.7893795371055603, "learning_rate": 1.7500277103528883e-05, "loss": 0.1224, "step": 740 }, { "epoch": 1.4420477077449978, "grad_norm": 0.5944446921348572, "learning_rate": 1.718621228112764e-05, "loss": 0.1095, "step": 750 }, { "epoch": 1.4612750105149312, "grad_norm": 0.5783366560935974, "learning_rate": 1.6871165078970118e-05, "loss": 0.1116, "step": 760 }, { "epoch": 1.4805023132848645, "grad_norm": 0.6842564940452576, "learning_rate": 1.6555277064275717e-05, "loss": 0.1215, "step": 770 }, { "epoch": 1.4997296160547977, "grad_norm": 0.5818539261817932, "learning_rate": 1.623869018208499e-05, "loss": 0.1283, "step": 780 }, { "epoch": 1.5189569188247312, "grad_norm": 0.658789336681366, "learning_rate": 1.5921546691476264e-05, "loss": 0.1168, "step": 790 }, { "epoch": 1.5381842215946644, "grad_norm": 0.7144546508789062, "learning_rate": 1.5603989101641228e-05, "loss": 0.1247, "step": 800 }, { "epoch": 1.5574115243645976, "grad_norm": 0.5796612501144409, "learning_rate": 1.5286160107848036e-05, "loss": 0.1279, "step": 810 }, { "epoch": 1.576638827134531, "grad_norm": 0.6537405252456665, "learning_rate": 1.4968202527320868e-05, "loss": 0.1396, "step": 820 }, { "epoch": 1.5958661299044643, "grad_norm": 0.7590240836143494, "learning_rate": 1.4650259235064662e-05, "loss": 0.1183, "step": 830 }, { "epoch": 1.6150934326743975, "grad_norm": 0.6850148439407349, "learning_rate": 1.43324730996639e-05, "loss": 0.1277, "step": 840 }, { "epoch": 1.634320735444331, "grad_norm": 0.7500022053718567, "learning_rate": 1.4014986919084228e-05, "loss": 0.1285, "step": 850 }, { "epoch": 1.6535480382142642, "grad_norm": 0.6234251856803894, "learning_rate": 1.3697943356505897e-05, "loss": 0.1071, "step": 860 }, { "epoch": 1.663161689599231, "eval_loss": 0.10122876614332199, "eval_runtime": 203.2008, "eval_samples_per_second": 5.276, "eval_steps_per_second": 5.276, "step": 865 }, { "epoch": 1.6727753409841974, "grad_norm": 1.0110090970993042, "learning_rate": 1.3381484876217669e-05, "loss": 0.1252, "step": 870 }, { "epoch": 1.692002643754131, "grad_norm": 0.8749274611473083, "learning_rate": 1.3065753679600186e-05, "loss": 0.1086, "step": 880 }, { "epoch": 1.7112299465240641, "grad_norm": 0.563439667224884, "learning_rate": 1.2750891641227418e-05, "loss": 0.1273, "step": 890 }, { "epoch": 1.7304572492939974, "grad_norm": 0.6679959297180176, "learning_rate": 1.2437040245114966e-05, "loss": 0.1124, "step": 900 }, { "epoch": 1.7496845520639308, "grad_norm": 0.8824312090873718, "learning_rate": 1.2124340521143929e-05, "loss": 0.1275, "step": 910 }, { "epoch": 1.768911854833864, "grad_norm": 0.6557831168174744, "learning_rate": 1.1812932981688715e-05, "loss": 0.1207, "step": 920 }, { "epoch": 1.7881391576037973, "grad_norm": 0.5608255863189697, "learning_rate": 1.1502957558477537e-05, "loss": 0.1095, "step": 930 }, { "epoch": 1.8073664603737307, "grad_norm": 0.8327426910400391, "learning_rate": 1.119455353971371e-05, "loss": 0.1423, "step": 940 }, { "epoch": 1.826593763143664, "grad_norm": 0.7187633514404297, "learning_rate": 1.0887859507486183e-05, "loss": 0.1142, "step": 950 }, { "epoch": 1.8458210659135972, "grad_norm": 0.7449970841407776, "learning_rate": 1.0583013275497318e-05, "loss": 0.1315, "step": 960 }, { "epoch": 1.8650483686835306, "grad_norm": 0.5967345237731934, "learning_rate": 1.0280151827136e-05, "loss": 0.1147, "step": 970 }, { "epoch": 1.884275671453464, "grad_norm": 0.8269909024238586, "learning_rate": 9.979411253923813e-06, "loss": 0.131, "step": 980 }, { "epoch": 1.903502974223397, "grad_norm": 0.6085448861122131, "learning_rate": 9.680926694361966e-06, "loss": 0.1339, "step": 990 }, { "epoch": 1.9227302769933305, "grad_norm": 0.7495784163475037, "learning_rate": 9.384832273206514e-06, "loss": 0.1324, "step": 1000 }, { "epoch": 1.941957579763264, "grad_norm": 0.8735560178756714, "learning_rate": 9.091261041199051e-06, "loss": 0.1225, "step": 1010 }, { "epoch": 1.961184882533197, "grad_norm": 0.7350926995277405, "learning_rate": 8.80034491528005e-06, "loss": 0.1108, "step": 1020 }, { "epoch": 1.9804121853031305, "grad_norm": 0.7938897013664246, "learning_rate": 8.51221461931167e-06, "loss": 0.1416, "step": 1030 }, { "epoch": 1.995794027519077, "eval_loss": 0.09892405569553375, "eval_runtime": 202.9814, "eval_samples_per_second": 5.281, "eval_steps_per_second": 5.281, "step": 1038 }, { "epoch": 1.999639488073064, "grad_norm": 1.0423219203948975, "learning_rate": 8.226999625336663e-06, "loss": 0.1597, "step": 1040 }, { "epoch": 2.018866790842997, "grad_norm": 0.7138562202453613, "learning_rate": 7.944828095399802e-06, "loss": 0.1114, "step": 1050 }, { "epoch": 2.0380940936129304, "grad_norm": 0.6143700480461121, "learning_rate": 7.66582682395797e-06, "loss": 0.1066, "step": 1060 }, { "epoch": 2.057321396382864, "grad_norm": 0.8447745442390442, "learning_rate": 7.390121180904763e-06, "loss": 0.113, "step": 1070 }, { "epoch": 2.076548699152797, "grad_norm": 0.41642722487449646, "learning_rate": 7.117835055235195e-06, "loss": 0.1095, "step": 1080 }, { "epoch": 2.0957760019227303, "grad_norm": 0.7106382250785828, "learning_rate": 6.849090799375931e-06, "loss": 0.1214, "step": 1090 }, { "epoch": 2.1150033046926637, "grad_norm": 0.6263849139213562, "learning_rate": 6.584009174205888e-06, "loss": 0.1293, "step": 1100 }, { "epoch": 2.1342306074625967, "grad_norm": 0.7726497054100037, "learning_rate": 6.322709294792051e-06, "loss": 0.1394, "step": 1110 }, { "epoch": 2.15345791023253, "grad_norm": 0.7134016752243042, "learning_rate": 6.065308576864859e-06, "loss": 0.1039, "step": 1120 }, { "epoch": 2.1726852130024636, "grad_norm": 0.6412186026573181, "learning_rate": 5.811922684057118e-06, "loss": 0.1151, "step": 1130 }, { "epoch": 2.1919125157723967, "grad_norm": 0.9640927314758301, "learning_rate": 5.5626654759303085e-06, "loss": 0.1247, "step": 1140 }, { "epoch": 2.21113981854233, "grad_norm": 0.8550817370414734, "learning_rate": 5.3176489568115e-06, "loss": 0.1069, "step": 1150 }, { "epoch": 2.2303671213122636, "grad_norm": 0.6239781975746155, "learning_rate": 5.0769832254639355e-06, "loss": 0.1013, "step": 1160 }, { "epoch": 2.2495944240821966, "grad_norm": 0.7141818404197693, "learning_rate": 4.840776425613887e-06, "loss": 0.0976, "step": 1170 }, { "epoch": 2.26882172685213, "grad_norm": 0.48725616931915283, "learning_rate": 4.609134697356009e-06, "loss": 0.1049, "step": 1180 }, { "epoch": 2.2880490296220635, "grad_norm": 0.8563340902328491, "learning_rate": 4.382162129459055e-06, "loss": 0.0988, "step": 1190 }, { "epoch": 2.3072763323919965, "grad_norm": 0.7721908092498779, "learning_rate": 4.159960712593301e-06, "loss": 0.1022, "step": 1200 }, { "epoch": 2.32650363516193, "grad_norm": 0.6547017097473145, "learning_rate": 3.942630293500821e-06, "loss": 0.1321, "step": 1210 }, { "epoch": 2.3284263654389235, "eval_loss": 0.09838072210550308, "eval_runtime": 203.6342, "eval_samples_per_second": 5.264, "eval_steps_per_second": 5.264, "step": 1211 }, { "epoch": 2.3457309379318634, "grad_norm": 0.6637281775474548, "learning_rate": 3.730268530129097e-06, "loss": 0.0987, "step": 1220 }, { "epoch": 2.3649582407017964, "grad_norm": 0.9455267786979675, "learning_rate": 3.522970847748196e-06, "loss": 0.1286, "step": 1230 }, { "epoch": 2.38418554347173, "grad_norm": 0.7048280239105225, "learning_rate": 3.3208303960711895e-06, "loss": 0.0998, "step": 1240 }, { "epoch": 2.4034128462416633, "grad_norm": 0.8539944291114807, "learning_rate": 3.1239380073971e-06, "loss": 0.114, "step": 1250 }, { "epoch": 2.4226401490115963, "grad_norm": 0.8466408252716064, "learning_rate": 2.9323821557952007e-06, "loss": 0.0986, "step": 1260 }, { "epoch": 2.4418674517815298, "grad_norm": 0.6942047476768494, "learning_rate": 2.7462489173489636e-06, "loss": 0.1112, "step": 1270 }, { "epoch": 2.461094754551463, "grad_norm": 0.7220749258995056, "learning_rate": 2.5656219314775886e-06, "loss": 0.1083, "step": 1280 }, { "epoch": 2.480322057321396, "grad_norm": 0.8154662847518921, "learning_rate": 2.3905823633523997e-06, "loss": 0.0981, "step": 1290 }, { "epoch": 2.4995493600913297, "grad_norm": 0.7933881282806396, "learning_rate": 2.221208867425096e-06, "loss": 0.0965, "step": 1300 }, { "epoch": 2.518776662861263, "grad_norm": 0.770427405834198, "learning_rate": 2.0575775520841878e-06, "loss": 0.1399, "step": 1310 }, { "epoch": 2.538003965631196, "grad_norm": 0.7757827043533325, "learning_rate": 1.8997619454554955e-06, "loss": 0.1022, "step": 1320 }, { "epoch": 2.5572312684011296, "grad_norm": 0.8519064784049988, "learning_rate": 1.7478329623621226e-06, "loss": 0.1114, "step": 1330 }, { "epoch": 2.576458571171063, "grad_norm": 0.4863261878490448, "learning_rate": 1.601858872458702e-06, "loss": 0.0964, "step": 1340 }, { "epoch": 2.5956858739409965, "grad_norm": 0.7736539244651794, "learning_rate": 1.4619052695542612e-06, "loss": 0.1062, "step": 1350 }, { "epoch": 2.6149131767109295, "grad_norm": 0.8441415429115295, "learning_rate": 1.3280350421374888e-06, "loss": 0.1158, "step": 1360 }, { "epoch": 2.634140479480863, "grad_norm": 1.0856326818466187, "learning_rate": 1.2003083451176366e-06, "loss": 0.1314, "step": 1370 }, { "epoch": 2.653367782250796, "grad_norm": 0.6666831374168396, "learning_rate": 1.0787825727937783e-06, "loss": 0.0889, "step": 1380 }, { "epoch": 2.6610587033587696, "eval_loss": 0.09759029000997543, "eval_runtime": 202.9122, "eval_samples_per_second": 5.283, "eval_steps_per_second": 5.283, "step": 1384 }, { "epoch": 2.6725950850207294, "grad_norm": 0.7304587960243225, "learning_rate": 9.635123330645218e-07, "loss": 0.1098, "step": 1390 }, { "epoch": 2.691822387790663, "grad_norm": 0.7729344964027405, "learning_rate": 8.545494228898448e-07, "loss": 0.0874, "step": 1400 }, { "epoch": 2.7110496905605963, "grad_norm": 0.8187854886054993, "learning_rate": 7.519428050159765e-07, "loss": 0.1295, "step": 1410 }, { "epoch": 2.7302769933305293, "grad_norm": 0.5940708518028259, "learning_rate": 6.557385859738985e-07, "loss": 0.1194, "step": 1420 }, { "epoch": 2.7495042961004628, "grad_norm": 0.8048242926597595, "learning_rate": 5.659799953612438e-07, "loss": 0.1112, "step": 1430 }, { "epoch": 2.7687315988703958, "grad_norm": 0.65644371509552, "learning_rate": 4.827073664169812e-07, "loss": 0.1061, "step": 1440 }, { "epoch": 2.7879589016403292, "grad_norm": 0.733321487903595, "learning_rate": 4.059581178975741e-07, "loss": 0.1187, "step": 1450 }, { "epoch": 2.8071862044102627, "grad_norm": 0.48416727781295776, "learning_rate": 3.357667372627754e-07, "loss": 0.1183, "step": 1460 }, { "epoch": 2.826413507180196, "grad_norm": 0.5195744633674622, "learning_rate": 2.7216476517860245e-07, "loss": 0.0869, "step": 1470 }, { "epoch": 2.845640809950129, "grad_norm": 0.6493935585021973, "learning_rate": 2.151807813444606e-07, "loss": 0.0867, "step": 1480 }, { "epoch": 2.8648681127200626, "grad_norm": 0.7516520023345947, "learning_rate": 1.6484039165079455e-07, "loss": 0.1259, "step": 1490 }, { "epoch": 2.8840954154899956, "grad_norm": 0.7778034210205078, "learning_rate": 1.211662166730071e-07, "loss": 0.1229, "step": 1500 }, { "epoch": 2.903322718259929, "grad_norm": 0.6024238467216492, "learning_rate": 8.417788150686001e-08, "loss": 0.0972, "step": 1510 }, { "epoch": 2.9225500210298625, "grad_norm": 0.6645819544792175, "learning_rate": 5.389200694988494e-08, "loss": 0.1297, "step": 1520 }, { "epoch": 2.941777323799796, "grad_norm": 0.9006750583648682, "learning_rate": 3.032220203278924e-08, "loss": 0.1088, "step": 1530 }, { "epoch": 2.961004626569729, "grad_norm": 0.7198818325996399, "learning_rate": 1.3479057904204339e-08, "loss": 0.1041, "step": 1540 }, { "epoch": 2.9802319293396624, "grad_norm": 0.6966450810432434, "learning_rate": 3.3701430715277202e-09, "loss": 0.09, "step": 1550 }, { "epoch": 2.9936910412786157, "eval_loss": 0.09760043770074844, "eval_runtime": 202.9049, "eval_samples_per_second": 5.283, "eval_steps_per_second": 5.283, "step": 1557 }, { "epoch": 2.9994592321095954, "grad_norm": 0.7270023822784424, "learning_rate": 0.0, "loss": 0.1184, "step": 1560 }, { "epoch": 2.9994592321095954, "step": 1560, "total_flos": 1.3623219564340838e+18, "train_loss": 0.1486816066579941, "train_runtime": 34006.2437, "train_samples_per_second": 1.468, "train_steps_per_second": 0.046 } ], "logging_steps": 10, "max_steps": 1560, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 173, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.3623219564340838e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }