| { | |
| "best_metric": 0.09759029000997543, | |
| "best_model_checkpoint": "./xlam_lora_new_2560_1_delete_over_size_3epoch_multi_t2/checkpoint-1384", | |
| "epoch": 2.9994592321095954, | |
| "eval_steps": 173, | |
| "global_step": 1560, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.019227302769933306, | |
| "grad_norm": 0.8343315720558167, | |
| "learning_rate": 3.846153846153846e-06, | |
| "loss": 0.6641, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.03845460553986661, | |
| "grad_norm": 0.461275190114975, | |
| "learning_rate": 7.692307692307692e-06, | |
| "loss": 0.6363, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.05768190830979992, | |
| "grad_norm": 0.5201115608215332, | |
| "learning_rate": 1.153846153846154e-05, | |
| "loss": 0.6127, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.07690921107973323, | |
| "grad_norm": 0.3447195589542389, | |
| "learning_rate": 1.5384615384615384e-05, | |
| "loss": 0.5328, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.09613651384966652, | |
| "grad_norm": 0.37843698263168335, | |
| "learning_rate": 1.923076923076923e-05, | |
| "loss": 0.4072, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.11536381661959984, | |
| "grad_norm": 0.3421487510204315, | |
| "learning_rate": 2.307692307692308e-05, | |
| "loss": 0.3608, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.13459111938953314, | |
| "grad_norm": 0.28197693824768066, | |
| "learning_rate": 2.6923076923076923e-05, | |
| "loss": 0.3121, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.15381842215946645, | |
| "grad_norm": 0.2969784438610077, | |
| "learning_rate": 2.999986518943083e-05, | |
| "loss": 0.2089, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.17304572492939974, | |
| "grad_norm": 0.35552915930747986, | |
| "learning_rate": 2.999514707393943e-05, | |
| "loss": 0.238, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.19227302769933305, | |
| "grad_norm": 0.36513009667396545, | |
| "learning_rate": 2.9983690852978995e-05, | |
| "loss": 0.2228, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.21150033046926636, | |
| "grad_norm": 0.3532279431819916, | |
| "learning_rate": 2.996550167443001e-05, | |
| "loss": 0.2321, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.23072763323919968, | |
| "grad_norm": 0.35095521807670593, | |
| "learning_rate": 2.9940587711643693e-05, | |
| "loss": 0.1888, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.24995493600913296, | |
| "grad_norm": 0.2918124198913574, | |
| "learning_rate": 2.9908960159769243e-05, | |
| "loss": 0.1585, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.2691822387790663, | |
| "grad_norm": 0.372545063495636, | |
| "learning_rate": 2.9870633230723313e-05, | |
| "loss": 0.1973, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.28840954154899956, | |
| "grad_norm": 0.36561667919158936, | |
| "learning_rate": 2.9825624146803807e-05, | |
| "loss": 0.1859, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.3076368443189329, | |
| "grad_norm": 0.38816386461257935, | |
| "learning_rate": 2.977395313295105e-05, | |
| "loss": 0.1965, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.3268641470888662, | |
| "grad_norm": 0.3820708096027374, | |
| "learning_rate": 2.971564340765961e-05, | |
| "loss": 0.1516, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.3326323379198462, | |
| "eval_loss": 0.13869456946849823, | |
| "eval_runtime": 203.7684, | |
| "eval_samples_per_second": 5.261, | |
| "eval_steps_per_second": 5.261, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 0.34609144985879947, | |
| "grad_norm": 0.41539186239242554, | |
| "learning_rate": 2.9650721172545048e-05, | |
| "loss": 0.1895, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.3653187526287328, | |
| "grad_norm": 0.4073317348957062, | |
| "learning_rate": 2.9579215600570152e-05, | |
| "loss": 0.1545, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.3845460553986661, | |
| "grad_norm": 0.3595292866230011, | |
| "learning_rate": 2.950115882293597e-05, | |
| "loss": 0.1629, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.40377335816859944, | |
| "grad_norm": 0.28112220764160156, | |
| "learning_rate": 2.9416585914643627e-05, | |
| "loss": 0.1576, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.4230006609385327, | |
| "grad_norm": 0.4690268933773041, | |
| "learning_rate": 2.932553487873324e-05, | |
| "loss": 0.1614, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.442227963708466, | |
| "grad_norm": 0.5640648007392883, | |
| "learning_rate": 2.922804662920718e-05, | |
| "loss": 0.1689, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.46145526647839935, | |
| "grad_norm": 0.6498159766197205, | |
| "learning_rate": 2.912416497264529e-05, | |
| "loss": 0.1704, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.48068256924833264, | |
| "grad_norm": 0.46260717511177063, | |
| "learning_rate": 2.9013936588520235e-05, | |
| "loss": 0.1938, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.4999098720182659, | |
| "grad_norm": 0.44871020317077637, | |
| "learning_rate": 2.8897411008222026e-05, | |
| "loss": 0.1674, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.5191371747881992, | |
| "grad_norm": 0.5815374255180359, | |
| "learning_rate": 2.8774640592800948e-05, | |
| "loss": 0.1424, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.5383644775581325, | |
| "grad_norm": 0.5914287567138672, | |
| "learning_rate": 2.864568050943899e-05, | |
| "loss": 0.1818, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.5575917803280659, | |
| "grad_norm": 0.5079910755157471, | |
| "learning_rate": 2.8510588706660338e-05, | |
| "loss": 0.1633, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.5768190830979991, | |
| "grad_norm": 0.5987181067466736, | |
| "learning_rate": 2.836942588829208e-05, | |
| "loss": 0.1455, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.5960463858679325, | |
| "grad_norm": 0.6342700719833374, | |
| "learning_rate": 2.8222255486186798e-05, | |
| "loss": 0.1522, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.6152736886378658, | |
| "grad_norm": 0.5555654764175415, | |
| "learning_rate": 2.8069143631719276e-05, | |
| "loss": 0.1394, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.634500991407799, | |
| "grad_norm": 0.5811598300933838, | |
| "learning_rate": 2.7910159126070257e-05, | |
| "loss": 0.1623, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.6537282941777324, | |
| "grad_norm": 0.7482380270957947, | |
| "learning_rate": 2.774537340931043e-05, | |
| "loss": 0.1458, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.6652646758396924, | |
| "eval_loss": 0.11664145439863205, | |
| "eval_runtime": 203.7283, | |
| "eval_samples_per_second": 5.262, | |
| "eval_steps_per_second": 5.262, | |
| "step": 346 | |
| }, | |
| { | |
| "epoch": 0.6729555969476657, | |
| "grad_norm": 0.5891281962394714, | |
| "learning_rate": 2.7574860528298677e-05, | |
| "loss": 0.1406, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.6921828997175989, | |
| "grad_norm": 0.5311967730522156, | |
| "learning_rate": 2.739869710340894e-05, | |
| "loss": 0.1525, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.7114102024875323, | |
| "grad_norm": 0.5632440447807312, | |
| "learning_rate": 2.7216962294100668e-05, | |
| "loss": 0.1392, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.7306375052574656, | |
| "grad_norm": 0.6121944785118103, | |
| "learning_rate": 2.7029737763348316e-05, | |
| "loss": 0.1602, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.7498648080273989, | |
| "grad_norm": 0.6687933802604675, | |
| "learning_rate": 2.6837107640945904e-05, | |
| "loss": 0.1583, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.7690921107973322, | |
| "grad_norm": 0.5596562623977661, | |
| "learning_rate": 2.6639158485703087e-05, | |
| "loss": 0.1667, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.7883194135672655, | |
| "grad_norm": 0.7156023979187012, | |
| "learning_rate": 2.6435979246549727e-05, | |
| "loss": 0.1438, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.8075467163371989, | |
| "grad_norm": 0.5293470621109009, | |
| "learning_rate": 2.6227661222566516e-05, | |
| "loss": 0.1865, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.8267740191071321, | |
| "grad_norm": 0.5734898447990417, | |
| "learning_rate": 2.6014298021959482e-05, | |
| "loss": 0.1477, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.8460013218770654, | |
| "grad_norm": 0.5020838975906372, | |
| "learning_rate": 2.5795985519996915e-05, | |
| "loss": 0.1303, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.8652286246469988, | |
| "grad_norm": 0.6881216764450073, | |
| "learning_rate": 2.5572821815927615e-05, | |
| "loss": 0.1429, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.884455927416932, | |
| "grad_norm": 0.4911053776741028, | |
| "learning_rate": 2.5344907188899715e-05, | |
| "loss": 0.1547, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.9036832301868654, | |
| "grad_norm": 0.8948251605033875, | |
| "learning_rate": 2.511234405290005e-05, | |
| "loss": 0.136, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.9229105329567987, | |
| "grad_norm": 0.4923257529735565, | |
| "learning_rate": 2.4875236910734145e-05, | |
| "loss": 0.123, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.9421378357267319, | |
| "grad_norm": 0.6984175443649292, | |
| "learning_rate": 2.4633692307067654e-05, | |
| "loss": 0.1519, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.9613651384966653, | |
| "grad_norm": 0.6080285310745239, | |
| "learning_rate": 2.4387818780550236e-05, | |
| "loss": 0.1267, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.9805924412665986, | |
| "grad_norm": 0.6914392113685608, | |
| "learning_rate": 2.4137726815043483e-05, | |
| "loss": 0.1664, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.9978970137595385, | |
| "eval_loss": 0.10869105905294418, | |
| "eval_runtime": 203.0899, | |
| "eval_samples_per_second": 5.278, | |
| "eval_steps_per_second": 5.278, | |
| "step": 519 | |
| }, | |
| { | |
| "epoch": 0.9998197440365318, | |
| "grad_norm": 0.5257564187049866, | |
| "learning_rate": 2.3883528789974703e-05, | |
| "loss": 0.1474, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.0190470468064652, | |
| "grad_norm": 0.5023784041404724, | |
| "learning_rate": 2.3625338929838952e-05, | |
| "loss": 0.1057, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 1.0382743495763984, | |
| "grad_norm": 0.6757215857505798, | |
| "learning_rate": 2.3363273252872003e-05, | |
| "loss": 0.1477, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.0575016523463319, | |
| "grad_norm": 0.5462861657142639, | |
| "learning_rate": 2.3097449518917257e-05, | |
| "loss": 0.1205, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.076728955116265, | |
| "grad_norm": 0.7116460800170898, | |
| "learning_rate": 2.2827987176510082e-05, | |
| "loss": 0.1164, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 1.0959562578861983, | |
| "grad_norm": 0.5546866655349731, | |
| "learning_rate": 2.255500730920332e-05, | |
| "loss": 0.1304, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.1151835606561318, | |
| "grad_norm": 0.4724363088607788, | |
| "learning_rate": 2.2278632581158095e-05, | |
| "loss": 0.1295, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 1.134410863426065, | |
| "grad_norm": 1.0140602588653564, | |
| "learning_rate": 2.1998987182024384e-05, | |
| "loss": 0.1122, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 1.1536381661959982, | |
| "grad_norm": 0.790867805480957, | |
| "learning_rate": 2.1716196771136115e-05, | |
| "loss": 0.1169, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.1728654689659317, | |
| "grad_norm": 0.6885173320770264, | |
| "learning_rate": 2.1430388421045812e-05, | |
| "loss": 0.1352, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 1.192092771735865, | |
| "grad_norm": 0.6807064414024353, | |
| "learning_rate": 2.1141690560424253e-05, | |
| "loss": 0.1226, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 1.2113200745057981, | |
| "grad_norm": 0.5460578799247742, | |
| "learning_rate": 2.0850232916350735e-05, | |
| "loss": 0.1214, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 1.2305473772757316, | |
| "grad_norm": 0.7612866163253784, | |
| "learning_rate": 2.05561464560199e-05, | |
| "loss": 0.1164, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 1.2497746800456648, | |
| "grad_norm": 0.38693496584892273, | |
| "learning_rate": 2.025956332789132e-05, | |
| "loss": 0.1398, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.269001982815598, | |
| "grad_norm": 0.5924756526947021, | |
| "learning_rate": 1.996061680230823e-05, | |
| "loss": 0.1214, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 1.2882292855855315, | |
| "grad_norm": 0.7164785861968994, | |
| "learning_rate": 1.9659441211612234e-05, | |
| "loss": 0.1226, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 1.3074565883554647, | |
| "grad_norm": 0.5729460716247559, | |
| "learning_rate": 1.93561718897807e-05, | |
| "loss": 0.1481, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 1.326683891125398, | |
| "grad_norm": 0.6892575025558472, | |
| "learning_rate": 1.9050945111614142e-05, | |
| "loss": 0.1498, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 1.3305293516793848, | |
| "eval_loss": 0.10392692685127258, | |
| "eval_runtime": 203.0603, | |
| "eval_samples_per_second": 5.279, | |
| "eval_steps_per_second": 5.279, | |
| "step": 692 | |
| }, | |
| { | |
| "epoch": 1.3459111938953314, | |
| "grad_norm": 0.5614696145057678, | |
| "learning_rate": 1.8743898031500772e-05, | |
| "loss": 0.1105, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.3651384966652647, | |
| "grad_norm": 0.6355635523796082, | |
| "learning_rate": 1.843516862178589e-05, | |
| "loss": 0.1291, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 1.3843657994351979, | |
| "grad_norm": 0.7176327109336853, | |
| "learning_rate": 1.8124895610773645e-05, | |
| "loss": 0.1387, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 1.4035931022051313, | |
| "grad_norm": 0.9504517316818237, | |
| "learning_rate": 1.781321842038914e-05, | |
| "loss": 0.1346, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 1.4228204049750646, | |
| "grad_norm": 0.7893795371055603, | |
| "learning_rate": 1.7500277103528883e-05, | |
| "loss": 0.1224, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 1.4420477077449978, | |
| "grad_norm": 0.5944446921348572, | |
| "learning_rate": 1.718621228112764e-05, | |
| "loss": 0.1095, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.4612750105149312, | |
| "grad_norm": 0.5783366560935974, | |
| "learning_rate": 1.6871165078970118e-05, | |
| "loss": 0.1116, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 1.4805023132848645, | |
| "grad_norm": 0.6842564940452576, | |
| "learning_rate": 1.6555277064275717e-05, | |
| "loss": 0.1215, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 1.4997296160547977, | |
| "grad_norm": 0.5818539261817932, | |
| "learning_rate": 1.623869018208499e-05, | |
| "loss": 0.1283, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 1.5189569188247312, | |
| "grad_norm": 0.658789336681366, | |
| "learning_rate": 1.5921546691476264e-05, | |
| "loss": 0.1168, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 1.5381842215946644, | |
| "grad_norm": 0.7144546508789062, | |
| "learning_rate": 1.5603989101641228e-05, | |
| "loss": 0.1247, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.5574115243645976, | |
| "grad_norm": 0.5796612501144409, | |
| "learning_rate": 1.5286160107848036e-05, | |
| "loss": 0.1279, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 1.576638827134531, | |
| "grad_norm": 0.6537405252456665, | |
| "learning_rate": 1.4968202527320868e-05, | |
| "loss": 0.1396, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 1.5958661299044643, | |
| "grad_norm": 0.7590240836143494, | |
| "learning_rate": 1.4650259235064662e-05, | |
| "loss": 0.1183, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 1.6150934326743975, | |
| "grad_norm": 0.6850148439407349, | |
| "learning_rate": 1.43324730996639e-05, | |
| "loss": 0.1277, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 1.634320735444331, | |
| "grad_norm": 0.7500022053718567, | |
| "learning_rate": 1.4014986919084228e-05, | |
| "loss": 0.1285, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 1.6535480382142642, | |
| "grad_norm": 0.6234251856803894, | |
| "learning_rate": 1.3697943356505897e-05, | |
| "loss": 0.1071, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 1.663161689599231, | |
| "eval_loss": 0.10122876614332199, | |
| "eval_runtime": 203.2008, | |
| "eval_samples_per_second": 5.276, | |
| "eval_steps_per_second": 5.276, | |
| "step": 865 | |
| }, | |
| { | |
| "epoch": 1.6727753409841974, | |
| "grad_norm": 1.0110090970993042, | |
| "learning_rate": 1.3381484876217669e-05, | |
| "loss": 0.1252, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 1.692002643754131, | |
| "grad_norm": 0.8749274611473083, | |
| "learning_rate": 1.3065753679600186e-05, | |
| "loss": 0.1086, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 1.7112299465240641, | |
| "grad_norm": 0.563439667224884, | |
| "learning_rate": 1.2750891641227418e-05, | |
| "loss": 0.1273, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 1.7304572492939974, | |
| "grad_norm": 0.6679959297180176, | |
| "learning_rate": 1.2437040245114966e-05, | |
| "loss": 0.1124, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.7496845520639308, | |
| "grad_norm": 0.8824312090873718, | |
| "learning_rate": 1.2124340521143929e-05, | |
| "loss": 0.1275, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 1.768911854833864, | |
| "grad_norm": 0.6557831168174744, | |
| "learning_rate": 1.1812932981688715e-05, | |
| "loss": 0.1207, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 1.7881391576037973, | |
| "grad_norm": 0.5608255863189697, | |
| "learning_rate": 1.1502957558477537e-05, | |
| "loss": 0.1095, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 1.8073664603737307, | |
| "grad_norm": 0.8327426910400391, | |
| "learning_rate": 1.119455353971371e-05, | |
| "loss": 0.1423, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 1.826593763143664, | |
| "grad_norm": 0.7187633514404297, | |
| "learning_rate": 1.0887859507486183e-05, | |
| "loss": 0.1142, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 1.8458210659135972, | |
| "grad_norm": 0.7449970841407776, | |
| "learning_rate": 1.0583013275497318e-05, | |
| "loss": 0.1315, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 1.8650483686835306, | |
| "grad_norm": 0.5967345237731934, | |
| "learning_rate": 1.0280151827136e-05, | |
| "loss": 0.1147, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 1.884275671453464, | |
| "grad_norm": 0.8269909024238586, | |
| "learning_rate": 9.979411253923813e-06, | |
| "loss": 0.131, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 1.903502974223397, | |
| "grad_norm": 0.6085448861122131, | |
| "learning_rate": 9.680926694361966e-06, | |
| "loss": 0.1339, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 1.9227302769933305, | |
| "grad_norm": 0.7495784163475037, | |
| "learning_rate": 9.384832273206514e-06, | |
| "loss": 0.1324, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.941957579763264, | |
| "grad_norm": 0.8735560178756714, | |
| "learning_rate": 9.091261041199051e-06, | |
| "loss": 0.1225, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 1.961184882533197, | |
| "grad_norm": 0.7350926995277405, | |
| "learning_rate": 8.80034491528005e-06, | |
| "loss": 0.1108, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 1.9804121853031305, | |
| "grad_norm": 0.7938897013664246, | |
| "learning_rate": 8.51221461931167e-06, | |
| "loss": 0.1416, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 1.995794027519077, | |
| "eval_loss": 0.09892405569553375, | |
| "eval_runtime": 202.9814, | |
| "eval_samples_per_second": 5.281, | |
| "eval_steps_per_second": 5.281, | |
| "step": 1038 | |
| }, | |
| { | |
| "epoch": 1.999639488073064, | |
| "grad_norm": 1.0423219203948975, | |
| "learning_rate": 8.226999625336663e-06, | |
| "loss": 0.1597, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 2.018866790842997, | |
| "grad_norm": 0.7138562202453613, | |
| "learning_rate": 7.944828095399802e-06, | |
| "loss": 0.1114, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 2.0380940936129304, | |
| "grad_norm": 0.6143700480461121, | |
| "learning_rate": 7.66582682395797e-06, | |
| "loss": 0.1066, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 2.057321396382864, | |
| "grad_norm": 0.8447745442390442, | |
| "learning_rate": 7.390121180904763e-06, | |
| "loss": 0.113, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 2.076548699152797, | |
| "grad_norm": 0.41642722487449646, | |
| "learning_rate": 7.117835055235195e-06, | |
| "loss": 0.1095, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 2.0957760019227303, | |
| "grad_norm": 0.7106382250785828, | |
| "learning_rate": 6.849090799375931e-06, | |
| "loss": 0.1214, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 2.1150033046926637, | |
| "grad_norm": 0.6263849139213562, | |
| "learning_rate": 6.584009174205888e-06, | |
| "loss": 0.1293, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 2.1342306074625967, | |
| "grad_norm": 0.7726497054100037, | |
| "learning_rate": 6.322709294792051e-06, | |
| "loss": 0.1394, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 2.15345791023253, | |
| "grad_norm": 0.7134016752243042, | |
| "learning_rate": 6.065308576864859e-06, | |
| "loss": 0.1039, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 2.1726852130024636, | |
| "grad_norm": 0.6412186026573181, | |
| "learning_rate": 5.811922684057118e-06, | |
| "loss": 0.1151, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 2.1919125157723967, | |
| "grad_norm": 0.9640927314758301, | |
| "learning_rate": 5.5626654759303085e-06, | |
| "loss": 0.1247, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 2.21113981854233, | |
| "grad_norm": 0.8550817370414734, | |
| "learning_rate": 5.3176489568115e-06, | |
| "loss": 0.1069, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 2.2303671213122636, | |
| "grad_norm": 0.6239781975746155, | |
| "learning_rate": 5.0769832254639355e-06, | |
| "loss": 0.1013, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 2.2495944240821966, | |
| "grad_norm": 0.7141818404197693, | |
| "learning_rate": 4.840776425613887e-06, | |
| "loss": 0.0976, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 2.26882172685213, | |
| "grad_norm": 0.48725616931915283, | |
| "learning_rate": 4.609134697356009e-06, | |
| "loss": 0.1049, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 2.2880490296220635, | |
| "grad_norm": 0.8563340902328491, | |
| "learning_rate": 4.382162129459055e-06, | |
| "loss": 0.0988, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 2.3072763323919965, | |
| "grad_norm": 0.7721908092498779, | |
| "learning_rate": 4.159960712593301e-06, | |
| "loss": 0.1022, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 2.32650363516193, | |
| "grad_norm": 0.6547017097473145, | |
| "learning_rate": 3.942630293500821e-06, | |
| "loss": 0.1321, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 2.3284263654389235, | |
| "eval_loss": 0.09838072210550308, | |
| "eval_runtime": 203.6342, | |
| "eval_samples_per_second": 5.264, | |
| "eval_steps_per_second": 5.264, | |
| "step": 1211 | |
| }, | |
| { | |
| "epoch": 2.3457309379318634, | |
| "grad_norm": 0.6637281775474548, | |
| "learning_rate": 3.730268530129097e-06, | |
| "loss": 0.0987, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 2.3649582407017964, | |
| "grad_norm": 0.9455267786979675, | |
| "learning_rate": 3.522970847748196e-06, | |
| "loss": 0.1286, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 2.38418554347173, | |
| "grad_norm": 0.7048280239105225, | |
| "learning_rate": 3.3208303960711895e-06, | |
| "loss": 0.0998, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 2.4034128462416633, | |
| "grad_norm": 0.8539944291114807, | |
| "learning_rate": 3.1239380073971e-06, | |
| "loss": 0.114, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 2.4226401490115963, | |
| "grad_norm": 0.8466408252716064, | |
| "learning_rate": 2.9323821557952007e-06, | |
| "loss": 0.0986, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 2.4418674517815298, | |
| "grad_norm": 0.6942047476768494, | |
| "learning_rate": 2.7462489173489636e-06, | |
| "loss": 0.1112, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 2.461094754551463, | |
| "grad_norm": 0.7220749258995056, | |
| "learning_rate": 2.5656219314775886e-06, | |
| "loss": 0.1083, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 2.480322057321396, | |
| "grad_norm": 0.8154662847518921, | |
| "learning_rate": 2.3905823633523997e-06, | |
| "loss": 0.0981, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 2.4995493600913297, | |
| "grad_norm": 0.7933881282806396, | |
| "learning_rate": 2.221208867425096e-06, | |
| "loss": 0.0965, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 2.518776662861263, | |
| "grad_norm": 0.770427405834198, | |
| "learning_rate": 2.0575775520841878e-06, | |
| "loss": 0.1399, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 2.538003965631196, | |
| "grad_norm": 0.7757827043533325, | |
| "learning_rate": 1.8997619454554955e-06, | |
| "loss": 0.1022, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 2.5572312684011296, | |
| "grad_norm": 0.8519064784049988, | |
| "learning_rate": 1.7478329623621226e-06, | |
| "loss": 0.1114, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 2.576458571171063, | |
| "grad_norm": 0.4863261878490448, | |
| "learning_rate": 1.601858872458702e-06, | |
| "loss": 0.0964, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 2.5956858739409965, | |
| "grad_norm": 0.7736539244651794, | |
| "learning_rate": 1.4619052695542612e-06, | |
| "loss": 0.1062, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 2.6149131767109295, | |
| "grad_norm": 0.8441415429115295, | |
| "learning_rate": 1.3280350421374888e-06, | |
| "loss": 0.1158, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 2.634140479480863, | |
| "grad_norm": 1.0856326818466187, | |
| "learning_rate": 1.2003083451176366e-06, | |
| "loss": 0.1314, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 2.653367782250796, | |
| "grad_norm": 0.6666831374168396, | |
| "learning_rate": 1.0787825727937783e-06, | |
| "loss": 0.0889, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 2.6610587033587696, | |
| "eval_loss": 0.09759029000997543, | |
| "eval_runtime": 202.9122, | |
| "eval_samples_per_second": 5.283, | |
| "eval_steps_per_second": 5.283, | |
| "step": 1384 | |
| }, | |
| { | |
| "epoch": 2.6725950850207294, | |
| "grad_norm": 0.7304587960243225, | |
| "learning_rate": 9.635123330645218e-07, | |
| "loss": 0.1098, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 2.691822387790663, | |
| "grad_norm": 0.7729344964027405, | |
| "learning_rate": 8.545494228898448e-07, | |
| "loss": 0.0874, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 2.7110496905605963, | |
| "grad_norm": 0.8187854886054993, | |
| "learning_rate": 7.519428050159765e-07, | |
| "loss": 0.1295, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 2.7302769933305293, | |
| "grad_norm": 0.5940708518028259, | |
| "learning_rate": 6.557385859738985e-07, | |
| "loss": 0.1194, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 2.7495042961004628, | |
| "grad_norm": 0.8048242926597595, | |
| "learning_rate": 5.659799953612438e-07, | |
| "loss": 0.1112, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 2.7687315988703958, | |
| "grad_norm": 0.65644371509552, | |
| "learning_rate": 4.827073664169812e-07, | |
| "loss": 0.1061, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 2.7879589016403292, | |
| "grad_norm": 0.733321487903595, | |
| "learning_rate": 4.059581178975741e-07, | |
| "loss": 0.1187, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 2.8071862044102627, | |
| "grad_norm": 0.48416727781295776, | |
| "learning_rate": 3.357667372627754e-07, | |
| "loss": 0.1183, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 2.826413507180196, | |
| "grad_norm": 0.5195744633674622, | |
| "learning_rate": 2.7216476517860245e-07, | |
| "loss": 0.0869, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 2.845640809950129, | |
| "grad_norm": 0.6493935585021973, | |
| "learning_rate": 2.151807813444606e-07, | |
| "loss": 0.0867, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 2.8648681127200626, | |
| "grad_norm": 0.7516520023345947, | |
| "learning_rate": 1.6484039165079455e-07, | |
| "loss": 0.1259, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 2.8840954154899956, | |
| "grad_norm": 0.7778034210205078, | |
| "learning_rate": 1.211662166730071e-07, | |
| "loss": 0.1229, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 2.903322718259929, | |
| "grad_norm": 0.6024238467216492, | |
| "learning_rate": 8.417788150686001e-08, | |
| "loss": 0.0972, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 2.9225500210298625, | |
| "grad_norm": 0.6645819544792175, | |
| "learning_rate": 5.389200694988494e-08, | |
| "loss": 0.1297, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 2.941777323799796, | |
| "grad_norm": 0.9006750583648682, | |
| "learning_rate": 3.032220203278924e-08, | |
| "loss": 0.1088, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 2.961004626569729, | |
| "grad_norm": 0.7198818325996399, | |
| "learning_rate": 1.3479057904204339e-08, | |
| "loss": 0.1041, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 2.9802319293396624, | |
| "grad_norm": 0.6966450810432434, | |
| "learning_rate": 3.3701430715277202e-09, | |
| "loss": 0.09, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 2.9936910412786157, | |
| "eval_loss": 0.09760043770074844, | |
| "eval_runtime": 202.9049, | |
| "eval_samples_per_second": 5.283, | |
| "eval_steps_per_second": 5.283, | |
| "step": 1557 | |
| }, | |
| { | |
| "epoch": 2.9994592321095954, | |
| "grad_norm": 0.7270023822784424, | |
| "learning_rate": 0.0, | |
| "loss": 0.1184, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 2.9994592321095954, | |
| "step": 1560, | |
| "total_flos": 1.3623219564340838e+18, | |
| "train_loss": 0.1486816066579941, | |
| "train_runtime": 34006.2437, | |
| "train_samples_per_second": 1.468, | |
| "train_steps_per_second": 0.046 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 1560, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 173, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.3623219564340838e+18, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |