{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.1, "eval_steps": 500, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "grad_norm": 0.16940702497959137, "learning_rate": 9e-07, "loss": 1.208, "step": 10 }, { "grad_norm": 0.14846493303775787, "learning_rate": 1.9e-06, "loss": 1.207, "step": 20 }, { "grad_norm": 0.1575060486793518, "learning_rate": 2.9e-06, "loss": 1.2031, "step": 30 }, { "grad_norm": 0.18646641075611115, "learning_rate": 3.9e-06, "loss": 1.19, "step": 40 }, { "grad_norm": 0.30270645022392273, "learning_rate": 4.9000000000000005e-06, "loss": 1.1729, "step": 50 }, { "grad_norm": 0.39245566725730896, "learning_rate": 5.9e-06, "loss": 1.1582, "step": 60 }, { "grad_norm": 0.44334590435028076, "learning_rate": 6.900000000000001e-06, "loss": 1.1293, "step": 70 }, { "grad_norm": 0.30264705419540405, "learning_rate": 7.9e-06, "loss": 1.1102, "step": 80 }, { "grad_norm": 0.24947984516620636, "learning_rate": 8.9e-06, "loss": 1.1029, "step": 90 }, { "grad_norm": 0.3442651927471161, "learning_rate": 9.900000000000002e-06, "loss": 1.0967, "step": 100 }, { "grad_norm": 0.2968611717224121, "learning_rate": 1.09e-05, "loss": 1.092, "step": 110 }, { "grad_norm": 0.8635988235473633, "learning_rate": 1.19e-05, "loss": 1.0971, "step": 120 }, { "grad_norm": 0.549736738204956, "learning_rate": 1.29e-05, "loss": 1.0877, "step": 130 }, { "grad_norm": 0.3879706859588623, "learning_rate": 1.3900000000000002e-05, "loss": 1.0965, "step": 140 }, { "grad_norm": 0.8023251295089722, "learning_rate": 1.49e-05, "loss": 1.0902, "step": 150 }, { "grad_norm": 0.3089507222175598, "learning_rate": 1.59e-05, "loss": 1.0938, "step": 160 }, { "grad_norm": 0.2845019996166229, "learning_rate": 1.69e-05, "loss": 1.0857, "step": 170 }, { "grad_norm": 0.5286039710044861, "learning_rate": 1.79e-05, "loss": 1.0811, "step": 180 }, { "grad_norm": 0.3651806712150574, "learning_rate": 1.8900000000000002e-05, "loss": 1.0711, "step": 190 }, { "grad_norm": 0.49551165103912354, "learning_rate": 1.9900000000000003e-05, "loss": 1.0568, "step": 200 }, { "grad_norm": 0.48400798439979553, "learning_rate": 2.09e-05, "loss": 1.0261, "step": 210 }, { "grad_norm": 0.5731498599052429, "learning_rate": 2.19e-05, "loss": 1.0099, "step": 220 }, { "grad_norm": 0.5158259868621826, "learning_rate": 2.29e-05, "loss": 0.9947, "step": 230 }, { "grad_norm": 0.8482366800308228, "learning_rate": 2.39e-05, "loss": 0.9701, "step": 240 }, { "grad_norm": 0.668360710144043, "learning_rate": 2.4900000000000002e-05, "loss": 0.933, "step": 250 }, { "grad_norm": 1.0341784954071045, "learning_rate": 2.5900000000000003e-05, "loss": 0.9101, "step": 260 }, { "grad_norm": 0.8576267957687378, "learning_rate": 2.6900000000000003e-05, "loss": 0.8772, "step": 270 }, { "grad_norm": 1.177884578704834, "learning_rate": 2.7900000000000004e-05, "loss": 0.8447, "step": 280 }, { "grad_norm": 1.2616709470748901, "learning_rate": 2.8899999999999998e-05, "loss": 0.8277, "step": 290 }, { "grad_norm": 0.9310820698738098, "learning_rate": 2.9900000000000002e-05, "loss": 0.8179, "step": 300 }, { "grad_norm": 0.9291635751724243, "learning_rate": 3.09e-05, "loss": 0.7966, "step": 310 }, { "grad_norm": 0.9610940217971802, "learning_rate": 3.19e-05, "loss": 0.7595, "step": 320 }, { "grad_norm": 1.082502841949463, "learning_rate": 3.29e-05, "loss": 0.7442, "step": 330 }, { "grad_norm": 1.0246247053146362, "learning_rate": 3.3900000000000004e-05, "loss": 0.7153, "step": 340 }, { "grad_norm": 1.1535388231277466, "learning_rate": 3.49e-05, "loss": 0.7008, "step": 350 }, { "grad_norm": 1.1344460248947144, "learning_rate": 3.59e-05, "loss": 0.6781, "step": 360 }, { "grad_norm": 1.0874427556991577, "learning_rate": 3.69e-05, "loss": 0.6773, "step": 370 }, { "grad_norm": 1.1591013669967651, "learning_rate": 3.79e-05, "loss": 0.6567, "step": 380 }, { "grad_norm": 1.2492725849151611, "learning_rate": 3.8900000000000004e-05, "loss": 0.6557, "step": 390 }, { "grad_norm": 1.333844542503357, "learning_rate": 3.99e-05, "loss": 0.6557, "step": 400 }, { "grad_norm": 1.157532811164856, "learning_rate": 4.09e-05, "loss": 0.6327, "step": 410 }, { "grad_norm": 1.0811901092529297, "learning_rate": 4.19e-05, "loss": 0.6181, "step": 420 }, { "grad_norm": 1.211959958076477, "learning_rate": 4.29e-05, "loss": 0.6222, "step": 430 }, { "grad_norm": 1.1791653633117676, "learning_rate": 4.39e-05, "loss": 0.6018, "step": 440 }, { "grad_norm": 1.483304500579834, "learning_rate": 4.49e-05, "loss": 0.5883, "step": 450 }, { "grad_norm": 1.136581540107727, "learning_rate": 4.5900000000000004e-05, "loss": 0.5781, "step": 460 }, { "grad_norm": 1.0122281312942505, "learning_rate": 4.69e-05, "loss": 0.5833, "step": 470 }, { "grad_norm": 1.294203519821167, "learning_rate": 4.79e-05, "loss": 0.5811, "step": 480 }, { "grad_norm": 1.036759614944458, "learning_rate": 4.89e-05, "loss": 0.5826, "step": 490 }, { "eval/loss": 0.540949667096138, "step": 500 }, { "grad_norm": 1.4752445220947266, "learning_rate": 4.99e-05, "loss": 0.5589, "step": 500 }, { "grad_norm": 0.9996066093444824, "learning_rate": 5.0900000000000004e-05, "loss": 0.5726, "step": 510 }, { "grad_norm": 1.1314283609390259, "learning_rate": 5.19e-05, "loss": 0.5489, "step": 520 }, { "grad_norm": 1.0463645458221436, "learning_rate": 5.2900000000000005e-05, "loss": 0.537, "step": 530 }, { "grad_norm": 1.1870821714401245, "learning_rate": 5.390000000000001e-05, "loss": 0.5419, "step": 540 }, { "grad_norm": 1.0127766132354736, "learning_rate": 5.4900000000000006e-05, "loss": 0.5383, "step": 550 }, { "grad_norm": 1.1530522108078003, "learning_rate": 5.590000000000001e-05, "loss": 0.5255, "step": 560 }, { "grad_norm": 1.6963386535644531, "learning_rate": 5.69e-05, "loss": 0.5248, "step": 570 }, { "grad_norm": 1.5842453241348267, "learning_rate": 5.79e-05, "loss": 0.5243, "step": 580 }, { "grad_norm": 1.3649457693099976, "learning_rate": 5.89e-05, "loss": 0.5147, "step": 590 }, { "grad_norm": 1.018904447555542, "learning_rate": 5.99e-05, "loss": 0.5042, "step": 600 }, { "grad_norm": 1.252278208732605, "learning_rate": 6.09e-05, "loss": 0.5213, "step": 610 }, { "grad_norm": 1.2415512800216675, "learning_rate": 6.19e-05, "loss": 0.4769, "step": 620 }, { "grad_norm": 1.3829114437103271, "learning_rate": 6.29e-05, "loss": 0.4806, "step": 630 }, { "grad_norm": 1.2860313653945923, "learning_rate": 6.390000000000001e-05, "loss": 0.4687, "step": 640 }, { "grad_norm": 1.1453088521957397, "learning_rate": 6.49e-05, "loss": 0.477, "step": 650 }, { "grad_norm": 1.2535901069641113, "learning_rate": 6.59e-05, "loss": 0.4541, "step": 660 }, { "grad_norm": 1.2619575262069702, "learning_rate": 6.690000000000001e-05, "loss": 0.4565, "step": 670 }, { "grad_norm": 1.1378668546676636, "learning_rate": 6.790000000000001e-05, "loss": 0.4395, "step": 680 }, { "grad_norm": 1.0631095170974731, "learning_rate": 6.89e-05, "loss": 0.4185, "step": 690 }, { "grad_norm": 1.1509623527526855, "learning_rate": 6.99e-05, "loss": 0.437, "step": 700 }, { "grad_norm": 1.249911904335022, "learning_rate": 7.09e-05, "loss": 0.4273, "step": 710 }, { "grad_norm": 1.1548298597335815, "learning_rate": 7.19e-05, "loss": 0.4296, "step": 720 }, { "grad_norm": 1.0660429000854492, "learning_rate": 7.29e-05, "loss": 0.4438, "step": 730 }, { "grad_norm": 1.2336221933364868, "learning_rate": 7.390000000000001e-05, "loss": 0.407, "step": 740 }, { "grad_norm": 1.073397159576416, "learning_rate": 7.49e-05, "loss": 0.3936, "step": 750 }, { "grad_norm": 1.2548182010650635, "learning_rate": 7.59e-05, "loss": 0.3991, "step": 760 }, { "grad_norm": 1.4380117654800415, "learning_rate": 7.69e-05, "loss": 0.3957, "step": 770 }, { "grad_norm": 1.2932844161987305, "learning_rate": 7.790000000000001e-05, "loss": 0.4081, "step": 780 }, { "grad_norm": 1.1372441053390503, "learning_rate": 7.890000000000001e-05, "loss": 0.3812, "step": 790 }, { "grad_norm": 1.1620570421218872, "learning_rate": 7.99e-05, "loss": 0.3952, "step": 800 }, { "grad_norm": 1.1965490579605103, "learning_rate": 8.090000000000001e-05, "loss": 0.3677, "step": 810 }, { "grad_norm": 1.176527738571167, "learning_rate": 8.19e-05, "loss": 0.3798, "step": 820 }, { "grad_norm": 1.153993010520935, "learning_rate": 8.29e-05, "loss": 0.3629, "step": 830 }, { "grad_norm": 1.3327205181121826, "learning_rate": 8.39e-05, "loss": 0.3578, "step": 840 }, { "grad_norm": 1.1645392179489136, "learning_rate": 8.49e-05, "loss": 0.3542, "step": 850 }, { "grad_norm": 1.1183959245681763, "learning_rate": 8.59e-05, "loss": 0.3452, "step": 860 }, { "grad_norm": 1.4171571731567383, "learning_rate": 8.69e-05, "loss": 0.328, "step": 870 }, { "grad_norm": 1.2265501022338867, "learning_rate": 8.790000000000001e-05, "loss": 0.3427, "step": 880 }, { "grad_norm": 1.3434756994247437, "learning_rate": 8.89e-05, "loss": 0.3333, "step": 890 }, { "grad_norm": 1.3676091432571411, "learning_rate": 8.99e-05, "loss": 0.3142, "step": 900 }, { "grad_norm": 1.0545670986175537, "learning_rate": 9.090000000000001e-05, "loss": 0.3242, "step": 910 }, { "grad_norm": 1.1802937984466553, "learning_rate": 9.190000000000001e-05, "loss": 0.3419, "step": 920 }, { "grad_norm": 1.2357131242752075, "learning_rate": 9.290000000000001e-05, "loss": 0.2918, "step": 930 }, { "grad_norm": 1.2467869520187378, "learning_rate": 9.39e-05, "loss": 0.2812, "step": 940 }, { "grad_norm": 1.2177903652191162, "learning_rate": 9.49e-05, "loss": 0.2786, "step": 950 }, { "grad_norm": 1.2031254768371582, "learning_rate": 9.59e-05, "loss": 0.2801, "step": 960 }, { "grad_norm": 1.27996826171875, "learning_rate": 9.69e-05, "loss": 0.2944, "step": 970 }, { "grad_norm": 1.4937174320220947, "learning_rate": 9.790000000000001e-05, "loss": 0.259, "step": 980 }, { "grad_norm": 1.2263216972351074, "learning_rate": 9.89e-05, "loss": 0.269, "step": 990 }, { "eval/loss": 0.2459017077088356, "step": 1000 }, { "grad_norm": 1.1868503093719482, "learning_rate": 9.99e-05, "loss": 0.2673, "step": 1000 }, { "grad_norm": 1.2517995834350586, "learning_rate": 9.999994463727085e-05, "loss": 0.2957, "step": 1010 }, { "grad_norm": 1.1621079444885254, "learning_rate": 9.999975326009292e-05, "loss": 0.2406, "step": 1020 }, { "grad_norm": 1.2248700857162476, "learning_rate": 9.999942518549879e-05, "loss": 0.2588, "step": 1030 }, { "grad_norm": 1.1486198902130127, "learning_rate": 9.999896041438544e-05, "loss": 0.2869, "step": 1040 }, { "grad_norm": 1.1869938373565674, "learning_rate": 9.999835894802353e-05, "loss": 0.2613, "step": 1050 }, { "grad_norm": 1.2058380842208862, "learning_rate": 9.999762078805743e-05, "loss": 0.2367, "step": 1060 }, { "grad_norm": 1.2073358297348022, "learning_rate": 9.999674593650526e-05, "loss": 0.2343, "step": 1070 }, { "grad_norm": 1.3462257385253906, "learning_rate": 9.99957343957588e-05, "loss": 0.2043, "step": 1080 }, { "grad_norm": 1.21333646774292, "learning_rate": 9.99945861685836e-05, "loss": 0.22, "step": 1090 }, { "grad_norm": 1.172276496887207, "learning_rate": 9.999330125811884e-05, "loss": 0.2268, "step": 1100 }, { "grad_norm": 1.5802624225616455, "learning_rate": 9.999187966787744e-05, "loss": 0.2389, "step": 1110 }, { "grad_norm": 1.0722038745880127, "learning_rate": 9.999032140174595e-05, "loss": 0.2069, "step": 1120 }, { "grad_norm": 1.2428017854690552, "learning_rate": 9.998862646398464e-05, "loss": 0.2105, "step": 1130 }, { "grad_norm": 1.109406590461731, "learning_rate": 9.998679485922739e-05, "loss": 0.204, "step": 1140 }, { "grad_norm": 1.133062720298767, "learning_rate": 9.998482659248174e-05, "loss": 0.1862, "step": 1150 }, { "grad_norm": 1.185992956161499, "learning_rate": 9.998272166912883e-05, "loss": 0.1944, "step": 1160 }, { "grad_norm": 1.0539828538894653, "learning_rate": 9.998048009492347e-05, "loss": 0.1603, "step": 1170 }, { "grad_norm": 1.2745673656463623, "learning_rate": 9.997810187599403e-05, "loss": 0.1815, "step": 1180 }, { "grad_norm": 1.2294188737869263, "learning_rate": 9.997558701884249e-05, "loss": 0.1774, "step": 1190 }, { "grad_norm": 1.6289048194885254, "learning_rate": 9.997293553034433e-05, "loss": 0.171, "step": 1200 }, { "grad_norm": 1.2011067867279053, "learning_rate": 9.997014741774866e-05, "loss": 0.1657, "step": 1210 }, { "grad_norm": 1.1529210805892944, "learning_rate": 9.996722268867803e-05, "loss": 0.1642, "step": 1220 }, { "grad_norm": 0.8735513091087341, "learning_rate": 9.996416135112858e-05, "loss": 0.1393, "step": 1230 }, { "grad_norm": 1.3112437725067139, "learning_rate": 9.996096341346988e-05, "loss": 0.1392, "step": 1240 }, { "grad_norm": 1.2347687482833862, "learning_rate": 9.995762888444495e-05, "loss": 0.1674, "step": 1250 }, { "grad_norm": 1.539437174797058, "learning_rate": 9.995415777317027e-05, "loss": 0.1625, "step": 1260 }, { "grad_norm": 1.2333788871765137, "learning_rate": 9.995055008913574e-05, "loss": 0.1328, "step": 1270 }, { "grad_norm": 1.1541303396224976, "learning_rate": 9.994680584220463e-05, "loss": 0.1294, "step": 1280 }, { "grad_norm": 1.0528708696365356, "learning_rate": 9.994292504261355e-05, "loss": 0.1441, "step": 1290 }, { "grad_norm": 1.0454338788986206, "learning_rate": 9.993890770097247e-05, "loss": 0.1266, "step": 1300 }, { "grad_norm": 1.1280555725097656, "learning_rate": 9.993475382826467e-05, "loss": 0.1426, "step": 1310 }, { "grad_norm": 1.187239170074463, "learning_rate": 9.993046343584664e-05, "loss": 0.1422, "step": 1320 }, { "grad_norm": 1.0262149572372437, "learning_rate": 9.992603653544816e-05, "loss": 0.1161, "step": 1330 }, { "grad_norm": 1.1586066484451294, "learning_rate": 9.992147313917222e-05, "loss": 0.1408, "step": 1340 }, { "grad_norm": 0.9765651226043701, "learning_rate": 9.991677325949497e-05, "loss": 0.1611, "step": 1350 }, { "grad_norm": 0.9763075709342957, "learning_rate": 9.991193690926568e-05, "loss": 0.1464, "step": 1360 }, { "grad_norm": 1.2092801332473755, "learning_rate": 9.990696410170678e-05, "loss": 0.1466, "step": 1370 }, { "grad_norm": 1.0392274856567383, "learning_rate": 9.990185485041371e-05, "loss": 0.1263, "step": 1380 }, { "grad_norm": 1.0358021259307861, "learning_rate": 9.989660916935498e-05, "loss": 0.1282, "step": 1390 }, { "grad_norm": 1.0262398719787598, "learning_rate": 9.989122707287208e-05, "loss": 0.1391, "step": 1400 }, { "grad_norm": 1.1978421211242676, "learning_rate": 9.988570857567945e-05, "loss": 0.1218, "step": 1410 }, { "grad_norm": 0.9296699166297913, "learning_rate": 9.988005369286446e-05, "loss": 0.1331, "step": 1420 }, { "grad_norm": 1.0004020929336548, "learning_rate": 9.987426243988734e-05, "loss": 0.1372, "step": 1430 }, { "grad_norm": 1.0646557807922363, "learning_rate": 9.986833483258114e-05, "loss": 0.1334, "step": 1440 }, { "grad_norm": 0.9959461688995361, "learning_rate": 9.986227088715173e-05, "loss": 0.1187, "step": 1450 }, { "grad_norm": 1.0928994417190552, "learning_rate": 9.98560706201777e-05, "loss": 0.1455, "step": 1460 }, { "grad_norm": 1.1495130062103271, "learning_rate": 9.984973404861036e-05, "loss": 0.1098, "step": 1470 }, { "grad_norm": 1.3037567138671875, "learning_rate": 9.984326118977361e-05, "loss": 0.1255, "step": 1480 }, { "grad_norm": 0.923818051815033, "learning_rate": 9.983665206136406e-05, "loss": 0.1439, "step": 1490 }, { "eval/loss": 0.12131376132369041, "step": 1500 }, { "grad_norm": 0.9496746063232422, "learning_rate": 9.982990668145075e-05, "loss": 0.123, "step": 1500 }, { "grad_norm": 1.0832738876342773, "learning_rate": 9.982302506847534e-05, "loss": 0.1408, "step": 1510 }, { "grad_norm": 1.033626914024353, "learning_rate": 9.981600724125189e-05, "loss": 0.1108, "step": 1520 }, { "grad_norm": 1.0005450248718262, "learning_rate": 9.980885321896685e-05, "loss": 0.1274, "step": 1530 }, { "grad_norm": 1.0442663431167603, "learning_rate": 9.980156302117905e-05, "loss": 0.1238, "step": 1540 }, { "grad_norm": 0.8260471820831299, "learning_rate": 9.979413666781963e-05, "loss": 0.1231, "step": 1550 }, { "grad_norm": 0.884735107421875, "learning_rate": 9.978657417919193e-05, "loss": 0.1495, "step": 1560 }, { "grad_norm": 0.9236319661140442, "learning_rate": 9.977887557597153e-05, "loss": 0.1327, "step": 1570 }, { "grad_norm": 0.9460572004318237, "learning_rate": 9.97710408792061e-05, "loss": 0.1263, "step": 1580 }, { "grad_norm": 0.9749200344085693, "learning_rate": 9.976307011031542e-05, "loss": 0.12, "step": 1590 }, { "grad_norm": 1.1136820316314697, "learning_rate": 9.975496329109126e-05, "loss": 0.1323, "step": 1600 }, { "grad_norm": 0.8567096590995789, "learning_rate": 9.974672044369732e-05, "loss": 0.125, "step": 1610 }, { "grad_norm": 1.0884920358657837, "learning_rate": 9.97383415906693e-05, "loss": 0.115, "step": 1620 }, { "grad_norm": 1.0339338779449463, "learning_rate": 9.97298267549146e-05, "loss": 0.1386, "step": 1630 }, { "grad_norm": 0.9121850728988647, "learning_rate": 9.972117595971249e-05, "loss": 0.1249, "step": 1640 }, { "grad_norm": 0.9620202779769897, "learning_rate": 9.971238922871391e-05, "loss": 0.1322, "step": 1650 }, { "grad_norm": 1.0946760177612305, "learning_rate": 9.970346658594142e-05, "loss": 0.1123, "step": 1660 }, { "grad_norm": 0.9470517635345459, "learning_rate": 9.969440805578923e-05, "loss": 0.1308, "step": 1670 }, { "grad_norm": 0.7607911229133606, "learning_rate": 9.968521366302298e-05, "loss": 0.1212, "step": 1680 }, { "grad_norm": 1.0321109294891357, "learning_rate": 9.967588343277981e-05, "loss": 0.1351, "step": 1690 }, { "grad_norm": 1.2910419702529907, "learning_rate": 9.966641739056818e-05, "loss": 0.1594, "step": 1700 }, { "grad_norm": 0.7687619924545288, "learning_rate": 9.965681556226793e-05, "loss": 0.1442, "step": 1710 }, { "grad_norm": 0.882544755935669, "learning_rate": 9.964707797413006e-05, "loss": 0.1131, "step": 1720 }, { "grad_norm": 1.0736138820648193, "learning_rate": 9.963720465277679e-05, "loss": 0.0997, "step": 1730 }, { "grad_norm": 1.0465947389602661, "learning_rate": 9.96271956252014e-05, "loss": 0.1114, "step": 1740 }, { "grad_norm": 0.8517502546310425, "learning_rate": 9.961705091876816e-05, "loss": 0.1054, "step": 1750 }, { "grad_norm": 0.8722822666168213, "learning_rate": 9.960677056121235e-05, "loss": 0.1409, "step": 1760 }, { "grad_norm": 1.3352707624435425, "learning_rate": 9.959635458064005e-05, "loss": 0.1207, "step": 1770 }, { "grad_norm": 1.0375741720199585, "learning_rate": 9.958580300552815e-05, "loss": 0.1018, "step": 1780 }, { "grad_norm": 0.9663418531417847, "learning_rate": 9.957511586472426e-05, "loss": 0.1131, "step": 1790 }, { "grad_norm": 0.9925614595413208, "learning_rate": 9.956429318744662e-05, "loss": 0.0994, "step": 1800 }, { "grad_norm": 0.9052272439002991, "learning_rate": 9.955333500328404e-05, "loss": 0.1439, "step": 1810 }, { "grad_norm": 1.1061209440231323, "learning_rate": 9.95422413421957e-05, "loss": 0.099, "step": 1820 }, { "grad_norm": 0.9718905687332153, "learning_rate": 9.953101223451133e-05, "loss": 0.1334, "step": 1830 }, { "grad_norm": 1.041150450706482, "learning_rate": 9.951964771093085e-05, "loss": 0.102, "step": 1840 }, { "grad_norm": 0.7296974658966064, "learning_rate": 9.950814780252442e-05, "loss": 0.1216, "step": 1850 }, { "grad_norm": 0.956504225730896, "learning_rate": 9.949651254073236e-05, "loss": 0.1504, "step": 1860 }, { "grad_norm": 0.9494929909706116, "learning_rate": 9.948474195736504e-05, "loss": 0.1176, "step": 1870 }, { "grad_norm": 1.073912501335144, "learning_rate": 9.947283608460277e-05, "loss": 0.0968, "step": 1880 }, { "grad_norm": 0.9738394618034363, "learning_rate": 9.946079495499577e-05, "loss": 0.1157, "step": 1890 }, { "grad_norm": 0.9605513215065002, "learning_rate": 9.944861860146401e-05, "loss": 0.122, "step": 1900 }, { "grad_norm": 0.8335412740707397, "learning_rate": 9.943630705729719e-05, "loss": 0.0999, "step": 1910 }, { "grad_norm": 0.881373405456543, "learning_rate": 9.942386035615459e-05, "loss": 0.1014, "step": 1920 }, { "grad_norm": 0.7942837476730347, "learning_rate": 9.941127853206503e-05, "loss": 0.1095, "step": 1930 }, { "grad_norm": 1.0058091878890991, "learning_rate": 9.939856161942673e-05, "loss": 0.118, "step": 1940 }, { "grad_norm": 0.8951269388198853, "learning_rate": 9.938570965300724e-05, "loss": 0.1093, "step": 1950 }, { "grad_norm": 0.8100780844688416, "learning_rate": 9.937272266794335e-05, "loss": 0.1344, "step": 1960 }, { "grad_norm": 0.8485944867134094, "learning_rate": 9.935960069974096e-05, "loss": 0.1062, "step": 1970 }, { "grad_norm": 1.0462373495101929, "learning_rate": 9.934634378427506e-05, "loss": 0.1073, "step": 1980 }, { "grad_norm": 0.7918229103088379, "learning_rate": 9.933295195778954e-05, "loss": 0.0865, "step": 1990 }, { "eval/loss": 0.10448359854519368, "step": 2000 }, { "grad_norm": 1.0324554443359375, "learning_rate": 9.931942525689715e-05, "loss": 0.1235, "step": 2000 } ], "logging_steps": 10, "max_steps": 20000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }