{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.3, "eval_steps": 500, "global_step": 6000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "grad_norm": 0.16940702497959137, "learning_rate": 9e-07, "loss": 1.208, "step": 10 }, { "grad_norm": 0.14846493303775787, "learning_rate": 1.9e-06, "loss": 1.207, "step": 20 }, { "grad_norm": 0.1575060486793518, "learning_rate": 2.9e-06, "loss": 1.2031, "step": 30 }, { "grad_norm": 0.18646641075611115, "learning_rate": 3.9e-06, "loss": 1.19, "step": 40 }, { "grad_norm": 0.30270645022392273, "learning_rate": 4.9000000000000005e-06, "loss": 1.1729, "step": 50 }, { "grad_norm": 0.39245566725730896, "learning_rate": 5.9e-06, "loss": 1.1582, "step": 60 }, { "grad_norm": 0.44334590435028076, "learning_rate": 6.900000000000001e-06, "loss": 1.1293, "step": 70 }, { "grad_norm": 0.30264705419540405, "learning_rate": 7.9e-06, "loss": 1.1102, "step": 80 }, { "grad_norm": 0.24947984516620636, "learning_rate": 8.9e-06, "loss": 1.1029, "step": 90 }, { "grad_norm": 0.3442651927471161, "learning_rate": 9.900000000000002e-06, "loss": 1.0967, "step": 100 }, { "grad_norm": 0.2968611717224121, "learning_rate": 1.09e-05, "loss": 1.092, "step": 110 }, { "grad_norm": 0.8635988235473633, "learning_rate": 1.19e-05, "loss": 1.0971, "step": 120 }, { "grad_norm": 0.549736738204956, "learning_rate": 1.29e-05, "loss": 1.0877, "step": 130 }, { "grad_norm": 0.3879706859588623, "learning_rate": 1.3900000000000002e-05, "loss": 1.0965, "step": 140 }, { "grad_norm": 0.8023251295089722, "learning_rate": 1.49e-05, "loss": 1.0902, "step": 150 }, { "grad_norm": 0.3089507222175598, "learning_rate": 1.59e-05, "loss": 1.0938, "step": 160 }, { "grad_norm": 0.2845019996166229, "learning_rate": 1.69e-05, "loss": 1.0857, "step": 170 }, { "grad_norm": 0.5286039710044861, "learning_rate": 1.79e-05, "loss": 1.0811, "step": 180 }, { "grad_norm": 0.3651806712150574, "learning_rate": 1.8900000000000002e-05, "loss": 1.0711, "step": 190 }, { "grad_norm": 0.49551165103912354, "learning_rate": 1.9900000000000003e-05, "loss": 1.0568, "step": 200 }, { "grad_norm": 0.48400798439979553, "learning_rate": 2.09e-05, "loss": 1.0261, "step": 210 }, { "grad_norm": 0.5731498599052429, "learning_rate": 2.19e-05, "loss": 1.0099, "step": 220 }, { "grad_norm": 0.5158259868621826, "learning_rate": 2.29e-05, "loss": 0.9947, "step": 230 }, { "grad_norm": 0.8482366800308228, "learning_rate": 2.39e-05, "loss": 0.9701, "step": 240 }, { "grad_norm": 0.668360710144043, "learning_rate": 2.4900000000000002e-05, "loss": 0.933, "step": 250 }, { "grad_norm": 1.0341784954071045, "learning_rate": 2.5900000000000003e-05, "loss": 0.9101, "step": 260 }, { "grad_norm": 0.8576267957687378, "learning_rate": 2.6900000000000003e-05, "loss": 0.8772, "step": 270 }, { "grad_norm": 1.177884578704834, "learning_rate": 2.7900000000000004e-05, "loss": 0.8447, "step": 280 }, { "grad_norm": 1.2616709470748901, "learning_rate": 2.8899999999999998e-05, "loss": 0.8277, "step": 290 }, { "grad_norm": 0.9310820698738098, "learning_rate": 2.9900000000000002e-05, "loss": 0.8179, "step": 300 }, { "grad_norm": 0.9291635751724243, "learning_rate": 3.09e-05, "loss": 0.7966, "step": 310 }, { "grad_norm": 0.9610940217971802, "learning_rate": 3.19e-05, "loss": 0.7595, "step": 320 }, { "grad_norm": 1.082502841949463, "learning_rate": 3.29e-05, "loss": 0.7442, "step": 330 }, { "grad_norm": 1.0246247053146362, "learning_rate": 3.3900000000000004e-05, "loss": 0.7153, "step": 340 }, { "grad_norm": 1.1535388231277466, "learning_rate": 3.49e-05, "loss": 0.7008, "step": 350 }, { "grad_norm": 1.1344460248947144, "learning_rate": 3.59e-05, "loss": 0.6781, "step": 360 }, { "grad_norm": 1.0874427556991577, "learning_rate": 3.69e-05, "loss": 0.6773, "step": 370 }, { "grad_norm": 1.1591013669967651, "learning_rate": 3.79e-05, "loss": 0.6567, "step": 380 }, { "grad_norm": 1.2492725849151611, "learning_rate": 3.8900000000000004e-05, "loss": 0.6557, "step": 390 }, { "grad_norm": 1.333844542503357, "learning_rate": 3.99e-05, "loss": 0.6557, "step": 400 }, { "grad_norm": 1.157532811164856, "learning_rate": 4.09e-05, "loss": 0.6327, "step": 410 }, { "grad_norm": 1.0811901092529297, "learning_rate": 4.19e-05, "loss": 0.6181, "step": 420 }, { "grad_norm": 1.211959958076477, "learning_rate": 4.29e-05, "loss": 0.6222, "step": 430 }, { "grad_norm": 1.1791653633117676, "learning_rate": 4.39e-05, "loss": 0.6018, "step": 440 }, { "grad_norm": 1.483304500579834, "learning_rate": 4.49e-05, "loss": 0.5883, "step": 450 }, { "grad_norm": 1.136581540107727, "learning_rate": 4.5900000000000004e-05, "loss": 0.5781, "step": 460 }, { "grad_norm": 1.0122281312942505, "learning_rate": 4.69e-05, "loss": 0.5833, "step": 470 }, { "grad_norm": 1.294203519821167, "learning_rate": 4.79e-05, "loss": 0.5811, "step": 480 }, { "grad_norm": 1.036759614944458, "learning_rate": 4.89e-05, "loss": 0.5826, "step": 490 }, { "eval/loss": 0.540949667096138, "step": 500 }, { "grad_norm": 1.4752445220947266, "learning_rate": 4.99e-05, "loss": 0.5589, "step": 500 }, { "grad_norm": 0.9996066093444824, "learning_rate": 5.0900000000000004e-05, "loss": 0.5726, "step": 510 }, { "grad_norm": 1.1314283609390259, "learning_rate": 5.19e-05, "loss": 0.5489, "step": 520 }, { "grad_norm": 1.0463645458221436, "learning_rate": 5.2900000000000005e-05, "loss": 0.537, "step": 530 }, { "grad_norm": 1.1870821714401245, "learning_rate": 5.390000000000001e-05, "loss": 0.5419, "step": 540 }, { "grad_norm": 1.0127766132354736, "learning_rate": 5.4900000000000006e-05, "loss": 0.5383, "step": 550 }, { "grad_norm": 1.1530522108078003, "learning_rate": 5.590000000000001e-05, "loss": 0.5255, "step": 560 }, { "grad_norm": 1.6963386535644531, "learning_rate": 5.69e-05, "loss": 0.5248, "step": 570 }, { "grad_norm": 1.5842453241348267, "learning_rate": 5.79e-05, "loss": 0.5243, "step": 580 }, { "grad_norm": 1.3649457693099976, "learning_rate": 5.89e-05, "loss": 0.5147, "step": 590 }, { "grad_norm": 1.018904447555542, "learning_rate": 5.99e-05, "loss": 0.5042, "step": 600 }, { "grad_norm": 1.252278208732605, "learning_rate": 6.09e-05, "loss": 0.5213, "step": 610 }, { "grad_norm": 1.2415512800216675, "learning_rate": 6.19e-05, "loss": 0.4769, "step": 620 }, { "grad_norm": 1.3829114437103271, "learning_rate": 6.29e-05, "loss": 0.4806, "step": 630 }, { "grad_norm": 1.2860313653945923, "learning_rate": 6.390000000000001e-05, "loss": 0.4687, "step": 640 }, { "grad_norm": 1.1453088521957397, "learning_rate": 6.49e-05, "loss": 0.477, "step": 650 }, { "grad_norm": 1.2535901069641113, "learning_rate": 6.59e-05, "loss": 0.4541, "step": 660 }, { "grad_norm": 1.2619575262069702, "learning_rate": 6.690000000000001e-05, "loss": 0.4565, "step": 670 }, { "grad_norm": 1.1378668546676636, "learning_rate": 6.790000000000001e-05, "loss": 0.4395, "step": 680 }, { "grad_norm": 1.0631095170974731, "learning_rate": 6.89e-05, "loss": 0.4185, "step": 690 }, { "grad_norm": 1.1509623527526855, "learning_rate": 6.99e-05, "loss": 0.437, "step": 700 }, { "grad_norm": 1.249911904335022, "learning_rate": 7.09e-05, "loss": 0.4273, "step": 710 }, { "grad_norm": 1.1548298597335815, "learning_rate": 7.19e-05, "loss": 0.4296, "step": 720 }, { "grad_norm": 1.0660429000854492, "learning_rate": 7.29e-05, "loss": 0.4438, "step": 730 }, { "grad_norm": 1.2336221933364868, "learning_rate": 7.390000000000001e-05, "loss": 0.407, "step": 740 }, { "grad_norm": 1.073397159576416, "learning_rate": 7.49e-05, "loss": 0.3936, "step": 750 }, { "grad_norm": 1.2548182010650635, "learning_rate": 7.59e-05, "loss": 0.3991, "step": 760 }, { "grad_norm": 1.4380117654800415, "learning_rate": 7.69e-05, "loss": 0.3957, "step": 770 }, { "grad_norm": 1.2932844161987305, "learning_rate": 7.790000000000001e-05, "loss": 0.4081, "step": 780 }, { "grad_norm": 1.1372441053390503, "learning_rate": 7.890000000000001e-05, "loss": 0.3812, "step": 790 }, { "grad_norm": 1.1620570421218872, "learning_rate": 7.99e-05, "loss": 0.3952, "step": 800 }, { "grad_norm": 1.1965490579605103, "learning_rate": 8.090000000000001e-05, "loss": 0.3677, "step": 810 }, { "grad_norm": 1.176527738571167, "learning_rate": 8.19e-05, "loss": 0.3798, "step": 820 }, { "grad_norm": 1.153993010520935, "learning_rate": 8.29e-05, "loss": 0.3629, "step": 830 }, { "grad_norm": 1.3327205181121826, "learning_rate": 8.39e-05, "loss": 0.3578, "step": 840 }, { "grad_norm": 1.1645392179489136, "learning_rate": 8.49e-05, "loss": 0.3542, "step": 850 }, { "grad_norm": 1.1183959245681763, "learning_rate": 8.59e-05, "loss": 0.3452, "step": 860 }, { "grad_norm": 1.4171571731567383, "learning_rate": 8.69e-05, "loss": 0.328, "step": 870 }, { "grad_norm": 1.2265501022338867, "learning_rate": 8.790000000000001e-05, "loss": 0.3427, "step": 880 }, { "grad_norm": 1.3434756994247437, "learning_rate": 8.89e-05, "loss": 0.3333, "step": 890 }, { "grad_norm": 1.3676091432571411, "learning_rate": 8.99e-05, "loss": 0.3142, "step": 900 }, { "grad_norm": 1.0545670986175537, "learning_rate": 9.090000000000001e-05, "loss": 0.3242, "step": 910 }, { "grad_norm": 1.1802937984466553, "learning_rate": 9.190000000000001e-05, "loss": 0.3419, "step": 920 }, { "grad_norm": 1.2357131242752075, "learning_rate": 9.290000000000001e-05, "loss": 0.2918, "step": 930 }, { "grad_norm": 1.2467869520187378, "learning_rate": 9.39e-05, "loss": 0.2812, "step": 940 }, { "grad_norm": 1.2177903652191162, "learning_rate": 9.49e-05, "loss": 0.2786, "step": 950 }, { "grad_norm": 1.2031254768371582, "learning_rate": 9.59e-05, "loss": 0.2801, "step": 960 }, { "grad_norm": 1.27996826171875, "learning_rate": 9.69e-05, "loss": 0.2944, "step": 970 }, { "grad_norm": 1.4937174320220947, "learning_rate": 9.790000000000001e-05, "loss": 0.259, "step": 980 }, { "grad_norm": 1.2263216972351074, "learning_rate": 9.89e-05, "loss": 0.269, "step": 990 }, { "eval/loss": 0.2459017077088356, "step": 1000 }, { "grad_norm": 1.1868503093719482, "learning_rate": 9.99e-05, "loss": 0.2673, "step": 1000 }, { "grad_norm": 1.2517995834350586, "learning_rate": 9.999994463727085e-05, "loss": 0.2957, "step": 1010 }, { "grad_norm": 1.1621079444885254, "learning_rate": 9.999975326009292e-05, "loss": 0.2406, "step": 1020 }, { "grad_norm": 1.2248700857162476, "learning_rate": 9.999942518549879e-05, "loss": 0.2588, "step": 1030 }, { "grad_norm": 1.1486198902130127, "learning_rate": 9.999896041438544e-05, "loss": 0.2869, "step": 1040 }, { "grad_norm": 1.1869938373565674, "learning_rate": 9.999835894802353e-05, "loss": 0.2613, "step": 1050 }, { "grad_norm": 1.2058380842208862, "learning_rate": 9.999762078805743e-05, "loss": 0.2367, "step": 1060 }, { "grad_norm": 1.2073358297348022, "learning_rate": 9.999674593650526e-05, "loss": 0.2343, "step": 1070 }, { "grad_norm": 1.3462257385253906, "learning_rate": 9.99957343957588e-05, "loss": 0.2043, "step": 1080 }, { "grad_norm": 1.21333646774292, "learning_rate": 9.99945861685836e-05, "loss": 0.22, "step": 1090 }, { "grad_norm": 1.172276496887207, "learning_rate": 9.999330125811884e-05, "loss": 0.2268, "step": 1100 }, { "grad_norm": 1.5802624225616455, "learning_rate": 9.999187966787744e-05, "loss": 0.2389, "step": 1110 }, { "grad_norm": 1.0722038745880127, "learning_rate": 9.999032140174595e-05, "loss": 0.2069, "step": 1120 }, { "grad_norm": 1.2428017854690552, "learning_rate": 9.998862646398464e-05, "loss": 0.2105, "step": 1130 }, { "grad_norm": 1.109406590461731, "learning_rate": 9.998679485922739e-05, "loss": 0.204, "step": 1140 }, { "grad_norm": 1.133062720298767, "learning_rate": 9.998482659248174e-05, "loss": 0.1862, "step": 1150 }, { "grad_norm": 1.185992956161499, "learning_rate": 9.998272166912883e-05, "loss": 0.1944, "step": 1160 }, { "grad_norm": 1.0539828538894653, "learning_rate": 9.998048009492347e-05, "loss": 0.1603, "step": 1170 }, { "grad_norm": 1.2745673656463623, "learning_rate": 9.997810187599403e-05, "loss": 0.1815, "step": 1180 }, { "grad_norm": 1.2294188737869263, "learning_rate": 9.997558701884249e-05, "loss": 0.1774, "step": 1190 }, { "grad_norm": 1.6289048194885254, "learning_rate": 9.997293553034433e-05, "loss": 0.171, "step": 1200 }, { "grad_norm": 1.2011067867279053, "learning_rate": 9.997014741774866e-05, "loss": 0.1657, "step": 1210 }, { "grad_norm": 1.1529210805892944, "learning_rate": 9.996722268867803e-05, "loss": 0.1642, "step": 1220 }, { "grad_norm": 0.8735513091087341, "learning_rate": 9.996416135112858e-05, "loss": 0.1393, "step": 1230 }, { "grad_norm": 1.3112437725067139, "learning_rate": 9.996096341346988e-05, "loss": 0.1392, "step": 1240 }, { "grad_norm": 1.2347687482833862, "learning_rate": 9.995762888444495e-05, "loss": 0.1674, "step": 1250 }, { "grad_norm": 1.539437174797058, "learning_rate": 9.995415777317027e-05, "loss": 0.1625, "step": 1260 }, { "grad_norm": 1.2333788871765137, "learning_rate": 9.995055008913574e-05, "loss": 0.1328, "step": 1270 }, { "grad_norm": 1.1541303396224976, "learning_rate": 9.994680584220463e-05, "loss": 0.1294, "step": 1280 }, { "grad_norm": 1.0528708696365356, "learning_rate": 9.994292504261355e-05, "loss": 0.1441, "step": 1290 }, { "grad_norm": 1.0454338788986206, "learning_rate": 9.993890770097247e-05, "loss": 0.1266, "step": 1300 }, { "grad_norm": 1.1280555725097656, "learning_rate": 9.993475382826467e-05, "loss": 0.1426, "step": 1310 }, { "grad_norm": 1.187239170074463, "learning_rate": 9.993046343584664e-05, "loss": 0.1422, "step": 1320 }, { "grad_norm": 1.0262149572372437, "learning_rate": 9.992603653544816e-05, "loss": 0.1161, "step": 1330 }, { "grad_norm": 1.1586066484451294, "learning_rate": 9.992147313917222e-05, "loss": 0.1408, "step": 1340 }, { "grad_norm": 0.9765651226043701, "learning_rate": 9.991677325949497e-05, "loss": 0.1611, "step": 1350 }, { "grad_norm": 0.9763075709342957, "learning_rate": 9.991193690926568e-05, "loss": 0.1464, "step": 1360 }, { "grad_norm": 1.2092801332473755, "learning_rate": 9.990696410170678e-05, "loss": 0.1466, "step": 1370 }, { "grad_norm": 1.0392274856567383, "learning_rate": 9.990185485041371e-05, "loss": 0.1263, "step": 1380 }, { "grad_norm": 1.0358021259307861, "learning_rate": 9.989660916935498e-05, "loss": 0.1282, "step": 1390 }, { "grad_norm": 1.0262398719787598, "learning_rate": 9.989122707287208e-05, "loss": 0.1391, "step": 1400 }, { "grad_norm": 1.1978421211242676, "learning_rate": 9.988570857567945e-05, "loss": 0.1218, "step": 1410 }, { "grad_norm": 0.9296699166297913, "learning_rate": 9.988005369286446e-05, "loss": 0.1331, "step": 1420 }, { "grad_norm": 1.0004020929336548, "learning_rate": 9.987426243988734e-05, "loss": 0.1372, "step": 1430 }, { "grad_norm": 1.0646557807922363, "learning_rate": 9.986833483258114e-05, "loss": 0.1334, "step": 1440 }, { "grad_norm": 0.9959461688995361, "learning_rate": 9.986227088715173e-05, "loss": 0.1187, "step": 1450 }, { "grad_norm": 1.0928994417190552, "learning_rate": 9.98560706201777e-05, "loss": 0.1455, "step": 1460 }, { "grad_norm": 1.1495130062103271, "learning_rate": 9.984973404861036e-05, "loss": 0.1098, "step": 1470 }, { "grad_norm": 1.3037567138671875, "learning_rate": 9.984326118977361e-05, "loss": 0.1255, "step": 1480 }, { "grad_norm": 0.923818051815033, "learning_rate": 9.983665206136406e-05, "loss": 0.1439, "step": 1490 }, { "eval/loss": 0.12131376132369041, "step": 1500 }, { "grad_norm": 0.9496746063232422, "learning_rate": 9.982990668145075e-05, "loss": 0.123, "step": 1500 }, { "grad_norm": 1.0832738876342773, "learning_rate": 9.982302506847534e-05, "loss": 0.1408, "step": 1510 }, { "grad_norm": 1.033626914024353, "learning_rate": 9.981600724125189e-05, "loss": 0.1108, "step": 1520 }, { "grad_norm": 1.0005450248718262, "learning_rate": 9.980885321896685e-05, "loss": 0.1274, "step": 1530 }, { "grad_norm": 1.0442663431167603, "learning_rate": 9.980156302117905e-05, "loss": 0.1238, "step": 1540 }, { "grad_norm": 0.8260471820831299, "learning_rate": 9.979413666781963e-05, "loss": 0.1231, "step": 1550 }, { "grad_norm": 0.884735107421875, "learning_rate": 9.978657417919193e-05, "loss": 0.1495, "step": 1560 }, { "grad_norm": 0.9236319661140442, "learning_rate": 9.977887557597153e-05, "loss": 0.1327, "step": 1570 }, { "grad_norm": 0.9460572004318237, "learning_rate": 9.97710408792061e-05, "loss": 0.1263, "step": 1580 }, { "grad_norm": 0.9749200344085693, "learning_rate": 9.976307011031542e-05, "loss": 0.12, "step": 1590 }, { "grad_norm": 1.1136820316314697, "learning_rate": 9.975496329109126e-05, "loss": 0.1323, "step": 1600 }, { "grad_norm": 0.8567096590995789, "learning_rate": 9.974672044369732e-05, "loss": 0.125, "step": 1610 }, { "grad_norm": 1.0884920358657837, "learning_rate": 9.97383415906693e-05, "loss": 0.115, "step": 1620 }, { "grad_norm": 1.0339338779449463, "learning_rate": 9.97298267549146e-05, "loss": 0.1386, "step": 1630 }, { "grad_norm": 0.9121850728988647, "learning_rate": 9.972117595971249e-05, "loss": 0.1249, "step": 1640 }, { "grad_norm": 0.9620202779769897, "learning_rate": 9.971238922871391e-05, "loss": 0.1322, "step": 1650 }, { "grad_norm": 1.0946760177612305, "learning_rate": 9.970346658594142e-05, "loss": 0.1123, "step": 1660 }, { "grad_norm": 0.9470517635345459, "learning_rate": 9.969440805578923e-05, "loss": 0.1308, "step": 1670 }, { "grad_norm": 0.7607911229133606, "learning_rate": 9.968521366302298e-05, "loss": 0.1212, "step": 1680 }, { "grad_norm": 1.0321109294891357, "learning_rate": 9.967588343277981e-05, "loss": 0.1351, "step": 1690 }, { "grad_norm": 1.2910419702529907, "learning_rate": 9.966641739056818e-05, "loss": 0.1594, "step": 1700 }, { "grad_norm": 0.7687619924545288, "learning_rate": 9.965681556226793e-05, "loss": 0.1442, "step": 1710 }, { "grad_norm": 0.882544755935669, "learning_rate": 9.964707797413006e-05, "loss": 0.1131, "step": 1720 }, { "grad_norm": 1.0736138820648193, "learning_rate": 9.963720465277679e-05, "loss": 0.0997, "step": 1730 }, { "grad_norm": 1.0465947389602661, "learning_rate": 9.96271956252014e-05, "loss": 0.1114, "step": 1740 }, { "grad_norm": 0.8517502546310425, "learning_rate": 9.961705091876816e-05, "loss": 0.1054, "step": 1750 }, { "grad_norm": 0.8722822666168213, "learning_rate": 9.960677056121235e-05, "loss": 0.1409, "step": 1760 }, { "grad_norm": 1.3352707624435425, "learning_rate": 9.959635458064005e-05, "loss": 0.1207, "step": 1770 }, { "grad_norm": 1.0375741720199585, "learning_rate": 9.958580300552815e-05, "loss": 0.1018, "step": 1780 }, { "grad_norm": 0.9663418531417847, "learning_rate": 9.957511586472426e-05, "loss": 0.1131, "step": 1790 }, { "grad_norm": 0.9925614595413208, "learning_rate": 9.956429318744662e-05, "loss": 0.0994, "step": 1800 }, { "grad_norm": 0.9052272439002991, "learning_rate": 9.955333500328404e-05, "loss": 0.1439, "step": 1810 }, { "grad_norm": 1.1061209440231323, "learning_rate": 9.95422413421957e-05, "loss": 0.099, "step": 1820 }, { "grad_norm": 0.9718905687332153, "learning_rate": 9.953101223451133e-05, "loss": 0.1334, "step": 1830 }, { "grad_norm": 1.041150450706482, "learning_rate": 9.951964771093085e-05, "loss": 0.102, "step": 1840 }, { "grad_norm": 0.7296974658966064, "learning_rate": 9.950814780252442e-05, "loss": 0.1216, "step": 1850 }, { "grad_norm": 0.956504225730896, "learning_rate": 9.949651254073236e-05, "loss": 0.1504, "step": 1860 }, { "grad_norm": 0.9494929909706116, "learning_rate": 9.948474195736504e-05, "loss": 0.1176, "step": 1870 }, { "grad_norm": 1.073912501335144, "learning_rate": 9.947283608460277e-05, "loss": 0.0968, "step": 1880 }, { "grad_norm": 0.9738394618034363, "learning_rate": 9.946079495499577e-05, "loss": 0.1157, "step": 1890 }, { "grad_norm": 0.9605513215065002, "learning_rate": 9.944861860146401e-05, "loss": 0.122, "step": 1900 }, { "grad_norm": 0.8335412740707397, "learning_rate": 9.943630705729719e-05, "loss": 0.0999, "step": 1910 }, { "grad_norm": 0.881373405456543, "learning_rate": 9.942386035615459e-05, "loss": 0.1014, "step": 1920 }, { "grad_norm": 0.7942837476730347, "learning_rate": 9.941127853206503e-05, "loss": 0.1095, "step": 1930 }, { "grad_norm": 1.0058091878890991, "learning_rate": 9.939856161942673e-05, "loss": 0.118, "step": 1940 }, { "grad_norm": 0.8951269388198853, "learning_rate": 9.938570965300724e-05, "loss": 0.1093, "step": 1950 }, { "grad_norm": 0.8100780844688416, "learning_rate": 9.937272266794335e-05, "loss": 0.1344, "step": 1960 }, { "grad_norm": 0.8485944867134094, "learning_rate": 9.935960069974096e-05, "loss": 0.1062, "step": 1970 }, { "grad_norm": 1.0462373495101929, "learning_rate": 9.934634378427506e-05, "loss": 0.1073, "step": 1980 }, { "grad_norm": 0.7918229103088379, "learning_rate": 9.933295195778954e-05, "loss": 0.0865, "step": 1990 }, { "eval/loss": 0.10448359854519368, "step": 2000 }, { "grad_norm": 1.0324554443359375, "learning_rate": 9.931942525689715e-05, "loss": 0.1235, "step": 2000 }, { "grad_norm": 0.9461175203323364, "learning_rate": 9.930576371857936e-05, "loss": 0.1044, "step": 2010 }, { "grad_norm": 0.909123420715332, "learning_rate": 9.929196738018629e-05, "loss": 0.1025, "step": 2020 }, { "grad_norm": 0.777509868144989, "learning_rate": 9.927803627943662e-05, "loss": 0.1044, "step": 2030 }, { "grad_norm": 1.0725702047348022, "learning_rate": 9.926397045441744e-05, "loss": 0.1066, "step": 2040 }, { "grad_norm": 0.9734055399894714, "learning_rate": 9.924976994358417e-05, "loss": 0.1145, "step": 2050 }, { "grad_norm": 0.9912729859352112, "learning_rate": 9.923543478576048e-05, "loss": 0.0964, "step": 2060 }, { "grad_norm": 0.7638317346572876, "learning_rate": 9.922096502013813e-05, "loss": 0.0896, "step": 2070 }, { "grad_norm": 0.8764864802360535, "learning_rate": 9.92063606862769e-05, "loss": 0.0934, "step": 2080 }, { "grad_norm": 0.8820127248764038, "learning_rate": 9.919162182410453e-05, "loss": 0.1327, "step": 2090 }, { "grad_norm": 0.9800596237182617, "learning_rate": 9.917674847391645e-05, "loss": 0.0862, "step": 2100 }, { "grad_norm": 0.844849705696106, "learning_rate": 9.916174067637584e-05, "loss": 0.1044, "step": 2110 }, { "grad_norm": 0.8713732957839966, "learning_rate": 9.914659847251348e-05, "loss": 0.0839, "step": 2120 }, { "grad_norm": 0.8485847115516663, "learning_rate": 9.913132190372753e-05, "loss": 0.1138, "step": 2130 }, { "grad_norm": 0.8496665358543396, "learning_rate": 9.911591101178359e-05, "loss": 0.097, "step": 2140 }, { "grad_norm": 0.936180830001831, "learning_rate": 9.910036583881443e-05, "loss": 0.1023, "step": 2150 }, { "grad_norm": 0.9755275249481201, "learning_rate": 9.908468642731995e-05, "loss": 0.1117, "step": 2160 }, { "grad_norm": 0.9466411471366882, "learning_rate": 9.906887282016707e-05, "loss": 0.1055, "step": 2170 }, { "grad_norm": 0.8077055811882019, "learning_rate": 9.90529250605896e-05, "loss": 0.1113, "step": 2180 }, { "grad_norm": 0.8465186953544617, "learning_rate": 9.903684319218809e-05, "loss": 0.1205, "step": 2190 }, { "grad_norm": 0.7864586114883423, "learning_rate": 9.902062725892976e-05, "loss": 0.1043, "step": 2200 }, { "grad_norm": 0.8424517512321472, "learning_rate": 9.900427730514834e-05, "loss": 0.1116, "step": 2210 }, { "grad_norm": 0.9575174450874329, "learning_rate": 9.8987793375544e-05, "loss": 0.1438, "step": 2220 }, { "grad_norm": 0.8532448410987854, "learning_rate": 9.897117551518318e-05, "loss": 0.1122, "step": 2230 }, { "grad_norm": 0.994422435760498, "learning_rate": 9.895442376949844e-05, "loss": 0.1269, "step": 2240 }, { "grad_norm": 0.7749019861221313, "learning_rate": 9.893753818428845e-05, "loss": 0.109, "step": 2250 }, { "grad_norm": 0.914318323135376, "learning_rate": 9.892051880571773e-05, "loss": 0.0931, "step": 2260 }, { "grad_norm": 0.9681793451309204, "learning_rate": 9.890336568031663e-05, "loss": 0.0974, "step": 2270 }, { "grad_norm": 0.7713805437088013, "learning_rate": 9.888607885498113e-05, "loss": 0.1163, "step": 2280 }, { "grad_norm": 0.9047562479972839, "learning_rate": 9.886865837697275e-05, "loss": 0.0984, "step": 2290 }, { "grad_norm": 0.9030624628067017, "learning_rate": 9.88511042939184e-05, "loss": 0.088, "step": 2300 }, { "grad_norm": 0.863299548625946, "learning_rate": 9.883341665381028e-05, "loss": 0.0892, "step": 2310 }, { "grad_norm": 0.6943339705467224, "learning_rate": 9.881559550500575e-05, "loss": 0.0984, "step": 2320 }, { "grad_norm": 0.8274405002593994, "learning_rate": 9.879764089622712e-05, "loss": 0.0853, "step": 2330 }, { "grad_norm": 1.09088134765625, "learning_rate": 9.87795528765616e-05, "loss": 0.0974, "step": 2340 }, { "grad_norm": 0.7990790605545044, "learning_rate": 9.876133149546118e-05, "loss": 0.1104, "step": 2350 }, { "grad_norm": 0.7597699165344238, "learning_rate": 9.874297680274238e-05, "loss": 0.1052, "step": 2360 }, { "grad_norm": 0.850264847278595, "learning_rate": 9.872448884858624e-05, "loss": 0.1015, "step": 2370 }, { "grad_norm": 0.779171347618103, "learning_rate": 9.870586768353815e-05, "loss": 0.1039, "step": 2380 }, { "grad_norm": 0.8852418661117554, "learning_rate": 9.868711335850764e-05, "loss": 0.0949, "step": 2390 }, { "grad_norm": 0.780746579170227, "learning_rate": 9.866822592476833e-05, "loss": 0.112, "step": 2400 }, { "grad_norm": 0.9411232471466064, "learning_rate": 9.86492054339577e-05, "loss": 0.1031, "step": 2410 }, { "grad_norm": 0.8456240296363831, "learning_rate": 9.863005193807711e-05, "loss": 0.0854, "step": 2420 }, { "grad_norm": 0.960747480392456, "learning_rate": 9.861076548949143e-05, "loss": 0.0884, "step": 2430 }, { "grad_norm": 0.8031632304191589, "learning_rate": 9.859134614092912e-05, "loss": 0.1086, "step": 2440 }, { "grad_norm": 0.918484628200531, "learning_rate": 9.857179394548191e-05, "loss": 0.1083, "step": 2450 }, { "grad_norm": 0.7876137495040894, "learning_rate": 9.855210895660477e-05, "loss": 0.1088, "step": 2460 }, { "grad_norm": 0.8716227412223816, "learning_rate": 9.853229122811568e-05, "loss": 0.1157, "step": 2470 }, { "grad_norm": 0.680898904800415, "learning_rate": 9.851234081419559e-05, "loss": 0.1339, "step": 2480 }, { "grad_norm": 0.8554558753967285, "learning_rate": 9.849225776938814e-05, "loss": 0.0945, "step": 2490 }, { "eval/loss": 0.10325490295886994, "step": 2500 }, { "grad_norm": 0.82692551612854, "learning_rate": 9.847204214859964e-05, "loss": 0.0895, "step": 2500 }, { "grad_norm": 0.7433873414993286, "learning_rate": 9.845169400709879e-05, "loss": 0.0943, "step": 2510 }, { "grad_norm": 0.9095647931098938, "learning_rate": 9.843121340051664e-05, "loss": 0.0963, "step": 2520 }, { "grad_norm": 0.8508788347244263, "learning_rate": 9.841060038484641e-05, "loss": 0.1102, "step": 2530 }, { "grad_norm": 0.8506011366844177, "learning_rate": 9.838985501644328e-05, "loss": 0.1057, "step": 2540 }, { "grad_norm": 0.8930569887161255, "learning_rate": 9.83689773520243e-05, "loss": 0.0895, "step": 2550 }, { "grad_norm": 0.696980357170105, "learning_rate": 9.834796744866819e-05, "loss": 0.1297, "step": 2560 }, { "grad_norm": 0.9346020221710205, "learning_rate": 9.832682536381525e-05, "loss": 0.1123, "step": 2570 }, { "grad_norm": 0.829645037651062, "learning_rate": 9.830555115526711e-05, "loss": 0.0952, "step": 2580 }, { "grad_norm": 0.6775968074798584, "learning_rate": 9.828414488118667e-05, "loss": 0.1127, "step": 2590 }, { "grad_norm": 0.7343615293502808, "learning_rate": 9.826260660009785e-05, "loss": 0.1101, "step": 2600 }, { "grad_norm": 0.9343723058700562, "learning_rate": 9.824093637088547e-05, "loss": 0.1, "step": 2610 }, { "grad_norm": 0.8306047320365906, "learning_rate": 9.821913425279514e-05, "loss": 0.0839, "step": 2620 }, { "grad_norm": 0.8243780732154846, "learning_rate": 9.8197200305433e-05, "loss": 0.0916, "step": 2630 }, { "grad_norm": 0.6883335709571838, "learning_rate": 9.817513458876564e-05, "loss": 0.0898, "step": 2640 }, { "grad_norm": 0.8698527216911316, "learning_rate": 9.815293716311987e-05, "loss": 0.0995, "step": 2650 }, { "grad_norm": 0.6949645280838013, "learning_rate": 9.813060808918262e-05, "loss": 0.0867, "step": 2660 }, { "grad_norm": 1.0080634355545044, "learning_rate": 9.810814742800069e-05, "loss": 0.1031, "step": 2670 }, { "grad_norm": 0.8581037521362305, "learning_rate": 9.808555524098074e-05, "loss": 0.0745, "step": 2680 }, { "grad_norm": 0.7769279479980469, "learning_rate": 9.806283158988887e-05, "loss": 0.0808, "step": 2690 }, { "grad_norm": 0.6746631264686584, "learning_rate": 9.803997653685072e-05, "loss": 0.0777, "step": 2700 }, { "grad_norm": 0.848559558391571, "learning_rate": 9.801699014435112e-05, "loss": 0.0877, "step": 2710 }, { "grad_norm": 0.8104966878890991, "learning_rate": 9.799387247523398e-05, "loss": 0.0807, "step": 2720 }, { "grad_norm": 0.7445914149284363, "learning_rate": 9.797062359270215e-05, "loss": 0.0924, "step": 2730 }, { "grad_norm": 0.7468881607055664, "learning_rate": 9.794724356031715e-05, "loss": 0.1078, "step": 2740 }, { "grad_norm": 0.8936640024185181, "learning_rate": 9.792373244199913e-05, "loss": 0.0991, "step": 2750 }, { "grad_norm": 0.6333166360855103, "learning_rate": 9.790009030202658e-05, "loss": 0.0737, "step": 2760 }, { "grad_norm": 0.7722197771072388, "learning_rate": 9.78763172050362e-05, "loss": 0.0841, "step": 2770 }, { "grad_norm": 0.8541356921195984, "learning_rate": 9.785241321602274e-05, "loss": 0.0873, "step": 2780 }, { "grad_norm": 0.7936137318611145, "learning_rate": 9.782837840033879e-05, "loss": 0.0894, "step": 2790 }, { "grad_norm": 0.6794918775558472, "learning_rate": 9.780421282369461e-05, "loss": 0.0933, "step": 2800 }, { "grad_norm": 0.8522564768791199, "learning_rate": 9.777991655215797e-05, "loss": 0.0938, "step": 2810 }, { "grad_norm": 0.6844787001609802, "learning_rate": 9.775548965215394e-05, "loss": 0.0723, "step": 2820 }, { "grad_norm": 0.8117145895957947, "learning_rate": 9.773093219046474e-05, "loss": 0.0949, "step": 2830 }, { "grad_norm": 0.7425336837768555, "learning_rate": 9.770624423422954e-05, "loss": 0.1161, "step": 2840 }, { "grad_norm": 0.780025839805603, "learning_rate": 9.768142585094426e-05, "loss": 0.0932, "step": 2850 }, { "grad_norm": 0.962777853012085, "learning_rate": 9.765647710846142e-05, "loss": 0.074, "step": 2860 }, { "grad_norm": 0.85185706615448, "learning_rate": 9.763139807498991e-05, "loss": 0.0777, "step": 2870 }, { "grad_norm": 0.6887856125831604, "learning_rate": 9.760618881909487e-05, "loss": 0.0797, "step": 2880 }, { "grad_norm": 0.752108097076416, "learning_rate": 9.758084940969744e-05, "loss": 0.0902, "step": 2890 }, { "grad_norm": 0.7339804172515869, "learning_rate": 9.755537991607459e-05, "loss": 0.0835, "step": 2900 }, { "grad_norm": 0.8898280262947083, "learning_rate": 9.752978040785895e-05, "loss": 0.0949, "step": 2910 }, { "grad_norm": 0.899470329284668, "learning_rate": 9.750405095503859e-05, "loss": 0.0803, "step": 2920 }, { "grad_norm": 0.8418382406234741, "learning_rate": 9.747819162795686e-05, "loss": 0.1044, "step": 2930 }, { "grad_norm": 0.7426697015762329, "learning_rate": 9.745220249731217e-05, "loss": 0.0953, "step": 2940 }, { "grad_norm": 0.8499985933303833, "learning_rate": 9.742608363415781e-05, "loss": 0.0967, "step": 2950 }, { "grad_norm": 0.8468800783157349, "learning_rate": 9.739983510990176e-05, "loss": 0.0888, "step": 2960 }, { "grad_norm": 0.7636969089508057, "learning_rate": 9.737345699630647e-05, "loss": 0.1128, "step": 2970 }, { "grad_norm": 0.7591195106506348, "learning_rate": 9.734694936548869e-05, "loss": 0.1036, "step": 2980 }, { "grad_norm": 0.756714403629303, "learning_rate": 9.732031228991932e-05, "loss": 0.0833, "step": 2990 }, { "eval/loss": 0.08892382547259331, "step": 3000 }, { "grad_norm": 0.8152308464050293, "learning_rate": 9.729354584242302e-05, "loss": 0.0923, "step": 3000 }, { "grad_norm": 0.6225714683532715, "learning_rate": 9.726665009617832e-05, "loss": 0.0941, "step": 3010 }, { "grad_norm": 0.9567254185676575, "learning_rate": 9.723962512471714e-05, "loss": 0.116, "step": 3020 }, { "grad_norm": 0.7935624122619629, "learning_rate": 9.72124710019247e-05, "loss": 0.0843, "step": 3030 }, { "grad_norm": 0.9391128420829773, "learning_rate": 9.718518780203934e-05, "loss": 0.0871, "step": 3040 }, { "grad_norm": 0.8182015419006348, "learning_rate": 9.715777559965228e-05, "loss": 0.0915, "step": 3050 }, { "grad_norm": 0.6948622465133667, "learning_rate": 9.713023446970746e-05, "loss": 0.0814, "step": 3060 }, { "grad_norm": 0.8153758645057678, "learning_rate": 9.710256448750126e-05, "loss": 0.089, "step": 3070 }, { "grad_norm": 0.6592750549316406, "learning_rate": 9.707476572868235e-05, "loss": 0.1341, "step": 3080 }, { "grad_norm": 0.760163426399231, "learning_rate": 9.704683826925149e-05, "loss": 0.0784, "step": 3090 }, { "grad_norm": 0.6800974607467651, "learning_rate": 9.701878218556129e-05, "loss": 0.0969, "step": 3100 }, { "grad_norm": 0.7948629260063171, "learning_rate": 9.699059755431598e-05, "loss": 0.0847, "step": 3110 }, { "grad_norm": 0.7480330467224121, "learning_rate": 9.696228445257132e-05, "loss": 0.0823, "step": 3120 }, { "grad_norm": 0.8076481223106384, "learning_rate": 9.693384295773419e-05, "loss": 0.0859, "step": 3130 }, { "grad_norm": 0.9288508892059326, "learning_rate": 9.690527314756259e-05, "loss": 0.1012, "step": 3140 }, { "grad_norm": 0.833281397819519, "learning_rate": 9.687657510016527e-05, "loss": 0.0921, "step": 3150 }, { "grad_norm": 0.786363422870636, "learning_rate": 9.684774889400161e-05, "loss": 0.0769, "step": 3160 }, { "grad_norm": 0.8035467863082886, "learning_rate": 9.681879460788135e-05, "loss": 0.1043, "step": 3170 }, { "grad_norm": 0.8117609620094299, "learning_rate": 9.67897123209644e-05, "loss": 0.0946, "step": 3180 }, { "grad_norm": 0.8063046932220459, "learning_rate": 9.676050211276062e-05, "loss": 0.0901, "step": 3190 }, { "grad_norm": 0.6972727179527283, "learning_rate": 9.673116406312962e-05, "loss": 0.0719, "step": 3200 }, { "grad_norm": 0.7159572839736938, "learning_rate": 9.67016982522805e-05, "loss": 0.0821, "step": 3210 }, { "grad_norm": 0.7346596717834473, "learning_rate": 9.667210476077164e-05, "loss": 0.0872, "step": 3220 }, { "grad_norm": 0.6855632662773132, "learning_rate": 9.664238366951055e-05, "loss": 0.089, "step": 3230 }, { "grad_norm": 0.8691261410713196, "learning_rate": 9.661253505975355e-05, "loss": 0.0848, "step": 3240 }, { "grad_norm": 0.7525714039802551, "learning_rate": 9.658255901310557e-05, "loss": 0.0898, "step": 3250 }, { "grad_norm": 0.7712537050247192, "learning_rate": 9.655245561152e-05, "loss": 0.0661, "step": 3260 }, { "grad_norm": 0.8987488746643066, "learning_rate": 9.65222249372984e-05, "loss": 0.0946, "step": 3270 }, { "grad_norm": 0.7688019871711731, "learning_rate": 9.649186707309026e-05, "loss": 0.1034, "step": 3280 }, { "grad_norm": 0.8300652503967285, "learning_rate": 9.646138210189283e-05, "loss": 0.0984, "step": 3290 }, { "grad_norm": 0.7578057646751404, "learning_rate": 9.643077010705087e-05, "loss": 0.0892, "step": 3300 }, { "grad_norm": 0.6529524326324463, "learning_rate": 9.640003117225637e-05, "loss": 0.0895, "step": 3310 }, { "grad_norm": 0.8418911695480347, "learning_rate": 9.636916538154846e-05, "loss": 0.0809, "step": 3320 }, { "grad_norm": 0.7712683081626892, "learning_rate": 9.633817281931296e-05, "loss": 0.0686, "step": 3330 }, { "grad_norm": 0.7736837863922119, "learning_rate": 9.630705357028242e-05, "loss": 0.0807, "step": 3340 }, { "grad_norm": 0.6950215101242065, "learning_rate": 9.627580771953563e-05, "loss": 0.0891, "step": 3350 }, { "grad_norm": 0.6912685632705688, "learning_rate": 9.624443535249759e-05, "loss": 0.0837, "step": 3360 }, { "grad_norm": 0.8387035727500916, "learning_rate": 9.621293655493913e-05, "loss": 0.0944, "step": 3370 }, { "grad_norm": 0.7013605237007141, "learning_rate": 9.618131141297675e-05, "loss": 0.0868, "step": 3380 }, { "grad_norm": 0.8324646353721619, "learning_rate": 9.614956001307242e-05, "loss": 0.0789, "step": 3390 }, { "grad_norm": 0.7250398993492126, "learning_rate": 9.611768244203321e-05, "loss": 0.0795, "step": 3400 }, { "grad_norm": 0.8336584568023682, "learning_rate": 9.60856787870112e-05, "loss": 0.0816, "step": 3410 }, { "grad_norm": 0.8211973309516907, "learning_rate": 9.605354913550318e-05, "loss": 0.089, "step": 3420 }, { "grad_norm": 0.9170548915863037, "learning_rate": 9.602129357535037e-05, "loss": 0.0747, "step": 3430 }, { "grad_norm": 0.7421762943267822, "learning_rate": 9.598891219473825e-05, "loss": 0.0767, "step": 3440 }, { "grad_norm": 0.7628731727600098, "learning_rate": 9.595640508219625e-05, "loss": 0.0889, "step": 3450 }, { "grad_norm": 0.5860986113548279, "learning_rate": 9.592377232659761e-05, "loss": 0.0737, "step": 3460 }, { "grad_norm": 0.6898937821388245, "learning_rate": 9.589101401715904e-05, "loss": 0.0938, "step": 3470 }, { "grad_norm": 0.7277891635894775, "learning_rate": 9.585813024344045e-05, "loss": 0.0868, "step": 3480 }, { "grad_norm": 0.6677371263504028, "learning_rate": 9.58251210953449e-05, "loss": 0.0785, "step": 3490 }, { "eval/loss": 0.09203485958278179, "step": 3500 }, { "grad_norm": 0.7086572051048279, "learning_rate": 9.579198666311809e-05, "loss": 0.1026, "step": 3500 }, { "grad_norm": 0.6426539421081543, "learning_rate": 9.575872703734832e-05, "loss": 0.072, "step": 3510 }, { "grad_norm": 0.705496609210968, "learning_rate": 9.572534230896611e-05, "loss": 0.0736, "step": 3520 }, { "grad_norm": 0.861102283000946, "learning_rate": 9.569183256924403e-05, "loss": 0.0712, "step": 3530 }, { "grad_norm": 0.8250711560249329, "learning_rate": 9.565819790979646e-05, "loss": 0.0907, "step": 3540 }, { "grad_norm": 0.76627117395401, "learning_rate": 9.562443842257925e-05, "loss": 0.0833, "step": 3550 }, { "grad_norm": 0.599372148513794, "learning_rate": 9.559055419988956e-05, "loss": 0.0809, "step": 3560 }, { "grad_norm": 0.8596682548522949, "learning_rate": 9.555654533436557e-05, "loss": 0.091, "step": 3570 }, { "grad_norm": 0.8261439800262451, "learning_rate": 9.552241191898621e-05, "loss": 0.0799, "step": 3580 }, { "grad_norm": 0.6706359386444092, "learning_rate": 9.548815404707092e-05, "loss": 0.0991, "step": 3590 }, { "grad_norm": 0.7521600723266602, "learning_rate": 9.545377181227942e-05, "loss": 0.0848, "step": 3600 }, { "grad_norm": 0.8199614882469177, "learning_rate": 9.541926530861145e-05, "loss": 0.0956, "step": 3610 }, { "grad_norm": 0.7299994230270386, "learning_rate": 9.538463463040645e-05, "loss": 0.0728, "step": 3620 }, { "grad_norm": 0.7606593370437622, "learning_rate": 9.534987987234337e-05, "loss": 0.0846, "step": 3630 }, { "grad_norm": 0.7459183931350708, "learning_rate": 9.53150011294404e-05, "loss": 0.0862, "step": 3640 }, { "grad_norm": 0.6445205807685852, "learning_rate": 9.527999849705471e-05, "loss": 0.0853, "step": 3650 }, { "grad_norm": 0.7891181707382202, "learning_rate": 9.524487207088213e-05, "loss": 0.0746, "step": 3660 }, { "grad_norm": 0.7532823085784912, "learning_rate": 9.520962194695698e-05, "loss": 0.0804, "step": 3670 }, { "grad_norm": 0.6685933470726013, "learning_rate": 9.517424822165175e-05, "loss": 0.0881, "step": 3680 }, { "grad_norm": 0.7136467695236206, "learning_rate": 9.513875099167685e-05, "loss": 0.0589, "step": 3690 }, { "grad_norm": 0.7880435585975647, "learning_rate": 9.510313035408035e-05, "loss": 0.0866, "step": 3700 }, { "grad_norm": 0.681725263595581, "learning_rate": 9.506738640624775e-05, "loss": 0.0821, "step": 3710 }, { "grad_norm": 0.7891600131988525, "learning_rate": 9.50315192459016e-05, "loss": 0.0673, "step": 3720 }, { "grad_norm": 0.8075012564659119, "learning_rate": 9.499552897110136e-05, "loss": 0.0909, "step": 3730 }, { "grad_norm": 0.5734759569168091, "learning_rate": 9.495941568024304e-05, "loss": 0.0831, "step": 3740 }, { "grad_norm": 0.6696135997772217, "learning_rate": 9.492317947205904e-05, "loss": 0.0726, "step": 3750 }, { "grad_norm": 0.6502518057823181, "learning_rate": 9.488682044561775e-05, "loss": 0.0813, "step": 3760 }, { "grad_norm": 0.7042556405067444, "learning_rate": 9.485033870032335e-05, "loss": 0.0871, "step": 3770 }, { "grad_norm": 0.666341245174408, "learning_rate": 9.481373433591556e-05, "loss": 0.0794, "step": 3780 }, { "grad_norm": 0.8834477663040161, "learning_rate": 9.47770074524693e-05, "loss": 0.0833, "step": 3790 }, { "grad_norm": 0.6137105226516724, "learning_rate": 9.474015815039446e-05, "loss": 0.0939, "step": 3800 }, { "grad_norm": 0.6735588312149048, "learning_rate": 9.470318653043565e-05, "loss": 0.076, "step": 3810 }, { "grad_norm": 0.7587777972221375, "learning_rate": 9.466609269367185e-05, "loss": 0.0701, "step": 3820 }, { "grad_norm": 0.6362771987915039, "learning_rate": 9.46288767415162e-05, "loss": 0.0718, "step": 3830 }, { "grad_norm": 0.7394053339958191, "learning_rate": 9.459153877571567e-05, "loss": 0.0756, "step": 3840 }, { "grad_norm": 0.7164848446846008, "learning_rate": 9.455407889835087e-05, "loss": 0.0734, "step": 3850 }, { "grad_norm": 0.6653488278388977, "learning_rate": 9.451649721183564e-05, "loss": 0.0755, "step": 3860 }, { "grad_norm": 0.7268480658531189, "learning_rate": 9.447879381891692e-05, "loss": 0.0937, "step": 3870 }, { "grad_norm": 0.6954826712608337, "learning_rate": 9.444096882267428e-05, "loss": 0.1, "step": 3880 }, { "grad_norm": 0.6395136117935181, "learning_rate": 9.440302232651988e-05, "loss": 0.0955, "step": 3890 }, { "grad_norm": 0.5530162453651428, "learning_rate": 9.436495443419795e-05, "loss": 0.0884, "step": 3900 }, { "grad_norm": 0.6270701885223389, "learning_rate": 9.432676524978466e-05, "loss": 0.0939, "step": 3910 }, { "grad_norm": 0.6683644652366638, "learning_rate": 9.42884548776878e-05, "loss": 0.0846, "step": 3920 }, { "grad_norm": 0.5946120619773865, "learning_rate": 9.425002342264646e-05, "loss": 0.0716, "step": 3930 }, { "grad_norm": 0.643402099609375, "learning_rate": 9.421147098973077e-05, "loss": 0.0779, "step": 3940 }, { "grad_norm": 0.604381263256073, "learning_rate": 9.41727976843416e-05, "loss": 0.0651, "step": 3950 }, { "grad_norm": 0.5524080991744995, "learning_rate": 9.413400361221029e-05, "loss": 0.0781, "step": 3960 }, { "grad_norm": 0.6096197366714478, "learning_rate": 9.409508887939835e-05, "loss": 0.1109, "step": 3970 }, { "grad_norm": 0.7158094048500061, "learning_rate": 9.40560535922972e-05, "loss": 0.0746, "step": 3980 }, { "grad_norm": 0.6933304667472839, "learning_rate": 9.40168978576278e-05, "loss": 0.0624, "step": 3990 }, { "eval/loss": 0.09422188766300678, "step": 4000 }, { "grad_norm": 0.690434455871582, "learning_rate": 9.397762178244043e-05, "loss": 0.0626, "step": 4000 }, { "grad_norm": 0.6562466621398926, "learning_rate": 9.393822547411439e-05, "loss": 0.0777, "step": 4010 }, { "grad_norm": 0.7862213253974915, "learning_rate": 9.389870904035769e-05, "loss": 0.0967, "step": 4020 }, { "grad_norm": 0.6593216061592102, "learning_rate": 9.385907258920672e-05, "loss": 0.0841, "step": 4030 }, { "grad_norm": 0.8023529052734375, "learning_rate": 9.381931622902607e-05, "loss": 0.081, "step": 4040 }, { "grad_norm": 0.7804498076438904, "learning_rate": 9.377944006850807e-05, "loss": 0.0929, "step": 4050 }, { "grad_norm": 0.7192986011505127, "learning_rate": 9.373944421667265e-05, "loss": 0.0822, "step": 4060 }, { "grad_norm": 0.621648907661438, "learning_rate": 9.369932878286691e-05, "loss": 0.0744, "step": 4070 }, { "grad_norm": 0.7035024762153625, "learning_rate": 9.365909387676494e-05, "loss": 0.0845, "step": 4080 }, { "grad_norm": 0.7409992814064026, "learning_rate": 9.361873960836744e-05, "loss": 0.0848, "step": 4090 }, { "grad_norm": 0.6578088998794556, "learning_rate": 9.357826608800142e-05, "loss": 0.0796, "step": 4100 }, { "grad_norm": 0.8029019236564636, "learning_rate": 9.353767342631994e-05, "loss": 0.1066, "step": 4110 }, { "grad_norm": 0.6449981927871704, "learning_rate": 9.34969617343018e-05, "loss": 0.0828, "step": 4120 }, { "grad_norm": 0.6862790584564209, "learning_rate": 9.345613112325122e-05, "loss": 0.08, "step": 4130 }, { "grad_norm": 0.7120580673217773, "learning_rate": 9.34151817047975e-05, "loss": 0.097, "step": 4140 }, { "grad_norm": 0.5801737308502197, "learning_rate": 9.33741135908948e-05, "loss": 0.0925, "step": 4150 }, { "grad_norm": 0.6498845219612122, "learning_rate": 9.33329268938218e-05, "loss": 0.0738, "step": 4160 }, { "grad_norm": 0.6008553504943848, "learning_rate": 9.329162172618132e-05, "loss": 0.0951, "step": 4170 }, { "grad_norm": 0.6471865177154541, "learning_rate": 9.325019820090013e-05, "loss": 0.0798, "step": 4180 }, { "grad_norm": 0.5920547842979431, "learning_rate": 9.320865643122855e-05, "loss": 0.0843, "step": 4190 }, { "grad_norm": 0.672496497631073, "learning_rate": 9.316699653074023e-05, "loss": 0.08, "step": 4200 }, { "grad_norm": 0.6855069398880005, "learning_rate": 9.312521861333172e-05, "loss": 0.085, "step": 4210 }, { "grad_norm": 0.8400130271911621, "learning_rate": 9.308332279322224e-05, "loss": 0.0698, "step": 4220 }, { "grad_norm": 0.7486523389816284, "learning_rate": 9.304130918495338e-05, "loss": 0.0915, "step": 4230 }, { "grad_norm": 0.619810163974762, "learning_rate": 9.299917790338874e-05, "loss": 0.0884, "step": 4240 }, { "grad_norm": 0.7251216173171997, "learning_rate": 9.295692906371363e-05, "loss": 0.0703, "step": 4250 }, { "grad_norm": 0.7287355065345764, "learning_rate": 9.291456278143476e-05, "loss": 0.083, "step": 4260 }, { "grad_norm": 0.6064874529838562, "learning_rate": 9.287207917237994e-05, "loss": 0.0807, "step": 4270 }, { "grad_norm": 0.6620915532112122, "learning_rate": 9.282947835269773e-05, "loss": 0.0823, "step": 4280 }, { "grad_norm": 0.7679476141929626, "learning_rate": 9.278676043885715e-05, "loss": 0.088, "step": 4290 }, { "grad_norm": 0.6529974341392517, "learning_rate": 9.274392554764733e-05, "loss": 0.0792, "step": 4300 }, { "grad_norm": 0.6754187941551208, "learning_rate": 9.270097379617723e-05, "loss": 0.0803, "step": 4310 }, { "grad_norm": 0.7030357718467712, "learning_rate": 9.26579053018753e-05, "loss": 0.0752, "step": 4320 }, { "grad_norm": 0.748179018497467, "learning_rate": 9.261472018248918e-05, "loss": 0.0946, "step": 4330 }, { "grad_norm": 0.6837642192840576, "learning_rate": 9.25714185560853e-05, "loss": 0.0841, "step": 4340 }, { "grad_norm": 0.7583628296852112, "learning_rate": 9.252800054104868e-05, "loss": 0.0699, "step": 4350 }, { "grad_norm": 0.7082024812698364, "learning_rate": 9.248446625608252e-05, "loss": 0.0863, "step": 4360 }, { "grad_norm": 0.7911331057548523, "learning_rate": 9.244081582020789e-05, "loss": 0.0877, "step": 4370 }, { "grad_norm": 0.6207730174064636, "learning_rate": 9.239704935276339e-05, "loss": 0.0998, "step": 4380 }, { "grad_norm": 0.5901274085044861, "learning_rate": 9.235316697340489e-05, "loss": 0.0684, "step": 4390 }, { "grad_norm": 0.5889009237289429, "learning_rate": 9.230916880210512e-05, "loss": 0.0771, "step": 4400 }, { "grad_norm": 0.6842628717422485, "learning_rate": 9.226505495915342e-05, "loss": 0.0766, "step": 4410 }, { "grad_norm": 0.6028392910957336, "learning_rate": 9.222082556515536e-05, "loss": 0.0872, "step": 4420 }, { "grad_norm": 0.5874050259590149, "learning_rate": 9.217648074103242e-05, "loss": 0.0767, "step": 4430 }, { "grad_norm": 0.6363229751586914, "learning_rate": 9.213202060802161e-05, "loss": 0.0752, "step": 4440 }, { "grad_norm": 0.7535254955291748, "learning_rate": 9.208744528767528e-05, "loss": 0.0726, "step": 4450 }, { "grad_norm": 0.6533192992210388, "learning_rate": 9.204275490186064e-05, "loss": 0.0655, "step": 4460 }, { "grad_norm": 0.6523351073265076, "learning_rate": 9.199794957275949e-05, "loss": 0.0724, "step": 4470 }, { "grad_norm": 0.6989829540252686, "learning_rate": 9.19530294228679e-05, "loss": 0.0739, "step": 4480 }, { "grad_norm": 0.7622507214546204, "learning_rate": 9.190799457499583e-05, "loss": 0.0953, "step": 4490 }, { "eval/loss": 0.09806343361735344, "step": 4500 }, { "grad_norm": 0.8140520453453064, "learning_rate": 9.186284515226686e-05, "loss": 0.0715, "step": 4500 }, { "grad_norm": 0.6975813508033752, "learning_rate": 9.181758127811777e-05, "loss": 0.0878, "step": 4510 }, { "grad_norm": 0.7301797270774841, "learning_rate": 9.177220307629825e-05, "loss": 0.0726, "step": 4520 }, { "grad_norm": 0.8559888005256653, "learning_rate": 9.172671067087059e-05, "loss": 0.0875, "step": 4530 }, { "grad_norm": 0.6034440994262695, "learning_rate": 9.16811041862093e-05, "loss": 0.0769, "step": 4540 }, { "grad_norm": 0.8451995253562927, "learning_rate": 9.163538374700076e-05, "loss": 0.0919, "step": 4550 }, { "grad_norm": 0.7010396718978882, "learning_rate": 9.158954947824287e-05, "loss": 0.0864, "step": 4560 }, { "grad_norm": 0.765733540058136, "learning_rate": 9.154360150524482e-05, "loss": 0.0822, "step": 4570 }, { "grad_norm": 0.6512400507926941, "learning_rate": 9.14975399536266e-05, "loss": 0.0655, "step": 4580 }, { "grad_norm": 0.619882345199585, "learning_rate": 9.14513649493187e-05, "loss": 0.0743, "step": 4590 }, { "grad_norm": 0.6765198111534119, "learning_rate": 9.140507661856187e-05, "loss": 0.0779, "step": 4600 }, { "grad_norm": 0.5520381927490234, "learning_rate": 9.135867508790661e-05, "loss": 0.068, "step": 4610 }, { "grad_norm": 0.6381470561027527, "learning_rate": 9.131216048421291e-05, "loss": 0.095, "step": 4620 }, { "grad_norm": 0.6084108948707581, "learning_rate": 9.126553293464998e-05, "loss": 0.0855, "step": 4630 }, { "grad_norm": 0.6895321607589722, "learning_rate": 9.121879256669572e-05, "loss": 0.0674, "step": 4640 }, { "grad_norm": 0.7201507091522217, "learning_rate": 9.117193950813652e-05, "loss": 0.072, "step": 4650 }, { "grad_norm": 0.6582661271095276, "learning_rate": 9.112497388706685e-05, "loss": 0.0912, "step": 4660 }, { "grad_norm": 0.6580666899681091, "learning_rate": 9.10778958318889e-05, "loss": 0.0973, "step": 4670 }, { "grad_norm": 0.8242357969284058, "learning_rate": 9.103070547131232e-05, "loss": 0.0891, "step": 4680 }, { "grad_norm": 0.6375697255134583, "learning_rate": 9.098340293435375e-05, "loss": 0.0804, "step": 4690 }, { "grad_norm": 0.5634434819221497, "learning_rate": 9.093598835033649e-05, "loss": 0.0701, "step": 4700 }, { "grad_norm": 0.6834983229637146, "learning_rate": 9.088846184889021e-05, "loss": 0.1067, "step": 4710 }, { "grad_norm": 0.6781161427497864, "learning_rate": 9.084082355995057e-05, "loss": 0.0644, "step": 4720 }, { "grad_norm": 0.6638842225074768, "learning_rate": 9.079307361375882e-05, "loss": 0.0663, "step": 4730 }, { "grad_norm": 0.6190152168273926, "learning_rate": 9.074521214086149e-05, "loss": 0.0981, "step": 4740 }, { "grad_norm": 0.6870885491371155, "learning_rate": 9.069723927211001e-05, "loss": 0.0627, "step": 4750 }, { "grad_norm": 0.7301667928695679, "learning_rate": 9.064915513866037e-05, "loss": 0.0849, "step": 4760 }, { "grad_norm": 0.6799473166465759, "learning_rate": 9.060095987197279e-05, "loss": 0.0717, "step": 4770 }, { "grad_norm": 0.708286702632904, "learning_rate": 9.055265360381126e-05, "loss": 0.0706, "step": 4780 }, { "grad_norm": 0.6627357006072998, "learning_rate": 9.050423646624326e-05, "loss": 0.0802, "step": 4790 }, { "grad_norm": 0.5568764805793762, "learning_rate": 9.045570859163943e-05, "loss": 0.0928, "step": 4800 }, { "grad_norm": 0.6857120394706726, "learning_rate": 9.04070701126731e-05, "loss": 0.0761, "step": 4810 }, { "grad_norm": 0.6400876641273499, "learning_rate": 9.035832116232001e-05, "loss": 0.0757, "step": 4820 }, { "grad_norm": 0.7173035740852356, "learning_rate": 9.030946187385796e-05, "loss": 0.0761, "step": 4830 }, { "grad_norm": 0.5821941494941711, "learning_rate": 9.026049238086635e-05, "loss": 0.0786, "step": 4840 }, { "grad_norm": 0.6988785862922668, "learning_rate": 9.021141281722591e-05, "loss": 0.0672, "step": 4850 }, { "grad_norm": 0.6213851571083069, "learning_rate": 9.01622233171183e-05, "loss": 0.0638, "step": 4860 }, { "grad_norm": 0.5618359446525574, "learning_rate": 9.011292401502574e-05, "loss": 0.0885, "step": 4870 }, { "grad_norm": 0.5900238752365112, "learning_rate": 9.006351504573063e-05, "loss": 0.0781, "step": 4880 }, { "grad_norm": 0.6234111785888672, "learning_rate": 9.001399654431519e-05, "loss": 0.0765, "step": 4890 }, { "grad_norm": 0.6283660531044006, "learning_rate": 8.996436864616116e-05, "loss": 0.0619, "step": 4900 }, { "grad_norm": 0.7293394207954407, "learning_rate": 8.991463148694925e-05, "loss": 0.0773, "step": 4910 }, { "grad_norm": 0.6040658950805664, "learning_rate": 8.986478520265902e-05, "loss": 0.0632, "step": 4920 }, { "grad_norm": 0.6519870758056641, "learning_rate": 8.981482992956827e-05, "loss": 0.0616, "step": 4930 }, { "grad_norm": 0.6233829855918884, "learning_rate": 8.976476580425282e-05, "loss": 0.0746, "step": 4940 }, { "grad_norm": 0.7341985702514648, "learning_rate": 8.971459296358606e-05, "loss": 0.071, "step": 4950 }, { "grad_norm": 0.6652783155441284, "learning_rate": 8.966431154473864e-05, "loss": 0.0664, "step": 4960 }, { "grad_norm": 0.6612856984138489, "learning_rate": 8.961392168517803e-05, "loss": 0.0728, "step": 4970 }, { "grad_norm": 0.6548994779586792, "learning_rate": 8.956342352266821e-05, "loss": 0.0764, "step": 4980 }, { "grad_norm": 0.7572376132011414, "learning_rate": 8.95128171952692e-05, "loss": 0.0875, "step": 4990 }, { "eval/loss": 0.08772104680538177, "step": 5000 }, { "grad_norm": 0.7272939682006836, "learning_rate": 8.946210284133676e-05, "loss": 0.0566, "step": 5000 }, { "grad_norm": 0.5917906761169434, "learning_rate": 8.941128059952201e-05, "loss": 0.074, "step": 5010 }, { "grad_norm": 0.6331184506416321, "learning_rate": 8.936035060877102e-05, "loss": 0.0649, "step": 5020 }, { "grad_norm": 0.5847299695014954, "learning_rate": 8.930931300832443e-05, "loss": 0.0692, "step": 5030 }, { "grad_norm": 0.8006874322891235, "learning_rate": 8.925816793771711e-05, "loss": 0.0679, "step": 5040 }, { "grad_norm": 0.6630862951278687, "learning_rate": 8.92069155367777e-05, "loss": 0.0706, "step": 5050 }, { "grad_norm": 0.6758430600166321, "learning_rate": 8.915555594562834e-05, "loss": 0.0747, "step": 5060 }, { "grad_norm": 0.6669508218765259, "learning_rate": 8.910408930468416e-05, "loss": 0.0799, "step": 5070 }, { "grad_norm": 0.763920783996582, "learning_rate": 8.905251575465303e-05, "loss": 0.088, "step": 5080 }, { "grad_norm": 0.5095866322517395, "learning_rate": 8.900083543653502e-05, "loss": 0.0687, "step": 5090 }, { "grad_norm": 0.5406675934791565, "learning_rate": 8.894904849162218e-05, "loss": 0.0893, "step": 5100 }, { "grad_norm": 0.6344950795173645, "learning_rate": 8.889715506149802e-05, "loss": 0.0698, "step": 5110 }, { "grad_norm": 0.6500484943389893, "learning_rate": 8.884515528803722e-05, "loss": 0.0651, "step": 5120 }, { "grad_norm": 0.646142840385437, "learning_rate": 8.879304931340517e-05, "loss": 0.0607, "step": 5130 }, { "grad_norm": 0.7101523876190186, "learning_rate": 8.874083728005759e-05, "loss": 0.0665, "step": 5140 }, { "grad_norm": 0.7680063843727112, "learning_rate": 8.868851933074021e-05, "loss": 0.0925, "step": 5150 }, { "grad_norm": 0.5732299089431763, "learning_rate": 8.863609560848829e-05, "loss": 0.0754, "step": 5160 }, { "grad_norm": 0.6689339280128479, "learning_rate": 8.85835662566263e-05, "loss": 0.0703, "step": 5170 }, { "grad_norm": 0.7001439929008484, "learning_rate": 8.853093141876747e-05, "loss": 0.0821, "step": 5180 }, { "grad_norm": 0.6817865371704102, "learning_rate": 8.847819123881343e-05, "loss": 0.0772, "step": 5190 }, { "grad_norm": 0.6189659237861633, "learning_rate": 8.842534586095383e-05, "loss": 0.0658, "step": 5200 }, { "grad_norm": 0.6852701902389526, "learning_rate": 8.837239542966593e-05, "loss": 0.0815, "step": 5210 }, { "grad_norm": 0.5887537598609924, "learning_rate": 8.831934008971417e-05, "loss": 0.0699, "step": 5220 }, { "grad_norm": 0.6554272174835205, "learning_rate": 8.826617998614982e-05, "loss": 0.0734, "step": 5230 }, { "grad_norm": 0.6582019925117493, "learning_rate": 8.821291526431056e-05, "loss": 0.0929, "step": 5240 }, { "grad_norm": 0.5610940456390381, "learning_rate": 8.815954606982015e-05, "loss": 0.0704, "step": 5250 }, { "grad_norm": 0.6148430109024048, "learning_rate": 8.810607254858789e-05, "loss": 0.0653, "step": 5260 }, { "grad_norm": 0.5456112623214722, "learning_rate": 8.805249484680838e-05, "loss": 0.0789, "step": 5270 }, { "grad_norm": 0.6438539028167725, "learning_rate": 8.799881311096096e-05, "loss": 0.0835, "step": 5280 }, { "grad_norm": 0.5814638733863831, "learning_rate": 8.794502748780949e-05, "loss": 0.064, "step": 5290 }, { "grad_norm": 0.5223795175552368, "learning_rate": 8.78911381244018e-05, "loss": 0.0812, "step": 5300 }, { "grad_norm": 0.5664160847663879, "learning_rate": 8.783714516806933e-05, "loss": 0.0701, "step": 5310 }, { "grad_norm": 0.6536245942115784, "learning_rate": 8.77830487664268e-05, "loss": 0.073, "step": 5320 }, { "grad_norm": 0.6739161610603333, "learning_rate": 8.772884906737167e-05, "loss": 0.0774, "step": 5330 }, { "grad_norm": 0.6405242085456848, "learning_rate": 8.767454621908387e-05, "loss": 0.0667, "step": 5340 }, { "grad_norm": 0.6425337195396423, "learning_rate": 8.76201403700253e-05, "loss": 0.0644, "step": 5350 }, { "grad_norm": 0.5212625861167908, "learning_rate": 8.756563166893949e-05, "loss": 0.0762, "step": 5360 }, { "grad_norm": 0.5699160099029541, "learning_rate": 8.751102026485113e-05, "loss": 0.0725, "step": 5370 }, { "grad_norm": 0.6521324515342712, "learning_rate": 8.745630630706571e-05, "loss": 0.0865, "step": 5380 }, { "grad_norm": 0.5155470967292786, "learning_rate": 8.740148994516912e-05, "loss": 0.0748, "step": 5390 }, { "grad_norm": 0.5227314829826355, "learning_rate": 8.73465713290272e-05, "loss": 0.087, "step": 5400 }, { "grad_norm": 0.7070881724357605, "learning_rate": 8.729155060878533e-05, "loss": 0.0879, "step": 5410 }, { "grad_norm": 0.6618897318840027, "learning_rate": 8.723642793486809e-05, "loss": 0.0605, "step": 5420 }, { "grad_norm": 0.6139546036720276, "learning_rate": 8.718120345797873e-05, "loss": 0.0671, "step": 5430 }, { "grad_norm": 0.6994019150733948, "learning_rate": 8.712587732909889e-05, "loss": 0.0733, "step": 5440 }, { "grad_norm": 0.6827563643455505, "learning_rate": 8.707044969948806e-05, "loss": 0.0711, "step": 5450 }, { "grad_norm": 0.520524263381958, "learning_rate": 8.701492072068329e-05, "loss": 0.0621, "step": 5460 }, { "grad_norm": 0.7971377968788147, "learning_rate": 8.695929054449869e-05, "loss": 0.0807, "step": 5470 }, { "grad_norm": 0.7996975779533386, "learning_rate": 8.690355932302501e-05, "loss": 0.0882, "step": 5480 }, { "grad_norm": 0.7133775353431702, "learning_rate": 8.684772720862931e-05, "loss": 0.0852, "step": 5490 }, { "eval/loss": 0.09252125598490238, "step": 5500 }, { "grad_norm": 0.5511888861656189, "learning_rate": 8.679179435395446e-05, "loss": 0.096, "step": 5500 }, { "grad_norm": 0.6047679781913757, "learning_rate": 8.673576091191874e-05, "loss": 0.0626, "step": 5510 }, { "grad_norm": 0.5799412131309509, "learning_rate": 8.667962703571541e-05, "loss": 0.0729, "step": 5520 }, { "grad_norm": 0.6674461960792542, "learning_rate": 8.662339287881238e-05, "loss": 0.0865, "step": 5530 }, { "grad_norm": 0.7157339453697205, "learning_rate": 8.656705859495169e-05, "loss": 0.0822, "step": 5540 }, { "grad_norm": 0.7077901363372803, "learning_rate": 8.651062433814912e-05, "loss": 0.0875, "step": 5550 }, { "grad_norm": 0.7879642248153687, "learning_rate": 8.645409026269375e-05, "loss": 0.06, "step": 5560 }, { "grad_norm": 0.6865609288215637, "learning_rate": 8.639745652314759e-05, "loss": 0.0698, "step": 5570 }, { "grad_norm": 0.7282442450523376, "learning_rate": 8.634072327434515e-05, "loss": 0.0636, "step": 5580 }, { "grad_norm": 0.6840656995773315, "learning_rate": 8.628389067139294e-05, "loss": 0.0733, "step": 5590 }, { "grad_norm": 0.5570817589759827, "learning_rate": 8.622695886966911e-05, "loss": 0.1039, "step": 5600 }, { "grad_norm": 0.5252546668052673, "learning_rate": 8.616992802482308e-05, "loss": 0.1104, "step": 5610 }, { "grad_norm": 0.7306527495384216, "learning_rate": 8.611279829277496e-05, "loss": 0.0747, "step": 5620 }, { "grad_norm": 0.5882008671760559, "learning_rate": 8.605556982971528e-05, "loss": 0.0618, "step": 5630 }, { "grad_norm": 0.8575307130813599, "learning_rate": 8.599824279210447e-05, "loss": 0.0909, "step": 5640 }, { "grad_norm": 0.5907726287841797, "learning_rate": 8.594081733667243e-05, "loss": 0.0667, "step": 5650 }, { "grad_norm": 0.6109846830368042, "learning_rate": 8.58832936204182e-05, "loss": 0.0871, "step": 5660 }, { "grad_norm": 0.6820570230484009, "learning_rate": 8.582567180060942e-05, "loss": 0.0572, "step": 5670 }, { "grad_norm": 0.5582043528556824, "learning_rate": 8.576795203478194e-05, "loss": 0.0732, "step": 5680 }, { "grad_norm": 0.608518123626709, "learning_rate": 8.571013448073939e-05, "loss": 0.0793, "step": 5690 }, { "grad_norm": 0.5402862429618835, "learning_rate": 8.565221929655275e-05, "loss": 0.074, "step": 5700 }, { "grad_norm": 0.5287830233573914, "learning_rate": 8.559420664055992e-05, "loss": 0.0572, "step": 5710 }, { "grad_norm": 0.5307794213294983, "learning_rate": 8.553609667136532e-05, "loss": 0.0658, "step": 5720 }, { "grad_norm": 0.6246611475944519, "learning_rate": 8.547788954783936e-05, "loss": 0.0707, "step": 5730 }, { "grad_norm": 0.7108138203620911, "learning_rate": 8.541958542911808e-05, "loss": 0.0676, "step": 5740 }, { "grad_norm": 0.6190049648284912, "learning_rate": 8.536118447460275e-05, "loss": 0.0651, "step": 5750 }, { "grad_norm": 0.6700112223625183, "learning_rate": 8.530268684395932e-05, "loss": 0.0664, "step": 5760 }, { "grad_norm": 0.7006531953811646, "learning_rate": 8.524409269711807e-05, "loss": 0.078, "step": 5770 }, { "grad_norm": 0.7312558889389038, "learning_rate": 8.51854021942732e-05, "loss": 0.0602, "step": 5780 }, { "grad_norm": 0.6549461483955383, "learning_rate": 8.512661549588227e-05, "loss": 0.0801, "step": 5790 }, { "grad_norm": 0.6290611028671265, "learning_rate": 8.506773276266588e-05, "loss": 0.074, "step": 5800 }, { "grad_norm": 0.6408827900886536, "learning_rate": 8.500875415560721e-05, "loss": 0.078, "step": 5810 }, { "grad_norm": 0.6235644817352295, "learning_rate": 8.494967983595144e-05, "loss": 0.0645, "step": 5820 }, { "grad_norm": 0.6140525937080383, "learning_rate": 8.489050996520558e-05, "loss": 0.0703, "step": 5830 }, { "grad_norm": 0.7021300196647644, "learning_rate": 8.483124470513775e-05, "loss": 0.08, "step": 5840 }, { "grad_norm": 0.6813769340515137, "learning_rate": 8.477188421777692e-05, "loss": 0.0792, "step": 5850 }, { "grad_norm": 0.5469093918800354, "learning_rate": 8.47124286654124e-05, "loss": 0.0681, "step": 5860 }, { "grad_norm": 0.618033766746521, "learning_rate": 8.465287821059341e-05, "loss": 0.0609, "step": 5870 }, { "grad_norm": 0.44991111755371094, "learning_rate": 8.45932330161286e-05, "loss": 0.0692, "step": 5880 }, { "grad_norm": 0.6399261951446533, "learning_rate": 8.453349324508567e-05, "loss": 0.0615, "step": 5890 }, { "grad_norm": 0.6941473484039307, "learning_rate": 8.447365906079088e-05, "loss": 0.0676, "step": 5900 }, { "grad_norm": 0.6344639658927917, "learning_rate": 8.441373062682856e-05, "loss": 0.065, "step": 5910 }, { "grad_norm": 0.6065757274627686, "learning_rate": 8.43537081070408e-05, "loss": 0.0687, "step": 5920 }, { "grad_norm": 0.6268478035926819, "learning_rate": 8.429359166552689e-05, "loss": 0.0603, "step": 5930 }, { "grad_norm": 0.7667990922927856, "learning_rate": 8.423338146664284e-05, "loss": 0.0865, "step": 5940 }, { "grad_norm": 0.5977605581283569, "learning_rate": 8.417307767500107e-05, "loss": 0.0561, "step": 5950 }, { "grad_norm": 0.5928356647491455, "learning_rate": 8.411268045546983e-05, "loss": 0.0616, "step": 5960 }, { "grad_norm": 0.5978808999061584, "learning_rate": 8.405218997317281e-05, "loss": 0.0614, "step": 5970 }, { "grad_norm": 0.5177515745162964, "learning_rate": 8.399160639348869e-05, "loss": 0.0628, "step": 5980 }, { "grad_norm": 0.68746417760849, "learning_rate": 8.393092988205065e-05, "loss": 0.0789, "step": 5990 }, { "eval/loss": 0.0932412301003933, "step": 6000 }, { "grad_norm": 0.6368496417999268, "learning_rate": 8.387016060474597e-05, "loss": 0.0581, "step": 6000 } ], "logging_steps": 10, "max_steps": 20000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }