| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.1, |
| "eval_steps": 500, |
| "global_step": 2000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "grad_norm": 0.16940702497959137, |
| "learning_rate": 9e-07, |
| "loss": 1.208, |
| "step": 10 |
| }, |
| { |
| "grad_norm": 0.14846493303775787, |
| "learning_rate": 1.9e-06, |
| "loss": 1.207, |
| "step": 20 |
| }, |
| { |
| "grad_norm": 0.1575060486793518, |
| "learning_rate": 2.9e-06, |
| "loss": 1.2031, |
| "step": 30 |
| }, |
| { |
| "grad_norm": 0.18646641075611115, |
| "learning_rate": 3.9e-06, |
| "loss": 1.19, |
| "step": 40 |
| }, |
| { |
| "grad_norm": 0.30270645022392273, |
| "learning_rate": 4.9000000000000005e-06, |
| "loss": 1.1729, |
| "step": 50 |
| }, |
| { |
| "grad_norm": 0.39245566725730896, |
| "learning_rate": 5.9e-06, |
| "loss": 1.1582, |
| "step": 60 |
| }, |
| { |
| "grad_norm": 0.44334590435028076, |
| "learning_rate": 6.900000000000001e-06, |
| "loss": 1.1293, |
| "step": 70 |
| }, |
| { |
| "grad_norm": 0.30264705419540405, |
| "learning_rate": 7.9e-06, |
| "loss": 1.1102, |
| "step": 80 |
| }, |
| { |
| "grad_norm": 0.24947984516620636, |
| "learning_rate": 8.9e-06, |
| "loss": 1.1029, |
| "step": 90 |
| }, |
| { |
| "grad_norm": 0.3442651927471161, |
| "learning_rate": 9.900000000000002e-06, |
| "loss": 1.0967, |
| "step": 100 |
| }, |
| { |
| "grad_norm": 0.2968611717224121, |
| "learning_rate": 1.09e-05, |
| "loss": 1.092, |
| "step": 110 |
| }, |
| { |
| "grad_norm": 0.8635988235473633, |
| "learning_rate": 1.19e-05, |
| "loss": 1.0971, |
| "step": 120 |
| }, |
| { |
| "grad_norm": 0.549736738204956, |
| "learning_rate": 1.29e-05, |
| "loss": 1.0877, |
| "step": 130 |
| }, |
| { |
| "grad_norm": 0.3879706859588623, |
| "learning_rate": 1.3900000000000002e-05, |
| "loss": 1.0965, |
| "step": 140 |
| }, |
| { |
| "grad_norm": 0.8023251295089722, |
| "learning_rate": 1.49e-05, |
| "loss": 1.0902, |
| "step": 150 |
| }, |
| { |
| "grad_norm": 0.3089507222175598, |
| "learning_rate": 1.59e-05, |
| "loss": 1.0938, |
| "step": 160 |
| }, |
| { |
| "grad_norm": 0.2845019996166229, |
| "learning_rate": 1.69e-05, |
| "loss": 1.0857, |
| "step": 170 |
| }, |
| { |
| "grad_norm": 0.5286039710044861, |
| "learning_rate": 1.79e-05, |
| "loss": 1.0811, |
| "step": 180 |
| }, |
| { |
| "grad_norm": 0.3651806712150574, |
| "learning_rate": 1.8900000000000002e-05, |
| "loss": 1.0711, |
| "step": 190 |
| }, |
| { |
| "grad_norm": 0.49551165103912354, |
| "learning_rate": 1.9900000000000003e-05, |
| "loss": 1.0568, |
| "step": 200 |
| }, |
| { |
| "grad_norm": 0.48400798439979553, |
| "learning_rate": 2.09e-05, |
| "loss": 1.0261, |
| "step": 210 |
| }, |
| { |
| "grad_norm": 0.5731498599052429, |
| "learning_rate": 2.19e-05, |
| "loss": 1.0099, |
| "step": 220 |
| }, |
| { |
| "grad_norm": 0.5158259868621826, |
| "learning_rate": 2.29e-05, |
| "loss": 0.9947, |
| "step": 230 |
| }, |
| { |
| "grad_norm": 0.8482366800308228, |
| "learning_rate": 2.39e-05, |
| "loss": 0.9701, |
| "step": 240 |
| }, |
| { |
| "grad_norm": 0.668360710144043, |
| "learning_rate": 2.4900000000000002e-05, |
| "loss": 0.933, |
| "step": 250 |
| }, |
| { |
| "grad_norm": 1.0341784954071045, |
| "learning_rate": 2.5900000000000003e-05, |
| "loss": 0.9101, |
| "step": 260 |
| }, |
| { |
| "grad_norm": 0.8576267957687378, |
| "learning_rate": 2.6900000000000003e-05, |
| "loss": 0.8772, |
| "step": 270 |
| }, |
| { |
| "grad_norm": 1.177884578704834, |
| "learning_rate": 2.7900000000000004e-05, |
| "loss": 0.8447, |
| "step": 280 |
| }, |
| { |
| "grad_norm": 1.2616709470748901, |
| "learning_rate": 2.8899999999999998e-05, |
| "loss": 0.8277, |
| "step": 290 |
| }, |
| { |
| "grad_norm": 0.9310820698738098, |
| "learning_rate": 2.9900000000000002e-05, |
| "loss": 0.8179, |
| "step": 300 |
| }, |
| { |
| "grad_norm": 0.9291635751724243, |
| "learning_rate": 3.09e-05, |
| "loss": 0.7966, |
| "step": 310 |
| }, |
| { |
| "grad_norm": 0.9610940217971802, |
| "learning_rate": 3.19e-05, |
| "loss": 0.7595, |
| "step": 320 |
| }, |
| { |
| "grad_norm": 1.082502841949463, |
| "learning_rate": 3.29e-05, |
| "loss": 0.7442, |
| "step": 330 |
| }, |
| { |
| "grad_norm": 1.0246247053146362, |
| "learning_rate": 3.3900000000000004e-05, |
| "loss": 0.7153, |
| "step": 340 |
| }, |
| { |
| "grad_norm": 1.1535388231277466, |
| "learning_rate": 3.49e-05, |
| "loss": 0.7008, |
| "step": 350 |
| }, |
| { |
| "grad_norm": 1.1344460248947144, |
| "learning_rate": 3.59e-05, |
| "loss": 0.6781, |
| "step": 360 |
| }, |
| { |
| "grad_norm": 1.0874427556991577, |
| "learning_rate": 3.69e-05, |
| "loss": 0.6773, |
| "step": 370 |
| }, |
| { |
| "grad_norm": 1.1591013669967651, |
| "learning_rate": 3.79e-05, |
| "loss": 0.6567, |
| "step": 380 |
| }, |
| { |
| "grad_norm": 1.2492725849151611, |
| "learning_rate": 3.8900000000000004e-05, |
| "loss": 0.6557, |
| "step": 390 |
| }, |
| { |
| "grad_norm": 1.333844542503357, |
| "learning_rate": 3.99e-05, |
| "loss": 0.6557, |
| "step": 400 |
| }, |
| { |
| "grad_norm": 1.157532811164856, |
| "learning_rate": 4.09e-05, |
| "loss": 0.6327, |
| "step": 410 |
| }, |
| { |
| "grad_norm": 1.0811901092529297, |
| "learning_rate": 4.19e-05, |
| "loss": 0.6181, |
| "step": 420 |
| }, |
| { |
| "grad_norm": 1.211959958076477, |
| "learning_rate": 4.29e-05, |
| "loss": 0.6222, |
| "step": 430 |
| }, |
| { |
| "grad_norm": 1.1791653633117676, |
| "learning_rate": 4.39e-05, |
| "loss": 0.6018, |
| "step": 440 |
| }, |
| { |
| "grad_norm": 1.483304500579834, |
| "learning_rate": 4.49e-05, |
| "loss": 0.5883, |
| "step": 450 |
| }, |
| { |
| "grad_norm": 1.136581540107727, |
| "learning_rate": 4.5900000000000004e-05, |
| "loss": 0.5781, |
| "step": 460 |
| }, |
| { |
| "grad_norm": 1.0122281312942505, |
| "learning_rate": 4.69e-05, |
| "loss": 0.5833, |
| "step": 470 |
| }, |
| { |
| "grad_norm": 1.294203519821167, |
| "learning_rate": 4.79e-05, |
| "loss": 0.5811, |
| "step": 480 |
| }, |
| { |
| "grad_norm": 1.036759614944458, |
| "learning_rate": 4.89e-05, |
| "loss": 0.5826, |
| "step": 490 |
| }, |
| { |
| "eval/loss": 0.540949667096138, |
| "step": 500 |
| }, |
| { |
| "grad_norm": 1.4752445220947266, |
| "learning_rate": 4.99e-05, |
| "loss": 0.5589, |
| "step": 500 |
| }, |
| { |
| "grad_norm": 0.9996066093444824, |
| "learning_rate": 5.0900000000000004e-05, |
| "loss": 0.5726, |
| "step": 510 |
| }, |
| { |
| "grad_norm": 1.1314283609390259, |
| "learning_rate": 5.19e-05, |
| "loss": 0.5489, |
| "step": 520 |
| }, |
| { |
| "grad_norm": 1.0463645458221436, |
| "learning_rate": 5.2900000000000005e-05, |
| "loss": 0.537, |
| "step": 530 |
| }, |
| { |
| "grad_norm": 1.1870821714401245, |
| "learning_rate": 5.390000000000001e-05, |
| "loss": 0.5419, |
| "step": 540 |
| }, |
| { |
| "grad_norm": 1.0127766132354736, |
| "learning_rate": 5.4900000000000006e-05, |
| "loss": 0.5383, |
| "step": 550 |
| }, |
| { |
| "grad_norm": 1.1530522108078003, |
| "learning_rate": 5.590000000000001e-05, |
| "loss": 0.5255, |
| "step": 560 |
| }, |
| { |
| "grad_norm": 1.6963386535644531, |
| "learning_rate": 5.69e-05, |
| "loss": 0.5248, |
| "step": 570 |
| }, |
| { |
| "grad_norm": 1.5842453241348267, |
| "learning_rate": 5.79e-05, |
| "loss": 0.5243, |
| "step": 580 |
| }, |
| { |
| "grad_norm": 1.3649457693099976, |
| "learning_rate": 5.89e-05, |
| "loss": 0.5147, |
| "step": 590 |
| }, |
| { |
| "grad_norm": 1.018904447555542, |
| "learning_rate": 5.99e-05, |
| "loss": 0.5042, |
| "step": 600 |
| }, |
| { |
| "grad_norm": 1.252278208732605, |
| "learning_rate": 6.09e-05, |
| "loss": 0.5213, |
| "step": 610 |
| }, |
| { |
| "grad_norm": 1.2415512800216675, |
| "learning_rate": 6.19e-05, |
| "loss": 0.4769, |
| "step": 620 |
| }, |
| { |
| "grad_norm": 1.3829114437103271, |
| "learning_rate": 6.29e-05, |
| "loss": 0.4806, |
| "step": 630 |
| }, |
| { |
| "grad_norm": 1.2860313653945923, |
| "learning_rate": 6.390000000000001e-05, |
| "loss": 0.4687, |
| "step": 640 |
| }, |
| { |
| "grad_norm": 1.1453088521957397, |
| "learning_rate": 6.49e-05, |
| "loss": 0.477, |
| "step": 650 |
| }, |
| { |
| "grad_norm": 1.2535901069641113, |
| "learning_rate": 6.59e-05, |
| "loss": 0.4541, |
| "step": 660 |
| }, |
| { |
| "grad_norm": 1.2619575262069702, |
| "learning_rate": 6.690000000000001e-05, |
| "loss": 0.4565, |
| "step": 670 |
| }, |
| { |
| "grad_norm": 1.1378668546676636, |
| "learning_rate": 6.790000000000001e-05, |
| "loss": 0.4395, |
| "step": 680 |
| }, |
| { |
| "grad_norm": 1.0631095170974731, |
| "learning_rate": 6.89e-05, |
| "loss": 0.4185, |
| "step": 690 |
| }, |
| { |
| "grad_norm": 1.1509623527526855, |
| "learning_rate": 6.99e-05, |
| "loss": 0.437, |
| "step": 700 |
| }, |
| { |
| "grad_norm": 1.249911904335022, |
| "learning_rate": 7.09e-05, |
| "loss": 0.4273, |
| "step": 710 |
| }, |
| { |
| "grad_norm": 1.1548298597335815, |
| "learning_rate": 7.19e-05, |
| "loss": 0.4296, |
| "step": 720 |
| }, |
| { |
| "grad_norm": 1.0660429000854492, |
| "learning_rate": 7.29e-05, |
| "loss": 0.4438, |
| "step": 730 |
| }, |
| { |
| "grad_norm": 1.2336221933364868, |
| "learning_rate": 7.390000000000001e-05, |
| "loss": 0.407, |
| "step": 740 |
| }, |
| { |
| "grad_norm": 1.073397159576416, |
| "learning_rate": 7.49e-05, |
| "loss": 0.3936, |
| "step": 750 |
| }, |
| { |
| "grad_norm": 1.2548182010650635, |
| "learning_rate": 7.59e-05, |
| "loss": 0.3991, |
| "step": 760 |
| }, |
| { |
| "grad_norm": 1.4380117654800415, |
| "learning_rate": 7.69e-05, |
| "loss": 0.3957, |
| "step": 770 |
| }, |
| { |
| "grad_norm": 1.2932844161987305, |
| "learning_rate": 7.790000000000001e-05, |
| "loss": 0.4081, |
| "step": 780 |
| }, |
| { |
| "grad_norm": 1.1372441053390503, |
| "learning_rate": 7.890000000000001e-05, |
| "loss": 0.3812, |
| "step": 790 |
| }, |
| { |
| "grad_norm": 1.1620570421218872, |
| "learning_rate": 7.99e-05, |
| "loss": 0.3952, |
| "step": 800 |
| }, |
| { |
| "grad_norm": 1.1965490579605103, |
| "learning_rate": 8.090000000000001e-05, |
| "loss": 0.3677, |
| "step": 810 |
| }, |
| { |
| "grad_norm": 1.176527738571167, |
| "learning_rate": 8.19e-05, |
| "loss": 0.3798, |
| "step": 820 |
| }, |
| { |
| "grad_norm": 1.153993010520935, |
| "learning_rate": 8.29e-05, |
| "loss": 0.3629, |
| "step": 830 |
| }, |
| { |
| "grad_norm": 1.3327205181121826, |
| "learning_rate": 8.39e-05, |
| "loss": 0.3578, |
| "step": 840 |
| }, |
| { |
| "grad_norm": 1.1645392179489136, |
| "learning_rate": 8.49e-05, |
| "loss": 0.3542, |
| "step": 850 |
| }, |
| { |
| "grad_norm": 1.1183959245681763, |
| "learning_rate": 8.59e-05, |
| "loss": 0.3452, |
| "step": 860 |
| }, |
| { |
| "grad_norm": 1.4171571731567383, |
| "learning_rate": 8.69e-05, |
| "loss": 0.328, |
| "step": 870 |
| }, |
| { |
| "grad_norm": 1.2265501022338867, |
| "learning_rate": 8.790000000000001e-05, |
| "loss": 0.3427, |
| "step": 880 |
| }, |
| { |
| "grad_norm": 1.3434756994247437, |
| "learning_rate": 8.89e-05, |
| "loss": 0.3333, |
| "step": 890 |
| }, |
| { |
| "grad_norm": 1.3676091432571411, |
| "learning_rate": 8.99e-05, |
| "loss": 0.3142, |
| "step": 900 |
| }, |
| { |
| "grad_norm": 1.0545670986175537, |
| "learning_rate": 9.090000000000001e-05, |
| "loss": 0.3242, |
| "step": 910 |
| }, |
| { |
| "grad_norm": 1.1802937984466553, |
| "learning_rate": 9.190000000000001e-05, |
| "loss": 0.3419, |
| "step": 920 |
| }, |
| { |
| "grad_norm": 1.2357131242752075, |
| "learning_rate": 9.290000000000001e-05, |
| "loss": 0.2918, |
| "step": 930 |
| }, |
| { |
| "grad_norm": 1.2467869520187378, |
| "learning_rate": 9.39e-05, |
| "loss": 0.2812, |
| "step": 940 |
| }, |
| { |
| "grad_norm": 1.2177903652191162, |
| "learning_rate": 9.49e-05, |
| "loss": 0.2786, |
| "step": 950 |
| }, |
| { |
| "grad_norm": 1.2031254768371582, |
| "learning_rate": 9.59e-05, |
| "loss": 0.2801, |
| "step": 960 |
| }, |
| { |
| "grad_norm": 1.27996826171875, |
| "learning_rate": 9.69e-05, |
| "loss": 0.2944, |
| "step": 970 |
| }, |
| { |
| "grad_norm": 1.4937174320220947, |
| "learning_rate": 9.790000000000001e-05, |
| "loss": 0.259, |
| "step": 980 |
| }, |
| { |
| "grad_norm": 1.2263216972351074, |
| "learning_rate": 9.89e-05, |
| "loss": 0.269, |
| "step": 990 |
| }, |
| { |
| "eval/loss": 0.2459017077088356, |
| "step": 1000 |
| }, |
| { |
| "grad_norm": 1.1868503093719482, |
| "learning_rate": 9.99e-05, |
| "loss": 0.2673, |
| "step": 1000 |
| }, |
| { |
| "grad_norm": 1.2517995834350586, |
| "learning_rate": 9.999994463727085e-05, |
| "loss": 0.2957, |
| "step": 1010 |
| }, |
| { |
| "grad_norm": 1.1621079444885254, |
| "learning_rate": 9.999975326009292e-05, |
| "loss": 0.2406, |
| "step": 1020 |
| }, |
| { |
| "grad_norm": 1.2248700857162476, |
| "learning_rate": 9.999942518549879e-05, |
| "loss": 0.2588, |
| "step": 1030 |
| }, |
| { |
| "grad_norm": 1.1486198902130127, |
| "learning_rate": 9.999896041438544e-05, |
| "loss": 0.2869, |
| "step": 1040 |
| }, |
| { |
| "grad_norm": 1.1869938373565674, |
| "learning_rate": 9.999835894802353e-05, |
| "loss": 0.2613, |
| "step": 1050 |
| }, |
| { |
| "grad_norm": 1.2058380842208862, |
| "learning_rate": 9.999762078805743e-05, |
| "loss": 0.2367, |
| "step": 1060 |
| }, |
| { |
| "grad_norm": 1.2073358297348022, |
| "learning_rate": 9.999674593650526e-05, |
| "loss": 0.2343, |
| "step": 1070 |
| }, |
| { |
| "grad_norm": 1.3462257385253906, |
| "learning_rate": 9.99957343957588e-05, |
| "loss": 0.2043, |
| "step": 1080 |
| }, |
| { |
| "grad_norm": 1.21333646774292, |
| "learning_rate": 9.99945861685836e-05, |
| "loss": 0.22, |
| "step": 1090 |
| }, |
| { |
| "grad_norm": 1.172276496887207, |
| "learning_rate": 9.999330125811884e-05, |
| "loss": 0.2268, |
| "step": 1100 |
| }, |
| { |
| "grad_norm": 1.5802624225616455, |
| "learning_rate": 9.999187966787744e-05, |
| "loss": 0.2389, |
| "step": 1110 |
| }, |
| { |
| "grad_norm": 1.0722038745880127, |
| "learning_rate": 9.999032140174595e-05, |
| "loss": 0.2069, |
| "step": 1120 |
| }, |
| { |
| "grad_norm": 1.2428017854690552, |
| "learning_rate": 9.998862646398464e-05, |
| "loss": 0.2105, |
| "step": 1130 |
| }, |
| { |
| "grad_norm": 1.109406590461731, |
| "learning_rate": 9.998679485922739e-05, |
| "loss": 0.204, |
| "step": 1140 |
| }, |
| { |
| "grad_norm": 1.133062720298767, |
| "learning_rate": 9.998482659248174e-05, |
| "loss": 0.1862, |
| "step": 1150 |
| }, |
| { |
| "grad_norm": 1.185992956161499, |
| "learning_rate": 9.998272166912883e-05, |
| "loss": 0.1944, |
| "step": 1160 |
| }, |
| { |
| "grad_norm": 1.0539828538894653, |
| "learning_rate": 9.998048009492347e-05, |
| "loss": 0.1603, |
| "step": 1170 |
| }, |
| { |
| "grad_norm": 1.2745673656463623, |
| "learning_rate": 9.997810187599403e-05, |
| "loss": 0.1815, |
| "step": 1180 |
| }, |
| { |
| "grad_norm": 1.2294188737869263, |
| "learning_rate": 9.997558701884249e-05, |
| "loss": 0.1774, |
| "step": 1190 |
| }, |
| { |
| "grad_norm": 1.6289048194885254, |
| "learning_rate": 9.997293553034433e-05, |
| "loss": 0.171, |
| "step": 1200 |
| }, |
| { |
| "grad_norm": 1.2011067867279053, |
| "learning_rate": 9.997014741774866e-05, |
| "loss": 0.1657, |
| "step": 1210 |
| }, |
| { |
| "grad_norm": 1.1529210805892944, |
| "learning_rate": 9.996722268867803e-05, |
| "loss": 0.1642, |
| "step": 1220 |
| }, |
| { |
| "grad_norm": 0.8735513091087341, |
| "learning_rate": 9.996416135112858e-05, |
| "loss": 0.1393, |
| "step": 1230 |
| }, |
| { |
| "grad_norm": 1.3112437725067139, |
| "learning_rate": 9.996096341346988e-05, |
| "loss": 0.1392, |
| "step": 1240 |
| }, |
| { |
| "grad_norm": 1.2347687482833862, |
| "learning_rate": 9.995762888444495e-05, |
| "loss": 0.1674, |
| "step": 1250 |
| }, |
| { |
| "grad_norm": 1.539437174797058, |
| "learning_rate": 9.995415777317027e-05, |
| "loss": 0.1625, |
| "step": 1260 |
| }, |
| { |
| "grad_norm": 1.2333788871765137, |
| "learning_rate": 9.995055008913574e-05, |
| "loss": 0.1328, |
| "step": 1270 |
| }, |
| { |
| "grad_norm": 1.1541303396224976, |
| "learning_rate": 9.994680584220463e-05, |
| "loss": 0.1294, |
| "step": 1280 |
| }, |
| { |
| "grad_norm": 1.0528708696365356, |
| "learning_rate": 9.994292504261355e-05, |
| "loss": 0.1441, |
| "step": 1290 |
| }, |
| { |
| "grad_norm": 1.0454338788986206, |
| "learning_rate": 9.993890770097247e-05, |
| "loss": 0.1266, |
| "step": 1300 |
| }, |
| { |
| "grad_norm": 1.1280555725097656, |
| "learning_rate": 9.993475382826467e-05, |
| "loss": 0.1426, |
| "step": 1310 |
| }, |
| { |
| "grad_norm": 1.187239170074463, |
| "learning_rate": 9.993046343584664e-05, |
| "loss": 0.1422, |
| "step": 1320 |
| }, |
| { |
| "grad_norm": 1.0262149572372437, |
| "learning_rate": 9.992603653544816e-05, |
| "loss": 0.1161, |
| "step": 1330 |
| }, |
| { |
| "grad_norm": 1.1586066484451294, |
| "learning_rate": 9.992147313917222e-05, |
| "loss": 0.1408, |
| "step": 1340 |
| }, |
| { |
| "grad_norm": 0.9765651226043701, |
| "learning_rate": 9.991677325949497e-05, |
| "loss": 0.1611, |
| "step": 1350 |
| }, |
| { |
| "grad_norm": 0.9763075709342957, |
| "learning_rate": 9.991193690926568e-05, |
| "loss": 0.1464, |
| "step": 1360 |
| }, |
| { |
| "grad_norm": 1.2092801332473755, |
| "learning_rate": 9.990696410170678e-05, |
| "loss": 0.1466, |
| "step": 1370 |
| }, |
| { |
| "grad_norm": 1.0392274856567383, |
| "learning_rate": 9.990185485041371e-05, |
| "loss": 0.1263, |
| "step": 1380 |
| }, |
| { |
| "grad_norm": 1.0358021259307861, |
| "learning_rate": 9.989660916935498e-05, |
| "loss": 0.1282, |
| "step": 1390 |
| }, |
| { |
| "grad_norm": 1.0262398719787598, |
| "learning_rate": 9.989122707287208e-05, |
| "loss": 0.1391, |
| "step": 1400 |
| }, |
| { |
| "grad_norm": 1.1978421211242676, |
| "learning_rate": 9.988570857567945e-05, |
| "loss": 0.1218, |
| "step": 1410 |
| }, |
| { |
| "grad_norm": 0.9296699166297913, |
| "learning_rate": 9.988005369286446e-05, |
| "loss": 0.1331, |
| "step": 1420 |
| }, |
| { |
| "grad_norm": 1.0004020929336548, |
| "learning_rate": 9.987426243988734e-05, |
| "loss": 0.1372, |
| "step": 1430 |
| }, |
| { |
| "grad_norm": 1.0646557807922363, |
| "learning_rate": 9.986833483258114e-05, |
| "loss": 0.1334, |
| "step": 1440 |
| }, |
| { |
| "grad_norm": 0.9959461688995361, |
| "learning_rate": 9.986227088715173e-05, |
| "loss": 0.1187, |
| "step": 1450 |
| }, |
| { |
| "grad_norm": 1.0928994417190552, |
| "learning_rate": 9.98560706201777e-05, |
| "loss": 0.1455, |
| "step": 1460 |
| }, |
| { |
| "grad_norm": 1.1495130062103271, |
| "learning_rate": 9.984973404861036e-05, |
| "loss": 0.1098, |
| "step": 1470 |
| }, |
| { |
| "grad_norm": 1.3037567138671875, |
| "learning_rate": 9.984326118977361e-05, |
| "loss": 0.1255, |
| "step": 1480 |
| }, |
| { |
| "grad_norm": 0.923818051815033, |
| "learning_rate": 9.983665206136406e-05, |
| "loss": 0.1439, |
| "step": 1490 |
| }, |
| { |
| "eval/loss": 0.12131376132369041, |
| "step": 1500 |
| }, |
| { |
| "grad_norm": 0.9496746063232422, |
| "learning_rate": 9.982990668145075e-05, |
| "loss": 0.123, |
| "step": 1500 |
| }, |
| { |
| "grad_norm": 1.0832738876342773, |
| "learning_rate": 9.982302506847534e-05, |
| "loss": 0.1408, |
| "step": 1510 |
| }, |
| { |
| "grad_norm": 1.033626914024353, |
| "learning_rate": 9.981600724125189e-05, |
| "loss": 0.1108, |
| "step": 1520 |
| }, |
| { |
| "grad_norm": 1.0005450248718262, |
| "learning_rate": 9.980885321896685e-05, |
| "loss": 0.1274, |
| "step": 1530 |
| }, |
| { |
| "grad_norm": 1.0442663431167603, |
| "learning_rate": 9.980156302117905e-05, |
| "loss": 0.1238, |
| "step": 1540 |
| }, |
| { |
| "grad_norm": 0.8260471820831299, |
| "learning_rate": 9.979413666781963e-05, |
| "loss": 0.1231, |
| "step": 1550 |
| }, |
| { |
| "grad_norm": 0.884735107421875, |
| "learning_rate": 9.978657417919193e-05, |
| "loss": 0.1495, |
| "step": 1560 |
| }, |
| { |
| "grad_norm": 0.9236319661140442, |
| "learning_rate": 9.977887557597153e-05, |
| "loss": 0.1327, |
| "step": 1570 |
| }, |
| { |
| "grad_norm": 0.9460572004318237, |
| "learning_rate": 9.97710408792061e-05, |
| "loss": 0.1263, |
| "step": 1580 |
| }, |
| { |
| "grad_norm": 0.9749200344085693, |
| "learning_rate": 9.976307011031542e-05, |
| "loss": 0.12, |
| "step": 1590 |
| }, |
| { |
| "grad_norm": 1.1136820316314697, |
| "learning_rate": 9.975496329109126e-05, |
| "loss": 0.1323, |
| "step": 1600 |
| }, |
| { |
| "grad_norm": 0.8567096590995789, |
| "learning_rate": 9.974672044369732e-05, |
| "loss": 0.125, |
| "step": 1610 |
| }, |
| { |
| "grad_norm": 1.0884920358657837, |
| "learning_rate": 9.97383415906693e-05, |
| "loss": 0.115, |
| "step": 1620 |
| }, |
| { |
| "grad_norm": 1.0339338779449463, |
| "learning_rate": 9.97298267549146e-05, |
| "loss": 0.1386, |
| "step": 1630 |
| }, |
| { |
| "grad_norm": 0.9121850728988647, |
| "learning_rate": 9.972117595971249e-05, |
| "loss": 0.1249, |
| "step": 1640 |
| }, |
| { |
| "grad_norm": 0.9620202779769897, |
| "learning_rate": 9.971238922871391e-05, |
| "loss": 0.1322, |
| "step": 1650 |
| }, |
| { |
| "grad_norm": 1.0946760177612305, |
| "learning_rate": 9.970346658594142e-05, |
| "loss": 0.1123, |
| "step": 1660 |
| }, |
| { |
| "grad_norm": 0.9470517635345459, |
| "learning_rate": 9.969440805578923e-05, |
| "loss": 0.1308, |
| "step": 1670 |
| }, |
| { |
| "grad_norm": 0.7607911229133606, |
| "learning_rate": 9.968521366302298e-05, |
| "loss": 0.1212, |
| "step": 1680 |
| }, |
| { |
| "grad_norm": 1.0321109294891357, |
| "learning_rate": 9.967588343277981e-05, |
| "loss": 0.1351, |
| "step": 1690 |
| }, |
| { |
| "grad_norm": 1.2910419702529907, |
| "learning_rate": 9.966641739056818e-05, |
| "loss": 0.1594, |
| "step": 1700 |
| }, |
| { |
| "grad_norm": 0.7687619924545288, |
| "learning_rate": 9.965681556226793e-05, |
| "loss": 0.1442, |
| "step": 1710 |
| }, |
| { |
| "grad_norm": 0.882544755935669, |
| "learning_rate": 9.964707797413006e-05, |
| "loss": 0.1131, |
| "step": 1720 |
| }, |
| { |
| "grad_norm": 1.0736138820648193, |
| "learning_rate": 9.963720465277679e-05, |
| "loss": 0.0997, |
| "step": 1730 |
| }, |
| { |
| "grad_norm": 1.0465947389602661, |
| "learning_rate": 9.96271956252014e-05, |
| "loss": 0.1114, |
| "step": 1740 |
| }, |
| { |
| "grad_norm": 0.8517502546310425, |
| "learning_rate": 9.961705091876816e-05, |
| "loss": 0.1054, |
| "step": 1750 |
| }, |
| { |
| "grad_norm": 0.8722822666168213, |
| "learning_rate": 9.960677056121235e-05, |
| "loss": 0.1409, |
| "step": 1760 |
| }, |
| { |
| "grad_norm": 1.3352707624435425, |
| "learning_rate": 9.959635458064005e-05, |
| "loss": 0.1207, |
| "step": 1770 |
| }, |
| { |
| "grad_norm": 1.0375741720199585, |
| "learning_rate": 9.958580300552815e-05, |
| "loss": 0.1018, |
| "step": 1780 |
| }, |
| { |
| "grad_norm": 0.9663418531417847, |
| "learning_rate": 9.957511586472426e-05, |
| "loss": 0.1131, |
| "step": 1790 |
| }, |
| { |
| "grad_norm": 0.9925614595413208, |
| "learning_rate": 9.956429318744662e-05, |
| "loss": 0.0994, |
| "step": 1800 |
| }, |
| { |
| "grad_norm": 0.9052272439002991, |
| "learning_rate": 9.955333500328404e-05, |
| "loss": 0.1439, |
| "step": 1810 |
| }, |
| { |
| "grad_norm": 1.1061209440231323, |
| "learning_rate": 9.95422413421957e-05, |
| "loss": 0.099, |
| "step": 1820 |
| }, |
| { |
| "grad_norm": 0.9718905687332153, |
| "learning_rate": 9.953101223451133e-05, |
| "loss": 0.1334, |
| "step": 1830 |
| }, |
| { |
| "grad_norm": 1.041150450706482, |
| "learning_rate": 9.951964771093085e-05, |
| "loss": 0.102, |
| "step": 1840 |
| }, |
| { |
| "grad_norm": 0.7296974658966064, |
| "learning_rate": 9.950814780252442e-05, |
| "loss": 0.1216, |
| "step": 1850 |
| }, |
| { |
| "grad_norm": 0.956504225730896, |
| "learning_rate": 9.949651254073236e-05, |
| "loss": 0.1504, |
| "step": 1860 |
| }, |
| { |
| "grad_norm": 0.9494929909706116, |
| "learning_rate": 9.948474195736504e-05, |
| "loss": 0.1176, |
| "step": 1870 |
| }, |
| { |
| "grad_norm": 1.073912501335144, |
| "learning_rate": 9.947283608460277e-05, |
| "loss": 0.0968, |
| "step": 1880 |
| }, |
| { |
| "grad_norm": 0.9738394618034363, |
| "learning_rate": 9.946079495499577e-05, |
| "loss": 0.1157, |
| "step": 1890 |
| }, |
| { |
| "grad_norm": 0.9605513215065002, |
| "learning_rate": 9.944861860146401e-05, |
| "loss": 0.122, |
| "step": 1900 |
| }, |
| { |
| "grad_norm": 0.8335412740707397, |
| "learning_rate": 9.943630705729719e-05, |
| "loss": 0.0999, |
| "step": 1910 |
| }, |
| { |
| "grad_norm": 0.881373405456543, |
| "learning_rate": 9.942386035615459e-05, |
| "loss": 0.1014, |
| "step": 1920 |
| }, |
| { |
| "grad_norm": 0.7942837476730347, |
| "learning_rate": 9.941127853206503e-05, |
| "loss": 0.1095, |
| "step": 1930 |
| }, |
| { |
| "grad_norm": 1.0058091878890991, |
| "learning_rate": 9.939856161942673e-05, |
| "loss": 0.118, |
| "step": 1940 |
| }, |
| { |
| "grad_norm": 0.8951269388198853, |
| "learning_rate": 9.938570965300724e-05, |
| "loss": 0.1093, |
| "step": 1950 |
| }, |
| { |
| "grad_norm": 0.8100780844688416, |
| "learning_rate": 9.937272266794335e-05, |
| "loss": 0.1344, |
| "step": 1960 |
| }, |
| { |
| "grad_norm": 0.8485944867134094, |
| "learning_rate": 9.935960069974096e-05, |
| "loss": 0.1062, |
| "step": 1970 |
| }, |
| { |
| "grad_norm": 1.0462373495101929, |
| "learning_rate": 9.934634378427506e-05, |
| "loss": 0.1073, |
| "step": 1980 |
| }, |
| { |
| "grad_norm": 0.7918229103088379, |
| "learning_rate": 9.933295195778954e-05, |
| "loss": 0.0865, |
| "step": 1990 |
| }, |
| { |
| "eval/loss": 0.10448359854519368, |
| "step": 2000 |
| }, |
| { |
| "grad_norm": 1.0324554443359375, |
| "learning_rate": 9.931942525689715e-05, |
| "loss": 0.1235, |
| "step": 2000 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 20000, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 9223372036854775807, |
| "save_steps": 2000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 8, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|