| [ |
| { |
| "loss": 3.062, |
| "grad_norm": 9.917159080505371, |
| "learning_rate": 9e-06, |
| "epoch": 0.01002004008016032, |
| "step": 10 |
| }, |
| { |
| "loss": 2.0797, |
| "grad_norm": 1.8970115184783936, |
| "learning_rate": 1.9e-05, |
| "epoch": 0.02004008016032064, |
| "step": 20 |
| }, |
| { |
| "loss": 1.6129, |
| "grad_norm": 0.81168532371521, |
| "learning_rate": 2.9e-05, |
| "epoch": 0.03006012024048096, |
| "step": 30 |
| }, |
| { |
| "loss": 1.4949, |
| "grad_norm": 0.470950186252594, |
| "learning_rate": 3.9000000000000006e-05, |
| "epoch": 0.04008016032064128, |
| "step": 40 |
| }, |
| { |
| "loss": 1.4555, |
| "grad_norm": 0.42354708909988403, |
| "learning_rate": 4.9e-05, |
| "epoch": 0.050100200400801605, |
| "step": 50 |
| }, |
| { |
| "loss": 1.4276, |
| "grad_norm": 0.4629833698272705, |
| "learning_rate": 5.9e-05, |
| "epoch": 0.06012024048096192, |
| "step": 60 |
| }, |
| { |
| "loss": 1.3983, |
| "grad_norm": 0.6642730236053467, |
| "learning_rate": 6.9e-05, |
| "epoch": 0.07014028056112225, |
| "step": 70 |
| }, |
| { |
| "loss": 1.3804, |
| "grad_norm": 0.33904388546943665, |
| "learning_rate": 7.900000000000001e-05, |
| "epoch": 0.08016032064128256, |
| "step": 80 |
| }, |
| { |
| "loss": 1.3628, |
| "grad_norm": 0.37728458642959595, |
| "learning_rate": 8.900000000000001e-05, |
| "epoch": 0.09018036072144289, |
| "step": 90 |
| }, |
| { |
| "loss": 1.3522, |
| "grad_norm": 0.3425058126449585, |
| "learning_rate": 9.900000000000001e-05, |
| "epoch": 0.10020040080160321, |
| "step": 100 |
| }, |
| { |
| "loss": 1.346, |
| "grad_norm": 0.3616705536842346, |
| "learning_rate": 0.000109, |
| "epoch": 0.11022044088176353, |
| "step": 110 |
| }, |
| { |
| "loss": 1.3488, |
| "grad_norm": 0.3829646408557892, |
| "learning_rate": 0.000119, |
| "epoch": 0.12024048096192384, |
| "step": 120 |
| }, |
| { |
| "loss": 1.3226, |
| "grad_norm": 0.3760342299938202, |
| "learning_rate": 0.00012900000000000002, |
| "epoch": 0.13026052104208416, |
| "step": 130 |
| }, |
| { |
| "eval_loss": 1.3178932666778564, |
| "eval_runtime": 84.6431, |
| "eval_samples_per_second": 29.784, |
| "eval_steps_per_second": 7.455, |
| "epoch": 0.13326653306613226, |
| "step": 133 |
| }, |
| { |
| "loss": 1.3309, |
| "grad_norm": 0.37514758110046387, |
| "learning_rate": 0.000139, |
| "epoch": 0.1402805611222445, |
| "step": 140 |
| }, |
| { |
| "loss": 1.3105, |
| "grad_norm": 0.7152091264724731, |
| "learning_rate": 0.00014900000000000002, |
| "epoch": 0.15030060120240482, |
| "step": 150 |
| }, |
| { |
| "loss": 1.3244, |
| "grad_norm": 0.3840025067329407, |
| "learning_rate": 0.00015900000000000002, |
| "epoch": 0.16032064128256512, |
| "step": 160 |
| }, |
| { |
| "loss": 1.3289, |
| "grad_norm": 0.35113897919654846, |
| "learning_rate": 0.00016900000000000002, |
| "epoch": 0.17034068136272545, |
| "step": 170 |
| }, |
| { |
| "loss": 1.2968, |
| "grad_norm": 0.38480180501937866, |
| "learning_rate": 0.00017900000000000001, |
| "epoch": 0.18036072144288579, |
| "step": 180 |
| }, |
| { |
| "loss": 1.296, |
| "grad_norm": 0.3818041980266571, |
| "learning_rate": 0.00018899999999999999, |
| "epoch": 0.1903807615230461, |
| "step": 190 |
| }, |
| { |
| "loss": 1.3032, |
| "grad_norm": 0.4464828073978424, |
| "learning_rate": 0.000199, |
| "epoch": 0.20040080160320642, |
| "step": 200 |
| }, |
| { |
| "loss": 1.2942, |
| "grad_norm": 0.3449303209781647, |
| "learning_rate": 0.0001999876082359844, |
| "epoch": 0.21042084168336672, |
| "step": 210 |
| }, |
| { |
| "loss": 1.288, |
| "grad_norm": 0.3817189633846283, |
| "learning_rate": 0.0001999447764513578, |
| "epoch": 0.22044088176352705, |
| "step": 220 |
| }, |
| { |
| "loss": 1.2808, |
| "grad_norm": 0.5007427930831909, |
| "learning_rate": 0.00019987136476341398, |
| "epoch": 0.23046092184368738, |
| "step": 230 |
| }, |
| { |
| "loss": 1.2796, |
| "grad_norm": 0.33001911640167236, |
| "learning_rate": 0.00019976739563378035, |
| "epoch": 0.24048096192384769, |
| "step": 240 |
| }, |
| { |
| "loss": 1.2582, |
| "grad_norm": 0.33476898074150085, |
| "learning_rate": 0.00019963290087368342, |
| "epoch": 0.250501002004008, |
| "step": 250 |
| }, |
| { |
| "loss": 1.2583, |
| "grad_norm": 0.34601467847824097, |
| "learning_rate": 0.00019946792163421596, |
| "epoch": 0.2605210420841683, |
| "step": 260 |
| }, |
| { |
| "eval_loss": 1.2609914541244507, |
| "eval_runtime": 68.3146, |
| "eval_samples_per_second": 36.903, |
| "eval_steps_per_second": 9.237, |
| "epoch": 0.2665330661322645, |
| "step": 266 |
| }, |
| { |
| "loss": 1.2781, |
| "grad_norm": 0.3411717116832733, |
| "learning_rate": 0.00019927250839374582, |
| "epoch": 0.27054108216432865, |
| "step": 270 |
| }, |
| { |
| "loss": 1.267, |
| "grad_norm": 0.3507177233695984, |
| "learning_rate": 0.00019904672094247128, |
| "epoch": 0.280561122244489, |
| "step": 280 |
| }, |
| { |
| "loss": 1.275, |
| "grad_norm": 0.3237634599208832, |
| "learning_rate": 0.0001987906283641271, |
| "epoch": 0.2905811623246493, |
| "step": 290 |
| }, |
| { |
| "loss": 1.252, |
| "grad_norm": 0.3348980247974396, |
| "learning_rate": 0.0001985043090148472, |
| "epoch": 0.30060120240480964, |
| "step": 300 |
| }, |
| { |
| "loss": 1.2548, |
| "grad_norm": 0.32034358382225037, |
| "learning_rate": 0.0001981878504991901, |
| "epoch": 0.3106212424849699, |
| "step": 310 |
| }, |
| { |
| "loss": 1.2595, |
| "grad_norm": 0.3158237040042877, |
| "learning_rate": 0.0001978413496433348, |
| "epoch": 0.32064128256513025, |
| "step": 320 |
| }, |
| { |
| "loss": 1.2555, |
| "grad_norm": 0.31501418352127075, |
| "learning_rate": 0.00019746491246545503, |
| "epoch": 0.3306613226452906, |
| "step": 330 |
| }, |
| { |
| "loss": 1.2511, |
| "grad_norm": 0.33747926354408264, |
| "learning_rate": 0.00019705865414328103, |
| "epoch": 0.3406813627254509, |
| "step": 340 |
| }, |
| { |
| "loss": 1.2497, |
| "grad_norm": 0.3418786823749542, |
| "learning_rate": 0.0001966226989788589, |
| "epoch": 0.35070140280561124, |
| "step": 350 |
| }, |
| { |
| "loss": 1.2495, |
| "grad_norm": 0.3279886543750763, |
| "learning_rate": 0.00019615718036051827, |
| "epoch": 0.36072144288577157, |
| "step": 360 |
| }, |
| { |
| "loss": 1.2427, |
| "grad_norm": 0.3327982723712921, |
| "learning_rate": 0.00019566224072205954, |
| "epoch": 0.37074148296593185, |
| "step": 370 |
| }, |
| { |
| "loss": 1.2357, |
| "grad_norm": 5.1672163009643555, |
| "learning_rate": 0.00019513803149917377, |
| "epoch": 0.3807615230460922, |
| "step": 380 |
| }, |
| { |
| "loss": 1.246, |
| "grad_norm": 0.3257956802845001, |
| "learning_rate": 0.00019458471308310822, |
| "epoch": 0.3907815631262525, |
| "step": 390 |
| }, |
| { |
| "eval_loss": 1.229310154914856, |
| "eval_runtime": 68.3539, |
| "eval_samples_per_second": 36.882, |
| "eval_steps_per_second": 9.231, |
| "epoch": 0.3997995991983968, |
| "step": 399 |
| }, |
| { |
| "loss": 1.2415, |
| "grad_norm": 0.32486265897750854, |
| "learning_rate": 0.0001940024547715918, |
| "epoch": 0.40080160320641284, |
| "step": 400 |
| }, |
| { |
| "loss": 1.2388, |
| "grad_norm": 0.3178859055042267, |
| "learning_rate": 0.00019339143471703532, |
| "epoch": 0.41082164328657317, |
| "step": 410 |
| }, |
| { |
| "loss": 1.2318, |
| "grad_norm": 0.31442832946777344, |
| "learning_rate": 0.00019275183987202255, |
| "epoch": 0.42084168336673344, |
| "step": 420 |
| }, |
| { |
| "loss": 1.2061, |
| "grad_norm": 0.3233127295970917, |
| "learning_rate": 0.00019208386593210874, |
| "epoch": 0.4308617234468938, |
| "step": 430 |
| }, |
| { |
| "loss": 1.2186, |
| "grad_norm": 0.3286103308200836, |
| "learning_rate": 0.00019138771727594405, |
| "epoch": 0.4408817635270541, |
| "step": 440 |
| }, |
| { |
| "loss": 1.2284, |
| "grad_norm": 0.32425037026405334, |
| "learning_rate": 0.00019066360690274018, |
| "epoch": 0.45090180360721444, |
| "step": 450 |
| }, |
| { |
| "loss": 1.2281, |
| "grad_norm": 0.3199349641799927, |
| "learning_rate": 0.00018991175636709953, |
| "epoch": 0.46092184368737477, |
| "step": 460 |
| }, |
| { |
| "loss": 1.2332, |
| "grad_norm": 0.36368799209594727, |
| "learning_rate": 0.0001891323957112264, |
| "epoch": 0.4709418837675351, |
| "step": 470 |
| }, |
| { |
| "loss": 1.2067, |
| "grad_norm": 0.34387969970703125, |
| "learning_rate": 0.00018832576339454166, |
| "epoch": 0.48096192384769537, |
| "step": 480 |
| }, |
| { |
| "loss": 1.2294, |
| "grad_norm": 0.34961533546447754, |
| "learning_rate": 0.00018749210622072155, |
| "epoch": 0.4909819639278557, |
| "step": 490 |
| }, |
| { |
| "loss": 1.2248, |
| "grad_norm": 0.3662363886833191, |
| "learning_rate": 0.00018663167926218392, |
| "epoch": 0.501002004008016, |
| "step": 500 |
| }, |
| { |
| "loss": 1.2496, |
| "grad_norm": 0.356840580701828, |
| "learning_rate": 0.00018574474578204403, |
| "epoch": 0.5110220440881763, |
| "step": 510 |
| }, |
| { |
| "loss": 1.219, |
| "grad_norm": 0.348263144493103, |
| "learning_rate": 0.00018483157715356457, |
| "epoch": 0.5210420841683366, |
| "step": 520 |
| }, |
| { |
| "loss": 1.2229, |
| "grad_norm": 0.3213677704334259, |
| "learning_rate": 0.00018389245277712396, |
| "epoch": 0.531062124248497, |
| "step": 530 |
| }, |
| { |
| "eval_loss": 1.204853892326355, |
| "eval_runtime": 68.393, |
| "eval_samples_per_second": 36.86, |
| "eval_steps_per_second": 9.226, |
| "epoch": 0.533066132264529, |
| "step": 532 |
| }, |
| { |
| "loss": 1.2225, |
| "grad_norm": 0.3370509147644043, |
| "learning_rate": 0.0001829276599947291, |
| "epoch": 0.5410821643286573, |
| "step": 540 |
| }, |
| { |
| "loss": 1.2177, |
| "grad_norm": 0.32882654666900635, |
| "learning_rate": 0.00018193749400209757, |
| "epoch": 0.5511022044088176, |
| "step": 550 |
| }, |
| { |
| "loss": 1.1929, |
| "grad_norm": 0.3255807161331177, |
| "learning_rate": 0.00018092225775833733, |
| "epoch": 0.561122244488978, |
| "step": 560 |
| }, |
| { |
| "loss": 1.2155, |
| "grad_norm": 0.31887778639793396, |
| "learning_rate": 0.00017988226189325103, |
| "epoch": 0.5711422845691383, |
| "step": 570 |
| }, |
| { |
| "loss": 1.2117, |
| "grad_norm": 0.3166080713272095, |
| "learning_rate": 0.000178817824612293, |
| "epoch": 0.5811623246492986, |
| "step": 580 |
| }, |
| { |
| "loss": 1.2088, |
| "grad_norm": 0.3295918107032776, |
| "learning_rate": 0.0001777292715992088, |
| "epoch": 0.591182364729459, |
| "step": 590 |
| }, |
| { |
| "loss": 1.2201, |
| "grad_norm": 0.3369976282119751, |
| "learning_rate": 0.00017661693591638618, |
| "epoch": 0.6012024048096193, |
| "step": 600 |
| }, |
| { |
| "loss": 1.2027, |
| "grad_norm": 0.33591219782829285, |
| "learning_rate": 0.00017548115790294895, |
| "epoch": 0.6112224448897795, |
| "step": 610 |
| }, |
| { |
| "loss": 1.1963, |
| "grad_norm": 0.332086443901062, |
| "learning_rate": 0.0001743222850706238, |
| "epoch": 0.6212424849699398, |
| "step": 620 |
| }, |
| { |
| "loss": 1.1835, |
| "grad_norm": 0.321736603975296, |
| "learning_rate": 0.00017314067199741291, |
| "epoch": 0.6312625250501002, |
| "step": 630 |
| }, |
| { |
| "loss": 1.213, |
| "grad_norm": 0.3286183476448059, |
| "learning_rate": 0.0001719366802191046, |
| "epoch": 0.6412825651302605, |
| "step": 640 |
| }, |
| { |
| "loss": 1.1896, |
| "grad_norm": 0.31868550181388855, |
| "learning_rate": 0.00017071067811865476, |
| "epoch": 0.6513026052104208, |
| "step": 650 |
| }, |
| { |
| "loss": 1.2021, |
| "grad_norm": 0.328276664018631, |
| "learning_rate": 0.00016946304081347352, |
| "epoch": 0.6613226452905812, |
| "step": 660 |
| }, |
| { |
| "eval_loss": 1.1863322257995605, |
| "eval_runtime": 68.3185, |
| "eval_samples_per_second": 36.901, |
| "eval_steps_per_second": 9.236, |
| "epoch": 0.6663326653306614, |
| "step": 665 |
| }, |
| { |
| "loss": 1.1909, |
| "grad_norm": 0.3365529477596283, |
| "learning_rate": 0.0001681941500406513, |
| "epoch": 0.6713426853707415, |
| "step": 670 |
| }, |
| { |
| "loss": 1.2118, |
| "grad_norm": 0.3334924876689911, |
| "learning_rate": 0.00016690439404015955, |
| "epoch": 0.6813627254509018, |
| "step": 680 |
| }, |
| { |
| "loss": 1.1966, |
| "grad_norm": 0.3221626281738281, |
| "learning_rate": 0.00016559416743606163, |
| "epoch": 0.6913827655310621, |
| "step": 690 |
| }, |
| { |
| "loss": 1.1976, |
| "grad_norm": 0.3202773332595825, |
| "learning_rate": 0.0001642638711157706, |
| "epoch": 0.7014028056112225, |
| "step": 700 |
| }, |
| { |
| "loss": 1.1894, |
| "grad_norm": 0.32952797412872314, |
| "learning_rate": 0.00016291391210739034, |
| "epoch": 0.7114228456913828, |
| "step": 710 |
| }, |
| { |
| "loss": 1.1867, |
| "grad_norm": 0.33879393339157104, |
| "learning_rate": 0.0001615447034551782, |
| "epoch": 0.7214428857715431, |
| "step": 720 |
| }, |
| { |
| "loss": 1.1928, |
| "grad_norm": 0.3320337235927582, |
| "learning_rate": 0.00016015666409316642, |
| "epoch": 0.7314629258517034, |
| "step": 730 |
| }, |
| { |
| "loss": 1.2109, |
| "grad_norm": 0.32818353176116943, |
| "learning_rate": 0.00015875021871698195, |
| "epoch": 0.7414829659318637, |
| "step": 740 |
| }, |
| { |
| "loss": 1.1959, |
| "grad_norm": 0.32380595803260803, |
| "learning_rate": 0.00015732579765390296, |
| "epoch": 0.751503006012024, |
| "step": 750 |
| }, |
| { |
| "loss": 1.1632, |
| "grad_norm": 0.333734393119812, |
| "learning_rate": 0.00015588383673119274, |
| "epoch": 0.7615230460921844, |
| "step": 760 |
| }, |
| { |
| "loss": 1.1787, |
| "grad_norm": 0.3443449139595032, |
| "learning_rate": 0.00015442477714275023, |
| "epoch": 0.7715430861723447, |
| "step": 770 |
| }, |
| { |
| "loss": 1.178, |
| "grad_norm": 0.34319791197776794, |
| "learning_rate": 0.00015294906531411888, |
| "epoch": 0.781563126252505, |
| "step": 780 |
| }, |
| { |
| "loss": 1.1906, |
| "grad_norm": 0.34300360083580017, |
| "learning_rate": 0.00015145715276589487, |
| "epoch": 0.7915831663326653, |
| "step": 790 |
| }, |
| { |
| "eval_loss": 1.1701182126998901, |
| "eval_runtime": 68.3162, |
| "eval_samples_per_second": 36.902, |
| "eval_steps_per_second": 9.236, |
| "epoch": 0.7995991983967936, |
| "step": 798 |
| }, |
| { |
| "loss": 1.1745, |
| "grad_norm": 0.3376515507698059, |
| "learning_rate": 0.00014994949597557612, |
| "epoch": 0.8016032064128257, |
| "step": 800 |
| }, |
| { |
| "loss": 1.1681, |
| "grad_norm": 0.3189542591571808, |
| "learning_rate": 0.0001484265562378947, |
| "epoch": 0.811623246492986, |
| "step": 810 |
| }, |
| { |
| "loss": 1.1915, |
| "grad_norm": 0.33153972029685974, |
| "learning_rate": 0.00014688879952367572, |
| "epoch": 0.8216432865731463, |
| "step": 820 |
| }, |
| { |
| "loss": 1.1668, |
| "grad_norm": 0.3349854052066803, |
| "learning_rate": 0.00014533669633726474, |
| "epoch": 0.8316633266533067, |
| "step": 830 |
| }, |
| { |
| "loss": 1.1678, |
| "grad_norm": 0.33402037620544434, |
| "learning_rate": 0.0001437707215725688, |
| "epoch": 0.8416833667334669, |
| "step": 840 |
| }, |
| { |
| "loss": 1.173, |
| "grad_norm": 0.34636396169662476, |
| "learning_rate": 0.00014219135436775412, |
| "epoch": 0.8517034068136272, |
| "step": 850 |
| }, |
| { |
| "loss": 1.1712, |
| "grad_norm": 0.3418276607990265, |
| "learning_rate": 0.00014059907795864487, |
| "epoch": 0.8617234468937875, |
| "step": 860 |
| }, |
| { |
| "loss": 1.1788, |
| "grad_norm": 0.3244767189025879, |
| "learning_rate": 0.00013899437953086865, |
| "epoch": 0.8717434869739479, |
| "step": 870 |
| }, |
| { |
| "loss": 1.1555, |
| "grad_norm": 0.3318007290363312, |
| "learning_rate": 0.00013737775007079334, |
| "epoch": 0.8817635270541082, |
| "step": 880 |
| }, |
| { |
| "loss": 1.1756, |
| "grad_norm": 0.3220977485179901, |
| "learning_rate": 0.00013574968421530088, |
| "epoch": 0.8917835671342685, |
| "step": 890 |
| }, |
| { |
| "loss": 1.1644, |
| "grad_norm": 0.32579556107521057, |
| "learning_rate": 0.0001341106801004442, |
| "epoch": 0.9018036072144289, |
| "step": 900 |
| }, |
| { |
| "loss": 1.1579, |
| "grad_norm": 0.3344557285308838, |
| "learning_rate": 0.00013246123920903358, |
| "epoch": 0.9118236472945892, |
| "step": 910 |
| }, |
| { |
| "loss": 1.1442, |
| "grad_norm": 0.3250351846218109, |
| "learning_rate": 0.000130801866217199, |
| "epoch": 0.9218436873747495, |
| "step": 920 |
| }, |
| { |
| "loss": 1.1608, |
| "grad_norm": 0.3367891013622284, |
| "learning_rate": 0.00012913306883997528, |
| "epoch": 0.9318637274549099, |
| "step": 930 |
| }, |
| { |
| "eval_loss": 1.156960368156433, |
| "eval_runtime": 68.2656, |
| "eval_samples_per_second": 36.929, |
| "eval_steps_per_second": 9.243, |
| "epoch": 0.9328657314629258, |
| "step": 931 |
| }, |
| { |
| "loss": 1.1916, |
| "grad_norm": 0.33810174465179443, |
| "learning_rate": 0.00012745535767595754, |
| "epoch": 0.9418837675350702, |
| "step": 940 |
| }, |
| { |
| "loss": 1.166, |
| "grad_norm": 0.32210567593574524, |
| "learning_rate": 0.00012576924605107456, |
| "epoch": 0.9519038076152304, |
| "step": 950 |
| }, |
| { |
| "loss": 1.1662, |
| "grad_norm": 0.33408471941947937, |
| "learning_rate": 0.0001240752498615272, |
| "epoch": 0.9619238476953907, |
| "step": 960 |
| }, |
| { |
| "loss": 1.1691, |
| "grad_norm": 0.3320842683315277, |
| "learning_rate": 0.00012237388741594078, |
| "epoch": 0.9719438877755511, |
| "step": 970 |
| }, |
| { |
| "loss": 1.1608, |
| "grad_norm": 0.35282036662101746, |
| "learning_rate": 0.00012066567927677938, |
| "epoch": 0.9819639278557114, |
| "step": 980 |
| }, |
| { |
| "loss": 1.1759, |
| "grad_norm": 0.3290286958217621, |
| "learning_rate": 0.00011895114810107015, |
| "epoch": 0.9919839679358717, |
| "step": 990 |
| }, |
| { |
| "loss": 1.1491, |
| "grad_norm": 0.33208462595939636, |
| "learning_rate": 0.0001172308184804871, |
| "epoch": 1.002004008016032, |
| "step": 1000 |
| }, |
| { |
| "loss": 1.1075, |
| "grad_norm": 0.3253942131996155, |
| "learning_rate": 0.00011550521678084279, |
| "epoch": 1.0120240480961924, |
| "step": 1010 |
| }, |
| { |
| "loss": 1.0982, |
| "grad_norm": 0.3393993377685547, |
| "learning_rate": 0.00011377487098103735, |
| "epoch": 1.0220440881763526, |
| "step": 1020 |
| }, |
| { |
| "loss": 1.0971, |
| "grad_norm": 0.3415449857711792, |
| "learning_rate": 0.00011204031051151364, |
| "epoch": 1.032064128256513, |
| "step": 1030 |
| }, |
| { |
| "loss": 1.0864, |
| "grad_norm": 0.3586030900478363, |
| "learning_rate": 0.00011030206609226868, |
| "epoch": 1.0420841683366733, |
| "step": 1040 |
| }, |
| { |
| "loss": 1.1017, |
| "grad_norm": 0.3494417369365692, |
| "learning_rate": 0.0001085606695704701, |
| "epoch": 1.0521042084168337, |
| "step": 1050 |
| }, |
| { |
| "loss": 1.0877, |
| "grad_norm": 0.35265541076660156, |
| "learning_rate": 0.0001068166537577282, |
| "epoch": 1.062124248496994, |
| "step": 1060 |
| }, |
| { |
| "eval_loss": 1.1489150524139404, |
| "eval_runtime": 68.2603, |
| "eval_samples_per_second": 36.932, |
| "eval_steps_per_second": 9.244, |
| "epoch": 1.066132264529058, |
| "step": 1064 |
| }, |
| { |
| "loss": 1.083, |
| "grad_norm": 0.35727137327194214, |
| "learning_rate": 0.00010507055226707235, |
| "epoch": 1.0721442885771544, |
| "step": 1070 |
| }, |
| { |
| "loss": 1.0813, |
| "grad_norm": 0.34173229336738586, |
| "learning_rate": 0.0001033228993496827, |
| "epoch": 1.0821643286573146, |
| "step": 1080 |
| }, |
| { |
| "loss": 1.1103, |
| "grad_norm": 0.35060715675354004, |
| "learning_rate": 0.00010157422973142629, |
| "epoch": 1.092184368737475, |
| "step": 1090 |
| }, |
| { |
| "loss": 1.0888, |
| "grad_norm": 0.3494425415992737, |
| "learning_rate": 9.982507844924809e-05, |
| "epoch": 1.1022044088176353, |
| "step": 1100 |
| }, |
| { |
| "loss": 1.0822, |
| "grad_norm": 0.3451234698295593, |
| "learning_rate": 9.807598068746686e-05, |
| "epoch": 1.1122244488977957, |
| "step": 1110 |
| }, |
| { |
| "loss": 1.0956, |
| "grad_norm": 0.357877254486084, |
| "learning_rate": 9.632747161402581e-05, |
| "epoch": 1.122244488977956, |
| "step": 1120 |
| }, |
| { |
| "loss": 1.0965, |
| "grad_norm": 0.3512496054172516, |
| "learning_rate": 9.458008621674833e-05, |
| "epoch": 1.1322645290581161, |
| "step": 1130 |
| }, |
| { |
| "loss": 1.0973, |
| "grad_norm": 0.3644641637802124, |
| "learning_rate": 9.283435913964887e-05, |
| "epoch": 1.1422845691382766, |
| "step": 1140 |
| }, |
| { |
| "loss": 1.0925, |
| "grad_norm": 0.36287280917167664, |
| "learning_rate": 9.109082451934903e-05, |
| "epoch": 1.1523046092184368, |
| "step": 1150 |
| }, |
| { |
| "loss": 1.0843, |
| "grad_norm": 0.3538283407688141, |
| "learning_rate": 8.935001582164876e-05, |
| "epoch": 1.1623246492985972, |
| "step": 1160 |
| }, |
| { |
| "loss": 1.0986, |
| "grad_norm": 0.35994771122932434, |
| "learning_rate": 8.761246567830283e-05, |
| "epoch": 1.1723446893787575, |
| "step": 1170 |
| }, |
| { |
| "loss": 1.091, |
| "grad_norm": 0.3535645604133606, |
| "learning_rate": 8.587870572405278e-05, |
| "epoch": 1.182364729458918, |
| "step": 1180 |
| }, |
| { |
| "loss": 1.0724, |
| "grad_norm": 0.35088545083999634, |
| "learning_rate": 8.414926643396355e-05, |
| "epoch": 1.1923847695390781, |
| "step": 1190 |
| }, |
| { |
| "eval_loss": 1.1411553621292114, |
| "eval_runtime": 68.2204, |
| "eval_samples_per_second": 36.954, |
| "eval_steps_per_second": 9.249, |
| "epoch": 1.1993987975951903, |
| "step": 1197 |
| }, |
| { |
| "loss": 1.1034, |
| "grad_norm": 0.349630743265152, |
| "learning_rate": 8.2424676961115e-05, |
| "epoch": 1.2024048096192386, |
| "step": 1200 |
| }, |
| { |
| "loss": 1.0913, |
| "grad_norm": 0.34191203117370605, |
| "learning_rate": 8.070546497469829e-05, |
| "epoch": 1.2124248496993988, |
| "step": 1210 |
| }, |
| { |
| "loss": 1.0882, |
| "grad_norm": 0.36975690722465515, |
| "learning_rate": 7.89921564985657e-05, |
| "epoch": 1.2224448897795592, |
| "step": 1220 |
| }, |
| { |
| "loss": 1.0959, |
| "grad_norm": 0.35831448435783386, |
| "learning_rate": 7.728527575028426e-05, |
| "epoch": 1.2324649298597194, |
| "step": 1230 |
| }, |
| { |
| "loss": 1.099, |
| "grad_norm": 0.3613387644290924, |
| "learning_rate": 7.558534498074204e-05, |
| "epoch": 1.2424849699398797, |
| "step": 1240 |
| }, |
| { |
| "loss": 1.0867, |
| "grad_norm": 0.3645046055316925, |
| "learning_rate": 7.389288431435603e-05, |
| "epoch": 1.25250501002004, |
| "step": 1250 |
| }, |
| { |
| "loss": 1.0901, |
| "grad_norm": 0.3511161208152771, |
| "learning_rate": 7.220841158993056e-05, |
| "epoch": 1.2625250501002003, |
| "step": 1260 |
| }, |
| { |
| "loss": 1.0928, |
| "grad_norm": 0.3355962932109833, |
| "learning_rate": 7.053244220221546e-05, |
| "epoch": 1.2725450901803608, |
| "step": 1270 |
| }, |
| { |
| "loss": 1.1051, |
| "grad_norm": 0.3608781695365906, |
| "learning_rate": 6.886548894421166e-05, |
| "epoch": 1.282565130260521, |
| "step": 1280 |
| }, |
| { |
| "loss": 1.0771, |
| "grad_norm": 0.3543643057346344, |
| "learning_rate": 6.720806185027281e-05, |
| "epoch": 1.2925851703406814, |
| "step": 1290 |
| }, |
| { |
| "loss": 1.1026, |
| "grad_norm": 0.3539983034133911, |
| "learning_rate": 6.55606680400513e-05, |
| "epoch": 1.3026052104208417, |
| "step": 1300 |
| }, |
| { |
| "loss": 1.0764, |
| "grad_norm": 0.35117045044898987, |
| "learning_rate": 6.392381156333572e-05, |
| "epoch": 1.3126252505010019, |
| "step": 1310 |
| }, |
| { |
| "loss": 1.0734, |
| "grad_norm": 0.35716721415519714, |
| "learning_rate": 6.229799324582782e-05, |
| "epoch": 1.3226452905811623, |
| "step": 1320 |
| }, |
| { |
| "loss": 1.0889, |
| "grad_norm": 0.34911802411079407, |
| "learning_rate": 6.068371053590582e-05, |
| "epoch": 1.3326653306613228, |
| "step": 1330 |
| }, |
| { |
| "eval_loss": 1.1338779926300049, |
| "eval_runtime": 67.8228, |
| "eval_samples_per_second": 37.17, |
| "eval_steps_per_second": 9.304, |
| "epoch": 1.3326653306613228, |
| "step": 1330 |
| }, |
| { |
| "loss": 1.0902, |
| "grad_norm": 0.3515984117984772, |
| "learning_rate": 5.9081457352421254e-05, |
| "epoch": 1.342685370741483, |
| "step": 1340 |
| }, |
| { |
| "loss": 1.0822, |
| "grad_norm": 0.353769987821579, |
| "learning_rate": 5.7491723933575395e-05, |
| "epoch": 1.3527054108216432, |
| "step": 1350 |
| }, |
| { |
| "loss": 1.0729, |
| "grad_norm": 0.36617526412010193, |
| "learning_rate": 5.5914996686922305e-05, |
| "epoch": 1.3627254509018036, |
| "step": 1360 |
| }, |
| { |
| "loss": 1.0911, |
| "grad_norm": 0.3742637038230896, |
| "learning_rate": 5.4351758040543424e-05, |
| "epoch": 1.3727454909819639, |
| "step": 1370 |
| }, |
| { |
| "loss": 1.0858, |
| "grad_norm": 0.3658939003944397, |
| "learning_rate": 5.280248629544027e-05, |
| "epoch": 1.3827655310621243, |
| "step": 1380 |
| }, |
| { |
| "loss": 1.0966, |
| "grad_norm": 0.35165053606033325, |
| "learning_rate": 5.1267655479189416e-05, |
| "epoch": 1.3927855711422845, |
| "step": 1390 |
| }, |
| { |
| "loss": 1.0738, |
| "grad_norm": 0.359625905752182, |
| "learning_rate": 4.974773520090541e-05, |
| "epoch": 1.402805611222445, |
| "step": 1400 |
| }, |
| { |
| "loss": 1.0833, |
| "grad_norm": 0.3560248613357544, |
| "learning_rate": 4.8243190507555314e-05, |
| "epoch": 1.4128256513026052, |
| "step": 1410 |
| }, |
| { |
| "loss": 1.0768, |
| "grad_norm": 0.3535526394844055, |
| "learning_rate": 4.675448174166912e-05, |
| "epoch": 1.4228456913827654, |
| "step": 1420 |
| }, |
| { |
| "loss": 1.0757, |
| "grad_norm": 0.3635563850402832, |
| "learning_rate": 4.5282064400489943e-05, |
| "epoch": 1.4328657314629258, |
| "step": 1430 |
| }, |
| { |
| "loss": 1.0941, |
| "grad_norm": 0.34245678782463074, |
| "learning_rate": 4.382638899660613e-05, |
| "epoch": 1.4428857715430863, |
| "step": 1440 |
| }, |
| { |
| "loss": 1.082, |
| "grad_norm": 0.3664350211620331, |
| "learning_rate": 4.238790092010897e-05, |
| "epoch": 1.4529058116232465, |
| "step": 1450 |
| }, |
| { |
| "loss": 1.0803, |
| "grad_norm": 0.35511618852615356, |
| "learning_rate": 4.096704030231767e-05, |
| "epoch": 1.4629258517034067, |
| "step": 1460 |
| }, |
| { |
| "eval_loss": 1.1275933980941772, |
| "eval_runtime": 68.2799, |
| "eval_samples_per_second": 36.922, |
| "eval_steps_per_second": 9.241, |
| "epoch": 1.465931863727455, |
| "step": 1463 |
| }, |
| { |
| "loss": 1.0844, |
| "grad_norm": 0.36960098147392273, |
| "learning_rate": 3.956424188111314e-05, |
| "epoch": 1.4729458917835672, |
| "step": 1470 |
| }, |
| { |
| "loss": 1.0896, |
| "grad_norm": 0.35100260376930237, |
| "learning_rate": 3.8179934867922016e-05, |
| "epoch": 1.4829659318637274, |
| "step": 1480 |
| }, |
| { |
| "loss": 1.0804, |
| "grad_norm": 0.3652225732803345, |
| "learning_rate": 3.681454281639195e-05, |
| "epoch": 1.4929859719438878, |
| "step": 1490 |
| }, |
| { |
| "loss": 1.0912, |
| "grad_norm": 0.36614513397216797, |
| "learning_rate": 3.54684834927976e-05, |
| "epoch": 1.503006012024048, |
| "step": 1500 |
| }, |
| { |
| "loss": 1.0817, |
| "grad_norm": 0.3566560447216034, |
| "learning_rate": 3.4142168748217405e-05, |
| "epoch": 1.5130260521042085, |
| "step": 1510 |
| }, |
| { |
| "loss": 1.0955, |
| "grad_norm": 0.34692510962486267, |
| "learning_rate": 3.2836004392520624e-05, |
| "epoch": 1.5230460921843687, |
| "step": 1520 |
| }, |
| { |
| "loss": 1.0758, |
| "grad_norm": 0.3460650146007538, |
| "learning_rate": 3.1550390070202255e-05, |
| "epoch": 1.533066132264529, |
| "step": 1530 |
| }, |
| { |
| "loss": 1.0842, |
| "grad_norm": 0.3645261228084564, |
| "learning_rate": 3.0285719138104628e-05, |
| "epoch": 1.5430861723446894, |
| "step": 1540 |
| }, |
| { |
| "loss": 1.0795, |
| "grad_norm": 0.3634505867958069, |
| "learning_rate": 2.9042378545063e-05, |
| "epoch": 1.5531062124248498, |
| "step": 1550 |
| }, |
| { |
| "loss": 1.0841, |
| "grad_norm": 0.36353570222854614, |
| "learning_rate": 2.7820748713511414e-05, |
| "epoch": 1.56312625250501, |
| "step": 1560 |
| }, |
| { |
| "loss": 1.0884, |
| "grad_norm": 0.3511188328266144, |
| "learning_rate": 2.662120342308557e-05, |
| "epoch": 1.5731462925851702, |
| "step": 1570 |
| }, |
| { |
| "loss": 1.0697, |
| "grad_norm": 0.37336093187332153, |
| "learning_rate": 2.5444109696258434e-05, |
| "epoch": 1.5831663326653307, |
| "step": 1580 |
| }, |
| { |
| "loss": 1.0713, |
| "grad_norm": 0.370236873626709, |
| "learning_rate": 2.428982768604281e-05, |
| "epoch": 1.5931863727454911, |
| "step": 1590 |
| }, |
| { |
| "eval_loss": 1.1230673789978027, |
| "eval_runtime": 68.22, |
| "eval_samples_per_second": 36.954, |
| "eval_steps_per_second": 9.249, |
| "epoch": 1.5991983967935872, |
| "step": 1596 |
| }, |
| { |
| "loss": 1.0832, |
| "grad_norm": 0.3588917553424835, |
| "learning_rate": 2.3158710565796348e-05, |
| "epoch": 1.6032064128256514, |
| "step": 1600 |
| }, |
| { |
| "loss": 1.0861, |
| "grad_norm": 0.34933748841285706, |
| "learning_rate": 2.2051104421161607e-05, |
| "epoch": 1.6132264529058116, |
| "step": 1610 |
| }, |
| { |
| "loss": 1.0667, |
| "grad_norm": 0.36681294441223145, |
| "learning_rate": 2.0967348144174924e-05, |
| "epoch": 1.623246492985972, |
| "step": 1620 |
| }, |
| { |
| "loss": 1.0567, |
| "grad_norm": 0.34613531827926636, |
| "learning_rate": 1.9907773329576375e-05, |
| "epoch": 1.6332665330661322, |
| "step": 1630 |
| }, |
| { |
| "loss": 1.082, |
| "grad_norm": 0.35560983419418335, |
| "learning_rate": 1.887270417335241e-05, |
| "epoch": 1.6432865731462925, |
| "step": 1640 |
| }, |
| { |
| "loss": 1.0606, |
| "grad_norm": 0.3562234342098236, |
| "learning_rate": 1.7862457373542095e-05, |
| "epoch": 1.653306613226453, |
| "step": 1650 |
| }, |
| { |
| "loss": 1.0716, |
| "grad_norm": 0.3679576516151428, |
| "learning_rate": 1.6877342033337872e-05, |
| "epoch": 1.6633266533066133, |
| "step": 1660 |
| }, |
| { |
| "loss": 1.0546, |
| "grad_norm": 0.36080437898635864, |
| "learning_rate": 1.5917659566509746e-05, |
| "epoch": 1.6733466933867736, |
| "step": 1670 |
| }, |
| { |
| "loss": 1.0934, |
| "grad_norm": 0.36290815472602844, |
| "learning_rate": 1.4983703605182242e-05, |
| "epoch": 1.6833667334669338, |
| "step": 1680 |
| }, |
| { |
| "loss": 1.0773, |
| "grad_norm": 0.365818053483963, |
| "learning_rate": 1.4075759909992548e-05, |
| "epoch": 1.6933867735470942, |
| "step": 1690 |
| }, |
| { |
| "loss": 1.0882, |
| "grad_norm": 0.35331591963768005, |
| "learning_rate": 1.3194106282656827e-05, |
| "epoch": 1.7034068136272547, |
| "step": 1700 |
| }, |
| { |
| "loss": 1.065, |
| "grad_norm": 0.3647233247756958, |
| "learning_rate": 1.2339012480971712e-05, |
| "epoch": 1.7134268537074149, |
| "step": 1710 |
| }, |
| { |
| "loss": 1.0634, |
| "grad_norm": 0.36989837884902954, |
| "learning_rate": 1.1510740136277109e-05, |
| "epoch": 1.723446893787575, |
| "step": 1720 |
| }, |
| { |
| "eval_loss": 1.1204578876495361, |
| "eval_runtime": 68.1731, |
| "eval_samples_per_second": 36.979, |
| "eval_steps_per_second": 9.256, |
| "epoch": 1.7324649298597194, |
| "step": 1729 |
| }, |
| { |
| "loss": 1.066, |
| "grad_norm": 0.36859068274497986, |
| "learning_rate": 1.070954267340547e-05, |
| "epoch": 1.7334669338677355, |
| "step": 1730 |
| }, |
| { |
| "loss": 1.0864, |
| "grad_norm": 0.3574073314666748, |
| "learning_rate": 9.9356652331417e-06, |
| "epoch": 1.7434869739478958, |
| "step": 1740 |
| }, |
| { |
| "loss": 1.0706, |
| "grad_norm": 0.3612259030342102, |
| "learning_rate": 9.189344597218153e-06, |
| "epoch": 1.753507014028056, |
| "step": 1750 |
| }, |
| { |
| "loss": 1.0713, |
| "grad_norm": 0.3700193464756012, |
| "learning_rate": 8.470809115866818e-06, |
| "epoch": 1.7635270541082164, |
| "step": 1760 |
| }, |
| { |
| "loss": 1.0803, |
| "grad_norm": 0.3535062074661255, |
| "learning_rate": 7.780278637951521e-06, |
| "epoch": 1.7735470941883769, |
| "step": 1770 |
| }, |
| { |
| "loss": 1.0647, |
| "grad_norm": 0.35681986808776855, |
| "learning_rate": 7.117964443701242e-06, |
| "epoch": 1.783567134268537, |
| "step": 1780 |
| }, |
| { |
| "loss": 1.0669, |
| "grad_norm": 0.3624560832977295, |
| "learning_rate": 6.484069180065055e-06, |
| "epoch": 1.7935871743486973, |
| "step": 1790 |
| }, |
| { |
| "loss": 1.0719, |
| "grad_norm": 0.35827717185020447, |
| "learning_rate": 5.8787867987087355e-06, |
| "epoch": 1.8036072144288577, |
| "step": 1800 |
| }, |
| { |
| "loss": 1.0855, |
| "grad_norm": 0.3589949905872345, |
| "learning_rate": 5.302302496671641e-06, |
| "epoch": 1.8136272545090182, |
| "step": 1810 |
| }, |
| { |
| "loss": 1.0722, |
| "grad_norm": 0.358090341091156, |
| "learning_rate": 4.754792659702468e-06, |
| "epoch": 1.8236472945891784, |
| "step": 1820 |
| }, |
| { |
| "loss": 1.0719, |
| "grad_norm": 0.3602781593799591, |
| "learning_rate": 4.236424808290751e-06, |
| "epoch": 1.8336673346693386, |
| "step": 1830 |
| }, |
| { |
| "loss": 1.0815, |
| "grad_norm": 0.3476168215274811, |
| "learning_rate": 3.7473575464110455e-06, |
| "epoch": 1.843687374749499, |
| "step": 1840 |
| }, |
| { |
| "loss": 1.0774, |
| "grad_norm": 0.3502495288848877, |
| "learning_rate": 3.2877405129950967e-06, |
| "epoch": 1.8537074148296593, |
| "step": 1850 |
| }, |
| { |
| "loss": 1.0736, |
| "grad_norm": 0.3665917217731476, |
| "learning_rate": 2.857714336147188e-06, |
| "epoch": 1.8637274549098195, |
| "step": 1860 |
| }, |
| { |
| "eval_loss": 1.1194243431091309, |
| "eval_runtime": 68.2716, |
| "eval_samples_per_second": 36.926, |
| "eval_steps_per_second": 9.242, |
| "epoch": 1.8657314629258517, |
| "step": 1862 |
| }, |
| { |
| "loss": 1.0802, |
| "grad_norm": 0.36012518405914307, |
| "learning_rate": 2.457410590116427e-06, |
| "epoch": 1.87374749498998, |
| "step": 1870 |
| }, |
| { |
| "loss": 1.0787, |
| "grad_norm": 0.3560042977333069, |
| "learning_rate": 2.086951755039168e-06, |
| "epoch": 1.8837675350701404, |
| "step": 1880 |
| }, |
| { |
| "loss": 1.0742, |
| "grad_norm": 0.36220309138298035, |
| "learning_rate": 1.746451179464137e-06, |
| "epoch": 1.8937875751503006, |
| "step": 1890 |
| }, |
| { |
| "loss": 1.0727, |
| "grad_norm": 0.3498152196407318, |
| "learning_rate": 1.4360130456712695e-06, |
| "epoch": 1.9038076152304608, |
| "step": 1900 |
| }, |
| { |
| "loss": 1.0738, |
| "grad_norm": 0.3657923936843872, |
| "learning_rate": 1.1557323377953456e-06, |
| "epoch": 1.9138276553106213, |
| "step": 1910 |
| }, |
| { |
| "loss": 1.0568, |
| "grad_norm": 0.36668485403060913, |
| "learning_rate": 9.056948127638687e-07, |
| "epoch": 1.9238476953907817, |
| "step": 1920 |
| }, |
| { |
| "loss": 1.0568, |
| "grad_norm": 0.3519572615623474, |
| "learning_rate": 6.859769740582e-07, |
| "epoch": 1.933867735470942, |
| "step": 1930 |
| }, |
| { |
| "loss": 1.0621, |
| "grad_norm": 0.3648887574672699, |
| "learning_rate": 4.966460483059044e-07, |
| "epoch": 1.9438877755511021, |
| "step": 1940 |
| }, |
| { |
| "loss": 1.0688, |
| "grad_norm": 0.3495730757713318, |
| "learning_rate": 3.3775996471160366e-07, |
| "epoch": 1.9539078156312626, |
| "step": 1950 |
| }, |
| { |
| "loss": 1.0652, |
| "grad_norm": 0.3570130169391632, |
| "learning_rate": 2.093673373324334e-07, |
| "epoch": 1.9639278557114228, |
| "step": 1960 |
| }, |
| { |
| "loss": 1.0822, |
| "grad_norm": 0.3690868020057678, |
| "learning_rate": 1.1150745020376275e-07, |
| "epoch": 1.973947895791583, |
| "step": 1970 |
| }, |
| { |
| "loss": 1.0665, |
| "grad_norm": 0.3751429319381714, |
| "learning_rate": 4.421024531948703e-08, |
| "epoch": 1.9839679358717435, |
| "step": 1980 |
| }, |
| { |
| "loss": 1.065, |
| "grad_norm": 0.3617876172065735, |
| "learning_rate": 7.496313470778393e-09, |
| "epoch": 1.993987975951904, |
| "step": 1990 |
| }, |
| { |
| "eval_loss": 1.1191450357437134, |
| "eval_runtime": 68.3979, |
| "eval_samples_per_second": 36.858, |
| "eval_steps_per_second": 9.225, |
| "epoch": 1.9989979959919841, |
| "step": 1995 |
| }, |
| { |
| "train_runtime": 6834.1445, |
| "train_samples_per_second": 14.016, |
| "train_steps_per_second": 0.292, |
| "total_flos": 7.260644379510651e+17, |
| "train_loss": 1.1743444665400442, |
| "epoch": 2.0, |
| "step": 1996 |
| } |
| ] |