kth8's picture
Upload folder using huggingface_hub
876c5be verified
[
{
"loss": 3.062,
"grad_norm": 9.917159080505371,
"learning_rate": 9e-06,
"epoch": 0.01002004008016032,
"step": 10
},
{
"loss": 2.0797,
"grad_norm": 1.8970115184783936,
"learning_rate": 1.9e-05,
"epoch": 0.02004008016032064,
"step": 20
},
{
"loss": 1.6129,
"grad_norm": 0.81168532371521,
"learning_rate": 2.9e-05,
"epoch": 0.03006012024048096,
"step": 30
},
{
"loss": 1.4949,
"grad_norm": 0.470950186252594,
"learning_rate": 3.9000000000000006e-05,
"epoch": 0.04008016032064128,
"step": 40
},
{
"loss": 1.4555,
"grad_norm": 0.42354708909988403,
"learning_rate": 4.9e-05,
"epoch": 0.050100200400801605,
"step": 50
},
{
"loss": 1.4276,
"grad_norm": 0.4629833698272705,
"learning_rate": 5.9e-05,
"epoch": 0.06012024048096192,
"step": 60
},
{
"loss": 1.3983,
"grad_norm": 0.6642730236053467,
"learning_rate": 6.9e-05,
"epoch": 0.07014028056112225,
"step": 70
},
{
"loss": 1.3804,
"grad_norm": 0.33904388546943665,
"learning_rate": 7.900000000000001e-05,
"epoch": 0.08016032064128256,
"step": 80
},
{
"loss": 1.3628,
"grad_norm": 0.37728458642959595,
"learning_rate": 8.900000000000001e-05,
"epoch": 0.09018036072144289,
"step": 90
},
{
"loss": 1.3522,
"grad_norm": 0.3425058126449585,
"learning_rate": 9.900000000000001e-05,
"epoch": 0.10020040080160321,
"step": 100
},
{
"loss": 1.346,
"grad_norm": 0.3616705536842346,
"learning_rate": 0.000109,
"epoch": 0.11022044088176353,
"step": 110
},
{
"loss": 1.3488,
"grad_norm": 0.3829646408557892,
"learning_rate": 0.000119,
"epoch": 0.12024048096192384,
"step": 120
},
{
"loss": 1.3226,
"grad_norm": 0.3760342299938202,
"learning_rate": 0.00012900000000000002,
"epoch": 0.13026052104208416,
"step": 130
},
{
"eval_loss": 1.3178932666778564,
"eval_runtime": 84.6431,
"eval_samples_per_second": 29.784,
"eval_steps_per_second": 7.455,
"epoch": 0.13326653306613226,
"step": 133
},
{
"loss": 1.3309,
"grad_norm": 0.37514758110046387,
"learning_rate": 0.000139,
"epoch": 0.1402805611222445,
"step": 140
},
{
"loss": 1.3105,
"grad_norm": 0.7152091264724731,
"learning_rate": 0.00014900000000000002,
"epoch": 0.15030060120240482,
"step": 150
},
{
"loss": 1.3244,
"grad_norm": 0.3840025067329407,
"learning_rate": 0.00015900000000000002,
"epoch": 0.16032064128256512,
"step": 160
},
{
"loss": 1.3289,
"grad_norm": 0.35113897919654846,
"learning_rate": 0.00016900000000000002,
"epoch": 0.17034068136272545,
"step": 170
},
{
"loss": 1.2968,
"grad_norm": 0.38480180501937866,
"learning_rate": 0.00017900000000000001,
"epoch": 0.18036072144288579,
"step": 180
},
{
"loss": 1.296,
"grad_norm": 0.3818041980266571,
"learning_rate": 0.00018899999999999999,
"epoch": 0.1903807615230461,
"step": 190
},
{
"loss": 1.3032,
"grad_norm": 0.4464828073978424,
"learning_rate": 0.000199,
"epoch": 0.20040080160320642,
"step": 200
},
{
"loss": 1.2942,
"grad_norm": 0.3449303209781647,
"learning_rate": 0.0001999876082359844,
"epoch": 0.21042084168336672,
"step": 210
},
{
"loss": 1.288,
"grad_norm": 0.3817189633846283,
"learning_rate": 0.0001999447764513578,
"epoch": 0.22044088176352705,
"step": 220
},
{
"loss": 1.2808,
"grad_norm": 0.5007427930831909,
"learning_rate": 0.00019987136476341398,
"epoch": 0.23046092184368738,
"step": 230
},
{
"loss": 1.2796,
"grad_norm": 0.33001911640167236,
"learning_rate": 0.00019976739563378035,
"epoch": 0.24048096192384769,
"step": 240
},
{
"loss": 1.2582,
"grad_norm": 0.33476898074150085,
"learning_rate": 0.00019963290087368342,
"epoch": 0.250501002004008,
"step": 250
},
{
"loss": 1.2583,
"grad_norm": 0.34601467847824097,
"learning_rate": 0.00019946792163421596,
"epoch": 0.2605210420841683,
"step": 260
},
{
"eval_loss": 1.2609914541244507,
"eval_runtime": 68.3146,
"eval_samples_per_second": 36.903,
"eval_steps_per_second": 9.237,
"epoch": 0.2665330661322645,
"step": 266
},
{
"loss": 1.2781,
"grad_norm": 0.3411717116832733,
"learning_rate": 0.00019927250839374582,
"epoch": 0.27054108216432865,
"step": 270
},
{
"loss": 1.267,
"grad_norm": 0.3507177233695984,
"learning_rate": 0.00019904672094247128,
"epoch": 0.280561122244489,
"step": 280
},
{
"loss": 1.275,
"grad_norm": 0.3237634599208832,
"learning_rate": 0.0001987906283641271,
"epoch": 0.2905811623246493,
"step": 290
},
{
"loss": 1.252,
"grad_norm": 0.3348980247974396,
"learning_rate": 0.0001985043090148472,
"epoch": 0.30060120240480964,
"step": 300
},
{
"loss": 1.2548,
"grad_norm": 0.32034358382225037,
"learning_rate": 0.0001981878504991901,
"epoch": 0.3106212424849699,
"step": 310
},
{
"loss": 1.2595,
"grad_norm": 0.3158237040042877,
"learning_rate": 0.0001978413496433348,
"epoch": 0.32064128256513025,
"step": 320
},
{
"loss": 1.2555,
"grad_norm": 0.31501418352127075,
"learning_rate": 0.00019746491246545503,
"epoch": 0.3306613226452906,
"step": 330
},
{
"loss": 1.2511,
"grad_norm": 0.33747926354408264,
"learning_rate": 0.00019705865414328103,
"epoch": 0.3406813627254509,
"step": 340
},
{
"loss": 1.2497,
"grad_norm": 0.3418786823749542,
"learning_rate": 0.0001966226989788589,
"epoch": 0.35070140280561124,
"step": 350
},
{
"loss": 1.2495,
"grad_norm": 0.3279886543750763,
"learning_rate": 0.00019615718036051827,
"epoch": 0.36072144288577157,
"step": 360
},
{
"loss": 1.2427,
"grad_norm": 0.3327982723712921,
"learning_rate": 0.00019566224072205954,
"epoch": 0.37074148296593185,
"step": 370
},
{
"loss": 1.2357,
"grad_norm": 5.1672163009643555,
"learning_rate": 0.00019513803149917377,
"epoch": 0.3807615230460922,
"step": 380
},
{
"loss": 1.246,
"grad_norm": 0.3257956802845001,
"learning_rate": 0.00019458471308310822,
"epoch": 0.3907815631262525,
"step": 390
},
{
"eval_loss": 1.229310154914856,
"eval_runtime": 68.3539,
"eval_samples_per_second": 36.882,
"eval_steps_per_second": 9.231,
"epoch": 0.3997995991983968,
"step": 399
},
{
"loss": 1.2415,
"grad_norm": 0.32486265897750854,
"learning_rate": 0.0001940024547715918,
"epoch": 0.40080160320641284,
"step": 400
},
{
"loss": 1.2388,
"grad_norm": 0.3178859055042267,
"learning_rate": 0.00019339143471703532,
"epoch": 0.41082164328657317,
"step": 410
},
{
"loss": 1.2318,
"grad_norm": 0.31442832946777344,
"learning_rate": 0.00019275183987202255,
"epoch": 0.42084168336673344,
"step": 420
},
{
"loss": 1.2061,
"grad_norm": 0.3233127295970917,
"learning_rate": 0.00019208386593210874,
"epoch": 0.4308617234468938,
"step": 430
},
{
"loss": 1.2186,
"grad_norm": 0.3286103308200836,
"learning_rate": 0.00019138771727594405,
"epoch": 0.4408817635270541,
"step": 440
},
{
"loss": 1.2284,
"grad_norm": 0.32425037026405334,
"learning_rate": 0.00019066360690274018,
"epoch": 0.45090180360721444,
"step": 450
},
{
"loss": 1.2281,
"grad_norm": 0.3199349641799927,
"learning_rate": 0.00018991175636709953,
"epoch": 0.46092184368737477,
"step": 460
},
{
"loss": 1.2332,
"grad_norm": 0.36368799209594727,
"learning_rate": 0.0001891323957112264,
"epoch": 0.4709418837675351,
"step": 470
},
{
"loss": 1.2067,
"grad_norm": 0.34387969970703125,
"learning_rate": 0.00018832576339454166,
"epoch": 0.48096192384769537,
"step": 480
},
{
"loss": 1.2294,
"grad_norm": 0.34961533546447754,
"learning_rate": 0.00018749210622072155,
"epoch": 0.4909819639278557,
"step": 490
},
{
"loss": 1.2248,
"grad_norm": 0.3662363886833191,
"learning_rate": 0.00018663167926218392,
"epoch": 0.501002004008016,
"step": 500
},
{
"loss": 1.2496,
"grad_norm": 0.356840580701828,
"learning_rate": 0.00018574474578204403,
"epoch": 0.5110220440881763,
"step": 510
},
{
"loss": 1.219,
"grad_norm": 0.348263144493103,
"learning_rate": 0.00018483157715356457,
"epoch": 0.5210420841683366,
"step": 520
},
{
"loss": 1.2229,
"grad_norm": 0.3213677704334259,
"learning_rate": 0.00018389245277712396,
"epoch": 0.531062124248497,
"step": 530
},
{
"eval_loss": 1.204853892326355,
"eval_runtime": 68.393,
"eval_samples_per_second": 36.86,
"eval_steps_per_second": 9.226,
"epoch": 0.533066132264529,
"step": 532
},
{
"loss": 1.2225,
"grad_norm": 0.3370509147644043,
"learning_rate": 0.0001829276599947291,
"epoch": 0.5410821643286573,
"step": 540
},
{
"loss": 1.2177,
"grad_norm": 0.32882654666900635,
"learning_rate": 0.00018193749400209757,
"epoch": 0.5511022044088176,
"step": 550
},
{
"loss": 1.1929,
"grad_norm": 0.3255807161331177,
"learning_rate": 0.00018092225775833733,
"epoch": 0.561122244488978,
"step": 560
},
{
"loss": 1.2155,
"grad_norm": 0.31887778639793396,
"learning_rate": 0.00017988226189325103,
"epoch": 0.5711422845691383,
"step": 570
},
{
"loss": 1.2117,
"grad_norm": 0.3166080713272095,
"learning_rate": 0.000178817824612293,
"epoch": 0.5811623246492986,
"step": 580
},
{
"loss": 1.2088,
"grad_norm": 0.3295918107032776,
"learning_rate": 0.0001777292715992088,
"epoch": 0.591182364729459,
"step": 590
},
{
"loss": 1.2201,
"grad_norm": 0.3369976282119751,
"learning_rate": 0.00017661693591638618,
"epoch": 0.6012024048096193,
"step": 600
},
{
"loss": 1.2027,
"grad_norm": 0.33591219782829285,
"learning_rate": 0.00017548115790294895,
"epoch": 0.6112224448897795,
"step": 610
},
{
"loss": 1.1963,
"grad_norm": 0.332086443901062,
"learning_rate": 0.0001743222850706238,
"epoch": 0.6212424849699398,
"step": 620
},
{
"loss": 1.1835,
"grad_norm": 0.321736603975296,
"learning_rate": 0.00017314067199741291,
"epoch": 0.6312625250501002,
"step": 630
},
{
"loss": 1.213,
"grad_norm": 0.3286183476448059,
"learning_rate": 0.0001719366802191046,
"epoch": 0.6412825651302605,
"step": 640
},
{
"loss": 1.1896,
"grad_norm": 0.31868550181388855,
"learning_rate": 0.00017071067811865476,
"epoch": 0.6513026052104208,
"step": 650
},
{
"loss": 1.2021,
"grad_norm": 0.328276664018631,
"learning_rate": 0.00016946304081347352,
"epoch": 0.6613226452905812,
"step": 660
},
{
"eval_loss": 1.1863322257995605,
"eval_runtime": 68.3185,
"eval_samples_per_second": 36.901,
"eval_steps_per_second": 9.236,
"epoch": 0.6663326653306614,
"step": 665
},
{
"loss": 1.1909,
"grad_norm": 0.3365529477596283,
"learning_rate": 0.0001681941500406513,
"epoch": 0.6713426853707415,
"step": 670
},
{
"loss": 1.2118,
"grad_norm": 0.3334924876689911,
"learning_rate": 0.00016690439404015955,
"epoch": 0.6813627254509018,
"step": 680
},
{
"loss": 1.1966,
"grad_norm": 0.3221626281738281,
"learning_rate": 0.00016559416743606163,
"epoch": 0.6913827655310621,
"step": 690
},
{
"loss": 1.1976,
"grad_norm": 0.3202773332595825,
"learning_rate": 0.0001642638711157706,
"epoch": 0.7014028056112225,
"step": 700
},
{
"loss": 1.1894,
"grad_norm": 0.32952797412872314,
"learning_rate": 0.00016291391210739034,
"epoch": 0.7114228456913828,
"step": 710
},
{
"loss": 1.1867,
"grad_norm": 0.33879393339157104,
"learning_rate": 0.0001615447034551782,
"epoch": 0.7214428857715431,
"step": 720
},
{
"loss": 1.1928,
"grad_norm": 0.3320337235927582,
"learning_rate": 0.00016015666409316642,
"epoch": 0.7314629258517034,
"step": 730
},
{
"loss": 1.2109,
"grad_norm": 0.32818353176116943,
"learning_rate": 0.00015875021871698195,
"epoch": 0.7414829659318637,
"step": 740
},
{
"loss": 1.1959,
"grad_norm": 0.32380595803260803,
"learning_rate": 0.00015732579765390296,
"epoch": 0.751503006012024,
"step": 750
},
{
"loss": 1.1632,
"grad_norm": 0.333734393119812,
"learning_rate": 0.00015588383673119274,
"epoch": 0.7615230460921844,
"step": 760
},
{
"loss": 1.1787,
"grad_norm": 0.3443449139595032,
"learning_rate": 0.00015442477714275023,
"epoch": 0.7715430861723447,
"step": 770
},
{
"loss": 1.178,
"grad_norm": 0.34319791197776794,
"learning_rate": 0.00015294906531411888,
"epoch": 0.781563126252505,
"step": 780
},
{
"loss": 1.1906,
"grad_norm": 0.34300360083580017,
"learning_rate": 0.00015145715276589487,
"epoch": 0.7915831663326653,
"step": 790
},
{
"eval_loss": 1.1701182126998901,
"eval_runtime": 68.3162,
"eval_samples_per_second": 36.902,
"eval_steps_per_second": 9.236,
"epoch": 0.7995991983967936,
"step": 798
},
{
"loss": 1.1745,
"grad_norm": 0.3376515507698059,
"learning_rate": 0.00014994949597557612,
"epoch": 0.8016032064128257,
"step": 800
},
{
"loss": 1.1681,
"grad_norm": 0.3189542591571808,
"learning_rate": 0.0001484265562378947,
"epoch": 0.811623246492986,
"step": 810
},
{
"loss": 1.1915,
"grad_norm": 0.33153972029685974,
"learning_rate": 0.00014688879952367572,
"epoch": 0.8216432865731463,
"step": 820
},
{
"loss": 1.1668,
"grad_norm": 0.3349854052066803,
"learning_rate": 0.00014533669633726474,
"epoch": 0.8316633266533067,
"step": 830
},
{
"loss": 1.1678,
"grad_norm": 0.33402037620544434,
"learning_rate": 0.0001437707215725688,
"epoch": 0.8416833667334669,
"step": 840
},
{
"loss": 1.173,
"grad_norm": 0.34636396169662476,
"learning_rate": 0.00014219135436775412,
"epoch": 0.8517034068136272,
"step": 850
},
{
"loss": 1.1712,
"grad_norm": 0.3418276607990265,
"learning_rate": 0.00014059907795864487,
"epoch": 0.8617234468937875,
"step": 860
},
{
"loss": 1.1788,
"grad_norm": 0.3244767189025879,
"learning_rate": 0.00013899437953086865,
"epoch": 0.8717434869739479,
"step": 870
},
{
"loss": 1.1555,
"grad_norm": 0.3318007290363312,
"learning_rate": 0.00013737775007079334,
"epoch": 0.8817635270541082,
"step": 880
},
{
"loss": 1.1756,
"grad_norm": 0.3220977485179901,
"learning_rate": 0.00013574968421530088,
"epoch": 0.8917835671342685,
"step": 890
},
{
"loss": 1.1644,
"grad_norm": 0.32579556107521057,
"learning_rate": 0.0001341106801004442,
"epoch": 0.9018036072144289,
"step": 900
},
{
"loss": 1.1579,
"grad_norm": 0.3344557285308838,
"learning_rate": 0.00013246123920903358,
"epoch": 0.9118236472945892,
"step": 910
},
{
"loss": 1.1442,
"grad_norm": 0.3250351846218109,
"learning_rate": 0.000130801866217199,
"epoch": 0.9218436873747495,
"step": 920
},
{
"loss": 1.1608,
"grad_norm": 0.3367891013622284,
"learning_rate": 0.00012913306883997528,
"epoch": 0.9318637274549099,
"step": 930
},
{
"eval_loss": 1.156960368156433,
"eval_runtime": 68.2656,
"eval_samples_per_second": 36.929,
"eval_steps_per_second": 9.243,
"epoch": 0.9328657314629258,
"step": 931
},
{
"loss": 1.1916,
"grad_norm": 0.33810174465179443,
"learning_rate": 0.00012745535767595754,
"epoch": 0.9418837675350702,
"step": 940
},
{
"loss": 1.166,
"grad_norm": 0.32210567593574524,
"learning_rate": 0.00012576924605107456,
"epoch": 0.9519038076152304,
"step": 950
},
{
"loss": 1.1662,
"grad_norm": 0.33408471941947937,
"learning_rate": 0.0001240752498615272,
"epoch": 0.9619238476953907,
"step": 960
},
{
"loss": 1.1691,
"grad_norm": 0.3320842683315277,
"learning_rate": 0.00012237388741594078,
"epoch": 0.9719438877755511,
"step": 970
},
{
"loss": 1.1608,
"grad_norm": 0.35282036662101746,
"learning_rate": 0.00012066567927677938,
"epoch": 0.9819639278557114,
"step": 980
},
{
"loss": 1.1759,
"grad_norm": 0.3290286958217621,
"learning_rate": 0.00011895114810107015,
"epoch": 0.9919839679358717,
"step": 990
},
{
"loss": 1.1491,
"grad_norm": 0.33208462595939636,
"learning_rate": 0.0001172308184804871,
"epoch": 1.002004008016032,
"step": 1000
},
{
"loss": 1.1075,
"grad_norm": 0.3253942131996155,
"learning_rate": 0.00011550521678084279,
"epoch": 1.0120240480961924,
"step": 1010
},
{
"loss": 1.0982,
"grad_norm": 0.3393993377685547,
"learning_rate": 0.00011377487098103735,
"epoch": 1.0220440881763526,
"step": 1020
},
{
"loss": 1.0971,
"grad_norm": 0.3415449857711792,
"learning_rate": 0.00011204031051151364,
"epoch": 1.032064128256513,
"step": 1030
},
{
"loss": 1.0864,
"grad_norm": 0.3586030900478363,
"learning_rate": 0.00011030206609226868,
"epoch": 1.0420841683366733,
"step": 1040
},
{
"loss": 1.1017,
"grad_norm": 0.3494417369365692,
"learning_rate": 0.0001085606695704701,
"epoch": 1.0521042084168337,
"step": 1050
},
{
"loss": 1.0877,
"grad_norm": 0.35265541076660156,
"learning_rate": 0.0001068166537577282,
"epoch": 1.062124248496994,
"step": 1060
},
{
"eval_loss": 1.1489150524139404,
"eval_runtime": 68.2603,
"eval_samples_per_second": 36.932,
"eval_steps_per_second": 9.244,
"epoch": 1.066132264529058,
"step": 1064
},
{
"loss": 1.083,
"grad_norm": 0.35727137327194214,
"learning_rate": 0.00010507055226707235,
"epoch": 1.0721442885771544,
"step": 1070
},
{
"loss": 1.0813,
"grad_norm": 0.34173229336738586,
"learning_rate": 0.0001033228993496827,
"epoch": 1.0821643286573146,
"step": 1080
},
{
"loss": 1.1103,
"grad_norm": 0.35060715675354004,
"learning_rate": 0.00010157422973142629,
"epoch": 1.092184368737475,
"step": 1090
},
{
"loss": 1.0888,
"grad_norm": 0.3494425415992737,
"learning_rate": 9.982507844924809e-05,
"epoch": 1.1022044088176353,
"step": 1100
},
{
"loss": 1.0822,
"grad_norm": 0.3451234698295593,
"learning_rate": 9.807598068746686e-05,
"epoch": 1.1122244488977957,
"step": 1110
},
{
"loss": 1.0956,
"grad_norm": 0.357877254486084,
"learning_rate": 9.632747161402581e-05,
"epoch": 1.122244488977956,
"step": 1120
},
{
"loss": 1.0965,
"grad_norm": 0.3512496054172516,
"learning_rate": 9.458008621674833e-05,
"epoch": 1.1322645290581161,
"step": 1130
},
{
"loss": 1.0973,
"grad_norm": 0.3644641637802124,
"learning_rate": 9.283435913964887e-05,
"epoch": 1.1422845691382766,
"step": 1140
},
{
"loss": 1.0925,
"grad_norm": 0.36287280917167664,
"learning_rate": 9.109082451934903e-05,
"epoch": 1.1523046092184368,
"step": 1150
},
{
"loss": 1.0843,
"grad_norm": 0.3538283407688141,
"learning_rate": 8.935001582164876e-05,
"epoch": 1.1623246492985972,
"step": 1160
},
{
"loss": 1.0986,
"grad_norm": 0.35994771122932434,
"learning_rate": 8.761246567830283e-05,
"epoch": 1.1723446893787575,
"step": 1170
},
{
"loss": 1.091,
"grad_norm": 0.3535645604133606,
"learning_rate": 8.587870572405278e-05,
"epoch": 1.182364729458918,
"step": 1180
},
{
"loss": 1.0724,
"grad_norm": 0.35088545083999634,
"learning_rate": 8.414926643396355e-05,
"epoch": 1.1923847695390781,
"step": 1190
},
{
"eval_loss": 1.1411553621292114,
"eval_runtime": 68.2204,
"eval_samples_per_second": 36.954,
"eval_steps_per_second": 9.249,
"epoch": 1.1993987975951903,
"step": 1197
},
{
"loss": 1.1034,
"grad_norm": 0.349630743265152,
"learning_rate": 8.2424676961115e-05,
"epoch": 1.2024048096192386,
"step": 1200
},
{
"loss": 1.0913,
"grad_norm": 0.34191203117370605,
"learning_rate": 8.070546497469829e-05,
"epoch": 1.2124248496993988,
"step": 1210
},
{
"loss": 1.0882,
"grad_norm": 0.36975690722465515,
"learning_rate": 7.89921564985657e-05,
"epoch": 1.2224448897795592,
"step": 1220
},
{
"loss": 1.0959,
"grad_norm": 0.35831448435783386,
"learning_rate": 7.728527575028426e-05,
"epoch": 1.2324649298597194,
"step": 1230
},
{
"loss": 1.099,
"grad_norm": 0.3613387644290924,
"learning_rate": 7.558534498074204e-05,
"epoch": 1.2424849699398797,
"step": 1240
},
{
"loss": 1.0867,
"grad_norm": 0.3645046055316925,
"learning_rate": 7.389288431435603e-05,
"epoch": 1.25250501002004,
"step": 1250
},
{
"loss": 1.0901,
"grad_norm": 0.3511161208152771,
"learning_rate": 7.220841158993056e-05,
"epoch": 1.2625250501002003,
"step": 1260
},
{
"loss": 1.0928,
"grad_norm": 0.3355962932109833,
"learning_rate": 7.053244220221546e-05,
"epoch": 1.2725450901803608,
"step": 1270
},
{
"loss": 1.1051,
"grad_norm": 0.3608781695365906,
"learning_rate": 6.886548894421166e-05,
"epoch": 1.282565130260521,
"step": 1280
},
{
"loss": 1.0771,
"grad_norm": 0.3543643057346344,
"learning_rate": 6.720806185027281e-05,
"epoch": 1.2925851703406814,
"step": 1290
},
{
"loss": 1.1026,
"grad_norm": 0.3539983034133911,
"learning_rate": 6.55606680400513e-05,
"epoch": 1.3026052104208417,
"step": 1300
},
{
"loss": 1.0764,
"grad_norm": 0.35117045044898987,
"learning_rate": 6.392381156333572e-05,
"epoch": 1.3126252505010019,
"step": 1310
},
{
"loss": 1.0734,
"grad_norm": 0.35716721415519714,
"learning_rate": 6.229799324582782e-05,
"epoch": 1.3226452905811623,
"step": 1320
},
{
"loss": 1.0889,
"grad_norm": 0.34911802411079407,
"learning_rate": 6.068371053590582e-05,
"epoch": 1.3326653306613228,
"step": 1330
},
{
"eval_loss": 1.1338779926300049,
"eval_runtime": 67.8228,
"eval_samples_per_second": 37.17,
"eval_steps_per_second": 9.304,
"epoch": 1.3326653306613228,
"step": 1330
},
{
"loss": 1.0902,
"grad_norm": 0.3515984117984772,
"learning_rate": 5.9081457352421254e-05,
"epoch": 1.342685370741483,
"step": 1340
},
{
"loss": 1.0822,
"grad_norm": 0.353769987821579,
"learning_rate": 5.7491723933575395e-05,
"epoch": 1.3527054108216432,
"step": 1350
},
{
"loss": 1.0729,
"grad_norm": 0.36617526412010193,
"learning_rate": 5.5914996686922305e-05,
"epoch": 1.3627254509018036,
"step": 1360
},
{
"loss": 1.0911,
"grad_norm": 0.3742637038230896,
"learning_rate": 5.4351758040543424e-05,
"epoch": 1.3727454909819639,
"step": 1370
},
{
"loss": 1.0858,
"grad_norm": 0.3658939003944397,
"learning_rate": 5.280248629544027e-05,
"epoch": 1.3827655310621243,
"step": 1380
},
{
"loss": 1.0966,
"grad_norm": 0.35165053606033325,
"learning_rate": 5.1267655479189416e-05,
"epoch": 1.3927855711422845,
"step": 1390
},
{
"loss": 1.0738,
"grad_norm": 0.359625905752182,
"learning_rate": 4.974773520090541e-05,
"epoch": 1.402805611222445,
"step": 1400
},
{
"loss": 1.0833,
"grad_norm": 0.3560248613357544,
"learning_rate": 4.8243190507555314e-05,
"epoch": 1.4128256513026052,
"step": 1410
},
{
"loss": 1.0768,
"grad_norm": 0.3535526394844055,
"learning_rate": 4.675448174166912e-05,
"epoch": 1.4228456913827654,
"step": 1420
},
{
"loss": 1.0757,
"grad_norm": 0.3635563850402832,
"learning_rate": 4.5282064400489943e-05,
"epoch": 1.4328657314629258,
"step": 1430
},
{
"loss": 1.0941,
"grad_norm": 0.34245678782463074,
"learning_rate": 4.382638899660613e-05,
"epoch": 1.4428857715430863,
"step": 1440
},
{
"loss": 1.082,
"grad_norm": 0.3664350211620331,
"learning_rate": 4.238790092010897e-05,
"epoch": 1.4529058116232465,
"step": 1450
},
{
"loss": 1.0803,
"grad_norm": 0.35511618852615356,
"learning_rate": 4.096704030231767e-05,
"epoch": 1.4629258517034067,
"step": 1460
},
{
"eval_loss": 1.1275933980941772,
"eval_runtime": 68.2799,
"eval_samples_per_second": 36.922,
"eval_steps_per_second": 9.241,
"epoch": 1.465931863727455,
"step": 1463
},
{
"loss": 1.0844,
"grad_norm": 0.36960098147392273,
"learning_rate": 3.956424188111314e-05,
"epoch": 1.4729458917835672,
"step": 1470
},
{
"loss": 1.0896,
"grad_norm": 0.35100260376930237,
"learning_rate": 3.8179934867922016e-05,
"epoch": 1.4829659318637274,
"step": 1480
},
{
"loss": 1.0804,
"grad_norm": 0.3652225732803345,
"learning_rate": 3.681454281639195e-05,
"epoch": 1.4929859719438878,
"step": 1490
},
{
"loss": 1.0912,
"grad_norm": 0.36614513397216797,
"learning_rate": 3.54684834927976e-05,
"epoch": 1.503006012024048,
"step": 1500
},
{
"loss": 1.0817,
"grad_norm": 0.3566560447216034,
"learning_rate": 3.4142168748217405e-05,
"epoch": 1.5130260521042085,
"step": 1510
},
{
"loss": 1.0955,
"grad_norm": 0.34692510962486267,
"learning_rate": 3.2836004392520624e-05,
"epoch": 1.5230460921843687,
"step": 1520
},
{
"loss": 1.0758,
"grad_norm": 0.3460650146007538,
"learning_rate": 3.1550390070202255e-05,
"epoch": 1.533066132264529,
"step": 1530
},
{
"loss": 1.0842,
"grad_norm": 0.3645261228084564,
"learning_rate": 3.0285719138104628e-05,
"epoch": 1.5430861723446894,
"step": 1540
},
{
"loss": 1.0795,
"grad_norm": 0.3634505867958069,
"learning_rate": 2.9042378545063e-05,
"epoch": 1.5531062124248498,
"step": 1550
},
{
"loss": 1.0841,
"grad_norm": 0.36353570222854614,
"learning_rate": 2.7820748713511414e-05,
"epoch": 1.56312625250501,
"step": 1560
},
{
"loss": 1.0884,
"grad_norm": 0.3511188328266144,
"learning_rate": 2.662120342308557e-05,
"epoch": 1.5731462925851702,
"step": 1570
},
{
"loss": 1.0697,
"grad_norm": 0.37336093187332153,
"learning_rate": 2.5444109696258434e-05,
"epoch": 1.5831663326653307,
"step": 1580
},
{
"loss": 1.0713,
"grad_norm": 0.370236873626709,
"learning_rate": 2.428982768604281e-05,
"epoch": 1.5931863727454911,
"step": 1590
},
{
"eval_loss": 1.1230673789978027,
"eval_runtime": 68.22,
"eval_samples_per_second": 36.954,
"eval_steps_per_second": 9.249,
"epoch": 1.5991983967935872,
"step": 1596
},
{
"loss": 1.0832,
"grad_norm": 0.3588917553424835,
"learning_rate": 2.3158710565796348e-05,
"epoch": 1.6032064128256514,
"step": 1600
},
{
"loss": 1.0861,
"grad_norm": 0.34933748841285706,
"learning_rate": 2.2051104421161607e-05,
"epoch": 1.6132264529058116,
"step": 1610
},
{
"loss": 1.0667,
"grad_norm": 0.36681294441223145,
"learning_rate": 2.0967348144174924e-05,
"epoch": 1.623246492985972,
"step": 1620
},
{
"loss": 1.0567,
"grad_norm": 0.34613531827926636,
"learning_rate": 1.9907773329576375e-05,
"epoch": 1.6332665330661322,
"step": 1630
},
{
"loss": 1.082,
"grad_norm": 0.35560983419418335,
"learning_rate": 1.887270417335241e-05,
"epoch": 1.6432865731462925,
"step": 1640
},
{
"loss": 1.0606,
"grad_norm": 0.3562234342098236,
"learning_rate": 1.7862457373542095e-05,
"epoch": 1.653306613226453,
"step": 1650
},
{
"loss": 1.0716,
"grad_norm": 0.3679576516151428,
"learning_rate": 1.6877342033337872e-05,
"epoch": 1.6633266533066133,
"step": 1660
},
{
"loss": 1.0546,
"grad_norm": 0.36080437898635864,
"learning_rate": 1.5917659566509746e-05,
"epoch": 1.6733466933867736,
"step": 1670
},
{
"loss": 1.0934,
"grad_norm": 0.36290815472602844,
"learning_rate": 1.4983703605182242e-05,
"epoch": 1.6833667334669338,
"step": 1680
},
{
"loss": 1.0773,
"grad_norm": 0.365818053483963,
"learning_rate": 1.4075759909992548e-05,
"epoch": 1.6933867735470942,
"step": 1690
},
{
"loss": 1.0882,
"grad_norm": 0.35331591963768005,
"learning_rate": 1.3194106282656827e-05,
"epoch": 1.7034068136272547,
"step": 1700
},
{
"loss": 1.065,
"grad_norm": 0.3647233247756958,
"learning_rate": 1.2339012480971712e-05,
"epoch": 1.7134268537074149,
"step": 1710
},
{
"loss": 1.0634,
"grad_norm": 0.36989837884902954,
"learning_rate": 1.1510740136277109e-05,
"epoch": 1.723446893787575,
"step": 1720
},
{
"eval_loss": 1.1204578876495361,
"eval_runtime": 68.1731,
"eval_samples_per_second": 36.979,
"eval_steps_per_second": 9.256,
"epoch": 1.7324649298597194,
"step": 1729
},
{
"loss": 1.066,
"grad_norm": 0.36859068274497986,
"learning_rate": 1.070954267340547e-05,
"epoch": 1.7334669338677355,
"step": 1730
},
{
"loss": 1.0864,
"grad_norm": 0.3574073314666748,
"learning_rate": 9.9356652331417e-06,
"epoch": 1.7434869739478958,
"step": 1740
},
{
"loss": 1.0706,
"grad_norm": 0.3612259030342102,
"learning_rate": 9.189344597218153e-06,
"epoch": 1.753507014028056,
"step": 1750
},
{
"loss": 1.0713,
"grad_norm": 0.3700193464756012,
"learning_rate": 8.470809115866818e-06,
"epoch": 1.7635270541082164,
"step": 1760
},
{
"loss": 1.0803,
"grad_norm": 0.3535062074661255,
"learning_rate": 7.780278637951521e-06,
"epoch": 1.7735470941883769,
"step": 1770
},
{
"loss": 1.0647,
"grad_norm": 0.35681986808776855,
"learning_rate": 7.117964443701242e-06,
"epoch": 1.783567134268537,
"step": 1780
},
{
"loss": 1.0669,
"grad_norm": 0.3624560832977295,
"learning_rate": 6.484069180065055e-06,
"epoch": 1.7935871743486973,
"step": 1790
},
{
"loss": 1.0719,
"grad_norm": 0.35827717185020447,
"learning_rate": 5.8787867987087355e-06,
"epoch": 1.8036072144288577,
"step": 1800
},
{
"loss": 1.0855,
"grad_norm": 0.3589949905872345,
"learning_rate": 5.302302496671641e-06,
"epoch": 1.8136272545090182,
"step": 1810
},
{
"loss": 1.0722,
"grad_norm": 0.358090341091156,
"learning_rate": 4.754792659702468e-06,
"epoch": 1.8236472945891784,
"step": 1820
},
{
"loss": 1.0719,
"grad_norm": 0.3602781593799591,
"learning_rate": 4.236424808290751e-06,
"epoch": 1.8336673346693386,
"step": 1830
},
{
"loss": 1.0815,
"grad_norm": 0.3476168215274811,
"learning_rate": 3.7473575464110455e-06,
"epoch": 1.843687374749499,
"step": 1840
},
{
"loss": 1.0774,
"grad_norm": 0.3502495288848877,
"learning_rate": 3.2877405129950967e-06,
"epoch": 1.8537074148296593,
"step": 1850
},
{
"loss": 1.0736,
"grad_norm": 0.3665917217731476,
"learning_rate": 2.857714336147188e-06,
"epoch": 1.8637274549098195,
"step": 1860
},
{
"eval_loss": 1.1194243431091309,
"eval_runtime": 68.2716,
"eval_samples_per_second": 36.926,
"eval_steps_per_second": 9.242,
"epoch": 1.8657314629258517,
"step": 1862
},
{
"loss": 1.0802,
"grad_norm": 0.36012518405914307,
"learning_rate": 2.457410590116427e-06,
"epoch": 1.87374749498998,
"step": 1870
},
{
"loss": 1.0787,
"grad_norm": 0.3560042977333069,
"learning_rate": 2.086951755039168e-06,
"epoch": 1.8837675350701404,
"step": 1880
},
{
"loss": 1.0742,
"grad_norm": 0.36220309138298035,
"learning_rate": 1.746451179464137e-06,
"epoch": 1.8937875751503006,
"step": 1890
},
{
"loss": 1.0727,
"grad_norm": 0.3498152196407318,
"learning_rate": 1.4360130456712695e-06,
"epoch": 1.9038076152304608,
"step": 1900
},
{
"loss": 1.0738,
"grad_norm": 0.3657923936843872,
"learning_rate": 1.1557323377953456e-06,
"epoch": 1.9138276553106213,
"step": 1910
},
{
"loss": 1.0568,
"grad_norm": 0.36668485403060913,
"learning_rate": 9.056948127638687e-07,
"epoch": 1.9238476953907817,
"step": 1920
},
{
"loss": 1.0568,
"grad_norm": 0.3519572615623474,
"learning_rate": 6.859769740582e-07,
"epoch": 1.933867735470942,
"step": 1930
},
{
"loss": 1.0621,
"grad_norm": 0.3648887574672699,
"learning_rate": 4.966460483059044e-07,
"epoch": 1.9438877755511021,
"step": 1940
},
{
"loss": 1.0688,
"grad_norm": 0.3495730757713318,
"learning_rate": 3.3775996471160366e-07,
"epoch": 1.9539078156312626,
"step": 1950
},
{
"loss": 1.0652,
"grad_norm": 0.3570130169391632,
"learning_rate": 2.093673373324334e-07,
"epoch": 1.9639278557114228,
"step": 1960
},
{
"loss": 1.0822,
"grad_norm": 0.3690868020057678,
"learning_rate": 1.1150745020376275e-07,
"epoch": 1.973947895791583,
"step": 1970
},
{
"loss": 1.0665,
"grad_norm": 0.3751429319381714,
"learning_rate": 4.421024531948703e-08,
"epoch": 1.9839679358717435,
"step": 1980
},
{
"loss": 1.065,
"grad_norm": 0.3617876172065735,
"learning_rate": 7.496313470778393e-09,
"epoch": 1.993987975951904,
"step": 1990
},
{
"eval_loss": 1.1191450357437134,
"eval_runtime": 68.3979,
"eval_samples_per_second": 36.858,
"eval_steps_per_second": 9.225,
"epoch": 1.9989979959919841,
"step": 1995
},
{
"train_runtime": 6834.1445,
"train_samples_per_second": 14.016,
"train_steps_per_second": 0.292,
"total_flos": 7.260644379510651e+17,
"train_loss": 1.1743444665400442,
"epoch": 2.0,
"step": 1996
}
]