kth8's picture
Upload folder using huggingface_hub
7f43fd7 verified
Raw
History Blame Contribute Delete
29.8 kB
[
{
"eval_loss": 4.586892604827881,
"eval_runtime": 89.6197,
"eval_samples_per_second": 5.858,
"eval_steps_per_second": 1.473,
"epoch": 0,
"step": 0
},
{
"loss": 4.1431,
"grad_norm": 32.154510498046875,
"learning_rate": 1.8e-05,
"epoch": 0.006222775357809583,
"step": 10
},
{
"loss": 2.171,
"grad_norm": 8.32018756866455,
"learning_rate": 3.8e-05,
"epoch": 0.012445550715619166,
"step": 20
},
{
"loss": 1.5115,
"grad_norm": 7.762565612792969,
"learning_rate": 5.8e-05,
"epoch": 0.018668326073428748,
"step": 30
},
{
"loss": 1.6593,
"grad_norm": 10.723950386047363,
"learning_rate": 7.800000000000001e-05,
"epoch": 0.024891101431238332,
"step": 40
},
{
"loss": 1.4035,
"grad_norm": 11.399044036865234,
"learning_rate": 9.8e-05,
"epoch": 0.031113876789047916,
"step": 50
},
{
"loss": 1.3391,
"grad_norm": 7.819660663604736,
"learning_rate": 0.000118,
"epoch": 0.037336652146857496,
"step": 60
},
{
"loss": 1.3462,
"grad_norm": 8.268921852111816,
"learning_rate": 0.000138,
"epoch": 0.043559427504667084,
"step": 70
},
{
"loss": 1.3119,
"grad_norm": 5.696014881134033,
"learning_rate": 0.00015800000000000002,
"epoch": 0.049782202862476664,
"step": 80
},
{
"eval_loss": 1.303046703338623,
"eval_runtime": 18.9705,
"eval_samples_per_second": 27.675,
"eval_steps_per_second": 6.958,
"epoch": 0.049782202862476664,
"step": 80
},
{
"loss": 1.4399,
"grad_norm": 4.720869541168213,
"learning_rate": 0.00017800000000000002,
"epoch": 0.056004978220286245,
"step": 90
},
{
"loss": 1.515,
"grad_norm": 6.697354793548584,
"learning_rate": 0.00019800000000000002,
"epoch": 0.06222775357809583,
"step": 100
},
{
"loss": 1.2566,
"grad_norm": 4.759271621704102,
"learning_rate": 0.00019998239988424423,
"epoch": 0.06845052893590542,
"step": 110
},
{
"loss": 1.2804,
"grad_norm": 4.9139723777771,
"learning_rate": 0.0001999215679316913,
"epoch": 0.07467330429371499,
"step": 120
},
{
"loss": 1.4076,
"grad_norm": 7.784193992614746,
"learning_rate": 0.00019981731328627206,
"epoch": 0.08089607965152458,
"step": 130
},
{
"loss": 1.2333,
"grad_norm": 4.605452060699463,
"learning_rate": 0.00019966968125369522,
"epoch": 0.08711885500933417,
"step": 140
},
{
"loss": 1.223,
"grad_norm": 6.003992557525635,
"learning_rate": 0.00019947873599008388,
"epoch": 0.09334163036714374,
"step": 150
},
{
"loss": 1.2006,
"grad_norm": 3.7916181087493896,
"learning_rate": 0.00019924456047409517,
"epoch": 0.09956440572495333,
"step": 160
},
{
"eval_loss": 1.2743773460388184,
"eval_runtime": 19.0399,
"eval_samples_per_second": 27.574,
"eval_steps_per_second": 6.933,
"epoch": 0.09956440572495333,
"step": 160
},
{
"loss": 1.2961,
"grad_norm": 4.582925796508789,
"learning_rate": 0.00019896725647086072,
"epoch": 0.10578718108276292,
"step": 170
},
{
"loss": 1.3238,
"grad_norm": 4.96942138671875,
"learning_rate": 0.0001986469444877626,
"epoch": 0.11200995644057249,
"step": 180
},
{
"loss": 1.2806,
"grad_norm": 5.652052879333496,
"learning_rate": 0.0001982837637220647,
"epoch": 0.11823273179838208,
"step": 190
},
{
"loss": 1.2519,
"grad_norm": 3.8876349925994873,
"learning_rate": 0.00019787787200042223,
"epoch": 0.12445550715619166,
"step": 200
},
{
"loss": 1.3565,
"grad_norm": 5.234771251678467,
"learning_rate": 0.00019742944571029517,
"epoch": 0.13067828251400124,
"step": 210
},
{
"loss": 1.3429,
"grad_norm": 4.472705364227295,
"learning_rate": 0.00019693867972329598,
"epoch": 0.13690105787181084,
"step": 220
},
{
"loss": 1.1896,
"grad_norm": 5.7520952224731445,
"learning_rate": 0.00019640578731050488,
"epoch": 0.1431238332296204,
"step": 230
},
{
"loss": 1.2823,
"grad_norm": 5.895481586456299,
"learning_rate": 0.00019583100004978886,
"epoch": 0.14934660858742999,
"step": 240
},
{
"eval_loss": 1.2229551076889038,
"eval_runtime": 19.0105,
"eval_samples_per_second": 27.616,
"eval_steps_per_second": 6.944,
"epoch": 0.14934660858742999,
"step": 240
},
{
"loss": 1.2715,
"grad_norm": 4.751076698303223,
"learning_rate": 0.00019521456772516552,
"epoch": 0.1555693839452396,
"step": 250
},
{
"loss": 1.2688,
"grad_norm": 4.310879230499268,
"learning_rate": 0.0001945567582182551,
"epoch": 0.16179215930304916,
"step": 260
},
{
"loss": 1.2177,
"grad_norm": 4.900815010070801,
"learning_rate": 0.00019385785739186746,
"epoch": 0.16801493466085873,
"step": 270
},
{
"loss": 1.1152,
"grad_norm": 9.008260726928711,
"learning_rate": 0.0001931181689657756,
"epoch": 0.17423771001866833,
"step": 280
},
{
"loss": 1.2861,
"grad_norm": 4.527481555938721,
"learning_rate": 0.00019233801438472875,
"epoch": 0.1804604853764779,
"step": 290
},
{
"loss": 1.1138,
"grad_norm": 3.7949774265289307,
"learning_rate": 0.00019151773267876273,
"epoch": 0.18668326073428748,
"step": 300
},
{
"loss": 1.1897,
"grad_norm": 4.003747463226318,
"learning_rate": 0.0001906576803158686,
"epoch": 0.19290603609209708,
"step": 310
},
{
"loss": 1.2305,
"grad_norm": 3.926623582839966,
"learning_rate": 0.00018975823104708313,
"epoch": 0.19912881144990666,
"step": 320
},
{
"eval_loss": 1.1955150365829468,
"eval_runtime": 19.0399,
"eval_samples_per_second": 27.574,
"eval_steps_per_second": 6.933,
"epoch": 0.19912881144990666,
"step": 320
},
{
"loss": 1.221,
"grad_norm": 4.546780109405518,
"learning_rate": 0.00018881977574406838,
"epoch": 0.20535158680771623,
"step": 330
},
{
"loss": 1.1211,
"grad_norm": 4.058185577392578,
"learning_rate": 0.00018784272222925198,
"epoch": 0.21157436216552583,
"step": 340
},
{
"loss": 1.2339,
"grad_norm": 6.297555923461914,
"learning_rate": 0.00018682749509860012,
"epoch": 0.2177971375233354,
"step": 350
},
{
"loss": 1.1462,
"grad_norm": 4.3970160484313965,
"learning_rate": 0.00018577453553710215,
"epoch": 0.22401991288114498,
"step": 360
},
{
"loss": 1.293,
"grad_norm": 3.81414532661438,
"learning_rate": 0.00018468430112704573,
"epoch": 0.23024268823895458,
"step": 370
},
{
"loss": 1.1787,
"grad_norm": 3.973546266555786,
"learning_rate": 0.00018355726564916628,
"epoch": 0.23646546359676415,
"step": 380
},
{
"loss": 1.3247,
"grad_norm": 4.137118339538574,
"learning_rate": 0.00018239391887675722,
"epoch": 0.24268823895457373,
"step": 390
},
{
"loss": 1.2655,
"grad_norm": 4.191527843475342,
"learning_rate": 0.00018119476636283018,
"epoch": 0.24891101431238333,
"step": 400
},
{
"eval_loss": 1.1952359676361084,
"eval_runtime": 19.1429,
"eval_samples_per_second": 27.425,
"eval_steps_per_second": 6.896,
"epoch": 0.24891101431238333,
"step": 400
},
{
"loss": 1.1443,
"grad_norm": 4.531196117401123,
"learning_rate": 0.00017996032922041797,
"epoch": 0.2551337896701929,
"step": 410
},
{
"loss": 1.0542,
"grad_norm": 3.887230634689331,
"learning_rate": 0.00017869114389611575,
"epoch": 0.2613565650280025,
"step": 420
},
{
"loss": 1.1879,
"grad_norm": 4.009781837463379,
"learning_rate": 0.00017738776193695853,
"epoch": 0.26757934038581205,
"step": 430
},
{
"loss": 1.2122,
"grad_norm": 4.212757587432861,
"learning_rate": 0.00017605074975073664,
"epoch": 0.2738021157436217,
"step": 440
},
{
"loss": 1.1998,
"grad_norm": 5.043585300445557,
"learning_rate": 0.00017468068835985325,
"epoch": 0.28002489110143125,
"step": 450
},
{
"loss": 1.1861,
"grad_norm": 3.5380918979644775,
"learning_rate": 0.00017327817314883055,
"epoch": 0.2862476664592408,
"step": 460
},
{
"loss": 1.1754,
"grad_norm": 4.334512233734131,
"learning_rate": 0.00017184381360557498,
"epoch": 0.2924704418170504,
"step": 470
},
{
"loss": 1.0953,
"grad_norm": 3.720323085784912,
"learning_rate": 0.00017037823305651343,
"epoch": 0.29869321717485997,
"step": 480
},
{
"eval_loss": 1.1778477430343628,
"eval_runtime": 18.9601,
"eval_samples_per_second": 27.69,
"eval_steps_per_second": 6.962,
"epoch": 0.29869321717485997,
"step": 480
},
{
"loss": 1.0638,
"grad_norm": 3.9478025436401367,
"learning_rate": 0.0001688820683957156,
"epoch": 0.30491599253266954,
"step": 490
},
{
"loss": 1.1504,
"grad_norm": 4.221723556518555,
"learning_rate": 0.00016735596980812047,
"epoch": 0.3111387678904792,
"step": 500
},
{
"loss": 1.2274,
"grad_norm": 3.9657983779907227,
"learning_rate": 0.0001658006004869867,
"epoch": 0.31736154324828875,
"step": 510
},
{
"loss": 1.1862,
"grad_norm": 4.158371925354004,
"learning_rate": 0.00016421663634569046,
"epoch": 0.3235843186060983,
"step": 520
},
{
"loss": 1.2757,
"grad_norm": 6.26188325881958,
"learning_rate": 0.00016260476572399496,
"epoch": 0.3298070939639079,
"step": 530
},
{
"loss": 1.0873,
"grad_norm": 4.029411315917969,
"learning_rate": 0.00016096568908892047,
"epoch": 0.33602986932171747,
"step": 540
},
{
"loss": 1.1004,
"grad_norm": 3.944244861602783,
"learning_rate": 0.00015930011873034375,
"epoch": 0.3422526446795271,
"step": 550
},
{
"loss": 1.1847,
"grad_norm": 5.218636512756348,
"learning_rate": 0.00015760877845145995,
"epoch": 0.34847542003733667,
"step": 560
},
{
"eval_loss": 1.1579095125198364,
"eval_runtime": 18.9511,
"eval_samples_per_second": 27.703,
"eval_steps_per_second": 6.965,
"epoch": 0.34847542003733667,
"step": 560
},
{
"loss": 1.2818,
"grad_norm": 4.242026329040527,
"learning_rate": 0.00015589240325424088,
"epoch": 0.35469819539514624,
"step": 570
},
{
"loss": 1.1621,
"grad_norm": 4.167393207550049,
"learning_rate": 0.00015415173902002703,
"epoch": 0.3609209707529558,
"step": 580
},
{
"loss": 1.0553,
"grad_norm": 3.5119478702545166,
"learning_rate": 0.00015238754218539156,
"epoch": 0.3671437461107654,
"step": 590
},
{
"loss": 1.2139,
"grad_norm": 3.9878628253936768,
"learning_rate": 0.00015060057941341718,
"epoch": 0.37336652146857496,
"step": 600
},
{
"loss": 1.1846,
"grad_norm": 4.306042671203613,
"learning_rate": 0.00014879162726052928,
"epoch": 0.3795892968263846,
"step": 610
},
{
"loss": 1.1916,
"grad_norm": 5.4814276695251465,
"learning_rate": 0.0001469614718390295,
"epoch": 0.38581207218419417,
"step": 620
},
{
"loss": 1.1762,
"grad_norm": 4.104554653167725,
"learning_rate": 0.00014511090847547643,
"epoch": 0.39203484754200374,
"step": 630
},
{
"loss": 1.0668,
"grad_norm": 4.206943511962891,
"learning_rate": 0.00014324074136506284,
"epoch": 0.3982576228998133,
"step": 640
},
{
"eval_loss": 1.122727632522583,
"eval_runtime": 19.0559,
"eval_samples_per_second": 27.551,
"eval_steps_per_second": 6.927,
"epoch": 0.3982576228998133,
"step": 640
},
{
"loss": 1.0995,
"grad_norm": 4.80627965927124,
"learning_rate": 0.00014135178322213765,
"epoch": 0.4044803982576229,
"step": 650
},
{
"loss": 1.2397,
"grad_norm": 5.690910339355469,
"learning_rate": 0.00013944485492702716,
"epoch": 0.41070317361543246,
"step": 660
},
{
"loss": 1.1212,
"grad_norm": 4.302467346191406,
"learning_rate": 0.00013752078516930652,
"epoch": 0.4169259489732421,
"step": 670
},
{
"loss": 1.2482,
"grad_norm": 4.4525604248046875,
"learning_rate": 0.00013558041008767798,
"epoch": 0.42314872433105166,
"step": 680
},
{
"loss": 1.1418,
"grad_norm": 3.8571488857269287,
"learning_rate": 0.00013362457290661215,
"epoch": 0.42937149968886124,
"step": 690
},
{
"loss": 1.1032,
"grad_norm": 3.5000720024108887,
"learning_rate": 0.00013165412356990955,
"epoch": 0.4355942750466708,
"step": 700
},
{
"loss": 1.0484,
"grad_norm": 4.838465690612793,
"learning_rate": 0.0001296699183713427,
"epoch": 0.4418170504044804,
"step": 710
},
{
"loss": 1.097,
"grad_norm": 3.3884267807006836,
"learning_rate": 0.0001276728195825383,
"epoch": 0.44803982576228996,
"step": 720
},
{
"eval_loss": 1.117976427078247,
"eval_runtime": 19.0566,
"eval_samples_per_second": 27.549,
"eval_steps_per_second": 6.927,
"epoch": 0.44803982576228996,
"step": 720
},
{
"loss": 1.103,
"grad_norm": 3.7878074645996094,
"learning_rate": 0.00012566369507826175,
"epoch": 0.4542626011200996,
"step": 730
},
{
"loss": 1.1819,
"grad_norm": 4.605463027954102,
"learning_rate": 0.00012364341795926683,
"epoch": 0.46048537647790916,
"step": 740
},
{
"loss": 1.0844,
"grad_norm": 4.262099742889404,
"learning_rate": 0.00012161286617287419,
"epoch": 0.46670815183571873,
"step": 750
},
{
"loss": 1.2108,
"grad_norm": 5.657451152801514,
"learning_rate": 0.00011957292213144385,
"epoch": 0.4729309271935283,
"step": 760
},
{
"loss": 1.1722,
"grad_norm": 3.5178189277648926,
"learning_rate": 0.00011752447232890702,
"epoch": 0.4791537025513379,
"step": 770
},
{
"loss": 1.0685,
"grad_norm": 3.929398775100708,
"learning_rate": 0.00011546840695552466,
"epoch": 0.48537647790914745,
"step": 780
},
{
"loss": 1.1232,
"grad_norm": 5.15775728225708,
"learning_rate": 0.0001134056195110393,
"epoch": 0.4915992532669571,
"step": 790
},
{
"loss": 1.175,
"grad_norm": 4.8148980140686035,
"learning_rate": 0.00011133700641638891,
"epoch": 0.49782202862476665,
"step": 800
},
{
"eval_loss": 1.068037986755371,
"eval_runtime": 19.1937,
"eval_samples_per_second": 27.353,
"eval_steps_per_second": 6.877,
"epoch": 0.49782202862476665,
"step": 800
},
{
"loss": 1.0508,
"grad_norm": 3.6205997467041016,
"learning_rate": 0.0001092634666241513,
"epoch": 0.5040448039825762,
"step": 810
},
{
"loss": 1.0613,
"grad_norm": 5.062050819396973,
"learning_rate": 0.00010718590122788821,
"epoch": 0.5102675793403858,
"step": 820
},
{
"loss": 1.1868,
"grad_norm": 4.122339725494385,
"learning_rate": 0.00010510521307055914,
"epoch": 0.5164903546981954,
"step": 830
},
{
"loss": 1.0381,
"grad_norm": 3.8383567333221436,
"learning_rate": 0.000103022306352175,
"epoch": 0.522713130056005,
"step": 840
},
{
"loss": 1.1625,
"grad_norm": 3.7978572845458984,
"learning_rate": 0.00010093808623686165,
"epoch": 0.5289359054138145,
"step": 850
},
{
"loss": 1.1144,
"grad_norm": 2.841909408569336,
"learning_rate": 9.88534584595051e-05,
"epoch": 0.5351586807716241,
"step": 860
},
{
"loss": 1.001,
"grad_norm": 4.133490085601807,
"learning_rate": 9.676932893214805e-05,
"epoch": 0.5413814561294338,
"step": 870
},
{
"loss": 1.1545,
"grad_norm": 5.40682315826416,
"learning_rate": 9.46866033503098e-05,
"epoch": 0.5476042314872434,
"step": 880
},
{
"eval_loss": 1.0770765542984009,
"eval_runtime": 19.0434,
"eval_samples_per_second": 27.569,
"eval_steps_per_second": 6.932,
"epoch": 0.5476042314872434,
"step": 880
},
{
"loss": 1.0768,
"grad_norm": 7.381899833679199,
"learning_rate": 9.260618679940025e-05,
"epoch": 0.5538270068450529,
"step": 890
},
{
"loss": 1.1314,
"grad_norm": 5.5879621505737305,
"learning_rate": 9.05289833613988e-05,
"epoch": 0.5600497822028625,
"step": 900
},
{
"loss": 1.1605,
"grad_norm": 6.610466480255127,
"learning_rate": 8.845589572196961e-05,
"epoch": 0.5662725575606721,
"step": 910
},
{
"loss": 1.1251,
"grad_norm": 3.7799007892608643,
"learning_rate": 8.638782477818334e-05,
"epoch": 0.5724953329184816,
"step": 920
},
{
"loss": 1.0761,
"grad_norm": 3.294947385787964,
"learning_rate": 8.432566924701659e-05,
"epoch": 0.5787181082762912,
"step": 930
},
{
"loss": 1.099,
"grad_norm": 3.8454885482788086,
"learning_rate": 8.227032527479806e-05,
"epoch": 0.5849408836341008,
"step": 940
},
{
"loss": 1.0603,
"grad_norm": 3.116297483444214,
"learning_rate": 8.022268604777271e-05,
"epoch": 0.5911636589919104,
"step": 950
},
{
"loss": 0.9529,
"grad_norm": 3.8308541774749756,
"learning_rate": 7.818364140395137e-05,
"epoch": 0.5973864343497199,
"step": 960
},
{
"eval_loss": 1.0448600053787231,
"eval_runtime": 18.8462,
"eval_samples_per_second": 27.857,
"eval_steps_per_second": 7.004,
"epoch": 0.5973864343497199,
"step": 960
},
{
"loss": 1.0266,
"grad_norm": 3.5868051052093506,
"learning_rate": 7.615407744641619e-05,
"epoch": 0.6036092097075295,
"step": 970
},
{
"loss": 1.0455,
"grad_norm": 4.006646633148193,
"learning_rate": 7.413487615824847e-05,
"epoch": 0.6098319850653391,
"step": 980
},
{
"loss": 0.9692,
"grad_norm": 3.410396099090576,
"learning_rate": 7.212691501924753e-05,
"epoch": 0.6160547604231488,
"step": 990
},
{
"loss": 1.1257,
"grad_norm": 4.496327877044678,
"learning_rate": 7.013106662460604e-05,
"epoch": 0.6222775357809583,
"step": 1000
},
{
"loss": 1.0382,
"grad_norm": 3.0930795669555664,
"learning_rate": 6.81481983057085e-05,
"epoch": 0.6285003111387679,
"step": 1010
},
{
"loss": 1.1206,
"grad_norm": 3.9784181118011475,
"learning_rate": 6.617917175321669e-05,
"epoch": 0.6347230864965775,
"step": 1020
},
{
"loss": 1.0582,
"grad_norm": 3.438899278640747,
"learning_rate": 6.422484264260698e-05,
"epoch": 0.6409458618543871,
"step": 1030
},
{
"loss": 1.0783,
"grad_norm": 4.454411029815674,
"learning_rate": 6.228606026232118e-05,
"epoch": 0.6471686372121966,
"step": 1040
},
{
"eval_loss": 1.0363072156906128,
"eval_runtime": 19.4555,
"eval_samples_per_second": 26.985,
"eval_steps_per_second": 6.785,
"epoch": 0.6471686372121966,
"step": 1040
},
{
"loss": 1.1599,
"grad_norm": 4.955887794494629,
"learning_rate": 6.0363667144693105e-05,
"epoch": 0.6533914125700062,
"step": 1050
},
{
"loss": 1.0642,
"grad_norm": 4.835966110229492,
"learning_rate": 5.845849869981137e-05,
"epoch": 0.6596141879278158,
"step": 1060
},
{
"loss": 1.0944,
"grad_norm": 4.23394250869751,
"learning_rate": 5.657138285247687e-05,
"epoch": 0.6658369632856254,
"step": 1070
},
{
"loss": 1.1125,
"grad_norm": 4.8603973388671875,
"learning_rate": 5.4703139682413586e-05,
"epoch": 0.6720597386434349,
"step": 1080
},
{
"loss": 1.1036,
"grad_norm": 3.6753642559051514,
"learning_rate": 5.285458106788807e-05,
"epoch": 0.6782825140012445,
"step": 1090
},
{
"loss": 1.0248,
"grad_norm": 3.8071329593658447,
"learning_rate": 5.10265103328937e-05,
"epoch": 0.6845052893590542,
"step": 1100
},
{
"loss": 0.9752,
"grad_norm": 3.280271530151367,
"learning_rate": 4.921972189805154e-05,
"epoch": 0.6907280647168638,
"step": 1110
},
{
"loss": 1.0233,
"grad_norm": 3.587294101715088,
"learning_rate": 4.7435000935381115e-05,
"epoch": 0.6969508400746733,
"step": 1120
},
{
"eval_loss": 1.0311274528503418,
"eval_runtime": 19.0411,
"eval_samples_per_second": 27.572,
"eval_steps_per_second": 6.932,
"epoch": 0.6969508400746733,
"step": 1120
},
{
"loss": 0.9725,
"grad_norm": 3.9192657470703125,
"learning_rate": 4.567312302708965e-05,
"epoch": 0.7031736154324829,
"step": 1130
},
{
"loss": 1.0536,
"grad_norm": 3.6675221920013428,
"learning_rate": 4.393485382852935e-05,
"epoch": 0.7093963907902925,
"step": 1140
},
{
"loss": 0.8942,
"grad_norm": 3.954765796661377,
"learning_rate": 4.2220948735467967e-05,
"epoch": 0.7156191661481021,
"step": 1150
},
{
"loss": 1.0628,
"grad_norm": 3.2176241874694824,
"learning_rate": 4.053215255581844e-05,
"epoch": 0.7218419415059116,
"step": 1160
},
{
"loss": 0.9189,
"grad_norm": 4.0021443367004395,
"learning_rate": 3.886919918596894e-05,
"epoch": 0.7280647168637212,
"step": 1170
},
{
"loss": 1.017,
"grad_norm": 5.141781330108643,
"learning_rate": 3.723281129185574e-05,
"epoch": 0.7342874922215308,
"step": 1180
},
{
"loss": 1.0003,
"grad_norm": 3.7216849327087402,
"learning_rate": 3.562369999491536e-05,
"epoch": 0.7405102675793404,
"step": 1190
},
{
"loss": 1.0913,
"grad_norm": 4.026198863983154,
"learning_rate": 3.4042564563054526e-05,
"epoch": 0.7467330429371499,
"step": 1200
},
{
"eval_loss": 1.0212879180908203,
"eval_runtime": 19.4048,
"eval_samples_per_second": 27.055,
"eval_steps_per_second": 6.802,
"epoch": 0.7467330429371499,
"step": 1200
},
{
"loss": 1.0205,
"grad_norm": 3.0943052768707275,
"learning_rate": 3.249009210677054e-05,
"epoch": 0.7529558182949595,
"step": 1210
},
{
"loss": 1.1132,
"grad_norm": 3.125500202178955,
"learning_rate": 3.096695728055536e-05,
"epoch": 0.7591785936527692,
"step": 1220
},
{
"loss": 1.0034,
"grad_norm": 3.151357889175415,
"learning_rate": 2.9473821989712625e-05,
"epoch": 0.7654013690105788,
"step": 1230
},
{
"loss": 1.0259,
"grad_norm": 4.394558906555176,
"learning_rate": 2.801133510271463e-05,
"epoch": 0.7716241443683883,
"step": 1240
},
{
"loss": 1.0123,
"grad_norm": 3.69354510307312,
"learning_rate": 2.6580132169225335e-05,
"epoch": 0.7778469197261979,
"step": 1250
},
{
"loss": 0.9852,
"grad_norm": 3.7997825145721436,
"learning_rate": 2.5180835143910732e-05,
"epoch": 0.7840696950840075,
"step": 1260
},
{
"loss": 1.0357,
"grad_norm": 3.2746288776397705,
"learning_rate": 2.3814052116157492e-05,
"epoch": 0.790292470441817,
"step": 1270
},
{
"loss": 0.9234,
"grad_norm": 2.5498299598693848,
"learning_rate": 2.248037704581686e-05,
"epoch": 0.7965152457996266,
"step": 1280
},
{
"eval_loss": 1.0135233402252197,
"eval_runtime": 19.2289,
"eval_samples_per_second": 27.303,
"eval_steps_per_second": 6.865,
"epoch": 0.7965152457996266,
"step": 1280
},
{
"loss": 0.9819,
"grad_norm": 4.244918346405029,
"learning_rate": 2.1180389505089004e-05,
"epoch": 0.8027380211574362,
"step": 1290
},
{
"loss": 0.9955,
"grad_norm": 3.978168249130249,
"learning_rate": 1.9914654426659374e-05,
"epoch": 0.8089607965152458,
"step": 1300
},
{
"loss": 1.0539,
"grad_norm": 4.090443134307861,
"learning_rate": 1.8683721858197366e-05,
"epoch": 0.8151835718730553,
"step": 1310
},
{
"loss": 1.0301,
"grad_norm": 4.159254550933838,
"learning_rate": 1.7488126723323183e-05,
"epoch": 0.8214063472308649,
"step": 1320
},
{
"loss": 0.9817,
"grad_norm": 3.623256206512451,
"learning_rate": 1.632838858914747e-05,
"epoch": 0.8276291225886746,
"step": 1330
},
{
"loss": 1.0417,
"grad_norm": 3.4594295024871826,
"learning_rate": 1.5205011440483929e-05,
"epoch": 0.8338518979464842,
"step": 1340
},
{
"loss": 0.9497,
"grad_norm": 3.782587766647339,
"learning_rate": 1.4118483460834064e-05,
"epoch": 0.8400746733042938,
"step": 1350
},
{
"loss": 0.9819,
"grad_norm": 4.124022960662842,
"learning_rate": 1.3069276820237997e-05,
"epoch": 0.8462974486621033,
"step": 1360
},
{
"eval_loss": 1.0055181980133057,
"eval_runtime": 19.0381,
"eval_samples_per_second": 27.576,
"eval_steps_per_second": 6.933,
"epoch": 0.8462974486621033,
"step": 1360
},
{
"loss": 0.9526,
"grad_norm": 3.329490900039673,
"learning_rate": 1.2057847470084993e-05,
"epoch": 0.8525202240199129,
"step": 1370
},
{
"loss": 1.0375,
"grad_norm": 3.4906861782073975,
"learning_rate": 1.108463494497135e-05,
"epoch": 0.8587429993777225,
"step": 1380
},
{
"loss": 1.143,
"grad_norm": 3.604809045791626,
"learning_rate": 1.0150062171693076e-05,
"epoch": 0.864965774735532,
"step": 1390
},
{
"loss": 1.0352,
"grad_norm": 3.657939910888672,
"learning_rate": 9.254535285455334e-06,
"epoch": 0.8711885500933416,
"step": 1400
},
{
"loss": 0.9611,
"grad_norm": 4.000196933746338,
"learning_rate": 8.398443453379267e-06,
"epoch": 0.8774113254511512,
"step": 1410
},
{
"loss": 1.02,
"grad_norm": 2.971421241760254,
"learning_rate": 7.582158705382581e-06,
"epoch": 0.8836341008089608,
"step": 1420
},
{
"loss": 1.095,
"grad_norm": 3.4433512687683105,
"learning_rate": 6.806035772507169e-06,
"epoch": 0.8898568761667703,
"step": 1430
},
{
"loss": 0.9563,
"grad_norm": 4.7456746101379395,
"learning_rate": 6.070411932764586e-06,
"epoch": 0.8960796515245799,
"step": 1440
},
{
"eval_loss": 1.0012239217758179,
"eval_runtime": 19.2942,
"eval_samples_per_second": 27.21,
"eval_steps_per_second": 6.841,
"epoch": 0.8960796515245799,
"step": 1440
},
{
"loss": 1.0647,
"grad_norm": 3.7112457752227783,
"learning_rate": 5.375606864565785e-06,
"epoch": 0.9023024268823896,
"step": 1450
},
{
"loss": 1.0923,
"grad_norm": 3.5192649364471436,
"learning_rate": 4.721922507799248e-06,
"epoch": 0.9085252022401992,
"step": 1460
},
{
"loss": 1.0141,
"grad_norm": 3.898160696029663,
"learning_rate": 4.10964293261763e-06,
"epoch": 0.9147479775980087,
"step": 1470
},
{
"loss": 0.9584,
"grad_norm": 3.5476901531219482,
"learning_rate": 3.5390342159900223e-06,
"epoch": 0.9209707529558183,
"step": 1480
},
{
"loss": 0.937,
"grad_norm": 3.9498867988586426,
"learning_rate": 3.0103443260734554e-06,
"epoch": 0.9271935283136279,
"step": 1490
},
{
"loss": 0.9892,
"grad_norm": 4.293119430541992,
"learning_rate": 2.5238030144539737e-06,
"epoch": 0.9334163036714375,
"step": 1500
},
{
"loss": 0.9698,
"grad_norm": 3.3624675273895264,
"learning_rate": 2.079621716303959e-06,
"epoch": 0.939639079029247,
"step": 1510
},
{
"loss": 0.968,
"grad_norm": 3.809082269668579,
"learning_rate": 1.6779934584992718e-06,
"epoch": 0.9458618543870566,
"step": 1520
},
{
"eval_loss": 1.0003483295440674,
"eval_runtime": 19.1326,
"eval_samples_per_second": 27.44,
"eval_steps_per_second": 6.899,
"epoch": 0.9458618543870566,
"step": 1520
},
{
"loss": 0.9195,
"grad_norm": 3.8601574897766113,
"learning_rate": 1.3190927757358973e-06,
"epoch": 0.9520846297448662,
"step": 1530
},
{
"loss": 1.0223,
"grad_norm": 3.9802134037017822,
"learning_rate": 1.0030756346829151e-06,
"epoch": 0.9583074051026758,
"step": 1540
},
{
"loss": 1.0011,
"grad_norm": 4.339512825012207,
"learning_rate": 7.300793662043282e-07,
"epoch": 0.9645301804604853,
"step": 1550
},
{
"loss": 0.9674,
"grad_norm": 3.0537281036376953,
"learning_rate": 5.002226056795123e-07,
"epoch": 0.9707529558182949,
"step": 1560
},
{
"loss": 1.0434,
"grad_norm": 4.255007266998291,
"learning_rate": 3.1360524144810055e-07,
"epoch": 0.9769757311761046,
"step": 1570
},
{
"loss": 1.0385,
"grad_norm": 4.374144554138184,
"learning_rate": 1.703083714017617e-07,
"epoch": 0.9831985065339142,
"step": 1580
},
{
"loss": 1.0528,
"grad_norm": 3.5838091373443604,
"learning_rate": 7.039426774164693e-08,
"epoch": 0.9894212818917237,
"step": 1590
},
{
"loss": 1.021,
"grad_norm": 3.116870880126953,
"learning_rate": 1.3906349916881222e-08,
"epoch": 0.9956440572495333,
"step": 1600
},
{
"eval_loss": 0.9999628067016602,
"eval_runtime": 18.979,
"eval_samples_per_second": 27.662,
"eval_steps_per_second": 6.955,
"epoch": 0.9956440572495333,
"step": 1600
},
{
"train_runtime": 2534.2207,
"train_samples_per_second": 10.144,
"train_steps_per_second": 0.634,
"total_flos": 7.873650806298163e+16,
"train_loss": 1.1504224986116116,
"epoch": 1.0,
"step": 1607
}
]