[ { "eval_loss": 4.586892604827881, "eval_runtime": 89.6197, "eval_samples_per_second": 5.858, "eval_steps_per_second": 1.473, "epoch": 0, "step": 0 }, { "loss": 4.1431, "grad_norm": 32.154510498046875, "learning_rate": 1.8e-05, "epoch": 0.006222775357809583, "step": 10 }, { "loss": 2.171, "grad_norm": 8.32018756866455, "learning_rate": 3.8e-05, "epoch": 0.012445550715619166, "step": 20 }, { "loss": 1.5115, "grad_norm": 7.762565612792969, "learning_rate": 5.8e-05, "epoch": 0.018668326073428748, "step": 30 }, { "loss": 1.6593, "grad_norm": 10.723950386047363, "learning_rate": 7.800000000000001e-05, "epoch": 0.024891101431238332, "step": 40 }, { "loss": 1.4035, "grad_norm": 11.399044036865234, "learning_rate": 9.8e-05, "epoch": 0.031113876789047916, "step": 50 }, { "loss": 1.3391, "grad_norm": 7.819660663604736, "learning_rate": 0.000118, "epoch": 0.037336652146857496, "step": 60 }, { "loss": 1.3462, "grad_norm": 8.268921852111816, "learning_rate": 0.000138, "epoch": 0.043559427504667084, "step": 70 }, { "loss": 1.3119, "grad_norm": 5.696014881134033, "learning_rate": 0.00015800000000000002, "epoch": 0.049782202862476664, "step": 80 }, { "eval_loss": 1.303046703338623, "eval_runtime": 18.9705, "eval_samples_per_second": 27.675, "eval_steps_per_second": 6.958, "epoch": 0.049782202862476664, "step": 80 }, { "loss": 1.4399, "grad_norm": 4.720869541168213, "learning_rate": 0.00017800000000000002, "epoch": 0.056004978220286245, "step": 90 }, { "loss": 1.515, "grad_norm": 6.697354793548584, "learning_rate": 0.00019800000000000002, "epoch": 0.06222775357809583, "step": 100 }, { "loss": 1.2566, "grad_norm": 4.759271621704102, "learning_rate": 0.00019998239988424423, "epoch": 0.06845052893590542, "step": 110 }, { "loss": 1.2804, "grad_norm": 4.9139723777771, "learning_rate": 0.0001999215679316913, "epoch": 0.07467330429371499, "step": 120 }, { "loss": 1.4076, "grad_norm": 7.784193992614746, "learning_rate": 0.00019981731328627206, "epoch": 0.08089607965152458, "step": 130 }, { "loss": 1.2333, "grad_norm": 4.605452060699463, "learning_rate": 0.00019966968125369522, "epoch": 0.08711885500933417, "step": 140 }, { "loss": 1.223, "grad_norm": 6.003992557525635, "learning_rate": 0.00019947873599008388, "epoch": 0.09334163036714374, "step": 150 }, { "loss": 1.2006, "grad_norm": 3.7916181087493896, "learning_rate": 0.00019924456047409517, "epoch": 0.09956440572495333, "step": 160 }, { "eval_loss": 1.2743773460388184, "eval_runtime": 19.0399, "eval_samples_per_second": 27.574, "eval_steps_per_second": 6.933, "epoch": 0.09956440572495333, "step": 160 }, { "loss": 1.2961, "grad_norm": 4.582925796508789, "learning_rate": 0.00019896725647086072, "epoch": 0.10578718108276292, "step": 170 }, { "loss": 1.3238, "grad_norm": 4.96942138671875, "learning_rate": 0.0001986469444877626, "epoch": 0.11200995644057249, "step": 180 }, { "loss": 1.2806, "grad_norm": 5.652052879333496, "learning_rate": 0.0001982837637220647, "epoch": 0.11823273179838208, "step": 190 }, { "loss": 1.2519, "grad_norm": 3.8876349925994873, "learning_rate": 0.00019787787200042223, "epoch": 0.12445550715619166, "step": 200 }, { "loss": 1.3565, "grad_norm": 5.234771251678467, "learning_rate": 0.00019742944571029517, "epoch": 0.13067828251400124, "step": 210 }, { "loss": 1.3429, "grad_norm": 4.472705364227295, "learning_rate": 0.00019693867972329598, "epoch": 0.13690105787181084, "step": 220 }, { "loss": 1.1896, "grad_norm": 5.7520952224731445, "learning_rate": 0.00019640578731050488, "epoch": 0.1431238332296204, "step": 230 }, { "loss": 1.2823, "grad_norm": 5.895481586456299, "learning_rate": 0.00019583100004978886, "epoch": 0.14934660858742999, "step": 240 }, { "eval_loss": 1.2229551076889038, "eval_runtime": 19.0105, "eval_samples_per_second": 27.616, "eval_steps_per_second": 6.944, "epoch": 0.14934660858742999, "step": 240 }, { "loss": 1.2715, "grad_norm": 4.751076698303223, "learning_rate": 0.00019521456772516552, "epoch": 0.1555693839452396, "step": 250 }, { "loss": 1.2688, "grad_norm": 4.310879230499268, "learning_rate": 0.0001945567582182551, "epoch": 0.16179215930304916, "step": 260 }, { "loss": 1.2177, "grad_norm": 4.900815010070801, "learning_rate": 0.00019385785739186746, "epoch": 0.16801493466085873, "step": 270 }, { "loss": 1.1152, "grad_norm": 9.008260726928711, "learning_rate": 0.0001931181689657756, "epoch": 0.17423771001866833, "step": 280 }, { "loss": 1.2861, "grad_norm": 4.527481555938721, "learning_rate": 0.00019233801438472875, "epoch": 0.1804604853764779, "step": 290 }, { "loss": 1.1138, "grad_norm": 3.7949774265289307, "learning_rate": 0.00019151773267876273, "epoch": 0.18668326073428748, "step": 300 }, { "loss": 1.1897, "grad_norm": 4.003747463226318, "learning_rate": 0.0001906576803158686, "epoch": 0.19290603609209708, "step": 310 }, { "loss": 1.2305, "grad_norm": 3.926623582839966, "learning_rate": 0.00018975823104708313, "epoch": 0.19912881144990666, "step": 320 }, { "eval_loss": 1.1955150365829468, "eval_runtime": 19.0399, "eval_samples_per_second": 27.574, "eval_steps_per_second": 6.933, "epoch": 0.19912881144990666, "step": 320 }, { "loss": 1.221, "grad_norm": 4.546780109405518, "learning_rate": 0.00018881977574406838, "epoch": 0.20535158680771623, "step": 330 }, { "loss": 1.1211, "grad_norm": 4.058185577392578, "learning_rate": 0.00018784272222925198, "epoch": 0.21157436216552583, "step": 340 }, { "loss": 1.2339, "grad_norm": 6.297555923461914, "learning_rate": 0.00018682749509860012, "epoch": 0.2177971375233354, "step": 350 }, { "loss": 1.1462, "grad_norm": 4.3970160484313965, "learning_rate": 0.00018577453553710215, "epoch": 0.22401991288114498, "step": 360 }, { "loss": 1.293, "grad_norm": 3.81414532661438, "learning_rate": 0.00018468430112704573, "epoch": 0.23024268823895458, "step": 370 }, { "loss": 1.1787, "grad_norm": 3.973546266555786, "learning_rate": 0.00018355726564916628, "epoch": 0.23646546359676415, "step": 380 }, { "loss": 1.3247, "grad_norm": 4.137118339538574, "learning_rate": 0.00018239391887675722, "epoch": 0.24268823895457373, "step": 390 }, { "loss": 1.2655, "grad_norm": 4.191527843475342, "learning_rate": 0.00018119476636283018, "epoch": 0.24891101431238333, "step": 400 }, { "eval_loss": 1.1952359676361084, "eval_runtime": 19.1429, "eval_samples_per_second": 27.425, "eval_steps_per_second": 6.896, "epoch": 0.24891101431238333, "step": 400 }, { "loss": 1.1443, "grad_norm": 4.531196117401123, "learning_rate": 0.00017996032922041797, "epoch": 0.2551337896701929, "step": 410 }, { "loss": 1.0542, "grad_norm": 3.887230634689331, "learning_rate": 0.00017869114389611575, "epoch": 0.2613565650280025, "step": 420 }, { "loss": 1.1879, "grad_norm": 4.009781837463379, "learning_rate": 0.00017738776193695853, "epoch": 0.26757934038581205, "step": 430 }, { "loss": 1.2122, "grad_norm": 4.212757587432861, "learning_rate": 0.00017605074975073664, "epoch": 0.2738021157436217, "step": 440 }, { "loss": 1.1998, "grad_norm": 5.043585300445557, "learning_rate": 0.00017468068835985325, "epoch": 0.28002489110143125, "step": 450 }, { "loss": 1.1861, "grad_norm": 3.5380918979644775, "learning_rate": 0.00017327817314883055, "epoch": 0.2862476664592408, "step": 460 }, { "loss": 1.1754, "grad_norm": 4.334512233734131, "learning_rate": 0.00017184381360557498, "epoch": 0.2924704418170504, "step": 470 }, { "loss": 1.0953, "grad_norm": 3.720323085784912, "learning_rate": 0.00017037823305651343, "epoch": 0.29869321717485997, "step": 480 }, { "eval_loss": 1.1778477430343628, "eval_runtime": 18.9601, "eval_samples_per_second": 27.69, "eval_steps_per_second": 6.962, "epoch": 0.29869321717485997, "step": 480 }, { "loss": 1.0638, "grad_norm": 3.9478025436401367, "learning_rate": 0.0001688820683957156, "epoch": 0.30491599253266954, "step": 490 }, { "loss": 1.1504, "grad_norm": 4.221723556518555, "learning_rate": 0.00016735596980812047, "epoch": 0.3111387678904792, "step": 500 }, { "loss": 1.2274, "grad_norm": 3.9657983779907227, "learning_rate": 0.0001658006004869867, "epoch": 0.31736154324828875, "step": 510 }, { "loss": 1.1862, "grad_norm": 4.158371925354004, "learning_rate": 0.00016421663634569046, "epoch": 0.3235843186060983, "step": 520 }, { "loss": 1.2757, "grad_norm": 6.26188325881958, "learning_rate": 0.00016260476572399496, "epoch": 0.3298070939639079, "step": 530 }, { "loss": 1.0873, "grad_norm": 4.029411315917969, "learning_rate": 0.00016096568908892047, "epoch": 0.33602986932171747, "step": 540 }, { "loss": 1.1004, "grad_norm": 3.944244861602783, "learning_rate": 0.00015930011873034375, "epoch": 0.3422526446795271, "step": 550 }, { "loss": 1.1847, "grad_norm": 5.218636512756348, "learning_rate": 0.00015760877845145995, "epoch": 0.34847542003733667, "step": 560 }, { "eval_loss": 1.1579095125198364, "eval_runtime": 18.9511, "eval_samples_per_second": 27.703, "eval_steps_per_second": 6.965, "epoch": 0.34847542003733667, "step": 560 }, { "loss": 1.2818, "grad_norm": 4.242026329040527, "learning_rate": 0.00015589240325424088, "epoch": 0.35469819539514624, "step": 570 }, { "loss": 1.1621, "grad_norm": 4.167393207550049, "learning_rate": 0.00015415173902002703, "epoch": 0.3609209707529558, "step": 580 }, { "loss": 1.0553, "grad_norm": 3.5119478702545166, "learning_rate": 0.00015238754218539156, "epoch": 0.3671437461107654, "step": 590 }, { "loss": 1.2139, "grad_norm": 3.9878628253936768, "learning_rate": 0.00015060057941341718, "epoch": 0.37336652146857496, "step": 600 }, { "loss": 1.1846, "grad_norm": 4.306042671203613, "learning_rate": 0.00014879162726052928, "epoch": 0.3795892968263846, "step": 610 }, { "loss": 1.1916, "grad_norm": 5.4814276695251465, "learning_rate": 0.0001469614718390295, "epoch": 0.38581207218419417, "step": 620 }, { "loss": 1.1762, "grad_norm": 4.104554653167725, "learning_rate": 0.00014511090847547643, "epoch": 0.39203484754200374, "step": 630 }, { "loss": 1.0668, "grad_norm": 4.206943511962891, "learning_rate": 0.00014324074136506284, "epoch": 0.3982576228998133, "step": 640 }, { "eval_loss": 1.122727632522583, "eval_runtime": 19.0559, "eval_samples_per_second": 27.551, "eval_steps_per_second": 6.927, "epoch": 0.3982576228998133, "step": 640 }, { "loss": 1.0995, "grad_norm": 4.80627965927124, "learning_rate": 0.00014135178322213765, "epoch": 0.4044803982576229, "step": 650 }, { "loss": 1.2397, "grad_norm": 5.690910339355469, "learning_rate": 0.00013944485492702716, "epoch": 0.41070317361543246, "step": 660 }, { "loss": 1.1212, "grad_norm": 4.302467346191406, "learning_rate": 0.00013752078516930652, "epoch": 0.4169259489732421, "step": 670 }, { "loss": 1.2482, "grad_norm": 4.4525604248046875, "learning_rate": 0.00013558041008767798, "epoch": 0.42314872433105166, "step": 680 }, { "loss": 1.1418, "grad_norm": 3.8571488857269287, "learning_rate": 0.00013362457290661215, "epoch": 0.42937149968886124, "step": 690 }, { "loss": 1.1032, "grad_norm": 3.5000720024108887, "learning_rate": 0.00013165412356990955, "epoch": 0.4355942750466708, "step": 700 }, { "loss": 1.0484, "grad_norm": 4.838465690612793, "learning_rate": 0.0001296699183713427, "epoch": 0.4418170504044804, "step": 710 }, { "loss": 1.097, "grad_norm": 3.3884267807006836, "learning_rate": 0.0001276728195825383, "epoch": 0.44803982576228996, "step": 720 }, { "eval_loss": 1.117976427078247, "eval_runtime": 19.0566, "eval_samples_per_second": 27.549, "eval_steps_per_second": 6.927, "epoch": 0.44803982576228996, "step": 720 }, { "loss": 1.103, "grad_norm": 3.7878074645996094, "learning_rate": 0.00012566369507826175, "epoch": 0.4542626011200996, "step": 730 }, { "loss": 1.1819, "grad_norm": 4.605463027954102, "learning_rate": 0.00012364341795926683, "epoch": 0.46048537647790916, "step": 740 }, { "loss": 1.0844, "grad_norm": 4.262099742889404, "learning_rate": 0.00012161286617287419, "epoch": 0.46670815183571873, "step": 750 }, { "loss": 1.2108, "grad_norm": 5.657451152801514, "learning_rate": 0.00011957292213144385, "epoch": 0.4729309271935283, "step": 760 }, { "loss": 1.1722, "grad_norm": 3.5178189277648926, "learning_rate": 0.00011752447232890702, "epoch": 0.4791537025513379, "step": 770 }, { "loss": 1.0685, "grad_norm": 3.929398775100708, "learning_rate": 0.00011546840695552466, "epoch": 0.48537647790914745, "step": 780 }, { "loss": 1.1232, "grad_norm": 5.15775728225708, "learning_rate": 0.0001134056195110393, "epoch": 0.4915992532669571, "step": 790 }, { "loss": 1.175, "grad_norm": 4.8148980140686035, "learning_rate": 0.00011133700641638891, "epoch": 0.49782202862476665, "step": 800 }, { "eval_loss": 1.068037986755371, "eval_runtime": 19.1937, "eval_samples_per_second": 27.353, "eval_steps_per_second": 6.877, "epoch": 0.49782202862476665, "step": 800 }, { "loss": 1.0508, "grad_norm": 3.6205997467041016, "learning_rate": 0.0001092634666241513, "epoch": 0.5040448039825762, "step": 810 }, { "loss": 1.0613, "grad_norm": 5.062050819396973, "learning_rate": 0.00010718590122788821, "epoch": 0.5102675793403858, "step": 820 }, { "loss": 1.1868, "grad_norm": 4.122339725494385, "learning_rate": 0.00010510521307055914, "epoch": 0.5164903546981954, "step": 830 }, { "loss": 1.0381, "grad_norm": 3.8383567333221436, "learning_rate": 0.000103022306352175, "epoch": 0.522713130056005, "step": 840 }, { "loss": 1.1625, "grad_norm": 3.7978572845458984, "learning_rate": 0.00010093808623686165, "epoch": 0.5289359054138145, "step": 850 }, { "loss": 1.1144, "grad_norm": 2.841909408569336, "learning_rate": 9.88534584595051e-05, "epoch": 0.5351586807716241, "step": 860 }, { "loss": 1.001, "grad_norm": 4.133490085601807, "learning_rate": 9.676932893214805e-05, "epoch": 0.5413814561294338, "step": 870 }, { "loss": 1.1545, "grad_norm": 5.40682315826416, "learning_rate": 9.46866033503098e-05, "epoch": 0.5476042314872434, "step": 880 }, { "eval_loss": 1.0770765542984009, "eval_runtime": 19.0434, "eval_samples_per_second": 27.569, "eval_steps_per_second": 6.932, "epoch": 0.5476042314872434, "step": 880 }, { "loss": 1.0768, "grad_norm": 7.381899833679199, "learning_rate": 9.260618679940025e-05, "epoch": 0.5538270068450529, "step": 890 }, { "loss": 1.1314, "grad_norm": 5.5879621505737305, "learning_rate": 9.05289833613988e-05, "epoch": 0.5600497822028625, "step": 900 }, { "loss": 1.1605, "grad_norm": 6.610466480255127, "learning_rate": 8.845589572196961e-05, "epoch": 0.5662725575606721, "step": 910 }, { "loss": 1.1251, "grad_norm": 3.7799007892608643, "learning_rate": 8.638782477818334e-05, "epoch": 0.5724953329184816, "step": 920 }, { "loss": 1.0761, "grad_norm": 3.294947385787964, "learning_rate": 8.432566924701659e-05, "epoch": 0.5787181082762912, "step": 930 }, { "loss": 1.099, "grad_norm": 3.8454885482788086, "learning_rate": 8.227032527479806e-05, "epoch": 0.5849408836341008, "step": 940 }, { "loss": 1.0603, "grad_norm": 3.116297483444214, "learning_rate": 8.022268604777271e-05, "epoch": 0.5911636589919104, "step": 950 }, { "loss": 0.9529, "grad_norm": 3.8308541774749756, "learning_rate": 7.818364140395137e-05, "epoch": 0.5973864343497199, "step": 960 }, { "eval_loss": 1.0448600053787231, "eval_runtime": 18.8462, "eval_samples_per_second": 27.857, "eval_steps_per_second": 7.004, "epoch": 0.5973864343497199, "step": 960 }, { "loss": 1.0266, "grad_norm": 3.5868051052093506, "learning_rate": 7.615407744641619e-05, "epoch": 0.6036092097075295, "step": 970 }, { "loss": 1.0455, "grad_norm": 4.006646633148193, "learning_rate": 7.413487615824847e-05, "epoch": 0.6098319850653391, "step": 980 }, { "loss": 0.9692, "grad_norm": 3.410396099090576, "learning_rate": 7.212691501924753e-05, "epoch": 0.6160547604231488, "step": 990 }, { "loss": 1.1257, "grad_norm": 4.496327877044678, "learning_rate": 7.013106662460604e-05, "epoch": 0.6222775357809583, "step": 1000 }, { "loss": 1.0382, "grad_norm": 3.0930795669555664, "learning_rate": 6.81481983057085e-05, "epoch": 0.6285003111387679, "step": 1010 }, { "loss": 1.1206, "grad_norm": 3.9784181118011475, "learning_rate": 6.617917175321669e-05, "epoch": 0.6347230864965775, "step": 1020 }, { "loss": 1.0582, "grad_norm": 3.438899278640747, "learning_rate": 6.422484264260698e-05, "epoch": 0.6409458618543871, "step": 1030 }, { "loss": 1.0783, "grad_norm": 4.454411029815674, "learning_rate": 6.228606026232118e-05, "epoch": 0.6471686372121966, "step": 1040 }, { "eval_loss": 1.0363072156906128, "eval_runtime": 19.4555, "eval_samples_per_second": 26.985, "eval_steps_per_second": 6.785, "epoch": 0.6471686372121966, "step": 1040 }, { "loss": 1.1599, "grad_norm": 4.955887794494629, "learning_rate": 6.0363667144693105e-05, "epoch": 0.6533914125700062, "step": 1050 }, { "loss": 1.0642, "grad_norm": 4.835966110229492, "learning_rate": 5.845849869981137e-05, "epoch": 0.6596141879278158, "step": 1060 }, { "loss": 1.0944, "grad_norm": 4.23394250869751, "learning_rate": 5.657138285247687e-05, "epoch": 0.6658369632856254, "step": 1070 }, { "loss": 1.1125, "grad_norm": 4.8603973388671875, "learning_rate": 5.4703139682413586e-05, "epoch": 0.6720597386434349, "step": 1080 }, { "loss": 1.1036, "grad_norm": 3.6753642559051514, "learning_rate": 5.285458106788807e-05, "epoch": 0.6782825140012445, "step": 1090 }, { "loss": 1.0248, "grad_norm": 3.8071329593658447, "learning_rate": 5.10265103328937e-05, "epoch": 0.6845052893590542, "step": 1100 }, { "loss": 0.9752, "grad_norm": 3.280271530151367, "learning_rate": 4.921972189805154e-05, "epoch": 0.6907280647168638, "step": 1110 }, { "loss": 1.0233, "grad_norm": 3.587294101715088, "learning_rate": 4.7435000935381115e-05, "epoch": 0.6969508400746733, "step": 1120 }, { "eval_loss": 1.0311274528503418, "eval_runtime": 19.0411, "eval_samples_per_second": 27.572, "eval_steps_per_second": 6.932, "epoch": 0.6969508400746733, "step": 1120 }, { "loss": 0.9725, "grad_norm": 3.9192657470703125, "learning_rate": 4.567312302708965e-05, "epoch": 0.7031736154324829, "step": 1130 }, { "loss": 1.0536, "grad_norm": 3.6675221920013428, "learning_rate": 4.393485382852935e-05, "epoch": 0.7093963907902925, "step": 1140 }, { "loss": 0.8942, "grad_norm": 3.954765796661377, "learning_rate": 4.2220948735467967e-05, "epoch": 0.7156191661481021, "step": 1150 }, { "loss": 1.0628, "grad_norm": 3.2176241874694824, "learning_rate": 4.053215255581844e-05, "epoch": 0.7218419415059116, "step": 1160 }, { "loss": 0.9189, "grad_norm": 4.0021443367004395, "learning_rate": 3.886919918596894e-05, "epoch": 0.7280647168637212, "step": 1170 }, { "loss": 1.017, "grad_norm": 5.141781330108643, "learning_rate": 3.723281129185574e-05, "epoch": 0.7342874922215308, "step": 1180 }, { "loss": 1.0003, "grad_norm": 3.7216849327087402, "learning_rate": 3.562369999491536e-05, "epoch": 0.7405102675793404, "step": 1190 }, { "loss": 1.0913, "grad_norm": 4.026198863983154, "learning_rate": 3.4042564563054526e-05, "epoch": 0.7467330429371499, "step": 1200 }, { "eval_loss": 1.0212879180908203, "eval_runtime": 19.4048, "eval_samples_per_second": 27.055, "eval_steps_per_second": 6.802, "epoch": 0.7467330429371499, "step": 1200 }, { "loss": 1.0205, "grad_norm": 3.0943052768707275, "learning_rate": 3.249009210677054e-05, "epoch": 0.7529558182949595, "step": 1210 }, { "loss": 1.1132, "grad_norm": 3.125500202178955, "learning_rate": 3.096695728055536e-05, "epoch": 0.7591785936527692, "step": 1220 }, { "loss": 1.0034, "grad_norm": 3.151357889175415, "learning_rate": 2.9473821989712625e-05, "epoch": 0.7654013690105788, "step": 1230 }, { "loss": 1.0259, "grad_norm": 4.394558906555176, "learning_rate": 2.801133510271463e-05, "epoch": 0.7716241443683883, "step": 1240 }, { "loss": 1.0123, "grad_norm": 3.69354510307312, "learning_rate": 2.6580132169225335e-05, "epoch": 0.7778469197261979, "step": 1250 }, { "loss": 0.9852, "grad_norm": 3.7997825145721436, "learning_rate": 2.5180835143910732e-05, "epoch": 0.7840696950840075, "step": 1260 }, { "loss": 1.0357, "grad_norm": 3.2746288776397705, "learning_rate": 2.3814052116157492e-05, "epoch": 0.790292470441817, "step": 1270 }, { "loss": 0.9234, "grad_norm": 2.5498299598693848, "learning_rate": 2.248037704581686e-05, "epoch": 0.7965152457996266, "step": 1280 }, { "eval_loss": 1.0135233402252197, "eval_runtime": 19.2289, "eval_samples_per_second": 27.303, "eval_steps_per_second": 6.865, "epoch": 0.7965152457996266, "step": 1280 }, { "loss": 0.9819, "grad_norm": 4.244918346405029, "learning_rate": 2.1180389505089004e-05, "epoch": 0.8027380211574362, "step": 1290 }, { "loss": 0.9955, "grad_norm": 3.978168249130249, "learning_rate": 1.9914654426659374e-05, "epoch": 0.8089607965152458, "step": 1300 }, { "loss": 1.0539, "grad_norm": 4.090443134307861, "learning_rate": 1.8683721858197366e-05, "epoch": 0.8151835718730553, "step": 1310 }, { "loss": 1.0301, "grad_norm": 4.159254550933838, "learning_rate": 1.7488126723323183e-05, "epoch": 0.8214063472308649, "step": 1320 }, { "loss": 0.9817, "grad_norm": 3.623256206512451, "learning_rate": 1.632838858914747e-05, "epoch": 0.8276291225886746, "step": 1330 }, { "loss": 1.0417, "grad_norm": 3.4594295024871826, "learning_rate": 1.5205011440483929e-05, "epoch": 0.8338518979464842, "step": 1340 }, { "loss": 0.9497, "grad_norm": 3.782587766647339, "learning_rate": 1.4118483460834064e-05, "epoch": 0.8400746733042938, "step": 1350 }, { "loss": 0.9819, "grad_norm": 4.124022960662842, "learning_rate": 1.3069276820237997e-05, "epoch": 0.8462974486621033, "step": 1360 }, { "eval_loss": 1.0055181980133057, "eval_runtime": 19.0381, "eval_samples_per_second": 27.576, "eval_steps_per_second": 6.933, "epoch": 0.8462974486621033, "step": 1360 }, { "loss": 0.9526, "grad_norm": 3.329490900039673, "learning_rate": 1.2057847470084993e-05, "epoch": 0.8525202240199129, "step": 1370 }, { "loss": 1.0375, "grad_norm": 3.4906861782073975, "learning_rate": 1.108463494497135e-05, "epoch": 0.8587429993777225, "step": 1380 }, { "loss": 1.143, "grad_norm": 3.604809045791626, "learning_rate": 1.0150062171693076e-05, "epoch": 0.864965774735532, "step": 1390 }, { "loss": 1.0352, "grad_norm": 3.657939910888672, "learning_rate": 9.254535285455334e-06, "epoch": 0.8711885500933416, "step": 1400 }, { "loss": 0.9611, "grad_norm": 4.000196933746338, "learning_rate": 8.398443453379267e-06, "epoch": 0.8774113254511512, "step": 1410 }, { "loss": 1.02, "grad_norm": 2.971421241760254, "learning_rate": 7.582158705382581e-06, "epoch": 0.8836341008089608, "step": 1420 }, { "loss": 1.095, "grad_norm": 3.4433512687683105, "learning_rate": 6.806035772507169e-06, "epoch": 0.8898568761667703, "step": 1430 }, { "loss": 0.9563, "grad_norm": 4.7456746101379395, "learning_rate": 6.070411932764586e-06, "epoch": 0.8960796515245799, "step": 1440 }, { "eval_loss": 1.0012239217758179, "eval_runtime": 19.2942, "eval_samples_per_second": 27.21, "eval_steps_per_second": 6.841, "epoch": 0.8960796515245799, "step": 1440 }, { "loss": 1.0647, "grad_norm": 3.7112457752227783, "learning_rate": 5.375606864565785e-06, "epoch": 0.9023024268823896, "step": 1450 }, { "loss": 1.0923, "grad_norm": 3.5192649364471436, "learning_rate": 4.721922507799248e-06, "epoch": 0.9085252022401992, "step": 1460 }, { "loss": 1.0141, "grad_norm": 3.898160696029663, "learning_rate": 4.10964293261763e-06, "epoch": 0.9147479775980087, "step": 1470 }, { "loss": 0.9584, "grad_norm": 3.5476901531219482, "learning_rate": 3.5390342159900223e-06, "epoch": 0.9209707529558183, "step": 1480 }, { "loss": 0.937, "grad_norm": 3.9498867988586426, "learning_rate": 3.0103443260734554e-06, "epoch": 0.9271935283136279, "step": 1490 }, { "loss": 0.9892, "grad_norm": 4.293119430541992, "learning_rate": 2.5238030144539737e-06, "epoch": 0.9334163036714375, "step": 1500 }, { "loss": 0.9698, "grad_norm": 3.3624675273895264, "learning_rate": 2.079621716303959e-06, "epoch": 0.939639079029247, "step": 1510 }, { "loss": 0.968, "grad_norm": 3.809082269668579, "learning_rate": 1.6779934584992718e-06, "epoch": 0.9458618543870566, "step": 1520 }, { "eval_loss": 1.0003483295440674, "eval_runtime": 19.1326, "eval_samples_per_second": 27.44, "eval_steps_per_second": 6.899, "epoch": 0.9458618543870566, "step": 1520 }, { "loss": 0.9195, "grad_norm": 3.8601574897766113, "learning_rate": 1.3190927757358973e-06, "epoch": 0.9520846297448662, "step": 1530 }, { "loss": 1.0223, "grad_norm": 3.9802134037017822, "learning_rate": 1.0030756346829151e-06, "epoch": 0.9583074051026758, "step": 1540 }, { "loss": 1.0011, "grad_norm": 4.339512825012207, "learning_rate": 7.300793662043282e-07, "epoch": 0.9645301804604853, "step": 1550 }, { "loss": 0.9674, "grad_norm": 3.0537281036376953, "learning_rate": 5.002226056795123e-07, "epoch": 0.9707529558182949, "step": 1560 }, { "loss": 1.0434, "grad_norm": 4.255007266998291, "learning_rate": 3.1360524144810055e-07, "epoch": 0.9769757311761046, "step": 1570 }, { "loss": 1.0385, "grad_norm": 4.374144554138184, "learning_rate": 1.703083714017617e-07, "epoch": 0.9831985065339142, "step": 1580 }, { "loss": 1.0528, "grad_norm": 3.5838091373443604, "learning_rate": 7.039426774164693e-08, "epoch": 0.9894212818917237, "step": 1590 }, { "loss": 1.021, "grad_norm": 3.116870880126953, "learning_rate": 1.3906349916881222e-08, "epoch": 0.9956440572495333, "step": 1600 }, { "eval_loss": 0.9999628067016602, "eval_runtime": 18.979, "eval_samples_per_second": 27.662, "eval_steps_per_second": 6.955, "epoch": 0.9956440572495333, "step": 1600 }, { "train_runtime": 2534.2207, "train_samples_per_second": 10.144, "train_steps_per_second": 0.634, "total_flos": 7.873650806298163e+16, "train_loss": 1.1504224986116116, "epoch": 1.0, "step": 1607 } ]