diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,3797 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 1000, + "global_step": 1874, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 1.6057937383651733, + "epoch": 0.0026680896478121665, + "grad_norm": 40.55179553800617, + "learning_rate": 4.2553191489361704e-07, + "loss": 2.3272, + "mean_token_accuracy": 0.5414203643798828, + "num_tokens": 314269.0, + "step": 5 + }, + { + "entropy": 1.6647388219833374, + "epoch": 0.005336179295624333, + "grad_norm": 43.06403399421523, + "learning_rate": 9.574468085106384e-07, + "loss": 2.1578, + "mean_token_accuracy": 0.5552075743675232, + "num_tokens": 626887.0, + "step": 10 + }, + { + "entropy": 1.6634229183197022, + "epoch": 0.0080042689434365, + "grad_norm": 23.775506080571247, + "learning_rate": 1.4893617021276596e-06, + "loss": 1.753, + "mean_token_accuracy": 0.6102993965148926, + "num_tokens": 938506.0, + "step": 15 + }, + { + "entropy": 1.3806034088134767, + "epoch": 0.010672358591248666, + "grad_norm": 15.144339226914607, + "learning_rate": 2.021276595744681e-06, + "loss": 1.254, + "mean_token_accuracy": 0.7032846212387085, + "num_tokens": 1253413.0, + "step": 20 + }, + { + "entropy": 0.8211405515670777, + "epoch": 0.013340448239060833, + "grad_norm": 3.9107969207090885, + "learning_rate": 2.553191489361702e-06, + "loss": 0.7717, + "mean_token_accuracy": 0.8092005133628846, + "num_tokens": 1567018.0, + "step": 25 + }, + { + "entropy": 0.4903128743171692, + "epoch": 0.016008537886873, + "grad_norm": 2.675059421456141, + "learning_rate": 3.0851063829787237e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8781713724136353, + "num_tokens": 1879087.0, + "step": 30 + }, + { + "entropy": 0.3075927495956421, + "epoch": 0.018676627534685165, + "grad_norm": 1.6301455165037353, + "learning_rate": 3.6170212765957453e-06, + "loss": 0.2837, + "mean_token_accuracy": 0.9133403658866882, + "num_tokens": 2193371.0, + "step": 35 + }, + { + "entropy": 0.22246569097042085, + "epoch": 0.021344717182497332, + "grad_norm": 1.1432473923317141, + "learning_rate": 4.148936170212766e-06, + "loss": 0.2158, + "mean_token_accuracy": 0.9255786180496216, + "num_tokens": 2505600.0, + "step": 40 + }, + { + "entropy": 0.20077281594276428, + "epoch": 0.0240128068303095, + "grad_norm": 1.323878562688497, + "learning_rate": 4.680851063829788e-06, + "loss": 0.1979, + "mean_token_accuracy": 0.9290306806564331, + "num_tokens": 2816122.0, + "step": 45 + }, + { + "entropy": 0.18968549370765686, + "epoch": 0.026680896478121666, + "grad_norm": 1.2515659600838258, + "learning_rate": 5.212765957446809e-06, + "loss": 0.1896, + "mean_token_accuracy": 0.9300891041755677, + "num_tokens": 3123042.0, + "step": 50 + }, + { + "entropy": 0.18321285843849183, + "epoch": 0.02934898612593383, + "grad_norm": 0.7840850203555432, + "learning_rate": 5.744680851063831e-06, + "loss": 0.1803, + "mean_token_accuracy": 0.9325991153717041, + "num_tokens": 3437738.0, + "step": 55 + }, + { + "entropy": 0.17623865604400635, + "epoch": 0.032017075773746, + "grad_norm": 0.5801801335679965, + "learning_rate": 6.276595744680851e-06, + "loss": 0.1764, + "mean_token_accuracy": 0.9335901737213135, + "num_tokens": 3748405.0, + "step": 60 + }, + { + "entropy": 0.17049630880355834, + "epoch": 0.03468516542155817, + "grad_norm": 0.4992046369679618, + "learning_rate": 6.808510638297873e-06, + "loss": 0.1713, + "mean_token_accuracy": 0.9349357962608338, + "num_tokens": 4064833.0, + "step": 65 + }, + { + "entropy": 0.1735348016023636, + "epoch": 0.03735325506937033, + "grad_norm": 0.6222605436746601, + "learning_rate": 7.340425531914894e-06, + "loss": 0.1741, + "mean_token_accuracy": 0.9334915518760681, + "num_tokens": 4374963.0, + "step": 70 + }, + { + "entropy": 0.17080763280391692, + "epoch": 0.040021344717182494, + "grad_norm": 0.5219735259020631, + "learning_rate": 7.872340425531916e-06, + "loss": 0.1714, + "mean_token_accuracy": 0.9345339298248291, + "num_tokens": 4686411.0, + "step": 75 + }, + { + "entropy": 0.1672375589609146, + "epoch": 0.042689434364994665, + "grad_norm": 0.7168995037550425, + "learning_rate": 8.404255319148937e-06, + "loss": 0.1692, + "mean_token_accuracy": 0.9349394798278808, + "num_tokens": 4999106.0, + "step": 80 + }, + { + "entropy": 0.16997842788696288, + "epoch": 0.04535752401280683, + "grad_norm": 0.5830639448822376, + "learning_rate": 8.936170212765958e-06, + "loss": 0.1711, + "mean_token_accuracy": 0.9341935753822327, + "num_tokens": 5314232.0, + "step": 85 + }, + { + "entropy": 0.17112229466438295, + "epoch": 0.048025613660619, + "grad_norm": 0.47473599128466865, + "learning_rate": 9.46808510638298e-06, + "loss": 0.1693, + "mean_token_accuracy": 0.9356699824333191, + "num_tokens": 5627433.0, + "step": 90 + }, + { + "entropy": 0.16550145745277406, + "epoch": 0.05069370330843116, + "grad_norm": 0.48443350956433884, + "learning_rate": 1e-05, + "loss": 0.1662, + "mean_token_accuracy": 0.9359866142272949, + "num_tokens": 5943052.0, + "step": 95 + }, + { + "entropy": 0.1691408634185791, + "epoch": 0.05336179295624333, + "grad_norm": 0.4462949644678706, + "learning_rate": 1.0531914893617022e-05, + "loss": 0.1661, + "mean_token_accuracy": 0.9358510494232177, + "num_tokens": 6256522.0, + "step": 100 + }, + { + "entropy": 0.16405502259731292, + "epoch": 0.056029882604055496, + "grad_norm": 0.5476258963434574, + "learning_rate": 1.1063829787234044e-05, + "loss": 0.1648, + "mean_token_accuracy": 0.9364481329917907, + "num_tokens": 6564699.0, + "step": 105 + }, + { + "entropy": 0.16443894803524017, + "epoch": 0.05869797225186766, + "grad_norm": 0.4080625646763937, + "learning_rate": 1.1595744680851065e-05, + "loss": 0.1642, + "mean_token_accuracy": 0.9364275813102723, + "num_tokens": 6880930.0, + "step": 110 + }, + { + "entropy": 0.16441120505332946, + "epoch": 0.06136606189967983, + "grad_norm": 0.40503117807901384, + "learning_rate": 1.2127659574468087e-05, + "loss": 0.1641, + "mean_token_accuracy": 0.9364349961280822, + "num_tokens": 7195225.0, + "step": 115 + }, + { + "entropy": 0.16429885029792785, + "epoch": 0.064034151547492, + "grad_norm": 0.3948105540476529, + "learning_rate": 1.2659574468085108e-05, + "loss": 0.1637, + "mean_token_accuracy": 0.9367253661155701, + "num_tokens": 7505840.0, + "step": 120 + }, + { + "entropy": 0.16231226325035095, + "epoch": 0.06670224119530416, + "grad_norm": 0.3707777775484136, + "learning_rate": 1.3191489361702127e-05, + "loss": 0.1634, + "mean_token_accuracy": 0.93640695810318, + "num_tokens": 7819265.0, + "step": 125 + }, + { + "entropy": 0.16570849120616912, + "epoch": 0.06937033084311633, + "grad_norm": 0.3440472771757231, + "learning_rate": 1.372340425531915e-05, + "loss": 0.1635, + "mean_token_accuracy": 0.9364205598831177, + "num_tokens": 8129451.0, + "step": 130 + }, + { + "entropy": 0.16542819142341614, + "epoch": 0.0720384204909285, + "grad_norm": 0.4679394891059294, + "learning_rate": 1.425531914893617e-05, + "loss": 0.1663, + "mean_token_accuracy": 0.935624098777771, + "num_tokens": 8439222.0, + "step": 135 + }, + { + "entropy": 0.16903293132781982, + "epoch": 0.07470651013874066, + "grad_norm": 11.697052768897619, + "learning_rate": 1.4787234042553193e-05, + "loss": 0.1761, + "mean_token_accuracy": 0.9346019506454468, + "num_tokens": 8753787.0, + "step": 140 + }, + { + "entropy": 0.18029699921607972, + "epoch": 0.07737459978655283, + "grad_norm": 3.2701838952682594, + "learning_rate": 1.5319148936170214e-05, + "loss": 0.1896, + "mean_token_accuracy": 0.928356921672821, + "num_tokens": 9065692.0, + "step": 145 + }, + { + "entropy": 0.1697184056043625, + "epoch": 0.08004268943436499, + "grad_norm": 0.5940807634293421, + "learning_rate": 1.5851063829787235e-05, + "loss": 0.1674, + "mean_token_accuracy": 0.9348207354545593, + "num_tokens": 9379119.0, + "step": 150 + }, + { + "entropy": 0.16092735528945923, + "epoch": 0.08271077908217717, + "grad_norm": 0.3832395625093914, + "learning_rate": 1.6382978723404255e-05, + "loss": 0.1624, + "mean_token_accuracy": 0.9370127081871032, + "num_tokens": 9691655.0, + "step": 155 + }, + { + "entropy": 0.16433717608451842, + "epoch": 0.08537886872998933, + "grad_norm": 0.5605090252886916, + "learning_rate": 1.6914893617021276e-05, + "loss": 0.1642, + "mean_token_accuracy": 0.9362092971801758, + "num_tokens": 10005418.0, + "step": 160 + }, + { + "entropy": 0.16236138641834258, + "epoch": 0.08804695837780149, + "grad_norm": 0.5371554214637537, + "learning_rate": 1.74468085106383e-05, + "loss": 0.1643, + "mean_token_accuracy": 0.9369156122207641, + "num_tokens": 10314605.0, + "step": 165 + }, + { + "entropy": 0.16272085011005402, + "epoch": 0.09071504802561366, + "grad_norm": 0.6100043143177375, + "learning_rate": 1.797872340425532e-05, + "loss": 0.1649, + "mean_token_accuracy": 0.9354394793510437, + "num_tokens": 10627864.0, + "step": 170 + }, + { + "entropy": 0.16350803077220916, + "epoch": 0.09338313767342583, + "grad_norm": 0.7545661359169421, + "learning_rate": 1.8510638297872342e-05, + "loss": 0.1705, + "mean_token_accuracy": 0.935249638557434, + "num_tokens": 10942449.0, + "step": 175 + }, + { + "entropy": 0.16734544038772584, + "epoch": 0.096051227321238, + "grad_norm": 0.47886967480805687, + "learning_rate": 1.9042553191489363e-05, + "loss": 0.1668, + "mean_token_accuracy": 0.9357113361358642, + "num_tokens": 11254628.0, + "step": 180 + }, + { + "entropy": 0.16646199226379393, + "epoch": 0.09871931696905016, + "grad_norm": 0.8950938356931862, + "learning_rate": 1.9574468085106384e-05, + "loss": 0.1661, + "mean_token_accuracy": 0.9356560349464417, + "num_tokens": 11565553.0, + "step": 185 + }, + { + "entropy": 0.16376915574073792, + "epoch": 0.10138740661686232, + "grad_norm": 0.49417911396521913, + "learning_rate": 1.9999982639809622e-05, + "loss": 0.1651, + "mean_token_accuracy": 0.9368755459785462, + "num_tokens": 11876584.0, + "step": 190 + }, + { + "entropy": 0.16318838894367219, + "epoch": 0.10405549626467449, + "grad_norm": 0.39442525016455715, + "learning_rate": 1.9999375039475278e-05, + "loss": 0.1641, + "mean_token_accuracy": 0.9363178849220276, + "num_tokens": 12190388.0, + "step": 195 + }, + { + "entropy": 0.1615314543247223, + "epoch": 0.10672358591248667, + "grad_norm": 0.4034435704573861, + "learning_rate": 1.999789948989634e-05, + "loss": 0.1632, + "mean_token_accuracy": 0.9353866577148438, + "num_tokens": 12503581.0, + "step": 200 + }, + { + "entropy": 0.1635851889848709, + "epoch": 0.10939167556029883, + "grad_norm": 0.322362019370726, + "learning_rate": 1.9995556119151032e-05, + "loss": 0.1632, + "mean_token_accuracy": 0.9364137291908264, + "num_tokens": 12815167.0, + "step": 205 + }, + { + "entropy": 0.1606872260570526, + "epoch": 0.11205976520811099, + "grad_norm": 0.7322225604735055, + "learning_rate": 1.999234513064475e-05, + "loss": 0.1605, + "mean_token_accuracy": 0.9371747970581055, + "num_tokens": 13133511.0, + "step": 210 + }, + { + "entropy": 0.16151292622089386, + "epoch": 0.11472785485592316, + "grad_norm": 0.38224695727957775, + "learning_rate": 1.998826680309242e-05, + "loss": 0.1626, + "mean_token_accuracy": 0.9366207599639893, + "num_tokens": 13448143.0, + "step": 215 + }, + { + "entropy": 0.16145381927490235, + "epoch": 0.11739594450373532, + "grad_norm": 0.44622077648668956, + "learning_rate": 1.9983321490494292e-05, + "loss": 0.1656, + "mean_token_accuracy": 0.9352792501449585, + "num_tokens": 13760937.0, + "step": 220 + }, + { + "entropy": 0.16521975994110108, + "epoch": 0.1200640341515475, + "grad_norm": 0.35843161110444777, + "learning_rate": 1.9977509622105233e-05, + "loss": 0.1637, + "mean_token_accuracy": 0.936168372631073, + "num_tokens": 14072855.0, + "step": 225 + }, + { + "entropy": 0.16004987359046935, + "epoch": 0.12273212379935966, + "grad_norm": 0.29808835226604685, + "learning_rate": 1.9970831702397447e-05, + "loss": 0.161, + "mean_token_accuracy": 0.9368571877479553, + "num_tokens": 14385910.0, + "step": 230 + }, + { + "entropy": 0.16051293015480042, + "epoch": 0.12540021344717184, + "grad_norm": 0.3517417621856074, + "learning_rate": 1.9963288311016695e-05, + "loss": 0.1606, + "mean_token_accuracy": 0.937283992767334, + "num_tokens": 14696377.0, + "step": 235 + }, + { + "entropy": 0.15953898727893828, + "epoch": 0.128068303094984, + "grad_norm": 0.3030873385645047, + "learning_rate": 1.995488010273198e-05, + "loss": 0.161, + "mean_token_accuracy": 0.9370529532432557, + "num_tokens": 15007831.0, + "step": 240 + }, + { + "entropy": 0.1591504067182541, + "epoch": 0.13073639274279616, + "grad_norm": 0.3185241274110224, + "learning_rate": 1.9945607807378717e-05, + "loss": 0.1614, + "mean_token_accuracy": 0.9368512272834778, + "num_tokens": 15319086.0, + "step": 245 + }, + { + "entropy": 0.16048753559589385, + "epoch": 0.13340448239060831, + "grad_norm": 0.30739582985703917, + "learning_rate": 1.9935472229795386e-05, + "loss": 0.1598, + "mean_token_accuracy": 0.9372594594955445, + "num_tokens": 15631096.0, + "step": 250 + }, + { + "entropy": 0.15844869911670684, + "epoch": 0.1360725720384205, + "grad_norm": 0.28671630040460405, + "learning_rate": 1.9924474249753656e-05, + "loss": 0.1592, + "mean_token_accuracy": 0.9375737190246582, + "num_tokens": 15948578.0, + "step": 255 + }, + { + "entropy": 0.15891876220703124, + "epoch": 0.13874066168623267, + "grad_norm": 0.33123746638711155, + "learning_rate": 1.991261482188203e-05, + "loss": 0.1603, + "mean_token_accuracy": 0.9368638515472412, + "num_tokens": 16264642.0, + "step": 260 + }, + { + "entropy": 0.16164786219596863, + "epoch": 0.14140875133404482, + "grad_norm": 0.3180618039206802, + "learning_rate": 1.9899894975582997e-05, + "loss": 0.1613, + "mean_token_accuracy": 0.9373221755027771, + "num_tokens": 16574897.0, + "step": 265 + }, + { + "entropy": 0.15974415242671966, + "epoch": 0.144076840981857, + "grad_norm": 0.31502539896563436, + "learning_rate": 1.988631581494365e-05, + "loss": 0.1617, + "mean_token_accuracy": 0.9362964034080505, + "num_tokens": 16885412.0, + "step": 270 + }, + { + "entropy": 0.16087310314178466, + "epoch": 0.14674493062966915, + "grad_norm": 0.3042949940918383, + "learning_rate": 1.9871878518639875e-05, + "loss": 0.16, + "mean_token_accuracy": 0.9370861053466797, + "num_tokens": 17196171.0, + "step": 275 + }, + { + "entropy": 0.15577465891838074, + "epoch": 0.14941302027748132, + "grad_norm": 0.2837059066482121, + "learning_rate": 1.985658433983403e-05, + "loss": 0.158, + "mean_token_accuracy": 0.938635504245758, + "num_tokens": 17512272.0, + "step": 280 + }, + { + "entropy": 0.1611830174922943, + "epoch": 0.1520811099252935, + "grad_norm": 2.07322747055291, + "learning_rate": 1.9840434606066182e-05, + "loss": 0.1696, + "mean_token_accuracy": 0.9359327912330627, + "num_tokens": 17824349.0, + "step": 285 + }, + { + "entropy": 0.16292799413204193, + "epoch": 0.15474919957310565, + "grad_norm": 0.3567691951667491, + "learning_rate": 1.9823430719138856e-05, + "loss": 0.1618, + "mean_token_accuracy": 0.9365374684333801, + "num_tokens": 18134612.0, + "step": 290 + }, + { + "entropy": 0.15836259722709656, + "epoch": 0.15741728922091783, + "grad_norm": 0.3306976011964114, + "learning_rate": 1.980557415499537e-05, + "loss": 0.1613, + "mean_token_accuracy": 0.9377595663070679, + "num_tokens": 18444174.0, + "step": 295 + }, + { + "entropy": 0.16409131288528442, + "epoch": 0.16008537886872998, + "grad_norm": 0.8223816526857904, + "learning_rate": 1.978686646359173e-05, + "loss": 0.1635, + "mean_token_accuracy": 0.9360223889350892, + "num_tokens": 18757377.0, + "step": 300 + }, + { + "entropy": 0.16573355793952943, + "epoch": 0.16275346851654215, + "grad_norm": 0.4570923592099133, + "learning_rate": 1.976730926876208e-05, + "loss": 0.1647, + "mean_token_accuracy": 0.9364067673683166, + "num_tokens": 19071020.0, + "step": 305 + }, + { + "entropy": 0.1585041582584381, + "epoch": 0.16542155816435433, + "grad_norm": 0.3847935585188154, + "learning_rate": 1.9746904268077766e-05, + "loss": 0.1608, + "mean_token_accuracy": 0.9372907161712647, + "num_tokens": 19384510.0, + "step": 310 + }, + { + "entropy": 0.15850078761577607, + "epoch": 0.16808964781216648, + "grad_norm": 0.27644993691755454, + "learning_rate": 1.9725653232699962e-05, + "loss": 0.1586, + "mean_token_accuracy": 0.9377372860908508, + "num_tokens": 19700894.0, + "step": 315 + }, + { + "entropy": 0.1582118660211563, + "epoch": 0.17075773745997866, + "grad_norm": 0.3068511012896705, + "learning_rate": 1.9703558007225964e-05, + "loss": 0.1601, + "mean_token_accuracy": 0.9376296758651733, + "num_tokens": 20012882.0, + "step": 320 + }, + { + "entropy": 0.1584922641515732, + "epoch": 0.1734258271077908, + "grad_norm": 0.27025656430048917, + "learning_rate": 1.968062050952906e-05, + "loss": 0.1584, + "mean_token_accuracy": 0.9380206942558289, + "num_tokens": 20324153.0, + "step": 325 + }, + { + "entropy": 0.15942637920379638, + "epoch": 0.17609391675560299, + "grad_norm": 0.29485986752985277, + "learning_rate": 1.9656842730592046e-05, + "loss": 0.1585, + "mean_token_accuracy": 0.9375697135925293, + "num_tokens": 20635204.0, + "step": 330 + }, + { + "entropy": 0.15870639085769653, + "epoch": 0.17876200640341516, + "grad_norm": 0.332325793174125, + "learning_rate": 1.963222673433445e-05, + "loss": 0.1606, + "mean_token_accuracy": 0.9369964957237243, + "num_tokens": 20946049.0, + "step": 335 + }, + { + "entropy": 0.1566277861595154, + "epoch": 0.1814300960512273, + "grad_norm": 0.35414878529407523, + "learning_rate": 1.9606774657433335e-05, + "loss": 0.1584, + "mean_token_accuracy": 0.9371545314788818, + "num_tokens": 21257058.0, + "step": 340 + }, + { + "entropy": 0.15877649188041687, + "epoch": 0.1840981856990395, + "grad_norm": 0.2720083540153981, + "learning_rate": 1.958048870913786e-05, + "loss": 0.1588, + "mean_token_accuracy": 0.9378036260604858, + "num_tokens": 21567622.0, + "step": 345 + }, + { + "entropy": 0.15873935520648957, + "epoch": 0.18676627534685167, + "grad_norm": 0.33426473267402307, + "learning_rate": 1.9553371171077523e-05, + "loss": 0.1598, + "mean_token_accuracy": 0.937464439868927, + "num_tokens": 21877214.0, + "step": 350 + }, + { + "entropy": 0.1577691912651062, + "epoch": 0.18943436499466382, + "grad_norm": 0.30962801826583497, + "learning_rate": 1.9525424397064082e-05, + "loss": 0.1586, + "mean_token_accuracy": 0.9379173636436462, + "num_tokens": 22188854.0, + "step": 355 + }, + { + "entropy": 0.15920698940753936, + "epoch": 0.192102454642476, + "grad_norm": 0.24858466270630716, + "learning_rate": 1.9496650812887293e-05, + "loss": 0.1585, + "mean_token_accuracy": 0.937783944606781, + "num_tokens": 22503817.0, + "step": 360 + }, + { + "entropy": 0.15488786101341248, + "epoch": 0.19477054429028814, + "grad_norm": 0.25132425246934453, + "learning_rate": 1.9467052916104297e-05, + "loss": 0.1568, + "mean_token_accuracy": 0.938006055355072, + "num_tokens": 22820485.0, + "step": 365 + }, + { + "entropy": 0.1576866924762726, + "epoch": 0.19743863393810032, + "grad_norm": 0.28007319047792106, + "learning_rate": 1.9436633275822876e-05, + "loss": 0.1579, + "mean_token_accuracy": 0.937552809715271, + "num_tokens": 23137566.0, + "step": 370 + }, + { + "entropy": 0.1562004268169403, + "epoch": 0.2001067235859125, + "grad_norm": 0.26412703181082453, + "learning_rate": 1.9405394532478422e-05, + "loss": 0.1571, + "mean_token_accuracy": 0.9379514217376709, + "num_tokens": 23454287.0, + "step": 375 + }, + { + "entropy": 0.15889868140220642, + "epoch": 0.20277481323372465, + "grad_norm": 0.2814723241928714, + "learning_rate": 1.937333939760477e-05, + "loss": 0.1586, + "mean_token_accuracy": 0.9374628782272338, + "num_tokens": 23765850.0, + "step": 380 + }, + { + "entropy": 0.15611255168914795, + "epoch": 0.20544290288153683, + "grad_norm": 0.2591709773637703, + "learning_rate": 1.934047065359881e-05, + "loss": 0.1572, + "mean_token_accuracy": 0.9383575320243835, + "num_tokens": 24079680.0, + "step": 385 + }, + { + "entropy": 0.15731624364852906, + "epoch": 0.20811099252934898, + "grad_norm": 0.3274322630296524, + "learning_rate": 1.9306791153479007e-05, + "loss": 0.1575, + "mean_token_accuracy": 0.9381836175918579, + "num_tokens": 24391912.0, + "step": 390 + }, + { + "entropy": 0.1555784285068512, + "epoch": 0.21077908217716115, + "grad_norm": 0.26139305778817185, + "learning_rate": 1.9272303820637727e-05, + "loss": 0.157, + "mean_token_accuracy": 0.9382982850074768, + "num_tokens": 24706470.0, + "step": 395 + }, + { + "entropy": 0.1561396062374115, + "epoch": 0.21344717182497333, + "grad_norm": 0.2937863844138795, + "learning_rate": 1.9237011648587496e-05, + "loss": 0.1563, + "mean_token_accuracy": 0.9385831475257873, + "num_tokens": 25022583.0, + "step": 400 + }, + { + "entropy": 0.1557897746562958, + "epoch": 0.21611526147278548, + "grad_norm": 0.2606893621813393, + "learning_rate": 1.9200917700701176e-05, + "loss": 0.1575, + "mean_token_accuracy": 0.9378427028656006, + "num_tokens": 25336503.0, + "step": 405 + }, + { + "entropy": 0.15799925923347474, + "epoch": 0.21878335112059766, + "grad_norm": 0.26497991242135527, + "learning_rate": 1.9164025109946042e-05, + "loss": 0.1572, + "mean_token_accuracy": 0.9383312106132508, + "num_tokens": 25649817.0, + "step": 410 + }, + { + "entropy": 0.15797626674175264, + "epoch": 0.2214514407684098, + "grad_norm": 0.27467565279203027, + "learning_rate": 1.9126337078611855e-05, + "loss": 0.159, + "mean_token_accuracy": 0.9374309062957764, + "num_tokens": 25962652.0, + "step": 415 + }, + { + "entropy": 0.15986525714397432, + "epoch": 0.22411953041622198, + "grad_norm": 0.2969689886430783, + "learning_rate": 1.908785687803289e-05, + "loss": 0.1596, + "mean_token_accuracy": 0.9372278928756714, + "num_tokens": 26271673.0, + "step": 420 + }, + { + "entropy": 0.1557195782661438, + "epoch": 0.22678762006403416, + "grad_norm": 0.23040213059715953, + "learning_rate": 1.9048587848303995e-05, + "loss": 0.1571, + "mean_token_accuracy": 0.9377826452255249, + "num_tokens": 26584515.0, + "step": 425 + }, + { + "entropy": 0.15680642426013947, + "epoch": 0.2294557097118463, + "grad_norm": 0.22629954502268448, + "learning_rate": 1.900853339799066e-05, + "loss": 0.1581, + "mean_token_accuracy": 0.9378763437271118, + "num_tokens": 26895984.0, + "step": 430 + }, + { + "entropy": 0.15606102645397185, + "epoch": 0.2321237993596585, + "grad_norm": 0.23559172310943693, + "learning_rate": 1.8967697003833156e-05, + "loss": 0.1556, + "mean_token_accuracy": 0.9387164950370789, + "num_tokens": 27208162.0, + "step": 435 + }, + { + "entropy": 0.15733987092971802, + "epoch": 0.23479188900747064, + "grad_norm": 0.3197383053118265, + "learning_rate": 1.8926082210444755e-05, + "loss": 0.1578, + "mean_token_accuracy": 0.9378737568855285, + "num_tokens": 27519484.0, + "step": 440 + }, + { + "entropy": 0.1580660492181778, + "epoch": 0.23745997865528282, + "grad_norm": 0.2737127367874776, + "learning_rate": 1.8883692630004046e-05, + "loss": 0.1578, + "mean_token_accuracy": 0.9377901077270507, + "num_tokens": 27831566.0, + "step": 445 + }, + { + "entropy": 0.15822867751121522, + "epoch": 0.240128068303095, + "grad_norm": 1.0659710021994102, + "learning_rate": 1.884053194194142e-05, + "loss": 0.1633, + "mean_token_accuracy": 0.936628234386444, + "num_tokens": 28140071.0, + "step": 450 + }, + { + "entropy": 0.16525934636592865, + "epoch": 0.24279615795090714, + "grad_norm": 0.6779597502538339, + "learning_rate": 1.8796603892619668e-05, + "loss": 0.164, + "mean_token_accuracy": 0.9368445754051209, + "num_tokens": 28451414.0, + "step": 455 + }, + { + "entropy": 0.1622500389814377, + "epoch": 0.24546424759871932, + "grad_norm": 0.38034546828877175, + "learning_rate": 1.8751912295008804e-05, + "loss": 0.1623, + "mean_token_accuracy": 0.9371641993522644, + "num_tokens": 28762308.0, + "step": 460 + }, + { + "entropy": 0.15895282626152038, + "epoch": 0.24813233724653147, + "grad_norm": 0.3511065822256795, + "learning_rate": 1.8706461028355107e-05, + "loss": 0.1611, + "mean_token_accuracy": 0.9371441841125489, + "num_tokens": 29072105.0, + "step": 465 + }, + { + "entropy": 0.15918091535568238, + "epoch": 0.2508004268943437, + "grad_norm": 0.2924598212092513, + "learning_rate": 1.866025403784439e-05, + "loss": 0.1575, + "mean_token_accuracy": 0.9381168961524964, + "num_tokens": 29382690.0, + "step": 470 + }, + { + "entropy": 0.15696989595890046, + "epoch": 0.2534685165421558, + "grad_norm": 0.2711360547796898, + "learning_rate": 1.861329533425956e-05, + "loss": 0.1582, + "mean_token_accuracy": 0.9376309752464295, + "num_tokens": 29694957.0, + "step": 475 + }, + { + "entropy": 0.15944585502147673, + "epoch": 0.256136606189968, + "grad_norm": 0.2572014322194392, + "learning_rate": 1.8565588993632488e-05, + "loss": 0.1593, + "mean_token_accuracy": 0.9372217178344726, + "num_tokens": 30008241.0, + "step": 480 + }, + { + "entropy": 0.15602820217609406, + "epoch": 0.25880469583778015, + "grad_norm": 0.26005795926801584, + "learning_rate": 1.8517139156890196e-05, + "loss": 0.158, + "mean_token_accuracy": 0.9383147120475769, + "num_tokens": 30319838.0, + "step": 485 + }, + { + "entropy": 0.15950543880462648, + "epoch": 0.26147278548559233, + "grad_norm": 0.2765594574846301, + "learning_rate": 1.846795002949543e-05, + "loss": 0.1575, + "mean_token_accuracy": 0.9381031513214111, + "num_tokens": 30633240.0, + "step": 490 + }, + { + "entropy": 0.1567516565322876, + "epoch": 0.2641408751334045, + "grad_norm": 0.23855526184988263, + "learning_rate": 1.8418025881081612e-05, + "loss": 0.1583, + "mean_token_accuracy": 0.9379245042800903, + "num_tokens": 30944472.0, + "step": 495 + }, + { + "entropy": 0.15918792486190797, + "epoch": 0.26680896478121663, + "grad_norm": 0.2579721676572744, + "learning_rate": 1.8367371045082265e-05, + "loss": 0.1605, + "mean_token_accuracy": 0.937362265586853, + "num_tokens": 31253331.0, + "step": 500 + }, + { + "entropy": 0.15768127143383026, + "epoch": 0.2694770544290288, + "grad_norm": 0.24642750080557177, + "learning_rate": 1.8315989918354835e-05, + "loss": 0.1576, + "mean_token_accuracy": 0.9381099939346313, + "num_tokens": 31567073.0, + "step": 505 + }, + { + "entropy": 0.15596598982810975, + "epoch": 0.272145144076841, + "grad_norm": 0.25681503988190824, + "learning_rate": 1.8263886960799062e-05, + "loss": 0.1566, + "mean_token_accuracy": 0.9381922602653503, + "num_tokens": 31880319.0, + "step": 510 + }, + { + "entropy": 0.15669248402118682, + "epoch": 0.27481323372465316, + "grad_norm": 0.22065427811092975, + "learning_rate": 1.8211066694969855e-05, + "loss": 0.1572, + "mean_token_accuracy": 0.938100790977478, + "num_tokens": 32192951.0, + "step": 515 + }, + { + "entropy": 0.15724363029003144, + "epoch": 0.27748132337246534, + "grad_norm": 0.2390844464658894, + "learning_rate": 1.8157533705684724e-05, + "loss": 0.158, + "mean_token_accuracy": 0.938084626197815, + "num_tokens": 32503369.0, + "step": 520 + }, + { + "entropy": 0.15720780789852143, + "epoch": 0.28014941302027746, + "grad_norm": 0.24549164685428548, + "learning_rate": 1.810329263962584e-05, + "loss": 0.157, + "mean_token_accuracy": 0.9381688714027405, + "num_tokens": 32817261.0, + "step": 525 + }, + { + "entropy": 0.15599971413612365, + "epoch": 0.28281750266808964, + "grad_norm": 0.2976212854060201, + "learning_rate": 1.804834820493666e-05, + "loss": 0.1569, + "mean_token_accuracy": 0.9384048819541931, + "num_tokens": 33128065.0, + "step": 530 + }, + { + "entropy": 0.15751757621765136, + "epoch": 0.2854855923159018, + "grad_norm": 0.23632025616502753, + "learning_rate": 1.7992705170813306e-05, + "loss": 0.1582, + "mean_token_accuracy": 0.9379205465316772, + "num_tokens": 33438675.0, + "step": 535 + }, + { + "entropy": 0.1583792746067047, + "epoch": 0.288153681963714, + "grad_norm": 0.25593143055297085, + "learning_rate": 1.793636836709057e-05, + "loss": 0.1584, + "mean_token_accuracy": 0.937691605091095, + "num_tokens": 33749618.0, + "step": 540 + }, + { + "entropy": 0.15800465345382692, + "epoch": 0.29082177161152617, + "grad_norm": 0.2572762932952313, + "learning_rate": 1.7879342683822683e-05, + "loss": 0.1584, + "mean_token_accuracy": 0.937437629699707, + "num_tokens": 34059695.0, + "step": 545 + }, + { + "entropy": 0.15476928949356078, + "epoch": 0.2934898612593383, + "grad_norm": 0.23558110079948988, + "learning_rate": 1.7821633070858855e-05, + "loss": 0.1559, + "mean_token_accuracy": 0.9386150956153869, + "num_tokens": 34371953.0, + "step": 550 + }, + { + "entropy": 0.15539830029010773, + "epoch": 0.29615795090715047, + "grad_norm": 0.278660274756863, + "learning_rate": 1.776324453741365e-05, + "loss": 0.1556, + "mean_token_accuracy": 0.9385053515434265, + "num_tokens": 34685055.0, + "step": 555 + }, + { + "entropy": 0.15808961391448975, + "epoch": 0.29882604055496265, + "grad_norm": 0.27320357497494924, + "learning_rate": 1.770418215163215e-05, + "loss": 0.1579, + "mean_token_accuracy": 0.9380279421806336, + "num_tokens": 34997194.0, + "step": 560 + }, + { + "entropy": 0.15648388266563415, + "epoch": 0.3014941302027748, + "grad_norm": 0.26423644669523827, + "learning_rate": 1.764445104015007e-05, + "loss": 0.1574, + "mean_token_accuracy": 0.9378781914710999, + "num_tokens": 35304669.0, + "step": 565 + }, + { + "entropy": 0.15613919198513032, + "epoch": 0.304162219850587, + "grad_norm": 0.2322264198496824, + "learning_rate": 1.7584056387648727e-05, + "loss": 0.1574, + "mean_token_accuracy": 0.9375425100326538, + "num_tokens": 35615302.0, + "step": 570 + }, + { + "entropy": 0.15673792064189912, + "epoch": 0.3068303094983991, + "grad_norm": 0.27604927624321784, + "learning_rate": 1.7523003436405055e-05, + "loss": 0.1566, + "mean_token_accuracy": 0.9383435010910034, + "num_tokens": 35928610.0, + "step": 575 + }, + { + "entropy": 0.15415124595165253, + "epoch": 0.3094983991462113, + "grad_norm": 0.28251874667852384, + "learning_rate": 1.7461297485836537e-05, + "loss": 0.1548, + "mean_token_accuracy": 0.938884687423706, + "num_tokens": 36243859.0, + "step": 580 + }, + { + "entropy": 0.15712933540344237, + "epoch": 0.3121664887940235, + "grad_norm": 0.22839995512002847, + "learning_rate": 1.7398943892041223e-05, + "loss": 0.1571, + "mean_token_accuracy": 0.9381480932235717, + "num_tokens": 36554595.0, + "step": 585 + }, + { + "entropy": 0.1551433563232422, + "epoch": 0.31483457844183566, + "grad_norm": 0.280231703238132, + "learning_rate": 1.7335948067332827e-05, + "loss": 0.1561, + "mean_token_accuracy": 0.9383224487304688, + "num_tokens": 36869542.0, + "step": 590 + }, + { + "entropy": 0.15621574819087983, + "epoch": 0.31750266808964783, + "grad_norm": 0.23211378761666593, + "learning_rate": 1.7272315479770925e-05, + "loss": 0.1572, + "mean_token_accuracy": 0.9376968264579773, + "num_tokens": 37179670.0, + "step": 595 + }, + { + "entropy": 0.15389657020568848, + "epoch": 0.32017075773745995, + "grad_norm": 0.23153822744101846, + "learning_rate": 1.7208051652686335e-05, + "loss": 0.1533, + "mean_token_accuracy": 0.9396386504173279, + "num_tokens": 37496771.0, + "step": 600 + }, + { + "entropy": 0.1562306672334671, + "epoch": 0.32283884738527213, + "grad_norm": 0.23042754503602308, + "learning_rate": 1.714316216420169e-05, + "loss": 0.1565, + "mean_token_accuracy": 0.9387167930603028, + "num_tokens": 37810099.0, + "step": 605 + }, + { + "entropy": 0.15647551119327546, + "epoch": 0.3255069370330843, + "grad_norm": 0.22042753143681787, + "learning_rate": 1.707765264674724e-05, + "loss": 0.1569, + "mean_token_accuracy": 0.9378628253936767, + "num_tokens": 38121465.0, + "step": 610 + }, + { + "entropy": 0.1532812863588333, + "epoch": 0.3281750266808965, + "grad_norm": 0.26050597368266426, + "learning_rate": 1.701152878657197e-05, + "loss": 0.1546, + "mean_token_accuracy": 0.9390907406806945, + "num_tokens": 38434404.0, + "step": 615 + }, + { + "entropy": 0.15821815133094788, + "epoch": 0.33084311632870866, + "grad_norm": 0.26774466711519207, + "learning_rate": 1.6944796323250036e-05, + "loss": 0.1575, + "mean_token_accuracy": 0.9375768661499023, + "num_tokens": 38745997.0, + "step": 620 + }, + { + "entropy": 0.15520448386669158, + "epoch": 0.3335112059765208, + "grad_norm": 0.2555654904241375, + "learning_rate": 1.687746104918255e-05, + "loss": 0.1565, + "mean_token_accuracy": 0.938270628452301, + "num_tokens": 39058084.0, + "step": 625 + }, + { + "entropy": 0.15563291013240815, + "epoch": 0.33617929562433296, + "grad_norm": 0.24561720528977696, + "learning_rate": 1.6809528809094808e-05, + "loss": 0.1554, + "mean_token_accuracy": 0.9383365511894226, + "num_tokens": 39374950.0, + "step": 630 + }, + { + "entropy": 0.15381639301776887, + "epoch": 0.33884738527214514, + "grad_norm": 0.2752577369716408, + "learning_rate": 1.674100549952897e-05, + "loss": 0.1553, + "mean_token_accuracy": 0.9386968612670898, + "num_tokens": 39687975.0, + "step": 635 + }, + { + "entropy": 0.15738414824008942, + "epoch": 0.3415154749199573, + "grad_norm": 0.2557558969973054, + "learning_rate": 1.6671897068332232e-05, + "loss": 0.1558, + "mean_token_accuracy": 0.9381865382194519, + "num_tokens": 40001067.0, + "step": 640 + }, + { + "entropy": 0.15551066398620605, + "epoch": 0.3441835645677695, + "grad_norm": 0.22019467451935834, + "learning_rate": 1.6602209514140552e-05, + "loss": 0.157, + "mean_token_accuracy": 0.9382826447486877, + "num_tokens": 40312428.0, + "step": 645 + }, + { + "entropy": 0.15661275684833526, + "epoch": 0.3468516542155816, + "grad_norm": 0.22547104678195906, + "learning_rate": 1.6531948885857957e-05, + "loss": 0.1568, + "mean_token_accuracy": 0.9382068991661072, + "num_tokens": 40621016.0, + "step": 650 + }, + { + "entropy": 0.15573394000530244, + "epoch": 0.3495197438633938, + "grad_norm": 0.22623521473974365, + "learning_rate": 1.6461121282131518e-05, + "loss": 0.1566, + "mean_token_accuracy": 0.9383347630500793, + "num_tokens": 40935624.0, + "step": 655 + }, + { + "entropy": 0.15639272928237916, + "epoch": 0.35218783351120597, + "grad_norm": 0.21033908601468662, + "learning_rate": 1.6389732850821967e-05, + "loss": 0.156, + "mean_token_accuracy": 0.938245439529419, + "num_tokens": 41247131.0, + "step": 660 + }, + { + "entropy": 0.15640448331832885, + "epoch": 0.35485592315901815, + "grad_norm": 0.23057738411070072, + "learning_rate": 1.6317789788470063e-05, + "loss": 0.1572, + "mean_token_accuracy": 0.9381119847297669, + "num_tokens": 41558078.0, + "step": 665 + }, + { + "entropy": 0.15540762543678283, + "epoch": 0.3575240128068303, + "grad_norm": 0.2034575643460659, + "learning_rate": 1.624529833975874e-05, + "loss": 0.1554, + "mean_token_accuracy": 0.9385368585586548, + "num_tokens": 41871920.0, + "step": 670 + }, + { + "entropy": 0.15528062880039215, + "epoch": 0.36019210245464245, + "grad_norm": 0.20896524280441509, + "learning_rate": 1.617226479697105e-05, + "loss": 0.1559, + "mean_token_accuracy": 0.9384117603302002, + "num_tokens": 42181375.0, + "step": 675 + }, + { + "entropy": 0.15409398078918457, + "epoch": 0.3628601921024546, + "grad_norm": 0.2153970667083397, + "learning_rate": 1.609869549944401e-05, + "loss": 0.1554, + "mean_token_accuracy": 0.938829505443573, + "num_tokens": 42495107.0, + "step": 680 + }, + { + "entropy": 0.1557753324508667, + "epoch": 0.3655282817502668, + "grad_norm": 0.25314972577609657, + "learning_rate": 1.6024596833018335e-05, + "loss": 0.1546, + "mean_token_accuracy": 0.938931691646576, + "num_tokens": 42807362.0, + "step": 685 + }, + { + "entropy": 0.15671408474445342, + "epoch": 0.368196371398079, + "grad_norm": 0.23786555264931808, + "learning_rate": 1.5949975229484132e-05, + "loss": 0.1569, + "mean_token_accuracy": 0.9379519820213318, + "num_tokens": 43118231.0, + "step": 690 + }, + { + "entropy": 0.1554604947566986, + "epoch": 0.37086446104589116, + "grad_norm": 0.21787664753869665, + "learning_rate": 1.587483716602265e-05, + "loss": 0.1566, + "mean_token_accuracy": 0.9381156921386719, + "num_tokens": 43429978.0, + "step": 695 + }, + { + "entropy": 0.1558491349220276, + "epoch": 0.37353255069370334, + "grad_norm": 0.20339100359830212, + "learning_rate": 1.5799189164644024e-05, + "loss": 0.1573, + "mean_token_accuracy": 0.9377960801124573, + "num_tokens": 43739241.0, + "step": 700 + }, + { + "entropy": 0.15804753303527833, + "epoch": 0.37620064034151546, + "grad_norm": 0.19712038739819182, + "learning_rate": 1.5723037791621193e-05, + "loss": 0.157, + "mean_token_accuracy": 0.9380033731460571, + "num_tokens": 44055301.0, + "step": 705 + }, + { + "entropy": 0.15700536370277404, + "epoch": 0.37886872998932764, + "grad_norm": 0.2134151511615338, + "learning_rate": 1.5646389656919912e-05, + "loss": 0.1574, + "mean_token_accuracy": 0.937994635105133, + "num_tokens": 44367651.0, + "step": 710 + }, + { + "entropy": 0.15613543689250947, + "epoch": 0.3815368196371398, + "grad_norm": 0.21307800352503886, + "learning_rate": 1.5569251413625043e-05, + "loss": 0.1568, + "mean_token_accuracy": 0.9383546113967896, + "num_tokens": 44679783.0, + "step": 715 + }, + { + "entropy": 0.1584879845380783, + "epoch": 0.384204909284952, + "grad_norm": 0.2044418124578672, + "learning_rate": 1.5491629757363033e-05, + "loss": 0.1575, + "mean_token_accuracy": 0.9375457048416138, + "num_tokens": 44990142.0, + "step": 720 + }, + { + "entropy": 0.15449098348617554, + "epoch": 0.38687299893276417, + "grad_norm": 0.1916484977488963, + "learning_rate": 1.5413531425720747e-05, + "loss": 0.1549, + "mean_token_accuracy": 0.938965666294098, + "num_tokens": 45300580.0, + "step": 725 + }, + { + "entropy": 0.1531301200389862, + "epoch": 0.3895410885805763, + "grad_norm": 0.19224448782508818, + "learning_rate": 1.533496319766064e-05, + "loss": 0.1551, + "mean_token_accuracy": 0.9387568950653076, + "num_tokens": 45613328.0, + "step": 730 + }, + { + "entropy": 0.1593233525753021, + "epoch": 0.39220917822838847, + "grad_norm": 0.22677980229806463, + "learning_rate": 1.5255931892932333e-05, + "loss": 0.1591, + "mean_token_accuracy": 0.9373407602310181, + "num_tokens": 45919600.0, + "step": 735 + }, + { + "entropy": 0.15600906312465668, + "epoch": 0.39487726787620064, + "grad_norm": 0.21468441059311985, + "learning_rate": 1.5176444371480673e-05, + "loss": 0.1563, + "mean_token_accuracy": 0.9382573127746582, + "num_tokens": 46231651.0, + "step": 740 + }, + { + "entropy": 0.1550221174955368, + "epoch": 0.3975453575240128, + "grad_norm": 0.2628525776434047, + "learning_rate": 1.5096507532850275e-05, + "loss": 0.1549, + "mean_token_accuracy": 0.9392532229423523, + "num_tokens": 46545633.0, + "step": 745 + }, + { + "entropy": 0.15294830203056337, + "epoch": 0.400213447171825, + "grad_norm": 0.23137778372846649, + "learning_rate": 1.501612831558664e-05, + "loss": 0.1541, + "mean_token_accuracy": 0.9392737984657288, + "num_tokens": 46861243.0, + "step": 750 + }, + { + "entropy": 0.15558450818061828, + "epoch": 0.4028815368196371, + "grad_norm": 0.19570194195754864, + "learning_rate": 1.493531369663389e-05, + "loss": 0.1547, + "mean_token_accuracy": 0.9386296153068543, + "num_tokens": 47176657.0, + "step": 755 + }, + { + "entropy": 0.15616436898708344, + "epoch": 0.4055496264674493, + "grad_norm": 0.20951796443416532, + "learning_rate": 1.4854070690729165e-05, + "loss": 0.1558, + "mean_token_accuracy": 0.9383723020553589, + "num_tokens": 47490967.0, + "step": 760 + }, + { + "entropy": 0.15231897830963134, + "epoch": 0.4082177161152615, + "grad_norm": 0.2117819997652048, + "learning_rate": 1.4772406349793744e-05, + "loss": 0.1551, + "mean_token_accuracy": 0.9383639574050904, + "num_tokens": 47801604.0, + "step": 765 + }, + { + "entropy": 0.15625073611736298, + "epoch": 0.41088580576307365, + "grad_norm": 0.21291868754948795, + "learning_rate": 1.4690327762320931e-05, + "loss": 0.1552, + "mean_token_accuracy": 0.9382084488868714, + "num_tokens": 48114796.0, + "step": 770 + }, + { + "entropy": 0.15438767075538634, + "epoch": 0.41355389541088583, + "grad_norm": 0.2018230333685515, + "learning_rate": 1.4607842052760777e-05, + "loss": 0.154, + "mean_token_accuracy": 0.9390445947647095, + "num_tokens": 48429003.0, + "step": 775 + }, + { + "entropy": 0.1527568519115448, + "epoch": 0.41622198505869795, + "grad_norm": 0.21050953897057828, + "learning_rate": 1.4524956380901669e-05, + "loss": 0.154, + "mean_token_accuracy": 0.9386907339096069, + "num_tokens": 48742907.0, + "step": 780 + }, + { + "entropy": 0.15494268834590913, + "epoch": 0.41889007470651013, + "grad_norm": 0.24431428774607528, + "learning_rate": 1.4441677941248862e-05, + "loss": 0.155, + "mean_token_accuracy": 0.938653540611267, + "num_tokens": 49054937.0, + "step": 785 + }, + { + "entropy": 0.15730642080307006, + "epoch": 0.4215581643543223, + "grad_norm": 0.21834695991194622, + "learning_rate": 1.4358013962399998e-05, + "loss": 0.1592, + "mean_token_accuracy": 0.9372829079627991, + "num_tokens": 49363498.0, + "step": 790 + }, + { + "entropy": 0.15722571909427643, + "epoch": 0.4242262540021345, + "grad_norm": 0.20605119981337897, + "learning_rate": 1.4273971706417648e-05, + "loss": 0.1552, + "mean_token_accuracy": 0.9383088946342468, + "num_tokens": 49675352.0, + "step": 795 + }, + { + "entropy": 0.15412231981754304, + "epoch": 0.42689434364994666, + "grad_norm": 0.2264335394675343, + "learning_rate": 1.4189558468198973e-05, + "loss": 0.1556, + "mean_token_accuracy": 0.93870609998703, + "num_tokens": 49987958.0, + "step": 800 + }, + { + "entropy": 0.1554412692785263, + "epoch": 0.4295624332977588, + "grad_norm": 0.26579388260223064, + "learning_rate": 1.4104781574842524e-05, + "loss": 0.1571, + "mean_token_accuracy": 0.9380474090576172, + "num_tokens": 50301027.0, + "step": 805 + }, + { + "entropy": 0.1543067067861557, + "epoch": 0.43223052294557096, + "grad_norm": 0.2559081768640021, + "learning_rate": 1.4019648385012243e-05, + "loss": 0.1537, + "mean_token_accuracy": 0.9395334362983704, + "num_tokens": 50614978.0, + "step": 810 + }, + { + "entropy": 0.15630431473255157, + "epoch": 0.43489861259338314, + "grad_norm": 0.25616045587986713, + "learning_rate": 1.3934166288298729e-05, + "loss": 0.1558, + "mean_token_accuracy": 0.9386638164520263, + "num_tokens": 50929542.0, + "step": 815 + }, + { + "entropy": 0.15396025478839875, + "epoch": 0.4375667022411953, + "grad_norm": 0.23184443832119792, + "learning_rate": 1.3848342704577824e-05, + "loss": 0.1555, + "mean_token_accuracy": 0.9384626746177673, + "num_tokens": 51241435.0, + "step": 820 + }, + { + "entropy": 0.15652641355991365, + "epoch": 0.4402347918890075, + "grad_norm": 0.22777748094798117, + "learning_rate": 1.3762185083366557e-05, + "loss": 0.156, + "mean_token_accuracy": 0.938478434085846, + "num_tokens": 51553060.0, + "step": 825 + }, + { + "entropy": 0.15561273396015168, + "epoch": 0.4429028815368196, + "grad_norm": 0.22271913073510122, + "learning_rate": 1.3675700903176534e-05, + "loss": 0.1558, + "mean_token_accuracy": 0.9381041049957275, + "num_tokens": 51865184.0, + "step": 830 + }, + { + "entropy": 0.15462508499622346, + "epoch": 0.4455709711846318, + "grad_norm": 0.22905739195488695, + "learning_rate": 1.3588897670864787e-05, + "loss": 0.156, + "mean_token_accuracy": 0.9383329391479492, + "num_tokens": 52179580.0, + "step": 835 + }, + { + "entropy": 0.15535084903240204, + "epoch": 0.44823906083244397, + "grad_norm": 0.20758795386301682, + "learning_rate": 1.3501782920982185e-05, + "loss": 0.1552, + "mean_token_accuracy": 0.9387192964553833, + "num_tokens": 52493650.0, + "step": 840 + }, + { + "entropy": 0.15626255571842193, + "epoch": 0.45090715048025615, + "grad_norm": 0.26357886940476394, + "learning_rate": 1.3414364215119436e-05, + "loss": 0.157, + "mean_token_accuracy": 0.9378098011016845, + "num_tokens": 52805986.0, + "step": 845 + }, + { + "entropy": 0.15512668788433076, + "epoch": 0.4535752401280683, + "grad_norm": 0.2687039281213037, + "learning_rate": 1.3326649141250731e-05, + "loss": 0.1547, + "mean_token_accuracy": 0.9390361428260803, + "num_tokens": 53119855.0, + "step": 850 + }, + { + "entropy": 0.15508779585361482, + "epoch": 0.45624332977588045, + "grad_norm": 0.23811379871161598, + "learning_rate": 1.3238645313075104e-05, + "loss": 0.1555, + "mean_token_accuracy": 0.9387191772460938, + "num_tokens": 53430994.0, + "step": 855 + }, + { + "entropy": 0.15608465075492858, + "epoch": 0.4589114194236926, + "grad_norm": 0.19296201717409298, + "learning_rate": 1.315036036935557e-05, + "loss": 0.1567, + "mean_token_accuracy": 0.9383533000946045, + "num_tokens": 53743754.0, + "step": 860 + }, + { + "entropy": 0.15427376329898834, + "epoch": 0.4615795090715048, + "grad_norm": 0.19057043944008742, + "learning_rate": 1.3061801973256066e-05, + "loss": 0.1544, + "mean_token_accuracy": 0.9388363599777222, + "num_tokens": 54060087.0, + "step": 865 + }, + { + "entropy": 0.15521352589130402, + "epoch": 0.464247598719317, + "grad_norm": 0.26017936933633373, + "learning_rate": 1.2972977811676286e-05, + "loss": 0.1557, + "mean_token_accuracy": 0.9386460781097412, + "num_tokens": 54371977.0, + "step": 870 + }, + { + "entropy": 0.15631742775440216, + "epoch": 0.46691568836712916, + "grad_norm": 0.21790366736450598, + "learning_rate": 1.2883895594584474e-05, + "loss": 0.1568, + "mean_token_accuracy": 0.9381857514381409, + "num_tokens": 54682077.0, + "step": 875 + }, + { + "entropy": 0.15416281521320344, + "epoch": 0.4695837780149413, + "grad_norm": 0.1978743033527956, + "learning_rate": 1.2794563054348173e-05, + "loss": 0.1534, + "mean_token_accuracy": 0.9395832657814026, + "num_tokens": 54996694.0, + "step": 880 + }, + { + "entropy": 0.15204716920852662, + "epoch": 0.47225186766275346, + "grad_norm": 0.20919665202064539, + "learning_rate": 1.270498794506307e-05, + "loss": 0.1535, + "mean_token_accuracy": 0.9389811158180237, + "num_tokens": 55311736.0, + "step": 885 + }, + { + "entropy": 0.15359111726284028, + "epoch": 0.47491995731056563, + "grad_norm": 0.1959526143243109, + "learning_rate": 1.2615178041879918e-05, + "loss": 0.1536, + "mean_token_accuracy": 0.9392480969429016, + "num_tokens": 55627584.0, + "step": 890 + }, + { + "entropy": 0.1572214722633362, + "epoch": 0.4775880469583778, + "grad_norm": 0.21946848825899526, + "learning_rate": 1.2525141140329673e-05, + "loss": 0.1573, + "mean_token_accuracy": 0.938277006149292, + "num_tokens": 55937026.0, + "step": 895 + }, + { + "entropy": 0.15423589944839478, + "epoch": 0.48025613660619, + "grad_norm": 0.22980710459603573, + "learning_rate": 1.2434885055646823e-05, + "loss": 0.1545, + "mean_token_accuracy": 0.9389993071556091, + "num_tokens": 56250825.0, + "step": 900 + }, + { + "entropy": 0.15658611953258514, + "epoch": 0.4829242262540021, + "grad_norm": 0.24487921513464042, + "learning_rate": 1.2344417622091032e-05, + "loss": 0.156, + "mean_token_accuracy": 0.9381351351737977, + "num_tokens": 56560445.0, + "step": 905 + }, + { + "entropy": 0.15108625292778016, + "epoch": 0.4855923159018143, + "grad_norm": 0.2301434106979841, + "learning_rate": 1.2253746692267119e-05, + "loss": 0.1528, + "mean_token_accuracy": 0.9396188139915467, + "num_tokens": 56877321.0, + "step": 910 + }, + { + "entropy": 0.1589755594730377, + "epoch": 0.48826040554962646, + "grad_norm": 0.25294725895747705, + "learning_rate": 1.2162880136443447e-05, + "loss": 0.1589, + "mean_token_accuracy": 0.9371992588043213, + "num_tokens": 57185139.0, + "step": 915 + }, + { + "entropy": 0.15456578135490417, + "epoch": 0.49092849519743864, + "grad_norm": 0.20948010370122902, + "learning_rate": 1.2071825841868788e-05, + "loss": 0.1543, + "mean_token_accuracy": 0.9389463782310485, + "num_tokens": 57499589.0, + "step": 920 + }, + { + "entropy": 0.15248561203479766, + "epoch": 0.4935965848452508, + "grad_norm": 0.19733552881311944, + "learning_rate": 1.19805917120877e-05, + "loss": 0.1534, + "mean_token_accuracy": 0.9392978549003601, + "num_tokens": 57817486.0, + "step": 925 + }, + { + "entropy": 0.15405131876468658, + "epoch": 0.49626467449306294, + "grad_norm": 0.18714177298536025, + "learning_rate": 1.1889185666254505e-05, + "loss": 0.1539, + "mean_token_accuracy": 0.9388469815254211, + "num_tokens": 58133143.0, + "step": 930 + }, + { + "entropy": 0.15573233962059022, + "epoch": 0.4989327641408751, + "grad_norm": 0.21350853503849604, + "learning_rate": 1.179761563844589e-05, + "loss": 0.1547, + "mean_token_accuracy": 0.9387565493583679, + "num_tokens": 58448138.0, + "step": 935 + }, + { + "entropy": 0.15494557321071625, + "epoch": 0.5016008537886874, + "grad_norm": 0.17864580074050054, + "learning_rate": 1.1705889576972243e-05, + "loss": 0.1556, + "mean_token_accuracy": 0.9387572526931762, + "num_tokens": 58759567.0, + "step": 940 + }, + { + "entropy": 0.15362054109573364, + "epoch": 0.5042689434364994, + "grad_norm": 0.17875371335143, + "learning_rate": 1.1614015443687723e-05, + "loss": 0.1541, + "mean_token_accuracy": 0.9391363382339477, + "num_tokens": 59072420.0, + "step": 945 + }, + { + "entropy": 0.15368084311485292, + "epoch": 0.5069370330843116, + "grad_norm": 0.20278372952945256, + "learning_rate": 1.1522001213299176e-05, + "loss": 0.1541, + "mean_token_accuracy": 0.9390037655830383, + "num_tokens": 59385414.0, + "step": 950 + }, + { + "entropy": 0.15587687492370605, + "epoch": 0.5096051227321238, + "grad_norm": 0.20036906160941148, + "learning_rate": 1.1429854872673927e-05, + "loss": 0.1557, + "mean_token_accuracy": 0.9383829355239868, + "num_tokens": 59698209.0, + "step": 955 + }, + { + "entropy": 0.15657925903797149, + "epoch": 0.512273212379936, + "grad_norm": 0.2518678485978799, + "learning_rate": 1.133758442014651e-05, + "loss": 0.1575, + "mean_token_accuracy": 0.9377983450889588, + "num_tokens": 60007386.0, + "step": 960 + }, + { + "entropy": 0.1564165860414505, + "epoch": 0.5149413020277481, + "grad_norm": 0.21361188248461715, + "learning_rate": 1.1245197864824426e-05, + "loss": 0.1567, + "mean_token_accuracy": 0.9377204418182373, + "num_tokens": 60317545.0, + "step": 965 + }, + { + "entropy": 0.1573189228773117, + "epoch": 0.5176093916755603, + "grad_norm": 0.21766212031152615, + "learning_rate": 1.1152703225892929e-05, + "loss": 0.1563, + "mean_token_accuracy": 0.938551950454712, + "num_tokens": 60629190.0, + "step": 970 + }, + { + "entropy": 0.15436407625675203, + "epoch": 0.5202774813233725, + "grad_norm": 0.260368916857354, + "learning_rate": 1.1060108531918972e-05, + "loss": 0.1553, + "mean_token_accuracy": 0.9390477299690246, + "num_tokens": 60941397.0, + "step": 975 + }, + { + "entropy": 0.15507691204547883, + "epoch": 0.5229455709711847, + "grad_norm": 0.1962543554869463, + "learning_rate": 1.0967421820154319e-05, + "loss": 0.1564, + "mean_token_accuracy": 0.9386341333389282, + "num_tokens": 61250900.0, + "step": 980 + }, + { + "entropy": 0.15573692321777344, + "epoch": 0.5256136606189968, + "grad_norm": 0.22480820207139385, + "learning_rate": 1.0874651135837912e-05, + "loss": 0.1557, + "mean_token_accuracy": 0.9386470794677735, + "num_tokens": 61562372.0, + "step": 985 + }, + { + "entropy": 0.15311691761016846, + "epoch": 0.528281750266809, + "grad_norm": 0.20634597184754916, + "learning_rate": 1.078180453149754e-05, + "loss": 0.1527, + "mean_token_accuracy": 0.9395962953567505, + "num_tokens": 61880598.0, + "step": 990 + }, + { + "entropy": 0.15343199968338012, + "epoch": 0.5309498399146211, + "grad_norm": 0.1883659554769896, + "learning_rate": 1.0688890066250876e-05, + "loss": 0.1528, + "mean_token_accuracy": 0.9397060394287109, + "num_tokens": 62193244.0, + "step": 995 + }, + { + "entropy": 0.15367394387722016, + "epoch": 0.5336179295624333, + "grad_norm": 0.199812419140156, + "learning_rate": 1.0595915805105945e-05, + "loss": 0.1543, + "mean_token_accuracy": 0.9393554091453552, + "num_tokens": 62507530.0, + "step": 1000 + }, + { + "epoch": 0.5336179295624333, + "eval_entropy": 0.1538348892067052, + "eval_loss": 0.15534012019634247, + "eval_mean_token_accuracy": 0.9388755446747888, + "eval_num_tokens": 62507530.0, + "eval_runtime": 17.6249, + "eval_samples_per_second": 283.69, + "eval_steps_per_second": 4.482, + "step": 1000 + }, + { + "entropy": 0.15436525344848634, + "epoch": 0.5362860192102454, + "grad_norm": 0.19576898569996617, + "learning_rate": 1.0502889818261075e-05, + "loss": 0.1554, + "mean_token_accuracy": 0.9384722709655762, + "num_tokens": 62818270.0, + "step": 1005 + }, + { + "entropy": 0.15489275455474855, + "epoch": 0.5389541088580576, + "grad_norm": 0.20047233121126237, + "learning_rate": 1.04098201804044e-05, + "loss": 0.1542, + "mean_token_accuracy": 0.9395401954650879, + "num_tokens": 63129173.0, + "step": 1010 + }, + { + "entropy": 0.15460718274116517, + "epoch": 0.5416221985058698, + "grad_norm": 0.19314743208982307, + "learning_rate": 1.031671497001298e-05, + "loss": 0.1548, + "mean_token_accuracy": 0.9389819502830505, + "num_tokens": 63440789.0, + "step": 1015 + }, + { + "entropy": 0.15471943616867065, + "epoch": 0.544290288153682, + "grad_norm": 0.23882474744234125, + "learning_rate": 1.0223582268651585e-05, + "loss": 0.1552, + "mean_token_accuracy": 0.9385024428367614, + "num_tokens": 63752159.0, + "step": 1020 + }, + { + "entropy": 0.15407437086105347, + "epoch": 0.5469583778014941, + "grad_norm": 0.20795985933204394, + "learning_rate": 1.0130430160271215e-05, + "loss": 0.1551, + "mean_token_accuracy": 0.9385804533958435, + "num_tokens": 64063845.0, + "step": 1025 + }, + { + "entropy": 0.1559080123901367, + "epoch": 0.5496264674493063, + "grad_norm": 0.19061876365796376, + "learning_rate": 1.0037266730507394e-05, + "loss": 0.1551, + "mean_token_accuracy": 0.9385924577713013, + "num_tokens": 64376046.0, + "step": 1030 + }, + { + "entropy": 0.15590772330760955, + "epoch": 0.5522945570971185, + "grad_norm": 0.19744593251536702, + "learning_rate": 9.944100065978351e-06, + "loss": 0.1555, + "mean_token_accuracy": 0.938746166229248, + "num_tokens": 64687857.0, + "step": 1035 + }, + { + "entropy": 0.15375302731990814, + "epoch": 0.5549626467449307, + "grad_norm": 0.19950393668783845, + "learning_rate": 9.850938253583102e-06, + "loss": 0.1549, + "mean_token_accuracy": 0.9387599468231201, + "num_tokens": 64998447.0, + "step": 1040 + }, + { + "entropy": 0.15291616320610046, + "epoch": 0.5576307363927427, + "grad_norm": 0.19509972625285316, + "learning_rate": 9.757789379799482e-06, + "loss": 0.1532, + "mean_token_accuracy": 0.9396419167518616, + "num_tokens": 65313380.0, + "step": 1045 + }, + { + "entropy": 0.15656664967536926, + "epoch": 0.5602988260405549, + "grad_norm": 0.20136336987602882, + "learning_rate": 9.664661529982261e-06, + "loss": 0.1559, + "mean_token_accuracy": 0.9382720947265625, + "num_tokens": 65624880.0, + "step": 1050 + }, + { + "entropy": 0.15257844924926758, + "epoch": 0.5629669156883671, + "grad_norm": 0.1796732976273371, + "learning_rate": 9.571562787661316e-06, + "loss": 0.1528, + "mean_token_accuracy": 0.9397455334663392, + "num_tokens": 65938756.0, + "step": 1055 + }, + { + "entropy": 0.15621959567070007, + "epoch": 0.5656350053361793, + "grad_norm": 0.190132157006254, + "learning_rate": 9.47850123383999e-06, + "loss": 0.1558, + "mean_token_accuracy": 0.9388103485107422, + "num_tokens": 66247340.0, + "step": 1060 + }, + { + "entropy": 0.15296345055103303, + "epoch": 0.5683030949839915, + "grad_norm": 0.17706775442899994, + "learning_rate": 9.385484946293636e-06, + "loss": 0.154, + "mean_token_accuracy": 0.9390620470046998, + "num_tokens": 66560725.0, + "step": 1065 + }, + { + "entropy": 0.1538401484489441, + "epoch": 0.5709711846318036, + "grad_norm": 0.23749640187460702, + "learning_rate": 9.292521998868498e-06, + "loss": 0.1542, + "mean_token_accuracy": 0.9392606258392334, + "num_tokens": 66871322.0, + "step": 1070 + }, + { + "entropy": 0.1535812646150589, + "epoch": 0.5736392742796158, + "grad_norm": 0.224556333435221, + "learning_rate": 9.199620460780867e-06, + "loss": 0.1542, + "mean_token_accuracy": 0.93891681432724, + "num_tokens": 67181595.0, + "step": 1075 + }, + { + "entropy": 0.1573965609073639, + "epoch": 0.576307363927428, + "grad_norm": 0.19815175316382136, + "learning_rate": 9.106788395916679e-06, + "loss": 0.1565, + "mean_token_accuracy": 0.9381605625152588, + "num_tokens": 67492926.0, + "step": 1080 + }, + { + "entropy": 0.15403612852096557, + "epoch": 0.5789754535752402, + "grad_norm": 0.1840244632498643, + "learning_rate": 9.014033862131583e-06, + "loss": 0.1536, + "mean_token_accuracy": 0.9394134044647217, + "num_tokens": 67808031.0, + "step": 1085 + }, + { + "entropy": 0.15346834659576417, + "epoch": 0.5816435432230523, + "grad_norm": 0.21717458002978238, + "learning_rate": 8.92136491055149e-06, + "loss": 0.1548, + "mean_token_accuracy": 0.9388310194015503, + "num_tokens": 68119639.0, + "step": 1090 + }, + { + "entropy": 0.15523187518119813, + "epoch": 0.5843116328708644, + "grad_norm": 0.1840200166212005, + "learning_rate": 8.828789584873754e-06, + "loss": 0.1558, + "mean_token_accuracy": 0.9386077761650086, + "num_tokens": 68429603.0, + "step": 1095 + }, + { + "entropy": 0.15351728796958924, + "epoch": 0.5869797225186766, + "grad_norm": 0.21035998185165955, + "learning_rate": 8.736315920668968e-06, + "loss": 0.153, + "mean_token_accuracy": 0.939343523979187, + "num_tokens": 68741574.0, + "step": 1100 + }, + { + "entropy": 0.15493020713329314, + "epoch": 0.5896478121664888, + "grad_norm": 0.2262869104676659, + "learning_rate": 8.643951944683466e-06, + "loss": 0.1546, + "mean_token_accuracy": 0.9387609839439393, + "num_tokens": 69052450.0, + "step": 1105 + }, + { + "entropy": 0.1536796510219574, + "epoch": 0.5923159018143009, + "grad_norm": 0.2588445878760737, + "learning_rate": 8.551705674142618e-06, + "loss": 0.1546, + "mean_token_accuracy": 0.9383743643760681, + "num_tokens": 69363673.0, + "step": 1110 + }, + { + "entropy": 0.15553466677665712, + "epoch": 0.5949839914621131, + "grad_norm": 0.24222053348739628, + "learning_rate": 8.459585116054903e-06, + "loss": 0.1559, + "mean_token_accuracy": 0.9381522536277771, + "num_tokens": 69676750.0, + "step": 1115 + }, + { + "entropy": 0.15311868786811828, + "epoch": 0.5976520811099253, + "grad_norm": 0.17614863141030848, + "learning_rate": 8.367598266516934e-06, + "loss": 0.1523, + "mean_token_accuracy": 0.9392088413238525, + "num_tokens": 69991603.0, + "step": 1120 + }, + { + "entropy": 0.15428132712841033, + "epoch": 0.6003201707577375, + "grad_norm": 0.1998512753996338, + "learning_rate": 8.275753110019367e-06, + "loss": 0.1547, + "mean_token_accuracy": 0.938707685470581, + "num_tokens": 70306692.0, + "step": 1125 + }, + { + "entropy": 0.15532322824001313, + "epoch": 0.6029882604055496, + "grad_norm": 0.24332810412676226, + "learning_rate": 8.184057618753849e-06, + "loss": 0.1556, + "mean_token_accuracy": 0.9380664229393005, + "num_tokens": 70618028.0, + "step": 1130 + }, + { + "entropy": 0.15342488884925842, + "epoch": 0.6056563500533618, + "grad_norm": 0.17933966138875423, + "learning_rate": 8.09251975192104e-06, + "loss": 0.1542, + "mean_token_accuracy": 0.9395575881004333, + "num_tokens": 70931744.0, + "step": 1135 + }, + { + "entropy": 0.1545114040374756, + "epoch": 0.608324439701174, + "grad_norm": 0.21466920390671304, + "learning_rate": 8.001147455039735e-06, + "loss": 0.1553, + "mean_token_accuracy": 0.9387154817581177, + "num_tokens": 71244224.0, + "step": 1140 + }, + { + "entropy": 0.1548311859369278, + "epoch": 0.6109925293489862, + "grad_norm": 0.24490082669063587, + "learning_rate": 7.909948659257213e-06, + "loss": 0.1536, + "mean_token_accuracy": 0.9394757270812988, + "num_tokens": 71558782.0, + "step": 1145 + }, + { + "entropy": 0.15591164231300353, + "epoch": 0.6136606189967982, + "grad_norm": 0.19745696634708404, + "learning_rate": 7.818931280660781e-06, + "loss": 0.1558, + "mean_token_accuracy": 0.9384572505950928, + "num_tokens": 71868069.0, + "step": 1150 + }, + { + "entropy": 0.15378665030002595, + "epoch": 0.6163287086446104, + "grad_norm": 0.18610092854623703, + "learning_rate": 7.72810321959068e-06, + "loss": 0.1538, + "mean_token_accuracy": 0.9386947751045227, + "num_tokens": 72179019.0, + "step": 1155 + }, + { + "entropy": 0.15749258995056153, + "epoch": 0.6189967982924226, + "grad_norm": 0.19344944196983696, + "learning_rate": 7.63747235995433e-06, + "loss": 0.1574, + "mean_token_accuracy": 0.9379115223884582, + "num_tokens": 72487777.0, + "step": 1160 + }, + { + "entropy": 0.151074355840683, + "epoch": 0.6216648879402348, + "grad_norm": 0.21666750191470913, + "learning_rate": 7.54704656854198e-06, + "loss": 0.1518, + "mean_token_accuracy": 0.9397258996963501, + "num_tokens": 72805762.0, + "step": 1165 + }, + { + "entropy": 0.1522180676460266, + "epoch": 0.624332977588047, + "grad_norm": 0.1819485518927904, + "learning_rate": 7.4568336943439055e-06, + "loss": 0.1524, + "mean_token_accuracy": 0.939635157585144, + "num_tokens": 73118436.0, + "step": 1170 + }, + { + "entropy": 0.15505608022212983, + "epoch": 0.6270010672358591, + "grad_norm": 0.17663094640924515, + "learning_rate": 7.3668415678690745e-06, + "loss": 0.1546, + "mean_token_accuracy": 0.9385998487472534, + "num_tokens": 73431334.0, + "step": 1175 + }, + { + "entropy": 0.15457534790039062, + "epoch": 0.6296691568836713, + "grad_norm": 0.19274527895852187, + "learning_rate": 7.277078000465487e-06, + "loss": 0.1549, + "mean_token_accuracy": 0.9389039039611816, + "num_tokens": 73742853.0, + "step": 1180 + }, + { + "entropy": 0.15383411943912506, + "epoch": 0.6323372465314835, + "grad_norm": 0.2045826633338559, + "learning_rate": 7.187550783642141e-06, + "loss": 0.1539, + "mean_token_accuracy": 0.9392240285873413, + "num_tokens": 74056292.0, + "step": 1185 + }, + { + "entropy": 0.15572645366191865, + "epoch": 0.6350053361792957, + "grad_norm": 0.1740911980701786, + "learning_rate": 7.098267688392702e-06, + "loss": 0.1554, + "mean_token_accuracy": 0.9382194876670837, + "num_tokens": 74369090.0, + "step": 1190 + }, + { + "entropy": 0.15480996966361998, + "epoch": 0.6376734258271078, + "grad_norm": 0.1958267352216853, + "learning_rate": 7.009236464521021e-06, + "loss": 0.1552, + "mean_token_accuracy": 0.9386586546897888, + "num_tokens": 74680567.0, + "step": 1195 + }, + { + "entropy": 0.1536591023206711, + "epoch": 0.6403415154749199, + "grad_norm": 0.20969055435632733, + "learning_rate": 6.920464839968405e-06, + "loss": 0.1534, + "mean_token_accuracy": 0.9394211530685425, + "num_tokens": 74990800.0, + "step": 1200 + }, + { + "entropy": 0.1548810452222824, + "epoch": 0.6430096051227321, + "grad_norm": 0.17777373225867352, + "learning_rate": 6.831960520142868e-06, + "loss": 0.1542, + "mean_token_accuracy": 0.9388859510421753, + "num_tokens": 75303707.0, + "step": 1205 + }, + { + "entropy": 0.15442072451114655, + "epoch": 0.6456776947705443, + "grad_norm": 0.2054630769538203, + "learning_rate": 6.743731187250262e-06, + "loss": 0.1548, + "mean_token_accuracy": 0.9383596777915955, + "num_tokens": 75615973.0, + "step": 1210 + }, + { + "entropy": 0.15551903247833251, + "epoch": 0.6483457844183564, + "grad_norm": 0.2388470280787671, + "learning_rate": 6.655784499627491e-06, + "loss": 0.1555, + "mean_token_accuracy": 0.9386810302734375, + "num_tokens": 75926254.0, + "step": 1215 + }, + { + "entropy": 0.15150526762008668, + "epoch": 0.6510138740661686, + "grad_norm": 0.1896593043296809, + "learning_rate": 6.56812809107775e-06, + "loss": 0.1522, + "mean_token_accuracy": 0.9392547249794007, + "num_tokens": 76241390.0, + "step": 1220 + }, + { + "entropy": 0.15170768201351165, + "epoch": 0.6536819637139808, + "grad_norm": 0.17975435195878658, + "learning_rate": 6.480769570207897e-06, + "loss": 0.1523, + "mean_token_accuracy": 0.939859139919281, + "num_tokens": 76556998.0, + "step": 1225 + }, + { + "entropy": 0.15482379198074342, + "epoch": 0.656350053361793, + "grad_norm": 0.2067120427304762, + "learning_rate": 6.393716519768047e-06, + "loss": 0.156, + "mean_token_accuracy": 0.9383298635482789, + "num_tokens": 76868707.0, + "step": 1230 + }, + { + "entropy": 0.1551853358745575, + "epoch": 0.6590181430096052, + "grad_norm": 0.17199612564283812, + "learning_rate": 6.306976495993373e-06, + "loss": 0.1531, + "mean_token_accuracy": 0.9393068075180053, + "num_tokens": 77181461.0, + "step": 1235 + }, + { + "entropy": 0.1531525045633316, + "epoch": 0.6616862326574173, + "grad_norm": 0.20032722239251838, + "learning_rate": 6.220557027948222e-06, + "loss": 0.1527, + "mean_token_accuracy": 0.940034294128418, + "num_tokens": 77494717.0, + "step": 1240 + }, + { + "entropy": 0.1529313951730728, + "epoch": 0.6643543223052295, + "grad_norm": 0.18132538819196686, + "learning_rate": 6.134465616872598e-06, + "loss": 0.1539, + "mean_token_accuracy": 0.9392167925834656, + "num_tokens": 77807893.0, + "step": 1245 + }, + { + "entropy": 0.154258593916893, + "epoch": 0.6670224119530416, + "grad_norm": 0.18572593550286834, + "learning_rate": 6.04870973553103e-06, + "loss": 0.155, + "mean_token_accuracy": 0.9384752988815308, + "num_tokens": 78115560.0, + "step": 1250 + }, + { + "entropy": 0.157048237323761, + "epoch": 0.6696905016008537, + "grad_norm": 0.19007712881646377, + "learning_rate": 5.963296827563969e-06, + "loss": 0.1565, + "mean_token_accuracy": 0.938205361366272, + "num_tokens": 78425131.0, + "step": 1255 + }, + { + "entropy": 0.15400311648845671, + "epoch": 0.6723585912486659, + "grad_norm": 0.17992885462267433, + "learning_rate": 5.878234306841637e-06, + "loss": 0.1539, + "mean_token_accuracy": 0.9387377262115478, + "num_tokens": 78740041.0, + "step": 1260 + }, + { + "entropy": 0.15463869869709015, + "epoch": 0.6750266808964781, + "grad_norm": 0.16967767258074606, + "learning_rate": 5.793529556820538e-06, + "loss": 0.1542, + "mean_token_accuracy": 0.9388965487480163, + "num_tokens": 79050088.0, + "step": 1265 + }, + { + "entropy": 0.1539893090724945, + "epoch": 0.6776947705442903, + "grad_norm": 0.2124727900050765, + "learning_rate": 5.709189929902551e-06, + "loss": 0.1541, + "mean_token_accuracy": 0.9387233018875122, + "num_tokens": 79363843.0, + "step": 1270 + }, + { + "entropy": 0.15476358234882354, + "epoch": 0.6803628601921025, + "grad_norm": 0.20572465697212475, + "learning_rate": 5.62522274679673e-06, + "loss": 0.1545, + "mean_token_accuracy": 0.9390263795852661, + "num_tokens": 79675480.0, + "step": 1275 + }, + { + "entropy": 0.15502644181251526, + "epoch": 0.6830309498399146, + "grad_norm": 0.16772458380589314, + "learning_rate": 5.541635295883889e-06, + "loss": 0.1547, + "mean_token_accuracy": 0.9388966798782349, + "num_tokens": 79988876.0, + "step": 1280 + }, + { + "entropy": 0.15343300998210907, + "epoch": 0.6856990394877268, + "grad_norm": 0.17283095303374088, + "learning_rate": 5.45843483258395e-06, + "loss": 0.1541, + "mean_token_accuracy": 0.9392352104187012, + "num_tokens": 80301888.0, + "step": 1285 + }, + { + "entropy": 0.15344253182411194, + "epoch": 0.688367129135539, + "grad_norm": 0.1947170351124985, + "learning_rate": 5.375628578726181e-06, + "loss": 0.1537, + "mean_token_accuracy": 0.9390166163444519, + "num_tokens": 80613143.0, + "step": 1290 + }, + { + "entropy": 0.1538625419139862, + "epoch": 0.6910352187833512, + "grad_norm": 0.18817183057447215, + "learning_rate": 5.293223721922319e-06, + "loss": 0.1536, + "mean_token_accuracy": 0.9387935757637024, + "num_tokens": 80926795.0, + "step": 1295 + }, + { + "entropy": 0.15280040502548217, + "epoch": 0.6937033084311632, + "grad_norm": 0.1717728161068434, + "learning_rate": 5.21122741494271e-06, + "loss": 0.1532, + "mean_token_accuracy": 0.9387828230857849, + "num_tokens": 81239514.0, + "step": 1300 + }, + { + "entropy": 0.15432974696159363, + "epoch": 0.6963713980789754, + "grad_norm": 0.21208211624998147, + "learning_rate": 5.129646775095432e-06, + "loss": 0.1536, + "mean_token_accuracy": 0.9388492107391357, + "num_tokens": 81551899.0, + "step": 1305 + }, + { + "entropy": 0.15256355106830596, + "epoch": 0.6990394877267876, + "grad_norm": 0.18456842506003865, + "learning_rate": 5.048488883608496e-06, + "loss": 0.153, + "mean_token_accuracy": 0.9395634889602661, + "num_tokens": 81868789.0, + "step": 1310 + }, + { + "entropy": 0.15440889596939086, + "epoch": 0.7017075773745998, + "grad_norm": 0.1800146180845423, + "learning_rate": 4.96776078501523e-06, + "loss": 0.1548, + "mean_token_accuracy": 0.939136517047882, + "num_tokens": 82179997.0, + "step": 1315 + }, + { + "entropy": 0.15329372584819795, + "epoch": 0.7043756670224119, + "grad_norm": 0.19968494109740412, + "learning_rate": 4.8874694865427676e-06, + "loss": 0.1527, + "mean_token_accuracy": 0.939085054397583, + "num_tokens": 82496847.0, + "step": 1320 + }, + { + "entropy": 0.15478273034095763, + "epoch": 0.7070437566702241, + "grad_norm": 0.16598727253128187, + "learning_rate": 4.807621957503853e-06, + "loss": 0.1544, + "mean_token_accuracy": 0.938567292690277, + "num_tokens": 82809223.0, + "step": 1325 + }, + { + "entropy": 0.15093289017677308, + "epoch": 0.7097118463180363, + "grad_norm": 0.17117692227910006, + "learning_rate": 4.728225128691888e-06, + "loss": 0.152, + "mean_token_accuracy": 0.9395733594894409, + "num_tokens": 83123357.0, + "step": 1330 + }, + { + "entropy": 0.1547090470790863, + "epoch": 0.7123799359658485, + "grad_norm": 0.19535886780678244, + "learning_rate": 4.649285891779327e-06, + "loss": 0.1534, + "mean_token_accuracy": 0.9391082644462585, + "num_tokens": 83436086.0, + "step": 1335 + }, + { + "entropy": 0.15347299575805665, + "epoch": 0.7150480256136607, + "grad_norm": 0.17698403460692924, + "learning_rate": 4.570811098719502e-06, + "loss": 0.1533, + "mean_token_accuracy": 0.9396489024162292, + "num_tokens": 83749042.0, + "step": 1340 + }, + { + "entropy": 0.15381872951984404, + "epoch": 0.7177161152614728, + "grad_norm": 0.17372376390391425, + "learning_rate": 4.492807561151841e-06, + "loss": 0.1547, + "mean_token_accuracy": 0.938352620601654, + "num_tokens": 84063674.0, + "step": 1345 + }, + { + "entropy": 0.15317289531230927, + "epoch": 0.7203842049092849, + "grad_norm": 0.18379765550528124, + "learning_rate": 4.415282049810644e-06, + "loss": 0.1534, + "mean_token_accuracy": 0.9391525387763977, + "num_tokens": 84378034.0, + "step": 1350 + }, + { + "entropy": 0.15499058365821838, + "epoch": 0.7230522945570971, + "grad_norm": 0.16958667153352305, + "learning_rate": 4.338241293937362e-06, + "loss": 0.1543, + "mean_token_accuracy": 0.9386964082717896, + "num_tokens": 84690531.0, + "step": 1355 + }, + { + "entropy": 0.15222392082214356, + "epoch": 0.7257203842049093, + "grad_norm": 0.19889195844150365, + "learning_rate": 4.261691980696502e-06, + "loss": 0.1522, + "mean_token_accuracy": 0.939692759513855, + "num_tokens": 85003751.0, + "step": 1360 + }, + { + "entropy": 0.1543875366449356, + "epoch": 0.7283884738527214, + "grad_norm": 0.17327562385573972, + "learning_rate": 4.185640754595183e-06, + "loss": 0.1546, + "mean_token_accuracy": 0.9386188387870789, + "num_tokens": 85315782.0, + "step": 1365 + }, + { + "entropy": 0.15334829092025756, + "epoch": 0.7310565635005336, + "grad_norm": 0.18169650993402348, + "learning_rate": 4.110094216906377e-06, + "loss": 0.1537, + "mean_token_accuracy": 0.9394148349761963, + "num_tokens": 85629325.0, + "step": 1370 + }, + { + "entropy": 0.15514211356639862, + "epoch": 0.7337246531483458, + "grad_norm": 0.1769358845517928, + "learning_rate": 4.03505892509594e-06, + "loss": 0.1556, + "mean_token_accuracy": 0.9386692762374877, + "num_tokens": 85936223.0, + "step": 1375 + }, + { + "entropy": 0.15412222743034362, + "epoch": 0.736392742796158, + "grad_norm": 0.18204823506406237, + "learning_rate": 3.960541392253387e-06, + "loss": 0.1533, + "mean_token_accuracy": 0.9394712686538697, + "num_tokens": 86249477.0, + "step": 1380 + }, + { + "entropy": 0.15360663831233978, + "epoch": 0.7390608324439701, + "grad_norm": 0.14263985064175763, + "learning_rate": 3.886548086526594e-06, + "loss": 0.153, + "mean_token_accuracy": 0.9389709830284119, + "num_tokens": 86559839.0, + "step": 1385 + }, + { + "entropy": 0.1553943246603012, + "epoch": 0.7417289220917823, + "grad_norm": 0.17680317343436647, + "learning_rate": 3.8130854305603325e-06, + "loss": 0.1551, + "mean_token_accuracy": 0.938764214515686, + "num_tokens": 86870592.0, + "step": 1390 + }, + { + "entropy": 0.15353876650333403, + "epoch": 0.7443970117395945, + "grad_norm": 0.18044428125254167, + "learning_rate": 3.740159800938784e-06, + "loss": 0.155, + "mean_token_accuracy": 0.9386942744255066, + "num_tokens": 87178900.0, + "step": 1395 + }, + { + "entropy": 0.15736522674560546, + "epoch": 0.7470651013874067, + "grad_norm": 0.19735522854846968, + "learning_rate": 3.667777527632066e-06, + "loss": 0.1564, + "mean_token_accuracy": 0.9383513927459717, + "num_tokens": 87489168.0, + "step": 1400 + }, + { + "entropy": 0.1552681565284729, + "epoch": 0.7497331910352187, + "grad_norm": 0.16086661998527077, + "learning_rate": 3.595944893446768e-06, + "loss": 0.1546, + "mean_token_accuracy": 0.9392289519309998, + "num_tokens": 87800261.0, + "step": 1405 + }, + { + "entropy": 0.15531840324401855, + "epoch": 0.7524012806830309, + "grad_norm": 0.1626277417036322, + "learning_rate": 3.5246681334806177e-06, + "loss": 0.1558, + "mean_token_accuracy": 0.9387146353721618, + "num_tokens": 88110224.0, + "step": 1410 + }, + { + "entropy": 0.153993684053421, + "epoch": 0.7550693703308431, + "grad_norm": 0.17744646490172236, + "learning_rate": 3.453953434581271e-06, + "loss": 0.1538, + "mean_token_accuracy": 0.9392658829689026, + "num_tokens": 88421782.0, + "step": 1415 + }, + { + "entropy": 0.15304548144340516, + "epoch": 0.7577374599786553, + "grad_norm": 0.16562692978370455, + "learning_rate": 3.383806934809274e-06, + "loss": 0.1527, + "mean_token_accuracy": 0.939368200302124, + "num_tokens": 88735130.0, + "step": 1420 + }, + { + "entropy": 0.15535095632076262, + "epoch": 0.7604055496264674, + "grad_norm": 0.19753091851214719, + "learning_rate": 3.314234722905302e-06, + "loss": 0.1552, + "mean_token_accuracy": 0.9386848092079163, + "num_tokens": 89046661.0, + "step": 1425 + }, + { + "entropy": 0.15178467333316803, + "epoch": 0.7630736392742796, + "grad_norm": 0.2163597483413995, + "learning_rate": 3.2452428377616373e-06, + "loss": 0.153, + "mean_token_accuracy": 0.939287257194519, + "num_tokens": 89360012.0, + "step": 1430 + }, + { + "entropy": 0.153414848446846, + "epoch": 0.7657417289220918, + "grad_norm": 0.1642251137191811, + "learning_rate": 3.176837267898002e-06, + "loss": 0.1526, + "mean_token_accuracy": 0.9395386815071106, + "num_tokens": 89673339.0, + "step": 1435 + }, + { + "entropy": 0.1548197388648987, + "epoch": 0.768409818569904, + "grad_norm": 0.18337676685613638, + "learning_rate": 3.1090239509417364e-06, + "loss": 0.1552, + "mean_token_accuracy": 0.9383951425552368, + "num_tokens": 89982235.0, + "step": 1440 + }, + { + "entropy": 0.15227739810943602, + "epoch": 0.7710779082177162, + "grad_norm": 0.15525725696787246, + "learning_rate": 3.0418087731124324e-06, + "loss": 0.1523, + "mean_token_accuracy": 0.9399517059326172, + "num_tokens": 90294967.0, + "step": 1445 + }, + { + "entropy": 0.15378317534923552, + "epoch": 0.7737459978655283, + "grad_norm": 0.16334353326521453, + "learning_rate": 2.9751975687109956e-06, + "loss": 0.1523, + "mean_token_accuracy": 0.9399662017822266, + "num_tokens": 90610717.0, + "step": 1450 + }, + { + "entropy": 0.15547354519367218, + "epoch": 0.7764140875133404, + "grad_norm": 0.18839341400355433, + "learning_rate": 2.909196119613218e-06, + "loss": 0.1554, + "mean_token_accuracy": 0.9385842323303223, + "num_tokens": 90923000.0, + "step": 1455 + }, + { + "entropy": 0.15278392732143403, + "epoch": 0.7790821771611526, + "grad_norm": 0.17757852890585457, + "learning_rate": 2.843810154767932e-06, + "loss": 0.1541, + "mean_token_accuracy": 0.9389323234558106, + "num_tokens": 91235879.0, + "step": 1460 + }, + { + "entropy": 0.15397576689720155, + "epoch": 0.7817502668089648, + "grad_norm": 0.15492266609314415, + "learning_rate": 2.779045349699708e-06, + "loss": 0.1544, + "mean_token_accuracy": 0.9384764671325684, + "num_tokens": 91547713.0, + "step": 1465 + }, + { + "entropy": 0.1529139518737793, + "epoch": 0.7844183564567769, + "grad_norm": 0.17336896184779682, + "learning_rate": 2.7149073260162416e-06, + "loss": 0.1528, + "mean_token_accuracy": 0.9394277930259705, + "num_tokens": 91859166.0, + "step": 1470 + }, + { + "entropy": 0.15577988028526307, + "epoch": 0.7870864461045891, + "grad_norm": 0.18392391814457118, + "learning_rate": 2.6514016509203843e-06, + "loss": 0.1537, + "mean_token_accuracy": 0.939464557170868, + "num_tokens": 92173028.0, + "step": 1475 + }, + { + "entropy": 0.15315627455711364, + "epoch": 0.7897545357524013, + "grad_norm": 0.16770812808389177, + "learning_rate": 2.588533836726901e-06, + "loss": 0.153, + "mean_token_accuracy": 0.9396589040756226, + "num_tokens": 92487360.0, + "step": 1480 + }, + { + "entropy": 0.15086018443107604, + "epoch": 0.7924226254002135, + "grad_norm": 0.1710231697211091, + "learning_rate": 2.5263093403840145e-06, + "loss": 0.1515, + "mean_token_accuracy": 0.9400036692619324, + "num_tokens": 92799729.0, + "step": 1485 + }, + { + "entropy": 0.15368268489837647, + "epoch": 0.7950907150480256, + "grad_norm": 0.16328574540278648, + "learning_rate": 2.464733562999737e-06, + "loss": 0.1538, + "mean_token_accuracy": 0.9391217470169068, + "num_tokens": 93109737.0, + "step": 1490 + }, + { + "entropy": 0.15320818424224852, + "epoch": 0.7977588046958378, + "grad_norm": 0.16400834989615978, + "learning_rate": 2.4038118493730366e-06, + "loss": 0.1528, + "mean_token_accuracy": 0.9395870327949524, + "num_tokens": 93418942.0, + "step": 1495 + }, + { + "entropy": 0.1534988135099411, + "epoch": 0.80042689434365, + "grad_norm": 0.17327724145407505, + "learning_rate": 2.3435494875299315e-06, + "loss": 0.1529, + "mean_token_accuracy": 0.9397768259048462, + "num_tokens": 93734428.0, + "step": 1500 + }, + { + "entropy": 0.15331260859966278, + "epoch": 0.8030949839914621, + "grad_norm": 0.1772078817717979, + "learning_rate": 2.283951708264468e-06, + "loss": 0.1537, + "mean_token_accuracy": 0.9394746541976928, + "num_tokens": 94046957.0, + "step": 1505 + }, + { + "entropy": 0.15226254165172576, + "epoch": 0.8057630736392742, + "grad_norm": 0.148092043865225, + "learning_rate": 2.2250236846846964e-06, + "loss": 0.1515, + "mean_token_accuracy": 0.9395742535591125, + "num_tokens": 94360546.0, + "step": 1510 + }, + { + "entropy": 0.1530079126358032, + "epoch": 0.8084311632870864, + "grad_norm": 0.16893142328693095, + "learning_rate": 2.1667705317636333e-06, + "loss": 0.153, + "mean_token_accuracy": 0.9392720460891724, + "num_tokens": 94675654.0, + "step": 1515 + }, + { + "entropy": 0.15376291871070863, + "epoch": 0.8110992529348986, + "grad_norm": 0.16634852143118453, + "learning_rate": 2.109197305895295e-06, + "loss": 0.1544, + "mean_token_accuracy": 0.9388089418411255, + "num_tokens": 94985985.0, + "step": 1520 + }, + { + "entropy": 0.15282466411590576, + "epoch": 0.8137673425827108, + "grad_norm": 0.16251016165730156, + "learning_rate": 2.0523090044557824e-06, + "loss": 0.1529, + "mean_token_accuracy": 0.9392230749130249, + "num_tokens": 95299780.0, + "step": 1525 + }, + { + "entropy": 0.15135302543640136, + "epoch": 0.816435432230523, + "grad_norm": 0.16157546003997317, + "learning_rate": 1.996110565369527e-06, + "loss": 0.1512, + "mean_token_accuracy": 0.9394175171852112, + "num_tokens": 95614445.0, + "step": 1530 + }, + { + "entropy": 0.15374623239040375, + "epoch": 0.8191035218783351, + "grad_norm": 0.18259722867426076, + "learning_rate": 1.940606866680663e-06, + "loss": 0.152, + "mean_token_accuracy": 0.9394948601722717, + "num_tokens": 95931805.0, + "step": 1535 + }, + { + "entropy": 0.1537939041852951, + "epoch": 0.8217716115261473, + "grad_norm": 0.16334117902599396, + "learning_rate": 1.8858027261296108e-06, + "loss": 0.1528, + "mean_token_accuracy": 0.939476215839386, + "num_tokens": 96245824.0, + "step": 1540 + }, + { + "entropy": 0.15337570607662201, + "epoch": 0.8244397011739595, + "grad_norm": 0.19316772677973434, + "learning_rate": 1.8317029007349086e-06, + "loss": 0.1541, + "mean_token_accuracy": 0.9392568707466126, + "num_tokens": 96556337.0, + "step": 1545 + }, + { + "entropy": 0.15260836780071257, + "epoch": 0.8271077908217717, + "grad_norm": 0.17986671972446405, + "learning_rate": 1.778312086380285e-06, + "loss": 0.1529, + "mean_token_accuracy": 0.9390943288803101, + "num_tokens": 96868925.0, + "step": 1550 + }, + { + "entropy": 0.15477918088436127, + "epoch": 0.8297758804695837, + "grad_norm": 0.16799503209068373, + "learning_rate": 1.7256349174070685e-06, + "loss": 0.1543, + "mean_token_accuracy": 0.9389909029006958, + "num_tokens": 97179267.0, + "step": 1555 + }, + { + "entropy": 0.15401456654071807, + "epoch": 0.8324439701173959, + "grad_norm": 0.1720200370324775, + "learning_rate": 1.6736759662119183e-06, + "loss": 0.1544, + "mean_token_accuracy": 0.9388466477394104, + "num_tokens": 97489742.0, + "step": 1560 + }, + { + "entropy": 0.15319326817989348, + "epoch": 0.8351120597652081, + "grad_norm": 0.15450019714705887, + "learning_rate": 1.6224397428499394e-06, + "loss": 0.1533, + "mean_token_accuracy": 0.939484441280365, + "num_tokens": 97801684.0, + "step": 1565 + }, + { + "entropy": 0.15360201597213746, + "epoch": 0.8377801494130203, + "grad_norm": 0.177662173258833, + "learning_rate": 1.571930694643209e-06, + "loss": 0.1537, + "mean_token_accuracy": 0.9389990210533142, + "num_tokens": 98114314.0, + "step": 1570 + }, + { + "entropy": 0.15317412316799164, + "epoch": 0.8404482390608324, + "grad_norm": 0.15123693889172668, + "learning_rate": 1.522153205794742e-06, + "loss": 0.1526, + "mean_token_accuracy": 0.9393292784690856, + "num_tokens": 98431194.0, + "step": 1575 + }, + { + "entropy": 0.1568188637495041, + "epoch": 0.8431163287086446, + "grad_norm": 0.17518595443649973, + "learning_rate": 1.4731115970079557e-06, + "loss": 0.1566, + "mean_token_accuracy": 0.9382672905921936, + "num_tokens": 98738019.0, + "step": 1580 + }, + { + "entropy": 0.15534501969814302, + "epoch": 0.8457844183564568, + "grad_norm": 0.1789791259956949, + "learning_rate": 1.4248101251116075e-06, + "loss": 0.1547, + "mean_token_accuracy": 0.9392199039459228, + "num_tokens": 99046421.0, + "step": 1585 + }, + { + "entropy": 0.15555815100669862, + "epoch": 0.848452508004269, + "grad_norm": 0.16712869095236838, + "learning_rate": 1.377252982690327e-06, + "loss": 0.1548, + "mean_token_accuracy": 0.9391135573387146, + "num_tokens": 99356306.0, + "step": 1590 + }, + { + "entropy": 0.15490398705005645, + "epoch": 0.8511205976520811, + "grad_norm": 0.1854224178958908, + "learning_rate": 1.3304442977206822e-06, + "loss": 0.1546, + "mean_token_accuracy": 0.9389813899993896, + "num_tokens": 99664488.0, + "step": 1595 + }, + { + "entropy": 0.15554279088974, + "epoch": 0.8537886872998933, + "grad_norm": 0.16236663288469788, + "learning_rate": 1.2843881332128649e-06, + "loss": 0.155, + "mean_token_accuracy": 0.9386430382728577, + "num_tokens": 99976581.0, + "step": 1600 + }, + { + "entropy": 0.151815003156662, + "epoch": 0.8564567769477054, + "grad_norm": 0.1704052427325928, + "learning_rate": 1.23908848685804e-06, + "loss": 0.1523, + "mean_token_accuracy": 0.9397621631622315, + "num_tokens": 100292769.0, + "step": 1605 + }, + { + "entropy": 0.15176697075366974, + "epoch": 0.8591248665955176, + "grad_norm": 0.1786854779796353, + "learning_rate": 1.1945492906813228e-06, + "loss": 0.1513, + "mean_token_accuracy": 0.940117597579956, + "num_tokens": 100607024.0, + "step": 1610 + }, + { + "entropy": 0.15364796221256255, + "epoch": 0.8617929562433297, + "grad_norm": 0.15429994926849608, + "learning_rate": 1.1507744107004937e-06, + "loss": 0.1533, + "mean_token_accuracy": 0.9390743613243103, + "num_tokens": 100919943.0, + "step": 1615 + }, + { + "entropy": 0.1543932318687439, + "epoch": 0.8644610458911419, + "grad_norm": 0.15625280035216202, + "learning_rate": 1.1077676465904209e-06, + "loss": 0.1538, + "mean_token_accuracy": 0.9391561985015869, + "num_tokens": 101227280.0, + "step": 1620 + }, + { + "entropy": 0.15176737308502197, + "epoch": 0.8671291355389541, + "grad_norm": 0.1456026421545509, + "learning_rate": 1.0655327313532393e-06, + "loss": 0.1515, + "mean_token_accuracy": 0.9398553967475891, + "num_tokens": 101541751.0, + "step": 1625 + }, + { + "entropy": 0.1538877785205841, + "epoch": 0.8697972251867663, + "grad_norm": 0.18290798792723664, + "learning_rate": 1.0240733309943408e-06, + "loss": 0.1536, + "mean_token_accuracy": 0.9390178799629212, + "num_tokens": 101852766.0, + "step": 1630 + }, + { + "entropy": 0.15422819554805756, + "epoch": 0.8724653148345785, + "grad_norm": 0.17873737576182977, + "learning_rate": 9.833930442041506e-07, + "loss": 0.1533, + "mean_token_accuracy": 0.9390836715698242, + "num_tokens": 102163302.0, + "step": 1635 + }, + { + "entropy": 0.15171377062797547, + "epoch": 0.8751334044823906, + "grad_norm": 0.15855802539068972, + "learning_rate": 9.434954020457676e-07, + "loss": 0.1518, + "mean_token_accuracy": 0.9397657155990601, + "num_tokens": 102475125.0, + "step": 1640 + }, + { + "entropy": 0.15260193645954132, + "epoch": 0.8778014941302028, + "grad_norm": 0.1726076552166529, + "learning_rate": 9.043838676484562e-07, + "loss": 0.1521, + "mean_token_accuracy": 0.9395378589630127, + "num_tokens": 102788987.0, + "step": 1645 + }, + { + "entropy": 0.15317799746990204, + "epoch": 0.880469583778015, + "grad_norm": 0.16488636114365046, + "learning_rate": 8.660618359070605e-07, + "loss": 0.1527, + "mean_token_accuracy": 0.9392109036445617, + "num_tokens": 103101810.0, + "step": 1650 + }, + { + "entropy": 0.15190463960170747, + "epoch": 0.8831376734258272, + "grad_norm": 0.18861785294109573, + "learning_rate": 8.285326331873156e-07, + "loss": 0.1522, + "mean_token_accuracy": 0.9398165941238403, + "num_tokens": 103415395.0, + "step": 1655 + }, + { + "entropy": 0.15620527565479278, + "epoch": 0.8858057630736392, + "grad_norm": 0.1661916421753367, + "learning_rate": 7.917995170371196e-07, + "loss": 0.1557, + "mean_token_accuracy": 0.9382588505744934, + "num_tokens": 103726299.0, + "step": 1660 + }, + { + "entropy": 0.15179810523986817, + "epoch": 0.8884738527214514, + "grad_norm": 0.156852390106292, + "learning_rate": 7.558656759037796e-07, + "loss": 0.152, + "mean_token_accuracy": 0.9394199013710022, + "num_tokens": 104041096.0, + "step": 1665 + }, + { + "entropy": 0.15225241482257842, + "epoch": 0.8911419423692636, + "grad_norm": 0.19547586460594443, + "learning_rate": 7.207342288572505e-07, + "loss": 0.1528, + "mean_token_accuracy": 0.9393445611000061, + "num_tokens": 104354220.0, + "step": 1670 + }, + { + "entropy": 0.15344537496566774, + "epoch": 0.8938100320170758, + "grad_norm": 0.16355590508798246, + "learning_rate": 6.864082253194027e-07, + "loss": 0.1531, + "mean_token_accuracy": 0.939066469669342, + "num_tokens": 104665581.0, + "step": 1675 + }, + { + "entropy": 0.1540704697370529, + "epoch": 0.8964781216648879, + "grad_norm": 0.14321416528661213, + "learning_rate": 6.528906447993289e-07, + "loss": 0.1542, + "mean_token_accuracy": 0.9388607501983642, + "num_tokens": 104975597.0, + "step": 1680 + }, + { + "entropy": 0.15550175607204436, + "epoch": 0.8991462113127001, + "grad_norm": 0.15605416020222349, + "learning_rate": 6.201843966347176e-07, + "loss": 0.1539, + "mean_token_accuracy": 0.9388849854469299, + "num_tokens": 105288459.0, + "step": 1685 + }, + { + "entropy": 0.15473953485488892, + "epoch": 0.9018143009605123, + "grad_norm": 0.17414650381201083, + "learning_rate": 5.882923197393331e-07, + "loss": 0.1539, + "mean_token_accuracy": 0.9390599966049195, + "num_tokens": 105602719.0, + "step": 1690 + }, + { + "entropy": 0.15401538014411925, + "epoch": 0.9044823906083245, + "grad_norm": 0.15830882724974832, + "learning_rate": 5.572171823565797e-07, + "loss": 0.1527, + "mean_token_accuracy": 0.9393938302993774, + "num_tokens": 105914271.0, + "step": 1695 + }, + { + "entropy": 0.15360169410705565, + "epoch": 0.9071504802561366, + "grad_norm": 0.15926528205721613, + "learning_rate": 5.269616818192335e-07, + "loss": 0.1535, + "mean_token_accuracy": 0.9388334393501282, + "num_tokens": 106227117.0, + "step": 1700 + }, + { + "entropy": 0.15220762491226197, + "epoch": 0.9098185699039488, + "grad_norm": 0.1791864739200382, + "learning_rate": 4.975284443153062e-07, + "loss": 0.1517, + "mean_token_accuracy": 0.9400452375411987, + "num_tokens": 106540833.0, + "step": 1705 + }, + { + "entropy": 0.15139750838279725, + "epoch": 0.9124866595517609, + "grad_norm": 0.17296662165772123, + "learning_rate": 4.6892002466008666e-07, + "loss": 0.1519, + "mean_token_accuracy": 0.9396615028381348, + "num_tokens": 106854815.0, + "step": 1710 + }, + { + "entropy": 0.1526773154735565, + "epoch": 0.9151547491995731, + "grad_norm": 0.16734739463180334, + "learning_rate": 4.4113890607439315e-07, + "loss": 0.1524, + "mean_token_accuracy": 0.9392900347709656, + "num_tokens": 107167462.0, + "step": 1715 + }, + { + "entropy": 0.15315515995025636, + "epoch": 0.9178228388473852, + "grad_norm": 0.1537273254608466, + "learning_rate": 4.141874999690143e-07, + "loss": 0.1533, + "mean_token_accuracy": 0.9392496705055237, + "num_tokens": 107479911.0, + "step": 1720 + }, + { + "entropy": 0.15481925010681152, + "epoch": 0.9204909284951974, + "grad_norm": 0.17050365912615373, + "learning_rate": 3.8806814573541185e-07, + "loss": 0.1548, + "mean_token_accuracy": 0.9386554837226868, + "num_tokens": 107788344.0, + "step": 1725 + }, + { + "entropy": 0.15229442715644836, + "epoch": 0.9231590181430096, + "grad_norm": 0.17679327443076584, + "learning_rate": 3.627831105426527e-07, + "loss": 0.1521, + "mean_token_accuracy": 0.9396791219711303, + "num_tokens": 108105431.0, + "step": 1730 + }, + { + "entropy": 0.15279924273490905, + "epoch": 0.9258271077908218, + "grad_norm": 0.16359512895424472, + "learning_rate": 3.383345891406176e-07, + "loss": 0.1518, + "mean_token_accuracy": 0.9397311925888061, + "num_tokens": 108419352.0, + "step": 1735 + }, + { + "entropy": 0.15270444452762605, + "epoch": 0.928495197438634, + "grad_norm": 0.16633402809686326, + "learning_rate": 3.147247036695034e-07, + "loss": 0.1518, + "mean_token_accuracy": 0.9400157809257508, + "num_tokens": 108732363.0, + "step": 1740 + }, + { + "entropy": 0.15398914813995362, + "epoch": 0.9311632870864461, + "grad_norm": 0.18184961530665641, + "learning_rate": 2.919555034756083e-07, + "loss": 0.1534, + "mean_token_accuracy": 0.9391607284545899, + "num_tokens": 109045942.0, + "step": 1745 + }, + { + "entropy": 0.15341159105300903, + "epoch": 0.9338313767342583, + "grad_norm": 0.15610763899581404, + "learning_rate": 2.7002896493346196e-07, + "loss": 0.1531, + "mean_token_accuracy": 0.9388864755630493, + "num_tokens": 109357605.0, + "step": 1750 + }, + { + "entropy": 0.15386421084403992, + "epoch": 0.9364994663820705, + "grad_norm": 0.17064992241605773, + "learning_rate": 2.489469912742637e-07, + "loss": 0.1539, + "mean_token_accuracy": 0.9389752864837646, + "num_tokens": 109669499.0, + "step": 1755 + }, + { + "entropy": 0.1526539146900177, + "epoch": 0.9391675560298826, + "grad_norm": 0.15419786967921834, + "learning_rate": 2.2871141242068794e-07, + "loss": 0.1526, + "mean_token_accuracy": 0.9394209742546081, + "num_tokens": 109982520.0, + "step": 1760 + }, + { + "entropy": 0.15407957434654235, + "epoch": 0.9418356456776947, + "grad_norm": 0.1607104970352119, + "learning_rate": 2.0932398482804374e-07, + "loss": 0.154, + "mean_token_accuracy": 0.9395723819732666, + "num_tokens": 110291773.0, + "step": 1765 + }, + { + "entropy": 0.1525971472263336, + "epoch": 0.9445037353255069, + "grad_norm": 0.16079202187464134, + "learning_rate": 1.9078639133181532e-07, + "loss": 0.1525, + "mean_token_accuracy": 0.9390587091445923, + "num_tokens": 110606679.0, + "step": 1770 + }, + { + "entropy": 0.15261533558368684, + "epoch": 0.9471718249733191, + "grad_norm": 0.14880163094849888, + "learning_rate": 1.731002410015914e-07, + "loss": 0.1529, + "mean_token_accuracy": 0.9391334652900696, + "num_tokens": 110916597.0, + "step": 1775 + }, + { + "entropy": 0.15149607956409455, + "epoch": 0.9498399146211313, + "grad_norm": 0.1592225130522692, + "learning_rate": 1.5626706900139344e-07, + "loss": 0.1506, + "mean_token_accuracy": 0.9401805758476257, + "num_tokens": 111232923.0, + "step": 1780 + }, + { + "entropy": 0.15343788266181946, + "epoch": 0.9525080042689434, + "grad_norm": 0.17284457111616702, + "learning_rate": 1.4028833645643113e-07, + "loss": 0.1529, + "mean_token_accuracy": 0.9393085956573486, + "num_tokens": 111542260.0, + "step": 1785 + }, + { + "entropy": 0.15213734805583953, + "epoch": 0.9551760939167556, + "grad_norm": 0.18244908291766967, + "learning_rate": 1.2516543032626393e-07, + "loss": 0.1516, + "mean_token_accuracy": 0.9401678204536438, + "num_tokens": 111855933.0, + "step": 1790 + }, + { + "entropy": 0.1516744911670685, + "epoch": 0.9578441835645678, + "grad_norm": 0.15801809576170697, + "learning_rate": 1.1089966328442836e-07, + "loss": 0.1518, + "mean_token_accuracy": 0.9402112603187561, + "num_tokens": 112167971.0, + "step": 1795 + }, + { + "entropy": 0.15328632593154906, + "epoch": 0.96051227321238, + "grad_norm": 0.1582365808562706, + "learning_rate": 9.749227360448143e-08, + "loss": 0.153, + "mean_token_accuracy": 0.9395403981208801, + "num_tokens": 112479502.0, + "step": 1800 + }, + { + "entropy": 0.1515852242708206, + "epoch": 0.9631803628601922, + "grad_norm": 0.157383395307447, + "learning_rate": 8.494442505252998e-08, + "loss": 0.1517, + "mean_token_accuracy": 0.939778745174408, + "num_tokens": 112794798.0, + "step": 1805 + }, + { + "entropy": 0.15518470108509064, + "epoch": 0.9658484525080042, + "grad_norm": 0.1608204572688974, + "learning_rate": 7.325720678620807e-08, + "loss": 0.1541, + "mean_token_accuracy": 0.938652515411377, + "num_tokens": 113106794.0, + "step": 1810 + }, + { + "entropy": 0.15322840213775635, + "epoch": 0.9685165421558164, + "grad_norm": 0.1729188603078381, + "learning_rate": 6.243163326014268e-08, + "loss": 0.153, + "mean_token_accuracy": 0.9392426013946533, + "num_tokens": 113417776.0, + "step": 1815 + }, + { + "entropy": 0.1549760401248932, + "epoch": 0.9711846318036286, + "grad_norm": 0.1553030515697785, + "learning_rate": 5.2468644137894096e-08, + "loss": 0.1544, + "mean_token_accuracy": 0.9385666251182556, + "num_tokens": 113727813.0, + "step": 1820 + }, + { + "entropy": 0.15276177525520324, + "epoch": 0.9738527214514408, + "grad_norm": 0.16417299638392785, + "learning_rate": 4.336910421039897e-08, + "loss": 0.1523, + "mean_token_accuracy": 0.9392683863639831, + "num_tokens": 114040983.0, + "step": 1825 + }, + { + "entropy": 0.15231212675571443, + "epoch": 0.9765208110992529, + "grad_norm": 0.16508718962458294, + "learning_rate": 3.5133803320897e-08, + "loss": 0.1533, + "mean_token_accuracy": 0.9389925718307495, + "num_tokens": 114355331.0, + "step": 1830 + }, + { + "entropy": 0.1533448129892349, + "epoch": 0.9791889007470651, + "grad_norm": 0.15896056804887643, + "learning_rate": 2.776345629638355e-08, + "loss": 0.1523, + "mean_token_accuracy": 0.9398018479347229, + "num_tokens": 114668137.0, + "step": 1835 + }, + { + "entropy": 0.15175502002239227, + "epoch": 0.9818569903948773, + "grad_norm": 0.15113393376615555, + "learning_rate": 2.1258702885551542e-08, + "loss": 0.152, + "mean_token_accuracy": 0.9397669315338135, + "num_tokens": 114980396.0, + "step": 1840 + }, + { + "entropy": 0.15319037735462188, + "epoch": 0.9845250800426895, + "grad_norm": 0.15994427039320552, + "learning_rate": 1.562010770326916e-08, + "loss": 0.1521, + "mean_token_accuracy": 0.9401495337486268, + "num_tokens": 115291239.0, + "step": 1845 + }, + { + "entropy": 0.153715580701828, + "epoch": 0.9871931696905016, + "grad_norm": 0.16228745702016076, + "learning_rate": 1.084816018156576e-08, + "loss": 0.1529, + "mean_token_accuracy": 0.9394945859909057, + "num_tokens": 115604910.0, + "step": 1850 + }, + { + "entropy": 0.1548271507024765, + "epoch": 0.9898612593383138, + "grad_norm": 0.16123378641612207, + "learning_rate": 6.9432745271535986e-09, + "loss": 0.1546, + "mean_token_accuracy": 0.9384868979454041, + "num_tokens": 115915463.0, + "step": 1855 + }, + { + "entropy": 0.15217451751232147, + "epoch": 0.9925293489861259, + "grad_norm": 0.16271968941599155, + "learning_rate": 3.905789685471062e-09, + "loss": 0.1524, + "mean_token_accuracy": 0.9392788529396057, + "num_tokens": 116228046.0, + "step": 1860 + }, + { + "entropy": 0.15162586271762848, + "epoch": 0.9951974386339381, + "grad_norm": 0.1602092482993034, + "learning_rate": 1.7359693112606323e-09, + "loss": 0.1517, + "mean_token_accuracy": 0.9397766590118408, + "num_tokens": 116542893.0, + "step": 1865 + }, + { + "entropy": 0.15208709836006165, + "epoch": 0.9978655282817502, + "grad_norm": 0.16954975469697164, + "learning_rate": 4.3400174569052657e-10, + "loss": 0.1517, + "mean_token_accuracy": 0.9398512244224548, + "num_tokens": 116858029.0, + "step": 1870 + }, + { + "entropy": 0.15305225551128387, + "epoch": 1.0, + "mean_token_accuracy": 0.9391315132379532, + "num_tokens": 117107840.0, + "step": 1874, + "total_flos": 246067485081600.0, + "train_loss": 0.17793139662661192, + "train_runtime": 3132.2908, + "train_samples_per_second": 76.578, + "train_steps_per_second": 0.598 + } + ], + "logging_steps": 5, + "max_steps": 1874, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 246067485081600.0, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}