{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 3036, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004941321803582459, "grad_norm": 3.079270839691162, "learning_rate": 1.092896174863388e-06, "loss": 1.004, "mean_token_accuracy": 0.7358035773038865, "num_tokens": 10405489.0, "step": 5 }, { "epoch": 0.009882643607164917, "grad_norm": 1.1023986339569092, "learning_rate": 2.459016393442623e-06, "loss": 0.967, "mean_token_accuracy": 0.7401202037930489, "num_tokens": 20812735.0, "step": 10 }, { "epoch": 0.014823965410747375, "grad_norm": 1.2933658361434937, "learning_rate": 3.825136612021858e-06, "loss": 0.9421, "mean_token_accuracy": 0.7410167530179024, "num_tokens": 31206554.0, "step": 15 }, { "epoch": 0.019765287214329835, "grad_norm": 1.6906228065490723, "learning_rate": 5.191256830601094e-06, "loss": 0.9049, "mean_token_accuracy": 0.7487473994493484, "num_tokens": 41618401.0, "step": 20 }, { "epoch": 0.02470660901791229, "grad_norm": 0.48920488357543945, "learning_rate": 6.557377049180328e-06, "loss": 0.8892, "mean_token_accuracy": 0.7520575806498527, "num_tokens": 52016212.0, "step": 25 }, { "epoch": 0.02964793082149475, "grad_norm": 0.4231764078140259, "learning_rate": 7.923497267759564e-06, "loss": 0.9592, "mean_token_accuracy": 0.7517324000597, "num_tokens": 62414110.0, "step": 30 }, { "epoch": 0.034589252625077206, "grad_norm": 0.3153907358646393, "learning_rate": 9.2896174863388e-06, "loss": 0.8669, "mean_token_accuracy": 0.7558081388473511, "num_tokens": 72804943.0, "step": 35 }, { "epoch": 0.03953057442865967, "grad_norm": 0.26243749260902405, "learning_rate": 1.0655737704918032e-05, "loss": 0.8518, "mean_token_accuracy": 0.7586903437972069, "num_tokens": 83189221.0, "step": 40 }, { "epoch": 0.044471896232242125, "grad_norm": 0.23591159284114838, "learning_rate": 1.2021857923497268e-05, "loss": 0.8462, "mean_token_accuracy": 0.759694167971611, "num_tokens": 93605213.0, "step": 45 }, { "epoch": 0.04941321803582458, "grad_norm": 0.22021469473838806, "learning_rate": 1.3387978142076505e-05, "loss": 0.8408, "mean_token_accuracy": 0.7603029757738113, "num_tokens": 104010903.0, "step": 50 }, { "epoch": 0.054354539839407044, "grad_norm": 0.20398813486099243, "learning_rate": 1.4754098360655739e-05, "loss": 0.8297, "mean_token_accuracy": 0.7627619341015816, "num_tokens": 114411921.0, "step": 55 }, { "epoch": 0.0592958616429895, "grad_norm": 0.21533672511577606, "learning_rate": 1.6120218579234975e-05, "loss": 0.8144, "mean_token_accuracy": 0.7663307622075081, "num_tokens": 124801309.0, "step": 60 }, { "epoch": 0.06423718344657196, "grad_norm": 0.21680276095867157, "learning_rate": 1.7486338797814207e-05, "loss": 0.8258, "mean_token_accuracy": 0.7630844265222549, "num_tokens": 135229970.0, "step": 65 }, { "epoch": 0.06917850525015441, "grad_norm": 0.19620661437511444, "learning_rate": 1.8852459016393442e-05, "loss": 0.8129, "mean_token_accuracy": 0.7654996693134308, "num_tokens": 145598239.0, "step": 70 }, { "epoch": 0.07411982705373688, "grad_norm": 0.2198481261730194, "learning_rate": 2.0218579234972678e-05, "loss": 0.8055, "mean_token_accuracy": 0.7672501325607299, "num_tokens": 155991018.0, "step": 75 }, { "epoch": 0.07906114885731934, "grad_norm": 0.25267091393470764, "learning_rate": 2.1584699453551914e-05, "loss": 0.8007, "mean_token_accuracy": 0.7685669183731079, "num_tokens": 166394949.0, "step": 80 }, { "epoch": 0.08400247066090179, "grad_norm": 0.3027536869049072, "learning_rate": 2.295081967213115e-05, "loss": 0.8084, "mean_token_accuracy": 0.7663731932640075, "num_tokens": 176796419.0, "step": 85 }, { "epoch": 0.08894379246448425, "grad_norm": 0.37499961256980896, "learning_rate": 2.431693989071038e-05, "loss": 0.8014, "mean_token_accuracy": 0.7681452915072441, "num_tokens": 187215188.0, "step": 90 }, { "epoch": 0.09388511426806671, "grad_norm": 0.49170613288879395, "learning_rate": 2.568306010928962e-05, "loss": 0.8062, "mean_token_accuracy": 0.7694577813148499, "num_tokens": 197599168.0, "step": 95 }, { "epoch": 0.09882643607164916, "grad_norm": 7.697641372680664, "learning_rate": 2.7049180327868856e-05, "loss": 0.8025, "mean_token_accuracy": 0.7678478330373764, "num_tokens": 208023285.0, "step": 100 }, { "epoch": 0.10376775787523163, "grad_norm": 0.9521784782409668, "learning_rate": 2.841530054644809e-05, "loss": 0.7994, "mean_token_accuracy": 0.7675640687346459, "num_tokens": 218443542.0, "step": 105 }, { "epoch": 0.10870907967881409, "grad_norm": 0.48149237036705017, "learning_rate": 2.9781420765027324e-05, "loss": 0.8044, "mean_token_accuracy": 0.7670532912015915, "num_tokens": 228855088.0, "step": 110 }, { "epoch": 0.11365040148239654, "grad_norm": 0.5225210189819336, "learning_rate": 3.114754098360656e-05, "loss": 0.7831, "mean_token_accuracy": 0.7721872299909591, "num_tokens": 239245255.0, "step": 115 }, { "epoch": 0.118591723285979, "grad_norm": 0.6328762769699097, "learning_rate": 3.251366120218579e-05, "loss": 0.792, "mean_token_accuracy": 0.7695287197828293, "num_tokens": 249641901.0, "step": 120 }, { "epoch": 0.12353304508956146, "grad_norm": 0.47159990668296814, "learning_rate": 3.387978142076503e-05, "loss": 0.7973, "mean_token_accuracy": 0.76757872402668, "num_tokens": 260014931.0, "step": 125 }, { "epoch": 0.12847436689314393, "grad_norm": 0.26565825939178467, "learning_rate": 3.524590163934427e-05, "loss": 0.7784, "mean_token_accuracy": 0.7727497264742851, "num_tokens": 270399134.0, "step": 130 }, { "epoch": 0.13341568869672638, "grad_norm": 0.34533897042274475, "learning_rate": 3.66120218579235e-05, "loss": 0.7761, "mean_token_accuracy": 0.7732908591628075, "num_tokens": 280819722.0, "step": 135 }, { "epoch": 0.13835701050030882, "grad_norm": 0.32269155979156494, "learning_rate": 3.797814207650273e-05, "loss": 0.7743, "mean_token_accuracy": 0.7737134978175163, "num_tokens": 291234416.0, "step": 140 }, { "epoch": 0.1432983323038913, "grad_norm": 0.418888658285141, "learning_rate": 3.934426229508197e-05, "loss": 0.7832, "mean_token_accuracy": 0.7711038008332253, "num_tokens": 301649374.0, "step": 145 }, { "epoch": 0.14823965410747375, "grad_norm": 2.6427414417266846, "learning_rate": 4.07103825136612e-05, "loss": 0.825, "mean_token_accuracy": 0.766799908876419, "num_tokens": 312034521.0, "step": 150 }, { "epoch": 0.1531809759110562, "grad_norm": 2.3590986728668213, "learning_rate": 4.207650273224044e-05, "loss": 0.8059, "mean_token_accuracy": 0.7664751559495926, "num_tokens": 322426237.0, "step": 155 }, { "epoch": 0.15812229771463868, "grad_norm": 2.972717523574829, "learning_rate": 4.3442622950819674e-05, "loss": 0.7977, "mean_token_accuracy": 0.7695314347743988, "num_tokens": 332818660.0, "step": 160 }, { "epoch": 0.16306361951822113, "grad_norm": 0.5393229722976685, "learning_rate": 4.4808743169398906e-05, "loss": 0.7903, "mean_token_accuracy": 0.7694852471351623, "num_tokens": 343223659.0, "step": 165 }, { "epoch": 0.16800494132180357, "grad_norm": 0.5978872776031494, "learning_rate": 4.6174863387978145e-05, "loss": 0.7696, "mean_token_accuracy": 0.7744763985276222, "num_tokens": 353601938.0, "step": 170 }, { "epoch": 0.17294626312538605, "grad_norm": 0.36113736033439636, "learning_rate": 4.754098360655738e-05, "loss": 0.7702, "mean_token_accuracy": 0.773854723572731, "num_tokens": 363963995.0, "step": 175 }, { "epoch": 0.1778875849289685, "grad_norm": 0.296596497297287, "learning_rate": 4.890710382513661e-05, "loss": 0.7635, "mean_token_accuracy": 0.7756986305117607, "num_tokens": 374367241.0, "step": 180 }, { "epoch": 0.18282890673255095, "grad_norm": 0.35513490438461304, "learning_rate": 4.9999984843247074e-05, "loss": 0.7694, "mean_token_accuracy": 0.7740960389375686, "num_tokens": 384787359.0, "step": 185 }, { "epoch": 0.18777022853613343, "grad_norm": 0.2819320261478424, "learning_rate": 4.999945435882428e-05, "loss": 0.759, "mean_token_accuracy": 0.7767567992210388, "num_tokens": 395178532.0, "step": 190 }, { "epoch": 0.19271155033971588, "grad_norm": 0.25831785798072815, "learning_rate": 4.9998166055133136e-05, "loss": 0.7679, "mean_token_accuracy": 0.7745850294828415, "num_tokens": 405586629.0, "step": 195 }, { "epoch": 0.19765287214329832, "grad_norm": 0.2429356724023819, "learning_rate": 4.9996119971226544e-05, "loss": 0.7596, "mean_token_accuracy": 0.776530908048153, "num_tokens": 415992685.0, "step": 200 }, { "epoch": 0.2025941939468808, "grad_norm": 0.29596009850502014, "learning_rate": 4.9993316169128334e-05, "loss": 0.7434, "mean_token_accuracy": 0.7808976873755455, "num_tokens": 426380223.0, "step": 205 }, { "epoch": 0.20753551575046325, "grad_norm": 0.32271862030029297, "learning_rate": 4.9989754733831366e-05, "loss": 0.7709, "mean_token_accuracy": 0.7732070550322533, "num_tokens": 436791731.0, "step": 210 }, { "epoch": 0.2124768375540457, "grad_norm": 0.4768483340740204, "learning_rate": 4.9985435773294975e-05, "loss": 0.7609, "mean_token_accuracy": 0.7759308516979218, "num_tokens": 447209022.0, "step": 215 }, { "epoch": 0.21741815935762818, "grad_norm": 0.38842251896858215, "learning_rate": 4.998035941844167e-05, "loss": 0.7515, "mean_token_accuracy": 0.7781018510460853, "num_tokens": 457588195.0, "step": 220 }, { "epoch": 0.22235948116121063, "grad_norm": 0.3555106818675995, "learning_rate": 4.9974525823153194e-05, "loss": 0.7632, "mean_token_accuracy": 0.7750346094369889, "num_tokens": 467968939.0, "step": 225 }, { "epoch": 0.22730080296479308, "grad_norm": 0.34921368956565857, "learning_rate": 4.9967935164265854e-05, "loss": 0.7542, "mean_token_accuracy": 0.7772198930382729, "num_tokens": 478327236.0, "step": 230 }, { "epoch": 0.23224212476837555, "grad_norm": 0.8633297681808472, "learning_rate": 4.9960587641565125e-05, "loss": 0.7599, "mean_token_accuracy": 0.7760145485401153, "num_tokens": 488716484.0, "step": 235 }, { "epoch": 0.237183446571958, "grad_norm": 0.6466217041015625, "learning_rate": 4.9952483477779654e-05, "loss": 0.7544, "mean_token_accuracy": 0.7774667114019393, "num_tokens": 499127155.0, "step": 240 }, { "epoch": 0.24212476837554045, "grad_norm": 0.6501172780990601, "learning_rate": 4.994362291857445e-05, "loss": 0.7658, "mean_token_accuracy": 0.7741294875741005, "num_tokens": 509553916.0, "step": 245 }, { "epoch": 0.24706609017912293, "grad_norm": 0.9864006638526917, "learning_rate": 4.993400623254347e-05, "loss": 0.7816, "mean_token_accuracy": 0.7745596036314965, "num_tokens": 519972291.0, "step": 250 }, { "epoch": 0.2520074119827054, "grad_norm": 0.7591823935508728, "learning_rate": 4.99236337112015e-05, "loss": 0.7593, "mean_token_accuracy": 0.7757471099495887, "num_tokens": 530382104.0, "step": 255 }, { "epoch": 0.25694873378628785, "grad_norm": 0.8461949825286865, "learning_rate": 4.9912505668975245e-05, "loss": 0.7775, "mean_token_accuracy": 0.7719829902052879, "num_tokens": 540772176.0, "step": 260 }, { "epoch": 0.2618900555898703, "grad_norm": 0.5740395188331604, "learning_rate": 4.990062244319387e-05, "loss": 0.7519, "mean_token_accuracy": 0.7776331752538681, "num_tokens": 551147852.0, "step": 265 }, { "epoch": 0.26683137739345275, "grad_norm": 0.3479936420917511, "learning_rate": 4.988798439407872e-05, "loss": 0.7569, "mean_token_accuracy": 0.7760330036282539, "num_tokens": 561569474.0, "step": 270 }, { "epoch": 0.27177269919703523, "grad_norm": 0.29772821068763733, "learning_rate": 4.9874591904732446e-05, "loss": 0.756, "mean_token_accuracy": 0.7763150066137314, "num_tokens": 571935576.0, "step": 275 }, { "epoch": 0.27671402100061765, "grad_norm": 0.5030266046524048, "learning_rate": 4.9860445381127385e-05, "loss": 0.7792, "mean_token_accuracy": 0.7774720788002014, "num_tokens": 582349510.0, "step": 280 }, { "epoch": 0.2816553428042001, "grad_norm": 0.3081815242767334, "learning_rate": 4.984554525209321e-05, "loss": 0.7452, "mean_token_accuracy": 0.7791904672980309, "num_tokens": 592706344.0, "step": 285 }, { "epoch": 0.2865966646077826, "grad_norm": 0.29467764496803284, "learning_rate": 4.9829891969303973e-05, "loss": 0.7573, "mean_token_accuracy": 0.7754112169146538, "num_tokens": 603106180.0, "step": 290 }, { "epoch": 0.291537986411365, "grad_norm": 0.2687968313694, "learning_rate": 4.981348600726441e-05, "loss": 0.7427, "mean_token_accuracy": 0.7795338571071625, "num_tokens": 613491192.0, "step": 295 }, { "epoch": 0.2964793082149475, "grad_norm": 0.3151031732559204, "learning_rate": 4.9796327863295536e-05, "loss": 0.7648, "mean_token_accuracy": 0.7766805797815323, "num_tokens": 623915573.0, "step": 300 }, { "epoch": 0.30142063001853, "grad_norm": 0.23440544307231903, "learning_rate": 4.9778418057519595e-05, "loss": 0.7398, "mean_token_accuracy": 0.7802766650915146, "num_tokens": 634319400.0, "step": 305 }, { "epoch": 0.3063619518221124, "grad_norm": 0.24154525995254517, "learning_rate": 4.9759757132844256e-05, "loss": 0.7369, "mean_token_accuracy": 0.7812311604619027, "num_tokens": 644726414.0, "step": 310 }, { "epoch": 0.3113032736256949, "grad_norm": 0.22309941053390503, "learning_rate": 4.974034565494621e-05, "loss": 0.7365, "mean_token_accuracy": 0.7811038464307785, "num_tokens": 655151755.0, "step": 315 }, { "epoch": 0.31624459542927735, "grad_norm": 0.22005286812782288, "learning_rate": 4.972018421225397e-05, "loss": 0.7449, "mean_token_accuracy": 0.7791362345218659, "num_tokens": 665495983.0, "step": 320 }, { "epoch": 0.3211859172328598, "grad_norm": 0.3202398121356964, "learning_rate": 4.969927341593008e-05, "loss": 0.7485, "mean_token_accuracy": 0.7779804199934006, "num_tokens": 675879481.0, "step": 325 }, { "epoch": 0.32612723903644225, "grad_norm": 0.25242334604263306, "learning_rate": 4.9677613899852535e-05, "loss": 0.7419, "mean_token_accuracy": 0.7794451892375946, "num_tokens": 686284515.0, "step": 330 }, { "epoch": 0.33106856084002473, "grad_norm": 0.3325527012348175, "learning_rate": 4.965520632059562e-05, "loss": 0.7633, "mean_token_accuracy": 0.7767202571034432, "num_tokens": 696699232.0, "step": 335 }, { "epoch": 0.33600988264360715, "grad_norm": 0.39707615971565247, "learning_rate": 4.963205135740997e-05, "loss": 0.7493, "mean_token_accuracy": 0.7775179639458656, "num_tokens": 707076497.0, "step": 340 }, { "epoch": 0.3409512044471896, "grad_norm": 0.346334308385849, "learning_rate": 4.960814971220199e-05, "loss": 0.7417, "mean_token_accuracy": 0.7795804455876351, "num_tokens": 717474591.0, "step": 345 }, { "epoch": 0.3458925262507721, "grad_norm": 4.010451316833496, "learning_rate": 4.958350210951259e-05, "loss": 0.7478, "mean_token_accuracy": 0.778372198343277, "num_tokens": 727885602.0, "step": 350 }, { "epoch": 0.3508338480543545, "grad_norm": 0.3823387026786804, "learning_rate": 4.95581092964952e-05, "loss": 0.7487, "mean_token_accuracy": 0.7780078485608101, "num_tokens": 738268731.0, "step": 355 }, { "epoch": 0.355775169857937, "grad_norm": 0.33636268973350525, "learning_rate": 4.953197204289315e-05, "loss": 0.7465, "mean_token_accuracy": 0.7781498372554779, "num_tokens": 748674599.0, "step": 360 }, { "epoch": 0.3607164916615195, "grad_norm": 0.34202906489372253, "learning_rate": 4.9505091141016305e-05, "loss": 0.7494, "mean_token_accuracy": 0.7772357612848282, "num_tokens": 759049314.0, "step": 365 }, { "epoch": 0.3656578134651019, "grad_norm": 0.4149486720561981, "learning_rate": 4.947746740571706e-05, "loss": 0.7333, "mean_token_accuracy": 0.7816486582159996, "num_tokens": 769428740.0, "step": 370 }, { "epoch": 0.3705991352686844, "grad_norm": 1.6017032861709595, "learning_rate": 4.9449101674365643e-05, "loss": 0.7588, "mean_token_accuracy": 0.7753262847661972, "num_tokens": 779800245.0, "step": 375 }, { "epoch": 0.37554045707226685, "grad_norm": 0.9440131783485413, "learning_rate": 4.941999480682474e-05, "loss": 0.7398, "mean_token_accuracy": 0.7795335426926613, "num_tokens": 790196388.0, "step": 380 }, { "epoch": 0.3804817788758493, "grad_norm": 0.8744800090789795, "learning_rate": 4.939014768542342e-05, "loss": 0.7344, "mean_token_accuracy": 0.7812343299388885, "num_tokens": 800569083.0, "step": 385 }, { "epoch": 0.38542310067943175, "grad_norm": 0.825480043888092, "learning_rate": 4.935956121493036e-05, "loss": 0.7355, "mean_token_accuracy": 0.7809210374951363, "num_tokens": 810992288.0, "step": 390 }, { "epoch": 0.39036442248301423, "grad_norm": 0.3783267140388489, "learning_rate": 4.9328236322526475e-05, "loss": 0.741, "mean_token_accuracy": 0.7795790642499923, "num_tokens": 821390810.0, "step": 395 }, { "epoch": 0.39530574428659665, "grad_norm": 18.137908935546875, "learning_rate": 4.9296173957776776e-05, "loss": 0.7699, "mean_token_accuracy": 0.7768698945641518, "num_tokens": 831805263.0, "step": 400 }, { "epoch": 0.4002470660901791, "grad_norm": 1.4189810752868652, "learning_rate": 4.926337509260157e-05, "loss": 0.7553, "mean_token_accuracy": 0.7770886451005936, "num_tokens": 842216817.0, "step": 405 }, { "epoch": 0.4051883878937616, "grad_norm": 2.034182071685791, "learning_rate": 4.9229840721247054e-05, "loss": 0.7499, "mean_token_accuracy": 0.7768227070569992, "num_tokens": 852576765.0, "step": 410 }, { "epoch": 0.410129709697344, "grad_norm": 0.6896814703941345, "learning_rate": 4.919557186025512e-05, "loss": 0.7449, "mean_token_accuracy": 0.7783482626080513, "num_tokens": 862986504.0, "step": 415 }, { "epoch": 0.4150710315009265, "grad_norm": 0.46286579966545105, "learning_rate": 4.9160569548432556e-05, "loss": 0.7408, "mean_token_accuracy": 0.7793940395116806, "num_tokens": 873406106.0, "step": 420 }, { "epoch": 0.420012353304509, "grad_norm": 0.41379502415657043, "learning_rate": 4.912483484681959e-05, "loss": 0.7559, "mean_token_accuracy": 0.7800678476691246, "num_tokens": 883753362.0, "step": 425 }, { "epoch": 0.4249536751080914, "grad_norm": 0.5154562592506409, "learning_rate": 4.908836883865768e-05, "loss": 0.7387, "mean_token_accuracy": 0.7794730111956596, "num_tokens": 894178144.0, "step": 430 }, { "epoch": 0.4298949969116739, "grad_norm": 0.38851067423820496, "learning_rate": 4.905117262935669e-05, "loss": 0.7407, "mean_token_accuracy": 0.7789692118763923, "num_tokens": 904574122.0, "step": 435 }, { "epoch": 0.43483631871525635, "grad_norm": 2.381132125854492, "learning_rate": 4.901324734646139e-05, "loss": 0.7344, "mean_token_accuracy": 0.7810276612639427, "num_tokens": 914977007.0, "step": 440 }, { "epoch": 0.4397776405188388, "grad_norm": 0.43396854400634766, "learning_rate": 4.897459413961729e-05, "loss": 0.7296, "mean_token_accuracy": 0.7823416009545326, "num_tokens": 925387385.0, "step": 445 }, { "epoch": 0.44471896232242125, "grad_norm": 0.38089531660079956, "learning_rate": 4.893521418053575e-05, "loss": 0.7695, "mean_token_accuracy": 0.7785934925079345, "num_tokens": 935815137.0, "step": 450 }, { "epoch": 0.44966028412600373, "grad_norm": 0.3441498875617981, "learning_rate": 4.88951086629585e-05, "loss": 0.7305, "mean_token_accuracy": 0.7820696160197258, "num_tokens": 946244183.0, "step": 455 }, { "epoch": 0.45460160592958615, "grad_norm": 0.29439103603363037, "learning_rate": 4.885427880262144e-05, "loss": 0.7453, "mean_token_accuracy": 0.7779595762491226, "num_tokens": 956604208.0, "step": 460 }, { "epoch": 0.4595429277331686, "grad_norm": 5.913498878479004, "learning_rate": 4.881272583721776e-05, "loss": 0.7628, "mean_token_accuracy": 0.777838508784771, "num_tokens": 967017106.0, "step": 465 }, { "epoch": 0.4644842495367511, "grad_norm": 0.28903448581695557, "learning_rate": 4.8770451026360495e-05, "loss": 0.7393, "mean_token_accuracy": 0.7796890079975128, "num_tokens": 977403787.0, "step": 470 }, { "epoch": 0.4694255713403335, "grad_norm": 0.24736757576465607, "learning_rate": 4.872745565154424e-05, "loss": 0.7302, "mean_token_accuracy": 0.7817555531859398, "num_tokens": 987801619.0, "step": 475 }, { "epoch": 0.474366893143916, "grad_norm": 0.260588139295578, "learning_rate": 4.868374101610638e-05, "loss": 0.7282, "mean_token_accuracy": 0.7825865045189857, "num_tokens": 998222682.0, "step": 480 }, { "epoch": 0.4793082149474985, "grad_norm": 0.589485764503479, "learning_rate": 4.863930844518757e-05, "loss": 0.7409, "mean_token_accuracy": 0.7823499292135239, "num_tokens": 1008614437.0, "step": 485 }, { "epoch": 0.4842495367510809, "grad_norm": 1.0684598684310913, "learning_rate": 4.8594159285691546e-05, "loss": 0.7398, "mean_token_accuracy": 0.779399824142456, "num_tokens": 1019023581.0, "step": 490 }, { "epoch": 0.4891908585546634, "grad_norm": 0.2995496094226837, "learning_rate": 4.8548294906244285e-05, "loss": 0.7348, "mean_token_accuracy": 0.7809380605816841, "num_tokens": 1029439322.0, "step": 495 }, { "epoch": 0.49413218035824585, "grad_norm": 0.2681543231010437, "learning_rate": 4.8501716697152555e-05, "loss": 0.7218, "mean_token_accuracy": 0.7838469684123993, "num_tokens": 1039851383.0, "step": 500 }, { "epoch": 0.4990735021618283, "grad_norm": 0.30591872334480286, "learning_rate": 4.845442607036176e-05, "loss": 0.7391, "mean_token_accuracy": 0.7794045254588127, "num_tokens": 1050273096.0, "step": 505 }, { "epoch": 0.5040148239654108, "grad_norm": 0.43375658988952637, "learning_rate": 4.840642445941309e-05, "loss": 0.7443, "mean_token_accuracy": 0.778987355530262, "num_tokens": 1060667850.0, "step": 510 }, { "epoch": 0.5089561457689932, "grad_norm": 0.35767972469329834, "learning_rate": 4.8357713319400155e-05, "loss": 0.7319, "mean_token_accuracy": 0.7822049915790558, "num_tokens": 1071070190.0, "step": 515 }, { "epoch": 0.5138974675725757, "grad_norm": 0.44550326466560364, "learning_rate": 4.8308294126924794e-05, "loss": 0.7213, "mean_token_accuracy": 0.7840958744287491, "num_tokens": 1081476164.0, "step": 520 }, { "epoch": 0.5188387893761581, "grad_norm": 0.28565794229507446, "learning_rate": 4.825816838005235e-05, "loss": 0.7312, "mean_token_accuracy": 0.7816817224025726, "num_tokens": 1091865310.0, "step": 525 }, { "epoch": 0.5237801111797405, "grad_norm": 0.2593757212162018, "learning_rate": 4.820733759826626e-05, "loss": 0.7295, "mean_token_accuracy": 0.7820295438170433, "num_tokens": 1102247757.0, "step": 530 }, { "epoch": 0.5287214329833231, "grad_norm": 0.33316972851753235, "learning_rate": 4.815580332242199e-05, "loss": 0.7149, "mean_token_accuracy": 0.7861398920416832, "num_tokens": 1112634196.0, "step": 535 }, { "epoch": 0.5336627547869055, "grad_norm": 0.2702305316925049, "learning_rate": 4.810356711470033e-05, "loss": 0.7209, "mean_token_accuracy": 0.7841956496238709, "num_tokens": 1123039113.0, "step": 540 }, { "epoch": 0.5386040765904879, "grad_norm": 3.4884896278381348, "learning_rate": 4.8050630558560026e-05, "loss": 0.7353, "mean_token_accuracy": 0.7829997181892395, "num_tokens": 1133440814.0, "step": 545 }, { "epoch": 0.5435453983940705, "grad_norm": 0.724455714225769, "learning_rate": 4.799699525868979e-05, "loss": 0.7273, "mean_token_accuracy": 0.782489824295044, "num_tokens": 1143853971.0, "step": 550 }, { "epoch": 0.5484867201976529, "grad_norm": 0.3589349091053009, "learning_rate": 4.7942662840959654e-05, "loss": 0.7334, "mean_token_accuracy": 0.7808837026357651, "num_tokens": 1154257665.0, "step": 555 }, { "epoch": 0.5534280420012353, "grad_norm": 0.27754878997802734, "learning_rate": 4.7887634952371684e-05, "loss": 0.7214, "mean_token_accuracy": 0.7839814886450768, "num_tokens": 1164628485.0, "step": 560 }, { "epoch": 0.5583693638048178, "grad_norm": 0.4044415354728699, "learning_rate": 4.7831913261010066e-05, "loss": 0.7541, "mean_token_accuracy": 0.7808719158172608, "num_tokens": 1174996101.0, "step": 565 }, { "epoch": 0.5633106856084003, "grad_norm": 0.3834975063800812, "learning_rate": 4.777549945599051e-05, "loss": 0.7407, "mean_token_accuracy": 0.7786987662315369, "num_tokens": 1185426572.0, "step": 570 }, { "epoch": 0.5682520074119827, "grad_norm": 0.24947196245193481, "learning_rate": 4.7718395247409095e-05, "loss": 0.7264, "mean_token_accuracy": 0.7827806279063225, "num_tokens": 1195853750.0, "step": 575 }, { "epoch": 0.5731933292155652, "grad_norm": 0.24807173013687134, "learning_rate": 4.766060236629037e-05, "loss": 0.7369, "mean_token_accuracy": 0.7798179477453232, "num_tokens": 1206269807.0, "step": 580 }, { "epoch": 0.5781346510191476, "grad_norm": 0.21170632541179657, "learning_rate": 4.760212256453493e-05, "loss": 0.733, "mean_token_accuracy": 0.7807521566748619, "num_tokens": 1216679925.0, "step": 585 }, { "epoch": 0.58307597282273, "grad_norm": 0.1931750476360321, "learning_rate": 4.7542957614866296e-05, "loss": 0.7288, "mean_token_accuracy": 0.7816975250840187, "num_tokens": 1227085555.0, "step": 590 }, { "epoch": 0.5880172946263126, "grad_norm": 0.19650745391845703, "learning_rate": 4.7483109310777165e-05, "loss": 0.7196, "mean_token_accuracy": 0.784217968583107, "num_tokens": 1237498455.0, "step": 595 }, { "epoch": 0.592958616429895, "grad_norm": 0.20376770198345184, "learning_rate": 4.7422579466475035e-05, "loss": 0.7173, "mean_token_accuracy": 0.7853407070040703, "num_tokens": 1247922816.0, "step": 600 }, { "epoch": 0.5978999382334774, "grad_norm": 0.2214794009923935, "learning_rate": 4.736136991682727e-05, "loss": 0.7219, "mean_token_accuracy": 0.7833896622061729, "num_tokens": 1258329220.0, "step": 605 }, { "epoch": 0.60284126003706, "grad_norm": 0.19553132355213165, "learning_rate": 4.7299482517305404e-05, "loss": 0.7157, "mean_token_accuracy": 0.7851592242717743, "num_tokens": 1268717321.0, "step": 610 }, { "epoch": 0.6077825818406424, "grad_norm": 0.3079369068145752, "learning_rate": 4.723691914392893e-05, "loss": 0.7296, "mean_token_accuracy": 0.7822741761803627, "num_tokens": 1279092136.0, "step": 615 }, { "epoch": 0.6127239036442248, "grad_norm": 0.46084290742874146, "learning_rate": 4.7173681693208444e-05, "loss": 0.753, "mean_token_accuracy": 0.7799653187394142, "num_tokens": 1289484661.0, "step": 620 }, { "epoch": 0.6176652254478073, "grad_norm": 11.295252799987793, "learning_rate": 4.710977208208812e-05, "loss": 0.7715, "mean_token_accuracy": 0.778912840783596, "num_tokens": 1299870309.0, "step": 625 }, { "epoch": 0.6226065472513898, "grad_norm": 0.36179018020629883, "learning_rate": 4.7045192247887634e-05, "loss": 0.7139, "mean_token_accuracy": 0.7855943024158478, "num_tokens": 1310266867.0, "step": 630 }, { "epoch": 0.6275478690549722, "grad_norm": 0.3986701965332031, "learning_rate": 4.697994414824343e-05, "loss": 0.7224, "mean_token_accuracy": 0.7834179788827896, "num_tokens": 1320695348.0, "step": 635 }, { "epoch": 0.6324891908585547, "grad_norm": 0.28618842363357544, "learning_rate": 4.6914029761049357e-05, "loss": 0.7199, "mean_token_accuracy": 0.7842305526137352, "num_tokens": 1331087810.0, "step": 640 }, { "epoch": 0.6374305126621371, "grad_norm": 0.3706783652305603, "learning_rate": 4.6847451084396724e-05, "loss": 0.7151, "mean_token_accuracy": 0.7855269074440002, "num_tokens": 1341477976.0, "step": 645 }, { "epoch": 0.6423718344657195, "grad_norm": 0.2638409435749054, "learning_rate": 4.678021013651375e-05, "loss": 0.7239, "mean_token_accuracy": 0.7827935367822647, "num_tokens": 1351900891.0, "step": 650 }, { "epoch": 0.6473131562693021, "grad_norm": 0.20985962450504303, "learning_rate": 4.6712308955704346e-05, "loss": 0.7088, "mean_token_accuracy": 0.7867402359843254, "num_tokens": 1362281947.0, "step": 655 }, { "epoch": 0.6522544780728845, "grad_norm": 0.2701743245124817, "learning_rate": 4.664374960028638e-05, "loss": 0.7351, "mean_token_accuracy": 0.7816146582365036, "num_tokens": 1372687416.0, "step": 660 }, { "epoch": 0.6571957998764669, "grad_norm": 0.3363015651702881, "learning_rate": 4.6574534148529225e-05, "loss": 0.7438, "mean_token_accuracy": 0.7821772992610931, "num_tokens": 1383065396.0, "step": 665 }, { "epoch": 0.6621371216800495, "grad_norm": 0.49438920617103577, "learning_rate": 4.650466469859079e-05, "loss": 0.7798, "mean_token_accuracy": 0.7778581961989403, "num_tokens": 1393456689.0, "step": 670 }, { "epoch": 0.6670784434836319, "grad_norm": 0.27729520201683044, "learning_rate": 4.643414336845394e-05, "loss": 0.7271, "mean_token_accuracy": 0.7821262225508689, "num_tokens": 1403871987.0, "step": 675 }, { "epoch": 0.6720197652872143, "grad_norm": 0.23920652270317078, "learning_rate": 4.6362972295862225e-05, "loss": 0.7128, "mean_token_accuracy": 0.7859095022082329, "num_tokens": 1414264057.0, "step": 680 }, { "epoch": 0.6769610870907968, "grad_norm": 0.22164705395698547, "learning_rate": 4.629115363825514e-05, "loss": 0.721, "mean_token_accuracy": 0.7835411220788956, "num_tokens": 1424682063.0, "step": 685 }, { "epoch": 0.6819024088943793, "grad_norm": 0.19232025742530823, "learning_rate": 4.6218689572702715e-05, "loss": 0.7364, "mean_token_accuracy": 0.7843156576156616, "num_tokens": 1435058723.0, "step": 690 }, { "epoch": 0.6868437306979617, "grad_norm": 0.21504122018814087, "learning_rate": 4.614558229583948e-05, "loss": 0.7234, "mean_token_accuracy": 0.7829699948430061, "num_tokens": 1445469769.0, "step": 695 }, { "epoch": 0.6917850525015442, "grad_norm": 0.21127116680145264, "learning_rate": 4.607183402379794e-05, "loss": 0.7055, "mean_token_accuracy": 0.7875325232744217, "num_tokens": 1455878611.0, "step": 700 }, { "epoch": 0.6967263743051266, "grad_norm": 0.221888467669487, "learning_rate": 4.599744699214136e-05, "loss": 0.7163, "mean_token_accuracy": 0.7848389104008675, "num_tokens": 1466265360.0, "step": 705 }, { "epoch": 0.701667696108709, "grad_norm": 11.019208908081055, "learning_rate": 4.5922423455795966e-05, "loss": 0.7403, "mean_token_accuracy": 0.7839446976780892, "num_tokens": 1476653925.0, "step": 710 }, { "epoch": 0.7066090179122916, "grad_norm": 0.2727574408054352, "learning_rate": 4.584676568898267e-05, "loss": 0.7104, "mean_token_accuracy": 0.7866337120532989, "num_tokens": 1487058650.0, "step": 715 }, { "epoch": 0.711550339715874, "grad_norm": 1.0687726736068726, "learning_rate": 4.5770475985148056e-05, "loss": 0.7324, "mean_token_accuracy": 0.7805073946714401, "num_tokens": 1497461614.0, "step": 720 }, { "epoch": 0.7164916615194564, "grad_norm": 5.212482452392578, "learning_rate": 4.5693556656894907e-05, "loss": 0.7587, "mean_token_accuracy": 0.7814112335443497, "num_tokens": 1507862236.0, "step": 725 }, { "epoch": 0.721432983323039, "grad_norm": 0.4863557815551758, "learning_rate": 4.561601003591208e-05, "loss": 0.7297, "mean_token_accuracy": 0.7825336411595345, "num_tokens": 1518272002.0, "step": 730 }, { "epoch": 0.7263743051266214, "grad_norm": 0.4574221074581146, "learning_rate": 4.5537838472903814e-05, "loss": 0.7252, "mean_token_accuracy": 0.7830310598015785, "num_tokens": 1528707308.0, "step": 735 }, { "epoch": 0.7313156269302038, "grad_norm": 0.34816044569015503, "learning_rate": 4.54590443375185e-05, "loss": 0.7118, "mean_token_accuracy": 0.7860510513186455, "num_tokens": 1539091834.0, "step": 740 }, { "epoch": 0.7362569487337863, "grad_norm": 0.2924805283546448, "learning_rate": 4.5379630018276834e-05, "loss": 0.7141, "mean_token_accuracy": 0.7856177687644958, "num_tokens": 1549484512.0, "step": 745 }, { "epoch": 0.7411982705373688, "grad_norm": 0.21858909726142883, "learning_rate": 4.5299597922499396e-05, "loss": 0.7144, "mean_token_accuracy": 0.7855332553386688, "num_tokens": 1559908828.0, "step": 750 }, { "epoch": 0.7461395923409512, "grad_norm": 0.21600526571273804, "learning_rate": 4.521895047623372e-05, "loss": 0.7139, "mean_token_accuracy": 0.7855884119868278, "num_tokens": 1570311228.0, "step": 755 }, { "epoch": 0.7510809141445337, "grad_norm": 2.1125078201293945, "learning_rate": 4.513769012418071e-05, "loss": 0.7108, "mean_token_accuracy": 0.7866385370492935, "num_tokens": 1580708990.0, "step": 760 }, { "epoch": 0.7560222359481161, "grad_norm": 0.2704427242279053, "learning_rate": 4.505581932962054e-05, "loss": 0.7064, "mean_token_accuracy": 0.7872796177864074, "num_tokens": 1591096704.0, "step": 765 }, { "epoch": 0.7609635577516985, "grad_norm": 0.6448246240615845, "learning_rate": 4.4973340574338016e-05, "loss": 0.7259, "mean_token_accuracy": 0.7822867006063461, "num_tokens": 1601474574.0, "step": 770 }, { "epoch": 0.7659048795552811, "grad_norm": 0.28744909167289734, "learning_rate": 4.4890256358547304e-05, "loss": 0.7156, "mean_token_accuracy": 0.7849863648414612, "num_tokens": 1611883617.0, "step": 775 }, { "epoch": 0.7708462013588635, "grad_norm": 1.0210628509521484, "learning_rate": 4.480656920081615e-05, "loss": 0.7458, "mean_token_accuracy": 0.7787564381957054, "num_tokens": 1622289561.0, "step": 780 }, { "epoch": 0.7757875231624459, "grad_norm": 0.6114581227302551, "learning_rate": 4.472228163798956e-05, "loss": 0.7247, "mean_token_accuracy": 0.7828380897641182, "num_tokens": 1632700884.0, "step": 785 }, { "epoch": 0.7807288449660285, "grad_norm": 0.3882652223110199, "learning_rate": 4.4637396225112846e-05, "loss": 0.7328, "mean_token_accuracy": 0.7821167722344399, "num_tokens": 1643078204.0, "step": 790 }, { "epoch": 0.7856701667696109, "grad_norm": 1.0247176885604858, "learning_rate": 4.4551915535354256e-05, "loss": 0.7134, "mean_token_accuracy": 0.7853814095258713, "num_tokens": 1653491014.0, "step": 795 }, { "epoch": 0.7906114885731933, "grad_norm": 27.41879653930664, "learning_rate": 4.446584215992687e-05, "loss": 5.9935, "mean_token_accuracy": 0.32389395159407286, "num_tokens": 1663888090.0, "step": 800 }, { "epoch": 0.7955528103767758, "grad_norm": 64.84490203857422, "learning_rate": 4.437917870801015e-05, "loss": 7.1083, "mean_token_accuracy": 0.08303709244355559, "num_tokens": 1674313567.0, "step": 805 }, { "epoch": 0.8004941321803583, "grad_norm": 11.887207984924316, "learning_rate": 4.429192780667077e-05, "loss": 2.6923, "mean_token_accuracy": 0.5082879170775414, "num_tokens": 1684731527.0, "step": 810 }, { "epoch": 0.8054354539839407, "grad_norm": 2.350527763366699, "learning_rate": 4.4204092100783033e-05, "loss": 0.8951, "mean_token_accuracy": 0.7543378055095673, "num_tokens": 1695152743.0, "step": 815 }, { "epoch": 0.8103767757875232, "grad_norm": 5.608152866363525, "learning_rate": 4.411567425294867e-05, "loss": 0.8052, "mean_token_accuracy": 0.7724114000797272, "num_tokens": 1705562199.0, "step": 820 }, { "epoch": 0.8153180975911056, "grad_norm": 0.6283033490180969, "learning_rate": 4.402667694341611e-05, "loss": 0.7615, "mean_token_accuracy": 0.7743734747171402, "num_tokens": 1715941127.0, "step": 825 }, { "epoch": 0.820259419394688, "grad_norm": 2.0349133014678955, "learning_rate": 4.393710286999929e-05, "loss": 0.7722, "mean_token_accuracy": 0.7739002510905266, "num_tokens": 1726358797.0, "step": 830 }, { "epoch": 0.8252007411982706, "grad_norm": 1.4279297590255737, "learning_rate": 4.3846954747995825e-05, "loss": 0.7408, "mean_token_accuracy": 0.778858907520771, "num_tokens": 1736768844.0, "step": 835 }, { "epoch": 0.830142063001853, "grad_norm": 0.9458789825439453, "learning_rate": 4.375623531010471e-05, "loss": 0.7382, "mean_token_accuracy": 0.7807383254170418, "num_tokens": 1747158903.0, "step": 840 }, { "epoch": 0.8350833848054354, "grad_norm": 0.6119948625564575, "learning_rate": 4.366494730634348e-05, "loss": 0.7256, "mean_token_accuracy": 0.7840852901339531, "num_tokens": 1757542631.0, "step": 845 }, { "epoch": 0.840024706609018, "grad_norm": 0.30982956290245056, "learning_rate": 4.357309350396488e-05, "loss": 0.719, "mean_token_accuracy": 0.7841277673840523, "num_tokens": 1767957528.0, "step": 850 }, { "epoch": 0.8449660284126004, "grad_norm": 0.28105172514915466, "learning_rate": 4.3480676687372915e-05, "loss": 0.7186, "mean_token_accuracy": 0.7848882034420968, "num_tokens": 1778374482.0, "step": 855 }, { "epoch": 0.8499073502161828, "grad_norm": 0.23508192598819733, "learning_rate": 4.3387699658038506e-05, "loss": 0.737, "mean_token_accuracy": 0.7816414266824723, "num_tokens": 1788794176.0, "step": 860 }, { "epoch": 0.8548486720197653, "grad_norm": 0.263019323348999, "learning_rate": 4.329416523441454e-05, "loss": 0.7297, "mean_token_accuracy": 0.7812557741999626, "num_tokens": 1799211612.0, "step": 865 }, { "epoch": 0.8597899938233478, "grad_norm": 0.2726346254348755, "learning_rate": 4.3200076251850455e-05, "loss": 0.7165, "mean_token_accuracy": 0.7844896405935288, "num_tokens": 1809623863.0, "step": 870 }, { "epoch": 0.8647313156269302, "grad_norm": 0.22352498769760132, "learning_rate": 4.310543556250624e-05, "loss": 0.7166, "mean_token_accuracy": 0.7844405427575112, "num_tokens": 1820035541.0, "step": 875 }, { "epoch": 0.8696726374305127, "grad_norm": 0.21182860434055328, "learning_rate": 4.301024603526603e-05, "loss": 0.7179, "mean_token_accuracy": 0.78359464854002, "num_tokens": 1830385910.0, "step": 880 }, { "epoch": 0.8746139592340951, "grad_norm": 0.1831175982952118, "learning_rate": 4.291451055565113e-05, "loss": 0.6931, "mean_token_accuracy": 0.792209367454052, "num_tokens": 1840794040.0, "step": 885 }, { "epoch": 0.8795552810376775, "grad_norm": 0.17403781414031982, "learning_rate": 4.281823202573252e-05, "loss": 0.7116, "mean_token_accuracy": 0.7860970973968506, "num_tokens": 1851206662.0, "step": 890 }, { "epoch": 0.8844966028412601, "grad_norm": 0.21906058490276337, "learning_rate": 4.272141336404289e-05, "loss": 0.7008, "mean_token_accuracy": 0.7892033144831657, "num_tokens": 1861585024.0, "step": 895 }, { "epoch": 0.8894379246448425, "grad_norm": 0.20309416949748993, "learning_rate": 4.2624057505488216e-05, "loss": 0.7186, "mean_token_accuracy": 0.7835696578025818, "num_tokens": 1872004992.0, "step": 900 }, { "epoch": 0.8943792464484249, "grad_norm": 0.18816368281841278, "learning_rate": 4.252616740125871e-05, "loss": 0.6973, "mean_token_accuracy": 0.7894984766840935, "num_tokens": 1882403165.0, "step": 905 }, { "epoch": 0.8993205682520075, "grad_norm": 0.1928325891494751, "learning_rate": 4.242774601873943e-05, "loss": 0.7156, "mean_token_accuracy": 0.784919947385788, "num_tokens": 1892805219.0, "step": 910 }, { "epoch": 0.9042618900555899, "grad_norm": 0.18554538488388062, "learning_rate": 4.23287963414203e-05, "loss": 0.7102, "mean_token_accuracy": 0.7861207097768783, "num_tokens": 1903158706.0, "step": 915 }, { "epoch": 0.9092032118591723, "grad_norm": 0.1917434185743332, "learning_rate": 4.222932136880566e-05, "loss": 0.7098, "mean_token_accuracy": 0.7864427953958512, "num_tokens": 1913578297.0, "step": 920 }, { "epoch": 0.9141445336627548, "grad_norm": 0.18963773548603058, "learning_rate": 4.212932411632336e-05, "loss": 0.7249, "mean_token_accuracy": 0.7831475377082825, "num_tokens": 1923979178.0, "step": 925 }, { "epoch": 0.9190858554663373, "grad_norm": 0.18606819212436676, "learning_rate": 4.202880761523337e-05, "loss": 0.7001, "mean_token_accuracy": 0.7890260085463524, "num_tokens": 1934367962.0, "step": 930 }, { "epoch": 0.9240271772699197, "grad_norm": 0.19036024808883667, "learning_rate": 4.1927774912535825e-05, "loss": 0.71, "mean_token_accuracy": 0.7863813400268554, "num_tokens": 1944753938.0, "step": 935 }, { "epoch": 0.9289684990735022, "grad_norm": 0.20415684580802917, "learning_rate": 4.1826229070878716e-05, "loss": 0.7295, "mean_token_accuracy": 0.7834419712424279, "num_tokens": 1955175313.0, "step": 940 }, { "epoch": 0.9339098208770846, "grad_norm": 0.327197402715683, "learning_rate": 4.1724173168465064e-05, "loss": 0.7156, "mean_token_accuracy": 0.78517996519804, "num_tokens": 1965584016.0, "step": 945 }, { "epoch": 0.938851142680667, "grad_norm": 0.4138229787349701, "learning_rate": 4.1621610298959556e-05, "loss": 0.7201, "mean_token_accuracy": 0.7838375464081764, "num_tokens": 1975979152.0, "step": 950 }, { "epoch": 0.9437924644842496, "grad_norm": 0.36716070771217346, "learning_rate": 4.1518543571394796e-05, "loss": 0.7119, "mean_token_accuracy": 0.7862480774521827, "num_tokens": 1986388324.0, "step": 955 }, { "epoch": 0.948733786287832, "grad_norm": 0.2444203794002533, "learning_rate": 4.141497611007705e-05, "loss": 0.7307, "mean_token_accuracy": 0.7824576959013939, "num_tokens": 1996795931.0, "step": 960 }, { "epoch": 0.9536751080914144, "grad_norm": 0.20585046708583832, "learning_rate": 4.131091105449156e-05, "loss": 0.7152, "mean_token_accuracy": 0.7847880542278289, "num_tokens": 2007173184.0, "step": 965 }, { "epoch": 0.958616429894997, "grad_norm": 0.19417141377925873, "learning_rate": 4.120635155920735e-05, "loss": 0.7073, "mean_token_accuracy": 0.7870074450969696, "num_tokens": 2017580798.0, "step": 970 }, { "epoch": 0.9635577516985794, "grad_norm": 0.19215287268161774, "learning_rate": 4.110130079378159e-05, "loss": 0.7111, "mean_token_accuracy": 0.7853602975606918, "num_tokens": 2027947222.0, "step": 975 }, { "epoch": 0.9684990735021618, "grad_norm": 0.1740170270204544, "learning_rate": 4.099576194266357e-05, "loss": 0.7136, "mean_token_accuracy": 0.7852953299880028, "num_tokens": 2038360867.0, "step": 980 }, { "epoch": 0.9734403953057443, "grad_norm": 0.5138667225837708, "learning_rate": 4.0889738205098105e-05, "loss": 0.7247, "mean_token_accuracy": 0.7842022761702537, "num_tokens": 2048762177.0, "step": 985 }, { "epoch": 0.9783817171093268, "grad_norm": 2.790609121322632, "learning_rate": 4.078323279502858e-05, "loss": 0.7154, "mean_token_accuracy": 0.7841217175126076, "num_tokens": 2059178406.0, "step": 990 }, { "epoch": 0.9833230389129092, "grad_norm": 0.30142104625701904, "learning_rate": 4.067624894099956e-05, "loss": 0.7053, "mean_token_accuracy": 0.787280821800232, "num_tokens": 2069556358.0, "step": 995 }, { "epoch": 0.9882643607164917, "grad_norm": 0.29179099202156067, "learning_rate": 4.056878988605884e-05, "loss": 0.7086, "mean_token_accuracy": 0.7862989202141761, "num_tokens": 2079942474.0, "step": 1000 }, { "epoch": 0.9932056825200741, "grad_norm": 0.2190045565366745, "learning_rate": 4.0460858887659225e-05, "loss": 0.7083, "mean_token_accuracy": 0.786419989168644, "num_tokens": 2090327369.0, "step": 1005 }, { "epoch": 0.9981470043236566, "grad_norm": 0.18355406820774078, "learning_rate": 4.0352459217559747e-05, "loss": 0.7114, "mean_token_accuracy": 0.7855605989694595, "num_tokens": 2100728668.0, "step": 1010 }, { "epoch": 1.0029647930821495, "grad_norm": 0.2185661494731903, "learning_rate": 4.024359416172644e-05, "loss": 0.6858, "mean_token_accuracy": 0.7917264806918609, "num_tokens": 2110869886.0, "step": 1015 }, { "epoch": 1.007906114885732, "grad_norm": 0.18818652629852295, "learning_rate": 4.013426702023284e-05, "loss": 0.6754, "mean_token_accuracy": 0.7937879115343094, "num_tokens": 2121248219.0, "step": 1020 }, { "epoch": 1.0128474366893143, "grad_norm": 0.1852264106273651, "learning_rate": 4.0024481107159836e-05, "loss": 0.6724, "mean_token_accuracy": 0.7944735899567604, "num_tokens": 2131667569.0, "step": 1025 }, { "epoch": 1.0177887584928969, "grad_norm": 0.24998919665813446, "learning_rate": 3.991423975049527e-05, "loss": 0.7023, "mean_token_accuracy": 0.7886683195829391, "num_tokens": 2142075128.0, "step": 1030 }, { "epoch": 1.0227300802964794, "grad_norm": 0.19213631749153137, "learning_rate": 3.980354629203307e-05, "loss": 0.6727, "mean_token_accuracy": 0.7945450767874718, "num_tokens": 2152492146.0, "step": 1035 }, { "epoch": 1.0276714021000617, "grad_norm": 0.1905544400215149, "learning_rate": 3.9692404087271896e-05, "loss": 0.6783, "mean_token_accuracy": 0.7932624593377113, "num_tokens": 2162896460.0, "step": 1040 }, { "epoch": 1.0326127239036442, "grad_norm": 3.515601396560669, "learning_rate": 3.958081650531343e-05, "loss": 0.7021, "mean_token_accuracy": 0.7893586799502372, "num_tokens": 2173297613.0, "step": 1045 }, { "epoch": 1.0375540457072268, "grad_norm": 0.19767682254314423, "learning_rate": 3.9468786928760316e-05, "loss": 0.6863, "mean_token_accuracy": 0.7907787173986435, "num_tokens": 2183711345.0, "step": 1050 }, { "epoch": 1.042495367510809, "grad_norm": 0.23874838650226593, "learning_rate": 3.9356318753613525e-05, "loss": 0.6753, "mean_token_accuracy": 0.7938481122255325, "num_tokens": 2194098086.0, "step": 1055 }, { "epoch": 1.0474366893143916, "grad_norm": 0.208620086312294, "learning_rate": 3.924341538916948e-05, "loss": 0.6895, "mean_token_accuracy": 0.7899346083402634, "num_tokens": 2204491785.0, "step": 1060 }, { "epoch": 1.0523780111179741, "grad_norm": 1.1455881595611572, "learning_rate": 3.913008025791669e-05, "loss": 0.6783, "mean_token_accuracy": 0.7934534177184105, "num_tokens": 2214880711.0, "step": 1065 }, { "epoch": 1.0573193329215564, "grad_norm": 0.19406479597091675, "learning_rate": 3.901631679543198e-05, "loss": 0.677, "mean_token_accuracy": 0.793326124548912, "num_tokens": 2225265608.0, "step": 1070 }, { "epoch": 1.062260654725139, "grad_norm": 0.18829670548439026, "learning_rate": 3.890212845027637e-05, "loss": 0.6706, "mean_token_accuracy": 0.7950180992484093, "num_tokens": 2235613832.0, "step": 1075 }, { "epoch": 1.0672019765287215, "grad_norm": 0.19113752245903015, "learning_rate": 3.8787518683890536e-05, "loss": 0.6791, "mean_token_accuracy": 0.7926081731915474, "num_tokens": 2246002617.0, "step": 1080 }, { "epoch": 1.0721432983323038, "grad_norm": 0.19079595804214478, "learning_rate": 3.867249097048989e-05, "loss": 0.6823, "mean_token_accuracy": 0.7948415905237198, "num_tokens": 2256415299.0, "step": 1085 }, { "epoch": 1.0770846201358864, "grad_norm": 0.31613099575042725, "learning_rate": 3.855704879695923e-05, "loss": 0.6923, "mean_token_accuracy": 0.7909860432147979, "num_tokens": 2266807650.0, "step": 1090 }, { "epoch": 1.0820259419394689, "grad_norm": 0.2967878580093384, "learning_rate": 3.844119566274707e-05, "loss": 0.7076, "mean_token_accuracy": 0.7886807918548584, "num_tokens": 2277220430.0, "step": 1095 }, { "epoch": 1.0869672637430512, "grad_norm": 0.4129527807235718, "learning_rate": 3.8324935079759555e-05, "loss": 0.6778, "mean_token_accuracy": 0.7931554391980171, "num_tokens": 2287644164.0, "step": 1100 }, { "epoch": 1.0919085855466337, "grad_norm": 0.39800935983657837, "learning_rate": 3.820827057225401e-05, "loss": 0.6771, "mean_token_accuracy": 0.7932255193591118, "num_tokens": 2298058744.0, "step": 1105 }, { "epoch": 1.0968499073502163, "grad_norm": 0.23592214286327362, "learning_rate": 3.809120567673209e-05, "loss": 0.6731, "mean_token_accuracy": 0.7948127672076225, "num_tokens": 2308457445.0, "step": 1110 }, { "epoch": 1.1017912291537986, "grad_norm": 0.3244028091430664, "learning_rate": 3.797374394183257e-05, "loss": 0.6903, "mean_token_accuracy": 0.790345923602581, "num_tokens": 2318859279.0, "step": 1115 }, { "epoch": 1.106732550957381, "grad_norm": 0.24913185834884644, "learning_rate": 3.785588892822383e-05, "loss": 0.683, "mean_token_accuracy": 0.7915908902883529, "num_tokens": 2329278923.0, "step": 1120 }, { "epoch": 1.1116738727609636, "grad_norm": 0.21519848704338074, "learning_rate": 3.7737644208495835e-05, "loss": 0.678, "mean_token_accuracy": 0.7930225506424904, "num_tokens": 2339685854.0, "step": 1125 }, { "epoch": 1.116615194564546, "grad_norm": 0.24836380779743195, "learning_rate": 3.76190133670519e-05, "loss": 0.6884, "mean_token_accuracy": 0.7918792888522148, "num_tokens": 2350067715.0, "step": 1130 }, { "epoch": 1.1215565163681285, "grad_norm": 0.31333428621292114, "learning_rate": 3.7500000000000003e-05, "loss": 0.681, "mean_token_accuracy": 0.7937313199043274, "num_tokens": 2360468748.0, "step": 1135 }, { "epoch": 1.126497838171711, "grad_norm": 3.056119918823242, "learning_rate": 3.73806077150438e-05, "loss": 0.7114, "mean_token_accuracy": 0.7900054767727852, "num_tokens": 2370874127.0, "step": 1140 }, { "epoch": 1.1314391599752933, "grad_norm": 0.37440919876098633, "learning_rate": 3.7260840131373255e-05, "loss": 0.6803, "mean_token_accuracy": 0.7923753753304481, "num_tokens": 2381252025.0, "step": 1145 }, { "epoch": 1.1363804817788759, "grad_norm": 0.24887527525424957, "learning_rate": 3.714070087955489e-05, "loss": 0.6899, "mean_token_accuracy": 0.791276590526104, "num_tokens": 2391657101.0, "step": 1150 }, { "epoch": 1.1413218035824584, "grad_norm": 0.2685694098472595, "learning_rate": 3.702019360142181e-05, "loss": 0.6717, "mean_token_accuracy": 0.7948125019669533, "num_tokens": 2402071516.0, "step": 1155 }, { "epoch": 1.1462631253860407, "grad_norm": 0.20285147428512573, "learning_rate": 3.689932194996322e-05, "loss": 0.689, "mean_token_accuracy": 0.7898005157709121, "num_tokens": 2412437076.0, "step": 1160 }, { "epoch": 1.1512044471896232, "grad_norm": 0.6126241683959961, "learning_rate": 3.677808958921375e-05, "loss": 0.6818, "mean_token_accuracy": 0.793350088596344, "num_tokens": 2422854087.0, "step": 1165 }, { "epoch": 1.1561457689932058, "grad_norm": 0.5088633894920349, "learning_rate": 3.665650019414239e-05, "loss": 0.6814, "mean_token_accuracy": 0.7923238545656204, "num_tokens": 2433241676.0, "step": 1170 }, { "epoch": 1.161087090796788, "grad_norm": 0.34475672245025635, "learning_rate": 3.653455745054101e-05, "loss": 0.6835, "mean_token_accuracy": 0.7914653971791268, "num_tokens": 2443645430.0, "step": 1175 }, { "epoch": 1.1660284126003706, "grad_norm": 0.39755386114120483, "learning_rate": 3.641226505491273e-05, "loss": 0.7204, "mean_token_accuracy": 0.7898602604866027, "num_tokens": 2454036623.0, "step": 1180 }, { "epoch": 1.1709697344039531, "grad_norm": 0.35569798946380615, "learning_rate": 3.6289626714359815e-05, "loss": 0.6794, "mean_token_accuracy": 0.7927926614880562, "num_tokens": 2464459884.0, "step": 1185 }, { "epoch": 1.1759110562075354, "grad_norm": 0.36636069416999817, "learning_rate": 3.616664614647129e-05, "loss": 0.6917, "mean_token_accuracy": 0.7891320005059242, "num_tokens": 2474874175.0, "step": 1190 }, { "epoch": 1.180852378011118, "grad_norm": 0.2763560116291046, "learning_rate": 3.60433270792103e-05, "loss": 0.6904, "mean_token_accuracy": 0.791066774725914, "num_tokens": 2485269212.0, "step": 1195 }, { "epoch": 1.1857936998147005, "grad_norm": 0.3080281615257263, "learning_rate": 3.591967325080104e-05, "loss": 0.6788, "mean_token_accuracy": 0.7932477965950966, "num_tokens": 2495685298.0, "step": 1200 }, { "epoch": 1.1907350216182828, "grad_norm": 0.3292888104915619, "learning_rate": 3.5795688409615464e-05, "loss": 0.6702, "mean_token_accuracy": 0.7956741228699684, "num_tokens": 2506084349.0, "step": 1205 }, { "epoch": 1.1956763434218654, "grad_norm": 0.2559851109981537, "learning_rate": 3.567137631405967e-05, "loss": 0.6843, "mean_token_accuracy": 0.7913395538926125, "num_tokens": 2516495078.0, "step": 1210 }, { "epoch": 1.2006176652254479, "grad_norm": 0.22338271141052246, "learning_rate": 3.554674073245996e-05, "loss": 0.6747, "mean_token_accuracy": 0.7942400932312011, "num_tokens": 2526877043.0, "step": 1215 }, { "epoch": 1.2055589870290302, "grad_norm": 0.22159652411937714, "learning_rate": 3.542178544294861e-05, "loss": 0.6806, "mean_token_accuracy": 0.7920667245984078, "num_tokens": 2537271309.0, "step": 1220 }, { "epoch": 1.2105003088326127, "grad_norm": 0.18364663422107697, "learning_rate": 3.529651423334932e-05, "loss": 0.6792, "mean_token_accuracy": 0.7927497401833534, "num_tokens": 2547657770.0, "step": 1225 }, { "epoch": 1.2154416306361953, "grad_norm": 0.17027516663074493, "learning_rate": 3.5170930901062436e-05, "loss": 0.6792, "mean_token_accuracy": 0.7929888218641281, "num_tokens": 2558054962.0, "step": 1230 }, { "epoch": 1.2203829524397776, "grad_norm": 0.2822136878967285, "learning_rate": 3.5045039252949795e-05, "loss": 0.7004, "mean_token_accuracy": 0.7900258719921112, "num_tokens": 2568444606.0, "step": 1235 }, { "epoch": 1.22532427424336, "grad_norm": 0.3928567171096802, "learning_rate": 3.491884310521938e-05, "loss": 0.7061, "mean_token_accuracy": 0.7906710639595985, "num_tokens": 2578834015.0, "step": 1240 }, { "epoch": 1.2302655960469426, "grad_norm": 0.2886963188648224, "learning_rate": 3.479234628330955e-05, "loss": 0.6705, "mean_token_accuracy": 0.7952982589602471, "num_tokens": 2589221980.0, "step": 1245 }, { "epoch": 1.235206917850525, "grad_norm": 0.2760990858078003, "learning_rate": 3.4665552621773165e-05, "loss": 0.6821, "mean_token_accuracy": 0.7916675806045532, "num_tokens": 2599648424.0, "step": 1250 }, { "epoch": 1.2401482396541075, "grad_norm": 0.22312766313552856, "learning_rate": 3.453846596416131e-05, "loss": 0.6707, "mean_token_accuracy": 0.7951520264148713, "num_tokens": 2610054239.0, "step": 1255 }, { "epoch": 1.24508956145769, "grad_norm": 0.20437319576740265, "learning_rate": 3.441109016290679e-05, "loss": 0.6859, "mean_token_accuracy": 0.791742579638958, "num_tokens": 2620401721.0, "step": 1260 }, { "epoch": 1.2500308832612723, "grad_norm": 0.19419367611408234, "learning_rate": 3.428342907920732e-05, "loss": 0.6855, "mean_token_accuracy": 0.7908361554145813, "num_tokens": 2630806140.0, "step": 1265 }, { "epoch": 1.2549722050648549, "grad_norm": 0.1733561009168625, "learning_rate": 3.4155486582908535e-05, "loss": 0.6649, "mean_token_accuracy": 0.7967527687549592, "num_tokens": 2641221965.0, "step": 1270 }, { "epoch": 1.2599135268684374, "grad_norm": 0.18513615429401398, "learning_rate": 3.402726655238665e-05, "loss": 0.6762, "mean_token_accuracy": 0.7932528495788574, "num_tokens": 2651649647.0, "step": 1275 }, { "epoch": 1.2648548486720197, "grad_norm": 0.2018139809370041, "learning_rate": 3.389877287443086e-05, "loss": 0.6746, "mean_token_accuracy": 0.7940797954797745, "num_tokens": 2662073524.0, "step": 1280 }, { "epoch": 1.2697961704756022, "grad_norm": 1.0830366611480713, "learning_rate": 3.37700094441256e-05, "loss": 0.6822, "mean_token_accuracy": 0.7922612771391868, "num_tokens": 2672493582.0, "step": 1285 }, { "epoch": 1.2747374922791848, "grad_norm": 0.2617737948894501, "learning_rate": 3.3640980164732395e-05, "loss": 0.687, "mean_token_accuracy": 0.7908132255077363, "num_tokens": 2682914502.0, "step": 1290 }, { "epoch": 1.279678814082767, "grad_norm": 0.21003659069538116, "learning_rate": 3.351168894757157e-05, "loss": 0.6911, "mean_token_accuracy": 0.7897532656788826, "num_tokens": 2693336417.0, "step": 1295 }, { "epoch": 1.2846201358863496, "grad_norm": 0.21736383438110352, "learning_rate": 3.33821397119037e-05, "loss": 0.6827, "mean_token_accuracy": 0.7918270066380501, "num_tokens": 2703706020.0, "step": 1300 }, { "epoch": 1.2895614576899321, "grad_norm": 0.21032196283340454, "learning_rate": 3.325233638481078e-05, "loss": 0.681, "mean_token_accuracy": 0.792155022919178, "num_tokens": 2714082196.0, "step": 1305 }, { "epoch": 1.2945027794935144, "grad_norm": 0.18472126126289368, "learning_rate": 3.312228290107717e-05, "loss": 0.6812, "mean_token_accuracy": 0.7917923405766487, "num_tokens": 2724452502.0, "step": 1310 }, { "epoch": 1.299444101297097, "grad_norm": 0.17233245074748993, "learning_rate": 3.299198320307036e-05, "loss": 0.6793, "mean_token_accuracy": 0.7928365305066108, "num_tokens": 2734827605.0, "step": 1315 }, { "epoch": 1.3043854231006795, "grad_norm": 0.18500040471553802, "learning_rate": 3.286144124062143e-05, "loss": 0.681, "mean_token_accuracy": 0.7920305550098419, "num_tokens": 2745243883.0, "step": 1320 }, { "epoch": 1.3093267449042618, "grad_norm": 0.2191830277442932, "learning_rate": 3.27306609709053e-05, "loss": 0.6651, "mean_token_accuracy": 0.7963899701833725, "num_tokens": 2755631702.0, "step": 1325 }, { "epoch": 1.3142680667078444, "grad_norm": 0.19073499739170074, "learning_rate": 3.2599646358320874e-05, "loss": 0.6763, "mean_token_accuracy": 0.7934411048889161, "num_tokens": 2766039150.0, "step": 1330 }, { "epoch": 1.3192093885114269, "grad_norm": 0.1874350607395172, "learning_rate": 3.246840137437072e-05, "loss": 0.6588, "mean_token_accuracy": 0.7982455238699913, "num_tokens": 2776460737.0, "step": 1335 }, { "epoch": 1.3241507103150092, "grad_norm": 0.2026117742061615, "learning_rate": 3.233692999754077e-05, "loss": 0.6922, "mean_token_accuracy": 0.7905497863888741, "num_tokens": 2786856139.0, "step": 1340 }, { "epoch": 1.3290920321185917, "grad_norm": 0.19188478589057922, "learning_rate": 3.2205236213179734e-05, "loss": 0.6672, "mean_token_accuracy": 0.7957891747355461, "num_tokens": 2797249229.0, "step": 1345 }, { "epoch": 1.3340333539221743, "grad_norm": 0.20345532894134521, "learning_rate": 3.207332401337823e-05, "loss": 0.6749, "mean_token_accuracy": 0.79354357868433, "num_tokens": 2807623382.0, "step": 1350 }, { "epoch": 1.3389746757257566, "grad_norm": 0.21133118867874146, "learning_rate": 3.194119739684779e-05, "loss": 0.6664, "mean_token_accuracy": 0.7959961429238319, "num_tokens": 2818024758.0, "step": 1355 }, { "epoch": 1.343915997529339, "grad_norm": 0.19039379060268402, "learning_rate": 3.1808860368799674e-05, "loss": 0.6907, "mean_token_accuracy": 0.7892735093832016, "num_tokens": 2828425993.0, "step": 1360 }, { "epoch": 1.3488573193329216, "grad_norm": 0.1846419721841812, "learning_rate": 3.1676316940823426e-05, "loss": 0.6681, "mean_token_accuracy": 0.7955585345625877, "num_tokens": 2838840590.0, "step": 1365 }, { "epoch": 1.353798641136504, "grad_norm": 0.17371538281440735, "learning_rate": 3.154357113076527e-05, "loss": 0.6773, "mean_token_accuracy": 0.7934020891785621, "num_tokens": 2849258862.0, "step": 1370 }, { "epoch": 1.3587399629400865, "grad_norm": 0.18384847044944763, "learning_rate": 3.141062696260636e-05, "loss": 0.6727, "mean_token_accuracy": 0.7949780642986297, "num_tokens": 2859638155.0, "step": 1375 }, { "epoch": 1.363681284743669, "grad_norm": 0.18146079778671265, "learning_rate": 3.1277488466340746e-05, "loss": 0.6735, "mean_token_accuracy": 0.7942267686128617, "num_tokens": 2870014943.0, "step": 1380 }, { "epoch": 1.3686226065472513, "grad_norm": 0.2187763750553131, "learning_rate": 3.11441596778532e-05, "loss": 0.6785, "mean_token_accuracy": 0.793820746243, "num_tokens": 2880394786.0, "step": 1385 }, { "epoch": 1.3735639283508339, "grad_norm": 0.22467771172523499, "learning_rate": 3.1010644638796956e-05, "loss": 0.665, "mean_token_accuracy": 0.7965216785669327, "num_tokens": 2890806878.0, "step": 1390 }, { "epoch": 1.3785052501544164, "grad_norm": 0.17766356468200684, "learning_rate": 3.08769473964711e-05, "loss": 0.6724, "mean_token_accuracy": 0.7943286210298538, "num_tokens": 2901194944.0, "step": 1395 }, { "epoch": 1.3834465719579987, "grad_norm": 0.18762089312076569, "learning_rate": 3.0743072003697946e-05, "loss": 0.6681, "mean_token_accuracy": 0.7956297710537911, "num_tokens": 2911601255.0, "step": 1400 }, { "epoch": 1.3883878937615812, "grad_norm": 0.2048417329788208, "learning_rate": 3.060902251870017e-05, "loss": 0.6734, "mean_token_accuracy": 0.7950954049825668, "num_tokens": 2922021556.0, "step": 1405 }, { "epoch": 1.3933292155651638, "grad_norm": 0.2158186286687851, "learning_rate": 3.0474803004977748e-05, "loss": 0.6787, "mean_token_accuracy": 0.7928067833185196, "num_tokens": 2932410781.0, "step": 1410 }, { "epoch": 1.398270537368746, "grad_norm": 0.25561225414276123, "learning_rate": 3.0340417531184832e-05, "loss": 0.687, "mean_token_accuracy": 0.7903619438409806, "num_tokens": 2942802115.0, "step": 1415 }, { "epoch": 1.4032118591723286, "grad_norm": 0.21336261928081512, "learning_rate": 3.02058701710064e-05, "loss": 0.6757, "mean_token_accuracy": 0.7946703806519508, "num_tokens": 2953223325.0, "step": 1420 }, { "epoch": 1.4081531809759111, "grad_norm": 0.2261497527360916, "learning_rate": 3.007116500303475e-05, "loss": 0.6659, "mean_token_accuracy": 0.7963157400488854, "num_tokens": 2963627568.0, "step": 1425 }, { "epoch": 1.4130945027794934, "grad_norm": 0.20056448876857758, "learning_rate": 2.9936306110645867e-05, "loss": 0.68, "mean_token_accuracy": 0.7920198395848275, "num_tokens": 2974004296.0, "step": 1430 }, { "epoch": 1.418035824583076, "grad_norm": 4.7345452308654785, "learning_rate": 2.980129758187567e-05, "loss": 0.6964, "mean_token_accuracy": 0.7900218307971955, "num_tokens": 2984403518.0, "step": 1435 }, { "epoch": 1.4229771463866585, "grad_norm": 0.1929967850446701, "learning_rate": 2.9666143509296057e-05, "loss": 0.6775, "mean_token_accuracy": 0.7929817318916321, "num_tokens": 2994812742.0, "step": 1440 }, { "epoch": 1.4279184681902408, "grad_norm": 0.20712681114673615, "learning_rate": 2.9530847989890865e-05, "loss": 0.6712, "mean_token_accuracy": 0.7948137015104294, "num_tokens": 3005236938.0, "step": 1445 }, { "epoch": 1.4328597899938234, "grad_norm": 0.19081586599349976, "learning_rate": 2.939541512493167e-05, "loss": 0.6747, "mean_token_accuracy": 0.7935641020536423, "num_tokens": 3015651863.0, "step": 1450 }, { "epoch": 1.4378011117974059, "grad_norm": 0.1719520092010498, "learning_rate": 2.9259849019853458e-05, "loss": 0.6733, "mean_token_accuracy": 0.7942060053348541, "num_tokens": 3026070210.0, "step": 1455 }, { "epoch": 1.4427424336009882, "grad_norm": 0.22839383780956268, "learning_rate": 2.9124153784130193e-05, "loss": 0.6839, "mean_token_accuracy": 0.7911037534475327, "num_tokens": 3036453223.0, "step": 1460 }, { "epoch": 1.4476837554045707, "grad_norm": 0.16924144327640533, "learning_rate": 2.898833353115021e-05, "loss": 0.6688, "mean_token_accuracy": 0.7952630639076232, "num_tokens": 3046881199.0, "step": 1465 }, { "epoch": 1.4526250772081533, "grad_norm": 0.1802128255367279, "learning_rate": 2.8852392378091564e-05, "loss": 0.6756, "mean_token_accuracy": 0.7934101119637489, "num_tokens": 3057283547.0, "step": 1470 }, { "epoch": 1.4575663990117356, "grad_norm": 0.2060791701078415, "learning_rate": 2.8716334445797195e-05, "loss": 0.6644, "mean_token_accuracy": 0.7965093940496445, "num_tokens": 3067693639.0, "step": 1475 }, { "epoch": 1.462507720815318, "grad_norm": 0.17594724893569946, "learning_rate": 2.8580163858650038e-05, "loss": 0.6726, "mean_token_accuracy": 0.7946979507803917, "num_tokens": 3078082156.0, "step": 1480 }, { "epoch": 1.4674490426189006, "grad_norm": 0.18219222128391266, "learning_rate": 2.8443884744447974e-05, "loss": 0.6704, "mean_token_accuracy": 0.7950583577156067, "num_tokens": 3088511458.0, "step": 1485 }, { "epoch": 1.472390364422483, "grad_norm": 0.16935332119464874, "learning_rate": 2.83075012342787e-05, "loss": 0.6775, "mean_token_accuracy": 0.7929304778575897, "num_tokens": 3098888388.0, "step": 1490 }, { "epoch": 1.4773316862260655, "grad_norm": 6.201488018035889, "learning_rate": 2.8171017462394546e-05, "loss": 0.6624, "mean_token_accuracy": 0.7975944474339485, "num_tokens": 3109255613.0, "step": 1495 }, { "epoch": 1.482273008029648, "grad_norm": 0.5504547953605652, "learning_rate": 2.803443756608707e-05, "loss": 0.6719, "mean_token_accuracy": 0.7945632874965668, "num_tokens": 3119631952.0, "step": 1500 }, { "epoch": 1.4872143298332303, "grad_norm": 0.21276314556598663, "learning_rate": 2.789776568556173e-05, "loss": 0.6828, "mean_token_accuracy": 0.791711862385273, "num_tokens": 3130027225.0, "step": 1505 }, { "epoch": 1.4921556516368129, "grad_norm": 0.23470132052898407, "learning_rate": 2.7761005963812337e-05, "loss": 0.6635, "mean_token_accuracy": 0.7968768313527107, "num_tokens": 3140453678.0, "step": 1510 }, { "epoch": 1.4970969734403954, "grad_norm": 0.2083427459001541, "learning_rate": 2.762416254649545e-05, "loss": 0.6643, "mean_token_accuracy": 0.7962050586938858, "num_tokens": 3150827695.0, "step": 1515 }, { "epoch": 1.502038295243978, "grad_norm": 0.2057102769613266, "learning_rate": 2.7487239581804753e-05, "loss": 0.6746, "mean_token_accuracy": 0.7935194700956345, "num_tokens": 3161227090.0, "step": 1520 }, { "epoch": 1.5069796170475602, "grad_norm": 0.1747300773859024, "learning_rate": 2.7350241220345274e-05, "loss": 0.682, "mean_token_accuracy": 0.7913977935910225, "num_tokens": 3171627806.0, "step": 1525 }, { "epoch": 1.5119209388511425, "grad_norm": 0.16887561976909637, "learning_rate": 2.7213171615007566e-05, "loss": 0.6723, "mean_token_accuracy": 0.7942707300186157, "num_tokens": 3182049340.0, "step": 1530 }, { "epoch": 1.516862260654725, "grad_norm": 0.20756720006465912, "learning_rate": 2.7076034920841836e-05, "loss": 0.6701, "mean_token_accuracy": 0.794575659930706, "num_tokens": 3192485069.0, "step": 1535 }, { "epoch": 1.5218035824583076, "grad_norm": 0.18000604212284088, "learning_rate": 2.6938835294931996e-05, "loss": 0.665, "mean_token_accuracy": 0.7962401360273361, "num_tokens": 3202862965.0, "step": 1540 }, { "epoch": 1.5267449042618901, "grad_norm": 0.1736893653869629, "learning_rate": 2.680157689626961e-05, "loss": 0.6747, "mean_token_accuracy": 0.7937764227390289, "num_tokens": 3213276418.0, "step": 1545 }, { "epoch": 1.5316862260654727, "grad_norm": 0.16604070365428925, "learning_rate": 2.6664263885627865e-05, "loss": 0.7027, "mean_token_accuracy": 0.7897267058491707, "num_tokens": 3223670758.0, "step": 1550 }, { "epoch": 1.536627547869055, "grad_norm": 0.1939777284860611, "learning_rate": 2.6526900425435425e-05, "loss": 0.6742, "mean_token_accuracy": 0.7940390273928642, "num_tokens": 3234097782.0, "step": 1555 }, { "epoch": 1.5415688696726373, "grad_norm": 0.18386848270893097, "learning_rate": 2.6389490679650236e-05, "loss": 0.6782, "mean_token_accuracy": 0.7937902882695198, "num_tokens": 3244506053.0, "step": 1560 }, { "epoch": 1.5465101914762198, "grad_norm": 0.20481400191783905, "learning_rate": 2.625203881363334e-05, "loss": 0.6826, "mean_token_accuracy": 0.7955172687768937, "num_tokens": 3254929952.0, "step": 1565 }, { "epoch": 1.5514515132798024, "grad_norm": 0.17962992191314697, "learning_rate": 2.6114548994022576e-05, "loss": 0.6866, "mean_token_accuracy": 0.7924586087465286, "num_tokens": 3265334962.0, "step": 1570 }, { "epoch": 1.5563928350833849, "grad_norm": 0.17282456159591675, "learning_rate": 2.5977025388606286e-05, "loss": 0.6717, "mean_token_accuracy": 0.7941754475235939, "num_tokens": 3275733927.0, "step": 1575 }, { "epoch": 1.5613341568869674, "grad_norm": 0.22649475932121277, "learning_rate": 2.5839472166196977e-05, "loss": 0.6776, "mean_token_accuracy": 0.7933913409709931, "num_tokens": 3286097045.0, "step": 1580 }, { "epoch": 1.5662754786905497, "grad_norm": 0.17881017923355103, "learning_rate": 2.5701893496504953e-05, "loss": 0.6816, "mean_token_accuracy": 0.7914034590125084, "num_tokens": 3296524291.0, "step": 1585 }, { "epoch": 1.571216800494132, "grad_norm": 0.16787505149841309, "learning_rate": 2.5564293550011913e-05, "loss": 0.6684, "mean_token_accuracy": 0.7956008955836296, "num_tokens": 3306916064.0, "step": 1590 }, { "epoch": 1.5761581222977146, "grad_norm": 0.16417014598846436, "learning_rate": 2.5426676497844515e-05, "loss": 0.6595, "mean_token_accuracy": 0.7979877829551697, "num_tokens": 3317295824.0, "step": 1595 }, { "epoch": 1.581099444101297, "grad_norm": 0.1576405167579651, "learning_rate": 2.5289046511647972e-05, "loss": 0.664, "mean_token_accuracy": 0.7966679364442826, "num_tokens": 3327702026.0, "step": 1600 }, { "epoch": 1.5860407659048796, "grad_norm": 0.1823538988828659, "learning_rate": 2.515140776345956e-05, "loss": 0.6742, "mean_token_accuracy": 0.7936360001564026, "num_tokens": 3338102467.0, "step": 1605 }, { "epoch": 1.5909820877084622, "grad_norm": 0.17999356985092163, "learning_rate": 2.501376442558215e-05, "loss": 0.6707, "mean_token_accuracy": 0.7947127357125282, "num_tokens": 3348517918.0, "step": 1610 }, { "epoch": 1.5959234095120445, "grad_norm": 0.15889112651348114, "learning_rate": 2.4876120670457754e-05, "loss": 0.6498, "mean_token_accuracy": 0.8010415449738503, "num_tokens": 3358940680.0, "step": 1615 }, { "epoch": 1.6008647313156268, "grad_norm": 0.16011574864387512, "learning_rate": 2.4738480670541024e-05, "loss": 0.678, "mean_token_accuracy": 0.7928107127547264, "num_tokens": 3369326978.0, "step": 1620 }, { "epoch": 1.6058060531192093, "grad_norm": 0.5964378714561462, "learning_rate": 2.460084859817281e-05, "loss": 0.6659, "mean_token_accuracy": 0.7966329261660576, "num_tokens": 3379732990.0, "step": 1625 }, { "epoch": 1.6107473749227919, "grad_norm": 0.15916962921619415, "learning_rate": 2.4463228625453607e-05, "loss": 0.6696, "mean_token_accuracy": 0.7948744997382164, "num_tokens": 3390132716.0, "step": 1630 }, { "epoch": 1.6156886967263744, "grad_norm": 0.17995233833789825, "learning_rate": 2.4325624924117142e-05, "loss": 0.6807, "mean_token_accuracy": 0.7919593781232834, "num_tokens": 3400548833.0, "step": 1635 }, { "epoch": 1.620630018529957, "grad_norm": 0.17512169480323792, "learning_rate": 2.4188041665403925e-05, "loss": 0.6673, "mean_token_accuracy": 0.7959037974476815, "num_tokens": 3410963830.0, "step": 1640 }, { "epoch": 1.6255713403335392, "grad_norm": 0.17564837634563446, "learning_rate": 2.4050483019934737e-05, "loss": 0.665, "mean_token_accuracy": 0.7963264405727386, "num_tokens": 3421337051.0, "step": 1645 }, { "epoch": 1.6305126621371215, "grad_norm": 0.17777878046035767, "learning_rate": 2.3912953157584304e-05, "loss": 0.6726, "mean_token_accuracy": 0.7940599545836449, "num_tokens": 3431747692.0, "step": 1650 }, { "epoch": 1.635453983940704, "grad_norm": 0.1665956974029541, "learning_rate": 2.3775456247354765e-05, "loss": 0.664, "mean_token_accuracy": 0.7964100062847137, "num_tokens": 3442105623.0, "step": 1655 }, { "epoch": 1.6403953057442866, "grad_norm": 2.0348362922668457, "learning_rate": 2.3637996457249434e-05, "loss": 0.6786, "mean_token_accuracy": 0.7920338585972786, "num_tokens": 3452509019.0, "step": 1660 }, { "epoch": 1.6453366275478691, "grad_norm": 0.17581728100776672, "learning_rate": 2.3500577954146356e-05, "loss": 0.6664, "mean_token_accuracy": 0.795609450340271, "num_tokens": 3462892485.0, "step": 1665 }, { "epoch": 1.6502779493514517, "grad_norm": 0.16040699183940887, "learning_rate": 2.3363204903672002e-05, "loss": 0.6852, "mean_token_accuracy": 0.791227325797081, "num_tokens": 3473319700.0, "step": 1670 }, { "epoch": 1.655219271155034, "grad_norm": 0.1749061942100525, "learning_rate": 2.3225881470075075e-05, "loss": 0.6796, "mean_token_accuracy": 0.7921550258994102, "num_tokens": 3483743564.0, "step": 1675 }, { "epoch": 1.6601605929586163, "grad_norm": 0.17449209094047546, "learning_rate": 2.308861181610017e-05, "loss": 0.6701, "mean_token_accuracy": 0.7946789160370826, "num_tokens": 3494099528.0, "step": 1680 }, { "epoch": 1.6651019147621988, "grad_norm": 0.16798865795135498, "learning_rate": 2.2951400102861664e-05, "loss": 0.6674, "mean_token_accuracy": 0.795718927681446, "num_tokens": 3504480588.0, "step": 1685 }, { "epoch": 1.6700432365657814, "grad_norm": 0.16419844329357147, "learning_rate": 2.2814250489717536e-05, "loss": 0.6771, "mean_token_accuracy": 0.7932738527655602, "num_tokens": 3514901417.0, "step": 1690 }, { "epoch": 1.6749845583693639, "grad_norm": 0.16027598083019257, "learning_rate": 2.267716713414332e-05, "loss": 0.6781, "mean_token_accuracy": 0.7923995703458786, "num_tokens": 3525288452.0, "step": 1695 }, { "epoch": 1.6799258801729464, "grad_norm": 0.16417835652828217, "learning_rate": 2.2540154191606028e-05, "loss": 0.671, "mean_token_accuracy": 0.7948248594999313, "num_tokens": 3535694493.0, "step": 1700 }, { "epoch": 1.6848672019765287, "grad_norm": 0.1824580430984497, "learning_rate": 2.240321581543822e-05, "loss": 0.6724, "mean_token_accuracy": 0.794626134634018, "num_tokens": 3546068917.0, "step": 1705 }, { "epoch": 1.689808523780111, "grad_norm": 0.16408054530620575, "learning_rate": 2.226635615671211e-05, "loss": 0.6643, "mean_token_accuracy": 0.7960787147283555, "num_tokens": 3556447087.0, "step": 1710 }, { "epoch": 1.6947498455836936, "grad_norm": 0.16972069442272186, "learning_rate": 2.2129579364113692e-05, "loss": 0.6594, "mean_token_accuracy": 0.7978889897465706, "num_tokens": 3566863532.0, "step": 1715 }, { "epoch": 1.699691167387276, "grad_norm": 0.1671704351902008, "learning_rate": 2.1992889583817023e-05, "loss": 0.6655, "mean_token_accuracy": 0.7957222029566765, "num_tokens": 3577293405.0, "step": 1720 }, { "epoch": 1.7046324891908586, "grad_norm": 0.21393845975399017, "learning_rate": 2.1856290959358504e-05, "loss": 0.6873, "mean_token_accuracy": 0.793942141532898, "num_tokens": 3587700784.0, "step": 1725 }, { "epoch": 1.7095738109944412, "grad_norm": 0.3951685428619385, "learning_rate": 2.1719787631511302e-05, "loss": 0.6751, "mean_token_accuracy": 0.7932202309370041, "num_tokens": 3598073941.0, "step": 1730 }, { "epoch": 1.7145151327980235, "grad_norm": 0.1711612492799759, "learning_rate": 2.1583383738159812e-05, "loss": 0.6762, "mean_token_accuracy": 0.7930672898888588, "num_tokens": 3608444988.0, "step": 1735 }, { "epoch": 1.7194564546016058, "grad_norm": 0.1807597279548645, "learning_rate": 2.1447083414174212e-05, "loss": 0.6646, "mean_token_accuracy": 0.7962919890880584, "num_tokens": 3618832648.0, "step": 1740 }, { "epoch": 1.7243977764051883, "grad_norm": 0.16536164283752441, "learning_rate": 2.1310890791285168e-05, "loss": 0.6767, "mean_token_accuracy": 0.7927768990397454, "num_tokens": 3629210358.0, "step": 1745 }, { "epoch": 1.7293390982087709, "grad_norm": 0.17039009928703308, "learning_rate": 2.117480999795853e-05, "loss": 0.676, "mean_token_accuracy": 0.7928067699074746, "num_tokens": 3639593453.0, "step": 1750 }, { "epoch": 1.7342804200123534, "grad_norm": 0.1626947671175003, "learning_rate": 2.103884515927023e-05, "loss": 0.675, "mean_token_accuracy": 0.7935326501727105, "num_tokens": 3649995994.0, "step": 1755 }, { "epoch": 1.739221741815936, "grad_norm": 0.15911497175693512, "learning_rate": 2.090300039678119e-05, "loss": 0.6669, "mean_token_accuracy": 0.7955508276820182, "num_tokens": 3660372590.0, "step": 1760 }, { "epoch": 1.7441630636195182, "grad_norm": 0.16413941979408264, "learning_rate": 2.0767279828412442e-05, "loss": 0.6708, "mean_token_accuracy": 0.7943277865648269, "num_tokens": 3670778469.0, "step": 1765 }, { "epoch": 1.7491043854231005, "grad_norm": 0.14908325672149658, "learning_rate": 2.0631687568320258e-05, "loss": 0.6613, "mean_token_accuracy": 0.7970643430948258, "num_tokens": 3681177969.0, "step": 1770 }, { "epoch": 1.754045707226683, "grad_norm": 0.16836871206760406, "learning_rate": 2.0496227726771415e-05, "loss": 0.6851, "mean_token_accuracy": 0.7903355419635772, "num_tokens": 3691608197.0, "step": 1775 }, { "epoch": 1.7589870290302656, "grad_norm": 0.15650926530361176, "learning_rate": 2.0360904410018676e-05, "loss": 0.6669, "mean_token_accuracy": 0.7956840336322785, "num_tokens": 3702023632.0, "step": 1780 }, { "epoch": 1.7639283508338481, "grad_norm": 0.16152553260326385, "learning_rate": 2.0225721720176244e-05, "loss": 0.6722, "mean_token_accuracy": 0.7941829591989518, "num_tokens": 3712430248.0, "step": 1785 }, { "epoch": 1.7688696726374307, "grad_norm": 0.18243561685085297, "learning_rate": 2.009068375509544e-05, "loss": 0.675, "mean_token_accuracy": 0.7927650153636933, "num_tokens": 3722829050.0, "step": 1790 }, { "epoch": 1.773810994441013, "grad_norm": 0.18079873919487, "learning_rate": 1.995579460824048e-05, "loss": 0.6764, "mean_token_accuracy": 0.7926057055592537, "num_tokens": 3733231074.0, "step": 1795 }, { "epoch": 1.7787523162445953, "grad_norm": 0.17142578959465027, "learning_rate": 1.982105836856441e-05, "loss": 0.669, "mean_token_accuracy": 0.7976492524147034, "num_tokens": 3743604338.0, "step": 1800 }, { "epoch": 1.7836936380481778, "grad_norm": 0.17926448583602905, "learning_rate": 1.9686479120385087e-05, "loss": 0.6609, "mean_token_accuracy": 0.7971063464879989, "num_tokens": 3754021152.0, "step": 1805 }, { "epoch": 1.7886349598517604, "grad_norm": 0.16492047905921936, "learning_rate": 1.9552060943261456e-05, "loss": 0.672, "mean_token_accuracy": 0.7941944986581803, "num_tokens": 3764394412.0, "step": 1810 }, { "epoch": 1.7935762816553429, "grad_norm": 0.1678888201713562, "learning_rate": 1.941780791186985e-05, "loss": 0.6674, "mean_token_accuracy": 0.7951693952083587, "num_tokens": 3774799854.0, "step": 1815 }, { "epoch": 1.7985176034589254, "grad_norm": 0.17829807102680206, "learning_rate": 1.928372409588043e-05, "loss": 0.6691, "mean_token_accuracy": 0.7948068961501121, "num_tokens": 3785193247.0, "step": 1820 }, { "epoch": 1.8034589252625077, "grad_norm": 0.16471394896507263, "learning_rate": 1.9149813559833897e-05, "loss": 0.6637, "mean_token_accuracy": 0.7964715838432312, "num_tokens": 3795565687.0, "step": 1825 }, { "epoch": 1.80840024706609, "grad_norm": 0.16672302782535553, "learning_rate": 1.9016080363018214e-05, "loss": 0.6655, "mean_token_accuracy": 0.7956433966755867, "num_tokens": 3805974954.0, "step": 1830 }, { "epoch": 1.8133415688696726, "grad_norm": 0.1635795533657074, "learning_rate": 1.8882528559345604e-05, "loss": 0.6752, "mean_token_accuracy": 0.7931825637817382, "num_tokens": 3816394445.0, "step": 1835 }, { "epoch": 1.818282890673255, "grad_norm": 0.1563975065946579, "learning_rate": 1.8749162197229626e-05, "loss": 0.6736, "mean_token_accuracy": 0.7934539332985878, "num_tokens": 3826795756.0, "step": 1840 }, { "epoch": 1.8232242124768376, "grad_norm": 0.1613382250070572, "learning_rate": 1.8615985319462486e-05, "loss": 0.6666, "mean_token_accuracy": 0.7956288605928421, "num_tokens": 3837209980.0, "step": 1845 }, { "epoch": 1.8281655342804202, "grad_norm": 0.18940430879592896, "learning_rate": 1.848300196309245e-05, "loss": 0.675, "mean_token_accuracy": 0.7961811035871506, "num_tokens": 3847604542.0, "step": 1850 }, { "epoch": 1.8331068560840025, "grad_norm": 0.1619659811258316, "learning_rate": 1.8350216159301483e-05, "loss": 0.6665, "mean_token_accuracy": 0.7956989362835885, "num_tokens": 3858008174.0, "step": 1855 }, { "epoch": 1.8380481778875848, "grad_norm": 0.17836914956569672, "learning_rate": 1.821763193328309e-05, "loss": 0.6719, "mean_token_accuracy": 0.7940696641802788, "num_tokens": 3868405303.0, "step": 1860 }, { "epoch": 1.8429894996911673, "grad_norm": 0.1670689433813095, "learning_rate": 1.8085253304120213e-05, "loss": 0.6555, "mean_token_accuracy": 0.7987230405211448, "num_tokens": 3878781192.0, "step": 1865 }, { "epoch": 1.8479308214947499, "grad_norm": 0.17044250667095184, "learning_rate": 1.7953084284663486e-05, "loss": 0.6698, "mean_token_accuracy": 0.794747294485569, "num_tokens": 3889178794.0, "step": 1870 }, { "epoch": 1.8528721432983324, "grad_norm": 0.16052749752998352, "learning_rate": 1.782112888140952e-05, "loss": 0.6691, "mean_token_accuracy": 0.7945168077945709, "num_tokens": 3899590027.0, "step": 1875 }, { "epoch": 1.857813465101915, "grad_norm": 0.15389318764209747, "learning_rate": 1.7689391094379534e-05, "loss": 0.668, "mean_token_accuracy": 0.7955880552530289, "num_tokens": 3909974381.0, "step": 1880 }, { "epoch": 1.8627547869054972, "grad_norm": 0.1830136477947235, "learning_rate": 1.7557874916997996e-05, "loss": 0.6871, "mean_token_accuracy": 0.7958386242389679, "num_tokens": 3920358396.0, "step": 1885 }, { "epoch": 1.8676961087090795, "grad_norm": 0.14622418582439423, "learning_rate": 1.7426584335971658e-05, "loss": 0.6667, "mean_token_accuracy": 0.7957353934645652, "num_tokens": 3930754434.0, "step": 1890 }, { "epoch": 1.872637430512662, "grad_norm": 0.15185917913913727, "learning_rate": 1.7295523331168673e-05, "loss": 0.6626, "mean_token_accuracy": 0.7965810731053352, "num_tokens": 3941174197.0, "step": 1895 }, { "epoch": 1.8775787523162446, "grad_norm": 0.1555299609899521, "learning_rate": 1.7164695875497928e-05, "loss": 0.6641, "mean_token_accuracy": 0.7962276056408882, "num_tokens": 3951578601.0, "step": 1900 }, { "epoch": 1.8825200741198271, "grad_norm": 0.1599041372537613, "learning_rate": 1.703410593478867e-05, "loss": 0.6737, "mean_token_accuracy": 0.793473419547081, "num_tokens": 3961967546.0, "step": 1905 }, { "epoch": 1.8874613959234097, "grad_norm": 0.15561096370220184, "learning_rate": 1.6903757467670215e-05, "loss": 0.6707, "mean_token_accuracy": 0.7944751814007759, "num_tokens": 3972373879.0, "step": 1910 }, { "epoch": 1.892402717726992, "grad_norm": 0.14810524880886078, "learning_rate": 1.6773654425452007e-05, "loss": 0.6618, "mean_token_accuracy": 0.7968007609248161, "num_tokens": 3982774226.0, "step": 1915 }, { "epoch": 1.8973440395305743, "grad_norm": 0.15639038383960724, "learning_rate": 1.6643800752003824e-05, "loss": 0.6709, "mean_token_accuracy": 0.7942842915654182, "num_tokens": 3993189170.0, "step": 1920 }, { "epoch": 1.9022853613341568, "grad_norm": 0.15391488373279572, "learning_rate": 1.6514200383636192e-05, "loss": 0.6662, "mean_token_accuracy": 0.7959662467241287, "num_tokens": 4003600898.0, "step": 1925 }, { "epoch": 1.9072266831377394, "grad_norm": 0.15802597999572754, "learning_rate": 1.638485724898112e-05, "loss": 0.662, "mean_token_accuracy": 0.7968731462955475, "num_tokens": 4013990867.0, "step": 1930 }, { "epoch": 1.9121680049413219, "grad_norm": 0.14575393497943878, "learning_rate": 1.6255775268872963e-05, "loss": 0.66, "mean_token_accuracy": 0.797378021478653, "num_tokens": 4024405781.0, "step": 1935 }, { "epoch": 1.9171093267449044, "grad_norm": 0.17408694326877594, "learning_rate": 1.6126958356229604e-05, "loss": 0.6541, "mean_token_accuracy": 0.7994588032364845, "num_tokens": 4034815424.0, "step": 1940 }, { "epoch": 1.9220506485484867, "grad_norm": 0.16843891143798828, "learning_rate": 1.5998410415933794e-05, "loss": 0.6704, "mean_token_accuracy": 0.7947205558419228, "num_tokens": 4045217255.0, "step": 1945 }, { "epoch": 1.926991970352069, "grad_norm": 0.16394077241420746, "learning_rate": 1.587013534471485e-05, "loss": 0.6638, "mean_token_accuracy": 0.7960691630840302, "num_tokens": 4055614692.0, "step": 1950 }, { "epoch": 1.9319332921556516, "grad_norm": 1.020789623260498, "learning_rate": 1.5742137031030436e-05, "loss": 0.6735, "mean_token_accuracy": 0.7944559305906296, "num_tokens": 4066027413.0, "step": 1955 }, { "epoch": 1.936874613959234, "grad_norm": 0.22389452159404755, "learning_rate": 1.5614419354948783e-05, "loss": 0.6778, "mean_token_accuracy": 0.7925828203558922, "num_tokens": 4076425169.0, "step": 1960 }, { "epoch": 1.9418159357628166, "grad_norm": 0.17207863926887512, "learning_rate": 1.548698618803104e-05, "loss": 0.6627, "mean_token_accuracy": 0.7969025999307633, "num_tokens": 4086832157.0, "step": 1965 }, { "epoch": 1.9467572575663992, "grad_norm": 0.16408152878284454, "learning_rate": 1.535984139321386e-05, "loss": 0.6611, "mean_token_accuracy": 0.7972674682736397, "num_tokens": 4097246660.0, "step": 1970 }, { "epoch": 1.9516985793699815, "grad_norm": 0.1568727344274521, "learning_rate": 1.5232988824692406e-05, "loss": 0.6543, "mean_token_accuracy": 0.7987544611096382, "num_tokens": 4107628644.0, "step": 1975 }, { "epoch": 1.9566399011735638, "grad_norm": 0.14633381366729736, "learning_rate": 1.5106432327803417e-05, "loss": 0.6626, "mean_token_accuracy": 0.7965556159615517, "num_tokens": 4118038279.0, "step": 1980 }, { "epoch": 1.9615812229771463, "grad_norm": 0.17039474844932556, "learning_rate": 1.4980175738908711e-05, "loss": 0.6698, "mean_token_accuracy": 0.79447071403265, "num_tokens": 4128440012.0, "step": 1985 }, { "epoch": 1.9665225447807289, "grad_norm": 0.17251941561698914, "learning_rate": 1.4854222885278842e-05, "loss": 0.7007, "mean_token_accuracy": 0.7944662183523178, "num_tokens": 4138816871.0, "step": 1990 }, { "epoch": 1.9714638665843114, "grad_norm": 0.17853756248950958, "learning_rate": 1.4728577584977118e-05, "loss": 0.673, "mean_token_accuracy": 0.793654565513134, "num_tokens": 4149219587.0, "step": 1995 }, { "epoch": 1.976405188387894, "grad_norm": 0.15176187455654144, "learning_rate": 1.4603243646743859e-05, "loss": 0.6587, "mean_token_accuracy": 0.797548696398735, "num_tokens": 4159615480.0, "step": 2000 }, { "epoch": 1.9813465101914762, "grad_norm": 0.14701396226882935, "learning_rate": 1.4478224869880908e-05, "loss": 0.6653, "mean_token_accuracy": 0.7960563406348229, "num_tokens": 4170006610.0, "step": 2005 }, { "epoch": 1.9862878319950585, "grad_norm": 0.1599283367395401, "learning_rate": 1.4353525044136514e-05, "loss": 0.6673, "mean_token_accuracy": 0.7951024606823921, "num_tokens": 4180405937.0, "step": 2010 }, { "epoch": 1.991229153798641, "grad_norm": 0.16183249652385712, "learning_rate": 1.4229147949590393e-05, "loss": 0.6651, "mean_token_accuracy": 0.7957586273550987, "num_tokens": 4190804735.0, "step": 2015 }, { "epoch": 1.9961704756022236, "grad_norm": 0.1604955494403839, "learning_rate": 1.4105097356539203e-05, "loss": 0.6761, "mean_token_accuracy": 0.794134946167469, "num_tokens": 4201192556.0, "step": 2020 }, { "epoch": 2.0009882643607164, "grad_norm": 0.2795203924179077, "learning_rate": 1.3981377025382186e-05, "loss": 0.665, "mean_token_accuracy": 0.7965775315578167, "num_tokens": 4211344649.0, "step": 2025 }, { "epoch": 2.005929586164299, "grad_norm": 0.19513043761253357, "learning_rate": 1.385799070650724e-05, "loss": 0.6361, "mean_token_accuracy": 0.8029673993587494, "num_tokens": 4221754280.0, "step": 2030 }, { "epoch": 2.0108709079678815, "grad_norm": 0.18438509106636047, "learning_rate": 1.3734942140177201e-05, "loss": 0.6424, "mean_token_accuracy": 0.8008751258254051, "num_tokens": 4232151391.0, "step": 2035 }, { "epoch": 2.015812229771464, "grad_norm": 0.18968532979488373, "learning_rate": 1.3612235056416442e-05, "loss": 0.6534, "mean_token_accuracy": 0.8006501421332359, "num_tokens": 4242553642.0, "step": 2040 }, { "epoch": 2.020753551575046, "grad_norm": 0.1811179518699646, "learning_rate": 1.3489873174897862e-05, "loss": 0.647, "mean_token_accuracy": 0.7995884954929352, "num_tokens": 4252982344.0, "step": 2045 }, { "epoch": 2.0256948733786286, "grad_norm": 0.17677195370197296, "learning_rate": 1.3367860204830063e-05, "loss": 0.6405, "mean_token_accuracy": 0.803388424217701, "num_tokens": 4263377952.0, "step": 2050 }, { "epoch": 2.030636195182211, "grad_norm": 0.1655615121126175, "learning_rate": 1.3246199844844964e-05, "loss": 0.6291, "mean_token_accuracy": 0.8047855436801911, "num_tokens": 4273790892.0, "step": 2055 }, { "epoch": 2.0355775169857937, "grad_norm": 0.16419780254364014, "learning_rate": 1.3124895782885668e-05, "loss": 0.6324, "mean_token_accuracy": 0.8037776455283165, "num_tokens": 4284212585.0, "step": 2060 }, { "epoch": 2.0405188387893762, "grad_norm": 0.18495163321495056, "learning_rate": 1.300395169609463e-05, "loss": 0.6434, "mean_token_accuracy": 0.8021784752607346, "num_tokens": 4294596888.0, "step": 2065 }, { "epoch": 2.0454601605929588, "grad_norm": 0.19491487741470337, "learning_rate": 1.2883371250702264e-05, "loss": 0.6454, "mean_token_accuracy": 0.802114674448967, "num_tokens": 4305018504.0, "step": 2070 }, { "epoch": 2.0504014823965413, "grad_norm": 0.1656065136194229, "learning_rate": 1.2763158101915718e-05, "loss": 0.6697, "mean_token_accuracy": 0.7992842480540275, "num_tokens": 4315416503.0, "step": 2075 }, { "epoch": 2.0553428042001234, "grad_norm": 0.17609845101833344, "learning_rate": 1.2643315893808172e-05, "loss": 0.631, "mean_token_accuracy": 0.8045080795884132, "num_tokens": 4325842711.0, "step": 2080 }, { "epoch": 2.060284126003706, "grad_norm": 0.16532674431800842, "learning_rate": 1.252384825920827e-05, "loss": 0.63, "mean_token_accuracy": 0.8041578352451324, "num_tokens": 4336237520.0, "step": 2085 }, { "epoch": 2.0652254478072885, "grad_norm": 0.16881072521209717, "learning_rate": 1.240475881959008e-05, "loss": 0.6342, "mean_token_accuracy": 0.8036715492606163, "num_tokens": 4346647661.0, "step": 2090 }, { "epoch": 2.070166769610871, "grad_norm": 0.1476944535970688, "learning_rate": 1.2286051184963273e-05, "loss": 0.6305, "mean_token_accuracy": 0.8041099295020103, "num_tokens": 4357023546.0, "step": 2095 }, { "epoch": 2.0751080914144535, "grad_norm": 0.15514177083969116, "learning_rate": 1.2167728953763714e-05, "loss": 0.6311, "mean_token_accuracy": 0.8037099212408065, "num_tokens": 4367426888.0, "step": 2100 }, { "epoch": 2.0800494132180356, "grad_norm": 0.16117146611213684, "learning_rate": 1.2049795712744336e-05, "loss": 0.6312, "mean_token_accuracy": 0.8039769634604454, "num_tokens": 4377836621.0, "step": 2105 }, { "epoch": 2.084990735021618, "grad_norm": 0.15188126266002655, "learning_rate": 1.1932255036866458e-05, "loss": 0.6323, "mean_token_accuracy": 0.8038640111684799, "num_tokens": 4388192404.0, "step": 2110 }, { "epoch": 2.0899320568252007, "grad_norm": 0.15676775574684143, "learning_rate": 1.181511048919141e-05, "loss": 0.6466, "mean_token_accuracy": 0.799469843506813, "num_tokens": 4398610857.0, "step": 2115 }, { "epoch": 2.094873378628783, "grad_norm": 0.2041247934103012, "learning_rate": 1.1698365620772523e-05, "loss": 0.6429, "mean_token_accuracy": 0.800723274052143, "num_tokens": 4409031770.0, "step": 2120 }, { "epoch": 2.0998147004323657, "grad_norm": 0.1508302092552185, "learning_rate": 1.1582023970547464e-05, "loss": 0.6307, "mean_token_accuracy": 0.8041250929236412, "num_tokens": 4419434453.0, "step": 2125 }, { "epoch": 2.1047560222359483, "grad_norm": 0.16123157739639282, "learning_rate": 1.1466089065230968e-05, "loss": 0.6328, "mean_token_accuracy": 0.8038171142339706, "num_tokens": 4429862339.0, "step": 2130 }, { "epoch": 2.109697344039531, "grad_norm": 0.15946310758590698, "learning_rate": 1.1350564419207953e-05, "loss": 0.6439, "mean_token_accuracy": 0.8009535774588585, "num_tokens": 4440241343.0, "step": 2135 }, { "epoch": 2.114638665843113, "grad_norm": 0.15749602019786835, "learning_rate": 1.123545353442696e-05, "loss": 0.6322, "mean_token_accuracy": 0.8037322282791137, "num_tokens": 4450633333.0, "step": 2140 }, { "epoch": 2.1195799876466954, "grad_norm": 1.4616570472717285, "learning_rate": 1.112075990029398e-05, "loss": 0.652, "mean_token_accuracy": 0.8011793598532677, "num_tokens": 4461024621.0, "step": 2145 }, { "epoch": 2.124521309450278, "grad_norm": 2.4609215259552, "learning_rate": 1.1006486993566774e-05, "loss": 0.6475, "mean_token_accuracy": 0.8001800090074539, "num_tokens": 4471419630.0, "step": 2150 }, { "epoch": 2.1294626312538605, "grad_norm": 0.15085525810718536, "learning_rate": 1.089263827824934e-05, "loss": 0.654, "mean_token_accuracy": 0.7973979458212852, "num_tokens": 4481813205.0, "step": 2155 }, { "epoch": 2.134403953057443, "grad_norm": 0.16060177981853485, "learning_rate": 1.0779217205487025e-05, "loss": 0.6337, "mean_token_accuracy": 0.8035554558038711, "num_tokens": 4492232981.0, "step": 2160 }, { "epoch": 2.139345274861025, "grad_norm": 0.14307548105716705, "learning_rate": 1.0666227213461827e-05, "loss": 0.6278, "mean_token_accuracy": 0.8049931466579437, "num_tokens": 4502573601.0, "step": 2165 }, { "epoch": 2.1442865966646076, "grad_norm": 0.1611761599779129, "learning_rate": 1.0553671727288243e-05, "loss": 0.6347, "mean_token_accuracy": 0.8029251515865325, "num_tokens": 4512976405.0, "step": 2170 }, { "epoch": 2.14922791846819, "grad_norm": 0.17778734862804413, "learning_rate": 1.044155415890937e-05, "loss": 0.6493, "mean_token_accuracy": 0.8001389935612678, "num_tokens": 4523382809.0, "step": 2175 }, { "epoch": 2.1541692402717727, "grad_norm": 0.15306136012077332, "learning_rate": 1.0329877906993537e-05, "loss": 0.6389, "mean_token_accuracy": 0.8018682345747947, "num_tokens": 4533792085.0, "step": 2180 }, { "epoch": 2.1591105620753552, "grad_norm": 0.15490379929542542, "learning_rate": 1.0218646356831269e-05, "loss": 0.638, "mean_token_accuracy": 0.8019036680459977, "num_tokens": 4544217327.0, "step": 2185 }, { "epoch": 2.1640518838789378, "grad_norm": 0.16301098465919495, "learning_rate": 1.0107862880232608e-05, "loss": 0.636, "mean_token_accuracy": 0.803830087184906, "num_tokens": 4554626069.0, "step": 2190 }, { "epoch": 2.1689932056825203, "grad_norm": 0.15061938762664795, "learning_rate": 9.997530835425e-06, "loss": 0.6367, "mean_token_accuracy": 0.8023175925016404, "num_tokens": 4565012403.0, "step": 2195 }, { "epoch": 2.1739345274861024, "grad_norm": 0.15271735191345215, "learning_rate": 9.887653566951405e-06, "loss": 0.6386, "mean_token_accuracy": 0.8026603817939758, "num_tokens": 4575390772.0, "step": 2200 }, { "epoch": 2.178875849289685, "grad_norm": 0.16427479684352875, "learning_rate": 9.778234405568972e-06, "loss": 0.6448, "mean_token_accuracy": 0.8001790478825569, "num_tokens": 4585791161.0, "step": 2205 }, { "epoch": 2.1838171710932675, "grad_norm": 0.14360326528549194, "learning_rate": 9.669276668148056e-06, "loss": 0.6203, "mean_token_accuracy": 0.8070179298520088, "num_tokens": 4596212247.0, "step": 2210 }, { "epoch": 2.18875849289685, "grad_norm": 0.15808238089084625, "learning_rate": 9.560783657571642e-06, "loss": 0.6616, "mean_token_accuracy": 0.7990046426653862, "num_tokens": 4606596210.0, "step": 2215 }, { "epoch": 2.1936998147004325, "grad_norm": 0.14959020912647247, "learning_rate": 9.452758662635283e-06, "loss": 0.6398, "mean_token_accuracy": 0.801710894703865, "num_tokens": 4616980639.0, "step": 2220 }, { "epoch": 2.1986411365040146, "grad_norm": 0.15106073021888733, "learning_rate": 9.34520495794734e-06, "loss": 0.6333, "mean_token_accuracy": 0.803425170481205, "num_tokens": 4627395951.0, "step": 2225 }, { "epoch": 2.203582458307597, "grad_norm": 0.1476610153913498, "learning_rate": 9.238125803829775e-06, "loss": 0.6412, "mean_token_accuracy": 0.800981068611145, "num_tokens": 4637804875.0, "step": 2230 }, { "epoch": 2.2085237801111797, "grad_norm": 0.1471738964319229, "learning_rate": 9.131524446219272e-06, "loss": 0.6258, "mean_token_accuracy": 0.8057189077138901, "num_tokens": 4648226582.0, "step": 2235 }, { "epoch": 2.213465101914762, "grad_norm": 0.14815585315227509, "learning_rate": 9.025404116568872e-06, "loss": 0.6284, "mean_token_accuracy": 0.8047678738832473, "num_tokens": 4658619856.0, "step": 2240 }, { "epoch": 2.2184064237183447, "grad_norm": 0.15571852028369904, "learning_rate": 8.919768031750025e-06, "loss": 0.645, "mean_token_accuracy": 0.7999783381819725, "num_tokens": 4669035963.0, "step": 2245 }, { "epoch": 2.2233477455219273, "grad_norm": 0.3053502142429352, "learning_rate": 8.814619393955023e-06, "loss": 0.6337, "mean_token_accuracy": 0.8034562259912491, "num_tokens": 4679422184.0, "step": 2250 }, { "epoch": 2.22828906732551, "grad_norm": 0.14345481991767883, "learning_rate": 8.709961390599997e-06, "loss": 0.6421, "mean_token_accuracy": 0.8010098516941071, "num_tokens": 4689831886.0, "step": 2255 }, { "epoch": 2.233230389129092, "grad_norm": 0.14696592092514038, "learning_rate": 8.605797194228234e-06, "loss": 0.66, "mean_token_accuracy": 0.7998679220676422, "num_tokens": 4700259043.0, "step": 2260 }, { "epoch": 2.2381717109326744, "grad_norm": 0.14800859987735748, "learning_rate": 8.502129962414068e-06, "loss": 0.6385, "mean_token_accuracy": 0.8022643268108368, "num_tokens": 4710648425.0, "step": 2265 }, { "epoch": 2.243113032736257, "grad_norm": 3.7514922618865967, "learning_rate": 8.39896283766711e-06, "loss": 0.6549, "mean_token_accuracy": 0.8020106881856919, "num_tokens": 4721011109.0, "step": 2270 }, { "epoch": 2.2480543545398395, "grad_norm": 0.14842398464679718, "learning_rate": 8.296298947337029e-06, "loss": 0.6297, "mean_token_accuracy": 0.8041480585932732, "num_tokens": 4731421318.0, "step": 2275 }, { "epoch": 2.252995676343422, "grad_norm": 0.14325624704360962, "learning_rate": 8.194141403518709e-06, "loss": 0.6421, "mean_token_accuracy": 0.8011141166090965, "num_tokens": 4741847592.0, "step": 2280 }, { "epoch": 2.257936998147004, "grad_norm": 0.1525031179189682, "learning_rate": 8.092493302957935e-06, "loss": 0.6503, "mean_token_accuracy": 0.800957977771759, "num_tokens": 4752234243.0, "step": 2285 }, { "epoch": 2.2628783199505866, "grad_norm": 0.14679549634456635, "learning_rate": 7.991357726957542e-06, "loss": 0.6405, "mean_token_accuracy": 0.8013312935829162, "num_tokens": 4762670151.0, "step": 2290 }, { "epoch": 2.267819641754169, "grad_norm": 0.14465415477752686, "learning_rate": 7.890737741283952e-06, "loss": 0.6334, "mean_token_accuracy": 0.8034225985407829, "num_tokens": 4773051063.0, "step": 2295 }, { "epoch": 2.2727609635577517, "grad_norm": 0.15005916357040405, "learning_rate": 7.790636396074308e-06, "loss": 0.6363, "mean_token_accuracy": 0.8025058448314667, "num_tokens": 4783445371.0, "step": 2300 }, { "epoch": 2.2777022853613342, "grad_norm": 0.14713406562805176, "learning_rate": 7.691056725743958e-06, "loss": 0.6377, "mean_token_accuracy": 0.8021243140101433, "num_tokens": 4793870222.0, "step": 2305 }, { "epoch": 2.2826436071649168, "grad_norm": 1.410751461982727, "learning_rate": 7.5920017488945145e-06, "loss": 0.6391, "mean_token_accuracy": 0.8034495621919632, "num_tokens": 4804277953.0, "step": 2310 }, { "epoch": 2.2875849289684993, "grad_norm": 0.14937101304531097, "learning_rate": 7.4934744682223085e-06, "loss": 0.6341, "mean_token_accuracy": 0.8032669469714164, "num_tokens": 4814689141.0, "step": 2315 }, { "epoch": 2.2925262507720814, "grad_norm": 0.1451476365327835, "learning_rate": 7.395477870427387e-06, "loss": 0.6317, "mean_token_accuracy": 0.8038819208741188, "num_tokens": 4825078906.0, "step": 2320 }, { "epoch": 2.297467572575664, "grad_norm": 0.14649330079555511, "learning_rate": 7.2980149261229955e-06, "loss": 0.6317, "mean_token_accuracy": 0.8046122491359711, "num_tokens": 4835469304.0, "step": 2325 }, { "epoch": 2.3024088943792465, "grad_norm": 0.14319194853305817, "learning_rate": 7.201088589745503e-06, "loss": 0.6398, "mean_token_accuracy": 0.8018277272582054, "num_tokens": 4845886048.0, "step": 2330 }, { "epoch": 2.307350216182829, "grad_norm": 0.15091899037361145, "learning_rate": 7.104701799464855e-06, "loss": 0.638, "mean_token_accuracy": 0.8017465516924858, "num_tokens": 4856217841.0, "step": 2335 }, { "epoch": 2.3122915379864115, "grad_norm": 0.14482976496219635, "learning_rate": 7.0088574770954874e-06, "loss": 0.636, "mean_token_accuracy": 0.8024977222084999, "num_tokens": 4866616897.0, "step": 2340 }, { "epoch": 2.3172328597899936, "grad_norm": 0.15366806089878082, "learning_rate": 6.913558528007791e-06, "loss": 0.6251, "mean_token_accuracy": 0.8055261209607124, "num_tokens": 4876985578.0, "step": 2345 }, { "epoch": 2.322174181593576, "grad_norm": 0.16261132061481476, "learning_rate": 6.818807841040001e-06, "loss": 0.6347, "mean_token_accuracy": 0.802975694835186, "num_tokens": 4887408783.0, "step": 2350 }, { "epoch": 2.3271155033971587, "grad_norm": 0.141385018825531, "learning_rate": 6.724608288410661e-06, "loss": 0.6283, "mean_token_accuracy": 0.8047755777835846, "num_tokens": 4897810849.0, "step": 2355 }, { "epoch": 2.332056825200741, "grad_norm": 0.14989116787910461, "learning_rate": 6.630962725631543e-06, "loss": 0.6246, "mean_token_accuracy": 0.8062204629182815, "num_tokens": 4908237163.0, "step": 2360 }, { "epoch": 2.3369981470043237, "grad_norm": 0.1412249654531479, "learning_rate": 6.537873991421068e-06, "loss": 0.6255, "mean_token_accuracy": 0.8058675542473793, "num_tokens": 4918628428.0, "step": 2365 }, { "epoch": 2.3419394688079063, "grad_norm": 0.14880429208278656, "learning_rate": 6.4453449076182946e-06, "loss": 0.6306, "mean_token_accuracy": 0.8041801512241363, "num_tokens": 4929027480.0, "step": 2370 }, { "epoch": 2.346880790611489, "grad_norm": 0.41716793179512024, "learning_rate": 6.35337827909733e-06, "loss": 0.6426, "mean_token_accuracy": 0.8006162449717522, "num_tokens": 4939428046.0, "step": 2375 }, { "epoch": 2.351822112415071, "grad_norm": 0.14712265133857727, "learning_rate": 6.2619768936823616e-06, "loss": 0.6429, "mean_token_accuracy": 0.8013240069150924, "num_tokens": 4949815912.0, "step": 2380 }, { "epoch": 2.3567634342186534, "grad_norm": 0.17049653828144073, "learning_rate": 6.171143522063089e-06, "loss": 0.6254, "mean_token_accuracy": 0.8054996937513351, "num_tokens": 4960242136.0, "step": 2385 }, { "epoch": 2.361704756022236, "grad_norm": 0.14510297775268555, "learning_rate": 6.08088091771078e-06, "loss": 0.6348, "mean_token_accuracy": 0.803258067369461, "num_tokens": 4970639720.0, "step": 2390 }, { "epoch": 2.3666460778258185, "grad_norm": 2.2291486263275146, "learning_rate": 5.991191816794794e-06, "loss": 0.6436, "mean_token_accuracy": 0.8042269602417946, "num_tokens": 4981043766.0, "step": 2395 }, { "epoch": 2.371587399629401, "grad_norm": 4.8146796226501465, "learning_rate": 5.902078938099611e-06, "loss": 0.6346, "mean_token_accuracy": 0.8034033760428428, "num_tokens": 4991455153.0, "step": 2400 }, { "epoch": 2.376528721432983, "grad_norm": 0.14338742196559906, "learning_rate": 5.813544982942465e-06, "loss": 0.6359, "mean_token_accuracy": 0.8027326971292496, "num_tokens": 5001859050.0, "step": 2405 }, { "epoch": 2.3814700432365656, "grad_norm": 0.13393332064151764, "learning_rate": 5.725592635091398e-06, "loss": 0.6301, "mean_token_accuracy": 0.8042582124471664, "num_tokens": 5012262599.0, "step": 2410 }, { "epoch": 2.386411365040148, "grad_norm": 0.14737042784690857, "learning_rate": 5.638224560683966e-06, "loss": 0.64, "mean_token_accuracy": 0.8013565048575402, "num_tokens": 5022644887.0, "step": 2415 }, { "epoch": 2.3913526868437307, "grad_norm": 0.14444060623645782, "learning_rate": 5.5514434081463815e-06, "loss": 0.6318, "mean_token_accuracy": 0.8037166804075241, "num_tokens": 5033062786.0, "step": 2420 }, { "epoch": 2.3962940086473132, "grad_norm": 0.14431585371494293, "learning_rate": 5.465251808113247e-06, "loss": 0.6387, "mean_token_accuracy": 0.8019110590219498, "num_tokens": 5043484013.0, "step": 2425 }, { "epoch": 2.4012353304508958, "grad_norm": 0.14653638005256653, "learning_rate": 5.379652373347793e-06, "loss": 0.6364, "mean_token_accuracy": 0.8026723086833953, "num_tokens": 5053900530.0, "step": 2430 }, { "epoch": 2.4061766522544783, "grad_norm": 0.1418423056602478, "learning_rate": 5.294647698662686e-06, "loss": 0.6325, "mean_token_accuracy": 0.8036406919360161, "num_tokens": 5064311977.0, "step": 2435 }, { "epoch": 2.4111179740580604, "grad_norm": 0.13966286182403564, "learning_rate": 5.210240360841392e-06, "loss": 0.6395, "mean_token_accuracy": 0.8020109251141548, "num_tokens": 5074703925.0, "step": 2440 }, { "epoch": 2.416059295861643, "grad_norm": 0.14265727996826172, "learning_rate": 5.1264329185600285e-06, "loss": 0.6488, "mean_token_accuracy": 0.7987211719155312, "num_tokens": 5085097312.0, "step": 2445 }, { "epoch": 2.4210006176652255, "grad_norm": 0.1417674571275711, "learning_rate": 5.0432279123098284e-06, "loss": 0.624, "mean_token_accuracy": 0.8060004249215126, "num_tokens": 5095474470.0, "step": 2450 }, { "epoch": 2.425941939468808, "grad_norm": 0.14316117763519287, "learning_rate": 4.960627864320122e-06, "loss": 0.6447, "mean_token_accuracy": 0.8003643557429314, "num_tokens": 5105827653.0, "step": 2455 }, { "epoch": 2.4308832612723905, "grad_norm": 0.14158298075199127, "learning_rate": 4.87863527848188e-06, "loss": 0.6327, "mean_token_accuracy": 0.8037482813000679, "num_tokens": 5116224584.0, "step": 2460 }, { "epoch": 2.4358245830759726, "grad_norm": 0.14278945326805115, "learning_rate": 4.797252640271802e-06, "loss": 0.6325, "mean_token_accuracy": 0.8035951778292656, "num_tokens": 5126633656.0, "step": 2465 }, { "epoch": 2.440765904879555, "grad_norm": 0.1477539986371994, "learning_rate": 4.7164824166769735e-06, "loss": 0.6496, "mean_token_accuracy": 0.7990917310118675, "num_tokens": 5137002575.0, "step": 2470 }, { "epoch": 2.4457072266831377, "grad_norm": 0.1400897353887558, "learning_rate": 4.6363270561201185e-06, "loss": 0.629, "mean_token_accuracy": 0.8048480406403542, "num_tokens": 5147386449.0, "step": 2475 }, { "epoch": 2.45064854848672, "grad_norm": 0.14131523668766022, "learning_rate": 4.556788988385327e-06, "loss": 0.6477, "mean_token_accuracy": 0.799259965121746, "num_tokens": 5157825534.0, "step": 2480 }, { "epoch": 2.4555898702903027, "grad_norm": 0.1450231820344925, "learning_rate": 4.4778706245444475e-06, "loss": 0.6351, "mean_token_accuracy": 0.803033995628357, "num_tokens": 5168222793.0, "step": 2485 }, { "epoch": 2.4605311920938853, "grad_norm": 0.14175313711166382, "learning_rate": 4.399574356883946e-06, "loss": 0.6414, "mean_token_accuracy": 0.8010082706809044, "num_tokens": 5178625053.0, "step": 2490 }, { "epoch": 2.465472513897468, "grad_norm": 0.1430099755525589, "learning_rate": 4.32190255883245e-06, "loss": 0.6361, "mean_token_accuracy": 0.8026632949709892, "num_tokens": 5189044012.0, "step": 2495 }, { "epoch": 2.47041383570105, "grad_norm": 0.14343580603599548, "learning_rate": 4.244857584888748e-06, "loss": 0.6301, "mean_token_accuracy": 0.8043952465057373, "num_tokens": 5199460528.0, "step": 2500 }, { "epoch": 2.4753551575046324, "grad_norm": 0.14571450650691986, "learning_rate": 4.168441770550438e-06, "loss": 0.6353, "mean_token_accuracy": 0.8028825014829636, "num_tokens": 5209861437.0, "step": 2505 }, { "epoch": 2.480296479308215, "grad_norm": 0.13535760343074799, "learning_rate": 4.092657432243144e-06, "loss": 0.6195, "mean_token_accuracy": 0.807587580382824, "num_tokens": 5220231445.0, "step": 2510 }, { "epoch": 2.4852378011117975, "grad_norm": 0.14562451839447021, "learning_rate": 4.0175068672502784e-06, "loss": 0.6306, "mean_token_accuracy": 0.8039803951978683, "num_tokens": 5230632565.0, "step": 2515 }, { "epoch": 2.49017912291538, "grad_norm": 0.14036355912685394, "learning_rate": 3.942992353643415e-06, "loss": 0.6255, "mean_token_accuracy": 0.8056451350450515, "num_tokens": 5241008768.0, "step": 2520 }, { "epoch": 2.495120444718962, "grad_norm": 0.14074988663196564, "learning_rate": 3.869116150213212e-06, "loss": 0.6373, "mean_token_accuracy": 0.8024703413248062, "num_tokens": 5251408473.0, "step": 2525 }, { "epoch": 2.5000617665225446, "grad_norm": 0.1382495015859604, "learning_rate": 3.7958804964009692e-06, "loss": 0.6151, "mean_token_accuracy": 0.8083513364195823, "num_tokens": 5261777092.0, "step": 2530 }, { "epoch": 2.505003088326127, "grad_norm": 0.1404552012681961, "learning_rate": 3.7232876122307165e-06, "loss": 0.6267, "mean_token_accuracy": 0.8052102610468864, "num_tokens": 5272171725.0, "step": 2535 }, { "epoch": 2.5099444101297097, "grad_norm": 0.13998018205165863, "learning_rate": 3.651339698241943e-06, "loss": 0.6342, "mean_token_accuracy": 0.8030717715620994, "num_tokens": 5282583777.0, "step": 2540 }, { "epoch": 2.5148857319332922, "grad_norm": 0.1403067409992218, "learning_rate": 3.5800389354228748e-06, "loss": 0.6311, "mean_token_accuracy": 0.8037037447094917, "num_tokens": 5292979910.0, "step": 2545 }, { "epoch": 2.5198270537368748, "grad_norm": 0.1378946155309677, "learning_rate": 3.5093874851443497e-06, "loss": 0.6392, "mean_token_accuracy": 0.8019730687141419, "num_tokens": 5303396667.0, "step": 2550 }, { "epoch": 2.5247683755404573, "grad_norm": 0.13942013680934906, "learning_rate": 3.4393874890943424e-06, "loss": 0.6419, "mean_token_accuracy": 0.8008860290050507, "num_tokens": 5313787259.0, "step": 2555 }, { "epoch": 2.5297096973440394, "grad_norm": 0.14019039273262024, "learning_rate": 3.3700410692129815e-06, "loss": 0.6458, "mean_token_accuracy": 0.79988262206316, "num_tokens": 5324192735.0, "step": 2560 }, { "epoch": 2.534651019147622, "grad_norm": 0.14086246490478516, "learning_rate": 3.3013503276282805e-06, "loss": 0.621, "mean_token_accuracy": 0.8067845925688744, "num_tokens": 5334573931.0, "step": 2565 }, { "epoch": 2.5395923409512045, "grad_norm": 0.1609017550945282, "learning_rate": 3.233317346592385e-06, "loss": 0.6284, "mean_token_accuracy": 0.8044478759169579, "num_tokens": 5344981148.0, "step": 2570 }, { "epoch": 2.544533662754787, "grad_norm": 0.13504938781261444, "learning_rate": 3.165944188418474e-06, "loss": 0.6328, "mean_token_accuracy": 0.8033879026770592, "num_tokens": 5355403276.0, "step": 2575 }, { "epoch": 2.5494749845583695, "grad_norm": 0.1396746188402176, "learning_rate": 3.099232895418211e-06, "loss": 0.6252, "mean_token_accuracy": 0.8057782351970673, "num_tokens": 5365798505.0, "step": 2580 }, { "epoch": 2.5544163063619516, "grad_norm": 0.1381690502166748, "learning_rate": 3.033185489839857e-06, "loss": 0.6213, "mean_token_accuracy": 0.8066289514303208, "num_tokens": 5376169697.0, "step": 2585 }, { "epoch": 2.559357628165534, "grad_norm": 0.13640715181827545, "learning_rate": 2.9678039738069845e-06, "loss": 0.6259, "mean_token_accuracy": 0.8056413754820824, "num_tokens": 5386549150.0, "step": 2590 }, { "epoch": 2.5642989499691167, "grad_norm": 0.13234998285770416, "learning_rate": 2.903090329257746e-06, "loss": 0.6303, "mean_token_accuracy": 0.8042984798550605, "num_tokens": 5396948060.0, "step": 2595 }, { "epoch": 2.569240271772699, "grad_norm": 0.14382146298885345, "learning_rate": 2.8390465178848304e-06, "loss": 0.6302, "mean_token_accuracy": 0.803765270113945, "num_tokens": 5407352335.0, "step": 2600 }, { "epoch": 2.5741815935762817, "grad_norm": 0.1451369673013687, "learning_rate": 2.7756744810759823e-06, "loss": 0.6359, "mean_token_accuracy": 0.8030797064304351, "num_tokens": 5417765250.0, "step": 2605 }, { "epoch": 2.5791229153798643, "grad_norm": 0.13900776207447052, "learning_rate": 2.7129761398551556e-06, "loss": 0.629, "mean_token_accuracy": 0.8044662460684776, "num_tokens": 5428196382.0, "step": 2610 }, { "epoch": 2.584064237183447, "grad_norm": 0.14031672477722168, "learning_rate": 2.650953394824274e-06, "loss": 0.644, "mean_token_accuracy": 0.8001787707209587, "num_tokens": 5438603443.0, "step": 2615 }, { "epoch": 2.589005558987029, "grad_norm": 0.14090599119663239, "learning_rate": 2.5896081261056138e-06, "loss": 0.6373, "mean_token_accuracy": 0.8019549712538719, "num_tokens": 5449000992.0, "step": 2620 }, { "epoch": 2.5939468807906114, "grad_norm": 0.13333368301391602, "learning_rate": 2.5289421932848336e-06, "loss": 0.6282, "mean_token_accuracy": 0.8051638424396514, "num_tokens": 5459421463.0, "step": 2625 }, { "epoch": 2.598888202594194, "grad_norm": 0.14006465673446655, "learning_rate": 2.468957435354585e-06, "loss": 0.6206, "mean_token_accuracy": 0.8069762364029884, "num_tokens": 5469817660.0, "step": 2630 }, { "epoch": 2.6038295243977765, "grad_norm": 5.3190999031066895, "learning_rate": 2.4096556706587726e-06, "loss": 0.6435, "mean_token_accuracy": 0.8018386244773865, "num_tokens": 5480213057.0, "step": 2635 }, { "epoch": 2.608770846201359, "grad_norm": 0.1389482617378235, "learning_rate": 2.351038696837421e-06, "loss": 0.6406, "mean_token_accuracy": 0.8012316897511482, "num_tokens": 5490632229.0, "step": 2640 }, { "epoch": 2.613712168004941, "grad_norm": 0.1365329474210739, "learning_rate": 2.2931082907722055e-06, "loss": 0.6383, "mean_token_accuracy": 0.8020335495471954, "num_tokens": 5501003433.0, "step": 2645 }, { "epoch": 2.6186534898085236, "grad_norm": 0.13382576406002045, "learning_rate": 2.2358662085325723e-06, "loss": 0.6219, "mean_token_accuracy": 0.8068993985652924, "num_tokens": 5511428459.0, "step": 2650 }, { "epoch": 2.623594811612106, "grad_norm": 0.13468101620674133, "learning_rate": 2.1793141853224978e-06, "loss": 0.6259, "mean_token_accuracy": 0.8055090084671974, "num_tokens": 5521834701.0, "step": 2655 }, { "epoch": 2.6285361334156887, "grad_norm": 0.13747040927410126, "learning_rate": 2.1234539354279214e-06, "loss": 0.6199, "mean_token_accuracy": 0.8072109371423721, "num_tokens": 5532228387.0, "step": 2660 }, { "epoch": 2.6334774552192712, "grad_norm": 0.13710159063339233, "learning_rate": 2.068287152164747e-06, "loss": 0.6351, "mean_token_accuracy": 0.8026650249958038, "num_tokens": 5542623902.0, "step": 2665 }, { "epoch": 2.6384187770228538, "grad_norm": 0.13862751424312592, "learning_rate": 2.0138155078275293e-06, "loss": 0.6341, "mean_token_accuracy": 0.8030014872550965, "num_tokens": 5553026750.0, "step": 2670 }, { "epoch": 2.6433600988264363, "grad_norm": 0.13887560367584229, "learning_rate": 1.96004065363877e-06, "loss": 0.6402, "mean_token_accuracy": 0.8015902519226075, "num_tokens": 5563408728.0, "step": 2675 }, { "epoch": 2.6483014206300184, "grad_norm": 0.13534307479858398, "learning_rate": 1.9069642196988757e-06, "loss": 0.6332, "mean_token_accuracy": 0.8032783582806587, "num_tokens": 5573792880.0, "step": 2680 }, { "epoch": 2.653242742433601, "grad_norm": 0.13488322496414185, "learning_rate": 1.8545878149367285e-06, "loss": 0.6193, "mean_token_accuracy": 0.80726078748703, "num_tokens": 5584202613.0, "step": 2685 }, { "epoch": 2.6581840642371835, "grad_norm": 0.13911563158035278, "learning_rate": 1.80291302706094e-06, "loss": 0.634, "mean_token_accuracy": 0.8030529797077179, "num_tokens": 5594573558.0, "step": 2690 }, { "epoch": 2.663125386040766, "grad_norm": 0.13639000058174133, "learning_rate": 1.7519414225116937e-06, "loss": 0.6274, "mean_token_accuracy": 0.8051569849252701, "num_tokens": 5604952424.0, "step": 2695 }, { "epoch": 2.6680667078443485, "grad_norm": 0.14200198650360107, "learning_rate": 1.7016745464132732e-06, "loss": 0.6306, "mean_token_accuracy": 0.8043223142623901, "num_tokens": 5615322036.0, "step": 2700 }, { "epoch": 2.6730080296479306, "grad_norm": 0.13950730860233307, "learning_rate": 1.6521139225272292e-06, "loss": 0.638, "mean_token_accuracy": 0.8019472226500511, "num_tokens": 5625725163.0, "step": 2705 }, { "epoch": 2.677949351451513, "grad_norm": 0.13591845333576202, "learning_rate": 1.603261053206176e-06, "loss": 0.631, "mean_token_accuracy": 0.8039070650935173, "num_tokens": 5636090693.0, "step": 2710 }, { "epoch": 2.6828906732550957, "grad_norm": 0.13798172771930695, "learning_rate": 1.5551174193482677e-06, "loss": 0.6335, "mean_token_accuracy": 0.8032683923840522, "num_tokens": 5646434352.0, "step": 2715 }, { "epoch": 2.687831995058678, "grad_norm": 0.1333172768354416, "learning_rate": 1.5076844803522922e-06, "loss": 0.6275, "mean_token_accuracy": 0.8049399971961975, "num_tokens": 5656826072.0, "step": 2720 }, { "epoch": 2.6927733168622607, "grad_norm": 0.13734152913093567, "learning_rate": 1.4609636740734316e-06, "loss": 0.6315, "mean_token_accuracy": 0.803749541938305, "num_tokens": 5667245386.0, "step": 2725 }, { "epoch": 2.6977146386658433, "grad_norm": 0.13476236164569855, "learning_rate": 1.414956416779692e-06, "loss": 0.6424, "mean_token_accuracy": 0.8015404507517815, "num_tokens": 5677647700.0, "step": 2730 }, { "epoch": 2.702655960469426, "grad_norm": 0.1344069540500641, "learning_rate": 1.3696641031089501e-06, "loss": 0.6309, "mean_token_accuracy": 0.8040262326598168, "num_tokens": 5688057469.0, "step": 2735 }, { "epoch": 2.707597282273008, "grad_norm": 0.1328783482313156, "learning_rate": 1.3250881060266952e-06, "loss": 0.6335, "mean_token_accuracy": 0.8033808618783951, "num_tokens": 5698464888.0, "step": 2740 }, { "epoch": 2.7125386040765904, "grad_norm": 0.14123857021331787, "learning_rate": 1.2812297767843956e-06, "loss": 0.6319, "mean_token_accuracy": 0.8036202192306519, "num_tokens": 5708857391.0, "step": 2745 }, { "epoch": 2.717479925880173, "grad_norm": 0.1333305686712265, "learning_rate": 1.2380904448785507e-06, "loss": 0.6284, "mean_token_accuracy": 0.8048888012766838, "num_tokens": 5719273038.0, "step": 2750 }, { "epoch": 2.7224212476837555, "grad_norm": 0.13743609189987183, "learning_rate": 1.19567141801038e-06, "loss": 0.6367, "mean_token_accuracy": 0.802096837759018, "num_tokens": 5729672047.0, "step": 2755 }, { "epoch": 2.727362569487338, "grad_norm": 0.13438722491264343, "learning_rate": 1.1539739820461804e-06, "loss": 0.6378, "mean_token_accuracy": 0.8020210683345794, "num_tokens": 5740065329.0, "step": 2760 }, { "epoch": 2.73230389129092, "grad_norm": 0.13518624007701874, "learning_rate": 1.1129994009783624e-06, "loss": 0.6281, "mean_token_accuracy": 0.8048671677708625, "num_tokens": 5750466926.0, "step": 2765 }, { "epoch": 2.7372452130945026, "grad_norm": 0.13526305556297302, "learning_rate": 1.0727489168871092e-06, "loss": 0.6384, "mean_token_accuracy": 0.801820358633995, "num_tokens": 5760840649.0, "step": 2770 }, { "epoch": 2.742186534898085, "grad_norm": 0.134367436170578, "learning_rate": 1.0332237499027508e-06, "loss": 0.6319, "mean_token_accuracy": 0.8037395715713501, "num_tokens": 5771252260.0, "step": 2775 }, { "epoch": 2.7471278567016677, "grad_norm": 0.13103972375392914, "learning_rate": 9.944250981687664e-07, "loss": 0.6185, "mean_token_accuracy": 0.8079636707901955, "num_tokens": 5781659388.0, "step": 2780 }, { "epoch": 2.7520691785052502, "grad_norm": 0.13419899344444275, "learning_rate": 9.56354137805457e-07, "loss": 0.639, "mean_token_accuracy": 0.801702830195427, "num_tokens": 5792085860.0, "step": 2785 }, { "epoch": 2.7570105003088328, "grad_norm": 0.1617174595594406, "learning_rate": 9.190120228743049e-07, "loss": 0.6373, "mean_token_accuracy": 0.8023781910538673, "num_tokens": 5802520289.0, "step": 2790 }, { "epoch": 2.7619518221124153, "grad_norm": 0.13774944841861725, "learning_rate": 8.823998853429799e-07, "loss": 0.635, "mean_token_accuracy": 0.803001980483532, "num_tokens": 5812910560.0, "step": 2795 }, { "epoch": 2.7668931439159974, "grad_norm": 0.1353764832019806, "learning_rate": 8.465188350510411e-07, "loss": 0.6489, "mean_token_accuracy": 0.803003454208374, "num_tokens": 5823303672.0, "step": 2800 }, { "epoch": 2.77183446571958, "grad_norm": 0.1379646211862564, "learning_rate": 8.11369959676278e-07, "loss": 0.6318, "mean_token_accuracy": 0.803684464097023, "num_tokens": 5833728533.0, "step": 2805 }, { "epoch": 2.7767757875231625, "grad_norm": 0.1382586508989334, "learning_rate": 7.769543247017452e-07, "loss": 0.6409, "mean_token_accuracy": 0.8013983756303787, "num_tokens": 5844099051.0, "step": 2810 }, { "epoch": 2.781717109326745, "grad_norm": 0.1347196102142334, "learning_rate": 7.432729733834631e-07, "loss": 0.6365, "mean_token_accuracy": 0.8025758102536201, "num_tokens": 5854513394.0, "step": 2815 }, { "epoch": 2.7866584311303275, "grad_norm": 0.13394369184970856, "learning_rate": 7.103269267188045e-07, "loss": 0.6409, "mean_token_accuracy": 0.8010724946856499, "num_tokens": 5864893930.0, "step": 2820 }, { "epoch": 2.7915997529339096, "grad_norm": 0.13620924949645996, "learning_rate": 6.781171834155164e-07, "loss": 0.6321, "mean_token_accuracy": 0.8042621850967407, "num_tokens": 5875301007.0, "step": 2825 }, { "epoch": 2.796541074737492, "grad_norm": 0.13298700749874115, "learning_rate": 6.466447198614806e-07, "loss": 0.6386, "mean_token_accuracy": 0.8018185943365097, "num_tokens": 5885713081.0, "step": 2830 }, { "epoch": 2.8014823965410747, "grad_norm": 0.13537032902240753, "learning_rate": 6.15910490095084e-07, "loss": 0.6311, "mean_token_accuracy": 0.80390265583992, "num_tokens": 5896120892.0, "step": 2835 }, { "epoch": 2.806423718344657, "grad_norm": 0.13865377008914948, "learning_rate": 5.85915425776326e-07, "loss": 0.6385, "mean_token_accuracy": 0.8017645820975303, "num_tokens": 5906521523.0, "step": 2840 }, { "epoch": 2.8113650401482397, "grad_norm": 0.133405864238739, "learning_rate": 5.566604361585626e-07, "loss": 0.631, "mean_token_accuracy": 0.8040564343333244, "num_tokens": 5916885991.0, "step": 2845 }, { "epoch": 2.8163063619518223, "grad_norm": 1.976336121559143, "learning_rate": 5.281464080609338e-07, "loss": 0.6337, "mean_token_accuracy": 0.8037149354815483, "num_tokens": 5927294150.0, "step": 2850 }, { "epoch": 2.821247683755405, "grad_norm": 0.13748565316200256, "learning_rate": 5.003742058415112e-07, "loss": 0.6276, "mean_token_accuracy": 0.8046556517481804, "num_tokens": 5937696658.0, "step": 2855 }, { "epoch": 2.826189005558987, "grad_norm": 0.13383837044239044, "learning_rate": 4.7334467137105933e-07, "loss": 0.6298, "mean_token_accuracy": 0.8045536518096924, "num_tokens": 5948099499.0, "step": 2860 }, { "epoch": 2.8311303273625694, "grad_norm": 0.13354896008968353, "learning_rate": 4.470586240075486e-07, "loss": 0.6492, "mean_token_accuracy": 0.7988908976316452, "num_tokens": 5958486114.0, "step": 2865 }, { "epoch": 2.836071649166152, "grad_norm": 0.13508053123950958, "learning_rate": 4.2151686057129156e-07, "loss": 0.6349, "mean_token_accuracy": 0.8028511360287667, "num_tokens": 5968885481.0, "step": 2870 }, { "epoch": 2.8410129709697345, "grad_norm": 0.13615567982196808, "learning_rate": 3.967201553208122e-07, "loss": 0.6493, "mean_token_accuracy": 0.7993659228086472, "num_tokens": 5979261107.0, "step": 2875 }, { "epoch": 2.845954292773317, "grad_norm": 0.13443627953529358, "learning_rate": 3.726692599293563e-07, "loss": 0.6226, "mean_token_accuracy": 0.8065914869308471, "num_tokens": 5989695075.0, "step": 2880 }, { "epoch": 2.850895614576899, "grad_norm": 0.13318121433258057, "learning_rate": 3.4936490346210713e-07, "loss": 0.6277, "mean_token_accuracy": 0.8050418853759765, "num_tokens": 6000080540.0, "step": 2885 }, { "epoch": 2.8558369363804816, "grad_norm": 0.13402459025382996, "learning_rate": 3.268077923541085e-07, "loss": 0.6278, "mean_token_accuracy": 0.8049208298325539, "num_tokens": 6010495720.0, "step": 2890 }, { "epoch": 2.860778258184064, "grad_norm": 0.13390561938285828, "learning_rate": 3.049986103888125e-07, "loss": 0.6264, "mean_token_accuracy": 0.8052194505929947, "num_tokens": 6020898745.0, "step": 2895 }, { "epoch": 2.8657195799876467, "grad_norm": 0.1322164535522461, "learning_rate": 2.8393801867738765e-07, "loss": 0.6317, "mean_token_accuracy": 0.8038971364498139, "num_tokens": 6031294064.0, "step": 2900 }, { "epoch": 2.8706609017912292, "grad_norm": 0.13253232836723328, "learning_rate": 2.636266556386546e-07, "loss": 0.6375, "mean_token_accuracy": 0.8022791400551796, "num_tokens": 6041694003.0, "step": 2905 }, { "epoch": 2.8756022235948118, "grad_norm": 0.13198673725128174, "learning_rate": 2.440651369797375e-07, "loss": 0.6345, "mean_token_accuracy": 0.80296840518713, "num_tokens": 6052100553.0, "step": 2910 }, { "epoch": 2.8805435453983943, "grad_norm": 0.13583111763000488, "learning_rate": 2.252540556774152e-07, "loss": 0.6376, "mean_token_accuracy": 0.8022321462631226, "num_tokens": 6062507192.0, "step": 2915 }, { "epoch": 2.8854848672019764, "grad_norm": 0.13603807985782623, "learning_rate": 2.0719398196012707e-07, "loss": 0.6416, "mean_token_accuracy": 0.8010483682155609, "num_tokens": 6072897644.0, "step": 2920 }, { "epoch": 2.890426189005559, "grad_norm": 0.13156232237815857, "learning_rate": 1.8988546329069268e-07, "loss": 0.6341, "mean_token_accuracy": 0.8029652521014213, "num_tokens": 6083301245.0, "step": 2925 }, { "epoch": 2.8953675108091415, "grad_norm": 0.13426810503005981, "learning_rate": 1.733290243497221e-07, "loss": 0.6384, "mean_token_accuracy": 0.8017426192760467, "num_tokens": 6093714163.0, "step": 2930 }, { "epoch": 2.900308832612724, "grad_norm": 0.13337218761444092, "learning_rate": 1.57525167019712e-07, "loss": 0.6338, "mean_token_accuracy": 0.8033397659659386, "num_tokens": 6104139692.0, "step": 2935 }, { "epoch": 2.9052501544163065, "grad_norm": 0.13580240309238434, "learning_rate": 1.4247437036981615e-07, "loss": 0.6393, "mean_token_accuracy": 0.8013502344489097, "num_tokens": 6114557025.0, "step": 2940 }, { "epoch": 2.9101914762198886, "grad_norm": 0.13431790471076965, "learning_rate": 1.281770906413432e-07, "loss": 0.6332, "mean_token_accuracy": 0.8035564810037613, "num_tokens": 6124910557.0, "step": 2945 }, { "epoch": 2.915132798023471, "grad_norm": 0.13268694281578064, "learning_rate": 1.1463376123391766e-07, "loss": 0.6379, "mean_token_accuracy": 0.802278782427311, "num_tokens": 6135277257.0, "step": 2950 }, { "epoch": 2.9200741198270537, "grad_norm": 0.13012386858463287, "learning_rate": 1.0184479269233216e-07, "loss": 0.6264, "mean_token_accuracy": 0.8052372932434082, "num_tokens": 6145692699.0, "step": 2955 }, { "epoch": 2.925015441630636, "grad_norm": 0.13214385509490967, "learning_rate": 8.981057269412674e-08, "loss": 0.6298, "mean_token_accuracy": 0.8040620595216751, "num_tokens": 6156075039.0, "step": 2960 }, { "epoch": 2.9299567634342187, "grad_norm": 0.3513474464416504, "learning_rate": 7.853146603780947e-08, "loss": 0.6322, "mean_token_accuracy": 0.8042141646146774, "num_tokens": 6166508847.0, "step": 2965 }, { "epoch": 2.9348980852378013, "grad_norm": 0.1361996829509735, "learning_rate": 6.800781463182082e-08, "loss": 0.6387, "mean_token_accuracy": 0.8021485671401024, "num_tokens": 6176898601.0, "step": 2970 }, { "epoch": 2.939839407041384, "grad_norm": 0.13293121755123138, "learning_rate": 5.8239937484155794e-08, "loss": 0.6285, "mean_token_accuracy": 0.8048635452985764, "num_tokens": 6187294229.0, "step": 2975 }, { "epoch": 2.944780728844966, "grad_norm": 0.13646461069583893, "learning_rate": 4.922813069269394e-08, "loss": 0.6374, "mean_token_accuracy": 0.802308914065361, "num_tokens": 6197703524.0, "step": 2980 }, { "epoch": 2.9497220506485484, "grad_norm": 0.13153176009655, "learning_rate": 4.097266743623151e-08, "loss": 0.6302, "mean_token_accuracy": 0.8041860401630402, "num_tokens": 6208108876.0, "step": 2985 }, { "epoch": 2.954663372452131, "grad_norm": 0.1305973380804062, "learning_rate": 3.3473797966199204e-08, "loss": 0.6168, "mean_token_accuracy": 0.8084435001015663, "num_tokens": 6218509795.0, "step": 2990 }, { "epoch": 2.9596046942557135, "grad_norm": 0.12855064868927002, "learning_rate": 2.6731749599065435e-08, "loss": 0.6366, "mean_token_accuracy": 0.8023294299840927, "num_tokens": 6228937505.0, "step": 2995 }, { "epoch": 2.964546016059296, "grad_norm": 0.13430272042751312, "learning_rate": 2.0746726709461316e-08, "loss": 0.6258, "mean_token_accuracy": 0.8053408190608025, "num_tokens": 6239277445.0, "step": 3000 }, { "epoch": 2.969487337862878, "grad_norm": 0.1332690417766571, "learning_rate": 1.5518910723966163e-08, "loss": 0.6336, "mean_token_accuracy": 0.8029686883091927, "num_tokens": 6249696839.0, "step": 3005 }, { "epoch": 2.9744286596664606, "grad_norm": 0.1333792507648468, "learning_rate": 1.1048460115634096e-08, "loss": 0.6361, "mean_token_accuracy": 0.8027278065681458, "num_tokens": 6260105370.0, "step": 3010 }, { "epoch": 2.979369981470043, "grad_norm": 0.13540172576904297, "learning_rate": 7.335510399161804e-09, "loss": 0.637, "mean_token_accuracy": 0.8022717341780663, "num_tokens": 6270523530.0, "step": 3015 }, { "epoch": 2.9843113032736257, "grad_norm": 0.131788432598114, "learning_rate": 4.380174126802916e-09, "loss": 0.6301, "mean_token_accuracy": 0.8042985036969185, "num_tokens": 6280889193.0, "step": 3020 }, { "epoch": 2.9892526250772082, "grad_norm": 0.13343243300914764, "learning_rate": 2.1825408849401873e-09, "loss": 0.6393, "mean_token_accuracy": 0.8015558466315269, "num_tokens": 6291299316.0, "step": 3025 }, { "epoch": 2.9941939468807908, "grad_norm": 0.13485004007816315, "learning_rate": 7.4267729138211e-10, "loss": 0.6356, "mean_token_accuracy": 0.8027144998311997, "num_tokens": 6301702825.0, "step": 3030 }, { "epoch": 2.9991352686843733, "grad_norm": 0.1348290592432022, "learning_rate": 6.062699333675425e-11, "loss": 0.6232, "mean_token_accuracy": 0.806241363286972, "num_tokens": 6312077683.0, "step": 3035 } ], "logging_steps": 5, "max_steps": 3036, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 320, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.6636055544172904e+19, "train_batch_size": 4, "trial_name": null, "trial_params": null }